/thread/thread.go

http://github.com/drbawb/goChanner · Go · 127 lines · 77 code · 22 blank · 28 comment · 23 complexity · bc6c8b8ccdb0097ddb176da6e55ba390 MD5 · raw file

  1. /*
  2. This is the PonyChan.net driver for GoChanner.
  3. As it is the first [and as of this writing: the only] GoChanner driver, this serves as the API implementation.
  4. I'll factor out the API interface when it's a bit more stable, and move this implementation code to where it belongs (lib/drivers/pchan.go)
  5. See doc/LICENSE for licensing restrictions.
  6. "I'm on the verge, I'm on the verge - unravelling with every word."
  7. */
  8. package thread
  9. import (
  10. // "fmt"
  11. "regexp"
  12. "html"
  13. "strings"
  14. )
  15. // Very simple tree structure of posts
  16. // We will eventually analyze posts for `>>` reply forwards
  17. // These reply forwards will determine the ID of the parent
  18. // Then we can insert the post as a child by looking for the parent in the tree.
  19. type Thread struct {
  20. Node *html.Node
  21. Author Author
  22. Subject, Body string
  23. ThreadNo string
  24. }
  25. type Author struct {
  26. Name, Trip string
  27. }
  28. //Attempts to build the thread from a DOM tree
  29. func (t *Thread) Build(in *html.Node) {
  30. t.Node = in
  31. t.ExtractMeta() //make functional later
  32. }
  33. //Extracts meta-data from a thread that has an underlying DOM tree, returns err. otherwise.
  34. func (t *Thread) ExtractMeta() {
  35. regex := regexp.MustCompile(`[0-9]+`)
  36. for aix := 0; aix < len(t.Node.Attr); aix++ {
  37. if t.Node.Attr[aix].Key == "id" {
  38. //fmt.Printf("threads id is: %s", t.Node.Attr[aix].Val)
  39. t.ThreadNo = regex.FindString(t.Node.Attr[aix].Val)
  40. }
  41. }
  42. for ix := 0; ix < len(t.Node.Child); ix++ {
  43. c := t.Node.Child[ix]
  44. if c.Type == html.ElementNode && c.Data == "label" {
  45. t.extractMetaSpans(c)
  46. }
  47. }
  48. }
  49. func (t *Thread) extractPostNum(in *html.Node) {
  50. //for the ponychan.net impelmentation, the post no. is contained in span.reflinks
  51. //in the body of the second anchor tag. so look for that.
  52. }
  53. //TODO: Every time I look at this, I end up getting depressed. That is probably an indicator that this should be rewritten.
  54. func (t *Thread) extractMetaSpans(in *html.Node) {
  55. for ix := 0; ix < len(in.Child); ix++ {
  56. c := in.Child[ix]
  57. if c.Type == html.ElementNode && c.Data == "span" {
  58. for aix := 0; aix < len(c.Attr); aix++ {
  59. if c.Attr[aix].Key == "class" {
  60. switch c.Attr[aix].Val {
  61. case "postername":
  62. t.Author.Name = t.extAuthor(c)
  63. //fmt.Printf("author name: %s \n", t.Author.Name)
  64. case "filetitle":
  65. t.Subject = t.extSubj(c)
  66. //fmt.Printf("subj: %s \n", t.Subject)
  67. case "postertrip":
  68. //fmt.Printf("getting trip")
  69. t.Author.Trip = t.extTrip(c)
  70. case "":
  71. }
  72. }
  73. }
  74. }
  75. }
  76. }
  77. func (t *Thread) extTrip(in *html.Node) string {
  78. out := "DEBUG-default"
  79. out = strings.TrimLeft(in.Child[0].Data, "\r\n")
  80. return out
  81. }
  82. func (t *Thread) extSubj(in *html.Node) string {
  83. out := "DEBUG-default"
  84. //go over all nodes looking for
  85. out = strings.TrimLeft(in.Child[0].Data, "\r\n")
  86. return out
  87. }
  88. //Gets the author from a subnode of the DOM tree [implementation specific]
  89. //For pChan, the subnode is the first <label> tree under the <div class='thread'>
  90. //The <label> tree has a <span> named postername that is the authors name
  91. func (t *Thread) extAuthor(in *html.Node) string {
  92. out := "DEBUG-default"
  93. for ix := 0; ix < len(in.Child); ix++ {
  94. c := in.Child[ix]
  95. if c.Type == html.ElementNode && c.Data == "a" {
  96. out = c.Child[0].Data
  97. } else {
  98. out = c.Data
  99. }
  100. }
  101. return out
  102. }