PageRenderTime 23ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 0ms

/src/main/clojure/wwmm/pubcrawler/core.clj

https://bitbucket.org/nickday/pubcrawler-clj
Clojure | 62 lines | 52 code | 10 blank | 0 comment | 6 complexity | acdc27631ea8fd15975ddc1bba1a178f MD5 | raw file
  1. (ns wwmm.pubcrawler.core
  2. (:require [com.twinql.clojure.http :as http])
  3. (:import
  4. (java.io StringReader)
  5. (nu.xom Builder XPathContext)
  6. (org.xml.sax.helpers XMLReaderFactory)))
  7. (defstruct journal :abbreviation :title :volume-offset)
  8. (def tagsoup-builder (Builder. (XMLReaderFactory/createXMLReader "org.ccil.cowan.tagsoup.Parser")))
  9. (defn get-webpage-string
  10. "Returns the webpage at the provided URI as a stream"
  11. [url]
  12. (:content (http/get url :as :string)))
  13. (defn get-webpage-xml
  14. "Retrieves the webpage at the provided URL, tidies it using Tagsoup
  15. and returns it as a nu.xom.Document."
  16. [url]
  17. (with-open [sr (StringReader. (get-webpage-string url))]
  18. (. tagsoup-builder build sr)))
  19. (defn unpack-nodes
  20. "Converts a nu.xom.Nodes object to a seq of nu.xom.Node."
  21. [nodes]
  22. (loop [result [] x (dec (. nodes size))]
  23. (if (neg? x)
  24. result
  25. (recur (conj result (. nodes get x)) (dec x)))))
  26. (defn xpath-query
  27. "Queries dct using the provided xpath and context. Returns all
  28. matching nodes."
  29. [dct xpath context]
  30. (. dct (query xpath context)))
  31. (defn xpath-query-html
  32. "Queries the HTML in dct using the provided XPath. Note that the
  33. HTML XPathContext is bound to the prefix 'x', so XPaths should be
  34. of the form './x:html/x:body'."
  35. [dct xpath]
  36. (xpath-query dct xpath (XPathContext. "x" "http://www.w3.org/1999/xhtml")))
  37. (defn get-html-elements
  38. "Will return all elements in the webpage HTML at the provided URI that
  39. match the provided XPath strings."
  40. [url & xpaths]
  41. (let [html (get-webpage-xml url)]
  42. (for [xp xpaths] (unpack-nodes (xpath-query-html html xp)))))
  43. (defn scrape
  44. "First attempt at a recursive scrape function - NEEDS MORE WORK."
  45. [url xpaths options]
  46. (let [elements (first (get-html-elements url (first xpaths)))
  47. values (map #(. %1 getValue) elements)
  48. opts (first options)
  49. urls (map #(str (:add-prefix opts) %1 (:add-postfix opts)) values)]
  50. (if (and xpaths opts)
  51. (map #(something %1 (rest xpaths) (rest options)) urls)
  52. urls)))