/src/main/clojure/wwmm/pubcrawler/core.clj
Clojure | 62 lines | 52 code | 10 blank | 0 comment | 6 complexity | acdc27631ea8fd15975ddc1bba1a178f MD5 | raw file
- (ns wwmm.pubcrawler.core
- (:require [com.twinql.clojure.http :as http])
- (:import
- (java.io StringReader)
- (nu.xom Builder XPathContext)
- (org.xml.sax.helpers XMLReaderFactory)))
- (defstruct journal :abbreviation :title :volume-offset)
- (def tagsoup-builder (Builder. (XMLReaderFactory/createXMLReader "org.ccil.cowan.tagsoup.Parser")))
-
- (defn get-webpage-string
- "Returns the webpage at the provided URI as a stream"
- [url]
- (:content (http/get url :as :string)))
-
- (defn get-webpage-xml
- "Retrieves the webpage at the provided URL, tidies it using Tagsoup
- and returns it as a nu.xom.Document."
- [url]
- (with-open [sr (StringReader. (get-webpage-string url))]
- (. tagsoup-builder build sr)))
-
- (defn unpack-nodes
- "Converts a nu.xom.Nodes object to a seq of nu.xom.Node."
- [nodes]
- (loop [result [] x (dec (. nodes size))]
- (if (neg? x)
- result
- (recur (conj result (. nodes get x)) (dec x)))))
-
- (defn xpath-query
- "Queries dct using the provided xpath and context. Returns all
- matching nodes."
- [dct xpath context]
- (. dct (query xpath context)))
-
- (defn xpath-query-html
- "Queries the HTML in dct using the provided XPath. Note that the
- HTML XPathContext is bound to the prefix 'x', so XPaths should be
- of the form './x:html/x:body'."
- [dct xpath]
- (xpath-query dct xpath (XPathContext. "x" "http://www.w3.org/1999/xhtml")))
- (defn get-html-elements
- "Will return all elements in the webpage HTML at the provided URI that
- match the provided XPath strings."
- [url & xpaths]
- (let [html (get-webpage-xml url)]
- (for [xp xpaths] (unpack-nodes (xpath-query-html html xp)))))
-
- (defn scrape
- "First attempt at a recursive scrape function - NEEDS MORE WORK."
- [url xpaths options]
- (let [elements (first (get-html-elements url (first xpaths)))
- values (map #(. %1 getValue) elements)
- opts (first options)
- urls (map #(str (:add-prefix opts) %1 (:add-postfix opts)) values)]
- (if (and xpaths opts)
- (map #(something %1 (rest xpaths) (rest options)) urls)
- urls)))
-
-