core.clj | searchcode

/src/main/clojure/wwmm/pubcrawler/core.clj

https://bitbucket.org/nickday/pubcrawler-clj
Clojure | 62 lines | 52 code | 10 blank | 0 comment | 6 complexity | acdc27631ea8fd15975ddc1bba1a178f MD5 | raw file


(ns wwmm.pubcrawler.core
  (:require [com.twinql.clojure.http :as http])
  (:import 
  	(java.io StringReader)
    (nu.xom Builder XPathContext)
    (org.xml.sax.helpers XMLReaderFactory)))

(defstruct journal :abbreviation :title :volume-offset)
(def tagsoup-builder (Builder. (XMLReaderFactory/createXMLReader "org.ccil.cowan.tagsoup.Parser")))
	
(defn get-webpage-string
	"Returns the webpage at the provided URI as a stream"
	[url]
	(:content (http/get url :as :string)))
	
(defn get-webpage-xml
	"Retrieves the webpage at the provided URL, tidies it using Tagsoup
	and returns it as a nu.xom.Document."
	[url]
	(with-open [sr (StringReader. (get-webpage-string url))]
		(. tagsoup-builder build sr)))
		
(defn unpack-nodes
	"Converts a nu.xom.Nodes object to a seq of nu.xom.Node."
	[nodes]
	(loop [result [] x (dec (. nodes size))]
		(if (neg? x)
			result
			(recur (conj result (. nodes get x)) (dec x)))))
		
(defn xpath-query
	"Queries dct using the provided xpath and context.  Returns all
	matching nodes."
	[dct xpath context]
	(. dct (query xpath context)))
	
(defn xpath-query-html
	"Queries the HTML in dct using the provided XPath.  Note that the
	HTML XPathContext is bound to the prefix 'x', so XPaths should be
	of the form './x:html/x:body'."
	[dct xpath]
	(xpath-query dct xpath (XPathContext. "x" "http://www.w3.org/1999/xhtml")))

(defn get-html-elements
	"Will return all elements in the webpage HTML at the provided URI that 
	match the provided XPath strings."
	[url & xpaths]
	(let [html (get-webpage-xml url)]
		(for [xp xpaths] (unpack-nodes (xpath-query-html html xp)))))
	
(defn scrape
	"First attempt at a recursive scrape function - NEEDS MORE WORK."
	[url xpaths options]
	(let [elements (first (get-html-elements url (first xpaths)))
				values (map #(. %1 getValue) elements)
				opts (first options)
				urls (map #(str (:add-prefix opts) %1 (:add-postfix opts)) values)]
		(if (and xpaths opts)
			(map #(something %1 (rest xpaths) (rest options)) urls)
			urls)))