PageRenderTime 47ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/src/main/clojure/clojure/contrib/lazy_xml.clj

http://github.com/richhickey/clojure-contrib
Clojure | 215 lines | 173 code | 26 blank | 16 comment | 13 complexity | ac5a232dd31f4b68e52ade76117edda2 MD5 | raw file
  1. ; Copyright (c) Chris Houser, Dec 2008. All rights reserved.
  2. ; The use and distribution terms for this software are covered by the
  3. ; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
  4. ; which can be found in the file epl-v10.html at the root of this distribution.
  5. ; By using this software in any fashion, you are agreeing to be bound by
  6. ; the terms of this license.
  7. ; You must not remove this notice, or any other, from this software.
  8. ; Functions to parse xml lazily and emit back to text.
  9. (ns
  10. #^{:author "Chris Houser",
  11. :doc "Functions to parse xml lazily and emit back to text."}
  12. clojure.contrib.lazy-xml
  13. (:use [clojure.xml :as xml :only []]
  14. [clojure.contrib.seq :only [fill-queue]])
  15. (:import (org.xml.sax Attributes InputSource)
  16. (org.xml.sax.helpers DefaultHandler)
  17. (javax.xml.parsers SAXParserFactory)
  18. (java.util.concurrent LinkedBlockingQueue TimeUnit)
  19. (java.lang.ref WeakReference)
  20. (java.io Reader)))
  21. (defstruct node :type :name :attrs :str)
  22. ; http://www.extreme.indiana.edu/xgws/xsoap/xpp/
  23. (def has-pull false)
  24. (defn- parse-seq-pull [& _])
  25. (try
  26. (load "lazy_xml/with_pull")
  27. (catch Exception e
  28. (when-not (re-find #"XmlPullParser" (str e))
  29. (throw e))))
  30. (defn startparse-sax [s ch]
  31. (.. SAXParserFactory newInstance newSAXParser (parse s ch)))
  32. (defn parse-seq
  33. "Parses the source s, which can be a File, InputStream or String
  34. naming a URI. Returns a lazy sequence of maps with two or more of
  35. the keys :type, :name, :attrs, and :str. Other SAX-compatible
  36. parsers can be supplied by passing startparse, a fn taking a source
  37. and a ContentHandler and returning a parser. If a parser is
  38. specified, it will be run in a separate thread and be allowed to get
  39. ahead by queue-size items, which defaults to maxint. If no parser
  40. is specified and org.xmlpull.v1.XmlPullParser is in the classpath,
  41. this superior pull parser will be used."
  42. ([s] (if has-pull
  43. (parse-seq-pull s)
  44. (parse-seq s startparse-sax)))
  45. ([s startparse] (parse-seq s startparse Integer/MAX_VALUE))
  46. ([s startparse queue-size]
  47. (let [s (if (instance? Reader s) (InputSource. s) s)
  48. f (fn filler-func [fill]
  49. (startparse s (proxy [DefaultHandler] []
  50. (startElement [uri local-name q-name #^Attributes atts]
  51. ;(prn :start-element q-name)(flush)
  52. (let [attrs (into {} (for [i (range (.getLength atts))]
  53. [(keyword (.getQName atts i))
  54. (.getValue atts i)]))]
  55. (fill (struct node :start-element (keyword q-name) attrs))))
  56. (endElement [uri local-name q-name]
  57. ;(prn :end-element q-name)(flush)
  58. (fill (struct node :end-element (keyword q-name))))
  59. (characters [ch start length]
  60. ;(prn :characters)(flush)
  61. (let [st (String. ch start length)]
  62. (when (seq (.trim st))
  63. (fill (struct node :characters nil nil st))))))))]
  64. (fill-queue f :queue-size queue-size))))
  65. (defstruct element :tag :attrs :content)
  66. (declare mktree)
  67. (defn- siblings [coll]
  68. (lazy-seq
  69. (when-let [s (seq coll)]
  70. (let [event (first s)]
  71. (condp = (:type event)
  72. :characters (cons (:str event) (siblings (rest s)))
  73. :start-element (let [t (mktree s)]
  74. (cons (first t) (siblings (rest t))))
  75. :end-element [(rest s)])))))
  76. (defn- mktree
  77. [[elem & events]]
  78. (lazy-seq
  79. (let [sibs (siblings events)]
  80. ;(prn :elem elem)
  81. (cons
  82. (struct element (:name elem) (:attrs elem) (drop-last sibs))
  83. (lazy-seq (last sibs))))))
  84. (defn parse-trim
  85. "Parses the source s, which can be a File, InputStream or String
  86. naming a URI. Returns a lazy tree of the clojure.xml/element
  87. struct-map, which has the keys :tag, :attrs, and :content and
  88. accessor fns tag, attrs, and content, with the whitespace trimmed
  89. from around each content string. This format is compatible with what
  90. clojure.xml/parse produces, except :content is a lazy seq instead of
  91. a vector. Other SAX-compatible parsers can be supplied by passing
  92. startparse, a fn taking a source and a ContentHandler and returning
  93. a parser. If a parser is specified, it will be run in a separate
  94. thread and be allowed to get ahead by queue-size items, which
  95. defaults to maxing. If no parser is specified and
  96. org.xmlpull.v1.XmlPullParser is in the classpath, this superior pull
  97. parser will be used."
  98. ([s] (first (mktree (parse-seq s))))
  99. ([s startparse queue-size]
  100. (first (mktree (parse-seq s startparse queue-size)))))
  101. (defn attributes [e]
  102. (let [v (vec (:attrs e))]
  103. (reify org.xml.sax.Attributes
  104. (getLength [_] (count v))
  105. (getURI [_ i] (namespace (key (v i))))
  106. (getLocalName [_ i] (name (key (v i))))
  107. (getQName [_ i] (name (key (v i))))
  108. (getValue [_ uri name] (get (:attrs e) name))
  109. (#^String getValue [_ #^int i] (val (v i)))
  110. (#^String getType [_ #^int i] "CDATA"))))
  111. (defn- emit-element
  112. "Recursively prints as XML text the element struct e. To have it
  113. print extra whitespace like clojure.xml/emit, use the :pad true
  114. option."
  115. [e #^org.xml.sax.ContentHandler ch]
  116. (if (instance? String e)
  117. (.characters ch (.toCharArray #^String e) 0 (count e))
  118. (let [nspace (namespace (:tag e))
  119. qname (name (:tag e))]
  120. (.startElement ch (or nspace "") qname qname (attributes e))
  121. (doseq [c (:content e)]
  122. (emit-element c ch))
  123. (.endElement ch (or nspace "") qname qname))))
  124. (defn emit
  125. [e & {:as opts}]
  126. (let [content-handler (atom nil)
  127. trans (-> (javax.xml.transform.TransformerFactory/newInstance)
  128. .newTransformer)]
  129. (when (:indent opts)
  130. (.setOutputProperty trans "indent" "yes")
  131. (.setOutputProperty trans "{http://xml.apache.org/xslt}indent-amount"
  132. (str (:indent opts))))
  133. (when (contains? opts :xml-declaration)
  134. (.setOutputProperty trans "omit-xml-declaration"
  135. (if (:xml-declaration opts) "no" "yes")))
  136. (when (:encoding opts)
  137. (.setOutputProperty trans "encoding" (:encoding opts)))
  138. (.transform
  139. trans
  140. (javax.xml.transform.sax.SAXSource.
  141. (reify org.xml.sax.XMLReader
  142. (getContentHandler [_] @content-handler)
  143. (setDTDHandler [_ handler])
  144. (setFeature [_ name value])
  145. (setProperty [_ name value])
  146. (setContentHandler [_ ch] (reset! content-handler ch))
  147. (#^void parse [_ #^org.xml.sax.InputSource _]
  148. (when @content-handler
  149. (.startDocument @content-handler)
  150. (emit-element e @content-handler)
  151. (.endDocument @content-handler))))
  152. (org.xml.sax.InputSource.))
  153. (javax.xml.transform.stream.StreamResult. *out*))))
  154. (comment
  155. (def atomstr "<?xml version='1.0' encoding='UTF-8'?>
  156. <feed xmlns='http://www.w3.org/2005/Atom'>
  157. <id>tag:blogger.com,1999:blog-28403206</id>
  158. <updated>2008-02-14T08:00:58.567-08:00</updated>
  159. <title type='text'>n01senet</title>
  160. <link rel='alternate' type='text/html' href='http://n01senet.blogspot.com/'/>
  161. <entry xmlns:foo='http://foo' xmlns:bar='http://bar'>
  162. <id>1</id>
  163. <published>2008-02-13</published>
  164. <title type='text'>clojure is the best lisp yet</title>
  165. <author><name>Chouser</name></author>
  166. </entry>
  167. <entry>
  168. <id>2</id>
  169. <published>2008-02-07</published>
  170. <title type='text'>experimenting with vnc</title>
  171. <author><name>agriffis</name></author>
  172. </entry>
  173. </feed>
  174. ")
  175. (def tree (parse-trim (java.io.StringReader. atomstr)
  176. startparse-sax
  177. 1))
  178. (println "\nsax")
  179. (emit tree)
  180. (def tree (parse-trim (java.io.StringReader. atomstr)))
  181. (println "\ndefault")
  182. (emit tree)
  183. (def tree (xml/parse (org.xml.sax.InputSource. (java.io.StringReader. atomstr))))
  184. (println "\norig")
  185. (emit tree)
  186. ; When used with zip and zip-filter, you can get do queries like this
  187. ; without parsing more than the first few tags:
  188. ; (zip/node (first (xml-> (zip/xml-zip tree) :id)))
  189. )