PageRenderTime 77ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/vendor/gems/facets-2.4.5/lib/more/facets/tagiter.rb

https://bitbucket.org/mediashelf/fedora-migrator
Ruby | 366 lines | 138 code | 35 blank | 193 comment | 12 complexity | ebe0be3a25821968f0b77b11c9bd7306 MD5 | raw file
Possible License(s): GPL-3.0, GPL-2.0, IPL-1.0, AGPL-1.0, LGPL-3.0
  1. # = tagiterator.rb
  2. #
  3. # == Copyright (c) 2000 ?nyasu <nyasu@osk.3web.ne.jp>
  4. #
  5. # Ruby License
  6. #
  7. # This module is free software. You may use, modify, and/or redistribute this
  8. # software under the same terms as Ruby.
  9. #
  10. # This program is distributed in the hope that it will be useful, but WITHOUT
  11. # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. # FOR A PARTICULAR PURPOSE.
  13. #
  14. # == Author(s)
  15. #
  16. # * ?nyasu
  17. # Author:: ?nyasu <nyasu@osk.3web.ne.jp>
  18. # Copyright:: Copyright (c) 2000 ?nyasu
  19. # License:: Ruby License
  20. # = TagIterator (aka Tagiter)
  21. #
  22. # Simple but very useful HTML/XHTML cascading parser.
  23. #
  24. # Quickly iterate through tagged markup documents like HTML and XML.
  25. # TagIterator is great for quick and dirty web scrapping.
  26. #
  27. # == Usage
  28. #
  29. # # sample html
  30. # stext = <<-EOF
  31. # <body> This is a test...
  32. # <sub> S1 </sub> <sub> S2 </sub>
  33. # <DL>
  34. # <DT> A1
  35. # <DT> A2
  36. # <DT> A3
  37. # </DL>
  38. # <DL>
  39. # <DT> B1
  40. # <DT> B2
  41. # <DT> B3
  42. # </DL>
  43. # <NEST>
  44. # <P ALIGN="R">TOP</P>
  45. # <NEST>
  46. # <P>SECOND</P>
  47. # <OL>
  48. # <LI>C1
  49. # <LI>C2
  50. # <LI>C3
  51. # <LI>C4
  52. # </OL>
  53. # </NEST>
  54. # <OL>
  55. # <LI>D1
  56. # <LI>D2
  57. # <LI>D3
  58. # <LI>D4
  59. # </OL>
  60. # </NEST>
  61. # </body>
  62. # EOF
  63. #
  64. # a = TagIterator.new(stext)
  65. # a.first("body") do |y|
  66. # y.nth("dl",2) do |dl|
  67. # dl.enumtag("dt") do |t|
  68. # puts t.text.strip
  69. # end
  70. # end
  71. # y.first("nest") do |n|
  72. # n.first("p") do |c|
  73. # print c.text, ' '
  74. # puts c.attributes.collect{ |k,v| "#{k}=#{v}" }
  75. # end.next("nest") do |m|
  76. # m.first("p") do |c|
  77. # puts c.text
  78. # end.next("ol") do |o|
  79. # o.enumtag("li") do |i| puts i.text.strip end
  80. # end
  81. # end.next("ol") do |o|
  82. # o.enumtag("li") do |i| puts i.text.strip end
  83. # end
  84. # end
  85. # end
  86. # a.each_block("sub") do |y|
  87. # puts y.text.strip
  88. # end
  89. #
  90. # _produces_
  91. #
  92. # B1
  93. # B2
  94. # B3
  95. # TOP align=R
  96. # SECOND
  97. # C1
  98. # C2
  99. # C3
  100. # C4
  101. # D1
  102. # D2
  103. # D3
  104. # D4
  105. # S1
  106. # S2
  107. #
  108. class TagIterator
  109. attr :text
  110. attr :option, true
  111. attr :tag
  112. attr :attributes
  113. private
  114. def initialize(text,tag=nil,attributes={})
  115. raise RuntimeError,"Only String accepted" unless text.is_a?(String)
  116. @text=text
  117. @option="pi"
  118. @tag=tag
  119. @attributes=attributes
  120. def @attributes.[](aname)
  121. super aname.downcase
  122. end
  123. end
  124. def find_element(element,st=0)
  125. rex=Regexp.new('<(\s|\n)*'+element+'(\s|\n|>)',@option)
  126. @text.index(rex,st)
  127. end
  128. def parse_attribute(attstr)
  129. k={}; r={};
  130. attstr.scan(/(\w+)=(\S+)/) do |pt| k[ pt[0] ] = pt[1] end
  131. attstr.scan(/(\w+)="([^"]*)"/) do |pt| k[ pt[0] ] = pt[1] end
  132. k.each do |key,val| r[key.downcase]=val end
  133. r
  134. end
  135. def find_opentag(tag,st=0)
  136. s=find_element(tag,st)
  137. return nil unless s
  138. r=@text.index('>',s)
  139. return r+1,@text[s+1..r-1]
  140. end
  141. def find_closetag(tag,st,opentag=nil)
  142. if opentag then
  143. p=find_element(tag,st)
  144. q,d = find_opentag(opentag,st)
  145. else
  146. p=find_element('/\s*'+tag,st)
  147. q,d = find_opentag(tag,st)
  148. end
  149. p-=1 if p
  150. if p and q then if p > q then # tag nested
  151. p=find_closetag(tag,find_closetag(tag,q,opentag)+2,opentag)
  152. end end
  153. return p
  154. end
  155. def find_closeenumtag(tag,st=0)
  156. rex=Regexp.new('<\s*'+tag,@option)
  157. s=@text.index(rex,st)
  158. s-=1 if s
  159. s
  160. end
  161. alias_method :find_openenumtag, :find_opentag
  162. public
  163. def nth(tag,n,closetag=nil)
  164. raise RuntimeError,"nth: number not specified" unless n
  165. t=0
  166. e=s=0 # for their scope
  167. d=nil
  168. 1.upto(n) do |i|
  169. s,d = find_opentag(tag,t)
  170. raise RuntimeError,"tag(#{tag}) not found at(#{i})" unless s
  171. if closetag then
  172. e=find_closetag(closetag,s,tag)
  173. else
  174. e=find_closetag(tag,s)
  175. end
  176. e=-1 unless e
  177. t=@text.index('>',e+1)
  178. t=@text.length unless t
  179. end
  180. yield self.class.new(text[s..e],tag,parse_attribute(d))
  181. self.class.new(text[t+1..-1])
  182. end
  183. def first(tag,*arg) nth(tag,1,*arg) do |f| yield f end end
  184. alias_method :next, :first
  185. def each_block(tag,closetag=nil)
  186. t=0
  187. s,d =find_opentag(tag)
  188. raise RuntimeError,"tag(#{tag}) not found" unless s
  189. while s do
  190. if closetag then
  191. e=find_closetag(closetag,s,tag)
  192. else
  193. e=find_closetag(tag,s)
  194. end
  195. e=-1 unless e
  196. yield self.class.new(@text[s..e],tag,parse_attribute(d))
  197. if e>=0 then
  198. t=@text.index('>',e+1)
  199. t=@text.length unless t
  200. s,d = find_opentag(tag,t)
  201. else
  202. s=false
  203. end
  204. end
  205. self.class.new(text[t+1..-1])
  206. end
  207. def collect(*arg)
  208. a=[]
  209. each_block(*arg) do |tt| a.push tt end
  210. a
  211. end
  212. def enumtag(tag)
  213. s,d = find_openenumtag(tag)
  214. while s do
  215. e=find_closeenumtag(tag,s+1)
  216. e=-1 unless e
  217. yield self.class.new(@text[s..e],tag,parse_attribute(d))
  218. s,d = find_openenumtag(tag,s)
  219. end
  220. end
  221. def enumcollect(tag)
  222. a=[]
  223. enumtag(tag) do |t| a.push t end
  224. a
  225. end
  226. def for_this
  227. yield self
  228. end
  229. def get_nth(*arg) r=nil; nth(*arg) do |bl| r=bl end; r; end
  230. def get_first(*arg) r=nil; first(*arg) do |bl| r=bl end; r; end
  231. def tagexist?(tag,st=0)
  232. s=find_element(tag,st)
  233. if s then true else false end
  234. end
  235. def tagnext
  236. s=@text.index("<")
  237. return nil unless s
  238. e=@text.index(">",s)
  239. return nil unless s
  240. @text[s..e].scan(/[^<>\s]+/)[0]
  241. end
  242. def nth_tailer(tag,n)
  243. nth(tag,n) do end
  244. end
  245. end
  246. # _____ _
  247. # |_ _|__ ___| |_
  248. # | |/ _ \/ __| __|
  249. # | | __/\__ \ |_
  250. # |_|\___||___/\__|
  251. #
  252. =begin testing
  253. require 'test/unit'
  254. class TC_TagIterator < Test::Unit::TestCase
  255. STEXT = <<-EOS
  256. <body> This is a test...
  257. <sub> S1 </sub> <sub> S2 </sub>
  258. <DL>
  259. <DT> A1
  260. <DT> A2
  261. <DT> A3
  262. </DL>
  263. <DL>
  264. <DT> B1
  265. <DT> B2
  266. <DT> B3
  267. </DL>
  268. <NEST>
  269. <P ALIGN="R">TOP</P>
  270. <NEST>
  271. <P>SECOND</P>
  272. <OL>
  273. <LI>C1
  274. <LI>C2
  275. <LI>C3
  276. <LI>C4
  277. </OL>
  278. </NEST>
  279. <OL>
  280. <LI>D1
  281. <LI>D2
  282. <LI>D3
  283. <LI>D4
  284. </OL>
  285. </NEST>
  286. </body>
  287. EOS
  288. def test_all
  289. assert_nothing_raised{ @a = TagIterator.new( STEXT ) }
  290. @f = []
  291. assert_nothing_raised {
  292. @a.first("body") do |y|
  293. y.nth("dl",2) do |dl|
  294. dl.enumtag("dt") do |t|
  295. @f << t.text.strip
  296. end
  297. end
  298. y.first("nest") do |n|
  299. n.first("p") do |c|
  300. @f << c.text
  301. @f.concat c.attributes.collect{ |k,v| "#{k}=#{v}" }
  302. end.next("nest") do |m|
  303. m.first("p") do |c|
  304. @f << c.text
  305. end.next("ol") do |o|
  306. o.enumtag("li") do |i| @f << i.text.strip end
  307. end
  308. end.next("ol") do |o|
  309. o.enumtag("li") do |i| @f << i.text.strip end
  310. end
  311. end
  312. end
  313. @a.each_block("sub") do |y|
  314. @f << y.text.strip
  315. end
  316. }
  317. o = [ "B1", "B2", "B3",
  318. "TOP", "align=R", "SECOND",
  319. "C1", "C2", "C3", "C4",
  320. "D1", "D2", "D3", "D4",
  321. "S1", "S2" ]
  322. assert_equal( o, @f )
  323. end
  324. end
  325. =end