PageRenderTime 125ms CodeModel.GetById 40ms app.highlight 49ms RepoModel.GetById 31ms app.codeStats 0ms

/vendor/gems/facets-2.4.5/lib/more/facets/tagiter.rb

https://bitbucket.org/mediashelf/fedora-migrator
Ruby | 366 lines | 138 code | 35 blank | 193 comment | 12 complexity | ebe0be3a25821968f0b77b11c9bd7306 MD5 | raw file
  1# = tagiterator.rb
  2#
  3# == Copyright (c) 2000 ?nyasu <nyasu@osk.3web.ne.jp>
  4#
  5#   Ruby License
  6#
  7#   This module is free software. You may use, modify, and/or redistribute this
  8#   software under the same terms as Ruby.
  9#
 10#   This program is distributed in the hope that it will be useful, but WITHOUT
 11#   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 12#   FOR A PARTICULAR PURPOSE.
 13#
 14# == Author(s)
 15#
 16# * ?nyasu
 17
 18# Author::    ?nyasu <nyasu@osk.3web.ne.jp>
 19# Copyright:: Copyright (c) 2000 ?nyasu
 20# License::   Ruby License
 21
 22# = TagIterator (aka Tagiter)
 23#
 24# Simple but very useful HTML/XHTML cascading parser.
 25#
 26# Quickly iterate through tagged markup documents like HTML and XML.
 27# TagIterator is great for quick and dirty web scrapping.
 28#
 29# == Usage
 30#
 31#   # sample html
 32#   stext = <<-EOF
 33#   <body> This is a test...
 34#     <sub> S1 </sub> <sub> S2 </sub>
 35#     <DL>
 36#       <DT> A1
 37#       <DT> A2
 38#       <DT> A3
 39#     </DL>
 40#     <DL>
 41#       <DT> B1
 42#       <DT> B2
 43#       <DT> B3
 44#     </DL>
 45#     <NEST>
 46#       <P ALIGN="R">TOP</P>
 47#       <NEST>
 48#         <P>SECOND</P>
 49#         <OL>
 50#           <LI>C1
 51#           <LI>C2
 52#           <LI>C3
 53#           <LI>C4
 54#         </OL>
 55#       </NEST>
 56#       <OL>
 57#         <LI>D1
 58#         <LI>D2
 59#         <LI>D3
 60#         <LI>D4
 61#       </OL>
 62#     </NEST>
 63#   </body>
 64#   EOF
 65#
 66#   a = TagIterator.new(stext)
 67#   a.first("body") do |y|
 68#     y.nth("dl",2) do |dl|
 69#       dl.enumtag("dt") do |t|
 70#         puts t.text.strip
 71#       end
 72#     end
 73#     y.first("nest") do |n|
 74#       n.first("p") do |c|
 75#         print c.text, ' '
 76#         puts c.attributes.collect{ |k,v| "#{k}=#{v}" }
 77#       end.next("nest") do |m|
 78#         m.first("p") do |c|
 79#           puts c.text
 80#         end.next("ol") do |o|
 81#           o.enumtag("li") do |i| puts i.text.strip end
 82#         end
 83#       end.next("ol") do |o|
 84#         o.enumtag("li") do |i| puts i.text.strip end
 85#       end
 86#     end
 87#   end
 88#   a.each_block("sub") do |y|
 89#     puts y.text.strip
 90#   end
 91#
 92# _produces_
 93#
 94#   B1
 95#   B2
 96#   B3
 97#   TOP align=R
 98#   SECOND
 99#   C1
100#   C2
101#   C3
102#   C4
103#   D1
104#   D2
105#   D3
106#   D4
107#   S1
108#   S2
109#
110
111class TagIterator
112
113  attr :text
114  attr :option, true
115  attr :tag
116  attr :attributes
117
118  private
119
120  def initialize(text,tag=nil,attributes={})
121    raise RuntimeError,"Only String accepted" unless text.is_a?(String)
122    @text=text
123    @option="pi"
124    @tag=tag
125    @attributes=attributes
126    def @attributes.[](aname)
127      super aname.downcase
128    end
129  end
130
131  def find_element(element,st=0)
132    rex=Regexp.new('<(\s|\n)*'+element+'(\s|\n|>)',@option)
133    @text.index(rex,st)
134  end
135
136  def parse_attribute(attstr)
137    k={}; r={}; 
138    attstr.scan(/(\w+)=(\S+)/) do |pt| k[ pt[0] ] = pt[1] end
139    attstr.scan(/(\w+)="([^"]*)"/) do |pt| k[ pt[0] ] = pt[1] end
140    k.each do |key,val| r[key.downcase]=val end
141    r
142  end
143
144  def find_opentag(tag,st=0)
145    s=find_element(tag,st)
146    return nil unless s
147
148    r=@text.index('>',s)
149    return r+1,@text[s+1..r-1]
150  end
151
152  def find_closetag(tag,st,opentag=nil)
153    if opentag then
154      p=find_element(tag,st)
155      q,d = find_opentag(opentag,st)
156    else
157      p=find_element('/\s*'+tag,st)
158      q,d = find_opentag(tag,st)
159    end
160    p-=1 if p 
161
162    if p and q then if p > q then    # tag nested
163      p=find_closetag(tag,find_closetag(tag,q,opentag)+2,opentag)
164    end end
165
166    return p
167  end
168
169  def find_closeenumtag(tag,st=0)
170    rex=Regexp.new('<\s*'+tag,@option)
171    s=@text.index(rex,st)
172    s-=1 if s
173    s
174  end
175  alias_method :find_openenumtag, :find_opentag
176
177  public
178
179  def nth(tag,n,closetag=nil)
180    raise RuntimeError,"nth: number not specified" unless n
181    t=0
182    e=s=0   # for their scope
183    d=nil
184
185    1.upto(n) do |i|
186      s,d = find_opentag(tag,t)
187      raise RuntimeError,"tag(#{tag}) not found at(#{i})" unless s
188
189      if closetag then
190        e=find_closetag(closetag,s,tag)
191      else
192        e=find_closetag(tag,s)
193      end
194      e=-1 unless e
195      t=@text.index('>',e+1)
196      t=@text.length unless t
197    end
198    yield self.class.new(text[s..e],tag,parse_attribute(d))
199    self.class.new(text[t+1..-1])
200  end
201
202  def first(tag,*arg) nth(tag,1,*arg) do |f| yield f end end
203  alias_method :next, :first
204
205  def each_block(tag,closetag=nil)
206    t=0
207    s,d =find_opentag(tag)
208    raise RuntimeError,"tag(#{tag}) not found" unless s
209
210    while s do
211      if closetag then
212        e=find_closetag(closetag,s,tag)
213      else
214        e=find_closetag(tag,s)
215      end
216      e=-1 unless e
217      yield self.class.new(@text[s..e],tag,parse_attribute(d))
218      if e>=0 then 
219        t=@text.index('>',e+1)
220        t=@text.length unless t
221        s,d = find_opentag(tag,t)
222      else
223        s=false
224      end
225    end
226    self.class.new(text[t+1..-1])
227  end
228
229  def collect(*arg)
230    a=[]
231    each_block(*arg) do |tt| a.push tt end
232    a
233  end
234
235  def enumtag(tag)
236    s,d = find_openenumtag(tag)
237    while s do
238      e=find_closeenumtag(tag,s+1)
239      e=-1 unless e
240      yield self.class.new(@text[s..e],tag,parse_attribute(d))
241      s,d = find_openenumtag(tag,s)
242    end
243  end
244
245  def enumcollect(tag)
246    a=[]
247    enumtag(tag) do |t| a.push t end
248    a
249  end
250
251  def for_this
252    yield self
253  end
254
255  def get_nth(*arg) r=nil; nth(*arg) do |bl| r=bl end; r; end
256
257  def get_first(*arg) r=nil; first(*arg) do |bl| r=bl end; r; end
258
259  def tagexist?(tag,st=0)
260    s=find_element(tag,st)
261    if s then true else false end
262  end
263
264  def tagnext
265    s=@text.index("<")
266    return nil unless s
267    e=@text.index(">",s)
268    return nil unless s
269    @text[s..e].scan(/[^<>\s]+/)[0]
270  end
271
272  def nth_tailer(tag,n)
273    nth(tag,n) do end
274  end
275
276end
277
278
279
280#  _____         _
281# |_   _|__  ___| |_
282#   | |/ _ \/ __| __|
283#   | |  __/\__ \ |_
284#   |_|\___||___/\__|
285#
286
287=begin testing
288
289require 'test/unit'
290
291class TC_TagIterator < Test::Unit::TestCase
292
293  STEXT = <<-EOS
294  <body> This is a test...
295    <sub> S1 </sub> <sub> S2 </sub>
296    <DL>
297      <DT> A1
298      <DT> A2
299      <DT> A3
300    </DL>
301    <DL>
302      <DT> B1
303      <DT> B2
304      <DT> B3
305    </DL>
306    <NEST>
307      <P ALIGN="R">TOP</P>
308      <NEST>
309        <P>SECOND</P>
310        <OL>
311          <LI>C1
312          <LI>C2
313          <LI>C3
314          <LI>C4
315        </OL>
316      </NEST>
317      <OL>
318        <LI>D1
319        <LI>D2
320        <LI>D3
321        <LI>D4
322      </OL>
323    </NEST>
324  </body>
325  EOS
326
327  def test_all
328    assert_nothing_raised{ @a = TagIterator.new( STEXT ) }
329    @f = []
330    assert_nothing_raised {
331      @a.first("body") do |y|
332        y.nth("dl",2) do |dl|
333          dl.enumtag("dt") do |t|
334            @f << t.text.strip
335          end
336        end
337        y.first("nest") do |n|
338          n.first("p") do |c|
339            @f << c.text
340            @f.concat c.attributes.collect{ |k,v| "#{k}=#{v}" }
341          end.next("nest") do |m|
342            m.first("p") do |c|
343              @f << c.text
344            end.next("ol") do |o|
345              o.enumtag("li") do |i| @f << i.text.strip end
346            end
347          end.next("ol") do |o|
348            o.enumtag("li") do |i| @f << i.text.strip end
349          end
350        end
351      end
352      @a.each_block("sub") do |y|
353        @f << y.text.strip
354      end
355    }
356    o = [ "B1", "B2", "B3",
357          "TOP", "align=R", "SECOND",
358          "C1", "C2", "C3", "C4",
359          "D1", "D2", "D3", "D4",
360          "S1", "S2" ]
361    assert_equal( o, @f )
362  end
363
364end
365
366=end