PageRenderTime 359ms CodeModel.GetById 91ms app.highlight 209ms RepoModel.GetById 52ms app.codeStats 1ms

/HarvestMan-twisted/pageparser.py

http://harvestman-crawler.googlecode.com/
Python | 483 lines | 393 code | 43 blank | 47 comment | 28 complexity | c920ffdaf4857105b90b05fc5d3796fb MD5 | raw file
  1import re
  2from sgmllib import SGMLParser
  3
  4from urltypes import *
  5from macros import *
  6
  7class ParseTag(object):
  8    """ Class representing a tag which is parsed by the HTML parser(s) """
  9    
 10    def __init__(self, tag, tagdict, pattern=None, enabled=True):
 11        # Tag is the name of the tag (element) which will be parsed.
 12        # Tagdict is a dictionary which contains the attributes
 13        # of the tag which we are interested as keys and the type
 14        # of URL the value of the attribute will be saved as, as
 15        # the value. If there are more than one type of URL for this
 16        # attribute key, then the value is a list.
 17        
 18        # For example valid tagdicts are {'href': [URL_TYPE_ANY, URL_TYPE_ANCHOR] },
 19        # {'codebase': URL_TYPE_JAPPLET_CODEBASE, 'code': URL_TYPE_JAPPLET'}.
 20        self.tag = tag
 21        self.tagdict = tagdict
 22        self.enabled = enabled
 23        self.pattern = pattern
 24
 25    def disable(self):
 26        """ Disable parsing of this tag """
 27        self.enabled = False
 28
 29    def enable(self):
 30        """ Enable parsing of this tag """
 31        self.enabled = True
 32
 33    def isEnabled(self):
 34        """ Is this tag enabled ? """
 35        
 36        return self.enabled
 37
 38    def setPattern(self, pattern):
 39        self.pattern = pattern
 40
 41    def __eq__(self, item):
 42        return self.tag.lower() == item.lower()
 43    
 44class HarvestManSimpleParser(SGMLParser):
 45    """ An HTML/XHTML parser derived from SGMLParser """
 46
 47    # query_re = re.compile(r'[-.:_a-zA-Z0-9]*\?[-.:_a-zA-Z0-9]*=[-.a:_-zA-Z0-9]*', re.UNICODE)
 48    # A more lenient form of query regular expression
 49    query_re = re.compile(r'([^&=\?]*\?)([^&=\?]*=[^&=\?])*', re.UNICODE) 
 50    skip_re = re.compile(r'(javascript:)|(mailto:)|(news:)')
 51    # Junk URLs obtained by parsing HTML of web-directory pages
 52    # i.e pages with title "Index of...". The filtering is done after
 53    # looking at the title of the page.
 54    index_page_re = re.compile(r'(\?[a-zA-Z0-9]=[a-zA-Z0-9])')
 55
 56    features = [ ParseTag('a', {'href': URL_TYPE_ANY}),
 57                 ParseTag('base', {'href' : URL_TYPE_BASE}),
 58                 ParseTag('frame', {'src' : URL_TYPE_FRAME}),
 59                 ParseTag('img', {'src': URL_TYPE_IMAGE}),
 60                 ParseTag('form', {'action': URL_TYPE_FORM}),
 61                 ParseTag('link', {'href': URL_TYPE_ANY}),
 62                 ParseTag('body', {'background' : URL_TYPE_IMAGE}),
 63                 ParseTag('script', {'src': URL_TYPE_JAVASCRIPT}),
 64                 ParseTag('applet', {'codebase': URL_TYPE_JAPPLET_CODEBASE, 'code' : URL_TYPE_JAPPLET}),
 65                 ParseTag('area', {'href': URL_TYPE_ANY}),
 66                 ParseTag('meta', {'CONTENT': URL_TYPE_ANY, 'content': URL_TYPE_ANY}),
 67                 ParseTag('embed', {'src': URL_TYPE_ANY}),
 68                 ParseTag('object', {'data': URL_TYPE_ANY}),
 69                 ParseTag('option', {'value': URL_TYPE_ANY}, enabled=False) ]
 70                 
 71
 72    handled_rel_types = ( URL_TYPE_STYLESHEET, )
 73    
 74    def __init__(self):
 75        self.url = None
 76        self.links = []
 77        self.linkpos = {}
 78        self.images = []
 79        # Keywords
 80        self.keywords = []
 81        # Description of page
 82        self.description = ''
 83        # Title of page
 84        self.title = ''
 85        self.title_flag = True
 86        # Fix for <base href="..."> links
 87        self.base_href = False
 88        # Base url for above
 89        self.base = None
 90        # anchor links flag
 91        self._anchors = True
 92        # For META robots tag
 93        self.can_index = True
 94        self.can_follow = True
 95        # Current tag
 96        self._tag = ''
 97        SGMLParser.__init__(self)
 98        # Type
 99        self.typ = 0
100        
101    def save_anchors(self, value):
102        """ Set the save anchor links flag """
103
104        # Warning: If you set this to true, anchor links on
105        # webpages will be saved as separate files.
106        self._anchors = value
107
108    def enable_feature(self, tag):
109        """ Enable the given tag feature if it is disabled """
110
111        if tag in self.features:
112            parsetag = self.features[self.features.index(tag)]
113            parsetag.enable()
114
115    def disable_feature(self, tag):
116        """ Disable the given tag feature if it is enabled """
117
118        if tag in self.features:
119            parsetag = self.features[self.features.index(tag)]
120            parsetag.disable()
121                
122    def filter_link(self, link):
123        """ Function to filter links, we decide here whether
124        to handle certain kinds of links """
125
126        if not link:
127            return LINK_EMPTY
128
129        # ignore javascript links (From 1.2 version javascript
130        # links of the form .js are fetched, but we still ignore
131        # the actual javascript actions since there is no
132        # javascript engine.)
133        llink = link.lower()
134
135        # Skip javascript, mailto, news and directory special tags.
136        if self.skip_re.match(llink):
137            return LINK_FILTERED
138
139        # If this is a web-directory Index page, then check for
140        # match with junk URLs of such index pages
141        if self.title.lower().startswith('index of'):
142            if self.index_page_re.match(llink):
143                # print 'Filtering link',llink
144                return LINK_FILTERED
145            
146        # Check if we're accepting query style URLs
147        if not objects.config.getquerylinks and self.query_re.search(llink):
148            debug('Query filtering link',link)
149            return LINK_FILTERED
150
151        return LINK_NOT_FILTERED
152
153    def handle_anchor_links(self, link):
154        """ Handle links of the form html#..."""
155
156        # if anchor tag, then get rid of anchor #...
157        # and only add the webpage link
158        if not link:
159            return LINK_EMPTY
160
161        # Need to do this here also
162        self.check_add_link(URL_TYPE_ANCHOR, link)
163
164        # No point in getting #anchor sort of links
165        # since typically they point to anchors in the
166        # same page
167
168        index = link.rfind('.html#')
169        if index != -1:
170            newhref = link[:(index + 5)]
171            self.check_add_link(URL_TYPE_WEBPAGE, newhref)
172            return ANCHOR_LINK_FOUND
173        else:
174            index = link.rfind('.htm#')
175            if index != -1:
176                newhref = link[:(index + 4)]
177                self.check_add_link(URL_TYPE_WEBPAGE, newhref)
178                return ANCHOR_LINK_FOUND
179
180        return ANCHOR_LINK_NOT_FOUND
181
182    def unknown_starttag(self, tag, attrs):
183        """ This method gives you the tag in the html
184        page along with its attributes as a list of
185        tuples """
186
187        # Raise event for anybody interested in catching a tagparse event...
188        if objects.eventmgr and objects.eventmgr.raise_event('beforetag', self.url, None, tag=tag, attrs=attrs)==False:
189            # Don't parse this tag..
190            return
191                                     
192        # Set as current tag
193        self._tag = tag
194        # print self._tag, attrs
195        
196        if not attrs: return
197        isBaseTag = not self.base and tag == 'base'
198        # print 'Base=>',isBaseTag
199        
200        if tag in self.features:
201
202            d = CaselessDict(attrs)
203            parsetag = self.features[self.features.index(tag)]
204
205            # Don't do anything if the feature is disabled
206            if not parsetag.isEnabled():
207                return
208            
209            tagdict = parsetag.tagdict
210            
211            link = ''
212
213            for key, typ in tagdict.items():
214                # If there is a <base href="..."> tag
215                # set self.base_href
216                if isBaseTag and key=='href':
217                    self.base_href = True
218                    try:
219                        self.base = d[key]
220                    except:
221                        self.base_href = False
222                        continue
223                
224                # if the link already has a value, skip
225                # (except for applet tags)
226                if tag != 'applet':
227                    if link: continue
228
229                if tag == 'link':
230                    try:
231                        # Fix - only reset typ if it is one
232                        # of the valid handled rel types.
233                        foundtyp = d['rel'].lower()
234                        if foundtyp in self.handled_rel_types:
235                            typ = getTypeClass(foundtyp)
236                    except KeyError:
237                        pass
238
239                try:
240                    if tag == 'meta':
241                        # Handle meta tag for refresh
242                        foundtyp = d.get('http-equiv','').lower()
243                        if foundtyp.lower() == 'refresh':
244                            link = d.get(key,'')
245                            if not link: continue
246                            # This will be of the form of either
247                            # a time-gap (CONTENT="600") or a time-gap
248                            # with a URL (CONTENT="0; URL=<url>")
249                            items = link.split(';')
250                            if len(items)==1:
251                                # Only a time-gap, skip it
252                                continue
253                            elif len(items)==2:
254                                # Second one should be a URL
255                                reqd = items[1]
256                                # print 'Reqd=>',reqd
257                                if (reqd.find('URL') != -1 or reqd.find('url') != -1) and reqd.find('=') != -1:
258                                    link = reqd.split('=')[1].strip()
259                                    # print 'Link=>',link
260                                else:
261                                    continue
262                        else:
263                            # Handle robots meta tag
264                            name = d.get('name','').lower()
265                            if name=='robots':
266                                robots = d.get('content','').lower()
267                                # Split to ','
268                                contents = [item.strip() for item in robots.split(',')]
269                                # Check for nofollow
270                                self.can_follow = not ('nofollow' in contents)
271                                # Check for noindex
272                                self.can_index = not ('noindex' in contents)
273                            elif name=='keywords':
274                                self.keywords = d.get('content','').split(',')
275                                # Trim the keywords list
276                                self.keywords = [word.lower().strip() for word in self.keywords]
277                            elif name=='description':
278                                self.description = d.get('content','').strip()
279                            else:
280                                continue
281
282                    elif tag != 'applet':
283                        link = d[key]
284                    else:
285                        link += d[key]
286                        if key == 'codebase':
287                            if link:
288                                if link[-1] != '/':
289                                    link += '/'
290                            continue                                
291                except KeyError:
292                    continue
293
294                # see if this link is to be filtered
295                if self.filter_link(link) != LINK_NOT_FILTERED:
296                    # print 'Filtered link',link
297                    continue
298
299                # anchor links in a page should not be saved        
300                # index = link.find('#')
301
302                # Make sure not to wrongly categorize '#' in query strings
303                # as anchor URLs.
304                if link.find('#') != -1 and not self.query_re.search(link):
305                    # print 'Is an anchor link',link
306                    self.handle_anchor_links(link)
307                else:
308                    # append to private list of links
309                    self.check_add_link(typ, link)
310
311    def unknown_endtag(self, tag):
312            
313        self._tag = ''
314        if tag=='title':
315            self.title_flag = False
316            self.title = self.title.strip()
317            
318    def handle_data(self, data):
319
320        if self._tag.lower()=='title' and self.title_flag:
321            self.title += data
322
323    def check_add_link(self, typ, link):
324        """ To avoid adding duplicate links """
325
326        f = False
327
328        if typ == 'image':
329            if not (typ, link) in self.images:
330                self.images.append((typ, link))
331        elif not (typ, link) in self.links:
332                # print 'Adding link ', link, typ
333                pos = self.getpos()
334                self.links.append((typ, link))
335                self.linkpos[(typ,link)] = (pos[0],pos[1])
336                
337
338    def add_tag_info(self, taginfo):
339        """ Add new tag information to this object.
340        This can be used to change the behavior of this class
341        at runtime by adding new tags """
342
343        # The taginfo object should be a dictionary
344        # of the form { tagtype : (elementname, elementype) }
345
346        # egs: { 'body' : ('background', 'img) }
347        if type(taginfo) != dict:
348            raise AttributeError, "Attribute type mismatch, taginfo should be a dictionary!"
349
350        # get the key of the dictionary
351        key = (taginfo.keys())[0]
352        if len(taginfo[key]) != 2:
353            raise ValueError, 'Value mismatch, size of tag tuple should be 2'
354
355        # get the value tuple
356        tagelname, tageltype = taginfo[key]
357
358        # see if this is an already existing tagtype
359        if key in self.handled.keys:
360            _values = self.handled[key]
361
362            f=0
363            for index in xrange(len(_values)):
364                # if the elementname is also
365                # the same, just replace it.
366                v = _values[index]
367
368                elname, eltype = v
369                if elname == tagelname:
370                    f=1
371                    _values[index] = (tagelname, tageltype)
372                    break
373
374            # new element, add it to list
375            if f==0: _values.append((tagelname, tageltype))
376            return 
377        else:
378            # new key, directly modify dictionary
379            elements = []
380            elements.append((tagelname, tageltype))
381            self.handled[key] = elements 
382
383    def reset(self):
384        SGMLParser.reset(self)
385
386        self.url = None
387        self.base = None
388        self.links = []
389        self.images = []
390        self.base_href = False
391        self.base_url = ''
392        self.can_index = True
393        self.can_follow = True
394        self.title = ''
395        self.title_flag = True
396        self.description = ''
397        self.keywords = []
398        
399    def base_url_defined(self):
400        """ Return whether this url had a
401        base url of the form <base href='...'>
402        defined """
403
404        return self.base_href
405
406    def get_base_url(self):
407        return self.base
408
409    def set_url(self, url):
410        """ Set the URL whose data is about to be parsed """
411        self.url = url
412
413class HarvestManSGMLOpParser(HarvestManSimpleParser):
414    """ A parser based on effbot's sgmlop """
415
416    def __init__(self):
417        # This module should be built already!
418        import sgmlop
419        
420        self.parser = sgmlop.SGMLParser()
421        self.parser.register(self)
422        HarvestManSimpleParser.__init__(self)
423        # Type
424        self.typ = 1
425        
426    def finish_starttag(self, tag, attrs):
427        self.unknown_starttag(tag, attrs)
428
429    def finish_endtag(self, tag):
430        self.unknown_endtag(tag)        
431
432    def feed(self, data):
433        self.parser.feed(data)
434        
435class HarvestManCSSParser(object):
436    """ Class to parse stylesheets and extract URLs """
437
438    # Regexp to parse stylesheet imports
439    importcss1 = re.compile(r'(\@import\s+\"?)(?!url)([\w.-:/]+)(\"?)', re.MULTILINE|re.LOCALE|re.UNICODE)
440    importcss2 = re.compile(r'(\@import\s+url\(\"?)([\w.-:/]+)(\"?\))', re.MULTILINE|re.LOCALE|re.UNICODE)
441    # Regexp to parse URLs inside CSS files
442    cssurl = re.compile(r'(url\()([^\)]+)(\))', re.LOCALE|re.UNICODE)
443
444    def __init__(self):
445        # Any imported stylesheet URLs
446        self.csslinks = []
447        # All URLs including above
448        self.links = []
449
450    def feed(self, data):
451        self._parse(data)
452        
453    def _parse(self, data):
454        """ Parse stylesheet data and extract imported css links, if any """
455
456        # Return is a list of imported css links.
457        # This subroutine uses the specification mentioned at
458        # http://www.w3.org/TR/REC-CSS2/cascade.html#at-import
459        # for doing stylesheet imports.
460
461        # This takes care of @import "style.css" and
462        # @import url("style.css") and url(...) syntax.
463        # Media types specified if any, are ignored.
464        
465        # Matches for @import "style.css"
466        l1 = self.importcss1.findall(data)
467        # Matches for @import url("style.css")
468        l2 = self.importcss2.findall(data)
469        # Matches for url(...)
470        l3 = self.cssurl.findall(data)
471        
472        for item in (l1+l2):
473            if not item: continue
474            url = item[1].replace("'",'').replace('"','')
475            self.csslinks.append(url)
476            self.links.append(url)
477            
478        for item in l3:
479            if not item: continue
480            url = item[1].replace("'",'').replace('"','')
481            if url not in self.links:
482                self.links.append(url)
483