/HarvestMan-twisted/pageparser.py
Python | 483 lines | 393 code | 43 blank | 47 comment | 28 complexity | c920ffdaf4857105b90b05fc5d3796fb MD5 | raw file
1import re 2from sgmllib import SGMLParser 3 4from urltypes import * 5from macros import * 6 7class ParseTag(object): 8 """ Class representing a tag which is parsed by the HTML parser(s) """ 9 10 def __init__(self, tag, tagdict, pattern=None, enabled=True): 11 # Tag is the name of the tag (element) which will be parsed. 12 # Tagdict is a dictionary which contains the attributes 13 # of the tag which we are interested as keys and the type 14 # of URL the value of the attribute will be saved as, as 15 # the value. If there are more than one type of URL for this 16 # attribute key, then the value is a list. 17 18 # For example valid tagdicts are {'href': [URL_TYPE_ANY, URL_TYPE_ANCHOR] }, 19 # {'codebase': URL_TYPE_JAPPLET_CODEBASE, 'code': URL_TYPE_JAPPLET'}. 20 self.tag = tag 21 self.tagdict = tagdict 22 self.enabled = enabled 23 self.pattern = pattern 24 25 def disable(self): 26 """ Disable parsing of this tag """ 27 self.enabled = False 28 29 def enable(self): 30 """ Enable parsing of this tag """ 31 self.enabled = True 32 33 def isEnabled(self): 34 """ Is this tag enabled ? """ 35 36 return self.enabled 37 38 def setPattern(self, pattern): 39 self.pattern = pattern 40 41 def __eq__(self, item): 42 return self.tag.lower() == item.lower() 43 44class HarvestManSimpleParser(SGMLParser): 45 """ An HTML/XHTML parser derived from SGMLParser """ 46 47 # query_re = re.compile(r'[-.:_a-zA-Z0-9]*\?[-.:_a-zA-Z0-9]*=[-.a:_-zA-Z0-9]*', re.UNICODE) 48 # A more lenient form of query regular expression 49 query_re = re.compile(r'([^&=\?]*\?)([^&=\?]*=[^&=\?])*', re.UNICODE) 50 skip_re = re.compile(r'(javascript:)|(mailto:)|(news:)') 51 # Junk URLs obtained by parsing HTML of web-directory pages 52 # i.e pages with title "Index of...". The filtering is done after 53 # looking at the title of the page. 54 index_page_re = re.compile(r'(\?[a-zA-Z0-9]=[a-zA-Z0-9])') 55 56 features = [ ParseTag('a', {'href': URL_TYPE_ANY}), 57 ParseTag('base', {'href' : URL_TYPE_BASE}), 58 ParseTag('frame', {'src' : URL_TYPE_FRAME}), 59 ParseTag('img', {'src': URL_TYPE_IMAGE}), 60 ParseTag('form', {'action': URL_TYPE_FORM}), 61 ParseTag('link', {'href': URL_TYPE_ANY}), 62 ParseTag('body', {'background' : URL_TYPE_IMAGE}), 63 ParseTag('script', {'src': URL_TYPE_JAVASCRIPT}), 64 ParseTag('applet', {'codebase': URL_TYPE_JAPPLET_CODEBASE, 'code' : URL_TYPE_JAPPLET}), 65 ParseTag('area', {'href': URL_TYPE_ANY}), 66 ParseTag('meta', {'CONTENT': URL_TYPE_ANY, 'content': URL_TYPE_ANY}), 67 ParseTag('embed', {'src': URL_TYPE_ANY}), 68 ParseTag('object', {'data': URL_TYPE_ANY}), 69 ParseTag('option', {'value': URL_TYPE_ANY}, enabled=False) ] 70 71 72 handled_rel_types = ( URL_TYPE_STYLESHEET, ) 73 74 def __init__(self): 75 self.url = None 76 self.links = [] 77 self.linkpos = {} 78 self.images = [] 79 # Keywords 80 self.keywords = [] 81 # Description of page 82 self.description = '' 83 # Title of page 84 self.title = '' 85 self.title_flag = True 86 # Fix for <base href="..."> links 87 self.base_href = False 88 # Base url for above 89 self.base = None 90 # anchor links flag 91 self._anchors = True 92 # For META robots tag 93 self.can_index = True 94 self.can_follow = True 95 # Current tag 96 self._tag = '' 97 SGMLParser.__init__(self) 98 # Type 99 self.typ = 0 100 101 def save_anchors(self, value): 102 """ Set the save anchor links flag """ 103 104 # Warning: If you set this to true, anchor links on 105 # webpages will be saved as separate files. 106 self._anchors = value 107 108 def enable_feature(self, tag): 109 """ Enable the given tag feature if it is disabled """ 110 111 if tag in self.features: 112 parsetag = self.features[self.features.index(tag)] 113 parsetag.enable() 114 115 def disable_feature(self, tag): 116 """ Disable the given tag feature if it is enabled """ 117 118 if tag in self.features: 119 parsetag = self.features[self.features.index(tag)] 120 parsetag.disable() 121 122 def filter_link(self, link): 123 """ Function to filter links, we decide here whether 124 to handle certain kinds of links """ 125 126 if not link: 127 return LINK_EMPTY 128 129 # ignore javascript links (From 1.2 version javascript 130 # links of the form .js are fetched, but we still ignore 131 # the actual javascript actions since there is no 132 # javascript engine.) 133 llink = link.lower() 134 135 # Skip javascript, mailto, news and directory special tags. 136 if self.skip_re.match(llink): 137 return LINK_FILTERED 138 139 # If this is a web-directory Index page, then check for 140 # match with junk URLs of such index pages 141 if self.title.lower().startswith('index of'): 142 if self.index_page_re.match(llink): 143 # print 'Filtering link',llink 144 return LINK_FILTERED 145 146 # Check if we're accepting query style URLs 147 if not objects.config.getquerylinks and self.query_re.search(llink): 148 debug('Query filtering link',link) 149 return LINK_FILTERED 150 151 return LINK_NOT_FILTERED 152 153 def handle_anchor_links(self, link): 154 """ Handle links of the form html#...""" 155 156 # if anchor tag, then get rid of anchor #... 157 # and only add the webpage link 158 if not link: 159 return LINK_EMPTY 160 161 # Need to do this here also 162 self.check_add_link(URL_TYPE_ANCHOR, link) 163 164 # No point in getting #anchor sort of links 165 # since typically they point to anchors in the 166 # same page 167 168 index = link.rfind('.html#') 169 if index != -1: 170 newhref = link[:(index + 5)] 171 self.check_add_link(URL_TYPE_WEBPAGE, newhref) 172 return ANCHOR_LINK_FOUND 173 else: 174 index = link.rfind('.htm#') 175 if index != -1: 176 newhref = link[:(index + 4)] 177 self.check_add_link(URL_TYPE_WEBPAGE, newhref) 178 return ANCHOR_LINK_FOUND 179 180 return ANCHOR_LINK_NOT_FOUND 181 182 def unknown_starttag(self, tag, attrs): 183 """ This method gives you the tag in the html 184 page along with its attributes as a list of 185 tuples """ 186 187 # Raise event for anybody interested in catching a tagparse event... 188 if objects.eventmgr and objects.eventmgr.raise_event('beforetag', self.url, None, tag=tag, attrs=attrs)==False: 189 # Don't parse this tag.. 190 return 191 192 # Set as current tag 193 self._tag = tag 194 # print self._tag, attrs 195 196 if not attrs: return 197 isBaseTag = not self.base and tag == 'base' 198 # print 'Base=>',isBaseTag 199 200 if tag in self.features: 201 202 d = CaselessDict(attrs) 203 parsetag = self.features[self.features.index(tag)] 204 205 # Don't do anything if the feature is disabled 206 if not parsetag.isEnabled(): 207 return 208 209 tagdict = parsetag.tagdict 210 211 link = '' 212 213 for key, typ in tagdict.items(): 214 # If there is a <base href="..."> tag 215 # set self.base_href 216 if isBaseTag and key=='href': 217 self.base_href = True 218 try: 219 self.base = d[key] 220 except: 221 self.base_href = False 222 continue 223 224 # if the link already has a value, skip 225 # (except for applet tags) 226 if tag != 'applet': 227 if link: continue 228 229 if tag == 'link': 230 try: 231 # Fix - only reset typ if it is one 232 # of the valid handled rel types. 233 foundtyp = d['rel'].lower() 234 if foundtyp in self.handled_rel_types: 235 typ = getTypeClass(foundtyp) 236 except KeyError: 237 pass 238 239 try: 240 if tag == 'meta': 241 # Handle meta tag for refresh 242 foundtyp = d.get('http-equiv','').lower() 243 if foundtyp.lower() == 'refresh': 244 link = d.get(key,'') 245 if not link: continue 246 # This will be of the form of either 247 # a time-gap (CONTENT="600") or a time-gap 248 # with a URL (CONTENT="0; URL=<url>") 249 items = link.split(';') 250 if len(items)==1: 251 # Only a time-gap, skip it 252 continue 253 elif len(items)==2: 254 # Second one should be a URL 255 reqd = items[1] 256 # print 'Reqd=>',reqd 257 if (reqd.find('URL') != -1 or reqd.find('url') != -1) and reqd.find('=') != -1: 258 link = reqd.split('=')[1].strip() 259 # print 'Link=>',link 260 else: 261 continue 262 else: 263 # Handle robots meta tag 264 name = d.get('name','').lower() 265 if name=='robots': 266 robots = d.get('content','').lower() 267 # Split to ',' 268 contents = [item.strip() for item in robots.split(',')] 269 # Check for nofollow 270 self.can_follow = not ('nofollow' in contents) 271 # Check for noindex 272 self.can_index = not ('noindex' in contents) 273 elif name=='keywords': 274 self.keywords = d.get('content','').split(',') 275 # Trim the keywords list 276 self.keywords = [word.lower().strip() for word in self.keywords] 277 elif name=='description': 278 self.description = d.get('content','').strip() 279 else: 280 continue 281 282 elif tag != 'applet': 283 link = d[key] 284 else: 285 link += d[key] 286 if key == 'codebase': 287 if link: 288 if link[-1] != '/': 289 link += '/' 290 continue 291 except KeyError: 292 continue 293 294 # see if this link is to be filtered 295 if self.filter_link(link) != LINK_NOT_FILTERED: 296 # print 'Filtered link',link 297 continue 298 299 # anchor links in a page should not be saved 300 # index = link.find('#') 301 302 # Make sure not to wrongly categorize '#' in query strings 303 # as anchor URLs. 304 if link.find('#') != -1 and not self.query_re.search(link): 305 # print 'Is an anchor link',link 306 self.handle_anchor_links(link) 307 else: 308 # append to private list of links 309 self.check_add_link(typ, link) 310 311 def unknown_endtag(self, tag): 312 313 self._tag = '' 314 if tag=='title': 315 self.title_flag = False 316 self.title = self.title.strip() 317 318 def handle_data(self, data): 319 320 if self._tag.lower()=='title' and self.title_flag: 321 self.title += data 322 323 def check_add_link(self, typ, link): 324 """ To avoid adding duplicate links """ 325 326 f = False 327 328 if typ == 'image': 329 if not (typ, link) in self.images: 330 self.images.append((typ, link)) 331 elif not (typ, link) in self.links: 332 # print 'Adding link ', link, typ 333 pos = self.getpos() 334 self.links.append((typ, link)) 335 self.linkpos[(typ,link)] = (pos[0],pos[1]) 336 337 338 def add_tag_info(self, taginfo): 339 """ Add new tag information to this object. 340 This can be used to change the behavior of this class 341 at runtime by adding new tags """ 342 343 # The taginfo object should be a dictionary 344 # of the form { tagtype : (elementname, elementype) } 345 346 # egs: { 'body' : ('background', 'img) } 347 if type(taginfo) != dict: 348 raise AttributeError, "Attribute type mismatch, taginfo should be a dictionary!" 349 350 # get the key of the dictionary 351 key = (taginfo.keys())[0] 352 if len(taginfo[key]) != 2: 353 raise ValueError, 'Value mismatch, size of tag tuple should be 2' 354 355 # get the value tuple 356 tagelname, tageltype = taginfo[key] 357 358 # see if this is an already existing tagtype 359 if key in self.handled.keys: 360 _values = self.handled[key] 361 362 f=0 363 for index in xrange(len(_values)): 364 # if the elementname is also 365 # the same, just replace it. 366 v = _values[index] 367 368 elname, eltype = v 369 if elname == tagelname: 370 f=1 371 _values[index] = (tagelname, tageltype) 372 break 373 374 # new element, add it to list 375 if f==0: _values.append((tagelname, tageltype)) 376 return 377 else: 378 # new key, directly modify dictionary 379 elements = [] 380 elements.append((tagelname, tageltype)) 381 self.handled[key] = elements 382 383 def reset(self): 384 SGMLParser.reset(self) 385 386 self.url = None 387 self.base = None 388 self.links = [] 389 self.images = [] 390 self.base_href = False 391 self.base_url = '' 392 self.can_index = True 393 self.can_follow = True 394 self.title = '' 395 self.title_flag = True 396 self.description = '' 397 self.keywords = [] 398 399 def base_url_defined(self): 400 """ Return whether this url had a 401 base url of the form <base href='...'> 402 defined """ 403 404 return self.base_href 405 406 def get_base_url(self): 407 return self.base 408 409 def set_url(self, url): 410 """ Set the URL whose data is about to be parsed """ 411 self.url = url 412 413class HarvestManSGMLOpParser(HarvestManSimpleParser): 414 """ A parser based on effbot's sgmlop """ 415 416 def __init__(self): 417 # This module should be built already! 418 import sgmlop 419 420 self.parser = sgmlop.SGMLParser() 421 self.parser.register(self) 422 HarvestManSimpleParser.__init__(self) 423 # Type 424 self.typ = 1 425 426 def finish_starttag(self, tag, attrs): 427 self.unknown_starttag(tag, attrs) 428 429 def finish_endtag(self, tag): 430 self.unknown_endtag(tag) 431 432 def feed(self, data): 433 self.parser.feed(data) 434 435class HarvestManCSSParser(object): 436 """ Class to parse stylesheets and extract URLs """ 437 438 # Regexp to parse stylesheet imports 439 importcss1 = re.compile(r'(\@import\s+\"?)(?!url)([\w.-:/]+)(\"?)', re.MULTILINE|re.LOCALE|re.UNICODE) 440 importcss2 = re.compile(r'(\@import\s+url\(\"?)([\w.-:/]+)(\"?\))', re.MULTILINE|re.LOCALE|re.UNICODE) 441 # Regexp to parse URLs inside CSS files 442 cssurl = re.compile(r'(url\()([^\)]+)(\))', re.LOCALE|re.UNICODE) 443 444 def __init__(self): 445 # Any imported stylesheet URLs 446 self.csslinks = [] 447 # All URLs including above 448 self.links = [] 449 450 def feed(self, data): 451 self._parse(data) 452 453 def _parse(self, data): 454 """ Parse stylesheet data and extract imported css links, if any """ 455 456 # Return is a list of imported css links. 457 # This subroutine uses the specification mentioned at 458 # http://www.w3.org/TR/REC-CSS2/cascade.html#at-import 459 # for doing stylesheet imports. 460 461 # This takes care of @import "style.css" and 462 # @import url("style.css") and url(...) syntax. 463 # Media types specified if any, are ignored. 464 465 # Matches for @import "style.css" 466 l1 = self.importcss1.findall(data) 467 # Matches for @import url("style.css") 468 l2 = self.importcss2.findall(data) 469 # Matches for url(...) 470 l3 = self.cssurl.findall(data) 471 472 for item in (l1+l2): 473 if not item: continue 474 url = item[1].replace("'",'').replace('"','') 475 self.csslinks.append(url) 476 self.links.append(url) 477 478 for item in l3: 479 if not item: continue 480 url = item[1].replace("'",'').replace('"','') 481 if url not in self.links: 482 self.links.append(url) 483