PageRenderTime 87ms CodeModel.GetById 26ms RepoModel.GetById 1ms app.codeStats 0ms

/HarvestMan-twisted/pageparser.py

http://harvestman-crawler.googlecode.com/
Python | 483 lines | 393 code | 43 blank | 47 comment | 22 complexity | c920ffdaf4857105b90b05fc5d3796fb MD5 | raw file
Possible License(s): GPL-2.0
  1. import re
  2. from sgmllib import SGMLParser
  3. from urltypes import *
  4. from macros import *
  5. class ParseTag(object):
  6. """ Class representing a tag which is parsed by the HTML parser(s) """
  7. def __init__(self, tag, tagdict, pattern=None, enabled=True):
  8. # Tag is the name of the tag (element) which will be parsed.
  9. # Tagdict is a dictionary which contains the attributes
  10. # of the tag which we are interested as keys and the type
  11. # of URL the value of the attribute will be saved as, as
  12. # the value. If there are more than one type of URL for this
  13. # attribute key, then the value is a list.
  14. # For example valid tagdicts are {'href': [URL_TYPE_ANY, URL_TYPE_ANCHOR] },
  15. # {'codebase': URL_TYPE_JAPPLET_CODEBASE, 'code': URL_TYPE_JAPPLET'}.
  16. self.tag = tag
  17. self.tagdict = tagdict
  18. self.enabled = enabled
  19. self.pattern = pattern
  20. def disable(self):
  21. """ Disable parsing of this tag """
  22. self.enabled = False
  23. def enable(self):
  24. """ Enable parsing of this tag """
  25. self.enabled = True
  26. def isEnabled(self):
  27. """ Is this tag enabled ? """
  28. return self.enabled
  29. def setPattern(self, pattern):
  30. self.pattern = pattern
  31. def __eq__(self, item):
  32. return self.tag.lower() == item.lower()
  33. class HarvestManSimpleParser(SGMLParser):
  34. """ An HTML/XHTML parser derived from SGMLParser """
  35. # query_re = re.compile(r'[-.:_a-zA-Z0-9]*\?[-.:_a-zA-Z0-9]*=[-.a:_-zA-Z0-9]*', re.UNICODE)
  36. # A more lenient form of query regular expression
  37. query_re = re.compile(r'([^&=\?]*\?)([^&=\?]*=[^&=\?])*', re.UNICODE)
  38. skip_re = re.compile(r'(javascript:)|(mailto:)|(news:)')
  39. # Junk URLs obtained by parsing HTML of web-directory pages
  40. # i.e pages with title "Index of...". The filtering is done after
  41. # looking at the title of the page.
  42. index_page_re = re.compile(r'(\?[a-zA-Z0-9]=[a-zA-Z0-9])')
  43. features = [ ParseTag('a', {'href': URL_TYPE_ANY}),
  44. ParseTag('base', {'href' : URL_TYPE_BASE}),
  45. ParseTag('frame', {'src' : URL_TYPE_FRAME}),
  46. ParseTag('img', {'src': URL_TYPE_IMAGE}),
  47. ParseTag('form', {'action': URL_TYPE_FORM}),
  48. ParseTag('link', {'href': URL_TYPE_ANY}),
  49. ParseTag('body', {'background' : URL_TYPE_IMAGE}),
  50. ParseTag('script', {'src': URL_TYPE_JAVASCRIPT}),
  51. ParseTag('applet', {'codebase': URL_TYPE_JAPPLET_CODEBASE, 'code' : URL_TYPE_JAPPLET}),
  52. ParseTag('area', {'href': URL_TYPE_ANY}),
  53. ParseTag('meta', {'CONTENT': URL_TYPE_ANY, 'content': URL_TYPE_ANY}),
  54. ParseTag('embed', {'src': URL_TYPE_ANY}),
  55. ParseTag('object', {'data': URL_TYPE_ANY}),
  56. ParseTag('option', {'value': URL_TYPE_ANY}, enabled=False) ]
  57. handled_rel_types = ( URL_TYPE_STYLESHEET, )
  58. def __init__(self):
  59. self.url = None
  60. self.links = []
  61. self.linkpos = {}
  62. self.images = []
  63. # Keywords
  64. self.keywords = []
  65. # Description of page
  66. self.description = ''
  67. # Title of page
  68. self.title = ''
  69. self.title_flag = True
  70. # Fix for <base href="..."> links
  71. self.base_href = False
  72. # Base url for above
  73. self.base = None
  74. # anchor links flag
  75. self._anchors = True
  76. # For META robots tag
  77. self.can_index = True
  78. self.can_follow = True
  79. # Current tag
  80. self._tag = ''
  81. SGMLParser.__init__(self)
  82. # Type
  83. self.typ = 0
  84. def save_anchors(self, value):
  85. """ Set the save anchor links flag """
  86. # Warning: If you set this to true, anchor links on
  87. # webpages will be saved as separate files.
  88. self._anchors = value
  89. def enable_feature(self, tag):
  90. """ Enable the given tag feature if it is disabled """
  91. if tag in self.features:
  92. parsetag = self.features[self.features.index(tag)]
  93. parsetag.enable()
  94. def disable_feature(self, tag):
  95. """ Disable the given tag feature if it is enabled """
  96. if tag in self.features:
  97. parsetag = self.features[self.features.index(tag)]
  98. parsetag.disable()
  99. def filter_link(self, link):
  100. """ Function to filter links, we decide here whether
  101. to handle certain kinds of links """
  102. if not link:
  103. return LINK_EMPTY
  104. # ignore javascript links (From 1.2 version javascript
  105. # links of the form .js are fetched, but we still ignore
  106. # the actual javascript actions since there is no
  107. # javascript engine.)
  108. llink = link.lower()
  109. # Skip javascript, mailto, news and directory special tags.
  110. if self.skip_re.match(llink):
  111. return LINK_FILTERED
  112. # If this is a web-directory Index page, then check for
  113. # match with junk URLs of such index pages
  114. if self.title.lower().startswith('index of'):
  115. if self.index_page_re.match(llink):
  116. # print 'Filtering link',llink
  117. return LINK_FILTERED
  118. # Check if we're accepting query style URLs
  119. if not objects.config.getquerylinks and self.query_re.search(llink):
  120. debug('Query filtering link',link)
  121. return LINK_FILTERED
  122. return LINK_NOT_FILTERED
  123. def handle_anchor_links(self, link):
  124. """ Handle links of the form html#..."""
  125. # if anchor tag, then get rid of anchor #...
  126. # and only add the webpage link
  127. if not link:
  128. return LINK_EMPTY
  129. # Need to do this here also
  130. self.check_add_link(URL_TYPE_ANCHOR, link)
  131. # No point in getting #anchor sort of links
  132. # since typically they point to anchors in the
  133. # same page
  134. index = link.rfind('.html#')
  135. if index != -1:
  136. newhref = link[:(index + 5)]
  137. self.check_add_link(URL_TYPE_WEBPAGE, newhref)
  138. return ANCHOR_LINK_FOUND
  139. else:
  140. index = link.rfind('.htm#')
  141. if index != -1:
  142. newhref = link[:(index + 4)]
  143. self.check_add_link(URL_TYPE_WEBPAGE, newhref)
  144. return ANCHOR_LINK_FOUND
  145. return ANCHOR_LINK_NOT_FOUND
  146. def unknown_starttag(self, tag, attrs):
  147. """ This method gives you the tag in the html
  148. page along with its attributes as a list of
  149. tuples """
  150. # Raise event for anybody interested in catching a tagparse event...
  151. if objects.eventmgr and objects.eventmgr.raise_event('beforetag', self.url, None, tag=tag, attrs=attrs)==False:
  152. # Don't parse this tag..
  153. return
  154. # Set as current tag
  155. self._tag = tag
  156. # print self._tag, attrs
  157. if not attrs: return
  158. isBaseTag = not self.base and tag == 'base'
  159. # print 'Base=>',isBaseTag
  160. if tag in self.features:
  161. d = CaselessDict(attrs)
  162. parsetag = self.features[self.features.index(tag)]
  163. # Don't do anything if the feature is disabled
  164. if not parsetag.isEnabled():
  165. return
  166. tagdict = parsetag.tagdict
  167. link = ''
  168. for key, typ in tagdict.items():
  169. # If there is a <base href="..."> tag
  170. # set self.base_href
  171. if isBaseTag and key=='href':
  172. self.base_href = True
  173. try:
  174. self.base = d[key]
  175. except:
  176. self.base_href = False
  177. continue
  178. # if the link already has a value, skip
  179. # (except for applet tags)
  180. if tag != 'applet':
  181. if link: continue
  182. if tag == 'link':
  183. try:
  184. # Fix - only reset typ if it is one
  185. # of the valid handled rel types.
  186. foundtyp = d['rel'].lower()
  187. if foundtyp in self.handled_rel_types:
  188. typ = getTypeClass(foundtyp)
  189. except KeyError:
  190. pass
  191. try:
  192. if tag == 'meta':
  193. # Handle meta tag for refresh
  194. foundtyp = d.get('http-equiv','').lower()
  195. if foundtyp.lower() == 'refresh':
  196. link = d.get(key,'')
  197. if not link: continue
  198. # This will be of the form of either
  199. # a time-gap (CONTENT="600") or a time-gap
  200. # with a URL (CONTENT="0; URL=<url>")
  201. items = link.split(';')
  202. if len(items)==1:
  203. # Only a time-gap, skip it
  204. continue
  205. elif len(items)==2:
  206. # Second one should be a URL
  207. reqd = items[1]
  208. # print 'Reqd=>',reqd
  209. if (reqd.find('URL') != -1 or reqd.find('url') != -1) and reqd.find('=') != -1:
  210. link = reqd.split('=')[1].strip()
  211. # print 'Link=>',link
  212. else:
  213. continue
  214. else:
  215. # Handle robots meta tag
  216. name = d.get('name','').lower()
  217. if name=='robots':
  218. robots = d.get('content','').lower()
  219. # Split to ','
  220. contents = [item.strip() for item in robots.split(',')]
  221. # Check for nofollow
  222. self.can_follow = not ('nofollow' in contents)
  223. # Check for noindex
  224. self.can_index = not ('noindex' in contents)
  225. elif name=='keywords':
  226. self.keywords = d.get('content','').split(',')
  227. # Trim the keywords list
  228. self.keywords = [word.lower().strip() for word in self.keywords]
  229. elif name=='description':
  230. self.description = d.get('content','').strip()
  231. else:
  232. continue
  233. elif tag != 'applet':
  234. link = d[key]
  235. else:
  236. link += d[key]
  237. if key == 'codebase':
  238. if link:
  239. if link[-1] != '/':
  240. link += '/'
  241. continue
  242. except KeyError:
  243. continue
  244. # see if this link is to be filtered
  245. if self.filter_link(link) != LINK_NOT_FILTERED:
  246. # print 'Filtered link',link
  247. continue
  248. # anchor links in a page should not be saved
  249. # index = link.find('#')
  250. # Make sure not to wrongly categorize '#' in query strings
  251. # as anchor URLs.
  252. if link.find('#') != -1 and not self.query_re.search(link):
  253. # print 'Is an anchor link',link
  254. self.handle_anchor_links(link)
  255. else:
  256. # append to private list of links
  257. self.check_add_link(typ, link)
  258. def unknown_endtag(self, tag):
  259. self._tag = ''
  260. if tag=='title':
  261. self.title_flag = False
  262. self.title = self.title.strip()
  263. def handle_data(self, data):
  264. if self._tag.lower()=='title' and self.title_flag:
  265. self.title += data
  266. def check_add_link(self, typ, link):
  267. """ To avoid adding duplicate links """
  268. f = False
  269. if typ == 'image':
  270. if not (typ, link) in self.images:
  271. self.images.append((typ, link))
  272. elif not (typ, link) in self.links:
  273. # print 'Adding link ', link, typ
  274. pos = self.getpos()
  275. self.links.append((typ, link))
  276. self.linkpos[(typ,link)] = (pos[0],pos[1])
  277. def add_tag_info(self, taginfo):
  278. """ Add new tag information to this object.
  279. This can be used to change the behavior of this class
  280. at runtime by adding new tags """
  281. # The taginfo object should be a dictionary
  282. # of the form { tagtype : (elementname, elementype) }
  283. # egs: { 'body' : ('background', 'img) }
  284. if type(taginfo) != dict:
  285. raise AttributeError, "Attribute type mismatch, taginfo should be a dictionary!"
  286. # get the key of the dictionary
  287. key = (taginfo.keys())[0]
  288. if len(taginfo[key]) != 2:
  289. raise ValueError, 'Value mismatch, size of tag tuple should be 2'
  290. # get the value tuple
  291. tagelname, tageltype = taginfo[key]
  292. # see if this is an already existing tagtype
  293. if key in self.handled.keys:
  294. _values = self.handled[key]
  295. f=0
  296. for index in xrange(len(_values)):
  297. # if the elementname is also
  298. # the same, just replace it.
  299. v = _values[index]
  300. elname, eltype = v
  301. if elname == tagelname:
  302. f=1
  303. _values[index] = (tagelname, tageltype)
  304. break
  305. # new element, add it to list
  306. if f==0: _values.append((tagelname, tageltype))
  307. return
  308. else:
  309. # new key, directly modify dictionary
  310. elements = []
  311. elements.append((tagelname, tageltype))
  312. self.handled[key] = elements
  313. def reset(self):
  314. SGMLParser.reset(self)
  315. self.url = None
  316. self.base = None
  317. self.links = []
  318. self.images = []
  319. self.base_href = False
  320. self.base_url = ''
  321. self.can_index = True
  322. self.can_follow = True
  323. self.title = ''
  324. self.title_flag = True
  325. self.description = ''
  326. self.keywords = []
  327. def base_url_defined(self):
  328. """ Return whether this url had a
  329. base url of the form <base href='...'>
  330. defined """
  331. return self.base_href
  332. def get_base_url(self):
  333. return self.base
  334. def set_url(self, url):
  335. """ Set the URL whose data is about to be parsed """
  336. self.url = url
  337. class HarvestManSGMLOpParser(HarvestManSimpleParser):
  338. """ A parser based on effbot's sgmlop """
  339. def __init__(self):
  340. # This module should be built already!
  341. import sgmlop
  342. self.parser = sgmlop.SGMLParser()
  343. self.parser.register(self)
  344. HarvestManSimpleParser.__init__(self)
  345. # Type
  346. self.typ = 1
  347. def finish_starttag(self, tag, attrs):
  348. self.unknown_starttag(tag, attrs)
  349. def finish_endtag(self, tag):
  350. self.unknown_endtag(tag)
  351. def feed(self, data):
  352. self.parser.feed(data)
  353. class HarvestManCSSParser(object):
  354. """ Class to parse stylesheets and extract URLs """
  355. # Regexp to parse stylesheet imports
  356. importcss1 = re.compile(r'(\@import\s+\"?)(?!url)([\w.-:/]+)(\"?)', re.MULTILINE|re.LOCALE|re.UNICODE)
  357. importcss2 = re.compile(r'(\@import\s+url\(\"?)([\w.-:/]+)(\"?\))', re.MULTILINE|re.LOCALE|re.UNICODE)
  358. # Regexp to parse URLs inside CSS files
  359. cssurl = re.compile(r'(url\()([^\)]+)(\))', re.LOCALE|re.UNICODE)
  360. def __init__(self):
  361. # Any imported stylesheet URLs
  362. self.csslinks = []
  363. # All URLs including above
  364. self.links = []
  365. def feed(self, data):
  366. self._parse(data)
  367. def _parse(self, data):
  368. """ Parse stylesheet data and extract imported css links, if any """
  369. # Return is a list of imported css links.
  370. # This subroutine uses the specification mentioned at
  371. # http://www.w3.org/TR/REC-CSS2/cascade.html#at-import
  372. # for doing stylesheet imports.
  373. # This takes care of @import "style.css" and
  374. # @import url("style.css") and url(...) syntax.
  375. # Media types specified if any, are ignored.
  376. # Matches for @import "style.css"
  377. l1 = self.importcss1.findall(data)
  378. # Matches for @import url("style.css")
  379. l2 = self.importcss2.findall(data)
  380. # Matches for url(...)
  381. l3 = self.cssurl.findall(data)
  382. for item in (l1+l2):
  383. if not item: continue
  384. url = item[1].replace("'",'').replace('"','')
  385. self.csslinks.append(url)
  386. self.links.append(url)
  387. for item in l3:
  388. if not item: continue
  389. url = item[1].replace("'",'').replace('"','')
  390. if url not in self.links:
  391. self.links.append(url)