PageRenderTime 42ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/scrapemark.py

https://github.com/Open-Source-GIS/scrapemark
Python | 508 lines | 455 code | 27 blank | 26 comment | 124 complexity | c71f581a7162d378f23135c478155335 MD5 | raw file
  1. import re
  2. import unicodedata
  3. import urllib, urllib2
  4. import urlparse
  5. import cgi
  6. import cookielib
  7. from htmlentitydefs import name2codepoint
  8. verbose = True
  9. user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3'
  10. def scrape(pattern, html=None, url=None, get=None, post=None, headers=None, cookie_jar=None):
  11. if type(pattern) == str:
  12. pattern = compile(pattern)
  13. return pattern.scrape(html, url, get, post, headers, cookie_jar)
  14. def compile(pattern):
  15. return _Pattern(_compile(pattern, True))
  16. def fetch_html(url, get=None, post=None, headers=None, cookie_jar=None):
  17. if get:
  18. if type(get) == str:
  19. get = cgi.parse_qs(get)
  20. l = list(urlparse.urlparse(url))
  21. g = cgi.parse_qs(l[4])
  22. g.update(get)
  23. l[4] = urllib.urlencode(g)
  24. url = urlparse.urlunparse(l)
  25. if post and type(post) != str:
  26. post = urllib.urlencode(post)
  27. if cookie_jar == None:
  28. cookie_jar = cookielib.CookieJar()
  29. if not headers:
  30. headers = {'User-Agent': user_agent}
  31. else:
  32. if 'User-Agent' not in headers:
  33. headers['User-Agent'] = user_agent
  34. if verbose:
  35. print 'fetching', url, '...'
  36. request = urllib2.Request(url, post, headers)
  37. opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
  38. res = opener.open(request).read()
  39. if verbose:
  40. print 'DONE fetching.'
  41. return res
  42. # INTERNALS
  43. # ----------------------------------------------------------------------
  44. class _Pattern:
  45. def __init__(self, nodes):
  46. self._nodes = nodes
  47. def scrape(self, html=None, url=None, get=None, post=None, headers=None, cookie_jar=None):
  48. if cookie_jar == None:
  49. cookie_jar = cookielib.CookieJar()
  50. if html == None:
  51. html = fetch_html(url, get, post, headers, cookie_jar)
  52. captures = {}
  53. if _match(self._nodes, _remove_comments(html), 0, captures, url, cookie_jar) == -1:
  54. return None
  55. if len(captures) == 1 and '' in captures:
  56. return captures['']
  57. return captures
  58. # node types # information in tuple
  59. _TEXT = 1 # (_TEXT, regex)
  60. _TAG = 2 # (_TAG, open_regex, close_regex, attributes, children) attributes {name: (regex, special_nodes) ...}
  61. _CAPTURE = 3 # (_CAPTURE, name_parts, filters)
  62. _SCAN = 4 # (_SCAN, children)
  63. _GOTO = 5 # (_GOTO, filters, children)
  64. _space_re = re.compile(r'\s+')
  65. _tag_re = re.compile(r'<[^>]*>')
  66. _attr_re = re.compile(r'([\w-]+)(?:\s*=\s*(?:(["\'])(.*?)\2|(\S+)))?', re.S)
  67. _attr_start_re = re.compile(r'([\w-]+)(?:\s*=\s*)?')
  68. _comment_re = re.compile(r'<!--.*?-->', re.S)
  69. _script_re = re.compile(r'<script[^>]*>.*?</script>', re.S | re.I)
  70. _entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")
  71. _closure_start_re = re.compile(r'<|\{[\{\*\@\#]')
  72. _capture_list_re = re.compile(r'\[(\w*)\]')
  73. # functions for compiling a pattern into nodes
  74. # --------------------------------------------------------------
  75. def _compile(s, re_compile):
  76. slen = len(s)
  77. i = 0
  78. nodes = []
  79. stack = []
  80. while i < slen:
  81. m = _closure_start_re.search(s, i)
  82. if not m:
  83. break
  84. closure_name = m.group(0)
  85. # text since last closure
  86. text = s[i:m.start()].strip()
  87. if text:
  88. nodes.append((_TEXT, _make_text_re(text, re_compile)))
  89. i = m.end()
  90. # an HTML tag
  91. if closure_name == '<':
  92. inner, i = _next_closure(s, i, '<', '>')
  93. inner = inner.strip()
  94. if inner:
  95. # end tag
  96. if inner[0] == '/':
  97. if stack:
  98. nodes = stack.pop()
  99. # standalone tag
  100. elif inner[-1] == '/':
  101. l = inner[:-1].split(None, 1)
  102. name = l[0].strip()
  103. attrs = {} if len(l) == 1 else _compile_attrs(l[1], re_compile)
  104. nodes.append((_TAG, _make_start_tag_re(name, re_compile), _make_end_tag_re(name, re_compile), attrs, []))
  105. # start tag
  106. else:
  107. l = inner.split(None, 1)
  108. name = l[0].strip()
  109. attrs = {} if len(l) == 1 else _compile_attrs(l[1], re_compile)
  110. new_nodes = []
  111. nodes.append((_TAG, _make_start_tag_re(name, re_compile), _make_end_tag_re(name, re_compile), attrs, new_nodes))
  112. stack.append(nodes)
  113. nodes = new_nodes
  114. # special brackets
  115. else:
  116. special_type = closure_name[1]
  117. # capture
  118. if special_type == '{':
  119. inner, i = _next_closure(s, i, '{{', '}}')
  120. nodes.append(_compile_capture(inner))
  121. # scan
  122. elif special_type == '*':
  123. inner, i = _next_closure(s, i, '{*', '*}')
  124. nodes.append((_SCAN, _compile(inner, re_compile)))
  125. # goto
  126. elif special_type == '@':
  127. inner, i = _next_closure(s, i, '{@', '@}')
  128. if inner:
  129. filters = []
  130. if inner[0] == '|':
  131. filters, inner = (inner.split(None, 1) + [''])[:2]
  132. filters = filters.split('|')[1:]
  133. nodes.append((_GOTO, filters, _compile(inner, True)))
  134. # comment
  135. elif special_type == '#':
  136. i = s.find('#}')
  137. if i == -1:
  138. break
  139. i += 2
  140. # ending text
  141. text = s[i:].strip()
  142. if text:
  143. nodes.append((_TEXT, _make_text_re(text, re_compile)))
  144. stack.append(nodes)
  145. return stack[0]
  146. def _compile_capture(s): # returns the tuple with _CAPTURE
  147. filters = s.strip().split('|')
  148. name = filters.pop(0)
  149. name_parts = []
  150. for part in name.split('.'):
  151. m = _capture_list_re.match(part)
  152. if m:
  153. name_parts.append((m.group(1),))
  154. else:
  155. name_parts.append(part)
  156. return (_CAPTURE, name_parts, filters)
  157. def _compile_attrs(s, re_compile):
  158. attrs = {}
  159. i = 0
  160. slen = len(s)
  161. while i < slen:
  162. m = _attr_start_re.search(s, i)
  163. if not m:
  164. break
  165. name = m.group(1).lower()
  166. i = m.end()
  167. if i >= slen:
  168. break
  169. quote = s[i]
  170. # no quotes, value ends at next whitespace
  171. if quote != '"' and quote != "'":
  172. m = _space_re.search(s, i)
  173. if m:
  174. val = s[i:m.start()]
  175. i = m.end()
  176. else:
  177. val = s[i:]
  178. i = slen
  179. # quotes
  180. else:
  181. i += 1
  182. start = i
  183. # find the ending quote, skipping over { }
  184. while i < slen:
  185. quote_i = s.find(quote, i)
  186. bracket_i = s.find('{', i)
  187. if quote_i == -1:
  188. i = slen
  189. break
  190. elif bracket_i == -1 or quote_i < bracket_i:
  191. i = quote_i
  192. break
  193. else:
  194. inner, i = _next_closure(s, bracket_i + 1, '{', '}')
  195. val = s[start:i]
  196. val = val.strip()
  197. regex = ''
  198. special_nodes = []
  199. if val: # if there is no value, empty regex string won't be compiled
  200. nodes = _compile(val, False)
  201. # concatenate regexes
  202. for node in nodes:
  203. if node[0] == _TEXT:
  204. regex += node[1]
  205. elif node[0] != _TAG:
  206. regex += '(.*)'
  207. special_nodes.append(node)
  208. if regex != '(.*)':
  209. regex = '(?:^|\s)' + regex + '(?:\s|$)' # match must be flush with whitespace or start/end
  210. if re_compile:
  211. regex = re.compile(regex, re.I)
  212. attrs[name] = (regex, special_nodes)
  213. return attrs
  214. def _make_start_tag_re(name, re_compile):
  215. regex = r'<\s*' + re.escape(name) + r'(?:\s+([^>]*)|(\s*\/))?>'
  216. if re_compile:
  217. regex = re.compile(regex, re.I)
  218. return regex
  219. def _make_end_tag_re(name, re_compile):
  220. regex = r'</\s*' + re.escape(name) + r'\s*>'
  221. if re_compile:
  222. regex = re.compile(regex, re.I)
  223. return regex
  224. def _make_text_re(text, re_compile):
  225. regex = r'\s+'.join([re.escape(w) for w in text.split()])
  226. if re_compile:
  227. regex = re.compile(regex, re.I)
  228. return regex
  229. # functions for running pattern nodes on html
  230. # ---------------------------------------------------------------
  231. def _match(nodes, html, i, captures, base_url, cookie_jar): # returns substring index after match, -1 if no match
  232. anchor_i = i
  233. special = []
  234. for node in nodes:
  235. # match text node
  236. if node[0] == _TEXT:
  237. m = node[1].search(html, i)
  238. if not m:
  239. return -1
  240. # run previous special nodes
  241. if not _run_special_nodes(special, html[anchor_i:m.start()], captures, base_url, cookie_jar):
  242. return -1
  243. special = []
  244. i = anchor_i = m.end()
  245. # match html tag
  246. elif node[0] == _TAG:
  247. while True:
  248. # cycle through tags until all attributes match
  249. while True:
  250. m = node[1].search(html, i)
  251. if not m:
  252. return -1
  253. i = m.end()
  254. attrs = _parse_attrs(m.group(1) or '')
  255. attrs_matched = _match_attrs(node[3], attrs, captures, base_url, cookie_jar)
  256. if attrs_matched == -1:
  257. return -1
  258. if attrs_matched:
  259. break
  260. if m.group(2): # standalone tag
  261. break
  262. else: # make sure children match
  263. body, i = _next_tag(html, i, node[1], node[2])
  264. nested_captures = {}
  265. if _match(node[4], body, 0, nested_captures, base_url, cookie_jar) != -1:
  266. _merge_captures(captures, nested_captures)
  267. break
  268. # run previous special nodes
  269. if not _run_special_nodes(special, html[anchor_i:m.start()], captures, base_url, cookie_jar):
  270. return -1
  271. special = []
  272. anchor_i = i
  273. else:
  274. special.append(node)
  275. if not _run_special_nodes(special, html[i:], captures, base_url, cookie_jar):
  276. return -1
  277. return i
  278. def _match_attrs(attr_nodes, attrs, captures, base_url, cookie_jar): # returns True/False, -1 if failed _run_special_node
  279. for name, attr_node in attr_nodes.items():
  280. if name not in attrs:
  281. return False
  282. if attr_node[0]: # if attr_node[0] is empty string, done matching
  283. m = attr_node[0].match(attrs[name])
  284. if not m:
  285. return False
  286. # run regex captures over parallel list of special nodes
  287. for i, special_node in enumerate(attr_node[1]):
  288. if not _run_special_node(special_node, m.group(i+1), captures, base_url, cookie_jar):
  289. return -1
  290. return True
  291. def _run_special_nodes(nodes, s, captures, base_url, cookie_jar): # returns True/False
  292. for node in nodes:
  293. if not _run_special_node(node, s, captures, base_url, cookie_jar):
  294. return False
  295. return True
  296. def _run_special_node(node, s, captures, base_url, cookie_jar): # returns True/False
  297. if node[0] == _CAPTURE:
  298. s = _apply_filters(s, node[2], base_url)
  299. _set_capture(captures, node[1], s)
  300. elif node[0] == _SCAN:
  301. i = 0
  302. while True:
  303. nested_captures = {}
  304. i = _match(node[1], s, i, nested_captures, base_url, cookie_jar)
  305. if i == -1:
  306. break
  307. else:
  308. _merge_captures(captures, nested_captures)
  309. # scan always ends with an usuccessful match, so fill in captures that weren't set
  310. _fill_captures(node[1], captures)
  311. elif node[0] == _GOTO:
  312. new_url = _apply_filters(s, node[1] + ['abs'], base_url)
  313. new_html = fetch_html(new_url, cookie_jar=cookie_jar)
  314. if _match(node[2], new_html, 0, captures, new_url, cookie_jar) == -1:
  315. return False
  316. return True
  317. def _set_capture(captures, name_parts, val, list_append=True):
  318. obj = captures
  319. last = len(name_parts) - 1
  320. for i, part in enumerate(name_parts):
  321. if i == last:
  322. new_obj = val
  323. else:
  324. new_obj = {}
  325. if type(part) == tuple:
  326. if part[0] not in obj:
  327. if list_append:
  328. obj[part[0]] = [new_obj]
  329. else:
  330. obj[part[0]] = []
  331. break
  332. else:
  333. if type(obj[part[0]]) != list:
  334. break
  335. if i == last or len(obj[part[0]]) == 0 or name_parts[i+1] in obj[part[0]][-1]:
  336. if list_append:
  337. obj[part[0]].append(new_obj)
  338. else:
  339. break
  340. else:
  341. new_obj = obj[part[0]][-1]
  342. else:
  343. if part not in obj:
  344. obj[part] = new_obj
  345. else:
  346. new_obj = obj[part]
  347. obj = new_obj
  348. def _merge_captures(master, slave):
  349. for name, val in slave.items():
  350. if name not in master:
  351. master[name] = val
  352. else:
  353. if type(val) == dict and type(master[name]) == dict:
  354. _merge_captures(master[name], val)
  355. elif type(val) == list and type(master[name]) == list:
  356. for e in val:
  357. if type(e) == dict:
  358. for n, v in e.items():
  359. if len(master[name]) == 0 or type(master[name][-1]) != dict or n in master[name][-1]:
  360. master[name].append({n: v})
  361. else:
  362. master[name][-1][n] = v
  363. else:
  364. master[name].append(e)
  365. def _fill_captures(nodes, captures):
  366. for node in nodes:
  367. if node[0] == _TAG:
  368. _fill_captures(node[4], captures)
  369. for attr in node[3].values():
  370. _fill_captures(attr[1], captures)
  371. elif node[0] == _CAPTURE:
  372. _set_capture(captures, node[1], _apply_filters(None, node[2], None), False)
  373. elif node[0] == _SCAN:
  374. _fill_captures(node[1], captures)
  375. elif node[0] == _GOTO:
  376. _fill_captures(node[2], captures)
  377. def _apply_filters(s, filters, base_url):
  378. if 'html' not in filters and issubclass(type(s), basestring):
  379. s = _remove_html(s)
  380. for f in filters:
  381. if f == 'unescape':
  382. if issubclass(type(s), basestring):
  383. s = s.decode('string_escape')
  384. elif f == 'abs':
  385. if issubclass(type(s), basestring):
  386. s = urlparse.urljoin(base_url, s)
  387. elif f == 'int':
  388. try:
  389. s = int(s)
  390. except:
  391. s = 0
  392. elif f == 'float':
  393. try:
  394. s = float(s)
  395. except:
  396. s = 0.0
  397. elif f == 'bool':
  398. s = bool(s)
  399. return s
  400. # html/text utilities
  401. # ---------------------------------------------------------------
  402. def _remove_comments(s):
  403. return _comment_re.sub('', s)
  404. def _remove_html(s):
  405. s = _comment_re.sub('', s)
  406. s = _script_re.sub('', s)
  407. s = _tag_re.sub('', s)
  408. s = _space_re.sub(' ', s)
  409. s = _decode_entities(s)
  410. s = s.strip()
  411. return s
  412. def _decode_entities(s):
  413. if type(s) is not unicode:
  414. s = unicode(s, 'utf-8', 'ignore')
  415. s = unicodedata.normalize('NFKD', s)
  416. return _entity_re.sub(_substitute_entity, s)
  417. def _substitute_entity(m):
  418. ent = m.group(2)
  419. if m.group(1) == "#":
  420. return unichr(int(ent))
  421. else:
  422. cp = name2codepoint.get(ent)
  423. if cp:
  424. return unichr(cp)
  425. else:
  426. return m.group()
  427. def _parse_attrs(s):
  428. attrs = {}
  429. for m in _attr_re.finditer(s):
  430. attrs[m.group(1)] = m.group(3) or m.group(4)
  431. return attrs
  432. def _next_tag(s, i, tag_open_re, tag_close_re, depth=1): # returns (tag body, substring index after tag)
  433. slen = len(s)
  434. start = i
  435. while i < slen:
  436. tag_open = tag_open_re.search(s, i)
  437. tag_close = tag_close_re.search(s, i)
  438. if not tag_close:
  439. i = len(s)
  440. break
  441. elif not tag_open or tag_close.start() < tag_open.start():
  442. i = tag_close.end()
  443. depth -= 1
  444. if depth == 0:
  445. return s[start:tag_close.start()], i
  446. else:
  447. if not (tag_open and tag_open.group(2)): # not a standalone tag
  448. depth += 1
  449. i = tag_open.end()
  450. return s[start:i], i
  451. def _next_closure(s, i, left_str, right_str, depth=1): # returns (closure body, substring index after closure)
  452. slen = len(s)
  453. start = i
  454. while i < slen:
  455. left = s.find(left_str, i)
  456. right = s.find(right_str, i)
  457. if right == -1:
  458. i = len(s)
  459. break
  460. elif left == -1 or right < left:
  461. i = right + len(right_str)
  462. depth -= 1
  463. if depth == 0:
  464. return s[start:right], i
  465. else:
  466. depth += 1
  467. i = left + len(left_str)
  468. return s[start:i], i