PageRenderTime 58ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/Sources/wwwclient/scrape.py

https://github.com/netconstructor/wwwclient
Python | 921 lines | 872 code | 17 blank | 32 comment | 10 complexity | 86efd62e6db9622660a238bce6efe965 MD5 | raw file
Possible License(s): LGPL-3.0
  1. #!/usr/bin/env python
  2. # -----------------------------------------------------------------------------
  3. # Project : WWWClient
  4. # -----------------------------------------------------------------------------
  5. # Author : Sebastien Pierre <sebastien@xprima.com>
  6. # -----------------------------------------------------------------------------
  7. # License : GNU Lesser General Public License
  8. # Credits : Xprima.com
  9. # -----------------------------------------------------------------------------
  10. # Creation : 19-Jun-2006
  11. # Last mod : 19-Mar-2012
  12. # -----------------------------------------------------------------------------
  13. # TODO: The tree could be created by the iterate function, by directly linking
  14. # nodes. So the tree could be unfolded as a list, or kept folded as a tree. This
  15. # would allow to have still one structure. Ideally, the original HTML could be
  16. # kept to allow easy subset extraction (currently, the data is recreated)
  17. import re, string, htmlentitydefs
  18. import form
  19. __doc__ = """\
  20. The scraping module gives a set of functionalities to manipulate HTML data. All
  21. functions are text oriented, so that they work with any subset of an HTML
  22. document. This is very useful, as it does not require the HTML to be
  23. well-formed, and allows easy selection of HTML fragments."""
  24. DEFAULT_ENCODING = "utf-8"
  25. RE_SPACES = re.compile("\s+")
  26. RE_HTMLSTART = re.compile("</?(\w+)", re.I)
  27. RE_HTMLEND = re.compile("/?>")
  28. RE_HTMLLINK = re.compile("<[^<]+(href|src|url)\s*=\s*('[^']*'|\"[^\"]*\"|[^ >]*)", re.I)
  29. RE_HTMLCLASS = re.compile("class\s*=\s*['\"]?([\w\-_\d]+)", re.I)
  30. RE_HTMLID = re.compile("id\s*=\s*['\"]?([\w\-_\d]+)", re.I)
  31. RE_HTMLHREF = re.compile("href\s*=\s*('[^']*'|\"[^\"]*\"|[^ ]*)", re.I)
  32. RE_SPACES = re.compile("\s+", re.MULTILINE)
  33. RE_QUERY = re.compile("^(?P<name>[\w\d_\-]+)?(?P<id>#[\w\d_\-]+)?(?P<class>\.[\w\d_\-]+)?(?P<property>\:[\w\d\-]+)?(?P<count>\[\-?\d+\])?$")
  34. KEEP_ABOVE = "+"
  35. KEEP_SAME = "="
  36. KEEP_BELOW = "-"
  37. # -----------------------------------------------------------------------------
  38. #
  39. # URL
  40. #
  41. # -----------------------------------------------------------------------------
  42. class URL:
  43. @classmethod
  44. def Base( self, url ):
  45. base_url = url
  46. if not url.endswith("/"):
  47. i = base_url.rfind("/")
  48. if i > 0:
  49. base_url = base_url[:i + 1]
  50. return base_url
  51. @classmethod
  52. def Absolute( self, url, siteURL ):
  53. base_url = self.Base(siteURL)
  54. if url.find("http") != 0:
  55. url = base_url + url
  56. return url
  57. # -----------------------------------------------------------------------------
  58. #
  59. # HTML TAG INTERFACE
  60. #
  61. # -----------------------------------------------------------------------------
  62. class Tag:
  63. """A Tag is an abstract decorator for a portion within a string. Tags are
  64. used in this module to identify HTML/XML data within strings."""
  65. OPEN = "open"
  66. CLOSE = "close"
  67. EMPTY = "empty"
  68. def __init__( self, html, start, end ):
  69. """Creates a new new tag."""
  70. self._html = html
  71. self.start = start
  72. self.end = end
  73. def isElement( self ):
  74. return isinstance(self, ElementTag)
  75. def isText( self ):
  76. return isinstance(self, TextTag)
  77. def html( self ):
  78. """Returns the HTML representation of this tag."""
  79. return self._html[self.start:self.end]
  80. def __repr__( self ):
  81. return repr(self._html[self.start:self.end])
  82. class ElementTag(Tag):
  83. """Represents a single element tag (open or close) identified within
  84. a string."""
  85. def __init__( self, html, start, end, astart=None, aend=None, attributes=None,
  86. level=None, type=None ):
  87. """Creates a new tag element extracted from the given 'html' string."""
  88. Tag.__init__(self, html, start, end)
  89. if type == None: type = Tag.OPEN
  90. self._attributes = attributes
  91. self.astart = astart
  92. self.aend = aend
  93. self.level = level
  94. self.type = type
  95. def attributes( self ):
  96. if self._attributes == None:
  97. self._attributes = HTML.parseAttributes(self._html[self.astart:self.aend].strip())
  98. return self._attributes
  99. def has( self, name, value=None ):
  100. """Tells if this tag has the given attribute"""
  101. if value is None:
  102. return self.attributes().has_key(name)
  103. else:
  104. return self.attributes().get(name) == value
  105. def get( self, name ):
  106. """Returns the given attribute set for this tag."""
  107. return self.attributes().get(name)
  108. def name( self ):
  109. """Returns this tag name"""
  110. if self.type == Tag.OPEN or self.type == Tag.EMPTY:
  111. return self._html[self.start+1:self.astart].strip()
  112. else:
  113. return self._html[self.start+2:self.astart].strip()
  114. def nameLike( self, what ):
  115. """Tells if the name is like the given string/list of string/regexp/list
  116. of regexpes."""
  117. if type(what) in (str, unicode):
  118. # TODO: Handle unicode
  119. return re.match(what, self.name(), re.I)
  120. if type(what) in (tuple, list):
  121. for w in what:
  122. if self.nameLike(w): return True
  123. else:
  124. return what.match(self.name(), re.I)
  125. def hasName( self, name ):
  126. return self.name().lower() == name.lower()
  127. def hasClass( self, name ):
  128. """Tells if the element has the given class (case sensitive)"""
  129. element_class = self.attributes().get("class")
  130. if element_class and name in element_class.split() != -1:
  131. return True
  132. else:
  133. return False
  134. def hasId( self, name ):
  135. """Tells if the element has the given id (case sensitive)"""
  136. return self.attributes().get("id") == name
  137. def text(self, encoding=DEFAULT_ENCODING):
  138. return u''
  139. class TextTag(Tag):
  140. """Represents raw text, not an element."""
  141. def __init__( self, html, start, end):
  142. Tag.__init__(self, html, start, end)
  143. def hasName( self, name ):
  144. return False
  145. def hasClass( self, name ):
  146. """Tells if the element has the given class (case sensitive)"""
  147. return False
  148. def hasId( self, name ):
  149. """Tells if the element has the given id (case sensitive)"""
  150. return False
  151. def text(self, encoding=DEFAULT_ENCODING):
  152. return self._html[self.start:self.end].decode(encoding)
  153. def name(self):
  154. return "#text"
  155. class TagList:
  156. """Represents a list of ElementTag and TextTag, which basically corresponds
  157. to the tokenization of an HTML string. The list can be folded as a tree
  158. if necessary."""
  159. def __init__( self, content=None ):
  160. """Creates a blank TagTree. It should be populated with data using the
  161. `fromHTML` method."""
  162. if content == None: content = []
  163. self.content = content
  164. def append( self, content ):
  165. assert isinstance(content, Tag)
  166. content.context = self
  167. self.content.append(content)
  168. return content
  169. def fromHTML( self, html, scraper=None ):
  170. """Creates the tag list content from the given HTML data. This will
  171. erase the content of this tag list, replacing it by this one."""
  172. self.content = []
  173. offset = 0
  174. level = 0
  175. end = False
  176. if scraper == None: scraper = HTML
  177. while not end:
  178. tag = scraper.findNextTag(html, offset)
  179. # If there is no tag, it is the document
  180. if tag == None:
  181. self.append(TextTag( html, start=offset, end=len(html)))
  182. end = True
  183. else:
  184. tag, tag_end_offset = tag
  185. tag_type, tag_name, tag_start, attr_start, attr_end = tag
  186. # There may be text inbetween
  187. if tag_start > offset:
  188. self.append(TextTag(html, start=offset,end=tag_start))
  189. # We process the encountered tag
  190. #new = level, tag_type, tag_name, tag_start, attr_end + 1, attr_start, attr_end
  191. new = ElementTag( html, tag_start, attr_end + 1, attr_start, attr_end, type=tag_type, level=level)
  192. self.append(new)
  193. last = new
  194. offset = tag_end_offset
  195. return self.content
  196. def tagtree( self, asXML=False ):
  197. """Folds this list into a tree, which is returned as result."""
  198. root = TagTree(id=-1)
  199. parents = [root]
  200. counter = 0
  201. tags_stack = []
  202. def find_opening_tag(tag, stack):
  203. for i in range(len(stack)-1,-1,-1):
  204. this_tag = stack[i]
  205. if this_tag.name() == tag.name() and this_tag.type == Tag.OPEN:
  206. return this_tag, i
  207. return None, -1
  208. # We iterate on the tags of this taglist
  209. for tag in self.content:
  210. # We create the node
  211. if isinstance(tag, TextTag):
  212. parents[-1].append(TagTree(tag))
  213. else:
  214. if tag.type in (Tag.OPEN, Tag.EMPTY):
  215. if tags_stack and HTML_closeWhen( tag, tags_stack[-1] ) and not asXML:
  216. # This is the special treatment when we have to close
  217. # tags in HTML
  218. # FIXME: The two variables are not used
  219. parent_tag = tags_stack.pop()
  220. closing_tag = ElementTag(tag._html, tag.start, tag.start, type=Tag.CLOSE)
  221. parents.pop().close(tag)
  222. if tag.type == Tag.EMPTY or (not asXML and HTML_isEmpty(tag)):
  223. node = TagTree(tag, id=counter)
  224. parents[-1].append(node)
  225. counter += 1
  226. else:
  227. node = TagTree(tag, id=counter)
  228. parents[-1].append(node)
  229. parents.append(node)
  230. tags_stack.append(tag)
  231. counter += 1
  232. elif tag.type == Tag.CLOSE:
  233. opening_tag, level = find_opening_tag(tag, tags_stack)
  234. if not opening_tag:
  235. #print "WARNING: no opening tag for ", tag
  236. continue
  237. else:
  238. while len(tags_stack) > level:
  239. stack_tag = tags_stack.pop()
  240. node = parents.pop()
  241. assert stack_tag == opening_tag
  242. node.close(tag)
  243. else:
  244. raise Exception("Unknow Tag.type: %s" % (tag.type))
  245. root._taglist = self
  246. return root
  247. def html( self ):
  248. """Converts this tags list to HTML"""
  249. res = []
  250. for tag in self.content:
  251. assert isinstance(tag, Tag) or isinstance(tag, TextTag)
  252. res.append(tag.html())
  253. return "".join(res)
  254. def innerhtml(self):
  255. res = []
  256. for tag in self.content[1:-1]:
  257. assert isinstance(tag, Tag) or isinstance(tag, TextTag)
  258. res.append(tag.html())
  259. return "".join(res)
  260. def text(self, encoding=DEFAULT_ENCODING):
  261. res = []
  262. for tag in self.content:
  263. res.append(tag.text(encoding))
  264. # FIXME: Unicode
  265. return u"".join(res)
  266. def __iter__( self ):
  267. for tag in self.content:
  268. yield tag
  269. def __str__( self ):
  270. return str(self.content)
  271. # FIXME: Should inherit from TagNode
  272. class TagTree:
  273. """A tag tree wraps one or two tags and allows to structure tags as a tree.
  274. The tree node instance offers a nice interface to manipulate the HTML
  275. document as a tree."""
  276. TEXT = "#text"
  277. def __init__( self, startTag=None, endTag=None, id=None ):
  278. """TagTrees should be created by an HTMLTools, and not really directly.
  279. However, if you really want to create a tree yourself, use the
  280. 'startTag' and 'endTags' to specify start and end tags."""
  281. self._parent = None
  282. self._depth = 0
  283. self._taglist = None
  284. self.startTag = None
  285. self.endTag = None
  286. self.id = id
  287. self.children = []
  288. self.name = None
  289. self.open(startTag)
  290. self.close(endTag)
  291. def clone( self, children=None ):
  292. """Clones this tree. If the 'children' attribute is 'True', then the
  293. children will be cloned as well (deep clone)."""
  294. clone = TagTree()
  295. clone._parent = self._parent
  296. clone._depth = self._depth
  297. clone._taglist = self._taglist
  298. clone.id = self.id
  299. clone.name = self.name
  300. if children is None:
  301. clone.children = []
  302. for child in self.children:
  303. clone.children.append(child.clone())
  304. else:
  305. clone.children = children
  306. clone.open(self.startTag)
  307. clone.close(self.endTag)
  308. return clone
  309. def has( self, name, value=None):
  310. """Tells if the start tag of this tag tree has an attribute of the given
  311. name."""
  312. if self.startTag == None: return None
  313. return self.startTag.has(name, value)
  314. def get( self, name):
  315. """Gets the start tag of this tag tree attribute with the given
  316. 'name'"""
  317. if self.startTag == None: return None
  318. return self.startTag.get(name)
  319. def attribute(self, name):
  320. """Alias for 'get(name)"""
  321. return self.attributes().get(name)
  322. def attributes( self ):
  323. """Returns the attributes of this tag tree start tag"""
  324. if self.startTag == None: return {}
  325. return self.startTag.attributes()
  326. def setParent( self, parent ):
  327. """Sets the parent TagTree for this tag tree."""
  328. self._parent = parent
  329. self._depth = self.parent().depth() + 1
  330. def parent( self ):
  331. """Returns the parnet tag tree (if any)."""
  332. return self._parent
  333. def depth( self ):
  334. """Returns the depth of this tag tree."""
  335. return self._depth
  336. def isRoot( self ):
  337. """Tells if this tag tree is a root (has no parent) or not."""
  338. return self._parent == None
  339. def _cutBelow( self, data, value ):
  340. """Helper function for the `cut()` method."""
  341. depth = self.depth()
  342. if depth > value:
  343. data.append(self)
  344. else:
  345. for child in self.children:
  346. child._cutBelow(data, value)
  347. return data
  348. def cut( self, above=None, below=None, at=None):
  349. res = []
  350. assert not above and not at, "Not implemented"
  351. if not below is None:
  352. root = TagTree()
  353. for child in self._cutBelow(res, below):
  354. root.append(child)
  355. return root
  356. def filter( self, reject=None, accept=None, recursive=False ):
  357. """Returns a clone of this tree where each child node is filtered
  358. through the given 'accept' or 'reject' predicate."""
  359. res = []
  360. root = self.clone(children=res)
  361. for child in self.children:
  362. if not reject is None:
  363. if reject(child): continue
  364. if not accept is None:
  365. if accept(child):
  366. if recursive:
  367. root.append(child.filter(reject=reject,accept=accept,recursive=recursive))
  368. else:
  369. root.append(child.clone())
  370. else:
  371. if recursive:
  372. root.append(child.filter(reject=reject,accept=accept,recursive=recursive))
  373. else:
  374. root.append(child.clone())
  375. return root
  376. def match( self, predicate ):
  377. """Tells if the current TagTree matches the given predicate"""
  378. if self.startTag and predicate(self.startTag):
  379. return True
  380. else:
  381. return False
  382. def find( self, predicate, recursive=True ):
  383. """Returns a list of child nodes (TagTree objects) that match the given predicate. This
  384. operation is recursive by default."""
  385. # NOTE: This has been removed, as find means "find inside"
  386. # if self.startTag and predicate(self.startTag):
  387. # return [self]
  388. res = []
  389. for c in self.children:
  390. assert isinstance(c, TagTree)
  391. if predicate(c):
  392. res.append(c)
  393. if recursive:
  394. res = res + c.find(predicate)
  395. return res
  396. def open( self, startTag):
  397. if startTag==None: return
  398. assert self.startTag == None
  399. assert self.endTag == None
  400. assert isinstance(startTag, Tag)
  401. self.startTag = startTag
  402. if isinstance(startTag, TextTag):
  403. self.name = TagTree.TEXT
  404. else:
  405. self.name = startTag.name()
  406. assert self.name, repr(startTag.html()) + ":" + startTag.name()
  407. return self
  408. def close( self, endTag ):
  409. if endTag==None: return
  410. assert self.endTag == None
  411. assert isinstance(endTag, ElementTag)
  412. self.endTag = endTag
  413. return self
  414. def append( self, node ):
  415. assert isinstance(node, TagTree)
  416. node.setParent(self)
  417. assert node != self
  418. self.children.append(node)
  419. self._taglist = None
  420. return self
  421. def merge( self, node ):
  422. assert isinstance(node, TagTree)
  423. for child in node.children:
  424. self.append(child)
  425. return self
  426. def list( self, contentOnly=False ):
  427. """Returns a tag list from this Tree Node."""
  428. if self._taglist == None:
  429. content = []
  430. if self.startTag: content.append(self.startTag)
  431. for c in self.children: content.extend(c.list(contentOnly=True))
  432. if self.endTag: content.append(self.endTag)
  433. self._taglist = TagList(content=content)
  434. if contentOnly:
  435. return self._taglist.content
  436. else:
  437. return self._taglist
  438. def hasName( self, name ):
  439. """Tells if the element has the given class (case sensitive)"""
  440. if self.startTag: return self.startTag.hasName(name)
  441. else: return None
  442. def hasClass( self, name ):
  443. """Tells if the element has the given class (case sensitive)"""
  444. if self.startTag: return self.startTag.hasClass(name)
  445. else: return None
  446. def hasId( self, name ):
  447. """Tells if the element has the given id (case sensitive)"""
  448. if self.startTag: return self.startTag.hasId(name)
  449. else: return None
  450. def prettyString( self, ):
  451. if self.name == self.TEXT:
  452. return "#text:" + repr(self.startTag.html())
  453. else:
  454. if self._parent == None:
  455. res = "#root\n"
  456. else:
  457. res = self.startTag.name()
  458. res += "["
  459. if self.id != None: res += "#%d" % (self.id)
  460. attr = []
  461. for k,v in self.attributes().items():attr.append("%s=%s" % (k,v))
  462. attr = ",".join(attr)
  463. if attr: attr = "(%s)" % (attr)
  464. res += "@%d]%s\n" % (self.depth(), attr)
  465. for c in self.children:
  466. ctext = ""
  467. for line in c.prettyString().split("\n"):
  468. if not line: continue
  469. if not ctext:
  470. ctext = " <" + line + "\n"
  471. else:
  472. ctext += " " + line + "\n"
  473. res += ctext
  474. return res
  475. def query( self, query ):
  476. """Does a basic CSS-like query on the TagTree. Returns a TagTree"""
  477. if type(query) not in (tuple, list):
  478. selectors = filter(lambda _:_.strip(), query.split(" "))
  479. else:
  480. selectors = filter(lambda _:_.strip(), query)
  481. if selectors:
  482. head = selectors[0]
  483. tail = []
  484. if len(selectors) >= 1: tail = selectors[1:]
  485. predicate = lambda: True
  486. match = RE_QUERY.match(head)
  487. assert match, "Invalid selector expression: " + repr(head)
  488. p_name, p_id, p_class, p_property, p_count = match.group("name"), match.group("id"), match.group("class"), match.group("property"), match.group("count")
  489. if p_name:
  490. predicate = lambda _:predicate and _.hasName(p_name)
  491. if p_id:
  492. predicate = lambda _:predicate and _.hasId(p_id[1:])
  493. if p_class:
  494. predicate = lambda _:predicate and _.hasClass(p_class[1:])
  495. res = []
  496. for sub_tree in self.find(predicate):
  497. res = res + sub_tree.query(tail)
  498. if p_property:
  499. if p_property == ":text":
  500. res = map(lambda _:_.text(), res)
  501. else:
  502. raise Exception("Property selector not supproted yet: " + p_property)
  503. if p_count:
  504. count = int(p_count[1:-1])
  505. if count < 0:
  506. count = len(res) + count
  507. if count < len(res):
  508. return [res[count]]
  509. else:
  510. return [None]
  511. else:
  512. return res
  513. else:
  514. return [self]
  515. def __str__( self ):
  516. return self.prettyString()
  517. def __repr__( self ):
  518. return str(self.list())
  519. def html( self ):
  520. """Converts this tags tree to HTML"""
  521. return self.list().html()
  522. def text( self ):
  523. """Returns only the text tags in this HTML tree"""
  524. return self.list().text()
  525. def innerhtml( self ):
  526. return self.list().innerhtml()
  527. def __iter__( self ):
  528. for tag in self.list():
  529. yield tag
  530. # -----------------------------------------------------------------------------
  531. #
  532. # HTML PRESETS
  533. #
  534. # -----------------------------------------------------------------------------
  535. HTML_EMPTY = """\
  536. AREA BASE BASEFONT BR COL FRAME HR IMG INPUT ISINDEX LINK META PARAM
  537. """[:-1].split()
  538. HTML_MAYBE_EMPTY = """\
  539. A P
  540. """[:-1].split()
  541. def HTML_isEmpty( tag ):
  542. tag_name = tag.name().upper()
  543. if tag_name in HTML_EMPTY: return True
  544. if tag_name == "A" and not tag.has("href"): return True
  545. return False
  546. def HTML_mayBeEmpty( tag ):
  547. tag_name = tag.name().upper()
  548. if tag_name in HTML_MAYBE_EMPTY: return True
  549. return False
  550. def HTML_closeWhen( current, parent ):
  551. cur_name = (current.name() or "").upper()
  552. par_name = (parent.name() or "").upper()
  553. if cur_name == par_name == "TD": return True
  554. if cur_name == par_name == "TR": return True
  555. if cur_name == par_name == "P": return True
  556. if par_name == "P" and cur_name in ("DIV", "TABLE", "UL", "BLOCKQUOTE",
  557. "FORM"): return True
  558. return False
  559. # -----------------------------------------------------------------------------
  560. #
  561. # HTML PARSING FUNCTIONS
  562. #
  563. # -----------------------------------------------------------------------------
  564. class HTMLTools:
  565. """This class contains a set of tools to process HTML text data easily. This
  566. class can operate on a full HTML document, or on any subset of the
  567. document."""
  568. LEVEL_ACCOUNT = [ "html", "head", "body", "div", "table", "tr", "td" ]
  569. def __init__( self ):
  570. pass
  571. # PREDICATES
  572. # ========================================================================
  573. def withClass( self, name ):
  574. """Predicate that filters node by class"""
  575. return lambda n:n.hasClass(name)
  576. def withName( self, name ):
  577. """Predicate that filters node by class"""
  578. return lambda n:n.hasName(name)
  579. # BASIC PARSING OPERATIONS
  580. # ========================================================================
  581. def parse( self, html ):
  582. """Returns a tagtree from the given HTML string, tag list or tree
  583. node."""
  584. return self.tree(html)
  585. def tree( self, html, asXML=False ):
  586. tag_list = TagList()
  587. tag_list.fromHTML(html, scraper=self)
  588. return tag_list.tagtree(asXML)
  589. def list( self, data ):
  590. """Converts the given text or tagtree into a taglist."""
  591. if type(data) in (str, unicode):
  592. tag_list = TagList()
  593. tag_list.fromHTML(data, scraper=self)
  594. return tag_list
  595. elif isinstance(data, TagList):
  596. return data
  597. elif isinstance(data, TagTree):
  598. return data.list()
  599. else:
  600. raise Exception("Unsupported data:" + data)
  601. def html( self, data ):
  602. """Converts the given taglist or tagtree into HTML, and returns
  603. a string or unicode."""
  604. if type(data) == str:
  605. return data
  606. elif type(data) == unicode:
  607. return data
  608. elif isinstance(data, TagList):
  609. return data.html()
  610. elif isinstance(data, TagTree):
  611. return data.html()
  612. else:
  613. raise Exception("Unsupported data:" + repr(data))
  614. # TEXT OPERATIONS
  615. # ========================================================================
  616. def textcut( self, text, cutfrom=None, cutto=None ):
  617. """Cuts the text from the given marker, to the given marker."""
  618. text = self.html(text)
  619. if cutfrom: start = text.find(cutfrom)
  620. else: start = 0
  621. if cutto: end = text.find(cutto)
  622. else: end = -1
  623. if start == -1: start = 0
  624. elif cutfrom: start += len(cutfrom)
  625. return text[start:end]
  626. def textlines( self, text, strip=True, empty=False ):
  627. """Returns a list of lines for the given HTML text. Lines are stripped
  628. and empty lines are filtered out by default."""
  629. text = self.html(text)
  630. lines = text.split("\n")
  631. if strip: lines = map(string.strip, lines)
  632. if not empty: lines = filter(lambda x:x, lines)
  633. return lines
  634. def text( self, data, expand=False, norm=False ):
  635. """Strips the given tags from HTML text"""
  636. res = None
  637. if type(data) in (str, unicode):
  638. res = data
  639. else:
  640. res = data.text()
  641. if expand: res = self.expand(res)
  642. if norm: res = self.norm(res)
  643. return res
  644. def expand( self, text, encoding=None ):
  645. """Expands the entities found in the given text."""
  646. if not (type(text) in (str, unicode)):
  647. text = text.text()
  648. # NOTE: This is based on
  649. # <http://www.shearersoftware.com/software/developers/htmlfilter/>
  650. entityStart = text.find('&')
  651. if entityStart != -1: # only run bulk of code if there are entities present
  652. preferUnicodeToISO8859 = True
  653. prevOffset = 0
  654. textParts = []
  655. while entityStart != -1:
  656. textParts.append(text[prevOffset:entityStart])
  657. entityEnd = text.find(';', entityStart+1)
  658. if entityEnd == -1:
  659. entityEnd = entityStart
  660. entity = '&'
  661. else:
  662. entity = text[entityStart:entityEnd+1]
  663. if len(entity) < 4 or entity[1] != '#':
  664. entity = htmlentitydefs.entitydefs.get(entity[1:-1],entity)
  665. if len(entity) == 1:
  666. if preferUnicodeToISO8859 and ord(entity) > 127 and hasattr(entity, 'decode'):
  667. entity = entity.decode('iso-8859-1')
  668. if type(text) != unicode and encoding:
  669. entity = entity.encode(encoding)
  670. else:
  671. if len(entity) >= 4 and entity[1] == '#':
  672. if entity[2] in ('X','x'):
  673. entityCode = int(entity[3:-1], 16)
  674. else:
  675. entityCode = int(entity[2:-1])
  676. if entityCode > 255:
  677. entity = unichr(entityCode)
  678. else:
  679. entity = chr(entityCode)
  680. if preferUnicodeToISO8859 and hasattr(entity, 'decode'):
  681. entity = entity.decode('iso-8859-1')
  682. if type(text) != unicode and encoding:
  683. entity = entity.encode(encoding)
  684. textParts.append(entity)
  685. prevOffset = entityEnd+1
  686. entityStart = text.find('&', prevOffset)
  687. textParts.append(text[prevOffset:])
  688. text = u''.join(textParts)
  689. return text
  690. # FORMS-RELATED OPERATIONS
  691. # ========================================================================
  692. def forms( self, html ):
  693. return form.parseForms(self, self.html(html))
  694. def images( self, html, like=None ):
  695. """Iterates through the links found in this document. This yields the
  696. tag name and the href value."""
  697. for name, url in self.links(html, like):
  698. if name == "img":
  699. yield url
  700. def links( self, html, like=None ):
  701. """Iterates through the links found in this document. This yields the
  702. tag name and the href value."""
  703. if html:
  704. html = self.html(html)
  705. if like != None:
  706. if type(like) in (str,unicode): like = re.compile(like)
  707. res = []
  708. for match in self.onRE(html, RE_HTMLLINK):
  709. tag = match.group()
  710. tag = tag.replace("\t"," ")
  711. tag = tag.replace("\n"," ")
  712. tag = tag[1:tag.find(" ")]
  713. href = match.group(2)
  714. if href[0] in ("'", '"'): href = href[1:-1]
  715. if not like or like.match(href):
  716. yield tag, href
  717. # UTILITIES
  718. # ========================================================================
  719. def findNextTag( self, html, offset=0 ):
  720. """Finds the next tag in the given HTML text from the given offset. This
  721. returns (tag type, tag name, tag start, attributes start, attributes
  722. end) and tag end or None."""
  723. if offset >= len(html) - 1: return None
  724. m = RE_HTMLSTART.search(html, offset)
  725. if m == None:
  726. return None
  727. n = RE_HTMLEND.search(html, m.end())
  728. if n == None:
  729. return HTMLTools.findNextTag(self, html, m.end())
  730. if m.group()[1] == "/": tag_type = Tag.CLOSE
  731. elif n.group()[0] == "/": tag_type = Tag.EMPTY
  732. else: tag_type = Tag.OPEN
  733. return (tag_type, m.group(1), m.start(), m.end(), n.start()), n.end()
  734. @staticmethod
  735. def onRE( text, regexp, off=0 ):
  736. """Itearates through the matches for the given regular expression."""
  737. res = True
  738. while res:
  739. res = regexp.search(text, off)
  740. if res:
  741. off = res.end()
  742. yield res
  743. @staticmethod
  744. def norm( text ):
  745. """Normalizes the spaces (\t, \n, etc) so that everything gets converted
  746. to single space."""
  747. return RE_SPACES.sub(" ", text).strip()
  748. @staticmethod
  749. def parseTag( text ):
  750. """Parses the HTML/XML tag in the given text, returning its name and
  751. attributes."""
  752. text = text.strip()
  753. space = text.find(" ")
  754. if text[0:2] == "</": start = 2
  755. elif text[0] == "<": start = 1
  756. else: start = 0
  757. if text[-2:0] == "/>": end = -2
  758. elif text[-1] == ">": end = -1
  759. else: end = len(text)
  760. if space:
  761. name = text[start:space]
  762. attr = text[space:end].strip()
  763. return (name, HTML.parseAttributes(attr))
  764. else:
  765. return (text[start:end].strip(), {})
  766. @staticmethod
  767. def parseAttributes(text, attribs = None):
  768. """Parses the HTML/XML attributes described in the given text."""
  769. if attribs == None: attribs = {}
  770. eq = text.find("=")
  771. # There may be attributes without a trailing =
  772. # Like ''id=all type=radio name=meta value="" checked''
  773. if eq == -1:
  774. space = text.find(" ")
  775. if space == -1:
  776. name = text.strip()
  777. if name: attribs[name] = None
  778. return attribs
  779. else:
  780. name = text[:space].strip()
  781. if name: attribs[name] = None
  782. return HTML.parseAttributes(text[space+1:], attribs)
  783. else:
  784. if eq + 1 == len(text):
  785. return attribs
  786. sep = text[eq+1]
  787. if sep == "'": end = text.find( "'", eq + 2 )
  788. elif sep == '"': end = text.find( '"', eq + 2 )
  789. else: end = text.find(" ", eq)
  790. # Did we reach the end ?
  791. name = text[:eq].strip()
  792. if end == -1:
  793. value = text[eq+1:]
  794. if value and value[0] in ("'", '"'): value = value[1:-1]
  795. else: value = value.strip()
  796. attribs[name.lower()] = value
  797. return attribs
  798. else:
  799. value = text[eq+1:end+1]
  800. if value[0] in ("'", '"'): value = value[1:-1]
  801. else: value = value.strip()
  802. attribs[name.lower()] = value
  803. return HTML.parseAttributes(text[end+1:].strip(), attribs)
  804. # We create a shared instance with the scraping tools
  805. HTML = HTMLTools()
  806. # EOF - vim: tw=80 ts=4 sw=4 noet