/BitTorrent/BeautifulSupe.py
https://github.com/mbologna/BitFountain · Python · 132 lines · 97 code · 30 blank · 5 comment · 34 complexity · b49c152ba597a08ae8bdce49015aa770 MD5 · raw file
- # A very very minimal BeautifulSoup immitation.
- #
- # BS uses SGMLlib to parse, which converts everything to lower case.
- # This uses real xml parsing to mimic the parts of BS we use.
- import xml.dom.minidom
- def _getText(node):
- nodelist = node.childNodes
- rc = []
- for node in nodelist:
- if node.nodeType == node.TEXT_NODE:
- rc.append(str(node.data))
- return rc
- def _getNodesAsTags(root):
- nodelist = root.childNodes
- tags = []
- for node in nodelist:
- if node.nodeType == node.ELEMENT_NODE:
- tags.append(Tag(node))
- return tags
- class Tag(object):
- def __init__(self, node):
- self.node = node
- self.name = node.nodeName
- self.contents = _getNodesAsTags(self.node)
- text = _getText(self.node)
- self.contents += text
- self.text = ''.join(text)
- def child_elements(self):
- children = []
- for tag in self.contents:
- if isinstance(tag, Tag):
- children.append(tag)
- return children
- def get(self, tagname):
- got = self.first(tagname)
- if got:
- return got.text
- def first(self, tagname):
- found = None
-
- for tag in self.contents:
- if isinstance(tag, Tag):
- if tag.name == tagname:
- found = tag
- break
-
- return found
-
- class BeautifulSupe(object):
- def __init__(self, data):
- #please don't give us your null terminators
- data = data.strip(chr(0))
- self.dom = xml.dom.minidom.parseString(data)
-
- def first(self, tagname, root = None):
- found = None
- if root == None:
- e = self.dom.getElementsByTagName(tagname)
- if len(e) > 0:
- found = e[0]
- else:
- for node in root.childNodes:
- if node.nodeName == tagname:
- found = node
- break
- if not found:
- return None
- tag = Tag(found)
- return tag
- def fetch(self, tagname, restraints = {}):
- e = self.dom.getElementsByTagName(tagname)
- matches = []
- for node in e:
- match = 1
-
- for restraint in restraints:
- f = self.first(restraint, node)
- if not f:
- match = 0
- break
- text = restraints[restraint]
- if not f.contents[0].startswith(text):
- match = 0
- break
-
- if match:
- tag = Tag(node)
- matches.append(tag)
- return matches
- def scour(self, prefix, suffix = None, node = None):
- if node is None:
- root = self.dom.getElementsByTagName(self.dom.documentElement.tagName)[0]
- node = root
- matches = []
- for node in node.childNodes:
- match = 0
-
- name = node.nodeName
- if name.startswith(prefix):
- if suffix:
- if name.endswith(suffix):
- match = 1
- else:
- match = 1
-
- if match:
- tag = Tag(node)
- matches.append(tag)
- matches += self.scour(prefix, suffix, node)
- return matches