/BitTorrent/BeautifulSupe.py

https://github.com/mbologna/BitFountain · Python · 132 lines · 97 code · 30 blank · 5 comment · 34 complexity · b49c152ba597a08ae8bdce49015aa770 MD5 · raw file

  1. # A very very minimal BeautifulSoup immitation.
  2. #
  3. # BS uses SGMLlib to parse, which converts everything to lower case.
  4. # This uses real xml parsing to mimic the parts of BS we use.
  5. import xml.dom.minidom
  6. def _getText(node):
  7. nodelist = node.childNodes
  8. rc = []
  9. for node in nodelist:
  10. if node.nodeType == node.TEXT_NODE:
  11. rc.append(str(node.data))
  12. return rc
  13. def _getNodesAsTags(root):
  14. nodelist = root.childNodes
  15. tags = []
  16. for node in nodelist:
  17. if node.nodeType == node.ELEMENT_NODE:
  18. tags.append(Tag(node))
  19. return tags
  20. class Tag(object):
  21. def __init__(self, node):
  22. self.node = node
  23. self.name = node.nodeName
  24. self.contents = _getNodesAsTags(self.node)
  25. text = _getText(self.node)
  26. self.contents += text
  27. self.text = ''.join(text)
  28. def child_elements(self):
  29. children = []
  30. for tag in self.contents:
  31. if isinstance(tag, Tag):
  32. children.append(tag)
  33. return children
  34. def get(self, tagname):
  35. got = self.first(tagname)
  36. if got:
  37. return got.text
  38. def first(self, tagname):
  39. found = None
  40. for tag in self.contents:
  41. if isinstance(tag, Tag):
  42. if tag.name == tagname:
  43. found = tag
  44. break
  45. return found
  46. class BeautifulSupe(object):
  47. def __init__(self, data):
  48. #please don't give us your null terminators
  49. data = data.strip(chr(0))
  50. self.dom = xml.dom.minidom.parseString(data)
  51. def first(self, tagname, root = None):
  52. found = None
  53. if root == None:
  54. e = self.dom.getElementsByTagName(tagname)
  55. if len(e) > 0:
  56. found = e[0]
  57. else:
  58. for node in root.childNodes:
  59. if node.nodeName == tagname:
  60. found = node
  61. break
  62. if not found:
  63. return None
  64. tag = Tag(found)
  65. return tag
  66. def fetch(self, tagname, restraints = {}):
  67. e = self.dom.getElementsByTagName(tagname)
  68. matches = []
  69. for node in e:
  70. match = 1
  71. for restraint in restraints:
  72. f = self.first(restraint, node)
  73. if not f:
  74. match = 0
  75. break
  76. text = restraints[restraint]
  77. if not f.contents[0].startswith(text):
  78. match = 0
  79. break
  80. if match:
  81. tag = Tag(node)
  82. matches.append(tag)
  83. return matches
  84. def scour(self, prefix, suffix = None, node = None):
  85. if node is None:
  86. root = self.dom.getElementsByTagName(self.dom.documentElement.tagName)[0]
  87. node = root
  88. matches = []
  89. for node in node.childNodes:
  90. match = 0
  91. name = node.nodeName
  92. if name.startswith(prefix):
  93. if suffix:
  94. if name.endswith(suffix):
  95. match = 1
  96. else:
  97. match = 1
  98. if match:
  99. tag = Tag(node)
  100. matches.append(tag)
  101. matches += self.scour(prefix, suffix, node)
  102. return matches