PageRenderTime 54ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/script.module.dmd-czech.common/lib/SoupSelector.py

http://dmd-xbmc.googlecode.com/
Python | 323 lines | 314 code | 0 blank | 9 comment | 0 complexity | 3a57dca1ebc6966456ad347a1e9c4297 MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0
  1. """
  2. BeautifulSoup HTML parser extended with CSS selector
  3. Suppored selectors:
  4. * any element
  5. E an element of type E
  6. E.warning an E element whose class is "warning" (the document
  7. language specifies how class is determined).
  8. E#myid an E element with ID equal to "myid".
  9. E[foo] an E element with a "foo" attribute
  10. E[foo="bar"] an E element whose "foo" attribute value is exactly
  11. equal to "bar"
  12. E[foo~="bar"] an E element whose "foo" attribute value is a list
  13. of whitespace-separated values, one of which is
  14. exactly equal to "bar"
  15. E[foo^="bar"] an E element whose "foo" attribute value begins
  16. exactly with the string "bar"
  17. E[foo$="bar"] an E element whose "foo" attribute value ends exactly
  18. with the string "bar"
  19. E[foo*="bar"] an E element whose "foo" attribute value contains
  20. the substring "bar"
  21. E[foo|="en"] an E element whose "foo" attribute has
  22. a hyphen-separated list of values beginning
  23. (from the left) with "en"
  24. E F an F element descendant of an E element
  25. E > F an F element child of an E element
  26. E + F an F element immediately preceded by an E element
  27. E ~ F an F element preceded by an E element
  28. See also:
  29. http://www.w3.org/TR/css3-selectors/
  30. """
  31. __author__ = "Tomas Pokorny (tomas.zemres@gmail.com)"
  32. __version__ = "0.1"
  33. __license__ = "GPL"
  34. __all__ = ["select", "select_first"]
  35. import re
  36. import BeautifulSoup
  37. RE_SPACE = re.compile(r'\s+')
  38. RE_SEP = re.compile(r'\s*([>+~])\s*|\s+')
  39. RE_TAG = re.compile(r'[\w-]+|[*]', re.I)
  40. RE_EXT = re.compile(r'([.#])([\w-]+)', re.I)
  41. RE_ATTR = re.compile(r'''
  42. \[ # left backet
  43. (?P<attr>[\w-]+) # attribute name
  44. (?P<match> # with/without pattern/value
  45. (?P<op>[~^$*|]?) = # operator
  46. (?P<pattern> # pattern with/without quotes
  47. "[^"]+" | [^"'\]]+
  48. )
  49. )?
  50. \] # right bracket
  51. ''', re.I | re.X)
  52. class SelectorError(Exception):
  53. pass
  54. class ResultList(list):
  55. pass
  56. def filter_to_callable(filter_def):
  57. """ Convert filter definition to lamba function """
  58. if callable(filter_def):
  59. return filter_def
  60. if filter_def is True:
  61. return lambda v: v is not None
  62. if hasattr(filter_def, 'match'):
  63. # regexp object
  64. def mkFilterClosure(pattern):
  65. return lambda v: v is not None and pattern.search(v)
  66. return mkFilterClosure(filter_def)
  67. if hasattr(filter_def, '__iter__'):
  68. def mkFilterClosure(pattern):
  69. return lambda v: v in pattern
  70. return mkFilterClosure(filter_def)
  71. if isinstance(filter_def, basestring):
  72. def mkFilterClosure(pattern):
  73. return lambda v: v is not None and v == pattern
  74. return mkFilterClosure(filter_def)
  75. raise Exception("Invalid filter_def value: " + repr(filter_def))
  76. def update_filters(target_dict, **kwargs):
  77. """ Recursive extend filters dictionary given in first argument
  78. by keyword args
  79. """
  80. for key, value in kwargs.items():
  81. if target_dict.has_key(key):
  82. if isinstance(target_dict ,dict) and isinstance(value, dict):
  83. update_filters( target_dict[ key ], **value)
  84. else:
  85. # "AND" filters:
  86. old_filter = filter_to_callable( target_dict[key] )
  87. new_filter = filter_to_callable( value )
  88. target_dict[key] = lambda v: old_filter(v) and new_filter(v)
  89. else:
  90. target_dict[key] = value
  91. def compile_selector(selector):
  92. """ Compile CSS selector string to lits of filter parameters """
  93. outList = []
  94. filters = {}
  95. partno = 0
  96. m = None
  97. part = selector
  98. while part:
  99. if m:
  100. part = part[m.end():]
  101. if not part and partno:
  102. outList.append( filters )
  103. return outList # return valid output
  104. if partno:
  105. m = RE_SEP.match(part)
  106. if m:
  107. partno = 0
  108. op = m.group(1)
  109. outList.append(filters)
  110. filters = {}
  111. if op == '>':
  112. # E > F -- an F element child of an E element
  113. filters['recursive'] = False
  114. elif op == '+':
  115. # E + F -- an F element immediately preceded by an E element
  116. def immediateNextSibling(content, **kwargs):
  117. immediateNext = content.findNextSibling()
  118. if immediateNext:
  119. found = content.findNextSibling(**kwargs)
  120. if found and id(found) == id(immediateNext):
  121. return [ found ]
  122. return []
  123. filters['call'] = immediateNextSibling
  124. elif op == '~':
  125. # E ~ F -- an F element preceded by an E element
  126. filters['call'] = 'findNextSiblings'
  127. elif op:
  128. break # error
  129. continue # next part
  130. partno += 1
  131. if partno == 1:
  132. m = RE_TAG.match( part )
  133. if m:
  134. # Filter tag name or *
  135. tag = m.group(0)
  136. if tag != '*':
  137. update_filters(filters, name=tag)
  138. continue # next part
  139. m = RE_EXT.match( part )
  140. if m:
  141. (symbol, key) = m.groups()
  142. if symbol == '#':
  143. # select by attribute class
  144. update_filters(filters, attrs={'id': key})
  145. elif symbol == '.':
  146. # select by attribute id
  147. update_filters(filters, attrs={
  148. 'class': re.compile("(^|\s)%s($|\s)" % key)
  149. })
  150. else:
  151. break # error
  152. continue # next part
  153. m = RE_ATTR.match( part )
  154. if m:
  155. attr = m.group('attr')
  156. if m.group('match'):
  157. (op, pattern) = (m.group('op'), m.group('pattern'))
  158. if pattern.startswith('"') and pattern.endswith('"'):
  159. pattern = pattern[1:-1]
  160. if not op:
  161. # E[foo="bar"]
  162. update_filters(filters, attrs={ attr : pattern })
  163. elif op == '~':
  164. # E[foo~="bar"]
  165. def mkFilterClosure(pattern):
  166. return lambda v: v and pattern in RE_SPACE.split(v)
  167. update_filters(filters, attrs={
  168. attr : mkFilterClosure(pattern)
  169. })
  170. elif op == '^':
  171. # E[foo^="bar"]
  172. def mkFilterClosure(pattern):
  173. return lambda v: v and v.startswith(pattern)
  174. update_filters(filters, attrs={
  175. attr : mkFilterClosure(pattern)
  176. })
  177. elif op == '$':
  178. # E[foo$="bar"]
  179. def mkFilterClosure(pattern):
  180. return lambda v: v and v.endswith(pattern)
  181. update_filters(filters, attrs={
  182. attr : mkFilterClosure(pattern)
  183. })
  184. elif op == '*':
  185. # E[foo*="bar"]
  186. def mkFilterClosure(pattern):
  187. return lambda v: v and (pattern in v)
  188. update_filters(filters, attrs={
  189. attr : mkFilterClosure(pattern)
  190. })
  191. elif op == '|':
  192. # E[foo|="en"]
  193. def mkFilterClosure(pattern):
  194. return lambda v: v and ( v == pattern \
  195. or v.startswith(pattern + '-') )
  196. update_filters(filters, attrs={
  197. attr : mkFilterClosure(pattern)
  198. })
  199. else:
  200. break # error
  201. else:
  202. update_filters(filters, attrs={ attr : True })
  203. # E[foo] - an E element with a "foo" attribute
  204. continue # next part
  205. break # error - any regexp does not match
  206. # Raise invalid selector error:
  207. raise SelectorError("Invalid Selector: " + repr(selector))
  208. def search_call(content, call='findAll', **filters):
  209. if callable(call):
  210. return call(content, **filters)
  211. return getattr(content, call)(**filters)
  212. def select(content, selector, limit=None):
  213. """ Find all elements by CSS selector
  214. Paramters:
  215. content - BeatifulSoup document
  216. selector - CSS selector string
  217. limit - maximum number of returned items
  218. """
  219. compiledList = compile_selector(selector)
  220. #print "Compiled %r ==> %r" % (selector, compiledList)
  221. if not isinstance(content, list):
  222. content = [ content ]
  223. while compiledList:
  224. filters = compiledList.pop(0)
  225. foundList = ResultList()
  226. added = {}
  227. for searchItem in content:
  228. for foundItem in search_call(searchItem, **filters):
  229. # eliminate duplices in result
  230. if not added.has_key( id(foundItem) ):
  231. added[ id(foundItem) ] = 1
  232. foundList.append( foundItem )
  233. if not compiledList and limit == len(foundList):
  234. return foundList
  235. content = foundList
  236. return content
  237. def select_first(content, selector):
  238. """ Find single element by given CSS selector """
  239. foundList = select(content, selector, limit=1)
  240. if foundList:
  241. return foundList[0]
  242. return None # not found
  243. # Extend BeautifulSoup classes and ResultList:
  244. for cls in ( BeautifulSoup.PageElement, BeautifulSoup.ResultSet, ResultList):
  245. cls.select = select
  246. cls.select_first = select_first