PageRenderTime 45ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 1ms

/modules/html_filter.py

http://n23.googlecode.com/
Python | 548 lines | 472 code | 31 blank | 45 comment | 15 complexity | 4fe8a56105c618134dbd19b55ddec591 MD5 | raw file
Possible License(s): Apache-2.0, LGPL-2.1
  1. # -*- coding: utf-8 -*-
  2. """
  3. A Python HTML filtering library - html_filter.py, v 1.15.4
  4. Translated to Python by Samuel Adam <samuel.adam@gmail.com>
  5. http://amisphere.com/contrib/python-html-filter/
  6. Original PHP code ( lib_filter.php, v 1.15 ) by Cal Henderson <cal@iamcal.com>
  7. http://iamcal.com/publish/articles/php/processing_html/
  8. http://iamcal.com/publish/articles/php/processing_html_part_2/
  9. This code is licensed under a Creative Commons Attribution-ShareAlike 2.5 License
  10. http://creativecommons.org/licenses/by-sa/2.5/
  11. """
  12. import re
  13. from cgi import escape
  14. from HTMLParser import HTMLParser
  15. class html_filter:
  16. """
  17. html_filter removes HTML tags that do not belong to a white list
  18. closes open tags and fixes broken ones
  19. removes javascript injections and black listed URLs
  20. makes text URLs and emails clickable
  21. adds rel="no-follow" to links except for white list
  22. default settings are based on Flickr's "Some HTML is OK"
  23. http://www.flickr.com/html.gne
  24. HOWTO
  25. 1. Basic example
  26. from html_filter import html_filter
  27. filter = html_filter()
  28. #change settings to meet your needs
  29. filter.strip_comments = False
  30. filter.allowed['br'] = ()
  31. filter.no_close += 'br',
  32. raw_html = '<p><strong><br><!-- Text to filter !!!<div></p>'
  33. # go() is a shortcut to apply the most common methods
  34. filtered_html = filter.go(raw_html)
  35. # returns <strong><br />&lt;!-- Text to filter !!!</strong>
  36. 2. You can only use one method at a time if you like
  37. from html_filter import html_filter
  38. filter = html_filter()
  39. please_dont_scream_this_is_a_pop_contest = filter.fix_case('HARD ROCK ALELUYAH!!!')
  40. # returns Hard rock aleluyah!!!
  41. filter.break_words_longer_than = 30
  42. wordwrap_text = filter.break_words('MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM...')
  43. # adds html entity "&#8203;" (zero width space) each 30 characters
  44. """
  45. def __init__(self):
  46. ### START Default Config ###
  47. # tags and attributes that are allowed
  48. self.allowed = {
  49. 'a': ('href', 'target'),
  50. 'b': (),
  51. 'blockquote': (),
  52. 'em': (),
  53. 'i': (),
  54. 'img': ('src', 'width', 'height', 'alt', 'title'),
  55. 'strong': (),
  56. 'u': (),
  57. }
  58. # tags which should always be self-closing (e.g. "<img />")
  59. self.no_close = (
  60. 'img',
  61. )
  62. # tags which must always have seperate opening and closing tags (e.g. "<b></b>")
  63. self.always_close = (
  64. 'a',
  65. 'b',
  66. 'blockquote',
  67. 'em',
  68. 'i',
  69. 'strong',
  70. 'u',
  71. )
  72. # tags which should be removed if they contain no content (e.g. "<b></b>" or "<b />")
  73. self.remove_blanks = (
  74. 'a',
  75. 'b',
  76. 'blockquote',
  77. 'em',
  78. 'i',
  79. 'strong',
  80. 'u',
  81. )
  82. # attributes which should be checked for valid protocols
  83. self.protocol_attributes = (
  84. 'src',
  85. 'href',
  86. )
  87. # protocols which are allowed
  88. self.allowed_protocols = (
  89. 'http',
  90. 'https',
  91. 'ftp',
  92. 'mailto',
  93. )
  94. # forbidden urls ( regular expressions ) are replaced by #
  95. self.forbidden_urls = (
  96. r'^/delete-account',
  97. r'^domain.ext/delete-account',
  98. )
  99. # should we make urls clickable ?
  100. self.make_clickable_urls = True
  101. # should we add a rel="nofollow" to the links ?
  102. self.add_no_follow = True
  103. # except for those domains
  104. self.follow_for = (
  105. 'allowed-domain.ext',
  106. )
  107. # should we remove comments?
  108. self.strip_comments = True
  109. # should we removes blanks from beginning and end of data ?
  110. self.strip_data = True
  111. # should we try and make a b tag out of "b>"
  112. self.always_make_tags = False
  113. # entity control options
  114. self.allow_numbered_entities = True
  115. self.allowed_entities = (
  116. 'amp',
  117. 'gt',
  118. 'lt',
  119. 'quot',
  120. )
  121. # should we "break" words longer than x chars ( 0 means "No", minimum is 8 chars )
  122. self.break_words_longer_than = 0
  123. ### END Default Config ###
  124. # INIT
  125. self.tag_counts = {}
  126. # pre-compile some regexp patterns
  127. self.pat_entities = re.compile(r'&([^&;]*)(?=(;|&|$))')
  128. self.pat_quotes = re.compile(r'(>|^)([^<]+?)(<|$)', re.DOTALL|re.IGNORECASE)
  129. self.pat_valid_entity = re.compile(r'^#([0-9]+)$', re.IGNORECASE)
  130. self.pat_decode_entities_dec = re.compile(r'(&)#(\d+);?')
  131. self.pat_decode_entities_hex = re.compile(r'(&)#x([0-9a-f]+);?', re.IGNORECASE)
  132. self.pat_decode_entities_hex2 = re.compile(r'(%)([0-9a-f]{2});?', re.IGNORECASE)
  133. self.pat_entities2 = re.compile(r'&([^&;]*);?', re.IGNORECASE)
  134. self.pat_raw_url = re.compile('(('+'|'.join(self.allowed_protocols)+')://)(([a-z0-9](?:[a-z0-9\\-]*[a-z0-9])?\\.)+(com\\b|edu\\b|biz\\b|gov\\b|in(?:t|fo)\\b|mil\\b|net\\b|org\\b|[a-z][a-z]\\b)|((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])))(:\\d+)?(/[-a-z0-9_:\\\\@&?=+,\\.!/~*\'%\\$]*)*(?<![.,?!])(?!((?!(?:<a )).)*?(?:</a>))(?!((?!(?:<!--)).)*?(?:-->))', re.IGNORECASE)
  135. #
  136. def go(self, data):
  137. data = self.strip_whitespace(data)
  138. data = self.escape_comments(data)
  139. data = self.balance_html(data)
  140. data = self.clickable_urls(data)
  141. data = self.check_tags(data)
  142. data = self.process_remove_blanks(data)
  143. data = self.validate_entities(data)
  144. data = self.break_words(data)
  145. return data
  146. #
  147. def strip_whitespace(self, data):
  148. if self.strip_data:
  149. data = data.strip()
  150. return data
  151. #
  152. def escape_comments(self, data):
  153. pat = re.compile(r'<!--(.*?)-->', re.IGNORECASE)
  154. data = re.sub(pat, self.f0, data)
  155. return data
  156. def f0(self, m):
  157. return '<!--'+escape(m.group(1), True)+'-->'
  158. #
  159. def balance_html(self, data):
  160. # try and form html
  161. if self.always_make_tags:
  162. data = re.sub(r'>>+', r'>', data)
  163. data = re.sub(r'<<+', r'<', data)
  164. data = re.sub(r'^>', r'', data)
  165. data = re.sub(r'<([^>]*?)(?=<|$)', r'<\1>', data)
  166. data = re.sub(r'(^|>)([^<]*?)(?=>)', r'\1<\2', data)
  167. else:
  168. data = data.replace('<>', '&lt;&gt;') # <> as text
  169. data = self.re_sub_overlap(r'<([^>]*?)(?=<|$)', r'&lt;\1', data)
  170. data = self.re_sub_overlap(r'(^|>)([^<]*?)(?=>)', r'\1\2&gt;<', data)
  171. data = re.sub(r'<(\s)+?', r'&lt;\1', data) # consider "< a href" as "&lt; a href"
  172. # this filter introduces an error, so we correct it
  173. data = data.replace('<>', '')
  174. return data
  175. # python re.sub() doesn't overlap matches
  176. def re_sub_overlap(self, pat, repl, data, i=0):
  177. data_temp = re.sub(pat, repl, data[i:])
  178. if data_temp != data[i:]:
  179. data = data[:i] + data_temp
  180. i += 1
  181. data = self.re_sub_overlap(pat, repl, data, i)
  182. return data
  183. #
  184. def clickable_urls(self, data):
  185. if self.make_clickable_urls:
  186. # urls
  187. # pat = re.compile('(('+'|'.join(self.allowed_protocols)+')://)(([a-z0-9](?:[a-z0-9\\-]*[a-z0-9])?\\.)+(com\\b|edu\\b|biz\\b|gov\\b|in(?:t|fo)\\b|mil\\b|net\\b|org\\b|[a-z][a-z]\\b)|((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])))(:\\d+)?(/[-a-z0-9_:\\\\@&?=+,\\.!/~*\'%\\$]*)*(?<![.,?!])(?!((?!(?:<a )).)*?(?:</a>))(?!((?!(?:<!--)).)*?(?:-->))', re.IGNORECASE)
  188. data = re.sub(self.pat_raw_url, self.f7, data)
  189. # emails
  190. if 'mailto' in self.allowed_protocols:
  191. pat = re.compile(r'((([a-z]|[0-9]|!|#|$|%|&|\'|\*|\+|\-|/|=|\?|\^|_|`|\{|\||\}|~)+(\.([a-z]|[0-9]|!|#|$|%|&|\'|\*|\+|\-|/|=|\?|\^|_|`|\{|\||\}|~)+)*)@((((([a-z]|[0-9])([a-z]|[0-9]|\-){0,61}([a-z]|[0-9])\.))*([a-z]|[0-9])([a-z]|[0-9]|\-){0,61}([a-z]|[0-9])\.(com|edu|gov|int|mil|net|org|biz|info|name|pro|aero|coop|museum|arpa|[a-z]{2}))|(((([0-9]){1,3}\.){3}([0-9]){1,3}))|(\[((([0-9]){1,3}\.){3}([0-9]){1,3})\])))(?!((?!(?:<a )).)*?(?:</a>))(?!((?!(?:<!--)).)*?(?:-->))', re.IGNORECASE)
  192. data = re.sub(pat, self.f8, data)
  193. return data
  194. def f7(self, m):
  195. return '<a href="'+m.group(0)+'">'+m.group(0)+'</a>'
  196. def f8(self, m):
  197. return '<a href="mailto:'+m.group(0)+'">'+m.group(0)+'</a>'
  198. #
  199. def check_tags(self, data):
  200. # compile loop regexps
  201. self.pat_end_tag = re.compile(r'^/([a-z0-9]+)', re.DOTALL|re.IGNORECASE)
  202. self.pat_start_tag = re.compile(r'^([a-z0-9]+)(.*?)(/?)$', re.DOTALL|re.IGNORECASE)
  203. self.pat_matches_2 = re.compile(r'([a-z0-9]+)=(["\'])(.*?)\2', re.DOTALL|re.IGNORECASE) # <foo a="b" />
  204. self.pat_matches_1 = re.compile(r'([a-z0-9]+)(=)([^"\s\']+)', re.DOTALL|re.IGNORECASE) # <foo a=b />
  205. self.pat_matches_3 = re.compile(r'([a-z0-9]+)=(["\'])([^"\']*?)\s*$', re.DOTALL|re.IGNORECASE) # <foo a="b />
  206. self.pat_comments = re.compile(r'^!--(.*)--$', re.DOTALL|re.IGNORECASE)
  207. self.pat_param_protocol = re.compile(r'^([^:]+):', re.DOTALL|re.IGNORECASE)
  208. pat = re.compile(r'<(.*?)>', re.DOTALL)
  209. data = re.sub(pat, self.f1, data)
  210. for tag in self.tag_counts:
  211. count = self.tag_counts[tag]
  212. for i in range(count):
  213. data += '</'+tag+'>'
  214. self.tag_counts = {}
  215. return data
  216. def f1(self, m):
  217. return self.process_tag(m.group(1))
  218. #
  219. def process_tag(self, data):
  220. # ending tags
  221. m = re.match(self.pat_end_tag, data)
  222. if m:
  223. name = m.group(1).lower()
  224. if name in self.allowed:
  225. if name not in self.no_close:
  226. if self.tag_counts.has_key(name):
  227. self.tag_counts[name] -= 1
  228. return '</' + name + '>'
  229. else:
  230. return ''
  231. # starting tags
  232. m = re.match(self.pat_start_tag, data)
  233. if m:
  234. name = m.group(1).lower()
  235. body = m.group(2)
  236. ending = m.group(3)
  237. if name in self.allowed:
  238. params = ''
  239. matches_2 = re.findall(self.pat_matches_2, body) # <foo a="b" />
  240. matches_1 = re.findall(self.pat_matches_1, body) # <foo a=b />
  241. matches_3 = re.findall(self.pat_matches_3, body) # <foo a="b />
  242. matches = {}
  243. for match in matches_3:
  244. matches[match[0].lower()] = match[2]
  245. for match in matches_1:
  246. matches[match[0].lower()] = match[2]
  247. for match in matches_2:
  248. matches[match[0].lower()] = match[2]
  249. for pname in matches:
  250. if pname in self.allowed[name]:
  251. value = matches[pname]
  252. if pname in self.protocol_attributes:
  253. processed_value = self.process_param_protocol(value)
  254. # add no_follow
  255. if self.add_no_follow and name== 'a' and pname == 'href' and processed_value == value:
  256. processed_value = re.sub(self.pat_raw_url, self.f9, processed_value)
  257. value = processed_value
  258. params += ' '+pname+'="'+value+'"'
  259. if name in self.no_close:
  260. ending = ' /'
  261. if name in self.always_close:
  262. ending = ''
  263. if not ending:
  264. if self.tag_counts.has_key(name):
  265. self.tag_counts[name] += 1
  266. else:
  267. self.tag_counts[name] = 1
  268. if ending:
  269. ending = ' /'
  270. return '<'+name+params+ending+'>'
  271. else:
  272. return ''
  273. # comments
  274. m = re.match(self.pat_comments, data)
  275. if m:
  276. if self.strip_comments:
  277. return ''
  278. else:
  279. return '<'+data+'>'
  280. # garbage, ignore it
  281. return ''
  282. def f9(self, m):
  283. if m.group(3) not in self.follow_for:
  284. return m.group()+'" rel="no-follow'
  285. return m.group()
  286. #
  287. def process_param_protocol(self, data):
  288. data = self.decode_entities(data)
  289. m = re.match(self.pat_param_protocol, data)
  290. if m:
  291. if not m.group(1) in self.allowed_protocols:
  292. start = len(m.group(1)) + 1
  293. data = '#' + data[start:]
  294. # remove forbidden urls
  295. for pat in self.forbidden_urls:
  296. m = re.search(pat, data)
  297. if m:
  298. data = '#'
  299. return data
  300. #
  301. def process_remove_blanks(self, data):
  302. for tag in self.remove_blanks:
  303. data = re.sub(r'<'+tag+'(\s[^>]*)?></'+tag+'>', r'', data)
  304. data = re.sub(r'<'+tag+'(\s[^>]*)?/>', r'', data)
  305. return data
  306. #
  307. def strip_tags(self, html):
  308. result = []
  309. parser = HTMLParser()
  310. parser.handle_data = result.append
  311. parser.feed(html)
  312. parser.close()
  313. return ''.join(result)
  314. def fix_case(self, data):
  315. # compile loop regexps
  316. self.pat_case_inner = re.compile(r'(^|[^\w\s\';,\\-])(\s*)([a-z])')
  317. data_notags = self.strip_tags(data)
  318. data_notags = re.sub(r'[^a-zA-Z]', r'', data_notags)
  319. if len(data_notags) < 5:
  320. return data
  321. m = re.search(r'[a-z]', data_notags)
  322. if m:
  323. return data
  324. pat = re.compile(r'(>|^)([^<]+?)(<|$)', re.DOTALL)
  325. data = re.sub(pat, self.f2, data)
  326. return data
  327. def f2(self, m):
  328. return m.group(1)+self.fix_case_inner(m.group(2))+m.group(3)
  329. def fix_case_inner(self, data):
  330. return re.sub(self.pat_case_inner, self.f3, data.lower())
  331. def f3(self, m):
  332. return m.group(1)+m.group(2)+m.group(3).upper()
  333. #
  334. def validate_entities(self, data):
  335. # validate entities throughout the string
  336. data = re.sub(self.pat_entities, self.f4, data)
  337. # validate quotes outside of tags
  338. data = re.sub(self.pat_quotes, self.f5, data)
  339. return data
  340. def f4(self, m):
  341. return self.check_entity(m.group(1), m.group(2))
  342. def f5(self, m):
  343. return m.group(1)+m.group(2).replace('"', '&quot;')+m.group(3)
  344. #
  345. def check_entity(self, preamble, term):
  346. if term != ';':
  347. return '&amp;'+preamble
  348. if self.is_valid_entity(preamble):
  349. return '&'+preamble
  350. return '&amp;'+preamble
  351. def is_valid_entity(self, entity):
  352. m = re.match(self.pat_valid_entity, entity)
  353. if m:
  354. if int(m.group(1)) > 127:
  355. return True
  356. return self.allow_numbered_entities
  357. if entity in self.allowed_entities:
  358. return True
  359. return False
  360. #
  361. # within attributes, we want to convert all hex/dec/url escape sequences into
  362. # their raw characters so that we can check we don't get stray quotes/brackets
  363. # inside strings
  364. def decode_entities(self, data):
  365. data = re.sub(self.pat_decode_entities_dec, self.decode_dec_entity, data)
  366. data = re.sub(self.pat_decode_entities_hex, self.decode_hex_entity, data)
  367. data = re.sub(self.pat_decode_entities_hex2, self.decode_hex_entity, data)
  368. data = self.validate_entities(data)
  369. return data
  370. def decode_hex_entity(self, m):
  371. return self.decode_num_entity(m.group(1), int(m.group(2), 16))
  372. def decode_dec_entity(self, m):
  373. return self.decode_num_entity(m.group(1), int(m.group(2)))
  374. def decode_num_entity(self, orig_type, d):
  375. if d < 0:
  376. d = 32 # space
  377. if d > 127:
  378. if orig_type == '%':
  379. return '%' + hex(d)[2:]
  380. if orig_type == '&':
  381. return '&#'+str(d)+';'
  382. return escape(chr(d))
  383. #
  384. def break_words(self, data):
  385. if self.break_words_longer_than > 0:
  386. pat = re.compile(r'(>|^)([\s]*)([^<]+?)([\s]*)(<|$)', re.DOTALL)
  387. data = re.sub(pat, self.f6, data)
  388. return data
  389. def f6(self, m):
  390. return m.group(1)+m.group(2)+self.break_text(m.group(3))+m.group(4)+m.group(5)
  391. def break_text(self, text):
  392. ret = ''
  393. entity_max_length = 8
  394. if self.break_words_longer_than < entity_max_length:
  395. width = entity_max_length
  396. else:
  397. width = self.break_words_longer_than
  398. for word in text.split(' '):
  399. if len(word) > width:
  400. word = word.replace('&#8203;','')
  401. m = re.search(self.pat_entities2, word[width-entity_max_length:width+entity_max_length])
  402. if m:
  403. width = width - entity_max_length + m.end()
  404. ret += word[0:width] + '&#8203;' + self.break_text(word[width:]) # insert "Zero Width" Space - helps wordwrap
  405. else:
  406. ret += word + ' '
  407. return ret.strip()