PageRenderTime 42ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/cola/core/extractor/preprocess.py

https://github.com/Flowerowl/cola
Python | 137 lines | 132 code | 0 blank | 5 comment | 0 complexity | 309f281503066e988c8b15c466803ce5 MD5 | raw file
Possible License(s): Apache-2.0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. '''
  4. Copyright (c) 2013 Qin Xuye <qin@qinxuye.me>
  5. Licensed under the Apache License, Version 2.0 (the "License");
  6. you may not use this file except in compliance with the License.
  7. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. Created on 2013-6-16
  15. @author: Chine
  16. '''
  17. import re
  18. from cola.core.logs import get_logger
  19. from cola.core.utils import beautiful_soup
  20. from cola.core.extractor.utils import absolute_url
  21. __all__ = ['PreProcessor']
  22. class Replacement(object):
  23. def __init__(self, desc, regex, replacement):
  24. self.desc = desc
  25. self.regex = regex
  26. self.replacement = replacement
  27. def apply(self, content):
  28. return self.regex.sub(self.replacement, content)
  29. # a bunch of regexes to hack around lousy html
  30. dodgy_regexes = (
  31. Replacement('javascript',
  32. regex=re.compile('<script.*?</script[^>]*>', re.DOTALL | re.IGNORECASE),
  33. replacement=''),
  34. Replacement('double double-quoted attributes',
  35. regex=re.compile('(="[^"]+")"+'),
  36. replacement='\\1'),
  37. Replacement('unclosed tags',
  38. regex = re.compile('(<[a-zA-Z]+[^>]*)(<[a-zA-Z]+[^<>]*>)'),
  39. replacement='\\1>\\2'),
  40. Replacement('unclosed (numerical) attribute values',
  41. regex = re.compile('(<[^>]*[a-zA-Z]+\s*=\s*"[0-9]+)( [a-zA-Z]+="\w+"|/?>)'),
  42. replacement='\\1"\\2'),
  43. Replacement('comment', regex=re.compile(r'<!--[^-]+-->', re.DOTALL),
  44. replacement=''),
  45. )
  46. # strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
  47. bad_attrs = ['width','height','style','[-a-z]*color','background[-a-z]*']
  48. single_quoted = "'[^']+'"
  49. double_quoted = '"[^"]+"'
  50. non_space = '[^ "\'>]+'
  51. htmlstrip = re.compile("<" # open
  52. "([^>]+) " # prefix
  53. "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
  54. '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
  55. "([^>]*)" # postfix
  56. ">" # end
  57. , re.I)
  58. class PreProcessor(object):
  59. def __init__(self, html, base_url=None, logger=None):
  60. self.logger = logger
  61. if logger is None:
  62. self.logger = get_logger(name='cola_extractor')
  63. self.html = html
  64. self.base_url = base_url
  65. def _remove_crufy_html(self, html):
  66. for replacement in dodgy_regexes:
  67. html = replacement.apply(html)
  68. return html
  69. def _fix_absolute_links(self, base_url):
  70. for link in self.soup.find_all('a', href=True):
  71. link['href'] = absolute_url(link['href'], base_url)
  72. def _fix_absolute_images(self, base_url):
  73. for image in self.soup.find_all('img', src=True):
  74. image['src'] = absolute_url(image['src'], base_url)
  75. def _fix_references(self, base_url):
  76. self._fix_absolute_links(base_url)
  77. self._fix_absolute_images(base_url)
  78. def _normalize_space(self, s):
  79. return ' '.join(s.split())
  80. def get_title(self, soup):
  81. if soup.head is None or soup.head.title is None:
  82. title = ''
  83. else:
  84. title = soup.head.title.text
  85. title = self._normalize_space(title)
  86. return title
  87. def _clean_attributes(self, html):
  88. while htmlstrip.search(html):
  89. html = htmlstrip.sub('<\\1\\2>', html)
  90. return html
  91. def get_body(self, soup):
  92. for elem in soup.find_all(['script', 'link', 'style']):
  93. elem.extract()
  94. raw_html = unicode(soup.body or soup)
  95. cleaned = self._clean_attributes(raw_html)
  96. return beautiful_soup(cleaned)
  97. def process(self, base_url=None):
  98. self.html = self._remove_crufy_html(self.html)
  99. self.soup = beautiful_soup(self.html, self.logger)
  100. base_url = self.base_url or base_url
  101. if base_url is not None:
  102. self._fix_references(base_url)
  103. title = self.get_title(self.soup)
  104. body = self.get_body(self.soup)
  105. return title, body