PageRenderTime 52ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/www.aaronsw.com/2002/html2text/html2text-1.0.py

https://github.com/jedahan/aaronsw-mirror
Python | 240 lines | 226 code | 8 blank | 6 comment | 39 complexity | b3f7ea1f22dec5a906f7f51d15ce46ab MD5 | raw file
  1. #!/usr/bin/python2.2
  2. """HTML2Text: Converts HTML to clean and readable plain text."""
  3. __author__ = "Aaron Swartz, based on code by Aaron Swartz and Lars Pind"
  4. __copyright__ = "(C) 2002 Aaron Swartz. GNU GPL 2"
  5. import re, urlparse
  6. re_ftag = re.compile(r'(/?)([^\s]+)(.*)', re.I|re.M|re.S)
  7. re_href = re.compile(r'(href|src)\s*=\s*["\']([^"\']+)["\']', re.I|re.M|re.S)
  8. re_href2 = re.compile(r'(href|src)\s*=\s*([^ ]+)', re.I|re.M|re.S)
  9. re_title = re.compile(r'(title|alt)\s*=\s*["\']([^"\']+)["\']', re.I|re.M|re.S)
  10. re_title2 = re.compile(r'(title|alt)\s*=\s*([^ ]+)', re.I|re.M|re.S)
  11. re_comments = re.compile(r'<!--.*?-->', re.I|re.M|re.S)
  12. def intEnt(m):
  13. m = int(m.groups(1)[0])
  14. return unichr(m).encode('utf-8')
  15. def xEnt(m):
  16. m = int(m.groups(1)[0], 16)
  17. return unichr(m).encode('utf-8')
  18. def expandEntities(text):
  19. text = text.replace("&lt;", "<")
  20. text = text.replace("&gt;", ">")
  21. text = text.replace("&quot;", '"')
  22. text = text.replace("&ob;", "{")
  23. text = text.replace("&cb;", "}")
  24. text = text.replace("&middot;", "*")
  25. text = re.sub("&[rl]squo;", "'", text)
  26. text = re.sub("&[rl]dquo;", '"', text)
  27. text = re.sub("&([aeiou])(grave|acute|circ|tilde|uml|ring);", lambda m: m.groups(1)[0], text)
  28. text = re.sub(r'&#(\d+);', intEnt, text)
  29. text = re.sub(r'&#[Xx](\w+);', xEnt, text)
  30. text = re.sub("&(#169|copy);", "(C)", text)
  31. text = re.sub("&mdash;", "--", text)
  32. return text
  33. class _html2text:
  34. def __call__(self, html, basehref, maxlen=80, showtags=0, showlinks=1):
  35. self.text, self.line, self.maxlen = '', '', maxlen
  36. self.pre, self.p, self.br, self.blockquote, self.space = 0, 0, 0, 0, 0
  37. last_tag_end = 0
  38. href_urls, href_stack = [], []
  39. # remove comments
  40. html = re.sub(re_comments, "", html)
  41. i = html.find('<')
  42. while i != -1:
  43. self.output(html[last_tag_end:i])
  44. # we're inside a tag, find the end
  45. # make i point to the char after the <
  46. tag_start = i + 1
  47. in_quote = 0
  48. for c in html[i:]:
  49. i += 1
  50. if c == ">" and not in_quote: break
  51. if c == '"' and not in_quote: in_quote = 1
  52. if c == '"' and in_quote: in_quote = 0
  53. i -= 1
  54. full_tag = html[tag_start:i]
  55. s = re.findall(re_ftag, full_tag)
  56. if s:
  57. s = s[0]
  58. slash, tagname, attributes = s[0], s[1], s[2]
  59. # valid tag
  60. t = tagname.lower()
  61. if t in ['p', 'ul', 'ol', 'table', 'div']:
  62. self.p = 1
  63. elif t == ["span", 'tbody']: pass
  64. elif t == 'br':
  65. self.text += self.line + '\n'
  66. self.line = " " * self.blockquote
  67. elif t in ['tr', 'td', 'th']:
  68. self.br = 1
  69. elif t == "title":
  70. if slash:
  71. self.p = 1
  72. else:
  73. self.output("TITLE: ")
  74. elif re.match(r'h\d+', t):
  75. if not slash: self.p = 1
  76. out = "=" * int(t[1:])
  77. if slash:
  78. out = ' ' + out
  79. else:
  80. out += ' '
  81. self.output(out)
  82. del out
  83. if slash: self.p = 1
  84. elif t == 'li':
  85. self.br = 1
  86. if not slash:
  87. self.output(" -")
  88. self.line += ' '
  89. elif t in ['strong', 'b']:
  90. self.output('*')
  91. elif t in ['em', 'i', 'cite']:
  92. self.output('_')
  93. elif t == 'a' and showlinks:
  94. if not slash:
  95. href = re.findall(re_href, attributes) or re.findall(re_href2, attributes)
  96. title = re.findall(re_title, attributes) or re.findall(re_title2, attributes)
  97. if href:
  98. href = href[0][1].replace("\n", "").replace("\r", "")
  99. href_no = len(href_urls) + 1
  100. if title:
  101. href_urls.append((href, expandEntities(title[0][1])))
  102. else:
  103. href_urls.append((href, ""))
  104. href_stack.append("["+`href_no`+"]")
  105. else:
  106. href_stack.append("")
  107. else:
  108. if len(href_stack) > 0:
  109. if href_stack[-1]:
  110. self.output(href_stack[-1])
  111. href_stack.pop()
  112. elif t == 'pre':
  113. self.p = 1
  114. if not slash:
  115. self.pre += 1
  116. else:
  117. self.pre -= 1
  118. elif t == 'blockquote':
  119. self.p = 1
  120. if not slash:
  121. self.blockquote += 1
  122. else:
  123. self.blockquote -= 1
  124. elif t == "hr":
  125. self.p = 1
  126. self.output("-" * maxlen)
  127. self.p = 1
  128. elif t == "img":
  129. self.output("[IMG")
  130. href = re.findall(re_href, attributes) or re.findall(re_href2, attributes)
  131. title = re.findall(re_title, attributes) or re.findall(re_title2, attributes)
  132. if href:
  133. href = urlparse.urljoin(basehref, href[0][1].replace("\n", "").replace("\r", ""))
  134. self.output(": " + href)
  135. if title:
  136. self.output(" ("+ expandEntities(title[0][1]) + ")")
  137. self.output("]")
  138. else:
  139. if showtags:
  140. self.output("&lt;"+slash+tagname+attributes+"&gt;")
  141. # set end of last tag to the character following the >
  142. last_tag_end = i + 1
  143. i = html.find("<", i)
  144. # append everything after the last tag
  145. self.output(html[last_tag_end:])
  146. # close all pre tags
  147. self.pre, self.blockquote = 0, 0
  148. self.text += self.line + "\n"
  149. if showlinks:
  150. i = 0
  151. for u in href_urls:
  152. i += 1
  153. self.text += "\n[" + `i` + "]" + (' ' * (len(`len(href_urls)`) - len(`i`) + 1)) + \
  154. urlparse.urljoin(basehref, u[0])
  155. if u[1]:
  156. self.text += "\n " + (' ' * len(`len(href_urls)`)) + u[1]
  157. self.text = self.text.replace("&nbsp;", " ")
  158. self.text = self.text.replace("&amp;", "&")
  159. return self.text
  160. def output(self, text):
  161. text = expandEntities(text)
  162. if self.line == '' and text.isspace(): return
  163. # output the text:
  164. if self.pre <= 0:
  165. # we're not inside a PRE tag
  166. text = re.sub("\s+", " ", text)
  167. if text == ' ': self.space = 1; return
  168. if self.space and self.line != " " * self.blockquote: self.line += " "; self.space = 0
  169. i, l = 0, text.split(' ')
  170. self.dumpbuffer()
  171. for word in l:
  172. word = re.sub("&(nsbp|#160);", " ", word)
  173. if len(self.line) > 0:
  174. if len(self.line) + 1 + len(word) > self.maxlen:
  175. # the next word goes past our maxline, break here
  176. self.text += self.line + '\n'
  177. self.line = " " * self.blockquote
  178. self.line = self.line + word
  179. if i != (len(l) - 1) and self.line != " " * self.blockquote: self.line += " "
  180. i += 1
  181. else:
  182. self.text += self.line
  183. self.line = ''
  184. self.dumpbuffer()
  185. # we are inside a pre tag
  186. if self.blockquote:
  187. # break up by lines and indent
  188. for line in text.split('\n')[:-1]:
  189. self.text += line + '\n' + (' ' * self.blockquote)
  190. self.text += text.split('\n')[-1] # last line, don't add a line break
  191. else:
  192. self.text += text
  193. def dumpbuffer(self):
  194. if self.p or self.br:
  195. # we're going to add some newlines, so empty line buffer
  196. self.text += self.line
  197. if self.text != '': # not the first thing
  198. if self.p:
  199. self.text += "\n\n"
  200. elif self.br:
  201. self.text += "\n"
  202. self.line = " " * self.blockquote
  203. self.p, self.br = 0, 0
  204. html2text = _html2text()
  205. if __name__ == "__main__":
  206. import cgitb; cgitb.enable()
  207. import sys, urllib, cgi
  208. if len(sys.argv) > 1:
  209. url = sys.argv[1]
  210. elif 'url' in cgi.FieldStorage().keys():
  211. import cgitb; cgitb.enable();
  212. url = cgi.FieldStorage()['url'].value
  213. print "Content-type: text/plain; charset=utf-8"
  214. print
  215. else:
  216. print "Content-type: text/plain; charset=utf-8"
  217. print
  218. url = "http://www.aaronsw.com/"
  219. maxlen=80
  220. if 'maxlen' in cgi.FieldStorage().keys():
  221. maxlen=cgi.FieldStorage()['maxlen'].value
  222. print html2text(urllib.urlopen(url).read(), url, maxlen=maxlen).encode('utf-8')