PageRenderTime 46ms CodeModel.GetById 9ms RepoModel.GetById 0ms app.codeStats 1ms

/html2text.py

https://github.com/jonathanmarvens/html2text
Python | 914 lines | 829 code | 51 blank | 34 comment | 144 complexity | 70c359c5e32253307627952e13a23cb0 MD5 | raw file
Possible License(s): GPL-3.0
  1. #!/usr/bin/env python
  2. """html2text: Turn HTML into equivalent Markdown-structured text."""
  3. __version__ = "3.200.3"
  4. __author__ = "Aaron Swartz (me@aaronsw.com)"
  5. __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
  6. __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
  7. # TODO:
  8. # Support decoded entities with unifiable.
  9. try:
  10. True
  11. except NameError:
  12. setattr(__builtins__, 'True', 1)
  13. setattr(__builtins__, 'False', 0)
  14. def has_key(x, y):
  15. if hasattr(x, 'has_key'): return x.has_key(y)
  16. else: return y in x
  17. try:
  18. import htmlentitydefs
  19. import urlparse
  20. import HTMLParser
  21. except ImportError: #Python3
  22. import html.entities as htmlentitydefs
  23. import urllib.parse as urlparse
  24. import html.parser as HTMLParser
  25. try: #Python3
  26. import urllib.request as urllib
  27. except:
  28. import urllib
  29. import optparse, re, sys, codecs, types
  30. try: from textwrap import wrap
  31. except: pass
  32. # Use Unicode characters instead of their ascii psuedo-replacements
  33. UNICODE_SNOB = 0
  34. # Escape all special characters. Output is less readable, but avoids corner case formatting issues.
  35. ESCAPE_SNOB = 0
  36. # Put the links after each paragraph instead of at the end.
  37. LINKS_EACH_PARAGRAPH = 0
  38. # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
  39. BODY_WIDTH = 78
  40. # Don't show internal links (href="#local-anchor") -- corresponding link targets
  41. # won't be visible in the plain text file anyway.
  42. SKIP_INTERNAL_LINKS = True
  43. # Use inline, rather than reference, formatting for images and links
  44. INLINE_LINKS = True
  45. # Number of pixels Google indents nested lists
  46. GOOGLE_LIST_INDENT = 36
  47. IGNORE_ANCHORS = False
  48. IGNORE_IMAGES = False
  49. IGNORE_EMPHASIS = False
  50. ### Entity Nonsense ###
  51. def name2cp(k):
  52. if k == 'apos': return ord("'")
  53. if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
  54. return htmlentitydefs.name2codepoint[k]
  55. else:
  56. k = htmlentitydefs.entitydefs[k]
  57. if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
  58. return ord(codecs.latin_1_decode(k)[0])
  59. unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
  60. 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
  61. 'ndash':'-', 'oelig':'oe', 'aelig':'ae',
  62. 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
  63. 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
  64. 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
  65. 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
  66. 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u',
  67. 'lrm':'', 'rlm':''}
  68. unifiable_n = {}
  69. for k in unifiable.keys():
  70. unifiable_n[name2cp(k)] = unifiable[k]
  71. ### End Entity Nonsense ###
  72. def onlywhite(line):
  73. """Return true if the line does only consist of whitespace characters."""
  74. for c in line:
  75. if c is not ' ' and c is not ' ':
  76. return c is ' '
  77. return line
  78. def hn(tag):
  79. if tag[0] == 'h' and len(tag) == 2:
  80. try:
  81. n = int(tag[1])
  82. if n in range(1, 10): return n
  83. except ValueError: return 0
  84. def dumb_property_dict(style):
  85. """returns a hash of css attributes"""
  86. return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]);
  87. def dumb_css_parser(data):
  88. """returns a hash of css selectors, each of which contains a hash of css attributes"""
  89. # remove @import sentences
  90. data += ';'
  91. importIndex = data.find('@import')
  92. while importIndex != -1:
  93. data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
  94. importIndex = data.find('@import')
  95. # parse the css. reverted from dictionary compehension in order to support older pythons
  96. elements = [x.split('{') for x in data.split('}') if '{' in x.strip()]
  97. try:
  98. elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements])
  99. except ValueError:
  100. elements = {} # not that important
  101. return elements
  102. def element_style(attrs, style_def, parent_style):
  103. """returns a hash of the 'final' style attributes of the element"""
  104. style = parent_style.copy()
  105. if 'class' in attrs:
  106. for css_class in attrs['class'].split():
  107. css_style = style_def['.' + css_class]
  108. style.update(css_style)
  109. if 'style' in attrs:
  110. immediate_style = dumb_property_dict(attrs['style'])
  111. style.update(immediate_style)
  112. return style
  113. def google_list_style(style):
  114. """finds out whether this is an ordered or unordered list"""
  115. if 'list-style-type' in style:
  116. list_style = style['list-style-type']
  117. if list_style in ['disc', 'circle', 'square', 'none']:
  118. return 'ul'
  119. return 'ol'
  120. def google_has_height(style):
  121. """check if the style of the element has the 'height' attribute explicitly defined"""
  122. if 'height' in style:
  123. return True
  124. return False
  125. def google_text_emphasis(style):
  126. """return a list of all emphasis modifiers of the element"""
  127. emphasis = []
  128. if 'text-decoration' in style:
  129. emphasis.append(style['text-decoration'])
  130. if 'font-style' in style:
  131. emphasis.append(style['font-style'])
  132. if 'font-weight' in style:
  133. emphasis.append(style['font-weight'])
  134. return emphasis
  135. def google_fixed_width_font(style):
  136. """check if the css of the current element defines a fixed width font"""
  137. font_family = ''
  138. if 'font-family' in style:
  139. font_family = style['font-family']
  140. if 'Courier New' == font_family or 'Consolas' == font_family:
  141. return True
  142. return False
  143. def list_numbering_start(attrs):
  144. """extract numbering from list element attributes"""
  145. if 'start' in attrs:
  146. return int(attrs['start']) - 1
  147. else:
  148. return 0
  149. class HTML2Text(HTMLParser.HTMLParser):
  150. def __init__(self, out=None, baseurl=''):
  151. HTMLParser.HTMLParser.__init__(self)
  152. # Config options
  153. self.unicode_snob = UNICODE_SNOB
  154. self.escape_snob = ESCAPE_SNOB
  155. self.links_each_paragraph = LINKS_EACH_PARAGRAPH
  156. self.body_width = BODY_WIDTH
  157. self.skip_internal_links = SKIP_INTERNAL_LINKS
  158. self.inline_links = INLINE_LINKS
  159. self.google_list_indent = GOOGLE_LIST_INDENT
  160. self.ignore_links = IGNORE_ANCHORS
  161. self.ignore_images = IGNORE_IMAGES
  162. self.ignore_emphasis = IGNORE_EMPHASIS
  163. self.google_doc = False
  164. self.ul_item_mark = '*'
  165. self.emphasis_mark = '_'
  166. self.strong_mark = '**'
  167. if out is None:
  168. self.out = self.outtextf
  169. else:
  170. self.out = out
  171. self.outtextlist = [] # empty list to store output characters before they are "joined"
  172. try:
  173. self.outtext = unicode()
  174. except NameError: # Python3
  175. self.outtext = str()
  176. self.quiet = 0
  177. self.p_p = 0 # number of newline character to print before next output
  178. self.outcount = 0
  179. self.start = 1
  180. self.space = 0
  181. self.a = []
  182. self.astack = []
  183. self.maybe_automatic_link = None
  184. self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://')
  185. self.acount = 0
  186. self.list = []
  187. self.blockquote = 0
  188. self.pre = 0
  189. self.startpre = 0
  190. self.code = False
  191. self.br_toggle = ''
  192. self.lastWasNL = 0
  193. self.lastWasList = False
  194. self.style = 0
  195. self.style_def = {}
  196. self.tag_stack = []
  197. self.emphasis = 0
  198. self.drop_white_space = 0
  199. self.inheader = False
  200. self.abbr_title = None # current abbreviation definition
  201. self.abbr_data = None # last inner HTML (for abbr being defined)
  202. self.abbr_list = {} # stack of abbreviations to write later
  203. self.baseurl = baseurl
  204. try: del unifiable_n[name2cp('nbsp')]
  205. except KeyError: pass
  206. unifiable['nbsp'] = '&nbsp_place_holder;'
  207. def feed(self, data):
  208. data = data.replace("</' + 'script>", "</ignore>")
  209. HTMLParser.HTMLParser.feed(self, data)
  210. def handle(self, data):
  211. self.feed(data)
  212. self.feed("")
  213. return self.optwrap(self.close())
  214. def outtextf(self, s):
  215. self.outtextlist.append(s)
  216. if s: self.lastWasNL = s[-1] == '\n'
  217. def close(self):
  218. HTMLParser.HTMLParser.close(self)
  219. self.pbr()
  220. self.o('', 0, 'end')
  221. self.outtext = self.outtext.join(self.outtextlist)
  222. if self.unicode_snob:
  223. nbsp = unichr(name2cp('nbsp'))
  224. else:
  225. nbsp = u' '
  226. self.outtext = self.outtext.replace(u'&nbsp_place_holder;', nbsp)
  227. return self.outtext
  228. def handle_charref(self, c):
  229. self.o(self.charref(c), 1)
  230. def handle_entityref(self, c):
  231. self.o(self.entityref(c), 1)
  232. def handle_starttag(self, tag, attrs):
  233. self.handle_tag(tag, attrs, 1)
  234. def handle_endtag(self, tag):
  235. self.handle_tag(tag, None, 0)
  236. def previousIndex(self, attrs):
  237. """ returns the index of certain set of attributes (of a link) in the
  238. self.a list
  239. If the set of attributes is not found, returns None
  240. """
  241. if not has_key(attrs, 'href'): return None
  242. i = -1
  243. for a in self.a:
  244. i += 1
  245. match = 0
  246. if has_key(a, 'href') and a['href'] == attrs['href']:
  247. if has_key(a, 'title') or has_key(attrs, 'title'):
  248. if (has_key(a, 'title') and has_key(attrs, 'title') and
  249. a['title'] == attrs['title']):
  250. match = True
  251. else:
  252. match = True
  253. if match: return i
  254. def drop_last(self, nLetters):
  255. if not self.quiet:
  256. self.outtext = self.outtext[:-nLetters]
  257. def handle_emphasis(self, start, tag_style, parent_style):
  258. """handles various text emphases"""
  259. tag_emphasis = google_text_emphasis(tag_style)
  260. parent_emphasis = google_text_emphasis(parent_style)
  261. # handle Google's text emphasis
  262. strikethrough = 'line-through' in tag_emphasis and self.hide_strikethrough
  263. bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
  264. italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
  265. fixed = google_fixed_width_font(tag_style) and not \
  266. google_fixed_width_font(parent_style) and not self.pre
  267. if start:
  268. # crossed-out text must be handled before other attributes
  269. # in order not to output qualifiers unnecessarily
  270. if bold or italic or fixed:
  271. self.emphasis += 1
  272. if strikethrough:
  273. self.quiet += 1
  274. if italic:
  275. self.o(self.emphasis_mark)
  276. self.drop_white_space += 1
  277. if bold:
  278. self.o(self.strong_mark)
  279. self.drop_white_space += 1
  280. if fixed:
  281. self.o('`')
  282. self.drop_white_space += 1
  283. self.code = True
  284. else:
  285. if bold or italic or fixed:
  286. # there must not be whitespace before closing emphasis mark
  287. self.emphasis -= 1
  288. self.space = 0
  289. self.outtext = self.outtext.rstrip()
  290. if fixed:
  291. if self.drop_white_space:
  292. # empty emphasis, drop it
  293. self.drop_last(1)
  294. self.drop_white_space -= 1
  295. else:
  296. self.o('`')
  297. self.code = False
  298. if bold:
  299. if self.drop_white_space:
  300. # empty emphasis, drop it
  301. self.drop_last(2)
  302. self.drop_white_space -= 1
  303. else:
  304. self.o(self.strong_mark)
  305. if italic:
  306. if self.drop_white_space:
  307. # empty emphasis, drop it
  308. self.drop_last(1)
  309. self.drop_white_space -= 1
  310. else:
  311. self.o(self.emphasis_mark)
  312. # space is only allowed after *all* emphasis marks
  313. if (bold or italic) and not self.emphasis:
  314. self.o(" ")
  315. if strikethrough:
  316. self.quiet -= 1
  317. def handle_tag(self, tag, attrs, start):
  318. #attrs = fixattrs(attrs)
  319. if attrs is None:
  320. attrs = {}
  321. else:
  322. attrs = dict(attrs)
  323. if self.google_doc:
  324. # the attrs parameter is empty for a closing tag. in addition, we
  325. # need the attributes of the parent nodes in order to get a
  326. # complete style description for the current element. we assume
  327. # that google docs export well formed html.
  328. parent_style = {}
  329. if start:
  330. if self.tag_stack:
  331. parent_style = self.tag_stack[-1][2]
  332. tag_style = element_style(attrs, self.style_def, parent_style)
  333. self.tag_stack.append((tag, attrs, tag_style))
  334. else:
  335. dummy, attrs, tag_style = self.tag_stack.pop()
  336. if self.tag_stack:
  337. parent_style = self.tag_stack[-1][2]
  338. if hn(tag):
  339. self.p()
  340. if start:
  341. self.inheader = True
  342. self.o(hn(tag)*"#" + ' ')
  343. else:
  344. self.inheader = False
  345. return # prevent redundant emphasis marks on headers
  346. if tag in ['p', 'div']:
  347. if self.google_doc:
  348. if start and google_has_height(tag_style):
  349. self.p()
  350. else:
  351. self.soft_br()
  352. else:
  353. self.p()
  354. if tag == "br" and start: self.o(" \n")
  355. if tag == "hr" and start:
  356. self.p()
  357. self.o("* * *")
  358. self.p()
  359. if tag in ["head", "style", 'script']:
  360. if start: self.quiet += 1
  361. else: self.quiet -= 1
  362. if tag == "style":
  363. if start: self.style += 1
  364. else: self.style -= 1
  365. if tag in ["body"]:
  366. self.quiet = 0 # sites like 9rules.com never close <head>
  367. if tag == "blockquote":
  368. if start:
  369. self.p(); self.o('> ', 0, 1); self.start = 1
  370. self.blockquote += 1
  371. else:
  372. self.blockquote -= 1
  373. self.p()
  374. if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: self.o(self.emphasis_mark)
  375. if tag in ['strong', 'b'] and not self.ignore_emphasis: self.o(self.strong_mark)
  376. if tag in ['del', 'strike', 's']:
  377. if start:
  378. self.o("<"+tag+">")
  379. else:
  380. self.o("</"+tag+">")
  381. if self.google_doc:
  382. if not self.inheader:
  383. # handle some font attributes, but leave headers clean
  384. self.handle_emphasis(start, tag_style, parent_style)
  385. if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` ``
  386. if tag == "abbr":
  387. if start:
  388. self.abbr_title = None
  389. self.abbr_data = ''
  390. if has_key(attrs, 'title'):
  391. self.abbr_title = attrs['title']
  392. else:
  393. if self.abbr_title != None:
  394. self.abbr_list[self.abbr_data] = self.abbr_title
  395. self.abbr_title = None
  396. self.abbr_data = ''
  397. if tag == "a" and not self.ignore_links:
  398. if start:
  399. if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')):
  400. self.astack.append(attrs)
  401. self.maybe_automatic_link = attrs['href']
  402. else:
  403. self.astack.append(None)
  404. else:
  405. if self.astack:
  406. a = self.astack.pop()
  407. if self.maybe_automatic_link:
  408. self.maybe_automatic_link = None
  409. elif a:
  410. if self.inline_links:
  411. self.o("](" + escape_md(a['href']) + ")")
  412. else:
  413. i = self.previousIndex(a)
  414. if i is not None:
  415. a = self.a[i]
  416. else:
  417. self.acount += 1
  418. a['count'] = self.acount
  419. a['outcount'] = self.outcount
  420. self.a.append(a)
  421. self.o("][" + str(a['count']) + "]")
  422. if tag == "img" and start and not self.ignore_images:
  423. if has_key(attrs, 'src'):
  424. attrs['href'] = attrs['src']
  425. alt = attrs.get('alt', '')
  426. self.o("![" + escape_md(alt) + "]")
  427. if self.inline_links:
  428. self.o("(" + escape_md(attrs['href']) + ")")
  429. else:
  430. i = self.previousIndex(attrs)
  431. if i is not None:
  432. attrs = self.a[i]
  433. else:
  434. self.acount += 1
  435. attrs['count'] = self.acount
  436. attrs['outcount'] = self.outcount
  437. self.a.append(attrs)
  438. self.o("[" + str(attrs['count']) + "]")
  439. if tag == 'dl' and start: self.p()
  440. if tag == 'dt' and not start: self.pbr()
  441. if tag == 'dd' and start: self.o(' ')
  442. if tag == 'dd' and not start: self.pbr()
  443. if tag in ["ol", "ul"]:
  444. # Google Docs create sub lists as top level lists
  445. if (not self.list) and (not self.lastWasList):
  446. self.p()
  447. if start:
  448. if self.google_doc:
  449. list_style = google_list_style(tag_style)
  450. else:
  451. list_style = tag
  452. numbering_start = list_numbering_start(attrs)
  453. self.list.append({'name':list_style, 'num':numbering_start})
  454. else:
  455. if self.list: self.list.pop()
  456. self.lastWasList = True
  457. else:
  458. self.lastWasList = False
  459. if tag == 'li':
  460. self.pbr()
  461. if start:
  462. if self.list: li = self.list[-1]
  463. else: li = {'name':'ul', 'num':0}
  464. if self.google_doc:
  465. nest_count = self.google_nest_count(tag_style)
  466. else:
  467. nest_count = len(self.list)
  468. self.o(" " * nest_count) #TODO: line up <ol><li>s > 9 correctly.
  469. if li['name'] == "ul": self.o(self.ul_item_mark + " ")
  470. elif li['name'] == "ol":
  471. li['num'] += 1
  472. self.o(str(li['num'])+". ")
  473. self.start = 1
  474. if tag in ["table", "tr"] and start: self.p()
  475. if tag == 'td': self.pbr()
  476. if tag == "pre":
  477. if start:
  478. self.startpre = 1
  479. self.pre = 1
  480. else:
  481. self.pre = 0
  482. self.p()
  483. def pbr(self):
  484. if self.p_p == 0:
  485. self.p_p = 1
  486. def p(self):
  487. self.p_p = 2
  488. def soft_br(self):
  489. self.pbr()
  490. self.br_toggle = ' '
  491. def o(self, data, puredata=0, force=0):
  492. if self.abbr_data is not None:
  493. self.abbr_data += data
  494. if not self.quiet:
  495. if self.google_doc:
  496. # prevent white space immediately after 'begin emphasis' marks ('**' and '_')
  497. lstripped_data = data.lstrip()
  498. if self.drop_white_space and not (self.pre or self.code):
  499. data = lstripped_data
  500. if lstripped_data != '':
  501. self.drop_white_space = 0
  502. if puredata and not self.pre:
  503. data = re.sub('\s+', ' ', data)
  504. if data and data[0] == ' ':
  505. self.space = 1
  506. data = data[1:]
  507. if not data and not force: return
  508. if self.startpre:
  509. #self.out(" :") #TODO: not output when already one there
  510. if not data.startswith("\n"): # <pre>stuff...
  511. data = "\n" + data
  512. bq = (">" * self.blockquote)
  513. if not (force and data and data[0] == ">") and self.blockquote: bq += " "
  514. if self.pre:
  515. if not self.list:
  516. bq += " "
  517. #else: list content is already partially indented
  518. for i in xrange(len(self.list)):
  519. bq += " "
  520. data = data.replace("\n", "\n"+bq)
  521. if self.startpre:
  522. self.startpre = 0
  523. if self.list:
  524. data = data.lstrip("\n") # use existing initial indentation
  525. if self.start:
  526. self.space = 0
  527. self.p_p = 0
  528. self.start = 0
  529. if force == 'end':
  530. # It's the end.
  531. self.p_p = 0
  532. self.out("\n")
  533. self.space = 0
  534. if self.p_p:
  535. self.out((self.br_toggle+'\n'+bq)*self.p_p)
  536. self.space = 0
  537. self.br_toggle = ''
  538. if self.space:
  539. if not self.lastWasNL: self.out(' ')
  540. self.space = 0
  541. if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"):
  542. if force == "end": self.out("\n")
  543. newa = []
  544. for link in self.a:
  545. if self.outcount > link['outcount']:
  546. self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
  547. if has_key(link, 'title'): self.out(" ("+link['title']+")")
  548. self.out("\n")
  549. else:
  550. newa.append(link)
  551. if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
  552. self.a = newa
  553. if self.abbr_list and force == "end":
  554. for abbr, definition in self.abbr_list.items():
  555. self.out(" *[" + abbr + "]: " + definition + "\n")
  556. self.p_p = 0
  557. self.out(data)
  558. self.outcount += 1
  559. def handle_data(self, data):
  560. if r'\/script>' in data: self.quiet -= 1
  561. if self.style:
  562. self.style_def.update(dumb_css_parser(data))
  563. if not self.maybe_automatic_link is None:
  564. href = self.maybe_automatic_link
  565. if href == data and self.absolute_url_matcher.match(href):
  566. self.o("<" + data + ">")
  567. return
  568. else:
  569. self.o("[")
  570. self.maybe_automatic_link = None
  571. if not self.code and not self.pre:
  572. data = escape_md_section(data, snob=self.escape_snob)
  573. self.o(data, 1)
  574. def unknown_decl(self, data): pass
  575. def charref(self, name):
  576. if name[0] in ['x','X']:
  577. c = int(name[1:], 16)
  578. else:
  579. c = int(name)
  580. if not self.unicode_snob and c in unifiable_n.keys():
  581. return unifiable_n[c]
  582. else:
  583. try:
  584. return unichr(c)
  585. except NameError: #Python3
  586. return chr(c)
  587. def entityref(self, c):
  588. if not self.unicode_snob and c in unifiable.keys():
  589. return unifiable[c]
  590. else:
  591. try: name2cp(c)
  592. except KeyError: return "&" + c + ';'
  593. else:
  594. try:
  595. return unichr(name2cp(c))
  596. except NameError: #Python3
  597. return chr(name2cp(c))
  598. def replaceEntities(self, s):
  599. s = s.group(1)
  600. if s[0] == "#":
  601. return self.charref(s[1:])
  602. else: return self.entityref(s)
  603. r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
  604. def unescape(self, s):
  605. return self.r_unescape.sub(self.replaceEntities, s)
  606. def google_nest_count(self, style):
  607. """calculate the nesting count of google doc lists"""
  608. nest_count = 0
  609. if 'margin-left' in style:
  610. nest_count = int(style['margin-left'][:-2]) / self.google_list_indent
  611. return nest_count
  612. def optwrap(self, text):
  613. """Wrap all paragraphs in the provided text."""
  614. if not self.body_width:
  615. return text
  616. assert wrap, "Requires Python 2.3."
  617. result = ''
  618. newlines = 0
  619. for para in text.split("\n"):
  620. if len(para) > 0:
  621. if not skipwrap(para):
  622. result += "\n".join(wrap(para, self.body_width))
  623. if para.endswith(' '):
  624. result += " \n"
  625. newlines = 1
  626. else:
  627. result += "\n\n"
  628. newlines = 2
  629. else:
  630. if not onlywhite(para):
  631. result += para + "\n"
  632. newlines = 1
  633. else:
  634. if newlines < 2:
  635. result += "\n"
  636. newlines += 1
  637. return result
  638. ordered_list_matcher = re.compile(r'\d+\.\s')
  639. unordered_list_matcher = re.compile(r'[-\*\+]\s')
  640. md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
  641. md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])")
  642. md_dot_matcher = re.compile(r"""
  643. ^ # start of line
  644. (\s*\d+) # optional whitespace and a number
  645. (\.) # dot
  646. (?=\s) # lookahead assert whitespace
  647. """, re.MULTILINE | re.VERBOSE)
  648. md_plus_matcher = re.compile(r"""
  649. ^
  650. (\s*)
  651. (\+)
  652. (?=\s)
  653. """, flags=re.MULTILINE | re.VERBOSE)
  654. md_dash_matcher = re.compile(r"""
  655. ^
  656. (\s*)
  657. (-)
  658. (?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
  659. # or another dash (header or hr)
  660. """, flags=re.MULTILINE | re.VERBOSE)
  661. slash_chars = r'\`*_{}[]()#+-.!'
  662. md_backslash_matcher = re.compile(r'''
  663. (\\) # match one slash
  664. (?=[%s]) # followed by a char that requires escaping
  665. ''' % re.escape(slash_chars),
  666. flags=re.VERBOSE)
  667. def skipwrap(para):
  668. # If the text begins with four spaces or one tab, it's a code block; don't wrap
  669. if para[0:4] == ' ' or para[0] == '\t':
  670. return True
  671. # If the text begins with only two "--", possibly preceded by whitespace, that's
  672. # an emdash; so wrap.
  673. stripped = para.lstrip()
  674. if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
  675. return False
  676. # I'm not sure what this is for; I thought it was to detect lists, but there's
  677. # a <br>-inside-<span> case in one of the tests that also depends upon it.
  678. if stripped[0:1] == '-' or stripped[0:1] == '*':
  679. return True
  680. # If the text begins with a single -, *, or +, followed by a space, or an integer,
  681. # followed by a ., followed by a space (in either case optionally preceeded by
  682. # whitespace), it's a list; don't wrap.
  683. if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped):
  684. return True
  685. return False
  686. def wrapwrite(text):
  687. text = text.encode('utf-8')
  688. try: #Python3
  689. sys.stdout.buffer.write(text)
  690. except AttributeError:
  691. sys.stdout.write(text)
  692. def html2text(html, baseurl=''):
  693. h = HTML2Text(baseurl=baseurl)
  694. return h.handle(html)
  695. def unescape(s, unicode_snob=False):
  696. h = HTML2Text()
  697. h.unicode_snob = unicode_snob
  698. return h.unescape(s)
  699. def escape_md(text):
  700. """Escapes markdown-sensitive characters within other markdown constructs."""
  701. return md_chars_matcher.sub(r"\\\1", text)
  702. def escape_md_section(text, snob=False):
  703. """Escapes markdown-sensitive characters across whole document sections."""
  704. text = md_backslash_matcher.sub(r"\\\1", text)
  705. if snob:
  706. text = md_chars_matcher_all.sub(r"\\\1", text)
  707. text = md_dot_matcher.sub(r"\1\\\2", text)
  708. text = md_plus_matcher.sub(r"\1\\\2", text)
  709. text = md_dash_matcher.sub(r"\1\\\2", text)
  710. return text
  711. def main():
  712. baseurl = ''
  713. p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
  714. version='%prog ' + __version__)
  715. p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true",
  716. default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis")
  717. p.add_option("--ignore-links", dest="ignore_links", action="store_true",
  718. default=IGNORE_ANCHORS, help="don't include any formatting for links")
  719. p.add_option("--ignore-images", dest="ignore_images", action="store_true",
  720. default=IGNORE_IMAGES, help="don't include any formatting for images")
  721. p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
  722. default=False, help="convert an html-exported Google Document")
  723. p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
  724. default=False, help="use a dash rather than a star for unordered list items")
  725. p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk",
  726. default=False, help="use an asterisk rather than an underscore for emphasized text")
  727. p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
  728. default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap")
  729. p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
  730. default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
  731. p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
  732. default=False, help="hide strike-through text. only relevant when -g is specified as well")
  733. p.add_option("--escape-all", action="store_true", dest="escape_snob",
  734. default=False, help="Escape all special characters. Output is less readable, but avoids corner case formatting issues.")
  735. (options, args) = p.parse_args()
  736. # process input
  737. encoding = "utf-8"
  738. if len(args) > 0:
  739. file_ = args[0]
  740. if len(args) == 2:
  741. encoding = args[1]
  742. if len(args) > 2:
  743. p.error('Too many arguments')
  744. if file_.startswith('http://') or file_.startswith('https://'):
  745. baseurl = file_
  746. j = urllib.urlopen(baseurl)
  747. data = j.read()
  748. if encoding is None:
  749. try:
  750. from feedparser import _getCharacterEncoding as enc
  751. except ImportError:
  752. enc = lambda x, y: ('utf-8', 1)
  753. encoding = enc(j.headers, data)[0]
  754. if encoding == 'us-ascii':
  755. encoding = 'utf-8'
  756. else:
  757. data = open(file_, 'rb').read()
  758. if encoding is None:
  759. try:
  760. from chardet import detect
  761. except ImportError:
  762. detect = lambda x: {'encoding': 'utf-8'}
  763. encoding = detect(data)['encoding']
  764. else:
  765. data = sys.stdin.read()
  766. data = data.decode(encoding)
  767. h = HTML2Text(baseurl=baseurl)
  768. # handle options
  769. if options.ul_style_dash: h.ul_item_mark = '-'
  770. if options.em_style_asterisk:
  771. h.emphasis_mark = '*'
  772. h.strong_mark = '__'
  773. h.body_width = options.body_width
  774. h.list_indent = options.list_indent
  775. h.ignore_emphasis = options.ignore_emphasis
  776. h.ignore_links = options.ignore_links
  777. h.ignore_images = options.ignore_images
  778. h.google_doc = options.google_doc
  779. h.hide_strikethrough = options.hide_strikethrough
  780. h.escape_snob = options.escape_snob
  781. wrapwrite(h.handle(data))
  782. if __name__ == "__main__":
  783. main()