PageRenderTime 55ms CodeModel.GetById 30ms RepoModel.GetById 0ms app.codeStats 0ms

/ptxt2markdown.py

http://epubia.googlecode.com/
Python | 266 lines | 254 code | 2 blank | 10 comment | 0 complexity | df3900f469eecb699b151f1e8cdee079 MD5 | raw file
  1. # -*- encoding: utf-8 -*-
  2. # Plain Text to Formatted Text
  3. #
  4. # Added markup
  5. # & null paragraph
  6. # Disabled markup
  7. # - list item
  8. # Changed markup
  9. # --- horizontal line -> predefined paragraph separator
  10. # * * * horizontal line -> predefined paragraph separator
  11. import codecs
  12. import re
  13. EMPTYLINE_PTN = re.compile(r'^\s*$',re.M|re.U)
  14. PARAEND_PTN = re.compile(r'''([\.\?!'"\)])\s*$''',re.M|re.U)
  15. INDENTSTART_PTN = re.compile(r'''^([ ]{1,3})''',re.M|re.U) # preserve Markdown block (4 spaces or tab)
  16. # guessing chapter/section
  17. CHAP_PTN1 = re.compile(r'\n\n([^\n]+)\n[ ]*={5,}[ \t]*\n')
  18. CHAP_PTN2 = re.compile(r'\n#[ \t]*')
  19. CHAP_PTN3 = re.compile(ur'^\s*(?\s*\d+\s*?[ \.].{2,})$',re.M)
  20. SECT_PTN1 = re.compile(r'\n##[ \t]*')
  21. SECT_PTN2 = re.compile(r'^\s*([\dIVXivx]+\.?)\s*$',re.M)
  22. class txt_cleaner:
  23. def __init__(self):
  24. self.ptn_sl_empty = re.compile(r'([^\n])\n\n([^\n])')
  25. self.ptn_quote_start = re.compile(r'''([\.!\?"'])\s*\n+(["'])''')
  26. self.ptn_quote_end = re.compile(r'''(["'])\s*\n+(\S)''')
  27. self.ptn_dblqt_begin = re.compile('^ *"([^"]*.) *$',re.M|re.U)
  28. self.ptn_dblqt_end = re.compile('^ *(.[^"]*)" *$',re.M|re.U)
  29. self.ptn_sglqt_begin = re.compile("^ *'([^']*.) *$",re.M|re.U)
  30. self.ptn_sglqt_end = re.compile("^ *(.[^']*)' *$",re.M|re.U)
  31. self.ptn_starhr = re.compile('(\* *\* *\*)',re.M|re.U)
  32. self.ptn_dash3hr = re.compile('^ *--- *$',re.M)
  33. self.ptn_minusli = re.compile('^( *)-([^-])',re.M|re.U)
  34. self.cleaned = False
  35. def convert(self, txt, start=1):
  36. # preprocess
  37. txt = self.preprocess(txt, start)
  38. # paragraph
  39. txt = self.format_paragraph(txt)
  40. # break within word
  41. txt = self.recover_word(txt)
  42. # postprocess
  43. txt = self.postprocess(txt)
  44. return txt
  45. def preprocess(self, txt, start=1):
  46. # line skip
  47. if start != 1:
  48. print "start from %d" % start
  49. txt = '\n'.join( txt.split('\n')[start-1:] )
  50. # remove spaces in line end
  51. # --> space in tail can be used as line break directive
  52. #txt = re.compile(r'[ \r]*$',re.M).sub('', txt)
  53. # clean empty line
  54. txt = re.compile(r'^\s*$',re.M|re.U).sub('', txt)
  55. # filter special character
  56. #txt = txt.replace(u'“ ','"').replace(u'”','"')
  57. #txt = txt.replace(u"‘ ","'").replace(u"’","'")
  58. return txt
  59. def analyze_paragraph(self, txt):
  60. # extract patterns
  61. self.num_line = len( txt.split('\n') )
  62. self.num_emptyline = len( EMPTYLINE_PTN.findall(txt) )
  63. self.num_paraend = len( PARAEND_PTN.findall(txt) )
  64. self.num_indentline = len( INDENTSTART_PTN.findall(txt) )
  65. print "paragraph: %d %d %d %d" % (self.num_line, self.num_emptyline, self.num_paraend, self.num_indentline)
  66. def format_paragraph(self, text):
  67. self.cleaned = False
  68. # merge multiple empty lines
  69. txt = re.sub(r'\n{2,}', r'\n\n', text)
  70. # decide paragraph style
  71. self.analyze_paragraph(txt)
  72. if self.num_emptyline > 0.45*self.num_line and self.num_paraend < 0.8*self.num_emptyline:
  73. # Type-5: line separated with empty line
  74. print "detect all lines are separated by empty line"
  75. txt = self.ptn_sl_empty.sub(r'\g<1>\n\g<2>',txt)
  76. self.cleaned = True
  77. self.analyze_paragraph(txt)
  78. if self.num_indentline > self.num_emptyline:
  79. # Type-2: paragraph by indent
  80. print "detect paragraph by indent"
  81. text = INDENTSTART_PTN.sub(r'\n',txt)
  82. self.cleaned = True
  83. elif self.num_emptyline < 0.3 * self.num_paraend:
  84. if self.num_paraend > 0.8 * self.num_line:
  85. # Type-3: paragraph in one line
  86. print "detect single line paragraph"
  87. else:
  88. # Type-4: not formatted
  89. print "detect non formatted paragraph"
  90. text = PARAEND_PTN.sub(r'\1\n',txt)
  91. self.cleaned = True
  92. return text
  93. def recover_word(self, text):
  94. numline = len(text.split('\n'))
  95. numsch = len(re.compile('\S\n\S ',re.U).findall(text)) # single character in line start
  96. #print "word break: %d / %d" % (numsch, numline)
  97. if numsch > numline*0.08:
  98. print "Suspect word break over lines"
  99. text = re.compile('(\w)\n([????][\.\?!])',re.U).sub(r'\1\2\n',text)
  100. text = re.compile('(\w)\n([???????]) ',re.U).sub(r'\1\2\n',text)
  101. #text = re.compile(' (\w)\n(\w{2,})',re.U).sub(r'\n\1\2',text)
  102. return text
  103. def postprocess(self, txt):
  104. if self.cleaned:
  105. # wrap single quote block with empty lines
  106. txt = self.ptn_quote_start.sub(r'\1\n\n\2', txt)
  107. txt = self.ptn_quote_end.sub(r'\1\n\n\2', txt)
  108. else:
  109. # separate adjacent quoted statements (conservative way)
  110. txt = re.sub(r'"\n( *)"',r'"\n\n\1"',txt)
  111. txt = re.sub(r"'\n( *)'",r"'\n\n\1'",txt)
  112. # transpose " and ' to better shapes
  113. txt = self.ptn_dblqt_begin.sub(ur'“\1', txt)
  114. txt = self.ptn_dblqt_end.sub(ur'\1”', txt)
  115. txt = self.ptn_sglqt_begin.sub(ur'‘\1', txt)
  116. txt = self.ptn_sglqt_end.sub(ur'\1’', txt)
  117. # Disable some markdown markers for better output
  118. # 1) horizontal line drawing by '* * *'
  119. # 2) list starting with '-'
  120. # use alternative way starting with '*'
  121. txt = self.ptn_starhr.sub(r'\t\g<1>', txt)
  122. txt = self.ptn_dash3hr.sub(r'- - -', txt)
  123. txt = self.ptn_minusli.sub(r'\g<1>\\-\g<2>', txt)
  124. return txt
  125. def mark_chapter(text, toc_hdr):
  126. start = False
  127. inTOC = False
  128. cnt = 0
  129. for line in text.split('\n'):
  130. if start:
  131. cname = re.compile('\d*\s*$').sub('',line).strip()
  132. if cname:
  133. inTOC = True
  134. #print (u"chapter: %s" % cname).encode('utf-8')
  135. text = re.compile('^%s$' % cname,re.M).sub('%s\n%s' % (cname,'='*2*len(cname)),text)
  136. cnt += 1
  137. elif inTOC:
  138. break
  139. elif line.find(toc_hdr) >= 0:
  140. start = True
  141. inTOC = False
  142. print "%d chapters found" % cnt
  143. return text
  144. def guess_block(txt):
  145. # chapter
  146. #txt = mark_chapter(txt, u'<? ?>')
  147. # chapter
  148. numch = 0
  149. numch += len(CHAP_PTN1.findall(txt))
  150. numch += len(CHAP_PTN2.findall(txt))
  151. if numch < 1:
  152. txt = CHAP_PTN3.sub(r'\n# \1\n\n', txt)
  153. # section
  154. numsec = 0
  155. numsec += len(SECT_PTN1.findall(txt))
  156. if numsec < 1:
  157. txt = SECT_PTN2.sub(r'\n## \1\n\n', txt)
  158. return txt
  159. def guess_coding(txt, filename=''):
  160. if filename.find('.utf8') > 0:
  161. return 'utf-8'
  162. if filename.find('.cp949') > 0:
  163. return 'cp949'
  164. if filename.find('.euckr') > 0:
  165. return 'euc-kr'
  166. if filename.find('.johab') > 0:
  167. return 'johab'
  168. if txt[:3] == codecs.BOM_UTF8:
  169. return 'utf-8'
  170. import chardet
  171. detsz = min(10000, len(txt))
  172. coding = chardet.detect(txt[:detsz])['encoding']
  173. print "coding %s is detected" % coding
  174. if coding is None or coding == 'EUC-KR':
  175. coding = 'cp949'
  176. return coding
  177. #--------------------------------------
  178. def load(fname):
  179. # load file
  180. try:
  181. text = open(fname,'r').read()
  182. except:
  183. import sys
  184. print >> sys.stderr, "fail to open"
  185. return None
  186. # convert to unicode
  187. coding = guess_coding(text, filename=fname)
  188. text = unicode(text, coding, errors='replace')
  189. if ord(text[0]) == 0xfeff: # utf-8 BOM
  190. return text[1:]
  191. return text
  192. def clean(text, start=1):
  193. # convert
  194. text = txt_cleaner().convert( text, start )
  195. return guess_block( text )
  196. #--------------------------------------
  197. def extract_meta(text):
  198. lines = text.split('\n')
  199. info = {}
  200. meta_found = False
  201. for line in lines:
  202. if len(line.strip()) == 0:
  203. break # empty line ends meta block
  204. pos = line.find(':')
  205. if pos >= 0:
  206. key = line[:pos].lower()
  207. if key[0] == ' ' or key[-1] == ' ':
  208. break
  209. val = line[pos+1:].strip()
  210. info[key] = val
  211. meta_found = True
  212. elif meta_found and line.startswith(' '*4):
  213. info[key] += ', ' + line.strip()
  214. else:
  215. break
  216. return info
  217. def insert_meta(text, meta):
  218. # remove existing meta block
  219. lines = text.split('\n')
  220. cnt = 0
  221. meta_found = False
  222. for line in lines:
  223. if len(line.strip()) == 0:
  224. break # empty line ends meta block
  225. pos = line.find(':')
  226. if pos >= 0:
  227. meta_found = True
  228. elif meta_found and line.startswith(' '*4):
  229. pass # continued block
  230. else:
  231. break
  232. cnt += 1
  233. # insert new meta block
  234. nwlns = []
  235. for key, val in meta.items():
  236. if key == 'isbn':
  237. key = key.upper()
  238. elif key.find('_') < 0: # not cover_url
  239. key = key.title()
  240. nwlns.append( u"{0:15} {1}".format(key+':', val.replace('\n','')) )
  241. nwlns.append('')
  242. nwlns.extend( lines[cnt:] )
  243. return '\n'.join(nwlns)
  244. #--------------------------------------
  245. if __name__ == '__main__':
  246. import sys
  247. rslt = load(sys.argv[1])
  248. open(sys.argv[2],'w').write( rslt.encode('utf-8') )
  249. # vim:sw=4:ts=8:et