PageRenderTime 59ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/old/txt2tags-1.3.py

http://txt2tags.googlecode.com/
Python | 2091 lines | 2067 code | 5 blank | 19 comment | 7 complexity | 2c9b95472f5ce6f552936d3a17e22220 MD5 | raw file
Possible License(s): GPL-2.0, GPL-3.0, WTFPL

Large files files are truncated, but you can click here to view the full file

  1. #!/usr/bin/env python
  2. # txt2tags - generic text conversion tool
  3. # http://txt2tags.sf.net
  4. #
  5. # Copyright 2001, 2002 Aurélio Marinho Jargas
  6. #
  7. # This program is free software; you can redistribute it and/or modify
  8. # it under the terms of the GNU General Public License as published by
  9. # the Free Software Foundation, version 2.
  10. #
  11. # This program is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You have received a copy of the GNU General Public License along
  17. # with this program, on the COPYING file.
  18. #
  19. # the code is getting better, but is still ugly - stay tunned
  20. import re, string, os, sys, getopt, traceback
  21. from time import strftime,time,localtime
  22. my_url = 'http://txt2tags.sf.net'
  23. my_email = 'aurelio@verde666.org'
  24. my_version = '1.3'
  25. DEBUG = 0 # do not edit here, please use --debug
  26. targets = ['txt', 'sgml', 'html', 'pm6', 'mgp', 'moin', 'man', 'tex']
  27. FLAGS = {'noheaders':0,'enumtitle':0,'maskemail':0, 'stdout':0,
  28. 'toconly' :0,'toc' :0,'gui' :0}
  29. regex = {}
  30. TAGS = {}
  31. rules = {}
  32. CMDLINE = ''
  33. currdate = strftime('%Y%m%d',localtime(time())) # ISO current date
  34. splitlevel = '' ; lang = 'english'
  35. doctype = outfile = ''
  36. pipefileid = '-'
  37. #my_version = my_version + '-dev' + currdate[4:] # devel!
  38. # global vars for doClose*()
  39. quotedepth = []
  40. listindent = []
  41. listids = []
  42. subarea = None
  43. tableborder = 0
  44. versionstr = "txt2tags version %s <%s>"%(my_version,my_url)
  45. usage = """
  46. %s
  47. usage: txt2tags -t <type> [OPTIONS] file.t2t
  48. txt2tags -t html -s <split level> -l <lang> file.t2t
  49. -t, --type target document type. actually supported:
  50. %s
  51. --stdout by default, the output is written to file.<type>
  52. with this option, STDOUT is used (no files written)
  53. --noheaders suppress header, title and footer information
  54. --enumtitle enumerate all title lines as 1, 1.1, 1.1.1, etc
  55. --maskemail hide email from spam robots. x@y.z turns to <x (a) y z>
  56. --toc add TOC (Table of Contents) to target document
  57. --toconly print document TOC and exit
  58. --gui invoke Graphical Tk Interface
  59. -h, --help print this help information and exit
  60. -V, --version print program version and exit
  61. extra options for HTML target (needs sgml-tools):
  62. --split split documents. values: 0, 1, 2 (default 0)
  63. --lang document language (default english)
  64. """%(versionstr, re.sub(r"[]'[]",'',repr(targets)))
  65. # here is all the target's templates
  66. # you may edit them to fit your needs
  67. # - the %(HEADERn)s strings represent the Header lines
  68. # - use %% to represent a literal %
  69. #
  70. HEADER_TEMPLATE = {
  71. 'txt': """\
  72. %(HEADER1)s
  73. %(HEADER2)s
  74. %(HEADER3)s
  75. """,
  76. 'sgml': """\
  77. <!doctype linuxdoc system>
  78. <article>
  79. <title>%(HEADER1)s
  80. <author>%(HEADER2)s
  81. <date>%(HEADER3)s
  82. """,
  83. 'html': """\
  84. <HTML>
  85. <HEAD>
  86. <META NAME="GENERATOR" CONTENT="http://txt2tags.sf.net">
  87. <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=%(ENCODING)s">
  88. <TITLE>%(HEADER1)s</TITLE></HEAD>
  89. <BODY BGCOLOR="white" TEXT="black">
  90. <P ALIGN="center"><CENTER><H1>%(HEADER1)s</H1>
  91. <FONT SIZE=4>
  92. <I>%(HEADER2)s</I><BR>
  93. %(HEADER3)s
  94. </FONT></CENTER>
  95. """,
  96. # TODO man section 1 is hardcoded...
  97. 'man': """\
  98. .TH "%(HEADER1)s" 1 %(HEADER3)s "%(HEADER2)s"
  99. """,
  100. # TODO style to <HR>
  101. 'pm6': """\
  102. <PMTags1.0 win><C-COLORTABLE ("Preto" 1 0 0 0)
  103. ><@Normal=
  104. <FONT "Times New Roman"><CCOLOR "Preto"><SIZE 11>
  105. <HORIZONTAL 100><LETTERSPACE 0><CTRACK 127><CSSIZE 70><C+SIZE 58.3>
  106. <C-POSITION 33.3><C+POSITION 33.3><P><CBASELINE 0><CNOBREAK 0><CLEADING -0.05>
  107. <GGRID 0><GLEFT 7.2><GRIGHT 0><GFIRST 0><G+BEFORE 7.2><G+AFTER 0>
  108. <GALIGNMENT "justify"><GMETHOD "proportional"><G& "ENGLISH">
  109. <GPAIRS 12><G%% 120><GKNEXT 0><GKWIDOW 0><GKORPHAN 0><GTABS $>
  110. <GHYPHENATION 2 34 0><GWORDSPACE 75 100 150><GSPACE -5 0 25>
  111. ><@Bullet=<@-PARENT "Normal"><FONT "Abadi MT Condensed Light">
  112. <GLEFT 14.4><G+BEFORE 2.15><G%% 110><GTABS(25.2 l "")>
  113. ><@PreFormat=<@-PARENT "Normal"><FONT "Lucida Console"><SIZE 8><CTRACK 0>
  114. <GLEFT 0><G+BEFORE 0><GALIGNMENT "left"><GWORDSPACE 100 100 100><GSPACE 0 0 0>
  115. ><@Title1=<@-PARENT "Normal"><FONT "Arial"><SIZE 14><B>
  116. <GCONTENTS><GLEFT 0><G+BEFORE 0><GALIGNMENT "left">
  117. ><@Title2=<@-PARENT "Title1"><SIZE 12><G+BEFORE 3.6>
  118. ><@Title3=<@-PARENT "Title1"><SIZE 10><GLEFT 7.2><G+BEFORE 7.2>
  119. ><@Title4=<@-PARENT "Title3">
  120. ><@Title5=<@-PARENT "Title3">
  121. ><@Quote=<@-PARENT "Normal"><SIZE 10><I>>
  122. %(HEADER1)s
  123. %(HEADER2)s
  124. %(HEADER3)s
  125. """,
  126. 'mgp': """\
  127. #!/usr/X11R6/bin/mgp -t 90
  128. %%deffont "normal" xfont "utopia-medium-r", charset "iso8859-1"
  129. %%deffont "normal-i" xfont "utopia-medium-i", charset "iso8859-1"
  130. %%deffont "normal-b" xfont "utopia-bold-r" , charset "iso8859-1"
  131. %%deffont "normal-bi" xfont "utopia-bold-i" , charset "iso8859-1"
  132. %%deffont "mono" xfont "courier-medium-r", charset "iso8859-1"
  133. %%default 1 size 5
  134. %%default 2 size 8, fore "yellow", font "normal-b", center
  135. %%default 3 size 5, fore "white", font "normal", left, prefix " "
  136. %%tab 1 size 4, vgap 30, prefix " ", icon arc "red" 40, leftfill
  137. %%tab 2 prefix " ", icon arc "orange" 40, leftfill
  138. %%tab 3 prefix " ", icon arc "brown" 40, leftfill
  139. %%tab 4 prefix " ", icon arc "darkmagenta" 40, leftfill
  140. %%tab 5 prefix " ", icon arc "magenta" 40, leftfill
  141. %%%%------------------------- end of headers -----------------------------
  142. %%page
  143. %%size 10, center, fore "yellow"
  144. %(HEADER1)s
  145. %%font "normal-i", size 6, fore "white", center
  146. %(HEADER2)s
  147. %%font "mono", size 7, center
  148. %(HEADER3)s
  149. """,
  150. # TODO please, improve me!
  151. 'moin': """\
  152. %(HEADER1)s
  153. %(HEADER2)s
  154. %(HEADER3)s
  155. """,
  156. 'tex': \
  157. r"""\documentclass[11pt,a4paper]{article}
  158. \usepackage{amsfonts,amssymb,graphicx,url}
  159. \usepackage[%(ENCODING)s]{inputenc} %% char encoding
  160. \pagestyle{plain} %% do page numbering ('empty' turns off)
  161. \frenchspacing %% no aditional spaces after periods
  162. \setlength{\parskip}{8pt}\parindent=0pt %% no paragraph indentation
  163. %% uncomment next line for fancy PDF output on Adobe Acrobat Reader
  164. %%\usepackage[pdfstartview=FitV,colorlinks=true,bookmarks=true]{hyperref}
  165. \title{%(HEADER1)s}
  166. \author{%(HEADER2)s}
  167. \begin{document}
  168. \date{%(HEADER3)s}
  169. \maketitle
  170. """
  171. }
  172. #-----------------------------------------------------------------------
  173. def Quit(msg, exitcode=0): print msg ; sys.exit(exitcode)
  174. def Error(msg): print "ERROR: %s"%msg ; sys.exit()
  175. def Debug(msg,i=0,linenr=None):
  176. if i > DEBUG: return
  177. if linenr is not None:
  178. print "(%d) %04d:%s"%(i,linenr,msg)
  179. else:
  180. print "(%d) %s"%(i,msg)
  181. def Readfile(file):
  182. if file == '-':
  183. try: data = sys.stdin.readlines()
  184. except: Error('You must feed me with data on STDIN!')
  185. else:
  186. try: f = open(file); data = f.readlines() ; f.close()
  187. except: Error("Cannot read file:\n %s"%file)
  188. return data
  189. def Savefile(file, contents):
  190. try: f = open(file, 'w')
  191. except: Error("Cannot open file for writing:\n %s"%file)
  192. if type(contents) == type([]): doit = f.writelines
  193. else: doit = f.write
  194. doit(contents) ; f.close()
  195. def NewArea(new, linenr):
  196. if new not in ['head', 'conf', 'body']:
  197. Error("Invalid new AREA '%s' on line '%s'"%(new,linenr))
  198. Debug('NEW AREA: %s'%new, 1, linenr)
  199. return new
  200. def reset_flags():
  201. global FLAGS
  202. for flag in FLAGS.keys(): FLAGS[flag] = 0
  203. def set_outfile_name(infile, doctype):
  204. "dirname is the same for {in,out}file"
  205. if not infile: return
  206. if infile == pipefileid or FLAGS['toconly'] or FLAGS['stdout']:
  207. outfile = pipefileid
  208. else:
  209. outfile = "%s.%s"%(re.sub('\.(txt|t2t)$','',infile), doctype)
  210. Debug(" infile: '%s'"% infile, 1)
  211. Debug("outfile: '%s'"%outfile, 1)
  212. return outfile
  213. def finish_him(outlist, outfile):
  214. "writing output to screen or file"
  215. if outfile == pipefileid:
  216. for line in outlist: print line
  217. else:
  218. Savefile(outfile, addLineBreaks(outlist))
  219. if not FLAGS['gui']: print 'wrote %s'%(outfile)
  220. if splitlevel:
  221. print "--- html..."
  222. os.system('sgml2html --language=%s --split=%s %s'%(
  223. lang,splitlevel,outfile))
  224. def ParseCmdline(cmdline=sys.argv):
  225. "return a dic with all options:value found"
  226. global CMDLINE ; CMDLINE = cmdline # save for dofooter()
  227. Debug("cmdline: %s"%cmdline, 1)
  228. options = {'infile': '', 'infiles':''}
  229. # get cmdline options
  230. longopt = ['help', 'version', 'type=', 'split=', 'lang=']+FLAGS.keys()
  231. try: (opt, args) = getopt.getopt(cmdline[1:], 'hVt:', longopt)
  232. except getopt.GetoptError:
  233. Error('Bad option or missing argument (try --help)')
  234. # get infile, if any
  235. if args:
  236. options['infile'] = args[0]
  237. options['infiles'] = args # multi
  238. for name,val in opt:
  239. # parse information options
  240. if name in ['-h','--help' ]: Quit(usage)
  241. elif name in ['-V','--version']: Quit(versionstr)
  242. # parse short/long options
  243. elif name in ['-t','--type']:
  244. options['doctype'] = val
  245. continue
  246. # just long options
  247. options[name[2:]] = val # del --
  248. Debug("cmdline options: %s"%options, 1)
  249. return options
  250. def ParseCmdlineOptions(optdic):
  251. "set vars and flags according to options dic"
  252. global FLAGS, splitlevel, lang
  253. # store flags and vars
  254. myflags = [] # for debug msg
  255. for flag in FLAGS.keys():
  256. if optdic.has_key(flag):
  257. FLAGS[flag] = 1
  258. myflags.append(flag)
  259. doctype = optdic.get('doctype')
  260. infile = optdic.get('infile')
  261. splitlevel = optdic.get('split')
  262. lang = optdic.get('lang')
  263. Debug("cmdline flags: %s"%string.join(myflags,', '), 1)
  264. if not doctype and FLAGS['toconly']: doctype = 'txt' # toconly dft type
  265. if not infile or not doctype: Quit(usage, 1) # no filename/doctype
  266. # sanity check: validate target type
  267. if not targets.count(doctype):
  268. Error("Invalid document type '%s' (try --help)"%(doctype))
  269. outfile = set_outfile_name(infile, doctype)
  270. # sanity check: validate split level
  271. if doctype != 'html': splitlevel = '' # only valid for HTML target
  272. if splitlevel:
  273. # checkings
  274. if outfile == pipefileid:
  275. Error('You need to provide a FILE (not STDIN) '
  276. 'when using --split')
  277. if splitlevel[0] not in '012':
  278. Error('Option --split must be 0, 1 or 2')
  279. # check for sgml-tools
  280. #TODO how to test (in a clever way) if an executable is in path?
  281. #TODO os.system() return code? sgml2html w/out --help exit 0?
  282. #TODO bah! implement sgml2html split natively and we're done
  283. # Error("Sorry, you must have 'sgml2html' to use --split")
  284. # set things
  285. FLAGS['stdout'] = 0 # no --stdout
  286. doctype = 'sgml' # 1st do a sgml, then sgml2html
  287. outfile = set_outfile_name(infile, doctype)
  288. # sanity check: source loss!
  289. if infile != pipefileid and infile == outfile:
  290. Error("SUICIDE WARNING!!! (try --stdout)\n source"+\
  291. " and target files has the same name: %s"%outfile)
  292. ### yes, i've got my sample.t2t file deleted before add this test... :/
  293. return infile,outfile,doctype
  294. #TODO splitlevel, lang
  295. #---End of ParseCmdlineOptions
  296. def toc_master(doctype, header, doc, toc):
  297. "decide to include TOC or not on the outlist"
  298. # deal with the TOC options
  299. if FLAGS['toc'] or FLAGS['toconly']:
  300. # format TOC lines
  301. ### here we do toc as a valid t2t marked text (list type)
  302. FLAGS['noheaders'] = 1
  303. x,y,toc = convert(['']+toc+['',''], doctype)
  304. # TOC between bars (not for --toconly)
  305. if FLAGS['toc']:
  306. para = TAGS['paragraph']
  307. tocbar = [para, regex['x'].sub('-'*72,TAGS['bar1']), para]
  308. toc = tocbar + toc + tocbar
  309. if FLAGS['toconly']: header = doc = []
  310. else:
  311. toc = []
  312. # TOC is a tag
  313. if TAGS['TOC'] and not FLAGS['toconly']:
  314. toc = []
  315. return header + toc + doc
  316. def doitall(cmdlinedic):
  317. global outfile
  318. infile,outfile,doctype = ParseCmdlineOptions(cmdlinedic)
  319. header,toc,doc = convert(Readfile(infile), doctype)
  320. outlist = toc_master(doctype,header,doc,toc)
  321. return doctype, outfile, outlist
  322. # set the Line Break across platforms
  323. LB = '\n' # default
  324. if sys.platform[:3] == 'win': LB = '\r\n'
  325. #elif sys.platform[:3] == 'cyg': LB = '\r\n' # not sure if it's best :(
  326. elif sys.platform[:3] == 'mac': LB = '\r'
  327. def escapePythonSpecials(txt):
  328. # drawback of using re.sub() - double escape some specials like \n
  329. # see also: 'force_re' marks on the code
  330. if sys.version[0] == '1':
  331. return re.sub(r'(\\[ntsrfvul])',r'\\\1',txt)
  332. else:
  333. return re.sub(r'(\\[ntsrfv])' ,r'\\\1',txt)
  334. def getTags(doctype):
  335. keys = [
  336. 'paragraph','title1','title2','title3','title4','title5',
  337. 'areaPreOpen','areaPreClose',
  338. 'areaQuoteOpen','areaQuoteClose',
  339. 'fontMonoOpen','fontMonoClose',
  340. 'fontBoldOpen','fontBoldClose',
  341. 'fontItalicOpen','fontItalicClose',
  342. 'fontBolditalicOpen','fontBolditalicClose',
  343. 'fontUnderlineOpen','fontUnderlineClose',
  344. 'listOpen','listClose','listItem',
  345. 'numlistOpen','numlistClose','numlistItem',
  346. 'deflistOpen','deflistClose','deflistItem1','deflistItem2',
  347. 'bar1','bar2',
  348. 'url','urlMark','email','emailMark',
  349. 'img','imgsolo',
  350. 'tableOpen','tableClose','tableLineOpen','tableLineClose',
  351. 'tableCellOpen','tableCellClose',
  352. 'tableTitleCellOpen','tableTitleCellClose',
  353. 'anchor','comment','TOC',
  354. 'EOD'
  355. ]
  356. if doctype == "txt":
  357. tags = {
  358. 'title1' : ' \a' ,
  359. 'title2' : '\t\a' ,
  360. 'title3' : '\t\t\a' ,
  361. 'title4' : '\t\t\t\a' ,
  362. 'title5' : '\t\t\t\t\a',
  363. 'areaQuoteOpen' : ' ' ,
  364. 'listItem' : '- ' ,
  365. 'numlistItem' : '\a. ' ,
  366. 'bar1' : '\a' ,
  367. 'bar2' : '\a' ,
  368. 'url' : '\a' ,
  369. 'urlMark' : '\a (\a)' ,
  370. 'email' : '\a' ,
  371. 'emailMark' : '\a (\a)' ,
  372. 'img' : '[\a]' ,
  373. }
  374. elif doctype == "html":
  375. tags = {
  376. 'paragraph' : '<P>' ,
  377. 'title1' : '<H1>\a</H1>' ,
  378. 'title2' : '<H2>\a</H2>' ,
  379. 'title3' : '<H3>\a</H3>' ,
  380. 'title4' : '<H4>\a</H4>' ,
  381. 'title5' : '<H5>\a</H5>' ,
  382. 'areaPreOpen' : '<PRE>' ,
  383. 'areaPreClose' : '</PRE>' ,
  384. 'areaQuoteOpen' : '<BLOCKQUOTE>' ,
  385. 'areaQuoteClose' : '</BLOCKQUOTE>' ,
  386. 'fontMonoOpen' : '<CODE>' ,
  387. 'fontMonoClose' : '</CODE>' ,
  388. 'fontBoldOpen' : '<B>' ,
  389. 'fontBoldClose' : '</B>' ,
  390. 'fontItalicOpen' : '<I>' ,
  391. 'fontItalicClose' : '</I>' ,
  392. 'fontBolditalicOpen' : '<B><I>' ,
  393. 'fontBolditalicClose': '</I></B>' ,
  394. 'fontUnderlineOpen' : '<U>' ,
  395. 'fontUnderlineClose' : '</U>' ,
  396. 'listOpen' : '<UL>' ,
  397. 'listClose' : '</UL>' ,
  398. 'listItem' : '<LI>' ,
  399. 'numlistOpen' : '<OL>' ,
  400. 'numlistClose' : '</OL>' ,
  401. 'numlistItem' : '<LI>' ,
  402. 'deflistOpen' : '<DL>' ,
  403. 'deflistClose' : '</DL>' ,
  404. 'deflistItem1' : '<DT>\a</DT>' ,
  405. 'deflistItem2' : '<DD>' ,
  406. 'bar1' : '<HR NOSHADE SIZE=1>' ,
  407. 'bar2' : '<HR NOSHADE SIZE=5>' ,
  408. 'url' : '<A HREF="\a">\a</A>' ,
  409. 'urlMark' : '<A HREF="\a">\a</A>' ,
  410. 'email' : '<A HREF="mailto:\a">\a</A>' ,
  411. 'emailMark' : '<A HREF="mailto:\a">\a</A>' ,
  412. 'img' : '<IMG ALIGN="\a" SRC="\a" BORDER="0">',
  413. 'imgsolo' : '<P ALIGN="center">\a</P>' ,
  414. 'tableOpen' : '<table align=center cellpadding=4 border=\a>',
  415. 'tableClose' : '</table>' ,
  416. 'tableLineOpen' : '<tr>' ,
  417. 'tableLineClose' : '</tr>' ,
  418. 'tableCellOpen' : '<td>' ,
  419. 'tableCellClose' : '</td>' ,
  420. 'tableTitleCellOpen' : '<th>' ,
  421. 'tableTitleCellClose': '</th>' ,
  422. 'anchor' : '<a name="\a">' ,
  423. 'comment' : '<!-- \a -->' ,
  424. 'EOD' : '</BODY></HTML>'
  425. }
  426. elif doctype == "sgml":
  427. tags = {
  428. 'paragraph' : '<p>' ,
  429. 'title1' : '<sect>\a<p>' ,
  430. 'title2' : '<sect1>\a<p>' ,
  431. 'title3' : '<sect2>\a<p>' ,
  432. 'title4' : '<sect3>\a<p>' ,
  433. 'title5' : '<sect4>\a<p>' ,
  434. 'areaPreOpen' : '<tscreen><verb>' ,
  435. 'areaPreClose' : '</verb></tscreen>' ,
  436. 'areaQuoteOpen' : '<quote>' ,
  437. 'areaQuoteClose' : '</quote>' ,
  438. 'fontMonoOpen' : '<tt>' ,
  439. 'fontMonoClose' : '</tt>' ,
  440. 'fontBoldOpen' : '<bf>' ,
  441. 'fontBoldClose' : '</bf>' ,
  442. 'fontItalicOpen' : '<em>' ,
  443. 'fontItalicClose' : '</em>' ,
  444. 'fontBolditalicOpen' : '<bf><em>' ,
  445. 'fontBolditalicClose': '</em></bf>' ,
  446. 'fontUnderlineOpen' : '<bf><em>' ,
  447. 'fontUnderlineClose' : '</em></bf>' ,
  448. 'listOpen' : '<itemize>' ,
  449. 'listClose' : '</itemize>' ,
  450. 'listItem' : '<item>' ,
  451. 'numlistOpen' : '<enum>' ,
  452. 'numlistClose' : '</enum>' ,
  453. 'numlistItem' : '<item>' ,
  454. 'deflistOpen' : '<descrip>' ,
  455. 'deflistClose' : '</descrip>' ,
  456. 'deflistItem1' : '<tag>\a</tag>' ,
  457. 'bar1' : '<!-- \a -->' ,
  458. 'bar2' : '<!-- \a -->' ,
  459. 'url' : '<htmlurl url="\a" name="\a">' ,
  460. 'urlMark' : '<htmlurl url="\a" name="\a">' ,
  461. 'email' : '<htmlurl url="mailto:\a" name="\a">' ,
  462. 'emailMark' : '<htmlurl url="mailto:\a" name="\a">' ,
  463. 'img' : '<figure><ph vspace=""><img src="\a"></figure>',
  464. 'tableOpen' : '<table><tabular ca="c">' ,
  465. 'tableClose' : '</tabular></table>' ,
  466. 'tableLineClose' : '<rowsep>' ,
  467. 'tableCellClose' : '<colsep>' ,
  468. 'tableTitleCellClose': '<colsep>' ,
  469. 'comment' : '<!-- \a -->' ,
  470. 'TOC' : '<toc>',
  471. 'EOD' : '</article>'
  472. }
  473. elif doctype == "tex":
  474. tags = {
  475. 'title1' : '\n\\newpage\section{\a}',
  476. 'title2' : '\\subsection{\a}' ,
  477. 'title3' : '\\subsubsection{\a}' ,
  478. # title 4/5: DIRTY: para+BF+\\+\n
  479. 'title4' : '\\paragraph{}\\textbf{\a}\\\\\\\n' ,
  480. 'title5' : '\\paragraph{}\\textbf{\a}\\\\\\\n' ,
  481. 'areaPreOpen' : '\\begin{verbatim}' ,
  482. 'areaPreClose' : '\\end{verbatim}' ,
  483. 'areaQuoteOpen' : '\\begin{quotation}' ,
  484. 'areaQuoteClose' : '\\end{quotation}' ,
  485. 'fontMonoOpen' : '\\texttt{' ,
  486. 'fontMonoClose' : '}' ,
  487. 'fontBoldOpen' : '\\textbf{' ,
  488. 'fontBoldClose' : '}' ,
  489. 'fontItalicOpen' : '\\textit{' ,
  490. 'fontItalicClose' : '}' ,
  491. 'fontBolditalicOpen' : '\\textbf{\\textit{' ,
  492. 'fontBolditalicClose': '}}' ,
  493. 'fontUnderlineOpen' : '\\underline{' ,
  494. 'fontUnderlineClose' : '}' ,
  495. 'listOpen' : '\\begin{itemize}' ,
  496. 'listClose' : '\\end{itemize}' ,
  497. 'listItem' : '\\item ' ,
  498. 'numlistOpen' : '\\begin{enumerate}' ,
  499. 'numlistClose' : '\\end{enumerate}' ,
  500. 'numlistItem' : '\\item ' ,
  501. 'deflistOpen' : '\\begin{description}' ,
  502. 'deflistClose' : '\\end{description}' ,
  503. 'deflistItem1' : '\\item[\a]' ,
  504. 'bar1' : '\n\\hrulefill{}\n' ,
  505. 'bar2' : '\n\\rule{\linewidth}{1mm}\n' ,
  506. 'url' : '\\url{\a}' ,
  507. 'urlMark' : '\\textit{\a} (\\url{\a})' ,
  508. 'email' : '\\url{\a}' ,
  509. 'emailMark' : '\\textit{\a} (\\url{\a})' ,
  510. 'img' : '(\a)' ,
  511. 'tableOpen' : '\\begin{center}\\begin{tabular}',
  512. 'tableClose' : '\\end{tabular}\\end{center}' ,
  513. 'tableLineOpen' : '\\hline ' ,
  514. 'tableLineClose' : ' \\\\' ,
  515. 'tableCellClose' : ' & ' ,
  516. 'tableTitleCellOpen' : '\\textbf{' ,
  517. 'tableTitleCellClose': '} & ' ,
  518. 'comment' : '% \a' ,
  519. 'TOC' : '\\newpage\\tableofcontents',
  520. 'EOD' : '\\end{document}'
  521. }
  522. elif doctype == "moin":
  523. tags = {
  524. 'title1' : '= \a =' ,
  525. 'title2' : '== \a ==' ,
  526. 'title3' : '=== \a ===' ,
  527. 'title4' : '==== \a ====' ,
  528. 'title5' : '===== \a =====' ,
  529. 'areaPreOpen' : '{{{' ,
  530. 'areaPreClose' : '}}}' ,
  531. 'areaQuoteOpen' : ' ' ,
  532. 'fontMonoOpen' : '{{{' ,
  533. 'fontMonoClose' : '}}}' ,
  534. 'fontBoldOpen' : "'''" ,
  535. 'fontBoldClose' : "'''" ,
  536. 'fontItalicOpen' : "''" ,
  537. 'fontItalicClose' : "''" ,
  538. 'fontBolditalicOpen' : "'''''" ,
  539. 'fontBolditalicClose': "'''''" ,
  540. 'fontUnderlineOpen' : "'''''" ,
  541. 'fontUnderlineClose' : "'''''" ,
  542. 'listItem' : '* ' ,
  543. 'numlistItem' : '\a. ' ,
  544. 'bar1' : '----' ,
  545. 'bar2' : '----' ,
  546. 'url' : '[\a]' ,
  547. 'urlMark' : '[\a \a]' ,
  548. 'email' : '[\a]' ,
  549. 'emailMark' : '[\a \a]' ,
  550. 'img' : '[\a]' ,
  551. 'tableLineOpen' : '||' ,
  552. 'tableCellClose' : '||' ,
  553. 'tableTitleCellClose': '||' ,
  554. }
  555. elif doctype == "mgp":
  556. tags = {
  557. 'paragraph' : '%font "normal", size 5\n' ,
  558. 'title1' : '%page\n\n\a' ,
  559. 'title2' : '%page\n\n\a' ,
  560. 'title3' : '%page\n\n\a' ,
  561. 'title4' : '%page\n\n\a' ,
  562. 'title5' : '%page\n\n\a' ,
  563. 'areaPreOpen' : '\n%font "mono"' ,
  564. 'areaPreClose' : '%font "normal"' ,
  565. 'areaQuoteOpen' : '%prefix " "' ,
  566. 'areaQuoteClose' : '%prefix " "' ,
  567. 'fontMonoOpen' : '\n%cont, font "mono"\n' ,
  568. 'fontMonoClose' : '\n%cont, font "normal"\n' ,
  569. 'fontBoldOpen' : '\n%cont, font "normal-b"\n' ,
  570. 'fontBoldClose' : '\n%cont, font "normal"\n' ,
  571. 'fontItalicOpen' : '\n%cont, font "normal-i"\n' ,
  572. 'fontItalicClose' : '\n%cont, font "normal"\n' ,
  573. 'fontBolditalicOpen' : '\n%cont, font "normal-bi"\n' ,
  574. 'fontBolditalicClose': '\n%cont, font "normal"\n' ,
  575. 'fontUnderlineOpen' : '\n%cont, fore "cyan"\n' ,
  576. 'fontUnderlineClose' : '\n%cont, fore "white"\n' ,
  577. 'numlistItem' : '\a. ' ,
  578. 'bar1' : '%bar "white" 5' ,
  579. 'bar2' : '%pause' ,
  580. 'url' : '\n%cont, fore "cyan"\n\a\n%cont, fore "white"\n',
  581. 'urlMark' : '\a \n%cont, fore "cyan"\n\a\n%cont, fore "white"\n',
  582. 'email' : '\n%cont, fore "cyan"\n\a\n%cont, fore "white"\n',
  583. 'emailMark' : '\a \n%cont, fore "cyan"\n\a\n%cont, fore "white"\n',
  584. 'img' : '\n%center\n%newimage "\a", left\n',
  585. 'comment' : '%% \a' ,
  586. 'EOD' : '%%EOD'
  587. }
  588. elif doctype == "man":
  589. tags = {
  590. 'paragraph' : '.P' ,
  591. 'title1' : '.SH \a' ,
  592. 'title2' : '.SS \a' ,
  593. 'title3' : '.SS \a' ,
  594. 'title4' : '.SS \a' ,
  595. 'title5' : '.SS \a' ,
  596. 'areaPreOpen' : '.nf' ,
  597. 'areaPreClose' : '.fi\n' ,
  598. 'areaQuoteOpen' : '\n' ,
  599. 'areaQuoteClose' : '\n' ,
  600. 'fontBoldOpen' : '\\fB' ,
  601. 'fontBoldClose' : '\\fP' ,
  602. 'fontItalicOpen' : '\\fI' ,
  603. 'fontItalicClose' : '\\fP' ,
  604. 'fontBolditalicOpen' : '\n.BI ' ,
  605. 'fontBolditalicClose': '\n\\&' ,
  606. 'listOpen' : '\n.nf' , # pre
  607. 'listClose' : '.fi\n' ,
  608. 'listItem' : '* ' ,
  609. 'numlistOpen' : '\n.nf' , # pre
  610. 'numlistClose' : '.fi\n' ,
  611. 'numlistItem' : '\a. ' ,
  612. 'bar1' : '\n\n' ,
  613. 'bar2' : '\n\n' ,
  614. 'url' : '\a' ,
  615. 'urlMark' : '\a (\a)' ,
  616. 'email' : '\a' ,
  617. 'emailMark' : '\a (\a)' ,
  618. 'img' : '\a' ,
  619. 'comment' : '.\\" \a'
  620. }
  621. elif doctype == "pm6":
  622. tags = {
  623. 'paragraph' : '<@Normal:>' ,
  624. 'title1' : '\n<@Title1:>\a' ,
  625. 'title2' : '\n<@Title2:>\a' ,
  626. 'title3' : '\n<@Title3:>\a' ,
  627. 'title4' : '\n<@Title4:>\a' ,
  628. 'title5' : '\n<@Title5:>\a' ,
  629. 'areaPreOpen' : '<@PreFormat:>' ,
  630. 'areaQuoteOpen' : '<@Quote:>' ,
  631. 'fontMonoOpen' : '<FONT "Lucida Console"><SIZE 9>' ,
  632. 'fontMonoClose' : '<SIZE$><FONT$>' ,
  633. 'fontBoldOpen' : '<B>' ,
  634. 'fontBoldClose' : '<P>' ,
  635. 'fontItalicOpen' : '<I>' ,
  636. 'fontItalicClose' : '<P>' ,
  637. 'fontBolditalicOpen' : '<B><I>' ,
  638. 'fontBolditalicClose': '<P>' ,
  639. 'fontUnderlineOpen' : '<U>' ,
  640. 'fontUnderlineClose' : '<P>' ,
  641. 'listOpen' : '<@Bullet:>' ,
  642. 'listItem' : '\x95 ' , # \x95 == ~U
  643. 'numlistOpen' : '<@Bullet:>' ,
  644. 'numlistItem' : '\x95 ' ,
  645. 'bar1' : '\a' ,
  646. 'bar2' : '\a' ,
  647. 'url' : '<U>\a<P>' , # underline
  648. 'urlMark' : '\a <U>\a<P>' ,
  649. 'email' : '\a' ,
  650. 'emailMark' : '\a \a' ,
  651. 'img' : '\a' ,
  652. }
  653. # create empty tags keys
  654. for key in keys:
  655. if not tags.has_key(key):
  656. tags[key] = ''
  657. else:
  658. tags[key] = escapePythonSpecials(tags[key])
  659. return tags
  660. def getRules(doctype):
  661. ret = {}
  662. allrules = [
  663. # target rules (ON/OFF)
  664. 'linkable', # target supports external links
  665. 'tableable', # target supports tables
  666. 'imgalignable', # target supports image alignment
  667. 'listcountable', # target supports numbered lists natively
  668. 'tablecellsplit', # place delimiters only *between* cells
  669. 'listnotnested', # lists cannot be nested
  670. 'quotenotnested', # quotes cannot be nested
  671. 'preareanotescaped', # don't escape specials in PRE area
  672. # target code beautify (ON/OFF)
  673. 'indentprearea', # add leading spaces to PRE area lines
  674. 'breaktablecell', # break lines after any table cell
  675. 'breaktablelineopen', # break line after opening table line
  676. 'keepquoteindent', # don't remove the leading TABs on quotes
  677. # value settings
  678. 'listmaxdepth', # maximum depth for lists
  679. ]
  680. rules = {
  681. 'txt' : {
  682. 'indentprearea':1
  683. },
  684. 'html': {
  685. 'indentprearea':1,
  686. 'linkable':1,
  687. 'imgalignable':1,
  688. 'listcountable':1,
  689. 'tableable':1,
  690. 'breaktablecell':1,
  691. 'breaktablelineopen':1,
  692. 'keepquoteindent':1
  693. },
  694. 'sgml': {
  695. 'linkable':1,
  696. 'listcountable':1,
  697. 'tableable':1,
  698. 'tablecellsplit':1,
  699. 'quotenotnested':1,
  700. 'keepquoteindent':1
  701. },
  702. 'mgp' : {
  703. },
  704. 'tex' : {
  705. 'listcountable':1,
  706. 'tableable':1,
  707. 'tablecellsplit':1,
  708. 'preareanotescaped':1,
  709. 'listmaxdepth':4
  710. },
  711. 'moin': {
  712. 'linkable':1,
  713. 'tableable':1
  714. },
  715. 'man' : {
  716. 'indentprearea':1,
  717. 'listnotnested':1
  718. },
  719. 'pm6' : {
  720. }
  721. }
  722. # populate return dictionary
  723. myrules = rules[doctype]
  724. for key in allrules : ret[key] = 0 # reset all
  725. for key in myrules.keys(): ret[key] = myrules[key] # turn ON
  726. return ret
  727. def getRegexes():
  728. regex = {
  729. # extra at end: (\[(?P<label>\w+)\])?
  730. 'title':
  731. re.compile(r'^\s*(?P<tag>={1,5})(?P<txt>[^=].*[^=])\1$'),
  732. 'areaPreOpen':
  733. re.compile(r'^---$'),
  734. 'areaPreClose':
  735. re.compile(r'^---$'),
  736. 'quote':
  737. re.compile(r'^\t+'),
  738. '1linePreOld':
  739. re.compile(r'^ {4}([^\s-])'),
  740. '1linePre':
  741. re.compile(r'^--- '),
  742. 'fontMono':
  743. re.compile(r'`([^`]+)`'),
  744. 'fontBold':
  745. re.compile(r'\*\*([^\s*].*?)\*\*'),
  746. 'fontItalic':
  747. re.compile(r'(^|[^:])//([^ /].*?)//'),
  748. 'fontUnderline':
  749. re.compile(r'__([^_].*?)__'), # underline lead/trailing blank
  750. 'fontBolditalic':
  751. re.compile(r'\*/([^/].*?)/\*'),
  752. 'list':
  753. re.compile(r'^( *)([+-]) ([^ ])'),
  754. 'deflist':
  755. re.compile(r'^( *)(=) ([^:]+):'),
  756. 'bar':
  757. re.compile(r'^\s*([_=-]{20,})\s*$'),
  758. 'table':
  759. re.compile(r'^ *\|\|? '),
  760. 'blankline':
  761. re.compile(r'^\s*$'),
  762. 'comment':
  763. re.compile(r'^%'),
  764. 'raw':
  765. re.compile(r'``(.+?)``')
  766. }
  767. # special char to place data on TAGs contents (\a == bell)
  768. regex['x'] = re.compile('\a')
  769. # %%date [ (formatting) ]
  770. regex['date'] = re.compile(r'%%date\b(\((?P<fmt>.*?)\))?', re.I)
  771. ### complicated regexes begin here ;)
  772. #
  773. # textual descriptions on --help's style: [...] is optional, | is OR
  774. ### first, some auxiliar variables
  775. #
  776. # [image.EXT]
  777. patt_img = r'\[([\w_,.+%$#@!?+~/-]+\.(png|jpe?g|gif|eps|bmp))\]'
  778. # link things
  779. urlskel = {
  780. 'proto' : r'(https?|ftp|news|telnet|gopher|wais)://',
  781. 'guess' : r'(www[23]?|ftp)\.', # w/out proto, try to guess
  782. 'login' : r'A-Za-z0-9_.-', # for ftp://login@domain.com
  783. 'pass' : r'[^ @]*', # for ftp://login:password@domain.com
  784. 'chars' : r'A-Za-z0-9%._/~:,=$@-',# %20(space), :80(port)
  785. 'anchor': r'A-Za-z0-9%._-', # %nn(encoded)
  786. 'form' : r'A-Za-z0-9/%&=+.@*_-', # .@*_-(as is)
  787. 'punct' : r'.,;:!?'
  788. }
  789. # username [ :password ] @
  790. patt_url_login = r'([%s]+(:%s)?@)?'%(urlskel['login'],urlskel['pass'])
  791. # [ http:// ] [ username:password@ ] domain.com [ / ] [ #anchor | ?form=data ]
  792. retxt_url = r'\b(%s%s|%s)[%s]+\b/*(\?[%s]+)?(#[%s]+)?'%(
  793. urlskel['proto'],patt_url_login, urlskel['guess'],
  794. urlskel['chars'],urlskel['form'],urlskel['anchor'])
  795. # filename | [ filename ] #anchor
  796. retxt_url_local = r'[%s]+|[%s]*(#[%s]+)'%(
  797. urlskel['chars'],urlskel['chars'],urlskel['anchor'])
  798. # user@domain [ ?form=data ]
  799. patt_email = r'\b[%s]+@([A-Za-z0-9_-]+\.)+[A-Za-z]{2,4}\b(\?[%s]+)?'%(
  800. urlskel['login'],urlskel['form'])
  801. # saving for future use
  802. regex['_urlskel'] = urlskel
  803. ### and now the real regexes
  804. #
  805. regex['email'] = re.compile(patt_email,re.I)
  806. # email | url
  807. regex['link'] = \
  808. re.compile(r'%s|%s'%(retxt_url,patt_email), re.I)
  809. # \[ label | imagetag url | email | filename \]
  810. regex['linkmark'] = \
  811. re.compile(r'\[(?P<label>%s|[^]]+) (?P<link>%s|%s|%s)\]'%(
  812. patt_img, retxt_url, patt_email, retxt_url_local),
  813. re.L+re.I)
  814. # image
  815. regex['img'] = re.compile(patt_img, re.L+re.I)
  816. # all macros
  817. regex['macro'] = regex['date']
  818. # special things
  819. regex['special'] = re.compile(r'^%!\s*')
  820. regex['setting'] = re.compile(r'(Encoding)\s*:\s*(.+)\s*$',re.I)
  821. return regex
  822. ### END OF regex nightmares
  823. class SubareaMaster:
  824. def __init__(self) : self.x = []
  825. def __call__(self) :
  826. if not self.x: return ''
  827. return self.x[-1]
  828. def add(self, area):
  829. if not self.x or (self.x and self.x[-1] != area):
  830. self.x.append(area)
  831. Debug('subarea ++ (%s): %s' % (area,self.x), 1)
  832. def pop(self, area=None):
  833. if area and self.x[-1] == area: self.x.pop()
  834. Debug('subarea -- (%s): %s' % (area,self.x), 1)
  835. def doHeader(doctype, headdic):
  836. if not HEADER_TEMPLATE.has_key(doctype):
  837. Error("doheader: Unknow doctype '%s'"%doctype)
  838. Debug('HEADER data: %s'%headdic, 1)
  839. template = string.split(HEADER_TEMPLATE[doctype], '\n')
  840. # scan for empty dictionary keys
  841. # if found, scan template lines for that key reference
  842. # if found, remove the reference
  843. # if there aren't any other key reference on the same line, remove it
  844. for key in headdic.keys():
  845. if not headdic[key]:
  846. for line in template:
  847. if string.count(line, key):
  848. sline = string.replace(
  849. line, '%%(%s)s'%key, '')
  850. if not string.count(sline, '%(HEADER'):
  851. template.remove(line)
  852. # populate template with data
  853. template = string.join(template, '\n') % headdic
  854. ### post processing
  855. #
  856. # TOC is a header tag
  857. if FLAGS['toc'] and TAGS['TOC']:
  858. toctag = re.sub('.*', TAGS['TOC'], '') #force_re
  859. template = template + toctag
  860. #
  861. # let tex format today
  862. if doctype == 'tex' and headdic['HEADER3'] == currdate:
  863. template = re.sub(r'\\date\{.*?}', r'\date', template)
  864. return string.split(template, '\n')
  865. def doCommentLine(doctype,txt):
  866. # the -- string ends a sgml comment :(
  867. if doctype == 'sgml':
  868. txt = string.replace(txt, '--', '\\-\\-')
  869. if TAGS['comment']:
  870. return regex['x'].sub(txt, TAGS['comment'])
  871. return ''
  872. def doFooter(doctype):
  873. ret = []
  874. typename = doctype
  875. if doctype == 'tex': typename = 'LaTeX2e'
  876. ppgd = '%s code generated by txt2tags %s (%s)'%(
  877. typename,my_version,my_url)
  878. cmdline = 'cmdline: txt2tags %s'%string.join(CMDLINE[1:], ' ')
  879. ret.append('\n'+doCommentLine(doctype,ppgd))
  880. ret.append(doCommentLine(doctype,cmdline))
  881. ret.append(TAGS['EOD'])
  882. return ret
  883. def doEscape(doctype,txt):
  884. if doctype == 'html' or doctype == 'sgml':
  885. txt = re.sub('&','&amp;',txt)
  886. txt = re.sub('<','&lt;',txt)
  887. txt = re.sub('>','&gt;',txt)
  888. if doctype == 'sgml':
  889. txt = re.sub('\xff','&yuml;',txt) # "+y
  890. elif doctype == 'pm6':
  891. txt = re.sub('<','<\#60>',txt)
  892. elif doctype == 'mgp':
  893. txt = re.sub('^%',' %',txt) # add leading blank to avoid parse
  894. #txt = re.sub('^%([^%])','%prefix ""\n %\n%cont, prefix " "\n\\1',txt)
  895. elif doctype == 'man':
  896. txt = re.sub('^\.', ' .',txt) # command ID
  897. txt = doEscapeEscapechar(txt)
  898. elif doctype == 'tex':
  899. txt = string.replace(txt, '\\', r'\verb!\!')
  900. txt = string.replace(txt, '~', r'\verb!~!')
  901. txt = string.replace(txt, '^', r'\verb!^!')
  902. txt = re.sub('([#$&%{}])', r'\\\1', txt)
  903. # TIP the _ is escaped at end
  904. return txt
  905. def doFinalEscape(doctype, txt):
  906. if doctype == 'pm6' : txt = string.replace(txt, r'\<',r'<\#92><')
  907. elif doctype == 'man' : txt = string.replace(txt, '-', r'\-')
  908. elif doctype == 'tex' : txt = string.replace(txt, '_', r'\_')
  909. elif doctype == 'sgml': txt = string.replace(txt, '[', '&lsqb;')
  910. return txt
  911. def doEscapeEscapechar(txt):
  912. return string.replace(txt, '\\', '\\\\')
  913. def addLineBreaks(list):
  914. "use LB to respect sys.platform"
  915. ret = []
  916. for line in list:
  917. line = string.replace(line,'\n',LB) # embedded \n's
  918. ret.append(line+LB) # add final line break
  919. return ret
  920. def doPreLine(doctype,line):
  921. "Parsing procedures for preformatted (verbatim) lines"
  922. if not rules['preareanotescaped']: line = doEscape(doctype,line)
  923. if rules['indentprearea']: line = ' '+line
  924. if doctype == 'pm6': line = doFinalEscape(doctype, line)
  925. return line
  926. def doCloseTable(doctype):
  927. global subarea, tableborder
  928. ret = ''
  929. if rules['tableable']:
  930. if doctype == 'tex' and tableborder:
  931. ret = TAGS['tableLineOpen']+TAGS['tableClose']+'\n'
  932. else:
  933. ret = TAGS['tableClose']+'\n'
  934. else:
  935. ret = TAGS['areaPreClose']
  936. tableborder = 0
  937. subarea.pop('table')
  938. return ret
  939. def doCloseQuote(howmany=None):
  940. global quotedepth
  941. ret = []
  942. if not howmany: howmany = len(quotedepth)
  943. for i in range(howmany):
  944. quotedepth.pop()
  945. #TODO align open/close tag -> FREE_ALING_TAG = 1 (man not)
  946. ret.append(TAGS['areaQuoteClose'])
  947. if not quotedepth: subarea.pop('quote')
  948. return string.join(ret,'\n')
  949. def doCloseList(howmany=None):
  950. global listindent, listids
  951. ret = []
  952. if not howmany: howmany = len(listindent)
  953. for i in range(howmany):
  954. if listids[-1] == '-': tag = TAGS['listClose']
  955. elif listids[-1] == '+': tag = TAGS['numlistClose']
  956. elif listids[-1] == '=': tag = TAGS['deflistClose']
  957. if not tag: tag = TAGS['listClose'] # default
  958. if tag:
  959. # unnested lists are only closed at mother-list
  960. if rules['listnotnested']:
  961. if len(listindent) == 1:
  962. ret.append(tag)
  963. else:
  964. ret.append(listindent[-1]+tag)
  965. del listindent[-1]
  966. del listids[-1]
  967. if not listindent: subarea.pop('list')
  968. return string.join(ret,'\n')
  969. def beautify_me(name, doctype, line):
  970. "where name is: bold, italic, underline or bolditalic"
  971. name = 'font%s' % string.capitalize(name)
  972. open = TAGS['%sOpen'%name]
  973. close = TAGS['%sClose'%name]
  974. txt = r'%s\1%s'%(open, close)
  975. if name == 'fontItalic':
  976. txt = r'\1%s\2%s'%(open, close)
  977. line = regex[name].sub(txt,line)
  978. return line
  979. def get_tagged_link(doctype, label, url):
  980. ret = ''
  981. # set link type
  982. if regex['email'].match(url):
  983. linktype = 'email'
  984. else:
  985. linktype = 'url';
  986. # adding protocol to guessed link
  987. guessurl = ''
  988. if linktype == 'url' and \
  989. re.match(regex['_urlskel']['guess'], url):
  990. if url[0] == 'w': guessurl = 'http://' +url
  991. else : guessurl = 'ftp://' +url
  992. # not link aware targets -> protocol is useless
  993. if not rules['linkable']: guessurl = ''
  994. # escape specials from TEXT parts
  995. label = doEscape(doctype,label)
  996. if not rules['linkable']:
  997. if doctype == 'tex':
  998. url = re.sub('^#', '\#', url) # ugly, but compile
  999. else:
  1000. url = doEscape(doctype,url)
  1001. # simple link (not guessed)
  1002. if not label and not guessurl:
  1003. if FLAGS['maskemail'] and linktype == 'email':
  1004. # do the email mask feature (no TAGs, just text)
  1005. url = string.replace(url,'@',' (a) ')
  1006. url = string.replace(url,'.',' ')
  1007. url = "<%s>" % url
  1008. if rules['linkable']: url = doEscape(url)
  1009. ret = url
  1010. else:
  1011. # just add link data to tag
  1012. tag = re.sub('.*', TAGS[linktype], '') #force_re
  1013. ret = regex['x'].sub(url,tag)
  1014. # named link or guessed simple link
  1015. else:
  1016. # adjusts for guessed link
  1017. if not label: label = url # no protocol
  1018. if guessurl : url = guessurl # with protocol
  1019. # handle \ on link label
  1020. label = doEscapeEscapechar(label)
  1021. # putting data on the right appearance order
  1022. if rules['linkable']:
  1023. urlorder = [url, label] # link before label
  1024. else:
  1025. urlorder = [label, url] # label before link
  1026. # get tag
  1027. ret = re.sub('.*', TAGS["%sMark"%linktype], '') #force_re
  1028. # add link data to tag (replace \a's)
  1029. for data in urlorder:
  1030. ret = regex['x'].sub(data,ret,1)
  1031. return ret
  1032. def get_image_align(line):
  1033. align = ''
  1034. line = string.strip(line)
  1035. m = regex['img'].search(line)
  1036. ini = m.start() ; head = 0
  1037. end = m.end() ; tail = len(line)
  1038. align = 'center' # default align # ^text +img +text$
  1039. if ini == head and end == tail: align = 'para' # ^img$
  1040. elif ini == head: align = 'left' # ^img + text$
  1041. elif end == tail: align = 'right' # ^text + img$
  1042. return align
  1043. def get_table_prop(line):
  1044. # default table proprierties
  1045. ret = {'border': 0, 'header':0, 'cells':[]}
  1046. line = string.strip(line)
  1047. # detect header (title) mark
  1048. if line[1] == '|':
  1049. ret['header'] = 1
  1050. # detect (and delete) border mark
  1051. if line[-1] == '|':
  1052. ret['border'] = 1
  1053. line = line[:-1]
  1054. # delete table mark
  1055. line = regex['table'].sub('', line)
  1056. # split cells
  1057. ret['cells'] = string.split(line, ' | ')
  1058. Debug('Table Prop: %s' % ret, 1)
  1059. return ret
  1060. #TODO if ' | ' table cell is center align
  1061. def tag_table_cells(table, doctype):
  1062. ret = ''
  1063. # plain cell
  1064. open, close = TAGS['tableCellOpen'], TAGS['tableCellClose']
  1065. # title cell
  1066. if table['header']:
  1067. open = TAGS['tableTitleCellOpen']
  1068. close = TAGS['tableTitleCellClose']
  1069. # should we break the line?
  1070. if rules['breaktablecell']: close = close+'\n'
  1071. # here we go
  1072. while table['cells']:
  1073. cel = table['cells'].pop(0)
  1074. if not cel and doctype == 'html':
  1075. cel = '&nbsp;'
  1076. # last cell gotchas
  1077. if not table['cells']:
  1078. # don't need cell separator
  1079. if rules['tablecellsplit']: close = ''
  1080. # close beautifier for last title cell
  1081. if doctype == 'tex' and table['header']: close = '}'
  1082. newcell = open + string.strip(cel) + close
  1083. newcell = re.sub('.*', newcell, '') #force_re
  1084. ret = ret + newcell
  1085. return ret
  1086. # reference: http://www.iana.org/assignments/character-sets
  1087. # http://www.drclue.net/F1.cgi/HTML/META/META.html
  1088. def get_encoding_string(enc, doctype):
  1089. if not enc: return ''
  1090. # target specific translation table
  1091. translate = {
  1092. 'tex': {
  1093. # missing: ansinew , applemac , cp437 , cp437de , cp865
  1094. 'us-ascii' : 'ascii',
  1095. 'windows-1250': 'cp1250',
  1096. 'windows-1252': 'cp1252',
  1097. 'ibm850' : 'cp850',
  1098. 'ibm852' : 'cp852',
  1099. 'iso-8859-1' : 'latin1',
  1100. 'iso-8859-2' : 'latin2',
  1101. 'iso-8859-3' : 'latin3',
  1102. 'iso-8859-4' : 'latin4',
  1103. 'iso-8859-5' : 'latin5',
  1104. 'iso-8859-9' : 'latin9',
  1105. 'koi8-r' : 'koi8-r'
  1106. }
  1107. }
  1108. # normalization
  1109. enc = re.sub('(?i)(us[-_]?)?ascii|us|ibm367','us-ascii' , enc)
  1110. enc = re.sub('(?i)(ibm|cp)?85([02])' ,'ibm85\\2' , enc)
  1111. enc = re.sub('(?i)(iso[_-]?)?8859[_-]?' ,'iso-8859-' , enc)
  1112. enc = re.sub('iso-8859-($|[^1-9]).*' ,'iso-8859-1', enc)
  1113. # apply translation table
  1114. try: enc = translate[doctype][string.upper(enc)]
  1115. except: pass
  1116. return enc
  1117. ################################################################################
  1118. ###MerryChristmas,IdontwanttofighttonightwithyouImissyourbodyandIneedyourlove###
  1119. ################################################################################
  1120. def convert(inlines, doctype):
  1121. # global vars for doClose*()
  1122. global TAGS, regex, rules, quotedepth, listindent, listids
  1123. global subarea, tableborder
  1124. TAGS = getTags(doctype)
  1125. rules = getRules(doctype)
  1126. regex = getRegexes()
  1127. # the defaults
  1128. linkmask = '@@_link_@@'
  1129. monomask = '@@_mono_@@'
  1130. macromask = '@@_macro_@@'
  1131. rawmask = '@@_raw_@@'
  1132. AREA = NewArea('head',0) # then conf, then body
  1133. subarea = SubareaMaster()
  1134. HEADERS = { 'HEADER1': '-NO TITLE-', 'HEADER2':'', 'HEADER3':'',
  1135. 'ENCODING': '' }
  1136. ret = []
  1137. toclist = []
  1138. header = []
  1139. f_tt = 0
  1140. listindent = []
  1141. listids = []
  1142. listcount = []
  1143. titlecount = ['',0,0,0,0,0]
  1144. f_lastblank = 0
  1145. holdspace = ''
  1146. listholdspace = ''
  1147. quotedepth = []
  1148. tableborder = 0
  1149. tablealign = []
  1150. if outfile != pipefileid:
  1151. if not FLAGS['gui']:
  1152. print "--- %s..."%doctype
  1153. # let's mark it up!
  1154. linenr = 0
  1155. for lineref in range(len(inlines)):
  1156. skip_continue = 0
  1157. linkbank = []
  1158. monobank = []
  1159. macrobank = []
  1160. rawbank = []
  1161. linenr = lineref +1
  1162. untouchedline = inlines[lineref]
  1163. line = string.rstrip(untouchedline)
  1164. Debug('LINE %04d: %s' % (linenr,repr(line)), 1) # for heavy debug
  1165. # detect if head section is over
  1166. #TIP 'not line' depends on previous line.rstrip()
  1167. if (linenr == 4 and AREA == 'head') or \
  1168. (linenr == 1 and not line):
  1169. AREA = NewArea('conf',linenr)
  1170. # we need (not really) to mark each paragraph
  1171. #TODO check if this is really needed
  1172. if doctype == 'pm6' and f_lastblank:
  1173. if f_tt or AREA == 'head' or listindent:
  1174. holdspace = ''
  1175. else:
  1176. holdspace = TAGS['paragraph']+'\n'
  1177. # any NOT table line, closes an open table
  1178. if subarea() == 'table' and not regex['table'].search(line):
  1179. ret.append(doCloseTable(doctype))
  1180. #---------------------[ PRE formatted ]----------------------
  1181. #TIP we'll never support beautifiers inside pre-formatted
  1182. # we're already on a PRE area
  1183. if f_tt:
  1184. # closing PRE
  1185. if regex['areaPreClose'].search(line):
  1186. if doctype != 'pm6':
  1187. ret.append(TAGS['areaPreClose'])
  1188. f_tt = 0
  1189. continue
  1190. # normal PRE-inside line
  1191. line = doPreLine(doctype, line)
  1192. ret.append(line)
  1193. continue
  1194. # detecting PRE area init
  1195. if regex['areaPreOpen'].search(line):
  1196. ret.append(TAGS['areaPreOpen'])
  1197. f_lastblank = 0
  1198. f_tt = 1
  1199. continue
  1200. # one line PRE-formatted text
  1201. if regex['1linePre'].search(line):
  1202. f_lastblank = 0
  1203. line = regex['1linePre'].sub('',line)
  1204. line = doPreLine(doctype, line)
  1205. t1, t2 = TAGS['areaPreOpen'],TAGS['areaPreClose']
  1206. ret.append('%s\n%s\n%s'%(t1,line,t2))
  1207. continue
  1208. #---------------------[ blank lines ]-----------------------
  1209. #TODO "holdspace" to save <p> to not show in closelist
  1210. if regex['blankline'].search(line):
  1211. # closing all open quotes
  1212. if quotedepth:
  1213. ret.append(doCloseQuote())
  1214. # closing all open lists
  1215. if f_lastblank: # 2nd consecutive blank line
  1216. if listindent: # closes list (if any)
  1217. ret.append(doCloseList())
  1218. holdspace = ''
  1219. continue # consecutive blanks are trash
  1220. # normal blank line
  1221. if doctype != 'pm6' and AREA == 'body':
  1222. # paragraph (if any) is wanted inside lists also
  1223. if listindent:
  1224. para = TAGS['paragraph'] + '\n'
  1225. holdspace = holdspace + para
  1226. elif doctype == 'html':
  1227. ret.append(TAGS['paragraph'])
  1228. # sgml: quote close tag must not be \n\n</quote>
  1229. elif doctype == 'sgml' and quotedepth:
  1230. skip_continue = 1
  1231. # otherwise we just print a blank line
  1232. else:
  1233. ret.append('')
  1234. f_lastblank = 1
  1235. if not skip_continue: continue
  1236. else:
  1237. f_lastblank = 0 # reset blank status
  1238. #---------------------[ special ]------------------------
  1239. # just encoding for now
  1240. if regex['special'].search(line):
  1241. special = line[2:]
  1242. # try Settings
  1243. m = regex['setting'].match(special)
  1244. if m:
  1245. name = string.upper(m.group(1))
  1246. val = m.group(2)
  1247. if AREA == 'conf':
  1248. if name == 'ENCODING':
  1249. val = get_encoding_string(val,doctype)
  1250. HEADERS[name] = val
  1251. Debug("Found Setting '%s', value '%s'"%(
  1252. name,val),1,linenr)
  1253. else:
  1254. Debug('Ignoring Setting outside CONF area:'
  1255. ' %s'%name,1,linenr)
  1256. else:
  1257. Debug('Bogus Special Line',1,linenr)
  1258. #---------------------[ comments ]-----------------------
  1259. # just skip them (if not macro or setting)
  1260. if regex['comment'].search(line) and not regex['date'].match(line):
  1261. f_lastblank = 1
  1262. continue
  1263. #---------------------[ BODY detect ]-----------------------
  1264. ### if got here, its a header or a valid line
  1265. if AREA == 'conf':
  1266. # oops, not header, so we're now on document BODY
  1267. AREA = NewArea('body', linenr)
  1268. # do headers!
  1269. if not FLAGS['noheaders']:
  1270. header = doHeader(doctype,HEADERS)
  1271. # so, let's print the opening paragraph
  1272. if doctype != 'pm6':
  1273. ret.append(TAGS['paragraph'])
  1274. #---------------------[ Title ]-----------------------
  1275. # man: - should not be escaped, \ turns to \\\\
  1276. #TODO set next blank and set f_lastblank or f_lasttitle
  1277. if regex['title'].search(line) and not listindent and AREA == 'body':
  1278. m = regex['title'].search(line)
  1279. tag = m.group('tag')
  1280. level = len(tag)
  1281. tag = TAGS['title%s'%level]
  1282. txt = string.strip(m.group('txt'))
  1283. if FLAGS['enumtitle']: ### numbered title
  1284. id = '' ; n = level #
  1285. titlecount[n] = titlecount[n] +1 # add count
  1286. if n < len(titlecount)-1: # reset sublevels count
  1287. for i in range(n+1, len(titlecount)): titlecount[i] = 0
  1288. for i in range(n): # compose id from hierarchy
  1289. id = "%s%d."%(id,titlecount[i+1])
  1290. idtxt = "%s %s"%(id, txt) # add id to title
  1291. else:
  1292. idtxt = txt
  1293. anchorid = 'toc%d'%(len(toclist)+1)
  1294. if TAGS['anchor'] and FLAGS['toc'] and level <= 3:
  1295. ret.append(regex['x'].sub(anchorid,TAGS['anchor']))
  1296. # place title tag overriding line
  1297. line = regex['title'].sub(tag,line)
  1298. ### escape title text (unescaped text is used for TOC)
  1299. #
  1300. esctxt = doEscape(doctype,idtxt)
  1301. # sgml: [ is special on title (and lists) - here bcos 'continue'
  1302. if doctype == 'sgml': esctxt = re.sub(r'\[', r'&lsqb;', esctxt)
  1303. esctxt = doEscapeEscapechar(esctxt) # for re.sub()
  1304. # man: \ on title becomes \\\\
  1305. if doctype == 'man': esctxt = doEscapeEscapechar(esctxt)
  1306. # finish title line
  1307. ret.append(regex['x'].sub(esctxt,line))
  1308. # let's do some TOC!
  1309. if TAGS['anchor']:
  1310. # tocitemid = '#toc%d'%(len(toclist)+1)
  1311. # TOC more readable with master topics not linked at number
  1312. # stoled idea from windows .CHM files (help system)
  1313. if FLAGS['enumtitle'] and level == 1:
  1314. tocitem = '%s+ [``%s`` #%s]'%(' '*level,txt,anchorid)
  1315. else:
  1316. tocitem = '%s- [``%s`` #%s]'%(' '*level,idtxt,anchorid)
  1317. else:
  1318. tocitem = '%s- %s'%(' '*level,idtxt)
  1319. if doctype in ['txt', 'man']:
  1320. tocitem = '%s%s' %(' '*level,idtxt)
  1321. if level <= 3: toclist.append(tocitem) # max toc level: 3
  1322. # add "underline" to text titles
  1323. if doctype == 'txt':
  1324. ret.append(regex['x'].sub('='*len(idtxt),tag))
  1325. continue
  1326. #TODO! labeltxt = ''
  1327. # label = m.group('label')
  1328. # if label: labeltxt = '<label id="%s">' %label
  1329. #---------------------[ apply masks ]-----------------------
  1330. ### protect important structures from escaping and formatting
  1331. while regex['raw'].search(line):
  1332. txt = regex['raw'].search(line).group(1)
  1333. rawbank.append(doEscape(doctype,txt))
  1334. line = regex['raw'].sub(rawmask,line,1)
  1335. # protect pre-formatted font text
  1336. while regex['fontMono'].search(line):
  1337. txt = regex['fontMono'].search(line).group(1)
  1338. txt = doEscape(doctype,txt)
  1339. txt = escapePythonSpecials(txt)
  1340. monobank.append(txt)
  1341. line = regex['fontMono'].sub(monomask,line,1)
  1342. # protect macros
  1343. while regex['macro'].search(line):
  1344. txt = regex['macro'].search(line).group()
  1345. macrobank.append(txt)
  1346. line = regex['macro'].sub(macromask,line,1)
  1347. # protect URLs and emails
  1348. while regex['linkmark'].search(line) or regex['link'].search(line):
  1349. # try to match pl

Large files files are truncated, but you can click here to view the full file