PageRenderTime 28ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 0ms

/tags/rel-0-2/tools/dict2tei.py

#
Python | 204 lines | 127 code | 30 blank | 47 comment | 9 complexity | 6d80865c7b708145b334015a223d6eb4 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-3.0, LGPL-2.1, GPL-3.0, CC-BY-SA-3.0
  1. #!/usr/bin/python
  2. # Written by Petergozz, Jan 2004
  3. #
  4. # micha137: renamed from dict2xml.py
  5. #
  6. ## todo GPL 2+ ##
  7. ##### THIS IS ALPHA LEVEL NOT FOR PRODUCTION USE ###########
  8. ####### Requires Python2.3 or later ###########################
  9. ## TODO will need TEI header > proper tei stuff too !!
  10. ## d2X_write_tei_header() (not here)
  11. ## TODO add detect for .dz files and uncompress
  12. ## (if tools on board if not .. explore the gzip modules :)
  13. ## dz is a modded version of gzip so _might_ be doable ?
  14. import sys
  15. import time
  16. import os
  17. import string
  18. import re
  19. #cool new way to do getopts :)
  20. #import optparse
  21. from optparse import OptionParser, OptionGroup
  22. #
  23. # Globals
  24. #
  25. VERSION = "-0.1.1"
  26. chatty = None
  27. app = os.path.basename(sys.argv[0])
  28. start_time = time.asctime()
  29. #
  30. # regex defs (pre-compiles) these are used in d2x_format_xml
  31. #
  32. rex_hdwd = re.compile('^\w.*$') #Headword starts with anything not a white space
  33. rex_descpt = re.compile('^\s\s+.*$') #Description starts with more than one white space
  34. ## TODO add matches for parts of speech pronounciation etc. here hmm more command line options ...
  35. ## TODO add matches for file names here (to autogen out names)
  36. ## TODO add matches for 00-data etc for dictd headers (possibly)
  37. def d2x_getInput():
  38. d2x_usage = '%prog -f dictfile [options]\n\n Defaults are provided for _everything_ except the dictfmt FILE to read from '
  39. cl_parser = OptionParser(d2x_usage, version="%prog"+VERSION )
  40. cl_parser.add_option("-f", "--file", type="string", action="store", dest="readfile", help="read dictfmt file from FILENAME" )
  41. cl_parser.add_option("-v", "--verbose", action="store_true", dest="verbose", help="Tell me whats going on. ")
  42. cl_parser.add_option("-o", "--out", type="string", action="store", dest="writefile", default="dicttei.xml", help="write TEI/XML format file to FILENAME" )
  43. groupDocT = OptionGroup( cl_parser, "Advanced Options for changing the DOCTYPE", "Use these to set a doctype string that works for your system")
  44. groupDocT.add_option("-s", "--dtdsys", type="string", action="store", dest="DTDsys",default='http://www.tei-c.org/Guidelines/DTD/tei2.dtd' , help="set System DTD to PATH. NB: If your not using an XML/SGML catalog system you should set this to: /your/path/to/tei2.dtd" )
  45. groupDocT.add_option("-p", "--dtdpub", type="string",action="store", dest="DTDpub",default='-//TEI P4//DTD Main Document Type//EN', help="set public DTD to \"Formal Public Identifier\" NB: You _will_ need to quote it" )
  46. groupDocT.add_option("-t", "--dtdtype", type="string", action="store", dest="DTDtype", default="TEI.2", help="set non default DOCTYPE [TEI.2] " )
  47. cl_parser.add_option_group( groupDocT )
  48. groupXML = OptionGroup( cl_parser, "Advanced options for altering the default XML header.", "Use these if you need to change the defaults. There are no single switch options for these." )
  49. groupXML.add_option("--xmlver" , type="string", action="store", dest="XMLver", default='1.0', help="Set XML version attribute. [\"1.0\" ]" )
  50. groupXML.add_option("--xmllang", type="string", action="store", dest="XMLlang", default='en', help="set the XML code language attribute. [en]")
  51. groupXML.add_option("--xmlstand", type="string", action="store", dest="XMLstand", default='no', help="set the XML \"standalone\" attribute. [no]")
  52. groupXML.add_option("--xmlenc", type="string", action="store", dest="XMLenc", default='utf-8', help="set the XML character ISO code attribute. [utf-8] \n ")
  53. cl_parser.add_option_group( groupXML )
  54. ## TODO a really quiet option and a logging option and a dotfile prefs section and group the options so they don't scare the crap out of innocent bystanders.
  55. (cl_options, cl_args) = cl_parser.parse_args()
  56. #pull the exports out of the "getopt"
  57. dictFileIN = cl_options.readfile
  58. teiFileOut = cl_options.writefile
  59. dtdType = cl_options.DTDtype
  60. dtdSys = cl_options.DTDsys
  61. dtdPub = cl_options.DTDpub
  62. chat = cl_options.verbose
  63. xml_v = cl_options.XMLver
  64. xml_lang = cl_options.XMLlang
  65. xml_stand = cl_options.XMLstand
  66. xml_enc = cl_options.XMLenc
  67. # catch-me's here
  68. if len(cl_args) << 1: ## this still broken i will fix later
  69. cl_parser.error("We need at least one thing to do.\n\n Have you supplied a file name for reading ?\n <::For help type::> "+ app +" -h")
  70. elif dictFileIN == None :
  71. print app +" ::> No input file <::\n"
  72. cl_parser.print_help()
  73. sys.exit(0)
  74. else:
  75. print app +" Reading from:::> "+ dictFileIN + " <::\n"
  76. print app +" Writing to: ::> "+ teiFileOut +" <::\n"
  77. #Test for verbosity
  78. # (damm and blast this is clunky)
  79. print app+" REMINDER ::> This is Alpha level software ! <::"
  80. print app+ VERSION +" !!!!!!!!!!!! not for production use !!!!!!!!!!!!!!!!"
  81. if chat == True :
  82. print "command line options :", cl_options
  83. chatty = "Y"
  84. print "Chat mode is on" +chatty
  85. else :
  86. chatty = "N"
  87. print app +" Chat mode off"
  88. #
  89. #Now get to work
  90. #call the workhorses up
  91. #
  92. d2x_write_prolog( app, teiFileOut, dtdType, dtdPub, dtdSys, xml_v, xml_lang, xml_stand, xml_enc, chatty )
  93. d2x_format_xml( dictFileIN, teiFileOut, chatty )
  94. return()
  95. def d2x_write_prolog( this_app, fout, doc_t, doc_type_pub, doc_type_sys, xml_v, xml_lang, xml_stand, xml_enc, chatty ):
  96. if chatty == "Y":
  97. print "entered write prolog function"
  98. xmlfile = file(fout, "w+")
  99. if chatty == "Y":
  100. print "Writing to ::> ", xmlfile
  101. # prolog is just a concat of all the following:
  102. doc_type = '<!DOCTYPE '+ doc_t+ ' PUBLIC \"'+ doc_type_pub +'\" \"' + doc_type_sys +'\" [ \n<!ENTITY % TEI.XML "INCLUDE" >\n<!ENTITY % TEI.dictionaries \"INCLUDE\" > \n]>\n<!--this file auto generated on ' +start_time +' by ' + this_app + VERSION +' \n please edit and rename --> '
  103. xml_head = '<?xml version=\"'+xml_v+'\" encoding=\"'+xml_enc+'\" lang =\"'+xml_lang+'\" standalone=\"'+xml_stand+'\" ?>'
  104. #
  105. #So putting it all together we get
  106. #
  107. prolog = xml_head+'\n'+doc_type+'\n\n'
  108. if chatty == "Y" :
  109. print(prolog)
  110. xmlfile.write( prolog )
  111. xmlfile.close() # this seems safer and dumps the buffer (we need a lot of ram for big files)
  112. def d2x_format_xml(fin, fout, chatty ) :
  113. """d2x_format_xml()
  114. takes a dictd format file and wraps it in TEI print dictionary xml tags.
  115. Command line options exist for most sgml, XML attributes and file in and out names.
  116. Defaults are supplied for all but the file in name.
  117. """
  118. dictfmt = file( fin, 'r+',1 ) #open file in read and write mode line buffed only
  119. xmlfile = file( fout, 'a' ) # reopen the output file for appending
  120. # read all of dictfmt file to a list (as it only has new lines to differentiate with)
  121. dictlist = dictfmt.read()
  122. ##TODO break into 100 line (+/- 8K) blocks use seek to increment through the whole file?
  123. # now split the buffer by "2 or more new lines"
  124. dictarray = dictlist.split('\n\n')
  125. # TODO make a spinner or % readout(after you improve cache and speed)
  126. for record in dictarray:
  127. recSub1 = re.sub('^\n', "" , record)#tidy any leading newlines
  128. recSub2 = re.sub('\t', ' ', recSub1) # replace tabs with 4 spaces
  129. recSub3 = recSub2+'\n'+'</entry>' # tag the true end
  130. sub_string = recSub3.split('\n')
  131. #
  132. #note do not strip leading space from defs (yet)
  133. #
  134. # it should now be the case that headwords start on "col one" and defs etc don't
  135. xmlfile.write('\n<entry>')
  136. for field in sub_string:
  137. if chatty == "Y":
  138. print "found field"
  139. match_H = rex_hdwd.match( field )
  140. match_D = rex_descpt.match( field )
  141. match_End = re.search('</entry>', field)
  142. if match_H :
  143. if chatty == "Y":
  144. print 'Headword Match found: ', match_H.group()
  145. xmlfile.write('\n<form><orth>' )
  146. xmlfile.write(match_H.group())
  147. xmlfile.write('</orth></form>')
  148. elif match_D :
  149. if chatty == "Y":
  150. print 'Description Match found: ', match_D.group()
  151. xmlfile.write( '\n<def>')
  152. xmlfile.write( match_D.group() )
  153. xmlfile.write ('</def>')
  154. elif match_End :
  155. if chatty == "Y":
  156. print 'end entry'
  157. xmlfile.write ('\n</entry>')
  158. else:
  159. if chatty == "Y":
  160. print 'No match'
  161. #
  162. #detect mode of operation and gather an environment etc
  163. #we actually start from here if called to execute
  164. #
  165. if __name__ == "__main__":
  166. d2x_getInput() # NTS this is not C
  167. print app+ ": End Run"