/tags/rel-0-2/tools/dict2tei.py
Python | 204 lines | 127 code | 30 blank | 47 comment | 9 complexity | 6d80865c7b708145b334015a223d6eb4 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-3.0, LGPL-2.1, GPL-3.0, CC-BY-SA-3.0
- #!/usr/bin/python
- # Written by Petergozz, Jan 2004
- #
- # micha137: renamed from dict2xml.py
- #
- ## todo GPL 2+ ##
- ##### THIS IS ALPHA LEVEL NOT FOR PRODUCTION USE ###########
- ####### Requires Python2.3 or later ###########################
- ## TODO will need TEI header > proper tei stuff too !!
- ## d2X_write_tei_header() (not here)
- ## TODO add detect for .dz files and uncompress
- ## (if tools on board if not .. explore the gzip modules :)
- ## dz is a modded version of gzip so _might_ be doable ?
- import sys
- import time
- import os
- import string
- import re
- #cool new way to do getopts :)
- #import optparse
- from optparse import OptionParser, OptionGroup
- #
- # Globals
- #
- VERSION = "-0.1.1"
- chatty = None
- app = os.path.basename(sys.argv[0])
- start_time = time.asctime()
- #
- # regex defs (pre-compiles) these are used in d2x_format_xml
- #
- rex_hdwd = re.compile('^\w.*$') #Headword starts with anything not a white space
- rex_descpt = re.compile('^\s\s+.*$') #Description starts with more than one white space
- ## TODO add matches for parts of speech pronounciation etc. here hmm more command line options ...
- ## TODO add matches for file names here (to autogen out names)
- ## TODO add matches for 00-data etc for dictd headers (possibly)
- def d2x_getInput():
- d2x_usage = '%prog -f dictfile [options]\n\n Defaults are provided for _everything_ except the dictfmt FILE to read from '
- cl_parser = OptionParser(d2x_usage, version="%prog"+VERSION )
-
- cl_parser.add_option("-f", "--file", type="string", action="store", dest="readfile", help="read dictfmt file from FILENAME" )
- cl_parser.add_option("-v", "--verbose", action="store_true", dest="verbose", help="Tell me whats going on. ")
- cl_parser.add_option("-o", "--out", type="string", action="store", dest="writefile", default="dicttei.xml", help="write TEI/XML format file to FILENAME" )
- groupDocT = OptionGroup( cl_parser, "Advanced Options for changing the DOCTYPE", "Use these to set a doctype string that works for your system")
- groupDocT.add_option("-s", "--dtdsys", type="string", action="store", dest="DTDsys",default='http://www.tei-c.org/Guidelines/DTD/tei2.dtd' , help="set System DTD to PATH. NB: If your not using an XML/SGML catalog system you should set this to: /your/path/to/tei2.dtd" )
- groupDocT.add_option("-p", "--dtdpub", type="string",action="store", dest="DTDpub",default='-//TEI P4//DTD Main Document Type//EN', help="set public DTD to \"Formal Public Identifier\" NB: You _will_ need to quote it" )
- groupDocT.add_option("-t", "--dtdtype", type="string", action="store", dest="DTDtype", default="TEI.2", help="set non default DOCTYPE [TEI.2] " )
- cl_parser.add_option_group( groupDocT )
- groupXML = OptionGroup( cl_parser, "Advanced options for altering the default XML header.", "Use these if you need to change the defaults. There are no single switch options for these." )
-
- groupXML.add_option("--xmlver" , type="string", action="store", dest="XMLver", default='1.0', help="Set XML version attribute. [\"1.0\" ]" )
- groupXML.add_option("--xmllang", type="string", action="store", dest="XMLlang", default='en', help="set the XML code language attribute. [en]")
- groupXML.add_option("--xmlstand", type="string", action="store", dest="XMLstand", default='no', help="set the XML \"standalone\" attribute. [no]")
- groupXML.add_option("--xmlenc", type="string", action="store", dest="XMLenc", default='utf-8', help="set the XML character ISO code attribute. [utf-8] \n ")
- cl_parser.add_option_group( groupXML )
- ## TODO a really quiet option and a logging option and a dotfile prefs section and group the options so they don't scare the crap out of innocent bystanders.
- (cl_options, cl_args) = cl_parser.parse_args()
-
- #pull the exports out of the "getopt"
- dictFileIN = cl_options.readfile
- teiFileOut = cl_options.writefile
- dtdType = cl_options.DTDtype
- dtdSys = cl_options.DTDsys
- dtdPub = cl_options.DTDpub
- chat = cl_options.verbose
- xml_v = cl_options.XMLver
- xml_lang = cl_options.XMLlang
- xml_stand = cl_options.XMLstand
- xml_enc = cl_options.XMLenc
-
- # catch-me's here
- if len(cl_args) << 1: ## this still broken i will fix later
- cl_parser.error("We need at least one thing to do.\n\n Have you supplied a file name for reading ?\n <::For help type::> "+ app +" -h")
- elif dictFileIN == None :
- print app +" ::> No input file <::\n"
- cl_parser.print_help()
- sys.exit(0)
- else:
- print app +" Reading from:::> "+ dictFileIN + " <::\n"
- print app +" Writing to: ::> "+ teiFileOut +" <::\n"
-
- #Test for verbosity
- # (damm and blast this is clunky)
-
- print app+" REMINDER ::> This is Alpha level software ! <::"
- print app+ VERSION +" !!!!!!!!!!!! not for production use !!!!!!!!!!!!!!!!"
- if chat == True :
- print "command line options :", cl_options
- chatty = "Y"
- print "Chat mode is on" +chatty
- else :
- chatty = "N"
- print app +" Chat mode off"
-
- #
- #Now get to work
- #call the workhorses up
- #
- d2x_write_prolog( app, teiFileOut, dtdType, dtdPub, dtdSys, xml_v, xml_lang, xml_stand, xml_enc, chatty )
- d2x_format_xml( dictFileIN, teiFileOut, chatty )
- return()
- def d2x_write_prolog( this_app, fout, doc_t, doc_type_pub, doc_type_sys, xml_v, xml_lang, xml_stand, xml_enc, chatty ):
- if chatty == "Y":
- print "entered write prolog function"
- xmlfile = file(fout, "w+")
-
- if chatty == "Y":
- print "Writing to ::> ", xmlfile
-
- # prolog is just a concat of all the following:
- doc_type = '<!DOCTYPE '+ doc_t+ ' PUBLIC \"'+ doc_type_pub +'\" \"' + doc_type_sys +'\" [ \n<!ENTITY % TEI.XML "INCLUDE" >\n<!ENTITY % TEI.dictionaries \"INCLUDE\" > \n]>\n<!--this file auto generated on ' +start_time +' by ' + this_app + VERSION +' \n please edit and rename --> '
- xml_head = '<?xml version=\"'+xml_v+'\" encoding=\"'+xml_enc+'\" lang =\"'+xml_lang+'\" standalone=\"'+xml_stand+'\" ?>'
- #
- #So putting it all together we get
- #
- prolog = xml_head+'\n'+doc_type+'\n\n'
- if chatty == "Y" :
- print(prolog)
-
- xmlfile.write( prolog )
- xmlfile.close() # this seems safer and dumps the buffer (we need a lot of ram for big files)
-
-
- def d2x_format_xml(fin, fout, chatty ) :
- """d2x_format_xml()
-
- takes a dictd format file and wraps it in TEI print dictionary xml tags.
- Command line options exist for most sgml, XML attributes and file in and out names.
- Defaults are supplied for all but the file in name.
- """
- dictfmt = file( fin, 'r+',1 ) #open file in read and write mode line buffed only
- xmlfile = file( fout, 'a' ) # reopen the output file for appending
- # read all of dictfmt file to a list (as it only has new lines to differentiate with)
- dictlist = dictfmt.read()
- ##TODO break into 100 line (+/- 8K) blocks use seek to increment through the whole file?
- # now split the buffer by "2 or more new lines"
- dictarray = dictlist.split('\n\n')
-
- # TODO make a spinner or % readout(after you improve cache and speed)
-
- for record in dictarray:
- recSub1 = re.sub('^\n', "" , record)#tidy any leading newlines
- recSub2 = re.sub('\t', ' ', recSub1) # replace tabs with 4 spaces
- recSub3 = recSub2+'\n'+'</entry>' # tag the true end
- sub_string = recSub3.split('\n')
- #
- #note do not strip leading space from defs (yet)
- #
- # it should now be the case that headwords start on "col one" and defs etc don't
- xmlfile.write('\n<entry>')
- for field in sub_string:
- if chatty == "Y":
- print "found field"
-
- match_H = rex_hdwd.match( field )
- match_D = rex_descpt.match( field )
- match_End = re.search('</entry>', field)
- if match_H :
- if chatty == "Y":
- print 'Headword Match found: ', match_H.group()
-
- xmlfile.write('\n<form><orth>' )
- xmlfile.write(match_H.group())
- xmlfile.write('</orth></form>')
- elif match_D :
- if chatty == "Y":
- print 'Description Match found: ', match_D.group()
-
- xmlfile.write( '\n<def>')
- xmlfile.write( match_D.group() )
- xmlfile.write ('</def>')
- elif match_End :
- if chatty == "Y":
- print 'end entry'
-
- xmlfile.write ('\n</entry>')
- else:
- if chatty == "Y":
- print 'No match'
- #
- #detect mode of operation and gather an environment etc
- #we actually start from here if called to execute
- #
- if __name__ == "__main__":
- d2x_getInput() # NTS this is not C
- print app+ ": End Run"