PageRenderTime 145ms CodeModel.GetById 60ms app.highlight 41ms RepoModel.GetById 40ms app.codeStats 0ms

/tags/rel-0-2/tools/dict2tei.py

#
Python | 204 lines | 127 code | 30 blank | 47 comment | 8 complexity | 6d80865c7b708145b334015a223d6eb4 MD5 | raw file
  1#!/usr/bin/python
  2# Written by Petergozz, Jan 2004
  3#
  4# micha137: renamed from dict2xml.py
  5#
  6## todo GPL 2+ ##
  7##### THIS IS ALPHA LEVEL NOT FOR PRODUCTION USE ###########
  8####### Requires Python2.3 or later ###########################
  9##  TODO will need TEI header > proper tei stuff too !! 
 10##	d2X_write_tei_header() (not here)
 11## TODO add detect for .dz files and uncompress
 12## (if tools on board if not .. explore the gzip modules :)
 13## dz is a modded version of gzip so _might_ be  doable ?
 14
 15
 16import sys
 17import time
 18import os
 19import string
 20import re
 21#cool new way to do getopts :)
 22#import optparse
 23from optparse import OptionParser, OptionGroup
 24
 25
 26
 27#
 28# Globals
 29#
 30VERSION = "-0.1.1"
 31chatty = None
 32app = os.path.basename(sys.argv[0])
 33start_time = time.asctime()
 34#
 35# regex defs (pre-compiles) these are used in d2x_format_xml
 36#
 37rex_hdwd = re.compile('^\w.*$') #Headword  starts with anything not a white space
 38rex_descpt = re.compile('^\s\s+.*$') #Description starts with more than one white space
 39## TODO add matches for parts of speech pronounciation etc. here hmm more command line options ... 
 40
 41## TODO add matches for file names here (to autogen out names)
 42
 43## TODO add matches for 00-data etc for dictd headers (possibly)
 44def d2x_getInput():
 45	d2x_usage = '%prog -f dictfile [options]\n\n Defaults are provided for _everything_ except the dictfmt FILE to read from '
 46	cl_parser = OptionParser(d2x_usage, version="%prog"+VERSION )
 47	
 48	cl_parser.add_option("-f", "--file", type="string", action="store",  dest="readfile",  help="read dictfmt file  from FILENAME" )
 49	cl_parser.add_option("-v", "--verbose", action="store_true", dest="verbose",  help="Tell me whats going on.  ")
 50	cl_parser.add_option("-o", "--out", type="string", action="store", dest="writefile", default="dicttei.xml",  help="write TEI/XML format file to FILENAME" )
 51	groupDocT = OptionGroup( cl_parser, "Advanced Options for changing the DOCTYPE", "Use these to set a doctype string that works for your system")
 52	groupDocT.add_option("-s", "--dtdsys", type="string", action="store", dest="DTDsys",default='http://www.tei-c.org/Guidelines/DTD/tei2.dtd' , help="set System DTD  to PATH.  NB: If your not using an XML/SGML catalog system you should set this to: /your/path/to/tei2.dtd" )
 53	groupDocT.add_option("-p", "--dtdpub", type="string",action="store", dest="DTDpub",default='-//TEI P4//DTD Main Document Type//EN', help="set public DTD to \"Formal Public Identifier\"  NB: You _will_ need to quote it" )
 54	groupDocT.add_option("-t", "--dtdtype", type="string", action="store", dest="DTDtype", default="TEI.2", help="set non default DOCTYPE [TEI.2] " )
 55	cl_parser.add_option_group( groupDocT )
 56	groupXML = OptionGroup( cl_parser, "Advanced options for altering the default XML header.", "Use these if you need to change the defaults. There are no single switch options for these." )
 57	
 58	groupXML.add_option("--xmlver" , type="string", action="store", dest="XMLver", default='1.0', help="Set XML version attribute. [\"1.0\" ]" )
 59	groupXML.add_option("--xmllang", type="string", action="store", dest="XMLlang", default='en', help="set the XML code language attribute. [en]")
 60	groupXML.add_option("--xmlstand", type="string", action="store", dest="XMLstand", default='no', help="set the XML \"standalone\"  attribute. [no]")
 61	groupXML.add_option("--xmlenc", type="string", action="store", dest="XMLenc", default='utf-8', help="set the XML character ISO code attribute. [utf-8] \n ")
 62	cl_parser.add_option_group( groupXML )
 63	## TODO a really quiet option and a logging option and a dotfile prefs section and group the options so they don't scare the crap out of innocent bystanders.
 64	(cl_options, cl_args)  = cl_parser.parse_args()
 65	
 66	#pull the exports out of the "getopt"
 67	dictFileIN = cl_options.readfile
 68	teiFileOut = cl_options.writefile
 69	dtdType = cl_options.DTDtype
 70	dtdSys = cl_options.DTDsys
 71	dtdPub = cl_options.DTDpub
 72	chat = cl_options.verbose
 73	xml_v = cl_options.XMLver
 74	xml_lang = cl_options.XMLlang
 75	xml_stand = cl_options.XMLstand
 76	xml_enc = cl_options.XMLenc
 77	
 78	# catch-me's here
 79	if len(cl_args) << 1: ## this still broken i will fix later
 80		cl_parser.error("We need at least one thing to do.\n\n Have you supplied a file name for reading ?\n <::For help type::> "+ app +" -h")
 81	elif dictFileIN == None :
 82		print app +"      ::>   No input file  <::\n"
 83		cl_parser.print_help()
 84		sys.exit(0)
 85	else:
 86		print app +"        Reading from:::> "+ dictFileIN + "  <::\n"
 87		print app +"        Writing to:  ::> "+ teiFileOut +" <::\n"
 88	
 89	#Test for verbosity
 90	# (damm and blast this is clunky)
 91	
 92	print app+" REMINDER ::> This is Alpha level software ! <::"
 93	print app+ VERSION +" !!!!!!!!!!!!  not for production use !!!!!!!!!!!!!!!!"
 94	if chat == True :
 95		print "command line options   :", cl_options
 96		chatty = "Y"
 97		print "Chat mode is on" +chatty
 98	else :
 99		chatty = "N"
100		print app +" Chat mode off"
101
102
103		
104	#
105	#Now get to work
106	#call the workhorses up
107	#
108	d2x_write_prolog( app, teiFileOut, dtdType, dtdPub, dtdSys, xml_v, xml_lang, xml_stand, xml_enc, chatty  )
109	d2x_format_xml( dictFileIN, teiFileOut, chatty  )
110	return()
111
112
113def d2x_write_prolog( this_app, fout, doc_t,  doc_type_pub, doc_type_sys,  xml_v, xml_lang, xml_stand, xml_enc,  chatty ):
114	if chatty == "Y":
115		print "entered write prolog function"
116
117	xmlfile = file(fout, "w+")
118	
119	if chatty == "Y":
120		print "Writing to ::> ", xmlfile
121
122	
123	# prolog is just a concat of all the following:
124	doc_type =  '<!DOCTYPE  '+ doc_t+ '  PUBLIC \"'+ doc_type_pub +'\"  \"' + doc_type_sys +'\" [ \n<!ENTITY % TEI.XML            "INCLUDE" >\n<!ENTITY % TEI.dictionaries \"INCLUDE\" > \n]>\n<!--this file  auto generated on   ' +start_time +'   by ' + this_app + VERSION +' \n     please edit and rename  --> ' 
125	xml_head = '<?xml version=\"'+xml_v+'\"  encoding=\"'+xml_enc+'\"  lang =\"'+xml_lang+'\" standalone=\"'+xml_stand+'\" ?>'
126	#
127	#So putting it all together we get 
128	#
129	prolog = xml_head+'\n'+doc_type+'\n\n'
130	if chatty == "Y" :
131		print(prolog)
132				
133	xmlfile.write( prolog )
134	xmlfile.close() # this seems safer and dumps the buffer (we need a lot of ram for big files)
135	
136	
137def d2x_format_xml(fin, fout, chatty ) :
138	"""d2x_format_xml()	
139	
140	takes a dictd format file and wraps it in TEI print dictionary xml tags.
141	Command line options exist for most sgml, XML attributes and file in and out names.
142	Defaults are supplied for all but the file in name.
143	 """
144
145	dictfmt = file( fin, 'r+',1 ) #open file in read and write mode line buffed only
146
147	xmlfile = file( fout, 'a' ) # reopen the output file for appending
148
149	# read all of dictfmt file to a list (as it only has new lines to differentiate with)
150	dictlist = dictfmt.read()
151	##TODO break into 100 line (+/- 8K) blocks use seek to increment through the whole file?
152	# now split the buffer by "2 or more new lines"
153	dictarray = dictlist.split('\n\n')
154		
155	# TODO make a spinner or % readout(after you improve cache and speed)
156	
157	for record in dictarray:
158		recSub1 = re.sub('^\n', "" , record)#tidy any leading newlines
159		recSub2  = re.sub('\t', '    ', recSub1) # replace tabs with 4 spaces
160		recSub3 = recSub2+'\n'+'</entry>' # tag the true end
161		sub_string = recSub3.split('\n')
162	#
163	#note do not strip leading space from defs (yet)
164	#
165		# it should now be the case that headwords start on "col one" and defs etc don't
166		xmlfile.write('\n<entry>')
167		for field in sub_string:
168			if chatty == "Y":
169				print "found field"
170				
171			match_H =  rex_hdwd.match( field )
172			match_D =  rex_descpt.match( field )
173			match_End = re.search('</entry>', field)
174			if  match_H :
175				if chatty == "Y":
176					print 'Headword Match found: ', match_H.group()
177					
178				xmlfile.write('\n<form><orth>' )
179				xmlfile.write(match_H.group())
180				xmlfile.write('</orth></form>')
181			elif match_D :
182				if chatty == "Y":
183					print 'Description Match found: ',  match_D.group()
184					
185				xmlfile.write( '\n<def>')
186				xmlfile.write( match_D.group() )
187				xmlfile.write ('</def>')
188			elif match_End :
189				if chatty == "Y":
190					print 'end entry'
191					
192				xmlfile.write ('\n</entry>')
193			else:
194				if chatty == "Y":
195					print 'No match'
196
197#
198#detect mode of operation and gather an environment etc
199#we actually start from here if called to execute
200#
201if  __name__ == "__main__":
202	d2x_getInput() # NTS this is not C
203	print app+ ": End Run"
204