PageRenderTime 2470ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/storage/lisa.py

https://github.com/mellterm/translate
Python | 372 lines | 332 code | 10 blank | 30 comment | 7 complexity | f3bdb200b52640e87bab3c2c6f5703cb MD5 | raw file
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #
  4. # Copyright 2006-2009 Zuza Software Foundation
  5. #
  6. # This file is part of the Translate Toolkit.
  7. #
  8. # This program is free software; you can redistribute it and/or modify
  9. # it under the terms of the GNU General Public License as published by
  10. # the Free Software Foundation; either version 2 of the License, or
  11. # (at your option) any later version.
  12. #
  13. # This program is distributed in the hope that it will be useful,
  14. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. # GNU General Public License for more details.
  17. #
  18. # You should have received a copy of the GNU General Public License
  19. # along with this program; if not, see <http://www.gnu.org/licenses/>.
  20. """Parent class for LISA standards (TMX, TBX, XLIFF)"""
  21. import re
  22. from translate.storage import base
  23. from translate.lang import data
  24. try:
  25. from lxml import etree
  26. from translate.misc.xml_helpers import getText, getXMLlang, setXMLlang, \
  27. getXMLspace, setXMLspace, namespaced
  28. except ImportError, e:
  29. raise ImportError("lxml is not installed. It might be possible to continue without support for XML formats.")
  30. def _findAllMatches(text, re_obj):
  31. """generate match objects for all L{re_obj} matches in L{text}."""
  32. start = 0
  33. max = len(text)
  34. while start < max:
  35. m = re_obj.search(text, start)
  36. if not m:
  37. break
  38. yield m
  39. start = m.end()
  40. #TODO: we can now do better with our proper placeables support
  41. placeholders = ['(%[diouxXeEfFgGcrs])', r'(\\+.?)',
  42. '(%[0-9]$lx)', '(%[0-9]\$[a-z])', '(<.+?>)']
  43. re_placeholders = [re.compile(ph) for ph in placeholders]
  44. def _getPhMatches(text):
  45. """return list of regexp matchobjects for with all place holders in the
  46. L{text}"""
  47. matches = []
  48. for re_ph in re_placeholders:
  49. matches.extend(list(_findAllMatches(text, re_ph)))
  50. # sort them so they come sequentially
  51. matches.sort(lambda a, b: cmp(a.start(), b.start()))
  52. return matches
  53. class LISAunit(base.TranslationUnit):
  54. """
  55. A single unit in the file. Provisional work is done to make several
  56. languages possible.
  57. """
  58. #The name of the root element of this unit type:(termEntry, tu, trans-unit)
  59. rootNode = ""
  60. # The name of the per language element of this unit type:(termEntry, tu,
  61. # trans-unit)
  62. languageNode = ""
  63. #The name of the innermost element of this unit type:(term, seg)
  64. textNode = ""
  65. namespace = None
  66. _default_xml_space = "preserve"
  67. """The default handling of spacing in the absense of an xml:space
  68. attribute.
  69. This is mostly for correcting XLIFF behaviour."""
  70. def __init__(self, source, empty=False, **kwargs):
  71. """Constructs a unit containing the given source string"""
  72. self._rich_source = None
  73. self._rich_target = None
  74. if empty:
  75. self._state_n = 0
  76. return
  77. self.xmlelement = etree.Element(self.namespaced(self.rootNode))
  78. #add descrip, note, etc.
  79. super(LISAunit, self).__init__(source)
  80. def __eq__(self, other):
  81. """Compares two units"""
  82. if not isinstance(other, LISAunit):
  83. return super(LISAunit, self).__eq__(other)
  84. languageNodes = self.getlanguageNodes()
  85. otherlanguageNodes = other.getlanguageNodes()
  86. if len(languageNodes) != len(otherlanguageNodes):
  87. return False
  88. for i in range(len(languageNodes)):
  89. mytext = self.getNodeText(languageNodes[i],
  90. getXMLspace(self.xmlelement,
  91. self._default_xml_space))
  92. othertext = other.getNodeText(otherlanguageNodes[i],
  93. getXMLspace(self.xmlelement,
  94. self._default_xml_space))
  95. if mytext != othertext:
  96. #TODO:^ maybe we want to take children and notes into account
  97. return False
  98. return True
  99. def namespaced(self, name):
  100. """Returns name in Clark notation.
  101. For example namespaced("source") in an XLIFF document might return::
  102. {urn:oasis:names:tc:xliff:document:1.1}source
  103. This is needed throughout lxml.
  104. """
  105. return namespaced(self.namespace, name)
  106. def set_source_dom(self, dom_node):
  107. languageNodes = self.getlanguageNodes()
  108. if len(languageNodes) > 0:
  109. self.xmlelement.replace(languageNodes[0], dom_node)
  110. else:
  111. self.xmlelement.append(dom_node)
  112. def get_source_dom(self):
  113. return self.getlanguageNode(lang=None, index=0)
  114. source_dom = property(get_source_dom, set_source_dom)
  115. def setsource(self, text, sourcelang='en'):
  116. if self._rich_source is not None:
  117. self._rich_source = None
  118. text = data.forceunicode(text)
  119. self.source_dom = self.createlanguageNode(sourcelang, text, "source")
  120. def getsource(self):
  121. return self.getNodeText(self.source_dom,
  122. getXMLspace(self.xmlelement,
  123. self._default_xml_space))
  124. source = property(getsource, setsource)
  125. def set_target_dom(self, dom_node, append=False):
  126. languageNodes = self.getlanguageNodes()
  127. assert len(languageNodes) > 0
  128. if dom_node is not None:
  129. if append or len(languageNodes) == 0:
  130. self.xmlelement.append(dom_node)
  131. else:
  132. self.xmlelement.insert(1, dom_node)
  133. if not append and len(languageNodes) > 1:
  134. self.xmlelement.remove(languageNodes[1])
  135. def get_target_dom(self, lang=None):
  136. if lang:
  137. return self.getlanguageNode(lang=lang)
  138. else:
  139. return self.getlanguageNode(lang=None, index=1)
  140. target_dom = property(get_target_dom)
  141. def settarget(self, text, lang='xx', append=False):
  142. """Sets the "target" string (second language), or alternatively
  143. appends to the list"""
  144. #XXX: we really need the language - can't really be optional, and we
  145. # need to propagate it
  146. if self._rich_target is not None:
  147. self._rich_target = None
  148. text = data.forceunicode(text)
  149. # Firstly deal with reinitialising to None or setting to identical
  150. # string
  151. if self.gettarget() == text:
  152. return
  153. languageNode = self.get_target_dom(None)
  154. if not text is None:
  155. if languageNode is None:
  156. languageNode = self.createlanguageNode(lang, text, "target")
  157. self.set_target_dom(languageNode, append)
  158. else:
  159. if self.textNode:
  160. terms = languageNode.iter(self.namespaced(self.textNode))
  161. try:
  162. languageNode = terms.next()
  163. except StopIteration, e:
  164. pass
  165. languageNode.text = text
  166. else:
  167. self.set_target_dom(None, False)
  168. def gettarget(self, lang=None):
  169. """retrieves the "target" text (second entry), or the entry in the
  170. specified language, if it exists"""
  171. return self.getNodeText(self.get_target_dom(lang),
  172. getXMLspace(self.xmlelement,
  173. self._default_xml_space))
  174. target = property(gettarget, settarget)
  175. def createlanguageNode(self, lang, text, purpose=None):
  176. """Returns a xml Element setup with given parameters to represent a
  177. single language entry. Has to be overridden."""
  178. return None
  179. def createPHnodes(self, parent, text):
  180. """Create the text node in parent containing all the ph tags"""
  181. matches = _getPhMatches(text)
  182. if not matches:
  183. parent.text = text
  184. return
  185. # Now we know there will definitely be some ph tags
  186. start = matches[0].start()
  187. pretext = text[:start]
  188. if pretext:
  189. parent.text = pretext
  190. lasttag = parent
  191. for i, m in enumerate(matches):
  192. #pretext
  193. pretext = text[start:m.start()]
  194. # this will never happen with the first ph tag
  195. if pretext:
  196. lasttag.tail = pretext
  197. #ph node
  198. phnode = etree.SubElement(parent, self.namespaced("ph"))
  199. phnode.set("id", str(i+1))
  200. phnode.text = m.group()
  201. lasttag = phnode
  202. start = m.end()
  203. #post text
  204. if text[start:]:
  205. lasttag.tail = text[start:]
  206. def getlanguageNodes(self):
  207. """Returns a list of all nodes that contain per language information.
  208. """
  209. return list(self.xmlelement.iterchildren(self.namespaced(self.languageNode)))
  210. def getlanguageNode(self, lang=None, index=None):
  211. """Retrieves a languageNode either by language or by index"""
  212. if lang is None and index is None:
  213. raise KeyError("No criterea for languageNode given")
  214. languageNodes = self.getlanguageNodes()
  215. if lang:
  216. for set in languageNodes:
  217. if getXMLlang(set) == lang:
  218. return set
  219. else:#have to use index
  220. if index >= len(languageNodes):
  221. return None
  222. else:
  223. return languageNodes[index]
  224. return None
  225. def getNodeText(self, languageNode, xml_space="preserve"):
  226. """Retrieves the term from the given languageNode"""
  227. if languageNode is None:
  228. return None
  229. if self.textNode:
  230. terms = languageNode.iterdescendants(self.namespaced(self.textNode))
  231. if terms is None:
  232. return None
  233. else:
  234. return getText(terms.next(), xml_space)
  235. else:
  236. return getText(languageNode, xml_space)
  237. def __str__(self):
  238. return etree.tostring(self.xmlelement, pretty_print=True,
  239. encoding='utf-8')
  240. def _set_property(self, name, value):
  241. self.xmlelement.attrib[name] = value
  242. xid = property(lambda self: self.xmlelement.attrib[self.namespaced('xid')],
  243. lambda self, value: self._set_property(self.namespaced('xid'), value))
  244. rid = property(lambda self: self.xmlelement.attrib[self.namespaced('rid')],
  245. lambda self, value: self._set_property(self.namespaced('rid'), value))
  246. def createfromxmlElement(cls, element):
  247. term = cls(None, empty=True)
  248. term.xmlelement = element
  249. return term
  250. createfromxmlElement = classmethod(createfromxmlElement)
  251. class LISAfile(base.TranslationStore):
  252. """A class representing a file store for one of the LISA file formats."""
  253. UnitClass = LISAunit
  254. #The root node of the XML document:
  255. rootNode = ""
  256. #The root node of the content section:
  257. bodyNode = ""
  258. #The XML skeleton to use for empty construction:
  259. XMLskeleton = ""
  260. namespace = None
  261. def __init__(self, inputfile=None, sourcelanguage='en',
  262. targetlanguage=None, unitclass=None):
  263. super(LISAfile, self).__init__(unitclass=unitclass)
  264. if inputfile is not None:
  265. self.parse(inputfile)
  266. assert self.document.getroot().tag == self.namespaced(self.rootNode)
  267. else:
  268. # We strip out newlines to ensure that spaces in the skeleton
  269. # doesn't interfere with the the pretty printing of lxml
  270. self.parse(self.XMLskeleton.replace("\n", ""))
  271. self.setsourcelanguage(sourcelanguage)
  272. self.settargetlanguage(targetlanguage)
  273. self.addheader()
  274. self._encoding = "UTF-8"
  275. def addheader(self):
  276. """Method to be overridden to initialise headers, etc."""
  277. pass
  278. def namespaced(self, name):
  279. """Returns name in Clark notation.
  280. For example namespaced("source") in an XLIFF document might return::
  281. {urn:oasis:names:tc:xliff:document:1.1}source
  282. This is needed throughout lxml.
  283. """
  284. return namespaced(self.namespace, name)
  285. def initbody(self):
  286. """Initialises self.body so it never needs to be retrieved from the
  287. XML again."""
  288. self.namespace = self.document.getroot().nsmap.get(None, None)
  289. self.body = self.document.find('//%s' % self.namespaced(self.bodyNode))
  290. def addsourceunit(self, source):
  291. #TODO: miskien moet hierdie eerder addsourcestring of iets genoem word?
  292. """Adds and returns a new unit with the given string as first entry."""
  293. newunit = self.UnitClass(source)
  294. self.addunit(newunit)
  295. return newunit
  296. def addunit(self, unit, new=True):
  297. unit.namespace = self.namespace
  298. super(LISAfile, self).addunit(unit)
  299. if new:
  300. self.body.append(unit.xmlelement)
  301. def __str__(self):
  302. """Converts to a string containing the file's XML"""
  303. return etree.tostring(self.document, pretty_print=True,
  304. xml_declaration=True, encoding='utf-8')
  305. def parse(self, xml):
  306. """Populates this object from the given xml string"""
  307. if not hasattr(self, 'filename'):
  308. self.filename = getattr(xml, 'name', '')
  309. if hasattr(xml, "read"):
  310. xml.seek(0)
  311. posrc = xml.read()
  312. xml = posrc
  313. if etree.LXML_VERSION >= (2, 1, 0):
  314. #Since version 2.1.0 we can pass the strip_cdata parameter to
  315. #indicate that we don't want cdata to be converted to raw XML
  316. parser = etree.XMLParser(strip_cdata=False)
  317. else:
  318. parser = etree.XMLParser()
  319. self.document = etree.fromstring(xml, parser).getroottree()
  320. self._encoding = self.document.docinfo.encoding
  321. self.initbody()
  322. assert self.document.getroot().tag == self.namespaced(self.rootNode)
  323. for entry in self.document.getroot().iterdescendants(self.namespaced(self.UnitClass.rootNode)):
  324. term = self.UnitClass.createfromxmlElement(entry)
  325. self.addunit(term, new=False)