PageRenderTime 39ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 0ms

/utils/translation.py

https://gitlab.com/gregtyka/server
Python | 287 lines | 277 code | 5 blank | 5 comment | 6 complexity | 4f441428d9a8db2191871a790d736fba MD5 | raw file
  1. '''
  2. Created on Nov 4, 2012
  3. @author: oferko
  4. '''
  5. from path3 import path
  6. import re
  7. import hashlib
  8. import urllib2
  9. import csv
  10. import itertools
  11. import sys
  12. import ast
  13. APP_ROOT = path(__file__).parent.parent
  14. class TranslatableFile(object):
  15. HEADER = ""
  16. def __init__(self, f):
  17. self.relative_path = APP_ROOT.relpathto(f)
  18. self.file_digest = hashlib.md5(self.relative_path).hexdigest()[:DIGEST_LEN]
  19. check_collision(self.file_digest)
  20. self.path = f
  21. self.orig = f + ".orig"
  22. source = self.orig if self.orig.isfile() else f
  23. with open(source) as opened:
  24. self.text = opened.read()
  25. def __repr__(self):
  26. return "%s(%s - %s)" % (self.__class__.__name__, self.file_digest, self.relative_path)
  27. def preprocess(self, translation):
  28. return translation
  29. def translate(self, translations):
  30. if not self.orig.isfile():
  31. with open(self.orig, "w") as f:
  32. f.write(self.text)
  33. print self.relative_path, "->", self.orig
  34. if translations == "IGNORE":
  35. print self.relative_path, "<-", self.orig
  36. self.orig.copy(self.path)
  37. return
  38. translated_text = self.HEADER
  39. cursor = 0
  40. for orig_text, (st, end), line_no, text_digest in self.iter_translatables():
  41. translated_text += self.text[cursor:st]
  42. translation = self.preprocess(translations.get(text_digest))
  43. chunk = self.text[st:end]
  44. translated_text += chunk.replace(orig_text.strip(), translation) if translation is not None else chunk
  45. cursor = end
  46. translated_text += self.text[cursor:]
  47. with open(self.path, "w") as f:
  48. f.write(translated_text)
  49. print "%s: %s" % (self.path, len(translations))
  50. @classmethod
  51. def get_files(cls):
  52. for f in cls._get_files():
  53. yield cls(f)
  54. class TranslatablePython(TranslatableFile, ast.NodeVisitor):
  55. HEADER = "# coding=utf8\n\n"
  56. @classmethod
  57. def _get_files(cls):
  58. for tmpl in (APP_ROOT + "/badges").walkfiles("*badge*.py"):
  59. yield tmpl
  60. def iter_translatables(self):
  61. self.lines = self.text.splitlines()
  62. self.line_breaks = [None,0] + [m.start() for m in RE_LINEBREAK.finditer(self.text+"\n")]
  63. self.strings = []
  64. tree = ast.parse(self.text, self.relative_path)
  65. self.visit(tree)
  66. for line_no, span, string in self.strings:
  67. line = self.lines[line_no-1]
  68. nice_text = line[line.find(string)-1:].strip()
  69. if len(set(nice_text.replace(string, ""))) == 1:
  70. # there's nothing but quotes around our string
  71. nice_text = string
  72. line = line.strip()
  73. if not (line.startswith("return") or "self.description" in line):
  74. continue
  75. digest = hashlib.md5(nice_text).hexdigest()[:DIGEST_LEN]
  76. yield nice_text, span, line_no, digest
  77. def preprocess(self, translation):
  78. if not translation:
  79. return
  80. return ("u" + translation
  81. if translation.startswith("\"")
  82. else ('"+u"%s' % translation)
  83. )
  84. def visit_Str(self, node):
  85. if node.s:
  86. st = self.line_breaks[node.lineno] + node.col_offset
  87. end = self.line_breaks[node.lineno+1]
  88. self.strings.append((node.lineno, (st, end), node.s))
  89. NON_TEXT = set("[document] script style".split())
  90. RE_TEMPLATE = re.compile("({{.*?}}|{%.*?%})")
  91. from cgi import escape
  92. def simple_escape(match):
  93. s = match.group(0)
  94. e = escape(s)
  95. return e
  96. from HTMLParser import HTMLParser
  97. htmlparser = HTMLParser()
  98. RE_TAGS = re.compile("<(/?)(!|\w+)(.*?)(/?)>|\n", flags=re.DOTALL)
  99. RE_ATTR_TAGS = {'input' : re.compile("""(value|placeholder)=(?P<qt>\"|')(.+?)(?P=qt)"""),
  100. 'textarea' : re.compile("""(value|placeholder)=(?P<qt>\"|')(.+?)(?P=qt)"""),
  101. 'a' : re.compile("""(title)=(?P<qt>\"|')(.+?)(?P=qt)"""),
  102. 'img' : re.compile("""(title)=(?P<qt>\"|')(.+?)(?P=qt)"""),
  103. }
  104. RE_LINEBREAK = re.compile("\n")
  105. class TranslatableHTML(TranslatableFile):
  106. @classmethod
  107. def _get_files(cls):
  108. for tmpl in (APP_ROOT + "/templates").walkfiles("*.html"):
  109. yield tmpl
  110. yield APP_ROOT + "/khan-exercises/exercises/khan-exercise.html"
  111. for tmpl in (APP_ROOT + "/clienttemplates").walkfiles("*.handlebars"):
  112. yield tmpl
  113. def iter_translatables(self):
  114. html = self.text
  115. def iterator():
  116. line_breaks = [m.start() for m in RE_LINEBREAK.finditer(html+"\n")]
  117. line_no = 1
  118. cursor = 0
  119. current_tag = None
  120. matches = RE_TAGS.finditer(html)
  121. for match in matches:
  122. ended, tag, attrs, end = match.groups()
  123. tag_start, tag_end = match.span()
  124. if tag and not ended:
  125. if not end:
  126. current_tag = tag
  127. regex = RE_ATTR_TAGS.get(tag)
  128. if regex:
  129. attrs_start = tag_start + html[tag_start:tag_end].find(attrs)
  130. for match in regex.finditer(attrs):
  131. name, qt, value = match.groups()
  132. attr_start, attr_end = match.start(), match.end()
  133. while attrs_start+attr_start > line_breaks[line_no-1]:
  134. line_no += 1
  135. if value == "Ask a question about this video":
  136. pass
  137. yield value, (attrs_start+attr_start, attrs_start+attr_end), line_no
  138. if current_tag in NON_TEXT:
  139. for match in matches:
  140. ended, tag, _, _ = match.groups()
  141. if tag == current_tag:
  142. assert ended, match.group(0)
  143. tag_end = match.end()
  144. current_tag = None
  145. break
  146. else:
  147. raise Exception("Could not close %s" % current_tag)
  148. else:
  149. while cursor > line_breaks[line_no-1]:
  150. line_no += 1
  151. yield html[cursor:tag_start], (cursor, tag_start), line_no
  152. cursor = tag_end
  153. tag_start = len(html)
  154. while cursor > line_breaks[line_no-1]:
  155. line_no += 1
  156. yield html[cursor:tag_start], (cursor, tag_start), line_no
  157. for text, span, line_no in iterator():
  158. if text:
  159. nice_text = text.strip()
  160. if not is_translateable(nice_text):
  161. continue
  162. digest = hashlib.md5(nice_text).hexdigest()[:DIGEST_LEN]
  163. yield nice_text, span, line_no, digest
  164. def is_translateable(nice_text):
  165. if not nice_text:
  166. return
  167. if nice_text.lower() == "x":
  168. return
  169. if "TODO" in nice_text:
  170. return
  171. cleaned = htmlparser.unescape(RE_TEMPLATE.sub("", nice_text)).strip()
  172. if not any(s.isalpha() for s in cleaned):
  173. return
  174. return True
  175. DIGEST_LEN = 6
  176. def check_collision(digest, _digests=set()):
  177. if digest in _digests:
  178. raise Exception("Collision!")
  179. _digests.add(digest)
  180. def iter_files():
  181. return itertools.chain(
  182. TranslatableHTML.get_files(),
  183. TranslatablePython.get_files(),
  184. )
  185. def extract_for_translation():
  186. data = get_translated_data(all=True)
  187. counter = lambda counter=itertools.count(1): str(next(counter))
  188. new_items = []
  189. with open("extracted_text3.tab","w") as output:
  190. print >>output, "\t".join("id,file digest,line no,text digest,text,translation".split(","))
  191. for f in iter_files():
  192. translations = data.get(f.file_digest) or {}
  193. if translations == "IGNORE":
  194. print "SKIPPING:", f
  195. print >>output, "%s\t'%s\t%s\t%s\t%s\tIGNORE" % (counter(), f.file_digest, "", "", f.relative_path)
  196. print >>output, counter()
  197. continue
  198. lines = []
  199. for text, _, line_no, text_digest in f.iter_translatables():
  200. translation = translations.get(text_digest, "")
  201. if translation == "NOOP":
  202. translation = ""
  203. lines.append(("'"+f.file_digest, "#%s" % line_no, "'"+text_digest, "'%s" % text, "'%s" % translation))
  204. if text_digest not in translations:
  205. new_items.append(lines[-1])
  206. if lines:
  207. print >>output, "\t".join((counter(), "'" + f.file_digest, "", "", f.relative_path))
  208. for line in lines:
  209. print >>output, "\t".join((counter(),) + line)
  210. print >>output, counter()
  211. print "\t", len(lines)
  212. for line in new_items:
  213. print "\t".join((counter(),) + line)
  214. TRANSLATION_URL = "https://docs.google.com/spreadsheet/pub?key=0Ap8djBdeiIG7dDF6VDJPbEpFSG5SNWtwOFVrU3Y5Qnc&single=true&gid=5&output=csv"
  215. def get_translated_data(all=False):
  216. lines = csv.reader(urllib2.urlopen(TRANSLATION_URL))
  217. _header = next(lines)
  218. data = {}
  219. c = 0
  220. for i, file_digest, _tag, text_digest, _orig, trans in lines:
  221. c+=1
  222. if file_digest and not text_digest and trans=="IGNORE": # ignore entire file
  223. data[file_digest] = "IGNORE"
  224. if data.get(file_digest) == "IGNORE":
  225. pass
  226. elif text_digest:
  227. if all or (trans and trans!="IGNORE"):
  228. data.setdefault(file_digest,{})[text_digest] = "NOOP" if not trans else "" if trans == "BLANK" else trans
  229. if i == "Version":
  230. print "VERSION =", file_digest
  231. elif i and int(i) % 100 == 99:
  232. print ".",
  233. print "Read %s lines" % c
  234. return data
  235. def import_translation():
  236. data = get_translated_data()
  237. for f in iter_files():
  238. translations = data.get(f.file_digest)
  239. if not translations:
  240. continue
  241. f.translate(translations)