PageRenderTime 54ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/jars-src/jython2.2.1/src/org/python/core/codecs.java

https://bitbucket.org/wrapman/frostwire.desktop.translations.pirate
Java | 541 lines | 405 code | 83 blank | 53 comment | 126 complexity | d6a60e29e95e5d10b553254a096400f8 MD5 | raw file
Possible License(s): CC-BY-SA-3.0, LGPL-2.1, BSD-3-Clause, Apache-2.0, MPL-2.0-no-copyleft-exception
  1. /*
  2. * Copyright 2000 Finn Bock
  3. *
  4. * This program contains material copyrighted by:
  5. * Copyright (c) Corporation for National Research Initiatives.
  6. * Originally written by Marc-Andre Lemburg (mal@lemburg.com).
  7. */
  8. package org.python.core;
  9. /**
  10. * Contains the implementation of the builtin codecs.
  11. * @since Jython 2.0
  12. */
  13. public class codecs {
  14. private static char Py_UNICODE_REPLACEMENT_CHARACTER = 0xFFFD;
  15. private static PyList searchPath = new PyList();
  16. private static PyStringMap searchCache = new PyStringMap();
  17. private static String default_encoding = "ascii";
  18. public static String getDefaultEncoding() {
  19. return default_encoding;
  20. }
  21. public static void setDefaultEncoding(String encoding) {
  22. lookup(encoding);
  23. default_encoding = encoding;
  24. }
  25. public static void register(PyObject search_function) {
  26. if (!search_function.isCallable()) {
  27. throw Py.TypeError("argument must be callable");
  28. }
  29. searchPath.append(search_function);
  30. }
  31. public static PyTuple lookup(String encoding) {
  32. import_encodings();
  33. PyString v = new PyString(normalizestring(encoding));
  34. PyObject result = searchCache.__finditem__(v);
  35. if (result != null) {
  36. return (PyTuple)result;
  37. }
  38. if (searchPath.__len__() == 0) {
  39. throw new PyException(Py.LookupError,
  40. "no codec search functions registered: " +
  41. "can't find encoding");
  42. }
  43. PyObject iter = searchPath.__iter__();
  44. PyObject func = null;
  45. while ((func = iter.__iternext__()) != null) {
  46. result = func.__call__(v);
  47. if (result == Py.None) {
  48. continue;
  49. }
  50. if (!(result instanceof PyTuple) || result.__len__() != 4) {
  51. throw Py.TypeError("codec search functions must "+
  52. "return 4-tuples");
  53. }
  54. break;
  55. }
  56. if (func == null) {
  57. throw new PyException(Py.LookupError, "unknown encoding " +
  58. encoding);
  59. }
  60. searchCache.__setitem__(v, result);
  61. return (PyTuple)result;
  62. }
  63. private static String normalizestring(String string) {
  64. return string.toLowerCase().replace(' ', '-');
  65. }
  66. private static boolean import_encodings_called = false;
  67. private static void import_encodings() {
  68. if (!import_encodings_called) {
  69. import_encodings_called = true;
  70. try {
  71. __builtin__.__import__("encodings");
  72. } catch (PyException exc) {
  73. if (exc.type != Py.ImportError) {
  74. throw exc;
  75. }
  76. }
  77. }
  78. }
  79. public static String decode(PyString v, String encoding,
  80. String errors)
  81. {
  82. if (encoding == null) {
  83. encoding = getDefaultEncoding();
  84. } else {
  85. encoding = normalizestring(encoding);
  86. }
  87. if (errors != null) {
  88. errors = errors.intern();
  89. }
  90. /* Shortcuts for common default encodings */
  91. /*
  92. if (encoding.equals("utf-8"))
  93. return utf_8_decode(v, errors).__getitem__(0).__str__();
  94. else if (encoding.equals("latin-1"))
  95. ; //return PyUnicode_DecodeLatin1(s, size, errors);
  96. else if (encoding.equals("ascii"))
  97. ; //return PyUnicode_DecodeASCII(s, size, errors);
  98. */
  99. if (encoding.equals("ascii")) {
  100. return PyUnicode_DecodeASCII(v.toString(),
  101. v.__len__(), errors);
  102. }
  103. /* Decode via the codec registry */
  104. PyObject decoder = getDecoder(encoding);
  105. PyObject result = null;
  106. if (errors != null) {
  107. result = decoder.__call__(v, new PyString(errors));
  108. } else {
  109. result = decoder.__call__(v);
  110. }
  111. if (!(result instanceof PyTuple) || result.__len__() != 2) {
  112. throw Py.TypeError("decoder must return a tuple " +
  113. "(object,integer)");
  114. }
  115. return result.__getitem__(0).toString();
  116. }
  117. private static PyObject getDecoder(String encoding) {
  118. PyObject codecs = lookup(encoding);
  119. return codecs.__getitem__(1);
  120. }
  121. public static String encode(PyString v, String encoding,
  122. String errors)
  123. {
  124. if (encoding == null) {
  125. encoding = getDefaultEncoding();
  126. } else {
  127. encoding = normalizestring(encoding);
  128. }
  129. if (errors != null) {
  130. errors = errors.intern();
  131. }
  132. /* Shortcuts for common default encodings */
  133. /*
  134. if (encoding.equals("utf-8"))
  135. return PyUnicode_DecodeUTF8(v.toString(), v.__len__(), errors);
  136. else if (encoding.equals("latin-1"))
  137. return PyUnicode_DecodeLatin1(v.toString(), v.__len__(), errors);
  138. else
  139. */
  140. if (encoding.equals("ascii")) {
  141. return PyUnicode_EncodeASCII(v.toString(),
  142. v.__len__(), errors);
  143. }
  144. /* Decode via the codec registry */
  145. PyObject encoder = getEncoder(encoding);
  146. PyObject result = null;
  147. if (errors != null) {
  148. result = encoder.__call__(v, new PyString(errors));
  149. } else {
  150. result = encoder.__call__(v);
  151. }
  152. if (!(result instanceof PyTuple) || result.__len__() != 2) {
  153. throw Py.TypeError("encoder must return a tuple " +
  154. "(object,integer)");
  155. }
  156. return result.__getitem__(0).toString();
  157. }
  158. private static PyObject getEncoder(String encoding) {
  159. PyObject codecs = lookup(encoding);
  160. return codecs.__getitem__(0);
  161. }
  162. /* --- UTF-8 Codec ---------------------------------------------------- */
  163. private static byte utf8_code_length[] = {
  164. /* Map UTF-8 encoded prefix byte to sequence length. zero means
  165. illegal prefix. see RFC 2279 for details */
  166. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  167. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  168. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  169. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  170. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  171. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  172. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  173. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  174. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  175. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  176. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  177. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  178. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  179. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  180. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  181. 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
  182. };
  183. public static String PyUnicode_DecodeUTF8(String str, String errors) {
  184. int size = str.length();
  185. StringBuffer unicode = new StringBuffer(size);
  186. /* Unpack UTF-8 encoded data */
  187. for (int i = 0; i < size; ) {
  188. int ch = str.charAt(i);
  189. if (ch > 0xFF) {
  190. codecs.decoding_error("utf-8", unicode, errors,
  191. "ordinal not in range(255)");
  192. i++;
  193. continue;
  194. }
  195. if (ch < 0x80) {
  196. unicode.append((char) ch);
  197. i++;
  198. continue;
  199. }
  200. int n = utf8_code_length[ch];
  201. if (i + n > size) {
  202. codecs.decoding_error("utf-8", unicode, errors,
  203. "unexpected end of data");
  204. i++;
  205. continue;
  206. }
  207. switch (n) {
  208. case 0:
  209. codecs.decoding_error("utf-8", unicode, errors,
  210. "unexpected code byte");
  211. i++;
  212. continue;
  213. case 1:
  214. codecs.decoding_error("utf-8", unicode, errors,
  215. "internal error");
  216. i++;
  217. continue;
  218. case 2:
  219. char ch1 = str.charAt(i+1);
  220. if ((ch1 & 0xc0) != 0x80) {
  221. codecs.decoding_error("utf-8", unicode, errors,
  222. "invalid data");
  223. i++;
  224. continue;
  225. }
  226. ch = ((ch & 0x1f) << 6) + (ch1 & 0x3f);
  227. if (ch < 0x80) {
  228. codecs.decoding_error("utf-8", unicode, errors,
  229. "illegal encoding");
  230. i++;
  231. continue;
  232. } else
  233. unicode.append((char) ch);
  234. break;
  235. case 3:
  236. ch1 = str.charAt(i+1);
  237. char ch2 = str.charAt(i+2);
  238. if ((ch1 & 0xc0) != 0x80 || (ch2 & 0xc0) != 0x80) {
  239. codecs.decoding_error("utf-8", unicode, errors,
  240. "invalid data");
  241. i++;
  242. continue;
  243. }
  244. ch = ((ch & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f);
  245. if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
  246. codecs.decoding_error("utf-8", unicode, errors,
  247. "illegal encoding");
  248. i++;
  249. continue;
  250. } else
  251. unicode.append((char) ch);
  252. break;
  253. case 4:
  254. ch1 = str.charAt(i+1);
  255. ch2 = str.charAt(i+2);
  256. char ch3 = str.charAt(i+3);
  257. if ((ch1 & 0xc0) != 0x80 ||
  258. (ch2 & 0xc0) != 0x80 ||
  259. (ch3 & 0xc0) != 0x80) {
  260. codecs.decoding_error("utf-8", unicode, errors,
  261. "invalid data");
  262. i++;
  263. continue;
  264. }
  265. ch = ((ch & 0x7) << 18) + ((ch1 & 0x3f) << 12) +
  266. ((ch2 & 0x3f) << 6) + (ch3 & 0x3f);
  267. /* validate and convert to UTF-16 */
  268. if ((ch < 0x10000) || /* minimum value allowed for 4
  269. byte encoding */
  270. (ch > 0x10ffff)) { /* maximum value allowed for
  271. UTF-16 */
  272. codecs.decoding_error("utf-8", unicode, errors,
  273. "illegal encoding");
  274. i++;
  275. continue;
  276. }
  277. /* compute and append the two surrogates: */
  278. /* translate from 10000..10FFFF to 0..FFFF */
  279. ch -= 0x10000;
  280. /* high surrogate = top 10 bits added to D800 */
  281. unicode.append((char) (0xD800 + (ch >> 10)));
  282. /* low surrogate = bottom 10 bits added to DC00 */
  283. unicode.append((char) (0xDC00 + (ch & ~0xFC00)));
  284. break;
  285. default:
  286. /* Other sizes are only needed for UCS-4 */
  287. codecs.decoding_error("utf-8", unicode, errors,
  288. "unsupported Unicode code range");
  289. i++;
  290. }
  291. i += n;
  292. }
  293. return unicode.toString();
  294. }
  295. public static String PyUnicode_EncodeUTF8(String str, String errors) {
  296. int size = str.length();
  297. StringBuffer v = new StringBuffer(size * 3);
  298. for (int i = 0; i < size; ) {
  299. int ch = str.charAt(i++);
  300. if (ch < 0x80) {
  301. v.append((char) ch);
  302. } else if (ch < 0x0800) {
  303. v.append((char) (0xc0 | (ch >> 6)));
  304. v.append((char) (0x80 | (ch & 0x3f)));
  305. } else {
  306. if (0xD800 <= ch && ch <= 0xDFFF) {
  307. if (i != size) {
  308. int ch2 = str.charAt(i);
  309. if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
  310. /* combine the two values */
  311. ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
  312. v.append((char)((ch >> 18) | 0xf0));
  313. v.append((char)(0x80 | ((ch >> 12) & 0x3f)));
  314. i++;
  315. }
  316. }
  317. } else {
  318. v.append((char)(0xe0 | (ch >> 12)));
  319. }
  320. v.append((char) (0x80 | ((ch >> 6) & 0x3f)));
  321. v.append((char) (0x80 | (ch & 0x3f)));
  322. }
  323. }
  324. return v.toString();
  325. }
  326. /* --- 7-bit ASCII Codec -------------------------------------------- */
  327. public static String PyUnicode_DecodeASCII(String str, int size,
  328. String errors)
  329. {
  330. StringBuffer v = new StringBuffer(size);
  331. for (int i = 0; i < size; i++) {
  332. char ch = str.charAt(i);
  333. if (ch < 128) {
  334. v.append(ch);
  335. } else {
  336. decoding_error("ascii", v, errors,
  337. "ordinal not in range(128)");
  338. continue;
  339. }
  340. }
  341. return v.toString();
  342. }
  343. public static String PyUnicode_EncodeASCII(String str, int size,
  344. String errors)
  345. {
  346. StringBuffer v = new StringBuffer(size);
  347. for (int i = 0; i < size; i++) {
  348. char ch = str.charAt(i);
  349. if (ch >= 128) {
  350. encoding_error("ascii", v, errors,
  351. "ordinal not in range(128)");
  352. } else {
  353. v.append(ch);
  354. }
  355. }
  356. return v.toString();
  357. }
  358. /* --- RawUnicodeEscape Codec ---------------------------------------- */
  359. private static char[] hexdigit = "0123456789ABCDEF".toCharArray();
  360. // The modified flag is used by cPickle.
  361. public static String PyUnicode_EncodeRawUnicodeEscape(String str,
  362. String errors,
  363. boolean modifed)
  364. {
  365. int size = str.length();
  366. StringBuffer v = new StringBuffer(str.length());
  367. for (int i = 0; i < size; i++) {
  368. char ch = str.charAt(i);
  369. if (ch >= 256 || (modifed && (ch == '\n' || ch == '\\'))) {
  370. v.append("\\u");
  371. v.append(hexdigit[(ch >>> 12) & 0xF]);
  372. v.append(hexdigit[(ch >>> 8) & 0xF]);
  373. v.append(hexdigit[(ch >>> 4) & 0xF]);
  374. v.append(hexdigit[ch & 0xF]);
  375. } else {
  376. v.append(ch);
  377. }
  378. }
  379. return v.toString();
  380. }
  381. public static String PyUnicode_DecodeRawUnicodeEscape(String str,
  382. String errors)
  383. {
  384. int size = str.length();
  385. StringBuffer v = new StringBuffer(size);
  386. for (int i = 0; i < size; ) {
  387. char ch = str.charAt(i);
  388. /* Non-escape characters are interpreted as Unicode ordinals */
  389. if (ch != '\\') {
  390. v.append(ch);
  391. i++;
  392. continue;
  393. }
  394. /* \\u-escapes are only interpreted iff the number of leading
  395. backslashes is odd */
  396. int bs = i;
  397. while (i < size) {
  398. ch = str.charAt(i);
  399. if (ch != '\\')
  400. break;
  401. v.append(ch);
  402. i++;
  403. }
  404. if (((i - bs) & 1) == 0 || i >= size || ch != 'u') {
  405. continue;
  406. }
  407. v.setLength(v.length() - 1);
  408. i++;
  409. /* \\uXXXX with 4 hex digits */
  410. int x = 0;
  411. for (int j = 0; j < 4; j++) {
  412. ch = str.charAt(i+j);
  413. int d = Character.digit(ch, 16);
  414. if (d == -1) {
  415. codecs.decoding_error("unicode escape", v, errors,
  416. "truncated \\uXXXX");
  417. break;
  418. }
  419. x = ((x<<4) & ~0xF) + d;
  420. }
  421. i += 4;
  422. v.append((char) x);
  423. }
  424. return v.toString();
  425. }
  426. /* --- Utility methods -------------------------------------------- */
  427. public static void encoding_error(String type, StringBuffer dest,
  428. String errors, String details)
  429. {
  430. if (errors == null || errors == "strict") {
  431. throw Py.UnicodeError(type + " encoding error: " + details);
  432. } else if (errors == "ignore") {
  433. //ignore
  434. } else if (errors == "replace") {
  435. dest.append('?');
  436. } else {
  437. throw Py.ValueError(type + " encoding error; "+
  438. "unknown error handling code: " + errors);
  439. }
  440. }
  441. public static void decoding_error(String type, StringBuffer dest,
  442. String errors, String details)
  443. {
  444. if (errors == null || errors == "strict") {
  445. throw Py.UnicodeError(type + " decoding error: " + details);
  446. }
  447. else if (errors == "ignore") {
  448. //ignore
  449. } else if (errors == "replace") {
  450. if (dest != null) {
  451. dest.append(Py_UNICODE_REPLACEMENT_CHARACTER);
  452. }
  453. } else {
  454. throw Py.ValueError(type + " decoding error; "+
  455. "unknown error handling code: " + errors);
  456. }
  457. }
  458. }