PageRenderTime 53ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/source/src/Data/String/converter.cpp

http://itexmacs.googlecode.com/
C++ | 673 lines | 516 code | 47 blank | 110 comment | 170 complexity | e2b0a30a2b2691fa1980452e1f40b98c MD5 | raw file
Possible License(s): GPL-3.0, GPL-2.0, MPL-2.0-no-copyleft-exception, LGPL-2.0
  1. /******************************************************************************
  2. * MODULE : converter.cpp
  3. * DESCRIPTION: Applies dictionaries to strings in an efficient manner.
  4. * COPYRIGHT : (C) 2002 Felix Breuer
  5. *******************************************************************************
  6. * This software falls under the GNU general public license version 3 or later.
  7. * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
  8. * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
  9. ******************************************************************************/
  10. #include "converter.hpp"
  11. #include "convert.hpp"
  12. #ifdef USE_ICONV
  13. #include <iconv.h>
  14. #endif
  15. #include <errno.h>
  16. RESOURCE_CODE (converter);
  17. /******************************************************************************
  18. * converter methods
  19. ******************************************************************************/
  20. void
  21. operator << (converter c, string str) {
  22. int index = 0;
  23. while (index < N(str))
  24. c->match(str, index);
  25. }
  26. string
  27. apply (converter c, string str) {
  28. c->output = string();
  29. c << str;
  30. return flush(c);
  31. }
  32. string
  33. flush (converter c) {
  34. string result = c->output;
  35. c->output = string();
  36. return result;
  37. }
  38. /******************************************************************************
  39. * method for loading converters
  40. ******************************************************************************/
  41. converter
  42. load_converter (string from, string to) {
  43. string name= from * "-" * to;
  44. if (converter::instances -> contains (name))
  45. return converter (name);
  46. converter conv = tm_new<converter_rep> (from, to);
  47. return conv;
  48. }
  49. /******************************************************************************
  50. * converter_rep methods
  51. ******************************************************************************/
  52. inline bool
  53. converter_rep::has_value(hashtree<char,string> node) {
  54. return node->label != nil_string;
  55. }
  56. inline void
  57. converter_rep::match (string& str, int& index) {
  58. int forward = index;
  59. int last_match = -1;
  60. string value("");
  61. bool done = false;
  62. hashtree<char,string> node = ht;
  63. //cout << "[";
  64. while (!done && forward < N(str)) {
  65. if (node->contains (str[forward])) {
  66. node = node(str[forward]);
  67. //printf("->%x",str[forward]);
  68. if (has_value(node)) {
  69. last_match = forward;
  70. value = node->label;
  71. }
  72. forward++;
  73. }
  74. else done = true;
  75. }
  76. if (last_match==-1) {
  77. if (copy_unmatched)
  78. output << string(str[index]);
  79. index++;
  80. }
  81. else {
  82. //printf(":");for(int i = 0; i < N(value);i++) printf("%x ",value[i]);
  83. output << value;
  84. index = last_match + 1;
  85. }
  86. //cout << "]";
  87. }
  88. void
  89. converter_rep::load () {
  90. // to handle each case individually seems unelegant, but there is simply more
  91. // to be done here than just loading a file.
  92. // cout << "TeXmacs] load converter " << from << " -> " << to << "\n";
  93. if ( from=="Cork" && to=="UTF-8" ) {
  94. hashtree<char,string> dic;
  95. hashtree_from_dictionary (dic,"corktounicode", BIT2BIT, UTF8, false);
  96. hashtree_from_dictionary (dic,"cork-unicode-oneway", BIT2BIT, UTF8, false);
  97. hashtree_from_dictionary (dic,"tmuniversaltounicode", BIT2BIT, UTF8, false);
  98. hashtree_from_dictionary (dic,"symbol-unicode-oneway", BIT2BIT, UTF8, false);
  99. hashtree_from_dictionary (dic,"symbol-unicode-math", BIT2BIT, UTF8, false);
  100. ht = dic;
  101. }
  102. else if ( from=="UTF-8" && to=="Cork") {
  103. hashtree<char,string> dic;
  104. hashtree_from_dictionary (dic,"corktounicode", UTF8, BIT2BIT, true);
  105. hashtree_from_dictionary (dic,"unicode-cork-oneway", UTF8, BIT2BIT, false);
  106. hashtree_from_dictionary (dic,"tmuniversaltounicode", UTF8, BIT2BIT, true);
  107. hashtree_from_dictionary (dic,"unicode-symbol-oneway", UTF8, BIT2BIT, true);
  108. ht = dic;
  109. }
  110. else if ( from=="UTF-8" && to=="HTML") {
  111. hashtree<char,string> dic;
  112. hashtree_from_dictionary (dic, "HTMLlat1" , CHAR_ENTITY, ENTITY_NAME, true);
  113. hashtree_from_dictionary (dic, "HTMLspecial", CHAR_ENTITY, ENTITY_NAME, true);
  114. hashtree_from_dictionary (dic, "HTMLsymbol" , CHAR_ENTITY, ENTITY_NAME, true);
  115. ht = dic;
  116. } else if ( from=="T2A" && to=="UTF-8" ) {
  117. hashtree<char,string> dic;
  118. hashtree_from_dictionary (dic,"corktounicode", BIT2BIT, UTF8, false);
  119. hashtree_from_dictionary (dic,"cork-unicode-oneway", BIT2BIT, UTF8, false);
  120. hashtree_from_dictionary (dic,"tmuniversaltounicode", BIT2BIT, UTF8, false);
  121. hashtree_from_dictionary (dic,"symbol-unicode-oneway", BIT2BIT, UTF8, false);
  122. hashtree_from_dictionary (dic,"symbol-unicode-math", BIT2BIT, UTF8, false);
  123. hashtree_from_dictionary (dic,"t2atounicode", BIT2BIT, UTF8, false);
  124. ht = dic;
  125. }
  126. }
  127. /******************************************************************************
  128. * convenience functions
  129. ******************************************************************************/
  130. bool
  131. check_encoding (string input, string encoding) {
  132. if (encoding == "Cork") return true;
  133. else return check_using_iconv (input, encoding);
  134. }
  135. string
  136. convert (string input, string from, string to) {
  137. if (from == "Cork")
  138. return convert_from_cork (input, to);
  139. else if (to == "Cork")
  140. return convert_to_cork (input,from);
  141. else
  142. return convert_using_iconv (input, from, to);
  143. }
  144. string
  145. convert_to_cork (string input, string from) {
  146. string str;
  147. if (from != "UTF-8")
  148. str = convert_using_iconv (input, from, "UTF-8");
  149. return utf8_to_cork (str);
  150. }
  151. string
  152. convert_from_cork (string input, string to) {
  153. string str = cork_to_utf8 (input);
  154. if (to != "UTF-8")
  155. str = convert_using_iconv (str, "UTF-8", to);
  156. return str;
  157. }
  158. string
  159. utf8_to_cork (string input) {
  160. converter conv= load_converter ("UTF-8", "Cork");
  161. int start, i, n= N(input);
  162. string output;
  163. for (i=0; i<n; ) {
  164. start= i;
  165. unsigned int code= decode_from_utf8 (input, i);
  166. string s= input (start, i);
  167. string r= apply (conv, s);
  168. if (r == s && code >= 256)
  169. r= "<#" * as_hexadecimal (code) * ">";
  170. output << r;
  171. }
  172. return output;
  173. }
  174. string
  175. cork_to_utf8 (string input) {
  176. converter conv= load_converter ("Cork", "UTF-8");
  177. int start= 0, i, n= N(input);
  178. string r;
  179. for (i=0; i<n; i++)
  180. if (input[i] == '<' && i+1<n && input[i+1] == '#') {
  181. r << apply (conv, input (start, i));
  182. start= i= i+2;
  183. while (i<n && input[i] != '>') i++;
  184. r << encode_as_utf8 (from_hexadecimal (input (start, i)));
  185. start= i+1;
  186. }
  187. r << apply (conv, input (start, n));
  188. return r;
  189. }
  190. string
  191. t2a_to_utf8 (string input) {
  192. converter conv= load_converter ("T2A", "UTF-8");
  193. int start= 0, i, n= N(input);
  194. string r;
  195. for (i=0; i<n; i++)
  196. if (input[i] == '<' && i+1<n && input[i+1] == '#') {
  197. r << apply (conv, input (start, i));
  198. start= i= i+2;
  199. while (i<n && input[i] != '>') i++;
  200. r << encode_as_utf8 (from_hexadecimal (input (start, i)));
  201. start= i+1;
  202. }
  203. r << apply (conv, input (start, n));
  204. return r;
  205. }
  206. string
  207. utf8_to_html (string input) {
  208. converter conv = load_converter ("UTF-8", "HTML");
  209. string s = apply (conv, input);
  210. return utf8_to_hex_entities(s);
  211. }
  212. #ifdef USE_ICONV
  213. // auto_array<T> objects ensure that the contained array is deleted when the
  214. // block where it is defined is exited. No spurious delete[], no memory leak.
  215. template<class T> class auto_array {
  216. T* value;
  217. public:
  218. auto_array (T* x) : value (x) {}
  219. ~auto_array () { tm_delete_array (value ); }
  220. operator T* () const { return value; }
  221. };
  222. class iconv_converter {
  223. string from;
  224. string to;
  225. iconv_t cd;
  226. bool show_errors;
  227. bool successful;
  228. public:
  229. iconv_converter (string from, string to, bool errors=true);
  230. ~iconv_converter ();
  231. inline bool is_valid () { return cd != (iconv_t)-1; }
  232. inline bool is_successful () { return successful; }
  233. friend string apply (iconv_converter &conv, string input);
  234. };
  235. iconv_converter::iconv_converter (string from2, string to2, bool errors):
  236. from (from2), to (to2), show_errors (errors), successful (false)
  237. {
  238. auto_array<char> from_cp = as_charp (from);
  239. auto_array<char> to_cp = as_charp (to);
  240. cd = iconv_open (to_cp, from_cp);
  241. if (!is_valid() && show_errors)
  242. system_error ("Initialization of iconv from " * from *
  243. " to " * to * " failed!");
  244. successful= true;
  245. }
  246. iconv_converter::~iconv_converter () {
  247. iconv_close(cd);
  248. }
  249. // From the standard C++ library (remember, TeXmacs does _not_ use std!)
  250. template<typename T>
  251. inline size_t
  252. iconv_adaptor(size_t(*iconv_func)(iconv_t, T, size_t *, char**, size_t*),
  253. iconv_t cd, char **inbuf, size_t *inbytesleft,
  254. char **outbuf, size_t *outbytesleft) {
  255. return iconv_func (cd, (T) ((void*) inbuf), inbytesleft,
  256. outbuf, outbytesleft);
  257. }
  258. string apply (iconv_converter &conv, string input) {
  259. if (! conv.is_valid()) {
  260. conv.successful= false;
  261. return "";
  262. }
  263. string result;
  264. auto_array<char> in_cp= as_charp(input);
  265. char* in_cursor= in_cp;
  266. size_t in_left= N(input);
  267. double expansion= 1.1;
  268. size_t out_counter= 0;
  269. while (in_left > 0) {
  270. size_t out_left= max(int(in_left * expansion), 1024);
  271. auto_array<char> out_cp= tm_new_array<char> (out_left);
  272. char* out_cursor= out_cp;
  273. size_t r = iconv_adaptor(iconv, conv.cd,
  274. &in_cursor, &in_left, &out_cursor, &out_left);
  275. if(r == (size_t)-1 && errno != E2BIG) {
  276. if (conv.show_errors) {
  277. cerr << "\nConverting from " << conv.from << " to " << conv.to << "\n";
  278. system_error ("String conversion using iconv failed!");
  279. }
  280. conv.successful= false;
  281. return "";
  282. }
  283. size_t used_out= out_cursor - out_cp;
  284. result << string(out_cp, used_out);
  285. out_counter += used_out;
  286. expansion= max((double) out_counter / (in_cursor - in_cp), 1.0) + 0.1;
  287. }
  288. conv.successful= true;
  289. return result;
  290. }
  291. #endif // defined USE_ICONV
  292. bool check_using_iconv (string input, string encoding) {
  293. #ifdef USE_ICONV
  294. iconv_converter conv (encoding, encoding, false);
  295. apply (conv, input);
  296. return conv.is_successful();
  297. #else
  298. (void) input;
  299. (void) encoding;
  300. FAILED ("iconv not enabled");
  301. return false;
  302. #endif
  303. }
  304. string
  305. convert_using_iconv (string input, string from, string to) {
  306. #ifdef USE_ICONV
  307. iconv_converter conv (from, to, true);
  308. return apply (conv, input);
  309. #else
  310. (void) input;
  311. (void) from;
  312. (void) to;
  313. FAILED ("iconv not enabled");
  314. return "";
  315. #endif
  316. }
  317. /******************************************************************************
  318. * Functions for hashtree handling
  319. ******************************************************************************/
  320. void
  321. put_prefix_code (string key, string value, hashtree<char,string> tree) {
  322. if (DEBUG_STD) {
  323. hashtree<char,string> ht= find_node (key,tree);
  324. if (ht->label != "")
  325. cout << "overwriting: " << ht->label << " with " << value << '\n';
  326. }
  327. find_node (key,tree)->set_label(value);
  328. }
  329. hashtree<char,string>
  330. find_node (string key, hashtree<char,string> ht) {
  331. int i;
  332. for(i = 0; i < N(key); i++)
  333. ht = ht(key[i]);
  334. return ht;
  335. }
  336. void
  337. hashtree_from_dictionary (
  338. hashtree<char,string> dic, string file_name, escape_type key_escape,
  339. escape_type val_escape, bool reverse)
  340. {
  341. system_info ("Loading",file_name);
  342. string key_string, val_string, file;
  343. file_name = file_name * ".scm";
  344. if (load_string (url ("$TEXMACS_PATH/langs/encoding", file_name), file, false)) {
  345. system_error ("Couldn't open encoding dictionary", file_name);
  346. return;
  347. }
  348. tree t = block_to_scheme_tree (file);
  349. if (!is_tuple (t)) {
  350. system_error ("Malformed encoding dictionary", file_name);
  351. return;
  352. }
  353. for (int i=0; i<N(t); i++) {
  354. if (is_func (t[i], TUPLE, 2) &&
  355. is_atomic (t[i][0]) && is_atomic (t[i][1]))
  356. {
  357. //cout << N(pairs[i]) << "\n" << as_string(pairs[i]) << "\n";
  358. reverse ? key_string = t[i][1]->label : key_string = t[i][0]->label;
  359. reverse ? val_string = t[i][0]->label : val_string = t[i][1]->label;
  360. if (is_quoted (key_string)) key_string = scm_unquote (key_string);
  361. if (is_quoted (val_string)) val_string = scm_unquote (val_string);
  362. //cout << "key: " << key_string << " val: " << val_string << "\n";
  363. if (key_escape == BIT2BIT)
  364. key_string = convert_escapes (key_string, false);
  365. else if (key_escape == UTF8)
  366. key_string = convert_escapes (key_string, true);
  367. else if (key_escape == CHAR_ENTITY)
  368. key_string = convert_char_entities (key_string);
  369. if (val_escape == BIT2BIT)
  370. val_string = convert_escapes (val_string, false);
  371. else if (val_escape == UTF8)
  372. val_string = convert_escapes (val_string, true);
  373. else if (val_escape == ENTITY_NAME)
  374. val_string = "&" * val_string * ";";
  375. //cout << "key: " << key_string << " val: " << val_string << "\n";
  376. put_prefix_code(key_string,val_string,dic);
  377. }
  378. }
  379. }
  380. /***************************************************************************
  381. * Functions for UTF-8 handling
  382. * These functions are helper functions to convert escape string a la "#23F7"
  383. * and HTML/XML character entities to and from UTF-8 byte sequences.
  384. ***************************************************************************/
  385. bool is_hex_digit (char c) {
  386. return
  387. (48 <= c && c <= 57) ||
  388. (65 <= c && c <= 70) ||
  389. (97 <= c && c <= 102);
  390. }
  391. int hex_digit_to_int(unsigned char c) {
  392. if (48 <= c && c <= 57)
  393. return c - 0x30;
  394. else if (65 <= c && c <= 70)
  395. return c - 0x41 + 0x0A;
  396. else if (97 <= c && c <= 102)
  397. return c - 0x61 + 0x0A;
  398. else
  399. return 0;
  400. }
  401. string
  402. convert_escapes (string in, bool utf8) {
  403. // cout << "converting " << in ;
  404. string result;
  405. int i = 0;
  406. while (i < N(in)) {
  407. if (in[i]!='#') result << in[i++];
  408. else {
  409. i++;
  410. unsigned int num = 0;
  411. while (i < N(in) && is_hex_digit(in[i]))
  412. num = 0x10 * num + hex_digit_to_int((unsigned char) in[i++]);
  413. //cout << " to num "; printf("%x",num); cout << " then to ";
  414. if (utf8) result << encode_as_utf8 (num);
  415. else result << string((char)num);
  416. }
  417. }
  418. //for(int i = 0; i < N(result);i++)
  419. // printf("%x ", (unsigned char)result[i]); printf("\n");
  420. return result;
  421. }
  422. string
  423. convert_char_entities (string s) {
  424. int i, n=N(s);
  425. string r;
  426. for (i=0; i<n; /* noop */) {
  427. if (s[i] == '&' && i+1<n && s[i+1] == '#') {
  428. i += 2;
  429. bool okay= false;
  430. string rr= convert_char_entity(s, i, okay);
  431. if (okay) r << rr;
  432. else { r << "&#"; continue; }
  433. }
  434. else r << s[i++];
  435. }
  436. return r;
  437. }
  438. static unsigned int
  439. as_unsigned_int (string s) {
  440. int i=0, n=N(s);
  441. unsigned int val=0;
  442. if (n==0) return 0;
  443. while (i<n) {
  444. if (s[i]<'0') break;
  445. if (s[i]>'9') break;
  446. val *= 10;
  447. val += (int) (s[i]-'0');
  448. i++;
  449. }
  450. return val;
  451. }
  452. string
  453. convert_char_entity (string s, int& start, bool& success) {
  454. // start: position in s after the character entity marker "&#".
  455. success = false;
  456. int i= start;
  457. int n= N(s);
  458. unsigned int num= 0;
  459. if (i >= n) return "";
  460. else if (s[i] == 'x' || s[i] == 'X') {
  461. i++;
  462. // int j=i;
  463. while (i<n && is_hex_digit (s[i])) {
  464. success = true;
  465. num = 0x10 * num + hex_digit_to_int(s[i]);
  466. i++;
  467. }
  468. // if (success) cout << "hex-ent: " << s(j,i) ;
  469. }
  470. else {
  471. int j=i;
  472. while (i<n && is_digit (s[i])) {
  473. success = true;
  474. i++;
  475. }
  476. // if (success) cout << "dec-ent: " << s(j,i) ;
  477. num = as_unsigned_int (s(j,i));
  478. }
  479. if (success) {
  480. if (i<n && s[i]==';') i++;
  481. start= i;
  482. // cout << " --> (" << num << ") " << encode_as_utf8 (num) << '\n' ;
  483. return encode_as_utf8(num);
  484. }
  485. else return "";
  486. }
  487. string
  488. encode_as_utf8 (unsigned int code) {
  489. if (/* 0x0 <= code && */ code <= 0x7F) {
  490. // 0x0ddddddd
  491. return string((char) code);
  492. }
  493. else if (0x80 <= code && code <= 0x7FF) {
  494. // 0x110ddddd 0x10dddddd
  495. string str(2);
  496. str[0] = ((code >> 6) & 0x1F) | 0xC0;
  497. str[1] = (code & 0x3F) | 0x80;
  498. return str;
  499. }
  500. else if (0x800 <= code && code <= 0xFFFF) {
  501. // 0x1110dddd 0x10dddddd 0x10dddddd
  502. string str(3);
  503. str[0] = ((code >> 12) & 0x0F) | 0xE0;
  504. str[1] = ((code >> 6) & 0x3F) | 0x80;
  505. str[2] = (code & 0x3F) | 0x80;
  506. return str;
  507. }
  508. else if (0x10000 <= code && code <= 0x1FFFFF) {
  509. // 0x11110uuu 0x10zzzzzz 0x10yyyyyy 0x10xxxxxx
  510. string str(4);
  511. str[0] = ((code >> 18) & 0x07) | 0xF0;
  512. str[1] = ((code >> 12) & 0x3F) | 0x80;
  513. str[2] = ((code >> 6) & 0x3F) | 0x80;
  514. str[3] = (code & 0x3F) | 0x80;
  515. return str;
  516. }
  517. else return "";
  518. }
  519. unsigned int
  520. decode_from_utf8 (string s, int& i) {
  521. unsigned char c = s[i];
  522. if ((0x80 & c) == 0) {
  523. // 0x0ddddddd
  524. i++;
  525. return (unsigned int) c;
  526. }
  527. unsigned int code;
  528. int trail;
  529. if ((0xE0 & c) == 0xC0) {
  530. // 0x110ddddd 0x10dddddd
  531. trail = 1;
  532. code = c & 0x1F;
  533. }
  534. else if ((0xF0 & c) == 0xE0) {
  535. // 0x1110dddd 0x10dddddd 0x10dddddd
  536. trail = 2;
  537. code = c & 0x0F;
  538. }
  539. else if ((0xF8 & c) == 0xF0) {
  540. // 0x11110dddd 0x10dddddd 0x10dddddd 0x10dddddd
  541. trail = 3;
  542. code = c & 0x07;
  543. }
  544. else {
  545. // failsafe
  546. //cout << "failsafe: " << c << " (" << (unsigned int)(c) << ")\n";
  547. i++;
  548. return (unsigned int) c;
  549. }
  550. for (; trail > 0; trail--) {
  551. i++;
  552. if (i >= N(s)) i= N(s)-1;
  553. c = s[i];
  554. code = (code << 6) | (c & 0x3F);
  555. }
  556. i++;
  557. return code;
  558. }
  559. string
  560. utf8_to_hex_entities (string s) {
  561. string result;
  562. int i, n= N(s);
  563. for (i=0; i<n; ) {
  564. unsigned char c = s[i];
  565. if ((0x80 & c) == 0 || ((0xF8 & c) == 0xF8)) {
  566. result << c;
  567. i++;
  568. }
  569. else {
  570. unsigned int code= decode_from_utf8 (s, i);
  571. string hex= as_hexadecimal (code);
  572. while (N(hex) < 4) hex = "0" * hex;
  573. //cout << "entity: " << hex << " (" << code << ")\n";
  574. result << "&#x" << hex << ";";
  575. }
  576. }
  577. return result;
  578. /*
  579. string result;
  580. const int n = N(s);
  581. int i;
  582. for (i=0; i<n; i++) {
  583. unsigned char c = s[i];
  584. if ((0x80 & c) == 0) {
  585. // 0x0ddddddd
  586. //cout << "ASCII: " << c << '\n';
  587. result << c;
  588. continue;
  589. }
  590. unsigned int code;
  591. int trail;
  592. if ((0xE0 & c) == 0xC0) {
  593. // 0x110ddddd 0x10dddddd
  594. trail = 1;
  595. code = c & 0x1F;
  596. }
  597. else if ((0xF0 & c) == 0xE0) {
  598. // 0x1110dddd 0x10dddddd 0x10dddddd
  599. trail = 2;
  600. code = c & 0x0F;
  601. }
  602. else if ((0xF8 & c) == 0xF0) {
  603. // 0x11110dddd 0x10dddddd 0x10dddddd 0x10dddddd
  604. trail = 3;
  605. code = c & 0x07;
  606. }
  607. else {
  608. // failsafe
  609. //cout << "failsafe: " << c << " (" << (unsigned int)(c) << ")\n";
  610. result << c;
  611. continue;
  612. }
  613. for (; trail > 0; trail--) {
  614. // Garbage in, garbage out. Do not resync when input is bad.
  615. i++;
  616. c = s[i];
  617. code = (code << 6) | (c & 0x3F);
  618. }
  619. string hex= as_hexadecimal (code);
  620. while (N(hex) < 4) hex = "0" * hex;
  621. //cout << "entity: " << hex << " (" << code << ")\n";
  622. result << "&#x" << hex << ";";
  623. }
  624. return result;
  625. */
  626. }