PageRenderTime 57ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/source/src/Data/String/analyze.cpp

http://itexmacs.googlecode.com/
C++ | 1289 lines | 1205 code | 51 blank | 33 comment | 136 complexity | d5994a9c42d49fb2e6b40eed26671704 MD5 | raw file
Possible License(s): GPL-3.0, GPL-2.0, MPL-2.0-no-copyleft-exception, LGPL-2.0
  1. /******************************************************************************
  2. * MODULE : analyze.cpp
  3. * DESCRIPTION: Properties of characters and strings
  4. * COPYRIGHT : (C) 1999 Joris van der Hoeven
  5. *******************************************************************************
  6. * This software falls under the GNU general public license version 3 or later.
  7. * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
  8. * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
  9. ******************************************************************************/
  10. #include "analyze.hpp"
  11. #include "merge_sort.hpp"
  12. #include "converter.hpp"
  13. #include "Scheme/object.hpp"
  14. /******************************************************************************
  15. * Tests for caracters
  16. ******************************************************************************/
  17. bool
  18. is_alpha (register char c) {
  19. return ((c>='a') && (c<='z')) || ((c>='A') && (c<='Z'));
  20. }
  21. bool
  22. is_iso_alpha (register char c) {
  23. int i= ((int) ((unsigned char) c));
  24. return
  25. ((c>='a') && (c<='z')) ||
  26. ((c>='A') && (c<='Z')) ||
  27. ((i >= 128) && (i != 159) && (i != 189) && (i != 190) && (i != 191));
  28. }
  29. bool
  30. is_locase (register char c) {
  31. int code= (int) ((unsigned char) c);
  32. return
  33. ((c>='a') && (c<='z')) ||
  34. ((code >= 160) && (code < 189)) ||
  35. (code >= 224);
  36. }
  37. bool
  38. is_upcase (register char c) {
  39. int code= (int) ((unsigned char) c);
  40. return
  41. ((c>='A') && (c<='Z')) ||
  42. ((code >= 128) && (code < 159)) ||
  43. ((code >= 192) && (code < 224));
  44. }
  45. bool
  46. is_digit (register char c) {
  47. return (c>='0') && (c<='9');
  48. }
  49. bool
  50. is_numeric (register char c) {
  51. return ((c>='0') && (c<='9')) || (c=='.');
  52. }
  53. bool
  54. is_punctuation (register char c) {
  55. return
  56. (c=='.') || (c==',') || (c==':') || (c=='\'') || (c=='`') ||
  57. (c==';') || (c=='!') || (c=='?');
  58. }
  59. bool
  60. is_space (register char c) {
  61. return (c == ' ') || (c == '\11') || (c == '\12') || (c == '\15');\
  62. }
  63. /******************************************************************************
  64. * Tests for strings
  65. ******************************************************************************/
  66. bool
  67. is_alpha (string s) {
  68. int i;
  69. if (N(s)==0) return false;
  70. for (i=0; i<N(s); i++)
  71. if (!is_alpha (s[i])) return false;
  72. return true;
  73. }
  74. bool
  75. is_locase_alpha (string s) {
  76. int i;
  77. if (N(s)==0) return false;
  78. for (i=0; i<N(s); i++)
  79. if (s[i]<'a' || s[i]>'z') return false;
  80. return true;
  81. }
  82. bool
  83. is_iso_alpha (string s) {
  84. int i;
  85. if (N(s)==0) return false;
  86. for (i=0; i<N(s); i++)
  87. if (!is_iso_alpha (s[i])) return false;
  88. return true;
  89. }
  90. bool
  91. is_numeric (string s) {
  92. int i;
  93. if (N(s)==0) return false;
  94. for (i=0; i<N(s); i++)
  95. if (!is_numeric (s[i])) return false;
  96. return true;
  97. }
  98. /******************************************************************************
  99. * Changing cases
  100. ******************************************************************************/
  101. char
  102. upcase (char c) {
  103. if (is_locase (c))
  104. return (char) (((int) ((unsigned char) c)) - 32);
  105. else return c;
  106. }
  107. char
  108. locase (char c) {
  109. if (is_upcase (c))
  110. return (char) (((int) ((unsigned char) c)) + 32);
  111. else return c;
  112. }
  113. string
  114. upcase_first (string s) {
  115. if ((N(s)==0) || (!is_locase (s[0]))) return s;
  116. return string ((char) (((int) ((unsigned char) s[0]))-32)) * s (1, N(s));
  117. }
  118. string
  119. locase_first (string s) {
  120. if ((N(s)==0) || (!is_upcase (s[0]))) return s;
  121. return string ((char) (((int) ((unsigned char) s[0]))+32)) * s (1, N(s));
  122. }
  123. string
  124. upcase_all (string s) {
  125. int i;
  126. string r (N(s));
  127. for (i=0; i<N(s); i++)
  128. if (!is_locase (s[i])) r[i]= s[i];
  129. else r[i]= (char) (((int) ((unsigned char) s[i]))-32);
  130. return r;
  131. }
  132. string
  133. locase_all (string s) {
  134. int i;
  135. string r (N(s));
  136. for (i=0; i<N(s); i++)
  137. if (!is_upcase (s[i])) r[i]= s[i];
  138. else r[i]= (char) (((int) ((unsigned char) s[i]))+32);
  139. return r;
  140. }
  141. /******************************************************************************
  142. * Inserting or removing a character into a string as a set of characters
  143. ******************************************************************************/
  144. string
  145. string_union (string s1, string s2) {
  146. return string_minus (s1, s2) * s2;
  147. }
  148. string
  149. string_minus (string s1, string s2) {
  150. string r;
  151. int i1, n1= N(s1), i2, n2= N(s2);
  152. for (i1=0; i1<n1; i1++) {
  153. for (i2=0; i2<n2; i2++)
  154. if (s1[i1] == s2[i2]) break;
  155. if (i2==n2) r << s1[i1];
  156. }
  157. return r;
  158. }
  159. /******************************************************************************
  160. * Spanish in relation with ispell
  161. ******************************************************************************/
  162. string
  163. ispanish_to_spanish (string s) {
  164. int i, n= N(s);
  165. string r;
  166. for (i=0; i<n; i++)
  167. if ((s[i] == '\'') && ((i+1)<n)) {
  168. switch (s[i+1]) {
  169. case 'A': r << 'Á'; break;
  170. case 'E': r << 'É'; break;
  171. case 'I': r << 'Í'; break;
  172. case 'N': r << '?'; break;
  173. case 'O': r << 'Ó'; break;
  174. case 'U': r << 'Ú'; break;
  175. case 'Y': r << 'Ý'; break;
  176. case 'a': r << 'á'; break;
  177. case 'e': r << 'é'; break;
  178. case 'i': r << 'í'; break;
  179. case 'n': r << '?'; break;
  180. case 'o': r << 'ó'; break;
  181. case 'u': r << 'ú'; break;
  182. case 'y': r << 'ý'; break;
  183. default : r << '\'' << s[i+1];
  184. }
  185. i++;
  186. }
  187. else r << s[i];
  188. return r;
  189. }
  190. string
  191. spanish_to_ispanish (string s) {
  192. int i, n= N(s);
  193. string r;
  194. for (i=0; i<n; i++)
  195. switch (s[i]) {
  196. case 'Á': r << "'A"; break;
  197. case 'É': r << "'E"; break;
  198. case 'Í': r << "'I"; break;
  199. case '?': r << "'N"; break;
  200. case 'Ó': r << "'O"; break;
  201. case 'Ú': r << "'U"; break;
  202. case 'Ý': r << "'Y"; break;
  203. case 'á': r << "'a"; break;
  204. case 'é': r << "'e"; break;
  205. case 'í': r << "'i"; break;
  206. case '?': r << "'n"; break;
  207. case 'ó': r << "'o"; break;
  208. case 'ú': r << "'u"; break;
  209. case 'ý': r << "'y"; break;
  210. default : r << s[i];
  211. }
  212. return r;
  213. }
  214. string
  215. igerman_to_german (string s) {
  216. int i, n= N(s);
  217. string r;
  218. for (i=0; i<n; i++)
  219. if (s[i] == 'ß') r << '?';
  220. else r << s[i];
  221. return r;
  222. }
  223. string
  224. german_to_igerman (string s) {
  225. int i, n= N(s);
  226. string r;
  227. for (i=0; i<n; i++)
  228. if (s[i] == '?') r << 'ß';
  229. else r << s[i];
  230. return r;
  231. }
  232. /******************************************************************************
  233. * Iso latin 2 encoding for polish and czech
  234. ******************************************************************************/
  235. static string il2_to_cork_string=
  236. "&#x20AC; &#x201A;&#x192;&#x201E;&#x2026;&#x2020;&#x2021;&#x2C6;&#x2030;&#x160;&#x2039;&#x152;?&#x17D;??&#x2018;&#x2019;&#x201C;&#x201D;&#x2022;&#x2013;&#x2014;&#x2DC;&#x2122;&#x161;&#x203A;&#x153;?&#x17E;&#x; &#x160; &#x2030;&#x2018;&#x;&#x2019;&#x201C;&#x201D;&#x2122;&#x161;&#x203A; ? ?Š? ??´š???ÁÂ&#x20AC;Ä&#x2C6;&#x201A;Ç&#x192;É&#x2020;Ë&#x2026;ÍÎ&#x201E;?&#x2039;&#x152;ÓÔ&#x17D;Ö.?&#x2014;Ú&#x2013;ÜÝ&#x2022;??áâ ä¨?ç?é?ë?íî¤&#x17E;??óôŽö/°?ú?üý? ";
  237. static string cork_to_il2_string=
  238. "???????G????? ????Š?????Y?Ž?II?§???????g????? ????š?????y?ž?i!?LAÁÂAÄAAÇEÉEËIÍÎI?NOÓÔOÖOOUÚUÜÝ Saáâaäaaçeéeëiíîi?noóôoöoouúuüý ß";
  239. static char
  240. il2_to_cork (char c) {
  241. int i= (int) ((unsigned char) c);
  242. if (i<128) return c;
  243. return il2_to_cork_string [i-128];
  244. }
  245. static char
  246. cork_to_il2 (char c) {
  247. int i= (int) ((unsigned char) c);
  248. if (i<128) return c;
  249. return cork_to_il2_string [i-128];
  250. }
  251. string
  252. il2_to_cork (string s) {
  253. int i, n= N(s);
  254. string r (n);
  255. for (i=0; i<n; i++)
  256. r[i]= il2_to_cork (s[i]);
  257. return r;
  258. }
  259. string
  260. cork_to_il2 (string s) {
  261. int i, n= N(s);
  262. string r (n);
  263. for (i=0; i<n; i++)
  264. r[i]= cork_to_il2 (s[i]);
  265. return r;
  266. }
  267. /******************************************************************************
  268. * Koi8 encoding for russian
  269. ******************************************************************************/
  270. static string koi8_to_iso_string=
  271. "áâ÷çä?öúé?ë?íî???óô??????ý???ü??ÁÂ×ÇÄ?ÖÚÉ?Ë?ÍÎ???ÓÔ??????Ýß??Ü??";
  272. static string iso_to_koi8_string=
  273. "??áöä?ô???é?ë?íî?????ó?âü?ç?ý?÷ú??ÁÖÄ?Ô???É?Ë?ÍÎ?ß???Ó?ÂÜ?Ç?Ý?×Ú";
  274. static char
  275. koi8_to_iso (char c, bool ukrainian) {
  276. int i= (int) ((unsigned char) c);
  277. if (i==156) return '?';
  278. if (i==188) return '?';
  279. if (ukrainian)
  280. {
  281. switch(c)
  282. {
  283. case 'I':return '?';
  284. case '&#x2C6;':return '?';
  285. case '&#x2122;':return '´';
  286. case '&#x20AC;':return '?';
  287. case 'i':return '?';
  288. case '¨':return '§';
  289. case 'š':return '¤';
  290. case ' ':return '­';
  291. }
  292. }
  293. if (i<192) return c;
  294. return koi8_to_iso_string [i-192];
  295. }
  296. static char
  297. iso_to_koi8 (char c, bool ukrainian) {
  298. int i= (int) ((unsigned char) c);
  299. if (c=='?') return (char) 156;
  300. if (c=='?') return (char) 188;
  301. if (ukrainian)
  302. {
  303. switch(c)
  304. {
  305. case '?':return 'I';
  306. case '?':return '&#x2C6;';
  307. case '´':return '&#x2122;';
  308. case '?':return '&#x20AC;';
  309. case '?':return 'i';
  310. case '§':return '¨';
  311. case '¤':return 'š';
  312. case '­':return ' ';
  313. }
  314. }
  315. if (i<192) return c;
  316. return iso_to_koi8_string [i-192];
  317. }
  318. string
  319. koi8_to_iso (string s) {
  320. int i, n= N(s);
  321. string r (n);
  322. for (i=0; i<n; i++)
  323. r[i]= koi8_to_iso (s[i], false);
  324. return r;
  325. }
  326. string
  327. iso_to_koi8 (string s) {
  328. int i, n= N(s);
  329. string r (n);
  330. for (i=0; i<n; i++)
  331. r[i]= iso_to_koi8 (s[i], false);
  332. return r;
  333. }
  334. string
  335. koi8uk_to_iso (string s) {
  336. int i, n= N(s);
  337. string r (n);
  338. for (i=0; i<n; i++)
  339. r[i]= koi8_to_iso (s[i], true);
  340. return r;
  341. }
  342. string
  343. iso_to_koi8uk (string s) {
  344. int i, n= N(s);
  345. string r (n);
  346. for (i=0; i<n; i++)
  347. r[i]= iso_to_koi8 (s[i], true);
  348. return r;
  349. }
  350. /******************************************************************************
  351. * Convert between TeXmacs and XML strings
  352. ******************************************************************************/
  353. static bool
  354. is_xml_name (char c) {
  355. return
  356. is_alpha (c) || is_numeric (c) ||
  357. (c == '.') || (c == '-') || (c == ':');
  358. }
  359. string
  360. tm_to_xml_name (string s) {
  361. string r;
  362. int i, n= N(s);
  363. for (i=0; i<n; i++)
  364. if (is_xml_name (s[i])) r << s[i];
  365. else r << "_" << as_string ((int) ((unsigned char) s[i])) << "_";
  366. return r;
  367. }
  368. string
  369. xml_name_to_tm (string s) {
  370. string r;
  371. int i, n= N(s);
  372. for (i=0; i<n; i++)
  373. if (s[i] != '_') r << s[i];
  374. else {
  375. int start= ++i;
  376. while ((i<n) && (s[i]!='_')) i++;
  377. r << (char) ((unsigned char) as_int (s (start, i)));
  378. }
  379. return r;
  380. }
  381. string
  382. old_tm_to_xml_cdata (string s) {
  383. string r;
  384. int i, n= N(s);
  385. for (i=0; i<n; i++)
  386. if (s[i] == '&') r << "&amp;";
  387. else if (s[i] == '>') r << "&gt;";
  388. else if (s[i] != '<') r << s[i];
  389. else {
  390. int start= ++i;
  391. while ((i<n) && (s[i]!='>')) i++;
  392. r << "&" << tm_to_xml_name (s (start, i)) << ";";
  393. }
  394. return r;
  395. }
  396. object
  397. tm_to_xml_cdata (string s) {
  398. array<object> a;
  399. a << symbol_object ("!concat");
  400. string r;
  401. int i, n= N(s);
  402. for (i=0; i<n; i++)
  403. if (s[i] == '&') r << "&amp;";
  404. else if (s[i] == '>') r << "&gt;";
  405. else if (s[i] == '\\') r << "\\";
  406. else if (s[i] != '<') r << cork_to_utf8 (s (i, i+1));
  407. else {
  408. int start= i++;
  409. while ((i<n) && (s[i]!='>')) i++;
  410. string ss= s (start, i+1);
  411. string rr= cork_to_utf8 (ss);
  412. string qq= utf8_to_cork (rr);
  413. if (rr != ss && qq == ss) r << rr;
  414. else {
  415. if (r != "") a << object (r);
  416. a << cons (symbol_object ("tm-sym"),
  417. cons (ss (1, N(ss)-1),
  418. null_object ()));
  419. r= "";
  420. }
  421. }
  422. if (r != "") a << object (r);
  423. if (N(a) == 1) return object ("");
  424. else if (N(a) == 2) return a[1];
  425. else return call ("list", a);
  426. }
  427. string
  428. old_xml_cdata_to_tm (string s) {
  429. string r;
  430. int i, n= N(s);
  431. for (i=0; i<n; i++)
  432. if (s[i] == '<') r << "<less>";
  433. else if (s[i] == '>') r << "<gtr>";
  434. else if (s[i] != '&') r << s[i];
  435. else {
  436. int start= ++i;
  437. while ((i<n) && (s[i]!=';')) i++;
  438. string x= "<" * xml_name_to_tm (s (start, i)) * ">";
  439. if (x == "<amp>") r << "&";
  440. else r << x;
  441. }
  442. return r;
  443. }
  444. string
  445. xml_unspace (string s, bool first, bool last) {
  446. string r;
  447. int i= 0, n= N(s);
  448. if (first) while ((i<n) && is_space (s[i])) i++;
  449. while (i<n)
  450. if (!is_space (s[i])) r << s[i++];
  451. else {
  452. while ((i<n) && is_space (s[i])) i++;
  453. if ((i<n) || (!last)) r << ' ';
  454. }
  455. return r;
  456. }
  457. bool
  458. contains_unicode_char (string s) {
  459. int i= 0, n= N(s);
  460. while (i+1<n) {
  461. if (s[i] == '<' && s[i+1] == '#') return true;
  462. tm_char_forwards (s, i);
  463. }
  464. return false;
  465. }
  466. /******************************************************************************
  467. * Roman and alpha numbers
  468. ******************************************************************************/
  469. static string ones[10]= {
  470. "", "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix" };
  471. static string tens[10]= {
  472. "", "x", "xx", "xxx", "xl", "l", "lx", "lxx", "lxxx", "xc" };
  473. static string hundreds[10]= {
  474. "", "c", "cc", "ccc", "cd", "d", "dc", "dcc", "dccc", "cm" };
  475. string
  476. roman_nr (int nr) {
  477. if (nr<0) return "-" * roman_nr (nr);
  478. if (nr==0) return "o";
  479. if (nr>1000) return "m" * roman_nr (nr-1000);
  480. if (nr==1000) return "m";
  481. if (nr==999) return "im";
  482. if (nr==499) return "id";
  483. if ((nr%100)==99) return hundreds[nr/100] * "ic";
  484. if ((nr%100)==49) return hundreds[nr/100] * "il";
  485. return hundreds[nr/100] * tens[(nr%100)/10] * ones[nr%10];
  486. }
  487. string
  488. Roman_nr (int nr) {
  489. return upcase_all (roman_nr (nr));
  490. }
  491. string
  492. alpha_nr (int nr) {
  493. if (nr<0) return "-" * alpha_nr (nr);
  494. if (nr==0) return "0";
  495. if (nr<=26) return string ((char) (((int) 'a')+ nr-1));
  496. return alpha_nr ((nr-1)/26) * alpha_nr (((nr-1)%26)+1);
  497. }
  498. string
  499. Alpha_nr (int nr) {
  500. return upcase_all (alpha_nr (nr));
  501. }
  502. string
  503. fnsymbol_nr (int nr) {
  504. string sym, r;
  505. int i, m= (nr-1)%3, n= ((nr-1)/3)+1;
  506. switch (m) {
  507. case 0: sym= "<ast>"; break;
  508. case 1: sym= "<dag>"; break;
  509. case 2: sym= "<ddag>"; break;
  510. }
  511. for (i=0; i<n; i++) r << sym;
  512. return r;
  513. }
  514. /******************************************************************************
  515. * Conversions to and from hexadecimal
  516. ******************************************************************************/
  517. static const char* hex_string= "0123456789ABCDEF";
  518. string
  519. as_hexadecimal (int i) {
  520. if (i<0) return "-" * as_hexadecimal (-i);
  521. if (i<16) return hex_string [i & 15];
  522. return as_hexadecimal (i >> 4) * hex_string [i & 15];
  523. }
  524. string
  525. as_hexadecimal (pointer ptr) {
  526. intptr_t i= (intptr_t) ptr;
  527. if (i<0) return "-" * as_hexadecimal (-i);
  528. if (i<16) return hex_string [i & 15];
  529. return as_hexadecimal (i >> 4) * hex_string [i & 15];
  530. }
  531. string
  532. as_hexadecimal (int i, int len) {
  533. if (len==1) return hex_string [i & 15];
  534. else return as_hexadecimal (i >> 4, len-1) * hex_string [i & 15];
  535. }
  536. int
  537. from_hexadecimal (string s) {
  538. int i, n= N(s), res= 0;
  539. if ((n>0) && (s[0]=='-'))
  540. return -from_hexadecimal (s (1, n));
  541. for (i=0; i<n; i++) {
  542. res= res << 4;
  543. if ((s[i] >= '0') && (s[i] <= '9')) res += (int) (s[i] - '0');
  544. if ((s[i] >= 'A') && (s[i] <= 'F')) res += (int) (s[i] + 10 - 'A');
  545. if ((s[i] >= 'a') && (s[i] <= 'f')) res += (int) (s[i] + 10 - 'a');
  546. }
  547. return res;
  548. }
  549. /******************************************************************************
  550. * Routines for the TeXmacs encoding
  551. ******************************************************************************/
  552. string
  553. tm_encode (string s) {
  554. // verbatim to TeXmacs encoding
  555. register int i;
  556. string r;
  557. for (i=0; i<N(s); i++) {
  558. if (s[i]=='<') r << "<less>";
  559. else if (s[i]=='>') r << "<gtr>";
  560. else r << s[i];
  561. }
  562. return r;
  563. }
  564. string
  565. tm_decode (string s) {
  566. // TeXmacs encoding to verbatim
  567. register int i;
  568. string r;
  569. for (i=0; i<N(s); i++) {
  570. if (s[i]=='<') {
  571. register int j;
  572. for (j=i+1; j<N(s); j++)
  573. if (s[j]=='>') break;
  574. if (j<N(s)) j++;
  575. if (s(i,j) == "<less>") r << "<";
  576. else if (s(i,j) == "<gtr>") r << ">";
  577. i=j-1;
  578. if (s[i]!='>') return r;
  579. }
  580. else if (s[i]!='>') r << s[i];
  581. }
  582. return r;
  583. }
  584. string
  585. tm_var_encode (string s) {
  586. register int i, n= N(s);
  587. string r;
  588. for (i=0; i<n; i++) {
  589. if (s[i]=='<') {
  590. if (i+1 < n && s[i+1] == '#') {
  591. while (i<n && s[i] != '>') r << s[i++];
  592. if (i<n) r << s[i];
  593. }
  594. else r << "<less>";
  595. }
  596. else if (s[i]=='>') r << "<gtr>";
  597. else r << s[i];
  598. }
  599. return r;
  600. }
  601. string
  602. tm_correct (string s) {
  603. register int i;
  604. string r;
  605. for (i=0; i<N(s); i++) {
  606. if (s[i]=='<') {
  607. register bool flag= true;
  608. register int j, k;
  609. for (j=i+1; j<N(s); j++)
  610. if (s[j]=='>') break;
  611. if (j==N(s)) return r;
  612. for (k=i+1; k<j; k++)
  613. if (s[k]=='<') flag= false;
  614. if (flag) r << s(i,j+1);
  615. i=j;
  616. }
  617. else if (s[i]!='>') r << s[i];
  618. }
  619. return r;
  620. }
  621. void
  622. tm_char_forwards (string s, int& pos) {
  623. ASSERT (pos >= 0 && pos <= N(s), "out of range");
  624. int n= N(s);
  625. if (pos == n);
  626. else if (s[pos] != '<') pos++;
  627. else {
  628. while (pos<n && s[pos] != '>') pos++;
  629. if (pos<n) pos++;
  630. }
  631. }
  632. void
  633. tm_char_backwards (string s, int& pos) {
  634. ASSERT (pos >= 0 && pos <= N(s), "out of range");
  635. if (pos == 0);
  636. else if (s[pos-1] != '>') pos--;
  637. else {
  638. while (pos>0 && s[pos-1] != '<') pos--;
  639. if (pos>0) pos--;
  640. }
  641. }
  642. int
  643. tm_char_next (string s, int pos) {
  644. tm_char_forwards (s, pos);
  645. return pos;
  646. }
  647. int
  648. tm_char_previous (string s, int pos) {
  649. tm_char_backwards (s, pos);
  650. return pos;
  651. }
  652. string
  653. tm_forward_access (string s, int k) {
  654. int pos= 0;
  655. for (int i=0; i<k; i++)
  656. tm_char_forwards (s, pos);
  657. int start= pos;
  658. tm_char_forwards (s, pos);
  659. return s (start, pos);
  660. }
  661. string
  662. tm_backward_access (string s, int k) {
  663. int pos= N(s);
  664. for (int i=0; i<k; i++)
  665. tm_char_backwards (s, pos);
  666. int end= pos;
  667. tm_char_backwards (s, pos);
  668. return s (pos, end);
  669. }
  670. int
  671. tm_string_length (string s) {
  672. int i= 0, pos= 0;
  673. while (pos < N(s)) {
  674. tm_char_forwards (s, pos);
  675. i++;
  676. }
  677. return i;
  678. }
  679. array<string>
  680. tm_tokenize (string s) {
  681. array<string> r;
  682. int pos= 0;
  683. while (pos < N(s)) {
  684. int start= pos;
  685. tm_char_forwards (s, pos);
  686. r << s (start, pos);
  687. }
  688. return r;
  689. }
  690. string
  691. tm_recompose (array<string> a) {
  692. string r;
  693. for (int i=0; i<N(a); i++)
  694. r << a[i];
  695. return r;
  696. }
  697. /******************************************************************************
  698. * Quoting
  699. ******************************************************************************/
  700. string
  701. scm_quote (string s) {
  702. // R5RS compliant external string representation.
  703. int i, n= N(s);
  704. string r;
  705. r << '"';
  706. for (i=0; i<n; i++)
  707. switch (s[i]) {
  708. case '\"':
  709. case '\\':
  710. r << '\\' << s[i];
  711. break;
  712. default:
  713. r << s[i];
  714. }
  715. r << '"';
  716. return r;
  717. }
  718. string
  719. scm_unquote (string s) {
  720. if ((N(s)>=2) && (s[0]=='\"') && (s[N(s)-1]=='\"')) {
  721. int i, n= N(s);
  722. string r;
  723. for (i=1; i<n-1; i++)
  724. if (s[i] == '\\' && (s[i+1] == '\"' || s[i+1] == '\\')) r << s[++i];
  725. else r << s[i];
  726. return r;
  727. }
  728. else return s;
  729. }
  730. string
  731. raw_quote (string s) {
  732. // Mark the label of a STRING tree as representing a string and not a symbol.
  733. return "\"" * s * "\"";
  734. }
  735. string
  736. raw_unquote (string s) {
  737. // Get the string value of a STRING tree label representing a string.
  738. if ((N(s)>=2) && (s[0]=='\"') && (s[N(s)-1]=='\"'))
  739. return s (1, N(s)-1);
  740. else return s;
  741. }
  742. /******************************************************************************
  743. * Handling escape characters
  744. ******************************************************************************/
  745. string
  746. escape_sh (string s) {
  747. #if defined (__MINGW__) || defined (__MINGW32__) || defined (OS_WIN32)
  748. return raw_quote (s);
  749. #else
  750. int i, n= N(s);
  751. string r;
  752. for (i=0; i<n; i++)
  753. switch (s[i]) {
  754. case '?':
  755. case '&':
  756. case '$':
  757. case '`':
  758. case '\"':
  759. case '\\':
  760. case ' ':
  761. r << '\\' << s[i];
  762. break;
  763. default:
  764. r << s[i];
  765. }
  766. return r;
  767. #endif
  768. }
  769. string
  770. escape_generic (string s) {
  771. int i, n= N(s);
  772. string r;
  773. for (i=0; i<n; i++) {
  774. if ((s[i] == '\2') || (s[i] == '\5') || (s[i] == '\33')) r << '\33';
  775. r << s[i];
  776. }
  777. return r;
  778. }
  779. string
  780. escape_verbatim (string s) {
  781. int i, n= N(s);
  782. string r;
  783. for (i=0; i<n; i++) {
  784. unsigned char c= (unsigned char) s[i];
  785. if ((c == '\n') || (c == '\t')) r << ' ';
  786. else if (((int) c) >= 32) r << s[i];
  787. }
  788. return r;
  789. }
  790. string
  791. escape_spaces (string s) {
  792. int i, n= N(s);
  793. string r;
  794. for (i=0; i<n; i++) {
  795. unsigned char c= (unsigned char) s[i];
  796. if (c == ' ') r << '\\';
  797. r << c;
  798. }
  799. return r;
  800. }
  801. string
  802. dos_to_better (string s) {
  803. int i, n= N(s);
  804. string r;
  805. for (i=0; i<n; i++)
  806. if (s[i] == '\015');
  807. else r << s[i];
  808. return r;
  809. }
  810. /******************************************************************************
  811. * Reading input from a string
  812. ******************************************************************************/
  813. bool
  814. test (string s, int i, const char* test) {
  815. int n= N(s), j=0;
  816. while (test[j]!='\0') {
  817. if (i>=n) return false;
  818. if (s[i]!=test[j]) return false;
  819. i++; j++;
  820. }
  821. return true;
  822. }
  823. bool
  824. test (string s, int i, string test) {
  825. int n= N(s), m= N(test), j=0;
  826. while (j<m) {
  827. if (i>=n) return false;
  828. if (s[i]!=test[j]) return false;
  829. i++; j++;
  830. }
  831. return true;
  832. }
  833. bool
  834. starts (string s, const char* what) {
  835. return test (s, 0, what);
  836. }
  837. bool
  838. starts (string s, const string what) {
  839. return test (s, 0, what);
  840. }
  841. bool
  842. ends (string s, const char* what) {
  843. string r (what);
  844. if (N(r) > N(s)) return false;
  845. return s (N(s)-N(r), N(s)) == r;
  846. }
  847. bool
  848. ends (string s, const string r) {
  849. if (N(r) > N(s)) return false;
  850. return s (N(s)-N(r), N(s)) == r;
  851. }
  852. bool
  853. read (string s, int& i, const char* test) {
  854. int n= N(s), j=0, k=i;
  855. while (test[j]!='\0') {
  856. if (k>=n) return false;
  857. if (s[k]!=test[j]) return false;
  858. j++; k++;
  859. }
  860. i=k;
  861. return true;
  862. }
  863. bool
  864. read (string s, int& i, string test) {
  865. int n= N(s), m= N(test), j=0, k=i;
  866. while (j<m) {
  867. if (k>=n) return false;
  868. if (s[k]!=test[j]) return false;
  869. j++; k++;
  870. }
  871. i=k;
  872. return true;
  873. }
  874. bool
  875. read_line (string s, int& i, string& result) {
  876. int start= i;
  877. for (; i<N(s); i++) {
  878. if (s[i]=='\n') {
  879. result= s(start,i++);
  880. return true;
  881. }
  882. }
  883. result= s(start,i);
  884. return false;
  885. }
  886. bool
  887. read_int (string s, int& i, int& result) {
  888. int n= N(s), start= i;
  889. result= 0;
  890. if (i==n) return false;
  891. if (s[i]=='-') {
  892. if (i+1==n) return false;
  893. if (!is_digit (s[i+1])) return false;
  894. i++;
  895. }
  896. else if (!is_digit (s[i])) return false;
  897. while ((i<n) && is_digit (s[i])) i++;
  898. result= as_int (s(start,i));
  899. return true;
  900. }
  901. bool
  902. read_double (string s, int& i, double& result) {
  903. int n= N(s), start= i;
  904. result= 0.0;
  905. if (i==n) return false;
  906. if (s[i]=='-') {
  907. if (i+1==n) return false;
  908. if (!is_numeric (s[i+1])) return false;
  909. i++;
  910. }
  911. else if (!is_numeric (s[i])) return false;
  912. while ((i<n) && is_digit (s[i])) i++;
  913. if ((i<n) && (s[i]=='.')) i++;
  914. while ((i<n) && is_digit (s[i])) i++;
  915. if ((i<n) && ((s[i]=='e') || (s[i]=='E'))) {
  916. i++;
  917. if ((i<n) && (s[i]=='-')) i++;
  918. if ((i==n) || (!is_digit (s[i]))) { i=start; return false; }
  919. while ((i<n) && is_digit (s[i])) i++;
  920. }
  921. result= as_double (s(start,i));
  922. return true;
  923. }
  924. void
  925. skip_spaces (string s, int& i) {
  926. int n=N(s);
  927. while ((i<n) && ((s[i]==' ') || (s[i]=='\t'))) i++;
  928. }
  929. void
  930. skip_line (string s, int& i) {
  931. int n=N(s);
  932. while ((i<n) && (s[i]!='\n')) i++;
  933. if (i<n) i++;
  934. }
  935. void
  936. skip_symbol (string s, int& i) {
  937. int n=N(s);
  938. if (i<n) {
  939. if (s[i]=='<') {
  940. for (i++; i<n; i++)
  941. if (s[i-1]=='>') break;
  942. }
  943. else i++;
  944. }
  945. }
  946. /******************************************************************************
  947. * Parsing binary data
  948. ******************************************************************************/
  949. void
  950. parse (string s, int& pos, QI& ret) {
  951. ret= (QI) s[pos++];
  952. }
  953. void
  954. parse (string s, int& pos, QN& ret) {
  955. ret= (QN) s[pos++];
  956. }
  957. void
  958. parse (string s, int& pos, HI& ret) {
  959. QI c1= (QI) s[pos++];
  960. QN c2= (QN) s[pos++];
  961. ret= (((HI) c1)<<8)+ c2;
  962. }
  963. void
  964. parse (string s, int& pos, HN& ret) {
  965. QN c1= (QN) s[pos++];
  966. QN c2= (QN) s[pos++];
  967. ret= (((HN) c1)<<8)+ c2;
  968. }
  969. void
  970. parse (string s, int& pos, SI& ret) {
  971. QI c1= (QI) s[pos++];
  972. QN c2= (QN) s[pos++];
  973. QN c3= (QN) s[pos++];
  974. QN c4= (QN) s[pos++];
  975. ret= (((((((SI) c1)<<8)+ ((SI) c2))<<8)+ ((SI) c3))<<8)+ c4;
  976. }
  977. void
  978. parse (string s, int& pos, SI*& a, int len) {
  979. int i;
  980. a= tm_new_array<int> (len);
  981. for (i=0; i<len; i++) parse (s, pos, a[i]);
  982. }
  983. /******************************************************************************
  984. * Searching, replacing and pattern matching
  985. ******************************************************************************/
  986. int
  987. search_forwards (string s, int pos, string in) {
  988. int k= N(s), n= N(in);
  989. if (k == 0) return pos;
  990. char c= s[0];
  991. while (pos+k <= n) {
  992. if (in[pos] == c && test (in, pos, s)) return pos;
  993. pos++;
  994. }
  995. return -1;
  996. }
  997. int
  998. search_forwards (string s, string in) {
  999. return search_forwards (s, 0, in);
  1000. }
  1001. bool
  1002. occurs (string what, string in) {
  1003. return search_forwards (what, 0, in) >= 0;
  1004. }
  1005. int
  1006. search_backwards (string s, int pos, string in) {
  1007. while (pos >= 0) {
  1008. if (test (in, pos, s)) return pos;
  1009. pos--;
  1010. }
  1011. return -1;
  1012. }
  1013. int
  1014. search_backwards (string s, string in) {
  1015. return search_backwards (s, N(in)-N(s), in);
  1016. }
  1017. int
  1018. count_occurrences (string s, string in) {
  1019. int count= 0;
  1020. int i=0, next, n= N(s);
  1021. while (i<n) {
  1022. next= search_forwards (s, i, in);
  1023. if (next == -1) break;
  1024. count++;
  1025. i= next+1;
  1026. }
  1027. return count;
  1028. }
  1029. string
  1030. replace (string s, string what, string by) {
  1031. int i, n= N(s);
  1032. string r;
  1033. for (i=0; i<n; )
  1034. if (test (s, i, what)) {
  1035. r << by;
  1036. i += N(what);
  1037. }
  1038. else {
  1039. r << s[i];
  1040. i++;
  1041. }
  1042. return r;
  1043. }
  1044. static bool
  1045. match_wildcard (string s, int spos, string w, int wpos) {
  1046. if (wpos == N(w)) return spos == N(s);
  1047. if (w[wpos] != '*')
  1048. return (spos < N(s)) && (s[spos] == w[wpos]) &&
  1049. match_wildcard (s, spos+1, w, wpos+1);
  1050. while ((wpos<N(w)) && (w[wpos]=='*')) wpos++;
  1051. while (spos <= N(s)) {
  1052. if (match_wildcard (s, spos, w, wpos)) return true;
  1053. spos++;
  1054. }
  1055. return false;
  1056. }
  1057. bool
  1058. match_wildcard (string s, string w) {
  1059. return match_wildcard (s, 0, w, 0);
  1060. }
  1061. array<string>
  1062. tokenize (string s, string sep) {
  1063. int start=0;
  1064. array<string> a;
  1065. for (int i=0; i<N(s); )
  1066. if (test (s, i, sep)) {
  1067. a << s (start, i);
  1068. i += N(sep);
  1069. start= i;
  1070. }
  1071. else i++;
  1072. return a;
  1073. }
  1074. string
  1075. recompose (array<string> a, string sep) {
  1076. string r;
  1077. for (int i=0; i<N(a); i++) {
  1078. if (i != 0) r << sep;
  1079. r << a[i];
  1080. }
  1081. return r;
  1082. }
  1083. string
  1084. trim_spaces (string s) {
  1085. int start, end;
  1086. for (start=0; start<N(s) && is_space (s[start]); start++);
  1087. for (end=N(s); end>start && is_space (s[end]); end--);
  1088. return s (start, end);
  1089. }
  1090. array<string>
  1091. trim_spaces (array<string> a) {
  1092. array<string> b (N(a));
  1093. for (int i=0; i<N(a); i++)
  1094. b[i]= trim_spaces (a[i]);
  1095. return b;
  1096. }
  1097. /******************************************************************************
  1098. * Computations with completions
  1099. ******************************************************************************/
  1100. array<string>
  1101. as_completions (hashset<string> h) {
  1102. tree t= (tree) h;
  1103. int i, n= N(t);
  1104. array<string> a (n);
  1105. for (i=0; i<n; i++) a[i]= t[i]->label;
  1106. merge_sort (a);
  1107. return a;
  1108. }
  1109. /*
  1110. static void
  1111. close_completions (hashset<string>& h) {
  1112. array<string> a= as_completions (h);
  1113. int i, j, n= N(a);
  1114. for (i=1; i<n; i++) {
  1115. for (j=0; j < min (N(a[i-1]), N(a[i])); j++)
  1116. if (a[i-1][j] != a[i][j]) break;
  1117. if (j < min (N(a[i-1]), N(a[i])))
  1118. h->insert (a[i](0,j));
  1119. }
  1120. }
  1121. array<string>
  1122. close_completions (array<string> a) {
  1123. int i, n= N(a);
  1124. hashset<string> h;
  1125. for (i=0; i<n; i++) h->insert (a[i]);
  1126. close_completions (h);
  1127. return as_completions (h);
  1128. }
  1129. */
  1130. array<string>
  1131. close_completions (array<string> a) {
  1132. if (N(a) == 0) return a;
  1133. merge_sort (a);
  1134. int i, j, n= N(a), l= N(a[0]);
  1135. for (i=1; i<n; i++) {
  1136. for (j=0; j<l && j<N(a[i]); j++)
  1137. if (a[i-1][j] != a[i][j]) break;
  1138. l= j;
  1139. }
  1140. array<string> r;
  1141. r << a[0] (0, l);
  1142. for (i=0; i<n; i++)
  1143. if (a[i] != r[N(r)-1])
  1144. r << a[i];
  1145. return r;
  1146. }
  1147. array<string>
  1148. strip_completions (array<string> a, string prefix) {
  1149. int i, n= N(a);
  1150. array<string> b;
  1151. for (i=0; i<n; i++)
  1152. if (starts (a[i], prefix))
  1153. b << a[i] (N(prefix), N(a[i]));
  1154. return b;
  1155. }