PageRenderTime 54ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/TeXmacs-1.0.7.11-src/src/Data/Convert/Tex/parsetex.cpp

#
C++ | 960 lines | 827 code | 53 blank | 80 comment | 522 complexity | 2783e653b68ced7c8129955ddad7da52 MD5 | raw file
Possible License(s): GPL-3.0, GPL-2.0, MPL-2.0-no-copyleft-exception
  1. /******************************************************************************
  2. * MODULE : parsetex.cpp
  3. * DESCRIPTION: conversion of tex/latex strings into logical tex/latex trees
  4. * COPYRIGHT : (C) 1999 Joris van der Hoeven
  5. *******************************************************************************
  6. * This software falls under the GNU general public license version 3 or later.
  7. * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
  8. * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
  9. ******************************************************************************/
  10. #include "Tex/convert_tex.hpp"
  11. #include "converter.hpp"
  12. string string_arg (tree t);
  13. /******************************************************************************
  14. * The latex_parser structure
  15. *******************************************************************************
  16. *
  17. * During the parsing, the following global variables are used:
  18. *
  19. * command_type Contains the types of all currently defined tex commands.
  20. * This is either 'command' 'modifier' 'operator'
  21. * 'environment' 'list' 'symbol' 'big-symbol' or 'user'.
  22. * command_arity Contains the corresponding arity.
  23. * command_def Contains the definitions of user commands.
  24. *
  25. * The command_type hashmap also contains come special fields
  26. *
  27. * \<sub> Stands for the subscript command
  28. * \<sup> Stands for the supscript command
  29. *
  30. * !mode Gives the current mode ("text" or "math").
  31. * !verbatim Verbatim mode ("true" or "false")
  32. * !em Emphasized mode ("true" or "false")
  33. *
  34. *******************************************************************************
  35. * WARNING: we recently put the standard LaTeX macros in latex_type and
  36. * latex_arity instead of command_type and command_arity.
  37. ******************************************************************************/
  38. struct latex_parser {
  39. bool unicode;
  40. latex_parser (bool unicode2): unicode (unicode2) {}
  41. void latex_error (string s, int i, string message);
  42. tree parse (string s, int& i, string stop= "", bool change= false);
  43. tree parse_backslash (string s, int& i);
  44. tree parse_symbol (string s, int& i);
  45. tree parse_command (string s, int& i, string which);
  46. tree parse_unknown (string s, int& i, string which);
  47. tree parse_verbatim (string s, int& i, string end);
  48. tree parse (string s, bool change);
  49. };
  50. /******************************************************************************
  51. * Error handling
  52. ******************************************************************************/
  53. void
  54. latex_parser::latex_error (string s, int i, string message) {
  55. cerr << "Latex error] " << message << "\n";
  56. if (i>30) s= "..." * s (i-27, N(s));
  57. if (N(s)>60) s= s (0, 57) * "...";
  58. cerr << "Latex error] in " << s << "\n";
  59. }
  60. /******************************************************************************
  61. * Main parsing routine
  62. ******************************************************************************/
  63. static bool
  64. is_regular (tree t) {
  65. if (!is_tuple (t)) return true;
  66. if (N(t) == 0 || !is_atomic (t[0])) return false;
  67. string s= t[0]->label;
  68. return !starts (s, "\\begin-") && !starts (s, "\\end-");
  69. }
  70. tree
  71. latex_parser::parse (string s, int& i, string stop, bool change) {
  72. bool no_error= true;
  73. int n= N(s);
  74. tree t (CONCAT);
  75. command_type ->extend ();
  76. command_arity->extend ();
  77. command_def ->extend ();
  78. while ((i<n) && is_space (s[i])) i++;
  79. while ((i<n) && no_error &&
  80. (s[i] != '\0' || N (stop) != 0) &&
  81. (N(stop) != 1 || s[i] != stop[0]) &&
  82. (s[i] != '$' || stop != "$$" || i+1>=n || s[i+1] != '$') &&
  83. (stop != "denom" ||
  84. (s[i] != '$' && s[i] != '}' &&
  85. (i+2>n || s(i,i+2) != "\\]") &&
  86. (i+4>n || s(i,i+4) != "\\end")))) {
  87. switch (s[i]) {
  88. case '~':
  89. if (command_type ["!mode"] == "math") t << tuple ("\\sim");
  90. else t << tuple ("\\nbsp");
  91. i++;
  92. break;
  93. case ' ':
  94. case '\t':
  95. case '\r':
  96. while ((i<n) && ((s[i]==' ') || (s[i]=='\t') || (s[i]=='\r'))) i++;
  97. if ((i<n) && (s[i]!='\n')) t << " ";
  98. break;
  99. case '\n': {
  100. int ln=0;
  101. while ((i<n) && is_space (s[i]))
  102. if (s[i++]=='\n') ln++;
  103. if (i<n) {
  104. if (ln == 1) t << " ";
  105. else t << "\n";
  106. }
  107. break;
  108. }
  109. case '%': {
  110. while ((i<n) && (s[i]!='\n')) i++;
  111. if (i<n) i++;
  112. int ln=0;
  113. while ((i<n) && is_space (s[i]))
  114. if (s[i++]=='\n') ln++;
  115. if (ln > 0) {
  116. if ((N(t)>0) && ((t[N(t)-1]==" ") || (t[N(t)-1]=="\n")))
  117. t[N(t)-1]= "\n";
  118. else t << "\n";
  119. }
  120. break;
  121. }
  122. case '#':
  123. i++;
  124. if (i==n) return t;
  125. if (is_numeric (s[i])) {
  126. t << s (i-1, i+1);
  127. i++;
  128. }
  129. else t << s (i-1, i);
  130. break;
  131. case '\\':
  132. if (((i+7)<n) && !is_alpha (s (i+5, i+7)) &&
  133. (s (i, i+5) == "\\over" || s (i, i+5) == "\\atop"))
  134. {
  135. string fr_cmd= s(i,i+5);
  136. if (fr_cmd == "\\over") fr_cmd= "\\frac";
  137. if (fr_cmd == "\\atop") fr_cmd= "\\ontop";
  138. int j;
  139. for (j=N(t); j>0 && is_regular (t[j-1]); j--) {}
  140. tree num= t (j, N(t));
  141. if (N(num) == 0) num= "";
  142. t= t (0, j);
  143. i+=5;
  144. while (i<n && (s[i] == ' ' || s[i] == '\n' || s[i] == '\t')) i++;
  145. tree den= parse (s, i, "denom");
  146. t << tree (TUPLE, fr_cmd, num, den);
  147. }
  148. else if (((i+5)<n) && (s(i,i+3)=="\\sp") && (!is_alpha(s[i+3]))) {
  149. i+=3;
  150. t << parse_command (s, i, "\\<sup>");
  151. }
  152. else if (((i+5)<n) && (s(i,i+3)=="\\sb") && (!is_alpha(s[i+3]))) {
  153. i+=3;
  154. t << parse_command (s, i, "\\<sub>");
  155. }
  156. else if (((i+10)<n) && (s(i,i+8)=="\\pmatrix")) {
  157. i+=8;
  158. tree arg= parse_command (s, i, "\\pmatrix");
  159. if (is_tuple (arg, "\\pmatrix", 1)) arg= arg[1];
  160. t << tree (TUPLE, "\\begin-pmatrix");
  161. if (is_concat (arg)) t << A (arg);
  162. else t << arg;
  163. t << tree (TUPLE, "\\end-pmatrix");
  164. }
  165. else {
  166. tree u= parse_backslash (s, i);
  167. if (u != "") t << u;
  168. }
  169. break;
  170. case '\'':
  171. i++;
  172. if (command_type ["!mode"] == "math") {
  173. int start= i-1;
  174. while ((i < N(s)) && (s[i] == '\'')) i++;
  175. t << tuple ("\\prime", s (start, i));
  176. }
  177. else t << s (i-1, i);
  178. break;
  179. case '*':
  180. if (command_type ["!mode"] == "math") t << tree (TUPLE, "\\ast");
  181. else t << "*";
  182. i++;
  183. break;
  184. case '_':
  185. i++;
  186. t << parse_command (s, i, "\\<sub>");
  187. /*
  188. if (command_type ["!mode"] == "math")
  189. t << parse_command (s, i, "\\<sub>");
  190. else t << s (i-1, i);
  191. */
  192. break;
  193. case '^':
  194. i++;
  195. t << parse_command (s, i, "\\<sup>");
  196. /*
  197. if (command_type ["!mode"] == "math")
  198. t << parse_command (s, i, "\\<sup>");
  199. else t << s (i-1, i);
  200. */
  201. break;
  202. case '<':
  203. t << tree (TUPLE, "\\<less>");
  204. i++;
  205. break;
  206. case '>':
  207. t << tree (TUPLE, "\\<gtr>");
  208. i++;
  209. break;
  210. case '\244':
  211. i++;
  212. t << parse_verbatim (s, i, "\244");
  213. break;
  214. case '{': {
  215. i++;
  216. t << parse (s, i, "}");
  217. if ((i<n) && (s[i]=='}')) i++;
  218. int ln=0;
  219. if ((i<n) && (!is_space (s[i]))) break;
  220. while ((i<n) && is_space (s[i]))
  221. if (s[i++]=='\n') ln++;
  222. if (ln >= 2) t << "\n";
  223. else if (i<n) t << tree (TUPLE, "\\ ");
  224. break;
  225. }
  226. case '$': {
  227. i++;
  228. if ((i<n) & (s[i]=='$')) {
  229. i++;
  230. t << tree (TUPLE, "\\begin-displaymath");
  231. command_type ("!mode")= "math";
  232. t << parse (s, i, "$$");
  233. command_type ("!mode")= "text";
  234. if ((i<n) && (s[i]=='$')) i++;
  235. if ((i<n) && (s[i]=='$')) i++;
  236. t << tree (TUPLE, "\\end-displaymath");
  237. }
  238. else {
  239. t << tree (TUPLE, "\\begin-math");
  240. command_type ("!mode")= "math";
  241. t << parse (s, i, "$");
  242. command_type ("!mode")= "text";
  243. if ((i<n) && (s[i]=='$')) i++;
  244. t << tree (TUPLE, "\\end-math");
  245. }
  246. break;
  247. }
  248. default:
  249. if (unicode && ((unsigned char) s[i]) >= 128) {
  250. unsigned int code= decode_from_utf8 (s, i);
  251. t << tree (TUPLE, "\\#" * as_hexadecimal (code));
  252. }
  253. else if (!unicode && is_iso_alpha (s[i])) {
  254. // If we encounter too much text in math mode, then return
  255. int start= i;
  256. while ((i<n) && is_iso_alpha (s[i])) i++;
  257. int end= i;
  258. if ((i >= start+3) && (command_type ["!mode"] == "math")) {
  259. while ((i<n) && (is_iso_alpha (s[i]) ||
  260. is_punctuation (s[i]) ||
  261. is_space (s[i])))
  262. i++;
  263. if (i >= start+20) {
  264. int last= i, words= 0, letters= 0;
  265. for (i=start; i<last; i++) {
  266. if (is_iso_alpha (s[i])) {
  267. letters++;
  268. if ((i==start) || (!is_iso_alpha (s[i-1]))) words++;
  269. }
  270. }
  271. if ((words > 3) && (letters/words >= 3) && (letters >= 15)) {
  272. i= start;
  273. no_error= false;
  274. }
  275. }
  276. }
  277. if (no_error)
  278. for (i=start; i<end; i++)
  279. t << s(i, i+1);
  280. }
  281. else {
  282. t << s (i, i+1);
  283. i++;
  284. }
  285. break;
  286. }
  287. }
  288. if (change) {
  289. command_type ->merge ();
  290. command_arity->merge ();
  291. command_def ->merge ();
  292. }
  293. else {
  294. command_type ->shorten ();
  295. command_arity->shorten ();
  296. command_def ->shorten ();
  297. }
  298. if (N(t)==0) return "";
  299. if (N(t)==1) return t[0];
  300. return t;
  301. }
  302. /******************************************************************************
  303. * Parsing commands
  304. ******************************************************************************/
  305. tree
  306. latex_parser::parse_backslash (string s, int& i) {
  307. int n= N(s);
  308. if (((i+7)<n) && (s(i,i+5)=="\\verb")) {
  309. i+=6;
  310. return parse_verbatim (s, i, s(i-1,i));
  311. }
  312. if (((i+29)<n) && (s(i,i+16)=="\\begin{verbatim}")) {
  313. i+=16;
  314. return parse_verbatim (s, i, "\\end{verbatim}");
  315. }
  316. if (((i+5)<n) && (s(i,i+4)=="\\url") && !is_alpha (s[i+5])) {
  317. i+=4;
  318. while (i<n && (s[i] == ' ' || s[i] == '\n' || s[i] == '\t')) i++;
  319. string ss;
  320. if (i<n && s[i] == '{') {
  321. i++;
  322. int start= i;
  323. while ((i<n) && s[i] != '}') i++;
  324. ss= s (start, i++);
  325. }
  326. return tree (TUPLE, "\\url", ss);
  327. }
  328. if (((i+6)<n) && (s(i,i+5)=="\\href")) {
  329. i+=5;
  330. while (i<n && (s[i] == ' ' || s[i] == '\n' || s[i] == '\t')) i++;
  331. string ss;
  332. if (i<n && s[i] == '{') {
  333. i++;
  334. int start= i;
  335. while ((i<n) && s[i] != '}') i++;
  336. ss= s (start, i++);
  337. }
  338. tree u= "";
  339. while (i<n && (s[i] == ' ' || s[i] == '\n' || s[i] == '\t')) i++;
  340. if (i<n && s[i] == '{') { i++; u= parse (s, i, "}"); i++; }
  341. return tree (TUPLE, "\\href", ss, u);
  342. }
  343. /************************ special commands *********************************/
  344. i++;
  345. if (i==n) return "";
  346. if (s[i]==' ') {
  347. i++;
  348. return tree (TUPLE, "\\ ");
  349. }
  350. if (!is_alpha(s[i])) {
  351. i++;
  352. if (s[i-1]=='(') return parse_command (s, i, "\\begin-math");
  353. if (s[i-1]==')') return parse_command (s, i, "\\end-math");
  354. if (s[i-1]=='[') return parse_command (s, i, "\\begin-displaymath");
  355. if (s[i-1]==']') return parse_command (s, i, "\\end-displaymath");
  356. return parse_command (s, i, s (i-2, i));
  357. }
  358. /************************* normal commands *********************************/
  359. int start= i-1;
  360. while ((i<n) && is_alpha (s[i])) i++;
  361. if ((i<n) && (s[i]=='*') && latex_type (s (start, i+1)) != "undefined") i++;
  362. string r= s (start, i);
  363. if ((r == "\\begin") || (r == "\\end")) {
  364. while ((i<n) && is_space (s[i])) i++;
  365. if ((i==n) || (s[i]!='{')) {
  366. latex_error (s, i, "begin or end which environment ?");
  367. return "";
  368. }
  369. i++; start= i;
  370. while ((i<n) && (s[i]!='}')) i++;
  371. r = r * "-" * s (start, i);
  372. if (i<n) i++;
  373. }
  374. return parse_command (s, i, r);
  375. }
  376. static string
  377. sharp_to_arg (string s, tree args) {
  378. int i;
  379. string r;
  380. for (i=0; i<N(s); i++)
  381. if ((s[i]=='#') && ((i+1)<N(s)) && (s[i+1]>='1') && (s[i+1]<='9')) {
  382. int nr= ((int) s[++i]) - ((int) '0');
  383. if (N(args)>nr) r << string_arg (args[nr]);
  384. }
  385. else r << s[i];
  386. return r;
  387. }
  388. tree
  389. latex_parser::parse_symbol (string s, int& i) {
  390. int start= i;
  391. if ((s[i] == '*') && (command_type ["!mode"] == "math")) {
  392. i++; return tree (TUPLE, "\\ast"); }
  393. if (s[i] == '<') { i++; return tree (TUPLE, "\\<less>"); }
  394. if (s[i] == '>') { i++; return tree (TUPLE, "\\<gtr>"); }
  395. if (s[i] != '\\') { i++; return s(start, i); }
  396. i++;
  397. if (i == N(s)) return tree (TUPLE, "\\backslash");
  398. if (!is_alpha (s[i])) { i++; return s(start, i); }
  399. while ((i<N(s)) && is_alpha (s[i])) i++;
  400. if ((i<N(s)) && (s[i]=='*')) i++;
  401. return s(start,i);
  402. }
  403. static bool
  404. is_math_environment (tree t) {
  405. //cout << "t= " << t << "\n";
  406. tree b= t[N(t)-2];
  407. tree e= t[N(t)-1];
  408. if (!is_concat (b)) b= tree (CONCAT, b);
  409. if (!is_concat (e)) e= tree (CONCAT, e);
  410. int i, j;
  411. for (i=N(b)-1; i>=0; i--)
  412. if (is_tuple (b[i]) && N(b[i])>0 && is_atomic (b[i][0]))
  413. if (latex_type (b[i][0]->label) == "math-environment")
  414. break;
  415. for (j=0; j<N(e); j++)
  416. if (is_tuple (e[j]) && N(e[j])>0 && is_atomic (e[j][0]))
  417. if (latex_type (e[j][0]->label) == "math-environment")
  418. break;
  419. if (i >= 0 && j < N(e)) {
  420. string bs= b[i][0]->label;
  421. string es= e[j][0]->label;
  422. bool ok=
  423. starts (bs, "\\begin-") &&
  424. starts (es, "\\end-") &&
  425. bs (7, N(bs)) == es (5, N(es));
  426. //cout << t[1] << " -> " << ok << "\n";
  427. return ok;
  428. }
  429. return false;
  430. }
  431. static bool
  432. is_text_argument (string cmd, int remaining_arity) {
  433. // FIXME: this test should be improved using DRD properties
  434. (void) remaining_arity;
  435. return cmd == "\\label" || cmd == "\\ref";
  436. }
  437. tree
  438. latex_parser::parse_command (string s, int& i, string cmd) {
  439. //cout << cmd << " [" << latex_type (cmd) << ", "
  440. //<< command_type ["!mode"] << ", " << latex_arity (cmd) << "]" << LF;
  441. if (cmd == "\\newcommand") cmd= "\\def";
  442. if (cmd == "\\renewcommand") cmd= "\\def";
  443. if (cmd == "\\renewenvironment") cmd= "\\newenvironment";
  444. if (cmd == "\\begin-split") cmd= "\\begin-eqsplit";
  445. if (cmd == "\\end-split") cmd= "\\end-eqsplit";
  446. if (cmd == "\\begin-split*") cmd= "\\begin-eqsplit*";
  447. if (cmd == "\\end-split*") cmd= "\\end-eqsplit*";
  448. if (latex_type (cmd) == "undefined")
  449. return parse_unknown (s, i, cmd);
  450. if (latex_type (cmd) == "math-environment") {
  451. if (cmd (0, 6) == "\\begin") command_type ("!mode") = "math";
  452. else command_type ("!mode") = "text";
  453. }
  454. bool mbox_flag=
  455. ((cmd == "\\text") || (cmd == "\\mbox")) &&
  456. (command_type ["!mode"] == "math");
  457. if (mbox_flag) command_type ("!mode") = "text";
  458. int n = N(s);
  459. int arity = latex_arity (cmd);
  460. bool option= (arity<0);
  461. if (option) arity= -1-arity;
  462. /************************ retrieve arguments *******************************/
  463. tree t (TUPLE, copy (cmd)); // parsed arguments
  464. tree u (TUPLE, copy (cmd)); // unparsed arguments
  465. while (i<n && arity>=0 && (arity>0 || option)) {
  466. int j= i;
  467. while ((j<n) && is_space (s[j])) j++;
  468. if (j==n) break;
  469. if (option && (s[j]=='[')) {
  470. j++;
  471. i=j;
  472. tree opt= parse (s, i, "]");
  473. if (cmd != "\\newtheorem" && cmd != "\\newtheorem*")
  474. t << opt;
  475. u << s (j, i);
  476. if ((i<n) && (s[i]==']')) i++;
  477. if (cmd != "\\newtheorem" && cmd != "\\newtheorem*")
  478. t[0]->label= t[0]->label * "*";
  479. option= false;
  480. }
  481. else if ((arity>0) && (s[j]=='{')) {
  482. bool text_arg=
  483. (command_type["!mode"] == "math") && is_text_argument (cmd, arity);
  484. j++;
  485. i=j;
  486. if (text_arg) command_type ("!mode")= "text";
  487. if ((N(t)==1) && (cmd == "\\def")) {
  488. while ((i<n) && (s[i]!='}')) i++;
  489. t << s (j, i);
  490. }
  491. else t << parse (s, i, "}");
  492. if (text_arg) command_type ("!mode")= "math";
  493. u << s (j, i);
  494. if ((i<n) && (s[i]=='}')) i++;
  495. arity--;
  496. if (arity == 0) option= false;
  497. }
  498. else if (s[j] == '}') break;
  499. else if (option && (s[j]=='#') && (cmd == "\\def")) {
  500. while ((j+3 <= n) && is_numeric (s[j+1]) && (s[j+2] == '#')) j+=2;
  501. if (j+2<=n) {
  502. t << s (j+1, j+2);
  503. u << s (j+1, j+2);
  504. i= j+2;
  505. }
  506. t[0]->label= t[0]->label * "*";
  507. option= false;
  508. }
  509. else {
  510. if (arity>0) {
  511. i=j;
  512. tree st= parse_symbol (s, i);
  513. t << st;
  514. u << st;
  515. arity--;
  516. if (arity == 0) option= false;
  517. }
  518. else break;
  519. }
  520. }
  521. if (arity>0) latex_error (s, i, "too little arguments for " * cmd);
  522. /******************** new commands and environments ************************/
  523. if (is_tuple (t, "\\def", 2)) {
  524. string var= string_arg (t[1]);
  525. command_type (var)= "user";
  526. command_arity (var)= 0;
  527. command_def (var)= as_string (u[2]);
  528. }
  529. if (is_tuple (t, "\\def*", 3)) {
  530. string var= string_arg (t[1]);
  531. command_type (var)= "user";
  532. command_arity (var)= as_int (t[2]);
  533. command_def (var)= as_string (u[3]);
  534. }
  535. if (is_tuple (t, "\\newtheorem", 2) || is_tuple (t, "\\newtheorem*", 2)) {
  536. string var= "\\begin-" * string_arg (t[1]);
  537. command_type (var)= "environment";
  538. command_arity (var)= 0;
  539. var= "\\end-" * string_arg (t[1]);
  540. command_type (var)= "environment";
  541. command_arity (var)= 0;
  542. }
  543. if (is_tuple (t, "\\newenvironment", 3)) {
  544. string var= "\\begin-" * string_arg (t[1]);
  545. command_type (var)= "user";
  546. command_arity (var)= 0;
  547. command_def (var)= as_string (u[2]);
  548. if (is_math_environment (t)) command_type (var)= "math-environment";
  549. var= "\\end-" * string_arg (t[1]);
  550. command_type (var)= "user";
  551. command_arity (var)= 0;
  552. command_def (var)= as_string (u[3]);
  553. if (is_math_environment (t)) command_type (var)= "math-environment";
  554. }
  555. if (is_tuple (t, "\\newenvironment*", 4)) {
  556. string var= "\\begin-" * string_arg (t[1]);
  557. command_type (var)= "user";
  558. command_arity (var)= as_int (t[2]);
  559. command_def (var)= as_string (u[3]);
  560. if (is_math_environment (t)) command_type (var)= "math-environment";
  561. var= "\\end-" * string_arg (t[1]);
  562. command_type (var)= "user";
  563. command_arity (var)= 0;
  564. command_def (var)= as_string (u[4]);
  565. if (is_math_environment (t)) command_type (var)= "math-environment";
  566. }
  567. /***************** environment changes for user commands ******************/
  568. if (latex_type (cmd) == "user") {
  569. int pos= 0;
  570. string body= command_def[cmd];
  571. if (count_occurrences ("\\begin", body) ==
  572. count_occurrences ("\\end", body))
  573. (void) parse (sharp_to_arg (body, u), pos, "", true);
  574. else t= parse (sharp_to_arg (body, u), pos, "", true);
  575. // replaces macros by their definitions in the case when
  576. // the user defined shorthands for \\begin{env} and \\end{env}
  577. }
  578. if (mbox_flag) command_type ("!mode") = "math";
  579. return t;
  580. }
  581. tree
  582. latex_parser::parse_unknown (string s, int& i, string cmd) {
  583. int n = N(s);
  584. bool option= false;
  585. tree t (TUPLE, copy (cmd));
  586. while (i<n) {
  587. int j=i;
  588. while ((j<n) && is_space (s[j])) j++;
  589. if (j==n) break;
  590. if (option && (s[j]=='[')) {
  591. j++;
  592. i=j;
  593. t << parse (s, i, "]");
  594. if ((i<n) && (s[i]==']')) i++;
  595. t[0]->label= t[0]->label * "*";
  596. option= false;
  597. }
  598. else if (s[j]=='{') {
  599. j++;
  600. i=j;
  601. t << parse (s, i, "}");
  602. if ((i<n) && (s[i]=='}')) i++;
  603. }
  604. else break;
  605. }
  606. return t;
  607. }
  608. /******************************************************************************
  609. * Parsing verbatim text
  610. ******************************************************************************/
  611. tree
  612. latex_parser::parse_verbatim (string s, int& i, string end) {
  613. int start=i, n= N(s), e= N(end);
  614. while ((i<(n-e)) && (s(i,i+e)!=end)) i++;
  615. i+=e;
  616. return tree (CONCAT,
  617. tree (TUPLE, "\\begin-verbatim"),
  618. s(start,i-e),
  619. tree (TUPLE, "\\end-verbatim"));
  620. }
  621. /******************************************************************************
  622. * This routine may be used to transform accented characters to the Cork format
  623. ******************************************************************************/
  624. static char Cork_unaccented[128]= {
  625. 'A', 'A', 'C', 'C', 'D', 'E', 'E', 'G',
  626. 'L', 'L', ' ', 'N', 'N', ' ', 'O', 'R',
  627. 'R', 'S', 'S', 'S', 'T', 'T', 'U', 'U',
  628. 'Y', 'Z', 'Z', 'Z', ' ', 'I', 'd', ' ',
  629. 'a', 'a', 'c', 'c', 'd', 'e', 'e', 'g',
  630. 'l', 'l', ' ', 'n', 'n', ' ', 'o', 'r',
  631. 'r', 's', 's', 's', 't', 't', 'u', 'u',
  632. 'y', 'z', 'z', 'z', ' ', ' ', ' ', ' ',
  633. 'A', 'A', 'A', 'A', 'A', 'A', ' ', 'C',
  634. 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
  635. 'D', 'N', 'O', 'O', 'O', 'O', 'O', ' ',
  636. ' ', 'U', 'U', 'U', 'U', 'Y', ' ', ' ',
  637. 'a', 'a', 'a', 'a', 'a', 'a', ' ', 'c',
  638. 'e', 'e', 'e', 'e', 25 , 25 , 25 , 25 ,
  639. 'd', 'n', 'o', 'o', 'o', 'o', 'o', ' ',
  640. ' ', 'u', 'u', 'u', 'u', 'y', ' ', ' '
  641. };
  642. static char Cork_accent[128]= {
  643. 'u' , 'k' , '\'', 'v' , 'v' , 'v' , 'k' , 'u' ,
  644. '\'', 'v' , ' ' , '\'', 'v' , ' ' , 'H' , '\'',
  645. 'v' , '\'', 'v' , 'c' , 'v' , 'c' , 'H' , 'r' ,
  646. '\"', '\'', 'v' , '.' , ' ' , '.' , '=' , ' ' , // "
  647. 'u' , 'k' , '\'', 'v' , 'v' , 'v' , 'k' , 'u' ,
  648. '\'', 'v' , ' ' , '\'', 'v' , ' ' , 'H' , '\'',
  649. 'v' , '\'', 'v' , 'c' , 'v' , 'c' , 'H' , 'r' ,
  650. '\"', '\'', 'v' , '.' , ' ' , ' ' , ' ' , ' ' , // "
  651. '`' , '\'', '^' , '~' , '\"', ' ' , ' ' , 'c' , // "
  652. '`' , '\'', '^' , '\"', '`' , '\'', '^' , '\"', // "
  653. '=' , '~' , '`' , '\'', '^' , '~' , '\"', ' ' , // "
  654. ' ' , '`' , '\'', '^' , '\"', '\'', ' ' , ' ' , // "
  655. '`' , '\'', '^' , '~' , '\"', ' ' , ' ' , 'c' , // "
  656. '`' , '\'', '^' , '\"', '`' , '\'', '^' , '\"', // "
  657. '=' , '~' , '`' , '\'', '^' , '~' , '\"', ' ' , // "
  658. ' ' , '`' , '\'', '^' , '\"', '\'', ' ' , ' ' // "
  659. };
  660. tree
  661. accented_to_Cork (tree t) {
  662. if (arity (t) == 0) return t;
  663. int i, n=N(t);
  664. tree r (t, n);
  665. for (i=0; i<n; i++) r[i]= accented_to_Cork (t[i]);
  666. if (is_compound (t[0])) return r;
  667. string s= t[0]->label;
  668. if ((N(s)==2) && (s[0]=='\\') && (n==2) &&
  669. is_atomic (r[1]) && (N(r[1]->label)<=2)) {
  670. string v= r[1]->label;
  671. if (N(v)==0) {
  672. if (s[1] == '`' ) {
  673. string ret_s (1);
  674. ret_s[0]= '\000';
  675. return ret_s;
  676. }
  677. if (s[1] == '\'') return "\001";
  678. if (s[1] == '^' ) return "\136";
  679. if (s[1] == '\"') return "\004"; // "
  680. if (s[1] == '~' ) return "\176";
  681. if (s[1] == '=' ) return "\026";
  682. if (s[1] == '.' ) return "\137";
  683. if (s[1] == 'u' ) return "\025";
  684. if (s[1] == 'v' ) return "\024";
  685. if (s[1] == 'H' ) return "\175";
  686. if (s[1] == 'c' ) return "\030";
  687. }
  688. else {
  689. char c1= v[0], c2= s[1];
  690. if (v == "\\i") c1= (char) 25;
  691. if ((N(v)==1) || (v=="\\i"))
  692. for (i=0; i<127; i++)
  693. if ((Cork_unaccented[i]==c1) && (Cork_accent[i]==c2))
  694. return tree (string ((char) (i+128)));
  695. }
  696. }
  697. if (r == tuple ("\\i")) return "\\i";
  698. return r;
  699. }
  700. /******************************************************************************
  701. * Interface
  702. ******************************************************************************/
  703. tree
  704. latex_parser::parse (string s, bool change) {
  705. command_type ->extend ();
  706. command_arity->extend ();
  707. command_def ->extend ();
  708. // We first cut the string into pieces at strategic places
  709. // This reduces the risk that the parser gets confused
  710. array<string> a;
  711. int i, start=0, n= N(s);
  712. for (i=0; i<n; i++)
  713. if (s[i]=='\n' || (s[i] == '\\' && test (s, i, "\\nextbib"))) {
  714. while ((i<n) && is_space (s[i])) i++;
  715. if (test (s, i, "%%%%%%%%%% Start TeXmacs macros\n")) {
  716. a << s (start, i);
  717. while ((i<n) && (!test (s, i, "%%%%%%%%%% End TeXmacs macros\n")))
  718. i++;
  719. i += 30;
  720. start= i;
  721. continue;
  722. }
  723. if (test (s, i, "\\begin{document}") ||
  724. test (s, i, "\\begin{abstract}") ||
  725. test (s, i, "\\chapter") ||
  726. test (s, i, "\\section") ||
  727. test (s, i, "\\subsection") ||
  728. test (s, i, "\\subsubsection") ||
  729. test (s, i, "\\paragraph") ||
  730. test (s, i, "\\subparagraph") ||
  731. test (s, i, "\\nextbib") ||
  732. test (s, i, "\\newcommand") ||
  733. test (s, i, "\\def") ||
  734. test (s, i, "\\input{") ||
  735. test (s, i, "\\include{"))
  736. {
  737. a << s (start, i);
  738. start= i;
  739. if (test (s, i, "\\input{") || test (s, i, "\\include{")) {
  740. while (i<N(s) && s[i] != '{') i++;
  741. int start_name= i+1;
  742. while (i<N(s) && s[i] != '}') i++;
  743. string name= s (start_name, i);
  744. if (!ends (name, ".tex")) name= name * ".tex";
  745. url incl= relative (get_file_focus (), name);
  746. string body;
  747. if (!exists (incl) || load_string (incl, body, false)) i++;
  748. else {
  749. //cout << "Include " << name << " -> " << incl << "\n";
  750. s= s (0, start) * "\n" * body * "\n" * s (i+1, N(s));
  751. n= N(s);
  752. i= start + 1;
  753. }
  754. start= i;
  755. }
  756. while (i < n && test (s, i, "\\nextbib{}")) {
  757. i += 10;
  758. a << s (start, i);
  759. start= i;
  760. }
  761. }
  762. if (i == n) break;
  763. }
  764. a << s (start, i);
  765. // We now parse each of the pieces
  766. tree t (CONCAT);
  767. for (i=0; i<N(a); i++) {
  768. int j=0;
  769. while (j<N(a[i])) {
  770. int start= j;
  771. command_type ("!mode") = "text";
  772. command_type ("!em") = "false";
  773. tree u= parse (a[i], j, "", true);
  774. if ((N(t)>0) && (t[N(t)-1]!='\n') && (start==0)) t << "\n";
  775. if (is_concat (u)) t << A(u);
  776. else t << u;
  777. if (j == start) j++;
  778. }
  779. }
  780. if (change) {
  781. command_type ->merge ();
  782. command_arity->merge ();
  783. command_def ->merge ();
  784. }
  785. else {
  786. command_type ->shorten ();
  787. command_arity->shorten ();
  788. command_def ->shorten ();
  789. }
  790. //cout << "Parsed " << t << "\n";
  791. return t;
  792. }
  793. static bool
  794. japanese_tex (string& s) {
  795. if (search_forwards ("\\documentclass{jarticle}", s) != -1) {
  796. s= replace (s, "\\documentclass{jarticle}", "\\documentclass{article}");
  797. s= convert (s, "ISO-2022-JP", "UTF-8");
  798. return true;
  799. }
  800. if (search_forwards ("\\documentclass{jbook}", s) != -1) {
  801. s= replace (s, "\\documentclass{jbook}", "\\documentclass{book}");
  802. s= convert (s, "ISO-2022-JP", "UTF-8");
  803. return true;
  804. }
  805. return false;
  806. }
  807. static bool
  808. korean_tex (string& s) {
  809. if (search_forwards ("\\usepackage{hangul}", s) != -1 ||
  810. search_forwards ("\\usepackage{hfont}", s) != -1 ||
  811. search_forwards ("]{hangul}", s) != -1 ||
  812. search_forwards ("]{hfont}", s) != -1)
  813. {
  814. s= replace (s, "\\usepackage{hangul}", "");
  815. s= replace (s, "\\usepackage{hfont}", "");
  816. s= convert (s, "EUC-KR", "UTF-8");
  817. return true;
  818. }
  819. if (search_forwards ("\\usepackage{dhucs}", s) != -1 ||
  820. search_forwards ("\\usepackage{memhangul-ucs}", s) != -1 ||
  821. search_forwards ("]{dhucs}", s) != -1 ||
  822. search_forwards ("]{memhangul-ucs}", s) != -1)
  823. {
  824. s= replace (s, "\\usepackage{dhucs}", "");
  825. s= replace (s, "\\usepackage{memhangul-ucs}", "");
  826. return true;
  827. }
  828. return false;
  829. }
  830. static bool
  831. chinese_tex (string& s) {
  832. if (search_forwards ("\\kaishu", s) != -1)
  833. s= replace (s, "\\kaishu", "");
  834. if (search_forwards ("\\begin{CJK}{GBK}{kai}", s) != -1)
  835. s= replace (s, "\\begin{CJK}{GBK}{kai}", "");
  836. if (search_forwards ("\\begin{CJK*}{GBK}{kai}", s) != -1)
  837. s= replace (s, "\\begin{CJK*}{GBK}{kai}", "");
  838. if (search_forwards ("\\end{CJK}", s) != -1)
  839. s= replace (s, "\\end{CJK}", "");
  840. if (search_forwards ("\\end{CJK*}", s) != -1)
  841. s= replace (s, "\\end{CJK*}", "");
  842. if (search_forwards ("\\CJKindent", s) != -1)
  843. s= replace (s, "\\CJKindent", "");
  844. if (search_forwards ("\\CJKcaption{GBk}", s) != -1)
  845. s= replace (s, "\\CJKcaption{GBK}", "");
  846. if (search_forwards ("\\usepackage{CJK}", s) != -1) {
  847. s= replace (s, "\\usepackage{CJK}", "");
  848. s= convert (s, "cp936", "UTF-8");
  849. return true;
  850. }
  851. if (search_forwards ("\\documentclass{cctart}", s) != -1) {
  852. s= replace (s, "\\documentclass{cctart}", "\\documentclass{article}");
  853. s= convert (s, "cp936", "UTF-8");
  854. return true;
  855. }
  856. if (search_forwards ("\\documentclass[CJK]{cctart}", s) != -1) {
  857. s= replace (s, "\\documentclass[CJK]{cctart}", "\\documentclass{article}");
  858. s= convert (s, "cp936", "UTF-8");
  859. return true;
  860. }
  861. return false;
  862. }
  863. static bool
  864. taiwanese_tex (string& s) {
  865. if (search_forwards ("\\usepackage{CJKvert,type1cm}", s) != -1)
  866. s= replace (s, "\\usepackage{CJKvert,type1cm}", "");
  867. if (search_forwards ("\\begin{CJK}{Bg5}{aming}", s) != -1)
  868. s= replace (s, "\\begin{CJK}{Bg5}{aming}", "");
  869. if (search_forwards ("\\begin{CJK}{Bg5}{kai}", s) != -1)
  870. s= replace (s, "\\begin{CJK}{Bg5}{kai}", "");
  871. if (search_forwards ("\\end{CJK}", s) != -1)
  872. s= replace (s, "\\end{CJK}", "");
  873. if (search_forwards ("\\CJKcaption{Bg5}", s) != -1)
  874. s= replace (s, "\\CJKcaption{Bg5}", "");
  875. if (search_forwards ("\\CJKindent", s) != -1)
  876. s= replace (s, "\\CJKindent", "");
  877. if (search_forwards ("\\usepackage{CJK}", s) != -1) {
  878. s= replace (s, "\\usepackage{CJK}", "");
  879. s= convert (s, "cp950", "UTF-8");
  880. return true;
  881. }
  882. if (search_forwards ("\\usepackage{CJK*}", s) != -1) {
  883. s= replace (s, "\\usepackage{CJK*}", "");
  884. s= convert (s, "cp950", "UTF-8");
  885. return true;
  886. }
  887. return false;
  888. }
  889. tree
  890. parse_latex (string s, bool change) {
  891. s= dos_to_better (s);
  892. string lan= "";
  893. if (japanese_tex (s)) lan= "japanese";
  894. else if (korean_tex (s)) lan= "korean";
  895. else if (taiwanese_tex (s)) lan= "taiwanese";
  896. else if (chinese_tex (s)) lan= "chinese";
  897. bool unicode= (lan == "chinese" || lan == "japanese" ||
  898. lan == "korean" || lan == "taiwanese");
  899. latex_parser ltx (unicode);
  900. tree r= accented_to_Cork (ltx.parse (s, change));
  901. if (lan == "") return r;
  902. return compound ("!language", r, lan);
  903. }
  904. tree
  905. parse_latex_document (string s, bool change) {
  906. return compound ("!file", parse_latex (s, change));
  907. }