PageRenderTime 52ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/source/src/Data/Convert/Tex/parsetex.cpp

http://itexmacs.googlecode.com/
C++ | 959 lines | 826 code | 53 blank | 80 comment | 519 complexity | 4d9af31b7868adb27436d38087bf5d52 MD5 | raw file
Possible License(s): GPL-3.0, GPL-2.0, MPL-2.0-no-copyleft-exception, LGPL-2.0
  1. /******************************************************************************
  2. * MODULE : parsetex.cpp
  3. * DESCRIPTION: conversion of tex/latex strings into logical tex/latex trees
  4. * COPYRIGHT : (C) 1999 Joris van der Hoeven
  5. *******************************************************************************
  6. * This software falls under the GNU general public license version 3 or later.
  7. * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
  8. * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
  9. ******************************************************************************/
  10. #include "Tex/convert_tex.hpp"
  11. #include "converter.hpp"
  12. string string_arg (tree t);
  13. /******************************************************************************
  14. * The latex_parser structure
  15. *******************************************************************************
  16. *
  17. * During the parsing, the following global variables are used:
  18. *
  19. * command_type Contains the types of all currently defined tex commands.
  20. * This is either 'command' 'modifier' 'operator'
  21. * 'environment' 'list' 'symbol' 'big-symbol' or 'user'.
  22. * command_arity Contains the corresponding arity.
  23. * command_def Contains the definitions of user commands.
  24. *
  25. * The command_type hashmap also contains come special fields
  26. *
  27. * \<sub> Stands for the subscript command
  28. * \<sup> Stands for the supscript command
  29. *
  30. * !mode Gives the current mode ("text" or "math").
  31. * !verbatim Verbatim mode ("true" or "false")
  32. * !em Emphasized mode ("true" or "false")
  33. *
  34. *******************************************************************************
  35. * WARNING: we recently put the standard LaTeX macros in latex_type and
  36. * latex_arity instead of command_type and command_arity.
  37. ******************************************************************************/
  38. struct latex_parser {
  39. bool unicode;
  40. latex_parser (bool unicode2): unicode (unicode2) {}
  41. void latex_error (string s, int i, string message);
  42. tree parse (string s, int& i, string stop= "", bool change= false);
  43. tree parse_backslash (string s, int& i);
  44. tree parse_symbol (string s, int& i);
  45. tree parse_command (string s, int& i, string which);
  46. tree parse_unknown (string s, int& i, string which);
  47. tree parse_verbatim (string s, int& i, string end);
  48. tree parse (string s, bool change);
  49. };
  50. /******************************************************************************
  51. * Error handling
  52. ******************************************************************************/
  53. void
  54. latex_parser::latex_error (string s, int i, string message) {
  55. cerr << "Latex error] " << message << "\n";
  56. if (i>30) s= "..." * s (i-27, N(s));
  57. if (N(s)>60) s= s (0, 57) * "...";
  58. cerr << "Latex error] in " << s << "\n";
  59. }
  60. /******************************************************************************
  61. * Main parsing routine
  62. ******************************************************************************/
  63. static bool
  64. is_regular (tree t) {
  65. if (!is_tuple (t)) return true;
  66. if (N(t) == 0 || !is_atomic (t[0])) return false;
  67. string s= t[0]->label;
  68. return !starts (s, "\\begin-") && !starts (s, "\\end-");
  69. }
  70. tree
  71. latex_parser::parse (string s, int& i, string stop, bool change) {
  72. bool no_error= true;
  73. int n= N(s);
  74. tree t (CONCAT);
  75. command_type ->extend ();
  76. command_arity->extend ();
  77. command_def ->extend ();
  78. while ((i<n) && is_space (s[i])) i++;
  79. while ((i<n) && no_error &&
  80. (s[i] != '\0' || N (stop) != 0) &&
  81. (N(stop) != 1 || s[i] != stop[0]) &&
  82. (s[i] != '$' || stop != "$$" || i+1>=n || s[i+1] != '$') &&
  83. (stop != "denom" ||
  84. (s[i] != '$' && s[i] != '}' &&
  85. (i+2>n || s(i,i+2) != "\\]") &&
  86. (i+4>n || s(i,i+4) != "\\end")))) {
  87. switch (s[i]) {
  88. case '~':
  89. t << tuple ("\\nbsp");
  90. i++;
  91. break;
  92. case ' ':
  93. case '\t':
  94. case '\r':
  95. while ((i<n) && ((s[i]==' ') || (s[i]=='\t') || (s[i]=='\r'))) i++;
  96. if ((i<n) && (s[i]!='\n')) t << " ";
  97. break;
  98. case '\n': {
  99. int ln=0;
  100. while ((i<n) && is_space (s[i]))
  101. if (s[i++]=='\n') ln++;
  102. if (i<n) {
  103. if (ln == 1) t << " ";
  104. else t << "\n";
  105. }
  106. break;
  107. }
  108. case '%': {
  109. while ((i<n) && (s[i]!='\n')) i++;
  110. if (i<n) i++;
  111. int ln=0;
  112. while ((i<n) && is_space (s[i]))
  113. if (s[i++]=='\n') ln++;
  114. if (ln > 0) {
  115. if ((N(t)>0) && ((t[N(t)-1]==" ") || (t[N(t)-1]=="\n")))
  116. t[N(t)-1]= "\n";
  117. else t << "\n";
  118. }
  119. break;
  120. }
  121. case '#':
  122. i++;
  123. if (i==n) return t;
  124. if (is_numeric (s[i])) {
  125. t << s (i-1, i+1);
  126. i++;
  127. }
  128. else t << s (i-1, i);
  129. break;
  130. case '\\':
  131. if (((i+7)<n) && !is_alpha (s (i+5, i+7)) &&
  132. (s (i, i+5) == "\\over" || s (i, i+5) == "\\atop"))
  133. {
  134. string fr_cmd= s(i,i+5);
  135. if (fr_cmd == "\\over") fr_cmd= "\\frac";
  136. if (fr_cmd == "\\atop") fr_cmd= "\\ontop";
  137. int j;
  138. for (j=N(t); j>0 && is_regular (t[j-1]); j--) {}
  139. tree num= t (j, N(t));
  140. if (N(num) == 0) num= "";
  141. t= t (0, j);
  142. i+=5;
  143. while (i<n && (s[i] == ' ' || s[i] == '\n' || s[i] == '\t')) i++;
  144. tree den= parse (s, i, "denom");
  145. t << tree (TUPLE, fr_cmd, num, den);
  146. }
  147. else if (((i+5)<n) && (s(i,i+3)=="\\sp") && (!is_alpha(s[i+3]))) {
  148. i+=3;
  149. t << parse_command (s, i, "\\<sup>");
  150. }
  151. else if (((i+5)<n) && (s(i,i+3)=="\\sb") && (!is_alpha(s[i+3]))) {
  152. i+=3;
  153. t << parse_command (s, i, "\\<sub>");
  154. }
  155. else if (((i+10)<n) && (s(i,i+8)=="\\pmatrix")) {
  156. i+=8;
  157. tree arg= parse_command (s, i, "\\pmatrix");
  158. if (is_tuple (arg, "\\pmatrix", 1)) arg= arg[1];
  159. t << tree (TUPLE, "\\begin-pmatrix");
  160. if (is_concat (arg)) t << A (arg);
  161. else t << arg;
  162. t << tree (TUPLE, "\\end-pmatrix");
  163. }
  164. else {
  165. tree u= parse_backslash (s, i);
  166. if (u != "") t << u;
  167. }
  168. break;
  169. case '\'':
  170. i++;
  171. if (command_type ["!mode"] == "math") {
  172. int start= i-1;
  173. while ((i < N(s)) && (s[i] == '\'')) i++;
  174. t << tuple ("\\prime", s (start, i));
  175. }
  176. else t << s (i-1, i);
  177. break;
  178. case '*':
  179. if (command_type ["!mode"] == "math") t << tree (TUPLE, "\\ast");
  180. else t << "*";
  181. i++;
  182. break;
  183. case '_':
  184. i++;
  185. t << parse_command (s, i, "\\<sub>");
  186. /*
  187. if (command_type ["!mode"] == "math")
  188. t << parse_command (s, i, "\\<sub>");
  189. else t << s (i-1, i);
  190. */
  191. break;
  192. case '^':
  193. i++;
  194. t << parse_command (s, i, "\\<sup>");
  195. /*
  196. if (command_type ["!mode"] == "math")
  197. t << parse_command (s, i, "\\<sup>");
  198. else t << s (i-1, i);
  199. */
  200. break;
  201. case '<':
  202. t << tree (TUPLE, "\\<less>");
  203. i++;
  204. break;
  205. case '>':
  206. t << tree (TUPLE, "\\<gtr>");
  207. i++;
  208. break;
  209. case '\244':
  210. i++;
  211. t << parse_verbatim (s, i, "\244");
  212. break;
  213. case '{': {
  214. i++;
  215. t << parse (s, i, "}");
  216. if ((i<n) && (s[i]=='}')) i++;
  217. int ln=0;
  218. if ((i<n) && (!is_space (s[i]))) break;
  219. while ((i<n) && is_space (s[i]))
  220. if (s[i++]=='\n') ln++;
  221. if (ln >= 2) t << "\n";
  222. else if (i<n) t << tree (TUPLE, "\\ ");
  223. break;
  224. }
  225. case '$': {
  226. i++;
  227. if ((i<n) & (s[i]=='$')) {
  228. i++;
  229. t << tree (TUPLE, "\\begin-displaymath");
  230. command_type ("!mode")= "math";
  231. t << parse (s, i, "$$");
  232. command_type ("!mode")= "text";
  233. if ((i<n) && (s[i]=='$')) i++;
  234. if ((i<n) && (s[i]=='$')) i++;
  235. t << tree (TUPLE, "\\end-displaymath");
  236. }
  237. else {
  238. t << tree (TUPLE, "\\begin-math");
  239. command_type ("!mode")= "math";
  240. t << parse (s, i, "$");
  241. command_type ("!mode")= "text";
  242. if ((i<n) && (s[i]=='$')) i++;
  243. t << tree (TUPLE, "\\end-math");
  244. }
  245. break;
  246. }
  247. default:
  248. if (unicode && ((unsigned char) s[i]) >= 128) {
  249. unsigned int code= decode_from_utf8 (s, i);
  250. t << tree (TUPLE, "\\#" * as_hexadecimal (code));
  251. }
  252. else if (!unicode && is_iso_alpha (s[i])) {
  253. // If we encounter too much text in math mode, then return
  254. int start= i;
  255. while ((i<n) && is_iso_alpha (s[i])) i++;
  256. int end= i;
  257. if ((i >= start+3) && (command_type ["!mode"] == "math")) {
  258. while ((i<n) && (is_iso_alpha (s[i]) ||
  259. is_punctuation (s[i]) ||
  260. is_space (s[i])))
  261. i++;
  262. if (i >= start+20) {
  263. int last= i, words= 0, letters= 0;
  264. for (i=start; i<last; i++) {
  265. if (is_iso_alpha (s[i])) {
  266. letters++;
  267. if ((i==start) || (!is_iso_alpha (s[i-1]))) words++;
  268. }
  269. }
  270. if ((words > 3) && (letters/words >= 3) && (letters >= 15)) {
  271. i= start;
  272. no_error= false;
  273. }
  274. }
  275. }
  276. if (no_error)
  277. for (i=start; i<end; i++)
  278. t << s(i, i+1);
  279. }
  280. else {
  281. t << s (i, i+1);
  282. i++;
  283. }
  284. break;
  285. }
  286. }
  287. if (change) {
  288. command_type ->merge ();
  289. command_arity->merge ();
  290. command_def ->merge ();
  291. }
  292. else {
  293. command_type ->shorten ();
  294. command_arity->shorten ();
  295. command_def ->shorten ();
  296. }
  297. if (N(t)==0) return "";
  298. if (N(t)==1) return t[0];
  299. return t;
  300. }
  301. /******************************************************************************
  302. * Parsing commands
  303. ******************************************************************************/
  304. tree
  305. latex_parser::parse_backslash (string s, int& i) {
  306. int n= N(s);
  307. if (((i+7)<n) && (s(i,i+5)=="\\verb")) {
  308. i+=6;
  309. return parse_verbatim (s, i, s(i-1,i));
  310. }
  311. if (((i+29)<n) && (s(i,i+16)=="\\begin{verbatim}")) {
  312. i+=16;
  313. return parse_verbatim (s, i, "\\end{verbatim}");
  314. }
  315. if (((i+5)<n) && (s(i,i+4)=="\\url") && !is_alpha (s[i+5])) {
  316. i+=4;
  317. while (i<n && (s[i] == ' ' || s[i] == '\n' || s[i] == '\t')) i++;
  318. string ss;
  319. if (i<n && s[i] == '{') {
  320. i++;
  321. int start= i;
  322. while ((i<n) && s[i] != '}') i++;
  323. ss= s (start, i++);
  324. }
  325. return tree (TUPLE, "\\url", ss);
  326. }
  327. if (((i+6)<n) && (s(i,i+5)=="\\href")) {
  328. i+=5;
  329. while (i<n && (s[i] == ' ' || s[i] == '\n' || s[i] == '\t')) i++;
  330. string ss;
  331. if (i<n && s[i] == '{') {
  332. i++;
  333. int start= i;
  334. while ((i<n) && s[i] != '}') i++;
  335. ss= s (start, i++);
  336. }
  337. tree u= "";
  338. while (i<n && (s[i] == ' ' || s[i] == '\n' || s[i] == '\t')) i++;
  339. if (i<n && s[i] == '{') { i++; u= parse (s, i, "}"); i++; }
  340. return tree (TUPLE, "\\href", ss, u);
  341. }
  342. /************************ special commands *********************************/
  343. i++;
  344. if (i==n) return "";
  345. if (s[i]==' ') {
  346. i++;
  347. return tree (TUPLE, "\\ ");
  348. }
  349. if (!is_alpha(s[i])) {
  350. i++;
  351. if (s[i-1]=='(') return parse_command (s, i, "\\begin-math");
  352. if (s[i-1]==')') return parse_command (s, i, "\\end-math");
  353. if (s[i-1]=='[') return parse_command (s, i, "\\begin-displaymath");
  354. if (s[i-1]==']') return parse_command (s, i, "\\end-displaymath");
  355. return parse_command (s, i, s (i-2, i));
  356. }
  357. /************************* normal commands *********************************/
  358. int start= i-1;
  359. while ((i<n) && is_alpha (s[i])) i++;
  360. if ((i<n) && (s[i]=='*') && latex_type (s (start, i+1)) != "undefined") i++;
  361. string r= s (start, i);
  362. if ((r == "\\begin") || (r == "\\end")) {
  363. while ((i<n) && is_space (s[i])) i++;
  364. if ((i==n) || (s[i]!='{')) {
  365. latex_error (s, i, "begin or end which environment ?");
  366. return "";
  367. }
  368. i++; start= i;
  369. while ((i<n) && (s[i]!='}')) i++;
  370. r = r * "-" * s (start, i);
  371. if (i<n) i++;
  372. }
  373. return parse_command (s, i, r);
  374. }
  375. static string
  376. sharp_to_arg (string s, tree args) {
  377. int i;
  378. string r;
  379. for (i=0; i<N(s); i++)
  380. if ((s[i]=='#') && ((i+1)<N(s)) && (s[i+1]>='1') && (s[i+1]<='9')) {
  381. int nr= ((int) s[++i]) - ((int) '0');
  382. if (N(args)>nr) r << string_arg (args[nr]);
  383. }
  384. else r << s[i];
  385. return r;
  386. }
  387. tree
  388. latex_parser::parse_symbol (string s, int& i) {
  389. int start= i;
  390. if ((s[i] == '*') && (command_type ["!mode"] == "math")) {
  391. i++; return tree (TUPLE, "\\ast"); }
  392. if (s[i] == '<') { i++; return tree (TUPLE, "\\<less>"); }
  393. if (s[i] == '>') { i++; return tree (TUPLE, "\\<gtr>"); }
  394. if (s[i] != '\\') { i++; return s(start, i); }
  395. i++;
  396. if (i == N(s)) return tree (TUPLE, "\\backslash");
  397. if (!is_alpha (s[i])) { i++; return s(start, i); }
  398. while ((i<N(s)) && is_alpha (s[i])) i++;
  399. if ((i<N(s)) && (s[i]=='*')) i++;
  400. return s(start,i);
  401. }
  402. static bool
  403. is_math_environment (tree t) {
  404. //cout << "t= " << t << "\n";
  405. tree b= t[N(t)-2];
  406. tree e= t[N(t)-1];
  407. if (!is_concat (b)) b= tree (CONCAT, b);
  408. if (!is_concat (e)) e= tree (CONCAT, e);
  409. int i, j;
  410. for (i=N(b)-1; i>=0; i--)
  411. if (is_tuple (b[i]) && N(b[i])>0 && is_atomic (b[i][0]))
  412. if (latex_type (b[i][0]->label) == "math-environment")
  413. break;
  414. for (j=0; j<N(e); j++)
  415. if (is_tuple (e[j]) && N(e[j])>0 && is_atomic (e[j][0]))
  416. if (latex_type (e[j][0]->label) == "math-environment")
  417. break;
  418. if (i >= 0 && j < N(e)) {
  419. string bs= b[i][0]->label;
  420. string es= e[j][0]->label;
  421. bool ok=
  422. starts (bs, "\\begin-") &&
  423. starts (es, "\\end-") &&
  424. bs (7, N(bs)) == es (5, N(es));
  425. //cout << t[1] << " -> " << ok << "\n";
  426. return ok;
  427. }
  428. return false;
  429. }
  430. static bool
  431. is_text_argument (string cmd, int remaining_arity) {
  432. // FIXME: this test should be improved using DRD properties
  433. (void) remaining_arity;
  434. return cmd == "\\label" || cmd == "\\ref";
  435. }
  436. tree
  437. latex_parser::parse_command (string s, int& i, string cmd) {
  438. //cout << cmd << " [" << latex_type (cmd) << ", "
  439. //<< command_type ["!mode"] << ", " << latex_arity (cmd) << "]" << LF;
  440. if (cmd == "\\newcommand") cmd= "\\def";
  441. if (cmd == "\\renewcommand") cmd= "\\def";
  442. if (cmd == "\\renewenvironment") cmd= "\\newenvironment";
  443. if (cmd == "\\begin-split") cmd= "\\begin-eqsplit";
  444. if (cmd == "\\end-split") cmd= "\\end-eqsplit";
  445. if (cmd == "\\begin-split*") cmd= "\\begin-eqsplit*";
  446. if (cmd == "\\end-split*") cmd= "\\end-eqsplit*";
  447. if (latex_type (cmd) == "undefined")
  448. return parse_unknown (s, i, cmd);
  449. if (latex_type (cmd) == "math-environment") {
  450. if (cmd (0, 6) == "\\begin") command_type ("!mode") = "math";
  451. else command_type ("!mode") = "text";
  452. }
  453. bool mbox_flag=
  454. ((cmd == "\\text") || (cmd == "\\mbox")) &&
  455. (command_type ["!mode"] == "math");
  456. if (mbox_flag) command_type ("!mode") = "text";
  457. int n = N(s);
  458. int arity = latex_arity (cmd);
  459. bool option= (arity<0);
  460. if (option) arity= -1-arity;
  461. /************************ retrieve arguments *******************************/
  462. tree t (TUPLE, copy (cmd)); // parsed arguments
  463. tree u (TUPLE, copy (cmd)); // unparsed arguments
  464. while (i<n && arity>=0 && (arity>0 || option)) {
  465. int j= i;
  466. while ((j<n) && is_space (s[j])) j++;
  467. if (j==n) break;
  468. if (option && (s[j]=='[')) {
  469. j++;
  470. i=j;
  471. tree opt= parse (s, i, "]");
  472. if (cmd != "\\newtheorem" && cmd != "\\newtheorem*")
  473. t << opt;
  474. u << s (j, i);
  475. if ((i<n) && (s[i]==']')) i++;
  476. if (cmd != "\\newtheorem" && cmd != "\\newtheorem*")
  477. t[0]->label= t[0]->label * "*";
  478. option= false;
  479. }
  480. else if ((arity>0) && (s[j]=='{')) {
  481. bool text_arg=
  482. (command_type["!mode"] == "math") && is_text_argument (cmd, arity);
  483. j++;
  484. i=j;
  485. if (text_arg) command_type ("!mode")= "text";
  486. if ((N(t)==1) && (cmd == "\\def")) {
  487. while ((i<n) && (s[i]!='}')) i++;
  488. t << s (j, i);
  489. }
  490. else t << parse (s, i, "}");
  491. if (text_arg) command_type ("!mode")= "math";
  492. u << s (j, i);
  493. if ((i<n) && (s[i]=='}')) i++;
  494. arity--;
  495. if (arity == 0) option= false;
  496. }
  497. else if (s[j] == '}') break;
  498. else if (option && (s[j]=='#') && (cmd == "\\def")) {
  499. while ((j+3 <= n) && is_numeric (s[j+1]) && (s[j+2] == '#')) j+=2;
  500. if (j+2<=n) {
  501. t << s (j+1, j+2);
  502. u << s (j+1, j+2);
  503. i= j+2;
  504. }
  505. t[0]->label= t[0]->label * "*";
  506. option= false;
  507. }
  508. else {
  509. if (arity>0) {
  510. i=j;
  511. tree st= parse_symbol (s, i);
  512. t << st;
  513. u << st;
  514. arity--;
  515. if (arity == 0) option= false;
  516. }
  517. else break;
  518. }
  519. }
  520. if (arity>0) latex_error (s, i, "too little arguments for " * cmd);
  521. /******************** new commands and environments ************************/
  522. if (is_tuple (t, "\\def", 2)) {
  523. string var= string_arg (t[1]);
  524. command_type (var)= "user";
  525. command_arity (var)= 0;
  526. command_def (var)= as_string (u[2]);
  527. }
  528. if (is_tuple (t, "\\def*", 3)) {
  529. string var= string_arg (t[1]);
  530. command_type (var)= "user";
  531. command_arity (var)= as_int (t[2]);
  532. command_def (var)= as_string (u[3]);
  533. }
  534. if (is_tuple (t, "\\newtheorem", 2) || is_tuple (t, "\\newtheorem*", 2)) {
  535. string var= "\\begin-" * string_arg (t[1]);
  536. command_type (var)= "environment";
  537. command_arity (var)= 0;
  538. var= "\\end-" * string_arg (t[1]);
  539. command_type (var)= "environment";
  540. command_arity (var)= 0;
  541. }
  542. if (is_tuple (t, "\\newenvironment", 3)) {
  543. string var= "\\begin-" * string_arg (t[1]);
  544. command_type (var)= "user";
  545. command_arity (var)= 0;
  546. command_def (var)= as_string (u[2]);
  547. if (is_math_environment (t)) command_type (var)= "math-environment";
  548. var= "\\end-" * string_arg (t[1]);
  549. command_type (var)= "user";
  550. command_arity (var)= 0;
  551. command_def (var)= as_string (u[3]);
  552. if (is_math_environment (t)) command_type (var)= "math-environment";
  553. }
  554. if (is_tuple (t, "\\newenvironment*", 4)) {
  555. string var= "\\begin-" * string_arg (t[1]);
  556. command_type (var)= "user";
  557. command_arity (var)= as_int (t[2]);
  558. command_def (var)= as_string (u[3]);
  559. if (is_math_environment (t)) command_type (var)= "math-environment";
  560. var= "\\end-" * string_arg (t[1]);
  561. command_type (var)= "user";
  562. command_arity (var)= 0;
  563. command_def (var)= as_string (u[4]);
  564. if (is_math_environment (t)) command_type (var)= "math-environment";
  565. }
  566. /***************** environment changes for user commands ******************/
  567. if (latex_type (cmd) == "user") {
  568. int pos= 0;
  569. string body= command_def[cmd];
  570. if (count_occurrences ("\\begin", body) ==
  571. count_occurrences ("\\end", body))
  572. (void) parse (sharp_to_arg (body, u), pos, "", true);
  573. else t= parse (sharp_to_arg (body, u), pos, "", true);
  574. // replaces macros by their definitions in the case when
  575. // the user defined shorthands for \\begin{env} and \\end{env}
  576. }
  577. if (mbox_flag) command_type ("!mode") = "math";
  578. return t;
  579. }
  580. tree
  581. latex_parser::parse_unknown (string s, int& i, string cmd) {
  582. int n = N(s);
  583. bool option= false;
  584. tree t (TUPLE, copy (cmd));
  585. while (i<n) {
  586. int j=i;
  587. while ((j<n) && is_space (s[j])) j++;
  588. if (j==n) break;
  589. if (option && (s[j]=='[')) {
  590. j++;
  591. i=j;
  592. t << parse (s, i, "]");
  593. if ((i<n) && (s[i]==']')) i++;
  594. t[0]->label= t[0]->label * "*";
  595. option= false;
  596. }
  597. else if (s[j]=='{') {
  598. j++;
  599. i=j;
  600. t << parse (s, i, "}");
  601. if ((i<n) && (s[i]=='}')) i++;
  602. }
  603. else break;
  604. }
  605. return t;
  606. }
  607. /******************************************************************************
  608. * Parsing verbatim text
  609. ******************************************************************************/
  610. tree
  611. latex_parser::parse_verbatim (string s, int& i, string end) {
  612. int start=i, n= N(s), e= N(end);
  613. while ((i<(n-e)) && (s(i,i+e)!=end)) i++;
  614. i+=e;
  615. return tree (CONCAT,
  616. tree (TUPLE, "\\begin-verbatim"),
  617. s(start,i-e),
  618. tree (TUPLE, "\\end-verbatim"));
  619. }
  620. /******************************************************************************
  621. * This routine may be used to transform accented characters to the Cork format
  622. ******************************************************************************/
  623. static char Cork_unaccented[128]= {
  624. 'A', 'A', 'C', 'C', 'D', 'E', 'E', 'G',
  625. 'L', 'L', ' ', 'N', 'N', ' ', 'O', 'R',
  626. 'R', 'S', 'S', 'S', 'T', 'T', 'U', 'U',
  627. 'Y', 'Z', 'Z', 'Z', ' ', 'I', 'd', ' ',
  628. 'a', 'a', 'c', 'c', 'd', 'e', 'e', 'g',
  629. 'l', 'l', ' ', 'n', 'n', ' ', 'o', 'r',
  630. 'r', 's', 's', 's', 't', 't', 'u', 'u',
  631. 'y', 'z', 'z', 'z', ' ', ' ', ' ', ' ',
  632. 'A', 'A', 'A', 'A', 'A', 'A', ' ', 'C',
  633. 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
  634. 'D', 'N', 'O', 'O', 'O', 'O', 'O', ' ',
  635. ' ', 'U', 'U', 'U', 'U', 'Y', ' ', ' ',
  636. 'a', 'a', 'a', 'a', 'a', 'a', ' ', 'c',
  637. 'e', 'e', 'e', 'e', 25 , 25 , 25 , 25 ,
  638. 'd', 'n', 'o', 'o', 'o', 'o', 'o', ' ',
  639. ' ', 'u', 'u', 'u', 'u', 'y', ' ', ' '
  640. };
  641. static char Cork_accent[128]= {
  642. 'u' , 'k' , '\'', 'v' , 'v' , 'v' , 'k' , 'u' ,
  643. '\'', 'v' , ' ' , '\'', 'v' , ' ' , 'H' , '\'',
  644. 'v' , '\'', 'v' , 'c' , 'v' , 'c' , 'H' , 'r' ,
  645. '\"', '\'', 'v' , '.' , ' ' , '.' , '=' , ' ' , // "
  646. 'u' , 'k' , '\'', 'v' , 'v' , 'v' , 'k' , 'u' ,
  647. '\'', 'v' , ' ' , '\'', 'v' , ' ' , 'H' , '\'',
  648. 'v' , '\'', 'v' , 'c' , 'v' , 'c' , 'H' , 'r' ,
  649. '\"', '\'', 'v' , '.' , ' ' , ' ' , ' ' , ' ' , // "
  650. '`' , '\'', '^' , '~' , '\"', ' ' , ' ' , 'c' , // "
  651. '`' , '\'', '^' , '\"', '`' , '\'', '^' , '\"', // "
  652. '=' , '~' , '`' , '\'', '^' , '~' , '\"', ' ' , // "
  653. ' ' , '`' , '\'', '^' , '\"', '\'', ' ' , ' ' , // "
  654. '`' , '\'', '^' , '~' , '\"', ' ' , ' ' , 'c' , // "
  655. '`' , '\'', '^' , '\"', '`' , '\'', '^' , '\"', // "
  656. '=' , '~' , '`' , '\'', '^' , '~' , '\"', ' ' , // "
  657. ' ' , '`' , '\'', '^' , '\"', '\'', ' ' , ' ' // "
  658. };
  659. tree
  660. accented_to_Cork (tree t) {
  661. if (arity (t) == 0) return t;
  662. int i, n=N(t);
  663. tree r (t, n);
  664. for (i=0; i<n; i++) r[i]= accented_to_Cork (t[i]);
  665. if (is_compound (t[0])) return r;
  666. string s= t[0]->label;
  667. if ((N(s)==2) && (s[0]=='\\') && (n==2) &&
  668. is_atomic (r[1]) && (N(r[1]->label)<=2)) {
  669. string v= r[1]->label;
  670. if (N(v)==0) {
  671. if (s[1] == '`' ) {
  672. string ret_s (1);
  673. ret_s[0]= '\000';
  674. return ret_s;
  675. }
  676. if (s[1] == '\'') return "\001";
  677. if (s[1] == '^' ) return "\136";
  678. if (s[1] == '\"') return "\004"; // "
  679. if (s[1] == '~' ) return "\176";
  680. if (s[1] == '=' ) return "\026";
  681. if (s[1] == '.' ) return "\137";
  682. if (s[1] == 'u' ) return "\025";
  683. if (s[1] == 'v' ) return "\024";
  684. if (s[1] == 'H' ) return "\175";
  685. if (s[1] == 'c' ) return "\030";
  686. }
  687. else {
  688. char c1= v[0], c2= s[1];
  689. if (v == "\\i") c1= (char) 25;
  690. if ((N(v)==1) || (v=="\\i"))
  691. for (i=0; i<127; i++)
  692. if ((Cork_unaccented[i]==c1) && (Cork_accent[i]==c2))
  693. return tree (string ((char) (i+128)));
  694. }
  695. }
  696. if (r == tuple ("\\i")) return "\\i";
  697. return r;
  698. }
  699. /******************************************************************************
  700. * Interface
  701. ******************************************************************************/
  702. tree
  703. latex_parser::parse (string s, bool change) {
  704. command_type ->extend ();
  705. command_arity->extend ();
  706. command_def ->extend ();
  707. // We first cut the string into pieces at strategic places
  708. // This reduces the risk that the parser gets confused
  709. array<string> a;
  710. int i, start=0, n= N(s);
  711. for (i=0; i<n; i++)
  712. if (s[i]=='\n' || (s[i] == '\\' && test (s, i, "\\nextbib"))) {
  713. while ((i<n) && is_space (s[i])) i++;
  714. if (test (s, i, "%%%%%%%%%% Start TeXmacs macros\n")) {
  715. a << s (start, i);
  716. while ((i<n) && (!test (s, i, "%%%%%%%%%% End TeXmacs macros\n")))
  717. i++;
  718. i += 30;
  719. start= i;
  720. continue;
  721. }
  722. if (test (s, i, "\\begin{document}") ||
  723. test (s, i, "\\begin{abstract}") ||
  724. test (s, i, "\\chapter") ||
  725. test (s, i, "\\section") ||
  726. test (s, i, "\\subsection") ||
  727. test (s, i, "\\subsubsection") ||
  728. test (s, i, "\\paragraph") ||
  729. test (s, i, "\\subparagraph") ||
  730. test (s, i, "\\nextbib") ||
  731. test (s, i, "\\newcommand") ||
  732. test (s, i, "\\def") ||
  733. test (s, i, "\\input{") ||
  734. test (s, i, "\\include{"))
  735. {
  736. a << s (start, i);
  737. start= i;
  738. if (test (s, i, "\\input{") || test (s, i, "\\include{")) {
  739. while (i<N(s) && s[i] != '{') i++;
  740. int start_name= i+1;
  741. while (i<N(s) && s[i] != '}') i++;
  742. string name= s (start_name, i);
  743. if (!ends (name, ".tex")) name= name * ".tex";
  744. url incl= relative (get_file_focus (), name);
  745. string body;
  746. if (!exists (incl) || load_string (incl, body, false)) i++;
  747. else {
  748. //cout << "Include " << name << " -> " << incl << "\n";
  749. s= s (0, start) * "\n" * body * "\n" * s (i+1, N(s));
  750. n= N(s);
  751. i= start + 1;
  752. }
  753. start= i;
  754. }
  755. while (i < n && test (s, i, "\\nextbib{}")) {
  756. i += 10;
  757. a << s (start, i);
  758. start= i;
  759. }
  760. }
  761. if (i == n) break;
  762. }
  763. a << s (start, i);
  764. // We now parse each of the pieces
  765. tree t (CONCAT);
  766. for (i=0; i<N(a); i++) {
  767. int j=0;
  768. while (j<N(a[i])) {
  769. int start= j;
  770. command_type ("!mode") = "text";
  771. command_type ("!em") = "false";
  772. tree u= parse (a[i], j, "", true);
  773. if ((N(t)>0) && (t[N(t)-1]!='\n') && (start==0)) t << "\n";
  774. if (is_concat (u)) t << A(u);
  775. else t << u;
  776. if (j == start) j++;
  777. }
  778. }
  779. if (change) {
  780. command_type ->merge ();
  781. command_arity->merge ();
  782. command_def ->merge ();
  783. }
  784. else {
  785. command_type ->shorten ();
  786. command_arity->shorten ();
  787. command_def ->shorten ();
  788. }
  789. //cout << "Parsed " << t << "\n";
  790. return t;
  791. }
  792. static bool
  793. japanese_tex (string& s) {
  794. if (search_forwards ("\\documentclass{jarticle}", s) != -1) {
  795. s= replace (s, "\\documentclass{jarticle}", "\\documentclass{article}");
  796. s= convert (s, "ISO-2022-JP", "UTF-8");
  797. return true;
  798. }
  799. if (search_forwards ("\\documentclass{jbook}", s) != -1) {
  800. s= replace (s, "\\documentclass{jbook}", "\\documentclass{book}");
  801. s= convert (s, "ISO-2022-JP", "UTF-8");
  802. return true;
  803. }
  804. return false;
  805. }
  806. static bool
  807. korean_tex (string& s) {
  808. if (search_forwards ("\\usepackage{hangul}", s) != -1 ||
  809. search_forwards ("\\usepackage{hfont}", s) != -1 ||
  810. search_forwards ("]{hangul}", s) != -1 ||
  811. search_forwards ("]{hfont}", s) != -1)
  812. {
  813. s= replace (s, "\\usepackage{hangul}", "");
  814. s= replace (s, "\\usepackage{hfont}", "");
  815. s= convert (s, "EUC-KR", "UTF-8");
  816. return true;
  817. }
  818. if (search_forwards ("\\usepackage{dhucs}", s) != -1 ||
  819. search_forwards ("\\usepackage{memhangul-ucs}", s) != -1 ||
  820. search_forwards ("]{dhucs}", s) != -1 ||
  821. search_forwards ("]{memhangul-ucs}", s) != -1)
  822. {
  823. s= replace (s, "\\usepackage{dhucs}", "");
  824. s= replace (s, "\\usepackage{memhangul-ucs}", "");
  825. return true;
  826. }
  827. return false;
  828. }
  829. static bool
  830. chinese_tex (string& s) {
  831. if (search_forwards ("\\kaishu", s) != -1)
  832. s= replace (s, "\\kaishu", "");
  833. if (search_forwards ("\\begin{CJK}{GBK}{kai}", s) != -1)
  834. s= replace (s, "\\begin{CJK}{GBK}{kai}", "");
  835. if (search_forwards ("\\begin{CJK*}{GBK}{kai}", s) != -1)
  836. s= replace (s, "\\begin{CJK*}{GBK}{kai}", "");
  837. if (search_forwards ("\\end{CJK}", s) != -1)
  838. s= replace (s, "\\end{CJK}", "");
  839. if (search_forwards ("\\end{CJK*}", s) != -1)
  840. s= replace (s, "\\end{CJK*}", "");
  841. if (search_forwards ("\\CJKindent", s) != -1)
  842. s= replace (s, "\\CJKindent", "");
  843. if (search_forwards ("\\CJKcaption{GBk}", s) != -1)
  844. s= replace (s, "\\CJKcaption{GBK}", "");
  845. if (search_forwards ("\\usepackage{CJK}", s) != -1) {
  846. s= replace (s, "\\usepackage{CJK}", "");
  847. s= convert (s, "cp936", "UTF-8");
  848. return true;
  849. }
  850. if (search_forwards ("\\documentclass{cctart}", s) != -1) {
  851. s= replace (s, "\\documentclass{cctart}", "\\documentclass{article}");
  852. s= convert (s, "cp936", "UTF-8");
  853. return true;
  854. }
  855. if (search_forwards ("\\documentclass[CJK]{cctart}", s) != -1) {
  856. s= replace (s, "\\documentclass[CJK]{cctart}", "\\documentclass{article}");
  857. s= convert (s, "cp936", "UTF-8");
  858. return true;
  859. }
  860. return false;
  861. }
  862. static bool
  863. taiwanese_tex (string& s) {
  864. if (search_forwards ("\\usepackage{CJKvert,type1cm}", s) != -1)
  865. s= replace (s, "\\usepackage{CJKvert,type1cm}", "");
  866. if (search_forwards ("\\begin{CJK}{Bg5}{aming}", s) != -1)
  867. s= replace (s, "\\begin{CJK}{Bg5}{aming}", "");
  868. if (search_forwards ("\\begin{CJK}{Bg5}{kai}", s) != -1)
  869. s= replace (s, "\\begin{CJK}{Bg5}{kai}", "");
  870. if (search_forwards ("\\end{CJK}", s) != -1)
  871. s= replace (s, "\\end{CJK}", "");
  872. if (search_forwards ("\\CJKcaption{Bg5}", s) != -1)
  873. s= replace (s, "\\CJKcaption{Bg5}", "");
  874. if (search_forwards ("\\CJKindent", s) != -1)
  875. s= replace (s, "\\CJKindent", "");
  876. if (search_forwards ("\\usepackage{CJK}", s) != -1) {
  877. s= replace (s, "\\usepackage{CJK}", "");
  878. s= convert (s, "cp950", "UTF-8");
  879. return true;
  880. }
  881. if (search_forwards ("\\usepackage{CJK*}", s) != -1) {
  882. s= replace (s, "\\usepackage{CJK*}", "");
  883. s= convert (s, "cp950", "UTF-8");
  884. return true;
  885. }
  886. return false;
  887. }
  888. tree
  889. parse_latex (string s, bool change) {
  890. s= dos_to_better (s);
  891. string lan= "";
  892. if (japanese_tex (s)) lan= "japanese";
  893. else if (korean_tex (s)) lan= "korean";
  894. else if (taiwanese_tex (s)) lan= "taiwanese";
  895. else if (chinese_tex (s)) lan= "chinese";
  896. bool unicode= (lan == "chinese" || lan == "japanese" ||
  897. lan == "korean" || lan == "taiwanese");
  898. latex_parser ltx (unicode);
  899. tree r= accented_to_Cork (ltx.parse (s, change));
  900. if (lan == "") return r;
  901. return compound ("!language", r, lan);
  902. }
  903. tree
  904. parse_latex_document (string s, bool change) {
  905. return compound ("!file", parse_latex (s, change));
  906. }