PageRenderTime 54ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/source/src/Data/Tree/tree_correct.cpp

http://itexmacs.googlecode.com/
C++ | 928 lines | 790 code | 57 blank | 81 comment | 610 complexity | 214589f22dad55d9ef29c79ba217c905 MD5 | raw file
Possible License(s): GPL-3.0, GPL-2.0, MPL-2.0-no-copyleft-exception, LGPL-2.0
  1. /******************************************************************************
  2. * MODULE : tree_correct.cpp
  3. * DESCRIPTION: make a tree syntactically match a drd
  4. * COPYRIGHT : (C) 2005 Joris van der Hoeven
  5. *******************************************************************************
  6. * This software falls under the GNU general public license version 3 or later.
  7. * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
  8. * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
  9. ******************************************************************************/
  10. #include "tree_correct.hpp"
  11. #include "tree_analyze.hpp"
  12. #include "Scheme/object.hpp"
  13. #include "packrat.hpp"
  14. /******************************************************************************
  15. * DRD based correction
  16. ******************************************************************************/
  17. tree
  18. drd_correct (drd_info drd, tree t) {
  19. if (is_atomic (t)) return t;
  20. else {
  21. int i, n= N(t);
  22. if (drd->contains (as_string (L(t))) &&
  23. !drd->correct_arity (L(t), n))
  24. return "";
  25. tree r (t, n);
  26. for (i=0; i<n; i++)
  27. r[i]= drd_correct (drd, t[i]);
  28. return r;
  29. }
  30. }
  31. /******************************************************************************
  32. * Correct WITHs or WITH-like macros
  33. ******************************************************************************/
  34. tree
  35. with_correct (tree t) {
  36. if (is_atomic (t)) return t;
  37. else {
  38. //cout << "Correcting " << t << LF << INDENT;
  39. tree u (t, N(t));
  40. for (int k=0; k<N(t); k++)
  41. u[k]= with_correct (t[k]);
  42. array<tree> a= concat_decompose (u);
  43. int i, n= N(a);
  44. array<tree> r;
  45. for (i=0; i<n; i++) {
  46. if (is_with_like (a[i])) {
  47. array<tree> b= with_decompose (a[i], with_body (a[i]));
  48. int p= N(b), k1, k2;
  49. for (k1=0; k1<p ; k1++)
  50. if (is_with_like (b[k1]) && with_similar_type (a[i], b[k1]));
  51. else break;
  52. for (k2=p; k2>k1; k2--)
  53. if (is_with_like (b[k2-1]) && with_similar_type (a[i], b[k2-1]));
  54. else break;
  55. array<tree> x;
  56. if (0 < k1) x << range (b, 0, k1);
  57. if (k1 < k2) x << with_recompose (a[i], range (b, k1, k2));
  58. if (k2 < p ) x << range (b, k2, p);
  59. if (N(x) == 0) continue;
  60. if (N(r) != 0 &&
  61. is_with_like (r[N(r)-1]) &&
  62. with_same_type (r[N(r)-1], x[0]))
  63. {
  64. array<tree> c= concat_decompose (with_body (r[N(r)-1]));
  65. c << concat_decompose (with_body (x[0]));
  66. r[N(r)-1]= with_recompose (x[0], c);
  67. r << range (x, 1, N(x));
  68. }
  69. else r << x;
  70. }
  71. else r << a[i];
  72. }
  73. //cout << UNINDENT << "Corrected " << t << " -> "
  74. //<< concat_recompose (r) << LF;
  75. return concat_recompose (r);
  76. }
  77. }
  78. static tree
  79. superfluous_with_correct (tree t, tree env) {
  80. if (is_atomic (t)) return t;
  81. else {
  82. //cout << "Superfluous correcting " << t << ", " << env << LF;
  83. if (is_compound (t, "body", 1))
  84. return compound ("body", superfluous_with_correct (t[0], env));
  85. if (is_func (t, WITH) && ((N(t) & 1) == 0))
  86. t= t * tree (WITH, "");
  87. tree r (t, N(t));
  88. for (int i=0; i<N(t); i++)
  89. r[i]= superfluous_with_correct
  90. (t[i], the_drd->get_env_child (t, i, env));
  91. if (is_compound (r, "math", 1) && r[0] == "") return "";
  92. else if (is_compound (r, "text", 1) && r[0] == "") return "";
  93. else if (is_compound (r, "math", 1) && drd_env_read (env, MODE) == "math")
  94. return r[0];
  95. else if (is_compound (r, "text", 1) && drd_env_read (env, MODE) == "text")
  96. return r[0];
  97. else if (is_func (r, WITH)) {
  98. for (int i=0; i+1<N(r); i+=2)
  99. if (!is_atomic (r[i])) return r;
  100. else if (drd_env_read (env, r[i]->label) != r[i+1]) return r;
  101. return r[N(r)-1];
  102. }
  103. else if (is_func (r, CONCAT)) {
  104. array<tree> a= concat_decompose (r);
  105. return concat_recompose (a);
  106. }
  107. return r;
  108. }
  109. }
  110. tree
  111. superfluous_with_correct (tree t) {
  112. with_drd drd (get_document_drd (t));
  113. return superfluous_with_correct (t, tree (WITH, MODE, "text"));
  114. }
  115. /******************************************************************************
  116. * Replace symbols by appropriate homoglyphs
  117. ******************************************************************************/
  118. static array<tree>
  119. homoglyph_correct (array<tree> a) {
  120. array<int> tp= symbol_types (a);
  121. array<tree> r;
  122. //cout << a << ", " << tp << "\n";
  123. for (int i=0; i<N(a); i++)
  124. if (a[i] == "<minus>") r << tree ("-");
  125. else if (a[i] == "\\" || a[i] == "<backslash>") {
  126. int j1, j2;
  127. for (j1= i-1; j1>=0; j1--)
  128. if (tp[j1] != SYMBOL_SKIP && tp[j1] != SYMBOL_SCRIPT) break;
  129. for (j2= i+1; j2<N(a); j2++)
  130. if (tp[j2] != SYMBOL_SKIP && tp[j2] != SYMBOL_SCRIPT) break;
  131. if (j1 < 0 || j2 >= N(a));
  132. else if ((a[i] == "\\" ||
  133. a[i] == "<backslash>") &&
  134. ((tp[j1] == SYMBOL_BASIC) ||
  135. (tp[j1] == SYMBOL_POSTFIX)) &&
  136. ((tp[j2] == SYMBOL_BASIC) ||
  137. (tp[j2] == SYMBOL_PREFIX)))
  138. r << tree ("<setminus>");
  139. else r << a[i];
  140. }
  141. else if (is_func (a[i], NEG, 1) && is_atomic (a[i][0])) {
  142. string s= a[i][0]->label;
  143. if (s == "=") r << tree ("<neq>");
  144. else if (s == "<less>") r << tree ("<nless>");
  145. else if (s == "<gtr>") r << tree ("<ngtr>");
  146. else if (s == "<leq>") r << tree ("<nleq>");
  147. else if (s == "<geq>") r << tree ("<ngeq>");
  148. else if (s == "<leqslant>") r << tree ("<nleqslant>");
  149. else if (s == "<geqslant>") r << tree ("<ngeqslant>");
  150. else if (s == "<prec>") r << tree ("<nprec>");
  151. else if (s == "<succ>") r << tree ("<nsucc>");
  152. else if (s == "<preceq>") r << tree ("<npreceq>");
  153. else if (s == "<succeq>") r << tree ("<nsucceq>");
  154. else if (s == "<preccurlyeq>") r << tree ("<npreccurlyeq>");
  155. else if (s == "<succcurlyeq>") r << tree ("<nsucccurlyeq>");
  156. else if (s == "<rightarrow>") r << tree ("<nrightarrow>");
  157. else if (s == "<Rightarrow>") r << tree ("<nRightarrow>");
  158. else if (s == "<leftarrow>") r << tree ("<nleftarrow>");
  159. else if (s == "<Leftarrow>") r << tree ("<nLeftarrow>");
  160. else if (s == "<leftrightarrow>") r << tree ("<nleftrightarrow>");
  161. else if (s == "<Leftrightarrow>") r << tree ("<nLeftrightarrow>");
  162. else if (s == "<equiv>") r << tree ("<nequiv>");
  163. else if (s == "<sim>") r << tree ("<nsim>");
  164. else if (s == "<simeq>") r << tree ("<nsimeq>");
  165. else if (s == "<approx>") r << tree ("<napprox>");
  166. else if (s == "<cong>") r << tree ("<ncong>");
  167. else if (s == "<asymp>") r << tree ("<nasymp>");
  168. else if (s == "<in>") r << tree ("<nin>");
  169. else if (s == "<ni>") r << tree ("<nni>");
  170. else if (s == "<subset>") r << tree ("<nsubset>");
  171. else if (s == "<supset>") r << tree ("<nsupset>");
  172. else if (s == "<subseteq>") r << tree ("<nsubseteq>");
  173. else if (s == "<supseteq>") r << tree ("<nsupseteq>");
  174. else if (s == "<sqsubset>") r << tree ("<nsqsubset>");
  175. else if (s == "<sqsupset>") r << tree ("<nsqsupset>");
  176. else if (s == "<sqsubseteq>") r << tree ("<nsqsubseteq>");
  177. else if (s == "<sqsupseteq>") r << tree ("<nsqsupseteq>");
  178. else if (s == "<leadsto>") r << tree ("<nleadsto>");
  179. else r << a[i];
  180. }
  181. else if (a[i] == ":" && i+1 < N(a) && a[i+1] == "=") {
  182. r << tree ("<assign>");
  183. i++;
  184. }
  185. else r << a[i];
  186. return r;
  187. }
  188. static tree
  189. homoglyph_correct (tree t, string mode) {
  190. //cout << "Correct " << t << ", " << mode << "\n";
  191. tree r= t;
  192. if (is_compound (t)) {
  193. int i, n= N(t);
  194. r= tree (t, n);
  195. for (i=0; i<n; i++) {
  196. tree tmode= the_drd->get_env_child (t, i, MODE, mode);
  197. string smode= (is_atomic (tmode)? tmode->label: string ("text"));
  198. if (is_correctable_child (t, i))
  199. r[i]= homoglyph_correct (t[i], smode);
  200. else r[i]= t[i];
  201. }
  202. }
  203. if (mode == "math") {
  204. array<tree> a= concat_tokenize (r);
  205. a= homoglyph_correct (a);
  206. tree ret= concat_recompose (a);
  207. //if (ret != r) cout << "< " << r << " >" << LF
  208. //<< "> " << ret << " <" << LF;
  209. return ret;
  210. }
  211. else return r;
  212. }
  213. tree
  214. homoglyph_correct (tree t) {
  215. with_drd drd (get_document_drd (t));
  216. return homoglyph_correct (t, "text");
  217. }
  218. /******************************************************************************
  219. * Remove incorrect spaces and multiplications
  220. ******************************************************************************/
  221. static array<tree>
  222. superfluous_invisible_correct (array<tree> a) {
  223. array<int> tp= symbol_types (a);
  224. array<tree> r;
  225. //cout << a << ", " << tp << "\n";
  226. for (int i=0; i<N(a); i++)
  227. if (a[i] == " " || a[i] == "*") {
  228. int j1, j2;
  229. for (j1= i-1; j1>=0; j1--)
  230. if (tp[j1] != SYMBOL_SKIP && tp[j1] != SYMBOL_SCRIPT) break;
  231. else if (a[j1] == " ") break;
  232. for (j2= i+1; j2<N(a); j2++)
  233. if (tp[j2] != SYMBOL_SKIP && tp[j2] != SYMBOL_SCRIPT)
  234. if (a[j2] != " " && a[j2] != "*") break;
  235. //cout << " " << i << ": " << j1 << ", " << j2
  236. //<< "; " << tp[j1] << ", " << tp[j2] << "\n";
  237. if (j1 < 0 || j2 >= N(a));
  238. else if (a[j1] == " " || a[j1] == "*");
  239. else if (tp[j1] == SYMBOL_PREFIX ||
  240. tp[j1] == SYMBOL_INFIX ||
  241. tp[j1] == SYMBOL_SEPARATOR ||
  242. tp[j1] == SYMBOL_PROBABLE_MIDDLE);
  243. else if (tp[j2] == SYMBOL_POSTFIX ||
  244. tp[j2] == SYMBOL_INFIX ||
  245. tp[j2] == SYMBOL_SEPARATOR ||
  246. tp[j2] == SYMBOL_PROBABLE_MIDDLE);
  247. else r << a[i];
  248. }
  249. else if (is_func (a[i], SQRT, 2) && a[i][1] == "")
  250. r << tree (SQRT, a[i][0]);
  251. else if (is_script (a[i]) && a[i][0] == "")
  252. r << tree (L(a[i]), "<nosymbol>");
  253. else r << a[i];
  254. return r;
  255. }
  256. static tree
  257. superfluous_invisible_correct (tree t, string mode) {
  258. //cout << "Correct " << t << ", " << mode << "\n";
  259. tree r= t;
  260. if (is_compound (t)) {
  261. int i, n= N(t);
  262. r= tree (t, n);
  263. for (i=0; i<n; i++) {
  264. tree tmode= the_drd->get_env_child (t, i, MODE, mode);
  265. string smode= (is_atomic (tmode)? tmode->label: string ("text"));
  266. //cout << " " << i << ": " << is_correctable_child (t, i)
  267. //<< ", " << smode << "\n";
  268. if (is_func (t, WITH) && i != N(t)-1)
  269. r[i]= t[i];
  270. else if (is_correctable_child (t, i))
  271. r[i]= superfluous_invisible_correct (t[i], smode);
  272. else r[i]= t[i];
  273. }
  274. }
  275. if (is_func (r, CONCAT)) {
  276. bool ok= true;
  277. int i, found= -1;
  278. for (i=0; i<N(r); i++)
  279. if (is_compound (r[i], "hide-preamble") ||
  280. is_compound (r[i], "show-preamble"))
  281. {
  282. ok= (found == -1);
  283. found= i;
  284. }
  285. else if (!is_atomic (r[i])) ok= false;
  286. else {
  287. string s= r[i]->label;
  288. for (int j=0; j<N(s); j++)
  289. if (s[j] != ' ') ok= false;
  290. }
  291. if (ok) r= r[found];
  292. }
  293. if (is_func (r, INACTIVE, 1) && is_func (r[0], RIGID))
  294. return r[0];
  295. else if (mode == "math") {
  296. array<tree> a= concat_tokenize (r);
  297. a= superfluous_invisible_correct (a);
  298. tree ret= concat_recompose (a);
  299. //if (ret != r) cout << "< " << r << " >" << LF
  300. //<< "> " << ret << " <" << LF;
  301. return ret;
  302. }
  303. else return r;
  304. }
  305. tree
  306. superfluous_invisible_correct (tree t) {
  307. with_drd drd (get_document_drd (t));
  308. return superfluous_invisible_correct (t, "text");
  309. }
  310. /******************************************************************************
  311. * Insert missing multiplications or function applications
  312. ******************************************************************************/
  313. #define SURE_NOTHING 0
  314. #define SURE_TIMES 1
  315. #define SURE_SPACE 2
  316. #define PROBABLE_TIMES 3
  317. #define PROBABLE_SPACE 4
  318. #define BOTH_WAYS 5
  319. struct invisible_corrector {
  320. int force;
  321. hashmap<string,int> times_before;
  322. hashmap<string,int> times_after;
  323. hashmap<string,int> space_before;
  324. hashmap<string,int> space_after;
  325. protected:
  326. bool is_letter_like (string s);
  327. bool contains_infix (tree t);
  328. bool contains_plus_like (tree t);
  329. void count_invisible (array<tree> a);
  330. void count_invisible (tree t, string mode);
  331. int get_status (tree t, bool left, bool script_flag);
  332. array<tree> correct (array<tree> a);
  333. public:
  334. inline invisible_corrector (tree t, int force2):
  335. force (force2), times_before (0), times_after (0), space_after (0) {
  336. count_invisible (t, "text"); }
  337. tree correct (tree t, string mode);
  338. };
  339. bool
  340. invisible_corrector::is_letter_like (string s) {
  341. static language lan= math_language ("std-math");
  342. if (s != "" && is_iso_alpha (s)) return true;
  343. return lan->get_group (s) == "Letter-symbol";
  344. }
  345. bool
  346. invisible_corrector::contains_infix (tree t) {
  347. array<int> tp= symbol_types (concat_tokenize (t));
  348. for (int i=0; i<N(tp); i++)
  349. if (tp[i] == SYMBOL_INFIX)
  350. return true;
  351. return false;
  352. }
  353. bool
  354. invisible_corrector::contains_plus_like (tree t) {
  355. array<tree> a= concat_tokenize (t);
  356. for (int i=1; i<N(a)-1; i++)
  357. if (a[i] == "+" || a[i] == "-")
  358. return true;
  359. return false;
  360. }
  361. void
  362. invisible_corrector::count_invisible (array<tree> a) {
  363. array<int> tp= symbol_types (a);
  364. for (int i=0; i<N(a); i++)
  365. if (is_atomic (a[i]) && is_letter_like (a[i]->label)) {
  366. int j1, j2;
  367. for (j1= i-1; j1>=0; j1--)
  368. if (tp[j1] != SYMBOL_SKIP && tp[j1] != SYMBOL_SCRIPT) break;
  369. else if (a[j1] == " ") break;
  370. for (j2= i+1; j2<N(a); j2++)
  371. if (tp[j2] != SYMBOL_SKIP && tp[j2] != SYMBOL_SCRIPT) break;
  372. else if (a[j2] == " ") break;
  373. string s= a[i]->label;
  374. if (j1 >= 0) {
  375. if (a[j1] == "*")
  376. times_before (s)= times_before[s] + 1;
  377. if (a[j1] == " ")
  378. space_before (s)= space_before[s] + 1;
  379. }
  380. if (j2 < N(a)) {
  381. if (a[j2] == "*")
  382. times_after (s)= times_after[s] + 1;
  383. if (a[j2] == " ")
  384. space_after (s)= space_after[s] + 1;
  385. // NOTE: this heuristic might not be a good idea,
  386. // because it inhibits the correction of QR -> Q*R,
  387. // if Q is a polynomial which is applied somewhere Q(1).
  388. // We might introduce a table 'apply_after'.
  389. //if (is_around (a[j2]) && a[j2][0] == "(" &&
  390. //!contains_infix (a[j2][1]))
  391. //space_after (s)= space_after[s] + 1;
  392. }
  393. }
  394. }
  395. void
  396. invisible_corrector::count_invisible (tree t, string mode) {
  397. if (is_compound (t)) {
  398. int i, n= N(t);
  399. for (i=0; i<n; i++) {
  400. tree tmode= the_drd->get_env_child (t, i, MODE, mode);
  401. string smode= (is_atomic (tmode)? tmode->label: string ("text"));
  402. if (is_func (t, WITH) && i != N(t)-1);
  403. else if (is_correctable_child (t, i))
  404. count_invisible (t[i], smode);
  405. }
  406. }
  407. if (mode == "math")
  408. count_invisible (concat_tokenize (t));
  409. }
  410. int
  411. invisible_corrector::get_status (tree t, bool left, bool script_flag) {
  412. if (is_atomic (t)) {
  413. static language lan= math_language ("std-math");
  414. string s= t->label;
  415. string g= lan->get_group (t->label);
  416. if (is_numeric (s))
  417. return (left? SURE_TIMES: PROBABLE_TIMES);
  418. else if (starts (g, "Unary-operator-textual"))
  419. return (left? SURE_SPACE: BOTH_WAYS);
  420. else if (starts (g, "Binary-operator"))
  421. return SURE_SPACE;
  422. else if (starts (g, "N-ary-operator"))
  423. return (left? SURE_SPACE: BOTH_WAYS);
  424. else if (is_letter_like (s)) {
  425. if (left) {
  426. if (times_after[s] > 0 && space_after[s] == 0)
  427. return SURE_TIMES;
  428. else if (space_after[s] > 0 && times_after[s] == 0)
  429. return SURE_SPACE;
  430. else if (times_after[s] > space_after[s])
  431. return PROBABLE_TIMES;
  432. else if (space_after[s] > times_after[s])
  433. return PROBABLE_SPACE;
  434. else if (N(s)>1 && is_iso_alpha (s))
  435. return PROBABLE_SPACE;
  436. else if (script_flag)
  437. return PROBABLE_TIMES;
  438. else return BOTH_WAYS;
  439. }
  440. else {
  441. if (times_before[s] > space_before[s])
  442. return PROBABLE_TIMES;
  443. else if (times_after[s] > 0 && space_after[s] == 0)
  444. return PROBABLE_TIMES;
  445. else if (script_flag && (N(s) == 1 || !is_iso_alpha (s)))
  446. return PROBABLE_TIMES;
  447. else return BOTH_WAYS;
  448. }
  449. }
  450. else if (s == "<cdots>" || s == "<ldots>")
  451. return PROBABLE_TIMES;
  452. else return ((force > 0)? BOTH_WAYS: SURE_NOTHING);
  453. }
  454. else {
  455. if (is_around (t)) {
  456. if (left && contains_plus_like (t[1]))
  457. return ((force > 0)? SURE_TIMES: PROBABLE_TIMES);
  458. else if (contains_plus_like (t[1]))
  459. return ((force > 0)? PROBABLE_TIMES: BOTH_WAYS);
  460. else if (!contains_infix (t[1]))
  461. return (left? BOTH_WAYS: SURE_SPACE);
  462. else return BOTH_WAYS;
  463. }
  464. else if (is_func (t, FRAC) ||
  465. is_func (t, SQRT))
  466. return (left? SURE_TIMES: BOTH_WAYS);
  467. else if (!left && is_func (t, BIG_AROUND, 2) &&
  468. (t[0] == "<sum>" || t[0] == "<amalg>" ||
  469. t[0] == "<oplus>" || t[0] == "<uplus>" ||
  470. t[0] == "<int>" || t[0] == "<oint>" ||
  471. t[0] == "<intlim>" || t[0] == "<ointlim>" ||
  472. t[0] == "<prod>" || t[0] == "<odot>" || t[0] == "<otimes>"))
  473. return PROBABLE_TIMES;
  474. else if (is_func (t, WIDE, 2))
  475. return get_status (t[0], left, script_flag);
  476. else if (is_func (t, WITH))
  477. return get_status (t[N(t)-1], left, script_flag);
  478. else if (N(t) == 0 && L(t) >= START_EXTENSIONS) {
  479. tree def= the_drd->get_syntax (L(t));
  480. if (is_func (def, MACRO, 1))
  481. return get_status (def[0], left, script_flag);
  482. else return SURE_NOTHING;
  483. }
  484. else return SURE_NOTHING;
  485. }
  486. }
  487. static bool
  488. admits_script (array<int> tp, int i) {
  489. i++;
  490. while (i<N(tp))
  491. if (tp[i] == SYMBOL_SCRIPT) return true;
  492. else if (tp[i] == SYMBOL_SKIP) i++;
  493. else return false;
  494. return false;
  495. }
  496. array<tree>
  497. invisible_corrector::correct (array<tree> a) {
  498. //cout << "Correct " << a << "\n";
  499. array<tree> r;
  500. array<int> tp= symbol_types (a);
  501. for (int i=0; i<N(a); i++) {
  502. r << a[i];
  503. if (a[i] != " " && tp[i] == SYMBOL_BASIC) {
  504. int j;
  505. for (j= i+1; j<N(a); j++)
  506. if (tp[j] != SYMBOL_SKIP && tp[j] != SYMBOL_SCRIPT) break;
  507. else if (a[j] == " ") break;
  508. if (j >= N(a) || a[j] == " " || tp[j] != SYMBOL_BASIC)
  509. continue;
  510. string ins= "";
  511. int sti= get_status (a[i], true, admits_script (tp, i));
  512. int stj= get_status (a[j], false, admits_script (tp, j));
  513. //cout << "Pair (" << a[i] << ", " << a[j] << ")"
  514. //<< " -> (" << sti << ", " << stj << ")" << LF;
  515. if (sti == SURE_NOTHING || stj == SURE_NOTHING)
  516. ins= "";
  517. else if (sti == SURE_TIMES && stj != SURE_SPACE)
  518. ins= "*";
  519. else if (sti == SURE_SPACE && stj != SURE_TIMES)
  520. ins= " ";
  521. else if (sti == PROBABLE_TIMES && stj == PROBABLE_TIMES)
  522. ins= "*";
  523. else if (sti == PROBABLE_SPACE && stj == PROBABLE_SPACE)
  524. ins= " ";
  525. else if (sti == PROBABLE_TIMES && stj == BOTH_WAYS)
  526. ins= "*";
  527. else if (sti == PROBABLE_SPACE && stj == BOTH_WAYS)
  528. ins= " ";
  529. else if (sti == BOTH_WAYS && stj == PROBABLE_TIMES)
  530. ins= "*";
  531. else if (sti == BOTH_WAYS && stj == PROBABLE_SPACE)
  532. ins= " ";
  533. else if (sti == BOTH_WAYS && stj == BOTH_WAYS && force == 1 &&
  534. (is_atomic (a[i]) || is_around (a[i])) &&
  535. (is_atomic (a[j]) || is_around (a[j])))
  536. ins= "*";
  537. if (is_around (a[j]))
  538. if (ins == " " || (ins == "*" && force == -1))
  539. ins= "";
  540. if (a[j] == ".") ins= "";
  541. while (i+1 < N(a) && (is_func (a[i+1], RSUB, 1) ||
  542. is_func (a[i+1], RSUP, 1) ||
  543. is_func (a[i+1], RPRIME, 1))) {
  544. i++;
  545. r << a[i];
  546. }
  547. if (ins != "") r << tree (ins);
  548. }
  549. }
  550. return r;
  551. }
  552. tree
  553. invisible_corrector::correct (tree t, string mode) {
  554. //cout << "Correct " << t << ", " << mode << "\n";
  555. tree r= t;
  556. if (is_compound (t)) {
  557. int i, n= N(t);
  558. r= tree (t, n);
  559. for (i=0; i<n; i++) {
  560. tree tmode= the_drd->get_env_child (t, i, MODE, mode);
  561. string smode= (is_atomic (tmode)? tmode->label: string ("text"));
  562. if (is_func (t, WITH) && i != N(t)-1)
  563. r[i]= t[i];
  564. else if (is_correctable_child (t, i))
  565. r[i]= correct (t[i], smode);
  566. else r[i]= t[i];
  567. }
  568. }
  569. if (mode == "math") {
  570. array<tree> a= concat_tokenize (r);
  571. a= correct (a);
  572. tree ret= concat_recompose (a);
  573. //if (ret != r)
  574. // cout << "<< " << r << " >>" << LF
  575. // << ">> " << ret << " <<" << LF;
  576. return ret;
  577. }
  578. else return r;
  579. }
  580. tree
  581. missing_invisible_correct (tree t, int force) {
  582. // force = -1, only correct when sure, and when old markup is incorrect
  583. // force = 0 , only correct when pretty sure
  584. // force = 1 , correct whenever reasonable (used for LaTeX import)
  585. with_drd drd (get_document_drd (t));
  586. invisible_corrector corrector (t, force);
  587. //cout << "Times before " << corrector.times_before << "\n";
  588. //cout << "Space before " << corrector.space_before << "\n";
  589. //cout << "Times after " << corrector.times_after << "\n";
  590. //cout << "Space after " << corrector.space_after << "\n";
  591. return corrector.correct (t, "text");
  592. }
  593. tree
  594. missing_invisible_correct_twice (tree t, int force= -1) {
  595. tree u= missing_invisible_correct (t, force);
  596. if (u == t) return t;
  597. return missing_invisible_correct (u, force);
  598. }
  599. /******************************************************************************
  600. * Miscellaneous corrections
  601. ******************************************************************************/
  602. tree
  603. misc_math_correct (tree t) {
  604. if (is_atomic (t)) return t;
  605. else if (is_compound (t, "math", 1) && is_func (t[0], RSUB, 1))
  606. return tree (RSUB, compound ("math", misc_math_correct (t[0][0])));
  607. else if (is_compound (t, "math", 1) && is_func (t[0], RSUP, 1))
  608. return tree (RSUP, compound ("math", misc_math_correct (t[0][0])));
  609. else if (is_func (t, RSUB, 1) && is_func (t[0], RSUB, 1))
  610. return misc_math_correct (t[0]);
  611. else if (is_func (t, RSUB, 1) && is_func (t[0], RSUP, 1))
  612. return misc_math_correct (tree (RSUB, t[0][0]));
  613. else if (is_func (t, RSUP, 1) && is_func (t[0], RSUB, 1))
  614. return misc_math_correct (tree (RSUP, t[0][0]));
  615. else if (is_func (t, RSUP, 1) && is_func (t[0], RSUP, 1))
  616. return misc_math_correct (t[0]);
  617. else if (is_func (t, RSUP, 1) && is_func (t[0], RPRIME, 1))
  618. return misc_math_correct (t[0]);
  619. else if (is_script (t) && is_compound (t[0], "text", 1) &&
  620. is_atomic (t[0][0]) && is_alpha (t[0][0]->label))
  621. {
  622. if (N(t[0][0]->label) != 1) return tree (L(t), t[0][0]);
  623. else return tree (L(t), tree (WITH, "math-font-family", "trm",
  624. misc_math_correct (t[0])));
  625. }
  626. else if (is_compound (t, "math", 1)) {
  627. tree arg = misc_math_correct (t[0]);
  628. tree last= arg;
  629. if (is_concat (last) && N(last) > 0) last= last[N(last)-1];
  630. if (is_atomic (last) && N(last->label) > 0 &&
  631. is_punctuation (last->label [N(last->label)-1]))
  632. {
  633. string s= last->label;
  634. int i= N(s);
  635. while (i>0 && is_punctuation (s[i-1])) i--;
  636. if (i == N(s)) return compound ("math", arg);
  637. string tail= s (i, N(s));
  638. s= s (0, i);
  639. if (last == arg) {
  640. if (N(s) == 0) return tail;
  641. else return concat (compound ("math", s), tail);
  642. }
  643. else {
  644. tree cc= arg (0, N(arg) - 1);
  645. if (N(s) != 0) cc << tree (s);
  646. if (N(cc) == 1) cc= cc[0];
  647. return concat (compound ("math", cc), tail);
  648. }
  649. }
  650. else return compound ("math", arg);
  651. }
  652. else {
  653. int i, n= N(t);
  654. tree r (t, n);
  655. for (i=0; i<n; i++)
  656. r[i]= misc_math_correct (t[i]);
  657. if (is_concat (r))
  658. r= concat_recompose (concat_decompose (r));
  659. return r;
  660. }
  661. }
  662. /******************************************************************************
  663. * Count errors
  664. ******************************************************************************/
  665. static int
  666. count_math_formula_errors (tree t, int mode) {
  667. if (mode == 1) return 1;
  668. if (packrat_correct ("std-math", "Main", t)) return 0;
  669. else {
  670. if (mode == 2) cout << " ERROR> " << t << "\n";
  671. return 1;
  672. }
  673. }
  674. static int
  675. count_math_table_errors (tree t, int mode) {
  676. if (is_atomic (t)) return 0;
  677. else if (is_func (t, CELL, 1)) {
  678. if (t[0] == "" || t[0] == tree (DOCUMENT, "")) return 0;
  679. if (mode == 1) return 1;
  680. if (packrat_correct ("std-math", "Cell", t[0])) return 0;
  681. else {
  682. if (mode == 2) cout << " ERROR> " << t << "\n";
  683. return 1;
  684. }
  685. }
  686. else {
  687. int sum= 0;
  688. for (int i=0; i<N(t); i++)
  689. sum += count_math_table_errors (t[i], mode);
  690. return sum;
  691. }
  692. }
  693. int
  694. count_math_errors (tree t, int mode) {
  695. if (is_atomic (t)) return 0;
  696. else {
  697. int sum= 0;
  698. for (int i=0; i<N(t); i++) {
  699. tree cmode= the_drd->get_env_child (t, i, MODE, "text");
  700. if (cmode != "math") sum += count_math_errors (t[i], mode);
  701. else {
  702. tree u= t[i];
  703. while (is_func (u, DOCUMENT, 1) ||
  704. is_func (u, TFORMAT) ||
  705. is_func (u, WITH))
  706. u= u[N(u)-1];
  707. if (is_func (u, TABLE)) sum += count_math_table_errors (u, mode);
  708. else sum += count_math_formula_errors (u, mode);
  709. }
  710. }
  711. return sum;
  712. }
  713. }
  714. /******************************************************************************
  715. * Print mathematical status
  716. ******************************************************************************/
  717. static int count_formula= 0;
  718. static int count_initial_errors= 0;
  719. static int count_final_errors= 0;
  720. static int corrected_with= 0;
  721. static int corrected_superfluous_with= 0;
  722. static int corrected_brackets= 0;
  723. static int corrected_move_brackets= 0;
  724. static int corrected_misc= 0;
  725. static int corrected_superfluous_invisible= 0;
  726. static int corrected_homoglyph= 0;
  727. static int corrected_missing_invisible= 0;
  728. static int corrected_zealous_invisible= 0;
  729. void
  730. math_status_cumul_sub (tree t, int& cumul, int& errors) {
  731. int new_errors= count_math_errors (t);
  732. cumul += (errors - new_errors);
  733. errors= new_errors;
  734. }
  735. void
  736. math_status_cumul (tree t) {
  737. with_drd drd (get_document_drd (t));
  738. if (is_func (t, DOCUMENT))
  739. for (int i=0; i<N(t); i++)
  740. if (is_compound (t[i], "body", 1)) {
  741. t= t[i][0];
  742. break;
  743. }
  744. int errors= count_math_errors (t);
  745. count_formula += count_math_errors (t, 1);
  746. count_initial_errors += errors;
  747. t= with_correct (t);
  748. math_status_cumul_sub (t, corrected_with, errors);
  749. t= superfluous_with_correct (t);
  750. math_status_cumul_sub (t, corrected_superfluous_with, errors);
  751. t= upgrade_brackets (t);
  752. math_status_cumul_sub (t, corrected_brackets, errors);
  753. t= move_brackets (t);
  754. math_status_cumul_sub (t, corrected_move_brackets, errors);
  755. t= misc_math_correct (t);
  756. math_status_cumul_sub (t, corrected_misc, errors);
  757. t= superfluous_invisible_correct (t);
  758. math_status_cumul_sub (t, corrected_superfluous_invisible, errors);
  759. t= homoglyph_correct (t);
  760. math_status_cumul_sub (t, corrected_homoglyph, errors);
  761. t= superfluous_invisible_correct (t);
  762. math_status_cumul_sub (t, corrected_superfluous_invisible, errors);
  763. t= missing_invisible_correct (t);
  764. math_status_cumul_sub (t, corrected_missing_invisible, errors);
  765. count_final_errors += errors;
  766. //cout << "Errors= " << errors << "\n";
  767. //(void) count_math_errors (t, 2);
  768. t= missing_invisible_correct (t, 1);
  769. math_status_cumul_sub (t, corrected_zealous_invisible, errors);
  770. }
  771. void
  772. math_status_reset () {
  773. count_formula= 0;
  774. count_initial_errors= 0;
  775. count_final_errors= 0;
  776. corrected_with= 0;
  777. corrected_superfluous_with= 0;
  778. corrected_brackets= 0;
  779. corrected_move_brackets= 0;
  780. corrected_misc= 0;
  781. corrected_superfluous_invisible= 0;
  782. corrected_homoglyph= 0;
  783. corrected_missing_invisible= 0;
  784. }
  785. void
  786. math_status_print () {
  787. cout << "Formulas : " << count_formula << "\n";
  788. cout << "Initial errors : " << count_initial_errors << "\n";
  789. cout << "Final errors : " << count_final_errors << "\n";
  790. cout << "\n";
  791. cout << "With corrected : "
  792. << corrected_with << "\n";
  793. cout << "Superfluous with corrected : "
  794. << corrected_superfluous_with << "\n";
  795. cout << "Upgraded brackets : "
  796. << corrected_brackets << "\n";
  797. cout << "Moved brackets : "
  798. << corrected_move_brackets << "\n";
  799. cout << "Miscellaneous corrected : "
  800. << corrected_misc << "\n";
  801. cout << "Superfluous invisible corrected : "
  802. << corrected_superfluous_invisible << "\n";
  803. cout << "Homoglyphs corrected : "
  804. << corrected_homoglyph << "\n";
  805. cout << "Missing invisible corrected : "
  806. << corrected_missing_invisible << "\n";
  807. cout << "Zealous invisible corrected : "
  808. << corrected_zealous_invisible << "\n";
  809. cout << "\n";
  810. }
  811. /******************************************************************************
  812. * Master routines
  813. ******************************************************************************/
  814. bool
  815. enabled_preference (string s) {
  816. return call ("get-preference", s) == object ("on");
  817. }
  818. tree
  819. latex_correct (tree t) {
  820. // NOTE: matching brackets corrected in upgrade_tex
  821. t= misc_math_correct (t);
  822. //if (enabled_preference ("remove superfluous invisible"))
  823. t= superfluous_invisible_correct (t);
  824. //if (enabled_preference ("homoglyph correct"))
  825. t= homoglyph_correct (t);
  826. //if (enabled_preference ("remove superfluous invisible"))
  827. t= superfluous_invisible_correct (t);
  828. //if (enabled_preference ("insert missing invisible"))
  829. t= missing_invisible_correct_twice (t);
  830. //if (enabled_preference ("insert missing invisible"))
  831. t= missing_invisible_correct (t, 1);
  832. t= downgrade_big (t);
  833. return t;
  834. }
  835. tree
  836. automatic_correct (tree t, string version) {
  837. if (version_inf_eq (version, "1.0.7.9")) {
  838. t= misc_math_correct (t);
  839. if (enabled_preference ("remove superfluous invisible"))
  840. t= superfluous_invisible_correct (t);
  841. if (enabled_preference ("homoglyph correct"))
  842. t= homoglyph_correct (t);
  843. if (enabled_preference ("remove superfluous invisible"))
  844. t= superfluous_invisible_correct (t);
  845. if (enabled_preference ("insert missing invisible"))
  846. t= missing_invisible_correct_twice (t);
  847. if (enabled_preference ("zealous invisible correct"))
  848. t= missing_invisible_correct (t, 1);
  849. }
  850. t= downgrade_big (t);
  851. return t;
  852. }
  853. tree
  854. manual_correct (tree t) {
  855. t= with_correct (t);
  856. t= superfluous_with_correct (t);
  857. t= upgrade_brackets (t);
  858. t= misc_math_correct (t);
  859. if (enabled_preference ("manual remove superfluous invisible"))
  860. t= superfluous_invisible_correct (t);
  861. if (enabled_preference ("manual homoglyph correct"))
  862. t= homoglyph_correct (t);
  863. if (enabled_preference ("manual remove superfluous invisible"))
  864. t= superfluous_invisible_correct (t);
  865. if (enabled_preference ("manual insert missing invisible"))
  866. t= missing_invisible_correct_twice (t);
  867. if (enabled_preference ("manual zealous invisible correct"))
  868. t= missing_invisible_correct (t, 1);
  869. t= downgrade_big (t);
  870. return t;
  871. }