PageRenderTime 52ms CodeModel.GetById 15ms RepoModel.GetById 1ms app.codeStats 0ms

/tesseract-2.04/ccmain/adaptions.cpp

https://github.com/leogomes/Tesseract-iPhone-Demo
C++ | 1082 lines | 896 code | 145 blank | 41 comment | 263 complexity | 24c73f6691c0e0d4567b72b7eb019277 MD5 | raw file
Possible License(s): Apache-2.0
  1. /**********************************************************************
  2. * File: adaptions.cpp (Formerly adaptions.c)
  3. * Description: Functions used to adapt to blobs already confidently
  4. * identified
  5. * Author: Chris Newton
  6. * Created: Thu Oct 7 10:17:28 BST 1993
  7. *
  8. * (C) Copyright 1992, Hewlett-Packard Ltd.
  9. ** Licensed under the Apache License, Version 2.0 (the "License");
  10. ** you may not use this file except in compliance with the License.
  11. ** You may obtain a copy of the License at
  12. ** http://www.apache.org/licenses/LICENSE-2.0
  13. ** Unless required by applicable law or agreed to in writing, software
  14. ** distributed under the License is distributed on an "AS IS" BASIS,
  15. ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. ** See the License for the specific language governing permissions and
  17. ** limitations under the License.
  18. *
  19. **********************************************************************/
  20. #include "mfcpch.h"
  21. #ifdef __UNIX__
  22. #include <assert.h>
  23. #endif
  24. #include <ctype.h>
  25. #include <string.h>
  26. #include "tessbox.h"
  27. #include "tessvars.h"
  28. #include "memry.h"
  29. #include "mainblk.h"
  30. #include "charcut.h"
  31. #include "imgs.h"
  32. #include "scaleimg.h"
  33. #include "reject.h"
  34. #include "control.h"
  35. #include "adaptions.h"
  36. #include "stopper.h"
  37. #include "charsample.h"
  38. #include "matmatch.h"
  39. #include "secname.h"
  40. inT32 demo_word = 0;
  41. #define WINDOWNAMESIZE 13 /*max size of name */
  42. #define EXTERN
  43. EXTERN BOOL_VAR (tessedit_reject_ems, FALSE, "Reject all m's");
  44. EXTERN BOOL_VAR (tessedit_reject_suspect_ems, FALSE, "Reject suspect m's");
  45. EXTERN double_VAR (tessedit_cluster_t1, 0.20,
  46. "t1 threshold for clustering samples");
  47. EXTERN double_VAR (tessedit_cluster_t2, 0.40,
  48. "t2 threshold for clustering samples");
  49. EXTERN double_VAR (tessedit_cluster_t3, 0.12,
  50. "Extra threshold for clustering samples, only keep a new sample if best score greater than this value");
  51. EXTERN double_VAR (tessedit_cluster_accept_fraction, 0.80,
  52. "Largest fraction of characters in cluster for it to be used for adaption");
  53. EXTERN INT_VAR (tessedit_cluster_min_size, 3,
  54. "Smallest number of samples in a cluster for it to be used for adaption");
  55. EXTERN BOOL_VAR (tessedit_cluster_debug, FALSE,
  56. "Generate and print debug information for adaption by clustering");
  57. EXTERN BOOL_VAR (tessedit_use_best_sample, FALSE,
  58. "Use best sample from cluster when adapting");
  59. EXTERN BOOL_VAR (tessedit_test_cluster_input, FALSE,
  60. "Set reject map to enable cluster input to be measured");
  61. EXTERN BOOL_VAR (tessedit_matrix_match, TRUE, "Use matrix matcher");
  62. EXTERN BOOL_VAR (tessedit_mm_use_non_adaption_set, FALSE,
  63. "Don't try to adapt to characters on this list");
  64. EXTERN STRING_VAR (tessedit_non_adaption_set, ",.;:'~@*",
  65. "Characters to be avoided when adapting");
  66. EXTERN BOOL_VAR (tessedit_mm_adapt_using_prototypes, TRUE,
  67. "Use prototypes when adapting");
  68. EXTERN BOOL_VAR (tessedit_mm_use_prototypes, TRUE,
  69. "Use prototypes as clusters are built");
  70. EXTERN BOOL_VAR (tessedit_mm_use_rejmap, FALSE,
  71. "Adapt to characters using reject map");
  72. EXTERN BOOL_VAR (tessedit_mm_all_rejects, FALSE,
  73. "Adapt to all characters using, matrix matcher");
  74. EXTERN BOOL_VAR (tessedit_mm_only_match_same_char, FALSE,
  75. "Only match samples against clusters for the same character");
  76. EXTERN BOOL_VAR (tessedit_process_rns, FALSE, "Handle m - rn ambigs");
  77. EXTERN BOOL_VAR (tessedit_demo_adaption, FALSE,
  78. "Display cut images and matrix match for demo purposes");
  79. EXTERN INT_VAR (tessedit_demo_word1, 62,
  80. "Word number of first word to display");
  81. EXTERN INT_VAR (tessedit_demo_word2, 64,
  82. "Word number of second word to display");
  83. EXTERN STRING_VAR (tessedit_demo_file, "academe",
  84. "Name of document containing demo words");
  85. BOOL8 word_adaptable( //should we adapt?
  86. WERD_RES *word,
  87. uinT16 mode) {
  88. BOOL8 status = FALSE;
  89. BITS16 flags(mode);
  90. enum MODES
  91. {
  92. ADAPTABLE_WERD,
  93. ACCEPTABLE_WERD,
  94. CHECK_DAWGS,
  95. CHECK_SPACES,
  96. CHECK_ONE_ELL_CONFLICT,
  97. CHECK_AMBIG_WERD
  98. };
  99. /*
  100. 0: NO adaption
  101. */
  102. if (mode == 0) {
  103. return FALSE;
  104. }
  105. if (flags.bit (ADAPTABLE_WERD))
  106. status |= word->tess_would_adapt;
  107. if (flags.bit (ACCEPTABLE_WERD))
  108. status |= word->tess_accepted;
  109. if (!status) // If not set then
  110. return FALSE; // ignore other checks
  111. if (flags.bit (CHECK_DAWGS) &&
  112. (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
  113. (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
  114. (word->best_choice->permuter () != USER_DAWG_PERM) &&
  115. (word->best_choice->permuter () != NUMBER_PERM))
  116. return FALSE;
  117. if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE))
  118. return FALSE;
  119. if (flags.bit (CHECK_SPACES) &&
  120. (strchr (word->best_choice->string ().string (), ' ') != NULL))
  121. return FALSE;
  122. // if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word))
  123. if (flags.bit (CHECK_AMBIG_WERD) &&
  124. !NoDangerousAmbig(word->best_choice->string().string(),
  125. word->best_choice->lengths().string(),
  126. NULL))
  127. return FALSE;
  128. return status;
  129. }
  130. void collect_ems_for_adaption(WERD_RES *word,
  131. CHAR_SAMPLES_LIST *char_clusters,
  132. CHAR_SAMPLE_LIST *chars_waiting) {
  133. PBLOB_LIST *blobs = word->outword->blob_list ();
  134. PBLOB_IT blob_it(blobs);
  135. inT16 i;
  136. CHAR_SAMPLE *sample;
  137. PIXROW_LIST *pixrow_list;
  138. PIXROW_IT pixrow_it;
  139. IMAGELINE *imlines; // lines of the image
  140. TBOX pix_box; // box of imlines
  141. // extent
  142. WERD copy_outword; // copy to denorm
  143. PBLOB_IT copy_blob_it;
  144. OUTLINE_IT copy_outline_it;
  145. inT32 resolution = page_image.get_res ();
  146. if (tessedit_reject_ems || tessedit_reject_suspect_ems)
  147. return; // Do nothing
  148. if (word->word->bounding_box ().height () > resolution / 3)
  149. return;
  150. if (tessedit_demo_adaption)
  151. // Make sure not set
  152. tessedit_display_mm.set_value (FALSE);
  153. if (word_adaptable (word, tessedit_em_adaption_mode)
  154. && word->reject_map.reject_count () == 0
  155. && (strchr (word->best_choice->string ().string (), 'm') != NULL
  156. || (tessedit_process_rns
  157. && strstr (word->best_choice->string ().string (),
  158. "rn") != NULL))) {
  159. if (tessedit_process_rns
  160. && strstr (word->best_choice->string ().string (), "rn") != NULL) {
  161. copy_outword = *(word->outword);
  162. copy_blob_it.set_to_list (copy_outword.blob_list ());
  163. i = 0;
  164. while (word->best_choice->string ()[i] != '\0') {
  165. if (word->best_choice->string ()[i] == 'r'
  166. && word->best_choice->string ()[i + 1] == 'n') {
  167. copy_outline_it.set_to_list (copy_blob_it.data ()->
  168. out_list ());
  169. copy_outline_it.add_list_after (copy_blob_it.
  170. data_relative (1)->
  171. out_list ());
  172. copy_blob_it.forward ();
  173. delete (copy_blob_it.extract ());
  174. i++;
  175. }
  176. copy_blob_it.forward ();
  177. i++;
  178. }
  179. }
  180. else
  181. copy_outword = *(word->outword);
  182. copy_outword.baseline_denormalise (&word->denorm);
  183. char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
  184. pixrow_it.set_to_list (pixrow_list);
  185. pixrow_it.move_to_first ();
  186. blob_it.move_to_first ();
  187. for (i = 0;
  188. word->best_choice->string ()[i] != '\0';
  189. i++, pixrow_it.forward (), blob_it.forward ()) {
  190. if (word->best_choice->string ()[i] == 'm'
  191. || (word->best_choice->string ()[i] == 'r'
  192. && word->best_choice->string ()[i + 1] == 'n')) {
  193. #ifndef SECURE_NAMES
  194. if (tessedit_cluster_debug)
  195. tprintf ("Sample %c for adaption found in %s, index %d\n",
  196. word->best_choice->string ()[i],
  197. word->best_choice->string ().string (), i);
  198. #endif
  199. if (tessedit_matrix_match) {
  200. sample = clip_sample (pixrow_it.data (),
  201. imlines,
  202. pix_box,
  203. copy_outword.flag (W_INVERSE),
  204. word->best_choice->string ()[i]);
  205. if (sample == NULL) { //Clip failed
  206. #ifndef SECURE_NAMES
  207. tprintf ("Unable to clip sample from %s, index %d\n",
  208. word->best_choice->string ().string (), i);
  209. #endif
  210. if (word->best_choice->string ()[i] == 'r')
  211. i++;
  212. continue;
  213. }
  214. }
  215. else
  216. sample = new CHAR_SAMPLE (blob_it.data (),
  217. &word->denorm,
  218. word->best_choice->string ()[i]);
  219. cluster_sample(sample, char_clusters, chars_waiting);
  220. if (word->best_choice->string ()[i] == 'r')
  221. i++; // Skip next character
  222. }
  223. }
  224. delete[]imlines; // Free array of imlines
  225. delete pixrow_list;
  226. }
  227. }
  228. void collect_characters_for_adaption(WERD_RES *word,
  229. CHAR_SAMPLES_LIST *char_clusters,
  230. CHAR_SAMPLE_LIST *chars_waiting) {
  231. PBLOB_LIST *blobs = word->outword->blob_list ();
  232. PBLOB_IT blob_it(blobs);
  233. inT16 i;
  234. CHAR_SAMPLE *sample;
  235. PIXROW_LIST *pixrow_list;
  236. PIXROW_IT pixrow_it;
  237. IMAGELINE *imlines; // lines of the image
  238. TBOX pix_box; // box of imlines
  239. // extent
  240. WERD copy_outword; // copy to denorm
  241. inT32 resolution = page_image.get_res ();
  242. if (word->word->bounding_box ().height () > resolution / 3)
  243. return;
  244. if (tessedit_demo_adaption)
  245. // Make sure not set
  246. tessedit_display_mm.set_value (FALSE);
  247. if ((word_adaptable (word, tessedit_cluster_adaption_mode)
  248. && word->reject_map.reject_count () == 0) || tessedit_mm_use_rejmap) {
  249. if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap)
  250. return; // Reject map set to acceptable
  251. /* Collect information about good matches */
  252. copy_outword = *(word->outword);
  253. copy_outword.baseline_denormalise (&word->denorm);
  254. char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
  255. pixrow_it.set_to_list (pixrow_list);
  256. pixrow_it.move_to_first ();
  257. blob_it.move_to_first ();
  258. for (i = 0;
  259. word->best_choice->string ()[i] != '\0';
  260. i++, pixrow_it.forward (), blob_it.forward ()) {
  261. if (!(tessedit_mm_use_non_adaption_set
  262. && STRING (tessedit_non_adaption_set).contains (word->
  263. best_choice->
  264. string ()[i]))
  265. || (tessedit_mm_use_rejmap && word->reject_map[i].accepted ())) {
  266. #ifndef SECURE_NAMES
  267. if (tessedit_cluster_debug)
  268. tprintf ("Sample %c for adaption found in %s, index %d\n",
  269. word->best_choice->string ()[i],
  270. word->best_choice->string ().string (), i);
  271. #endif
  272. sample = clip_sample (pixrow_it.data (),
  273. imlines,
  274. pix_box,
  275. copy_outword.flag (W_INVERSE),
  276. word->best_choice->string ()[i]);
  277. if (sample == NULL) { //Clip failed
  278. #ifndef SECURE_NAMES
  279. tprintf ("Unable to clip sample from %s, index %d\n",
  280. word->best_choice->string ().string (), i);
  281. #endif
  282. continue;
  283. }
  284. cluster_sample(sample, char_clusters, chars_waiting);
  285. }
  286. }
  287. delete[]imlines; // Free array of imlines
  288. delete pixrow_list;
  289. }
  290. else if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap)
  291. // Set word to all rejects
  292. word->reject_map.rej_word_tess_failure ();
  293. }
  294. void cluster_sample(CHAR_SAMPLE *sample,
  295. CHAR_SAMPLES_LIST *char_clusters,
  296. CHAR_SAMPLE_LIST *chars_waiting) {
  297. CHAR_SAMPLES *best_cluster = NULL;
  298. CHAR_SAMPLES_IT c_it = char_clusters;
  299. CHAR_SAMPLE_IT cw_it = chars_waiting;
  300. float score;
  301. float best_score = MAX_INT32;
  302. if (c_it.empty ())
  303. c_it.add_to_end (new CHAR_SAMPLES (sample));
  304. else {
  305. for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {
  306. score = c_it.data ()->match_score (sample);
  307. if (score < best_score) {
  308. best_score = score;
  309. best_cluster = c_it.data ();
  310. }
  311. }
  312. if (tessedit_cluster_debug)
  313. tprintf ("Sample's best score %f\n", best_score);
  314. if (best_score < tessedit_cluster_t1) {
  315. if (best_score > tessedit_cluster_t3 || tessedit_mm_use_prototypes) {
  316. best_cluster->add_sample (sample);
  317. check_wait_list(chars_waiting, sample, best_cluster);
  318. #ifndef SECURE_NAMES
  319. if (tessedit_cluster_debug)
  320. tprintf ("Sample added to an existing cluster\n");
  321. #endif
  322. }
  323. else {
  324. #ifndef SECURE_NAMES
  325. if (tessedit_cluster_debug)
  326. tprintf
  327. ("Sample dropped, good match to an existing cluster\n");
  328. #endif
  329. }
  330. }
  331. else if (best_score > tessedit_cluster_t2) {
  332. c_it.add_to_end (new CHAR_SAMPLES (sample));
  333. #ifndef SECURE_NAMES
  334. if (tessedit_cluster_debug)
  335. tprintf ("New cluster created for this sample\n");
  336. #endif
  337. }
  338. else {
  339. cw_it.add_to_end (sample);
  340. if (tessedit_cluster_debug)
  341. tprintf ("Sample added to the wait list\n");
  342. }
  343. }
  344. }
  345. void check_wait_list(CHAR_SAMPLE_LIST *chars_waiting,
  346. CHAR_SAMPLE *sample,
  347. CHAR_SAMPLES *best_cluster) {
  348. CHAR_SAMPLE *wait_sample;
  349. CHAR_SAMPLE *test_sample = sample;
  350. CHAR_SAMPLE_IT cw_it = chars_waiting;
  351. CHAR_SAMPLE_LIST add_list; //Samples added to best cluster
  352. CHAR_SAMPLE_IT add_it = &add_list;
  353. float score;
  354. add_list.clear ();
  355. if (!cw_it.empty ()) {
  356. do {
  357. if (!add_list.empty ()) {
  358. add_it.forward ();
  359. test_sample = add_it.extract ();
  360. best_cluster->add_sample (test_sample);
  361. }
  362. for (cw_it.mark_cycle_pt ();
  363. !cw_it.cycled_list (); cw_it.forward ()) {
  364. wait_sample = cw_it.data ();
  365. if (tessedit_mm_use_prototypes)
  366. score = best_cluster->match_score (wait_sample);
  367. else
  368. score = sample->match_sample (wait_sample, FALSE);
  369. if (score < tessedit_cluster_t1) {
  370. if (score > tessedit_cluster_t3
  371. || tessedit_mm_use_prototypes) {
  372. add_it.add_after_stay_put (cw_it.extract ());
  373. #ifndef SECURE_NAMES
  374. if (tessedit_cluster_debug)
  375. tprintf
  376. ("Wait sample added to an existing cluster\n");
  377. #endif
  378. }
  379. else {
  380. #ifndef SECURE_NAMES
  381. if (tessedit_cluster_debug)
  382. tprintf
  383. ("Wait sample dropped, good match to an existing cluster\n");
  384. #endif
  385. }
  386. }
  387. }
  388. }
  389. while (!add_list.empty ());
  390. }
  391. }
  392. void complete_clustering(CHAR_SAMPLES_LIST *char_clusters,
  393. CHAR_SAMPLE_LIST *chars_waiting) {
  394. CHAR_SAMPLES *best_cluster;
  395. CHAR_SAMPLES_IT c_it = char_clusters;
  396. CHAR_SAMPLE_IT cw_it = chars_waiting;
  397. CHAR_SAMPLE *sample;
  398. inT32 total_sample_count = 0;
  399. while (!cw_it.empty ()) {
  400. cw_it.move_to_first ();
  401. sample = cw_it.extract ();
  402. best_cluster = new CHAR_SAMPLES (sample);
  403. c_it.add_to_end (best_cluster);
  404. check_wait_list(chars_waiting, sample, best_cluster);
  405. }
  406. for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {
  407. c_it.data ()->assign_to_char ();
  408. if (tessedit_use_best_sample)
  409. c_it.data ()->find_best_sample ();
  410. else if (tessedit_mm_adapt_using_prototypes)
  411. c_it.data ()->build_prototype ();
  412. if (tessedit_cluster_debug)
  413. total_sample_count += c_it.data ()->n_samples ();
  414. }
  415. #ifndef SECURE_NAMES
  416. if (tessedit_cluster_debug)
  417. tprintf ("Clustering completed, %d samples in all\n", total_sample_count);
  418. #endif
  419. #ifndef GRAPHICS_DISABLED
  420. if (tessedit_demo_adaption)
  421. display_cluster_prototypes(char_clusters);
  422. #endif
  423. }
  424. void adapt_to_good_ems(WERD_RES *word,
  425. CHAR_SAMPLES_LIST *char_clusters,
  426. CHAR_SAMPLE_LIST *chars_waiting) {
  427. PBLOB_LIST *blobs = word->outword->blob_list ();
  428. PBLOB_IT blob_it(blobs);
  429. inT16 i;
  430. CHAR_SAMPLE *sample;
  431. CHAR_SAMPLES_IT c_it = char_clusters;
  432. CHAR_SAMPLE_IT cw_it = chars_waiting;
  433. float score;
  434. float best_score;
  435. char best_char;
  436. CHAR_SAMPLES *best_cluster;
  437. PIXROW_LIST *pixrow_list;
  438. PIXROW_IT pixrow_it;
  439. IMAGELINE *imlines; // lines of the image
  440. TBOX pix_box; // box of imlines
  441. // extent
  442. WERD copy_outword; // copy to denorm
  443. TBOX b_box;
  444. PBLOB_IT copy_blob_it;
  445. OUTLINE_IT copy_outline_it;
  446. PIXROW *pixrow = NULL;
  447. static inT32 word_number = 0;
  448. #ifndef GRAPHICS_DISABLED
  449. ScrollView* demo_win = NULL;
  450. #endif
  451. inT32 resolution = page_image.get_res ();
  452. if (word->word->bounding_box ().height () > resolution / 3)
  453. return;
  454. word_number++;
  455. if (strchr (word->best_choice->string ().string (), 'm') == NULL
  456. && (tessedit_process_rns
  457. && strstr (word->best_choice->string ().string (), "rn") == NULL))
  458. return;
  459. if (tessedit_reject_ems)
  460. reject_all_ems(word);
  461. else if (tessedit_reject_suspect_ems)
  462. reject_suspect_ems(word);
  463. else {
  464. if (char_clusters->length () == 0) {
  465. #ifndef SECURE_NAMES
  466. if (tessedit_cluster_debug)
  467. tprintf ("No clusters to use for em adaption\n");
  468. #endif
  469. return;
  470. }
  471. if (!cw_it.empty ()) {
  472. complete_clustering(char_clusters, chars_waiting);
  473. print_em_stats(char_clusters, chars_waiting);
  474. }
  475. if ((!word_adaptable (word, tessedit_em_adaption_mode) ||
  476. word->reject_map.reject_count () != 0)
  477. && (strchr (word->best_choice->string ().string (), 'm') != NULL
  478. || (tessedit_process_rns
  479. && strstr (word->best_choice->string ().string (),
  480. "rn") != NULL))) {
  481. if (tessedit_process_rns
  482. && strstr (word->best_choice->string ().string (),
  483. "rn") != NULL) {
  484. copy_outword = *(word->outword);
  485. copy_blob_it.set_to_list (copy_outword.blob_list ());
  486. i = 0;
  487. while (word->best_choice->string ()[i] != '\0') {
  488. if (word->best_choice->string ()[i] == 'r'
  489. && word->best_choice->string ()[i + 1] == 'n') {
  490. copy_outline_it.set_to_list (copy_blob_it.data ()->
  491. out_list ());
  492. copy_outline_it.add_list_after (copy_blob_it.
  493. data_relative (1)->
  494. out_list ());
  495. copy_blob_it.forward ();
  496. delete (copy_blob_it.extract ());
  497. i++;
  498. }
  499. copy_blob_it.forward ();
  500. i++;
  501. }
  502. }
  503. else
  504. copy_outword = *(word->outword);
  505. copy_outword.baseline_denormalise (&word->denorm);
  506. copy_blob_it.set_to_list (copy_outword.blob_list ());
  507. char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
  508. pixrow_it.set_to_list (pixrow_list);
  509. pixrow_it.move_to_first ();
  510. // For debugging only
  511. b_box = copy_outword.bounding_box ();
  512. pixrow = pixrow_it.data ();
  513. blob_it.move_to_first ();
  514. copy_blob_it.move_to_first ();
  515. for (i = 0;
  516. word->best_choice->string ()[i] != '\0';
  517. i++, pixrow_it.forward (), blob_it.forward (),
  518. copy_blob_it.forward ()) {
  519. if ((word->best_choice->string ()[i] == 'm'
  520. || (word->best_choice->string ()[i] == 'r'
  521. && word->best_choice->string ()[i + 1] == 'n'))
  522. && !word->reject_map[i].perm_rejected ()) {
  523. if (tessedit_cluster_debug)
  524. tprintf ("Sample %c to check found in %s, index %d\n",
  525. word->best_choice->string ()[i],
  526. word->best_choice->string ().string (), i);
  527. if (tessedit_demo_adaption)
  528. tprintf
  529. ("Sample %c to check found in %s (%d), index %d\n",
  530. word->best_choice->string ()[i],
  531. word->best_choice->string ().string (), word_number,
  532. i);
  533. if (tessedit_matrix_match) {
  534. TBOX copy_box = copy_blob_it.data ()->bounding_box ();
  535. sample = clip_sample (pixrow_it.data (),
  536. imlines,
  537. pix_box,
  538. copy_outword.flag (W_INVERSE),
  539. word->best_choice->string ()[i]);
  540. //Clip failed
  541. if (sample == NULL) {
  542. tprintf
  543. ("Unable to clip sample from %s, index %d\n",
  544. word->best_choice->string ().string (), i);
  545. #ifndef SECURE_NAMES
  546. if (tessedit_cluster_debug)
  547. tprintf ("Sample rejected (no sample)\n");
  548. #endif
  549. word->reject_map[i].setrej_mm_reject ();
  550. if (word->best_choice->string ()[i] == 'r') {
  551. word->reject_map[i + 1].setrej_mm_reject ();
  552. i++;
  553. }
  554. continue;
  555. }
  556. }
  557. else
  558. sample = new CHAR_SAMPLE (blob_it.data (),
  559. &word->denorm,
  560. word->best_choice->
  561. string ()[i]);
  562. best_score = MAX_INT32;
  563. best_char = '\0';
  564. best_cluster = NULL;
  565. for (c_it.mark_cycle_pt ();
  566. !c_it.cycled_list (); c_it.forward ()) {
  567. if (c_it.data ()->character () != '\0') {
  568. score = c_it.data ()->match_score (sample);
  569. if (score < best_score) {
  570. best_cluster = c_it.data ();
  571. best_score = score;
  572. best_char = c_it.data ()->character ();
  573. }
  574. }
  575. }
  576. if (best_score > tessedit_cluster_t1) {
  577. #ifndef SECURE_NAMES
  578. if (tessedit_cluster_debug)
  579. tprintf ("Sample rejected (score %f)\n", best_score);
  580. if (tessedit_demo_adaption)
  581. tprintf ("Sample rejected (score %f)\n", best_score);
  582. #endif
  583. word->reject_map[i].setrej_mm_reject ();
  584. if (word->best_choice->string ()[i] == 'r')
  585. word->reject_map[i + 1].setrej_mm_reject ();
  586. }
  587. else {
  588. if (word->best_choice->string ()[i] == best_char) {
  589. #ifndef SECURE_NAMES
  590. if (tessedit_cluster_debug)
  591. tprintf ("Sample accepted (score %f)\n",
  592. best_score);
  593. if (tessedit_demo_adaption)
  594. tprintf ("Sample accepted (score %f)\n",
  595. best_score);
  596. #endif
  597. word->reject_map[i].setrej_mm_accept ();
  598. if (word->best_choice->string ()[i] == 'r')
  599. word->reject_map[i + 1].setrej_mm_accept ();
  600. }
  601. else {
  602. #ifndef SECURE_NAMES
  603. if (tessedit_cluster_debug)
  604. tprintf ("Sample rejected (char %c, score %f)\n",
  605. best_char, best_score);
  606. if (tessedit_demo_adaption)
  607. tprintf ("Sample rejected (char %c, score %f)\n",
  608. best_char, best_score);
  609. #endif
  610. word->reject_map[i].setrej_mm_reject ();
  611. if (word->best_choice->string ()[i] == 'r')
  612. word->reject_map[i + 1].setrej_mm_reject ();
  613. }
  614. }
  615. if (tessedit_demo_adaption) {
  616. if (strcmp (imagebasename.string (),
  617. tessedit_demo_file.string ()) != 0
  618. || word_number == tessedit_demo_word1
  619. || word_number == tessedit_demo_word2) {
  620. #ifndef GRAPHICS_DISABLED
  621. demo_win =
  622. display_clip_image(&copy_outword,
  623. page_image,
  624. pixrow_list,
  625. pix_box);
  626. #endif
  627. demo_word = word_number;
  628. best_cluster->match_score (sample);
  629. demo_word = 0;
  630. }
  631. }
  632. if (word->best_choice->string ()[i] == 'r')
  633. i++; // Skip next character
  634. }
  635. }
  636. delete[]imlines; // Free array of imlines
  637. delete pixrow_list;
  638. }
  639. }
  640. }
  641. void adapt_to_good_samples(WERD_RES *word,
  642. CHAR_SAMPLES_LIST *char_clusters,
  643. CHAR_SAMPLE_LIST *chars_waiting) {
  644. PBLOB_LIST *blobs = word->outword->blob_list ();
  645. PBLOB_IT blob_it(blobs);
  646. inT16 i;
  647. CHAR_SAMPLE *sample;
  648. CHAR_SAMPLES_IT c_it = char_clusters;
  649. CHAR_SAMPLE_IT cw_it = chars_waiting;
  650. float score;
  651. float best_score;
  652. char best_char;
  653. CHAR_SAMPLES *best_cluster;
  654. PIXROW_LIST *pixrow_list;
  655. PIXROW_IT pixrow_it;
  656. IMAGELINE *imlines; // lines of the image
  657. TBOX pix_box; // box of imlines
  658. // extent
  659. WERD copy_outword; // copy to denorm
  660. TBOX b_box;
  661. PBLOB_IT copy_blob_it;
  662. PIXROW *pixrow = NULL;
  663. static inT32 word_number = 0;
  664. #ifndef GRAPHICS_DISABLED
  665. ScrollView* demo_win = NULL;
  666. #endif
  667. inT32 resolution = page_image.get_res ();
  668. word_number++;
  669. if (tessedit_test_cluster_input)
  670. return;
  671. if (word->word->bounding_box ().height () > resolution / 3)
  672. return;
  673. if (char_clusters->length () == 0) {
  674. #ifndef SECURE_NAMES
  675. if (tessedit_cluster_debug)
  676. tprintf ("No clusters to use for adaption\n");
  677. #endif
  678. return;
  679. }
  680. if (!cw_it.empty ()) {
  681. complete_clustering(char_clusters, chars_waiting);
  682. print_em_stats(char_clusters, chars_waiting);
  683. }
  684. if ((!word_adaptable (word, tessedit_cluster_adaption_mode)
  685. && word->reject_map.reject_count () != 0) || tessedit_mm_use_rejmap) {
  686. if (tessedit_cluster_debug) {
  687. tprintf ("\nChecking: \"%s\" MAP ",
  688. word->best_choice->string ().string ());
  689. word->reject_map.print (debug_fp);
  690. tprintf ("\n");
  691. }
  692. copy_outword = *(word->outword);
  693. copy_outword.baseline_denormalise (&word->denorm);
  694. copy_blob_it.set_to_list (copy_outword.blob_list ());
  695. char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
  696. pixrow_it.set_to_list (pixrow_list);
  697. pixrow_it.move_to_first ();
  698. // For debugging only
  699. b_box = copy_outword.bounding_box ();
  700. pixrow = pixrow_it.data ();
  701. blob_it.move_to_first ();
  702. copy_blob_it.move_to_first ();
  703. for (i = 0;
  704. word->best_choice->string ()[i] != '\0';
  705. i++, pixrow_it.forward (), blob_it.forward (),
  706. copy_blob_it.forward ()) {
  707. if (word->reject_map[i].recoverable ()
  708. || (tessedit_mm_all_rejects && word->reject_map[i].rejected ())) {
  709. TBOX copy_box = copy_blob_it.data ()->bounding_box ();
  710. if (tessedit_cluster_debug)
  711. tprintf ("Sample %c to check found in %s, index %d\n",
  712. word->best_choice->string ()[i],
  713. word->best_choice->string ().string (), i);
  714. if (tessedit_demo_adaption)
  715. tprintf ("Sample %c to check found in %s (%d), index %d\n",
  716. word->best_choice->string ()[i],
  717. word->best_choice->string ().string (),
  718. word_number, i);
  719. sample = clip_sample (pixrow_it.data (),
  720. imlines,
  721. pix_box,
  722. copy_outword.flag (W_INVERSE),
  723. word->best_choice->string ()[i]);
  724. if (sample == NULL) { //Clip failed
  725. tprintf ("Unable to clip sample from %s, index %d\n",
  726. word->best_choice->string ().string (), i);
  727. #ifndef SECURE_NAMES
  728. if (tessedit_cluster_debug)
  729. tprintf ("Sample rejected (no sample)\n");
  730. #endif
  731. word->reject_map[i].setrej_mm_reject ();
  732. continue;
  733. }
  734. best_score = MAX_INT32;
  735. best_char = '\0';
  736. best_cluster = NULL;
  737. for (c_it.mark_cycle_pt ();
  738. !c_it.cycled_list (); c_it.forward ()) {
  739. if (c_it.data ()->character () != '\0') {
  740. score = c_it.data ()->match_score (sample);
  741. if (score < best_score) {
  742. best_cluster = c_it.data ();
  743. best_score = score;
  744. best_char = c_it.data ()->character ();
  745. }
  746. }
  747. }
  748. if (best_score > tessedit_cluster_t1) {
  749. #ifndef SECURE_NAMES
  750. if (tessedit_cluster_debug)
  751. tprintf ("Sample rejected (score %f)\n", best_score);
  752. if (tessedit_demo_adaption)
  753. tprintf ("Sample rejected (score %f)\n", best_score);
  754. #endif
  755. word->reject_map[i].setrej_mm_reject ();
  756. }
  757. else {
  758. if (word->best_choice->string ()[i] == best_char) {
  759. #ifndef SECURE_NAMES
  760. if (tessedit_cluster_debug)
  761. tprintf ("Sample accepted (score %f)\n", best_score);
  762. if (tessedit_demo_adaption)
  763. tprintf ("Sample accepted (score %f)\n", best_score);
  764. #endif
  765. if (tessedit_test_adaption)
  766. word->reject_map[i].setrej_minimal_rej_accept ();
  767. else
  768. word->reject_map[i].setrej_mm_accept ();
  769. }
  770. else {
  771. #ifndef SECURE_NAMES
  772. if (tessedit_cluster_debug)
  773. tprintf ("Sample rejected (char %c, score %f)\n",
  774. best_char, best_score);
  775. if (tessedit_demo_adaption)
  776. tprintf ("Sample rejected (char %c, score %f)\n",
  777. best_char, best_score);
  778. #endif
  779. word->reject_map[i].setrej_mm_reject ();
  780. }
  781. }
  782. if (tessedit_demo_adaption) {
  783. if (strcmp (imagebasename.string (),
  784. tessedit_demo_file.string ()) != 0
  785. || word_number == tessedit_demo_word1
  786. || word_number == tessedit_demo_word2) {
  787. #ifndef GRAPHICS_DISABLED
  788. demo_win =
  789. display_clip_image(&copy_outword,
  790. page_image,
  791. pixrow_list,
  792. pix_box);
  793. #endif
  794. demo_word = word_number;
  795. best_cluster->match_score (sample);
  796. demo_word = 0;
  797. }
  798. }
  799. }
  800. }
  801. delete[]imlines; // Free array of imlines
  802. delete pixrow_list;
  803. if (tessedit_cluster_debug) {
  804. tprintf ("\nFinal: \"%s\" MAP ",
  805. word->best_choice->string ().string ());
  806. word->reject_map.print (debug_fp);
  807. tprintf ("\n");
  808. }
  809. }
  810. }
  811. void print_em_stats(CHAR_SAMPLES_LIST *char_clusters,
  812. CHAR_SAMPLE_LIST *chars_waiting) {
  813. CHAR_SAMPLES_IT c_it = char_clusters;
  814. if (!tessedit_cluster_debug)
  815. return;
  816. #ifndef SECURE_NAMES
  817. tprintf ("There are %d clusters and %d samples waiting\n",
  818. char_clusters->length (), chars_waiting->length ());
  819. for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ())
  820. c_it.data ()->print (debug_fp);
  821. #endif
  822. tprintf ("\n");
  823. }
  824. CHAR_SAMPLE *clip_sample( //lines of the image
  825. PIXROW *pixrow,
  826. IMAGELINE *imlines,
  827. TBOX pix_box, //box of imlines extent
  828. BOOL8 white_on_black,
  829. char c) {
  830. TBOX b_box = pixrow->bounding_box ();
  831. float baseline_pos = 0;
  832. inT32 resolution = page_image.get_res ();
  833. if (!b_box.null_box ()) {
  834. ASSERT_HOST (b_box.width () < page_image.get_xsize () &&
  835. b_box.height () < page_image.get_ysize ());
  836. if (b_box.width () > resolution || b_box.height () > resolution) {
  837. tprintf ("clip sample: sample too big (%d x %d)\n",
  838. b_box.width (), b_box.height ());
  839. return NULL;
  840. }
  841. IMAGE *image = new (IMAGE);
  842. if (image->create (b_box.width (), b_box.height (), 1) == -1) {
  843. tprintf ("clip sample: create image failed (%d x %d)\n",
  844. b_box.width (), b_box.height ());
  845. delete image;
  846. return NULL;
  847. }
  848. if (!white_on_black)
  849. invert_image(image); // Set background to white
  850. pixrow->char_clip_image (imlines, pix_box, NULL, *image, baseline_pos);
  851. if (white_on_black)
  852. invert_image(image); //invert white on black for scaling &NN
  853. return new CHAR_SAMPLE (image, c);
  854. }
  855. else
  856. return NULL;
  857. }
  858. #ifndef GRAPHICS_DISABLED
  859. void display_cluster_prototypes(CHAR_SAMPLES_LIST *char_clusters) {
  860. inT16 proto_number = 0;
  861. CHAR_SAMPLES_IT c_it = char_clusters;
  862. char title[WINDOWNAMESIZE];
  863. for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {
  864. proto_number++;
  865. #ifndef SECURE_NAMES
  866. tprintf ("Displaying proto number %d\n", proto_number);
  867. #endif
  868. if (c_it.data ()->prototype () != NULL) {
  869. sprintf (title, "Proto - %d", proto_number);
  870. display_image (c_it.data ()->prototype ()->make_image (),
  871. title, (proto_number - 1) * 400, 0, FALSE);
  872. }
  873. }
  874. }
  875. #endif
  876. // *********************************************************************
  877. // Simplistic routines to test the effect of rejecting ems and fullstops
  878. // *********************************************************************
  879. void reject_all_ems(WERD_RES *word) {
  880. inT16 i;
  881. for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
  882. if (word->best_choice->string ()[i] == 'm')
  883. // reject all ems
  884. word->reject_map[i].setrej_mm_reject ();
  885. }
  886. }
  887. void reject_all_fullstops(WERD_RES *word) {
  888. inT16 i;
  889. for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
  890. if (word->best_choice->string ()[i] == '.')
  891. // reject all fullstops
  892. word->reject_map[i].setrej_mm_reject ();
  893. }
  894. }
  895. void reject_suspect_ems(WERD_RES *word) {
  896. inT16 i;
  897. if (!word_adaptable (word, tessedit_cluster_adaption_mode))
  898. for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
  899. if (word->best_choice->string ()[i] == 'm' && suspect_em (word, i))
  900. // reject all ems
  901. word->reject_map[i].setrej_mm_reject ();
  902. }
  903. }
  904. void reject_suspect_fullstops(WERD_RES *word) {
  905. inT16 i;
  906. for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
  907. if (word->best_choice->string ()[i] == '.'
  908. && suspect_fullstop (word, i))
  909. // reject all commas
  910. word->reject_map[i].setrej_mm_reject ();
  911. }
  912. }
  913. BOOL8 suspect_em(WERD_RES *word, inT16 index) {
  914. PBLOB_LIST *blobs = word->outword->blob_list ();
  915. PBLOB_IT blob_it(blobs);
  916. inT16 j;
  917. for (j = 0; j < index; j++)
  918. blob_it.forward ();
  919. return (blob_it.data ()->out_list ()->length () != 1);
  920. }
  921. BOOL8 suspect_fullstop(WERD_RES *word, inT16 i) {
  922. float aspect_ratio;
  923. PBLOB_LIST *blobs = word->outword->blob_list ();
  924. PBLOB_IT blob_it(blobs);
  925. inT16 j;
  926. TBOX box;
  927. inT16 width;
  928. inT16 height;
  929. for (j = 0; j < i; j++)
  930. blob_it.forward ();
  931. box = blob_it.data ()->bounding_box ();
  932. width = box.width ();
  933. height = box.height ();
  934. aspect_ratio = ((width > height) ? ((float) width) / height :
  935. ((float) height) / width);
  936. return (aspect_ratio > tessed_fullstop_aspect_ratio);
  937. }