PageRenderTime 35ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/FuzzyOcr-3.6.0/FuzzyOcr/Config.pm

#
Perl | 1032 lines | 900 code | 93 blank | 39 comment | 85 complexity | b8128d6cc6819c422d3184cc9264e0e9 MD5 | raw file
Possible License(s): Apache-2.0
  1. # <@LICENSE>
  2. # Licensed to the Apache Software Foundation (ASF) under one or more
  3. # contributor license agreements. See the NOTICE file distributed with
  4. # this work for additional information regarding copyright ownership.
  5. # The ASF licenses this file to you under the Apache License, Version 2.0
  6. # (the "License"); you may not use this file except in compliance with
  7. # the License. You may obtain a copy of the License at:
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. # </@LICENSE>
  17. use strict;
  18. package FuzzyOcr::Config;
  19. use lib qw(..);
  20. use FuzzyOcr::Logging qw(debuglog infolog warnlog errorlog);
  21. use FuzzyOcr::Scanset;
  22. use FuzzyOcr::Preprocessor;
  23. use Mail::SpamAssassin::Logger;
  24. use Fcntl qw(O_RDWR O_CREAT);
  25. use base 'Exporter';
  26. our @EXPORT_OK = qw/
  27. parse_config
  28. finish_parsing_end
  29. get_config
  30. set_config
  31. set_pid
  32. unset_pid
  33. kill_pid
  34. set_tmpdir
  35. get_tmpdir
  36. get_all_tmpdirs
  37. get_pms
  38. save_pms
  39. get_timeout
  40. get_scansets
  41. get_preprocessor
  42. get_thresholds
  43. get_wordlist
  44. get_mysql_ddb
  45. get_db_ref
  46. set_db_ref
  47. read_words
  48. /;
  49. use constant HAS_DBI => eval { require DBI; };
  50. use constant HAS_DBD_MYSQL => eval { require DBD::mysql; };
  51. use constant HAS_MLDBM => eval { require MLDBM; require MLDBM::Sync;};
  52. use constant HAS_DB_FILE => eval { require DB_File; };
  53. use constant HAS_STORABLE => eval { require Storable; };
  54. #Defines the defaults and reads the configuration and wordlists
  55. our %Threshold = ();
  56. our %words = ();
  57. our @scansets;
  58. our @preprocessors;
  59. our $conf;
  60. our $pms;
  61. our $timeout;
  62. our $pid;
  63. our $tmpdir;
  64. our @tmpdirs;
  65. our $dbref;
  66. # State of the plugin, already initialized?
  67. our $initialized = 0;
  68. our @bin_utils = qw/gifsicle
  69. giffix
  70. giftext
  71. gifinter
  72. giftopnm
  73. jpegtopnm
  74. pngtopnm
  75. bmptopnm
  76. tifftopnm
  77. ppmhist
  78. pamfile
  79. ocrad
  80. gocr/;
  81. our @paths = qw(/usr/local/netpbm/bin /usr/local/bin /usr/bin);
  82. my @img_types = qw/gif png jpeg bmp tiff/;
  83. sub get_timeout {
  84. unless (defined $timeout) {
  85. $timeout = Mail::SpamAssassin::Timeout->new({ secs => $conf->{focr_timeout} });
  86. }
  87. return $timeout;
  88. }
  89. sub set_pid {
  90. $pid = shift;
  91. debuglog("Saved pid: $pid");
  92. }
  93. sub unset_pid {
  94. $pid = 0;
  95. }
  96. sub kill_pid {
  97. if ($pid) {
  98. infolog("Sending SIGTERM to pid: $pid",2);
  99. my $ret = kill POSIX::SIGTERM, $pid;
  100. # Wait for zombie process if the process is a zombie (i.e. SIGTERM didn't work)
  101. wait();
  102. return ($ret, $pid);
  103. } else {
  104. return (-1, 0);
  105. }
  106. }
  107. sub set_tmpdir {
  108. $tmpdir = shift;
  109. push(@tmpdirs, $tmpdir);
  110. }
  111. sub get_tmpdir {
  112. return $tmpdir;
  113. }
  114. sub get_all_tmpdirs {
  115. return @tmpdirs;
  116. }
  117. sub save_pms {
  118. $pms = shift;
  119. }
  120. sub get_pms {
  121. return $pms;
  122. }
  123. sub get_config {
  124. return $conf;
  125. }
  126. sub get_wordlist {
  127. return \%words;
  128. }
  129. sub get_scansets {
  130. if ($conf->{focr_autosort_scanset}) {
  131. @scansets = sort { $b->{hit_counter} <=> $a->{hit_counter} } @scansets;
  132. }
  133. return \@scansets;
  134. }
  135. sub get_preprocessor {
  136. my ($label) = @_;
  137. foreach (@preprocessors) {
  138. if ($_->{label} eq $label) {
  139. return $_;
  140. }
  141. }
  142. return 0;
  143. }
  144. sub get_thresholds {
  145. return \%Threshold;
  146. }
  147. sub set_db_ref {
  148. $dbref = shift;
  149. }
  150. sub get_db_ref {
  151. return $dbref;
  152. }
  153. sub get_mysql_ddb {
  154. return undef unless (HAS_DBI and HAS_DBD_MYSQL);
  155. my $conf = get_config();
  156. my %dopts = ( AutoCommit => 1 );
  157. my $dsn = "dbi:mysql:database=".$conf->{focr_mysql_db};
  158. if (defined($conf->{focr_mysql_socket})) {
  159. $dsn .= ";mysql_socket=".$conf->{focr_mysql_socket};
  160. } else {
  161. $dsn .= ";host=".$conf->{focr_mysql_host};
  162. $dsn .= ";port=".$conf->{focr_mysql_port} if $conf->{focr_mysql_port} != 3306;
  163. }
  164. debuglog("Connecting to: $dsn");
  165. my $ddb = DBI->connect($dsn,
  166. $conf->{focr_mysql_user},
  167. $conf->{focr_mysql_pass},
  168. \%dopts);
  169. return $ddb;
  170. }
  171. sub set_config {
  172. my($self, $conf) = @_;
  173. my @cmds = ();
  174. foreach my $t (qw/s h w cn/) {
  175. push (@cmds, {
  176. setting => 'focr_threshold_'.$t,
  177. default => 0.01,
  178. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  179. });
  180. }
  181. foreach my $t (qw/c max_hash/) {
  182. push (@cmds, {
  183. setting => 'focr_threshold_'.$t,
  184. default => 5,
  185. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  186. });
  187. }
  188. foreach my $t (qw/height width/) {
  189. push (@cmds, {
  190. setting => 'focr_min_'.$t,
  191. default => 4,
  192. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  193. });
  194. push (@cmds, {
  195. setting => 'focr_max_'.$t,
  196. default => 800,
  197. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  198. });
  199. }
  200. push (@cmds, {
  201. setting => 'focr_threshold',
  202. default => 0.25,
  203. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  204. });
  205. push (@cmds, {
  206. setting => 'focr_counts_required',
  207. default => 2,
  208. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  209. });
  210. push (@cmds, {
  211. setting => 'focr_verbose',
  212. default => 1,
  213. code => sub {
  214. my ($self, $key, $value, $line) = @_;
  215. unless (defined $value && $value !~ m/^$/) {
  216. return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
  217. }
  218. unless ($value =~ m/^[0-9]+$/) {
  219. return $Mail::SpamAssassin::Conf::INVALID_VALUE;
  220. }
  221. $self->{focr_verbose} = $value+0;
  222. }
  223. });
  224. push (@cmds, {
  225. setting => 'focr_timeout',
  226. default => 10,
  227. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  228. });
  229. push (@cmds, {
  230. setting => 'focr_global_timeout',
  231. default => 0,
  232. type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
  233. });
  234. push (@cmds, {
  235. setting => 'focr_logfile',
  236. type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
  237. });
  238. push (@cmds, {
  239. setting => 'focr_log_stderr',
  240. default => 1,
  241. type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
  242. });
  243. push (@cmds, {
  244. setting => 'focr_log_pmsinfo',
  245. default => 1,
  246. type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
  247. });
  248. push (@cmds, {
  249. setting => 'focr_enable_image_hashing',
  250. default => 0,
  251. code => sub {
  252. my ($self, $key, $value, $line) = @_;
  253. unless (defined $value && $value !~ m/^$/) {
  254. return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
  255. }
  256. unless ($value =~ m/^[0123]$/) {
  257. return $Mail::SpamAssassin::Conf::INVALID_VALUE;
  258. }
  259. $self->{focr_enable_image_hashing} = $value+0;
  260. }
  261. });
  262. push (@cmds, {
  263. setting => 'focr_hashing_learn_scanned',
  264. default => 1,
  265. type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
  266. });
  267. push (@cmds, {
  268. setting => 'focr_skip_updates',
  269. default => 0,
  270. type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
  271. });
  272. push (@cmds, {
  273. setting => 'focr_digest_db',
  274. default => "/etc/mail/spamassassin/FuzzyOcr.hashdb",
  275. type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
  276. });
  277. push (@cmds, {
  278. setting => 'focr_global_wordlist',
  279. default => "/etc/mail/spamassassin/FuzzyOcr.words",
  280. type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
  281. });
  282. push (@cmds, {
  283. setting => 'focr_personal_wordlist',
  284. default => "__userstate__/FuzzyOcr.words",
  285. type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
  286. });
  287. push (@cmds, {
  288. setting => 'focr_no_homedirs',
  289. default => 0,
  290. type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
  291. });
  292. push (@cmds, {
  293. setting => 'focr_db_hash',
  294. default => "/etc/mail/spamassassin/FuzzyOcr.db",
  295. type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
  296. });
  297. push (@cmds, {
  298. setting => 'focr_db_safe',
  299. default => "/etc/mail/spamassassin/FuzzyOcr.safe.db",
  300. type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
  301. });
  302. push (@cmds, {
  303. setting => 'focr_db_max_days',
  304. default => 35,
  305. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  306. });
  307. push (@cmds, {
  308. setting => 'focr_keep_bad_images',
  309. default => 0,
  310. code => sub {
  311. my ($self, $key, $value, $line) = @_;
  312. unless (defined $value && $value !~ m/^$/) {
  313. return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
  314. }
  315. unless ($value =~ m/^[012]$/) {
  316. return $Mail::SpamAssassin::Conf::INVALID_VALUE;
  317. }
  318. $self->{focr_keep_bad_images} = $value+0;
  319. }
  320. });
  321. push (@cmds, {
  322. setting => 'focr_strip_numbers',
  323. default => 1,
  324. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  325. });
  326. push (@cmds, {
  327. setting => 'focr_twopass_scoring_factor',
  328. default => 1.5,
  329. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  330. });
  331. push (@cmds, {
  332. setting => 'focr_unique_matches',
  333. default => 0,
  334. type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
  335. });
  336. push (@cmds, {
  337. setting => 'focr_score_ham',
  338. default => 0,
  339. type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
  340. });
  341. push (@cmds, {
  342. setting => 'focr_base_score',
  343. default => 5,
  344. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  345. });
  346. push (@cmds, {
  347. setting => 'focr_add_score',
  348. default => 1,
  349. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  350. });
  351. push (@cmds, {
  352. setting => 'focr_corrupt_score',
  353. default => 2.5,
  354. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  355. });
  356. push (@cmds, {
  357. setting => 'focr_corrupt_unfixable_score',
  358. default => 5,
  359. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  360. });
  361. push (@cmds, {
  362. setting => 'focr_wrongctype_score',
  363. default => 1.5,
  364. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  365. });
  366. push (@cmds, {
  367. setting => 'focr_wrongext_score',
  368. default => 1.5,
  369. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  370. });
  371. push (@cmds, {
  372. setting => 'focr_autodisable_score',
  373. default => 10,
  374. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  375. });
  376. push (@cmds, {
  377. setting => 'focr_autodisable_negative_score',
  378. default => -5,
  379. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  380. });
  381. push (@cmds, {
  382. setting => 'focr_path_bin',
  383. default => '/usr/local/netpbm/bin:/usr/local/bin:/usr/bin',
  384. type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
  385. });
  386. foreach (@bin_utils) {
  387. push (@cmds, {
  388. setting => 'focr_bin_'.$_,
  389. type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
  390. });
  391. }
  392. foreach (@img_types) {
  393. push (@cmds, {
  394. setting => 'focr_skip_'.$_,
  395. default => 0,
  396. type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
  397. });
  398. push (@cmds, {
  399. setting => 'focr_max_size_'.$_,
  400. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  401. });
  402. }
  403. push (@cmds, {
  404. setting => 'focr_scan_pdfs',
  405. default => 0,
  406. type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
  407. });
  408. push (@cmds, {
  409. setting => 'focr_pdf_maxpages',
  410. default => 1,
  411. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  412. });
  413. push (@cmds, {
  414. setting => 'focr_scanset_file',
  415. default => '/etc/mail/spamassassin/FuzzyOcr.scansets',
  416. type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
  417. });
  418. push (@cmds, {
  419. setting => 'focr_preprocessor_file',
  420. default => '/etc/mail/spamassassin/FuzzyOcr.preps',
  421. type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
  422. });
  423. push (@cmds, {
  424. setting => 'focr_minimal_scanset',
  425. default => 1,
  426. type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
  427. });
  428. push (@cmds, {
  429. setting => 'focr_autosort_scanset',
  430. default => 1,
  431. type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
  432. });
  433. push (@cmds, {
  434. setting => 'focr_autosort_buffer',
  435. default => 10,
  436. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  437. });
  438. push (@cmds, {
  439. setting => 'focr_mysql_host',
  440. default => 'localhost',
  441. type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
  442. });
  443. push (@cmds, {
  444. setting => 'focr_mysql_port',
  445. default => 3306,
  446. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
  447. });
  448. push (@cmds, {
  449. setting => 'focr_mysql_socket',
  450. type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
  451. });
  452. push (@cmds, {
  453. setting => 'focr_mysql_db',
  454. default => 'FuzzyOcr',
  455. type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
  456. });
  457. push (@cmds, {
  458. setting => 'focr_mysql_hash',
  459. default => 'Hash',
  460. type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
  461. });
  462. push (@cmds, {
  463. setting => 'focr_mysql_safe',
  464. default => 'Safe',
  465. type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
  466. });
  467. push (@cmds, {
  468. setting => 'focr_mysql_update_hash',
  469. default => 0,
  470. type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
  471. });
  472. foreach (qw/user pass/) {
  473. push (@cmds, {
  474. setting => 'focr_mysql_'.$_,
  475. default => 'fuzzyocr',
  476. type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
  477. });
  478. }
  479. $conf->{parser}->register_commands(\@cmds);
  480. }
  481. sub parse_config {
  482. my ($self, $opts) = @_;
  483. # Don't parse a config twice
  484. if ($initialized) { return 1; }
  485. if ($opts->{key} eq 'focr_end_config') {
  486. $conf = $opts->{conf};
  487. my $main = $self->{main};
  488. my $retcode;
  489. # Parse preprocessor file
  490. my $pfile = $conf->{'focr_preprocessor_file'};
  491. infolog("Starting preprocessor parser for file \"$pfile\"...");
  492. ($retcode, @preprocessors) = parse_preprocessors($pfile);
  493. if ($retcode) {
  494. errorlog("Error parsing preprocessor file \"$pfile\", aborting...");
  495. return 0;
  496. }
  497. # Parse scanset file
  498. my $sfile = $conf->{'focr_scanset_file'};
  499. infolog("Starting scanset parser for file \"$sfile\"...");
  500. ($retcode, @scansets) = parse_scansets($sfile);
  501. if ($retcode) {
  502. errorlog("Error parsing scanset file \"$sfile\", aborting...");
  503. return 0;
  504. }
  505. return 1;
  506. } elsif ($opts->{key} eq 'focr_bin_helper') {
  507. my @cmd; $conf = $opts->{conf};
  508. my $val = Mail::SpamAssassin::Util::untaint_var($opts->{value}); $val =~ s/[\s]*//g;
  509. debuglog("focr_bin_helper: '$val'");
  510. foreach my $bin (split(',',$val)) {
  511. unless (grep {m/$bin/} @bin_utils) {
  512. push @bin_utils, $bin;
  513. push (@cmd, {
  514. setting => 'focr_bin_'.$bin,
  515. type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING
  516. });
  517. } else {
  518. warnlog("$bin is already defined, skipping...");
  519. }
  520. }
  521. if (scalar(@cmd)>0) {
  522. infolog("Adding <".scalar(@cmd)."> new helper apps");
  523. $conf->{parser}->register_commands(\@cmd)
  524. }
  525. return 1;
  526. }
  527. return 0;
  528. }
  529. sub finish_parsing_end {
  530. my ($self, $opts) = @_;
  531. # Don't call this function twice
  532. if ($initialized) { return 1; }
  533. my $main = $self->{main};
  534. $conf = $opts->{conf};
  535. # find external binaries
  536. @paths = split(/:/, $conf->{focr_path_bin});
  537. infolog("Searching in: $_") foreach @paths;
  538. foreach my $a (@bin_utils) {
  539. my $b = "focr_bin_$a";
  540. if (defined $conf->{$b} and ! -x $conf->{$b}) {
  541. infolog("cannot exec $a, removing...");
  542. delete $conf->{$b};
  543. }
  544. if (defined $conf->{$b}) {
  545. $conf->{$b} = Mail::SpamAssassin::Util::untaint_var($conf->{$b});
  546. debuglog("Using $a => $conf->{$b}");
  547. } else {
  548. foreach my $p (@paths) {
  549. my $f = "$p/$a";
  550. next unless -x $f;
  551. $conf->{$b} = $f;
  552. last;
  553. }
  554. if (defined $conf->{$b}) {
  555. infolog("Using $a => $conf->{$b}");
  556. } else {
  557. warnlog("Cannot find executable for $a");
  558. }
  559. }
  560. }
  561. # Allow scanning if in debug mode?
  562. $conf->{focr_autodisable_score} = 1000
  563. if $Mail::SpamAssassin::Logger::LOG_SA{level} == 3;
  564. # Extract Thresholds
  565. foreach my $k (keys %{$conf}) {
  566. if ($k =~ m/^focr_threshold_(\S+)/) {
  567. $Threshold{$1} = $conf->{$k};
  568. debuglog("Threshold[$1] => $conf->{$k}");
  569. }
  570. }
  571. # Display All Options
  572. foreach my $k (sort keys %{$conf}) {
  573. next unless $k =~ m/^focr_/;
  574. next if $k =~ m/^focr_bin_/;
  575. next if $k =~ m/^focr_mysql_pass/;
  576. next if $k =~ m/^focr_threshold_/;
  577. debuglog(" $k => ".$conf->{$k});
  578. }
  579. unless (@scansets) {
  580. warn("No scansets loaded, did you remove the \"focr_config_end\" line at the end of the .cf file?");
  581. }
  582. foreach my $prep (@preprocessors) {
  583. my $preplabel = $prep->{label};
  584. my $off = ($prep->{command} =~ m/^\$/) ? 1 : 0;
  585. my $t = 'focr_bin_'.substr($prep->{command},$off);
  586. #Replace command with full path if known
  587. $prep->{command} = $conf->{$t} if defined $conf->{$t};
  588. my $prepcmd = $prep->{command};
  589. if (defined $prep->{args}) {
  590. $prepcmd .= ' ' . $prep->{args};
  591. }
  592. infolog("Loaded preprocessor $preplabel: $prepcmd");
  593. }
  594. foreach my $scan (@scansets) {
  595. my $scanlabel = $scan->{label};
  596. my $off = ($scan->{command} =~ m/^\$/) ? 1 : 0;
  597. my $t = 'focr_bin_'.substr($scan->{command},$off);
  598. #Replace command with full path if known
  599. $scan->{command} = $conf->{$t} if defined $conf->{$t};
  600. my $scancmd = $scan->{command};
  601. if (defined $scan->{args}) {
  602. $scancmd .= ' ' . $scan->{args};
  603. }
  604. infolog("Using scan $scanlabel: $scancmd");
  605. }
  606. if ($conf->{focr_enable_image_hashing} == 3) {
  607. unless (HAS_DBI and HAS_DBD_MYSQL) {
  608. $conf->{focr_enable_image_hashing} = 0;
  609. errorlog("Disable Image Hashing");
  610. errorlog("Missing DBI") unless HAS_DBI;
  611. errorlog("Missing DBD::mysql") unless HAS_DBD_MYSQL;
  612. }
  613. # Warn if MLDBM databases are present, but can't be imported
  614. unless (HAS_MLDBM and HAS_DB_FILE and HAS_STORABLE and (-r $conf->{focr_db_hash} or -r $conf->{focr_db_safe})) {
  615. infolog("Importing for MLDBM databases not available (dependencies missing)");
  616. }
  617. }
  618. if ($conf->{focr_enable_image_hashing} == 2) {
  619. unless (HAS_MLDBM and HAS_DB_FILE and HAS_STORABLE) {
  620. $conf->{focr_enable_image_hashing} = 0;
  621. errorlog("Disable Image Hashing");
  622. errorlog("Missing MLDBM and/or MLDBM::Sync") unless HAS_MLDBM;
  623. errorlog("Missing DB_File") unless HAS_DB_FILE;
  624. errorlog("Missing Storable") unless HAS_STORABLE;
  625. }
  626. }
  627. unless ($conf->{focr_skip_updates}) {
  628. if ($conf->{focr_enable_image_hashing} == 2 and -r $conf->{focr_digest_db}) {
  629. import MLDBM qw(DB_File Storable);
  630. my %DB; my $dbm; my $sdbm; my $err = 0;
  631. my $now = time - ($conf->{focr_db_max_days}*86400);
  632. $sdbm = tie %DB, 'MLDBM::Sync', $conf->{focr_db_hash} or $err++;
  633. if ($err) {
  634. errorlog("Could not open \"$conf->{focr_db_hash}\"");
  635. } else {
  636. $sdbm->Lock;
  637. my $hash = 0;
  638. infolog("Expiring records prior to: ".scalar(localtime($now)));
  639. foreach my $k (keys %DB) {
  640. my $db = $DB{$k};
  641. if ($db->{check} < $now) {
  642. infolog("Expire: <$k> Reason: $db->{check} < $now");
  643. delete $DB{$k}; $hash++;
  644. }
  645. }
  646. infolog("Expired <$hash> Image Hashes after $conf->{focr_db_max_days} day(s)")
  647. if ($hash>0);
  648. $hash = 0;
  649. open HASH, $conf->{focr_digest_db};
  650. while (<HASH>) {
  651. chomp;
  652. my($score,$basic,$key) = split('::',$_,3);
  653. next if (defined $DB{$key});
  654. $dbm = $DB{$key};
  655. $dbm->{score} = $score;
  656. $dbm->{basic} = $basic;
  657. $dbm->{input} =
  658. $dbm->{check} = time;
  659. $dbm->{match} = 1;
  660. $DB{$key} = $dbm;
  661. $hash++;
  662. }
  663. close HASH;
  664. infolog("Imported <$hash> Image Hashes from \"$conf->{focr_digest_db}\"")
  665. if ($hash>0);
  666. $hash = scalar(keys %DB);
  667. infolog("<$hash> Known BAD Image Hashes Available");
  668. $sdbm->UnLock;
  669. undef $sdbm;
  670. untie %DB;
  671. }
  672. $err = 0;
  673. $sdbm = tie %DB, 'MLDBM::Sync', $conf->{focr_db_safe} or $err++;
  674. if ($err) {
  675. errorlog("Could not open \"$conf->{focr_db_safe}\"");
  676. } else {
  677. $sdbm->Lock;
  678. my $hash = 0;
  679. foreach my $k (keys %DB) {
  680. my $db = $DB{$k};
  681. if ($db->{check} < $now) {
  682. infolog("Expire: <$k> Reason: $db->{check} < $now");
  683. delete $DB{$k}; $hash++;
  684. }
  685. }
  686. infolog("Expired <$hash> Image Hashes after $conf->{focr_db_max_days} day(s)")
  687. if ($hash>0);
  688. $hash = scalar(keys %DB);
  689. infolog("<$hash> Known GOOD Image Hashes Available");
  690. $sdbm->UnLock;
  691. undef $sdbm;
  692. untie %DB;
  693. }
  694. }
  695. if ($conf->{focr_enable_image_hashing} == 3 and defined (my $ddb = get_mysql_ddb())
  696. and (-r $conf->{focr_db_hash} or -r $conf->{focr_db_safe})
  697. and HAS_MLDBM and HAS_DB_FILE and HAS_STORABLE) {
  698. import MLDBM qw(DB_File Storable);
  699. my $db = $conf->{focr_mysql_db};
  700. my $tab = $conf->{focr_mysql_hash};
  701. my $file = $conf->{focr_db_hash};
  702. my %DB; my $dbm; my $sdbm; my $err = 0;
  703. $sdbm = tie %DB, 'MLDBM::Sync', $file or $err++;
  704. if ($err) {
  705. errorlog("Could not open \"$file\"");
  706. } else {
  707. $sdbm->ReadLock;
  708. foreach my $k (keys %DB) {
  709. my $dbm = $DB{$k};
  710. my $sql = qq(select score from $db.$tab where $tab.key='$k');
  711. my @data = $ddb->selectrow_array($sql);
  712. unless (scalar(@data)>0) {
  713. $sql = "insert into $db.$tab values ('$k'";
  714. foreach my $y (qw/basic fname ctype/) {
  715. my $val = defined($dbm->{$y}) ? $dbm->{$y} : '';
  716. $sql .= ",'$val'";
  717. }
  718. if ($dbm->{ctype} =~ m/gif/i) { $sql .= ",'1'"; }
  719. elsif ($dbm->{ctype} =~ m/jpg|jpeg/i) { $sql .= ",'2'"; }
  720. elsif ($dbm->{ctype} =~ m/png/i) { $sql .= ",'3'"; }
  721. elsif ($dbm->{ctype} =~ m/bmp/i) { $sql .= ",'4'"; }
  722. elsif ($dbm->{ctype} =~ m/tiff/i) { $sql .= ",'5'"; }
  723. else { $sql .= ",'0'"; }
  724. foreach my $y (qw/match input check score dinfo/) {
  725. my $val = defined($dbm->{$y}) ? $dbm->{$y} : '';
  726. $sql .= ",'$val'";
  727. }
  728. $sql .= ")";
  729. debuglog($sql);
  730. $ddb->do($sql); $err++;
  731. }
  732. }
  733. $sdbm->UnLock;
  734. undef $sdbm;
  735. untie %DB;
  736. infolog("Stored [$err] Hashes in $db.$tab") if $err>0;
  737. }
  738. $tab = $conf->{focr_mysql_safe};
  739. $file = $conf->{focr_db_safe};
  740. $err = 0;
  741. $sdbm = tie %DB, 'MLDBM::Sync', $file or $err++;
  742. if ($err) {
  743. errorlog("Could not open \"$file\"");
  744. } else {
  745. $sdbm->ReadLock;
  746. foreach my $k (keys %DB) {
  747. my $dbm = $DB{$k};
  748. my $sql = qq(select score from $db.$tab where $tab.key='$k');
  749. my @data = $ddb->selectrow_array($sql);
  750. unless (scalar(@data)>0) {
  751. $sql = "insert into $db.$tab values ('$k'";
  752. foreach my $y (qw/basic fname ctype/) {
  753. my $val = defined($dbm->{$y}) ? $dbm->{$y} : '';
  754. $sql .= ",'$val'";
  755. }
  756. if ($dbm->{ctype} =~ m/gif/i) { $sql .= ",'1'"; }
  757. elsif ($dbm->{ctype} =~ m/jpg|jpeg/i) { $sql .= ",'2'"; }
  758. elsif ($dbm->{ctype} =~ m/png/i) { $sql .= ",'3'"; }
  759. elsif ($dbm->{ctype} =~ m/bmp/i) { $sql .= ",'4'"; }
  760. elsif ($dbm->{ctype} =~ m/tiff/i) { $sql .= ",'5'"; }
  761. else { $sql .= ",'0'"; }
  762. foreach my $y (qw/match input check score dinfo/) {
  763. my $val = defined($dbm->{$y}) ? $dbm->{$y} : '';
  764. $sql .= ",'$val'";
  765. }
  766. $sql .= ")";
  767. debuglog($sql);
  768. $ddb->do($sql); $err++;
  769. }
  770. }
  771. $sdbm->UnLock;
  772. undef $sdbm;
  773. untie %DB;
  774. infolog("Stored [$err] Hashes in $db.$tab") if $err>0;
  775. }
  776. debuglog("done updating MySQL database");
  777. $ddb->disconnect;
  778. }
  779. }
  780. read_words( $conf->{focr_global_wordlist} , 'Global');
  781. 1;
  782. # Important: We parsed the config now and did all post config parsing stuff
  783. # don't do it again (for amavisd and other 3rd party applications using the SA API directly)
  784. $initialized = 1;
  785. }
  786. sub read_words {
  787. my $wfile = $_[0];
  788. return unless ( -e $wfile );
  789. my $tfile = $_[1] || 'Personal';
  790. unless ( -r $wfile ) {
  791. warnlog("Cannot read $tfile wordlist: \"$wfile\"\n Please check file path and permissions are correct.");
  792. return;
  793. }
  794. my $cnt = 0;
  795. open WORDLIST, "<$wfile";
  796. while(my $w = <WORDLIST>) {
  797. chomp($w);
  798. $w =~ s/\s*//;
  799. $w =~ s/#(.*)//;
  800. next unless $w;
  801. my $wt = $conf->{focr_threshold};
  802. if ($w =~ /^(.*?)::(0(\.\d+){0,1})/) {
  803. ($w, $wt) = (lc($1), $2);
  804. $wt = $conf->{focr_threshold} unless ($wt =~ m/[\d\.]+/);
  805. } else {
  806. $wt *= 0.750 if length($w) == 5;
  807. $wt *= 0.500 if length($w) == 4;
  808. $wt *= 0.250 if length($w) < 4;
  809. }
  810. $words{$w} = $wt; $cnt++;
  811. }
  812. close WORDLIST;
  813. infolog("Added <$cnt> words from \"$wfile\"") if ($cnt>0);
  814. }
  815. sub parse_scansets {
  816. my ($file) = @_;
  817. unless (open(SFILE, "<$file")) {
  818. warnlog("Failed to open scanset file \"$file\", aborting...");
  819. return 1;
  820. }
  821. my @slabels;
  822. my @scanlist;
  823. my $scanset;
  824. while(<SFILE>) {
  825. # We are in the middle of a scanset
  826. if(defined $scanset) {
  827. # Strip comments and ignore blank lines
  828. chomp($_);
  829. $_ =~ s/(\s)*#(.*)//;
  830. unless ($_) {
  831. next;
  832. }
  833. debuglog("line $_");
  834. if ($_ =~ /^(\s)*preprocessors(\s)*=(\s)*(.*)$/i) {
  835. my $prep = $4;
  836. $scanset->{preprocessors} = $prep;
  837. $prep =~ s/ //g;
  838. my @preps = split(',', $prep);
  839. foreach (@preps) {
  840. unless(get_preprocessor($_)) {
  841. errorlog("Unknown preprocessor \"$_\" used in scansets line $., aborting...");
  842. return 1;
  843. }
  844. }
  845. } elsif ($_ =~ /^(\s)*(command|args)(\s)*=(\s)*(.*)$/i) {
  846. my $tag = $2;
  847. my $val = $5;
  848. if ($val =~ /(<|>|\||;)/) {
  849. errorlog("OCR $tag may not contain \"< > | ;\", aborting...");
  850. return 1;
  851. }
  852. $scanset->{$tag} = $val;
  853. } elsif ($_ =~ /^(\s)*force_output_in(\s)*=(\s)*(.*)$/i) {
  854. $scanset->{force_output_in} = $4;
  855. # Scanset is closing
  856. } elsif ($_ =~ /^(\s)*\}/) {
  857. foreach my $tag (qw/command args/) {
  858. unless ($scanset->{$tag}) {
  859. my $l = $scanset->{label};
  860. errorlog("Scanset \"$l\" is missing $tag line, aborting...");
  861. return 1;
  862. }
  863. }
  864. push(@scanlist, $scanset);
  865. $scanset = undef;
  866. } else {
  867. errorlog("Unknown token at line $., aborting...");
  868. return 1;
  869. }
  870. # Start a new scanset
  871. } elsif ($_ =~ /^(\s)*scanset(\s)+(.+?)(\s)+\{$/i) {
  872. debuglog("line $_");
  873. if (grep $_ eq $3, @slabels) {
  874. errorlog("Label already used earlier in line $., aborting...");
  875. return 1;
  876. }
  877. $scanset = FuzzyOcr::Scanset->new($3);
  878. push(@slabels, $3);
  879. }
  880. }
  881. close(SFILE);
  882. return (0, @scanlist);
  883. }
  884. sub parse_preprocessors {
  885. my ($file) = @_;
  886. unless (open(PFILE, "<$file")) {
  887. errorlog("Failed to open preprocessor file \"$file\", aborting...");
  888. return 1;
  889. }
  890. my @plabels;
  891. my @preplist;
  892. my $preprocessor;
  893. while(<PFILE>) {
  894. chomp($_);
  895. $_ =~ s/(\s)*#(.*)//;
  896. unless ($_) {
  897. next;
  898. }
  899. # We are in the middle of a preprocessor
  900. if(defined $preprocessor) {
  901. debuglog("line: $_");
  902. if ($_ =~ /^(\s)*(command|args)(\s)*=(\s)*(.*)$/i) {
  903. my $tag = $2;
  904. my $val = $5;
  905. if ($val =~ /(<|>|\||;)/) {
  906. errorlog("Preprocessor $tag may not contain \"< > | ;\", aborting...");
  907. return 1;
  908. }
  909. $preprocessor->{$tag} = $val;
  910. # Preprocessor is closing
  911. } elsif ($_ =~ /^(\s)*\}/) {
  912. foreach my $tag (qw/command/) {
  913. unless ($preprocessor->{$tag}) {
  914. my $l = $preprocessor->{label};
  915. errorlog("Preprocessor \"$l\" is missing $tag line, aborting...");
  916. return 1;
  917. }
  918. }
  919. push(@preplist, $preprocessor);
  920. $preprocessor = undef;
  921. } else {
  922. errorlog("Unknown token at line $., aborting...");
  923. return 1;
  924. }
  925. # Start a new preprocessor
  926. } elsif ($_ =~ /^(\s)*preprocessor(\s)+(.+?)(\s)+\{$/i) {
  927. debuglog("line: $_");
  928. if (grep $_ eq $3, @plabels) {
  929. errorlog("Error, label already used earlier in line $., aborting...");
  930. return 1;
  931. }
  932. $preprocessor = FuzzyOcr::Preprocessor->new($3);
  933. push(@plabels, $3);
  934. }
  935. }
  936. close(PFILE);
  937. return (0, @preplist);
  938. }
  939. 1;