PageRenderTime 61ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 1ms

/lib/Mail/SpamAssassin/BayesStore/BDB.pm

https://gitlab.com/mba811/spamassassin
Perl | 1576 lines | 1259 code | 243 blank | 74 comment | 128 complexity | fdc2d84be81830c2dfd430504d2018e5 MD5 | raw file
Possible License(s): Apache-2.0
  1. # <@LICENSE>
  2. # Licensed to the Apache Software Foundation (ASF) under one or more
  3. # contributor license agreements. See the NOTICE file distributed with
  4. # this work for additional information regarding copyright ownership.
  5. # The ASF licenses this file to you under the Apache License, Version 2.0
  6. # (the "License"); you may not use this file except in compliance with
  7. # the License. You may obtain a copy of the License at:
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. # </@LICENSE>
  17. =head1 NAME
  18. Mail::SpamAssassin::BayesStore::BDB - BerkeleyDB Bayesian Storage Module Implementation
  19. =head1 SYNOPSIS
  20. =head1 DESCRIPTION
  21. This module implementes a BDB based bayesian storage module.
  22. =cut
  23. package Mail::SpamAssassin::BayesStore::BDB;
  24. use strict;
  25. use warnings;
  26. use bytes;
  27. use re 'taint';
  28. use Errno qw(EBADF);
  29. #use Data::Dumper;
  30. use File::Basename;
  31. use File::Path;
  32. BEGIN {
  33. eval { require Digest::SHA; import Digest::SHA qw(sha1); 1 }
  34. or do { require Digest::SHA1; import Digest::SHA1 qw(sha1) }
  35. }
  36. use Mail::SpamAssassin::BayesStore;
  37. use Mail::SpamAssassin::Logger;
  38. use vars qw( @ISA );
  39. @ISA = qw( Mail::SpamAssassin::BayesStore );
  40. use constant HAS_BDB => eval { require BerkeleyDB; BerkeleyDB->import; };
  41. my $rmw = DB_RMW;
  42. my $next = DB_NEXT;
  43. =head1 METHODS
  44. =head2 new
  45. public class (Mail::SpamAssassin::BayesStore::SQL) new (Mail::Spamassassin::Plugin::Bayes $bayes)
  46. Description:
  47. This methods creates a new instance of the Mail::SpamAssassin::BayesStore::BDB
  48. object. It expects to be passed an instance of the Mail::SpamAssassin:Bayes
  49. object which is passed into the Mail::SpamAssassin::BayesStore parent object.
  50. =cut
  51. sub new {
  52. my $class = shift;
  53. $class = ref($class) || $class;
  54. my $self = $class->SUPER::new(@_);
  55. $self->{supported_db_version} = 3;
  56. $self->{is_really_open} = 0;
  57. $self->{is_writable} = 0;
  58. $self->{is_officially_open} = 0;
  59. return $self;
  60. }
  61. sub DESTROY {
  62. my $self = shift;
  63. $self->_close_db;
  64. }
  65. =head2 tie_db_readonly
  66. public instance (Boolean) tie_db_readonly ();
  67. Description:
  68. This method ensures that the database connection is properly setup and
  69. working.
  70. =cut
  71. sub tie_db_readonly {
  72. my($self) = @_;
  73. #dbg("bayes: tie_db_readonly");
  74. # my $result = ($self->{is_really_open} && !$self->{is_writable})
  75. # || $self->_open_db(0);
  76. my $result = $self->{is_really_open} || $self->_open_db(0);
  77. dbg("bayes: tie_db_readonly, result is $result");
  78. return $result;
  79. }
  80. =head2 tie_db_writable
  81. public instance (Boolean) tie_db_writable ()
  82. Description:
  83. This method ensures that the database connection is properly setup and
  84. working. If necessary it will initialize the database so that they can
  85. begin using the database immediately.
  86. =cut
  87. sub tie_db_writable {
  88. my($self) = @_;
  89. #dbg("bayes: tie_db_writable");
  90. my $result = ($self->{is_really_open} && $self->{is_writable})
  91. || $self->_open_db(1);
  92. dbg("bayes: tie_db_writable, result is $result");
  93. return $result;
  94. }
  95. =head2 _open_db
  96. private instance (Boolean) _open_db (Boolean $writable)
  97. Description:
  98. This method ensures that the database connection is properly setup and
  99. working. It will initialize a users bayes variables so that they
  100. can begin using the database immediately.
  101. =cut
  102. sub _open_db {
  103. my($self, $writable) = @_;
  104. dbg("bayes: _open_db(%s, %s); BerkeleyDB %s, libdb %s",
  105. $writable ? 'for writing' : 'for reading',
  106. $self->{is_really_open} ? 'already open' : 'not yet open',
  107. BerkeleyDB->VERSION, $BerkeleyDB::db_version);
  108. # Always notice state changes
  109. $self->{is_writable} = $writable;
  110. return 1 if $self->{is_really_open};
  111. #dbg("bayes: not already tied");
  112. my $main = $self->{bayes}->{main};
  113. if (!defined($main->{conf}->{bayes_path})) {
  114. dbg("bayes: bayes_path not defined");
  115. return 0;
  116. }
  117. #dbg("bayes: Reading db configs");
  118. $self->read_db_configs();
  119. my $path = dirname $main->sed_path($main->{conf}->{bayes_path});
  120. #dbg("bayes: Path is $path");
  121. # Path must exist or we must be in writable mode
  122. if (-d $path) {
  123. # All is cool
  124. } elsif ($writable) {
  125. # Create the path
  126. eval {
  127. mkpath($path, 0, (oct($main->{conf}->{bayes_file_mode}) & 0777));
  128. };
  129. warn("bayes: Couldn't create path: $@") if $@;
  130. } else {
  131. # FAIL
  132. warn("bayes: bayes_path doesn't exist and can't create: $path");
  133. return 0;
  134. }
  135. # Now we can set up our environment
  136. my $flags = DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN;
  137. $flags |= DB_CREATE if $writable;
  138. # DB_REGISTER|DB_RECOVER|
  139. # In the Berkeley DB 4.7 release, the logging subsystem is configured
  140. # using the DB_ENV->log_set_config method instead of the previously used
  141. # DB_ENV->set_flags method. The DB_ENV->set_flags method no longer accepts
  142. # flags DB_DIRECT_LOG, DB_DSYNC_LOG, DB_LOG_INMEMORY or DB_LOG_AUTOREMOVE.
  143. # Applications should be modified to use the equivalent flags accepted by
  144. # the DB_ENV->log_set_config method.
  145. # -SetFlags => DB_LOG_AUTOREMOVE
  146. dbg("bayes: %s environment: %s, 0x%x, %s",
  147. $writable ? 'Opening or creating' : 'Opening existing',
  148. $path, $flags, $main->{conf}->{bayes_file_mode});
  149. unless ($self->{env} = BerkeleyDB::Env->new(
  150. -Cachesize => 67108864, -Home => $path, -Flags => $flags,
  151. -Mode => (oct($main->{conf}->{bayes_file_mode}) & 0666),
  152. )) {
  153. dbg("bayes: berkeleydb environment couldn't initialize: $BerkeleyDB::Error");
  154. return 0;
  155. }
  156. $flags = $writable ? DB_CREATE : 0;
  157. #dbg("bayes: Opening vars");
  158. unless ($self->{handles}->{vars} = BerkeleyDB::Btree->new(
  159. -Env => $self->{env}, -Filename => "vars.db", -Flags => $flags)) {
  160. warn("bayes: couldn't open vars.db: $BerkeleyDB::Error");
  161. delete $self->{handles}->{vars};
  162. $self->untie_db;
  163. return 0;
  164. }
  165. #dbg("bayes: Looking for db_version");
  166. unless ($self->{db_version} = $self->_get(vars => "DB_VERSION")) {
  167. if ($writable) {
  168. $self->{db_version} = $self->DB_VERSION;
  169. $self->{handles}->{vars}->db_put(DB_VERSION => $self->{db_version}) == 0
  170. or die "Couldn't put record: $BerkeleyDB::Error";
  171. $self->{handles}->{vars}->db_put(NTOKENS => 0) == 0
  172. or die "Couldn't put record: $BerkeleyDB::Error";
  173. dbg("bayes: new db, set db version %s and 0 tokens",$self->{db_version});
  174. } else {
  175. warn("bayes: vars.db not intialized: $BerkeleyDB::Error");
  176. $self->untie_db;
  177. return 0;
  178. }
  179. } elsif ($self->{db_version}) {
  180. dbg("bayes: found bayes db version $self->{db_version}");
  181. if ($self->{db_version} != $self->DB_VERSION) {
  182. warn("bayes: bayes db version $self->{db_version} is not able to be used, aborting: $BerkeleyDB::Error");
  183. $self->untie_db();
  184. return 0;
  185. }
  186. }
  187. #dbg("bayes: Opening tokens");
  188. unless ($self->{handles}->{tokens} = BerkeleyDB::Btree->new(
  189. -Env => $self->{env}, -Filename => "tokens.db",
  190. -Flags => $flags, -Property => DB_REVSPLITOFF)) {
  191. warn("bayes: couldn't open tokens.db: $BerkeleyDB::Error");
  192. delete $self->{handles}->{tokens};
  193. $self->untie_db;
  194. return 0;
  195. }
  196. #dbg("bayes: Opening atime secondary DB");
  197. unless ($self->{handles}->{atime} = BerkeleyDB::Btree->new(
  198. -Env => $self->{env}, -Filename => "atime.db",
  199. -Flags => $flags, -Property => DB_DUP|DB_DUPSORT)) {
  200. warn("bayes: couldn't open atime.db: $BerkeleyDB::Error");
  201. delete $self->{handles}->{atime};
  202. $self->untie_db;
  203. return 0;
  204. }
  205. #dbg("bayes: Opening seen DB");
  206. unless ($self->{handles}->{seen} = BerkeleyDB::Btree->new(
  207. -Env => $self->{env}, -Filename => "seen.db", -Flags => $flags)) {
  208. warn("bayes: couldn't open tokens.db: $BerkeleyDB::Error");
  209. delete $self->{handles}->{seen};
  210. $self->untie_db;
  211. return 0;
  212. }
  213. # This MUST be outside the transaction that opens the DB,
  214. # or it just doesn't work. Dunno Why.
  215. !$self->{handles}->{tokens}->associate($self->{handles}->{atime},
  216. \&_extract_atime)
  217. or die "Couldn't associate DBs: $BerkeleyDB::Error";
  218. $self->{is_really_open} = 1;
  219. $self->{is_officially_open} = 1;
  220. dbg("bayes: _open_db done");
  221. return 1;
  222. }
  223. =head2 untie_db
  224. public instance () untie_db ()
  225. Description:
  226. Closes any open db handles. You can safely call this at any time.
  227. =cut
  228. sub untie_db {
  229. my $self = shift;
  230. dbg("bayes: pretend to be closing a database");
  231. $self->{is_writable} = 0;
  232. $self->{is_officially_open} = 0;
  233. $self->{env}->txn_checkpoint(128, 1) if $self->{env};
  234. for my $handle (keys %{$self->{handles}}) {
  235. my $handles = $self->{handles};
  236. if (defined $handles && $handles->{$handle}) {
  237. $handles->{$handle}->db_sync == 0
  238. or die "Couldn't sync $handle: $BerkeleyDB::Error";
  239. }
  240. }
  241. return;
  242. }
  243. sub _close_db {
  244. my $self = shift;
  245. dbg("bayes: really closing a database");
  246. $self->{is_writable} = 0;
  247. $self->{is_really_open} = 0;
  248. $self->{is_officially_open} = 0;
  249. $self->{db_version} = undef;
  250. for my $handle (keys %{$self->{handles}}) {
  251. my $handles = $self->{handles};
  252. if (defined $handles && $handles->{$handle}) {
  253. dbg("bayes: closing database $handle");
  254. eval { $handles->{$handle}->db_close }; # ignoring status
  255. }
  256. delete $handles->{$handle};
  257. }
  258. delete $self->{env};
  259. return;
  260. }
  261. =head2 calculate_expire_delta
  262. public instance (%) calculate_expire_delta (
  263. Integer $newest_atime, Integer $start, Integer $max_expire_mult)
  264. Description:
  265. This method performs a calculation on the data to determine the
  266. optimum atime for token expiration.
  267. =cut
  268. sub calculate_expire_delta {
  269. my($self, $newest_atime, $start, $max_expire_mult) = @_;
  270. dbg("bayes: calculate_expire_delta starting");
  271. my %delta; # use a hash since an array is going to be very sparse
  272. my $cursor = $self->{handles}->{atime}->db_cursor;
  273. $cursor or die "Couldn't get cursor: $BerkeleyDB::Error";
  274. my($atime, $value) = ("", "");
  275. # Do the first pass, figure out atime delta by iterating over our
  276. # *secondary* index, avoiding the decoding overhead
  277. while ($cursor->c_get($atime, $value, $next) == 0) {
  278. # Go through from $start * 1 to $start * 512, mark how many tokens
  279. # we would expire
  280. my $age = $newest_atime - $atime;
  281. for (my $i = 1; $i <= $max_expire_mult; $i <<= 1) {
  282. if ($age >= $start * $i) {
  283. $delta{$i}++;
  284. } else {
  285. # If the token age is less than the expire delta, it'll be
  286. # less for all upcoming checks too, so abort early.
  287. last;
  288. }
  289. }
  290. }
  291. $cursor->c_close == 0
  292. or die "Couldn't close cursor: $BerkeleyDB::Error";
  293. undef $cursor;
  294. dbg("bayes: calculate_expire_delta done");
  295. return %delta;
  296. }
  297. =head2 token_expiration
  298. public instance (Integer, Integer,
  299. Integer, Integer) token_expiration (\% $opts,
  300. Integer $newdelta,
  301. @ @vars)
  302. Description:
  303. This method performs the database specific expiration of tokens based on
  304. the passed in C<$newdelta> and C<@vars>.
  305. =cut
  306. sub token_expiration {
  307. my($self, $opts, $newdelta, @vars) = @_;
  308. dbg("bayes: Entering token_expiration");
  309. my($kept, $deleted, $hapaxes, $lowfreq) = (0, 0, 0, 0);
  310. # Reset stray too-new tokens
  311. {
  312. my $cursor = $self->{handles}->{atime}->db_cursor;
  313. $cursor or die "Couldn't get cursor: $BerkeleyDB::Error";
  314. # Grab the token for a tight RWM loop
  315. my($atime, $flag) = ($vars[10], DB_SET_RANGE|$rmw);
  316. # Find the first token eq or gt the current newest
  317. while ($cursor->c_pget($atime, my $token, my $value, $flag) == 0) {
  318. my($ts, $th, $current) = _unpack_token($value);
  319. $self->{handles}->{tokens}->db_put($token,
  320. _pack_token($ts, $th, $atime)) == 0
  321. or die "Couldn't put record: $BerkeleyDB::Error";
  322. # We need to adjust our flag to continue on from the first rec
  323. $flag = $next|$rmw;
  324. }
  325. $cursor->c_close == 0
  326. or die "Couldn't close cursor: $BerkeleyDB::Error";
  327. undef $cursor;
  328. }
  329. # Figure out how old is too old...
  330. my $too_old = $vars[10] - $newdelta; # tooold = newest - delta
  331. dbg("bayes: Too old is $too_old");
  332. dbg("bayes: Getting db stats");
  333. my $count;
  334. # Estimate the number of keys to be deleted
  335. {
  336. my $stats = $self->{handles}->{atime}->db_stat(DB_FAST_STAT);
  337. #dbg("bayes: Stats: %s", Dumper($stats));
  338. # Scan if we've never gotten stats before
  339. $stats = $self->{handles}->{atime}->db_stat if $stats->{bt_ndata} == 0;
  340. #dbg("bayes: Stats: %s", Dumper($stats));
  341. if ($self->{handles}->{atime}->db_key_range(
  342. $too_old, my $less, my $equal, my $greater) == 0) {
  343. dbg("bayes: less is $less, equal is $equal, greater is $greater");
  344. $count = $stats->{bt_ndata} - $stats->{bt_ndata} * $greater;
  345. }
  346. }
  347. dbg("bayes: Considering deleting $vars[3], $count");
  348. # As long as too many tokens wouldn't be deleted
  349. if ($vars[3] - $count >= 100000) {
  350. dbg("bayes: Preparing to iterate");
  351. my $cursor = $self->{handles}->{atime}->db_cursor;
  352. $cursor or die "Couldn't get cursor: $BerkeleyDB::Error";
  353. my ($atime, $oldest, $token, $value);
  354. $atime = 0;
  355. while ($cursor->c_pget($atime, $token, $value, $next) == 0) {
  356. # We're traversing in order, so done
  357. $oldest = $atime, last if $atime >= $too_old;
  358. dbg("bayes: Deleting record");
  359. $cursor->c_del;
  360. $deleted++;
  361. my($ts, $th, $atime) = _unpack_token($value);
  362. if ($ts + $th == 1) {
  363. $hapaxes++;
  364. } elsif ($ts < 8 && $th < 8) {
  365. $lowfreq++;
  366. }
  367. }
  368. dbg("bayes: Done with cursor");
  369. $cursor->c_close == 0
  370. or die "Couldn't close cursor: $BerkeleyDB::Error";
  371. undef $cursor;
  372. $kept = $self->_get(vars => "NTOKENS", $rmw) - $deleted;
  373. $self->{handles}->{vars}->db_put(NTOKENS => $kept) == 0
  374. or die "Couldn't put record: $BerkeleyDB::Error";
  375. $self->{handles}->{vars}->db_put(LAST_EXPIRE => time) == 0
  376. or die "Couldn't put record: $BerkeleyDB::Error";
  377. $self->{handles}->{vars}->db_put(OLDEST_TOKEN_AGE => $oldest) == 0
  378. or die "Couldn't put record: $BerkeleyDB::Error";
  379. $self->{handles}->{vars}->db_put(LAST_EXPIRE_REDUCE => $deleted) == 0
  380. or die "Couldn't put record: $BerkeleyDB::Error";
  381. $self->{handles}->{vars}->db_put(LAST_ATIME_DELTA => $newdelta) == 0
  382. or die "Couldn't put record: $BerkeleyDB::Error";
  383. #$self->{handles}->{atime}->compact;
  384. #$self->{handles}->{tokens}->compact;
  385. #$self->{handles}->{vars}->compact;
  386. } else {
  387. dbg("bayes: Update vars to regenerate histogram");
  388. # Make sure we regenerate our histogramn
  389. $kept = $self->_get(vars => "NTOKENS");
  390. $self->{handles}->{vars}->db_put(LAST_EXPIRE => time) == 0
  391. or die "Couldn't put record: $BerkeleyDB::Error";
  392. $self->{handles}->{vars}->db_put(LAST_ATIME_DELTA => 0) == 0
  393. or die "Couldn't put record: $BerkeleyDB::Error";
  394. $self->{handles}->{vars}->db_put(LAST_EXPIRE_REDUCE => 0) == 0
  395. or die "Couldn't put record: $BerkeleyDB::Error";
  396. }
  397. dbg("bayes: token_expiration done");
  398. return($kept, $deleted, $hapaxes, $lowfreq);
  399. }
  400. =head2 sync_due
  401. public instance (Boolean) sync_due ()
  402. Description:
  403. This method determines if a database sync is currently required.
  404. Unused for BDB implementation.
  405. =cut
  406. sub sync_due {
  407. return 0;
  408. }
  409. =head2 seen_get
  410. public instance (String) seen_get (string $msgid)
  411. Description:
  412. This method retrieves the stored value, if any, for C<$msgid>. The return
  413. value is the stored string ('s' for spam and 'h' for ham) or undef if C<$msgid>
  414. is not found.
  415. =cut
  416. sub seen_get {
  417. my($self, $msgid) = @_;
  418. dbg("bayes: Entering seen_get");
  419. my $value = $self->_get(seen => $msgid);
  420. return $value;
  421. }
  422. =head2 seen_put
  423. public (Boolean) seen_put (string $msgid, char $flag)
  424. Description:
  425. This method records C<$msgid> as the type given by C<$flag>. C<$flag> is one
  426. of two values 's' for spam and 'h' for ham.
  427. =cut
  428. sub seen_put {
  429. my($self, $msgid, $flag) = @_;
  430. dbg("bayes: Entering seen_put");
  431. $self->{handles}->{seen}->db_put($msgid, $flag) == 0
  432. or die "Couldn't put record: $BerkeleyDB::Error";
  433. return 1;
  434. }
  435. =head2 seen_delete
  436. public instance (Boolean) seen_delete (string $msgid)
  437. Description:
  438. This method removes C<$msgid> from the database.
  439. =cut
  440. sub seen_delete {
  441. my($self, $msgid) = @_;
  442. dbg("bayes: Entering seen_delete");
  443. my $result;
  444. my $status = $self->{handles}->{seen}->db_del($msgid);
  445. if ($status == 0) {
  446. $result = 1;
  447. } elsif ($status == DB_NOTFOUND) {
  448. $result = 0E0;
  449. } else {
  450. die "Couldn't delete record: $BerkeleyDB::Error";
  451. }
  452. return $result;
  453. }
  454. =head2 get_storage_variables
  455. public instance (@) get_storage_variables ()
  456. Description:
  457. This method retrieves the various administrative variables used by
  458. the Bayes process and database.
  459. The values returned in the array are in the following order:
  460. 0: scan count base
  461. 1: number of spam
  462. 2: number of ham
  463. 3: number of tokens in db
  464. 4: last expire atime
  465. 5: oldest token in db atime
  466. 6: db version value
  467. 7: last journal sync
  468. 8: last atime delta
  469. 9: last expire reduction count
  470. 10: newest token in db atime
  471. =cut
  472. sub get_storage_variables {
  473. my($self) = @_;
  474. dbg("bayes: get_storage_variables starting");
  475. my @values;
  476. for my $token (qw{LAST_JOURNAL_SYNC NSPAM NHAM NTOKENS LAST_EXPIRE
  477. OLDEST_TOKEN_AGE DB_VERSION LAST_JOURNAL_SYNC
  478. LAST_ATIME_DELTA LAST_EXPIRE_REDUCE NEWEST_TOKEN_AGE}) {
  479. my $value = $self->_get(vars => $token);
  480. $value = 0 unless $value && $value =~ /\d+/;
  481. push @values, $value;
  482. }
  483. dbg("bayes: get_storage_variables done");
  484. return @values;
  485. }
  486. =head2 dump_tokens
  487. public instance () dump_tokens (String $template, String $regex, Array @vars)
  488. Description:
  489. This method loops over all tokens, computing the probability for the token
  490. and then printing it out according to the passed in token.
  491. =cut
  492. sub dump_db_toks { dump_tokens(@_) }
  493. sub dump_tokens {
  494. my($self, $template, $regex, @vars) = @_;
  495. dbg("bayes: dump_tokens starting");
  496. my $cursor = $self->{handles}->{tokens}->db_cursor;
  497. $cursor or die "Couldn't get cursor: $BerkeleyDB::Error";
  498. my ($token, $value) = ("", "");
  499. while ($cursor->c_get($token, $value, $next) == 0) {
  500. next if defined $regex && $token !~ /$regex/o;
  501. my($ts, $th, $atime) = _unpack_token($value);
  502. my $prob = $self->{bayes}->_compute_prob_for_token(
  503. $token, $vars[1], $vars[2], $ts, $th) || 0.5;
  504. my $encoded = unpack("H*",$token);
  505. printf $template, $prob, $ts, $th, $atime, $encoded;
  506. }
  507. $cursor->c_close == 0
  508. or die "Couldn't close cursor: $BerkeleyDB::Error";
  509. undef $cursor;
  510. dbg("bayes: dump_tokens done");
  511. return 1;
  512. }
  513. =head2 set_last_expire
  514. public instance (Boolean) set_last_expire (Integer $time)
  515. Description:
  516. This method sets the last expire time.
  517. =cut
  518. sub set_last_expire {
  519. my($self, $time) = @_;
  520. dbg("bayes: Entering set_last_expire");
  521. $self->{handles}->{vars}->db_put(LAST_EXPIRE => $time) == 0
  522. or die "Couldn't put record: $BerkeleyDB::Error";
  523. return 1;
  524. }
  525. =head2 get_running_expire_tok
  526. public instance (String $time) get_running_expire_tok ()
  527. Description:
  528. This method determines if an expire is currently running and returns
  529. the last time set.
  530. There can be multiple times, so we just pull the greatest (most recent)
  531. value.
  532. =cut
  533. sub get_running_expire_tok {
  534. my($self) = @_;
  535. dbg("bayes: Entering get_running_expire_tok");
  536. my $value = $self->_get(vars => "RUNNING_EXPIRE") || "";
  537. my $result;
  538. $result = $value if $value =~ /^\d+$/;
  539. dbg("bayes: get_running_expire_tok exiting with %s",
  540. !defined $result ? 'UNDEF' : $result);
  541. return $result;
  542. }
  543. =head2 set_running_expire_tok
  544. public instance (String $time) set_running_expire_tok ()
  545. Description:
  546. This method sets the time that an expire starts running.
  547. =cut
  548. sub set_running_expire_tok {
  549. my($self) = @_;
  550. my $time = time;
  551. $self->{handles}->{vars}->db_put(RUNNING_EXPIRE => $time) == 0
  552. or die "Couldn't put record: $BerkeleyDB::Error";
  553. return $time;
  554. }
  555. =head2 remove_running_expire_tok
  556. public instance (Boolean) remove_running_expire_tok ()
  557. Description:
  558. This method removes the row in the database that indicates that
  559. and expire is currently running.
  560. =cut
  561. sub remove_running_expire_tok {
  562. my($self) = @_;
  563. my $status = $self->{handles}->{vars}->db_del("RUNNING_EXPIRE");
  564. my $result;
  565. if ($status == 0) {
  566. $result = 1;
  567. } elsif ($status == DB_NOTFOUND) {
  568. $result = 0E0;
  569. } else {
  570. die "Couldn't delete record: $BerkeleyDB::Error";
  571. }
  572. return $result;
  573. }
  574. =head2 tok_get
  575. public instance (Integer, Integer, Integer) tok_get (String $token)
  576. Description:
  577. This method retrieves a specificed token (C<$token>) from the database
  578. and returns its spam_count, ham_count and last access time.
  579. =cut
  580. sub tok_get {
  581. my($self, $token) = @_;
  582. dbg("bayes: Entering tok_get");
  583. my $array = $self->tok_get_all($token);
  584. return !@$array ? () : (@{$array->[0]})[1,2,3];
  585. }
  586. =head2 tok_get_all
  587. public instance (\@) tok_get (@ $tokens)
  588. Description:
  589. This method retrieves the specified tokens (C<$tokens>) from storage and
  590. returns an array ref of arrays spam count, ham acount and last access time.
  591. =cut
  592. sub tok_get_all {
  593. my($self, @keys) = @_;
  594. #dbg("bayes: Entering tok_get_all");
  595. my @results = $self->_mget(tokens => \@keys);
  596. my @values;
  597. for my $token (@keys) {
  598. my $value = shift(@results);
  599. push(@values, [$token, _unpack_token($value)]) if defined $value;
  600. }
  601. dbg("bayes: tok_get_all found %d tokens out of %d search keys",
  602. scalar(@values), scalar(@keys));
  603. #dbg("bayes: tok_get_all returning with %s", Dumper(\@values));
  604. return \@values;
  605. }
  606. =head2 tok_count_change
  607. public instance (Boolean) tok_count_change (
  608. Integer $dspam, Integer $dham, String $token, String $newatime)
  609. Description:
  610. This method takes a C<$spam_count> and C<$ham_count> and adds it to
  611. C<$tok> along with updating C<$tok>s atime with C<$atime>.
  612. =cut
  613. sub tok_count_change {
  614. my($self, $dspam, $dham, $token, $newatime) = @_;
  615. dbg("bayes: Entering tok_count_change");
  616. $self->multi_tok_count_change($dspam, $dham, {$token => 1}, $newatime);
  617. }
  618. =head2 multi_tok_count_change
  619. public instance (Boolean) multi_tok_count_change (
  620. Integer $dspam, Integer $dham, \% $tokens, String $newatime)
  621. Description:
  622. This method takes a C<$dspam> and C<$dham> and adds it to all of the
  623. tokens in the C<$tokens> hash ref along with updating each tokens
  624. atime with C<$atime>.
  625. =cut
  626. sub multi_tok_count_change {
  627. my($self, $dspam, $dham, $tokens, $newatime) = @_;
  628. # Make sure we have some values
  629. $dspam ||= 0;
  630. $dham ||= 0;
  631. $newatime ||= 0;
  632. # No changes, just return
  633. return 1 unless ($dspam or $dham);
  634. # Collect this for updates at the end
  635. my $newtokens = 0;
  636. for my $token (keys %{$tokens}) {
  637. #dbg("bayes: token %s", $tokens->{$token});
  638. my $status = $self->{handles}->{tokens}->db_get($token => my $value, $rmw);
  639. if ($status == 0) {
  640. my ($spam, $ham, $oldatime) = _unpack_token($value);
  641. $spam += $dspam;
  642. $spam = 0 if $spam < 0;
  643. $ham += $dham;
  644. $ham = 0 if $ham < 0;
  645. my $newvalue = _pack_token($spam, $ham, $newatime);
  646. $self->{handles}->{tokens}->db_put($token => $newvalue) == 0
  647. or die "Couldn't put record: $BerkeleyDB::Error";
  648. }
  649. elsif ($status == DB_NOTFOUND) {
  650. my $spam = $dspam;
  651. $spam = 0 if $spam < 0;
  652. my $ham = $dham;
  653. $ham = 0 if $ham < 0;
  654. my $newvalue = _pack_token($spam, $ham, $newatime);
  655. $self->{handles}->{tokens}->db_put($token => $newvalue) == 0
  656. or die "Couldn't put record: $BerkeleyDB::Error";
  657. $newtokens++;
  658. }
  659. else {
  660. die "Couldn't get record: $BerkeleyDB::Error";
  661. }
  662. }
  663. if ($newtokens) {
  664. my $ntokens = $self->_get(vars => "NTOKENS", $rmw) || 0;
  665. $ntokens += $newtokens;
  666. $ntokens = 0 if $ntokens < 0;
  667. $self->{handles}->{vars}->db_put(NTOKENS => $ntokens) == 0
  668. or die "Couldn't put record: $BerkeleyDB::Error";
  669. }
  670. my $newmagic = $self->_get(vars => "NEWEST_TOKEN_AGE", $rmw) || 0;
  671. if ($newatime > $newmagic) {
  672. $self->{handles}->{vars}->db_put(NEWEST_TOKEN_AGE => $newatime) == 0
  673. or die "Couldn't put record: $BerkeleyDB::Error";
  674. }
  675. my $oldmagic = $self->_get(vars => "OLDEST_TOKEN_AGE", $rmw) || time;
  676. if ($newatime && $newatime < $oldmagic) {
  677. $self->{handles}->{vars}->db_put(OLDEST_TOKEN_AGE => $newatime) == 0
  678. or die "Couldn't put record: $BerkeleyDB::Error";
  679. }
  680. return 1;
  681. }
  682. =head2 nspam_nham_get
  683. public instance ($spam_count, $ham_count) nspam_nham_get ()
  684. Description:
  685. This method retrieves the total number of spam and the total number of
  686. ham learned.
  687. =cut
  688. sub nspam_nham_get {
  689. my($self) = @_;
  690. dbg("bayes: Entering nspam_nham_get");
  691. my @vars = $self->get_storage_variables();
  692. ($vars[1], $vars[2]);
  693. }
  694. =head2 nspam_nham_change
  695. public instance (Boolean) nspam_nham_change (Integer $num_spam,
  696. Integer $num_ham)
  697. Description:
  698. This method updates the number of spam and the number of ham in the database.
  699. =cut
  700. sub nspam_nham_change {
  701. my($self, $ds, $dh) = @_;
  702. my $nspam = $self->_get(vars => "NSPAM", $rmw) || 0;
  703. $nspam += ($ds || 0);
  704. $nspam = 0 if $nspam < 0;
  705. $self->{handles}->{vars}->db_put(NSPAM => $nspam) == 0
  706. or die "Couldn't put record: $BerkeleyDB::Error";
  707. my $nham = $self->_get(vars => "NHAM", $rmw) || 0;
  708. $nham += ($dh || 0);
  709. $nham = 0 if $nham < 0;
  710. $self->{handles}->{vars}->db_put(NHAM => $nham) == 0
  711. or die "Couldn't put record: $BerkeleyDB::Error";
  712. return 1;
  713. }
  714. =head2 tok_touch
  715. public instance (Boolean) tok_touch (String $token,
  716. String $atime)
  717. Description:
  718. This method updates the given tokens (C<$token>) atime.
  719. The assumption is that the token already exists in the database.
  720. We will never update to an older atime
  721. =cut
  722. sub tok_touch {
  723. my($self, $token, $atime) = @_;
  724. return $self->tok_touch_all([$token], $atime);
  725. }
  726. =head2 tok_touch_all
  727. public instance (Boolean) tok_touch (\@ $tokens
  728. String $atime)
  729. Description:
  730. This method does a mass update of the given list of tokens C<$tokens>,
  731. if the existing token atime is < C<$atime>.
  732. The assumption is that the tokens already exist in the database.
  733. We should never be touching more than N_SIGNIFICANT_TOKENS, so we can
  734. make some assumptions about how to handle the data (ie no need to
  735. batch like we do in tok_get_all)
  736. =cut
  737. sub tok_touch_all {
  738. my($self, $tokens, $newatime) = @_;
  739. for my $token (@{$tokens}) {
  740. my $status = $self->{handles}->{tokens}->db_get($token => my $value, $rmw);
  741. if ($status == 0) {
  742. my ($spam, $ham, $oldatime) = _unpack_token($value);
  743. my $newvalue = _pack_token($spam, $ham, $newatime);
  744. $self->{handles}->{tokens}->db_put($token => $newvalue) == 0
  745. or die "Couldn't put record: $BerkeleyDB::Error";
  746. }
  747. elsif ($status == DB_NOTFOUND) {
  748. # Do nothing
  749. }
  750. else {
  751. die "Couldn't get record: $BerkeleyDB::Error";
  752. }
  753. }
  754. return 1;
  755. }
  756. =head2 cleanup
  757. public instance (Boolean) cleanup ()
  758. Description:
  759. This method perfoms any cleanup necessary before moving onto the next
  760. operation.
  761. =cut
  762. sub cleanup {
  763. my ($self) = @_;
  764. dbg("Running cleanup");
  765. return 1;
  766. }
  767. =head2 get_magic_re
  768. public instance (String) get_magic_re ()
  769. Description:
  770. This method returns a regexp which indicates a magic token.
  771. Unused in BDB implementation.
  772. =cut
  773. use constant get_magic_re => undef;
  774. =head2 sync
  775. public instance (Boolean) sync (\% $opts)
  776. Description:
  777. This method performs a sync of the database
  778. =cut
  779. sub sync {
  780. my($self, $opts) = @_;
  781. dbg("Running sync");
  782. return 1;
  783. }
  784. =head2 perform_upgrade
  785. public instance (Boolean) perform_upgrade (\% $opts);
  786. Description:
  787. Performs an upgrade of the database from one version to another, not
  788. currently used in this implementation.
  789. =cut
  790. sub perform_upgrade {
  791. dbg("bayes: Entering perform_upgrade");
  792. return 1;
  793. }
  794. =head2 clear_database
  795. public instance (Boolean) clear_database ()
  796. Description:
  797. This method deletes all records for a particular user.
  798. Callers should be aware that any errors returned by this method
  799. could causes the database to be inconsistent for the given user.
  800. =cut
  801. sub clear_database {
  802. my($self) = @_;
  803. dbg("bayes: Entering clear_database");
  804. $self->untie_db();
  805. dbg("bayes: removing db.");
  806. my $main = $self->{bayes}->{main};
  807. my $path = $main->sed_path($main->{conf}->{bayes_path});
  808. eval {rmpath($path)};
  809. return 1;
  810. }
  811. =head2 backup_database
  812. public instance (Boolean) backup_database ()
  813. Description:
  814. This method will dump the users database in a machine readable format.
  815. =cut
  816. sub backup_database {
  817. my($self) = @_;
  818. dbg("bayes: Entering backup_database");
  819. return 0 unless $self->tie_db_writable;
  820. my @vars = $self->get_storage_variables;
  821. print "v\t$vars[6]\tdb_version # this must be the first line!!!\n";
  822. print "v\t$vars[1]\tnum_spam\n";
  823. print "v\t$vars[2]\tnum_nonspam\n";
  824. my $tokens = $self->{handles}->{tokens}->db_cursor;
  825. $tokens or die "Couldn't get cursor: $BerkeleyDB::Error";
  826. my($token, $value) = ("", "");
  827. while ($tokens->c_get($token, $value, $next) == 0) {
  828. my($ts, $th, $atime) = _unpack_token($value);
  829. my $encoded = unpack("H*", $token);
  830. print "t\t$ts\t$th\t$atime\t$encoded\n";
  831. }
  832. $tokens->c_close == 0
  833. or die "Couldn't close cursor: $BerkeleyDB::Error";
  834. undef $tokens;
  835. my $seen = $self->{handles}->{seen}->db_cursor;
  836. $seen or die "Couldn't get cursor: $BerkeleyDB::Error";
  837. $token = "";
  838. while ($seen->c_get($token, $value, $next) == 0) {
  839. print "s\t$token\t$value\n";
  840. }
  841. $seen->c_close == 0
  842. or die "Couldn't close cursor: $BerkeleyDB::Error";
  843. undef $seen;
  844. $self->untie_db();
  845. return 1;
  846. }
  847. =head2 restore_database
  848. public instance (Boolean) restore_database (String $filename, Boolean $showdots)
  849. Description:
  850. This method restores a database from the given filename, C<$filename>.
  851. Callers should be aware that any errors returned by this method
  852. could causes the database to be inconsistent for the given user.
  853. =cut
  854. sub restore_database {
  855. my ($self, $filename, $showdots) = @_;
  856. dbg("bayes: Entering restore_database");
  857. local *DUMPFILE;
  858. if (!open(DUMPFILE, '<', $filename)) {
  859. dbg("bayes: unable to open backup file $filename: $!");
  860. return 0;
  861. }
  862. # This is the critical phase (moving sql around), so don't allow it
  863. # to be interrupted.
  864. local $SIG{'INT'} = 'IGNORE';
  865. local $SIG{'HUP'} = 'IGNORE'
  866. if !Mail::SpamAssassin::Util::am_running_on_windows();
  867. local $SIG{'TERM'} = 'IGNORE';
  868. unless ($self->clear_database()) {
  869. return 0;
  870. }
  871. # we need to go ahead close the db connection so we can then open it up
  872. # in a fresh state after clearing
  873. $self->untie_db();
  874. unless ($self->tie_db_writable()) {
  875. return 0;
  876. }
  877. my $token_count = 0;
  878. my $db_version;
  879. my $num_spam;
  880. my $num_ham;
  881. my $error_p = 0;
  882. my $line_count = 0;
  883. my $line = <DUMPFILE>;
  884. defined $line or die "Error reading dump file: $!";
  885. $line_count++;
  886. # We require the database version line to be the first in the file so we can
  887. # figure out how to properly deal with the file. If it is not the first
  888. # line then fail
  889. if ($line =~ m/^v\s+(\d+)\s+db_version/) {
  890. $db_version = $1;
  891. } else {
  892. dbg("bayes: database version must be the first line in the backup file, correct and re-run");
  893. return 0;
  894. }
  895. unless ($db_version == 2 || $db_version == 3) {
  896. warn("bayes: database version $db_version is unsupported, must be version 2 or 3");
  897. return 0;
  898. }
  899. my $token_error_count = 0;
  900. my $seen_error_count = 0;
  901. for ($!=0; defined($line=<DUMPFILE>); $!=0) {
  902. chomp($line);
  903. $line_count++;
  904. if ($line_count % 1000 == 0) {
  905. print STDERR "." if $showdots;
  906. }
  907. if ($line =~ /^v\s+/) { # variable line
  908. my @parsed_line = split(/\s+/, $line, 3);
  909. my $value = $parsed_line[1] + 0;
  910. if ($parsed_line[2] eq 'num_spam') {
  911. $num_spam = $value;
  912. } elsif ($parsed_line[2] eq 'num_nonspam') {
  913. $num_ham = $value;
  914. } else {
  915. dbg("bayes: restore_database: skipping unknown line: $line");
  916. }
  917. } elsif ($line =~ /^t\s+/) { # token line
  918. my @parsed_line = split(/\s+/, $line, 5);
  919. my $spam_count = $parsed_line[1] + 0;
  920. my $ham_count = $parsed_line[2] + 0;
  921. my $atime = $parsed_line[3] + 0;
  922. my $token = $parsed_line[4];
  923. my $token_warn_p = 0;
  924. my @warnings;
  925. if ($spam_count < 0) {
  926. $spam_count = 0;
  927. push(@warnings, 'spam count < 0, resetting');
  928. $token_warn_p = 1;
  929. }
  930. if ($ham_count < 0) {
  931. $ham_count = 0;
  932. push(@warnings, 'ham count < 0, resetting');
  933. $token_warn_p = 1;
  934. }
  935. if ($spam_count == 0 && $ham_count == 0) {
  936. dbg("bayes: token has zero spam and ham count, skipping");
  937. next;
  938. }
  939. if ($atime > time()) {
  940. $atime = time();
  941. push(@warnings, 'atime > current time, resetting');
  942. $token_warn_p = 1;
  943. }
  944. if ($token_warn_p) {
  945. dbg("bayes: token (%s) has the following warnings:\n%s",
  946. $token, join("\n",@warnings));
  947. }
  948. if ($db_version < 3) {
  949. # versions < 3 use plain text tokens, so we need to convert to hash
  950. $token = substr(sha1($token), -5);
  951. } else {
  952. # turn unpacked binary token back into binary value
  953. $token = pack("H*",$token);
  954. }
  955. unless ($self->_put_token($token, $spam_count, $ham_count, $atime)) {
  956. dbg("bayes: error inserting token for line: $line");
  957. $token_error_count++;
  958. }
  959. $token_count++;
  960. } elsif ($line =~ /^s\s+/) { # seen line
  961. my @parsed_line = split(/\s+/, $line, 3);
  962. my $flag = $parsed_line[1];
  963. my $msgid = $parsed_line[2];
  964. unless ($flag eq 'h' || $flag eq 's') {
  965. dbg("bayes: unknown seen flag ($flag) for line: $line, skipping");
  966. next;
  967. }
  968. unless ($msgid) {
  969. dbg("bayes: blank msgid for line: $line, skipping");
  970. next;
  971. }
  972. unless ($self->seen_put($msgid, $flag)) {
  973. dbg("bayes: error inserting msgid in seen table for line: $line");
  974. $seen_error_count++;
  975. }
  976. } else {
  977. dbg("bayes: skipping unknown line: $line");
  978. next;
  979. }
  980. if ($token_error_count >= 20) {
  981. warn "bayes: encountered too many errors (20) while parsing token line, reverting to empty database and exiting\n";
  982. $self->clear_database();
  983. return 0;
  984. }
  985. if ($seen_error_count >= 20) {
  986. warn "bayes: encountered too many errors (20) while parsing seen lines, reverting to empty database and exiting\n";
  987. $self->clear_database();
  988. return 0;
  989. }
  990. }
  991. defined $line || $!==0 or
  992. $!==EBADF ? dbg("bayes: error reading dump file: $!")
  993. : die "error reading dump file: $!";
  994. close(DUMPFILE) or die "Can't close dump file: $!";
  995. print STDERR "\n" if $showdots;
  996. unless (defined($num_spam)) {
  997. dbg("bayes: unable to find num spam, please check file");
  998. $error_p = 1;
  999. }
  1000. unless (defined($num_ham)) {
  1001. dbg("bayes: unable to find num ham, please check file");
  1002. $error_p = 1;
  1003. }
  1004. if ($error_p) {
  1005. dbg("bayes: error(s) while attempting to load $filename, clearing database, correct and re-run");
  1006. $self->clear_database();
  1007. return 0;
  1008. }
  1009. if ($num_spam || $num_ham) {
  1010. unless ($self->nspam_nham_change($num_spam, $num_ham)) {
  1011. dbg("bayes: error updating num spam and num ham, clearing database");
  1012. $self->clear_database();
  1013. return 0;
  1014. }
  1015. }
  1016. dbg("bayes: parsed $line_count lines");
  1017. dbg("bayes: created database with $token_count tokens based on $num_spam spam messages and $num_ham ham messages");
  1018. $self->untie_db();
  1019. return 1;
  1020. }
  1021. =head2 db_readable
  1022. public instance (Boolean) db_readable()
  1023. Description:
  1024. This method returns a boolean value indicating if the database is in a
  1025. readable state.
  1026. =cut
  1027. sub db_readable {
  1028. my($self) = @_;
  1029. #dbg("bayes: Entering db_readable");
  1030. return $self->{is_really_open} && $self->{is_officially_open};
  1031. }
  1032. =head2 db_writable
  1033. public instance (Boolean) db_writable()
  1034. Description:
  1035. This method returns a boolean value indicating if the database is in a
  1036. writable state.
  1037. =cut
  1038. sub db_writable {
  1039. my($self) = @_;
  1040. dbg("bayes: Entering db_writable");
  1041. return $self->{is_really_open} && $self->{is_officially_open} &&
  1042. $self->{is_writable};
  1043. }
  1044. =head2 _extract_atime
  1045. private instance () _extract_atime (String $token,
  1046. String $value,
  1047. String $index)
  1048. Description:
  1049. This method ensures that the database connection is properly setup and
  1050. working. If appropriate it will initialize a users bayes variables so
  1051. that they can begin using the database immediately.
  1052. =cut
  1053. sub _extract_atime {
  1054. my ($token, $value) = @_;
  1055. #dbg("bayes: Entering _extract_atime");
  1056. my($ts, $th, $atime) = _unpack_token($value);
  1057. #dbg("bayes: _extract_atime found $atime for $token");
  1058. $_[2] = $atime;
  1059. #dbg("bayes: Leaving db_writable");
  1060. return 0;
  1061. }
  1062. =head2 _put_token
  1063. FIXME: This is rarely a good interface, because of the churn that will
  1064. often happen in the "magic" tokens. Open-code this stuff in the
  1065. presence of loops.
  1066. =cut
  1067. sub _put_token {
  1068. my($self, $token, $ts, $th, $atime) = @_;
  1069. dbg("bayes: Entering _put_token");
  1070. $ts ||= 0;
  1071. $th ||= 0;
  1072. dbg("bayes: $token has spam $ts, ham $th, atime $atime");
  1073. my $value = $self->_get(tokens => $token, $rmw);
  1074. my $exists_already = defined $value ? 1 : 0;
  1075. dbg("bayes: $token exists: $exists_already");
  1076. if ($ts == 0 && $th == 0) {
  1077. return unless $exists_already; # If the token doesn't exist, just return
  1078. my $ntokens = $self->_get(vars => "NTOKENS", $rmw);
  1079. $self->{handles}->{vars}->db_put(NTOKENS => --$ntokens) == 0
  1080. or die "Couldn't put record: $BerkeleyDB::Error";
  1081. dbg("bayes: ntokens is $ntokens");
  1082. my $status = $self->{handles}->{tokens}->db_del($token);
  1083. $status == 0 || $status == DB_NOTFOUND
  1084. or die "Couldn't delete record: $BerkeleyDB::Error";
  1085. dbg("bayes: $token deleted");
  1086. } else {
  1087. unless ($exists_already) {
  1088. # If the token doesn't exist, raise the token count
  1089. my $ntokens = $self->_get(vars => "NTOKENS", $rmw);
  1090. $self->{handles}->{vars}->db_put(NTOKENS => ++$ntokens) == 0
  1091. or die "Couldn't put record: $BerkeleyDB::Error";
  1092. dbg("bayes: ntokens is $ntokens");
  1093. }
  1094. my $newmagic = $self->_get(vars => "NEWEST_TOKEN_AGE", $rmw) || 0;
  1095. dbg("bayes: NEWEST_TOKEN_AGE is $newmagic");
  1096. if ($atime > $newmagic) {
  1097. dbg("bayes: Updating NEWEST_TOKEN_AGE");
  1098. $self->{handles}->{vars}->db_put(NEWEST_TOKEN_AGE => $atime) == 0
  1099. or die "Couldn't put record: $BerkeleyDB::Error";
  1100. }
  1101. my $oldmagic = $self->_get(vars => "OLDEST_TOKEN_AGE", $rmw) || time;
  1102. dbg("bayes: OLDEST_TOKEN_AGE is $oldmagic");
  1103. if ($atime && $atime < $oldmagic) {
  1104. dbg("bayes: Updating OLDEST_TOKEN_AGE to $atime");
  1105. $self->{handles}->{vars}->db_put(OLDEST_TOKEN_AGE => $atime) == 0
  1106. or die "Couldn't put record: $BerkeleyDB::Error";
  1107. }
  1108. my $value = _pack_token($ts, $th, $atime);
  1109. dbg("bayes: Setting $token to $value");
  1110. dbg("bayes: Handle is $self->{handles}->{tokens}");
  1111. $self->{handles}->{tokens}->db_put($token, $value) == 0
  1112. or die "Couldn't put record: $BerkeleyDB::Error";
  1113. }
  1114. dbg("bayes: Leaving _put_token");
  1115. return 1;
  1116. }
  1117. # token marshalling format for tokens.
  1118. # Since we may have many entries with few hits, especially thousands of hapaxes
  1119. # (1-occurrence entries), use a flexible entry format, instead of simply "2
  1120. # packed ints", to keep the memory and disk space usage down. In my
  1121. # 18k-message test corpus, only 8.9% have >= 8 hits in either counter, so we
  1122. # can use a 1-byte representation for the other 91% of low-hitting entries
  1123. # and save masses of space.
  1124. # This looks like: XXSSSHHH (XX = format bits, SSS = 3 spam-count bits, HHH = 3
  1125. # ham-count bits). If XX in the first byte is 11, it's packed as this 1-byte
  1126. # representation; otherwise, if XX in the first byte is 00, it's packed as
  1127. # "CLL", ie. 1 byte and 2 32-bit "longs" in perl pack format.
  1128. # Savings: roughly halves size of toks db, at the cost of a ~10% slowdown.
  1129. use constant FORMAT_FLAG => 0xc0; # 11000000
  1130. use constant ONE_BYTE_FORMAT => 0xc0; # 11000000
  1131. use constant TWO_LONGS_FORMAT => 0x00; # 00000000
  1132. use constant ONE_BYTE_SSS_BITS => 0x38; # 00111000
  1133. use constant ONE_BYTE_HHH_BITS => 0x07; # 00000111
  1134. sub _unpack_token {
  1135. my $value = shift || 0;
  1136. my($packed, $ts, $th, $atime) = unpack("CVVV", $value);
  1137. if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) {
  1138. return (($packed & ONE_BYTE_SSS_BITS) >> 3,
  1139. $packed & ONE_BYTE_HHH_BITS,
  1140. $ts || 0);
  1141. # The one-byte-format uses that first 32-bit long as atime
  1142. } elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) {
  1143. return ($ts || 0, $th || 0, $atime || 0);
  1144. } else {
  1145. warn "bayes: unknown packing format for bayes db, please re-learn: $packed";
  1146. return (0, 0, 0);
  1147. }
  1148. }
  1149. sub _pack_token {
  1150. my($ts, $th, $atime) = @_;
  1151. $ts ||= 0; $th ||= 0; $atime ||= 0;
  1152. if ($ts < 8 && $th < 8) {
  1153. return pack("CV", (ONE_BYTE_FORMAT | ($ts << 3) | $th) & 255, $atime);
  1154. } else {
  1155. return pack("CVVV", TWO_LONGS_FORMAT, $ts, $th, $atime);
  1156. }
  1157. }
  1158. sub _get {
  1159. my ($self, $table, $key, $flags) = @_;
  1160. $flags |= 0;
  1161. my $value = "";
  1162. my $status = $self->{handles}->{$table}->db_get($key => $value, $flags);
  1163. if ($status == 0) {
  1164. return $value;
  1165. } elsif ($status == DB_NOTFOUND) {
  1166. return;
  1167. } else {
  1168. die "Couldn't get record: $BerkeleyDB::Error";
  1169. }
  1170. }
  1171. sub _mget {
  1172. my ($self, $table, $keys, $flags) = @_;
  1173. my @results;
  1174. $flags |= 0;
  1175. my $handle = $self->{handles}->{$table};
  1176. for my $key (@$keys) {
  1177. my $value = "";
  1178. my $status = $handle->db_get($key => $value, $flags);
  1179. undef $value if $status != 0;
  1180. $status == 0 || $status == DB_NOTFOUND
  1181. or die "Couldn't get record: $BerkeleyDB::Error";
  1182. push(@results, $value);
  1183. }
  1184. return @results;
  1185. }
  1186. sub sa_die { Mail::SpamAssassin::sa_die(@_); }
  1187. 1;