PageRenderTime 48ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/MogileFS/Worker/Delete.pm

http://github.com/mogilefs/MogileFS-Server
Perl | 385 lines | 238 code | 68 blank | 79 comment | 37 complexity | 59ca047415b748a7d9626a1bb0d29c05 MD5 | raw file
  1. package MogileFS::Worker::Delete;
  2. # deletes files
  3. use strict;
  4. use base 'MogileFS::Worker';
  5. use MogileFS::Util qw(error);
  6. use MogileFS::Server;
  7. # we select 1000 but only do a random 100 of them, to allow
  8. # for stateless parallelism
  9. use constant LIMIT => 1000;
  10. use constant PER_BATCH => 100;
  11. sub new {
  12. my ($class, $psock) = @_;
  13. my $self = fields::new($class);
  14. $self->SUPER::new($psock);
  15. return $self;
  16. }
  17. sub watchdog_timeout { 120 }
  18. sub work {
  19. my $self = shift;
  20. my $sleep_for = 0; # we sleep longer and longer until we hit max_sleep
  21. my $sleep_max = 5; # max sleep when there's nothing to do.
  22. my $old_queue_check = 0; # next time to check the old queue.
  23. my $old_queue_backoff = 0; # backoff index
  24. while (1) {
  25. $self->send_to_parent("worker_bored 50 delete");
  26. $self->read_from_parent(1);
  27. next unless $self->validate_dbh;
  28. # call our workers, and have them do things
  29. # RETVAL = 0; I think I am done working for now
  30. # RETVAL = 1; I have more work to do
  31. my $lock = 'mgfs:tempfiles';
  32. # This isn't something we need to wait for: just need to ensure one is.
  33. my $tempres;
  34. if (Mgd::get_store()->get_lock($lock, 0)) {
  35. $tempres = $self->process_tempfiles;
  36. Mgd::get_store()->release_lock($lock);
  37. }
  38. my $delres;
  39. if (time() > $old_queue_check) {
  40. $self->reenqueue_delayed_deletes;
  41. $delres = $self->process_deletes;
  42. # if we did no work, crawl the backoff.
  43. if ($delres) {
  44. $old_queue_backoff = 0;
  45. $old_queue_check = 0;
  46. } else {
  47. $old_queue_check = time() + $old_queue_backoff
  48. if $old_queue_backoff > 360;
  49. $old_queue_backoff++ unless $old_queue_backoff > 1800;
  50. }
  51. }
  52. my $delres2 = $self->process_deletes2;
  53. # unless someone did some work, let's sleep
  54. unless ($tempres || $delres || $delres2) {
  55. $sleep_for++ if $sleep_for < $sleep_max;
  56. sleep $sleep_for;
  57. } else {
  58. $sleep_for = 0;
  59. }
  60. }
  61. }
  62. # deletes a given DevFID from the storage device
  63. # returns true on success, false on failure
  64. sub delete_devfid {
  65. my ($self, $dfid) = @_;
  66. # send delete request
  67. error("Sending delete for " . $dfid->url) if $Mgd::DEBUG >= 2;
  68. my $res;
  69. $dfid->device->host->http("DELETE", $dfid->uri_path, undef, sub { ($res) = @_ });
  70. Danga::Socket->SetPostLoopCallback(sub { !defined $res });
  71. Danga::Socket->EventLoop;
  72. my $httpcode = $res->code;
  73. # effectively means all went well
  74. return 1 if (($httpcode >= 200 && $httpcode <= 299) || $httpcode == 404);
  75. my $status = $res->status_line;
  76. error("Error: unlink failure: " . $dfid->url . ": HTTP code $status");
  77. return 0;
  78. }
  79. sub process_tempfiles {
  80. my $self = shift;
  81. # also clean the tempfile table
  82. #mysql> select * from tempfile where createtime < unix_timestamp() - 86400 limit 50;
  83. #+--------+------------+---------+------+---------+--------+
  84. #| fid | createtime | classid | dmid | dkey | devids |
  85. #+--------+------------+---------+------+---------+--------+
  86. #| 3253 | 1149451058 | 1 | 1 | file574 | 1,2 |
  87. #| 4559 | 1149451156 | 1 | 1 | file83 | 1,2 |
  88. #| 11024 | 1149451697 | 1 | 1 | file836 | 2,1 |
  89. #| 19885 | 1149454542 | 1 | 1 | file531 | 1,2 |
  90. # BUT NOTE:
  91. # the fids might exist on one of the devices in devids column if we assigned them those,
  92. # they wrote some to one of them, then they died or for whatever reason didn't create_close
  93. # to use, so we shouldn't delete from tempfile before going on a hunt of the missing fid.
  94. # perhaps we should just add to the file_on table for both devids, and let the regular delete
  95. # process discover via 404 that they're not there.
  96. # so we should:
  97. # select fid, devids from tempfile where createtime < unix_timestamp() - 86400
  98. # add file_on rows for both of those,
  99. # add fid to fids_to_delete table,
  100. # delete from tempfile where fid=?
  101. # dig up some temporary files to purge
  102. my $sto = Mgd::get_store();
  103. my $too_old = int($ENV{T_TEMPFILE_TOO_OLD} || 3600);
  104. my $tempfiles = $sto->old_tempfiles($too_old);
  105. return 0 unless $tempfiles && @$tempfiles;
  106. # insert the right rows into file_on and file_to_delete and remove the
  107. # now expunged (or soon to be) rows from tempfile
  108. my (@devfids, @fidids);
  109. foreach my $row (@$tempfiles) {
  110. # If FID is still loadable, we've arrived here due to a bug or race
  111. # condition elsewhere. Remove the tempfile row but don't delete the
  112. # file!
  113. my $fidid = $row->[0];
  114. my $fid = MogileFS::FID->new($fidid);
  115. if ($fid->exists) {
  116. $sto->delete_tempfile_row($fidid);
  117. next;
  118. }
  119. push @fidids, $fidid;
  120. # sanity check the string column.
  121. my $devids = $row->[1];
  122. unless ($devids =~ /^(\d+)(,\d+)*$/) {
  123. $devids = "";
  124. }
  125. foreach my $devid (split /,/, $devids) {
  126. push @devfids, MogileFS::DevFID->new($devid, $row->[0]);
  127. }
  128. }
  129. # We might've done no work due to discovering the tempfiles are real.
  130. return 0 unless @fidids;
  131. $sto->mass_insert_file_on(@devfids);
  132. $sto->enqueue_fids_to_delete2(@fidids);
  133. $sto->dbh->do("DELETE FROM tempfile WHERE fid IN (" . join(',', @fidids) . ")");
  134. return 1;
  135. }
  136. # new style delete queueing. I'm not putting a lot of effort into commonizing
  137. # code between the old one and the new one. Feel free to send a patch!
  138. sub process_deletes2 {
  139. my $self = shift;
  140. my $sto = Mgd::get_store();
  141. my $queue_todo = $self->queue_todo('delete');
  142. unless (@$queue_todo) {
  143. # No work.
  144. return 0;
  145. }
  146. while (my $todo = shift @$queue_todo) {
  147. $self->still_alive;
  148. # load all the devids related to this fid, and delete.
  149. my $fid = MogileFS::FID->new($todo->{fid});
  150. my $fidid = $fid->id;
  151. # if it's currently being replicated, wait for replication to finish
  152. # before deleting to avoid stale files
  153. if (! $sto->should_begin_replicating_fidid($fidid)) {
  154. $sto->reschedule_file_to_delete2_relative($fidid, 1);
  155. next;
  156. }
  157. $sto->delete_fidid_enqueued($fidid);
  158. my @devids = $fid->devids;
  159. my %devids = map { $_ => 1 } @devids;
  160. for my $devid (@devids) {
  161. my $dev = $devid ? Mgd::device_factory()->get_by_id($devid) : undef;
  162. error("deleting fid $fidid, on devid ".($devid || 'NULL')."...") if $Mgd::DEBUG >= 2;
  163. unless ($dev) {
  164. next;
  165. }
  166. if ($dev->dstate->is_perm_dead) {
  167. $sto->remove_fidid_from_devid($fidid, $devid);
  168. delete $devids{$devid};
  169. next;
  170. }
  171. # devid is observed down/readonly: delay for at least
  172. # 10 minutes.
  173. unless ($dev->observed_writeable) {
  174. $sto->reschedule_file_to_delete2_relative($fidid,
  175. 60 * (10 + $todo->{failcount}));
  176. next;
  177. }
  178. # devid is marked readonly/down/etc: delay for
  179. # at least 1 hour.
  180. unless ($dev->can_delete_from) {
  181. $sto->reschedule_file_to_delete2_relative($fidid,
  182. 60 * 60 * (1 + $todo->{failcount}));
  183. next;
  184. }
  185. my $dfid = MogileFS::DevFID->new($dev, $fidid);
  186. my $path = $dfid->url;
  187. # dormando: "There are cases where url can return undefined,
  188. # Mogile appears to try to replicate to bogus devices
  189. # sometimes?"
  190. unless ($path) {
  191. error("in deleter, url(devid=$devid, fid=$fidid) returned nothing");
  192. next;
  193. }
  194. if ($self->delete_devfid($dfid)) {
  195. # effectively means all went well
  196. $sto->remove_fidid_from_devid($fidid, $devid);
  197. delete $devids{$devid};
  198. } else {
  199. # remote file system error? connect failure? retry in 30min
  200. $sto->reschedule_file_to_delete2_relative($fidid,
  201. 60 * 30 * (1 + $todo->{failcount}));
  202. next;
  203. }
  204. }
  205. # fid has no pants.
  206. unless (keys %devids) {
  207. $sto->delete_fid_from_file_to_delete2($fidid);
  208. }
  209. $sto->note_done_replicating($fidid);
  210. }
  211. # did work.
  212. return 1;
  213. }
  214. sub process_deletes {
  215. my $self = shift;
  216. my $sto = Mgd::get_store();
  217. my $dbh = $sto->dbh;
  218. my $delmap = $dbh->selectall_arrayref("SELECT fd.fid, fo.devid ".
  219. "FROM file_to_delete fd ".
  220. "LEFT JOIN file_on fo ON fd.fid=fo.fid ".
  221. "LIMIT " . LIMIT);
  222. my $count = $delmap ? scalar @$delmap : 0;
  223. return 0 unless $count;
  224. my $done = 0;
  225. foreach my $dm (List::Util::shuffle(@$delmap)) {
  226. last if ++$done > PER_BATCH;
  227. $self->still_alive;
  228. my ($fid, $devid) = @$dm;
  229. error("deleting fid $fid, on devid ".($devid || 'NULL')."...") if $Mgd::DEBUG >= 2;
  230. my $done_with_fid = sub {
  231. my $reason = shift;
  232. $dbh->do("DELETE FROM file_to_delete WHERE fid=?", undef, $fid);
  233. $sto->condthrow("Failure to delete from file_to_delete for fid=$fid");
  234. };
  235. my $done_with_devid = sub {
  236. my $reason = shift;
  237. $dbh->do("DELETE FROM file_on WHERE fid=? AND devid=?",
  238. undef, $fid, $devid);
  239. $sto->condthrow("Failure to delete from file_on for $fid/$devid");
  240. die "Failed to delete from file_on: " . $dbh->errstr if $dbh->err;
  241. };
  242. my $reschedule_fid = sub {
  243. my ($secs, $reason) = (int(shift), shift);
  244. $sto->insert_ignore("INTO file_to_delete_later (fid, delafter) ".
  245. "VALUES (?,".$sto->unix_timestamp."+$secs)", undef,
  246. $fid);
  247. error("delete of fid $fid rescheduled: $reason") if $Mgd::DEBUG >= 2;
  248. $done_with_fid->("rescheduled");
  249. };
  250. # Cases:
  251. # devid is null: doesn't exist anywhere anymore, we're done with this fid.
  252. # devid is observed down/readonly: delay for 10 minutes
  253. # devid is marked readonly: delay for 2 hours
  254. # devid is marked dead or doesn't exist: consider it deleted on this devid.
  255. # CASE: devid is null, which means we're done deleting all instances.
  256. unless (defined $devid) {
  257. $done_with_fid->("no_more_locations");
  258. next;
  259. }
  260. # CASE: devid is marked dead or doesn't exist: consider it deleted on this devid.
  261. # (Note: we're tolerant of '0' as a devid, due to old buggy version which
  262. # would sometimes put that in there)
  263. my $dev = $devid ? Mgd::device_factory()->get_by_id($devid) : undef;
  264. unless ($dev) {
  265. $done_with_devid->("devid_doesnt_exist");
  266. next;
  267. }
  268. if ($dev->dstate->is_perm_dead) {
  269. $done_with_devid->("devid_marked_dead");
  270. next;
  271. }
  272. # CASE: devid is observed down/readonly: delay for 10 minutes
  273. unless ($dev->observed_writeable) {
  274. $reschedule_fid->(60 * 10, "not_observed_writeable");
  275. next;
  276. }
  277. # CASE: devid is marked readonly/down/etc: delay for 2 hours
  278. unless ($dev->can_delete_from) {
  279. $reschedule_fid->(60 * 60 * 2, "devid_marked_not_alive");
  280. next;
  281. }
  282. my $dfid = MogileFS::DevFID->new($dev, $fid);
  283. my $path = $dfid->url;
  284. # dormando: "There are cases where url can return undefined,
  285. # Mogile appears to try to replicate to bogus devices
  286. # sometimes?"
  287. unless ($path) {
  288. error("in deleter, url(devid=$devid, fid=$fid) returned nothing");
  289. next;
  290. }
  291. if ($self->delete_devfid($dfid)) {
  292. $done_with_devid->("deleted");
  293. } else {
  294. # remote file system error? connect failure? retry in 30min
  295. $reschedule_fid->(60 * 30, "http_failure");
  296. }
  297. }
  298. # as far as we know, we have more work to do
  299. return 1;
  300. }
  301. sub reenqueue_delayed_deletes {
  302. my $self = shift;
  303. my $sto = Mgd::get_store();
  304. my $dbh = $sto->dbh;
  305. my @fidids = $sto->fids_to_delete_again
  306. or return;
  307. $sto->enqueue_fids_to_delete(@fidids);
  308. $dbh->do("DELETE FROM file_to_delete_later WHERE fid IN (" .
  309. join(",", @fidids) . ")");
  310. $sto->condthrow("reenqueue file_to_delete_later delete");
  311. }
  312. 1;
  313. # Local Variables:
  314. # mode: perl
  315. # c-basic-indent: 4
  316. # indent-tabs-mode: nil
  317. # End: