PageRenderTime 53ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/apps/couch/src/couch_replication_manager.erl

http://github.com/cloudant/bigcouch
Erlang | 629 lines | 510 code | 93 blank | 26 comment | 0 complexity | d74b49f4e210f64038416664a0c3862c MD5 | raw file
Possible License(s): Apache-2.0
  1. % Licensed under the Apache License, Version 2.0 (the "License"); you may not
  2. % use this file except in compliance with the License. You may obtain a copy of
  3. % the License at
  4. %
  5. % http://www.apache.org/licenses/LICENSE-2.0
  6. %
  7. % Unless required by applicable law or agreed to in writing, software
  8. % distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  9. % WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  10. % License for the specific language governing permissions and limitations under
  11. % the License.
  12. -module(couch_replication_manager).
  13. -behaviour(gen_server).
  14. % public API
  15. -export([replication_started/1, replication_completed/1, replication_error/2]).
  16. % gen_server callbacks
  17. -export([start_link/0, init/1, handle_call/3, handle_info/2, handle_cast/2]).
  18. -export([code_change/3, terminate/2]).
  19. -include("couch_db.hrl").
  20. -include("couch_js_functions.hrl").
  21. -define(DOC_TO_REP, couch_rep_doc_id_to_rep_id).
  22. -define(REP_TO_STATE, couch_rep_id_to_rep_state).
  23. -define(INITIAL_WAIT, 2.5). % seconds
  24. -define(MAX_WAIT, 600). % seconds
  25. -record(state, {
  26. changes_feed_loop = nil,
  27. db_notifier = nil,
  28. rep_db_name = nil,
  29. rep_start_pids = [],
  30. max_retries
  31. }).
  32. -record(rep_state, {
  33. doc_id,
  34. user_ctx,
  35. doc,
  36. starting,
  37. retries_left,
  38. max_retries,
  39. wait = ?INITIAL_WAIT
  40. }).
  41. -import(couch_util, [
  42. get_value/2,
  43. get_value/3,
  44. to_binary/1
  45. ]).
  46. start_link() ->
  47. gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
  48. replication_started({BaseId, _} = RepId) ->
  49. case rep_state(RepId) of
  50. nil ->
  51. ok;
  52. #rep_state{doc_id = DocId} ->
  53. update_rep_doc(DocId, [
  54. {<<"_replication_state">>, <<"triggered">>},
  55. {<<"_replication_id">>, ?l2b(BaseId)}]),
  56. ok = gen_server:call(?MODULE, {rep_started, RepId}, infinity),
  57. ?LOG_INFO("Document `~s` triggered replication `~s`",
  58. [DocId, pp_rep_id(RepId)])
  59. end.
  60. replication_completed(RepId) ->
  61. case rep_state(RepId) of
  62. nil ->
  63. ok;
  64. #rep_state{doc_id = DocId} ->
  65. update_rep_doc(DocId, [{<<"_replication_state">>, <<"completed">>}]),
  66. ok = gen_server:call(?MODULE, {rep_complete, RepId}, infinity),
  67. ?LOG_INFO("Replication `~s` finished (triggered by document `~s`)",
  68. [pp_rep_id(RepId), DocId])
  69. end.
  70. replication_error({BaseId, _} = RepId, Error) ->
  71. case rep_state(RepId) of
  72. nil ->
  73. ok;
  74. #rep_state{doc_id = DocId} ->
  75. % TODO: maybe add error reason to replication document
  76. update_rep_doc(DocId, [
  77. {<<"_replication_state">>, <<"error">>},
  78. {<<"_replication_id">>, ?l2b(BaseId)}]),
  79. ok = gen_server:call(?MODULE, {rep_error, RepId, Error}, infinity)
  80. end.
  81. init(_) ->
  82. process_flag(trap_exit, true),
  83. ?DOC_TO_REP = ets:new(?DOC_TO_REP, [named_table, set, protected]),
  84. ?REP_TO_STATE = ets:new(?REP_TO_STATE, [named_table, set, protected]),
  85. Server = self(),
  86. ok = couch_config:register(
  87. fun("replicator", "db", NewName) ->
  88. ok = gen_server:cast(Server, {rep_db_changed, ?l2b(NewName)});
  89. ("replicator", "max_replication_retry_count", V) ->
  90. ok = gen_server:cast(Server, {set_max_retries, retries_value(V)})
  91. end
  92. ),
  93. {Loop, RepDbName} = changes_feed_loop(),
  94. {ok, #state{
  95. changes_feed_loop = Loop,
  96. rep_db_name = RepDbName,
  97. db_notifier = db_update_notifier(),
  98. max_retries = retries_value(
  99. couch_config:get("replicator", "max_replication_retry_count", "10"))
  100. }}.
  101. handle_call({rep_db_update, {ChangeProps} = Change}, _From, State) ->
  102. NewState = try
  103. process_update(State, Change)
  104. catch
  105. _Tag:Error ->
  106. {RepProps} = get_value(doc, ChangeProps),
  107. DocId = get_value(<<"_id">>, RepProps),
  108. rep_db_update_error(Error, DocId),
  109. State
  110. end,
  111. {reply, ok, NewState};
  112. handle_call({rep_started, RepId}, _From, State) ->
  113. case rep_state(RepId) of
  114. nil ->
  115. ok;
  116. RepState ->
  117. NewRepState = RepState#rep_state{
  118. starting = false,
  119. retries_left = State#state.max_retries,
  120. max_retries = State#state.max_retries,
  121. wait = ?INITIAL_WAIT
  122. },
  123. true = ets:insert(?REP_TO_STATE, {RepId, NewRepState})
  124. end,
  125. {reply, ok, State};
  126. handle_call({rep_complete, RepId}, _From, State) ->
  127. true = ets:delete(?REP_TO_STATE, RepId),
  128. {reply, ok, State};
  129. handle_call({rep_error, RepId, Error}, _From, State) ->
  130. {reply, ok, replication_error(State, RepId, Error)};
  131. handle_call(Msg, From, State) ->
  132. ?LOG_ERROR("Replication manager received unexpected call ~p from ~p",
  133. [Msg, From]),
  134. {stop, {error, {unexpected_call, Msg}}, State}.
  135. handle_cast({rep_db_changed, NewName}, #state{rep_db_name = NewName} = State) ->
  136. {noreply, State};
  137. handle_cast({rep_db_changed, _NewName}, State) ->
  138. {noreply, restart(State)};
  139. handle_cast({rep_db_created, NewName}, #state{rep_db_name = NewName} = State) ->
  140. {noreply, State};
  141. handle_cast({rep_db_created, _NewName}, State) ->
  142. {noreply, restart(State)};
  143. handle_cast({set_max_retries, MaxRetries}, State) ->
  144. {noreply, State#state{max_retries = MaxRetries}};
  145. handle_cast(Msg, State) ->
  146. ?LOG_ERROR("Replication manager received unexpected cast ~p", [Msg]),
  147. {stop, {error, {unexpected_cast, Msg}}, State}.
  148. handle_info({'EXIT', From, normal}, #state{changes_feed_loop = From} = State) ->
  149. % replicator DB deleted
  150. {noreply, State#state{changes_feed_loop = nil, rep_db_name = nil}};
  151. handle_info({'EXIT', From, Reason}, #state{db_notifier = From} = State) ->
  152. ?LOG_ERROR("Database update notifier died. Reason: ~p", [Reason]),
  153. {stop, {db_update_notifier_died, Reason}, State};
  154. handle_info({'EXIT', From, normal}, #state{rep_start_pids = Pids} = State) ->
  155. % one of the replication start processes terminated successfully
  156. {noreply, State#state{rep_start_pids = Pids -- [From]}};
  157. handle_info({'DOWN', _Ref, _, _, _}, State) ->
  158. % From a db monitor created by a replication process. Ignore.
  159. {noreply, State};
  160. handle_info(Msg, State) ->
  161. ?LOG_ERROR("Replication manager received unexpected message ~p", [Msg]),
  162. {stop, {unexpected_msg, Msg}, State}.
  163. terminate(_Reason, State) ->
  164. #state{
  165. rep_start_pids = StartPids,
  166. changes_feed_loop = Loop,
  167. db_notifier = Notifier
  168. } = State,
  169. stop_all_replications(),
  170. lists:foreach(
  171. fun(Pid) ->
  172. catch unlink(Pid),
  173. catch exit(Pid, stop)
  174. end,
  175. [Loop | StartPids]),
  176. true = ets:delete(?REP_TO_STATE),
  177. true = ets:delete(?DOC_TO_REP),
  178. couch_db_update_notifier:stop(Notifier).
  179. code_change(_OldVsn, State, _Extra) ->
  180. {ok, State}.
  181. changes_feed_loop() ->
  182. {ok, RepDb} = ensure_rep_db_exists(),
  183. Server = self(),
  184. Pid = spawn_link(
  185. fun() ->
  186. ChangesFeedFun = couch_changes:handle_changes(
  187. #changes_args{
  188. include_docs = true,
  189. feed = "continuous",
  190. timeout = infinity,
  191. db_open_options = [sys_db]
  192. },
  193. {json_req, null},
  194. RepDb
  195. ),
  196. ChangesFeedFun(
  197. fun({change, Change, _}, _) ->
  198. case has_valid_rep_id(Change) of
  199. true ->
  200. ok = gen_server:call(
  201. Server, {rep_db_update, Change}, infinity);
  202. false ->
  203. ok
  204. end;
  205. (_, _) ->
  206. ok
  207. end
  208. )
  209. end
  210. ),
  211. couch_db:close(RepDb),
  212. {Pid, couch_db:name(RepDb)}.
  213. has_valid_rep_id({Change}) ->
  214. has_valid_rep_id(get_value(<<"id">>, Change));
  215. has_valid_rep_id(<<?DESIGN_DOC_PREFIX, _Rest/binary>>) ->
  216. false;
  217. has_valid_rep_id(_Else) ->
  218. true.
  219. db_update_notifier() ->
  220. Server = self(),
  221. {ok, Notifier} = couch_db_update_notifier:start_link(
  222. fun({created, DbName}) ->
  223. case ?l2b(couch_config:get("replicator", "db", "_replicator")) of
  224. DbName ->
  225. ok = gen_server:cast(Server, {rep_db_created, DbName});
  226. _ ->
  227. ok
  228. end;
  229. (_) ->
  230. % no need to handle the 'deleted' event - the changes feed loop
  231. % dies when the database is deleted
  232. ok
  233. end
  234. ),
  235. Notifier.
  236. restart(#state{changes_feed_loop = Loop, rep_start_pids = StartPids} = State) ->
  237. stop_all_replications(),
  238. lists:foreach(
  239. fun(Pid) ->
  240. catch unlink(Pid),
  241. catch exit(Pid, rep_db_changed)
  242. end,
  243. [Loop | StartPids]),
  244. {NewLoop, NewRepDbName} = changes_feed_loop(),
  245. State#state{
  246. changes_feed_loop = NewLoop,
  247. rep_db_name = NewRepDbName,
  248. rep_start_pids = []
  249. }.
  250. process_update(State, {Change}) ->
  251. {RepProps} = JsonRepDoc = get_value(doc, Change),
  252. DocId = get_value(<<"_id">>, RepProps),
  253. case get_value(<<"deleted">>, Change, false) of
  254. true ->
  255. rep_doc_deleted(DocId),
  256. State;
  257. false ->
  258. case get_value(<<"_replication_state">>, RepProps) of
  259. undefined ->
  260. maybe_start_replication(State, DocId, JsonRepDoc);
  261. <<"triggered">> ->
  262. maybe_start_replication(State, DocId, JsonRepDoc);
  263. <<"completed">> ->
  264. replication_complete(DocId),
  265. State;
  266. <<"error">> ->
  267. case ets:lookup(?DOC_TO_REP, DocId) of
  268. [] ->
  269. maybe_start_replication(State, DocId, JsonRepDoc);
  270. _ ->
  271. State
  272. end
  273. end
  274. end.
  275. rep_db_update_error(Error, DocId) ->
  276. case Error of
  277. {bad_rep_doc, Reason} ->
  278. ok;
  279. _ ->
  280. Reason = to_binary(Error)
  281. end,
  282. ?LOG_ERROR("Replication manager, error processing document `~s`: ~s",
  283. [DocId, Reason]),
  284. update_rep_doc(DocId, [{<<"_replication_state">>, <<"error">>}]).
  285. rep_user_ctx({RepDoc}) ->
  286. case get_value(<<"user_ctx">>, RepDoc) of
  287. undefined ->
  288. #user_ctx{};
  289. {UserCtx} ->
  290. #user_ctx{
  291. name = get_value(<<"name">>, UserCtx, null),
  292. roles = get_value(<<"roles">>, UserCtx, [])
  293. }
  294. end.
  295. maybe_start_replication(State, DocId, RepDoc) ->
  296. UserCtx = rep_user_ctx(RepDoc),
  297. {BaseId, _} = RepId = make_rep_id(RepDoc, UserCtx),
  298. case rep_state(RepId) of
  299. nil ->
  300. RepState = #rep_state{
  301. doc_id = DocId,
  302. user_ctx = UserCtx,
  303. doc = RepDoc,
  304. starting = true,
  305. retries_left = State#state.max_retries,
  306. max_retries = State#state.max_retries
  307. },
  308. true = ets:insert(?REP_TO_STATE, {RepId, RepState}),
  309. true = ets:insert(?DOC_TO_REP, {DocId, RepId}),
  310. ?LOG_INFO("Attempting to start replication `~s` (document `~s`).",
  311. [pp_rep_id(RepId), DocId]),
  312. Server = self(),
  313. Pid = spawn_link(fun() ->
  314. start_replication(Server, RepDoc, RepId, UserCtx, 0)
  315. end),
  316. State#state{rep_start_pids = [Pid | State#state.rep_start_pids]};
  317. #rep_state{doc_id = DocId} ->
  318. State;
  319. #rep_state{starting = false, doc_id = OtherDocId} ->
  320. ?LOG_INFO("The replication specified by the document `~s` was already"
  321. " triggered by the document `~s`", [DocId, OtherDocId]),
  322. maybe_tag_rep_doc(DocId, RepDoc, ?l2b(BaseId)),
  323. State;
  324. #rep_state{starting = true, doc_id = OtherDocId} ->
  325. ?LOG_INFO("The replication specified by the document `~s` is already"
  326. " being triggered by the document `~s`", [DocId, OtherDocId]),
  327. maybe_tag_rep_doc(DocId, RepDoc, ?l2b(BaseId)),
  328. State
  329. end.
  330. make_rep_id(RepDoc, UserCtx) ->
  331. try
  332. couch_rep:make_replication_id(RepDoc, UserCtx)
  333. catch
  334. throw:{error, Reason} ->
  335. throw({bad_rep_doc, Reason});
  336. Tag:Err ->
  337. throw({bad_rep_doc, to_binary({Tag, Err})})
  338. end.
  339. maybe_tag_rep_doc(DocId, {RepProps}, RepId) ->
  340. case get_value(<<"_replication_id">>, RepProps) of
  341. RepId ->
  342. ok;
  343. _ ->
  344. update_rep_doc(DocId, [{<<"_replication_id">>, RepId}])
  345. end.
  346. start_replication(Server, RepDoc, RepId, UserCtx, Wait) ->
  347. ok = timer:sleep(Wait * 1000),
  348. case (catch couch_rep:start_replication(RepDoc, RepId, UserCtx, ?MODULE)) of
  349. Pid when is_pid(Pid) ->
  350. ok = gen_server:call(Server, {rep_started, RepId}, infinity),
  351. couch_rep:get_result(Pid, RepId, RepDoc, UserCtx);
  352. Error ->
  353. replication_error(RepId, Error)
  354. end.
  355. replication_complete(DocId) ->
  356. case ets:lookup(?DOC_TO_REP, DocId) of
  357. [{DocId, RepId}] ->
  358. case rep_state(RepId) of
  359. nil ->
  360. couch_rep:end_replication(RepId);
  361. #rep_state{} ->
  362. ok
  363. end,
  364. true = ets:delete(?DOC_TO_REP, DocId);
  365. _ ->
  366. ok
  367. end.
  368. rep_doc_deleted(DocId) ->
  369. case ets:lookup(?DOC_TO_REP, DocId) of
  370. [{DocId, RepId}] ->
  371. couch_rep:end_replication(RepId),
  372. true = ets:delete(?REP_TO_STATE, RepId),
  373. true = ets:delete(?DOC_TO_REP, DocId),
  374. ?LOG_INFO("Stopped replication `~s` because replication document `~s`"
  375. " was deleted", [pp_rep_id(RepId), DocId]);
  376. [] ->
  377. ok
  378. end.
  379. replication_error(State, RepId, Error) ->
  380. case rep_state(RepId) of
  381. nil ->
  382. State;
  383. RepState ->
  384. maybe_retry_replication(RepId, RepState, Error, State)
  385. end.
  386. maybe_retry_replication(RepId, #rep_state{retries_left = 0} = RepState, Error, State) ->
  387. #rep_state{
  388. doc_id = DocId,
  389. max_retries = MaxRetries
  390. } = RepState,
  391. couch_rep:end_replication(RepId),
  392. true = ets:delete(?REP_TO_STATE, RepId),
  393. true = ets:delete(?DOC_TO_REP, DocId),
  394. ?LOG_ERROR("Error in replication `~s` (triggered by document `~s`): ~s"
  395. "~nReached maximum retry attempts (~p).",
  396. [pp_rep_id(RepId), DocId, to_binary(error_reason(Error)), MaxRetries]),
  397. State;
  398. maybe_retry_replication(RepId, RepState, Error, State) ->
  399. #rep_state{
  400. doc_id = DocId,
  401. user_ctx = UserCtx,
  402. doc = RepDoc
  403. } = RepState,
  404. #rep_state{wait = Wait} = NewRepState = state_after_error(RepState),
  405. true = ets:insert(?REP_TO_STATE, {RepId, NewRepState}),
  406. ?LOG_ERROR("Error in replication `~s` (triggered by document `~s`): ~s"
  407. "~nRestarting replication in ~p seconds.",
  408. [pp_rep_id(RepId), DocId, to_binary(error_reason(Error)), Wait]),
  409. Server = self(),
  410. Pid = spawn_link(fun() ->
  411. start_replication(Server, RepDoc, RepId, UserCtx, Wait)
  412. end),
  413. State#state{rep_start_pids = [Pid | State#state.rep_start_pids]}.
  414. stop_all_replications() ->
  415. ?LOG_INFO("Stopping all ongoing replications because the replicator"
  416. " database was deleted or changed", []),
  417. ets:foldl(
  418. fun({_, RepId}, _) ->
  419. couch_rep:end_replication(RepId)
  420. end,
  421. ok, ?DOC_TO_REP),
  422. true = ets:delete_all_objects(?REP_TO_STATE),
  423. true = ets:delete_all_objects(?DOC_TO_REP).
  424. update_rep_doc(RepDocId, KVs) ->
  425. {ok, RepDb} = ensure_rep_db_exists(),
  426. try
  427. case couch_db:open_doc(RepDb, RepDocId, []) of
  428. {ok, LatestRepDoc} ->
  429. update_rep_doc(RepDb, LatestRepDoc, KVs);
  430. _ ->
  431. ok
  432. end
  433. catch throw:conflict ->
  434. % Shouldn't happen, as by default only the role _replicator can
  435. % update replication documents.
  436. ?LOG_ERROR("Conflict error when updating replication document `~s`."
  437. " Retrying.", [RepDocId]),
  438. ok = timer:sleep(5),
  439. update_rep_doc(RepDocId, KVs)
  440. after
  441. couch_db:close(RepDb)
  442. end.
  443. update_rep_doc(RepDb, #doc{body = {RepDocBody}} = RepDoc, KVs) ->
  444. NewRepDocBody = lists:foldl(
  445. fun({<<"_replication_state">> = K, State} = KV, Body) ->
  446. case get_value(K, Body) of
  447. State ->
  448. Body;
  449. _ ->
  450. Body1 = lists:keystore(K, 1, Body, KV),
  451. lists:keystore(
  452. <<"_replication_state_time">>, 1, Body1,
  453. {<<"_replication_state_time">>, timestamp()})
  454. end;
  455. ({K, _V} = KV, Body) ->
  456. lists:keystore(K, 1, Body, KV)
  457. end,
  458. RepDocBody, KVs),
  459. case NewRepDocBody of
  460. RepDocBody ->
  461. ok;
  462. _ ->
  463. % Might not succeed - when the replication doc is deleted right
  464. % before this update (not an error, ignore).
  465. couch_db:update_doc(RepDb, RepDoc#doc{body = {NewRepDocBody}}, [])
  466. end.
  467. % RFC3339 timestamps.
  468. % Note: doesn't include the time seconds fraction (RFC3339 says it's optional).
  469. timestamp() ->
  470. {{Year, Month, Day}, {Hour, Min, Sec}} = calendar:now_to_local_time(now()),
  471. UTime = erlang:universaltime(),
  472. LocalTime = calendar:universal_time_to_local_time(UTime),
  473. DiffSecs = calendar:datetime_to_gregorian_seconds(LocalTime) -
  474. calendar:datetime_to_gregorian_seconds(UTime),
  475. zone(DiffSecs div 3600, (DiffSecs rem 3600) div 60),
  476. iolist_to_binary(
  477. io_lib:format("~4..0w-~2..0w-~2..0wT~2..0w:~2..0w:~2..0w~s",
  478. [Year, Month, Day, Hour, Min, Sec,
  479. zone(DiffSecs div 3600, (DiffSecs rem 3600) div 60)])).
  480. zone(Hr, Min) when Hr >= 0, Min >= 0 ->
  481. io_lib:format("+~2..0w:~2..0w", [Hr, Min]);
  482. zone(Hr, Min) ->
  483. io_lib:format("-~2..0w:~2..0w", [abs(Hr), abs(Min)]).
  484. ensure_rep_db_exists() ->
  485. DbName = ?l2b(couch_config:get("replicator", "db", "_replicator")),
  486. Opts = [
  487. {user_ctx, #user_ctx{roles=[<<"_admin">>, <<"_replicator">>]}},
  488. sys_db
  489. ],
  490. case couch_db:open(DbName, Opts) of
  491. {ok, Db} ->
  492. Db;
  493. _Error ->
  494. {ok, Db} = couch_db:create(DbName, Opts)
  495. end,
  496. ok = ensure_rep_ddoc_exists(Db, <<"_design/_replicator">>),
  497. {ok, Db}.
  498. ensure_rep_ddoc_exists(RepDb, DDocID) ->
  499. case couch_db:open_doc(RepDb, DDocID, []) of
  500. {ok, _Doc} ->
  501. ok;
  502. _ ->
  503. DDoc = couch_doc:from_json_obj({[
  504. {<<"_id">>, DDocID},
  505. {<<"language">>, <<"javascript">>},
  506. {<<"validate_doc_update">>, ?REP_DB_DOC_VALIDATE_FUN}
  507. ]}),
  508. {ok, _Rev} = couch_db:update_doc(RepDb, DDoc, [])
  509. end,
  510. ok.
  511. % pretty-print replication id
  512. pp_rep_id({Base, Extension}) ->
  513. Base ++ Extension.
  514. rep_state(RepId) ->
  515. case ets:lookup(?REP_TO_STATE, RepId) of
  516. [{RepId, RepState}] ->
  517. RepState;
  518. [] ->
  519. nil
  520. end.
  521. error_reason({error, Reason}) ->
  522. Reason;
  523. error_reason(Reason) ->
  524. Reason.
  525. retries_value("infinity") ->
  526. infinity;
  527. retries_value(Value) ->
  528. list_to_integer(Value).
  529. state_after_error(#rep_state{retries_left = Left, wait = Wait} = State) ->
  530. Wait2 = erlang:min(trunc(Wait * 2), ?MAX_WAIT),
  531. case Left of
  532. infinity ->
  533. State#rep_state{wait = Wait2};
  534. _ ->
  535. State#rep_state{retries_left = Left - 1, wait = Wait2}
  536. end.