/src/ns_rebalancer.erl

http://github.com/membase/ns_server · Erlang · 1510 lines · 1195 code · 210 blank · 105 comment · 44 complexity · 5a81bc9f551c82614b88c8127cfb4ac3 MD5 · raw file

Large files are truncated click here to view the full file

  1. %% @author Couchbase <info@couchbase.com>
  2. %% @copyright 2010-2018 Couchbase, Inc.
  3. %%
  4. %% Licensed under the Apache License, Version 2.0 (the "License");
  5. %% you may not use this file except in compliance with the License.
  6. %% You may obtain a copy of the License at
  7. %%
  8. %% http://www.apache.org/licenses/LICENSE-2.0
  9. %%
  10. %% Unless required by applicable law or agreed to in writing, software
  11. %% distributed under the License is distributed on an "AS IS" BASIS,
  12. %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. %% See the License for the specific language governing permissions and
  14. %% limitations under the License.
  15. %%
  16. %% Monitor and maintain the vbucket layout of each bucket.
  17. %% There is one of these per bucket.
  18. %%
  19. %% @doc Rebalancing functions.
  20. %%
  21. -module(ns_rebalancer).
  22. -include("cut.hrl").
  23. -include("ns_common.hrl").
  24. -include("ns_stats.hrl").
  25. -include_lib("eunit/include/eunit.hrl").
  26. -export([orchestrate_failover/1,
  27. check_graceful_failover_possible/2,
  28. validate_autofailover/1,
  29. generate_initial_map/1,
  30. start_link_rebalance/5,
  31. move_vbuckets/2,
  32. unbalanced/2,
  33. map_options_changed/1,
  34. eject_nodes/1,
  35. maybe_cleanup_old_buckets/1,
  36. get_delta_recovery_nodes/2,
  37. verify_replication/3,
  38. start_link_graceful_failover/1,
  39. generate_vbucket_map_options/2,
  40. run_failover/2,
  41. rebalance_topology_aware_services/4]).
  42. -export([wait_local_buckets_shutdown_complete/0]). % used via rpc:multicall
  43. -define(DATA_LOST, 1).
  44. -define(BAD_REPLICATORS, 2).
  45. -define(DEFAULT_BUCKETS_SHUTDOWN_WAIT_TIMEOUT, 20000).
  46. -define(REBALANCER_READINESS_WAIT_TIMEOUT,
  47. ns_config:get_timeout({ns_rebalancer, readiness}, 60000)).
  48. -define(REBALANCER_QUERY_STATES_TIMEOUT,
  49. ns_config:get_timeout({ns_rebalancer, query_states}, 10000)).
  50. -define(REBALANCER_APPLY_CONFIG_TIMEOUT,
  51. ns_config:get_timeout({ns_rebalancer, apply_config}, 300000)).
  52. -define(FAILOVER_CONFIG_SYNC_TIMEOUT,
  53. ns_config:get_timeout({ns_rebalancer, failover_config_sync}, 2000)).
  54. %%
  55. %% API
  56. %%
  57. run_failover(Nodes, AllowUnsafe) ->
  58. ok = check_no_tap_buckets(),
  59. case check_failover_possible(Nodes) of
  60. ok ->
  61. Result = leader_activities:run_activity(
  62. failover, majority,
  63. ?cut(orchestrate_failover(Nodes)),
  64. [{unsafe, AllowUnsafe}]),
  65. case Result of
  66. {leader_activities_error, _, quorum_lost} ->
  67. orchestration_unsafe;
  68. {leader_activities_error, _, {no_quorum, _}} ->
  69. orchestration_unsafe;
  70. _ ->
  71. Result
  72. end;
  73. Error ->
  74. Error
  75. end.
  76. orchestrate_failover(Nodes) ->
  77. ale:info(?USER_LOGGER, "Starting failing over ~p", [Nodes]),
  78. master_activity_events:note_failover(Nodes),
  79. ErrorNodes = failover(Nodes),
  80. case ErrorNodes of
  81. [] ->
  82. ns_cluster:counter_inc(failover_complete),
  83. ale:info(?USER_LOGGER, "Failed over ~p: ok", [Nodes]);
  84. _ ->
  85. ns_cluster:counter_inc(failover_incomplete),
  86. ale:error(?USER_LOGGER,
  87. "Failover couldn't "
  88. "complete on some nodes:~n~p", [ErrorNodes])
  89. end,
  90. ns_cluster:counter_inc(failover),
  91. deactivate_nodes(Nodes),
  92. ok.
  93. deactivate_nodes([]) ->
  94. ok;
  95. deactivate_nodes(Nodes) ->
  96. ale:info(?USER_LOGGER, "Deactivating failed over nodes ~p", [Nodes]),
  97. ns_cluster_membership:deactivate(Nodes),
  98. OtherNodes = ns_node_disco:nodes_wanted() -- Nodes,
  99. LiveNodes = leader_utils:live_nodes(OtherNodes),
  100. ns_config_rep:ensure_config_seen_by_nodes(LiveNodes,
  101. ?FAILOVER_CONFIG_SYNC_TIMEOUT).
  102. %% @doc Fail one or more nodes. Doesn't eject the node from the cluster. Takes
  103. %% effect immediately.
  104. failover(Nodes) ->
  105. lists:umerge([failover_buckets(Nodes),
  106. failover_services(Nodes)]).
  107. failover_buckets(Nodes) ->
  108. Results = lists:flatmap(fun ({Bucket, BucketConfig}) ->
  109. failover_bucket(Bucket, BucketConfig, Nodes)
  110. end, ns_bucket:get_buckets()),
  111. failover_buckets_handle_failover_vbuckets(Results),
  112. failover_handle_results(Results).
  113. failover_buckets_handle_failover_vbuckets(Results) ->
  114. FailoverVBuckets =
  115. misc:groupby_map(fun (Result) ->
  116. Node = proplists:get_value(node, Result),
  117. Bucket = proplists:get_value(bucket, Result),
  118. VBs = proplists:get_value(vbuckets, Result),
  119. {Node, {Bucket, VBs}}
  120. end, Results),
  121. KVs = [{{node, N, failover_vbuckets}, VBs} || {N, VBs} <- FailoverVBuckets],
  122. ns_config:set(KVs).
  123. failover_handle_results(Results) ->
  124. NodeStatuses =
  125. misc:groupby_map(fun (Result) ->
  126. Node = proplists:get_value(node, Result),
  127. Status = proplists:get_value(status, Result),
  128. {Node, Status}
  129. end, Results),
  130. lists:filtermap(fun ({Node, Statuses}) ->
  131. NonOKs = [S || S <- Statuses, S =/= ok],
  132. case NonOKs of
  133. [] ->
  134. false;
  135. _ ->
  136. {true, Node}
  137. end
  138. end, NodeStatuses).
  139. failover_bucket(Bucket, BucketConfig, Nodes) ->
  140. master_activity_events:note_bucket_failover_started(Bucket, Nodes),
  141. Type = ns_bucket:bucket_type(BucketConfig),
  142. Result = do_failover_bucket(Type, Bucket, BucketConfig, Nodes),
  143. master_activity_events:note_bucket_failover_ended(Bucket, Nodes),
  144. Result.
  145. do_failover_bucket(memcached, Bucket, BucketConfig, Nodes) ->
  146. failover_memcached_bucket(Nodes, Bucket, BucketConfig),
  147. [];
  148. do_failover_bucket(membase, Bucket, BucketConfig, Nodes) ->
  149. Map = proplists:get_value(map, BucketConfig, []),
  150. R = failover_membase_bucket(Nodes, Bucket, BucketConfig, Map),
  151. [[{bucket, Bucket},
  152. {node, N},
  153. {status, R},
  154. {vbuckets, node_vbuckets(Map, N)}] || N <- Nodes].
  155. failover_services(Nodes) ->
  156. failover_services(cluster_compat_mode:is_cluster_41(), Nodes).
  157. failover_services(false, _Nodes) ->
  158. [];
  159. failover_services(true, Nodes) ->
  160. Config = ns_config:get(),
  161. Services0 = lists:flatmap(
  162. ns_cluster_membership:node_services(Config, _), Nodes),
  163. Services = lists:usort(Services0) -- [kv],
  164. Results = lists:flatmap(failover_service(Config, _, Nodes), Services),
  165. failover_handle_results(Results).
  166. failover_service(Config, Service, Nodes) ->
  167. ns_cluster_membership:failover_service_nodes(Config, Service, Nodes),
  168. %% We're refetching the config since failover_service_nodes updated the
  169. %% one that we had.
  170. Result = service_janitor:complete_service_failover(ns_config:get(),
  171. Service,
  172. Nodes),
  173. case Result of
  174. ok ->
  175. ?log_debug("Failed over service ~p on nodes ~p successfully",
  176. [Service, Nodes]);
  177. _ ->
  178. ?log_error("Failed to failover service ~p on nodes ~p: ~p",
  179. [Service, Nodes, Result])
  180. end,
  181. [[{node, Node},
  182. {status, Result},
  183. {service, Service}] || Node <- Nodes].
  184. get_failover_vbuckets(Config, Node) ->
  185. ns_config:search(Config, {node, Node, failover_vbuckets}, []).
  186. validate_autofailover(Nodes) ->
  187. BucketPairs = ns_bucket:get_buckets(),
  188. UnsafeBuckets =
  189. [BucketName
  190. || {BucketName, BucketConfig} <- BucketPairs,
  191. validate_autofailover_bucket(BucketConfig, Nodes) =:= false],
  192. case UnsafeBuckets of
  193. [] -> ok;
  194. _ -> {error, UnsafeBuckets}
  195. end.
  196. validate_autofailover_bucket(BucketConfig, Nodes) ->
  197. case proplists:get_value(type, BucketConfig) of
  198. membase ->
  199. Map = proplists:get_value(map, BucketConfig),
  200. Map1 = mb_map:promote_replicas(Map, Nodes),
  201. case Map1 of
  202. undefined ->
  203. true;
  204. _ ->
  205. case [I || {I, [undefined|_]} <- misc:enumerate(Map1, 0)] of
  206. [] -> true;
  207. _MissingVBuckets ->
  208. false
  209. end
  210. end;
  211. _ ->
  212. true
  213. end.
  214. failover_memcached_bucket(Nodes, Bucket, BucketConfig) ->
  215. remove_nodes_from_server_list(Nodes, Bucket, BucketConfig).
  216. failover_membase_bucket(Nodes, Bucket, BucketConfig, Map) when Map =:= [] ->
  217. %% this is possible if bucket just got created and ns_janitor didn't get a
  218. %% chance to create a map yet; or alternatively, if it failed to do so
  219. %% because, for example, one of the nodes was down
  220. failover_membase_bucket_with_no_map(Nodes, Bucket, BucketConfig);
  221. failover_membase_bucket(Nodes, Bucket, BucketConfig, Map) ->
  222. failover_membase_bucket_with_map(Nodes, Bucket, BucketConfig, Map).
  223. failover_membase_bucket_with_no_map(Nodes, Bucket, BucketConfig) ->
  224. ?log_debug("Skipping failover of bucket ~p because it has no vbuckets. "
  225. "Config:~n~p", [Bucket, BucketConfig]),
  226. %% we still need to make sure to remove ourselves from the bucket server
  227. %% list
  228. remove_nodes_from_server_list(Nodes, Bucket, BucketConfig),
  229. ok.
  230. failover_membase_bucket_with_map(Nodes, Bucket, BucketConfig, Map) ->
  231. %% Promote replicas of vbuckets on this node
  232. NewMap = mb_map:promote_replicas(Map, Nodes),
  233. true = (NewMap =/= undefined),
  234. case [I || {I, [undefined|_]} <- misc:enumerate(NewMap, 0)] of
  235. [] -> ok; % Phew!
  236. MissingVBuckets ->
  237. ?rebalance_error("Lost data in ~p for ~w", [Bucket, MissingVBuckets]),
  238. ?user_log(?DATA_LOST,
  239. "Data has been lost for ~B% of vbuckets in bucket ~p.",
  240. [length(MissingVBuckets) * 100 div length(Map), Bucket])
  241. end,
  242. ns_bucket:set_fast_forward_map(Bucket, undefined),
  243. ns_bucket:set_map(Bucket, NewMap),
  244. remove_nodes_from_server_list(Nodes, Bucket, BucketConfig),
  245. try ns_janitor:cleanup(Bucket, []) of
  246. ok ->
  247. ok;
  248. {error, _, BadNodes} ->
  249. ?rebalance_error("Skipped vbucket activations and "
  250. "replication topology changes because not "
  251. "all remaining nodes were found to have "
  252. "healthy bucket ~p: ~p", [Bucket, BadNodes]),
  253. janitor_failed
  254. catch
  255. E:R ->
  256. ?rebalance_error("Janitor cleanup of ~p failed after failover of ~p: ~p",
  257. [Bucket, Nodes, {E, R}]),
  258. janitor_failed
  259. end.
  260. remove_nodes_from_server_list(Nodes, Bucket, BucketConfig) ->
  261. Servers = proplists:get_value(servers, BucketConfig),
  262. ns_bucket:set_servers(Bucket, Servers -- Nodes).
  263. generate_vbucket_map_options(KeepNodes, BucketConfig) ->
  264. Config = ns_config:get(),
  265. generate_vbucket_map_options(KeepNodes, BucketConfig, Config).
  266. generate_vbucket_map_options(KeepNodes, BucketConfig, Config) ->
  267. Tags = case ns_config:search(Config, server_groups) of
  268. false ->
  269. undefined;
  270. {value, ServerGroups} ->
  271. case [G || G <- ServerGroups,
  272. proplists:get_value(nodes, G) =/= []] of
  273. [_] ->
  274. %% note that we don't need to handle this case
  275. %% specially; but unfortunately removing it would
  276. %% make 2.5 nodes always believe that rebalance is
  277. %% required in case there's only one server group
  278. undefined;
  279. _ ->
  280. Tags0 = [case proplists:get_value(uuid, G) of
  281. T ->
  282. [{N, T} || N <- proplists:get_value(nodes, G),
  283. lists:member(N, KeepNodes)]
  284. end || G <- ServerGroups],
  285. TagsRV = lists:append(Tags0),
  286. case KeepNodes -- [N || {N, _T} <- TagsRV] of
  287. [] -> ok;
  288. _ ->
  289. %% there's tiny race between start of rebalance and
  290. %% somebody changing server_groups. We largely ignore it,
  291. %% but in case where it can clearly cause problem we raise
  292. %% exception
  293. erlang:error(server_groups_race_detected)
  294. end,
  295. TagsRV
  296. end
  297. end,
  298. Opts0 = ns_bucket:config_to_map_options(BucketConfig),
  299. %% Note that we don't need to have replication_topology here (in fact as
  300. %% of today it's still returned by ns_bucket:config_to_map_options/1), but
  301. %% these options are used to compute map_opts_hash which in turn is used
  302. %% to decide if rebalance is needed. So if we remove this, old nodes will
  303. %% wrongly believe that rebalance is needed even when the cluster is
  304. %% balanced. See MB-15543 for details.
  305. misc:update_proplist(Opts0, [{replication_topology, star},
  306. {tags, Tags}]).
  307. generate_vbucket_map(CurrentMap, KeepNodes, BucketConfig) ->
  308. Opts = generate_vbucket_map_options(KeepNodes, BucketConfig),
  309. Map0 =
  310. case lists:keyfind(deltaRecoveryMap, 1, BucketConfig) of
  311. {deltaRecoveryMap, DRMapAndOpts} when DRMapAndOpts =/= undefined ->
  312. {DRMap, DROpts} = DRMapAndOpts,
  313. case mb_map:is_trivially_compatible_past_map(KeepNodes, CurrentMap,
  314. Opts, DRMap, DROpts) of
  315. true ->
  316. DRMap;
  317. false ->
  318. undefined
  319. end;
  320. _ ->
  321. undefined
  322. end,
  323. Map = case Map0 of
  324. undefined ->
  325. EffectiveOpts = [{maps_history, ns_bucket:past_vbucket_maps()} | Opts],
  326. mb_map:generate_map(CurrentMap, KeepNodes, EffectiveOpts);
  327. _ ->
  328. Map0
  329. end,
  330. {Map, Opts}.
  331. generate_initial_map(BucketConfig) ->
  332. Chain = lists:duplicate(proplists:get_value(num_replicas, BucketConfig) + 1,
  333. undefined),
  334. Map1 = lists:duplicate(proplists:get_value(num_vbuckets, BucketConfig),
  335. Chain),
  336. Servers = proplists:get_value(servers, BucketConfig),
  337. generate_vbucket_map(Map1, Servers, BucketConfig).
  338. local_buckets_shutdown_loop(Ref, CanWait) ->
  339. ExcessiveBuckets = ns_memcached:active_buckets() -- ns_bucket:node_bucket_names(node()),
  340. case ExcessiveBuckets of
  341. [] ->
  342. ok;
  343. _ ->
  344. case CanWait of
  345. false ->
  346. exit({old_buckets_shutdown_wait_failed, ExcessiveBuckets});
  347. true ->
  348. ?log_debug("Waiting until the following old bucket instances are gone: ~p", [ExcessiveBuckets]),
  349. receive
  350. {Ref, timeout} ->
  351. local_buckets_shutdown_loop(Ref, false);
  352. {Ref, _Msg} ->
  353. local_buckets_shutdown_loop(Ref, true)
  354. end
  355. end
  356. end.
  357. %% note: this is rpc:multicall-ed
  358. wait_local_buckets_shutdown_complete() ->
  359. ExcessiveBuckets =
  360. ns_memcached:active_buckets() -- ns_bucket:node_bucket_names(node()),
  361. do_wait_local_buckets_shutdown_complete(ExcessiveBuckets).
  362. do_wait_local_buckets_shutdown_complete([]) ->
  363. ok;
  364. do_wait_local_buckets_shutdown_complete(ExcessiveBuckets) ->
  365. Timeout = ns_config:get_timeout(buckets_shutdown, ?DEFAULT_BUCKETS_SHUTDOWN_WAIT_TIMEOUT)
  366. * length(ExcessiveBuckets),
  367. misc:executing_on_new_process(
  368. fun () ->
  369. Ref = erlang:make_ref(),
  370. Parent = self(),
  371. Subscription = ns_pubsub:subscribe_link(buckets_events,
  372. fun ({stopped, _, _, _} = StoppedMsg) ->
  373. Parent ! {Ref, StoppedMsg};
  374. (_) ->
  375. ok
  376. end),
  377. erlang:send_after(Timeout, Parent, {Ref, timeout}),
  378. try
  379. local_buckets_shutdown_loop(Ref, true)
  380. after
  381. (catch ns_pubsub:unsubscribe(Subscription))
  382. end
  383. end).
  384. do_wait_buckets_shutdown(KeepNodes) ->
  385. {Good, ReallyBad, FailedNodes} =
  386. misc:rpc_multicall_with_plist_result(
  387. KeepNodes, ns_rebalancer, wait_local_buckets_shutdown_complete, []),
  388. NonOk = [Pair || {_Node, Result} = Pair <- Good,
  389. Result =/= ok],
  390. Failures = ReallyBad ++ NonOk ++ [{N, node_was_down} || N <- FailedNodes],
  391. case Failures of
  392. [] ->
  393. ok;
  394. _ ->
  395. ?rebalance_error("Failed to wait deletion of some buckets on some nodes: ~p~n", [Failures]),
  396. exit({buckets_shutdown_wait_failed, Failures})
  397. end.
  398. sanitize(Config) ->
  399. misc:rewrite_key_value_tuple(sasl_password, "*****", Config).
  400. pull_and_push_config(Nodes) ->
  401. case ns_config_rep:pull_remotes(Nodes) of
  402. ok ->
  403. ok;
  404. Error ->
  405. exit({config_sync_failed, Error})
  406. end,
  407. %% And after we have that, make sure recovery, rebalance and
  408. %% graceful failover, all start with latest config reliably
  409. case ns_config_rep:ensure_config_seen_by_nodes(Nodes) of
  410. ok ->
  411. cool;
  412. {error, SyncFailedNodes} ->
  413. exit({config_sync_failed, SyncFailedNodes})
  414. end.
  415. start_link_rebalance(KeepNodes, EjectNodes,
  416. FailedNodes, DeltaNodes, DeltaRecoveryBucketNames) ->
  417. proc_lib:start_link(
  418. erlang, apply,
  419. [fun () ->
  420. ok = check_no_tap_buckets(),
  421. KVKeep = ns_cluster_membership:service_nodes(KeepNodes, kv),
  422. case KVKeep =:= [] of
  423. true ->
  424. proc_lib:init_ack({error, no_kv_nodes_left}),
  425. exit(normal);
  426. false ->
  427. ok
  428. end,
  429. KVDeltaNodes = ns_cluster_membership:service_nodes(DeltaNodes,
  430. kv),
  431. BucketConfigs = ns_bucket:get_buckets(),
  432. case build_delta_recovery_buckets(KVKeep, KVDeltaNodes,
  433. BucketConfigs, DeltaRecoveryBucketNames) of
  434. {ok, DeltaRecoveryBucketTuples} ->
  435. proc_lib:init_ack({ok, self()}),
  436. master_activity_events:note_rebalance_start(
  437. self(), KeepNodes, EjectNodes, FailedNodes, DeltaNodes),
  438. rebalance(KeepNodes, EjectNodes, FailedNodes,
  439. BucketConfigs,
  440. DeltaNodes, DeltaRecoveryBucketTuples);
  441. {error, not_possible} ->
  442. proc_lib:init_ack({error, delta_recovery_not_possible})
  443. end
  444. end, []]).
  445. move_vbuckets(Bucket, Moves) ->
  446. {ok, Config} = ns_bucket:get_bucket(Bucket),
  447. Map = proplists:get_value(map, Config),
  448. TMap = lists:foldl(fun ({VBucket, TargetChain}, Map0) ->
  449. setelement(VBucket+1, Map0, TargetChain)
  450. end, list_to_tuple(Map), Moves),
  451. NewMap = tuple_to_list(TMap),
  452. ProgressFun = make_progress_fun(0, 1),
  453. run_mover(Bucket, Config,
  454. proplists:get_value(servers, Config),
  455. ProgressFun, Map, NewMap).
  456. rebalance_services(KeepNodes, EjectNodes) ->
  457. Config = ns_config:get(),
  458. AllServices = ns_cluster_membership:cluster_supported_services() -- [kv],
  459. TopologyAwareServices = ns_cluster_membership:topology_aware_services(),
  460. SimpleServices = AllServices -- TopologyAwareServices,
  461. SimpleTSs = rebalance_simple_services(Config, SimpleServices, KeepNodes),
  462. TopologyAwareTSs = rebalance_topology_aware_services(Config, TopologyAwareServices,
  463. KeepNodes, EjectNodes),
  464. maybe_delay_eject_nodes(SimpleTSs ++ TopologyAwareTSs, EjectNodes).
  465. rebalance_simple_services(Config, Services, KeepNodes) ->
  466. case cluster_compat_mode:is_cluster_41(Config) of
  467. true ->
  468. lists:filtermap(
  469. fun (Service) ->
  470. ServiceNodes = ns_cluster_membership:service_nodes(KeepNodes, Service),
  471. Updated = update_service_map_with_config(Config, Service, ServiceNodes),
  472. case Updated of
  473. false ->
  474. false;
  475. true ->
  476. {true, {Service, os:timestamp()}}
  477. end
  478. end, Services);
  479. false ->
  480. []
  481. end.
  482. update_service_map_with_config(Config, Service, ServiceNodes0) ->
  483. CurrentNodes0 = ns_cluster_membership:get_service_map(Config, Service),
  484. update_service_map(Service, CurrentNodes0, ServiceNodes0).
  485. update_service_map(Service, CurrentNodes0, ServiceNodes0) ->
  486. CurrentNodes = lists:sort(CurrentNodes0),
  487. ServiceNodes = lists:sort(ServiceNodes0),
  488. case CurrentNodes =:= ServiceNodes of
  489. true ->
  490. false;
  491. false ->
  492. ?rebalance_info("Updating service map for ~p:~n~p",
  493. [Service, ServiceNodes]),
  494. ok = ns_cluster_membership:set_service_map(Service, ServiceNodes),
  495. true
  496. end.
  497. rebalance_topology_aware_services(Config, Services, KeepNodesAll, EjectNodesAll) ->
  498. %% TODO: support this one day
  499. DeltaNodesAll = [],
  500. lists:filtermap(
  501. fun (Service) ->
  502. KeepNodes = ns_cluster_membership:service_nodes(Config, KeepNodesAll, Service),
  503. DeltaNodes = ns_cluster_membership:service_nodes(Config, DeltaNodesAll, Service),
  504. %% if a node being ejected is not active, then it means that it
  505. %% was never rebalanced in in the first place; so we can
  506. %% postpone the heat death of the universe a little bit by
  507. %% ignoring such nodes
  508. ActiveNodes = ns_cluster_membership:get_service_map(Config, Service),
  509. EjectNodes = [N || N <- EjectNodesAll,
  510. lists:member(N, ActiveNodes)],
  511. AllNodes = EjectNodes ++ KeepNodes,
  512. case AllNodes of
  513. [] ->
  514. false;
  515. _ ->
  516. update_service_map_with_config(Config, Service, AllNodes),
  517. ok = rebalance_topology_aware_service(Service, KeepNodes,
  518. EjectNodes, DeltaNodes),
  519. update_service_map(Service, AllNodes, KeepNodes),
  520. {true, {Service, os:timestamp()}}
  521. end
  522. end, Services).
  523. rebalance_topology_aware_service(Service, KeepNodes, EjectNodes, DeltaNodes) ->
  524. ProgressCallback =
  525. fun (Progress) ->
  526. ns_orchestrator:update_progress(Service, Progress)
  527. end,
  528. misc:with_trap_exit(
  529. fun () ->
  530. {Pid, MRef} = service_rebalancer:spawn_monitor_rebalance(
  531. Service, KeepNodes,
  532. EjectNodes, DeltaNodes, ProgressCallback),
  533. receive
  534. {'EXIT', _Pid, Reason} ->
  535. misc:terminate_and_wait(Pid, Reason),
  536. exit(Reason);
  537. {'DOWN', MRef, _, _, Reason} ->
  538. case Reason of
  539. normal ->
  540. ok;
  541. _ ->
  542. exit({service_rebalance_failed, Service, Reason})
  543. end
  544. end
  545. end).
  546. get_service_eject_delay(Service) ->
  547. Default =
  548. case Service of
  549. n1ql ->
  550. 20000;
  551. fts ->
  552. 10000;
  553. _ ->
  554. 0
  555. end,
  556. ns_config:get_timeout({eject_delay, Service}, Default).
  557. maybe_delay_eject_nodes(Timestamps, EjectNodes) ->
  558. case cluster_compat_mode:is_cluster_41() of
  559. true ->
  560. do_maybe_delay_eject_nodes(Timestamps, EjectNodes);
  561. false ->
  562. ok
  563. end.
  564. do_maybe_delay_eject_nodes(_Timestamps, []) ->
  565. ok;
  566. do_maybe_delay_eject_nodes(Timestamps, EjectNodes) ->
  567. EjectedServices =
  568. ordsets:union([ordsets:from_list(ns_cluster_membership:node_services(N))
  569. || N <- EjectNodes]),
  570. Now = os:timestamp(),
  571. Delays = [begin
  572. ServiceDelay = get_service_eject_delay(Service),
  573. case proplists:get_value(Service, Timestamps) of
  574. undefined ->
  575. %% it's possible that a node is ejected without ever
  576. %% getting rebalanced in; there's no point in
  577. %% delaying anything in such case
  578. 0;
  579. RebalanceTS ->
  580. SinceRebalance = max(0, timer:now_diff(Now, RebalanceTS) div 1000),
  581. ServiceDelay - SinceRebalance
  582. end
  583. end || Service <- EjectedServices],
  584. Delay = lists:max(Delays),
  585. case Delay > 0 of
  586. true ->
  587. ?log_info("Waiting ~pms before ejecting nodes:~n~p",
  588. [Delay, EjectNodes]),
  589. timer:sleep(Delay);
  590. false ->
  591. ok
  592. end.
  593. rebalance(KeepNodes, EjectNodesAll, FailedNodesAll,
  594. BucketConfigs,
  595. DeltaNodes, DeltaRecoveryBuckets) ->
  596. ok = leader_activities:run_activity(
  597. rebalance,
  598. {all, KeepNodes ++ EjectNodesAll},
  599. ?cut(rebalance_body(KeepNodes, EjectNodesAll,
  600. FailedNodesAll, BucketConfigs,
  601. DeltaNodes, DeltaRecoveryBuckets))).
  602. rebalance_body(KeepNodes,
  603. EjectNodesAll,
  604. FailedNodesAll,
  605. BucketConfigs, DeltaNodes, DeltaRecoveryBuckets) ->
  606. KVDeltaNodes = ns_cluster_membership:service_nodes(DeltaNodes, kv),
  607. ok = drop_old_2i_indexes(KeepNodes),
  608. ok = apply_delta_recovery_buckets(DeltaRecoveryBuckets,
  609. KVDeltaNodes, BucketConfigs),
  610. ok = maybe_clear_full_recovery_type(KeepNodes),
  611. ok = service_janitor:cleanup(),
  612. ns_cluster_membership:activate(KeepNodes),
  613. pull_and_push_config(EjectNodesAll ++ KeepNodes),
  614. %% Eject failed nodes first so they don't cause trouble
  615. FailedNodes = FailedNodesAll -- [node()],
  616. eject_nodes(FailedNodes),
  617. rebalance_kv(KeepNodes, EjectNodesAll, BucketConfigs, DeltaRecoveryBuckets),
  618. rebalance_services(KeepNodes, EjectNodesAll),
  619. %% don't eject ourselves at all here; this will be handled by
  620. %% ns_orchestrator
  621. EjectNowNodes = EjectNodesAll -- [node()],
  622. %% Deactivate the nodes first so that they are excluded from quorums.
  623. ns_cluster_membership:deactivate(EjectNowNodes),
  624. %% Generally, the nodes to be ejected are supposed to be healthy, so it's
  625. %% ok to try to sync the config to them to. The intent here is to improve
  626. %% the safety: we want those nodes to realize that they are not part of
  627. %% the normal quorum anymore. Obviously, it still doesn't guarantee
  628. %% complete safety.
  629. ok = ns_config_rep:ensure_config_seen_by_nodes(KeepNodes ++ EjectNodesAll),
  630. ok = leader_activities:switch_quorum({all, KeepNodes}),
  631. eject_nodes(EjectNowNodes),
  632. ok.
  633. make_progress_fun(BucketCompletion, NumBuckets) ->
  634. fun (P) ->
  635. Progress = dict:map(fun (_, N) ->
  636. N / NumBuckets + BucketCompletion
  637. end, P),
  638. update_kv_progress(Progress)
  639. end.
  640. update_kv_progress(Progress) ->
  641. ns_orchestrator:update_progress(kv, Progress).
  642. update_kv_progress(Nodes, Progress) ->
  643. update_kv_progress(dict:from_list([{N, Progress} || N <- Nodes])).
  644. rebalance_kv(KeepNodes, EjectNodes, BucketConfigs, DeltaRecoveryBuckets) ->
  645. %% wait when all bucket shutdowns are done on nodes we're
  646. %% adding (or maybe adding)
  647. do_wait_buckets_shutdown(KeepNodes),
  648. NumBuckets = length(BucketConfigs),
  649. ?rebalance_debug("BucketConfigs = ~p", [sanitize(BucketConfigs)]),
  650. KeepKVNodes = ns_cluster_membership:service_nodes(KeepNodes, kv),
  651. LiveKVNodes = ns_cluster_membership:service_nodes(KeepNodes ++ EjectNodes, kv),
  652. case maybe_cleanup_old_buckets(KeepNodes) of
  653. ok ->
  654. ok;
  655. Error ->
  656. exit(Error)
  657. end,
  658. {ok, RebalanceObserver} = ns_rebalance_observer:start_link(length(BucketConfigs)),
  659. lists:foreach(fun ({I, {BucketName, BucketConfig}}) ->
  660. BucketCompletion = I / NumBuckets,
  661. update_kv_progress(LiveKVNodes, BucketCompletion),
  662. ProgressFun = make_progress_fun(BucketCompletion, NumBuckets),
  663. rebalance_bucket(BucketName, BucketConfig, ProgressFun,
  664. KeepKVNodes, EjectNodes, DeltaRecoveryBuckets)
  665. end, misc:enumerate(BucketConfigs, 0)),
  666. update_kv_progress(LiveKVNodes, 1.0),
  667. misc:unlink_terminate_and_wait(RebalanceObserver, shutdown).
  668. rebalance_bucket(BucketName, BucketConfig, ProgressFun,
  669. KeepKVNodes, EjectNodes, DeltaRecoveryBuckets) ->
  670. ale:info(?USER_LOGGER, "Started rebalancing bucket ~s", [BucketName]),
  671. ?rebalance_info("Rebalancing bucket ~p with config ~p",
  672. [BucketName, sanitize(BucketConfig)]),
  673. case proplists:get_value(type, BucketConfig) of
  674. memcached ->
  675. rebalance_memcached_bucket(BucketName, KeepKVNodes);
  676. membase ->
  677. rebalance_membase_bucket(BucketName, BucketConfig, ProgressFun,
  678. KeepKVNodes, EjectNodes, DeltaRecoveryBuckets)
  679. end.
  680. rebalance_memcached_bucket(BucketName, KeepKVNodes) ->
  681. master_activity_events:note_bucket_rebalance_started(BucketName),
  682. ns_bucket:set_servers(BucketName, KeepKVNodes),
  683. master_activity_events:note_bucket_rebalance_ended(BucketName).
  684. rebalance_membase_bucket(BucketName, BucketConfig, ProgressFun,
  685. KeepKVNodes, EjectNodes, DeltaRecoveryBuckets) ->
  686. %% Only start one bucket at a time to avoid
  687. %% overloading things
  688. ThisEjected = ordsets:intersection(lists:sort(proplists:get_value(servers, BucketConfig, [])),
  689. lists:sort(EjectNodes)),
  690. ThisLiveNodes = KeepKVNodes ++ ThisEjected,
  691. ns_bucket:set_servers(BucketName, ThisLiveNodes),
  692. ?rebalance_info("Waiting for bucket ~p to be ready on ~p", [BucketName, ThisLiveNodes]),
  693. {ok, _States, Zombies} = janitor_agent:query_states(BucketName, ThisLiveNodes, ?REBALANCER_READINESS_WAIT_TIMEOUT),
  694. case Zombies of
  695. [] ->
  696. ?rebalance_info("Bucket is ready on all nodes"),
  697. ok;
  698. _ ->
  699. exit({not_all_nodes_are_ready_yet, Zombies})
  700. end,
  701. run_janitor_pre_rebalance(BucketName),
  702. {ok, NewConf} =
  703. ns_bucket:get_bucket(BucketName),
  704. master_activity_events:note_bucket_rebalance_started(BucketName),
  705. {NewMap, MapOptions} =
  706. do_rebalance_membase_bucket(BucketName, NewConf,
  707. KeepKVNodes, ProgressFun, DeltaRecoveryBuckets),
  708. ns_bucket:set_map_opts(BucketName, MapOptions),
  709. ns_bucket:update_bucket_props(BucketName,
  710. [{deltaRecoveryMap, undefined}]),
  711. master_activity_events:note_bucket_rebalance_ended(BucketName),
  712. run_verify_replication(BucketName, KeepKVNodes, NewMap).
  713. run_janitor_pre_rebalance(BucketName) ->
  714. case ns_janitor:cleanup(BucketName,
  715. [{query_states_timeout, ?REBALANCER_QUERY_STATES_TIMEOUT},
  716. {apply_config_timeout, ?REBALANCER_APPLY_CONFIG_TIMEOUT}]) of
  717. ok ->
  718. ok;
  719. {error, _, BadNodes} ->
  720. exit({pre_rebalance_janitor_run_failed, BadNodes})
  721. end.
  722. %% @doc Rebalance the cluster. Operates on a single bucket. Will
  723. %% either return ok or exit with reason 'stopped' or whatever reason
  724. %% was given by whatever failed.
  725. do_rebalance_membase_bucket(Bucket, Config,
  726. KeepNodes, ProgressFun, DeltaRecoveryBuckets) ->
  727. Map = proplists:get_value(map, Config),
  728. {FastForwardMap, MapOptions} =
  729. case lists:keyfind(Bucket, 1, DeltaRecoveryBuckets) of
  730. false ->
  731. generate_vbucket_map(Map, KeepNodes, Config);
  732. {_, _, V} ->
  733. V
  734. end,
  735. ns_bucket:update_vbucket_map_history(FastForwardMap, MapOptions),
  736. ?rebalance_debug("Target map options: ~p (hash: ~p)", [MapOptions, erlang:phash2(MapOptions)]),
  737. {run_mover(Bucket, Config, KeepNodes, ProgressFun, Map, FastForwardMap),
  738. MapOptions}.
  739. run_mover(Bucket, Config, KeepNodes, ProgressFun, Map, FastForwardMap) ->
  740. ?rebalance_info("Target map (distance: ~p):~n~p", [(catch mb_map:vbucket_movements(Map, FastForwardMap)), FastForwardMap]),
  741. ns_bucket:set_fast_forward_map(Bucket, FastForwardMap),
  742. misc:with_trap_exit(
  743. fun () ->
  744. {ok, Pid} = ns_vbucket_mover:start_link(Bucket, Map,
  745. FastForwardMap,
  746. ProgressFun),
  747. wait_for_mover(Pid)
  748. end),
  749. HadRebalanceOut = ((proplists:get_value(servers, Config, []) -- KeepNodes) =/= []),
  750. case HadRebalanceOut of
  751. true ->
  752. SecondsToWait = ns_config:read_key_fast(rebalance_out_delay_seconds, 10),
  753. ?rebalance_info("Waiting ~w seconds before completing rebalance out."
  754. " So that clients receive graceful not my vbucket instead of silent closed connection", [SecondsToWait]),
  755. timer:sleep(SecondsToWait * 1000);
  756. false ->
  757. ok
  758. end,
  759. ns_bucket:set_fast_forward_map(Bucket, undefined),
  760. ns_bucket:set_servers(Bucket, KeepNodes),
  761. FastForwardMap.
  762. unbalanced(Map, BucketConfig) ->
  763. Servers = proplists:get_value(servers, BucketConfig, []),
  764. NumServers = length(Servers),
  765. R = lists:any(
  766. fun (Chain) ->
  767. lists:member(
  768. undefined,
  769. %% Don't warn about missing replicas when you have
  770. %% fewer servers than your copy count!
  771. lists:sublist(Chain, NumServers))
  772. end, Map),
  773. R orelse do_unbalanced(Map, Servers).
  774. do_unbalanced(Map, Servers) ->
  775. {Masters, Replicas} =
  776. lists:foldl(
  777. fun ([M | R], {AccM, AccR}) ->
  778. {[M | AccM], R ++ AccR}
  779. end, {[], []}, Map),
  780. Masters1 = lists:sort([M || M <- Masters, lists:member(M, Servers)]),
  781. Replicas1 = lists:sort([R || R <- Replicas, lists:member(R, Servers)]),
  782. MastersCounts = misc:uniqc(Masters1),
  783. ReplicasCounts = misc:uniqc(Replicas1),
  784. NumServers = length(Servers),
  785. lists:any(
  786. fun (Counts0) ->
  787. Counts1 = [C || {_, C} <- Counts0],
  788. Len = length(Counts1),
  789. Counts = case Len < NumServers of
  790. true ->
  791. lists:duplicate(NumServers - Len, 0) ++ Counts1;
  792. false ->
  793. true = Len =:= NumServers,
  794. Counts1
  795. end,
  796. Counts =/= [] andalso lists:max(Counts) - lists:min(Counts) > 1
  797. end, [MastersCounts, ReplicasCounts]).
  798. map_options_changed(BucketConfig) ->
  799. Config = ns_config:get(),
  800. Servers = proplists:get_value(servers, BucketConfig, []),
  801. Opts = generate_vbucket_map_options(Servers, BucketConfig, Config),
  802. OptsHash = proplists:get_value(map_opts_hash, BucketConfig),
  803. case OptsHash of
  804. undefined ->
  805. true;
  806. _ ->
  807. erlang:phash2(Opts) =/= OptsHash
  808. end.
  809. %%
  810. %% Internal functions
  811. %%
  812. %% @private
  813. %% @doc Eject a list of nodes from the cluster, making sure this node is last.
  814. eject_nodes(Nodes) ->
  815. %% Leave myself last
  816. LeaveNodes = case lists:member(node(), Nodes) of
  817. true ->
  818. (Nodes -- [node()]) ++ [node()];
  819. false ->
  820. Nodes
  821. end,
  822. lists:foreach(fun (N) ->
  823. ns_cluster_membership:deactivate([N]),
  824. ns_cluster:leave(N)
  825. end, LeaveNodes).
  826. run_verify_replication(Bucket, Nodes, Map) ->
  827. Pid = proc_lib:spawn_link(?MODULE, verify_replication, [Bucket, Nodes, Map]),
  828. ?log_debug("Spawned verify_replication worker: ~p", [Pid]),
  829. {trap_exit, false} = erlang:process_info(self(), trap_exit),
  830. misc:wait_for_process(Pid, infinity).
  831. verify_replication(Bucket, Nodes, Map) ->
  832. ExpectedReplicators0 = ns_bucket:map_to_replicas(Map),
  833. ExpectedReplicators = lists:sort(ExpectedReplicators0),
  834. {ActualReplicators, BadNodes} = janitor_agent:get_src_dst_vbucket_replications(Bucket, Nodes),
  835. case BadNodes of
  836. [] -> ok;
  837. _ ->
  838. ale:error(?USER_LOGGER, "Rebalance is done, but failed to verify replications on following nodes:~p", [BadNodes]),
  839. exit(bad_replicas_due_to_bad_results)
  840. end,
  841. case misc:comm(ExpectedReplicators, ActualReplicators) of
  842. {[], [], _} ->
  843. ok;
  844. {Missing, Extra, _} ->
  845. ?user_log(?BAD_REPLICATORS,
  846. "Bad replicators after rebalance:~nMissing = ~p~nExtras = ~p",
  847. [Missing, Extra]),
  848. exit(bad_replicas)
  849. end.
  850. wait_for_mover(Pid) ->
  851. receive
  852. {'EXIT', Pid, Reason} ->
  853. case Reason of
  854. normal ->
  855. ok;
  856. {shutdown, stop} = Stop->
  857. exit(Stop);
  858. _ ->
  859. exit({mover_crashed, Reason})
  860. end;
  861. {'EXIT', _Pid, {shutdown, stop} = Stop} ->
  862. ?log_debug("Got rebalance stop request"),
  863. TimeoutPid = diag_handler:arm_timeout(
  864. 5000,
  865. fun (_) ->
  866. ?log_debug("Observing slow rebalance stop (mover pid: ~p)", [Pid]),
  867. timeout_diag_logger:log_diagnostics(slow_rebalance_stop)
  868. end),
  869. try
  870. exit(Pid, Stop),
  871. wait_for_mover(Pid)
  872. after
  873. diag_handler:disarm_timeout(TimeoutPid)
  874. end;
  875. {'EXIT', _Pid, Reason} ->
  876. exit(Reason)
  877. end.
  878. maybe_cleanup_old_buckets(KeepNodes) ->
  879. case misc:rpc_multicall_with_plist_result(KeepNodes, ns_storage_conf, delete_unused_buckets_db_files, []) of
  880. {_, _, DownNodes} when DownNodes =/= [] ->
  881. ?rebalance_error("Failed to cleanup old buckets on some nodes: ~p",
  882. [DownNodes]),
  883. {buckets_cleanup_failed, DownNodes};
  884. {Good, ReallyBad, []} ->
  885. ReallyBadNodes =
  886. case ReallyBad of
  887. [] ->
  888. [];
  889. _ ->
  890. ?rebalance_error(
  891. "Failed to cleanup old buckets on some nodes: ~n~p",
  892. [ReallyBad]),
  893. lists:map(fun ({Node, _}) -> Node end, ReallyBad)
  894. end,
  895. FailedNodes =
  896. lists:foldl(
  897. fun ({Node, Result}, Acc) ->
  898. case Result of
  899. ok ->
  900. Acc;
  901. Error ->
  902. ?rebalance_error(
  903. "Failed to cleanup old buckets on node ~p: ~p",
  904. [Node, Error]),
  905. [Node | Acc]
  906. end
  907. end, [], Good),
  908. case FailedNodes ++ ReallyBadNodes of
  909. [] ->
  910. ok;
  911. AllFailedNodes ->
  912. {buckets_cleanup_failed, AllFailedNodes}
  913. end
  914. end.
  915. node_vbuckets(Map, Node) ->
  916. [V || {V, Chain} <- misc:enumerate(Map, 0),
  917. lists:member(Node, Chain)].
  918. find_delta_recovery_map(Config, AllNodes, DeltaNodes, Bucket, BucketConfig) ->
  919. {map, CurrentMap} = lists:keyfind(map, 1, BucketConfig),
  920. CurrentOptions = generate_vbucket_map_options(AllNodes, BucketConfig),
  921. History = ns_bucket:past_vbucket_maps(Config),
  922. MatchingMaps = mb_map:find_matching_past_maps(AllNodes, CurrentMap,
  923. CurrentOptions, History),
  924. find_delta_recovery_map_loop(MatchingMaps,
  925. Config, Bucket, CurrentOptions, DeltaNodes).
  926. find_delta_recovery_map_loop([], _Config, _Bucket, _Options, _DeltaNodes) ->
  927. false;
  928. find_delta_recovery_map_loop([TargetMap | Rest], Config, Bucket, Options, DeltaNodes) ->
  929. {_, TargetVBucketsDict} =
  930. lists:foldl(
  931. fun (Chain, {V, D}) ->
  932. D1 = lists:foldl(
  933. fun (Node, Acc) ->
  934. case lists:member(Node, Chain) of
  935. true ->
  936. dict:update(Node,
  937. fun (Vs) ->
  938. [V | Vs]
  939. end, Acc);
  940. false ->
  941. Acc
  942. end
  943. end, D, DeltaNodes),
  944. {V+1, D1}
  945. end,
  946. {0, dict:from_list([{N, []} || N <- DeltaNodes])}, TargetMap),
  947. Usable =
  948. lists:all(
  949. fun (Node) ->
  950. AllFailoverVBuckets = get_failover_vbuckets(Config, Node),
  951. FailoverVBuckets = proplists:get_value(Bucket, AllFailoverVBuckets),
  952. TargetVBuckets = lists:reverse(dict:fetch(Node, TargetVBucketsDict)),
  953. TargetVBuckets =:= FailoverVBuckets
  954. end, DeltaNodes),
  955. case Usable of
  956. true ->
  957. {TargetMap, Options};
  958. false ->
  959. find_delta_recovery_map_loop(Rest, Config, Bucket, Options, DeltaNodes)
  960. end.
  961. membase_delta_recovery_buckets(DeltaRecoveryBuckets, MembaseBucketConfigs) ->
  962. MembaseBuckets = [Bucket || {Bucket, _} <- MembaseBucketConfigs],
  963. case DeltaRecoveryBuckets of
  964. all ->
  965. MembaseBuckets;
  966. _ when is_list(DeltaRecoveryBuckets) ->
  967. ordsets:to_list(ordsets:intersection(ordsets:from_list(MembaseBuckets),
  968. ordsets:from_list(DeltaRecoveryBuckets)))
  969. end.
  970. build_delta_recovery_buckets(_AllNodes, [] = _DeltaNodes, _AllBucketConfigs, _DeltaRecoveryBuckets) ->
  971. {ok, []};
  972. build_delta_recovery_buckets(AllNodes, DeltaNodes, AllBucketConfigs, DeltaRecoveryBuckets0) ->
  973. Config = ns_config:get(),
  974. MembaseBuckets = [P || {_, BucketConfig} = P <- AllBucketConfigs,
  975. proplists:get_value(type, BucketConfig) =:= membase],
  976. DeltaRecoveryBuckets = membase_delta_recovery_buckets(DeltaRecoveryBuckets0, MembaseBuckets),
  977. %% such non-lazy computation of recovery map is suboptimal, but
  978. %% it's not that big deal suboptimal. I'm doing it for better
  979. %% testability of build_delta_recovery_buckets_loop
  980. MappedConfigs = [{Bucket,
  981. BucketConfig,
  982. find_delta_recovery_map(Config, AllNodes, DeltaNodes,
  983. Bucket, BucketConfig)}
  984. || {Bucket, BucketConfig} <- MembaseBuckets],
  985. case build_delta_recovery_buckets_loop(MappedConfigs, DeltaRecoveryBuckets, []) of
  986. {ok, Recovered0} ->
  987. RV = [{Bucket,
  988. build_transitional_bucket_config(BucketConfig, Map, Opts, DeltaNodes),
  989. {Map, Opts}}
  990. || {Bucket, BucketConfig, {Map, Opts}} <- Recovered0],
  991. {ok, RV};
  992. Error ->
  993. Error
  994. end.
  995. build_delta_recovery_buckets_loop([] = _MappedConfigs, _DeltaRecoveryBuckets, Acc) ->
  996. {ok, Acc};
  997. build_delta_recovery_buckets_loop(MappedConfigs, DeltaRecoveryBuckets, Acc) ->
  998. [{Bucket, BucketConfig, RecoverResult0} | RestMapped] = MappedConfigs,
  999. NeedBucket = lists:member(Bucket, DeltaRecoveryBuckets),
  1000. RecoverResult = case NeedBucket of
  1001. true ->
  1002. RecoverResult0;
  1003. false ->
  1004. false
  1005. end,
  1006. case RecoverResult of
  1007. {Map, Opts} ->
  1008. ?rebalance_debug("Found delta recovery map for bucket ~s: ~p",
  1009. [Bucket, {Map, Opts}]),
  1010. NewAcc = [{Bucket, BucketConfig, {Map, Opts}} | Acc],
  1011. build_delta_recovery_buckets_loop(RestMapped, DeltaRecoveryBuckets, NewAcc);
  1012. false ->
  1013. case NeedBucket of
  1014. true ->
  1015. ?rebalance_debug("Couldn't delta recover bucket ~s when we care about delta recovery of that bucket", [Bucket]),
  1016. %% run rest of elements for logging
  1017. _ = build_delta_recovery_buckets_loop(RestMapped, DeltaRecoveryBuckets, []),
  1018. {error, not_possible};
  1019. false ->
  1020. build_delta_recovery_buckets_loop(RestMapped, DeltaRecoveryBuckets, Acc)
  1021. end
  1022. end.
  1023. membase_delta_recovery_buckets_test() ->
  1024. MembaseBuckets = [{"b1", conf}, {"b3", conf}],
  1025. ["b1", "b3"] = membase_delta_recovery_buckets(["b1", "b2", "b3", "b4"], MembaseBuckets),
  1026. ["b1", "b3"] = membase_delta_recovery_buckets(all, MembaseBuckets).
  1027. build_delta_recovery_buckets_loop_test() ->
  1028. MappedConfigs = [{"b1", conf1, {map, opts}},
  1029. {"b2", conf2, false}],
  1030. All = membase_delta_recovery_buckets(all, [{"b1", conf}, {"b2", conf}]),
  1031. {ok, []} = build_delta_recovery_buckets_loop([], All, []),
  1032. {error, not_possible} = build_delta_recovery_buckets_loop(MappedConfigs, All, []),
  1033. {error, not_possible} = build_delta_recovery_buckets_loop(MappedConfigs, ["b2"], []),
  1034. {error, not_possible} = build_delta_recovery_buckets_loop(MappedConfigs, ["b1", "b2"], []),
  1035. {ok, []} = build_delta_recovery_buckets_loop(MappedConfigs, [], []),
  1036. ?assertEqual({ok, [{"b1", conf1, {map, opts}}]},
  1037. build_delta_recovery_buckets_loop(MappedConfigs, ["b1"], [])),
  1038. ?assertEqual({ok, [{"b1", conf1, {map, opts}}]},
  1039. build_delta_recovery_buckets_loop([hd(MappedConfigs)], All, [])).
  1040. apply_delta_recovery_buckets([], _DeltaNodes, _CurrentBuckets) ->
  1041. ok;
  1042. apply_delta_recovery_buckets(DeltaRecoveryBuckets, DeltaNodes, CurrentBuckets) ->
  1043. NewBuckets = misc:update_proplist(
  1044. CurrentBuckets,
  1045. [{Bucket, BucketConfig} ||
  1046. {Bucket, BucketConfig, _} <- DeltaRecoveryBuckets]),
  1047. NodeChanges = [[{{node, N, recovery_type}, none},
  1048. {{node, N, failover_vbuckets}, []},
  1049. {{node, N, membership}, active}] || N <- DeltaNodes],
  1050. BucketChanges = {buckets, [{configs, NewBuckets}]},
  1051. Changes = lists:flatten([BucketChanges, NodeChanges]),
  1052. ok = ns_config:set(Changes),
  1053. case ns_config_rep:ensure_config_seen_by_nodes(DeltaNodes) of
  1054. ok ->
  1055. cool;
  1056. {error, SyncFailedNodes} ->
  1057. exit({delta_recovery_config_synchronization_failed, SyncFailedNodes})
  1058. end,
  1059. lists:foreach(
  1060. fun ({Bucket, _, _}) ->
  1061. ok = wait_for_bucket(Bucket, DeltaNodes)
  1062. end, DeltaRecoveryBuckets),
  1063. ok.
  1064. maybe_clear_full_recovery_type(Nodes) ->
  1065. Cfg = ns_config:latest(),
  1066. NodeChanges = [[{{node, N, recovery_type}, none},
  1067. {{node, N, failover_vbuckets}, []}]
  1068. || N <- Nodes,
  1069. ns_cluster_membership:get_recovery_type(Cfg, N) =:= full],
  1070. ok = ns_config:set(lists:flatten(NodeChanges)).
  1071. wait_for_bucket(Bucket, Nodes) ->
  1072. ?log_debug("Waiting until bucket ~p gets ready on nodes ~p", [Bucket, Nodes]),
  1073. do_wait_for_bucket(Bucket, Nodes).
  1074. do_wait_for_bucket(Bucket, Nodes) ->
  1075. case janitor_agent:query_states_details(Bucket, Nodes, 60000) of
  1076. {ok, _States, []} ->
  1077. ?log_debug("Bucket ~p became ready on nodes ~p", [Bucket, Nodes]),
  1078. ok;
  1079. {ok, _States, Failures} ->
  1080. case check_f