/src/rabbit_disk_monitor.erl

http://github.com/rabbitmq/rabbitmq-server · Erlang · 326 lines · 197 code · 58 blank · 71 comment · 0 complexity · 6af72b59e8e99b89dc9ccc18920dfbf5 MD5 · raw file

  1. % The contents of this file are subject to the Mozilla Public License
  2. %% Version 1.1 (the "License"); you may not use this file except in
  3. %% compliance with the License. You may obtain a copy of the License
  4. %% at https://www.mozilla.org/MPL/
  5. %%
  6. %% Software distributed under the License is distributed on an "AS IS"
  7. %% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
  8. %% the License for the specific language governing rights and
  9. %% limitations under the License.
  10. %%
  11. %% The Original Code is RabbitMQ.
  12. %%
  13. %% The Initial Developer of the Original Code is GoPivotal, Inc.
  14. %% Copyright (c) 2007-2020 VMware, Inc. or its affiliates. All rights reserved.
  15. %%
  16. -module(rabbit_disk_monitor).
  17. %% Disk monitoring server. Monitors free disk space
  18. %% periodically and sets alarms when it is below a certain
  19. %% watermark (configurable either as an absolute value or
  20. %% relative to the memory limit).
  21. %%
  22. %% Disk monitoring is done by shelling out to /usr/bin/df
  23. %% instead of related built-in OTP functions because currently
  24. %% this is the most reliable way of determining free disk space
  25. %% for the partition our internal database is on.
  26. %%
  27. %% Update interval is dynamically calculated assuming disk
  28. %% space is being filled at FAST_RATE.
  29. -behaviour(gen_server).
  30. -export([start_link/1]).
  31. -export([init/1, handle_call/3, handle_cast/2, handle_info/2,
  32. terminate/2, code_change/3]).
  33. -export([get_disk_free_limit/0, set_disk_free_limit/1,
  34. get_min_check_interval/0, set_min_check_interval/1,
  35. get_max_check_interval/0, set_max_check_interval/1,
  36. get_disk_free/0, set_enabled/1]).
  37. -define(SERVER, ?MODULE).
  38. -define(DEFAULT_MIN_DISK_CHECK_INTERVAL, 100).
  39. -define(DEFAULT_MAX_DISK_CHECK_INTERVAL, 10000).
  40. -define(DEFAULT_DISK_FREE_LIMIT, 50000000).
  41. %% 250MB/s i.e. 250kB/ms
  42. -define(FAST_RATE, (250 * 1000)).
  43. -record(state, {
  44. %% monitor partition on which this directory resides
  45. dir,
  46. %% configured limit in bytes
  47. limit,
  48. %% last known free disk space amount in bytes
  49. actual,
  50. %% minimum check interval
  51. min_interval,
  52. %% maximum check interval
  53. max_interval,
  54. %% timer that drives periodic checks
  55. timer,
  56. %% is free disk space alarm currently in effect?
  57. alarmed,
  58. %% is monitoring enabled? false on unsupported
  59. %% platforms
  60. enabled,
  61. %% number of retries to enable monitoring if it fails
  62. %% on start-up
  63. retries,
  64. %% Interval between retries
  65. interval
  66. }).
  67. %%----------------------------------------------------------------------------
  68. -type disk_free_limit() :: (integer() | string() | {'mem_relative', float() | integer()}).
  69. %%----------------------------------------------------------------------------
  70. %% Public API
  71. %%----------------------------------------------------------------------------
  72. -spec get_disk_free_limit() -> integer().
  73. get_disk_free_limit() ->
  74. gen_server:call(?MODULE, get_disk_free_limit, infinity).
  75. -spec set_disk_free_limit(disk_free_limit()) -> 'ok'.
  76. set_disk_free_limit(Limit) ->
  77. gen_server:call(?MODULE, {set_disk_free_limit, Limit}, infinity).
  78. -spec get_min_check_interval() -> integer().
  79. get_min_check_interval() ->
  80. gen_server:call(?MODULE, get_min_check_interval, infinity).
  81. -spec set_min_check_interval(integer()) -> 'ok'.
  82. set_min_check_interval(Interval) ->
  83. gen_server:call(?MODULE, {set_min_check_interval, Interval}, infinity).
  84. -spec get_max_check_interval() -> integer().
  85. get_max_check_interval() ->
  86. gen_server:call(?MODULE, get_max_check_interval, infinity).
  87. -spec set_max_check_interval(integer()) -> 'ok'.
  88. set_max_check_interval(Interval) ->
  89. gen_server:call(?MODULE, {set_max_check_interval, Interval}, infinity).
  90. -spec get_disk_free() -> (integer() | 'unknown').
  91. -spec set_enabled(string()) -> 'ok'.
  92. get_disk_free() ->
  93. gen_server:call(?MODULE, get_disk_free, infinity).
  94. set_enabled(Enabled) ->
  95. gen_server:call(?MODULE, {set_enabled, Enabled}, infinity).
  96. %%----------------------------------------------------------------------------
  97. %% gen_server callbacks
  98. %%----------------------------------------------------------------------------
  99. -spec start_link(disk_free_limit()) -> rabbit_types:ok_pid_or_error().
  100. start_link(Args) ->
  101. gen_server:start_link({local, ?SERVER}, ?MODULE, [Args], []).
  102. init([Limit]) ->
  103. Dir = dir(),
  104. {ok, Retries} = application:get_env(rabbit, disk_monitor_failure_retries),
  105. {ok, Interval} = application:get_env(rabbit, disk_monitor_failure_retry_interval),
  106. State = #state{dir = Dir,
  107. min_interval = ?DEFAULT_MIN_DISK_CHECK_INTERVAL,
  108. max_interval = ?DEFAULT_MAX_DISK_CHECK_INTERVAL,
  109. alarmed = false,
  110. enabled = true,
  111. limit = Limit,
  112. retries = Retries,
  113. interval = Interval},
  114. {ok, enable(State)}.
  115. handle_call(get_disk_free_limit, _From, State = #state{limit = Limit}) ->
  116. {reply, Limit, State};
  117. handle_call({set_disk_free_limit, _}, _From, #state{enabled = false} = State) ->
  118. rabbit_log:info("Cannot set disk free limit: "
  119. "disabled disk free space monitoring", []),
  120. {reply, ok, State};
  121. handle_call({set_disk_free_limit, Limit}, _From, State) ->
  122. {reply, ok, set_disk_limits(State, Limit)};
  123. handle_call(get_min_check_interval, _From, State) ->
  124. {reply, State#state.min_interval, State};
  125. handle_call(get_max_check_interval, _From, State) ->
  126. {reply, State#state.max_interval, State};
  127. handle_call({set_min_check_interval, MinInterval}, _From, State) ->
  128. {reply, ok, State#state{min_interval = MinInterval}};
  129. handle_call({set_max_check_interval, MaxInterval}, _From, State) ->
  130. {reply, ok, State#state{max_interval = MaxInterval}};
  131. handle_call(get_disk_free, _From, State = #state { actual = Actual }) ->
  132. {reply, Actual, State};
  133. handle_call({set_enabled, _Enabled = true}, _From, State) ->
  134. start_timer(set_disk_limits(State, State#state.limit)),
  135. rabbit_log:info("Free disk space monitor was enabled"),
  136. {reply, ok, State#state{enabled = true}};
  137. handle_call({set_enabled, _Enabled = false}, _From, State) ->
  138. erlang:cancel_timer(State#state.timer),
  139. rabbit_log:info("Free disk space monitor was manually disabled"),
  140. {reply, ok, State#state{enabled = false}};
  141. handle_call(_Request, _From, State) ->
  142. {noreply, State}.
  143. handle_cast(_Request, State) ->
  144. {noreply, State}.
  145. handle_info(try_enable, #state{retries = Retries} = State) ->
  146. {noreply, enable(State#state{retries = Retries - 1})};
  147. handle_info(update, State) ->
  148. {noreply, start_timer(internal_update(State))};
  149. handle_info(_Info, State) ->
  150. {noreply, State}.
  151. terminate(_Reason, _State) ->
  152. ok.
  153. code_change(_OldVsn, State, _Extra) ->
  154. {ok, State}.
  155. %%----------------------------------------------------------------------------
  156. %% Server Internals
  157. %%----------------------------------------------------------------------------
  158. % the partition / drive containing this directory will be monitored
  159. dir() -> rabbit_mnesia:dir().
  160. set_disk_limits(State, Limit0) ->
  161. Limit = interpret_limit(Limit0),
  162. State1 = State#state { limit = Limit },
  163. rabbit_log:info("Disk free limit set to ~pMB~n",
  164. [trunc(Limit / 1000000)]),
  165. internal_update(State1).
  166. internal_update(State = #state { limit = Limit,
  167. dir = Dir,
  168. alarmed = Alarmed}) ->
  169. CurrentFree = get_disk_free(Dir),
  170. NewAlarmed = CurrentFree < Limit,
  171. case {Alarmed, NewAlarmed} of
  172. {false, true} ->
  173. emit_update_info("insufficient", CurrentFree, Limit),
  174. rabbit_alarm:set_alarm({{resource_limit, disk, node()}, []});
  175. {true, false} ->
  176. emit_update_info("sufficient", CurrentFree, Limit),
  177. rabbit_alarm:clear_alarm({resource_limit, disk, node()});
  178. _ ->
  179. ok
  180. end,
  181. State #state {alarmed = NewAlarmed, actual = CurrentFree}.
  182. get_disk_free(Dir) ->
  183. get_disk_free(Dir, os:type()).
  184. get_disk_free(Dir, {unix, Sun})
  185. when Sun =:= sunos; Sun =:= sunos4; Sun =:= solaris ->
  186. Df = os:find_executable("df"),
  187. parse_free_unix(rabbit_misc:os_cmd(Df ++ " -k " ++ Dir));
  188. get_disk_free(Dir, {unix, _}) ->
  189. Df = os:find_executable("df"),
  190. parse_free_unix(rabbit_misc:os_cmd(Df ++ " -kP " ++ Dir));
  191. get_disk_free(Dir, {win32, _}) ->
  192. %% On Windows, the Win32 API enforces a limit of 260 characters
  193. %% (MAX_PATH). If we call `dir` with a path longer than that, it
  194. %% fails with "File not found". Starting with Windows 10 version
  195. %% 1607, this limit was removed, but the administrator has to
  196. %% configure that.
  197. %%
  198. %% NTFS supports paths up to 32767 characters. Therefore, paths
  199. %% longer than 260 characters exist but they are "inaccessible" to
  200. %% `dir`.
  201. %%
  202. %% A workaround is to tell the Win32 API to not parse a path and
  203. %% just pass it raw to the underlying filesystem. To do this, the
  204. %% path must be prepended with "\\?\". That's what we do here.
  205. %%
  206. %% However, the underlying filesystem may not support forward
  207. %% slashes transparently, as the Win32 API does. Therefore, we
  208. %% convert all forward slashes to backslashes.
  209. %%
  210. %% See the following page to learn more about this:
  211. %% https://ss64.com/nt/syntax-filenames.html
  212. RawDir = "\\\\?\\" ++ string:replace(Dir, "/", "\\", all),
  213. parse_free_win32(rabbit_misc:os_cmd("dir /-C /W \"" ++ RawDir ++ "\"")).
  214. parse_free_unix(Str) ->
  215. case string:tokens(Str, "\n") of
  216. [_, S | _] -> case string:tokens(S, " \t") of
  217. [_, _, _, Free | _] -> list_to_integer(Free) * 1024;
  218. _ -> exit({unparseable, Str})
  219. end;
  220. _ -> exit({unparseable, Str})
  221. end.
  222. parse_free_win32(CommandResult) ->
  223. LastLine = lists:last(string:tokens(CommandResult, "\r\n")),
  224. {match, [Free]} = re:run(lists:reverse(LastLine), "(\\d+)",
  225. [{capture, all_but_first, list}]),
  226. list_to_integer(lists:reverse(Free)).
  227. interpret_limit({mem_relative, Relative})
  228. when is_number(Relative) ->
  229. round(Relative * vm_memory_monitor:get_total_memory());
  230. interpret_limit(Absolute) ->
  231. case rabbit_resource_monitor_misc:parse_information_unit(Absolute) of
  232. {ok, ParsedAbsolute} -> ParsedAbsolute;
  233. {error, parse_error} ->
  234. rabbit_log:error("Unable to parse disk_free_limit value ~p",
  235. [Absolute]),
  236. ?DEFAULT_DISK_FREE_LIMIT
  237. end.
  238. emit_update_info(StateStr, CurrentFree, Limit) ->
  239. rabbit_log:info(
  240. "Free disk space is ~s. Free bytes: ~p. Limit: ~p~n",
  241. [StateStr, CurrentFree, Limit]).
  242. start_timer(State) ->
  243. State#state{timer = erlang:send_after(interval(State), self(), update)}.
  244. interval(#state{alarmed = true,
  245. max_interval = MaxInterval}) ->
  246. MaxInterval;
  247. interval(#state{limit = Limit,
  248. actual = Actual,
  249. min_interval = MinInterval,
  250. max_interval = MaxInterval}) ->
  251. IdealInterval = 2 * (Actual - Limit) / ?FAST_RATE,
  252. trunc(erlang:max(MinInterval, erlang:min(MaxInterval, IdealInterval))).
  253. enable(#state{retries = 0} = State) ->
  254. State;
  255. enable(#state{dir = Dir, interval = Interval, limit = Limit, retries = Retries}
  256. = State) ->
  257. case {catch get_disk_free(Dir),
  258. vm_memory_monitor:get_total_memory()} of
  259. {N1, N2} when is_integer(N1), is_integer(N2) ->
  260. rabbit_log:info("Enabling free disk space monitoring~n", []),
  261. start_timer(set_disk_limits(State, Limit));
  262. Err ->
  263. rabbit_log:info("Free disk space monitor encountered an error "
  264. "(e.g. failed to parse output from OS tools): ~p, retries left: ~b~n",
  265. [Err, Retries]),
  266. erlang:send_after(Interval, self(), try_enable),
  267. State#state{enabled = false}
  268. end.