PageRenderTime 356ms CodeModel.GetById 110ms app.highlight 87ms RepoModel.GetById 155ms app.codeStats 0ms

/src/clj/backtype/storm/crate/ganglia.clj

http://github.com/nathanmarz/storm-deploy
Clojure | 377 lines | 291 code | 39 blank | 47 comment | 0 complexity | 8f8d8412ae2c5443caed6a36475175de MD5 | raw file
  1(ns backtype.storm.crate.ganglia
  2  "Install and configure ganglia."
  3  (:require
  4   [pallet.argument :as argument]
  5   [pallet.compute :as compute]
  6   [pallet.session :as session]
  7   [pallet.action :as action]
  8   [pallet.stevedore :as stevedore]
  9   [pallet.action.remote-file :as remote-file]
 10   [pallet.action.file :as file]
 11   [pallet.action.package :as package]
 12   [pallet.crate.nagios-config :as nagios-config]
 13   [pallet.action.service :as action-service]
 14   [clojure.string :as string]))
 15
 16(defn install
 17  [session]
 18  (-> session
 19      (package/packages
 20       :aptitude ["rrdtool" "librrds-perl" "librrd-dev" "php5-gd" "libapache2-mod-php5"
 21                  "ganglia-monitor" "ganglia-webfrontend" "gmetad"])
 22      (file/symbolic-link
 23       "/usr/share/ganglia-webfrontend" "/var/www/ganglia")))
 24
 25(defn monitor
 26  [session]
 27  (package/packages session :aptitude ["ganglia-monitor"]))
 28
 29(defn data-source
 30  [session [id {:keys [interval hosts] :or {interval 15}}]]
 31  (format
 32   "data_source \"%s\" %d %s\n"
 33   id interval
 34   (string/join " " (if (or (seq? hosts) (vector? hosts))
 35                      hosts
 36                      (map
 37                       compute/private-ip
 38                       (session/nodes-in-group session hosts))))))
 39
 40(defn configure*
 41  [session {:keys [data_sources rras trusted_hosts]
 42            :as options}]
 43  (str
 44   (reduce #(str %1 (data-source session %2)) "" data_sources)
 45   (when rras
 46     (reduce #(str %1 (format " \"%s\"" %2)) "RRAs" rras))
 47   (when trusted_hosts
 48     (reduce #(str %1 (format " %s" %2)) "trusted_hosts" trusted_hosts))
 49   (reduce
 50    #(str %1 (format "%s %s" (name (first %2)) (second %2)))
 51    ""
 52    (select-keys
 53     options
 54     [:scalable :gridname :authority :all_trusted
 55      :setuid :setuid_username :xml_port :interactive_port
 56      :server_threads :rrd_rootdir]))))
 57
 58(defn configure
 59  "Each data source is a map, keyed by data source name.
 60     :interval   (15s)
 61     :hosts      list of hosts, or tag name"
 62  [session & {:keys [data_sources] :as options}]
 63  (remote-file/remote-file
 64   session
 65   "/etc/ganglia/gmetad.conf"
 66   :content (argument/delayed [session]
 67             (configure* session options))
 68   :mode 644))
 69
 70
 71(declare format-value)
 72
 73(defn format-map
 74  [[key value]]
 75  (cond
 76   (map? value) (format
 77                 "%s {\n%s}\n" (name key) (format-value value))
 78   (or (seq? value)
 79       (vector? value)) (string/join
 80                         ""
 81                         (map #(format-value {key %}) value))
 82   (= :include key) (format "%s (%s)\n" (name key) (format-value value))
 83   :else (format "%s = %s\n" (name key) (format-value value))))
 84
 85(defn format-value
 86  [value]
 87  (cond
 88   (map? value) (string/join
 89                 ""
 90                 (map format-map value))
 91   (string? value) (format "\"%s\"" value)
 92   (keyword? value) (name value)
 93   :else (format "%s" value)))
 94
 95(defn metrics
 96  "Configure metrics"
 97  [session master-group {:as options}]
 98  (let [master-nodes (session/nodes-in-group session master-group)
 99        master-ip (-> master-nodes first compute/private-ip)]
100    (remote-file/remote-file
101     session
102     "/etc/ganglia/gmond.conf"
103     :content (format-value (assoc-in options [:udp_send_channel :host] (keyword master-ip)))
104     :mode 644)))
105
106(def default-metrics
107  {:globals {:daemonize :yes
108             :setuid :yes
109             :user :ganglia
110             :debug_level 0
111             :max_udp_msg_len  1472
112             :mute :false
113             :deaf :false
114             :host_dmax  0              ; secs
115             :cleanup_threshold  300    ; secs
116             :gexec :no
117             :send_metadata_interval  0}
118
119   ;; If a cluster attribute is specified, then all gmond hosts are wrapped
120   ;; inside of a <CLUSTER> tag.  If you do not specify a cluster tag, then all
121   ;; <HOSTS> will NOT be wrapped inside of a <CLUSTER> tag.
122   :cluster {:name "unspecified"
123             :owner "unspecified"
124             :latlong "unspecified"
125             :url "unspecified"}
126
127   ;;  The host section describes attributes of the host, like the location
128   :host {:location  "unspecified"}
129
130   ;; Feel free to specify as many udp_send_channels as you like.  Gmond used
131   ;; to only support having a single channel
132   :udp_send_channel {:host nil
133                      :port  8650}
134
135   ;; You can specify as many udp_recv_channels as you like as well.
136   :udp_recv_channel {:port  8650
137                      :family :inet4}
138
139   ;; You can specify as many tcp_accept_channels as you like to share
140   ;; an xml description of the state of the cluster
141   :tcp_accept_channel { :port  8649 }
142
143   ;; Each metrics module that is referenced by gmond must be specified and
144   ;; loaded. If the module has been statically linked with gmond, it does not
145   ;; require a load path. However all dynamically loadable modules must include
146   ;; a load path.
147   :modules {:module [{:name  "core_metrics"}
148                      {:name  "cpu_module"
149                       :path  "/usr/lib/ganglia/modcpu.so"}
150                      {:name  "disk_module"
151                       :path  "/usr/lib/ganglia/moddisk.so"}
152                      {:name  "load_module"
153                       :path  "/usr/lib/ganglia/modload.so"}
154                      {:name  "mem_module"
155                       :path  "/usr/lib/ganglia/modmem.so"}
156                      {:name  "net_module"
157                       :path  "/usr/lib/ganglia/modnet.so"}
158                      {:name  "proc_module"
159                       :path  "/usr/lib/ganglia/modproc.so"}
160                      {:name  "sys_module"
161                       :path  "/usr/lib/ganglia/modsys.so"}]}
162
163   :include "/etc/ganglia/conf.d/*.conf"
164
165
166   ;; The old internal 2.5.x metric array has been replaced by the following
167   ;; collection_group directives.  What follows is the default behavior for
168   ;; collecting and sending metrics that is as close to 2.5.x behavior as
169   ;; possible.
170
171   ;; This collection group will cause a heartbeat (or beacon) to be sent every
172   ;; 20 seconds.  In the heartbeat is the GMOND_STARTED data which expresses
173   ;; the age of the running gmond. */
174   :collection_group
175   [{:collect_once  :yes
176     :time_threshold  20
177     :metric {:name  "heartbeat"}}
178
179    ;; This collection group will send general info about this host every 1200
180    ;; secs. This information doesn't change between reboots and is only
181    ;; collected once.
182    {:collect_once  :yes
183     :time_threshold  1200
184     :metric [{:name  "cpu_num"
185               :title  "CPU Count"}
186              {:name  "cpu_speed"
187               :title  "CPU Speed"}
188              {:name  "mem_total"
189               :title  "Memory Total"}
190              ;; Should this be here? Swap can be added/removed
191              ;; between reboots.
192              {:name  "swap_total"
193               :title  "Swap Space Total"}
194              {:name  "boottime"
195               :title  "Last Boot Time"}
196              {:name  "machine_type"
197               :title  "Machine Type"}
198              {:name  "os_name"
199               :title  "Operating System"}
200              {:name  "os_release"
201               :title  "Operating System Release"}
202              {:name  "location"
203               :title  "Location"}]}
204
205    ;; This collection group will send the status of gexecd for this host every
206    ;; 300 secs. Unlike 2.5.x the default behavior is to report gexecd OFF.
207    {:collect_once  :yes
208     :time_threshold  300
209     :metric {:name  "gexec"
210              :title  "Gexec Status"}}
211
212    ;; This collection group will collect the CPU status info every 20 secs.  The
213    ;; time threshold is set to 90 seconds.  In honesty, this time_threshold
214    ;; could be set significantly higher to reduce unneccessary network
215    ;; chatter.
216    {
217     :collect_every  20
218     :time_threshold  90
219     ;;  CPU status
220     :metric [{:name  "cpu_user"
221               :value_threshold  "1.0"
222               :title  "CPU User"}
223              {:name  "cpu_system"
224               :value_threshold  "1.0"
225               :title  "CPU System"}
226              {:name  "cpu_idle"
227               :value_threshold  "5.0"
228               :title  "CPU Idle"}
229              {:name  "cpu_nice"
230               :value_threshold  "1.0"
231               :title  "CPU Nice"}
232              {:name  "cpu_aidle"
233               :value_threshold  "5.0"
234               :title  "CPU aidle"}
235              {:name  "cpu_wio"
236               :value_threshold  "1.0"
237               :title  "CPU wio"}
238
239              ;; The next two metrics are optional if you want
240              ;; more detail, since they are accounted
241              ;; for in cpu_system.
242
243              ;; {:name  "cpu_intr"
244              ;;  :value_threshold  "1.0"
245              ;;  :title  "CPU intr"}
246              ;; {:name  "cpu_sintr"
247              ;;  :value_threshold  "1.0"
248              ;;  :title  "CPU sintr"}
249              ]}
250
251    {:collect_every  20
252     :time_threshold  90
253     ;; Load Averages
254     :metric [{:name  "load_one"
255               :value_threshold  "1.0"
256               :title  "One Minute Load Average"}
257              {:name  "load_five"
258               :value_threshold  "1.0"
259               :title  "Five Minute Load Average"}
260              {:name  "load_fifteen"
261               :value_threshold  "1.0"
262               :title  "Fifteen Minute Load Average"}]}
263
264    ;; This group collects the number of running and total processes
265    {:collect_every  80
266     :time_threshold  950
267     :metric [{:name  "proc_run"
268               :value_threshold  "1.0"
269               :title  "Total Running Processes"}
270              {:name  "proc_total"
271               :value_threshold  "1.0"
272               :title  "Total Processes"}]}
273
274    ;; This collection group grabs the volatile memory metrics every 40 secs and
275    ;; sends them at least every 180 secs.  This time_threshold can be increased
276    ;; significantly to reduce unneeded network traffic.
277    {
278     :collect_every  40
279     :time_threshold  180
280     :metric [{:name  "mem_free"
281               :value_threshold  "1024.0"
282               :title  "Free Memory"}
283              {:name  "mem_shared"
284               :value_threshold  "1024.0"
285               :title  "Shared Memory"}
286              {:name  "mem_buffers"
287               :value_threshold  "1024.0"
288               :title  "Memory Buffers"}
289              {:name  "mem_cached"
290               :value_threshold  "1024.0"
291               :title  "Cached Memory"}
292              {:name  "swap_free"
293               :value_threshold  "1024.0"
294               :title  "Free Swap Space"}]}
295
296    {:collect_every  40
297     :time_threshold  300
298     :metric [{:name  "bytes_out"
299               :value_threshold  4096
300               :title  "Bytes Sent"}
301              {:name  "bytes_in"
302               :value_threshold  4096
303               :title  "Bytes Received"}
304              {:name  "pkts_in"
305               :value_threshold  256
306               :title  "Packets Received"}
307              {:name  "pkts_out"
308               :value_threshold  256
309               :title  "Packets Sent"}]}
310
311    ;; Different than 2.5.x default since the old config made no sense
312    {:collect_every  1800
313     :time_threshold  3600
314     :metric {:name  "disk_total"
315              :value_threshold  1.0
316              :title  "Total Disk Space"}}
317
318    {:collect_every  40
319     :time_threshold  180
320     :metric [{:name  "disk_free"
321               :value_threshold  1.0
322               :title  "Disk Space Available"}
323              {:name  "part_max_used"
324               :value_threshold  1.0
325               :title  "Maximum Disk Space Used"}]}]})
326
327(defn nagios-monitor
328  "Monitor ganglia web frontent using nagios."
329  [session & {:keys [url service_description]
330      :or {service_description "Ganglia Web Frontend"}
331      :as options}]
332  (nagios-config/monitor-http
333   session
334   :url "/ganglia"
335   :service_description service_description))
336
337(defn check-ganglia-script
338  [session]
339  (-> session
340      (remote-file/remote-file
341       "/usr/lib/nagios/plugins/check_ganglia.py"
342       :template "crate/ganglia/check_ganglia.py"
343       :mode "0755")
344      (nagios-config/command
345       :command_name "check_ganglia"
346       :command_line
347       "$USER1$/check_ganglia.py -h $HOSTNAME$ -m $ARG1$ -w $ARG2$ -c $ARG3$")))
348
349(defn nagios-monitor-metric
350  [session metric warn critical
351   & {:keys [service_description servicegroups]
352      :or {servicegroups [:ganglia-metrics]}}]
353  (nagios-config/service
354   session
355   {:service_description (or service_description (format "%s" metric))
356    :servicegroups servicegroups
357    :check_command (format "check_ganglia!%s!%s!%s" metric warn critical)}))
358
359(defn ganglia-master [req master-group]
360  (-> req
361    install
362    (configure
363      :data_sources {"localhost" {:hosts ["localhost"]}})
364    (monitor)
365    (metrics master-group default-metrics)
366    (action-service/service "apache2" :action :restart)
367    ))
368
369(defn ganglia-node [req master-group]
370  (-> req
371    (monitor)
372    (metrics master-group default-metrics)
373    ))
374
375(defn ganglia-finish [req]
376  (action-service/service req "ganglia-monitor" :action :restart))
377