/src/clj/backtype/storm/crate/ganglia.clj

http://github.com/nathanmarz/storm-deploy · Clojure · 377 lines · 291 code · 39 blank · 47 comment · 7 complexity · 8f8d8412ae2c5443caed6a36475175de MD5 · raw file

  1. (ns backtype.storm.crate.ganglia
  2. "Install and configure ganglia."
  3. (:require
  4. [pallet.argument :as argument]
  5. [pallet.compute :as compute]
  6. [pallet.session :as session]
  7. [pallet.action :as action]
  8. [pallet.stevedore :as stevedore]
  9. [pallet.action.remote-file :as remote-file]
  10. [pallet.action.file :as file]
  11. [pallet.action.package :as package]
  12. [pallet.crate.nagios-config :as nagios-config]
  13. [pallet.action.service :as action-service]
  14. [clojure.string :as string]))
  15. (defn install
  16. [session]
  17. (-> session
  18. (package/packages
  19. :aptitude ["rrdtool" "librrds-perl" "librrd-dev" "php5-gd" "libapache2-mod-php5"
  20. "ganglia-monitor" "ganglia-webfrontend" "gmetad"])
  21. (file/symbolic-link
  22. "/usr/share/ganglia-webfrontend" "/var/www/ganglia")))
  23. (defn monitor
  24. [session]
  25. (package/packages session :aptitude ["ganglia-monitor"]))
  26. (defn data-source
  27. [session [id {:keys [interval hosts] :or {interval 15}}]]
  28. (format
  29. "data_source \"%s\" %d %s\n"
  30. id interval
  31. (string/join " " (if (or (seq? hosts) (vector? hosts))
  32. hosts
  33. (map
  34. compute/private-ip
  35. (session/nodes-in-group session hosts))))))
  36. (defn configure*
  37. [session {:keys [data_sources rras trusted_hosts]
  38. :as options}]
  39. (str
  40. (reduce #(str %1 (data-source session %2)) "" data_sources)
  41. (when rras
  42. (reduce #(str %1 (format " \"%s\"" %2)) "RRAs" rras))
  43. (when trusted_hosts
  44. (reduce #(str %1 (format " %s" %2)) "trusted_hosts" trusted_hosts))
  45. (reduce
  46. #(str %1 (format "%s %s" (name (first %2)) (second %2)))
  47. ""
  48. (select-keys
  49. options
  50. [:scalable :gridname :authority :all_trusted
  51. :setuid :setuid_username :xml_port :interactive_port
  52. :server_threads :rrd_rootdir]))))
  53. (defn configure
  54. "Each data source is a map, keyed by data source name.
  55. :interval (15s)
  56. :hosts list of hosts, or tag name"
  57. [session & {:keys [data_sources] :as options}]
  58. (remote-file/remote-file
  59. session
  60. "/etc/ganglia/gmetad.conf"
  61. :content (argument/delayed [session]
  62. (configure* session options))
  63. :mode 644))
  64. (declare format-value)
  65. (defn format-map
  66. [[key value]]
  67. (cond
  68. (map? value) (format
  69. "%s {\n%s}\n" (name key) (format-value value))
  70. (or (seq? value)
  71. (vector? value)) (string/join
  72. ""
  73. (map #(format-value {key %}) value))
  74. (= :include key) (format "%s (%s)\n" (name key) (format-value value))
  75. :else (format "%s = %s\n" (name key) (format-value value))))
  76. (defn format-value
  77. [value]
  78. (cond
  79. (map? value) (string/join
  80. ""
  81. (map format-map value))
  82. (string? value) (format "\"%s\"" value)
  83. (keyword? value) (name value)
  84. :else (format "%s" value)))
  85. (defn metrics
  86. "Configure metrics"
  87. [session master-group {:as options}]
  88. (let [master-nodes (session/nodes-in-group session master-group)
  89. master-ip (-> master-nodes first compute/private-ip)]
  90. (remote-file/remote-file
  91. session
  92. "/etc/ganglia/gmond.conf"
  93. :content (format-value (assoc-in options [:udp_send_channel :host] (keyword master-ip)))
  94. :mode 644)))
  95. (def default-metrics
  96. {:globals {:daemonize :yes
  97. :setuid :yes
  98. :user :ganglia
  99. :debug_level 0
  100. :max_udp_msg_len 1472
  101. :mute :false
  102. :deaf :false
  103. :host_dmax 0 ; secs
  104. :cleanup_threshold 300 ; secs
  105. :gexec :no
  106. :send_metadata_interval 0}
  107. ;; If a cluster attribute is specified, then all gmond hosts are wrapped
  108. ;; inside of a <CLUSTER> tag. If you do not specify a cluster tag, then all
  109. ;; <HOSTS> will NOT be wrapped inside of a <CLUSTER> tag.
  110. :cluster {:name "unspecified"
  111. :owner "unspecified"
  112. :latlong "unspecified"
  113. :url "unspecified"}
  114. ;; The host section describes attributes of the host, like the location
  115. :host {:location "unspecified"}
  116. ;; Feel free to specify as many udp_send_channels as you like. Gmond used
  117. ;; to only support having a single channel
  118. :udp_send_channel {:host nil
  119. :port 8650}
  120. ;; You can specify as many udp_recv_channels as you like as well.
  121. :udp_recv_channel {:port 8650
  122. :family :inet4}
  123. ;; You can specify as many tcp_accept_channels as you like to share
  124. ;; an xml description of the state of the cluster
  125. :tcp_accept_channel { :port 8649 }
  126. ;; Each metrics module that is referenced by gmond must be specified and
  127. ;; loaded. If the module has been statically linked with gmond, it does not
  128. ;; require a load path. However all dynamically loadable modules must include
  129. ;; a load path.
  130. :modules {:module [{:name "core_metrics"}
  131. {:name "cpu_module"
  132. :path "/usr/lib/ganglia/modcpu.so"}
  133. {:name "disk_module"
  134. :path "/usr/lib/ganglia/moddisk.so"}
  135. {:name "load_module"
  136. :path "/usr/lib/ganglia/modload.so"}
  137. {:name "mem_module"
  138. :path "/usr/lib/ganglia/modmem.so"}
  139. {:name "net_module"
  140. :path "/usr/lib/ganglia/modnet.so"}
  141. {:name "proc_module"
  142. :path "/usr/lib/ganglia/modproc.so"}
  143. {:name "sys_module"
  144. :path "/usr/lib/ganglia/modsys.so"}]}
  145. :include "/etc/ganglia/conf.d/*.conf"
  146. ;; The old internal 2.5.x metric array has been replaced by the following
  147. ;; collection_group directives. What follows is the default behavior for
  148. ;; collecting and sending metrics that is as close to 2.5.x behavior as
  149. ;; possible.
  150. ;; This collection group will cause a heartbeat (or beacon) to be sent every
  151. ;; 20 seconds. In the heartbeat is the GMOND_STARTED data which expresses
  152. ;; the age of the running gmond. */
  153. :collection_group
  154. [{:collect_once :yes
  155. :time_threshold 20
  156. :metric {:name "heartbeat"}}
  157. ;; This collection group will send general info about this host every 1200
  158. ;; secs. This information doesn't change between reboots and is only
  159. ;; collected once.
  160. {:collect_once :yes
  161. :time_threshold 1200
  162. :metric [{:name "cpu_num"
  163. :title "CPU Count"}
  164. {:name "cpu_speed"
  165. :title "CPU Speed"}
  166. {:name "mem_total"
  167. :title "Memory Total"}
  168. ;; Should this be here? Swap can be added/removed
  169. ;; between reboots.
  170. {:name "swap_total"
  171. :title "Swap Space Total"}
  172. {:name "boottime"
  173. :title "Last Boot Time"}
  174. {:name "machine_type"
  175. :title "Machine Type"}
  176. {:name "os_name"
  177. :title "Operating System"}
  178. {:name "os_release"
  179. :title "Operating System Release"}
  180. {:name "location"
  181. :title "Location"}]}
  182. ;; This collection group will send the status of gexecd for this host every
  183. ;; 300 secs. Unlike 2.5.x the default behavior is to report gexecd OFF.
  184. {:collect_once :yes
  185. :time_threshold 300
  186. :metric {:name "gexec"
  187. :title "Gexec Status"}}
  188. ;; This collection group will collect the CPU status info every 20 secs. The
  189. ;; time threshold is set to 90 seconds. In honesty, this time_threshold
  190. ;; could be set significantly higher to reduce unneccessary network
  191. ;; chatter.
  192. {
  193. :collect_every 20
  194. :time_threshold 90
  195. ;; CPU status
  196. :metric [{:name "cpu_user"
  197. :value_threshold "1.0"
  198. :title "CPU User"}
  199. {:name "cpu_system"
  200. :value_threshold "1.0"
  201. :title "CPU System"}
  202. {:name "cpu_idle"
  203. :value_threshold "5.0"
  204. :title "CPU Idle"}
  205. {:name "cpu_nice"
  206. :value_threshold "1.0"
  207. :title "CPU Nice"}
  208. {:name "cpu_aidle"
  209. :value_threshold "5.0"
  210. :title "CPU aidle"}
  211. {:name "cpu_wio"
  212. :value_threshold "1.0"
  213. :title "CPU wio"}
  214. ;; The next two metrics are optional if you want
  215. ;; more detail, since they are accounted
  216. ;; for in cpu_system.
  217. ;; {:name "cpu_intr"
  218. ;; :value_threshold "1.0"
  219. ;; :title "CPU intr"}
  220. ;; {:name "cpu_sintr"
  221. ;; :value_threshold "1.0"
  222. ;; :title "CPU sintr"}
  223. ]}
  224. {:collect_every 20
  225. :time_threshold 90
  226. ;; Load Averages
  227. :metric [{:name "load_one"
  228. :value_threshold "1.0"
  229. :title "One Minute Load Average"}
  230. {:name "load_five"
  231. :value_threshold "1.0"
  232. :title "Five Minute Load Average"}
  233. {:name "load_fifteen"
  234. :value_threshold "1.0"
  235. :title "Fifteen Minute Load Average"}]}
  236. ;; This group collects the number of running and total processes
  237. {:collect_every 80
  238. :time_threshold 950
  239. :metric [{:name "proc_run"
  240. :value_threshold "1.0"
  241. :title "Total Running Processes"}
  242. {:name "proc_total"
  243. :value_threshold "1.0"
  244. :title "Total Processes"}]}
  245. ;; This collection group grabs the volatile memory metrics every 40 secs and
  246. ;; sends them at least every 180 secs. This time_threshold can be increased
  247. ;; significantly to reduce unneeded network traffic.
  248. {
  249. :collect_every 40
  250. :time_threshold 180
  251. :metric [{:name "mem_free"
  252. :value_threshold "1024.0"
  253. :title "Free Memory"}
  254. {:name "mem_shared"
  255. :value_threshold "1024.0"
  256. :title "Shared Memory"}
  257. {:name "mem_buffers"
  258. :value_threshold "1024.0"
  259. :title "Memory Buffers"}
  260. {:name "mem_cached"
  261. :value_threshold "1024.0"
  262. :title "Cached Memory"}
  263. {:name "swap_free"
  264. :value_threshold "1024.0"
  265. :title "Free Swap Space"}]}
  266. {:collect_every 40
  267. :time_threshold 300
  268. :metric [{:name "bytes_out"
  269. :value_threshold 4096
  270. :title "Bytes Sent"}
  271. {:name "bytes_in"
  272. :value_threshold 4096
  273. :title "Bytes Received"}
  274. {:name "pkts_in"
  275. :value_threshold 256
  276. :title "Packets Received"}
  277. {:name "pkts_out"
  278. :value_threshold 256
  279. :title "Packets Sent"}]}
  280. ;; Different than 2.5.x default since the old config made no sense
  281. {:collect_every 1800
  282. :time_threshold 3600
  283. :metric {:name "disk_total"
  284. :value_threshold 1.0
  285. :title "Total Disk Space"}}
  286. {:collect_every 40
  287. :time_threshold 180
  288. :metric [{:name "disk_free"
  289. :value_threshold 1.0
  290. :title "Disk Space Available"}
  291. {:name "part_max_used"
  292. :value_threshold 1.0
  293. :title "Maximum Disk Space Used"}]}]})
  294. (defn nagios-monitor
  295. "Monitor ganglia web frontent using nagios."
  296. [session & {:keys [url service_description]
  297. :or {service_description "Ganglia Web Frontend"}
  298. :as options}]
  299. (nagios-config/monitor-http
  300. session
  301. :url "/ganglia"
  302. :service_description service_description))
  303. (defn check-ganglia-script
  304. [session]
  305. (-> session
  306. (remote-file/remote-file
  307. "/usr/lib/nagios/plugins/check_ganglia.py"
  308. :template "crate/ganglia/check_ganglia.py"
  309. :mode "0755")
  310. (nagios-config/command
  311. :command_name "check_ganglia"
  312. :command_line
  313. "$USER1$/check_ganglia.py -h $HOSTNAME$ -m $ARG1$ -w $ARG2$ -c $ARG3$")))
  314. (defn nagios-monitor-metric
  315. [session metric warn critical
  316. & {:keys [service_description servicegroups]
  317. :or {servicegroups [:ganglia-metrics]}}]
  318. (nagios-config/service
  319. session
  320. {:service_description (or service_description (format "%s" metric))
  321. :servicegroups servicegroups
  322. :check_command (format "check_ganglia!%s!%s!%s" metric warn critical)}))
  323. (defn ganglia-master [req master-group]
  324. (-> req
  325. install
  326. (configure
  327. :data_sources {"localhost" {:hosts ["localhost"]}})
  328. (monitor)
  329. (metrics master-group default-metrics)
  330. (action-service/service "apache2" :action :restart)
  331. ))
  332. (defn ganglia-node [req master-group]
  333. (-> req
  334. (monitor)
  335. (metrics master-group default-metrics)
  336. ))
  337. (defn ganglia-finish [req]
  338. (action-service/service req "ganglia-monitor" :action :restart))