PageRenderTime 29ms CodeModel.GetById 31ms RepoModel.GetById 1ms app.codeStats 0ms

/src/clustrz/core.clj

https://github.com/dirtyvagabond/clustrz
Clojure | 398 lines | 296 code | 66 blank | 36 comment | 10 complexity | 15096ed4ecb156201c1c99c62f05f1d4 MD5 | raw file
  1. ;;
  2. ;; Provides a flexible library for managing remote Linux environments.
  3. ;; Treats each environment as a simple "node", and supports grouping
  4. ;; nodes into "clusters".
  5. ;;
  6. ;; Higher order functions can be written to perform remote actions on
  7. ;; a node, and then those functions may be run against individual nodes,
  8. ;; or against all nodes in a cluster in parallel.
  9. ;;
  10. ;; You must have your public key on each node that you wish to work with.
  11. ;;
  12. (ns clustrz.core
  13. (:use [clojure.string :only (split split-lines blank? join trim trim-newline)])
  14. (:use [clojure.java.shell :only (sh)])
  15. (:use [clojure.pprint :only (pprint)])
  16. (:use [clojure.contrib.duck-streams :only (make-parents)])
  17. (:use [clojure.contrib.java-utils :only (file delete-file)])
  18. (:require [clojure.contrib.jmx :as jmx])
  19. (:gen-class))
  20. ;;; Defines where clustrz stores certain data on nodes.
  21. (def *home* "~/.clustrz/")
  22. (def *kvs-dir* (str *home* "kvs/"))
  23. (def *log* (str *home* "node.log"))
  24. (defn now [] (java.util.Date.))
  25. ;;; TODO: consider security; "bash injection" attacks,
  26. ;;; e.g., passing hostile escaped bash code to things like assoc-at.
  27. (defn ssh-exec
  28. "Runs cmd on the specified node and returns a hashmap of the
  29. remote result of running the command."
  30. [{:keys [host user]} cmd]
  31. (sh "ssh" (str user "@" host) cmd))
  32. (defn shout
  33. "Runs cmd on node and returns the textual result that went to stdout.
  34. Throws an exception if the exit status of the command run on the node
  35. was non-zero."
  36. [node cmd]
  37. (let [{:keys [exit out err]} (ssh-exec node cmd)]
  38. (if (= 0 exit)
  39. (trim out)
  40. (throw (Exception. (str "shout error: exit=" exit ", err=\"" (trim-newline err) "\""))))))
  41. (defn ps-map
  42. "Utility function to parse the textual line output of our custom ps command.
  43. Returns a hashmap representing the remote process data parsed from the line."
  44. [line]
  45. (let [[pid time pctcpu cmd] (split line #"\|")]
  46. {:pid (Long/parseLong (trim pid))
  47. :time (trim time)
  48. :pctcpu (Float/parseFloat (trim pctcpu))
  49. :cmd cmd}))
  50. (defn ps
  51. "Fetches data about running processes at node. Returns a list of hashmaps. Each
  52. hashmap in the list represents a running process. Keys of the map include:
  53. :pid
  54. :time
  55. :pctcpu
  56. :cmd"
  57. [node]
  58. (let [cmd (str "ps --no-header -u " (:user node) " -o \"%p|%x|%C|%a\"")
  59. lines (split-lines (shout node cmd))]
  60. (map #(ps-map %) lines)))
  61. (defn mkdir-at
  62. "Creates the specified path at node. All subdirectories will be created if
  63. the don't already exist."
  64. [node path]
  65. (ssh-exec node (str "mkdir -p " path)))
  66. (defn delete-file-at
  67. "Deletes the specified file at node."
  68. [node file]
  69. (ssh-exec node (str "rm " file)))
  70. (defn last-lines
  71. "Returns the last n lines from file at node."
  72. [node file n]
  73. (shout node (str "tail -" n " " file)))
  74. (defn last-line
  75. "Returns the last line from file at node."
  76. [node file]
  77. (last-lines node file 1))
  78. ;; TODO: accidentally trying (execs f a-node) results in opaque error.
  79. ;; should i have just one exec, that asks (seq? nodes) ?
  80. ;; or at least put a precondition on execs that nodes be a seq?
  81. (defn exec
  82. "Runs f against node, and wraps the result in a hashmap that contains
  83. extra information about the run. The result of f itself will be at the
  84. key :out.
  85. Keys in the returned hashmap include:
  86. :out the result of running f on node
  87. :host the node's hostname
  88. :time how long the run took"
  89. [f node]
  90. (let [start (System/currentTimeMillis)
  91. out (trim (f node))
  92. t (- (System/currentTimeMillis) start)]
  93. {:out out
  94. :host (node :host)
  95. :time t}))
  96. (defn execs
  97. "Runs f against all nodes in parallel. Returns a sequence of results,
  98. where each element in the sequence is a result from running f on one
  99. of the nodes."
  100. [f nodes]
  101. (doall (apply pcalls (map #(partial f %) nodes))))
  102. (defn uptime-at
  103. "Returns the raw uptime string from node."
  104. [node]
  105. (shout node "uptime"))
  106. (defn nice-report-str
  107. [hashmaps]
  108. (join "\n" (map #(str (:host %) ": " (:out %)) hashmaps)))
  109. (defn nice-seq [thing]
  110. (if (seq? thing) thing (list thing)))
  111. ;;TODO: is it goofy to transparenty treat a node as a cluster?
  112. ;; this means that, e.g., ($ f a-single-node) will return
  113. ;; a sequence, so the caller will need to call first
  114. ;; to get just the result. But I don't know if I like having
  115. ;; one function that expects a single node, and another
  116. ;; function that expects nodes, like the exec and execs
  117. ;; functions... this can get tedious when writing calling
  118. ;; code.
  119. (defn $
  120. "Runs f across all nodes in parallel. Each result is wrapped in
  121. a hashmap per our custom exec function, and returned in a sequence."
  122. [f nodes]
  123. (doall (apply pcalls (map #(partial exec f %) (nice-seq nodes)))))
  124. (defn report [fn nodes]
  125. (nice-report-str ($ fn nodes)))
  126. (defn tmp-file []
  127. (str "/tmp/clustrz_tmp_" (java.util.UUID/randomUUID)))
  128. (defn copy-to
  129. "Copies local-file to the specified host destination, copying it
  130. to the file path specified by dest-file."
  131. [local-file {:keys [host user]} dest-file]
  132. (sh "scp" local-file (str user "@" host ":" dest-file)))
  133. (defn copy-files-to
  134. "Copies local files to the specified destination folder on the
  135. specified remote host."
  136. [files {:keys [host user]} dest-path]
  137. (let [dest (str user "@" host ":" dest-path)]
  138. (apply sh (flatten ["scp" files dest]))))
  139. (defn spit-at
  140. "Puts the textual representation of val in dest-file at node.
  141. The file will be overwritten if it already exists."
  142. [node dest-file val]
  143. (let [tmp-local-file (tmp-file)]
  144. (spit tmp-local-file (str val))
  145. (let [res (copy-to tmp-local-file node dest-file)]
  146. (delete-file tmp-local-file)
  147. res)))
  148. (defn append-spit-at
  149. "Appends the textual representation of s to dest-file at node."
  150. [node dest-file s]
  151. (let [tmp-dest-file (tmp-file)]
  152. (spit-at node tmp-dest-file s)
  153. (ssh-exec node (str "cat " tmp-dest-file " >> " dest-file "; rm " tmp-dest-file))))
  154. (defn slurp-at [node file]
  155. (shout node "cat " file))
  156. (defn kvs-file [key]
  157. (str *kvs-dir* key))
  158. (defn assoc-at
  159. "Associates val with key, at node. val can be any Clojure object."
  160. [node key val]
  161. ;;; OPTIMIZE: mkdir is only needed once per node, and only if the dir isn't there. how to track?
  162. (mkdir-at node *kvs-dir*)
  163. (spit-at node (kvs-file key) (with-out-str (pr val))))
  164. (defn get-at
  165. ([node key not-found]
  166. "Returns the object associated with key at node, or not-found if none."
  167. (let [out (:out (ssh-exec node (str "cat " (kvs-file key))))]
  168. (if (= 0 (.length out))
  169. not-found
  170. (read-string out))))
  171. ([node key]
  172. "Returns the object associated with key at node, or nil if none."
  173. (get-at node key nil)))
  174. (defn dissoc-at
  175. "Removes key at node."
  176. [node key]
  177. (delete-file-at node (kvs-file key)))
  178. ;;; TODO: creates a DateFormat each time for thread safety. better way?
  179. (defn bash-time
  180. "Converts a bash time string to a java Date.
  181. Example input, t: 'Fri Dec 3 02:51:12 PST 2010'"
  182. [t]
  183. (let [df (java.text.SimpleDateFormat. "EEE MMM d HH:mm:ss z yyyyy")]
  184. (.parse df t)))
  185. (defn slurp-at
  186. [node file]
  187. "Returns the contents of file at node."
  188. (shout node (str "cat " file)))
  189. (defn log-at
  190. [node msg]
  191. "Appends msg to the central log stored at node."
  192. (append-spit-at node *log* (str (now) ": " msg "\n")))
  193. (defn log [msg]
  194. (println (str (now) ": " msg)))
  195. (defn log2 [node msg]
  196. (log-at node msg)
  197. (log (str (node :host) ": " msg)))
  198. (defn up?
  199. [{:keys [user host pid] :as node}]
  200. "Returns true if and only if the specified remote process is
  201. running."
  202. (let [out (shout node (str "ps --no-header -p " pid))]
  203. (not (blank? out))))
  204. (def down?
  205. (complement up?))
  206. (defn java?
  207. [proc]
  208. "Returns true if and only if the specified remote
  209. process is a Java process."
  210. (not (nil? (re-matches #".*java" (first (split (proc :cmd) #"\s"))))))
  211. (defn clojure?
  212. [proc]
  213. "Returns true if and only if the specified remote process is a
  214. Clojure process."
  215. (not (nil? (re-matches #".* clojure\.main .*" (proc :cmd)))))
  216. (defn chmod-at [node opts file]
  217. (ssh-exec node (str "chmod " opts " " file)))
  218. (defn wget-at
  219. ([node url dest-dir]
  220. "Runs wget at node for the specified url, with pwd set to dest-dir."
  221. (wget-at node url dest-dir ""))
  222. ([node url dest-dir opts]
  223. "Runs wget at node for the specified url, with pwd set to dest-dir.
  224. opts must be valid options as one string, or an empty string."
  225. (ssh-exec node (str "cd " dest-dir "; wget " opts " " url))))
  226. (defn tagger
  227. [node key f]
  228. "Associates key to node, where the value of key is the result of
  229. calling f on node."
  230. (assoc node key (f node)))
  231. (defn >+ [f nodes]
  232. (doall (apply pcalls (map
  233. #(partial tagger % (keyword (:name (meta f))) f)
  234. (nice-seq nodes)))))
  235. ;;
  236. ;; JMX related
  237. ;;
  238. (defn jmx-props [node]
  239. {:host (:host node),
  240. :port (get-in node [:jmx :port]),
  241. :environment {"jmx.remote.credentials" (into-array [(get-in node [:jmx :user])
  242. (get-in node [:jmx :pwd])])}})
  243. (defn jmx-names
  244. [node]
  245. "Returns a sequence of JMX ObjectNames, where each ObjectName
  246. represents a JMX mbean available at node. This can be used to
  247. discover the full set of mbeans available at node."
  248. (into #{}
  249. (jmx/with-connection (jmx-props node)
  250. (jmx/mbean-names "*:*"))))
  251. (defn jmx-type-at
  252. ([node package type]
  253. "Returns the JMX mbean available at node for type,
  254. under package."
  255. (jmx/with-connection (jmx-props node)
  256. (jmx/mbean (str package ":type=" type))))
  257. ([node type]
  258. "Returns the JMX mbean available at node for type,
  259. under the java.lang package."
  260. (jmx-type-at node "java.lang" type)))
  261. (defn start-time-at
  262. [proc]
  263. "Returns a Java Date representing the time that the specified remote
  264. JMX-enabled process was started."
  265. (java.util.Date.
  266. (:StartTime
  267. (jmx-type-at proc "Runtime"))))
  268. (defn os-at
  269. [proc]
  270. "Returns the OperatingSystem mbean data for the JMX-enabled proc."
  271. (jmx-type-at proc "OperatingSystem"))
  272. (defn load-avg-at
  273. [proc]
  274. "Returns the load average for the OS running the JMX-enabled proc,
  275. as a Double."
  276. (:SystemLoadAverage
  277. (os-at proc)))
  278. (defn threading-at
  279. [proc]
  280. "Returns the Threading mbean data for the JMX-enabled proc."
  281. (jmx-type-at proc "Threading"))
  282. (defn thread-count-at [node]
  283. (:ThreadCount (jmx-type-at node "java.lang" "Threading")))
  284. ;;
  285. ;; Factual/Quartz specific
  286. ;;
  287. (defn vote-server? [proc]
  288. (and
  289. (java? proc)
  290. (not (nil? (re-matches #".* quartz.voteserver.rest.VoteServerRestBootstrap .*" (proc :cmd))))))
  291. (def oome-log "/u/apps/PRODUCTION/quartz/shared/bin/oome.log")
  292. (def vot-hosts (map #(str "vot0" %) ["04" "05" "06" "07" "09" "14" "10" "11" "12" "13"]))
  293. (def quartz-props {:user "rails_deploy",
  294. :jmx {:port 8021, :user "monitorRole", :pwd "quartz"}})
  295. ;; Quartz vote servers. Every host has the same properties (except host name).
  296. (def quartz (map #(merge quartz-props {:host %}) vot-hosts))
  297. (comment
  298. (defmacro def-hosts [cluster]
  299. (let [node (gensym "node")]
  300. `(doseq [~node ~cluster]
  301. (def ~(symbol (str (:host `~node))) ~node)))))
  302. ;; A sample vote server node
  303. (def vot (first quartz))
  304. (defn vote-servers-at [node]
  305. (filter vote-server? (ps node)))
  306. ;; TODO: ambiguous if >1 vs is running :-/
  307. (defn pct-cpu-at [node]
  308. (:pctcpu
  309. (first (vote-servers-at node))))
  310. (defn restart-vs [node]
  311. (shout node "/u/apps/PRODUCTION/quartz/shared/bin/vot_restart.sh"))
  312. (defn new-oome-vs [node oome-date-str]
  313. (log2 node (str "Found new oome: " oome-date-str))
  314. (restart-vs node)
  315. (log2 node "Restarted VoteServer")
  316. (assoc-at node :last-seen-oome oome-date-str))
  317. (defn get-last-seen-oome [node]
  318. (bash-time (get-at node :last-seen-oome)))
  319. (defn check-oome [node]
  320. (let [last-oome-str (last-line node oome-log)
  321. last-oome (bash-time last-oome-str)
  322. last-seen-oome (get-last-seen-oome node)]
  323. (if (.after last-oome last-seen-oome)
  324. (do
  325. (new-oome-vs node last-oome-str)
  326. {:new-oome last-oome})
  327. (do
  328. (log2 node "No new oomes")
  329. {:new-oome false}))))
  330. (defn -main []
  331. (log "Checking all vote servers for oomes...")
  332. (execs check-oome quartz))