PageRenderTime 27ms CodeModel.GetById 23ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/fluent/plugin/in_td_monitor_agent.rb

https://gitlab.com/CORP-RESELLER/fluent-plugin-td-monitoring
Ruby | 485 lines | 425 code | 44 blank | 16 comment | 10 complexity | 7ce3461dbe2326fd4d1d05d77d5d3205 MD5 | raw file
  1. module Fluent
  2. require_relative 'tdms_ext_fluentd'
  3. require_relative 'out_td_counter'
  4. class TDMonitorAgentInput < Input
  5. VERSION = "0.2.1"
  6. Plugin.register_input('td_monitor_agent', self)
  7. config_param :apikey, :string, :secret => true
  8. config_param :emit_interval, :time, :default => 60
  9. config_param :endpoint, :string, :default => 'https://api.treasuredata.com:443'
  10. config_param :http_proxy, :string, :default => nil
  11. config_param :instance_id, :string, :default => nil
  12. config_param :retry_limit, :integer, :default => 5
  13. config_param :connect_timeout, :integer, :default => 10
  14. config_param :read_timeout, :integer, :default => 10
  15. config_param :send_timeout, :integer, :default => 10
  16. config_param :disable_node_info, :bool, :default => true
  17. unless method_defined?(:log)
  18. define_method(:log) { $log }
  19. end
  20. def initialize
  21. super
  22. require 'json'
  23. require 'ohai'
  24. require 'httpclient'
  25. end
  26. class TimerWatcher < Coolio::TimerWatcher
  27. def initialize(interval, repeat, log, &callback)
  28. @callback = callback
  29. # Avoid long shutdown time
  30. @num_call = 0
  31. @call_interval = interval / 10
  32. @log = log
  33. super(10, repeat)
  34. end
  35. def on_timer
  36. @num_call += 1
  37. if @num_call >= @call_interval
  38. @num_call = 0
  39. @callback.call
  40. end
  41. rescue => e
  42. @log.error e.to_s
  43. @log.error_backtrace
  44. end
  45. end
  46. def configure(conf)
  47. super
  48. @agent_id = get_agent_id
  49. @mac_address = Mac.address
  50. @ca_file = find_ca_file
  51. $log.warn "crt file not found. Use VERIFY_NONE in SSL context" if @ca_file.nil?
  52. end
  53. def start
  54. Engine.set_tag_path
  55. @started_at = Time.now.to_i
  56. @monitor_agent = ExMonitorAgentInput.new
  57. begin
  58. unless @disable_node_info
  59. @cpu_stat = CpuStat.new
  60. @disk_stat = DiskStat.new(FileBuffer.class_variable_get(:@@buffer_paths).keys)
  61. @memory_stat = MemoryStat.new
  62. @bandwidth_stat = BandwidthStat.new(@emit_interval)
  63. end
  64. rescue => e
  65. @disable_node_info = true
  66. log.warn "Failed to get system metrics. Set 'disable_node_info' to true: #{e}"
  67. end
  68. @counters = collect_counters
  69. unless register_instance_info
  70. log.warn "Can't register instance information at start"
  71. end
  72. @loop = Coolio::Loop.new
  73. @timer = TimerWatcher.new(@emit_interval, true, log, &method(:on_timer))
  74. @loop.attach(@timer)
  75. @thread = Thread.new(&method(:run))
  76. end
  77. def shutdown
  78. log.info "shutdown td_monitor_agent plugin"
  79. @loop.watchers.each {|w| w.detach }
  80. @loop.stop
  81. @thread.join
  82. end
  83. def run
  84. @loop.run
  85. rescue => e
  86. log.error "unexpected error", :error=> e.to_s
  87. log.error_backtrace
  88. end
  89. EVENT_ENDPOINT_PATH = '/v1/monitoring/start'
  90. def on_timer
  91. retrying = false
  92. @retry_limit.times { |i|
  93. if send_to_tdms(EVENT_ENDPOINT_PATH, collect_info)
  94. if retrying
  95. log.warn "retry succeeded after #{i} retry"
  96. end
  97. return
  98. else
  99. retrying = true
  100. end
  101. sleep 2
  102. }
  103. log.error "Send instance metrics failed. Try next #{@emit_interval} seconds"
  104. end
  105. private
  106. def find_ca_file
  107. ca_file = File.join(File.dirname(__FILE__), '..', '..', '..', 'data', 'ca-bundle.crt')
  108. begin
  109. File.read(ca_file)
  110. return File.expand_path(ca_file)
  111. rescue Errno::ENOENT => e
  112. end
  113. ca_file = File.join(File.dirname(__FILE__), 'ca-bundle.crt')
  114. begin
  115. File.read(ca_file)
  116. return File.expand_path(ca_file)
  117. rescue Errno::ENOENT => e
  118. end
  119. nil
  120. end
  121. BASIC_INFO_PLUGINS = %W(os platform hostname)
  122. def register_instance_info
  123. info = basic_info.dup
  124. info.merge!(collect_info)
  125. send_to_tdms(EVENT_ENDPOINT_PATH, info)
  126. end
  127. def basic_info
  128. if @basic_info.nil?
  129. ohai = Ohai::System.new
  130. BASIC_INFO_PLUGINS.each { |plugin|
  131. ohai.require_plugin(plugin)
  132. }
  133. @basic_info = {'info' => {'os' => ohai[:platform], 'os_version' => ohai[:platform_version], 'hostname' => ohai[:fqdn]}}
  134. end
  135. @basic_info
  136. end
  137. def collect_info
  138. info = {}
  139. info['plugins'] = collect_fluentd_info
  140. info['node_data'] = collect_node_info unless @disable_node_info
  141. info['traffic'] = collect_traffic_info unless @counters.empty?
  142. info.merge!(basic_info)
  143. info
  144. end
  145. def collect_node_info
  146. result = {}
  147. result['cpu'] = @cpu_stat.stats
  148. result['disk'] = @disk_stat.stats
  149. result['memory'] = @memory_stat.stats
  150. result['bandwidth'] = @bandwidth_stat.stats
  151. result
  152. end
  153. def collect_fluentd_info
  154. result = {}
  155. @monitor_agent.plugins_info_all.map { |plugin|
  156. id = plugin.delete('plugin_id')
  157. result[id] = plugin
  158. }
  159. result
  160. end
  161. def collect_traffic_info
  162. tagged_counts = {}
  163. @counters.map { |counter| counter.flush_counts }.each { |counts|
  164. counts.each { |tag, count|
  165. if c = tagged_counts[tag]
  166. c[Fluent::TDCounterOutput::BYTES_FIELD] += count[Fluent::TDCounterOutput::BYTES_FIELD]
  167. c[Fluent::TDCounterOutput::COUNT_FIELD] += count[Fluent::TDCounterOutput::COUNT_FIELD]
  168. else
  169. tagged_counts[tag] = count
  170. end
  171. }
  172. }
  173. tagged_counts
  174. end
  175. def send_to_tdms(path, info)
  176. #puts JSON.pretty_generate('agent_id' => @agent_id, 'data' => info, 'time' => Time.now.to_i); return true
  177. begin
  178. res = post(path, info)
  179. unless res.code.to_s.start_with?('2')
  180. log.warn "Get an error response: code = #{res.code}, message = #{res.body}"
  181. return false
  182. end
  183. rescue => e
  184. log.warn "Failed to send metrics: error = #{e.to_s}"
  185. return false
  186. end
  187. true
  188. end
  189. def get_agent_id
  190. id = @instance_id
  191. if id.nil?
  192. ObjectSpace.each_object(Fluent::Supervisor) { |obj|
  193. # TODO: Improve getting id using instance-id or something
  194. id = obj.instance_variable_get(:@config_path)
  195. }
  196. end
  197. id
  198. end
  199. def collect_counters
  200. counters = []
  201. ObjectSpace.each_object(Fluent::TDCounterOutput) { |obj|
  202. counters << obj
  203. }
  204. counters
  205. end
  206. def post(path, params = nil)
  207. client, header = new_client
  208. header['Content-Type'] = 'application/json'
  209. target = build_endpoint(path)
  210. body = {'mac_addr' => @mac_address, 'agent_id' => @agent_id, 'started_at' => @started_at,
  211. 'time' => Time.now.to_i, 'version' => VERSION, 'data' => params.to_json}.to_json
  212. # TODO: Use post_content supports redirect
  213. client.post(target, body, header)
  214. end
  215. def build_endpoint(path)
  216. "#{@endpoint}/#{path}"
  217. end
  218. def new_client(opts = {})
  219. client = HTTPClient.new(@http_proxy, "TDMS Agent #{VERSION}")
  220. client.connect_timeout = @connect_timeout
  221. client.receive_timeout = @read_timeout
  222. client.send_timeout = @send_timeout
  223. if ssl?
  224. if @ca_file
  225. client.ssl_config.add_trust_ca(@ca_file)
  226. client.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_PEER
  227. else
  228. client.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
  229. end
  230. end
  231. header = {}
  232. if @apikey
  233. header['Authorization'] = "TD1 #{@apikey}"
  234. end
  235. header['Date'] = Time.now.rfc2822
  236. return client, header
  237. end
  238. def ssl?
  239. uri = URI.parse(@endpoint)
  240. uri.scheme == 'https'
  241. end
  242. def e(s)
  243. require 'cgi'
  244. CGI.escape(s.to_s)
  245. end
  246. # TODO: Get fluentd's process usage of CPU and Memory
  247. class CpuStat
  248. def initialize
  249. @stats = cpu_stats
  250. end
  251. CPU_KEYS = %W(user nice system idle iowait irq sirq)
  252. USE_CPU_KEYS = [0, 2]
  253. def stats
  254. res = {}
  255. stats = cpu_stats
  256. diff = @stats.map.with_index { |stat, i| stats[i] - stat }
  257. total = diff.inject(0) { |sum, n| sum + n }
  258. total = 1 if total.zero?
  259. diff.each_with_index { |stat, i|
  260. if USE_CPU_KEYS.include?(i)
  261. res[CPU_KEYS[i]] = stat.to_f / total * 100
  262. end
  263. }
  264. @stats = stats
  265. res['loadavg1'] = loadavg_stats
  266. res
  267. end
  268. private
  269. def cpu_stats
  270. File.open("/proc/stat") { |f|
  271. stats = f.gets.split(' ', CPU_KEYS.size + 1)
  272. return stats.map { |stat| stat.to_i }
  273. }
  274. end
  275. def loadavg_stats
  276. File.open("/proc/loadavg") { |f|
  277. stats = f.gets.split(' ', 2)
  278. return stats.first.to_f
  279. }
  280. end
  281. end
  282. class DiskStat
  283. def initialize(paths)
  284. mounts = mount_points
  285. @targets = paths.map { |path| select_mount(path, mounts) }.sort.uniq
  286. end
  287. def stats
  288. res = {}
  289. `df -B G -P`.each_line.with_index { |line, i|
  290. if i.nonzero?
  291. columns = line.strip.split(' ')
  292. mount = columns[-1].strip
  293. if @targets.include?(mount)
  294. usage = columns[-2].chop.to_i
  295. res[mount] = usage
  296. end
  297. end
  298. }
  299. res
  300. end
  301. private
  302. def select_mount(path, mounts)
  303. mount = mounts.first
  304. mounts[1..-1].each { |m|
  305. if path.start_with?(m) && (m.length > mount.length)
  306. mount = m
  307. end
  308. }
  309. mount
  310. end
  311. def mount_points
  312. `df -B G -P`.each_line.map.with_index { |line, i|
  313. if i.zero?
  314. nil
  315. else
  316. columns = line.strip.split(' ')
  317. columns[-1].strip
  318. end
  319. }.compact
  320. end
  321. end
  322. class MemoryStat
  323. def stats
  324. res = {}
  325. `free -o`.each_line.with_index { |line, i|
  326. case
  327. when line.start_with?('Mem:')
  328. columns = line.strip.split(' ')
  329. total = columns[1].to_i
  330. free = columns[3].to_i + columns[5].to_i + columns[6].to_i
  331. res['usage'] = ((total - free).to_f / total * 100).to_i
  332. #when line.start_with?('Swap:')
  333. # columns = line.strip.split(' ')
  334. # res['swap'] = (columns[2].to_f / columns[1].to_i * 100).to_i
  335. end
  336. }
  337. res
  338. end
  339. end
  340. # bandwidth used ratio in bytes/s
  341. class BandwidthStat
  342. def initialize(interval)
  343. @interval = interval
  344. @bytes_cache = current_total_bytes
  345. end
  346. def stats
  347. res = {}
  348. last_bytes, @bytes_cache = @bytes_cache, current_total_bytes
  349. res['ratio'] = (@bytes_cache - last_bytes) / @interval
  350. res
  351. end
  352. def current_total_bytes
  353. network_bytes = `grep eth0: /proc/net/dev`.lstrip[5..-1].strip.split(/\s+/)
  354. received_bytes = network_bytes[0].to_i
  355. transmitted_bytes = network_bytes[8].to_i
  356. received_bytes + transmitted_bytes
  357. rescue => e
  358. 0
  359. end
  360. end
  361. # from macaddr gem
  362. module Mac
  363. class << self
  364. ##
  365. # Accessor for the system's first MAC address, requires a call to #address
  366. # first
  367. attr_accessor "mac_address"
  368. ##
  369. # Discovers and returns the system's MAC addresses. Returns the first
  370. # MAC address, and includes an accessor #list for the remaining addresses:
  371. #
  372. # Mac.addr # => first address
  373. # Mac.addr.list # => all addresses
  374. def address
  375. return @mac_address if defined? @mac_address and @mac_address
  376. re = %r/[^:\-](?:[0-9A-F][0-9A-F][:\-]){5}[0-9A-F][0-9A-F][^:\-]/io
  377. cmds = '/sbin/ifconfig', '/bin/ifconfig', 'ifconfig', 'ipconfig /all', 'cat /sys/class/net/*/address'
  378. null = test(?e, '/dev/null') ? '/dev/null' : 'NUL'
  379. output = nil
  380. cmds.each do |cmd|
  381. begin
  382. r, w = IO.pipe
  383. ::Process.waitpid(spawn(cmd, :out => w))
  384. w.close
  385. stdout = r.read
  386. next unless stdout and stdout.size > 0
  387. output = stdout and break
  388. rescue
  389. # go to next command!
  390. end
  391. end
  392. raise "all of #{ cmds.join ' ' } failed" unless output
  393. @mac_address = parse(output)
  394. end
  395. def parse(output)
  396. lines = output.split(/\n/)
  397. candidates = lines.select{|line| line =~ RE}
  398. raise 'no mac address candidates' unless candidates.first
  399. candidates.map!{|c| c[RE].strip}
  400. maddr = candidates.first
  401. raise 'no mac address found' unless maddr
  402. maddr.strip!
  403. maddr.instance_eval{ @list = candidates; def list() @list end }
  404. maddr
  405. end
  406. end
  407. RE = %r/(?:[^:\-]|\A)(?:[0-9A-F][0-9A-F][:\-]){5}[0-9A-F][0-9A-F](?:[^:\-]|\Z)/io
  408. end
  409. end
  410. end