PageRenderTime 36ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/apiary/base.py

https://bitbucket.org/lindenlab/apiary/
Python | 466 lines | 388 code | 30 blank | 48 comment | 9 complexity | a3bd99749f483779aaa1d0b4a08dbea7 MD5 | raw file
  1. #
  2. # $LicenseInfo:firstyear=2010&license=mit$
  3. #
  4. # Copyright (c) 2010, Linden Research, Inc.
  5. #
  6. # Permission is hereby granted, free of charge, to any person obtaining a copy
  7. # of this software and associated documentation files (the "Software"), to deal
  8. # in the Software without restriction, including without limitation the rights
  9. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10. # copies of the Software, and to permit persons to whom the Software is
  11. # furnished to do so, subject to the following conditions:
  12. #
  13. # The above copyright notice and this permission notice shall be included in
  14. # all copies or substantial portions of the Software.
  15. #
  16. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22. # THE SOFTWARE.
  23. # $/LicenseInfo$
  24. #
  25. '''
  26. This module stores the base classes for the QueenBee, BeeKeeper, and
  27. WorkerBee classes. It is responsible for all things related to message
  28. dispatch and collection. It contains nothing specific to the target
  29. protocol, nor to configuration, nor process management.
  30. '''
  31. # *FIX: Some bug exists which leads to the same span being inserted multiple times in http.
  32. # It probably has to do with the interplay between message semantics.
  33. import optparse
  34. import os
  35. import re
  36. import random
  37. import socket
  38. import sys
  39. import tempfile
  40. import cPickle
  41. import MySQLdb
  42. import time
  43. import warnings
  44. from multiprocessing import Value
  45. import amqplib.client_0_8 as amqp
  46. from apiary.tools import stattools
  47. from apiary.tools.counter import Counter
  48. from apiary.tools.childprocess import ChildProcess
  49. from apiary.tools.transport import Transport, ConnectionError
  50. from apiary.tools.debug import debug, traced_func, traced_method
  51. # We use an amqp virtual host called "/apiary".
  52. # A virtual host holds a cluster of exchanges, queues, and bindings.
  53. # We use a virtual host for permissions purposes (user apiary has access to everything in /apiary)
  54. # Exchanges are routers with routing tables.
  55. # Queues are where your messages end up.
  56. # Bindings are rules for routing tables. We use a "direct" exchange.
  57. # credit: http://blogs.digitar.com/jjww/
  58. verbose = False
  59. class BeeKeeper(object):
  60. """Manages the hive, including QueenBee, WorkerBees, and StatsGatherer."""
  61. def __init__(self, options, arguments):
  62. self.options = options
  63. self.arguments = arguments
  64. def start(self):
  65. """Run the load test."""
  66. start_time = time.time()
  67. workers = []
  68. for i in xrange(self.options.workers):
  69. worker = WorkerBee(self.options)
  70. worker.start()
  71. workers.append(worker)
  72. # TODO: consider waiting until workers are ready
  73. stats_gatherer = StatsGatherer(self.options)
  74. stats_gatherer.start()
  75. queen = QueenBee(self.options, self.arguments)
  76. queen.start()
  77. # Now wait while the queen does its thing.
  78. try:
  79. queen.join()
  80. except KeyboardInterrupt:
  81. print "Interrupted, shutting down..."
  82. queen.terminate()
  83. print "Waiting for workers to complete jobs and terminate (may take up to %d seconds)..." % self.options.max_ahead
  84. try:
  85. # All jobs have been sent to rabbitMQ. Now tell workers to stop.
  86. transport = Transport(self.options)
  87. transport.connect()
  88. transport.queue('worker-job', clean=False)
  89. for worker in workers:
  90. transport.send('worker-job', cPickle.dumps(Message(Message.STOP_WORKER)))
  91. # Now wait for the workers to get the message. This may take a few
  92. # minutes as the QueenBee likes to stay ahead by a bit.
  93. for worker in workers:
  94. worker.join()
  95. # Tell the Stats Gatherer that it's done.
  96. transport.queue('worker-status', clean=False)
  97. transport.send('worker-status', cPickle.dumps(Message(Message.STOP_STATS_GATHERER)))
  98. # Wait for it to finish.
  99. stats_gatherer.join()
  100. print "Completed %d jobs in %0.2f seconds." % (queen.jobs_sent.value, time.time() - start_time)
  101. except KeyboardInterrupt:
  102. print "Interrupted before shutdown process completed."
  103. class StatsGatherer(ChildProcess):
  104. def __init__(self, options):
  105. super(StatsGatherer, self).__init__()
  106. self._options = options
  107. self._verbose = options.verbose
  108. self._tally = {}
  109. self._tally_time = time.time() + 15.0
  110. self._worker_count = 0
  111. self._table_dne_re = re.compile('''500 \(1146, "Table '.*' doesn't exist"\)''')
  112. def tally(self, msg):
  113. # aggregate these error codes since we see a lot of them (1062/1064)
  114. if "Duplicate entry" in msg:
  115. msg = '501 (1062, "Duplicate entry for key")'
  116. elif "You have an error in your SQL syntax" in msg:
  117. msg = '501 (1064, "You have an error in your SQL syntax")'
  118. elif self._table_dne_re.match(msg):
  119. msg = '''501 (1146, "Table ___ doesn't exist")'''
  120. self._tally[msg] = self._tally.get(msg, 0) + 1
  121. if time.time() > self._tally_time:
  122. self.print_tally()
  123. def print_tally(self):
  124. keys = self._tally.keys()
  125. keys.sort()
  126. print
  127. print " count - message"
  128. print "------------ -------------------------------------------"
  129. for k in keys:
  130. print ("%12d - %s" % (self._tally[k], k))
  131. self._tally_time = time.time() + 15.0
  132. @traced_method
  133. def worker_status(self, msg):
  134. debug("received worker status: %s" % msg.body)
  135. body = msg.body
  136. message = cPickle.loads(body)
  137. if message.type == Message.WORKER_NEW:
  138. self._worker_count += 1
  139. debug('new-worker: now %d workers.',
  140. self._worker_count)
  141. return
  142. elif message.type == Message.WORKER_HALTED:
  143. self._worker_count -= 1
  144. debug('worker-stopped: now %d workers.',
  145. self._worker_count)
  146. return
  147. elif message.type == Message.STOP_STATS_GATHERER:
  148. debug('Stopping stats gatherer.')
  149. self.print_tally()
  150. msg.channel.basic_cancel('worker-status')
  151. elif message.type == Message.JOB_STARTED:
  152. self.tally("100 Start Job")
  153. elif message.type == Message.JOB_COMPLETED:
  154. self.tally("200 OK")
  155. elif message.type == Message.JOB_ERROR:
  156. self.tally("500 %s" % message.body)
  157. else:
  158. print >> sys.stderr, "Received unknown worker status: %s" % message
  159. def run_child_process(self):
  160. t = Transport(self._options)
  161. t.connect()
  162. t.usequeue('worker-status')
  163. if self._verbose > 2:
  164. print "consuming worker-status"
  165. t.consume('worker-status', 'worker-status', self.worker_status)
  166. t.wait()
  167. t.close()
  168. class QueenBee(ChildProcess):
  169. """A QueenBee process that distributes sequences of events"""
  170. def __init__(self, options, arguments):
  171. super(QueenBee, self).__init__()
  172. self._options = options
  173. self._verbose = options.verbose
  174. self._sequence_file = arguments[0]
  175. self._time_scale = 1.0 / options.speedup
  176. self._last_warning = 0
  177. self.jobs_sent = Value('L', 0)
  178. def run_child_process(self):
  179. transport = Transport(self._options)
  180. transport.connect()
  181. transport.queue('worker-job', clean=True)
  182. start_time = time.time()
  183. sequence_file = open(self._sequence_file, 'rb')
  184. job_num = 0
  185. while True:
  186. try:
  187. job = cPickle.load(sequence_file)
  188. job_num += 1
  189. # Jobs look like this:
  190. # (job_id, ((time, SQL), (time, SQL), ...))
  191. # The job is ready to shove onto the wire as is. However,
  192. # let's check to make sure we're not falling behind, and
  193. # throttle sending so as not to overfill the queue.
  194. if not self._options.asap and len(job[1]) > 0:
  195. base_time = job[1][0][0]
  196. offset = base_time * self._time_scale - (time.time() - start_time)
  197. if offset > self._options.max_ahead:
  198. time.sleep(offset - self._options.max_ahead)
  199. elif offset < -10.0:
  200. if time.time() - self._last_warning > 60:
  201. print "WARNING: Queenbee is %0.2f seconds behind." % (-offset)
  202. self._last_warning = time.time()
  203. message = Message(Message.JOB, job)
  204. message = cPickle.dumps(message)
  205. transport.send('worker-job', message)
  206. except EOFError:
  207. break
  208. self.jobs_sent.value = job_num
  209. class WorkerBee(ChildProcess):
  210. """A WorkerBee that processes a sequences of events"""
  211. def __init__(self, options):
  212. super(WorkerBee, self).__init__()
  213. self._options = options
  214. self._asap = options.asap
  215. self._verbose = options.verbose >= 1
  216. self._debug = options.debug
  217. self._no_mysql = options.no_mysql
  218. self._connect_options = {}
  219. self._connect_options['host'] = options.mysql_host
  220. self._connect_options['port'] = options.mysql_port
  221. self._connect_options['user'] = options.mysql_user
  222. self._connect_options['passwd'] = options.mysql_passwd
  223. self._connect_options['db'] = options.mysql_db
  224. self._start_time = time.time()
  225. self._time_scale = 1.0 / options.speedup
  226. def status(self, status, body=None):
  227. self._transport.send('worker-status', cPickle.dumps(Message(status, body)))
  228. def process_job(self, msg):
  229. message = cPickle.loads(msg.body)
  230. if message.type == Message.STOP_WORKER:
  231. msg.channel.basic_cancel('worker-job')
  232. msg.channel.basic_ack(msg.delivery_tag)
  233. elif message.type == Message.JOB:
  234. # Jobs look like this:
  235. # (job_id, ((time, SQL), (time, SQL), ...))
  236. job_id, tasks = message.body
  237. self.status(Message.JOB_STARTED)
  238. if self._no_mysql:
  239. self.status(Message.JOB_COMPLETED)
  240. return
  241. try:
  242. connection = MySQLdb.connect(**self._connect_options)
  243. except Exception, e:
  244. self.status(Message.JOB_ERROR, str(e))
  245. return
  246. for timestamp, query in tasks:
  247. target_time = timestamp * self._time_scale + self._start_time
  248. offset = target_time - time.time()
  249. # TODO: warn if falling behind?
  250. if offset > 0:
  251. #if self._verbose:
  252. debug('sleeping %0.4f seconds' % offset)
  253. if offset > 120 and self._verbose:
  254. print "long wait of %ds for job %s" % (offset, job_id)
  255. time.sleep(offset)
  256. query = query.strip()
  257. if query and query != "Quit": # "Quit" is for compatibility with a bug in genjobs.py. TODO: remove this
  258. try:
  259. cursor = connection.cursor()
  260. rows = cursor.execute(query)
  261. if rows:
  262. cursor.fetchall()
  263. cursor.close()
  264. except Exception, e: # TODO: more restrictive error catching?
  265. self.status(Message.JOB_ERROR, "%s" % e)
  266. try:
  267. cursor.close()
  268. connection.close()
  269. except:
  270. pass
  271. msg.channel.basic_ack(msg.delivery_tag)
  272. return
  273. try:
  274. # Sometimes pt-query-digest neglects to mention the commit.
  275. cursor.execute('COMMIT;')
  276. except:
  277. pass
  278. try:
  279. connection.close()
  280. except:
  281. pass
  282. self.status(Message.JOB_COMPLETED)
  283. msg.channel.basic_ack(msg.delivery_tag)
  284. def run_child_process(self):
  285. if not self._debug:
  286. warnings.filterwarnings('ignore', category=MySQLdb.Warning)
  287. self._transport = Transport(self._options)
  288. self._transport.connect()
  289. self._transport.set_prefetch(1)
  290. self._transport.usequeue('worker-job')
  291. self._transport.usequeue('worker-status')
  292. self.status(Message.WORKER_NEW)
  293. self._transport.consume('worker-job', 'worker-job', self.process_job, exclusive=False)
  294. self._transport.wait()
  295. self.status(Message.WORKER_HALTED)
  296. debug("worker ended")
  297. self._transport.close()
  298. self._transport = None
  299. def clean(options):
  300. transport = Transport(options)
  301. transport.connect()
  302. transport.queue('worker-job')
  303. transport.queue('worker-status')
  304. transport.close()
  305. class Message (object):
  306. WORKER_NEW = 1
  307. WORKER_HALTED = 2
  308. STOP_WORKER = 3
  309. STOP_STATS_GATHERER = 4
  310. JOB_STARTED = 5
  311. JOB_COMPLETED = 6
  312. JOB_ERROR = 7
  313. JOB = 8
  314. def __init__(self, type, body=None):
  315. self.type = type
  316. self.body = body
  317. def __str__(self):
  318. return repr(self)
  319. def __repr__(self):
  320. return "Message(%s, %s)" % (self.type, repr(self.body))
  321. def add_options(parser):
  322. parser.add_option('-v', '--verbose',
  323. default=0, action='count',
  324. help='increase output (0~2 times')
  325. parser.add_option('--profile', default=False, action='store_true',
  326. help='Print profiling data. This will impact performance.')
  327. parser.add_option('--debug', default=False, action='store_true', dest='debug',
  328. help='Print debug messages.')
  329. parser.add_option('--asap',
  330. action='store_true', default=False,
  331. help='send queries as fast as possible (default: off)')
  332. parser.add_option('-w', '--workers', metavar='N',
  333. default=100, type='int',
  334. help='number of worker bee processes (default: 100)')
  335. parser.add_option('--clean',
  336. action='store_true', default=False,
  337. help='clean up all queues')
  338. parser.add_option('--speedup', default=1.0, dest='speedup', type='float',
  339. help="Time multiple used when replaying query logs. 2.0 means "
  340. "that queries run twice as fast (and the entire run takes "
  341. "half the time the capture ran for).")
  342. parser.add_option('--max-ahead', default=300, type='int', metavar='SECONDS',
  343. help='''How many seconds ahead the QueenBee may get in sending
  344. jobs to the queue. Only change this if RabbitMQ runs out
  345. of memory.''')
  346. # Option groups:
  347. g = optparse.OptionGroup(parser, 'AMQP options')
  348. g.add_option('--amqp-host',
  349. default="localhost", metavar='HOST',
  350. help='AMQP server to connect to (default: %default)')
  351. g.add_option('--amqp-vhost',
  352. default="/apiary", metavar='PATH',
  353. help='AMQP virtual host to use (default: %default)')
  354. g.add_option('--amqp-userid',
  355. default="apiary", metavar='USER',
  356. help='AMQP userid to authenticate as (default: %default)')
  357. g.add_option('--amqp-password',
  358. default="beehonest", metavar='PW',
  359. help='AMQP password to authenticate with (default: %default)')
  360. g.add_option('--amqp-ssl',
  361. action='store_true', default=False,
  362. help='Enable SSL (default: not enabled)')
  363. parser.add_option_group(g)
  364. g = optparse.OptionGroup(parser, 'MySQL options')
  365. g.add_option('--no-mysql', default=False, dest='no_mysql', action='store_true',
  366. help="Don't make mysql connections. Return '200 OK' instead.")
  367. g.add_option('--mysql-host',
  368. default="localhost", metavar='HOST',
  369. help='MySQL server to connect to (default: %default)')
  370. g.add_option('--mysql-port',
  371. default=3306, type='int', metavar='PORT',
  372. help='MySQL port to connect on (default: %default)')
  373. g.add_option('--mysql-user',
  374. default='guest', metavar='USER',
  375. help='MySQL user to connect as (default: %default)')
  376. g.add_option('--mysql-passwd',
  377. default='', metavar='PW',
  378. help='MySQL password to connect with (default: %default)')
  379. g.add_option('--mysql-db',
  380. default='test', metavar='DB',
  381. help='MySQL database to connect to (default: %default)')
  382. parser.add_option_group(g)