PageRenderTime 66ms CodeModel.GetById 41ms app.highlight 20ms RepoModel.GetById 2ms app.codeStats 0ms

/apiary/base.py

https://bitbucket.org/lindenlab/apiary/
Python | 466 lines | 388 code | 30 blank | 48 comment | 7 complexity | a3bd99749f483779aaa1d0b4a08dbea7 MD5 | raw file
  1#
  2# $LicenseInfo:firstyear=2010&license=mit$
  3# 
  4# Copyright (c) 2010, Linden Research, Inc.
  5# 
  6# Permission is hereby granted, free of charge, to any person obtaining a copy
  7# of this software and associated documentation files (the "Software"), to deal
  8# in the Software without restriction, including without limitation the rights
  9# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10# copies of the Software, and to permit persons to whom the Software is
 11# furnished to do so, subject to the following conditions:
 12# 
 13# The above copyright notice and this permission notice shall be included in
 14# all copies or substantial portions of the Software.
 15# 
 16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22# THE SOFTWARE.
 23# $/LicenseInfo$
 24#
 25
 26
 27'''
 28This module stores the base classes for the QueenBee, BeeKeeper, and
 29WorkerBee classes.  It is responsible for all things related to message
 30dispatch and collection.  It contains nothing specific to the target
 31protocol, nor to configuration, nor process management.
 32'''
 33
 34# *FIX: Some bug exists which leads to the same span being inserted multiple times in http.
 35# It probably has to do with the interplay between message semantics.
 36
 37import optparse
 38import os
 39import re
 40import random
 41import socket
 42import sys
 43import tempfile
 44import cPickle
 45import MySQLdb
 46import time
 47import warnings
 48from multiprocessing import Value
 49
 50import amqplib.client_0_8 as amqp
 51
 52from apiary.tools import stattools
 53from apiary.tools.counter import Counter
 54from apiary.tools.childprocess import ChildProcess
 55from apiary.tools.transport import Transport, ConnectionError
 56from apiary.tools.debug import debug, traced_func, traced_method
 57
 58# We use an amqp virtual host called "/apiary".
 59# A virtual host holds a cluster of exchanges, queues, and bindings.
 60# We use a virtual host for permissions purposes (user apiary has access to everything in /apiary)
 61# Exchanges are routers with routing tables.  
 62# Queues are where your messages end up.
 63# Bindings are rules for routing tables.  We use a "direct" exchange.
 64
 65# credit: http://blogs.digitar.com/jjww/
 66
 67verbose = False
 68
 69class BeeKeeper(object):
 70    """Manages the hive, including QueenBee, WorkerBees, and StatsGatherer."""
 71    
 72    def __init__(self, options, arguments):
 73        self.options = options
 74        self.arguments = arguments
 75    
 76    def start(self):
 77        """Run the load test."""
 78        
 79        start_time = time.time()
 80        
 81        workers = []
 82        
 83        for i in xrange(self.options.workers):
 84            worker = WorkerBee(self.options)
 85            worker.start()
 86            workers.append(worker)
 87        
 88        # TODO: consider waiting until workers are ready
 89        
 90        stats_gatherer = StatsGatherer(self.options)
 91        stats_gatherer.start()
 92        
 93        queen = QueenBee(self.options, self.arguments)
 94        queen.start()
 95        
 96        # Now wait while the queen does its thing.
 97        try:
 98            queen.join()
 99        except KeyboardInterrupt:
100            print "Interrupted, shutting down..."
101            queen.terminate()
102        
103        print "Waiting for workers to complete jobs and terminate (may take up to %d seconds)..." % self.options.max_ahead
104        
105        try:
106            # All jobs have been sent to rabbitMQ.  Now tell workers to stop.
107            transport = Transport(self.options)
108            transport.connect()
109            transport.queue('worker-job', clean=False)
110            
111            for worker in workers:
112                transport.send('worker-job', cPickle.dumps(Message(Message.STOP_WORKER)))
113            
114            # Now wait for the workers to get the message.  This may take a few 
115            # minutes as the QueenBee likes to stay ahead by a bit.
116            
117            for worker in workers:
118                worker.join()
119            
120            # Tell the Stats Gatherer that it's done.
121            transport.queue('worker-status', clean=False)
122            transport.send('worker-status', cPickle.dumps(Message(Message.STOP_STATS_GATHERER)))
123            
124            # Wait for it to finish.
125            stats_gatherer.join()
126            
127            print "Completed %d jobs in %0.2f seconds." % (queen.jobs_sent.value, time.time() - start_time)
128        except KeyboardInterrupt:
129            print "Interrupted before shutdown process completed."
130    
131class StatsGatherer(ChildProcess):
132
133    def __init__(self, options):
134        super(StatsGatherer, self).__init__()
135        
136        self._options = options
137        self._verbose = options.verbose
138        self._tally = {}
139        self._tally_time = time.time() + 15.0
140        self._worker_count = 0
141        self._table_dne_re = re.compile('''500 \(1146, "Table '.*' doesn't exist"\)''')
142    
143    def tally(self, msg):
144        # aggregate these error codes since we see a lot of them (1062/1064)
145        if "Duplicate entry" in msg:
146            msg = '501 (1062, "Duplicate entry for key")'
147        elif "You have an error in your SQL syntax" in msg:
148            msg = '501 (1064, "You have an error in your SQL syntax")'
149        elif self._table_dne_re.match(msg):
150            msg = '''501 (1146, "Table ___ doesn't exist")'''
151        self._tally[msg] = self._tally.get(msg, 0) + 1
152        if time.time() > self._tally_time:
153            self.print_tally()
154
155    
156    def print_tally(self):
157        keys = self._tally.keys()
158        keys.sort()
159        print
160        print "       count - message"
161        print "------------   -------------------------------------------"
162        for k in keys:
163            print ("%12d - %s" % (self._tally[k], k))
164        self._tally_time = time.time() + 15.0    
165    
166    @traced_method
167    def worker_status(self, msg):
168        debug("received worker status: %s" % msg.body)
169        body = msg.body
170        message = cPickle.loads(body)
171        if message.type == Message.WORKER_NEW:
172            self._worker_count += 1
173            debug('new-worker: now %d workers.',
174                  self._worker_count)
175            return
176        elif message.type == Message.WORKER_HALTED:
177            self._worker_count -= 1
178            debug('worker-stopped: now %d workers.',
179                  self._worker_count)
180            return
181        elif message.type == Message.STOP_STATS_GATHERER:
182            debug('Stopping stats gatherer.')
183            self.print_tally()
184            msg.channel.basic_cancel('worker-status')
185        elif message.type == Message.JOB_STARTED:
186            self.tally("100 Start Job")
187        elif message.type == Message.JOB_COMPLETED:
188            self.tally("200 OK")
189        elif message.type == Message.JOB_ERROR:
190            self.tally("500 %s" % message.body)
191        else:
192            print >> sys.stderr, "Received unknown worker status: %s" % message
193    
194    def run_child_process(self):
195        t = Transport(self._options)
196        t.connect()
197        t.usequeue('worker-status')
198        if self._verbose > 2:
199            print "consuming worker-status"
200        t.consume('worker-status', 'worker-status', self.worker_status)
201        t.wait()
202        t.close()
203
204        
205
206class QueenBee(ChildProcess):
207    """A QueenBee process that distributes sequences of events"""
208    
209    def __init__(self, options, arguments):
210        super(QueenBee, self).__init__()
211
212        self._options = options
213        self._verbose = options.verbose
214        self._sequence_file = arguments[0]
215        self._time_scale = 1.0 / options.speedup
216        self._last_warning = 0
217        self.jobs_sent = Value('L', 0)
218    
219    def run_child_process(self):
220        transport = Transport(self._options)
221        transport.connect()
222        transport.queue('worker-job', clean=True)
223        
224        start_time = time.time()
225        
226        sequence_file = open(self._sequence_file, 'rb')
227
228        job_num = 0
229
230        while True:
231            try:
232                job = cPickle.load(sequence_file)
233                job_num += 1
234                
235                # Jobs look like this:
236                # (job_id, ((time, SQL), (time, SQL), ...))
237                
238                # The job is ready to shove onto the wire as is.  However, 
239                # let's check to make sure we're not falling behind, and 
240                # throttle sending so as not to overfill the queue.
241                
242                if not self._options.asap and len(job[1]) > 0:
243                    base_time = job[1][0][0]
244                    offset = base_time * self._time_scale - (time.time() - start_time)
245                    
246                    if offset > self._options.max_ahead:
247                        time.sleep(offset - self._options.max_ahead)
248                    elif offset < -10.0:
249                        if time.time() - self._last_warning > 60:
250                            print "WARNING: Queenbee is %0.2f seconds behind." % (-offset)
251                            self._last_warning = time.time()
252                
253                message = Message(Message.JOB, job)
254                message = cPickle.dumps(message)
255                transport.send('worker-job', message)
256            except EOFError:
257                break
258        
259        self.jobs_sent.value = job_num
260
261class WorkerBee(ChildProcess):
262    """A WorkerBee that processes a sequences of events"""
263    
264    def __init__(self, options):
265        super(WorkerBee, self).__init__()
266        
267        self._options = options
268        self._asap = options.asap
269        self._verbose = options.verbose >= 1
270        self._debug = options.debug
271        self._no_mysql = options.no_mysql
272        self._connect_options = {}
273        self._connect_options['host'] = options.mysql_host
274        self._connect_options['port'] = options.mysql_port
275        self._connect_options['user'] = options.mysql_user
276        self._connect_options['passwd'] = options.mysql_passwd
277        self._connect_options['db'] = options.mysql_db
278        self._start_time = time.time()
279        self._time_scale = 1.0 / options.speedup
280    
281    def status(self, status, body=None):
282        self._transport.send('worker-status', cPickle.dumps(Message(status, body)))
283    
284    def process_job(self, msg):
285        message = cPickle.loads(msg.body)
286        
287        if message.type == Message.STOP_WORKER:
288            msg.channel.basic_cancel('worker-job')
289            msg.channel.basic_ack(msg.delivery_tag)
290        elif message.type == Message.JOB:
291            # Jobs look like this:
292            # (job_id, ((time, SQL), (time, SQL), ...))
293            
294            job_id, tasks = message.body
295            
296            self.status(Message.JOB_STARTED)
297            
298            if self._no_mysql:
299                self.status(Message.JOB_COMPLETED)
300                return
301
302            try:
303                connection = MySQLdb.connect(**self._connect_options)
304            except Exception, e:
305                self.status(Message.JOB_ERROR, str(e))
306                return
307            
308            for timestamp, query in tasks:
309                target_time = timestamp * self._time_scale + self._start_time
310                offset = target_time - time.time()
311                
312                # TODO: warn if falling behind?
313                
314                if offset > 0:
315                    #if self._verbose:
316                    debug('sleeping %0.4f seconds' % offset)
317                    if offset > 120 and self._verbose:
318                        print "long wait of %ds for job %s" % (offset, job_id)
319                    time.sleep(offset)
320                
321                query = query.strip()
322                if query and query != "Quit": # "Quit" is for compatibility with a bug in genjobs.py.  TODO: remove this
323                    try:
324                        cursor = connection.cursor()
325                        rows = cursor.execute(query)
326                        if rows:
327                            cursor.fetchall()
328                        cursor.close()
329                    except Exception, e: # TODO: more restrictive error catching?
330                        self.status(Message.JOB_ERROR, "%s" % e)
331                        
332                        try:
333                            cursor.close()
334                            connection.close()
335                        except:
336                            pass
337                        
338                        msg.channel.basic_ack(msg.delivery_tag)
339                        return
340            
341            try:
342                # Sometimes pt-query-digest neglects to mention the commit.
343                cursor.execute('COMMIT;')
344            except:
345                pass
346            
347            try:
348                connection.close()
349            except:
350                pass
351            
352            self.status(Message.JOB_COMPLETED)
353            msg.channel.basic_ack(msg.delivery_tag)
354    
355    def run_child_process(self):
356        if not self._debug:
357            warnings.filterwarnings('ignore', category=MySQLdb.Warning)
358    
359        self._transport = Transport(self._options)
360        self._transport.connect()
361        self._transport.set_prefetch(1)
362        self._transport.usequeue('worker-job')
363        self._transport.usequeue('worker-status')
364        self.status(Message.WORKER_NEW)
365
366        self._transport.consume('worker-job', 'worker-job', self.process_job, exclusive=False)
367        self._transport.wait()
368        self.status(Message.WORKER_HALTED)
369        debug("worker ended")
370        self._transport.close()
371        self._transport = None
372
373
374def clean(options):
375    transport = Transport(options)
376    transport.connect()
377    transport.queue('worker-job')
378    transport.queue('worker-status')
379    transport.close()
380
381
382class Message (object):
383    WORKER_NEW = 1
384    WORKER_HALTED = 2
385    STOP_WORKER = 3
386    STOP_STATS_GATHERER = 4
387    JOB_STARTED = 5
388    JOB_COMPLETED = 6
389    JOB_ERROR = 7
390    JOB = 8
391    
392    def __init__(self, type, body=None):
393        self.type = type
394        self.body = body
395    
396    def __str__(self):
397        return repr(self)
398        
399    def __repr__(self):
400        return "Message(%s, %s)" % (self.type, repr(self.body))
401
402
403def add_options(parser):
404    parser.add_option('-v', '--verbose',
405                        default=0, action='count',
406                        help='increase output (0~2 times')
407    parser.add_option('--profile', default=False, action='store_true', 
408                      help='Print profiling data.  This will impact performance.')
409    parser.add_option('--debug', default=False, action='store_true', dest='debug',
410                      help='Print debug messages.')
411    parser.add_option('--asap',
412                      action='store_true', default=False,
413                      help='send queries as fast as possible (default: off)')
414    parser.add_option('-w', '--workers', metavar='N',
415                      default=100, type='int',
416                      help='number of worker bee processes (default: 100)')
417    parser.add_option('--clean',
418                      action='store_true', default=False,
419                      help='clean up all queues')
420    parser.add_option('--speedup', default=1.0, dest='speedup', type='float',
421                      help="Time multiple used when replaying query logs.  2.0 means "
422                           "that queries run twice as fast (and the entire run takes "
423                           "half the time the capture ran for).")
424    parser.add_option('--max-ahead', default=300, type='int', metavar='SECONDS',
425                      help='''How many seconds ahead the QueenBee may get in sending
426                           jobs to the queue.  Only change this if RabbitMQ runs out
427                           of memory.''')
428    
429    # Option groups:
430    g = optparse.OptionGroup(parser, 'AMQP options')
431    g.add_option('--amqp-host',
432                      default="localhost", metavar='HOST',
433                      help='AMQP server to connect to (default: %default)')
434    g.add_option('--amqp-vhost',
435                      default="/apiary", metavar='PATH',
436                      help='AMQP virtual host to use (default: %default)')
437    g.add_option('--amqp-userid',
438                      default="apiary", metavar='USER',
439                      help='AMQP userid to authenticate as (default: %default)')
440    g.add_option('--amqp-password',
441                      default="beehonest", metavar='PW',
442                      help='AMQP password to authenticate with (default: %default)')
443    g.add_option('--amqp-ssl',
444                      action='store_true', default=False,
445                      help='Enable SSL (default: not enabled)')
446    parser.add_option_group(g)
447
448    g = optparse.OptionGroup(parser, 'MySQL options')
449    g.add_option('--no-mysql', default=False, dest='no_mysql', action='store_true',
450                      help="Don't make mysql connections.  Return '200 OK' instead.")
451    g.add_option('--mysql-host',
452                      default="localhost", metavar='HOST',
453                      help='MySQL server to connect to (default: %default)')
454    g.add_option('--mysql-port',
455                      default=3306, type='int', metavar='PORT',
456                      help='MySQL port to connect on (default: %default)')
457    g.add_option('--mysql-user',
458                      default='guest', metavar='USER',
459                      help='MySQL user to connect as (default: %default)')
460    g.add_option('--mysql-passwd',
461                      default='', metavar='PW',
462                      help='MySQL password to connect with (default: %default)')
463    g.add_option('--mysql-db',
464                      default='test', metavar='DB',
465                      help='MySQL database to connect to (default: %default)')
466    parser.add_option_group(g)