PageRenderTime 269ms CodeModel.GetById 140ms app.highlight 75ms RepoModel.GetById 46ms app.codeStats 1ms

/historical/hive.py

https://bitbucket.org/lindenlab/apiary/
Python | 657 lines | 461 code | 93 blank | 103 comment | 72 complexity | 903c6be4175b08337a17e9b23261679c MD5 | raw file
  1#
  2# $LicenseInfo:firstyear=2010&license=mit$
  3# 
  4# Copyright (c) 2010, Linden Research, Inc.
  5# 
  6# Permission is hereby granted, free of charge, to any person obtaining a copy
  7# of this software and associated documentation files (the "Software"), to deal
  8# in the Software without restriction, including without limitation the rights
  9# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10# copies of the Software, and to permit persons to whom the Software is
 11# furnished to do so, subject to the following conditions:
 12# 
 13# The above copyright notice and this permission notice shall be included in
 14# all copies or substantial portions of the Software.
 15# 
 16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22# THE SOFTWARE.
 23# $/LicenseInfo$
 24#
 25
 26from optparse import OptionParser
 27import os
 28import random
 29import sys
 30import thread
 31import threading
 32import time
 33
 34import amqplib.client_0_8 as amqp
 35
 36import stattools
 37
 38
 39# We use an amqp virtual host called "/hive".
 40# A virtual host holds a cluster of exchanges, queues, and bindings.
 41# We use a virtual host for permissions purposes (user hive has access to everything in /hive)
 42# Exchanges are routers with routing tables.  
 43# Queues are where your messages end up.
 44# Bindings are rules for routing tables.  We use a "direct" exchange.
 45
 46# credit: http://blogs.digitar.com/jjww/
 47
 48amqp_host = 'localhost'
 49amqp_userid = 'hive'
 50amqp_password = 'resistanceisfutile'
 51amqp_vhost = '/hive'
 52amqp_exchange = 'b.direct'
 53timeout = 10.0
 54
 55class TimeoutError(Exception):
 56    pass
 57
 58
 59class Transport(object):
 60    """A simple message queue-like transport system
 61    
 62    Built on AMQP, hides much of the details of that interface and presents
 63    a simple set of utilities for sending and receiving messages on named
 64    queues.
 65    
 66    """
 67        
 68    def __init__(self, options=None):
 69        self._amqp_host = getattr(options, 'amqp_host', amqp_host)
 70        self._amqp_vhost = getattr(options, 'amqp_vhost', amqp_vhost)
 71        self._amqp_userid = getattr(options, 'amqp_userid', amqp_userid)
 72        self._amqp_password = getattr(options, 'amqp_password', amqp_password)
 73        self._timeout = getattr(options, 'timeout', timeout)
 74    
 75    def _server_connect(self):
 76        self._conn = amqp.Connection(
 77                self._amqp_host, virtual_host=self._amqp_vhost,
 78                userid=self._amqp_userid, password=self._amqp_password)
 79        self._ch = self._conn.channel()
 80        # request active access so we can get, create, delete
 81        self._ch.access_request('/data', active=True, write=True, read=True)
 82        # durable=False means flush messages out to disk
 83        # auto_delete=True means get rid of the queue when there are no more messages to consume.
 84        # Doesn't hardly matter though since we manually delete the queues with close
 85        self._ch.exchange_declare(amqp_exchange, 'direct', durable=False, auto_delete=True)
 86    
 87    def _server_close(self):
 88        try:
 89            self._ch.close()
 90            self._ch = None
 91        except:
 92            pass
 93        try:
 94            self._conn.close()
 95            self._conn = None
 96        except:
 97            pass
 98       
 99    # vestigial
100    def _server_reconnect(self):
101        self._server_close()
102        self._server_connect()
103
104    def connect(self):
105        self._server_connect()
106        self._queues = []
107        
108    def close(self):
109        for qname in self._queues:
110            self._ch.queue_delete(qname)
111        self._queues = []
112        self._server_close()
113    
114    def queue(self, queue='', inControl=True, clean=False):
115        queue, _, _ = self._ch.queue_declare(queue, durable=False, auto_delete=False)
116        self._ch.queue_bind(queue, amqp_exchange, queue)
117        if inControl:
118            self._queues.append(queue)
119        if clean:
120        # we purge the queues when we first initialize them
121            #print "purging queue " + queue
122            self._ch.queue_purge(queue)
123        return queue
124
125    # same as queue, only without inControl, so it purges, instead of appending (these names suck)
126    # we are having problems with usequeue, not with queue
127    def usequeue(self, queue, clean=False):
128        self.queue(queue, inControl=False, clean=clean)
129
130    # we aren't using this -- should we be?
131#    def donequeue(self, queue):
132#        self._ch.queue_delete(queue)
133        
134    def send(self, queue, data):
135        msg = amqp.Message(data)
136        self._ch.basic_publish(msg, amqp_exchange, queue)
137    
138    def recv(self, queue):
139        t = time.time()
140        while True:
141            # m is a sequence of SQL statements (preprocessed)
142            m = self._ch.basic_get(queue, no_ack=True)
143            if m is not None:
144                return m.body
145# Should just take out TimeoutError, and return None when there's no more messages.
146# Or fix TimeoutError so it closes the queues?
147# Anytime we call TimeoutError, it's going to screw up the workers.
148#            if (time.time() - t) > self._timeout:
149#                raise TimeoutError('Timeout waiting for data on queue ' + queue)    
150            if (time.time() - t) > self._timeout:
151                #print 'Timeout waiting for data on queue ' + queue 
152                break
153
154            time.sleep(0.1)
155
156    def consume(self, queue, tag, fn):
157        return self._ch.basic_consume(queue, tag,
158                            no_ack=True, exclusive=True, callback=fn)
159
160    def cancelconsume(self, tag):
161        self._ch.basic_cancel(tag)
162    
163    def wait(self):
164        while self._ch.callbacks:
165            self._ch.wait()
166
167
168_STATE_WAITING_PARTIAL  = 0
169_STATE_WAITING_COMPLETE = 1
170_STATE_RUNNING_PARTIAL  = 2
171_STATE_RUNNING_COMPLETE = 3
172    
173class JobMinder(object):
174    """A check on jobs to run"""
175    
176    def __init__(self, options):
177        self._options = options
178        self._verbose = options.verbose
179        self._throttle = options.throttle
180        self._lock = threading.Lock()
181        self._ok_to_start = threading.Event()
182        self._ok_to_start.set()
183        self._all_done = threading.Event()
184        self._all_done.set()
185        self._jobs = { }
186        self._counts = [ 0, 0, 0, 0 ]
187        self._results = [ ]
188        self._running_stats = stattools.StatValue()
189        self._waiting_stats = stattools.StatValue()
190    
191    def _recompute(self):
192        #print 'counts:', ','.join(map(str, self._counts))
193        # must have lock!
194        running = (self._counts[_STATE_RUNNING_PARTIAL]
195                    + self._counts[_STATE_RUNNING_COMPLETE])
196        waiting = (self._counts[_STATE_WAITING_PARTIAL]
197                    + self._counts[_STATE_WAITING_COMPLETE])
198        
199        self._running_stats.sample(running)
200        self._waiting_stats.sample(waiting)
201
202        if self._counts[_STATE_RUNNING_PARTIAL] > 0:
203            # if there are started jobs that aren't complete, keep going
204            self._ok_to_start.set()
205            self._all_done.clear()
206            return                    
207                    
208        if not self._throttle or waiting < (100 * (running + 1)):
209            self._ok_to_start.set()
210        else:
211            self._ok_to_start.clear()
212            
213        if running == 0 and waiting == 0:
214            self._all_done.set()
215        else:
216            self._all_done.clear()
217    
218    
219    def central_start(self, job):
220        if self._verbose >= 2:
221            print "central_start", job
222        self._ok_to_start.wait()
223        self._lock.acquire()
224        self._jobs[job] = _STATE_WAITING_PARTIAL
225        self._counts[_STATE_WAITING_PARTIAL] += 1
226        self._recompute()
227        self._lock.release()
228        return True
229    
230    def central_end(self, job):
231        if self._verbose >= 2:
232            print "central_end", job
233        self._lock.acquire()
234        s = self._jobs[job]
235        self._counts[s] -= 1
236        if s == _STATE_WAITING_PARTIAL:
237            s = _STATE_WAITING_COMPLETE
238        if s == _STATE_RUNNING_PARTIAL:
239            s = _STATE_RUNNING_COMPLETE
240        self._jobs[job] = s
241        self._counts[s] += 1
242        self._recompute()
243        self._lock.release()
244    
245    def worker_start(self, job):
246        if self._verbose >= 2:
247            print "worker_start", job
248        self._lock.acquire()
249        if job in self._jobs:
250            s = self._jobs[job]
251            self._counts[s] -= 1
252            if s == _STATE_WAITING_PARTIAL:
253                s = _STATE_RUNNING_PARTIAL
254            if s == _STATE_WAITING_COMPLETE:
255                s = _STATE_RUNNING_COMPLETE
256            self._jobs[job] = s
257            self._counts[s] += 1
258            self._recompute()
259#        else:
260#            print "Received worker start of unknown job:", job
261        self._lock.release()
262    
263    def worker_end(self, msg):
264        result = msg.split(',', 1)
265        job = result[0]
266        if self._verbose >= 2:
267            print "worker_end", job
268        self._lock.acquire()
269        if job in self._jobs:
270            del self._jobs[job]
271            self._counts[_STATE_RUNNING_COMPLETE] -= 1
272            self._recompute()
273            self._results.append(result)
274#        else:
275#            print "Received worker end of unknown job:", job
276        self._lock.release()
277    
278    def worker_status(self, msg):
279        parts = msg.body.split(',', 1)
280        if len(parts) != 2:
281            print "Received malformed status:", msg.body
282            return
283        if parts[0] == 'start':
284            self.worker_start(parts[1])
285        elif parts[0] == 'end':
286            self.worker_end(parts[1])
287        else:
288            print "Received unknown status:", msg.body
289            
290    def wait_for_done(self, timeout=None):
291        self._all_done.wait(timeout)
292        if self._verbose >= 2:
293            print
294        return self._all_done.isSet()
295    
296    def not_done(self):
297        return not self._all_done.isSet()
298    
299    def results(self):
300        self._lock.acquire()
301        rs = self._results
302        self._results = []
303        self._lock.release()
304        return rs
305
306    def run(self):
307        t = Transport(self._options)
308        t.connect()
309        t.usequeue('minder-end')
310        t.usequeue('worker-status')
311        # so ... this never ends.  we never get to the rest of this.
312        #print "consuming minder-end"
313        t.consume('minder-end', 'm0', self.stop)
314        #print "consuming worker-status"
315        t.consume('worker-status', 'm1', self.worker_status)
316        t.wait()
317        t.close()
318
319# basic_cancel() with a consumer tag to stop the consume(), or it will consume forever
320# we should be using t.cancelconsume() here
321# http://hg.barryp.org/py-amqplib/raw-file/tip/docs/overview.txt
322    def stop(self, msg):
323        msg.channel.basic_cancel('m0')
324        msg.channel.basic_cancel('m1')
325        print "running concurrency:", self._running_stats.format()
326        print "waiting concurrency:", self._waiting_stats.format()
327
328# How we encode sequences of queries (?)
329def _job_encode(job, data_list):
330    escaped_items = [
331        item.replace('~', '~t').replace('|', '~p')
332        for item in [job] + data_list]
333    return '|'.join(escaped_items)
334
335def _job_decode(message):
336    escaped_items = message.split('|')
337    data_list = [
338        item.replace('~p', '|').replace('~t', '~')
339        for item in escaped_items]
340    job = data_list.pop(0)
341    return (job, data_list)
342
343
344class Central(object):
345    """A Central process that distributes sequences of events"""
346    
347    def __init__(self, options, arguments):
348        self._options = options
349        self._timeout = options.timeout
350        self._transport = Transport(options)
351        self._send = self._transport.send
352        self._recv = self._transport.recv
353        self._jobs = {}        
354    
355    # Methods to override in subclasses
356    
357    def next(self):
358        """generate the next event
359        
360        Should call one of the following:
361            self.start(seq)
362            self.event(seq, data)
363            self.end(seq)
364        
365        return False if there are no more events, True otherwise
366        """
367        # what does this do?
368        self.endrun()
369    
370    def result(self, seq, data):
371        """The result returned by the worker"""
372        pass
373        
374    # methods that are sent by subclasses, from next()
375    
376    def start(self, job):
377        if job not in self._jobs:
378            self._jobs[job] = []
379    
380    def event(self, job, data):
381        if job not in self._jobs:
382            self._jobs[job] = [data]
383        else:
384            self._jobs[job].append(data)
385    
386    def end(self, job):
387        if job not in self._jobs:
388            return;
389        data_list = self._jobs[job]
390        del self._jobs[job]
391        message = _job_encode(job, data_list)
392        self._minder.central_start(job)
393        self._send("worker-job", message)
394        self._minder.central_end(job)
395        
396    def flush_results(self):
397        for r in self._minder.results():
398            self.result(r[0], r[1])
399            
400    def main(self):
401        self._transport.connect()
402        self._transport.queue('minder-end', clean=True)
403        self._transport.queue('worker-job', clean=True)
404        self._transport.queue('worker-status', clean=True)
405
406        self._minder = JobMinder(self._options)
407        minder_thread = threading.Thread(target=self._minder.run)
408        minder_thread.setDaemon(True)
409        minder_thread.start()
410        
411        while self.next():
412            self.flush_results()
413        for job in self._jobs.keys():
414            self.end(job)
415       
416        # Main main main loop
417        while self._minder.not_done():
418            self.flush_results()
419            time.sleep(1.0)
420        self.flush_results()
421           
422        self._send('minder-end', '')
423        minder_thread.join(self._timeout)
424        if minder_thread.isAlive():
425            raise TimeoutError('Timeout waiting for job minder to exit.')
426
427        # delete the queues -- this never happens
428        print "closing transport"
429        self._transport.close()
430
431
432# identify the worker threads
433_randomized = False
434def genWorkerID():
435    global _randomized
436    if not _randomized:
437        random.jumpahead(os.getpid())
438        _randomized = True
439    return "worker-%02d-%02d-%02d" % (
440        random.randint(0,99),
441        random.randint(0,99),
442        random.randint(0,99))
443
444class Worker(object):
445    """A Worker that processes a sequences of events"""
446    
447    def __init__(self, options, arguments):
448        self._id = genWorkerID()
449        self._transport = Transport(options)
450        self._send = self._transport.send
451        self._recv = self._transport.recv
452        self._verbose = options.verbose >= 1
453        self._logtime = time.time()
454    
455    # Methods to override in subclasses
456    
457    def start(self):
458        """start of a sequence of events"""
459        pass
460    
461    def event(self, data):
462        """an event in a sequence"""
463        pass
464    
465    def end(self):
466        """the end of a sequence"""
467        return ''
468    
469    def log(self, msg):
470        if self._verbose < 1:
471            return
472        t = time.time()
473        # time elapsed between each action and the next action 
474        print ("%s (%8.4f)" % (self._id, t - self._logtime)), msg
475        self._logtime = t
476    
477
478    # Implementation
479    
480    def main(self):
481        self.log("starting AMQP connection")
482        self._transport.connect()
483        self._transport.usequeue('worker-job')
484        # should be queue, not usequeue? -- sending, not receiving?
485        # what's the difference between using basic_consume and basic_get?
486        self._transport.usequeue('worker-status')
487        #self._transport.queue('worker-status', inControl=True)
488        
489        while True:
490            try:
491                self.log("getting from worker-job queue")
492                message = self._recv('worker-job')
493            except amqp.AMQPException, ae:
494                self.log ("Got AMQP error: " + str(ae))
495                # the break is what pops the worker out of this loop and allows it to close the AMQP connections
496                # the AMQP 404 error is what causes the worker to break
497                # so how do we break without a 404 error, or how do we manage to always get the error?
498                # see recv
499                break
500
501            # get rid of stack traces complaining about decoding empty message
502            if message is not None:
503                (job, data_list) = _job_decode(message)
504
505                self._send('worker-status', 'start,' + job)
506                self.start()
507                for item in data_list:
508                    self.event(item)
509                result = self.end()
510                self._send('worker-status', 'end,' + job + ',' + result)
511
512            # ok, I see what's happening here.  If we don't explicitly kill off the workers
513            # here, it will just keep timing out and saying "timeout getting from worker-job
514            # queue" every 20 seconds forever and ever.
515            # So we will kill it off when we get an empty message.  But this will kill it off
516            # when SOME of the workers still have things to do.  Is there a way to kill it off
517            # when the last worker is empty?  Can we kill it off when the minder thinks we're
518            # finished?  Or do we need a fanout queue?
519
520            else:
521#                self.log("killing AMQP connection")
522                self._transport.close()
523                self._transport = None
524                # break, or else we will keep trying to recv from worker-job queue
525                break
526
527
528def clean(options):
529    transport = Transport(options)
530    transport.connect()
531    transport.queue('minder-end')
532    transport.queue('worker-job')
533    transport.queue('worker-status')
534    transport.close()
535
536
537
538def start_forks(options):
539    if options.workers == 0:
540        options.workers = 1
541    if os.fork() == 0:
542        # now in child
543        os.setsid() # magic that creates a new process group
544        options.central = False # ensure forks don't run central
545        for i in xrange(0, options.fork):
546            if os.fork() == 0:
547                # now in grandchild
548                return # escape loop, keep processing
549        sys.exit(0)
550    else:
551        options.workers = 0 # ensure parent doesn't run workers
552
553def run_worker(worker_cls, options, arguments):
554    w = worker_cls(options, arguments)
555    try:
556        w.main()
557    except KeyboardInterrupt:
558        thread.interrupt_main()
559        
560def start_workers(worker_cls, n, options, arguments):
561    threads = []
562    for i in xrange(0, n):
563        t = threading.Thread(target=run_worker,
564                                args=(worker_cls, options, arguments))
565        threads.append(t)
566        t.start()
567    return threads
568
569def run_central(central_cls, options, arguments):
570    c = central_cls(options, arguments)
571    c.main()
572
573
574class Hive(object):
575    def __init__(self, central_cls, worker_cls):
576        self.central_cls = central_cls
577        self.worker_cls = worker_cls
578    
579    def add_options(self, parser):
580        # AMQP options
581        parser.add_option('--amqp-host',
582                            default=amqp_host, metavar='HOST',
583                            help='AMQP server to connect to (default: %default)')
584        parser.add_option('--amqp-vhost',
585                            default=amqp_vhost, metavar='PATH',
586                            help='AMQP virtual host to use (default: %default)')
587        parser.add_option('--amqp-userid',
588                            default=amqp_userid, metavar='USER',
589                            help='AMQP userid to authenticate as (default: %default)')
590        parser.add_option('--amqp-password',
591                            default=amqp_password, metavar='PW',
592                            help='AMQP password to authenticate with (default: %default)')
593        parser.add_option('--amqp-ssl',
594                            action='store_true', default=False,
595                            help='Enable SSL (default: not enabled)')
596        
597        # Central options
598        parser.add_option('-c', '--central',
599                            default=False, action='store_true',
600                            help='run a central job distributor')
601        
602        parser.add_option('--throttle',
603                            default=False, action='store_true',
604                            help='attempt to throttle jobs in queue')
605        
606        # Worker options
607        parser.add_option('-w', '--workers', metavar='N',
608                            default=0, type='int',
609                            help='create N worker threads (default: 0)')
610        parser.add_option('-f', '--fork', metavar='K',
611                            default=0, type='int',
612                            help='fork K detached processes (default: 0)')
613    
614        parser.add_option('--clean',
615                            action='store_true', default=False,
616                            help='clean up all queues, causing old workers to quit')
617        
618        # Generic options
619        parser.add_option('--timeout', metavar='T',
620                            default=timeout, type='float',
621                            help='set timeout to T seconds (default: %default)')
622        parser.add_option('-v', '--verbose',
623                            default=0, action='count',
624                            help='increase output (0~2 times')
625        
626
627    def default_options(self):
628        parser = OptionParser()
629        self.add_options(parser)    
630        options, arguments = parser.parse_args(args=[])
631        return options
632        
633    def main(self, args=None):
634        parser = OptionParser()
635        self.add_options(parser)    
636                            
637        options, arguments = parser.parse_args(args=args)
638        
639        if options.clean:
640            clean(options)
641            
642        if (not options.clean and not options.central
643            and options.fork == 0 and options.workers == 0):
644            sys.exit('Nothing to do: specify one or more of --central, --workers or --clean')
645    
646        if options.fork > 0:
647            start_forks(options)
648            
649        if options.workers > 0:
650            start_workers(self.worker_cls, options.workers, options, arguments)
651        
652        if options.central:
653            run_central(self.central_cls, options, arguments)
654        
655
656if __name__ == '__main__':
657    Hive(None, None).main()