/historical/hive.py
Python | 657 lines | 461 code | 93 blank | 103 comment | 72 complexity | 903c6be4175b08337a17e9b23261679c MD5 | raw file
1# 2# $LicenseInfo:firstyear=2010&license=mit$ 3# 4# Copyright (c) 2010, Linden Research, Inc. 5# 6# Permission is hereby granted, free of charge, to any person obtaining a copy 7# of this software and associated documentation files (the "Software"), to deal 8# in the Software without restriction, including without limitation the rights 9# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10# copies of the Software, and to permit persons to whom the Software is 11# furnished to do so, subject to the following conditions: 12# 13# The above copyright notice and this permission notice shall be included in 14# all copies or substantial portions of the Software. 15# 16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22# THE SOFTWARE. 23# $/LicenseInfo$ 24# 25 26from optparse import OptionParser 27import os 28import random 29import sys 30import thread 31import threading 32import time 33 34import amqplib.client_0_8 as amqp 35 36import stattools 37 38 39# We use an amqp virtual host called "/hive". 40# A virtual host holds a cluster of exchanges, queues, and bindings. 41# We use a virtual host for permissions purposes (user hive has access to everything in /hive) 42# Exchanges are routers with routing tables. 43# Queues are where your messages end up. 44# Bindings are rules for routing tables. We use a "direct" exchange. 45 46# credit: http://blogs.digitar.com/jjww/ 47 48amqp_host = 'localhost' 49amqp_userid = 'hive' 50amqp_password = 'resistanceisfutile' 51amqp_vhost = '/hive' 52amqp_exchange = 'b.direct' 53timeout = 10.0 54 55class TimeoutError(Exception): 56 pass 57 58 59class Transport(object): 60 """A simple message queue-like transport system 61 62 Built on AMQP, hides much of the details of that interface and presents 63 a simple set of utilities for sending and receiving messages on named 64 queues. 65 66 """ 67 68 def __init__(self, options=None): 69 self._amqp_host = getattr(options, 'amqp_host', amqp_host) 70 self._amqp_vhost = getattr(options, 'amqp_vhost', amqp_vhost) 71 self._amqp_userid = getattr(options, 'amqp_userid', amqp_userid) 72 self._amqp_password = getattr(options, 'amqp_password', amqp_password) 73 self._timeout = getattr(options, 'timeout', timeout) 74 75 def _server_connect(self): 76 self._conn = amqp.Connection( 77 self._amqp_host, virtual_host=self._amqp_vhost, 78 userid=self._amqp_userid, password=self._amqp_password) 79 self._ch = self._conn.channel() 80 # request active access so we can get, create, delete 81 self._ch.access_request('/data', active=True, write=True, read=True) 82 # durable=False means flush messages out to disk 83 # auto_delete=True means get rid of the queue when there are no more messages to consume. 84 # Doesn't hardly matter though since we manually delete the queues with close 85 self._ch.exchange_declare(amqp_exchange, 'direct', durable=False, auto_delete=True) 86 87 def _server_close(self): 88 try: 89 self._ch.close() 90 self._ch = None 91 except: 92 pass 93 try: 94 self._conn.close() 95 self._conn = None 96 except: 97 pass 98 99 # vestigial 100 def _server_reconnect(self): 101 self._server_close() 102 self._server_connect() 103 104 def connect(self): 105 self._server_connect() 106 self._queues = [] 107 108 def close(self): 109 for qname in self._queues: 110 self._ch.queue_delete(qname) 111 self._queues = [] 112 self._server_close() 113 114 def queue(self, queue='', inControl=True, clean=False): 115 queue, _, _ = self._ch.queue_declare(queue, durable=False, auto_delete=False) 116 self._ch.queue_bind(queue, amqp_exchange, queue) 117 if inControl: 118 self._queues.append(queue) 119 if clean: 120 # we purge the queues when we first initialize them 121 #print "purging queue " + queue 122 self._ch.queue_purge(queue) 123 return queue 124 125 # same as queue, only without inControl, so it purges, instead of appending (these names suck) 126 # we are having problems with usequeue, not with queue 127 def usequeue(self, queue, clean=False): 128 self.queue(queue, inControl=False, clean=clean) 129 130 # we aren't using this -- should we be? 131# def donequeue(self, queue): 132# self._ch.queue_delete(queue) 133 134 def send(self, queue, data): 135 msg = amqp.Message(data) 136 self._ch.basic_publish(msg, amqp_exchange, queue) 137 138 def recv(self, queue): 139 t = time.time() 140 while True: 141 # m is a sequence of SQL statements (preprocessed) 142 m = self._ch.basic_get(queue, no_ack=True) 143 if m is not None: 144 return m.body 145# Should just take out TimeoutError, and return None when there's no more messages. 146# Or fix TimeoutError so it closes the queues? 147# Anytime we call TimeoutError, it's going to screw up the workers. 148# if (time.time() - t) > self._timeout: 149# raise TimeoutError('Timeout waiting for data on queue ' + queue) 150 if (time.time() - t) > self._timeout: 151 #print 'Timeout waiting for data on queue ' + queue 152 break 153 154 time.sleep(0.1) 155 156 def consume(self, queue, tag, fn): 157 return self._ch.basic_consume(queue, tag, 158 no_ack=True, exclusive=True, callback=fn) 159 160 def cancelconsume(self, tag): 161 self._ch.basic_cancel(tag) 162 163 def wait(self): 164 while self._ch.callbacks: 165 self._ch.wait() 166 167 168_STATE_WAITING_PARTIAL = 0 169_STATE_WAITING_COMPLETE = 1 170_STATE_RUNNING_PARTIAL = 2 171_STATE_RUNNING_COMPLETE = 3 172 173class JobMinder(object): 174 """A check on jobs to run""" 175 176 def __init__(self, options): 177 self._options = options 178 self._verbose = options.verbose 179 self._throttle = options.throttle 180 self._lock = threading.Lock() 181 self._ok_to_start = threading.Event() 182 self._ok_to_start.set() 183 self._all_done = threading.Event() 184 self._all_done.set() 185 self._jobs = { } 186 self._counts = [ 0, 0, 0, 0 ] 187 self._results = [ ] 188 self._running_stats = stattools.StatValue() 189 self._waiting_stats = stattools.StatValue() 190 191 def _recompute(self): 192 #print 'counts:', ','.join(map(str, self._counts)) 193 # must have lock! 194 running = (self._counts[_STATE_RUNNING_PARTIAL] 195 + self._counts[_STATE_RUNNING_COMPLETE]) 196 waiting = (self._counts[_STATE_WAITING_PARTIAL] 197 + self._counts[_STATE_WAITING_COMPLETE]) 198 199 self._running_stats.sample(running) 200 self._waiting_stats.sample(waiting) 201 202 if self._counts[_STATE_RUNNING_PARTIAL] > 0: 203 # if there are started jobs that aren't complete, keep going 204 self._ok_to_start.set() 205 self._all_done.clear() 206 return 207 208 if not self._throttle or waiting < (100 * (running + 1)): 209 self._ok_to_start.set() 210 else: 211 self._ok_to_start.clear() 212 213 if running == 0 and waiting == 0: 214 self._all_done.set() 215 else: 216 self._all_done.clear() 217 218 219 def central_start(self, job): 220 if self._verbose >= 2: 221 print "central_start", job 222 self._ok_to_start.wait() 223 self._lock.acquire() 224 self._jobs[job] = _STATE_WAITING_PARTIAL 225 self._counts[_STATE_WAITING_PARTIAL] += 1 226 self._recompute() 227 self._lock.release() 228 return True 229 230 def central_end(self, job): 231 if self._verbose >= 2: 232 print "central_end", job 233 self._lock.acquire() 234 s = self._jobs[job] 235 self._counts[s] -= 1 236 if s == _STATE_WAITING_PARTIAL: 237 s = _STATE_WAITING_COMPLETE 238 if s == _STATE_RUNNING_PARTIAL: 239 s = _STATE_RUNNING_COMPLETE 240 self._jobs[job] = s 241 self._counts[s] += 1 242 self._recompute() 243 self._lock.release() 244 245 def worker_start(self, job): 246 if self._verbose >= 2: 247 print "worker_start", job 248 self._lock.acquire() 249 if job in self._jobs: 250 s = self._jobs[job] 251 self._counts[s] -= 1 252 if s == _STATE_WAITING_PARTIAL: 253 s = _STATE_RUNNING_PARTIAL 254 if s == _STATE_WAITING_COMPLETE: 255 s = _STATE_RUNNING_COMPLETE 256 self._jobs[job] = s 257 self._counts[s] += 1 258 self._recompute() 259# else: 260# print "Received worker start of unknown job:", job 261 self._lock.release() 262 263 def worker_end(self, msg): 264 result = msg.split(',', 1) 265 job = result[0] 266 if self._verbose >= 2: 267 print "worker_end", job 268 self._lock.acquire() 269 if job in self._jobs: 270 del self._jobs[job] 271 self._counts[_STATE_RUNNING_COMPLETE] -= 1 272 self._recompute() 273 self._results.append(result) 274# else: 275# print "Received worker end of unknown job:", job 276 self._lock.release() 277 278 def worker_status(self, msg): 279 parts = msg.body.split(',', 1) 280 if len(parts) != 2: 281 print "Received malformed status:", msg.body 282 return 283 if parts[0] == 'start': 284 self.worker_start(parts[1]) 285 elif parts[0] == 'end': 286 self.worker_end(parts[1]) 287 else: 288 print "Received unknown status:", msg.body 289 290 def wait_for_done(self, timeout=None): 291 self._all_done.wait(timeout) 292 if self._verbose >= 2: 293 print 294 return self._all_done.isSet() 295 296 def not_done(self): 297 return not self._all_done.isSet() 298 299 def results(self): 300 self._lock.acquire() 301 rs = self._results 302 self._results = [] 303 self._lock.release() 304 return rs 305 306 def run(self): 307 t = Transport(self._options) 308 t.connect() 309 t.usequeue('minder-end') 310 t.usequeue('worker-status') 311 # so ... this never ends. we never get to the rest of this. 312 #print "consuming minder-end" 313 t.consume('minder-end', 'm0', self.stop) 314 #print "consuming worker-status" 315 t.consume('worker-status', 'm1', self.worker_status) 316 t.wait() 317 t.close() 318 319# basic_cancel() with a consumer tag to stop the consume(), or it will consume forever 320# we should be using t.cancelconsume() here 321# http://hg.barryp.org/py-amqplib/raw-file/tip/docs/overview.txt 322 def stop(self, msg): 323 msg.channel.basic_cancel('m0') 324 msg.channel.basic_cancel('m1') 325 print "running concurrency:", self._running_stats.format() 326 print "waiting concurrency:", self._waiting_stats.format() 327 328# How we encode sequences of queries (?) 329def _job_encode(job, data_list): 330 escaped_items = [ 331 item.replace('~', '~t').replace('|', '~p') 332 for item in [job] + data_list] 333 return '|'.join(escaped_items) 334 335def _job_decode(message): 336 escaped_items = message.split('|') 337 data_list = [ 338 item.replace('~p', '|').replace('~t', '~') 339 for item in escaped_items] 340 job = data_list.pop(0) 341 return (job, data_list) 342 343 344class Central(object): 345 """A Central process that distributes sequences of events""" 346 347 def __init__(self, options, arguments): 348 self._options = options 349 self._timeout = options.timeout 350 self._transport = Transport(options) 351 self._send = self._transport.send 352 self._recv = self._transport.recv 353 self._jobs = {} 354 355 # Methods to override in subclasses 356 357 def next(self): 358 """generate the next event 359 360 Should call one of the following: 361 self.start(seq) 362 self.event(seq, data) 363 self.end(seq) 364 365 return False if there are no more events, True otherwise 366 """ 367 # what does this do? 368 self.endrun() 369 370 def result(self, seq, data): 371 """The result returned by the worker""" 372 pass 373 374 # methods that are sent by subclasses, from next() 375 376 def start(self, job): 377 if job not in self._jobs: 378 self._jobs[job] = [] 379 380 def event(self, job, data): 381 if job not in self._jobs: 382 self._jobs[job] = [data] 383 else: 384 self._jobs[job].append(data) 385 386 def end(self, job): 387 if job not in self._jobs: 388 return; 389 data_list = self._jobs[job] 390 del self._jobs[job] 391 message = _job_encode(job, data_list) 392 self._minder.central_start(job) 393 self._send("worker-job", message) 394 self._minder.central_end(job) 395 396 def flush_results(self): 397 for r in self._minder.results(): 398 self.result(r[0], r[1]) 399 400 def main(self): 401 self._transport.connect() 402 self._transport.queue('minder-end', clean=True) 403 self._transport.queue('worker-job', clean=True) 404 self._transport.queue('worker-status', clean=True) 405 406 self._minder = JobMinder(self._options) 407 minder_thread = threading.Thread(target=self._minder.run) 408 minder_thread.setDaemon(True) 409 minder_thread.start() 410 411 while self.next(): 412 self.flush_results() 413 for job in self._jobs.keys(): 414 self.end(job) 415 416 # Main main main loop 417 while self._minder.not_done(): 418 self.flush_results() 419 time.sleep(1.0) 420 self.flush_results() 421 422 self._send('minder-end', '') 423 minder_thread.join(self._timeout) 424 if minder_thread.isAlive(): 425 raise TimeoutError('Timeout waiting for job minder to exit.') 426 427 # delete the queues -- this never happens 428 print "closing transport" 429 self._transport.close() 430 431 432# identify the worker threads 433_randomized = False 434def genWorkerID(): 435 global _randomized 436 if not _randomized: 437 random.jumpahead(os.getpid()) 438 _randomized = True 439 return "worker-%02d-%02d-%02d" % ( 440 random.randint(0,99), 441 random.randint(0,99), 442 random.randint(0,99)) 443 444class Worker(object): 445 """A Worker that processes a sequences of events""" 446 447 def __init__(self, options, arguments): 448 self._id = genWorkerID() 449 self._transport = Transport(options) 450 self._send = self._transport.send 451 self._recv = self._transport.recv 452 self._verbose = options.verbose >= 1 453 self._logtime = time.time() 454 455 # Methods to override in subclasses 456 457 def start(self): 458 """start of a sequence of events""" 459 pass 460 461 def event(self, data): 462 """an event in a sequence""" 463 pass 464 465 def end(self): 466 """the end of a sequence""" 467 return '' 468 469 def log(self, msg): 470 if self._verbose < 1: 471 return 472 t = time.time() 473 # time elapsed between each action and the next action 474 print ("%s (%8.4f)" % (self._id, t - self._logtime)), msg 475 self._logtime = t 476 477 478 # Implementation 479 480 def main(self): 481 self.log("starting AMQP connection") 482 self._transport.connect() 483 self._transport.usequeue('worker-job') 484 # should be queue, not usequeue? -- sending, not receiving? 485 # what's the difference between using basic_consume and basic_get? 486 self._transport.usequeue('worker-status') 487 #self._transport.queue('worker-status', inControl=True) 488 489 while True: 490 try: 491 self.log("getting from worker-job queue") 492 message = self._recv('worker-job') 493 except amqp.AMQPException, ae: 494 self.log ("Got AMQP error: " + str(ae)) 495 # the break is what pops the worker out of this loop and allows it to close the AMQP connections 496 # the AMQP 404 error is what causes the worker to break 497 # so how do we break without a 404 error, or how do we manage to always get the error? 498 # see recv 499 break 500 501 # get rid of stack traces complaining about decoding empty message 502 if message is not None: 503 (job, data_list) = _job_decode(message) 504 505 self._send('worker-status', 'start,' + job) 506 self.start() 507 for item in data_list: 508 self.event(item) 509 result = self.end() 510 self._send('worker-status', 'end,' + job + ',' + result) 511 512 # ok, I see what's happening here. If we don't explicitly kill off the workers 513 # here, it will just keep timing out and saying "timeout getting from worker-job 514 # queue" every 20 seconds forever and ever. 515 # So we will kill it off when we get an empty message. But this will kill it off 516 # when SOME of the workers still have things to do. Is there a way to kill it off 517 # when the last worker is empty? Can we kill it off when the minder thinks we're 518 # finished? Or do we need a fanout queue? 519 520 else: 521# self.log("killing AMQP connection") 522 self._transport.close() 523 self._transport = None 524 # break, or else we will keep trying to recv from worker-job queue 525 break 526 527 528def clean(options): 529 transport = Transport(options) 530 transport.connect() 531 transport.queue('minder-end') 532 transport.queue('worker-job') 533 transport.queue('worker-status') 534 transport.close() 535 536 537 538def start_forks(options): 539 if options.workers == 0: 540 options.workers = 1 541 if os.fork() == 0: 542 # now in child 543 os.setsid() # magic that creates a new process group 544 options.central = False # ensure forks don't run central 545 for i in xrange(0, options.fork): 546 if os.fork() == 0: 547 # now in grandchild 548 return # escape loop, keep processing 549 sys.exit(0) 550 else: 551 options.workers = 0 # ensure parent doesn't run workers 552 553def run_worker(worker_cls, options, arguments): 554 w = worker_cls(options, arguments) 555 try: 556 w.main() 557 except KeyboardInterrupt: 558 thread.interrupt_main() 559 560def start_workers(worker_cls, n, options, arguments): 561 threads = [] 562 for i in xrange(0, n): 563 t = threading.Thread(target=run_worker, 564 args=(worker_cls, options, arguments)) 565 threads.append(t) 566 t.start() 567 return threads 568 569def run_central(central_cls, options, arguments): 570 c = central_cls(options, arguments) 571 c.main() 572 573 574class Hive(object): 575 def __init__(self, central_cls, worker_cls): 576 self.central_cls = central_cls 577 self.worker_cls = worker_cls 578 579 def add_options(self, parser): 580 # AMQP options 581 parser.add_option('--amqp-host', 582 default=amqp_host, metavar='HOST', 583 help='AMQP server to connect to (default: %default)') 584 parser.add_option('--amqp-vhost', 585 default=amqp_vhost, metavar='PATH', 586 help='AMQP virtual host to use (default: %default)') 587 parser.add_option('--amqp-userid', 588 default=amqp_userid, metavar='USER', 589 help='AMQP userid to authenticate as (default: %default)') 590 parser.add_option('--amqp-password', 591 default=amqp_password, metavar='PW', 592 help='AMQP password to authenticate with (default: %default)') 593 parser.add_option('--amqp-ssl', 594 action='store_true', default=False, 595 help='Enable SSL (default: not enabled)') 596 597 # Central options 598 parser.add_option('-c', '--central', 599 default=False, action='store_true', 600 help='run a central job distributor') 601 602 parser.add_option('--throttle', 603 default=False, action='store_true', 604 help='attempt to throttle jobs in queue') 605 606 # Worker options 607 parser.add_option('-w', '--workers', metavar='N', 608 default=0, type='int', 609 help='create N worker threads (default: 0)') 610 parser.add_option('-f', '--fork', metavar='K', 611 default=0, type='int', 612 help='fork K detached processes (default: 0)') 613 614 parser.add_option('--clean', 615 action='store_true', default=False, 616 help='clean up all queues, causing old workers to quit') 617 618 # Generic options 619 parser.add_option('--timeout', metavar='T', 620 default=timeout, type='float', 621 help='set timeout to T seconds (default: %default)') 622 parser.add_option('-v', '--verbose', 623 default=0, action='count', 624 help='increase output (0~2 times') 625 626 627 def default_options(self): 628 parser = OptionParser() 629 self.add_options(parser) 630 options, arguments = parser.parse_args(args=[]) 631 return options 632 633 def main(self, args=None): 634 parser = OptionParser() 635 self.add_options(parser) 636 637 options, arguments = parser.parse_args(args=args) 638 639 if options.clean: 640 clean(options) 641 642 if (not options.clean and not options.central 643 and options.fork == 0 and options.workers == 0): 644 sys.exit('Nothing to do: specify one or more of --central, --workers or --clean') 645 646 if options.fork > 0: 647 start_forks(options) 648 649 if options.workers > 0: 650 start_workers(self.worker_cls, options.workers, options, arguments) 651 652 if options.central: 653 run_central(self.central_cls, options, arguments) 654 655 656if __name__ == '__main__': 657 Hive(None, None).main()