/historical/hive.py

https://bitbucket.org/lindenlab/apiary/ · Python · 657 lines · 520 code · 53 blank · 84 comment · 33 complexity · 903c6be4175b08337a17e9b23261679c MD5 · raw file

  1. #
  2. # $LicenseInfo:firstyear=2010&license=mit$
  3. #
  4. # Copyright (c) 2010, Linden Research, Inc.
  5. #
  6. # Permission is hereby granted, free of charge, to any person obtaining a copy
  7. # of this software and associated documentation files (the "Software"), to deal
  8. # in the Software without restriction, including without limitation the rights
  9. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10. # copies of the Software, and to permit persons to whom the Software is
  11. # furnished to do so, subject to the following conditions:
  12. #
  13. # The above copyright notice and this permission notice shall be included in
  14. # all copies or substantial portions of the Software.
  15. #
  16. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22. # THE SOFTWARE.
  23. # $/LicenseInfo$
  24. #
  25. from optparse import OptionParser
  26. import os
  27. import random
  28. import sys
  29. import thread
  30. import threading
  31. import time
  32. import amqplib.client_0_8 as amqp
  33. import stattools
  34. # We use an amqp virtual host called "/hive".
  35. # A virtual host holds a cluster of exchanges, queues, and bindings.
  36. # We use a virtual host for permissions purposes (user hive has access to everything in /hive)
  37. # Exchanges are routers with routing tables.
  38. # Queues are where your messages end up.
  39. # Bindings are rules for routing tables. We use a "direct" exchange.
  40. # credit: http://blogs.digitar.com/jjww/
  41. amqp_host = 'localhost'
  42. amqp_userid = 'hive'
  43. amqp_password = 'resistanceisfutile'
  44. amqp_vhost = '/hive'
  45. amqp_exchange = 'b.direct'
  46. timeout = 10.0
  47. class TimeoutError(Exception):
  48. pass
  49. class Transport(object):
  50. """A simple message queue-like transport system
  51. Built on AMQP, hides much of the details of that interface and presents
  52. a simple set of utilities for sending and receiving messages on named
  53. queues.
  54. """
  55. def __init__(self, options=None):
  56. self._amqp_host = getattr(options, 'amqp_host', amqp_host)
  57. self._amqp_vhost = getattr(options, 'amqp_vhost', amqp_vhost)
  58. self._amqp_userid = getattr(options, 'amqp_userid', amqp_userid)
  59. self._amqp_password = getattr(options, 'amqp_password', amqp_password)
  60. self._timeout = getattr(options, 'timeout', timeout)
  61. def _server_connect(self):
  62. self._conn = amqp.Connection(
  63. self._amqp_host, virtual_host=self._amqp_vhost,
  64. userid=self._amqp_userid, password=self._amqp_password)
  65. self._ch = self._conn.channel()
  66. # request active access so we can get, create, delete
  67. self._ch.access_request('/data', active=True, write=True, read=True)
  68. # durable=False means flush messages out to disk
  69. # auto_delete=True means get rid of the queue when there are no more messages to consume.
  70. # Doesn't hardly matter though since we manually delete the queues with close
  71. self._ch.exchange_declare(amqp_exchange, 'direct', durable=False, auto_delete=True)
  72. def _server_close(self):
  73. try:
  74. self._ch.close()
  75. self._ch = None
  76. except:
  77. pass
  78. try:
  79. self._conn.close()
  80. self._conn = None
  81. except:
  82. pass
  83. # vestigial
  84. def _server_reconnect(self):
  85. self._server_close()
  86. self._server_connect()
  87. def connect(self):
  88. self._server_connect()
  89. self._queues = []
  90. def close(self):
  91. for qname in self._queues:
  92. self._ch.queue_delete(qname)
  93. self._queues = []
  94. self._server_close()
  95. def queue(self, queue='', inControl=True, clean=False):
  96. queue, _, _ = self._ch.queue_declare(queue, durable=False, auto_delete=False)
  97. self._ch.queue_bind(queue, amqp_exchange, queue)
  98. if inControl:
  99. self._queues.append(queue)
  100. if clean:
  101. # we purge the queues when we first initialize them
  102. #print "purging queue " + queue
  103. self._ch.queue_purge(queue)
  104. return queue
  105. # same as queue, only without inControl, so it purges, instead of appending (these names suck)
  106. # we are having problems with usequeue, not with queue
  107. def usequeue(self, queue, clean=False):
  108. self.queue(queue, inControl=False, clean=clean)
  109. # we aren't using this -- should we be?
  110. # def donequeue(self, queue):
  111. # self._ch.queue_delete(queue)
  112. def send(self, queue, data):
  113. msg = amqp.Message(data)
  114. self._ch.basic_publish(msg, amqp_exchange, queue)
  115. def recv(self, queue):
  116. t = time.time()
  117. while True:
  118. # m is a sequence of SQL statements (preprocessed)
  119. m = self._ch.basic_get(queue, no_ack=True)
  120. if m is not None:
  121. return m.body
  122. # Should just take out TimeoutError, and return None when there's no more messages.
  123. # Or fix TimeoutError so it closes the queues?
  124. # Anytime we call TimeoutError, it's going to screw up the workers.
  125. # if (time.time() - t) > self._timeout:
  126. # raise TimeoutError('Timeout waiting for data on queue ' + queue)
  127. if (time.time() - t) > self._timeout:
  128. #print 'Timeout waiting for data on queue ' + queue
  129. break
  130. time.sleep(0.1)
  131. def consume(self, queue, tag, fn):
  132. return self._ch.basic_consume(queue, tag,
  133. no_ack=True, exclusive=True, callback=fn)
  134. def cancelconsume(self, tag):
  135. self._ch.basic_cancel(tag)
  136. def wait(self):
  137. while self._ch.callbacks:
  138. self._ch.wait()
  139. _STATE_WAITING_PARTIAL = 0
  140. _STATE_WAITING_COMPLETE = 1
  141. _STATE_RUNNING_PARTIAL = 2
  142. _STATE_RUNNING_COMPLETE = 3
  143. class JobMinder(object):
  144. """A check on jobs to run"""
  145. def __init__(self, options):
  146. self._options = options
  147. self._verbose = options.verbose
  148. self._throttle = options.throttle
  149. self._lock = threading.Lock()
  150. self._ok_to_start = threading.Event()
  151. self._ok_to_start.set()
  152. self._all_done = threading.Event()
  153. self._all_done.set()
  154. self._jobs = { }
  155. self._counts = [ 0, 0, 0, 0 ]
  156. self._results = [ ]
  157. self._running_stats = stattools.StatValue()
  158. self._waiting_stats = stattools.StatValue()
  159. def _recompute(self):
  160. #print 'counts:', ','.join(map(str, self._counts))
  161. # must have lock!
  162. running = (self._counts[_STATE_RUNNING_PARTIAL]
  163. + self._counts[_STATE_RUNNING_COMPLETE])
  164. waiting = (self._counts[_STATE_WAITING_PARTIAL]
  165. + self._counts[_STATE_WAITING_COMPLETE])
  166. self._running_stats.sample(running)
  167. self._waiting_stats.sample(waiting)
  168. if self._counts[_STATE_RUNNING_PARTIAL] > 0:
  169. # if there are started jobs that aren't complete, keep going
  170. self._ok_to_start.set()
  171. self._all_done.clear()
  172. return
  173. if not self._throttle or waiting < (100 * (running + 1)):
  174. self._ok_to_start.set()
  175. else:
  176. self._ok_to_start.clear()
  177. if running == 0 and waiting == 0:
  178. self._all_done.set()
  179. else:
  180. self._all_done.clear()
  181. def central_start(self, job):
  182. if self._verbose >= 2:
  183. print "central_start", job
  184. self._ok_to_start.wait()
  185. self._lock.acquire()
  186. self._jobs[job] = _STATE_WAITING_PARTIAL
  187. self._counts[_STATE_WAITING_PARTIAL] += 1
  188. self._recompute()
  189. self._lock.release()
  190. return True
  191. def central_end(self, job):
  192. if self._verbose >= 2:
  193. print "central_end", job
  194. self._lock.acquire()
  195. s = self._jobs[job]
  196. self._counts[s] -= 1
  197. if s == _STATE_WAITING_PARTIAL:
  198. s = _STATE_WAITING_COMPLETE
  199. if s == _STATE_RUNNING_PARTIAL:
  200. s = _STATE_RUNNING_COMPLETE
  201. self._jobs[job] = s
  202. self._counts[s] += 1
  203. self._recompute()
  204. self._lock.release()
  205. def worker_start(self, job):
  206. if self._verbose >= 2:
  207. print "worker_start", job
  208. self._lock.acquire()
  209. if job in self._jobs:
  210. s = self._jobs[job]
  211. self._counts[s] -= 1
  212. if s == _STATE_WAITING_PARTIAL:
  213. s = _STATE_RUNNING_PARTIAL
  214. if s == _STATE_WAITING_COMPLETE:
  215. s = _STATE_RUNNING_COMPLETE
  216. self._jobs[job] = s
  217. self._counts[s] += 1
  218. self._recompute()
  219. # else:
  220. # print "Received worker start of unknown job:", job
  221. self._lock.release()
  222. def worker_end(self, msg):
  223. result = msg.split(',', 1)
  224. job = result[0]
  225. if self._verbose >= 2:
  226. print "worker_end", job
  227. self._lock.acquire()
  228. if job in self._jobs:
  229. del self._jobs[job]
  230. self._counts[_STATE_RUNNING_COMPLETE] -= 1
  231. self._recompute()
  232. self._results.append(result)
  233. # else:
  234. # print "Received worker end of unknown job:", job
  235. self._lock.release()
  236. def worker_status(self, msg):
  237. parts = msg.body.split(',', 1)
  238. if len(parts) != 2:
  239. print "Received malformed status:", msg.body
  240. return
  241. if parts[0] == 'start':
  242. self.worker_start(parts[1])
  243. elif parts[0] == 'end':
  244. self.worker_end(parts[1])
  245. else:
  246. print "Received unknown status:", msg.body
  247. def wait_for_done(self, timeout=None):
  248. self._all_done.wait(timeout)
  249. if self._verbose >= 2:
  250. print
  251. return self._all_done.isSet()
  252. def not_done(self):
  253. return not self._all_done.isSet()
  254. def results(self):
  255. self._lock.acquire()
  256. rs = self._results
  257. self._results = []
  258. self._lock.release()
  259. return rs
  260. def run(self):
  261. t = Transport(self._options)
  262. t.connect()
  263. t.usequeue('minder-end')
  264. t.usequeue('worker-status')
  265. # so ... this never ends. we never get to the rest of this.
  266. #print "consuming minder-end"
  267. t.consume('minder-end', 'm0', self.stop)
  268. #print "consuming worker-status"
  269. t.consume('worker-status', 'm1', self.worker_status)
  270. t.wait()
  271. t.close()
  272. # basic_cancel() with a consumer tag to stop the consume(), or it will consume forever
  273. # we should be using t.cancelconsume() here
  274. # http://hg.barryp.org/py-amqplib/raw-file/tip/docs/overview.txt
  275. def stop(self, msg):
  276. msg.channel.basic_cancel('m0')
  277. msg.channel.basic_cancel('m1')
  278. print "running concurrency:", self._running_stats.format()
  279. print "waiting concurrency:", self._waiting_stats.format()
  280. # How we encode sequences of queries (?)
  281. def _job_encode(job, data_list):
  282. escaped_items = [
  283. item.replace('~', '~t').replace('|', '~p')
  284. for item in [job] + data_list]
  285. return '|'.join(escaped_items)
  286. def _job_decode(message):
  287. escaped_items = message.split('|')
  288. data_list = [
  289. item.replace('~p', '|').replace('~t', '~')
  290. for item in escaped_items]
  291. job = data_list.pop(0)
  292. return (job, data_list)
  293. class Central(object):
  294. """A Central process that distributes sequences of events"""
  295. def __init__(self, options, arguments):
  296. self._options = options
  297. self._timeout = options.timeout
  298. self._transport = Transport(options)
  299. self._send = self._transport.send
  300. self._recv = self._transport.recv
  301. self._jobs = {}
  302. # Methods to override in subclasses
  303. def next(self):
  304. """generate the next event
  305. Should call one of the following:
  306. self.start(seq)
  307. self.event(seq, data)
  308. self.end(seq)
  309. return False if there are no more events, True otherwise
  310. """
  311. # what does this do?
  312. self.endrun()
  313. def result(self, seq, data):
  314. """The result returned by the worker"""
  315. pass
  316. # methods that are sent by subclasses, from next()
  317. def start(self, job):
  318. if job not in self._jobs:
  319. self._jobs[job] = []
  320. def event(self, job, data):
  321. if job not in self._jobs:
  322. self._jobs[job] = [data]
  323. else:
  324. self._jobs[job].append(data)
  325. def end(self, job):
  326. if job not in self._jobs:
  327. return;
  328. data_list = self._jobs[job]
  329. del self._jobs[job]
  330. message = _job_encode(job, data_list)
  331. self._minder.central_start(job)
  332. self._send("worker-job", message)
  333. self._minder.central_end(job)
  334. def flush_results(self):
  335. for r in self._minder.results():
  336. self.result(r[0], r[1])
  337. def main(self):
  338. self._transport.connect()
  339. self._transport.queue('minder-end', clean=True)
  340. self._transport.queue('worker-job', clean=True)
  341. self._transport.queue('worker-status', clean=True)
  342. self._minder = JobMinder(self._options)
  343. minder_thread = threading.Thread(target=self._minder.run)
  344. minder_thread.setDaemon(True)
  345. minder_thread.start()
  346. while self.next():
  347. self.flush_results()
  348. for job in self._jobs.keys():
  349. self.end(job)
  350. # Main main main loop
  351. while self._minder.not_done():
  352. self.flush_results()
  353. time.sleep(1.0)
  354. self.flush_results()
  355. self._send('minder-end', '')
  356. minder_thread.join(self._timeout)
  357. if minder_thread.isAlive():
  358. raise TimeoutError('Timeout waiting for job minder to exit.')
  359. # delete the queues -- this never happens
  360. print "closing transport"
  361. self._transport.close()
  362. # identify the worker threads
  363. _randomized = False
  364. def genWorkerID():
  365. global _randomized
  366. if not _randomized:
  367. random.jumpahead(os.getpid())
  368. _randomized = True
  369. return "worker-%02d-%02d-%02d" % (
  370. random.randint(0,99),
  371. random.randint(0,99),
  372. random.randint(0,99))
  373. class Worker(object):
  374. """A Worker that processes a sequences of events"""
  375. def __init__(self, options, arguments):
  376. self._id = genWorkerID()
  377. self._transport = Transport(options)
  378. self._send = self._transport.send
  379. self._recv = self._transport.recv
  380. self._verbose = options.verbose >= 1
  381. self._logtime = time.time()
  382. # Methods to override in subclasses
  383. def start(self):
  384. """start of a sequence of events"""
  385. pass
  386. def event(self, data):
  387. """an event in a sequence"""
  388. pass
  389. def end(self):
  390. """the end of a sequence"""
  391. return ''
  392. def log(self, msg):
  393. if self._verbose < 1:
  394. return
  395. t = time.time()
  396. # time elapsed between each action and the next action
  397. print ("%s (%8.4f)" % (self._id, t - self._logtime)), msg
  398. self._logtime = t
  399. # Implementation
  400. def main(self):
  401. self.log("starting AMQP connection")
  402. self._transport.connect()
  403. self._transport.usequeue('worker-job')
  404. # should be queue, not usequeue? -- sending, not receiving?
  405. # what's the difference between using basic_consume and basic_get?
  406. self._transport.usequeue('worker-status')
  407. #self._transport.queue('worker-status', inControl=True)
  408. while True:
  409. try:
  410. self.log("getting from worker-job queue")
  411. message = self._recv('worker-job')
  412. except amqp.AMQPException, ae:
  413. self.log ("Got AMQP error: " + str(ae))
  414. # the break is what pops the worker out of this loop and allows it to close the AMQP connections
  415. # the AMQP 404 error is what causes the worker to break
  416. # so how do we break without a 404 error, or how do we manage to always get the error?
  417. # see recv
  418. break
  419. # get rid of stack traces complaining about decoding empty message
  420. if message is not None:
  421. (job, data_list) = _job_decode(message)
  422. self._send('worker-status', 'start,' + job)
  423. self.start()
  424. for item in data_list:
  425. self.event(item)
  426. result = self.end()
  427. self._send('worker-status', 'end,' + job + ',' + result)
  428. # ok, I see what's happening here. If we don't explicitly kill off the workers
  429. # here, it will just keep timing out and saying "timeout getting from worker-job
  430. # queue" every 20 seconds forever and ever.
  431. # So we will kill it off when we get an empty message. But this will kill it off
  432. # when SOME of the workers still have things to do. Is there a way to kill it off
  433. # when the last worker is empty? Can we kill it off when the minder thinks we're
  434. # finished? Or do we need a fanout queue?
  435. else:
  436. # self.log("killing AMQP connection")
  437. self._transport.close()
  438. self._transport = None
  439. # break, or else we will keep trying to recv from worker-job queue
  440. break
  441. def clean(options):
  442. transport = Transport(options)
  443. transport.connect()
  444. transport.queue('minder-end')
  445. transport.queue('worker-job')
  446. transport.queue('worker-status')
  447. transport.close()
  448. def start_forks(options):
  449. if options.workers == 0:
  450. options.workers = 1
  451. if os.fork() == 0:
  452. # now in child
  453. os.setsid() # magic that creates a new process group
  454. options.central = False # ensure forks don't run central
  455. for i in xrange(0, options.fork):
  456. if os.fork() == 0:
  457. # now in grandchild
  458. return # escape loop, keep processing
  459. sys.exit(0)
  460. else:
  461. options.workers = 0 # ensure parent doesn't run workers
  462. def run_worker(worker_cls, options, arguments):
  463. w = worker_cls(options, arguments)
  464. try:
  465. w.main()
  466. except KeyboardInterrupt:
  467. thread.interrupt_main()
  468. def start_workers(worker_cls, n, options, arguments):
  469. threads = []
  470. for i in xrange(0, n):
  471. t = threading.Thread(target=run_worker,
  472. args=(worker_cls, options, arguments))
  473. threads.append(t)
  474. t.start()
  475. return threads
  476. def run_central(central_cls, options, arguments):
  477. c = central_cls(options, arguments)
  478. c.main()
  479. class Hive(object):
  480. def __init__(self, central_cls, worker_cls):
  481. self.central_cls = central_cls
  482. self.worker_cls = worker_cls
  483. def add_options(self, parser):
  484. # AMQP options
  485. parser.add_option('--amqp-host',
  486. default=amqp_host, metavar='HOST',
  487. help='AMQP server to connect to (default: %default)')
  488. parser.add_option('--amqp-vhost',
  489. default=amqp_vhost, metavar='PATH',
  490. help='AMQP virtual host to use (default: %default)')
  491. parser.add_option('--amqp-userid',
  492. default=amqp_userid, metavar='USER',
  493. help='AMQP userid to authenticate as (default: %default)')
  494. parser.add_option('--amqp-password',
  495. default=amqp_password, metavar='PW',
  496. help='AMQP password to authenticate with (default: %default)')
  497. parser.add_option('--amqp-ssl',
  498. action='store_true', default=False,
  499. help='Enable SSL (default: not enabled)')
  500. # Central options
  501. parser.add_option('-c', '--central',
  502. default=False, action='store_true',
  503. help='run a central job distributor')
  504. parser.add_option('--throttle',
  505. default=False, action='store_true',
  506. help='attempt to throttle jobs in queue')
  507. # Worker options
  508. parser.add_option('-w', '--workers', metavar='N',
  509. default=0, type='int',
  510. help='create N worker threads (default: 0)')
  511. parser.add_option('-f', '--fork', metavar='K',
  512. default=0, type='int',
  513. help='fork K detached processes (default: 0)')
  514. parser.add_option('--clean',
  515. action='store_true', default=False,
  516. help='clean up all queues, causing old workers to quit')
  517. # Generic options
  518. parser.add_option('--timeout', metavar='T',
  519. default=timeout, type='float',
  520. help='set timeout to T seconds (default: %default)')
  521. parser.add_option('-v', '--verbose',
  522. default=0, action='count',
  523. help='increase output (0~2 times')
  524. def default_options(self):
  525. parser = OptionParser()
  526. self.add_options(parser)
  527. options, arguments = parser.parse_args(args=[])
  528. return options
  529. def main(self, args=None):
  530. parser = OptionParser()
  531. self.add_options(parser)
  532. options, arguments = parser.parse_args(args=args)
  533. if options.clean:
  534. clean(options)
  535. if (not options.clean and not options.central
  536. and options.fork == 0 and options.workers == 0):
  537. sys.exit('Nothing to do: specify one or more of --central, --workers or --clean')
  538. if options.fork > 0:
  539. start_forks(options)
  540. if options.workers > 0:
  541. start_workers(self.worker_cls, options.workers, options, arguments)
  542. if options.central:
  543. run_central(self.central_cls, options, arguments)
  544. if __name__ == '__main__':
  545. Hive(None, None).main()