/multiprocessing/verbose.py

https://gitlab.com/skororu/pysnippets · Python · 267 lines · 254 code · 0 blank · 13 comment · 0 complexity · 2d6d29e669461026d38bfc4593b5252c MD5 · raw file

  1. #!/usr/bin/env python3
  2. """
  3. test of a suitable method to replace the rather serial main control loop
  4. of dtr (*) as the script was blocking while network file transfers were
  5. completed, delaying the start of rendering the next block.
  6. (*) https://gitlab.com/skororu/dtr
  7. simulated scenario:
  8. * 1 computer to run the script
  9. * 3 computers (nodes) to render the blocks
  10. thus the script creates:
  11. * 3 processes, 1 for each node, running function 'render'
  12. to simulate despatching blocks to each node
  13. * 2 processes, running function 'collect'
  14. to simulate collecting completed blocks from nodes
  15. 1 control loop that checks if the blocks have all been received
  16. in essence the design carries out the renders and performs
  17. fetching of completed blocks concurrently, allowing the the next block
  18. render to start immediately - network file transfer is largely eliminated
  19. as a source of overhead.
  20. +-------+ +------+ +---------+ +--------+
  21. | BLOCK |>>>>|render|>>>>| COLLECT | | RETIRE |
  22. | QUEUE | +------+ | QUEUE | +-------+ | QUEUE |
  23. | | | | |>>>>|collect|>>>>| | +----------+
  24. | | +------+ | | +-------+ | | | CHECK |
  25. | |>>>>|render|>>>>| | | | |>>>>| COMPLETE |
  26. | | +------+ | | +-------+ | | | |
  27. | | | | |>>>>|collect|>>>>| | +----------+
  28. | | +------+ | | +-------+ | | |
  29. | |>>>>|render|>>>>| | | | | |
  30. +-------+ +------+ +---------+ | +--------+ |
  31. | | |
  32. +-----<-----<-----<-----<--+--<-----<-----<-----<------+
  33. RUN QUEUE (all processes terminate if not empty)
  34. """
  35. import math # ceil
  36. import multiprocessing as mp # Process, Queue
  37. import os # getpid
  38. import queue # Empty exception
  39. import random # random
  40. import sys # argv
  41. import time # sleep
  42. ##############################################################################
  43. # functions to be run as processes
  44. ##############################################################################
  45. def render(node, bl_q, co_q, ru_q):
  46. """
  47. simulates despatching blocks to be rendered on remote nodes
  48. it takes a block number from the block queue (bl_q), waits around for a
  49. token short period of time to simulate rendering, then places the
  50. block number on the collection queue (co_q) for later processing
  51. --------------------------------------------------------------------------
  52. args
  53. nodes : list [x, y]
  54. where:
  55. x = string, ip address
  56. y = int, benchmark time
  57. bl_q : multiprocessing.Queue
  58. queue containing ints (blocks to be rendered)
  59. co_q : multiprocessing.Queue containing lists [x, z]
  60. where:
  61. x = string, ip address
  62. z = int, block number
  63. (blocks to be collected)
  64. ru_q : multiprocessing.Queue
  65. if this queue is not empty, this function should terminate
  66. --------------------------------------------------------------------------
  67. returns
  68. co_q
  69. mutable type amended in place, no explicit return
  70. --------------------------------------------------------------------------
  71. """
  72. process_str = ' render (' + str(os.getpid()) + '):'
  73. print(process_str, 'starting')
  74. while ru_q.empty():
  75. try:
  76. # timeout=1 limits poll rate when queue is empty
  77. block_num = bl_q.get(timeout=1)
  78. except queue.Empty:
  79. # handle exception when the queue is empty towards the finish
  80. continue
  81. else:
  82. print(process_str, 'despatching block', block_num, 'to node', node[0])
  83. # short blocking delay to simulate "render time"
  84. time.sleep(node[1])
  85. print(process_str, 'finished block', block_num, 'on node', node[0])
  86. # add completed block to queue so it can be collected later
  87. co_q.put([node[0], block_num])
  88. print(process_str, 'exiting')
  89. def collect(bl_q, co_q, ch_q, ru_q):
  90. """
  91. simulates initiating collection of blocks from remote nodes
  92. it takes a block number from the collection queue (co_q), waits around
  93. for a token short period of time to simulate file transfer, then places
  94. the block number on the check queue (ch_q) for later processing.
  95. --------------------------------------------------------------------------
  96. args
  97. bl_q : multiprocessing.Queue
  98. queue containing ints (blocks to be despatched)
  99. co_q : multiprocessing.Queue containing lists [x, z]
  100. where:
  101. x = string, ip address
  102. z = int, block number
  103. (blocks to be fetched)
  104. ch_q : multiprocessing.Queue containing lists [x, z]
  105. where:
  106. x = string, ip address
  107. z = int, block number
  108. (blocks to be checked)
  109. ru_q : multiprocessing.Queue
  110. if this queue is not empty, this function should terminate
  111. --------------------------------------------------------------------------
  112. returns
  113. ch_q
  114. mutable type amended in place, no explicit return
  115. --------------------------------------------------------------------------
  116. """
  117. process_str = 'collect (' + str(os.getpid()) + '):'
  118. print(process_str, 'starting')
  119. while ru_q.empty():
  120. try:
  121. # timeout=1 limits poll rate when queue is empty
  122. block_info = co_q.get(timeout=1)
  123. except queue.Empty:
  124. # handle exception when the queue is empty towards the finish
  125. continue
  126. else:
  127. print(process_str, 'collecting block', block_info[1], 'from node', block_info[0])
  128. # short blocking delay to simulate "network transfer"
  129. time.sleep(random.random())
  130. # deliberately fail some of the collected blocks
  131. if random.random() > 0.2:
  132. ch_q.put(block_info)
  133. else:
  134. # despatch block again
  135. print('** collection of block', block_info[1], 'failed, despatch again **')
  136. bl_q.put(block_info[1])
  137. print(process_str, 'exiting')
  138. ##############################################################################
  139. def main():
  140. """
  141. initialise concurrent processes and checking
  142. we will use one render process per node, as the processes will block
  143. during rendering.
  144. for collect processes, given that:
  145. * the time taken to render a block is very much greater than the time
  146. taken to transfer it over the network
  147. * variance in render times between nodes of similar performance is
  148. usually greater than the network transfer time
  149. * rendering and network transfers proceed concurrently
  150. it is thus unlikely we will need to handle many concurrent
  151. network file transfers, and even if a few are queued they will all be
  152. transferred long before the next render completes.
  153. in essence, we need less collect processes than render processes
  154. """
  155. ##########################################################################
  156. # user settings: get num_nodes from command line if present
  157. ##########################################################################
  158. try:
  159. num_nodes = int(sys.argv[1])
  160. except:
  161. num_nodes = 3
  162. num_blocks = num_nodes * 2
  163. ##########################################################################
  164. # initialise
  165. ##########################################################################
  166. blocks_q = mp.Queue()
  167. collect_q = mp.Queue()
  168. retire_q = mp.Queue()
  169. run_q = mp.Queue()
  170. # fill queue with blocks to be rendered
  171. all_blocks = list(range(num_blocks))
  172. for block in all_blocks:
  173. blocks_q.put(block)
  174. # create some nodes with random performance data
  175. nodes = [[n, random.randint(1, 8)] for n in all_blocks]
  176. # initialise a render process for each node
  177. parfun = []
  178. for node in nodes:
  179. parfun.append(mp.Process(target=render, args=(node, blocks_q, collect_q, run_q)))
  180. # initialise an appropriate number of collect processes
  181. num_collectors = math.ceil(len(nodes) / 4)
  182. for i in range(num_collectors):
  183. parfun.append(mp.Process(target=collect, args=(blocks_q, collect_q, retire_q, run_q)))
  184. # give the user basic configuration information
  185. print(num_blocks, 'blocks to be rendered on', num_nodes, 'nodes, using', \
  186. num_nodes, 'render processes and', num_collectors, 'collector processes')
  187. for node in nodes:
  188. print('node', node[0], 'block render time', str(node[1]) + 's')
  189. ##########################################################################
  190. # start
  191. ##########################################################################
  192. # start up render and collect processes for each node
  193. for pfu in parfun:
  194. pfu.start()
  195. # check that all the blocks transfers have completed
  196. checked = []
  197. while len(checked) < len(all_blocks):
  198. try:
  199. # timeout=1 limits poll rate when queue is empty
  200. block_info = retire_q.get(timeout=1)
  201. except queue.Empty:
  202. # handle exception when the queue is empty towards the finish
  203. continue
  204. else:
  205. checked.append(block_info[1])
  206. print('transfer of block', block_info[1], 'succeeded')
  207. ##########################################################################
  208. # tidy up and exit
  209. ##########################################################################
  210. # indicate to render and collect processes that they can terminate now
  211. run_q.put(1)
  212. # tidy up
  213. for pfu in parfun:
  214. pfu.join()
  215. if sorted(checked) != all_blocks:
  216. sys.exit("failed")
  217. ##############################################################################
  218. if __name__ == '__main__':
  219. main()