/multiprocessing/verbose.py
https://gitlab.com/skororu/pysnippets · Python · 267 lines · 254 code · 0 blank · 13 comment · 0 complexity · 2d6d29e669461026d38bfc4593b5252c MD5 · raw file
- #!/usr/bin/env python3
- """
- test of a suitable method to replace the rather serial main control loop
- of dtr (*) as the script was blocking while network file transfers were
- completed, delaying the start of rendering the next block.
- (*) https://gitlab.com/skororu/dtr
- simulated scenario:
- * 1 computer to run the script
- * 3 computers (nodes) to render the blocks
- thus the script creates:
- * 3 processes, 1 for each node, running function 'render'
- to simulate despatching blocks to each node
- * 2 processes, running function 'collect'
- to simulate collecting completed blocks from nodes
- 1 control loop that checks if the blocks have all been received
- in essence the design carries out the renders and performs
- fetching of completed blocks concurrently, allowing the the next block
- render to start immediately - network file transfer is largely eliminated
- as a source of overhead.
- +-------+ +------+ +---------+ +--------+
- | BLOCK |>>>>|render|>>>>| COLLECT | | RETIRE |
- | QUEUE | +------+ | QUEUE | +-------+ | QUEUE |
- | | | | |>>>>|collect|>>>>| | +----------+
- | | +------+ | | +-------+ | | | CHECK |
- | |>>>>|render|>>>>| | | | |>>>>| COMPLETE |
- | | +------+ | | +-------+ | | | |
- | | | | |>>>>|collect|>>>>| | +----------+
- | | +------+ | | +-------+ | | |
- | |>>>>|render|>>>>| | | | | |
- +-------+ +------+ +---------+ | +--------+ |
- | | |
- +-----<-----<-----<-----<--+--<-----<-----<-----<------+
- RUN QUEUE (all processes terminate if not empty)
- """
- import math # ceil
- import multiprocessing as mp # Process, Queue
- import os # getpid
- import queue # Empty exception
- import random # random
- import sys # argv
- import time # sleep
- ##############################################################################
- # functions to be run as processes
- ##############################################################################
- def render(node, bl_q, co_q, ru_q):
- """
- simulates despatching blocks to be rendered on remote nodes
- it takes a block number from the block queue (bl_q), waits around for a
- token short period of time to simulate rendering, then places the
- block number on the collection queue (co_q) for later processing
- --------------------------------------------------------------------------
- args
- nodes : list [x, y]
- where:
- x = string, ip address
- y = int, benchmark time
- bl_q : multiprocessing.Queue
- queue containing ints (blocks to be rendered)
- co_q : multiprocessing.Queue containing lists [x, z]
- where:
- x = string, ip address
- z = int, block number
- (blocks to be collected)
- ru_q : multiprocessing.Queue
- if this queue is not empty, this function should terminate
- --------------------------------------------------------------------------
- returns
- co_q
- mutable type amended in place, no explicit return
- --------------------------------------------------------------------------
- """
- process_str = ' render (' + str(os.getpid()) + '):'
- print(process_str, 'starting')
- while ru_q.empty():
- try:
- # timeout=1 limits poll rate when queue is empty
- block_num = bl_q.get(timeout=1)
- except queue.Empty:
- # handle exception when the queue is empty towards the finish
- continue
- else:
- print(process_str, 'despatching block', block_num, 'to node', node[0])
- # short blocking delay to simulate "render time"
- time.sleep(node[1])
- print(process_str, 'finished block', block_num, 'on node', node[0])
- # add completed block to queue so it can be collected later
- co_q.put([node[0], block_num])
- print(process_str, 'exiting')
- def collect(bl_q, co_q, ch_q, ru_q):
- """
- simulates initiating collection of blocks from remote nodes
- it takes a block number from the collection queue (co_q), waits around
- for a token short period of time to simulate file transfer, then places
- the block number on the check queue (ch_q) for later processing.
- --------------------------------------------------------------------------
- args
- bl_q : multiprocessing.Queue
- queue containing ints (blocks to be despatched)
- co_q : multiprocessing.Queue containing lists [x, z]
- where:
- x = string, ip address
- z = int, block number
- (blocks to be fetched)
- ch_q : multiprocessing.Queue containing lists [x, z]
- where:
- x = string, ip address
- z = int, block number
- (blocks to be checked)
- ru_q : multiprocessing.Queue
- if this queue is not empty, this function should terminate
- --------------------------------------------------------------------------
- returns
- ch_q
- mutable type amended in place, no explicit return
- --------------------------------------------------------------------------
- """
- process_str = 'collect (' + str(os.getpid()) + '):'
- print(process_str, 'starting')
- while ru_q.empty():
- try:
- # timeout=1 limits poll rate when queue is empty
- block_info = co_q.get(timeout=1)
- except queue.Empty:
- # handle exception when the queue is empty towards the finish
- continue
- else:
- print(process_str, 'collecting block', block_info[1], 'from node', block_info[0])
- # short blocking delay to simulate "network transfer"
- time.sleep(random.random())
- # deliberately fail some of the collected blocks
- if random.random() > 0.2:
- ch_q.put(block_info)
- else:
- # despatch block again
- print('** collection of block', block_info[1], 'failed, despatch again **')
- bl_q.put(block_info[1])
- print(process_str, 'exiting')
- ##############################################################################
- def main():
- """
- initialise concurrent processes and checking
- we will use one render process per node, as the processes will block
- during rendering.
- for collect processes, given that:
- * the time taken to render a block is very much greater than the time
- taken to transfer it over the network
- * variance in render times between nodes of similar performance is
- usually greater than the network transfer time
- * rendering and network transfers proceed concurrently
- it is thus unlikely we will need to handle many concurrent
- network file transfers, and even if a few are queued they will all be
- transferred long before the next render completes.
- in essence, we need less collect processes than render processes
- """
- ##########################################################################
- # user settings: get num_nodes from command line if present
- ##########################################################################
- try:
- num_nodes = int(sys.argv[1])
- except:
- num_nodes = 3
- num_blocks = num_nodes * 2
- ##########################################################################
- # initialise
- ##########################################################################
- blocks_q = mp.Queue()
- collect_q = mp.Queue()
- retire_q = mp.Queue()
- run_q = mp.Queue()
- # fill queue with blocks to be rendered
- all_blocks = list(range(num_blocks))
- for block in all_blocks:
- blocks_q.put(block)
- # create some nodes with random performance data
- nodes = [[n, random.randint(1, 8)] for n in all_blocks]
- # initialise a render process for each node
- parfun = []
- for node in nodes:
- parfun.append(mp.Process(target=render, args=(node, blocks_q, collect_q, run_q)))
- # initialise an appropriate number of collect processes
- num_collectors = math.ceil(len(nodes) / 4)
- for i in range(num_collectors):
- parfun.append(mp.Process(target=collect, args=(blocks_q, collect_q, retire_q, run_q)))
- # give the user basic configuration information
- print(num_blocks, 'blocks to be rendered on', num_nodes, 'nodes, using', \
- num_nodes, 'render processes and', num_collectors, 'collector processes')
- for node in nodes:
- print('node', node[0], 'block render time', str(node[1]) + 's')
- ##########################################################################
- # start
- ##########################################################################
- # start up render and collect processes for each node
- for pfu in parfun:
- pfu.start()
- # check that all the blocks transfers have completed
- checked = []
- while len(checked) < len(all_blocks):
- try:
- # timeout=1 limits poll rate when queue is empty
- block_info = retire_q.get(timeout=1)
- except queue.Empty:
- # handle exception when the queue is empty towards the finish
- continue
- else:
- checked.append(block_info[1])
- print('transfer of block', block_info[1], 'succeeded')
- ##########################################################################
- # tidy up and exit
- ##########################################################################
- # indicate to render and collect processes that they can terminate now
- run_q.put(1)
- # tidy up
- for pfu in parfun:
- pfu.join()
- if sorted(checked) != all_blocks:
- sys.exit("failed")
- ##############################################################################
- if __name__ == '__main__':
- main()