/dumbo/backends/unix.py

https://github.com/uwstatsclub/mapred_intro · Python · 131 lines · 105 code · 21 blank · 5 comment · 20 complexity · 0399c9499f67aa7546ad4eeee0441958 MD5 · raw file

  1. '''
  2. Created on 26 Jul 2010
  3. @author: klaas
  4. '''
  5. import sys
  6. import operator
  7. from dumbo.backends.common import Backend, Iteration, FileSystem
  8. from dumbo.util import getopt, getopts, configopts, envdef, execute
  9. from dumbo.cmd import decodepipe
  10. class UnixBackend(Backend):
  11. def matches(self, opts):
  12. return True # always matches, but it's last in the list
  13. def create_iteration(self, opts):
  14. progopt = getopt(opts, 'prog')
  15. return UnixIteration(progopt[0], opts)
  16. def create_filesystem(self, opts):
  17. return UnixFileSystem()
  18. class UnixIteration(Iteration):
  19. def __init__(self, prog, opts):
  20. Iteration.__init__(self, prog, opts)
  21. self.opts += configopts('unix', prog, self.opts)
  22. def run(self):
  23. retval = Iteration.run(self)
  24. if retval != 0:
  25. return retval
  26. addedopts = getopts(self.opts, ['input',
  27. 'output',
  28. 'mapper',
  29. 'reducer',
  30. 'libegg',
  31. 'delinputs',
  32. 'cmdenv',
  33. 'pv',
  34. 'addpath',
  35. 'inputformat',
  36. 'outputformat',
  37. 'numreducetasks',
  38. 'python',
  39. 'pypath',
  40. 'sorttmpdir',
  41. 'sortbufsize'])
  42. (mapper, reducer) = (addedopts['mapper'][0], addedopts['reducer'][0])
  43. if not addedopts['input'] or not addedopts['output']:
  44. print >> sys.stderr, 'ERROR: input or output not specified'
  45. return 1
  46. inputs = reduce(operator.concat, (input.split(' ') for input in
  47. addedopts['input']))
  48. output = addedopts['output'][0]
  49. pyenv = envdef('PYTHONPATH', addedopts['libegg'],
  50. shortcuts=dict(configopts('eggs', self.prog)),
  51. extrapaths=addedopts['pypath'])
  52. cmdenv = ' '.join("%s='%s'" % tuple(arg.split('=')) for arg in
  53. addedopts['cmdenv'])
  54. if addedopts['pv'] and addedopts['pv'][0] == 'yes':
  55. mpv = '| pv -s `du -b %s | cut -f 1` -cN map ' % ' '.join(inputs)
  56. (spv, rpv) = ('| pv -cN sort ', '| pv -cN reduce ')
  57. else:
  58. (mpv, spv, rpv) = ('', '', '')
  59. (sorttmpdir, sortbufsize) = ('', '')
  60. if addedopts['sorttmpdir']:
  61. sorttmpdir = "-T %s" % addedopts['sorttmpdir'][0]
  62. if addedopts['sortbufsize']:
  63. sortbufsize = "-S %s" % addedopts['sortbufsize'][0]
  64. python = addedopts['python'][0]
  65. encodepipe = pyenv + ' ' + python + \
  66. ' -m dumbo.cmd encodepipe -file ' + ' -file '.join(inputs)
  67. if addedopts['inputformat'] and addedopts['inputformat'][0] == 'code':
  68. encodepipe += ' -alreadycoded yes'
  69. if addedopts['addpath'] and addedopts['addpath'][0] != 'no':
  70. encodepipe += ' -addpath yes'
  71. if addedopts['numreducetasks'] and addedopts['numreducetasks'][0] == '0':
  72. retval = execute("%s | %s %s %s %s > '%s'" % (encodepipe,
  73. pyenv,
  74. cmdenv,
  75. mapper,
  76. mpv,
  77. output))
  78. else:
  79. retval = execute("%s | %s %s %s %s| LC_ALL=C sort %s %s %s| %s %s %s %s> '%s'"
  80. % (encodepipe,
  81. pyenv,
  82. cmdenv,
  83. mapper,
  84. mpv,
  85. sorttmpdir,
  86. sortbufsize,
  87. spv,
  88. pyenv,
  89. cmdenv,
  90. reducer,
  91. rpv,
  92. output))
  93. if addedopts['delinputs'] and addedopts['delinputs'][0] == 'yes':
  94. for file in addedopts['input']:
  95. execute('rm ' + file)
  96. return retval
  97. class UnixFileSystem(FileSystem):
  98. def cat(self, path, opts):
  99. return decodepipe(opts + [('file', path)])
  100. def ls(self, path, opts):
  101. return execute("ls -l '%s'" % path, printcmd=False)
  102. def exists(self, path, opts):
  103. return execute("test -e '%s'" % path, printcmd=False)
  104. def rm(self, path, opts):
  105. return execute("rm -rf '%s'" % path, printcmd=False)
  106. def put(self, path1, path2, opts):
  107. return execute("cp '%s' '%s'" % (path1, path2), printcmd=False)
  108. def get(self, path1, path2, opts):
  109. return execute("cp '%s' '%s'" % (path1, path2), printcmd=False)