PageRenderTime 157ms CodeModel.GetById 33ms RepoModel.GetById 1ms app.codeStats 0ms

/pydoop/app/script.py

https://bitbucket.org/jagan/pydoop
Python | 233 lines | 205 code | 7 blank | 21 comment | 2 complexity | b9cf3d6409a800f6e7ea566621a19cff MD5 | raw file
Possible License(s): Apache-2.0
  1. #!/usr/bin/env python
  2. # BEGIN_COPYRIGHT
  3. #
  4. # Copyright 2012 CRS4.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License"); you may not
  7. # use this file except in compliance with the License. You may obtain a copy
  8. # of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  14. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  15. # License for the specific language governing permissions and limitations
  16. # under the License.
  17. #
  18. # END_COPYRIGHT
  19. """
  20. A quick and easy to use interface for running simple MapReduce jobs.
  21. """
  22. import os, sys, warnings, logging
  23. logging.basicConfig(level=logging.INFO)
  24. import pydoop
  25. import pydoop.hdfs as hdfs
  26. import pydoop.hadut as hadut
  27. PIPES_TEMPLATE = """
  28. import sys, os
  29. sys.path.insert(0, os.getcwd())
  30. import pydoop.pipes
  31. import %(module)s
  32. class ContextWriter(object):
  33. def __init__(self, context):
  34. self.context = context
  35. self.counters = {}
  36. def emit(self, k, v):
  37. self.context.emit(str(k), str(v))
  38. def count(self, what, howmany):
  39. if self.counters.has_key(what):
  40. counter = self.counters[what]
  41. else:
  42. counter = self.context.getCounter('%(module)s', what)
  43. self.counters[what] = counter
  44. self.context.incrementCounter(counter, howmany)
  45. def status(self, msg):
  46. self.context.setStatus(msg)
  47. def progress(self):
  48. self.context.progress()
  49. class PydoopScriptMapper(pydoop.pipes.Mapper):
  50. def __init__(self, ctx):
  51. super(type(self), self).__init__(ctx)
  52. self.writer = ContextWriter(ctx)
  53. def map(self, ctx):
  54. %(module)s.%(map_fn)s(ctx.getInputKey(), ctx.getInputValue(), self.writer)
  55. class PydoopScriptReducer(pydoop.pipes.Reducer):
  56. def __init__(self, ctx):
  57. super(type(self), self).__init__(ctx)
  58. self.writer = ContextWriter(ctx)
  59. @staticmethod
  60. def iter(ctx):
  61. while ctx.nextValue():
  62. yield ctx.getInputValue()
  63. def reduce(self, ctx):
  64. key = ctx.getInputKey()
  65. %(module)s.%(reduce_fn)s(key, PydoopScriptReducer.iter(ctx), self.writer)
  66. if __name__ == '__main__':
  67. result = pydoop.pipes.runTask(pydoop.pipes.Factory(
  68. PydoopScriptMapper, PydoopScriptReducer
  69. ))
  70. sys.exit(0 if result else 1)
  71. """
  72. DEFAULT_REDUCE_TASKS = 3 * hadut.get_num_nodes()
  73. def find_pydoop_jar():
  74. pydoop_jar_path = os.path.join(
  75. os.path.dirname(pydoop.__file__), pydoop.__jar_name__
  76. )
  77. if os.path.exists(pydoop_jar_path):
  78. return pydoop_jar_path
  79. else:
  80. return None
  81. def kv_pair(s):
  82. return s.split("=", 1)
  83. class PydoopScript(object):
  84. DESCRIPTION = "Easy MapReduce scripting with Pydoop"
  85. def __init__(self):
  86. self.logger = logging.getLogger("PydoopScript")
  87. self.logger.setLevel(logging.DEBUG) # TODO: expose as a cli param
  88. self.properties = {
  89. 'hadoop.pipes.java.recordreader': 'true',
  90. 'hadoop.pipes.java.recordwriter': 'true',
  91. 'mapred.cache.files': '',
  92. 'mapred.create.symlink': 'yes',
  93. 'mapred.compress.map.output': 'true',
  94. 'bl.libhdfs.opts': '-Xmx48m'
  95. }
  96. self.args = None
  97. self.runner = None
  98. def set_args(self, args):
  99. parent = hdfs.path.dirname(hdfs.path.abspath(args.output.rstrip("/")))
  100. prefix = hdfs.path.join(parent, "pydoop_script_")
  101. self.runner = hadut.PipesRunner(prefix=prefix, logger=self.logger)
  102. module_bn = os.path.basename(args.module)
  103. self.properties['mapred.job.name'] = module_bn
  104. self.properties.update(dict(args.D or []))
  105. self.properties['mapred.reduce.tasks'] = args.num_reducers
  106. self.properties['mapred.textoutputformat.separator'] = args.kv_separator
  107. remote_module = hdfs.path.join(self.runner.wd, module_bn)
  108. hdfs.put(args.module, remote_module)
  109. dist_cache_parameter = "%s#%s" % (remote_module, module_bn)
  110. if self.properties['mapred.cache.files']:
  111. self.properties['mapred.cache.files'] += ','
  112. self.properties['mapred.cache.files'] += dist_cache_parameter
  113. self.args = args
  114. def __generate_pipes_code(self):
  115. lines = []
  116. ld_path = os.environ.get('LD_LIBRARY_PATH', None)
  117. pypath = os.environ.get('PYTHONPATH', '')
  118. lines.append("#!/bin/bash")
  119. lines.append('""":"')
  120. if ld_path:
  121. lines.append('export LD_LIBRARY_PATH="%s"' % ld_path)
  122. if pypath:
  123. lines.append('export PYTHONPATH="%s"' % pypath)
  124. # override the script's home directory.
  125. if ("mapreduce.admin.user.home.dir" not in self.properties and
  126. 'HOME' in os.environ and
  127. not self.args.no_override_home):
  128. lines.append('export HOME="%s"' % os.environ['HOME'])
  129. lines.append('exec "%s" -u "$0" "$@"' % sys.executable)
  130. lines.append('":"""')
  131. template_args = {
  132. 'module': os.path.splitext(os.path.basename(self.args.module))[0],
  133. 'map_fn': self.args.map_fn,
  134. 'reduce_fn': self.args.reduce_fn,
  135. }
  136. lines.append(PIPES_TEMPLATE % template_args)
  137. return os.linesep.join(lines) + os.linesep
  138. def __validate(self):
  139. if not hdfs.path.exists(self.args.input):
  140. raise RuntimeError("%r does not exist" % (self.args.input,))
  141. if hdfs.path.exists(self.args.output):
  142. raise RuntimeError("%r already exists" % (self.args.output,))
  143. def run(self):
  144. if self.args is None:
  145. raise RuntimeError("cannot run without args, please call set_args")
  146. self.__validate()
  147. pipes_args = []
  148. if self.properties['mapred.textoutputformat.separator'] == '':
  149. pydoop_jar = find_pydoop_jar()
  150. if pydoop_jar is not None:
  151. self.properties[
  152. 'mapred.output.format.class'
  153. ] = 'it.crs4.pydoop.NoSeparatorTextOutputFormat'
  154. pipes_args.extend(['-libjars', pydoop_jar])
  155. else:
  156. warnings.warn(
  157. "Can't find pydoop.jar, output will probably be tab-separated"
  158. )
  159. pipes_code = self.__generate_pipes_code()
  160. self.runner.set_input(self.args.input)
  161. self.runner.set_output(self.args.output)
  162. self.runner.set_exe(pipes_code)
  163. self.runner.run(more_args=pipes_args, properties=self.properties)
  164. def run(args):
  165. script = PydoopScript()
  166. script.set_args(args)
  167. print script.run()
  168. return 0
  169. def add_parser(subparsers):
  170. parser = subparsers.add_parser("script", description=PydoopScript.DESCRIPTION)
  171. parser.add_argument('module', metavar='MODULE', help='python module file')
  172. parser.add_argument('input', metavar='INPUT', help='hdfs input path')
  173. parser.add_argument('output', metavar='OUTPUT', help='hdfs output path')
  174. parser.add_argument('-m', '--map-fn', metavar='MAP', default='mapper',
  175. help="name of map function within module")
  176. parser.add_argument('-r', '--reduce-fn', metavar='RED', default='reducer',
  177. help="name of reduce function within module")
  178. parser.add_argument('-t', '--kv-separator', metavar='SEP', default='\t',
  179. help="output key-value separator")
  180. parser.add_argument(
  181. '--num-reducers', metavar='INT', type=int, default=DEFAULT_REDUCE_TASKS,
  182. help="Number of reduce tasks. Specify 0 to only perform map phase"
  183. )
  184. parser.add_argument(
  185. '--no-override-home', action='store_true',
  186. help="Don't set the script's HOME directory to the $HOME in your " +
  187. "environment. Hadoop will set it to the value of the " +
  188. "'mapreduce.admin.user.home.dir' property"
  189. )
  190. parser.add_argument(
  191. '-D', metavar="NAME=VALUE", type=kv_pair, action="append",
  192. help='Set a Hadoop property, such as -D mapred.compress.map.output=true'
  193. )
  194. parser.set_defaults(func=run)
  195. return parser