PageRenderTime 93ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/boto-2.5.2/boto/emr/step.py

#
Python | 232 lines | 104 code | 38 blank | 90 comment | 16 complexity | 8f18cd50a2eb26586f88d64e05411149 MD5 | raw file
  1. # Copyright (c) 2010 Spotify AB
  2. # Copyright (c) 2010-2011 Yelp
  3. #
  4. # Permission is hereby granted, free of charge, to any person obtaining a
  5. # copy of this software and associated documentation files (the
  6. # "Software"), to deal in the Software without restriction, including
  7. # without limitation the rights to use, copy, modify, merge, publish, dis-
  8. # tribute, sublicense, and/or sell copies of the Software, and to permit
  9. # persons to whom the Software is furnished to do so, subject to the fol-
  10. # lowing conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included
  13. # in all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  16. # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
  17. # ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
  18. # SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  19. # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21. # IN THE SOFTWARE.
  22. class Step(object):
  23. """
  24. Jobflow Step base class
  25. """
  26. def jar(self):
  27. """
  28. :rtype: str
  29. :return: URI to the jar
  30. """
  31. raise NotImplemented()
  32. def args(self):
  33. """
  34. :rtype: list(str)
  35. :return: List of arguments for the step
  36. """
  37. raise NotImplemented()
  38. def main_class(self):
  39. """
  40. :rtype: str
  41. :return: The main class name
  42. """
  43. raise NotImplemented()
  44. class JarStep(Step):
  45. """
  46. Custom jar step
  47. """
  48. def __init__(self, name, jar, main_class=None,
  49. action_on_failure='TERMINATE_JOB_FLOW', step_args=None):
  50. """
  51. A elastic mapreduce step that executes a jar
  52. :type name: str
  53. :param name: The name of the step
  54. :type jar: str
  55. :param jar: S3 URI to the Jar file
  56. :type main_class: str
  57. :param main_class: The class to execute in the jar
  58. :type action_on_failure: str
  59. :param action_on_failure: An action, defined in the EMR docs to take on failure.
  60. :type step_args: list(str)
  61. :param step_args: A list of arguments to pass to the step
  62. """
  63. self.name = name
  64. self._jar = jar
  65. self._main_class = main_class
  66. self.action_on_failure = action_on_failure
  67. if isinstance(step_args, basestring):
  68. step_args = [step_args]
  69. self.step_args = step_args
  70. def jar(self):
  71. return self._jar
  72. def args(self):
  73. args = []
  74. if self.step_args:
  75. args.extend(self.step_args)
  76. return args
  77. def main_class(self):
  78. return self._main_class
  79. class StreamingStep(Step):
  80. """
  81. Hadoop streaming step
  82. """
  83. def __init__(self, name, mapper, reducer=None, combiner=None,
  84. action_on_failure='TERMINATE_JOB_FLOW',
  85. cache_files=None, cache_archives=None,
  86. step_args=None, input=None, output=None,
  87. jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar'):
  88. """
  89. A hadoop streaming elastic mapreduce step
  90. :type name: str
  91. :param name: The name of the step
  92. :type mapper: str
  93. :param mapper: The mapper URI
  94. :type reducer: str
  95. :param reducer: The reducer URI
  96. :type combiner: str
  97. :param combiner: The combiner URI. Only works for Hadoop 0.20 and later!
  98. :type action_on_failure: str
  99. :param action_on_failure: An action, defined in the EMR docs to take on failure.
  100. :type cache_files: list(str)
  101. :param cache_files: A list of cache files to be bundled with the job
  102. :type cache_archives: list(str)
  103. :param cache_archives: A list of jar archives to be bundled with the job
  104. :type step_args: list(str)
  105. :param step_args: A list of arguments to pass to the step
  106. :type input: str or a list of str
  107. :param input: The input uri
  108. :type output: str
  109. :param output: The output uri
  110. :type jar: str
  111. :param jar: The hadoop streaming jar. This can be either a local path on the master node, or an s3:// URI.
  112. """
  113. self.name = name
  114. self.mapper = mapper
  115. self.reducer = reducer
  116. self.combiner = combiner
  117. self.action_on_failure = action_on_failure
  118. self.cache_files = cache_files
  119. self.cache_archives = cache_archives
  120. self.input = input
  121. self.output = output
  122. self._jar = jar
  123. if isinstance(step_args, basestring):
  124. step_args = [step_args]
  125. self.step_args = step_args
  126. def jar(self):
  127. return self._jar
  128. def main_class(self):
  129. return None
  130. def args(self):
  131. args = []
  132. # put extra args BEFORE -mapper and -reducer so that e.g. -libjar
  133. # will work
  134. if self.step_args:
  135. args.extend(self.step_args)
  136. args.extend(['-mapper', self.mapper])
  137. if self.combiner:
  138. args.extend(['-combiner', self.combiner])
  139. if self.reducer:
  140. args.extend(['-reducer', self.reducer])
  141. else:
  142. args.extend(['-jobconf', 'mapred.reduce.tasks=0'])
  143. if self.input:
  144. if isinstance(self.input, list):
  145. for input in self.input:
  146. args.extend(('-input', input))
  147. else:
  148. args.extend(('-input', self.input))
  149. if self.output:
  150. args.extend(('-output', self.output))
  151. if self.cache_files:
  152. for cache_file in self.cache_files:
  153. args.extend(('-cacheFile', cache_file))
  154. if self.cache_archives:
  155. for cache_archive in self.cache_archives:
  156. args.extend(('-cacheArchive', cache_archive))
  157. return args
  158. def __repr__(self):
  159. return '%s.%s(name=%r, mapper=%r, reducer=%r, action_on_failure=%r, cache_files=%r, cache_archives=%r, step_args=%r, input=%r, output=%r, jar=%r)' % (
  160. self.__class__.__module__, self.__class__.__name__,
  161. self.name, self.mapper, self.reducer, self.action_on_failure,
  162. self.cache_files, self.cache_archives, self.step_args,
  163. self.input, self.output, self._jar)
  164. class ScriptRunnerStep(JarStep):
  165. ScriptRunnerJar = 's3n://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar'
  166. def __init__(self, name, **kw):
  167. JarStep.__init__(self, name, self.ScriptRunnerJar, **kw)
  168. class PigBase(ScriptRunnerStep):
  169. BaseArgs = ['s3n://us-east-1.elasticmapreduce/libs/pig/pig-script',
  170. '--base-path', 's3n://us-east-1.elasticmapreduce/libs/pig/']
  171. class InstallPigStep(PigBase):
  172. """
  173. Install pig on emr step
  174. """
  175. InstallPigName = 'Install Pig'
  176. def __init__(self, pig_versions='latest'):
  177. step_args = []
  178. step_args.extend(self.BaseArgs)
  179. step_args.extend(['--install-pig'])
  180. step_args.extend(['--pig-versions', pig_versions])
  181. ScriptRunnerStep.__init__(self, self.InstallPigName, step_args=step_args)
  182. class PigStep(PigBase):
  183. """
  184. Pig script step
  185. """
  186. def __init__(self, name, pig_file, pig_versions='latest', pig_args=[]):
  187. step_args = []
  188. step_args.extend(self.BaseArgs)
  189. step_args.extend(['--pig-versions', pig_versions])
  190. step_args.extend(['--run-pig-script', '--args', '-f', pig_file])
  191. step_args.extend(pig_args)
  192. ScriptRunnerStep.__init__(self, name, step_args=step_args)