PageRenderTime 75ms CodeModel.GetById 64ms app.highlight 8ms RepoModel.GetById 1ms app.codeStats 0ms

/boto-2.5.2/boto/emr/step.py

#
Python | 232 lines | 104 code | 38 blank | 90 comment | 17 complexity | 8f18cd50a2eb26586f88d64e05411149 MD5 | raw file
  1# Copyright (c) 2010 Spotify AB
  2# Copyright (c) 2010-2011 Yelp
  3#
  4# Permission is hereby granted, free of charge, to any person obtaining a
  5# copy of this software and associated documentation files (the
  6# "Software"), to deal in the Software without restriction, including
  7# without limitation the rights to use, copy, modify, merge, publish, dis-
  8# tribute, sublicense, and/or sell copies of the Software, and to permit
  9# persons to whom the Software is furnished to do so, subject to the fol-
 10# lowing conditions:
 11#
 12# The above copyright notice and this permission notice shall be included
 13# in all copies or substantial portions of the Software.
 14#
 15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 16# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
 17# ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
 18# SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 19# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 21# IN THE SOFTWARE.
 22
 23class Step(object):
 24    """
 25    Jobflow Step base class
 26    """
 27    def jar(self):
 28        """
 29        :rtype: str
 30        :return: URI to the jar
 31        """
 32        raise NotImplemented()
 33
 34    def args(self):
 35        """
 36        :rtype: list(str)
 37        :return: List of arguments for the step
 38        """
 39        raise NotImplemented()
 40
 41    def main_class(self):
 42        """
 43        :rtype: str
 44        :return: The main class name
 45        """
 46        raise NotImplemented()
 47
 48
 49class JarStep(Step):
 50    """
 51    Custom jar step
 52    """
 53    def __init__(self, name, jar, main_class=None,
 54                 action_on_failure='TERMINATE_JOB_FLOW', step_args=None):
 55        """
 56        A elastic mapreduce step that executes a jar
 57
 58        :type name: str
 59        :param name: The name of the step
 60        :type jar: str
 61        :param jar: S3 URI to the Jar file
 62        :type main_class: str
 63        :param main_class: The class to execute in the jar
 64        :type action_on_failure: str
 65        :param action_on_failure: An action, defined in the EMR docs to take on failure.
 66        :type step_args: list(str)
 67        :param step_args: A list of arguments to pass to the step
 68        """
 69        self.name = name
 70        self._jar = jar
 71        self._main_class = main_class
 72        self.action_on_failure = action_on_failure
 73
 74        if isinstance(step_args, basestring):
 75            step_args = [step_args]
 76
 77        self.step_args = step_args
 78
 79    def jar(self):
 80        return self._jar
 81
 82    def args(self):
 83        args = []
 84
 85        if self.step_args:
 86            args.extend(self.step_args)
 87
 88        return args
 89
 90    def main_class(self):
 91        return self._main_class
 92
 93
 94class StreamingStep(Step):
 95    """
 96    Hadoop streaming step
 97    """
 98    def __init__(self, name, mapper, reducer=None, combiner=None,
 99                 action_on_failure='TERMINATE_JOB_FLOW',
100                 cache_files=None, cache_archives=None,
101                 step_args=None, input=None, output=None,
102                 jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar'):
103        """
104        A hadoop streaming elastic mapreduce step
105
106        :type name: str
107        :param name: The name of the step
108        :type mapper: str
109        :param mapper: The mapper URI
110        :type reducer: str
111        :param reducer: The reducer URI
112        :type combiner: str
113        :param combiner: The combiner URI. Only works for Hadoop 0.20 and later!
114        :type action_on_failure: str
115        :param action_on_failure: An action, defined in the EMR docs to take on failure.
116        :type cache_files: list(str)
117        :param cache_files: A list of cache files to be bundled with the job
118        :type cache_archives: list(str)
119        :param cache_archives: A list of jar archives to be bundled with the job
120        :type step_args: list(str)
121        :param step_args: A list of arguments to pass to the step
122        :type input: str or a list of str
123        :param input: The input uri
124        :type output: str
125        :param output: The output uri
126        :type jar: str
127        :param jar: The hadoop streaming jar. This can be either a local path on the master node, or an s3:// URI.
128        """
129        self.name = name
130        self.mapper = mapper
131        self.reducer = reducer
132        self.combiner = combiner
133        self.action_on_failure = action_on_failure
134        self.cache_files = cache_files
135        self.cache_archives = cache_archives
136        self.input = input
137        self.output = output
138        self._jar = jar
139
140        if isinstance(step_args, basestring):
141            step_args = [step_args]
142
143        self.step_args = step_args
144
145    def jar(self):
146        return self._jar
147
148    def main_class(self):
149        return None
150
151    def args(self):
152        args = []
153
154        # put extra args BEFORE -mapper and -reducer so that e.g. -libjar
155        # will work
156        if self.step_args:
157            args.extend(self.step_args)
158
159        args.extend(['-mapper', self.mapper])
160
161        if self.combiner:
162            args.extend(['-combiner', self.combiner])
163
164        if self.reducer:
165            args.extend(['-reducer', self.reducer])
166        else:
167            args.extend(['-jobconf', 'mapred.reduce.tasks=0'])
168
169        if self.input:
170            if isinstance(self.input, list):
171                for input in self.input:
172                    args.extend(('-input', input))
173            else:
174                args.extend(('-input', self.input))
175        if self.output:
176            args.extend(('-output', self.output))
177
178        if self.cache_files:
179            for cache_file in self.cache_files:
180                args.extend(('-cacheFile', cache_file))
181
182        if self.cache_archives:
183           for cache_archive in self.cache_archives:
184                args.extend(('-cacheArchive', cache_archive))
185
186        return args
187
188    def __repr__(self):
189        return '%s.%s(name=%r, mapper=%r, reducer=%r, action_on_failure=%r, cache_files=%r, cache_archives=%r, step_args=%r, input=%r, output=%r, jar=%r)' % (
190            self.__class__.__module__, self.__class__.__name__,
191            self.name, self.mapper, self.reducer, self.action_on_failure,
192            self.cache_files, self.cache_archives, self.step_args,
193            self.input, self.output, self._jar)
194
195class ScriptRunnerStep(JarStep):
196
197    ScriptRunnerJar = 's3n://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar'
198
199    def __init__(self, name, **kw):
200        JarStep.__init__(self, name, self.ScriptRunnerJar, **kw)
201
202class PigBase(ScriptRunnerStep):
203
204    BaseArgs = ['s3n://us-east-1.elasticmapreduce/libs/pig/pig-script',
205                '--base-path', 's3n://us-east-1.elasticmapreduce/libs/pig/']
206
207class InstallPigStep(PigBase):
208    """
209    Install pig on emr step
210    """
211
212    InstallPigName = 'Install Pig'
213
214    def __init__(self, pig_versions='latest'):
215        step_args = []
216        step_args.extend(self.BaseArgs)
217        step_args.extend(['--install-pig'])
218        step_args.extend(['--pig-versions', pig_versions])
219        ScriptRunnerStep.__init__(self, self.InstallPigName, step_args=step_args)
220
221class PigStep(PigBase):
222    """
223    Pig script step
224    """
225
226    def __init__(self, name, pig_file, pig_versions='latest', pig_args=[]):
227        step_args = []
228        step_args.extend(self.BaseArgs)
229        step_args.extend(['--pig-versions', pig_versions])
230        step_args.extend(['--run-pig-script', '--args', '-f', pig_file])
231        step_args.extend(pig_args)
232        ScriptRunnerStep.__init__(self, name, step_args=step_args)