PageRenderTime 38ms CodeModel.GetById 10ms RepoModel.GetById 0ms app.codeStats 0ms

/mrjob/fs/hadoop.py

https://bitbucket.org/wangqiang8511/mrjob
Python | 231 lines | 189 code | 18 blank | 24 comment | 7 complexity | 8c9a819f7d49af16003b8706cf50aeea MD5 | raw file
  1. # Copyright 2009-2012 Yelp and Contributors
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import logging
  15. import posixpath
  16. import re
  17. from subprocess import Popen
  18. from subprocess import PIPE
  19. from subprocess import CalledProcessError
  20. try:
  21. from cStringIO import StringIO
  22. StringIO # quiet "redefinition of unused ..." warning from pyflakes
  23. except ImportError:
  24. from StringIO import StringIO
  25. from mrjob.fs.base import Filesystem
  26. from mrjob.parse import is_uri
  27. from mrjob.parse import urlparse
  28. from mrjob.util import cmd_line
  29. from mrjob.util import read_file
  30. log = logging.getLogger('mrjob.fs.hadoop')
  31. # used by mkdir()
  32. HADOOP_FILE_EXISTS_RE = re.compile(r'.*File exists.*')
  33. # used by ls()
  34. HADOOP_LSR_NO_SUCH_FILE = re.compile(
  35. r'^lsr: Cannot access .*: No such file or directory.')
  36. # used by rm() (see below)
  37. HADOOP_RMR_NO_SUCH_FILE = re.compile(r'^rmr: hdfs://.*$')
  38. class HadoopFilesystem(Filesystem):
  39. """Filesystem for URIs accepted by ``hadoop fs``. Typically you will get
  40. one of these via ``HadoopJobRunner().fs``, composed with
  41. :py:class:`~mrjob.fs.local.LocalFilesystem`.
  42. """
  43. def __init__(self, hadoop_bin):
  44. """:param hadoop_bin: path to ``hadoop`` binary"""
  45. super(HadoopFilesystem, self).__init__()
  46. self._hadoop_bin = hadoop_bin
  47. def can_handle_path(self, path):
  48. return is_uri(path)
  49. def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None,
  50. return_stdout=False):
  51. """Run the given hadoop command, raising an exception on non-zero
  52. return code. This only works for commands whose output we don't
  53. care about.
  54. Args:
  55. ok_returncodes -- a list/tuple/set of return codes we expect to
  56. get back from hadoop (e.g. [0,1]). By default, we only expect 0.
  57. If we get an unexpected return code, we raise a CalledProcessError.
  58. ok_stderr -- don't log STDERR or raise CalledProcessError if stderr
  59. matches a regex in this list (even if the returncode is bad)
  60. return_stdout -- return the stdout from the hadoop command rather
  61. than logging it. If this is False, we return the returncode
  62. instead.
  63. """
  64. args = self._hadoop_bin + args
  65. log.debug('> %s' % cmd_line(args))
  66. proc = Popen(args, stdout=PIPE, stderr=PIPE)
  67. stdout, stderr = proc.communicate()
  68. log_func = log.debug if proc.returncode == 0 else log.error
  69. if not return_stdout:
  70. for line in StringIO(stdout):
  71. log_func('STDOUT: ' + line.rstrip('\r\n'))
  72. # check if STDERR is okay
  73. stderr_is_ok = False
  74. if ok_stderr:
  75. for stderr_re in ok_stderr:
  76. if stderr_re.match(stderr):
  77. stderr_is_ok = True
  78. break
  79. if not stderr_is_ok:
  80. for line in StringIO(stderr):
  81. log_func('STDERR: ' + line.rstrip('\r\n'))
  82. ok_returncodes = ok_returncodes or [0]
  83. if not stderr_is_ok and proc.returncode not in ok_returncodes:
  84. raise CalledProcessError(proc.returncode, args)
  85. if return_stdout:
  86. return stdout
  87. else:
  88. return proc.returncode
  89. def du(self, path_glob):
  90. """Get the size of a file, or None if it's not a file or doesn't
  91. exist."""
  92. try:
  93. stdout = self.invoke_hadoop(['fs', '-dus', path_glob],
  94. return_stdout=True)
  95. except CalledProcessError:
  96. raise IOError(path_glob)
  97. try:
  98. return sum(int(line.split()[1])
  99. for line in stdout.split('\n')
  100. if line.strip())
  101. except (ValueError, TypeError, IndexError):
  102. raise IOError(
  103. 'Unexpected output from hadoop fs -du: %r' % stdout)
  104. def ls(self, path_glob):
  105. components = urlparse(path_glob)
  106. hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)
  107. try:
  108. stdout = self.invoke_hadoop(
  109. ['fs', '-lsr', path_glob],
  110. return_stdout=True,
  111. ok_stderr=[HADOOP_LSR_NO_SUCH_FILE])
  112. except CalledProcessError:
  113. raise IOError("Could not ls %s" % path_glob)
  114. path_index = None
  115. for line in StringIO(stdout):
  116. fields = line.rstrip('\r\n').split()
  117. # Throw out directories
  118. if fields[0].startswith('d'):
  119. continue
  120. # Try to figure out which part of the line is the path
  121. # Expected lines:
  122. # -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar # HDFS
  123. # -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar # S3
  124. if not path_index:
  125. for index, field in enumerate(fields):
  126. if len(field) == 5 and field[2] == ':':
  127. path_index = (index + 1)
  128. if not path_index:
  129. raise IOError("Could not locate path in string '%s'" % line)
  130. path = ' '.join(fields[path_index:])
  131. yield hdfs_prefix + path
  132. def _cat_file(self, filename):
  133. # stream from HDFS
  134. cat_args = self._hadoop_bin + ['fs', '-cat', filename]
  135. log.debug('> %s' % cmd_line(cat_args))
  136. cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)
  137. def stream():
  138. for line in cat_proc.stdout:
  139. yield line
  140. # there shouldn't be any stderr
  141. for line in cat_proc.stderr:
  142. log.error('STDERR: ' + line)
  143. returncode = cat_proc.wait()
  144. if returncode != 0:
  145. raise IOError("Could not stream %s" % filename)
  146. return read_file(filename, stream())
  147. def mkdir(self, path):
  148. try:
  149. self.invoke_hadoop(
  150. ['fs', '-mkdir', path], ok_stderr=[HADOOP_FILE_EXISTS_RE])
  151. except CalledProcessError:
  152. raise IOError("Could not mkdir %s" % path)
  153. def path_exists(self, path_glob):
  154. """Does the given path exist?
  155. If dest is a directory (ends with a "/"), we check if there are
  156. any files starting with that path.
  157. """
  158. try:
  159. return_code = self.invoke_hadoop(['fs', '-test', '-e', path_glob],
  160. ok_returncodes=(0, 1))
  161. return (return_code == 0)
  162. except CalledProcessError:
  163. raise IOError("Could not check path %s" % path_glob)
  164. def path_join(self, dirname, filename):
  165. return posixpath.join(dirname, filename)
  166. def rm(self, path_glob):
  167. if not is_uri(path_glob):
  168. super(HadoopFilesystem, self).rm(path_glob)
  169. if self.path_exists(path_glob):
  170. # hadoop fs -rmr will print something like:
  171. # Moved to trash: hdfs://hdnamenode:54310/user/dave/asdf
  172. # to STDOUT, which we don't care about.
  173. #
  174. # if we ask to delete a path that doesn't exist, it prints
  175. # to STDERR something like:
  176. # rmr: <path>
  177. # which we can safely ignore
  178. try:
  179. self.invoke_hadoop(
  180. ['fs', '-rmr', path_glob],
  181. return_stdout=True, ok_stderr=[HADOOP_RMR_NO_SUCH_FILE])
  182. except CalledProcessError:
  183. raise IOError("Could not rm %s" % path_glob)
  184. def touchz(self, dest):
  185. try:
  186. self.invoke_hadoop(['fs', '-touchz', dest])
  187. except CalledProcessError:
  188. raise IOError("Could not touchz %s" % dest)