PageRenderTime 47ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/bioy_pkg/utils.py

https://bitbucket.org/crosenth/bioy
Python | 282 lines | 252 code | 5 blank | 25 comment | 1 complexity | 75df23eea1fce394c67e0122f5fc527b MD5 | raw file
Possible License(s): GPL-3.0
  1. # This file is part of Bioy
  2. #
  3. # Bioy is free software: you can redistribute it and/or modify
  4. # it under the terms of the GNU General Public License as published by
  5. # the Free Software Foundation, either version 3 of the License, or
  6. # (at your option) any later version.
  7. #
  8. # Bioy is distributed in the hope that it will be useful,
  9. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. # GNU General Public License for more details.
  12. #
  13. # You should have received a copy of the GNU General Public License
  14. # along with Bioy. If not, see <http://www.gnu.org/licenses/>.
  15. import os
  16. import bz2
  17. import gzip
  18. import logging
  19. import pandas
  20. import re
  21. import shutil
  22. import sys
  23. import signal
  24. import contextlib
  25. import tempfile
  26. from itertools import takewhile, izip_longest, groupby
  27. from csv import DictReader
  28. from collections import Iterable, OrderedDict
  29. from os import path
  30. log = logging.getLogger(__name__)
  31. def apply_df_status(func, df, msg=''):
  32. """
  33. """
  34. tmp_column = 'index_number'
  35. row_count = float(len(df))
  36. df[tmp_column] = xrange(int(row_count))
  37. msg += ' {:.0%}\r'
  38. def apply_func(item, msg):
  39. sys.stderr.write(msg.format(item[tmp_column] / row_count))
  40. return func(item)
  41. df = df.apply(apply_func, args=[msg], axis=1)
  42. return df.drop(tmp_column, axis=1)
  43. def flattener(iterable):
  44. """
  45. Flatten nested iterables (not strings or dict-like objects).
  46. Poached from http://stackoverflow.com/questions/2158395
  47. /flatten-an-irregular-list-of-lists-in-python
  48. """
  49. for el in iterable:
  50. if isinstance(el, Iterable) and \
  51. not (isinstance(el, basestring) or hasattr(el, 'get')):
  52. for sub in flattener(el):
  53. yield sub
  54. else:
  55. yield el
  56. def chunker(seq, size, combine_last=None):
  57. """
  58. Break sequence seq into lists of length `size`. If the length of
  59. the final list is < 'combine_last', it is appended to the end of
  60. the penultimate element.
  61. """
  62. chunks = [seq[pos:pos + size] for pos in xrange(0, len(seq), size)]
  63. if combine_last and len(chunks[-1]) < combine_last:
  64. chunks[-2].extend(chunks.pop(-1))
  65. return iter(chunks)
  66. def grouper(n, iterable, pad=True):
  67. """
  68. Return sequence of n-tuples composed of successive elements of
  69. iterable; last tuple is padded with None if necessary. Not safe
  70. for iterables with None elements.
  71. """
  72. args = [iter(iterable)] * n
  73. iterout = izip_longest(fillvalue=None, *args)
  74. if pad:
  75. return iterout
  76. else:
  77. return (takewhile(lambda x: x is not None, c) for c in iterout)
  78. def cast(val):
  79. for func in [int, float, lambda x: x.strip()]:
  80. try:
  81. return func(val)
  82. except ValueError:
  83. pass
  84. def mkdir(dirpath, clobber=False):
  85. """
  86. Create a (potentially existing) directory without errors. Raise
  87. OSError if directory can't be created. If clobber is True, remove
  88. dirpath if it exists.
  89. """
  90. if clobber:
  91. shutil.rmtree(dirpath, ignore_errors=True)
  92. try:
  93. os.mkdir(dirpath)
  94. except OSError:
  95. pass
  96. if not path.exists(dirpath):
  97. raise OSError('Failed to create %s' % dirpath)
  98. return dirpath
  99. def parse_extras(s, numeric=True):
  100. """
  101. Return an OrderedDict parsed from a string in the format
  102. "key1:val1,key2:val2"
  103. """
  104. # allow for escaped quoted text
  105. commas = re.compile(r"""((?:[^,"']|"[^"]*"|'[^']*')+)""")
  106. colons = re.compile(r"""((?:[^:"']|"[^"]*"|'[^']*')+)""")
  107. extras = commas.split(s)[1::2]
  108. extras = (colons.split(e)[1::2] for e in extras)
  109. extras = ((k, cast(v) if numeric else v) for k, v in extras)
  110. extras = OrderedDict(extras)
  111. return extras
  112. class Opener(object):
  113. """Factory for creating file objects
  114. Keyword Arguments:
  115. - mode -- A string indicating how the file is to be opened. Accepts the
  116. same values as the builtin open() function.
  117. - bufsize -- The file's desired buffer size. Accepts the same values as
  118. the builtin open() function.
  119. """
  120. def __init__(self, mode='r', bufsize=-1):
  121. self._mode = mode
  122. self._bufsize = bufsize
  123. def __call__(self, string):
  124. if string is sys.stdout or string is sys.stdin:
  125. return string
  126. elif string == '-':
  127. return sys.stdin if 'r' in self._mode else sys.stdout
  128. elif string.endswith('.bz2'):
  129. return bz2.BZ2File(string, self._mode, self._bufsize)
  130. elif string.endswith('.gz'):
  131. return gzip.open(string, self._mode, self._bufsize)
  132. else:
  133. return open(string, self._mode, self._bufsize)
  134. def __repr__(self):
  135. args = self._mode, self._bufsize
  136. args_str = ', '.join(repr(arg) for arg in args if arg != -1)
  137. return '{}({})'.format(type(self).__name__, args_str)
  138. def opener(pth, mode='r', bufsize=-1):
  139. return Opener(mode, bufsize)(pth)
  140. class Csv2Dict(object):
  141. """Easy way to convert a csv file into a
  142. dictionary using the argparse type function
  143. If no arguments the first column of the csv
  144. will be the key and every column
  145. will be the value in an OrderedDict.
  146. Keyword Arguments:
  147. - index -- csv column to key index the dictionary
  148. - value -- csv column to value the dictionary
  149. - fieldnames -- csv column names
  150. """
  151. def __init__(self, index=None, value=None, *args, **kwds):
  152. self.index = index
  153. self.value = value
  154. self.args = args
  155. self.kwds = kwds
  156. def __call__(self, pth):
  157. reader = DictReader(opener(pth), *self.args, **self.kwds)
  158. if not self.index:
  159. self.index = reader.fieldnames[0]
  160. results = {}
  161. for r in reader:
  162. key = r[self.index]
  163. if self.value:
  164. results[key] = r[self.value]
  165. else:
  166. fields = lambda k: reader.fieldnames.index(k[0])
  167. results[key] = OrderedDict(sorted(r.items(), key=fields))
  168. return results
  169. def csv2dict(pth, index=None, value=None, *args, **kwds):
  170. return Csv2Dict(index, value, args, kwds)(pth)
  171. def groupbyl(li, key=None, as_dict=False):
  172. groups = sorted(li, key=key)
  173. groups = groupby(groups, key=key)
  174. groups = ((g, list(l)) for g, l in groups)
  175. if as_dict:
  176. return(dict(groups))
  177. else:
  178. return groups
  179. def _exit_on_signal(sig, status=None, message=None):
  180. def exit(sig, frame):
  181. if message:
  182. print >> sys.stderr, message
  183. raise SystemExit(status)
  184. signal.signal(sig, exit)
  185. def exit_on_sigint(status=1, message="Canceled."):
  186. """
  187. Set program to exit on SIGINT, with provided status and message.
  188. """
  189. _exit_on_signal(signal.SIGINT, status, message)
  190. def exit_on_sigpipe(status=None):
  191. """
  192. Set program to exit on SIGPIPE
  193. """
  194. _exit_on_signal(signal.SIGPIPE, status)
  195. def read_csv(filename, compression=None, limit=None, **kwargs):
  196. """Read a csv file using pandas.read_csv with compression defined by
  197. the file suffix unless provided.
  198. """
  199. suffixes = {'.bz2': 'bz2', '.gz': 'gzip'}
  200. compression = compression or suffixes.get(path.splitext(filename)[-1])
  201. kwargs['compression'] = compression
  202. return pandas.read_csv(filename, **kwargs)
  203. @contextlib.contextmanager
  204. def named_tempfile(*args, **kwargs):
  205. """Near-clone of tempfile.NamedTemporaryFile, but the file is deleted
  206. when the context manager exits, rather than when it's closed.
  207. """
  208. kwargs['delete'] = False
  209. tf = tempfile.NamedTemporaryFile(*args, **kwargs)
  210. try:
  211. with tf:
  212. yield tf
  213. finally:
  214. os.unlink(tf.name)