utils.py | searchcode

/bioy_pkg/utils.py

https://bitbucket.org/crosenth/bioy
Python | 282 lines | 252 code | 5 blank | 25 comment | 1 complexity | 75df23eea1fce394c67e0122f5fc527b MD5 | raw file
Possible License(s): GPL-3.0

# This file is part of Bioy
#
#    Bioy is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    Bioy is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with Bioy.  If not, see <http://www.gnu.org/licenses/>.

import os
import bz2
import gzip
import logging
import pandas
import re
import shutil
import sys
import signal
import contextlib
import tempfile

from itertools import takewhile, izip_longest, groupby
from csv import DictReader
from collections import Iterable, OrderedDict
from os import path

log = logging.getLogger(__name__)


def apply_df_status(func, df, msg=''):
    """
    """
    tmp_column = 'index_number'
    row_count = float(len(df))
    df[tmp_column] = xrange(int(row_count))
    msg += ' {:.0%}\r'

    def apply_func(item, msg):
        sys.stderr.write(msg.format(item[tmp_column] / row_count))
        return func(item)

    df = df.apply(apply_func, args=[msg], axis=1)
    return df.drop(tmp_column, axis=1)


def flattener(iterable):
    """
    Flatten nested iterables (not strings or dict-like objects).

    Poached from http://stackoverflow.com/questions/2158395
                       /flatten-an-irregular-list-of-lists-in-python
    """
    for el in iterable:
        if isinstance(el, Iterable) and \
                not (isinstance(el, basestring) or hasattr(el, 'get')):
            for sub in flattener(el):
                yield sub
        else:
            yield el


def chunker(seq, size, combine_last=None):
    """
    Break sequence seq into lists of length `size`. If the length of
    the final list is < 'combine_last', it is appended to the end of
    the penultimate element.
    """

    chunks = [seq[pos:pos + size] for pos in xrange(0, len(seq), size)]
    if combine_last and len(chunks[-1]) < combine_last:
        chunks[-2].extend(chunks.pop(-1))

    return iter(chunks)


def grouper(n, iterable, pad=True):
    """
    Return sequence of n-tuples composed of successive elements of
    iterable; last tuple is padded with None if necessary. Not safe
    for iterables with None elements.
    """

    args = [iter(iterable)] * n
    iterout = izip_longest(fillvalue=None, *args)

    if pad:
        return iterout
    else:
        return (takewhile(lambda x: x is not None, c) for c in iterout)


def cast(val):
    for func in [int, float, lambda x: x.strip()]:
        try:
            return func(val)
        except ValueError:
            pass


def mkdir(dirpath, clobber=False):
    """
    Create a (potentially existing) directory without errors. Raise
    OSError if directory can't be created. If clobber is True, remove
    dirpath if it exists.
    """

    if clobber:
        shutil.rmtree(dirpath, ignore_errors=True)

    try:
        os.mkdir(dirpath)
    except OSError:
        pass

    if not path.exists(dirpath):
        raise OSError('Failed to create %s' % dirpath)

    return dirpath


def parse_extras(s, numeric=True):
    """
    Return an OrderedDict parsed from a string in the format
    "key1:val1,key2:val2"
    """

    # allow for escaped quoted text
    commas = re.compile(r"""((?:[^,"']|"[^"]*"|'[^']*')+)""")
    colons = re.compile(r"""((?:[^:"']|"[^"]*"|'[^']*')+)""")

    extras = commas.split(s)[1::2]
    extras = (colons.split(e)[1::2] for e in extras)
    extras = ((k, cast(v) if numeric else v) for k, v in extras)
    extras = OrderedDict(extras)

    return extras


class Opener(object):

    """Factory for creating file objects

    Keyword Arguments:
        - mode -- A string indicating how the file is to be opened. Accepts the
            same values as the builtin open() function.
        - bufsize -- The file's desired buffer size. Accepts the same values as
            the builtin open() function.
    """

    def __init__(self, mode='r', bufsize=-1):
        self._mode = mode
        self._bufsize = bufsize

    def __call__(self, string):
        if string is sys.stdout or string is sys.stdin:
            return string
        elif string == '-':
            return sys.stdin if 'r' in self._mode else sys.stdout
        elif string.endswith('.bz2'):
            return bz2.BZ2File(string, self._mode, self._bufsize)
        elif string.endswith('.gz'):
            return gzip.open(string, self._mode, self._bufsize)
        else:
            return open(string, self._mode, self._bufsize)

    def __repr__(self):
        args = self._mode, self._bufsize
        args_str = ', '.join(repr(arg) for arg in args if arg != -1)
        return '{}({})'.format(type(self).__name__, args_str)


def opener(pth, mode='r', bufsize=-1):
    return Opener(mode, bufsize)(pth)


class Csv2Dict(object):

    """Easy way to convert a csv file into a
    dictionary using the argparse type function

    If no arguments the first column of the csv
    will be the key and every column
    will be the value in an OrderedDict.

    Keyword Arguments:
        - index -- csv column to key index the dictionary
        - value -- csv column to value the dictionary
        - fieldnames -- csv column names
    """

    def __init__(self, index=None, value=None, *args, **kwds):
        self.index = index
        self.value = value
        self.args = args
        self.kwds = kwds

    def __call__(self, pth):
        reader = DictReader(opener(pth), *self.args, **self.kwds)

        if not self.index:
            self.index = reader.fieldnames[0]

        results = {}
        for r in reader:
            key = r[self.index]
            if self.value:
                results[key] = r[self.value]
            else:
                fields = lambda k: reader.fieldnames.index(k[0])
                results[key] = OrderedDict(sorted(r.items(), key=fields))

        return results


def csv2dict(pth, index=None, value=None, *args, **kwds):
    return Csv2Dict(index, value, args, kwds)(pth)


def groupbyl(li, key=None, as_dict=False):
    groups = sorted(li, key=key)
    groups = groupby(groups, key=key)
    groups = ((g, list(l)) for g, l in groups)
    if as_dict:
        return(dict(groups))
    else:
        return groups


def _exit_on_signal(sig, status=None, message=None):
    def exit(sig, frame):
        if message:
            print >> sys.stderr, message
        raise SystemExit(status)
    signal.signal(sig, exit)


def exit_on_sigint(status=1, message="Canceled."):
    """
    Set program to exit on SIGINT, with provided status and message.
    """
    _exit_on_signal(signal.SIGINT, status, message)


def exit_on_sigpipe(status=None):
    """
    Set program to exit on SIGPIPE
    """
    _exit_on_signal(signal.SIGPIPE, status)


def read_csv(filename, compression=None, limit=None, **kwargs):
    """Read a csv file using pandas.read_csv with compression defined by
    the file suffix unless provided.
    """

    suffixes = {'.bz2': 'bz2', '.gz': 'gzip'}
    compression = compression or suffixes.get(path.splitext(filename)[-1])
    kwargs['compression'] = compression

    return pandas.read_csv(filename, **kwargs)


@contextlib.contextmanager
def named_tempfile(*args, **kwargs):
    """Near-clone of tempfile.NamedTemporaryFile, but the file is deleted
    when the context manager exits, rather than when it's closed.

    """

    kwargs['delete'] = False
    tf = tempfile.NamedTemporaryFile(*args, **kwargs)
    try:
        with tf:
            yield tf
    finally:
        os.unlink(tf.name)