kmj_lincs /lincs/io/genepix.py

Language Python Lines 79
MD5 Hash 6d58e177d31293e6bbece86f4feff095 Estimated Cost $848 (why?)
Repository https://bitbucket.org/kljensen/kmj_lincs.git View Raw File View Project SPDX
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas
import numpy
import logging
from collections import OrderedDict


class GenepixParseError(Exception):
    """ Exception raised when a Genepix spreadsheet cannot be parsed.
    """
    pass


def parse_genepix_sheet(xls, sheet_name):
    """ Parses a single sheet from an Excel spreadsheet containing
        Genepix data.  This sheet should contain the data for one
        column of a device used in a LINCS experiment.

        :param xls: Excel file object containing sheets
        :type xls: pandas.io.parsers.ExcelFile.
        :param sheet_name: The name of the sheet to parse.
        :type sheet_name: str.
        :returns:  data - a pandas.DataFrame object.
        :raises: GenepixParseError

    """
    raw_data = xls.parse(sheet_name)
    # if raw_data.shape[-1] != 77:
    #     import IPython; IPython.embed()
    #     raise GenepixParseError("Sheet does not have 77 columns.")

    header_set = set(['Block', 'Row'])
    for (starting_row_idx, row) in raw_data.iterrows():
        if header_set.issubset(set(row)):
            logging.debug("\t...skipping first {0} rows".format(
                starting_row_idx
            ))
            break

    if starting_row_idx > 200:
        msg = "Starting row for genepix file suspciously high: {0}".format(
            starting_row_idx
        )
        logging.error(msg)

    data = raw_data.drop(range(starting_row_idx + 1))
    data.columns = raw_data.ix[starting_row_idx]
    data.index = range(data.shape[0])
    data = data.replace("Error", numpy.nan)
    return data


def read_genepix(filepath, max_sheets=None, needed_sheets=None):
    """ Parses an Excel spreadsheet containing Genepix data.  The
        spreadsheet should have one sheet for each column on the
        device.

        :param filepath: Path to Excel file to parse
        :type xls: str.
        :param max_sheets: Max number of sheets to parse
        :type max_sheets: int.
        :param needed_sheets: List of names of sheets to be extracted.
        :type needed_sheets: iterable.
        :returns:  parsed_sheets -- list of DataFrames, one for each sheet
        :raises: GenepixParseError
    """
    xls = pandas.ExcelFile(filepath)
    parsed_sheets = OrderedDict()
    if not needed_sheets:
        needed_sheets = xls.sheet_names

    for i, sheet_name in enumerate(needed_sheets):
        if max_sheets != None and i > max_sheets:
            break
        logging.debug("Genepix sheet: {0}".format(sheet_name))
        data = parse_genepix_sheet(xls, sheet_name)
        parsed_sheets[sheet_name] = data

    return parsed_sheets
Back to Top