kmj_lincs /lincs/io/elements.py

Language Python Lines 158
MD5 Hash 4b295031ab2d2ce29e5fb6e27896be21 Estimated Cost $1,830 (why?)
Repository https://bitbucket.org/kljensen/kmj_lincs.git View Raw File View Project SPDX
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import pandas
import numpy
import logging
import itertools
from collections import OrderedDict, Counter
from ..analysis.elements import extract_cell_counts


class ElementsParseError(Exception):
    """ Exception raised when a Elements spreadsheet cannot be parsed.
    """
    pass


def read_cell_counts(filepath, sheetdict, manual_columns):

    # Those columns that were counted by hand
    manual_col_dict = {}
    # Those columns that were counted automatically
    automated_col_dict = {}
    for (k, v) in sheetdict.iteritems():
        if manual_columns is True or k in manual_columns:
            manual_col_dict[k] = v
        else:
            automated_col_dict[k] = v

    # Handle the case in which cell counting was
    # done manually
    #
    manual_counts = read_manual_cell_counts(filepath, manual_col_dict)

    # Handle the case in which the cell counting was
    # done using the Elements software.
    #
    if automated_col_dict:
        elem = read_elements(
            filepath,
            needed_sheets=itertools.chain.from_iterable(
                    automated_col_dict.values()
            ),
        )

        # Calculate the Elements cell counts
        #
        automated_counts = extract_cell_counts(
            automated_col_dict,
            elem,
        )
    else:
        automated_counts = {}

    # Combine the two counts
    #
    total_counts = manual_counts
    total_counts.update(automated_counts)
    return total_counts


def read_manual_cell_counts(filepath, sheetdict):
    """ Parses an Excel spreadsheet containing cell counts.  Returns
        a dict of Counter objects, the same as the `extract_cell_counts`
        function in `analysis.elements`.

        :param filepath: Path to Excel file to parse
        :type filepath: str.
        :param sheetdict: Mapping from col number to sheet names
        :type sheetdict: dict
        :returns:  dict of Counter objects.
        :raises: ElementsParseError
    """
    if not sheetdict:
        return {}

    xls = pandas.ExcelFile(filepath)

    cell_counts = {}
    for col_no, sheet_names in sheetdict.iteritems():
        col_cell_count = Counter()
        for sheet_name in sheet_names:
            raw_data = xls.parse(sheet_name, header=None)
            for well_no, cell_count in raw_data.iterrows():
                col_cell_count[well_no] = cell_count[0]
        cell_counts[col_no] = col_cell_count
    return cell_counts



def parse_elements_sheet(xls, sheet_name):
    """ Parses a single sheet from an Excel spreadsheet containing
        Elements data.  This sheet should contain the data for one
        column of a device used in a LINCS experiment.

        :param xls: Excel file object containing sheets
        :type xls: pandas.io.parsers.ExcelFile.
        :param sheet_name: The name of the sheet to parse.
        :type sheet_name: str.
        :returns:  data - a pandas.DataFrame object.
        :raises: ElementsParseError

    """
    raw_data = xls.parse(sheet_name)
    if raw_data.shape[-1] not in (22, 23):
        raise ElementsParseError("Sheet does not have 22 or 23 columns.")

    passed_feature = False
    for stop_row_idx in range(raw_data.shape[0] - 1, 0, -1):
        row = raw_data.irow(stop_row_idx)
        if not passed_feature and row[0] == "Feature":

            # Go back until we've passed the metadata at the bottom
            # of the spreadsheet.
            passed_feature = True

        elif passed_feature:

            # Now go back until we find a row that begins with an
            # integer.
            try:
                int(row[0])
                stop_row_idx += 1
                break
            except TypeError:
                continue

    data = raw_data.drop(range(stop_row_idx, raw_data.shape[0]))
    data = data.replace("N/A", numpy.nan)
    return data


def read_elements(filepath, max_sheets=None, needed_sheets=None):
    """ Parses an Excel spreadsheet containing Elements data.  The
        spreadsheet should have one sheet for each column on the
        device.

        :param filepath: Path to Excel file to parse
        :type xls: str.
        :param max_sheets: Max number of sheets to parse
        :type max_sheets: int.
        :param needed_sheets: List of names of sheets to be extracted.
        :type needed_sheets: iterable.
        :returns:  int -- the return code.
        :raises: ElementsParseError
    """
    xls = pandas.ExcelFile(filepath)
    parsed_sheets = OrderedDict()

    if not needed_sheets:
        needed_sheets = xls.sheet_names

    for i, sheet_name in enumerate(needed_sheets):
        if max_sheets != None and i > max_sheets:
            break
        logging.debug("Elements sheet: {0}".format(sheet_name))
        data = parse_elements_sheet(xls, sheet_name)
        parsed_sheets[sheet_name] = data

    return parsed_sheets
Back to Top