elements.py - Those columns that were counted by hand Those…

/lincs/io/elements.py

https://bitbucket.org/kljensen/kmj_lincs · Python · 157 lines · 77 code · 26 blank · 54 comment · 21 complexity · 4b295031ab2d2ce29e5fb6e27896be21 MD5 · raw file

import pandas
import numpy
import logging
import itertools
from collections import OrderedDict, Counter
from ..analysis.elements import extract_cell_counts


class ElementsParseError(Exception):
    """ Exception raised when a Elements spreadsheet cannot be parsed.
    """
    pass


def read_cell_counts(filepath, sheetdict, manual_columns):

    # Those columns that were counted by hand
    manual_col_dict = {}
    # Those columns that were counted automatically
    automated_col_dict = {}
    for (k, v) in sheetdict.iteritems():
        if manual_columns is True or k in manual_columns:
            manual_col_dict[k] = v
        else:
            automated_col_dict[k] = v

    # Handle the case in which cell counting was
    # done manually
    #
    manual_counts = read_manual_cell_counts(filepath, manual_col_dict)

    # Handle the case in which the cell counting was
    # done using the Elements software.
    #
    if automated_col_dict:
        elem = read_elements(
            filepath,
            needed_sheets=itertools.chain.from_iterable(
                    automated_col_dict.values()
            ),
        )

        # Calculate the Elements cell counts
        #
        automated_counts = extract_cell_counts(
            automated_col_dict,
            elem,
        )
    else:
        automated_counts = {}

    # Combine the two counts
    #
    total_counts = manual_counts
    total_counts.update(automated_counts)
    return total_counts


def read_manual_cell_counts(filepath, sheetdict):
    """ Parses an Excel spreadsheet containing cell counts.  Returns
        a dict of Counter objects, the same as the `extract_cell_counts`
        function in `analysis.elements`.

        :param filepath: Path to Excel file to parse
        :type filepath: str.
        :param sheetdict: Mapping from col number to sheet names
        :type sheetdict: dict
        :returns:  dict of Counter objects.
        :raises: ElementsParseError
    """
    if not sheetdict:
        return {}

    xls = pandas.ExcelFile(filepath)

    cell_counts = {}
    for col_no, sheet_names in sheetdict.iteritems():
        col_cell_count = Counter()
        for sheet_name in sheet_names:
            raw_data = xls.parse(sheet_name, header=None)
            for well_no, cell_count in raw_data.iterrows():
                col_cell_count[well_no] = cell_count[0]
        cell_counts[col_no] = col_cell_count
    return cell_counts



def parse_elements_sheet(xls, sheet_name):
    """ Parses a single sheet from an Excel spreadsheet containing
        Elements data.  This sheet should contain the data for one
        column of a device used in a LINCS experiment.

        :param xls: Excel file object containing sheets
        :type xls: pandas.io.parsers.ExcelFile.
        :param sheet_name: The name of the sheet to parse.
        :type sheet_name: str.
        :returns:  data - a pandas.DataFrame object.
        :raises: ElementsParseError

    """
    raw_data = xls.parse(sheet_name)
    if raw_data.shape[-1] not in (22, 23):
        raise ElementsParseError("Sheet does not have 22 or 23 columns.")

    passed_feature = False
    for stop_row_idx in range(raw_data.shape[0] - 1, 0, -1):
        row = raw_data.irow(stop_row_idx)
        if not passed_feature and row[0] == "Feature":

            # Go back until we've passed the metadata at the bottom
            # of the spreadsheet.
            passed_feature = True

        elif passed_feature:

            # Now go back until we find a row that begins with an
            # integer.
            try:
                int(row[0])
                stop_row_idx += 1
                break
            except TypeError:
                continue

    data = raw_data.drop(range(stop_row_idx, raw_data.shape[0]))
    data = data.replace("N/A", numpy.nan)
    return data


def read_elements(filepath, max_sheets=None, needed_sheets=None):
    """ Parses an Excel spreadsheet containing Elements data.  The
        spreadsheet should have one sheet for each column on the
        device.

        :param filepath: Path to Excel file to parse
        :type xls: str.
        :param max_sheets: Max number of sheets to parse
        :type max_sheets: int.
        :param needed_sheets: List of names of sheets to be extracted.
        :type needed_sheets: iterable.
        :returns:  int -- the return code.
        :raises: ElementsParseError
    """
    xls = pandas.ExcelFile(filepath)
    parsed_sheets = OrderedDict()

    if not needed_sheets:
        needed_sheets = xls.sheet_names

    for i, sheet_name in enumerate(needed_sheets):
        if max_sheets != None and i > max_sheets:
            break
        logging.debug("Elements sheet: {0}".format(sheet_name))
        data = parse_elements_sheet(xls, sheet_name)
        parsed_sheets[sheet_name] = data

    return parsed_sheets
Tech Fingerprint

Alerts (5)

'def' Ensure functions have docstrings for documentation
15
Complexity hotspot; lines 21 to 22 (total complexity: 3)
21 22
Complexity hotspot; lines 150 to 151 (total complexity: 3)
150 151