/lincs/io/elements.py
Python | 157 lines | 77 code | 26 blank | 54 comment | 21 complexity | 4b295031ab2d2ce29e5fb6e27896be21 MD5 | raw file
- import pandas
- import numpy
- import logging
- import itertools
- from collections import OrderedDict, Counter
- from ..analysis.elements import extract_cell_counts
- class ElementsParseError(Exception):
- """ Exception raised when a Elements spreadsheet cannot be parsed.
- """
- pass
- def read_cell_counts(filepath, sheetdict, manual_columns):
- # Those columns that were counted by hand
- manual_col_dict = {}
- # Those columns that were counted automatically
- automated_col_dict = {}
- for (k, v) in sheetdict.iteritems():
- if manual_columns is True or k in manual_columns:
- manual_col_dict[k] = v
- else:
- automated_col_dict[k] = v
- # Handle the case in which cell counting was
- # done manually
- #
- manual_counts = read_manual_cell_counts(filepath, manual_col_dict)
- # Handle the case in which the cell counting was
- # done using the Elements software.
- #
- if automated_col_dict:
- elem = read_elements(
- filepath,
- needed_sheets=itertools.chain.from_iterable(
- automated_col_dict.values()
- ),
- )
- # Calculate the Elements cell counts
- #
- automated_counts = extract_cell_counts(
- automated_col_dict,
- elem,
- )
- else:
- automated_counts = {}
- # Combine the two counts
- #
- total_counts = manual_counts
- total_counts.update(automated_counts)
- return total_counts
- def read_manual_cell_counts(filepath, sheetdict):
- """ Parses an Excel spreadsheet containing cell counts. Returns
- a dict of Counter objects, the same as the `extract_cell_counts`
- function in `analysis.elements`.
- :param filepath: Path to Excel file to parse
- :type filepath: str.
- :param sheetdict: Mapping from col number to sheet names
- :type sheetdict: dict
- :returns: dict of Counter objects.
- :raises: ElementsParseError
- """
- if not sheetdict:
- return {}
- xls = pandas.ExcelFile(filepath)
- cell_counts = {}
- for col_no, sheet_names in sheetdict.iteritems():
- col_cell_count = Counter()
- for sheet_name in sheet_names:
- raw_data = xls.parse(sheet_name, header=None)
- for well_no, cell_count in raw_data.iterrows():
- col_cell_count[well_no] = cell_count[0]
- cell_counts[col_no] = col_cell_count
- return cell_counts
- def parse_elements_sheet(xls, sheet_name):
- """ Parses a single sheet from an Excel spreadsheet containing
- Elements data. This sheet should contain the data for one
- column of a device used in a LINCS experiment.
- :param xls: Excel file object containing sheets
- :type xls: pandas.io.parsers.ExcelFile.
- :param sheet_name: The name of the sheet to parse.
- :type sheet_name: str.
- :returns: data - a pandas.DataFrame object.
- :raises: ElementsParseError
- """
- raw_data = xls.parse(sheet_name)
- if raw_data.shape[-1] not in (22, 23):
- raise ElementsParseError("Sheet does not have 22 or 23 columns.")
- passed_feature = False
- for stop_row_idx in range(raw_data.shape[0] - 1, 0, -1):
- row = raw_data.irow(stop_row_idx)
- if not passed_feature and row[0] == "Feature":
- # Go back until we've passed the metadata at the bottom
- # of the spreadsheet.
- passed_feature = True
- elif passed_feature:
- # Now go back until we find a row that begins with an
- # integer.
- try:
- int(row[0])
- stop_row_idx += 1
- break
- except TypeError:
- continue
- data = raw_data.drop(range(stop_row_idx, raw_data.shape[0]))
- data = data.replace("N/A", numpy.nan)
- return data
- def read_elements(filepath, max_sheets=None, needed_sheets=None):
- """ Parses an Excel spreadsheet containing Elements data. The
- spreadsheet should have one sheet for each column on the
- device.
- :param filepath: Path to Excel file to parse
- :type xls: str.
- :param max_sheets: Max number of sheets to parse
- :type max_sheets: int.
- :param needed_sheets: List of names of sheets to be extracted.
- :type needed_sheets: iterable.
- :returns: int -- the return code.
- :raises: ElementsParseError
- """
- xls = pandas.ExcelFile(filepath)
- parsed_sheets = OrderedDict()
- if not needed_sheets:
- needed_sheets = xls.sheet_names
- for i, sheet_name in enumerate(needed_sheets):
- if max_sheets != None and i > max_sheets:
- break
- logging.debug("Elements sheet: {0}".format(sheet_name))
- data = parse_elements_sheet(xls, sheet_name)
- parsed_sheets[sheet_name] = data
- return parsed_sheets