/lincs/io/genepix.py
Python | 78 lines | 37 code | 11 blank | 30 comment | 7 complexity | 6d58e177d31293e6bbece86f4feff095 MD5 | raw file
- import pandas
- import numpy
- import logging
- from collections import OrderedDict
- class GenepixParseError(Exception):
- """ Exception raised when a Genepix spreadsheet cannot be parsed.
- """
- pass
- def parse_genepix_sheet(xls, sheet_name):
- """ Parses a single sheet from an Excel spreadsheet containing
- Genepix data. This sheet should contain the data for one
- column of a device used in a LINCS experiment.
- :param xls: Excel file object containing sheets
- :type xls: pandas.io.parsers.ExcelFile.
- :param sheet_name: The name of the sheet to parse.
- :type sheet_name: str.
- :returns: data - a pandas.DataFrame object.
- :raises: GenepixParseError
- """
- raw_data = xls.parse(sheet_name)
- # if raw_data.shape[-1] != 77:
- # import IPython; IPython.embed()
- # raise GenepixParseError("Sheet does not have 77 columns.")
- header_set = set(['Block', 'Row'])
- for (starting_row_idx, row) in raw_data.iterrows():
- if header_set.issubset(set(row)):
- logging.debug("\t...skipping first {0} rows".format(
- starting_row_idx
- ))
- break
- if starting_row_idx > 200:
- msg = "Starting row for genepix file suspciously high: {0}".format(
- starting_row_idx
- )
- logging.error(msg)
- data = raw_data.drop(range(starting_row_idx + 1))
- data.columns = raw_data.ix[starting_row_idx]
- data.index = range(data.shape[0])
- data = data.replace("Error", numpy.nan)
- return data
- def read_genepix(filepath, max_sheets=None, needed_sheets=None):
- """ Parses an Excel spreadsheet containing Genepix data. The
- spreadsheet should have one sheet for each column on the
- device.
- :param filepath: Path to Excel file to parse
- :type xls: str.
- :param max_sheets: Max number of sheets to parse
- :type max_sheets: int.
- :param needed_sheets: List of names of sheets to be extracted.
- :type needed_sheets: iterable.
- :returns: parsed_sheets -- list of DataFrames, one for each sheet
- :raises: GenepixParseError
- """
- xls = pandas.ExcelFile(filepath)
- parsed_sheets = OrderedDict()
- if not needed_sheets:
- needed_sheets = xls.sheet_names
- for i, sheet_name in enumerate(needed_sheets):
- if max_sheets != None and i > max_sheets:
- break
- logging.debug("Genepix sheet: {0}".format(sheet_name))
- data = parse_genepix_sheet(xls, sheet_name)
- parsed_sheets[sheet_name] = data
- return parsed_sheets