PageRenderTime 25ms CodeModel.GetById 14ms RepoModel.GetById 1ms app.codeStats 0ms

/lincs/io/genepix.py

https://bitbucket.org/kljensen/kmj_lincs
Python | 78 lines | 37 code | 11 blank | 30 comment | 7 complexity | 6d58e177d31293e6bbece86f4feff095 MD5 | raw file
  1. import pandas
  2. import numpy
  3. import logging
  4. from collections import OrderedDict
  5. class GenepixParseError(Exception):
  6. """ Exception raised when a Genepix spreadsheet cannot be parsed.
  7. """
  8. pass
  9. def parse_genepix_sheet(xls, sheet_name):
  10. """ Parses a single sheet from an Excel spreadsheet containing
  11. Genepix data. This sheet should contain the data for one
  12. column of a device used in a LINCS experiment.
  13. :param xls: Excel file object containing sheets
  14. :type xls: pandas.io.parsers.ExcelFile.
  15. :param sheet_name: The name of the sheet to parse.
  16. :type sheet_name: str.
  17. :returns: data - a pandas.DataFrame object.
  18. :raises: GenepixParseError
  19. """
  20. raw_data = xls.parse(sheet_name)
  21. # if raw_data.shape[-1] != 77:
  22. # import IPython; IPython.embed()
  23. # raise GenepixParseError("Sheet does not have 77 columns.")
  24. header_set = set(['Block', 'Row'])
  25. for (starting_row_idx, row) in raw_data.iterrows():
  26. if header_set.issubset(set(row)):
  27. logging.debug("\t...skipping first {0} rows".format(
  28. starting_row_idx
  29. ))
  30. break
  31. if starting_row_idx > 200:
  32. msg = "Starting row for genepix file suspciously high: {0}".format(
  33. starting_row_idx
  34. )
  35. logging.error(msg)
  36. data = raw_data.drop(range(starting_row_idx + 1))
  37. data.columns = raw_data.ix[starting_row_idx]
  38. data.index = range(data.shape[0])
  39. data = data.replace("Error", numpy.nan)
  40. return data
  41. def read_genepix(filepath, max_sheets=None, needed_sheets=None):
  42. """ Parses an Excel spreadsheet containing Genepix data. The
  43. spreadsheet should have one sheet for each column on the
  44. device.
  45. :param filepath: Path to Excel file to parse
  46. :type xls: str.
  47. :param max_sheets: Max number of sheets to parse
  48. :type max_sheets: int.
  49. :param needed_sheets: List of names of sheets to be extracted.
  50. :type needed_sheets: iterable.
  51. :returns: parsed_sheets -- list of DataFrames, one for each sheet
  52. :raises: GenepixParseError
  53. """
  54. xls = pandas.ExcelFile(filepath)
  55. parsed_sheets = OrderedDict()
  56. if not needed_sheets:
  57. needed_sheets = xls.sheet_names
  58. for i, sheet_name in enumerate(needed_sheets):
  59. if max_sheets != None and i > max_sheets:
  60. break
  61. logging.debug("Genepix sheet: {0}".format(sheet_name))
  62. data = parse_genepix_sheet(xls, sheet_name)
  63. parsed_sheets[sheet_name] = data
  64. return parsed_sheets