PageRenderTime 48ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/lincs/io/elements.py

https://bitbucket.org/kljensen/kmj_lincs
Python | 157 lines | 77 code | 26 blank | 54 comment | 21 complexity | 4b295031ab2d2ce29e5fb6e27896be21 MD5 | raw file
  1. import pandas
  2. import numpy
  3. import logging
  4. import itertools
  5. from collections import OrderedDict, Counter
  6. from ..analysis.elements import extract_cell_counts
  7. class ElementsParseError(Exception):
  8. """ Exception raised when a Elements spreadsheet cannot be parsed.
  9. """
  10. pass
  11. def read_cell_counts(filepath, sheetdict, manual_columns):
  12. # Those columns that were counted by hand
  13. manual_col_dict = {}
  14. # Those columns that were counted automatically
  15. automated_col_dict = {}
  16. for (k, v) in sheetdict.iteritems():
  17. if manual_columns is True or k in manual_columns:
  18. manual_col_dict[k] = v
  19. else:
  20. automated_col_dict[k] = v
  21. # Handle the case in which cell counting was
  22. # done manually
  23. #
  24. manual_counts = read_manual_cell_counts(filepath, manual_col_dict)
  25. # Handle the case in which the cell counting was
  26. # done using the Elements software.
  27. #
  28. if automated_col_dict:
  29. elem = read_elements(
  30. filepath,
  31. needed_sheets=itertools.chain.from_iterable(
  32. automated_col_dict.values()
  33. ),
  34. )
  35. # Calculate the Elements cell counts
  36. #
  37. automated_counts = extract_cell_counts(
  38. automated_col_dict,
  39. elem,
  40. )
  41. else:
  42. automated_counts = {}
  43. # Combine the two counts
  44. #
  45. total_counts = manual_counts
  46. total_counts.update(automated_counts)
  47. return total_counts
  48. def read_manual_cell_counts(filepath, sheetdict):
  49. """ Parses an Excel spreadsheet containing cell counts. Returns
  50. a dict of Counter objects, the same as the `extract_cell_counts`
  51. function in `analysis.elements`.
  52. :param filepath: Path to Excel file to parse
  53. :type filepath: str.
  54. :param sheetdict: Mapping from col number to sheet names
  55. :type sheetdict: dict
  56. :returns: dict of Counter objects.
  57. :raises: ElementsParseError
  58. """
  59. if not sheetdict:
  60. return {}
  61. xls = pandas.ExcelFile(filepath)
  62. cell_counts = {}
  63. for col_no, sheet_names in sheetdict.iteritems():
  64. col_cell_count = Counter()
  65. for sheet_name in sheet_names:
  66. raw_data = xls.parse(sheet_name, header=None)
  67. for well_no, cell_count in raw_data.iterrows():
  68. col_cell_count[well_no] = cell_count[0]
  69. cell_counts[col_no] = col_cell_count
  70. return cell_counts
  71. def parse_elements_sheet(xls, sheet_name):
  72. """ Parses a single sheet from an Excel spreadsheet containing
  73. Elements data. This sheet should contain the data for one
  74. column of a device used in a LINCS experiment.
  75. :param xls: Excel file object containing sheets
  76. :type xls: pandas.io.parsers.ExcelFile.
  77. :param sheet_name: The name of the sheet to parse.
  78. :type sheet_name: str.
  79. :returns: data - a pandas.DataFrame object.
  80. :raises: ElementsParseError
  81. """
  82. raw_data = xls.parse(sheet_name)
  83. if raw_data.shape[-1] not in (22, 23):
  84. raise ElementsParseError("Sheet does not have 22 or 23 columns.")
  85. passed_feature = False
  86. for stop_row_idx in range(raw_data.shape[0] - 1, 0, -1):
  87. row = raw_data.irow(stop_row_idx)
  88. if not passed_feature and row[0] == "Feature":
  89. # Go back until we've passed the metadata at the bottom
  90. # of the spreadsheet.
  91. passed_feature = True
  92. elif passed_feature:
  93. # Now go back until we find a row that begins with an
  94. # integer.
  95. try:
  96. int(row[0])
  97. stop_row_idx += 1
  98. break
  99. except TypeError:
  100. continue
  101. data = raw_data.drop(range(stop_row_idx, raw_data.shape[0]))
  102. data = data.replace("N/A", numpy.nan)
  103. return data
  104. def read_elements(filepath, max_sheets=None, needed_sheets=None):
  105. """ Parses an Excel spreadsheet containing Elements data. The
  106. spreadsheet should have one sheet for each column on the
  107. device.
  108. :param filepath: Path to Excel file to parse
  109. :type xls: str.
  110. :param max_sheets: Max number of sheets to parse
  111. :type max_sheets: int.
  112. :param needed_sheets: List of names of sheets to be extracted.
  113. :type needed_sheets: iterable.
  114. :returns: int -- the return code.
  115. :raises: ElementsParseError
  116. """
  117. xls = pandas.ExcelFile(filepath)
  118. parsed_sheets = OrderedDict()
  119. if not needed_sheets:
  120. needed_sheets = xls.sheet_names
  121. for i, sheet_name in enumerate(needed_sheets):
  122. if max_sheets != None and i > max_sheets:
  123. break
  124. logging.debug("Elements sheet: {0}".format(sheet_name))
  125. data = parse_elements_sheet(xls, sheet_name)
  126. parsed_sheets[sheet_name] = data
  127. return parsed_sheets