PageRenderTime 102ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/lincs/analysis/genepix.py

https://bitbucket.org/kljensen/kmj_lincs
Python | 637 lines | 630 code | 2 blank | 5 comment | 3 complexity | 4464310b9551d5cb5cc8d6424cc5601a MD5 | raw file
  1. import numpy
  2. import pandas
  3. import logging
  4. from collections import namedtuple
  5. from .elements import counts_to_dataframe
  6. from .pairwise2 import align
  7. class GenepixProcessingError(Exception):
  8. """docstring for GenepixProcessingError"""
  9. pass
  10. def align_intensities_to_cells(gcol, ccol):
  11. """ Align Genepix intensity readings to cell counts in each ROI.
  12. :param gcol: Column of intensities
  13. :type sheetdict: pandas.DataFrame
  14. :param ccol: Column of cell counts
  15. :type genepdict: collections.Counter
  16. :returns: alignment.
  17. """
  18. # Sum intensity for each row of a column on the device
  19. #
  20. sumi = [x.sum() for _, x in gcol.iterrows()]
  21. # Number of wells in this column that have cells in them
  22. #
  23. num_populated_wells = len(ccol.keys())
  24. # Assuming only wells that have cells should have intensity,
  25. # what is our threshold_intensity? That is, if 30% of the
  26. # wells have cells, our threshold_intensity should be the
  27. # 30% percentile of intensitites.
  28. #
  29. threshold_intensity = sorted(sumi, reverse=True)[num_populated_wells]
  30. ibool = [
  31. (1 if x >= threshold_intensity else 0)
  32. for x in sumi
  33. ]
  34. cbool = [
  35. (1 if ccol[i] > 0 else 0)
  36. for i in range(max(ccol.keys()) + 1)
  37. ]
  38. sjoin = lambda x: "".join(str(y) for y in x)
  39. alignment = align.localms(
  40. sjoin(ibool),
  41. sjoin(cbool),
  42. 2, -1, -4, -0.25,
  43. penalize_end_gaps=False,
  44. one_alignment_only=True
  45. )
  46. return alignment
  47. def collect_intensity_per_row(xls_sheet):
  48. """ Collect the a DataFrame of intensities for each well in
  49. for which there is data in this xls_sheet. Returns a
  50. list of DataFrame objects, one per well.
  51. """
  52. num_ab = None
  53. intensities = []
  54. grouped_data = xls_sheet.groupby(('Block', 'Row'))
  55. for (i, ((block, row), group)) in enumerate(grouped_data):
  56. # Ensure we have a consisten number of columns,
  57. # which correspond to antibody intensities.
  58. #
  59. if not num_ab:
  60. num_ab = len(group)
  61. logging.info("Found {0} antibody intensities".format(num_ab))
  62. elif len(group) != num_ab:
  63. msg = "Varying number of antibodies in Genepix file."
  64. msg += " Expected {0} but found {1}".format(num_ab, len(group))
  65. raise GenepixProcessingError(msg)
  66. # `group` is a DataFrame of shape, e.g. (20, 77), where 20 is
  67. # the number of antibodies and 77 are the various Genepix readings.
  68. #
  69. intensities.append(group)
  70. return intensities
  71. def process_ab_intensities_for_column(gpix_wells, flow_pattern,
  72. reference_ab, ab_channels, column_number, ab_names,
  73. channel_shifts=None,
  74. num_end_wells_ignored=None, inverted=False):
  75. """ Take per-well data for a single column of the device and
  76. extract the intensities for the Abs of interest, computing
  77. their "normalized" intensities, e.g. the mean minus median
  78. of the background for a particular channel.
  79. :param gpix_wells: List of DataFrame objects,
  80. each with Genepix data for a single well.
  81. :type gpix_wells: list
  82. :param flow_pattern: Mapping from flow lines to an Ab name.
  83. :type flow_pattern: dictionary
  84. :param reference_ab: Name of reference Ab.
  85. :type reference_ab: string or unicode
  86. :param ab_channels: Mapping from Ab name to a fluorescence channel.
  87. :type ab_channels: dictionary or defaultdict
  88. :returns: ??.
  89. """
  90. num_gpix_wells = len(gpix_wells)
  91. if inverted:
  92. well_iterator = reversed(gpix_wells)
  93. else:
  94. well_iterator = iter(gpix_wells)
  95. well_indicies = []
  96. well_intensities = []
  97. for (i, well) in enumerate(well_iterator):
  98. logging.debug("\tWell {0}".format(i))
  99. # Are we at the top of the device?
  100. #
  101. if num_end_wells_ignored and i < num_end_wells_ignored:
  102. continue
  103. # Or at the end of the device?
  104. #
  105. if (num_end_wells_ignored
  106. and i > num_gpix_wells - num_end_wells_ignored):
  107. break
  108. ab_intensities = \
  109. process_ab_intensities_for_well(well, reference_ab,
  110. ab_channels, flow_pattern,
  111. ab_names,
  112. channel_shifts=channel_shifts,
  113. )
  114. well_indicies.append((column_number, i))
  115. well_intensities.append(ab_intensities)
  116. df = pandas.DataFrame(
  117. well_intensities,
  118. index=pandas.MultiIndex.from_tuples(well_indicies),
  119. columns=ab_names + ["error"],
  120. )
  121. return df
  122. def mean_minus_bg_channel_name(channel):
  123. """ The channel names we want out of the Genepix data
  124. look like "F488 Mean - B488" which is the foreground
  125. mean fluorescence minus the median of the background,
  126. calculated using a small border around the "target box"
  127. """
  128. return "F{0} Mean - B{0}".format(channel)
  129. def process_ab_intensities_for_well(well, reference_ab, ab_channels,
  130. flow_pattern, ab_names, channel_shifts=None):
  131. """ Given data for a single well and the flow pattern, try to
  132. figure out the line indexes from which we should take each
  133. Ab's reading.
  134. """
  135. # Find the row number (row in the pandas sense) that
  136. # corresponds to the reference Ab. This is assumed
  137. # to be the row with the largest fluorescence for the
  138. # reference_ab's fluorescence channel.
  139. #
  140. channel_name = mean_minus_bg_channel_name(ab_channels[reference_ab])
  141. reference_ab_col_idx = numpy.argmax(well[channel_name])
  142. logging.debug("\t\t{0} at position {1}".format(
  143. reference_ab,
  144. reference_ab_col_idx
  145. ))
  146. # See if it is at the left or right of the well
  147. # (equivalently top or bottom of the `well` object here).
  148. # The reference Ab is ALWAYS ASSUMED to be on one side of the
  149. # flow pattern. If it is not, we have no way to orient ourselves!
  150. #
  151. if reference_ab_col_idx < (len(well) - reference_ab_col_idx):
  152. direction = -1
  153. logging.debug("\t\tLeft to right")
  154. else:
  155. direction = 1
  156. logging.debug("\t\tRight to left")
  157. # Reset the index so that it is 0->20-ish instead
  158. # of the original Genepix data file row numbers
  159. #
  160. well.index = range(len(well))
  161. # import IPython; IPython.embed()
  162. ab_intensities = []
  163. had_exception = False
  164. for ab in ab_names:
  165. # How many positions is this Ab offset from the
  166. # reference_ab in the flow pattern?
  167. #
  168. delta = flow_pattern[ab] - flow_pattern[reference_ab]
  169. # Find the index in the data for this Ab. This is
  170. # the reference_ab_col_idx plus this Ab's offset in
  171. # the flow pattern from the reference_ab. We must
  172. # multiply this by the (-1 or +1) direction because
  173. # the flow pattern is inverted in some cases (our
  174. # "zig-zag" pattern). Finally, sometimes the Genepix
  175. # software will mis-align images it takes of different
  176. # fluorescence patterns. Such mis-alignments are a
  177. # constant factor per-channel, as specified in the
  178. # the channel_shifts dictionary which is a user-set
  179. # parameter in the configuration file.
  180. #
  181. idx = reference_ab_col_idx + (direction * delta)
  182. # Get the channel_shift for this channel, by default zero.
  183. #
  184. if channel_shifts:
  185. idx += channel_shifts.get(ab_channels[ab], 0)
  186. logging.debug("\t\t{0} at position {1}".format(ab, idx))
  187. # Get the fluorescence column name for the channel in
  188. # which this Ab fluorescences.
  189. #
  190. col = mean_minus_bg_channel_name(ab_channels[ab])
  191. # Extract the data
  192. #
  193. try:
  194. # Do not allow negative intensities, instead,
  195. # consider them to be zero, else they will
  196. # oddly affect statistics such as sum intensitites
  197. # for a row, etc.
  198. #
  199. this_ab_intensity = max(well[col][idx], 0.)
  200. ab_intensities.append(this_ab_intensity)
  201. except KeyError:
  202. had_exception = True
  203. break
  204. if had_exception:
  205. ab_intensities = [None for ab in ab_names] + [True]
  206. else:
  207. ab_intensities.append(False)
  208. return ab_intensities
  209. def extract_intensities(sheetdict, genepdict,
  210. num_columns, num_wells,
  211. flow_pattern, reference_ab, ab_channels,
  212. num_end_wells_ignored=None,
  213. channel_shifts=None,
  214. inverted=False):
  215. """ Compute the intensities for each well in each well of each column
  216. on the device. The intensities will typically have 20 values,
  217. one for each antibody.
  218. :param sheetdict: Mapping from col number to sheet names
  219. :type sheetdict: dict
  220. :param genepdict: Mapping from sheet name to sheet data as DataFrames
  221. :type genepdict: dict
  222. :param num_columns: Number of columns on the device
  223. :type num_columns: int
  224. :param num_wells: Number of wells in each column of the device
  225. :type num_wells: int
  226. :param flow_pattern: Mapping from flow lines to an Ab name.
  227. :type flow_pattern: dictionary
  228. :param reference_ab: Name of reference Ab.
  229. :type reference_ab: string or unicode
  230. :param ab_channels: Mapping from Ab name to a fluorescence channel.
  231. :type ab_channels: dictionary or defaultdict
  232. :returns: None.
  233. """
  234. all_intensities = None
  235. # Iterate over each column on the device and get a list of the
  236. # spreadsheet sheets
  237. #
  238. for col_num, sheets in sheetdict.iteritems():
  239. logging.info("Processing Genepix col {0}, sheets: {1}".format(
  240. col_num,
  241. ", ".join(sheets)
  242. ))
  243. this_col_intensities = None
  244. # Iterate over each sheet of the spreadsheet for this column
  245. # of the device. Most will have a 1-to-1 mapping from column
  246. # to spreadsheet. In cases where it is 1-to-N, the sheets must
  247. # be specified in the configuration file in the order they are
  248. # in the column: top-to-bottom.
  249. #
  250. for sheet in sheets:
  251. # Get the spreadsheet
  252. #
  253. xls_sheet = genepdict[sheet]
  254. # Extract the intensities for each well
  255. #
  256. # import IPython; IPython.embed()
  257. this_sheet_intensities = collect_intensity_per_row(
  258. xls_sheet,
  259. )
  260. # Append intensities for these wells to the other
  261. # intensities for this column if we have a 1-to-N column
  262. # to sheet mapping.
  263. #
  264. if this_col_intensities:
  265. this_col_intensities.extend(
  266. this_sheet_intensities
  267. )
  268. else:
  269. this_col_intensities = this_sheet_intensities
  270. # Raise an exception if the data don't look right.
  271. #
  272. if len(this_col_intensities) != num_wells:
  273. msg = ("Wrong number of rows in Genepix "
  274. "col {0}, got {1} expected {2}".format(
  275. col_num, len(this_col_intensities), num_wells
  276. ))
  277. logging.warning(msg)
  278. # raise GenepixProcessingError(msg)
  279. # See if the columns are inverted. That is, the orientation
  280. # of the device was rotated in the Genepix analysis relative
  281. # to the Elements analysis.
  282. #
  283. if inverted:
  284. real_col_num = num_columns - (col_num + 1)
  285. else:
  286. real_col_num = col_num
  287. # all_intensities is a dictiontionary, the keys of which are
  288. # column numbers. The values are lists of length num_wells.
  289. # The members of each list are DataFrame objects describing
  290. # the fluorescence values of each well. These are generally
  291. # of shape (20, 77) where 20 is the number of flow lines and
  292. # 77 are the Genepix data for each line.
  293. #
  294. # all_intensities[col_num] = this_col_intensities
  295. logging.info("Genepix col {0} (real {1})".format(
  296. col_num,
  297. real_col_num
  298. ))
  299. ab_names = sorted(flow_pattern, key=flow_pattern.get)
  300. col_intensities = process_ab_intensities_for_column(
  301. this_col_intensities, flow_pattern,
  302. reference_ab, ab_channels, real_col_num,
  303. ab_names,
  304. num_end_wells_ignored=num_end_wells_ignored,
  305. inverted=inverted,
  306. channel_shifts=channel_shifts,
  307. )
  308. if all_intensities is None:
  309. all_intensities = col_intensities
  310. else:
  311. all_intensities = all_intensities.append(col_intensities)
  312. # import IPython; IPython.embed()
  313. return all_intensities
  314. def join_counts_and_intensities(cell_counts, intensities):
  315. """ Join together the intensities and the cell counts.
  316. """
  317. cell_count_df = counts_to_dataframe(cell_counts, intensities.index)
  318. joined_intensities = intensities.join(cell_count_df)
  319. return joined_intensities
  320. def add_sum_nonref_column(intensities, reference_ab,
  321. ab_channels, col_name="sum_nonref"):
  322. """ Take a dataframe of intensities and add a column that
  323. is the sum of the intensities of all antibodies that
  324. are not the reference_ab.
  325. """
  326. ab_names = set(ab_channels.keys())
  327. abs_for_aln = [
  328. c for c in intensities.columns
  329. if c in ab_names and c != reference_ab
  330. ]
  331. intensities[col_name] = intensities.filter(abs_for_aln).apply(sum, axis=1)
  332. AlignmentDetails = namedtuple(
  333. "AlignmentDetails",
  334. "alignment aln_score_per_well percent_populated imin cmin cmax imax"
  335. )
  336. def align_all(intensities, cell_counts, reference_ab,
  337. ab_channels, num_columns):
  338. """ Compute optimal alignments of the
  339. """
  340. add_sum_nonref_column(intensities, reference_ab, ab_channels)
  341. mappings = {}
  342. alignment_details = {}
  343. warnings = []
  344. for col_num in range(num_columns):
  345. logging.info("Getting mapping for col {0}".format(col_num))
  346. had_warning = False
  347. (aln, pp, imin, cmin, cmax, imax) \
  348. = align_column(intensities, cell_counts, col_num)
  349. # Check for too many or too few cells in this column's wells.
  350. #
  351. if not (0.10 < pp < 0.90):
  352. msg = ("Col {0} had a populated well frequecy of {1} "
  353. "(it should be > 0.1 and < 0.9) "
  354. "and was therefore not aligned automatically."
  355. ).format(col_num, pp)
  356. warnings.append(msg)
  357. had_warning = True
  358. # Check for a low-scoring alignment, in which case we should
  359. # just keep the default alignment.
  360. #
  361. aln_score_per_well = float(aln[2]) / min(imax - imin, cmax - cmin)
  362. if aln_score_per_well <= 1.6:
  363. msg = ("Col {0} had aln_score_per_well of {1} "
  364. "(it should be > 1.6) "
  365. "and was therefore not aligned automatically."
  366. ).format(col_num, pp)
  367. warnings.append(msg)
  368. had_warning = True
  369. # In the absence of warnings, make a new intensity-to-cell count
  370. # row mapping.
  371. #
  372. if had_warning:
  373. mapping = None
  374. else:
  375. istr = aln[0]
  376. cstr = aln[1]
  377. mapping = create_mapping_from_alignment(
  378. istr, cstr,
  379. imin, cmin,
  380. cmax, imax,
  381. extend_counts=True,
  382. ipy=False,
  383. )
  384. mappings[col_num] = mapping
  385. alignment_details[col_num] = AlignmentDetails(
  386. aln,
  387. aln_score_per_well,
  388. pp,
  389. imin,
  390. cmin,
  391. cmax,
  392. imax
  393. )
  394. return (mappings, alignment_details, warnings)
  395. def align_column(intensities, cell_counts, col_num):
  396. """ Align a single column of cell counts against a single
  397. column of intensities.
  398. """
  399. # Find min and max indexes of the wells in this particular column.
  400. # E.g. we probably threw out the first five wells, because intensities
  401. # read from them
  402. #
  403. imin = min(intensities.ix[col_num].index)
  404. imax = max(intensities.ix[col_num].index)
  405. # Find the percent of wells that have cells.
  406. #
  407. percent_populated = float(
  408. len([v for v in cell_counts[col_num].values() if v > 0])) \
  409. / (max(cell_counts[col_num].keys()) + 1)
  410. # If 3% of wells have cells, then we expect the top 3% of
  411. # intensities to correspond to these wells.
  412. #
  413. threshold_idx = int(
  414. percent_populated * len(intensities['sum_nonref'].ix[col_num])
  415. )
  416. # Handle case in which 100% of the wells are populated
  417. threshold_idx = min(
  418. threshold_idx,
  419. len(intensities['sum_nonref'].ix[col_num]) - 1
  420. )
  421. # Find the threshold intensity, which the the ith highest
  422. # intensity, where i = threshold_idx.
  423. #
  424. ithreshold = sorted(
  425. intensities['sum_nonref'].ix[col_num], reverse=True
  426. )[threshold_idx]
  427. thresholded_intensities = ['0'] * (imax - imin + 1)
  428. for wellno, well in intensities.ix[col_num].iterrows():
  429. if well['sum_nonref'] > ithreshold:
  430. thresholded_intensities[wellno - imin] = '1'
  431. istr = ''.join(thresholded_intensities)
  432. cmin = min(cell_counts[col_num].keys())
  433. cmax = max(cell_counts[col_num].keys())
  434. cstr = ''.join([
  435. ('1' if cell_counts[col_num][i] > 0 else '0')
  436. for i in range(
  437. cmin,
  438. cmax + 1
  439. )
  440. ])
  441. aln = align.globalms(
  442. istr,
  443. cstr,
  444. 2,
  445. -1,
  446. -10,
  447. -1,
  448. penalize_end_gaps=False,
  449. one_alignment_only=True
  450. )[0]
  451. return (aln, percent_populated, imin, cmin, cmax, imax)
  452. def create_mapping_from_alignment(istr, cstr,
  453. imin, cmin, cmax, imax, gapchar='-',
  454. extend_counts=True, ipy=False):
  455. """ Takes a string alignment of the thresholded intensities
  456. and cell counts. Returns a mapping from intensity row
  457. index to cell row index.
  458. If `extend_counts` is True, then we will not terminate
  459. the mapping at cmax. This is because the elements
  460. software only outputs the locations of cells and does
  461. not output something like "ROI #300 has zero cells".
  462. Therefore, the highest ROI number of which we have data
  463. may be substantially lower than number of
  464. """
  465. # Ensure the alignments are of the same size
  466. #
  467. if len(istr) != len(cstr):
  468. msg = "Itensity and cell count alignments must be same length!"
  469. raise Exception(msg)
  470. else:
  471. alignment_length = len(istr)
  472. # The mapping is a dictionary that translates the intensity
  473. # row number to a corresponding cell count well number.
  474. #
  475. mapping = {}
  476. # Keep track of how many non-gap characters we've seen
  477. # of each the intensitites and the cell counts.
  478. #
  479. i_chars_seen = 0
  480. c_chars_seen = 0
  481. c_past_end = False
  482. c_num_past_end = 0
  483. # Iterate over the whole aligment.
  484. #
  485. for j in range(alignment_length):
  486. # Break out if we're at the end
  487. #
  488. if (i_chars_seen + imin > imax):
  489. logging.info("\tBroke out due to imax")
  490. break
  491. if (c_chars_seen + cmin > cmax):
  492. c_past_end = True
  493. if not extend_counts:
  494. logging.info("\tBroke out due to cmax")
  495. break
  496. # If two non-gap characters, make a mapping
  497. #
  498. if istr[j] != gapchar:
  499. if cstr[j] != gapchar or (c_past_end and extend_counts):
  500. mapping[i_chars_seen + imin] \
  501. = c_chars_seen + cmin + c_num_past_end
  502. # If either one is non-gap, increment number
  503. # of charaters seen.
  504. #
  505. if istr[j] != gapchar:
  506. i_chars_seen += 1
  507. if cstr[j] != gapchar:
  508. c_chars_seen += 1
  509. # If we're past the end of the cell counts, c_chars_seen
  510. # will not be incremented but instead we'll increment
  511. # c_num_past_end.
  512. #
  513. if c_past_end and extend_counts:
  514. c_num_past_end += 1
  515. if ipy:
  516. import IPython
  517. IPython.embed()
  518. return mapping
  519. def find_high_zero_cell(data, antibodies, cell_col, percentile=75):
  520. """ Takes a dataframe with intensity data and a list of antibodies.
  521. Adds a column called "high_empty_well_abs". For each well with
  522. zero cells, this function will list those antibodies that
  523. have intensities about the 75th percentile of intensities
  524. for the wells with one cell. That is, we're finding wells
  525. with zero cells that have suspiciously high intensities.
  526. """
  527. thresholds = dict(
  528. (ab, numpy.percentile(data[data[cell_col] == 1][ab], 75))
  529. for ab in antibodies
  530. )
  531. def label_suspicious_for_row(x):
  532. suspicious = [
  533. ab for ab in antibodies
  534. if x[cell_col] == 0 and x[ab] > thresholds[ab]
  535. ]
  536. return ", ".join(suspicious)
  537. # Returns a pandas.Series object
  538. #
  539. suspicious = data.apply(label_suspicious_for_row, axis=1)
  540. data['high_empty_well_abs'] = suspicious