/lincs/analysis/genepix.py
Python | 637 lines | 630 code | 2 blank | 5 comment | 3 complexity | 4464310b9551d5cb5cc8d6424cc5601a MD5 | raw file
- import numpy
- import pandas
- import logging
- from collections import namedtuple
- from .elements import counts_to_dataframe
- from .pairwise2 import align
- class GenepixProcessingError(Exception):
- """docstring for GenepixProcessingError"""
- pass
- def align_intensities_to_cells(gcol, ccol):
- """ Align Genepix intensity readings to cell counts in each ROI.
- :param gcol: Column of intensities
- :type sheetdict: pandas.DataFrame
- :param ccol: Column of cell counts
- :type genepdict: collections.Counter
- :returns: alignment.
- """
- # Sum intensity for each row of a column on the device
- #
- sumi = [x.sum() for _, x in gcol.iterrows()]
- # Number of wells in this column that have cells in them
- #
- num_populated_wells = len(ccol.keys())
- # Assuming only wells that have cells should have intensity,
- # what is our threshold_intensity? That is, if 30% of the
- # wells have cells, our threshold_intensity should be the
- # 30% percentile of intensitites.
- #
- threshold_intensity = sorted(sumi, reverse=True)[num_populated_wells]
- ibool = [
- (1 if x >= threshold_intensity else 0)
- for x in sumi
- ]
- cbool = [
- (1 if ccol[i] > 0 else 0)
- for i in range(max(ccol.keys()) + 1)
- ]
- sjoin = lambda x: "".join(str(y) for y in x)
- alignment = align.localms(
- sjoin(ibool),
- sjoin(cbool),
- 2, -1, -4, -0.25,
- penalize_end_gaps=False,
- one_alignment_only=True
- )
- return alignment
- def collect_intensity_per_row(xls_sheet):
- """ Collect the a DataFrame of intensities for each well in
- for which there is data in this xls_sheet. Returns a
- list of DataFrame objects, one per well.
- """
- num_ab = None
- intensities = []
- grouped_data = xls_sheet.groupby(('Block', 'Row'))
- for (i, ((block, row), group)) in enumerate(grouped_data):
- # Ensure we have a consisten number of columns,
- # which correspond to antibody intensities.
- #
- if not num_ab:
- num_ab = len(group)
- logging.info("Found {0} antibody intensities".format(num_ab))
- elif len(group) != num_ab:
- msg = "Varying number of antibodies in Genepix file."
- msg += " Expected {0} but found {1}".format(num_ab, len(group))
- raise GenepixProcessingError(msg)
- # `group` is a DataFrame of shape, e.g. (20, 77), where 20 is
- # the number of antibodies and 77 are the various Genepix readings.
- #
- intensities.append(group)
- return intensities
- def process_ab_intensities_for_column(gpix_wells, flow_pattern,
- reference_ab, ab_channels, column_number, ab_names,
- channel_shifts=None,
- num_end_wells_ignored=None, inverted=False):
- """ Take per-well data for a single column of the device and
- extract the intensities for the Abs of interest, computing
- their "normalized" intensities, e.g. the mean minus median
- of the background for a particular channel.
- :param gpix_wells: List of DataFrame objects,
- each with Genepix data for a single well.
- :type gpix_wells: list
- :param flow_pattern: Mapping from flow lines to an Ab name.
- :type flow_pattern: dictionary
- :param reference_ab: Name of reference Ab.
- :type reference_ab: string or unicode
- :param ab_channels: Mapping from Ab name to a fluorescence channel.
- :type ab_channels: dictionary or defaultdict
- :returns: ??.
- """
- num_gpix_wells = len(gpix_wells)
- if inverted:
- well_iterator = reversed(gpix_wells)
- else:
- well_iterator = iter(gpix_wells)
- well_indicies = []
- well_intensities = []
- for (i, well) in enumerate(well_iterator):
- logging.debug("\tWell {0}".format(i))
- # Are we at the top of the device?
- #
- if num_end_wells_ignored and i < num_end_wells_ignored:
- continue
- # Or at the end of the device?
- #
- if (num_end_wells_ignored
- and i > num_gpix_wells - num_end_wells_ignored):
- break
- ab_intensities = \
- process_ab_intensities_for_well(well, reference_ab,
- ab_channels, flow_pattern,
- ab_names,
- channel_shifts=channel_shifts,
- )
- well_indicies.append((column_number, i))
- well_intensities.append(ab_intensities)
- df = pandas.DataFrame(
- well_intensities,
- index=pandas.MultiIndex.from_tuples(well_indicies),
- columns=ab_names + ["error"],
- )
- return df
- def mean_minus_bg_channel_name(channel):
- """ The channel names we want out of the Genepix data
- look like "F488 Mean - B488" which is the foreground
- mean fluorescence minus the median of the background,
- calculated using a small border around the "target box"
- """
- return "F{0} Mean - B{0}".format(channel)
- def process_ab_intensities_for_well(well, reference_ab, ab_channels,
- flow_pattern, ab_names, channel_shifts=None):
- """ Given data for a single well and the flow pattern, try to
- figure out the line indexes from which we should take each
- Ab's reading.
- """
- # Find the row number (row in the pandas sense) that
- # corresponds to the reference Ab. This is assumed
- # to be the row with the largest fluorescence for the
- # reference_ab's fluorescence channel.
- #
- channel_name = mean_minus_bg_channel_name(ab_channels[reference_ab])
- reference_ab_col_idx = numpy.argmax(well[channel_name])
- logging.debug("\t\t{0} at position {1}".format(
- reference_ab,
- reference_ab_col_idx
- ))
- # See if it is at the left or right of the well
- # (equivalently top or bottom of the `well` object here).
- # The reference Ab is ALWAYS ASSUMED to be on one side of the
- # flow pattern. If it is not, we have no way to orient ourselves!
- #
- if reference_ab_col_idx < (len(well) - reference_ab_col_idx):
- direction = -1
- logging.debug("\t\tLeft to right")
- else:
- direction = 1
- logging.debug("\t\tRight to left")
- # Reset the index so that it is 0->20-ish instead
- # of the original Genepix data file row numbers
- #
- well.index = range(len(well))
- # import IPython; IPython.embed()
- ab_intensities = []
- had_exception = False
- for ab in ab_names:
- # How many positions is this Ab offset from the
- # reference_ab in the flow pattern?
- #
- delta = flow_pattern[ab] - flow_pattern[reference_ab]
- # Find the index in the data for this Ab. This is
- # the reference_ab_col_idx plus this Ab's offset in
- # the flow pattern from the reference_ab. We must
- # multiply this by the (-1 or +1) direction because
- # the flow pattern is inverted in some cases (our
- # "zig-zag" pattern). Finally, sometimes the Genepix
- # software will mis-align images it takes of different
- # fluorescence patterns. Such mis-alignments are a
- # constant factor per-channel, as specified in the
- # the channel_shifts dictionary which is a user-set
- # parameter in the configuration file.
- #
- idx = reference_ab_col_idx + (direction * delta)
- # Get the channel_shift for this channel, by default zero.
- #
- if channel_shifts:
- idx += channel_shifts.get(ab_channels[ab], 0)
- logging.debug("\t\t{0} at position {1}".format(ab, idx))
- # Get the fluorescence column name for the channel in
- # which this Ab fluorescences.
- #
- col = mean_minus_bg_channel_name(ab_channels[ab])
- # Extract the data
- #
- try:
- # Do not allow negative intensities, instead,
- # consider them to be zero, else they will
- # oddly affect statistics such as sum intensitites
- # for a row, etc.
- #
- this_ab_intensity = max(well[col][idx], 0.)
- ab_intensities.append(this_ab_intensity)
- except KeyError:
- had_exception = True
- break
- if had_exception:
- ab_intensities = [None for ab in ab_names] + [True]
- else:
- ab_intensities.append(False)
- return ab_intensities
- def extract_intensities(sheetdict, genepdict,
- num_columns, num_wells,
- flow_pattern, reference_ab, ab_channels,
- num_end_wells_ignored=None,
- channel_shifts=None,
- inverted=False):
- """ Compute the intensities for each well in each well of each column
- on the device. The intensities will typically have 20 values,
- one for each antibody.
- :param sheetdict: Mapping from col number to sheet names
- :type sheetdict: dict
- :param genepdict: Mapping from sheet name to sheet data as DataFrames
- :type genepdict: dict
- :param num_columns: Number of columns on the device
- :type num_columns: int
- :param num_wells: Number of wells in each column of the device
- :type num_wells: int
- :param flow_pattern: Mapping from flow lines to an Ab name.
- :type flow_pattern: dictionary
- :param reference_ab: Name of reference Ab.
- :type reference_ab: string or unicode
- :param ab_channels: Mapping from Ab name to a fluorescence channel.
- :type ab_channels: dictionary or defaultdict
- :returns: None.
- """
- all_intensities = None
- # Iterate over each column on the device and get a list of the
- # spreadsheet sheets
- #
- for col_num, sheets in sheetdict.iteritems():
- logging.info("Processing Genepix col {0}, sheets: {1}".format(
- col_num,
- ", ".join(sheets)
- ))
- this_col_intensities = None
- # Iterate over each sheet of the spreadsheet for this column
- # of the device. Most will have a 1-to-1 mapping from column
- # to spreadsheet. In cases where it is 1-to-N, the sheets must
- # be specified in the configuration file in the order they are
- # in the column: top-to-bottom.
- #
- for sheet in sheets:
- # Get the spreadsheet
- #
- xls_sheet = genepdict[sheet]
- # Extract the intensities for each well
- #
- # import IPython; IPython.embed()
- this_sheet_intensities = collect_intensity_per_row(
- xls_sheet,
- )
- # Append intensities for these wells to the other
- # intensities for this column if we have a 1-to-N column
- # to sheet mapping.
- #
- if this_col_intensities:
- this_col_intensities.extend(
- this_sheet_intensities
- )
- else:
- this_col_intensities = this_sheet_intensities
- # Raise an exception if the data don't look right.
- #
- if len(this_col_intensities) != num_wells:
- msg = ("Wrong number of rows in Genepix "
- "col {0}, got {1} expected {2}".format(
- col_num, len(this_col_intensities), num_wells
- ))
- logging.warning(msg)
- # raise GenepixProcessingError(msg)
- # See if the columns are inverted. That is, the orientation
- # of the device was rotated in the Genepix analysis relative
- # to the Elements analysis.
- #
- if inverted:
- real_col_num = num_columns - (col_num + 1)
- else:
- real_col_num = col_num
- # all_intensities is a dictiontionary, the keys of which are
- # column numbers. The values are lists of length num_wells.
- # The members of each list are DataFrame objects describing
- # the fluorescence values of each well. These are generally
- # of shape (20, 77) where 20 is the number of flow lines and
- # 77 are the Genepix data for each line.
- #
- # all_intensities[col_num] = this_col_intensities
- logging.info("Genepix col {0} (real {1})".format(
- col_num,
- real_col_num
- ))
- ab_names = sorted(flow_pattern, key=flow_pattern.get)
- col_intensities = process_ab_intensities_for_column(
- this_col_intensities, flow_pattern,
- reference_ab, ab_channels, real_col_num,
- ab_names,
- num_end_wells_ignored=num_end_wells_ignored,
- inverted=inverted,
- channel_shifts=channel_shifts,
- )
- if all_intensities is None:
- all_intensities = col_intensities
- else:
- all_intensities = all_intensities.append(col_intensities)
- # import IPython; IPython.embed()
- return all_intensities
- def join_counts_and_intensities(cell_counts, intensities):
- """ Join together the intensities and the cell counts.
- """
- cell_count_df = counts_to_dataframe(cell_counts, intensities.index)
- joined_intensities = intensities.join(cell_count_df)
- return joined_intensities
- def add_sum_nonref_column(intensities, reference_ab,
- ab_channels, col_name="sum_nonref"):
- """ Take a dataframe of intensities and add a column that
- is the sum of the intensities of all antibodies that
- are not the reference_ab.
- """
- ab_names = set(ab_channels.keys())
- abs_for_aln = [
- c for c in intensities.columns
- if c in ab_names and c != reference_ab
- ]
- intensities[col_name] = intensities.filter(abs_for_aln).apply(sum, axis=1)
- AlignmentDetails = namedtuple(
- "AlignmentDetails",
- "alignment aln_score_per_well percent_populated imin cmin cmax imax"
- )
- def align_all(intensities, cell_counts, reference_ab,
- ab_channels, num_columns):
- """ Compute optimal alignments of the
- """
- add_sum_nonref_column(intensities, reference_ab, ab_channels)
- mappings = {}
- alignment_details = {}
- warnings = []
- for col_num in range(num_columns):
- logging.info("Getting mapping for col {0}".format(col_num))
- had_warning = False
- (aln, pp, imin, cmin, cmax, imax) \
- = align_column(intensities, cell_counts, col_num)
- # Check for too many or too few cells in this column's wells.
- #
- if not (0.10 < pp < 0.90):
- msg = ("Col {0} had a populated well frequecy of {1} "
- "(it should be > 0.1 and < 0.9) "
- "and was therefore not aligned automatically."
- ).format(col_num, pp)
- warnings.append(msg)
- had_warning = True
- # Check for a low-scoring alignment, in which case we should
- # just keep the default alignment.
- #
- aln_score_per_well = float(aln[2]) / min(imax - imin, cmax - cmin)
- if aln_score_per_well <= 1.6:
- msg = ("Col {0} had aln_score_per_well of {1} "
- "(it should be > 1.6) "
- "and was therefore not aligned automatically."
- ).format(col_num, pp)
- warnings.append(msg)
- had_warning = True
- # In the absence of warnings, make a new intensity-to-cell count
- # row mapping.
- #
- if had_warning:
- mapping = None
- else:
- istr = aln[0]
- cstr = aln[1]
- mapping = create_mapping_from_alignment(
- istr, cstr,
- imin, cmin,
- cmax, imax,
- extend_counts=True,
- ipy=False,
- )
- mappings[col_num] = mapping
- alignment_details[col_num] = AlignmentDetails(
- aln,
- aln_score_per_well,
- pp,
- imin,
- cmin,
- cmax,
- imax
- )
- return (mappings, alignment_details, warnings)
- def align_column(intensities, cell_counts, col_num):
- """ Align a single column of cell counts against a single
- column of intensities.
- """
- # Find min and max indexes of the wells in this particular column.
- # E.g. we probably threw out the first five wells, because intensities
- # read from them
- #
- imin = min(intensities.ix[col_num].index)
- imax = max(intensities.ix[col_num].index)
- # Find the percent of wells that have cells.
- #
- percent_populated = float(
- len([v for v in cell_counts[col_num].values() if v > 0])) \
- / (max(cell_counts[col_num].keys()) + 1)
- # If 3% of wells have cells, then we expect the top 3% of
- # intensities to correspond to these wells.
- #
- threshold_idx = int(
- percent_populated * len(intensities['sum_nonref'].ix[col_num])
- )
- # Handle case in which 100% of the wells are populated
- threshold_idx = min(
- threshold_idx,
- len(intensities['sum_nonref'].ix[col_num]) - 1
- )
- # Find the threshold intensity, which the the ith highest
- # intensity, where i = threshold_idx.
- #
- ithreshold = sorted(
- intensities['sum_nonref'].ix[col_num], reverse=True
- )[threshold_idx]
- thresholded_intensities = ['0'] * (imax - imin + 1)
- for wellno, well in intensities.ix[col_num].iterrows():
- if well['sum_nonref'] > ithreshold:
- thresholded_intensities[wellno - imin] = '1'
- istr = ''.join(thresholded_intensities)
- cmin = min(cell_counts[col_num].keys())
- cmax = max(cell_counts[col_num].keys())
- cstr = ''.join([
- ('1' if cell_counts[col_num][i] > 0 else '0')
- for i in range(
- cmin,
- cmax + 1
- )
- ])
- aln = align.globalms(
- istr,
- cstr,
- 2,
- -1,
- -10,
- -1,
- penalize_end_gaps=False,
- one_alignment_only=True
- )[0]
- return (aln, percent_populated, imin, cmin, cmax, imax)
- def create_mapping_from_alignment(istr, cstr,
- imin, cmin, cmax, imax, gapchar='-',
- extend_counts=True, ipy=False):
- """ Takes a string alignment of the thresholded intensities
- and cell counts. Returns a mapping from intensity row
- index to cell row index.
- If `extend_counts` is True, then we will not terminate
- the mapping at cmax. This is because the elements
- software only outputs the locations of cells and does
- not output something like "ROI #300 has zero cells".
- Therefore, the highest ROI number of which we have data
- may be substantially lower than number of
- """
- # Ensure the alignments are of the same size
- #
- if len(istr) != len(cstr):
- msg = "Itensity and cell count alignments must be same length!"
- raise Exception(msg)
- else:
- alignment_length = len(istr)
- # The mapping is a dictionary that translates the intensity
- # row number to a corresponding cell count well number.
- #
- mapping = {}
- # Keep track of how many non-gap characters we've seen
- # of each the intensitites and the cell counts.
- #
- i_chars_seen = 0
- c_chars_seen = 0
- c_past_end = False
- c_num_past_end = 0
- # Iterate over the whole aligment.
- #
- for j in range(alignment_length):
- # Break out if we're at the end
- #
- if (i_chars_seen + imin > imax):
- logging.info("\tBroke out due to imax")
- break
- if (c_chars_seen + cmin > cmax):
- c_past_end = True
- if not extend_counts:
- logging.info("\tBroke out due to cmax")
- break
- # If two non-gap characters, make a mapping
- #
- if istr[j] != gapchar:
- if cstr[j] != gapchar or (c_past_end and extend_counts):
- mapping[i_chars_seen + imin] \
- = c_chars_seen + cmin + c_num_past_end
- # If either one is non-gap, increment number
- # of charaters seen.
- #
- if istr[j] != gapchar:
- i_chars_seen += 1
- if cstr[j] != gapchar:
- c_chars_seen += 1
- # If we're past the end of the cell counts, c_chars_seen
- # will not be incremented but instead we'll increment
- # c_num_past_end.
- #
- if c_past_end and extend_counts:
- c_num_past_end += 1
- if ipy:
- import IPython
- IPython.embed()
- return mapping
- def find_high_zero_cell(data, antibodies, cell_col, percentile=75):
- """ Takes a dataframe with intensity data and a list of antibodies.
- Adds a column called "high_empty_well_abs". For each well with
- zero cells, this function will list those antibodies that
- have intensities about the 75th percentile of intensities
- for the wells with one cell. That is, we're finding wells
- with zero cells that have suspiciously high intensities.
- """
- thresholds = dict(
- (ab, numpy.percentile(data[data[cell_col] == 1][ab], 75))
- for ab in antibodies
- )
- def label_suspicious_for_row(x):
- suspicious = [
- ab for ab in antibodies
- if x[cell_col] == 0 and x[ab] > thresholds[ab]
- ]
- return ", ".join(suspicious)
- # Returns a pandas.Series object
- #
- suspicious = data.apply(label_suspicious_for_row, axis=1)
- data['high_empty_well_abs'] = suspicious