genepix.py - Sum intensity for each row of a column on the …

/lincs/analysis/genepix.py

https://bitbucket.org/kljensen/kmj_lincs · Python · 637 lines · 416 code · 52 blank · 169 comment · 40 complexity · 4464310b9551d5cb5cc8d6424cc5601a MD5 · raw file

import numpy
import pandas
import logging
from collections import namedtuple
from .elements import counts_to_dataframe
from .pairwise2 import align


class GenepixProcessingError(Exception):
    """docstring for GenepixProcessingError"""
    pass


def align_intensities_to_cells(gcol, ccol):
    """ Align Genepix intensity readings to cell counts in each ROI.

        :param gcol: Column of intensities
        :type sheetdict: pandas.DataFrame
        :param ccol: Column of cell counts
        :type genepdict: collections.Counter
        :returns:  alignment.
    """

    # Sum intensity for each row of a column on the device
    #
    sumi = [x.sum() for _, x in gcol.iterrows()]

    # Number of wells in this column that have cells in them
    #
    num_populated_wells = len(ccol.keys())

    # Assuming only wells that have cells should have intensity,
    # what is our threshold_intensity?  That is, if 30% of the
    # wells have cells, our threshold_intensity should be the
    # 30% percentile of intensitites.
    #
    threshold_intensity = sorted(sumi, reverse=True)[num_populated_wells]

    ibool = [
        (1 if x >= threshold_intensity else 0)
        for x in sumi
    ]
    cbool = [
        (1 if ccol[i] > 0 else 0)
        for i in range(max(ccol.keys()) + 1)
    ]
    sjoin = lambda x: "".join(str(y) for y in x)
    alignment = align.localms(
                    sjoin(ibool),
                    sjoin(cbool),
                    2, -1, -4, -0.25,
                    penalize_end_gaps=False,
                    one_alignment_only=True
    )

    return alignment


def collect_intensity_per_row(xls_sheet):
    """ Collect the a DataFrame of intensities for each well in
        for which there is data in this xls_sheet.  Returns a
        list of DataFrame objects, one per well.
    """
    num_ab = None
    intensities = []

    grouped_data = xls_sheet.groupby(('Block', 'Row'))
    for (i, ((block, row), group)) in enumerate(grouped_data):

        # Ensure we have a consisten number of columns,
        # which correspond to antibody intensities.
        #
        if not num_ab:
            num_ab = len(group)
            logging.info("Found {0} antibody intensities".format(num_ab))
        elif len(group) != num_ab:
            msg = "Varying number of antibodies in Genepix file."
            msg += " Expected {0} but found {1}".format(num_ab, len(group))
            raise GenepixProcessingError(msg)

        # `group` is a DataFrame of shape, e.g. (20, 77), where 20 is
        #  the number of antibodies and 77 are the various Genepix readings.
        #
        intensities.append(group)

    return intensities


def process_ab_intensities_for_column(gpix_wells, flow_pattern,
    reference_ab, ab_channels, column_number, ab_names,
    channel_shifts=None,
    num_end_wells_ignored=None, inverted=False):
    """ Take per-well data for a single column of the device and
        extract the intensities for the Abs of interest, computing
        their "normalized" intensities, e.g. the mean minus median
        of the background for a particular channel.

        :param gpix_wells: List of DataFrame objects,
                           each with Genepix data for a single well.
        :type gpix_wells: list
        :param flow_pattern: Mapping from flow lines to an Ab name.
        :type flow_pattern: dictionary
        :param reference_ab: Name of reference Ab.
        :type reference_ab: string or unicode
        :param ab_channels: Mapping from Ab name to a fluorescence channel.
        :type ab_channels: dictionary or defaultdict
        :returns:  ??.
    """

    num_gpix_wells = len(gpix_wells)
    if inverted:
        well_iterator = reversed(gpix_wells)
    else:
        well_iterator = iter(gpix_wells)

    well_indicies = []
    well_intensities = []

    for (i, well) in enumerate(well_iterator):

        logging.debug("\tWell {0}".format(i))

        # Are we at the top of the device?
        #
        if num_end_wells_ignored and i < num_end_wells_ignored:
            continue

        # Or at the end of the device?
        #
        if (num_end_wells_ignored
            and i > num_gpix_wells - num_end_wells_ignored):
            break

        ab_intensities = \
            process_ab_intensities_for_well(well, reference_ab,
                ab_channels, flow_pattern,
                ab_names,
                channel_shifts=channel_shifts,
        )
        well_indicies.append((column_number, i))
        well_intensities.append(ab_intensities)

    df = pandas.DataFrame(
        well_intensities,
        index=pandas.MultiIndex.from_tuples(well_indicies),
        columns=ab_names + ["error"],
    )
    return df


def mean_minus_bg_channel_name(channel):
    """ The channel names we want out of the Genepix data
        look like "F488 Mean - B488" which is the foreground
        mean fluorescence minus the median of the background,
        calculated using a small border around the "target box"
    """
    return "F{0} Mean - B{0}".format(channel)


def process_ab_intensities_for_well(well, reference_ab, ab_channels,
    flow_pattern, ab_names, channel_shifts=None):
    """ Given data for a single well and the flow pattern, try to
        figure out the line indexes from which we should take each
        Ab's reading.
    """
    # Find the row number (row in the pandas sense) that
    # corresponds to the reference Ab.  This is assumed
    # to be the row with the largest fluorescence for the
    # reference_ab's fluorescence channel.
    #
    channel_name = mean_minus_bg_channel_name(ab_channels[reference_ab])
    reference_ab_col_idx = numpy.argmax(well[channel_name])
    logging.debug("\t\t{0} at position {1}".format(
        reference_ab,
        reference_ab_col_idx
    ))

    # See if it is at the left or right of the well
    # (equivalently top or bottom of the `well` object here).
    # The reference Ab is ALWAYS ASSUMED to be on one side of the
    # flow pattern.  If it is not, we have no way to orient ourselves!
    #
    if reference_ab_col_idx < (len(well) - reference_ab_col_idx):
        direction = -1
        logging.debug("\t\tLeft to right")
    else:
        direction = 1
        logging.debug("\t\tRight to left")

    # Reset the index so that it is 0->20-ish instead
    # of the original Genepix data file row numbers
    #
    well.index = range(len(well))
    # import IPython; IPython.embed()

    ab_intensities = []
    had_exception = False
    for ab in ab_names:

        # How many positions is this Ab offset from the
        # reference_ab in the flow pattern?
        #
        delta = flow_pattern[ab] - flow_pattern[reference_ab]

        # Find the index in the data for this Ab.  This is
        # the reference_ab_col_idx plus this Ab's offset in
        # the flow pattern from the reference_ab.  We must
        # multiply this by the (-1 or +1) direction because
        # the flow pattern is inverted in some cases (our
        # "zig-zag" pattern).  Finally, sometimes the Genepix
        # software will mis-align images it takes of different
        # fluorescence patterns.  Such mis-alignments are a
        # constant factor per-channel, as specified in the
        # the channel_shifts dictionary which is a user-set
        # parameter in the configuration file.
        #
        idx = reference_ab_col_idx + (direction * delta)

        # Get the channel_shift for this channel, by default zero.
        #
        if channel_shifts:
            idx += channel_shifts.get(ab_channels[ab],  0)

        logging.debug("\t\t{0} at position {1}".format(ab, idx))

        # Get the fluorescence column name for the channel in
        # which this Ab fluorescences.
        #
        col = mean_minus_bg_channel_name(ab_channels[ab])

        # Extract the data
        #
        try:
            # Do not allow negative intensities, instead,
            # consider them to be zero, else they will
            # oddly affect statistics such as sum intensitites
            # for a row, etc.
            #
            this_ab_intensity = max(well[col][idx], 0.)
            ab_intensities.append(this_ab_intensity)
        except KeyError:
            had_exception = True
            break

    if had_exception:
        ab_intensities = [None for ab in ab_names] + [True]
    else:
        ab_intensities.append(False)

    return ab_intensities


def extract_intensities(sheetdict, genepdict,
    num_columns, num_wells,
    flow_pattern, reference_ab, ab_channels,
    num_end_wells_ignored=None,
    channel_shifts=None,
    inverted=False):
    """ Compute the intensities for each well in each well of each column
        on the device.  The intensities will typically have 20 values,
        one for each antibody.

        :param sheetdict: Mapping from col number to sheet names
        :type sheetdict: dict
        :param genepdict: Mapping from sheet name to sheet data as DataFrames
        :type genepdict: dict
        :param num_columns: Number of columns on the device
        :type num_columns: int
        :param num_wells: Number of wells in each column of the device
        :type num_wells: int
        :param flow_pattern: Mapping from flow lines to an Ab name.
        :type flow_pattern: dictionary
        :param reference_ab: Name of reference Ab.
        :type reference_ab: string or unicode
        :param ab_channels: Mapping from Ab name to a fluorescence channel.
        :type ab_channels: dictionary or defaultdict

        :returns:  None.
    """
    all_intensities = None

    # Iterate over each column on the device and get a list of the
    # spreadsheet sheets
    #
    for col_num, sheets in sheetdict.iteritems():

        logging.info("Processing Genepix col {0}, sheets: {1}".format(
            col_num,
            ", ".join(sheets)
        ))
        this_col_intensities = None

        # Iterate over each sheet of the spreadsheet for this column
        # of the device.  Most will have a 1-to-1 mapping from column
        # to spreadsheet.  In cases where it is 1-to-N, the sheets must
        # be specified in the configuration file in the order they are
        # in the column: top-to-bottom.
        #
        for sheet in sheets:

            # Get the spreadsheet
            #
            xls_sheet = genepdict[sheet]

            # Extract the intensities for each well
            #
            # import IPython; IPython.embed()
            this_sheet_intensities = collect_intensity_per_row(
                xls_sheet,
            )

            # Append intensities for these wells to the other
            # intensities for this column if we have a 1-to-N column
            # to sheet mapping.
            #
            if this_col_intensities:
                this_col_intensities.extend(
                    this_sheet_intensities
                )
            else:
                this_col_intensities = this_sheet_intensities

        # Raise an exception if the data don't look right.
        #
        if len(this_col_intensities) != num_wells:
            msg = ("Wrong number of rows in Genepix "
                    "col {0}, got {1} expected {2}".format(
                        col_num, len(this_col_intensities), num_wells
            ))
            logging.warning(msg)
            # raise GenepixProcessingError(msg)

        # See if the columns are inverted.  That is, the orientation
        # of the device was rotated in the Genepix analysis relative
        # to the Elements analysis.
        #
        if inverted:
            real_col_num = num_columns - (col_num + 1)
        else:
            real_col_num = col_num

        # all_intensities is a dictiontionary, the keys of which are
        # column numbers.  The values are lists of length num_wells.
        # The members of each list are DataFrame objects describing
        # the fluorescence values of each well.  These are generally
        # of shape (20, 77) where 20 is the number of flow lines and
        # 77 are the Genepix data for each line.
        #
        # all_intensities[col_num] = this_col_intensities

        logging.info("Genepix col {0} (real {1})".format(
            col_num,
            real_col_num
        ))
        ab_names = sorted(flow_pattern, key=flow_pattern.get)

        col_intensities = process_ab_intensities_for_column(
            this_col_intensities, flow_pattern,
            reference_ab, ab_channels, real_col_num,
            ab_names,
            num_end_wells_ignored=num_end_wells_ignored,
            inverted=inverted,
            channel_shifts=channel_shifts,
        )
        if all_intensities is None:
            all_intensities = col_intensities
        else:
            all_intensities = all_intensities.append(col_intensities)

        # import IPython; IPython.embed()
    return all_intensities


def join_counts_and_intensities(cell_counts, intensities):
    """ Join together the intensities and the cell counts.
    """
    cell_count_df = counts_to_dataframe(cell_counts, intensities.index)
    joined_intensities = intensities.join(cell_count_df)
    return joined_intensities


def add_sum_nonref_column(intensities, reference_ab,
    ab_channels, col_name="sum_nonref"):
    """ Take a dataframe of intensities and add a column that
        is the sum of the intensities of all antibodies that
        are not the reference_ab.
    """
    ab_names = set(ab_channels.keys())
    abs_for_aln = [
        c for c in intensities.columns
        if c in ab_names and c != reference_ab
    ]
    intensities[col_name] = intensities.filter(abs_for_aln).apply(sum, axis=1)

AlignmentDetails = namedtuple(
    "AlignmentDetails",
    "alignment aln_score_per_well percent_populated  imin cmin cmax imax"
)


def align_all(intensities, cell_counts, reference_ab,
    ab_channels, num_columns):
    """ Compute optimal alignments of the
    """
    add_sum_nonref_column(intensities, reference_ab, ab_channels)

    mappings = {}
    alignment_details = {}
    warnings = []
    for col_num in range(num_columns):
        logging.info("Getting mapping for col {0}".format(col_num))
        had_warning = False

        (aln, pp,  imin, cmin, cmax, imax) \
            = align_column(intensities, cell_counts, col_num)

        # Check for too many or too few cells in this column's wells.
        #
        if not (0.10 < pp < 0.90):
            msg = ("Col {0} had a populated well frequecy of {1} "
                    "(it should be > 0.1 and < 0.9) "
                    "and was therefore not aligned automatically."
            ).format(col_num, pp)
            warnings.append(msg)
            had_warning = True

        # Check for a low-scoring alignment, in which case we should
        # just keep the default alignment.
        #
        aln_score_per_well = float(aln[2]) / min(imax - imin, cmax - cmin)
        if aln_score_per_well <= 1.6:
            msg = ("Col {0} had aln_score_per_well of {1} "
                    "(it should be > 1.6) "
                    "and was therefore not aligned automatically."
            ).format(col_num, pp)
            warnings.append(msg)
            had_warning = True

        # In the absence of warnings, make a new intensity-to-cell count
        # row mapping.
        #
        if had_warning:
            mapping = None
        else:
            istr = aln[0]
            cstr = aln[1]
            mapping = create_mapping_from_alignment(
                istr, cstr,
                imin, cmin,
                cmax, imax,
                extend_counts=True,
                ipy=False,
            )
        mappings[col_num] = mapping
        alignment_details[col_num] = AlignmentDetails(
            aln,
            aln_score_per_well,
            pp,
            imin,
            cmin,
            cmax,
            imax
        )

    return (mappings, alignment_details, warnings)


def align_column(intensities, cell_counts, col_num):
    """ Align a single column of cell counts against a single
        column of intensities.
    """

    # Find min and max indexes of the wells in this particular column.
    # E.g. we probably threw out the first five wells, because intensities
    # read from them
    #
    imin = min(intensities.ix[col_num].index)
    imax = max(intensities.ix[col_num].index)

    # Find the percent of wells that have cells.
    #
    percent_populated = float(
        len([v for v in cell_counts[col_num].values() if v > 0])) \
        / (max(cell_counts[col_num].keys()) + 1)

    #  If 3% of wells have cells, then we expect the top 3% of
    #  intensities to correspond to these wells.
    #
    threshold_idx = int(
        percent_populated * len(intensities['sum_nonref'].ix[col_num])
    )

    # Handle case in which 100% of the wells are populated
    threshold_idx = min(
        threshold_idx,
        len(intensities['sum_nonref'].ix[col_num]) - 1
    )

    # Find the threshold intensity, which the the ith highest
    # intensity, where i = threshold_idx.
    #
    ithreshold = sorted(
        intensities['sum_nonref'].ix[col_num], reverse=True
    )[threshold_idx]

    thresholded_intensities = ['0'] * (imax - imin + 1)
    for wellno, well in intensities.ix[col_num].iterrows():
        if well['sum_nonref'] > ithreshold:
            thresholded_intensities[wellno - imin] = '1'
    istr = ''.join(thresholded_intensities)

    cmin = min(cell_counts[col_num].keys())
    cmax = max(cell_counts[col_num].keys())
    cstr = ''.join([
        ('1' if cell_counts[col_num][i] > 0 else '0')
        for i in range(
            cmin,
            cmax + 1
        )
    ])
    aln = align.globalms(
            istr,
            cstr,
            2,
            -1,
            -10,
            -1,
            penalize_end_gaps=False,
            one_alignment_only=True
    )[0]
    return (aln, percent_populated, imin, cmin, cmax, imax)


def create_mapping_from_alignment(istr, cstr,
    imin, cmin, cmax, imax, gapchar='-',
    extend_counts=True, ipy=False):
    """ Takes a string alignment of the thresholded intensities
        and cell counts.  Returns a mapping from intensity row
        index to cell row index.

        If `extend_counts` is True, then we will not terminate
        the mapping at cmax.  This is because the elements
        software only outputs the locations of cells and does
        not output something like "ROI #300 has zero cells".
        Therefore, the highest ROI number of which we have data
        may be substantially lower than number of
    """

    # Ensure the alignments are of the same size
    #
    if len(istr) != len(cstr):
        msg = "Itensity and cell count alignments must be same length!"
        raise Exception(msg)
    else:
        alignment_length = len(istr)

    # The mapping is a dictionary that translates the intensity
    # row number to a corresponding cell count well number.
    #
    mapping = {}

    # Keep track of how many non-gap characters we've seen
    # of each the intensitites and the cell counts.
    #
    i_chars_seen = 0
    c_chars_seen = 0
    c_past_end = False
    c_num_past_end = 0

    # Iterate over the whole aligment.
    #
    for j in range(alignment_length):

        # Break out if we're at the end
        #
        if (i_chars_seen + imin > imax):
            logging.info("\tBroke out due to imax")
            break
        if (c_chars_seen + cmin > cmax):
            c_past_end = True
            if not extend_counts:
                logging.info("\tBroke out due to cmax")
                break

        # If two non-gap characters, make a mapping
        #
        if istr[j] != gapchar:
            if cstr[j] != gapchar or (c_past_end and extend_counts):
                mapping[i_chars_seen + imin] \
                    = c_chars_seen + cmin + c_num_past_end

        # If either one is non-gap, increment number
        # of charaters seen.
        #
        if istr[j] != gapchar:
            i_chars_seen += 1
        if cstr[j] != gapchar:
            c_chars_seen += 1

        # If we're past the end of the cell counts, c_chars_seen
        # will not be incremented but instead we'll increment
        # c_num_past_end.
        #
        if c_past_end and extend_counts:
            c_num_past_end += 1

    if ipy:
        import IPython
        IPython.embed()

    return mapping


def find_high_zero_cell(data, antibodies, cell_col, percentile=75):
    """ Takes a dataframe with intensity data and a list of antibodies.
        Adds a column called "high_empty_well_abs".  For each well with
        zero cells, this function will list those antibodies that
        have intensities about the 75th percentile of intensities
        for the wells with one cell.  That is, we're finding wells
        with zero cells that have suspiciously high intensities.
    """
    thresholds = dict(
        (ab, numpy.percentile(data[data[cell_col] == 1][ab], 75))
        for ab in antibodies
    )

    def label_suspicious_for_row(x):
        suspicious = [
            ab for ab in antibodies
            if x[cell_col] == 0 and x[ab] > thresholds[ab]
        ]
        return ", ".join(suspicious)

    # Returns a pandas.Series object
    #
    suspicious = data.apply(label_suspicious_for_row, axis=1)
    data['high_empty_well_abs'] = suspicious
Tech Fingerprint

Alerts (16)

'lambda' Avoid complex 'lambda' functions; prefer named functions for clarity and debugging
47
'def' Ensure functions have docstrings for documentation
89 160 253 382 401 534 627
Complexity hotspot; lines 245 to 247 (total complexity: 3)
245 246 247
Complexity hotspot; lines 390 to 391 (total complexity: 3)
390 391
Complexity hotspot; lines 515 to 516 (total complexity: 3)
515 516
'raise Exception(' Raise specific exception types for better error handling
553