kmj_lincs /lincs/analysis/genepix.py

Language Python Lines 638
MD5 Hash 4464310b9551d5cb5cc8d6424cc5601a Estimated Cost $8,006 (why?)
Repository https://bitbucket.org/kljensen/kmj_lincs.git View Raw File View Project SPDX
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
import numpy
import pandas
import logging
from collections import namedtuple
from .elements import counts_to_dataframe
from .pairwise2 import align


class GenepixProcessingError(Exception):
    """docstring for GenepixProcessingError"""
    pass


def align_intensities_to_cells(gcol, ccol):
    """ Align Genepix intensity readings to cell counts in each ROI.

        :param gcol: Column of intensities
        :type sheetdict: pandas.DataFrame
        :param ccol: Column of cell counts
        :type genepdict: collections.Counter
        :returns:  alignment.
    """

    # Sum intensity for each row of a column on the device
    #
    sumi = [x.sum() for _, x in gcol.iterrows()]

    # Number of wells in this column that have cells in them
    #
    num_populated_wells = len(ccol.keys())

    # Assuming only wells that have cells should have intensity,
    # what is our threshold_intensity?  That is, if 30% of the
    # wells have cells, our threshold_intensity should be the
    # 30% percentile of intensitites.
    #
    threshold_intensity = sorted(sumi, reverse=True)[num_populated_wells]

    ibool = [
        (1 if x >= threshold_intensity else 0)
        for x in sumi
    ]
    cbool = [
        (1 if ccol[i] > 0 else 0)
        for i in range(max(ccol.keys()) + 1)
    ]
    sjoin = lambda x: "".join(str(y) for y in x)
    alignment = align.localms(
                    sjoin(ibool),
                    sjoin(cbool),
                    2, -1, -4, -0.25,
                    penalize_end_gaps=False,
                    one_alignment_only=True
    )

    return alignment


def collect_intensity_per_row(xls_sheet):
    """ Collect the a DataFrame of intensities for each well in
        for which there is data in this xls_sheet.  Returns a
        list of DataFrame objects, one per well.
    """
    num_ab = None
    intensities = []

    grouped_data = xls_sheet.groupby(('Block', 'Row'))
    for (i, ((block, row), group)) in enumerate(grouped_data):

        # Ensure we have a consisten number of columns,
        # which correspond to antibody intensities.
        #
        if not num_ab:
            num_ab = len(group)
            logging.info("Found {0} antibody intensities".format(num_ab))
        elif len(group) != num_ab:
            msg = "Varying number of antibodies in Genepix file."
            msg += " Expected {0} but found {1}".format(num_ab, len(group))
            raise GenepixProcessingError(msg)

        # `group` is a DataFrame of shape, e.g. (20, 77), where 20 is
        #  the number of antibodies and 77 are the various Genepix readings.
        #
        intensities.append(group)

    return intensities


def process_ab_intensities_for_column(gpix_wells, flow_pattern,
    reference_ab, ab_channels, column_number, ab_names,
    channel_shifts=None,
    num_end_wells_ignored=None, inverted=False):
    """ Take per-well data for a single column of the device and
        extract the intensities for the Abs of interest, computing
        their "normalized" intensities, e.g. the mean minus median
        of the background for a particular channel.

        :param gpix_wells: List of DataFrame objects,
                           each with Genepix data for a single well.
        :type gpix_wells: list
        :param flow_pattern: Mapping from flow lines to an Ab name.
        :type flow_pattern: dictionary
        :param reference_ab: Name of reference Ab.
        :type reference_ab: string or unicode
        :param ab_channels: Mapping from Ab name to a fluorescence channel.
        :type ab_channels: dictionary or defaultdict
        :returns:  ??.
    """

    num_gpix_wells = len(gpix_wells)
    if inverted:
        well_iterator = reversed(gpix_wells)
    else:
        well_iterator = iter(gpix_wells)

    well_indicies = []
    well_intensities = []

    for (i, well) in enumerate(well_iterator):

        logging.debug("\tWell {0}".format(i))

        # Are we at the top of the device?
        #
        if num_end_wells_ignored and i < num_end_wells_ignored:
            continue

        # Or at the end of the device?
        #
        if (num_end_wells_ignored
            and i > num_gpix_wells - num_end_wells_ignored):
            break

        ab_intensities = \
            process_ab_intensities_for_well(well, reference_ab,
                ab_channels, flow_pattern,
                ab_names,
                channel_shifts=channel_shifts,
        )
        well_indicies.append((column_number, i))
        well_intensities.append(ab_intensities)

    df = pandas.DataFrame(
        well_intensities,
        index=pandas.MultiIndex.from_tuples(well_indicies),
        columns=ab_names + ["error"],
    )
    return df


def mean_minus_bg_channel_name(channel):
    """ The channel names we want out of the Genepix data
        look like "F488 Mean - B488" which is the foreground
        mean fluorescence minus the median of the background,
        calculated using a small border around the "target box"
    """
    return "F{0} Mean - B{0}".format(channel)


def process_ab_intensities_for_well(well, reference_ab, ab_channels,
    flow_pattern, ab_names, channel_shifts=None):
    """ Given data for a single well and the flow pattern, try to
        figure out the line indexes from which we should take each
        Ab's reading.
    """
    # Find the row number (row in the pandas sense) that
    # corresponds to the reference Ab.  This is assumed
    # to be the row with the largest fluorescence for the
    # reference_ab's fluorescence channel.
    #
    channel_name = mean_minus_bg_channel_name(ab_channels[reference_ab])
    reference_ab_col_idx = numpy.argmax(well[channel_name])
    logging.debug("\t\t{0} at position {1}".format(
        reference_ab,
        reference_ab_col_idx
    ))

    # See if it is at the left or right of the well
    # (equivalently top or bottom of the `well` object here).
    # The reference Ab is ALWAYS ASSUMED to be on one side of the
    # flow pattern.  If it is not, we have no way to orient ourselves!
    #
    if reference_ab_col_idx < (len(well) - reference_ab_col_idx):
        direction = -1
        logging.debug("\t\tLeft to right")
    else:
        direction = 1
        logging.debug("\t\tRight to left")

    # Reset the index so that it is 0->20-ish instead
    # of the original Genepix data file row numbers
    #
    well.index = range(len(well))
    # import IPython; IPython.embed()

    ab_intensities = []
    had_exception = False
    for ab in ab_names:

        # How many positions is this Ab offset from the
        # reference_ab in the flow pattern?
        #
        delta = flow_pattern[ab] - flow_pattern[reference_ab]

        # Find the index in the data for this Ab.  This is
        # the reference_ab_col_idx plus this Ab's offset in
        # the flow pattern from the reference_ab.  We must
        # multiply this by the (-1 or +1) direction because
        # the flow pattern is inverted in some cases (our
        # "zig-zag" pattern).  Finally, sometimes the Genepix
        # software will mis-align images it takes of different
        # fluorescence patterns.  Such mis-alignments are a
        # constant factor per-channel, as specified in the
        # the channel_shifts dictionary which is a user-set
        # parameter in the configuration file.
        #
        idx = reference_ab_col_idx + (direction * delta)

        # Get the channel_shift for this channel, by default zero.
        #
        if channel_shifts:
            idx += channel_shifts.get(ab_channels[ab],  0)

        logging.debug("\t\t{0} at position {1}".format(ab, idx))

        # Get the fluorescence column name for the channel in
        # which this Ab fluorescences.
        #
        col = mean_minus_bg_channel_name(ab_channels[ab])

        # Extract the data
        #
        try:
            # Do not allow negative intensities, instead,
            # consider them to be zero, else they will
            # oddly affect statistics such as sum intensitites
            # for a row, etc.
            #
            this_ab_intensity = max(well[col][idx], 0.)
            ab_intensities.append(this_ab_intensity)
        except KeyError:
            had_exception = True
            break

    if had_exception:
        ab_intensities = [None for ab in ab_names] + [True]
    else:
        ab_intensities.append(False)

    return ab_intensities


def extract_intensities(sheetdict, genepdict,
    num_columns, num_wells,
    flow_pattern, reference_ab, ab_channels,
    num_end_wells_ignored=None,
    channel_shifts=None,
    inverted=False):
    """ Compute the intensities for each well in each well of each column
        on the device.  The intensities will typically have 20 values,
        one for each antibody.

        :param sheetdict: Mapping from col number to sheet names
        :type sheetdict: dict
        :param genepdict: Mapping from sheet name to sheet data as DataFrames
        :type genepdict: dict
        :param num_columns: Number of columns on the device
        :type num_columns: int
        :param num_wells: Number of wells in each column of the device
        :type num_wells: int
        :param flow_pattern: Mapping from flow lines to an Ab name.
        :type flow_pattern: dictionary
        :param reference_ab: Name of reference Ab.
        :type reference_ab: string or unicode
        :param ab_channels: Mapping from Ab name to a fluorescence channel.
        :type ab_channels: dictionary or defaultdict

        :returns:  None.
    """
    all_intensities = None

    # Iterate over each column on the device and get a list of the
    # spreadsheet sheets
    #
    for col_num, sheets in sheetdict.iteritems():

        logging.info("Processing Genepix col {0}, sheets: {1}".format(
            col_num,
            ", ".join(sheets)
        ))
        this_col_intensities = None

        # Iterate over each sheet of the spreadsheet for this column
        # of the device.  Most will have a 1-to-1 mapping from column
        # to spreadsheet.  In cases where it is 1-to-N, the sheets must
        # be specified in the configuration file in the order they are
        # in the column: top-to-bottom.
        #
        for sheet in sheets:

            # Get the spreadsheet
            #
            xls_sheet = genepdict[sheet]

            # Extract the intensities for each well
            #
            # import IPython; IPython.embed()
            this_sheet_intensities = collect_intensity_per_row(
                xls_sheet,
            )

            # Append intensities for these wells to the other
            # intensities for this column if we have a 1-to-N column
            # to sheet mapping.
            #
            if this_col_intensities:
                this_col_intensities.extend(
                    this_sheet_intensities
                )
            else:
                this_col_intensities = this_sheet_intensities

        # Raise an exception if the data don't look right.
        #
        if len(this_col_intensities) != num_wells:
            msg = ("Wrong number of rows in Genepix "
                    "col {0}, got {1} expected {2}".format(
                        col_num, len(this_col_intensities), num_wells
            ))
            logging.warning(msg)
            # raise GenepixProcessingError(msg)

        # See if the columns are inverted.  That is, the orientation
        # of the device was rotated in the Genepix analysis relative
        # to the Elements analysis.
        #
        if inverted:
            real_col_num = num_columns - (col_num + 1)
        else:
            real_col_num = col_num

        # all_intensities is a dictiontionary, the keys of which are
        # column numbers.  The values are lists of length num_wells.
        # The members of each list are DataFrame objects describing
        # the fluorescence values of each well.  These are generally
        # of shape (20, 77) where 20 is the number of flow lines and
        # 77 are the Genepix data for each line.
        #
        # all_intensities[col_num] = this_col_intensities

        logging.info("Genepix col {0} (real {1})".format(
            col_num,
            real_col_num
        ))
        ab_names = sorted(flow_pattern, key=flow_pattern.get)

        col_intensities = process_ab_intensities_for_column(
            this_col_intensities, flow_pattern,
            reference_ab, ab_channels, real_col_num,
            ab_names,
            num_end_wells_ignored=num_end_wells_ignored,
            inverted=inverted,
            channel_shifts=channel_shifts,
        )
        if all_intensities is None:
            all_intensities = col_intensities
        else:
            all_intensities = all_intensities.append(col_intensities)

        # import IPython; IPython.embed()
    return all_intensities


def join_counts_and_intensities(cell_counts, intensities):
    """ Join together the intensities and the cell counts.
    """
    cell_count_df = counts_to_dataframe(cell_counts, intensities.index)
    joined_intensities = intensities.join(cell_count_df)
    return joined_intensities


def add_sum_nonref_column(intensities, reference_ab,
    ab_channels, col_name="sum_nonref"):
    """ Take a dataframe of intensities and add a column that
        is the sum of the intensities of all antibodies that
        are not the reference_ab.
    """
    ab_names = set(ab_channels.keys())
    abs_for_aln = [
        c for c in intensities.columns
        if c in ab_names and c != reference_ab
    ]
    intensities[col_name] = intensities.filter(abs_for_aln).apply(sum, axis=1)

AlignmentDetails = namedtuple(
    "AlignmentDetails",
    "alignment aln_score_per_well percent_populated  imin cmin cmax imax"
)


def align_all(intensities, cell_counts, reference_ab,
    ab_channels, num_columns):
    """ Compute optimal alignments of the
    """
    add_sum_nonref_column(intensities, reference_ab, ab_channels)

    mappings = {}
    alignment_details = {}
    warnings = []
    for col_num in range(num_columns):
        logging.info("Getting mapping for col {0}".format(col_num))
        had_warning = False

        (aln, pp,  imin, cmin, cmax, imax) \
            = align_column(intensities, cell_counts, col_num)

        # Check for too many or too few cells in this column's wells.
        #
        if not (0.10 < pp < 0.90):
            msg = ("Col {0} had a populated well frequecy of {1} "
                    "(it should be > 0.1 and < 0.9) "
                    "and was therefore not aligned automatically."
            ).format(col_num, pp)
            warnings.append(msg)
            had_warning = True

        # Check for a low-scoring alignment, in which case we should
        # just keep the default alignment.
        #
        aln_score_per_well = float(aln[2]) / min(imax - imin, cmax - cmin)
        if aln_score_per_well <= 1.6:
            msg = ("Col {0} had aln_score_per_well of {1} "
                    "(it should be > 1.6) "
                    "and was therefore not aligned automatically."
            ).format(col_num, pp)
            warnings.append(msg)
            had_warning = True

        # In the absence of warnings, make a new intensity-to-cell count
        # row mapping.
        #
        if had_warning:
            mapping = None
        else:
            istr = aln[0]
            cstr = aln[1]
            mapping = create_mapping_from_alignment(
                istr, cstr,
                imin, cmin,
                cmax, imax,
                extend_counts=True,
                ipy=False,
            )
        mappings[col_num] = mapping
        alignment_details[col_num] = AlignmentDetails(
            aln,
            aln_score_per_well,
            pp,
            imin,
            cmin,
            cmax,
            imax
        )

    return (mappings, alignment_details, warnings)


def align_column(intensities, cell_counts, col_num):
    """ Align a single column of cell counts against a single
        column of intensities.
    """

    # Find min and max indexes of the wells in this particular column.
    # E.g. we probably threw out the first five wells, because intensities
    # read from them
    #
    imin = min(intensities.ix[col_num].index)
    imax = max(intensities.ix[col_num].index)

    # Find the percent of wells that have cells.
    #
    percent_populated = float(
        len([v for v in cell_counts[col_num].values() if v > 0])) \
        / (max(cell_counts[col_num].keys()) + 1)

    #  If 3% of wells have cells, then we expect the top 3% of
    #  intensities to correspond to these wells.
    #
    threshold_idx = int(
        percent_populated * len(intensities['sum_nonref'].ix[col_num])
    )

    # Handle case in which 100% of the wells are populated
    threshold_idx = min(
        threshold_idx,
        len(intensities['sum_nonref'].ix[col_num]) - 1
    )

    # Find the threshold intensity, which the the ith highest
    # intensity, where i = threshold_idx.
    #
    ithreshold = sorted(
        intensities['sum_nonref'].ix[col_num], reverse=True
    )[threshold_idx]

    thresholded_intensities = ['0'] * (imax - imin + 1)
    for wellno, well in intensities.ix[col_num].iterrows():
        if well['sum_nonref'] > ithreshold:
            thresholded_intensities[wellno - imin] = '1'
    istr = ''.join(thresholded_intensities)

    cmin = min(cell_counts[col_num].keys())
    cmax = max(cell_counts[col_num].keys())
    cstr = ''.join([
        ('1' if cell_counts[col_num][i] > 0 else '0')
        for i in range(
            cmin,
            cmax + 1
        )
    ])
    aln = align.globalms(
            istr,
            cstr,
            2,
            -1,
            -10,
            -1,
            penalize_end_gaps=False,
            one_alignment_only=True
    )[0]
    return (aln, percent_populated, imin, cmin, cmax, imax)


def create_mapping_from_alignment(istr, cstr,
    imin, cmin, cmax, imax, gapchar='-',
    extend_counts=True, ipy=False):
    """ Takes a string alignment of the thresholded intensities
        and cell counts.  Returns a mapping from intensity row
        index to cell row index.

        If `extend_counts` is True, then we will not terminate
        the mapping at cmax.  This is because the elements
        software only outputs the locations of cells and does
        not output something like "ROI #300 has zero cells".
        Therefore, the highest ROI number of which we have data
        may be substantially lower than number of
    """

    # Ensure the alignments are of the same size
    #
    if len(istr) != len(cstr):
        msg = "Itensity and cell count alignments must be same length!"
        raise Exception(msg)
    else:
        alignment_length = len(istr)

    # The mapping is a dictionary that translates the intensity
    # row number to a corresponding cell count well number.
    #
    mapping = {}

    # Keep track of how many non-gap characters we've seen
    # of each the intensitites and the cell counts.
    #
    i_chars_seen = 0
    c_chars_seen = 0
    c_past_end = False
    c_num_past_end = 0

    # Iterate over the whole aligment.
    #
    for j in range(alignment_length):

        # Break out if we're at the end
        #
        if (i_chars_seen + imin > imax):
            logging.info("\tBroke out due to imax")
            break
        if (c_chars_seen + cmin > cmax):
            c_past_end = True
            if not extend_counts:
                logging.info("\tBroke out due to cmax")
                break

        # If two non-gap characters, make a mapping
        #
        if istr[j] != gapchar:
            if cstr[j] != gapchar or (c_past_end and extend_counts):
                mapping[i_chars_seen + imin] \
                    = c_chars_seen + cmin + c_num_past_end

        # If either one is non-gap, increment number
        # of charaters seen.
        #
        if istr[j] != gapchar:
            i_chars_seen += 1
        if cstr[j] != gapchar:
            c_chars_seen += 1

        # If we're past the end of the cell counts, c_chars_seen
        # will not be incremented but instead we'll increment
        # c_num_past_end.
        #
        if c_past_end and extend_counts:
            c_num_past_end += 1

    if ipy:
        import IPython
        IPython.embed()

    return mapping


def find_high_zero_cell(data, antibodies, cell_col, percentile=75):
    """ Takes a dataframe with intensity data and a list of antibodies.
        Adds a column called "high_empty_well_abs".  For each well with
        zero cells, this function will list those antibodies that
        have intensities about the 75th percentile of intensities
        for the wells with one cell.  That is, we're finding wells
        with zero cells that have suspiciously high intensities.
    """
    thresholds = dict(
        (ab, numpy.percentile(data[data[cell_col] == 1][ab], 75))
        for ab in antibodies
    )

    def label_suspicious_for_row(x):
        suspicious = [
            ab for ab in antibodies
            if x[cell_col] == 0 and x[ab] > thresholds[ab]
        ]
        return ", ".join(suspicious)

    # Returns a pandas.Series object
    #
    suspicious = data.apply(label_suspicious_for_row, axis=1)
    data['high_empty_well_abs'] = suspicious
Back to Top