config.py - Check for missing sheets

/lincs/io/config.py

https://bitbucket.org/kljensen/kmj_lincs · Python · 157 lines · 148 code · 4 blank · 5 comment · 2 complexity · f5ec6890db0c8e202e48b0105400db7b MD5 · raw file

import yaml
import logging
import itertools
import pandas
from collections import defaultdict

REQUIRED_FILES = ('elements_file', 'genepix_file')
REQUIRED_FIELDS = (
    'experiment_name',
    'num_columns',
    'num_wells',
)


class ConfigParseError(Exception):
    """docstring for ConfigParseError"""


def check_config(config):
    """ Check to make sure all config values
        are present and in the correct format.

    """
    for field in REQUIRED_FIELDS + REQUIRED_FILES:
        if field not in config:
            raise ConfigParseError("Config is missing {0} field".format(field))

    for field in REQUIRED_FIELDS:

        if field not in config:
            raise ConfigParseError
        if field.startswith('num_') and not isinstance(config[field], int):
            raise ConfigParseError("Config {0} should be an int".format(field))

    config = fill_default_sheets(config)
    required_cols = set(range(config['num_columns']))
    for file_type in REQUIRED_FILES:
        missing_cols = required_cols - set(config[file_type]["sheets"].keys())
        if missing_cols:
            msg = "{0} is missing col specifications for cols: {1}".format(
                file_type,
                missing_cols
            )
            raise ConfigParseError(msg)

        # Check for missing sheets
        #
        find_missing_sheets(config, file_type)


def find_missing_sheets(config, file_type):
    """ Ensure all the requested sheets actually exist
    """
    requested_sheets = set(itertools.chain.from_iterable(
        config[file_type]['sheets'].values()
    ))
    xls_file = pandas.ExcelFile(config[file_type]['path'])
    missing_sheets = set(requested_sheets) - set(xls_file.sheet_names)
    if missing_sheets:
        msg = "Missing sheets in {0} file: {1}".format(
            file_type, missing_sheets
        )
        raise ConfigParseError(msg)
    else:
        print "No missing sheets for {0}".format(file_type)

    return missing_sheets


def fill_default_sheets(config):
    for file_type in REQUIRED_FILES:

        fill_default = config[file_type].get('default_sheets')

        if fill_default:
            msg = "default_sheets specified for {0}, inspecting {1}".format(
                file_type,
                config[file_type]['path'],
            )
            logging.debug(msg)
            xls_file = pandas.ExcelFile(config[file_type]['path'])
            sheet_names = xls_file.sheet_names
            config[file_type]["sheets"] = {}
            for i in range(config['num_columns']):
                config[file_type]["sheets"][i] = [sheet_names[i]]
    return config


# def check_inversion(config):
#     """ Invert columns-sheet mapping if invertion is specified.  This means
#         that, if there are 12 columns, each mapped to a sheet in sequence,
#         we will instead map sheet 0 to column 11, sheet 1 to column 10, etc.

#         NOTE: this does not do the inversion of rows!  That is done when the
#         data is actually read-in.
#     """
#     for file_type in REQUIRED_FILES:
#         if config[file_type].get('inverted'):
#             logging.info("Inverting sheets for {0}".format(file_type))

#             num_columns = config['num_columns']
#             sheets = config[file_type]["sheets"]
#             new_sheets = {}

#             for i in range(num_columns):
#                 j = num_columns - (i + 1)
#                 new_sheets[j] = sheets[i]
#                 logging.info("\t{0} -> {1}".format(i, j))

#             config[file_type]["sheets"] = new_sheets


def process_default_channels(config):
    """ If the user specified a default channel in the ab_channels
        parameter, fill in the channels for antibodies that are in the
        flow pattern for which a channel was not specified in the
        config.  Alters the config in place.
    """
    channels = config['genepix_file']['ab_channels']
    default_channel = channels.get('default')
    if default_channel:
        del channels['default']
        for ab in config['genepix_file']['flow_pattern'].iterkeys():
            if ab not in channels:
                channels[ab] = default_channel


def handle_manual_cell_counting(config):
    """ The configuration file may specific that some columns of the
        device were counted manually rather than using the Elements
        software.  Setting `manual` to True means that all the columns
        were counted manually.  Or, it can be a list of columns that
        were counted manually.
    """
    manual_value = config['elements_file'].get('manual')
    if manual_value is True:
        manual_value = config['elements_file']['sheets'].keys()
    elif manual_value in [None, False]:
        manual_value = []
    elif isinstance(manual_value, list):
        pass
    else:
        msg = "config['elements_file']['sheets'] has bad value!"
        raise ConfigParseError(msg)
    config['elements_file']['manual'] = manual_value


def read_config(file_path):
    with open(file_path) as fh:
        config = yaml.safe_load(fh)

    config = fill_default_sheets(config)
    process_default_channels(config)
    handle_manual_cell_counting(config)
    check_config(config)

    return config
Tech Fingerprint

Alerts (5)

'isinstance(' Overuse may indicate design issues; consider polymorphism
32 140
'def' Ensure functions have docstrings for documentation
70 148
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
122