/lincs/io/config_stats.py
Python | 96 lines | 88 code | 4 blank | 4 comment | 0 complexity | b7d17bf0ebe9751b9fc3d0e1218074f0 MD5 | raw file
- # -*- coding: utf-8 -*-
- """ Code for parsing the configuration file used for
- running statistics on Lincs experimental treatments.
- """
- import yaml
- import os
- import pandas
- REQUIRED_FIELDS = (
- ('experiment_name', str),
- ('treatments', dict),
- ('signals', list),
- ('on_threshold', int),
- )
- class ConfigParseError(Exception):
- """docstring for ConfigParseError"""
- def check_config(config):
- """ Check to make sure all config values are present and in the correct
- format. Check to make sure Excel files are in order.
- """
- for field, obj_type in REQUIRED_FIELDS:
- if field not in config:
- raise ConfigParseError("Missing field in config: {0}".format(
- field
- ))
- if not isinstance(config[field], obj_type):
- msg = "Field '{0}' should be of type {1}".format(field, obj_type)
- raise ConfigParseError(msg)
- if "control" not in config["treatments"]:
- raise ConfigParseError("Configuration missing a control treatment!")
- for k, v in config["treatments"].iteritems():
- # Make sure each treatment has an Excel file listed
- #
- if "path" not in v or not v["path"]:
- msg = "Treatment {0} is missing a path to data file".format(k)
- raise ConfigParseError(msg)
- # Make sure that file exists
- #
- if not os.path.isfile(v["path"]):
- msg = "Treatment {0} data file {1} not found!".format(k, v["path"])
- raise ConfigParseError(msg)
- # Open the file. Make sure the required sheet exists.
- # Set the required sheet if it's the default.
- #
- xls_file = pandas.ExcelFile(v['path'])
- if "sheet" not in v:
- config["treatments"][k]["sheet"] = xls_file.sheet_names[0]
- elif v["sheet"] not in xls_file.sheet_names:
- msg = "Sheet {0} not found in {1}!".format(v["sheet"], v["path"])
- raise ConfigParseError(msg)
- # Ensure there is a cell_count_column
- #
- if "cell_count_column" not in v:
- config["treatments"][k]["cell_count_column"] = "cell count"
- # Make sure there is a column for each signal and the cell_count
- #
- data = xls_file.parse(v["sheet"])
- needed_columns = config["signals"] \
- + [config["treatments"][k]["cell_count_column"]]
- for col in needed_columns:
- if col not in data.columns:
- msg = "Column {0} is missing in sheet {1} of {2}".format(
- col,
- v["sheet"],
- v["path"]
- )
- raise ConfigParseError(msg)
- def read_config(file_path):
- """ Read the configuration file
- :param file_path: Path to the config file
- :type file_path: string.
- :returns: dictionary version of config file.
- """
- with open(file_path) as fh:
- config = yaml.safe_load(fh)
- check_config(config)
- return config