/roof/iotools.py
Python | 277 lines | 185 code | 50 blank | 42 comment | 29 complexity | 7835b2c96d38d662a879c0b3c1548cf4 MD5 | raw file
- # -*- coding: utf-8 -*-
- """
- Tools for processing data from Decagon Devices sensors.
- Aims to support data exported as XLS files (or converted to CSV) as well as raw
- DZD format (zipped SQL database).
- """
- __date__ = "Oct 21, 2012"
- import re, os
- import numpy as np
- import pandas as pd
- from roof.data import *
- from roof import decagon # import *
- INPUT_FREQ = pd.offsets.Minute(5)
- force_freq = False # whether to force converted data to conform to INPUT_FREQ
- # Callback, used to decide what to do when conflicting sensor values
- # are encountered while converting.
- # anything else: raise exception
- sensor_ambiguous = max
- # Used to handle an unkown sensor
- sensor_unknown = lambda idnum: {123: 120, 189: 188, 249: 120}[idnum]
- def process_file(path, convert=True, freq=INPUT_FREQ):
- """
- Processes a file and returns a dataframe.
- """
- from mimetypes import guess_type
- # from magic import from_file
-
- # by mime type
- handlers = {
- 'text/csv' : _process_csv,
- 'text/plain' : _process_csv,
- 'application/vnd.ms-excel' : _process_xls,
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
- : _process_xlsx,
- 'application/octet-stream' : _process_sql,
- 'application/zip' : _process_dzd,
- }
- # mime = from_file(path, mime=True).decode('utf-8')
- mime, enc = guess_type(path)
- try:
- handler = handlers[mime]
- except KeyError:
- from os.path import splitext
- base, ext = splitext(path)
- if ext == '.db':
- handler = _process_sql
- elif ext == '.dzd':
- handler = _process_dzd
- else: raise Exception("File type not supported.")
- df = handler(path, convert)
- # df = pd.DataFrame(df)
-
- df.sort(axis=1, inplace=True)
- # why do this here?
- try:
- df.set_index('time', inplace=True)
- # except KeyError:
- # try: df.set_index(df.columns[0], inplace=True)
- except: pass
- if force_freq: # force a frequency
- if df.index.inferred_freq != freq:
- raise Exception("Can't infer frequency for "+path)
- else:
- df = df.asfreq(freq)
- return LabeledFrame(df)
- # return LabeledFrame(df, file=os.path.basename(path))
- def _process_csv(path, convert):
- return _process_csv_plain(path, convert)
- # df, exn = None, None
- # for handler in [_process_csv_excel, _process_csv_plain]:
- # try:
- # df = handler(path, convert)
- # except Exception as e:
- # exn = e
- # if df is None:
- # raise exn
- return df
- def _process_csv_plain(path, convert):
- df = pd.read_csv(path, na_values='.', parse_dates=[0], index_col=0)
- # df.columns = [Label.from_str(s) for s in df.columns]
- cols = []
- for s in df.columns:
- try:
- s = Label.from_str(s)
- except: pass
- cols.append(s)
- df.columns = cols
- return df
- def _labeler(platform=None):
- # Generates a function which...
- def label_fn(col):
- # Generates a label for each column in CSV header
- name, srlabel, meas = col[:3]
- meas = [s for s in meas.strip().split(' ') if s]
- m = re.search('port (\d+)', name, re.I)
- port = int(m.group(1)) if m else 0
- sensor = srlabel
- if len(meas) > 1:
- units, var = meas[:2]
- else:
- units, var = None, meas[-1]
- sensor = decagon.sensors_bylabel.get(srlabel, decagon.UNKNOWN_SENSOR)
- for m in sensor.measurements:
- # find out which measurement it is
- if re.search(str(m), var, re.I):
- units, var = m.units, m
- break
- if isinstance(var, str):
- if re.search('liters', var, re.I):
- var = 'runoff'
- lbl = Label(port, platform, sensor, var, units)
- return lbl
-
- return label_fn
- def _process_csv_excel(path, convert):
- # Process a CSV file obtained from a converted Excel file
- # read just header
- hdr = pd.read_csv(path, index_col=0, nrows=3, header=None)
- # greenroof platform (treatment)
- plat = get_platform(os.path.basename(path))
- labels = hdr.apply(_labeler(plat))
- df = pd.read_csv(path, index_col = 0, names = labels, skiprows = 4,
- parse_dates = True, na_values = ['***'])
-
- return df
-
- def _process_xls(path, convert):
- xls = pd.ExcelFile(path)
- sheet = 'Processed' if convert else 'Unprocessed'
- df = xls.parse(sheet, index_col = 0, header = None)
- plat = get_platform(os.path.basename(path))
- labels = df.head(3).apply(_labeler(plat))
- df = df.tail(-3).convert_objects()
- df.columns = labels
-
- return df
- _process_xlsx = _process_xls
- def _read_raw_time(times):
- from datetime import datetime
- # from pytz import timezone
- # tz = timezone('US/Eastern')
- # origin = tz.localize(datetime(2000, 1, 1, 0))
- # "Origin" date, in seconds: Jan 1, 2000 00:00
- origin = datetime(2000, 1, 1, 0)
- if hasattr(origin, 'timestamp'): # py3 & py2 compat.
- origin = origin.timestamp()
- else:
- from time import mktime
- origin = mktime(origin.timetuple())
-
- tseries = times + int(origin)
- return tseries.astype('datetime64[s]')
-
- def _process_sql(path, convert):
- from sqlite3 import connect
- from pandas.io.sql import read_frame
- plat = get_platform(os.path.basename(path))
-
- qstr = "select * from %s"
- db = connect(path)
- ports = read_frame(qstr % 'ports', db, coerce_float=False)
- ports.set_index('portNum', inplace=True)
- raw = read_frame(qstr % 'raw', db, coerce_float=False)
- data = {}
- from re import match
- # Determine sensor types
- for col in raw.columns:
- add = {}
- if col == 'time':
- add[TimeLabel()] = _read_raw_time(raw[col])
- elif col == 'displaytime':
- continue
- else:
- sr = decagon.UNKNOWN_SENSOR
- comment = '' # Comment for label, to log what goes wrong
- try: # Try to determine sensor.
- m = match('ch(\d+)', col)
- if m is None: continue
- ch = int(m.group(1))
- p = ports.ix[ch]
- # There can be multiple entries for a port number, but they
- # should have the same sensor. If not, throw.
- idnum = p['sensorID']
- if np.size(idnum) > 1:
- idnum = idnum[idnum != 255] # "None" ID
- idnum = idnum.unique()
- if len(idnum) > 1:
- comment = "Multiple sensor IDs for Port %s: %s" % (ch, idnum)
- if callable(sensor_ambiguous):
- idnum = sensor_ambiguous(idnum)
- # elif isinstance(sensor_ambiguous, str):
- # idnum = eval(sensor_ambiguous)(idnum)
- else:
- raise Exception(comment)
- try:
- sr = decagon.sensors[int(idnum)]
- except KeyError:
- comment = "Unknown sensor id: %s" % idnum
- try:
- idnum_ = sensor_unknown(int(idnum))
- comment += ", trying %s" % idnum_
- sr = decagon.sensors[idnum_]
- except KeyError as e:
- raise Exception(comment)
- if not convert:
- raise Exception("Not converting values.")
- full = convert != 'split'
- for meas, vals in sr.convert(raw[col], full).items():
- label = Label(ch, plat, sr, meas, sr.get_units(meas))
- if comment: label.comment = comment
- add[label] = vals
- except Exception as e:
- add[RawLabel(ch, plat, sr, e)] = raw[col]
- data.update(add)
-
- return pd.DataFrame(data)
- def _process_dzd(path, convert):
- from zipfile import ZipFile
- from tempfile import TemporaryDirectory
-
- df = None
- with ZipFile(path, 'r') as zf:
- ls = zf.namelist()
- if not ls:
- raise Exception("Archive is empty.")
- elif len(ls) > 1:
- raise Exception("Multiple files in archive.")
- with TemporaryDirectory() as tdir:
- tpath = zf.extract(ls[0], tdir)
- df = _process_sql(tpath, convert)
- return df