PageRenderTime 49ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/roof/iotools.py

https://bitbucket.org/roysc/roof
Python | 277 lines | 185 code | 50 blank | 42 comment | 29 complexity | 7835b2c96d38d662a879c0b3c1548cf4 MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tools for processing data from Decagon Devices sensors.
  4. Aims to support data exported as XLS files (or converted to CSV) as well as raw
  5. DZD format (zipped SQL database).
  6. """
  7. __date__ = "Oct 21, 2012"
  8. import re, os
  9. import numpy as np
  10. import pandas as pd
  11. from roof.data import *
  12. from roof import decagon # import *
  13. INPUT_FREQ = pd.offsets.Minute(5)
  14. force_freq = False # whether to force converted data to conform to INPUT_FREQ
  15. # Callback, used to decide what to do when conflicting sensor values
  16. # are encountered while converting.
  17. # anything else: raise exception
  18. sensor_ambiguous = max
  19. # Used to handle an unkown sensor
  20. sensor_unknown = lambda idnum: {123: 120, 189: 188, 249: 120}[idnum]
  21. def process_file(path, convert=True, freq=INPUT_FREQ):
  22. """
  23. Processes a file and returns a dataframe.
  24. """
  25. from mimetypes import guess_type
  26. # from magic import from_file
  27. # by mime type
  28. handlers = {
  29. 'text/csv' : _process_csv,
  30. 'text/plain' : _process_csv,
  31. 'application/vnd.ms-excel' : _process_xls,
  32. 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
  33. : _process_xlsx,
  34. 'application/octet-stream' : _process_sql,
  35. 'application/zip' : _process_dzd,
  36. }
  37. # mime = from_file(path, mime=True).decode('utf-8')
  38. mime, enc = guess_type(path)
  39. try:
  40. handler = handlers[mime]
  41. except KeyError:
  42. from os.path import splitext
  43. base, ext = splitext(path)
  44. if ext == '.db':
  45. handler = _process_sql
  46. elif ext == '.dzd':
  47. handler = _process_dzd
  48. else: raise Exception("File type not supported.")
  49. df = handler(path, convert)
  50. # df = pd.DataFrame(df)
  51. df.sort(axis=1, inplace=True)
  52. # why do this here?
  53. try:
  54. df.set_index('time', inplace=True)
  55. # except KeyError:
  56. # try: df.set_index(df.columns[0], inplace=True)
  57. except: pass
  58. if force_freq: # force a frequency
  59. if df.index.inferred_freq != freq:
  60. raise Exception("Can't infer frequency for "+path)
  61. else:
  62. df = df.asfreq(freq)
  63. return LabeledFrame(df)
  64. # return LabeledFrame(df, file=os.path.basename(path))
  65. def _process_csv(path, convert):
  66. return _process_csv_plain(path, convert)
  67. # df, exn = None, None
  68. # for handler in [_process_csv_excel, _process_csv_plain]:
  69. # try:
  70. # df = handler(path, convert)
  71. # except Exception as e:
  72. # exn = e
  73. # if df is None:
  74. # raise exn
  75. return df
  76. def _process_csv_plain(path, convert):
  77. df = pd.read_csv(path, na_values='.', parse_dates=[0], index_col=0)
  78. # df.columns = [Label.from_str(s) for s in df.columns]
  79. cols = []
  80. for s in df.columns:
  81. try:
  82. s = Label.from_str(s)
  83. except: pass
  84. cols.append(s)
  85. df.columns = cols
  86. return df
  87. def _labeler(platform=None):
  88. # Generates a function which...
  89. def label_fn(col):
  90. # Generates a label for each column in CSV header
  91. name, srlabel, meas = col[:3]
  92. meas = [s for s in meas.strip().split(' ') if s]
  93. m = re.search('port (\d+)', name, re.I)
  94. port = int(m.group(1)) if m else 0
  95. sensor = srlabel
  96. if len(meas) > 1:
  97. units, var = meas[:2]
  98. else:
  99. units, var = None, meas[-1]
  100. sensor = decagon.sensors_bylabel.get(srlabel, decagon.UNKNOWN_SENSOR)
  101. for m in sensor.measurements:
  102. # find out which measurement it is
  103. if re.search(str(m), var, re.I):
  104. units, var = m.units, m
  105. break
  106. if isinstance(var, str):
  107. if re.search('liters', var, re.I):
  108. var = 'runoff'
  109. lbl = Label(port, platform, sensor, var, units)
  110. return lbl
  111. return label_fn
  112. def _process_csv_excel(path, convert):
  113. # Process a CSV file obtained from a converted Excel file
  114. # read just header
  115. hdr = pd.read_csv(path, index_col=0, nrows=3, header=None)
  116. # greenroof platform (treatment)
  117. plat = get_platform(os.path.basename(path))
  118. labels = hdr.apply(_labeler(plat))
  119. df = pd.read_csv(path, index_col = 0, names = labels, skiprows = 4,
  120. parse_dates = True, na_values = ['***'])
  121. return df
  122. def _process_xls(path, convert):
  123. xls = pd.ExcelFile(path)
  124. sheet = 'Processed' if convert else 'Unprocessed'
  125. df = xls.parse(sheet, index_col = 0, header = None)
  126. plat = get_platform(os.path.basename(path))
  127. labels = df.head(3).apply(_labeler(plat))
  128. df = df.tail(-3).convert_objects()
  129. df.columns = labels
  130. return df
  131. _process_xlsx = _process_xls
  132. def _read_raw_time(times):
  133. from datetime import datetime
  134. # from pytz import timezone
  135. # tz = timezone('US/Eastern')
  136. # origin = tz.localize(datetime(2000, 1, 1, 0))
  137. # "Origin" date, in seconds: Jan 1, 2000 00:00
  138. origin = datetime(2000, 1, 1, 0)
  139. if hasattr(origin, 'timestamp'): # py3 & py2 compat.
  140. origin = origin.timestamp()
  141. else:
  142. from time import mktime
  143. origin = mktime(origin.timetuple())
  144. tseries = times + int(origin)
  145. return tseries.astype('datetime64[s]')
  146. def _process_sql(path, convert):
  147. from sqlite3 import connect
  148. from pandas.io.sql import read_frame
  149. plat = get_platform(os.path.basename(path))
  150. qstr = "select * from %s"
  151. db = connect(path)
  152. ports = read_frame(qstr % 'ports', db, coerce_float=False)
  153. ports.set_index('portNum', inplace=True)
  154. raw = read_frame(qstr % 'raw', db, coerce_float=False)
  155. data = {}
  156. from re import match
  157. # Determine sensor types
  158. for col in raw.columns:
  159. add = {}
  160. if col == 'time':
  161. add[TimeLabel()] = _read_raw_time(raw[col])
  162. elif col == 'displaytime':
  163. continue
  164. else:
  165. sr = decagon.UNKNOWN_SENSOR
  166. comment = '' # Comment for label, to log what goes wrong
  167. try: # Try to determine sensor.
  168. m = match('ch(\d+)', col)
  169. if m is None: continue
  170. ch = int(m.group(1))
  171. p = ports.ix[ch]
  172. # There can be multiple entries for a port number, but they
  173. # should have the same sensor. If not, throw.
  174. idnum = p['sensorID']
  175. if np.size(idnum) > 1:
  176. idnum = idnum[idnum != 255] # "None" ID
  177. idnum = idnum.unique()
  178. if len(idnum) > 1:
  179. comment = "Multiple sensor IDs for Port %s: %s" % (ch, idnum)
  180. if callable(sensor_ambiguous):
  181. idnum = sensor_ambiguous(idnum)
  182. # elif isinstance(sensor_ambiguous, str):
  183. # idnum = eval(sensor_ambiguous)(idnum)
  184. else:
  185. raise Exception(comment)
  186. try:
  187. sr = decagon.sensors[int(idnum)]
  188. except KeyError:
  189. comment = "Unknown sensor id: %s" % idnum
  190. try:
  191. idnum_ = sensor_unknown(int(idnum))
  192. comment += ", trying %s" % idnum_
  193. sr = decagon.sensors[idnum_]
  194. except KeyError as e:
  195. raise Exception(comment)
  196. if not convert:
  197. raise Exception("Not converting values.")
  198. full = convert != 'split'
  199. for meas, vals in sr.convert(raw[col], full).items():
  200. label = Label(ch, plat, sr, meas, sr.get_units(meas))
  201. if comment: label.comment = comment
  202. add[label] = vals
  203. except Exception as e:
  204. add[RawLabel(ch, plat, sr, e)] = raw[col]
  205. data.update(add)
  206. return pd.DataFrame(data)
  207. def _process_dzd(path, convert):
  208. from zipfile import ZipFile
  209. from tempfile import TemporaryDirectory
  210. df = None
  211. with ZipFile(path, 'r') as zf:
  212. ls = zf.namelist()
  213. if not ls:
  214. raise Exception("Archive is empty.")
  215. elif len(ls) > 1:
  216. raise Exception("Multiple files in archive.")
  217. with TemporaryDirectory() as tdir:
  218. tpath = zf.extract(ls[0], tdir)
  219. df = _process_sql(tpath, convert)
  220. return df