/roof/term.py
Python | 251 lines | 208 code | 17 blank | 26 comment | 4 complexity | 74e92a81f47fcfca538b5d017460707d MD5 | raw file
- import os, re
- from glob import glob
- from datetime import datetime, timedelta
- import pandas as pd, numpy as np
- import roof
- from roof.iotools import *
- from roof.decagon import *
- from roof.data import *
- from roof.calc import *
- from roof import quirks
- def files(path, ext='dzd', plat=None):
- from glob import glob
-
- fs = os.listdir(path)
- filt = lambda s: re.search('.%s$' % ext, s)
- fs = list(filter(filt, fs))
- if plat:
- filtplat = lambda s: re.search('^%s' % plat, s)
- fs = list(filter(filtplat, fs))
- return fs
- def do_sample(file):
- d = pd.read_csv(file,index_col=0,parse_dates=[0])
- Rn = d['solar radiation']
- et = est_ETpm(d.dailytemp, d.humidity*100, calc_pressure(21),
- Rn, d.avgwindspeed)
- return et
- # def packall(s, letter, plats='1234', concat=True, ext='dzd'):
- # "join all dataframes in glob(s)"
- # from glob import glob
- # # "[aA]*.ext" or "[wW]eather*.ext"
- # s += "*["+letter.lower()+letter.upper()+"]"
- # if letter == 'w': s += 'eather'
- # else: s += '['+plats+']'
- # s += "*."+ext
- # ls = glob(s)
- # print('\n'.join(ls))
- # dfs = pack(ls)
-
- # if letter == 'w':
- # return concat(dfs.values())
- # h = {}
- # for n in range(1,5):
- # ds = [d for k,d in dfs.items() if int(get_platform(k)[1]) == n]
- # if ds and concat:
- # print("Concatenating %s #%s platforms" % (len(ds), n))
- # h[n] = concat(ds)
- # return h
- def do_packplat(plat, join_platforms=True):
- ls = [f for f in glob("*.dzd") if get_platform(f) == plat]
- return packdata(ls, join_platforms)
- def do_packdata(ls, join_platforms=True):
- """
- read df's from list of filenames
- returns dict: platform -> list of df's
- if join_platforms, concat lists
- if there's only one platform, just return a df
- """
- dfs = {}
- for f in ls:
- plat = get_platform(f)
- if plat: # and plat[0].lower() == letter.lower():
- print("Processing",f)
- dfs[plat] = (dfs.get(plat) or []) + [process_file(f)]
-
- if join_platforms:
- dfs = {pl: do_concat(ds).sort() for (pl, ds) in dfs.items()}
- # if len(dfs) == 1:
- # [df] = dfs.values()
- # return df
- return dfs
-
- def do_concat(dfs):
- return LabeledFrame(pd.concat(dfs).drop_duplicates())
- def do_packall_write(letter, daily=False):
- h = packall("*201*/",letter)
- for k,d in h.items():
- file='text/'+letter.lower()+str(k)
- if daily: file += '_daily'
- file+='.csv'
- if daily: d = d.aggregate()
-
- print('writing',file)
- d.to_csv(file, na_rep='.')
- def do_query(dfs, dti):
- "Query data files at all times in DatetimeIndex"
- agg = LabeledFrame(index=dti)
- for d in dfs:
- # assert(d.index.is_unique)
- # d = d.ix[dti].sort()
- # agg = pd.concat([agg, d]).sort()
- agg = agg.join(d).drop_duplicates().sort()
- # agg = agg.join(d, how='inner').sort()
-
- return agg
- def splitby_rainevents(waterdata):
- """
- Split data the times corresponding to rain events
- Return: DF: [start times, end times] for all rain events
- """
- from datetime import timedelta
- # rainevts = { "start": [], "end": [] }
- # rainevts = pd.DataFrame(columns=["Start", "End"])
- rainevts = []
- # Distinct event := rain >12 hours after previous rainfall
-
- interval = timedelta(0, 0, 0, 0, 0, 12)
- # Empty data
- if len(waterdata) == 0: return rainevts
- t_evt = waterdata.index[0] # most recent event
- for t, rain in waterdata.items():
- if rain > 0:
- if (t - t_evt) > interval: # a new event
- # skip this for first event
- if len(rainevts):
- rainevts[-1][1] = t_evt
- rainevts.append([t, t])
- # store most recent
- t_evt = t
- # a list of [start, stop] pairs
- return rainevts
- def get_startstoplists(xlspath):
- xl=pd.ExcelFile(xlspath)
- byplat = {}
- for sheet in xl.sheet_names:
- plat = get_platform(sheet)
- if plat:
- dti = get_startstoplist(xl, sheet)
- if dti is not None: byplat[plat] = dti
- else:
- print("Can't tell platform from sheet name '%s'" % sheet)
-
- return byplat
- def get_startstoplist(xl, sheet):
- """
- read list of __-[start|stop] times from an Excel sheet
- if sheet is None, returns dict: platform -> list
- else, returns list
- """
- try:
- df = xl.parse(sheet, header=None, parse_dates=[1])
- try:
- cs = df.columns
- for c in cs:
- try:
- dti = pd.DatetimeIndex(df[c])
- cs.remove(c)
- break
- except: pass
- # for c in cs:
- # s = df[c]
- # if s.apply(lambda st: re.search("(rain|runoff)", st)).any():
- return dti
- except Exception as e:
- print(e, "at sheet", sheet)
- except StopIteration:
- print("Could not parse sheet", sheet, "(possibly empty)")
- def query(dfdict, dtis):
- """
- given dicts of DataFrames and query indexes, w/ platforms as keys
- return dict of values at queries
- """
- byplat = {}
- for plat, dti in dtis.items():
- df = dfdict.get(plat)
- if df is not None:
- q = df.ix[dti]
- byplat[plat] = q.drop_duplicates()
- else:
- print("Data dict doesn't contain data for platform", plat)
- return byplat
- def filter_var(df, var='vwc'):
- filtered = df[[c for c in df.columns if c.variable == var]]
- return LabeledFrame(filtered)
- def season_of(dt):
- """get season from a datetime or timestamp
- defn:
- spring = march-may
- summer = june-aug
- fall = sept-nov
- winter = dec-feb
- """
- bounds = {
- (3, 5): 'spring',
- (6, 8): 'summer',
- (9, 11): 'fall',
- }
- for (start, end), sn in bounds.items():
- if start <= dt.month <= end: return sn
- return 'winter'
-
- def year_season_of(dt):
- "get (year, season)"
- # return '%s_%s' % (season(dt), dt.year)
- return (dt.year, season_of(dt))
- def sumwater(col):
- lbl = col.name
- if isinstance(lbl, str):
- try:
- lbl = Label.from_str(col.name)
- except: pass
- if isinstance(lbl, Label) and lbl.variable == 'water_vol':
- return col.sum()
- return col.mean()
-
- def do_fixtime(df, time, freq=INPUT_FREQ):
- """
- Shift pieces of DF with time gap by timedelta (on index) and concat
- to create new DF without gaps.
- """
- df0, df1 = df.ix[:time - freq], df.ix[time:]
- df0_shifted = df0.tshift(freq = time - freq - df0.index[-1])
- df1_shifted = df1.tshift(periods = -1, freq = df1.index[0] - time)
- return LabeledFrame(pd.concat([df0_shifted, df1_shifted]))
-
- # def outliers(dfs):
- # """Find outliers in data"""