/etm_interface.py
Python | 345 lines | 240 code | 22 blank | 83 comment | 35 complexity | 1d3678e677ff0ea8027465ada5b3c234 MD5 | raw file
- #!/usr/bin/python
- """ Collection of functions and classes to deal with CMWP data
- """
- #-------------------------------------------------------------------------------
- #Imports
- #-------------------------------------------------------------------------------
- import os
- import netCDF4
- import scipy as sp
- import numpy as np
- import pandas as pd
- from pandas import Panel, DataFrame, Series
- import seawater.csiro as sw
- from etm_data.lisst import *
- import etm_data.etm_time as t
- #--------------------------------------------------------------------------------
- #Constants
- #--------------------------------------------------------------------------------
- # DataFrame must have equal sized arrays, so skip x/y when loading data
- ship_keys = ['x', 'y', 'time']
- sub_sample = ['VG1', 'VG2', 'ACC1', 'ACC2', 'uT', 'uC', 'OXY']
- #--------------------------------------------------------------------------------
- #Functions
- #--------------------------------------------------------------------------------
- def formatLisstDataForDataFrame(netCDF_data):
- """Formats processed LISST-100x data from a netCDF file for loading into
- a pandas DataFrame.
- Ughhh... this is ugly.
- Parameters
- ----------
- netCDF_data - netCDF4.Dataset
- Returns
- -------
- (vol, diams) - Tuple of DataFrame of volume and diameters of lisst data
- """
- volume_data = {}
- for i in range(32):
- volume_data[i] = netCDF_data.variables['vol'][:,i]
- time = netCDF_data.variables['time'][:]
- # 1000000 is arbitrary, but effective for this.
- if isinstance(time[0], np.float64) and time[0] > 1000000:
- time = pd.DatetimeIndex(t.epochToDatetime(time))
- else:
- time = pd.DatetimeIndex(t.datenumToDatetime(time)).tz_localize('UTC')
- vol = DataFrame(volume_data, index=time)
- diam_data = {}
- for i in range(14):
- diam_data[i] = netCDF_data.variables['diams'][:,i]
- diams = DataFrame(diam_data, index=time)
- return (vol, diams)
- def formatDataForDataFrame(netCDF_data):
- """Formats data from a netCDF file for loading into a pandas DataFrame
- Parameters
- ----------
- netCDF_data - netCDF4.Dataset
- Returns
- -------
- df - DataFrame object with instrument data
- """
- keys = netCDF_data.variables.keys()
- data = {}
- for k in keys:
- if k in ship_keys: continue
- data[k] = netCDF_data.variables[k][:]
- time = netCDF_data.variables['time'][:]
- if isinstance(time[0], np.float64) and time[0] > 1000000:
- time = pd.DatetimeIndex(t.epochToDatetime(time))
- else:
- time = pd.DatetimeIndex(t.datenumToDatetime(time)).tz_localize('UTC')
- df = DataFrame(data, index=time)
- return df
- #TODO: Format like LISST and junk this approach
- def formatADCPForDataFrame(adcpNetCDFData):
- """ Formats ADCP data from wh300.nc for loading into a pandas Panel
- It seems you can't place DataFrames with different dimensions into a single
- Panel, so this returns a dictionary df with a:
- - DataFrame of 1d data
- - Panel of 2d data
- Parameters
- ----------
- adcdpNetCDFData - netCDF object with ADCP data
- returns
- -------
- df - Dict of 1d in DataFrame and 2d data in Panel
- """
- keys = adcpNetCDFData.variables.keys()
- # 'trajectory' is empty so I'm throwing it out
- if keys.count('trajectory') > 0:
- keys.remove('trajectory')
- dim1 = {}
- dim2 = {}
- for k in keys:
- if len(adcpNetCDFData.variables[k].shape) == 2:
- dim2[k] = adcpNetCDFData.variables[k][:]
- else:
- dim1[k] = adcpNetCDFData.variables[k][:]
- df = {}
- df['1d'] = DataFrame(dim1)
- df['2d'] = Panel(dim2)
- return df
- def loadNetCDF(file, mode='r') :
- """Loads netCDF file of data and returns DataFrame of all vars
-
- Parameters
- ----------
- file - File to netCDF file of data
- mode - Read!
- Returns
- -------
- df - DataFrame of instrument data
- """
- data = netCDF4.Dataset(file, mode)
- if 'LISST' in file:
- print 'Loading LISST data'
- if len(data.variables.keys()) == 4:
- dfData = formatLisstDataForDataFrame(data)
- else:
- dfData = formatDataForDataFrame(data)
- else:
- dfData = formatDataForDataFrame(data)
- data.close()
- return dfData
- def processData(path, dirBase='oc1204B.0.F.', optInsts=None):
- """ Processes stepA of all data (except LISST) for all days
- """
- days = np.arange(1,8)
- if optInsts != None:
- insts = opt_insts
- else:
- insts = procInst
- for inst in insts:
- if not os.path.exists(path+dirBase+inst_name[inst]):
- print 'Making dir '+path+dirBase+inst_name[inst]
- os.mkdir(path+dirBase+inst_name[inst])
- for d in days:
- print 'Processing day %s' % d
- procStepA(d, inPath=path)
- def procStepA(day, inPath='./', outPath='./', type='Normal', ship='oc1204B'):
- """ Begins processing of Spring ETM cruise data.
- Step A:
- 1. Subsample to 2s intervals to make size a bit more managable
- 2. Add lat, long, and depth
- 3. Saves changes to oc1204B.0.F.INSTRUMENT/dayDAY.nc
- Parameters
- ----------
- day - Int of day of cruise to process
- inPath - String of path to input files (where cmwp_day?_*?.nc exist
- outPath - String of path to place output files
- """
- if type == 'Normal':
- print 'Processing normal data'
- f = os.path.join(inPath, 'cmwp_day%d_CTD.nc' %day)
- ctd = loadNetCDF(f).resample('2s')
- #oowTimes = calcOutOfWater(ctd)
- for inst in procInst:
- if inst == 'LISST':
- continue
- _open_and_save_file(day, inst, ctd, inPath, outPath, ship)
- elif type == 'LISST':
- print 'Processing LISST data'
- f = os.path.join(inPath, 'cmwp_day%d_CTD.nc' %day)
- ctd = loadNetCDF(f).resample('2s')
- _open_and_save_file(day, 'LISST_PROC', ctd, inPath, outPath, ship)
- # Used to clear out memory after closing scope.
- def _open_and_save_file(day, inst, ctd, inPath, outPath, ship):
- # Open old file
- f = 'cmwp_day%d_%s.nc' % (day, inst)
- f = os.path.join(inPath, f)
- print 'Opening %s' % f
- data = loadNetCDF(f, mode='r')
- if inst == 'LISST_PROC':
- d1 = data[0].resample('2s').reindex(ctd.index)
- d2 = data[1].resample('2s').reindex(ctd.index)
- d1 = addDepth(d1, ctd)
- data = (d1, d2)
- ship = ship+'.0.F.'+'LISST'
- type = 'LISST'
- else:
- if inst in sub_sample:
- data = data[::100].resample('2s').reindex(ctd.index)
- else:
- data = data.resample('2s').reindex(ctd.index)
- data = addDepth(data, ctd)
- ship = ship+'.0.F.'+inst_name[inst]
- type = 'Normal'
- # Save new netCDF file
- fDir = os.path.join(outPath, ship)
- dayFile = 'day%d.nc' % day
- f = os.path.join(fDir, dayFile)
- saveNetCDF(f, data, cruise='fall_1', type=type)
- def saveNetCDF(filePath, df, cruise, type='Normal', cache=True):
- """ Saves data from an instrument (DataFrame) into a netCDF4 file.
- Also adds:
- lat, lon, and global description (offering)
- Parameters
- ----------
- filePath - String of path for file (Assumes inclusion of dir
- structure for description)
- df - DateFrame object of data to save to file
- cruise - 'fall_1', 'fall_2', or 'spring' - Used to get units
- type - String like origin of data
- - 'Normal' not from ADCP or LISST
- - 'ADCP'
- - 'LISST' - Raw lisst data
- - 'LISST2' - LISST data that has already been dumpted into netcdf files and pandas
- cache -
- True: Adheres to netCDF like cache path structure
- e.g. oceanus1204c.0.F.LISST/day8.nc
- """
- if (type != 'Normal') and (type != 'ADCP') and (type != 'LISST') and (type != 'LISST2'):
- print 'Data type not supported '+type
- raise
- if cruise == 'fall_1':
- from etm_data.fall_1_consts import *
- elif cruise == 'fall_2':
- from etm_data.fall_2_consts import *
- elif cruise == 'spring':
- from etm_data.spring_conts import *
- print 'Creating netCDF file '+filePath
- f = netCDF4.Dataset(filePath, 'w', format='NETCDF4', zlib=True)
- if cache:
- f.description=filePath[:-8]
- else:
- f.description=filePath
- if type == 'Normal' or type == 'LISST2':
- f.createDimension('time', df.shape[0])
- times = t.datetime64ToEpoch(df.index.values)
- else:
- f.createDimension('time', df[0].shape[0])
- times = t.datetime64ToEpoch(df[0].index.values)
- f.createDimension('x', 1)
- f.createDimension('y', 1)
- tmp = f.createVariable('time', 'float64', ('time',))
- tmp[:] = times
- tmp.units = units['time']
- tmp = f.createVariable('x', 'float', ('x',))
- tmp[:] = lon
- tmp.units = 'latlon'
- tmp = f.createVariable('y', 'float', ('y',))
- tmp[:] = lat
- tmp.units = 'latlon'
- # Treat Data Origin
- if type == 'Normal':
- for i,v in enumerate(df.keys()):
- print 'Variable '+rev_names[v]+' : '+units[rev_names[v]]
- tmp = f.createVariable(v, df[v].dtype, ('time',))
- tmp[:] = df[v].values
- tmp.units = units[rev_names[v]]
- f.sync()
- elif type == 'ADCP':
- print 'Not implemented'
- raise
- elif type == 'LISST':
- # Treat volume and diameter df separately
- for i,v in enumerate(df[0].keys()):
- print 'Variable '+lisst_vol_names[i]+' : '+'um'
- tmp = f.createVariable(lisst_vol_names[i], 'float64', ('time',))
- if v == 'elev':
- tmp[:] = df[0]['elev'].values
- tmp.units = 'm'
- else:
- tmp[:] = df[0][i].values
- tmp.units = 'um'
- f.sync()
- for i,v in enumerate(df[1].keys()):
- print 'Variable '+lisst_diams_names[i]+' : '+lisst_diams_units[i]
- tmp = f.createVariable(lisst_diams_names[i], 'float64', ('time',))
- tmp[:] = df[1][i].values
- tmp.units = lisst_diams_units[i]
- f.sync()
- elif type == 'LISST2':
- for i,v in enumerate(df.keys()):
- print 'Variable '+v
- tmp = f.createVariable(v, 'float64', ('time',))
- tmp[:] = df[v].values
- tmp.units = lisst_units[v]
- f.sync()
- else:
- print 'Not implemented'
- raise
- f.close()
- #-------------------------------------------------------------------------------
- # Main
- #-------------------------------------------------------------------------------
- if __name__ == "__main__":
- cruises = ['spring', 'fall1', 'fall2']
- usage = "Usage: %prog raw_matlab_dir etm_cruise ['spring', 'fall1', 'fall2']"
- parser = OptionParser(usage=usage)
- parser.add_option("-l", action="store_true", help="Process LISST data",
- default_value=False, dest=lisst, type="bool")
- (options, args) = parser.parse_args()
- if len(args) != 2:
- parser.error("Incorrect number of args")
- if arg[1] not in cruises:
- parser.error("That's not a valid ETM cruise.")
- elif arg[1] == 'spring':
- import etm.spring_consts as consts
- elif arg[1] == 'fall1':
- import etm.fall_1_consts as consts
- else:
- import etm.fall_2_consts as consts
- if lisst:
- lisstMatToNc(arg[0])
- else:
- cmwpMatToNc(arg[0])