etm_interface.py - DataFrame must have equal sized arrays, …

/etm_interface.py

https://bitbucket.org/yosoyjay/etm-data · Python · 345 lines · 240 code · 22 blank · 83 comment · 35 complexity · 1d3678e677ff0ea8027465ada5b3c234 MD5 · raw file

#!/usr/bin/python
""" Collection of functions and classes to deal with CMWP data
"""
#-------------------------------------------------------------------------------
#Imports
#-------------------------------------------------------------------------------
import os
import netCDF4
import scipy as sp
import numpy as np
import pandas as pd
from pandas import Panel, DataFrame, Series
import seawater.csiro as sw
from etm_data.lisst import * 
import etm_data.etm_time as t 

#--------------------------------------------------------------------------------
#Constants
#--------------------------------------------------------------------------------
# DataFrame must have equal sized arrays, so skip x/y when loading data
ship_keys = ['x', 'y', 'time']
sub_sample = ['VG1', 'VG2', 'ACC1', 'ACC2', 'uT', 'uC', 'OXY']

#--------------------------------------------------------------------------------
#Functions
#--------------------------------------------------------------------------------
def formatLisstDataForDataFrame(netCDF_data):
  """Formats processed LISST-100x data from a netCDF file for loading into
  a pandas DataFrame.

  Ughhh... this is ugly. 

  Parameters
  ----------
    netCDF_data - netCDF4.Dataset 

  Returns
  -------
    (vol, diams) - Tuple of DataFrame of volume and diameters of lisst data
  """
  volume_data = {} 
  for i in range(32):
    volume_data[i] = netCDF_data.variables['vol'][:,i]
  time = netCDF_data.variables['time'][:]
  # 1000000 is arbitrary, but effective for this.
  if isinstance(time[0], np.float64) and time[0] > 1000000:
    time = pd.DatetimeIndex(t.epochToDatetime(time))
  else:
    time = pd.DatetimeIndex(t.datenumToDatetime(time)).tz_localize('UTC')
  vol = DataFrame(volume_data, index=time)

  diam_data = {}
  for i in range(14):
    diam_data[i] = netCDF_data.variables['diams'][:,i] 
  diams = DataFrame(diam_data, index=time)

  return (vol, diams)

def formatDataForDataFrame(netCDF_data):
  """Formats data from a netCDF file for loading into a pandas DataFrame 

  Parameters
  ----------
    netCDF_data - netCDF4.Dataset 

  Returns
  -------
    df - DataFrame object with instrument data

  """
  keys = netCDF_data.variables.keys()
  data = {}
  for k in keys:
    if k in ship_keys: continue
    data[k] = netCDF_data.variables[k][:]
  time = netCDF_data.variables['time'][:]
  if isinstance(time[0], np.float64) and time[0] > 1000000:
    time = pd.DatetimeIndex(t.epochToDatetime(time))
  else:
    time = pd.DatetimeIndex(t.datenumToDatetime(time)).tz_localize('UTC')
  df = DataFrame(data, index=time)

  return df

#TODO: Format like LISST and junk this approach
def formatADCPForDataFrame(adcpNetCDFData):
  """ Formats ADCP data from wh300.nc for loading into a pandas Panel 

  It seems you can't place DataFrames with different dimensions into a single
  Panel, so this returns a dictionary df with a:
    - DataFrame of 1d data
    - Panel of 2d data

  Parameters
  ----------
    adcdpNetCDFData - netCDF object with ADCP data

  returns
  -------
    df - Dict of 1d in DataFrame and 2d data in Panel
  """
  keys = adcpNetCDFData.variables.keys()
  # 'trajectory' is empty so I'm throwing it out
  if keys.count('trajectory') > 0:
    keys.remove('trajectory')
  dim1 = {}
  dim2 = {}
  for k in keys:
    if len(adcpNetCDFData.variables[k].shape) == 2:
      dim2[k] = adcpNetCDFData.variables[k][:]
    else:
      dim1[k] = adcpNetCDFData.variables[k][:]
  df = {}
  df['1d'] = DataFrame(dim1)
  df['2d'] = Panel(dim2)

  return df

def loadNetCDF(file, mode='r') :
  """Loads netCDF file of data and returns DataFrame of all vars
  
  Parameters
  ----------
    file - File to netCDF file of data
    mode - Read!

  Returns
  -------
    df - DataFrame of instrument data
  """ 
  data = netCDF4.Dataset(file, mode) 
  if 'LISST' in file:
    print 'Loading LISST data'
    if len(data.variables.keys()) == 4:
      dfData = formatLisstDataForDataFrame(data)
    else:
      dfData = formatDataForDataFrame(data)
  else:
    dfData = formatDataForDataFrame(data)
  data.close()

  return dfData

def processData(path, dirBase='oc1204B.0.F.', optInsts=None):
  """ Processes stepA of all data (except LISST) for all days
  """
  days = np.arange(1,8)
  if optInsts != None:
    insts = opt_insts
  else:
    insts = procInst 

  for inst in insts: 
    if not os.path.exists(path+dirBase+inst_name[inst]):
      print 'Making dir '+path+dirBase+inst_name[inst]
      os.mkdir(path+dirBase+inst_name[inst])

  for d in days: 
    print 'Processing day %s' % d
    procStepA(d, inPath=path)

def procStepA(day, inPath='./', outPath='./', type='Normal', ship='oc1204B'):
  """ Begins processing of Spring ETM cruise data.
  Step A:
  1. Subsample to 2s intervals to make size a bit more managable
  2. Add lat, long, and depth 
  3. Saves changes to oc1204B.0.F.INSTRUMENT/dayDAY.nc 

  Parameters
  ----------
    day - Int of day of cruise to process
    inPath - String of path to input files (where cmwp_day?_*?.nc exist
    outPath - String of path to place output files 

  """
  if type == 'Normal':
    print 'Processing normal data'
    f = os.path.join(inPath, 'cmwp_day%d_CTD.nc' %day)
    ctd = loadNetCDF(f).resample('2s')
    #oowTimes = calcOutOfWater(ctd)

    for inst in procInst: 
      if inst == 'LISST':
        continue
      _open_and_save_file(day, inst, ctd, inPath, outPath, ship)
  elif type == 'LISST':
    print 'Processing LISST data'
    f = os.path.join(inPath, 'cmwp_day%d_CTD.nc' %day)
    ctd = loadNetCDF(f).resample('2s')
    _open_and_save_file(day, 'LISST_PROC', ctd, inPath, outPath, ship)

# Used to clear out memory after closing scope. 
def _open_and_save_file(day, inst, ctd, inPath, outPath, ship):
  # Open old file
  f = 'cmwp_day%d_%s.nc' % (day, inst)
  f = os.path.join(inPath, f)
  print 'Opening %s' % f
  data = loadNetCDF(f, mode='r')
  if inst == 'LISST_PROC':
    d1 = data[0].resample('2s').reindex(ctd.index)
    d2 = data[1].resample('2s').reindex(ctd.index)
    d1 = addDepth(d1, ctd)
    data = (d1, d2)
    ship = ship+'.0.F.'+'LISST'
    type = 'LISST'
  else:
    if inst in sub_sample:
      data = data[::100].resample('2s').reindex(ctd.index)
    else: 
      data = data.resample('2s').reindex(ctd.index)
    data = addDepth(data, ctd)
    ship = ship+'.0.F.'+inst_name[inst]
    type = 'Normal'

  # Save new netCDF file
  fDir = os.path.join(outPath, ship)
  dayFile = 'day%d.nc' % day
  f = os.path.join(fDir, dayFile)

  saveNetCDF(f, data, cruise='fall_1', type=type)


def saveNetCDF(filePath, df, cruise, type='Normal', cache=True):
  """ Saves data from an instrument (DataFrame) into a netCDF4 file.

  Also adds:
  lat, lon, and global description (offering)

  Parameters
  ----------
    filePath - String of path for file (Assumes inclusion of dir 
               structure for description)
    df       - DateFrame object of data to save to file
    cruise   - 'fall_1', 'fall_2', or 'spring' - Used to get units
    type     - String like origin of data
      - 'Normal' not from ADCP or LISST
      - 'ADCP'
      - 'LISST' - Raw lisst data
      - 'LISST2' - LISST data that has already been dumpted into netcdf files and pandas
    cache    - 
      True: Adheres to netCDF like cache path structure 
      e.g. oceanus1204c.0.F.LISST/day8.nc
  """
  if (type != 'Normal') and (type != 'ADCP') and (type != 'LISST') and (type != 'LISST2'):
    print 'Data type not supported '+type
    raise

  if cruise == 'fall_1':
    from etm_data.fall_1_consts import * 
  elif cruise == 'fall_2':
    from etm_data.fall_2_consts import *
  elif cruise == 'spring':
    from etm_data.spring_conts import *

  print 'Creating netCDF file '+filePath
  f = netCDF4.Dataset(filePath, 'w', format='NETCDF4', zlib=True)

  if cache:
    f.description=filePath[:-8]
  else:
    f.description=filePath
  if type == 'Normal' or type == 'LISST2':
    f.createDimension('time', df.shape[0])
    times = t.datetime64ToEpoch(df.index.values)
  else:
    f.createDimension('time', df[0].shape[0])
    times = t.datetime64ToEpoch(df[0].index.values)
  f.createDimension('x', 1)
  f.createDimension('y', 1)
  tmp = f.createVariable('time', 'float64', ('time',))
  tmp[:] = times
  tmp.units = units['time'] 
  tmp = f.createVariable('x', 'float', ('x',))
  tmp[:] = lon
  tmp.units = 'latlon'
  tmp = f.createVariable('y', 'float', ('y',))
  tmp[:] = lat
  tmp.units = 'latlon'

  # Treat Data Origin
  if type == 'Normal':
    for i,v in enumerate(df.keys()):
      print 'Variable '+rev_names[v]+' : '+units[rev_names[v]]
      tmp = f.createVariable(v, df[v].dtype, ('time',))
      tmp[:] = df[v].values
      tmp.units = units[rev_names[v]] 
      f.sync()
  elif type == 'ADCP':
    print 'Not implemented'
    raise
  elif type == 'LISST':
    # Treat volume and diameter df separately
    for i,v in enumerate(df[0].keys()):
      print 'Variable '+lisst_vol_names[i]+' : '+'um'
      tmp = f.createVariable(lisst_vol_names[i], 'float64', ('time',))
      if v == 'elev':
        tmp[:] = df[0]['elev'].values
        tmp.units = 'm'
      else:
        tmp[:] = df[0][i].values
        tmp.units = 'um'
      f.sync()
    for i,v in enumerate(df[1].keys()):
      print 'Variable '+lisst_diams_names[i]+' : '+lisst_diams_units[i]
      tmp = f.createVariable(lisst_diams_names[i], 'float64', ('time',))
      tmp[:] = df[1][i].values
      tmp.units = lisst_diams_units[i] 
      f.sync()
  elif type == 'LISST2':
    for i,v in enumerate(df.keys()):
      print 'Variable '+v
      tmp = f.createVariable(v, 'float64', ('time',))
      tmp[:] = df[v].values
      tmp.units = lisst_units[v] 
      f.sync()
  else:
    print 'Not implemented'
    raise
  f.close()

#-------------------------------------------------------------------------------
# Main 
#-------------------------------------------------------------------------------
if __name__ == "__main__":
  cruises = ['spring', 'fall1', 'fall2']
  usage = "Usage: %prog raw_matlab_dir etm_cruise ['spring', 'fall1', 'fall2']"
  parser = OptionParser(usage=usage)
  parser.add_option("-l", action="store_true", help="Process LISST data",
                    default_value=False, dest=lisst, type="bool")
  (options, args) = parser.parse_args()
  if len(args) != 2:
    parser.error("Incorrect number of args")
  if arg[1] not in cruises:
    parser.error("That's not a valid ETM cruise.")
  elif arg[1] == 'spring':
    import etm.spring_consts as consts
  elif arg[1] == 'fall1':
    import etm.fall_1_consts as consts
  else:
    import etm.fall_2_consts as consts

  if lisst:
    lisstMatToNc(arg[0])
  else:
    cmwpMatToNc(arg[0])
Tech Fingerprint

Alerts (6)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
14 249 251 253
'isinstance(' Overuse may indicate design issues; consider polymorphism
46 77