try_interchange.py | searchcode

/statsmodels/sandbox/examples/thirdparty/try_interchange.py

http://github.com/statsmodels/statsmodels
Python | 74 lines | 54 code | 4 blank | 16 comment | 5 complexity | fe3e1195efe0fc1d3f3e9c03ad7fcfac MD5 | raw file
Possible License(s): BSD-3-Clause

# -*- coding: utf-8 -*-
"""groupmean, groupby in pandas, la and tabular from a scikits.timeseries

after a question on the scipy-user mailing list I tried to do
groupmeans, which in this case are duplicate dates, in the 3 packages.

I'm using the versions that I had installed, which are all based on
repository checkout, but are not fully up-to-date

some brief comments

* la.larry and pandas.DataFrame require unique labels/index so
  groups have to represented in a separate data structure
* pandas is missing GroupBy in the docs, but the docstring is helpful
* both la and pandas handle datetime objects as object arrays
* tabular requires conversion to structured dtype, but easy helper
  functions or methods are available in scikits.timeseries and tabular

* not too bad for a first try

Created on Sat Jan 30 08:33:11 2010
Author: josef-pktd
"""
from statsmodels.compat.python import lrange
import numpy as np
import scikits.timeseries as ts
import la
import pandas
import tabular as tb
from finance import msft, ibm  # hack to make it run as standalone

s = ts.time_series([1,2,3,4,5],
            dates=ts.date_array(["2001-01","2001-01",
            "2001-02","2001-03","2001-03"],freq="M"))

print('\nUsing la')
dta = la.larry(s.data, label=[lrange(len(s.data))])
dat = la.larry(s.dates.tolist(), label=[lrange(len(s.data))])
s2 = ts.time_series(dta.group_mean(dat).x,dates=ts.date_array(dat.x,freq="M"))
s2u = ts.remove_duplicated_dates(s2)
print(repr(s))
print(dat)
print(repr(s2))
print(repr(s2u))

print('\nUsing pandas')
pdta = pandas.DataFrame(s.data, np.arange(len(s.data)), [1])
pa = pdta.groupby(dict(zip(np.arange(len(s.data)),
            s.dates.tolist()))).aggregate(np.mean)
s3 = ts.time_series(pa.values.ravel(),
            dates=ts.date_array(pa.index.tolist(),freq="M"))

print(pa)
print(repr(s3))

print('\nUsing tabular')
X = tb.tabarray(array=s.torecords(), dtype=s.torecords().dtype)
tabx = X.aggregate(On=['_dates'], AggFuncDict={'_data':np.mean,'_mask':np.all})
s4 = ts.time_series(tabx['_data'],dates=ts.date_array(tabx['_dates'],freq="M"))
print(tabx)
print(repr(s4))

#after running pandas/examples/finance.py
larmsft = la.larry(msft.values, [msft.index.tolist(), msft.columns.tolist()])
laribm = la.larry(ibm.values, [ibm.index.tolist(), ibm.columns.tolist()])
lar1 = la.larry(np.dstack((msft.values,ibm.values)), [ibm.index.tolist(), ibm.columns.tolist(), ['msft', 'ibm']])
print(lar1.mean(0))


y = la.larry([[1.0, 2.0], [3.0, 4.0]], [['a', 'b'], ['c', 'd']])
ysr = np.empty(y.x.shape[0],dtype=([('index','S1')]+[(i,np.float) for i in y.label[1]]))
ysr['index'] = y.label[0]
for i in ysr.dtype.names[1:]:
    ysr[i] = y[y.labelindex(i, axis=1)].x