PageRenderTime 15ms CodeModel.GetById 2ms app.highlight 9ms RepoModel.GetById 1ms app.codeStats 0ms

/statsmodels/sandbox/examples/thirdparty/try_interchange.py

http://github.com/statsmodels/statsmodels
Python | 74 lines | 54 code | 4 blank | 16 comment | 10 complexity | fe3e1195efe0fc1d3f3e9c03ad7fcfac MD5 | raw file
 1# -*- coding: utf-8 -*-
 2"""groupmean, groupby in pandas, la and tabular from a scikits.timeseries
 3
 4after a question on the scipy-user mailing list I tried to do
 5groupmeans, which in this case are duplicate dates, in the 3 packages.
 6
 7I'm using the versions that I had installed, which are all based on
 8repository checkout, but are not fully up-to-date
 9
10some brief comments
11
12* la.larry and pandas.DataFrame require unique labels/index so
13  groups have to represented in a separate data structure
14* pandas is missing GroupBy in the docs, but the docstring is helpful
15* both la and pandas handle datetime objects as object arrays
16* tabular requires conversion to structured dtype, but easy helper
17  functions or methods are available in scikits.timeseries and tabular
18
19* not too bad for a first try
20
21Created on Sat Jan 30 08:33:11 2010
22Author: josef-pktd
23"""
24from statsmodels.compat.python import lrange
25import numpy as np
26import scikits.timeseries as ts
27import la
28import pandas
29import tabular as tb
30from finance import msft, ibm  # hack to make it run as standalone
31
32s = ts.time_series([1,2,3,4,5],
33            dates=ts.date_array(["2001-01","2001-01",
34            "2001-02","2001-03","2001-03"],freq="M"))
35
36print('\nUsing la')
37dta = la.larry(s.data, label=[lrange(len(s.data))])
38dat = la.larry(s.dates.tolist(), label=[lrange(len(s.data))])
39s2 = ts.time_series(dta.group_mean(dat).x,dates=ts.date_array(dat.x,freq="M"))
40s2u = ts.remove_duplicated_dates(s2)
41print(repr(s))
42print(dat)
43print(repr(s2))
44print(repr(s2u))
45
46print('\nUsing pandas')
47pdta = pandas.DataFrame(s.data, np.arange(len(s.data)), [1])
48pa = pdta.groupby(dict(zip(np.arange(len(s.data)),
49            s.dates.tolist()))).aggregate(np.mean)
50s3 = ts.time_series(pa.values.ravel(),
51            dates=ts.date_array(pa.index.tolist(),freq="M"))
52
53print(pa)
54print(repr(s3))
55
56print('\nUsing tabular')
57X = tb.tabarray(array=s.torecords(), dtype=s.torecords().dtype)
58tabx = X.aggregate(On=['_dates'], AggFuncDict={'_data':np.mean,'_mask':np.all})
59s4 = ts.time_series(tabx['_data'],dates=ts.date_array(tabx['_dates'],freq="M"))
60print(tabx)
61print(repr(s4))
62
63#after running pandas/examples/finance.py
64larmsft = la.larry(msft.values, [msft.index.tolist(), msft.columns.tolist()])
65laribm = la.larry(ibm.values, [ibm.index.tolist(), ibm.columns.tolist()])
66lar1 = la.larry(np.dstack((msft.values,ibm.values)), [ibm.index.tolist(), ibm.columns.tolist(), ['msft', 'ibm']])
67print(lar1.mean(0))
68
69
70y = la.larry([[1.0, 2.0], [3.0, 4.0]], [['a', 'b'], ['c', 'd']])
71ysr = np.empty(y.x.shape[0],dtype=([('index','S1')]+[(i,np.float) for i in y.label[1]]))
72ysr['index'] = y.label[0]
73for i in ysr.dtype.names[1:]:
74    ysr[i] = y[y.labelindex(i, axis=1)].x