/python/algo_local/clean_sentiment_data.py
Python | 71 lines | 40 code | 20 blank | 11 comment | 2 complexity | 1625407fb9d131878641888e9e0c5630 MD5 | raw file
- import datetime as dt
- import pandas
- import numpy as np
- import os
- import matplotlib.pyplot as plt
- import sys
- #set time zone, otherwise dates are screwed
- os.putenv("TZ","America/New_York")
- #load data
- path="/backtest_data/1second/teams_data/team3/data/"
- sent=pandas.load(path+"recorded_future.bin")
- cp=pandas.load(path+"adj_close_px_all_from_09.bin") #need this to merge with the date index used in price files
- #tickers with available price data
- tickers=cp.columns
- #filter out tickers which are missing sentiment data (know from prior experience)
- market='SPY'
- excluded_tickers=[market,'EEM','EWJ','FAS','FAZ','GLD','IWM','NOK','QQQ','RIMM','SLV','TLT','UNG','USO','XLE','XLF']
- for symbol in excluded_tickers:
- tickers=tickers[tickers!=symbol]
-
- #flatten out the sentiment data
- w30=pandas.DataFrame(sent['w30'].ix[tickers[0]])
- w24h=pandas.DataFrame(sent['w24h'].ix[tickers[0]])
- count=pandas.DataFrame(sent['Count'].ix[tickers[0]])
- ticker_count=1
- for symbol in tickers[1:]:
- w30=pandas.merge(w30, pandas.DataFrame(sent['w30'].ix[symbol]), left_index=True, right_index=True, how='outer')
- w24h=pandas.merge(w24h, pandas.DataFrame(sent['w24h'].ix[symbol]), left_index=True, right_index=True, how='outer')
- count=pandas.merge(count, pandas.DataFrame(sent['Count'].ix[symbol]), left_index=True, right_index=True, how='outer')
- col_ind=np.arange(0,ticker_count+1)
- w30.columns=col_ind
- w24h.columns=col_ind
- count.columns=col_ind
- ticker_count+=1
- #rename the columns by tickers
- w30.columns=tickers
- w24h.columns=tickers
- count.columns=tickers
- w30.save(path+"weighted_30min_sentiment_arrival_time.bin")
- w24h.save(path+"weighted_24h_sentiment_arrival_time.bin")
- count.save(path+"news_count_arrival_time.bin")
- ##################################################################################
- #put the sentiment data index on the same index as price data and fill forward
- w30=pandas.merge(pandas.DataFrame(cp.SPY), w30, how='outer',left_index=True, right_index=True)
- w30.pop('SPY')
- #save to new file
- w30.save(path+"weighted_30_min_sentiment_no_pad.bin")
- #fill forward missing data
- w30=w30.fillna(method='pad')
- w30=w30.fillna(0) #fill remaining NANs in the beginning of series with zero
- #save to new file
- w30.save(path+"weighted_30_min_sentiment.bin")