PageRenderTime 45ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/python/algo_local/clean_sentiment_data.py

https://bitbucket.org/vroomzel/semester2
Python | 71 lines | 40 code | 20 blank | 11 comment | 2 complexity | 1625407fb9d131878641888e9e0c5630 MD5 | raw file
  1. import datetime as dt
  2. import pandas
  3. import numpy as np
  4. import os
  5. import matplotlib.pyplot as plt
  6. import sys
  7. #set time zone, otherwise dates are screwed
  8. os.putenv("TZ","America/New_York")
  9. #load data
  10. path="/backtest_data/1second/teams_data/team3/data/"
  11. sent=pandas.load(path+"recorded_future.bin")
  12. cp=pandas.load(path+"adj_close_px_all_from_09.bin") #need this to merge with the date index used in price files
  13. #tickers with available price data
  14. tickers=cp.columns
  15. #filter out tickers which are missing sentiment data (know from prior experience)
  16. market='SPY'
  17. excluded_tickers=[market,'EEM','EWJ','FAS','FAZ','GLD','IWM','NOK','QQQ','RIMM','SLV','TLT','UNG','USO','XLE','XLF']
  18. for symbol in excluded_tickers:
  19. tickers=tickers[tickers!=symbol]
  20. #flatten out the sentiment data
  21. w30=pandas.DataFrame(sent['w30'].ix[tickers[0]])
  22. w24h=pandas.DataFrame(sent['w24h'].ix[tickers[0]])
  23. count=pandas.DataFrame(sent['Count'].ix[tickers[0]])
  24. ticker_count=1
  25. for symbol in tickers[1:]:
  26. w30=pandas.merge(w30, pandas.DataFrame(sent['w30'].ix[symbol]), left_index=True, right_index=True, how='outer')
  27. w24h=pandas.merge(w24h, pandas.DataFrame(sent['w24h'].ix[symbol]), left_index=True, right_index=True, how='outer')
  28. count=pandas.merge(count, pandas.DataFrame(sent['Count'].ix[symbol]), left_index=True, right_index=True, how='outer')
  29. col_ind=np.arange(0,ticker_count+1)
  30. w30.columns=col_ind
  31. w24h.columns=col_ind
  32. count.columns=col_ind
  33. ticker_count+=1
  34. #rename the columns by tickers
  35. w30.columns=tickers
  36. w24h.columns=tickers
  37. count.columns=tickers
  38. w30.save(path+"weighted_30min_sentiment_arrival_time.bin")
  39. w24h.save(path+"weighted_24h_sentiment_arrival_time.bin")
  40. count.save(path+"news_count_arrival_time.bin")
  41. ##################################################################################
  42. #put the sentiment data index on the same index as price data and fill forward
  43. w30=pandas.merge(pandas.DataFrame(cp.SPY), w30, how='outer',left_index=True, right_index=True)
  44. w30.pop('SPY')
  45. #save to new file
  46. w30.save(path+"weighted_30_min_sentiment_no_pad.bin")
  47. #fill forward missing data
  48. w30=w30.fillna(method='pad')
  49. w30=w30.fillna(0) #fill remaining NANs in the beginning of series with zero
  50. #save to new file
  51. w30.save(path+"weighted_30_min_sentiment.bin")