PageRenderTime 43ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 1ms

/pythalesians/market/loaders/timeseriesio.py

https://gitlab.com/oytunistrator/pythalesians
Python | 388 lines | 139 code | 66 blank | 183 comment | 30 complexity | 106ed881a9fb02ad014072dcedbb8987 MD5 | raw file
  1. __author__ = 'saeedamen' # Saeed Amen / saeed@thalesians.com
  2. #
  3. # Copyright 2015 Thalesians Ltd. - http//www.thalesians.com / @thalesians
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
  6. # License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
  7. #
  8. # Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
  9. # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. #
  11. # See the License for the specific language governing permissions and limitations under the License.
  12. #
  13. """
  14. TimeSeriesIO
  15. Write and reads time series data to disk in various formats, CSV and HDF5 format. (planning to add other interfaces too).
  16. """
  17. import pandas
  18. import codecs
  19. import datetime
  20. from dateutil.parser import parse
  21. import shutil
  22. from openpyxl import load_workbook
  23. import os.path
  24. from pythalesians.util.constants import Constants
  25. from pythalesians.util.loggermanager import LoggerManager
  26. class TimeSeriesIO:
  27. def __init__(self):
  28. self.logger = LoggerManager().getLogger(__name__)
  29. ### functions to handle Excel on disk
  30. def write_time_series_to_excel(self, fname, sheet, data_frame, create_new=False):
  31. """
  32. write_time_series_to_excel - writes Pandas data frame to disk in Excel format
  33. Parameters
  34. ----------
  35. fname : str
  36. Excel filename to be written to
  37. sheet : str
  38. sheet in excel
  39. data_frame : DataFrame
  40. data frame to be written
  41. create_new : boolean
  42. to create a new Excel file
  43. """
  44. if(create_new):
  45. writer = pandas.ExcelWriter(fname, engine='xlsxwriter')
  46. else:
  47. if os.path.isfile(fname):
  48. book = load_workbook(fname)
  49. writer = pandas.ExcelWriter(fname, engine='xlsxwriter')
  50. writer.book = book
  51. writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
  52. else:
  53. writer = pandas.ExcelWriter(fname, engine='xlsxwriter')
  54. data_frame.to_excel(writer, sheet_name=sheet, engine='xlsxwriter')
  55. writer.save()
  56. writer.close()
  57. def write_time_series_to_excel_writer(self, writer, sheet, data_frame):
  58. """
  59. write_time_series_to_excel_writer - writes Pandas data frame to disk in Excel format for a writer
  60. Parameters
  61. ----------
  62. writer : ExcelWriter
  63. File handle to use for writing Excel file to disk
  64. sheet : str
  65. sheet in excel
  66. data_frame : DataFrame
  67. data frame to be written
  68. """
  69. data_frame.to_excel(writer, sheet, engine='xlsxwriter')
  70. def read_excel_data_frame(self, f_name, excel_sheet, freq, cutoff = None, dateparse = None,
  71. postfix = '.close', intraday_tz = 'UTC'):
  72. """
  73. read_excel_data_frame - Reads Excel from disk into DataFrame
  74. Parameters
  75. ----------
  76. f_name : str
  77. Excel file path to read
  78. freq : str
  79. Frequency of data to read (intraday/daily etc)
  80. cutoff : DateTime (optional)
  81. end date to read up to
  82. dateparse : str (optional)
  83. date parser to use
  84. postfix : str (optional)
  85. postfix to add to each columns
  86. intraday_tz : str
  87. timezone of file if uses intraday data
  88. Returns
  89. -------
  90. DataFrame
  91. """
  92. return self.read_csv_data_frame(f_name, freq, cutoff = cutoff, dateparse = dateparse,
  93. postfix = postfix, intraday_tz = intraday_tz, excel_sheet = excel_sheet)
  94. ### functions to handle HDF5 on disk
  95. def write_time_series_cache_to_disk(self, fname, data_frame):
  96. """
  97. write_time_series_cache_to_disk - writes Pandas data frame to disk as HDF5 format
  98. Parmeters
  99. ---------
  100. fname : str
  101. path of file
  102. data_frame : DataFrame
  103. data frame to be written to disk
  104. """
  105. h5_filename_temp = self.get_h5_filename(fname + ".temp")
  106. h5_filename = self.get_h5_filename(fname)
  107. # delete the old copy
  108. try:
  109. # os.remove(h5_filename_temp)
  110. p =0
  111. except: pass
  112. store = pandas.HDFStore(h5_filename_temp, complib="blosc", complevel=9)
  113. if ('intraday' in fname):
  114. data_frame = data_frame.astype('float32')
  115. store['data'] = data_frame
  116. store.close()
  117. # delete the old copy
  118. try:
  119. os.remove(h5_filename)
  120. except: pass
  121. # once written to disk rename
  122. os.rename(h5_filename_temp, h5_filename)
  123. def get_h5_filename(self, fname):
  124. """
  125. get_h5_filename - Strips h5 off filename returning first portion of filename
  126. Parameters
  127. ----------
  128. fname : str
  129. h5 filename to strip
  130. Returns
  131. -------
  132. str
  133. """
  134. if fname[-3:] == '.h5':
  135. return fname
  136. return fname + ".h5"
  137. def write_r_compatible_hdf_dataframe(self, data_frame, fname, fields = None):
  138. """
  139. write_r_compatible_hdf_dataframe - Write a DataFrame to disk in as an R compatible HDF5 file
  140. Parameters
  141. ----------
  142. data_frame : DataFrame
  143. data frame to be written
  144. fname : str
  145. file path to be written
  146. fields : list(str)
  147. columns to be written
  148. """
  149. fname_r = self.get_h5_filename(fname)
  150. self.logger.info("About to dump R binary HDF5 - " + fname_r)
  151. data_frame32 = data_frame.astype('float32')
  152. if fields is None:
  153. fields = data_frame32.columns.values
  154. # decompose date/time into individual fields (easier to pick up in R)
  155. data_frame32['Year'] = data_frame.index.year
  156. data_frame32['Month'] = data_frame.index.month
  157. data_frame32['Day'] = data_frame.index.day
  158. data_frame32['Hour'] = data_frame.index.hour
  159. data_frame32['Minute'] = data_frame.index.minute
  160. data_frame32['Second'] = data_frame.index.second
  161. data_frame32['Millisecond'] = data_frame.index.microsecond / 1000
  162. data_frame32 = data_frame32[
  163. ['Year', 'Month', 'Day', 'Hour', 'Minute', 'Second', 'Millisecond'] + fields]
  164. cols = data_frame32.columns
  165. store_export = pandas.HDFStore(fname_r)
  166. store_export.put('df_for_r', data_frame32, data_columns=cols)
  167. store_export.close()
  168. def read_time_series_cache_from_disk(self, fname):
  169. """
  170. read_time_series_cache_from_disk - Reads time series cache from disk
  171. Parameters
  172. ----------
  173. fname : str
  174. file to be read from
  175. Returns
  176. -------
  177. DataFrame
  178. """
  179. if os.path.isfile(self.get_h5_filename(fname)):
  180. store = pandas.HDFStore(self.get_h5_filename(fname))
  181. data_frame = store.select("data")
  182. if ('intraday' in fname):
  183. data_frame = data_frame.astype('float32')
  184. store.close()
  185. return data_frame
  186. return None
  187. ### functions for CSV reading and writing
  188. def write_time_series_to_csv(self, csv_path, data_frame):
  189. data_frame.to_csv(csv_path)
  190. def read_csv_data_frame(self, f_name, freq, cutoff = None, dateparse = None,
  191. postfix = '.close', intraday_tz = 'UTC', excel_sheet = None):
  192. """
  193. read_csv_data_frame - Reads CSV/Excel from disk into DataFrame
  194. Parameters
  195. ----------
  196. f_name : str
  197. CSV/Excel file path to read
  198. freq : str
  199. Frequency of data to read (intraday/daily etc)
  200. cutoff : DateTime (optional)
  201. end date to read up to
  202. dateparse : str (optional)
  203. date parser to use
  204. postfix : str (optional)
  205. postfix to add to each columns
  206. intraday_tz : str (optional)
  207. timezone of file if uses intraday data
  208. excel_sheet : str (optional)
  209. Excel sheet to be read
  210. Returns
  211. -------
  212. DataFrame
  213. """
  214. if(freq == 'intraday'):
  215. if dateparse is None:
  216. dateparse = lambda x: datetime.datetime(*map(int, [x[6:10], x[3:5], x[0:2],
  217. x[11:13], x[14:16], x[17:19]]))
  218. elif dateparse is 'dukascopy':
  219. dateparse = lambda x: datetime.datetime(*map(int, [x[0:4], x[5:7], x[8:10],
  220. x[11:13], x[14:16], x[17:19]]))
  221. elif dateparse is 'c':
  222. # use C library for parsing dates, several hundred times quicker
  223. # requires compilation of library to install
  224. import ciso8601
  225. dateparse = lambda x: ciso8601.parse_datetime(x)
  226. if excel_sheet is None:
  227. data_frame = pandas.read_csv(f_name, index_col = 0, parse_dates = True, date_parser = dateparse)
  228. else:
  229. data_frame = pandas.read_excel(f_name, excel_sheet, index_col = 0, na_values=['NA'])
  230. data_frame = data_frame.astype('float32')
  231. data_frame.index.names = ['Date']
  232. old_cols = data_frame.columns
  233. new_cols = []
  234. # add '.close' to each column name
  235. for col in old_cols:
  236. new_cols.append(col + postfix)
  237. data_frame.columns = new_cols
  238. else:
  239. # daily data
  240. if 'events' in f_name:
  241. data_frame = pandas.read_csv(f_name)
  242. # very slow conversion
  243. data_frame = data_frame.convert_objects(convert_dates = 'coerce')
  244. else:
  245. if excel_sheet is None:
  246. data_frame = pandas.read_csv(f_name, index_col=0, parse_dates =["DATE"], date_parser = dateparse)
  247. else:
  248. data_frame = pandas.read_excel(f_name, excel_sheet, index_col = 0, na_values=['NA'])
  249. # convert Date to Python datetime
  250. # datetime data_frame['Date1'] = data_frame.index
  251. # slower method: lambda x: pandas.datetime.strptime(x, '%d/%m/%Y %H:%M:%S')
  252. # data_frame['Date1'].apply(lambda x: datetime.datetime(int(x[6:10]), int(x[3:5]), int(x[0:2]),
  253. # int(x[12:13]), int(x[15:16]), int(x[18:19])))
  254. # data_frame.index = data_frame['Date1']
  255. # data_frame.drop('Date1')
  256. # slower method: data_frame.index = pandas.to_datetime(data_frame.index)
  257. if(freq == 'intraday'):
  258. # assume time series are already in UTC and assign this (can specify other time zones)
  259. data_frame = data_frame.tz_localize(intraday_tz)
  260. # end cutoff date
  261. if cutoff is not None:
  262. if (isinstance(cutoff, str)):
  263. cutoff = parse(cutoff)
  264. data_frame = data_frame.loc[data_frame.index < cutoff]
  265. return data_frame
  266. def convert_csv_data_frame(self, f_name, category, freq, cutoff=None, dateparse=None):
  267. """
  268. convert_csv_data_frame - Converts CSV file to HDF5 file
  269. Parameters
  270. ----------
  271. f_name : str
  272. File name to be read
  273. category : str
  274. data category of file (used in HDF5 filename)
  275. freq : str
  276. intraday/daily frequency (used in HDF5 filename)
  277. cutoff : DateTime (optional)
  278. filter dates up to here
  279. dateparse : str
  280. date parser to use
  281. """
  282. self.logger.info("About to read... " + f_name)
  283. data_frame = self.read_csv_data_frame(f_name, freq, cutoff=cutoff, dateparse=dateparse)
  284. category_f_name = self.create_cache_file_name(category)
  285. self.write_time_series_cache_to_disk(
  286. category_f_name, data_frame)
  287. def clean_csv_file(self, f_name):
  288. """
  289. clean_csv_file - Cleans up CSV file (removing empty characters) before writing back to disk
  290. Parameters
  291. ----------
  292. f_name : str
  293. CSV file to be cleaned
  294. """
  295. with codecs.open (f_name, 'rb', 'utf-8') as myfile:
  296. data = myfile.read()
  297. # clean file first if dirty
  298. if data.count( '\x00' ):
  299. self.logger.info('Cleaning CSV...')
  300. with codecs.open(f_name + '.tmp', 'w', 'utf-8') as of:
  301. of.write(data.replace('\x00', ''))
  302. shutil.move(f_name + '.tmp', f_name)
  303. def create_cache_file_name(self, filename):
  304. return Constants().folder_time_series_data + "/" + filename