timeseriesio.py | searchcode

/pythalesians/market/loaders/timeseriesio.py

https://gitlab.com/oytunistrator/pythalesians
Python | 388 lines | 139 code | 66 blank | 183 comment | 30 complexity | 106ed881a9fb02ad014072dcedbb8987 MD5 | raw file

__author__ = 'saeedamen' # Saeed Amen / saeed@thalesians.com

#
# Copyright 2015 Thalesians Ltd. - http//www.thalesians.com / @thalesians
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
# License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#
# See the License for the specific language governing permissions and limitations under the License.
#

"""
TimeSeriesIO

Write and reads time series data to disk in various formats, CSV and HDF5 format. (planning to add other interfaces too).

"""

import pandas
import codecs
import datetime
from dateutil.parser import parse
import shutil

from openpyxl import load_workbook
import os.path

from pythalesians.util.constants import Constants
from pythalesians.util.loggermanager import LoggerManager

class TimeSeriesIO:

    def __init__(self):
        self.logger = LoggerManager().getLogger(__name__)

    ### functions to handle Excel on disk
    def write_time_series_to_excel(self, fname, sheet, data_frame, create_new=False):
        """
        write_time_series_to_excel - writes Pandas data frame to disk in Excel format

        Parameters
        ----------
        fname : str
            Excel filename to be written to
        sheet : str
            sheet in excel
        data_frame : DataFrame
            data frame to be written
        create_new : boolean
            to create a new Excel file
        """

        if(create_new):
            writer = pandas.ExcelWriter(fname, engine='xlsxwriter')
        else:
            if os.path.isfile(fname):
                book = load_workbook(fname)
                writer = pandas.ExcelWriter(fname, engine='xlsxwriter')
                writer.book = book
                writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
            else:
                writer = pandas.ExcelWriter(fname, engine='xlsxwriter')

        data_frame.to_excel(writer, sheet_name=sheet, engine='xlsxwriter')

        writer.save()
        writer.close()

    def write_time_series_to_excel_writer(self, writer, sheet, data_frame):
        """
        write_time_series_to_excel_writer - writes Pandas data frame to disk in Excel format for a writer

        Parameters
        ----------
        writer : ExcelWriter
            File handle to use for writing Excel file to disk
        sheet : str
            sheet in excel
        data_frame : DataFrame
            data frame to be written
        """
        data_frame.to_excel(writer, sheet, engine='xlsxwriter')

    def read_excel_data_frame(self, f_name, excel_sheet, freq, cutoff = None, dateparse = None,
                            postfix = '.close', intraday_tz = 'UTC'):
        """
        read_excel_data_frame - Reads Excel from disk into DataFrame

        Parameters
        ----------
        f_name : str
            Excel file path to read
        freq : str
            Frequency of data to read (intraday/daily etc)
        cutoff : DateTime (optional)
            end date to read up to
        dateparse : str (optional)
            date parser to use
        postfix : str (optional)
            postfix to add to each columns
        intraday_tz : str
            timezone of file if uses intraday data

        Returns
        -------
        DataFrame
        """

        return self.read_csv_data_frame(f_name, freq, cutoff = cutoff, dateparse = dateparse,
                            postfix = postfix, intraday_tz = intraday_tz, excel_sheet = excel_sheet)

    ### functions to handle HDF5 on disk
    def write_time_series_cache_to_disk(self, fname, data_frame):
        """
        write_time_series_cache_to_disk - writes Pandas data frame to disk as HDF5 format

        Parmeters
        ---------
        fname : str
            path of file
        data_frame : DataFrame
            data frame to be written to disk
        """

        h5_filename_temp = self.get_h5_filename(fname + ".temp")
        h5_filename = self.get_h5_filename(fname)

        # delete the old copy
        try:
            # os.remove(h5_filename_temp)
            p =0
        except: pass

        store = pandas.HDFStore(h5_filename_temp, complib="blosc", complevel=9)

        if ('intraday' in fname):
            data_frame = data_frame.astype('float32')

        store['data'] = data_frame
        store.close()

        # delete the old copy
        try:
            os.remove(h5_filename)
        except: pass

        # once written to disk rename
        os.rename(h5_filename_temp, h5_filename)

    def get_h5_filename(self, fname):
        """
        get_h5_filename - Strips h5 off filename returning first portion of filename

        Parameters
        ----------
        fname : str
            h5 filename to strip

        Returns
        -------
        str
        """
        if fname[-3:] == '.h5':
            return fname

        return fname + ".h5"

    def write_r_compatible_hdf_dataframe(self, data_frame, fname, fields = None):
        """
        write_r_compatible_hdf_dataframe - Write a DataFrame to disk in as an R compatible HDF5 file

        Parameters
        ----------
        data_frame : DataFrame
            data frame to be written
        fname : str
            file path to be written
        fields : list(str)
            columns to be written
        """
        fname_r = self.get_h5_filename(fname)

        self.logger.info("About to dump R binary HDF5 - " + fname_r)
        data_frame32 = data_frame.astype('float32')

        if fields is None:
            fields = data_frame32.columns.values

        # decompose date/time into individual fields (easier to pick up in R)
        data_frame32['Year'] = data_frame.index.year
        data_frame32['Month'] = data_frame.index.month
        data_frame32['Day'] = data_frame.index.day
        data_frame32['Hour'] = data_frame.index.hour
        data_frame32['Minute'] = data_frame.index.minute
        data_frame32['Second'] = data_frame.index.second
        data_frame32['Millisecond'] = data_frame.index.microsecond / 1000

        data_frame32 = data_frame32[
            ['Year', 'Month', 'Day', 'Hour', 'Minute', 'Second', 'Millisecond'] + fields]

        cols = data_frame32.columns

        store_export = pandas.HDFStore(fname_r)
        store_export.put('df_for_r', data_frame32, data_columns=cols)
        store_export.close()

    def read_time_series_cache_from_disk(self, fname):
        """
        read_time_series_cache_from_disk - Reads time series cache from disk

        Parameters
        ----------
        fname : str
            file to be read from

        Returns
        -------
        DataFrame
        """

        if os.path.isfile(self.get_h5_filename(fname)):
            store = pandas.HDFStore(self.get_h5_filename(fname))
            data_frame = store.select("data")

            if ('intraday' in fname):
                data_frame = data_frame.astype('float32')

            store.close()

            return data_frame

        return None

    ### functions for CSV reading and writing
    def write_time_series_to_csv(self, csv_path, data_frame):
        data_frame.to_csv(csv_path)

    def read_csv_data_frame(self, f_name, freq, cutoff = None, dateparse = None,
                            postfix = '.close', intraday_tz = 'UTC', excel_sheet = None):
        """
        read_csv_data_frame - Reads CSV/Excel from disk into DataFrame

        Parameters
        ----------
        f_name : str
            CSV/Excel file path to read
        freq : str
            Frequency of data to read (intraday/daily etc)
        cutoff : DateTime (optional)
            end date to read up to
        dateparse : str (optional)
            date parser to use
        postfix : str (optional)
            postfix to add to each columns
        intraday_tz : str (optional)
            timezone of file if uses intraday data
        excel_sheet : str (optional)
            Excel sheet to be read

        Returns
        -------
        DataFrame
        """

        if(freq == 'intraday'):

            if dateparse is None:
                dateparse = lambda x: datetime.datetime(*map(int, [x[6:10], x[3:5], x[0:2],
                                                   x[11:13], x[14:16], x[17:19]]))
            elif dateparse is 'dukascopy':
                dateparse = lambda x: datetime.datetime(*map(int, [x[0:4], x[5:7], x[8:10],
                                                   x[11:13], x[14:16], x[17:19]]))
            elif dateparse is 'c':
                # use C library for parsing dates, several hundred times quicker
                # requires compilation of library to install
                import ciso8601
                dateparse = lambda x: ciso8601.parse_datetime(x)

            if excel_sheet is None:
                data_frame = pandas.read_csv(f_name, index_col = 0, parse_dates = True, date_parser = dateparse)
            else:
                data_frame = pandas.read_excel(f_name, excel_sheet, index_col = 0, na_values=['NA'])

            data_frame = data_frame.astype('float32')
            data_frame.index.names = ['Date']

            old_cols = data_frame.columns
            new_cols = []

            # add '.close' to each column name
            for col in old_cols:
                new_cols.append(col + postfix)

            data_frame.columns = new_cols
        else:
            # daily data
            if 'events' in f_name:

                data_frame = pandas.read_csv(f_name)

                # very slow conversion
                data_frame = data_frame.convert_objects(convert_dates = 'coerce')

            else:
                if excel_sheet is None:
                    data_frame = pandas.read_csv(f_name, index_col=0, parse_dates =["DATE"], date_parser = dateparse)
                else:
                    data_frame = pandas.read_excel(f_name, excel_sheet, index_col = 0, na_values=['NA'])

        # convert Date to Python datetime
        # datetime data_frame['Date1'] = data_frame.index

        # slower method: lambda x: pandas.datetime.strptime(x, '%d/%m/%Y %H:%M:%S')
        # data_frame['Date1'].apply(lambda x: datetime.datetime(int(x[6:10]), int(x[3:5]), int(x[0:2]),
        #                                        int(x[12:13]), int(x[15:16]), int(x[18:19])))

        # data_frame.index = data_frame['Date1']
        # data_frame.drop('Date1')

        # slower method: data_frame.index = pandas.to_datetime(data_frame.index)

        if(freq == 'intraday'):
            # assume time series are already in UTC and assign this (can specify other time zones)
            data_frame = data_frame.tz_localize(intraday_tz)

        # end cutoff date
        if cutoff is not None:
            if (isinstance(cutoff, str)):
                cutoff = parse(cutoff)

            data_frame = data_frame.loc[data_frame.index < cutoff]

        return data_frame

    def convert_csv_data_frame(self, f_name, category, freq, cutoff=None, dateparse=None):
        """
        convert_csv_data_frame - Converts CSV file to HDF5 file

        Parameters
        ----------
        f_name : str
            File name to be read
        category : str
            data category of file (used in HDF5 filename)
        freq : str
            intraday/daily frequency (used in HDF5 filename)
        cutoff : DateTime (optional)
            filter dates up to here
        dateparse : str
            date parser to use
        """

        self.logger.info("About to read... " + f_name)

        data_frame = self.read_csv_data_frame(f_name, freq, cutoff=cutoff, dateparse=dateparse)

        category_f_name = self.create_cache_file_name(category)

        self.write_time_series_cache_to_disk(
            category_f_name, data_frame)

    def clean_csv_file(self, f_name):
        """
        clean_csv_file - Cleans up CSV file (removing empty characters) before writing back to disk

        Parameters
        ----------
        f_name : str
            CSV file to be cleaned
        """

        with codecs.open (f_name, 'rb', 'utf-8') as myfile:
            data = myfile.read()

            # clean file first if dirty
            if data.count( '\x00' ):
                self.logger.info('Cleaning CSV...')

                with codecs.open(f_name + '.tmp', 'w', 'utf-8') as of:
                    of.write(data.replace('\x00', ''))

                shutil.move(f_name + '.tmp', f_name)

    def create_cache_file_name(self, filename):
        return Constants().folder_time_series_data + "/" + filename