savWriter.py | searchcode

/savReaderWriter/savWriter.py

https://bitbucket.org/fomcl/savreaderwriter · Python · 438 lines · 392 code · 25 blank · 21 comment · 32 complexity · 55f0f57e292d4161a51120a74282604d MD5 · raw file

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from ctypes import *
import os
import time
import locale
from collections import Iterable

try:
    pandasOK = True
    import pandas as pd
except ImportError:
    pandasOK = False
try:
    numpyOK = True
    import numpy as np
except ImportError:
    numpyOK = False

from savReaderWriter import *
from py3k import *
from header import *

if cWriterowOK and not isPy3k:
    cWriterow = cWriterow.cWriterow

class SavWriter(Header):

    """ Write SPSS system files (.sav, .zsav)

    Below, the associated SPSS commands are given in `CAPS`.

    Parameters
    ----------

    savFileName : str
        The file name of the spss data file.

        * File names that end with '.sav' are compressed using the 'old' 
          compression scheme
        * File names that end with '_uncompressed.sav' are, well, not 
          compressed. This is useful when you intend to read the files with 
          the faster :py:class:`savReaderWriter.SavReaderNp` class
        * File names that end with '.zsav' are compressed using the ZLIB 
          (ZSAV) compression scheme (requires v21 SPSS I/O files)
    varNames : list
        list of of strings of the variable names in the order in which they
        should appear in the spss data file. See also under 
        :py:meth:`savReaderWriter.Header.varNamesTypes`.
    varTypes : dict
        varTypes dictionary `{varName: varType}`

        * varType == 0 --> numeric
        * varType > 0 --> character' of that length (in bytes!)
        See also under :py:meth:`savReaderWriter.Header.varNamesTypes`.

    valueLabels : dict, optional
        value label dictionary ``{varName: {value: label}}`` Cf. 
        `VALUE LABELS`. See also under 
        :py:meth:`savReaderWriter.Header.valueLabels`.
    varLabels : dict, optional
        variable label dictionary ``{varName: varLabel}``. Cf.
        `VARIABLE LABELS`. See also under 
        :py:meth:`savReaderWriter.Header.varLabels`.
    formats : dict, optional
        format dictionary ``{varName: printFmt}``. Cf. `FORMATS`.
        See also under :py:meth:`savReaderWriter.Header.formats`, under
        :ref:`formats` and under :ref:`dateformats`.
    missingValues : dict, optional
        missing values dictionary ``{varName: {missing value spec}}``.
        Cf. `MISSING VALUES`. See also under 
        :py:meth:`savReaderWriter.Header.missingValues`

    measureLevels : dict, optional
        measurement level dictionary ``{varName: <level>}``.
        Valid levels are: "unknown", "nominal", "ordinal", "scale",
        "ratio", "flag", "typeless". Cf. `VARIABLE LEVEL`
        See also under :py:meth:`savReaderWriter.Header.measureLevels`.

        .. warning::
            `measureLevels`, `columnWidths` and `alignments` must all three
            be set, if used
    columnWidths : dict, optional
        column display width dictionary ``{varName: <int>}``.
        Cf. `VARIABLE WIDTH`. (default: None --> >= 10 [stringVars] or 
        automatic [numVars])
        See also under :py:meth:`savReaderWriter.Header.columnWidths`.
    alignments : dict, optional
        variable alignment dictionary ``{varName: <left/center/right>}``.
        Cf. `VARIABLE ALIGNMENT` (default: None --> left)
        See also under :py:meth:`savReaderWriter.Header.alignments`.

    varSets : dict, optional
        sets dictionary ``{setName: list_of_valid_varNames}``. 
        Cf. `SETSMR` command.
        See also under :py:meth:`savReaderWriter.Header.varSets`
    varRoles : dict, optional
        variable roles dictionary ``{varName: varRole}``, where varRole
        may be any of the following: 'both', 'frequency', 'input', 'none',
        'partition', 'record ID', 'split', 'target'. Cf. `VARIABLE ROLE`
        See also under :py:meth:`savReaderWriter.Header.varRoles`.
    varAttributes : dict, optional
        variable attributes dictionary ``{varName: {attribName:
        attribValue}``. Cf. `VARIABLE ATTRIBUTES`.
        See also under :py:meth:`savReaderWriter.Header.varAttributes`.
    fileAttributes : dict, optional
        file attributes dictionary ``{attribName: attribValue}``.
        Cf. FILE ATTRIBUTES. See also under 
        :py:meth:`savReaderWriter.Header.fileAttributes`.
    fileLabel : dict, optional 
        file label string, which defaults to "File created by user
        <username> at <datetime>" is file label is None. Cf. `FILE LABEL`
        See also under :py:meth:`savReaderWriter.Header.fileLabel`.
    multRespDefs : dict, optional
        multiple response sets definitions (dichotomy groups or
        category groups) dictionary ``{setName: <set definition>}``. In SPSS 
        syntax, 'setName' has a dollar prefix ('$someSet'). Cf. `MRSETS`.
        See also under :py:meth:`savReaderWriter.Header.multRespDefs`.

    caseWeightVar : str, optional
        valid varName that is set as case weight (cf. `WEIGHT BY`). See also
        under :py:meth:`savReaderWriter.Header.caseWeightVar`.
    overwrite : bool, optional
        indicates whether an existing SPSS file should be overwritten
    ioUtf8 : bool, optional
        indicates the mode in which text communicated to or from the
        I/O Module will be. This refers to unicode mode (`SET UNICODE=ON`) 
        and codepage mode in SPSS (`SET UNICODE=OFF`).
        See also under :py:meth:`savReaderWriter.Generic.ioUtf8` and
	under ``ioUtf8`` in :py:class:`savReaderWriter.SavReader`.

        * `ioUtf8=False`. Use the current ioLocale setting to determine the
          encoding for writing data.
        * `ioUtf8=True`. Use Unicode encoding (UTF-8) for writing data. 

        Note: Data files saved in Unicode encoding cannot be read by versions
        of IBM SPSS Statistics prior to 16. Unicode mode is the default since
        IBM SPSS Statistics version 21. When opening code page IBM SPSS 
        Statistics data files in Unicode mode or saving data files as Unicode 
        in codepage mode, defined string widths are automatically *tripled*.

        .. seealso::
            `<http://www-01.ibm.com/support/knowledgecenter/SSLVMB_21.0.0/com.ibm.spss.statistics.help/faq_unicode.htm>`_

    ioLocale : bool, optional
        indicates the locale of the I/O module, cf. `SET LOCALE` (default:
        ``None``, which is the same as ``locale.setlocale(locale.LC_CTYPE)``).
        See also under :py:meth:`savReaderWriter.Generic.ioLocale`
    mode : str, optional
      indicates the mode in which ``savFileName`` should be opened. Possible
      values are:

      * "wb" --> write
      * "ab" --> append
      * "cp" --> copy: initialize header using ``refSavFileName`` as a reference
        file, cf. `APPLY DICTIONARY`.
    refSavFileName : str, optional
      reference file that should be used to initialize the header (aka the 
      SPSS data dictionary) containing variable label, value label, missing
      value, etc. etc. definitions. Only relevant in conjunction with 
      ``mode="cp"``.
   
    See also
    --------

    savReaderWriter.Header : for details about how to define individual 
        metadata items

    Examples
    --------

    Typical use::

        records = [[b'Test1', 1, 1], [b'Test2', 2, 1]]
        varNames = [b'var1', b'v2', b'v3']
        varTypes = {b'var1': 5, b'v2': 0, b'v3': 0}
        savFileName = 'someFile.sav'
        with SavWriter(savFileName, varNames, varTypes) as writer:
            for record in records:
                writer.writerow(record)
    """
    def __init__(self, savFileName, varNames, varTypes, valueLabels=None,
                 varLabels=None, formats=None, missingValues=None,
                 measureLevels=None, columnWidths=None, alignments=None,
                 varSets=None, varRoles=None, varAttributes=None,
                 fileAttributes=None, fileLabel=None, multRespDefs=None,
                 caseWeightVar=None, overwrite=True, ioUtf8=False, 
                 ioLocale=None, mode=b"wb", refSavFileName=None):
        """ Constructor. Initializes all vars that can be recycled """
        super(Header, self).__init__(savFileName, ioUtf8, ioLocale)
        self.savFileName = savFileName
        self.varNames = self.encode(varNames)
        self.varTypes = self.encode(varTypes)
        self.overwrite = overwrite
        self.mode = mode
        self.refSavFileName = refSavFileName

        self.fh = super(Header, self).openSavFile(self.savFileName, self.mode,
                                                  self.refSavFileName)
        self.myStruct = self.getStruct(self.varTypes, self.varNames, self.mode)
        self.pack_into = self.myStruct.pack_into

        self.sysmis_ = self.sysmis
        self.ioUtf8_ = ioUtf8
        self.pad_8_lookup = self._getPaddingLookupTable(self.varTypes)
        self.pad_string = self._pyWriterow_pad_string(isPy3k)
        self.bytify = bytify(self.encoding)  # from py3k module
        #import pdb; pdb.set_trace()
        #self.encoding = self.fileEncoding

        if self.mode == b"wb":
            self._openWrite(self.savFileName, self.overwrite)
            self.varNamesTypes = self.varNames, self.varTypes
            self.valueLabels = valueLabels
            self.varLabels = varLabels
            self.formats = formats
            self.missingValues = missingValues
            self.measureLevels = measureLevels
            self.columnWidths = columnWidths
            self.alignments = alignments
            self.varSets = varSets
            self.varRoles = varRoles
            self.varAttributes = varAttributes
            self.fileAttributes = fileAttributes
            self.fileLabel = fileLabel
            self.multRespDefs = multRespDefs
            self.caseWeightVar = caseWeightVar
            #self.dateVariables = dateVariables
            triplet = [measureLevels, columnWidths, alignments]
            if all([item is None for item in triplet]):
                self._setColWidth10()
            self.textInfo = self.savFileName
        if self.mode in (b"wb", b"cp"):
            self._commitHeader()
        self.caseBuffer = self.getCaseBuffer()

    def __enter__(self):
        """This function returns the writer object itself so the writerow and
        writerows methods become available for use with 'with' statements"""
        return self

    def __exit__(self, type, value, tb):
        """ This function closes the spss data file.

        .. warning::

            Always ensure the the .sav file is properly closed, either by 
            using a context manager (``with`` statement) or by using 
            ``close()``"""
        if type is not None:
            pass  # Exception occurred
        self.close()

    def close(self):
        """ This function closes the spss data file."""
        self.closeSavFile(self.fh, self.mode)
        try:
            locale.resetlocale()  # fails on Windows
        except:
            locale.setlocale(locale.LC_ALL, "")

    def _openWrite(self, savFileName, overwrite):
        """ This function opens a file in preparation for creating a new IBM
        SPSS Statistics data file"""
        if os.path.exists(savFileName) and not os.access(savFileName, os.W_OK):
            raise IOError("No write access for file %r" % savFileName)

        b = isinstance(savFileName, bytes)
        u = isinstance(savFileName, unicode)
        fn_endswith = savFileName.lower().endswith
        if overwrite or not os.path.exists(savFileName):
            if b and fn_endswith(b".zsav") or u and fn_endswith(u".zsav"):
                self.fileCompression = b"zlib"  # only with v21 libraries!
            elif ( b and fn_endswith(b"_uncompressed.sav") or 
                   u and fn_endswith(u"_uncompressed.sav") ):
                self.fileCompression = b"uncompressed"
            else:
                self.fileCompression = b"standard"
        elif not overwrite and os.path.exists(savFileName):
            raise IOError("File %r already exists!" % savFileName)

    def convertDate(self, day, month, year):
        """This function converts a Gregorian date expressed as day-month-year
        to the internal SPSS date format. The time portion of the date variable
        is set to 0:00. To set the time portion if the date variable to another
        value, use convertTime."""
        func = self.spssio.spssConvertDate
        func.argtypes = [c_int, c_int, c_int, POINTER(c_double)]
        spssDate = c_double()
        retcode = func(day, month, year, spssDate)
        if retcode:
            msg = "Problem converting date value '%s-%s-%s'" % (day, month, year)
            checkErrsWarns(msg, retcode)
        return spssDate.value

    def convertTime(self, day, hour, minute, second):
        """This function converts a time given as day, hours, minutes, and
        seconds to the internal SPSS time format."""
        func = self.spssio.spssConvertTime
        func.argtypes = [c_int, c_int, c_int, c_double, POINTER(c_double)]
        spssTime = c_double()
        retcode = func(day, hour, minute, float(second), spssTime)
        if retcode:
            msg = "Problem converting time value '%s %s:%s:%s'"
            checkErrsWarns(msg % (day, hour, minute, second), retcode)
        return spssTime.value

    def spssDateTime(self, datetimeStr=b"2001-12-08", strptimeFmt="%Y-%m-%d"):
        """ This function converts a date/time string into an SPSS date,
        using a strptime format. See also :ref:`dateformats`"""
        try:
            if isinstance(datetimeStr, bytes):
                datetimeStr = datetimeStr.decode("utf-8")
            dt = time.strptime(datetimeStr, strptimeFmt)
        except (ValueError, TypeError, AttributeError):
            return self.sysmis
        day, month, year = dt.tm_mday, dt.tm_mon, dt.tm_year
        hour, minute, second = dt.tm_hour, dt.tm_min, dt.tm_sec
        return (self.convertDate(day, month, year) +
                self.convertTime(0, hour, minute, second))

    def _commitHeader(self):
        """This function writes the data dictionary to the data file associated
        with file handle 'fh'. Before any case data can be written, the
        dictionary must be committed; once the dictionary has been committed,
        no further changes can be made to it."""
        retcode = self.spssio.spssCommitHeader(c_int(self.fh))
        if retcode:
            checkErrsWarns("Problem committing header", retcode)

    def _getPaddingLookupTable(self, varTypes):
        """Helper function that returns a lookup table that maps string lengths
        to string lengths to the nearest ceiled multiple of 8. For example:
        {1:%-8s, 7:%-8s, 9: %-16s, 24: %-24s}. Purpose: Get rid of trailing
        null bytes"""
        strLengths = varTypes.values()
        if isPy3k:
            return dict([(i, (-8 * (i // -8))) for i in strLengths])
        return dict([(i, "%%-%ds" % (-8 * (i // -8))) for i in strLengths])

    def _pyWriterow_pad_string(self, isPy3k):
        """Helper that returns a function to pad string values using
        _getPaddingLookupTable. Padding is done differently for Python 2 and
        3 (probably the latter is slower)"""
        if isPy3k:
            def _padStringValue(value, varType):
                # % replacement is not possible with bytes
                return value.ljust(self.pad_8_lookup[varType])
        else:
            def _padStringValue(value, varType):
                # Get rid of trailing null bytes --> 7 x faster than 'ljust'
                return self.pad_8_lookup[varType] % value
        return _padStringValue

    def _pyWriterow(self, record):
        """ This function writes one record, which is a Python list,
        compare this Python version with the Cython version cWriterow."""
        float_ = float
        encoding = self.encoding
        pad_string = self.pad_string
        for i, value in enumerate(record):
            varName = self.varNames[i]
            varType = self.varTypes[varName]
            if varType == 0:
                try:
                    value = float_(value)
                except (ValueError, TypeError):
                    value = self.sysmis_
            else:
                value = pad_string(value, varType)
                if self.ioUtf8_ and isinstance(value, unicode):
                    value = value.encode("utf-8")
            record[i] = value
        self.record = record

    def writerow(self, record):
        """This function writes one record, which is a Python list."""
        if cWriterowOK:
            cWriterow(self, record)
            return
        self._pyWriterow(record)

    def writerows(self, records):
        """This function writes all records.
        
        Parameters
        ----------
        records : list, tuple, numpy.ndarray, pandas.DataFrame, or similar
            the records to be written to the .sav file
            
        Raises
        ------
        TypeError : if the records instance is not of a suitable type
        ValueError : if bool(records) == False, or if the array/DataFrame 
            is empty
        """
        def is_empty(records):
            if hasattr(records, "empty"):   # pandas
                return records.empty
            elif hasattr(records, "size"):  # numpy
                return not records.size
            else:
                return not records
                
        if is_empty(records):                   
            raise ValueError("No data")
        elif numpyOK and pandasOK and isinstance(records, np.ndarray):  # issue #25
            is_string = [bool(self.varTypes[v]) for v in self.varNames]
            if any(is_string):
                is_nan_string = is_string & pd.isnull(records)
                records = np.where(is_nan_string, b'', records)
            records = np.where(pd.isnull(records), self.sysmis, records)
            for i in xrange(len(records)):
                self.writerow( records[i].tolist() )
        elif pandasOK and isinstance(records, pd.DataFrame):
            is_string = records.dtypes == np.object
            is_nan_string = is_string & pd.isnull(records)
            records = records.where(~is_nan_string, b'')
            records = records.fillna(self.sysmis)
            for record in records.itertuples(index=False):
                self.writerow(list(record))
        elif isinstance(records, Iterable) and hasattr(records[0], "__iter__"):
            for record in records:           # (named)tuple
                self.writerow(list(record))  # need item assignment
        else:
            try:
                for record in records:
                    self.writerow(record)
            except:
                types = (tuple, list)
                if numpyOK: types += (np.array, )
                if pandasOK: types += (pd.DataFrame,)
                if not isinstance(records, types):
                    msg = ('records instance type must be one of list, tuple, ' 
                           'numpy.array, pandas.DataFrame but got %s')
                    raise TypeError( msg % (type(records), ))
                raise
Tech Fingerprint

Alerts (16)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
4 21 22 23
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
260 430
'isinstance(' Overuse may indicate design issues; consider polymorphism
269 270 313 372 408 416 423 434
Complexity hotspot; lines 394 to 395 (total complexity: 4)
394 395