PageRenderTime 71ms CodeModel.GetById 30ms RepoModel.GetById 0ms app.codeStats 0ms

/savReaderWriter/savWriter.py

https://bitbucket.org/fomcl/savreaderwriter
Python | 438 lines | 398 code | 21 blank | 19 comment | 20 complexity | 55f0f57e292d4161a51120a74282604d MD5 | raw file
Possible License(s): CC-BY-SA-3.0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. from ctypes import *
  4. import os
  5. import time
  6. import locale
  7. from collections import Iterable
  8. try:
  9. pandasOK = True
  10. import pandas as pd
  11. except ImportError:
  12. pandasOK = False
  13. try:
  14. numpyOK = True
  15. import numpy as np
  16. except ImportError:
  17. numpyOK = False
  18. from savReaderWriter import *
  19. from py3k import *
  20. from header import *
  21. if cWriterowOK and not isPy3k:
  22. cWriterow = cWriterow.cWriterow
  23. class SavWriter(Header):
  24. """ Write SPSS system files (.sav, .zsav)
  25. Below, the associated SPSS commands are given in `CAPS`.
  26. Parameters
  27. ----------
  28. savFileName : str
  29. The file name of the spss data file.
  30. * File names that end with '.sav' are compressed using the 'old'
  31. compression scheme
  32. * File names that end with '_uncompressed.sav' are, well, not
  33. compressed. This is useful when you intend to read the files with
  34. the faster :py:class:`savReaderWriter.SavReaderNp` class
  35. * File names that end with '.zsav' are compressed using the ZLIB
  36. (ZSAV) compression scheme (requires v21 SPSS I/O files)
  37. varNames : list
  38. list of of strings of the variable names in the order in which they
  39. should appear in the spss data file. See also under
  40. :py:meth:`savReaderWriter.Header.varNamesTypes`.
  41. varTypes : dict
  42. varTypes dictionary `{varName: varType}`
  43. * varType == 0 --> numeric
  44. * varType > 0 --> character' of that length (in bytes!)
  45. See also under :py:meth:`savReaderWriter.Header.varNamesTypes`.
  46. valueLabels : dict, optional
  47. value label dictionary ``{varName: {value: label}}`` Cf.
  48. `VALUE LABELS`. See also under
  49. :py:meth:`savReaderWriter.Header.valueLabels`.
  50. varLabels : dict, optional
  51. variable label dictionary ``{varName: varLabel}``. Cf.
  52. `VARIABLE LABELS`. See also under
  53. :py:meth:`savReaderWriter.Header.varLabels`.
  54. formats : dict, optional
  55. format dictionary ``{varName: printFmt}``. Cf. `FORMATS`.
  56. See also under :py:meth:`savReaderWriter.Header.formats`, under
  57. :ref:`formats` and under :ref:`dateformats`.
  58. missingValues : dict, optional
  59. missing values dictionary ``{varName: {missing value spec}}``.
  60. Cf. `MISSING VALUES`. See also under
  61. :py:meth:`savReaderWriter.Header.missingValues`
  62. measureLevels : dict, optional
  63. measurement level dictionary ``{varName: <level>}``.
  64. Valid levels are: "unknown", "nominal", "ordinal", "scale",
  65. "ratio", "flag", "typeless". Cf. `VARIABLE LEVEL`
  66. See also under :py:meth:`savReaderWriter.Header.measureLevels`.
  67. .. warning::
  68. `measureLevels`, `columnWidths` and `alignments` must all three
  69. be set, if used
  70. columnWidths : dict, optional
  71. column display width dictionary ``{varName: <int>}``.
  72. Cf. `VARIABLE WIDTH`. (default: None --> >= 10 [stringVars] or
  73. automatic [numVars])
  74. See also under :py:meth:`savReaderWriter.Header.columnWidths`.
  75. alignments : dict, optional
  76. variable alignment dictionary ``{varName: <left/center/right>}``.
  77. Cf. `VARIABLE ALIGNMENT` (default: None --> left)
  78. See also under :py:meth:`savReaderWriter.Header.alignments`.
  79. varSets : dict, optional
  80. sets dictionary ``{setName: list_of_valid_varNames}``.
  81. Cf. `SETSMR` command.
  82. See also under :py:meth:`savReaderWriter.Header.varSets`
  83. varRoles : dict, optional
  84. variable roles dictionary ``{varName: varRole}``, where varRole
  85. may be any of the following: 'both', 'frequency', 'input', 'none',
  86. 'partition', 'record ID', 'split', 'target'. Cf. `VARIABLE ROLE`
  87. See also under :py:meth:`savReaderWriter.Header.varRoles`.
  88. varAttributes : dict, optional
  89. variable attributes dictionary ``{varName: {attribName:
  90. attribValue}``. Cf. `VARIABLE ATTRIBUTES`.
  91. See also under :py:meth:`savReaderWriter.Header.varAttributes`.
  92. fileAttributes : dict, optional
  93. file attributes dictionary ``{attribName: attribValue}``.
  94. Cf. FILE ATTRIBUTES. See also under
  95. :py:meth:`savReaderWriter.Header.fileAttributes`.
  96. fileLabel : dict, optional
  97. file label string, which defaults to "File created by user
  98. <username> at <datetime>" is file label is None. Cf. `FILE LABEL`
  99. See also under :py:meth:`savReaderWriter.Header.fileLabel`.
  100. multRespDefs : dict, optional
  101. multiple response sets definitions (dichotomy groups or
  102. category groups) dictionary ``{setName: <set definition>}``. In SPSS
  103. syntax, 'setName' has a dollar prefix ('$someSet'). Cf. `MRSETS`.
  104. See also under :py:meth:`savReaderWriter.Header.multRespDefs`.
  105. caseWeightVar : str, optional
  106. valid varName that is set as case weight (cf. `WEIGHT BY`). See also
  107. under :py:meth:`savReaderWriter.Header.caseWeightVar`.
  108. overwrite : bool, optional
  109. indicates whether an existing SPSS file should be overwritten
  110. ioUtf8 : bool, optional
  111. indicates the mode in which text communicated to or from the
  112. I/O Module will be. This refers to unicode mode (`SET UNICODE=ON`)
  113. and codepage mode in SPSS (`SET UNICODE=OFF`).
  114. See also under :py:meth:`savReaderWriter.Generic.ioUtf8` and
  115. under ``ioUtf8`` in :py:class:`savReaderWriter.SavReader`.
  116. * `ioUtf8=False`. Use the current ioLocale setting to determine the
  117. encoding for writing data.
  118. * `ioUtf8=True`. Use Unicode encoding (UTF-8) for writing data.
  119. Note: Data files saved in Unicode encoding cannot be read by versions
  120. of IBM SPSS Statistics prior to 16. Unicode mode is the default since
  121. IBM SPSS Statistics version 21. When opening code page IBM SPSS
  122. Statistics data files in Unicode mode or saving data files as Unicode
  123. in codepage mode, defined string widths are automatically *tripled*.
  124. .. seealso::
  125. `<http://www-01.ibm.com/support/knowledgecenter/SSLVMB_21.0.0/com.ibm.spss.statistics.help/faq_unicode.htm>`_
  126. ioLocale : bool, optional
  127. indicates the locale of the I/O module, cf. `SET LOCALE` (default:
  128. ``None``, which is the same as ``locale.setlocale(locale.LC_CTYPE)``).
  129. See also under :py:meth:`savReaderWriter.Generic.ioLocale`
  130. mode : str, optional
  131. indicates the mode in which ``savFileName`` should be opened. Possible
  132. values are:
  133. * "wb" --> write
  134. * "ab" --> append
  135. * "cp" --> copy: initialize header using ``refSavFileName`` as a reference
  136. file, cf. `APPLY DICTIONARY`.
  137. refSavFileName : str, optional
  138. reference file that should be used to initialize the header (aka the
  139. SPSS data dictionary) containing variable label, value label, missing
  140. value, etc. etc. definitions. Only relevant in conjunction with
  141. ``mode="cp"``.
  142. See also
  143. --------
  144. savReaderWriter.Header : for details about how to define individual
  145. metadata items
  146. Examples
  147. --------
  148. Typical use::
  149. records = [[b'Test1', 1, 1], [b'Test2', 2, 1]]
  150. varNames = [b'var1', b'v2', b'v3']
  151. varTypes = {b'var1': 5, b'v2': 0, b'v3': 0}
  152. savFileName = 'someFile.sav'
  153. with SavWriter(savFileName, varNames, varTypes) as writer:
  154. for record in records:
  155. writer.writerow(record)
  156. """
  157. def __init__(self, savFileName, varNames, varTypes, valueLabels=None,
  158. varLabels=None, formats=None, missingValues=None,
  159. measureLevels=None, columnWidths=None, alignments=None,
  160. varSets=None, varRoles=None, varAttributes=None,
  161. fileAttributes=None, fileLabel=None, multRespDefs=None,
  162. caseWeightVar=None, overwrite=True, ioUtf8=False,
  163. ioLocale=None, mode=b"wb", refSavFileName=None):
  164. """ Constructor. Initializes all vars that can be recycled """
  165. super(Header, self).__init__(savFileName, ioUtf8, ioLocale)
  166. self.savFileName = savFileName
  167. self.varNames = self.encode(varNames)
  168. self.varTypes = self.encode(varTypes)
  169. self.overwrite = overwrite
  170. self.mode = mode
  171. self.refSavFileName = refSavFileName
  172. self.fh = super(Header, self).openSavFile(self.savFileName, self.mode,
  173. self.refSavFileName)
  174. self.myStruct = self.getStruct(self.varTypes, self.varNames, self.mode)
  175. self.pack_into = self.myStruct.pack_into
  176. self.sysmis_ = self.sysmis
  177. self.ioUtf8_ = ioUtf8
  178. self.pad_8_lookup = self._getPaddingLookupTable(self.varTypes)
  179. self.pad_string = self._pyWriterow_pad_string(isPy3k)
  180. self.bytify = bytify(self.encoding) # from py3k module
  181. #import pdb; pdb.set_trace()
  182. #self.encoding = self.fileEncoding
  183. if self.mode == b"wb":
  184. self._openWrite(self.savFileName, self.overwrite)
  185. self.varNamesTypes = self.varNames, self.varTypes
  186. self.valueLabels = valueLabels
  187. self.varLabels = varLabels
  188. self.formats = formats
  189. self.missingValues = missingValues
  190. self.measureLevels = measureLevels
  191. self.columnWidths = columnWidths
  192. self.alignments = alignments
  193. self.varSets = varSets
  194. self.varRoles = varRoles
  195. self.varAttributes = varAttributes
  196. self.fileAttributes = fileAttributes
  197. self.fileLabel = fileLabel
  198. self.multRespDefs = multRespDefs
  199. self.caseWeightVar = caseWeightVar
  200. #self.dateVariables = dateVariables
  201. triplet = [measureLevels, columnWidths, alignments]
  202. if all([item is None for item in triplet]):
  203. self._setColWidth10()
  204. self.textInfo = self.savFileName
  205. if self.mode in (b"wb", b"cp"):
  206. self._commitHeader()
  207. self.caseBuffer = self.getCaseBuffer()
  208. def __enter__(self):
  209. """This function returns the writer object itself so the writerow and
  210. writerows methods become available for use with 'with' statements"""
  211. return self
  212. def __exit__(self, type, value, tb):
  213. """ This function closes the spss data file.
  214. .. warning::
  215. Always ensure the the .sav file is properly closed, either by
  216. using a context manager (``with`` statement) or by using
  217. ``close()``"""
  218. if type is not None:
  219. pass # Exception occurred
  220. self.close()
  221. def close(self):
  222. """ This function closes the spss data file."""
  223. self.closeSavFile(self.fh, self.mode)
  224. try:
  225. locale.resetlocale() # fails on Windows
  226. except:
  227. locale.setlocale(locale.LC_ALL, "")
  228. def _openWrite(self, savFileName, overwrite):
  229. """ This function opens a file in preparation for creating a new IBM
  230. SPSS Statistics data file"""
  231. if os.path.exists(savFileName) and not os.access(savFileName, os.W_OK):
  232. raise IOError("No write access for file %r" % savFileName)
  233. b = isinstance(savFileName, bytes)
  234. u = isinstance(savFileName, unicode)
  235. fn_endswith = savFileName.lower().endswith
  236. if overwrite or not os.path.exists(savFileName):
  237. if b and fn_endswith(b".zsav") or u and fn_endswith(u".zsav"):
  238. self.fileCompression = b"zlib" # only with v21 libraries!
  239. elif ( b and fn_endswith(b"_uncompressed.sav") or
  240. u and fn_endswith(u"_uncompressed.sav") ):
  241. self.fileCompression = b"uncompressed"
  242. else:
  243. self.fileCompression = b"standard"
  244. elif not overwrite and os.path.exists(savFileName):
  245. raise IOError("File %r already exists!" % savFileName)
  246. def convertDate(self, day, month, year):
  247. """This function converts a Gregorian date expressed as day-month-year
  248. to the internal SPSS date format. The time portion of the date variable
  249. is set to 0:00. To set the time portion if the date variable to another
  250. value, use convertTime."""
  251. func = self.spssio.spssConvertDate
  252. func.argtypes = [c_int, c_int, c_int, POINTER(c_double)]
  253. spssDate = c_double()
  254. retcode = func(day, month, year, spssDate)
  255. if retcode:
  256. msg = "Problem converting date value '%s-%s-%s'" % (day, month, year)
  257. checkErrsWarns(msg, retcode)
  258. return spssDate.value
  259. def convertTime(self, day, hour, minute, second):
  260. """This function converts a time given as day, hours, minutes, and
  261. seconds to the internal SPSS time format."""
  262. func = self.spssio.spssConvertTime
  263. func.argtypes = [c_int, c_int, c_int, c_double, POINTER(c_double)]
  264. spssTime = c_double()
  265. retcode = func(day, hour, minute, float(second), spssTime)
  266. if retcode:
  267. msg = "Problem converting time value '%s %s:%s:%s'"
  268. checkErrsWarns(msg % (day, hour, minute, second), retcode)
  269. return spssTime.value
  270. def spssDateTime(self, datetimeStr=b"2001-12-08", strptimeFmt="%Y-%m-%d"):
  271. """ This function converts a date/time string into an SPSS date,
  272. using a strptime format. See also :ref:`dateformats`"""
  273. try:
  274. if isinstance(datetimeStr, bytes):
  275. datetimeStr = datetimeStr.decode("utf-8")
  276. dt = time.strptime(datetimeStr, strptimeFmt)
  277. except (ValueError, TypeError, AttributeError):
  278. return self.sysmis
  279. day, month, year = dt.tm_mday, dt.tm_mon, dt.tm_year
  280. hour, minute, second = dt.tm_hour, dt.tm_min, dt.tm_sec
  281. return (self.convertDate(day, month, year) +
  282. self.convertTime(0, hour, minute, second))
  283. def _commitHeader(self):
  284. """This function writes the data dictionary to the data file associated
  285. with file handle 'fh'. Before any case data can be written, the
  286. dictionary must be committed; once the dictionary has been committed,
  287. no further changes can be made to it."""
  288. retcode = self.spssio.spssCommitHeader(c_int(self.fh))
  289. if retcode:
  290. checkErrsWarns("Problem committing header", retcode)
  291. def _getPaddingLookupTable(self, varTypes):
  292. """Helper function that returns a lookup table that maps string lengths
  293. to string lengths to the nearest ceiled multiple of 8. For example:
  294. {1:%-8s, 7:%-8s, 9: %-16s, 24: %-24s}. Purpose: Get rid of trailing
  295. null bytes"""
  296. strLengths = varTypes.values()
  297. if isPy3k:
  298. return dict([(i, (-8 * (i // -8))) for i in strLengths])
  299. return dict([(i, "%%-%ds" % (-8 * (i // -8))) for i in strLengths])
  300. def _pyWriterow_pad_string(self, isPy3k):
  301. """Helper that returns a function to pad string values using
  302. _getPaddingLookupTable. Padding is done differently for Python 2 and
  303. 3 (probably the latter is slower)"""
  304. if isPy3k:
  305. def _padStringValue(value, varType):
  306. # % replacement is not possible with bytes
  307. return value.ljust(self.pad_8_lookup[varType])
  308. else:
  309. def _padStringValue(value, varType):
  310. # Get rid of trailing null bytes --> 7 x faster than 'ljust'
  311. return self.pad_8_lookup[varType] % value
  312. return _padStringValue
  313. def _pyWriterow(self, record):
  314. """ This function writes one record, which is a Python list,
  315. compare this Python version with the Cython version cWriterow."""
  316. float_ = float
  317. encoding = self.encoding
  318. pad_string = self.pad_string
  319. for i, value in enumerate(record):
  320. varName = self.varNames[i]
  321. varType = self.varTypes[varName]
  322. if varType == 0:
  323. try:
  324. value = float_(value)
  325. except (ValueError, TypeError):
  326. value = self.sysmis_
  327. else:
  328. value = pad_string(value, varType)
  329. if self.ioUtf8_ and isinstance(value, unicode):
  330. value = value.encode("utf-8")
  331. record[i] = value
  332. self.record = record
  333. def writerow(self, record):
  334. """This function writes one record, which is a Python list."""
  335. if cWriterowOK:
  336. cWriterow(self, record)
  337. return
  338. self._pyWriterow(record)
  339. def writerows(self, records):
  340. """This function writes all records.
  341. Parameters
  342. ----------
  343. records : list, tuple, numpy.ndarray, pandas.DataFrame, or similar
  344. the records to be written to the .sav file
  345. Raises
  346. ------
  347. TypeError : if the records instance is not of a suitable type
  348. ValueError : if bool(records) == False, or if the array/DataFrame
  349. is empty
  350. """
  351. def is_empty(records):
  352. if hasattr(records, "empty"): # pandas
  353. return records.empty
  354. elif hasattr(records, "size"): # numpy
  355. return not records.size
  356. else:
  357. return not records
  358. if is_empty(records):
  359. raise ValueError("No data")
  360. elif numpyOK and pandasOK and isinstance(records, np.ndarray): # issue #25
  361. is_string = [bool(self.varTypes[v]) for v in self.varNames]
  362. if any(is_string):
  363. is_nan_string = is_string & pd.isnull(records)
  364. records = np.where(is_nan_string, b'', records)
  365. records = np.where(pd.isnull(records), self.sysmis, records)
  366. for i in xrange(len(records)):
  367. self.writerow( records[i].tolist() )
  368. elif pandasOK and isinstance(records, pd.DataFrame):
  369. is_string = records.dtypes == np.object
  370. is_nan_string = is_string & pd.isnull(records)
  371. records = records.where(~is_nan_string, b'')
  372. records = records.fillna(self.sysmis)
  373. for record in records.itertuples(index=False):
  374. self.writerow(list(record))
  375. elif isinstance(records, Iterable) and hasattr(records[0], "__iter__"):
  376. for record in records: # (named)tuple
  377. self.writerow(list(record)) # need item assignment
  378. else:
  379. try:
  380. for record in records:
  381. self.writerow(record)
  382. except:
  383. types = (tuple, list)
  384. if numpyOK: types += (np.array, )
  385. if pandasOK: types += (pd.DataFrame,)
  386. if not isinstance(records, types):
  387. msg = ('records instance type must be one of list, tuple, '
  388. 'numpy.array, pandas.DataFrame but got %s')
  389. raise TypeError( msg % (type(records), ))
  390. raise