PageRenderTime 99ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 1ms

/sas7bdat.py

https://bitbucket.org/jaredhobbs/sas7bdat
Python | 1559 lines | 1533 code | 16 blank | 10 comment | 38 complexity | d4b6587b29fee7c656b1db2127de2040 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. #!/usr/bin/env python
  2. """
  3. This module will read sas7bdat files using pure Python (2.7+, 3+).
  4. No SAS software required!
  5. """
  6. from __future__ import division, absolute_import, print_function,\
  7. unicode_literals
  8. import atexit
  9. import csv
  10. import logging
  11. import math
  12. import os
  13. import platform
  14. import struct
  15. import sys
  16. from codecs import open
  17. from datetime import datetime, timedelta
  18. import six
  19. xrange = six.moves.range
  20. __all__ = ['SAS7BDAT']
  21. def _debug(t, v, tb):
  22. if hasattr(sys, 'ps1') or not sys.stderr.isatty():
  23. sys.__excepthook__(t, v, tb)
  24. else:
  25. import pdb
  26. import traceback
  27. traceback.print_exception(t, v, tb)
  28. print()
  29. pdb.pm()
  30. os._exit(1)
  31. def _get_color_emit(prefix, fn):
  32. # This doesn't work on Windows since Windows doesn't support
  33. # the ansi escape characters
  34. def _new(handler):
  35. levelno = handler.levelno
  36. if levelno >= logging.CRITICAL:
  37. color = '\x1b[31m' # red
  38. elif levelno >= logging.ERROR:
  39. color = '\x1b[31m' # red
  40. elif levelno >= logging.WARNING:
  41. color = '\x1b[33m' # yellow
  42. elif levelno >= logging.INFO:
  43. color = '\x1b[32m' # green or normal
  44. elif levelno >= logging.DEBUG:
  45. color = '\x1b[35m' # pink
  46. else:
  47. color = '\x1b[0m' # normal
  48. try:
  49. handler.msg = '%s[%s] %s%s' % (
  50. color, prefix, handler.msg, '\x1b[0m'
  51. )
  52. except UnicodeDecodeError:
  53. handler.msg = '%s[%s] %s%s' % (
  54. color, prefix, handler.msg.decode('utf-8'), '\x1b[0m'
  55. )
  56. return fn(handler)
  57. return _new
  58. class ParseError(Exception):
  59. pass
  60. class Decompressor(object):
  61. def __init__(self, parent):
  62. self.parent = parent
  63. def decompress_row(self, offset, length, result_length, page):
  64. raise NotImplementedError
  65. @staticmethod
  66. def to_ord(int_or_str):
  67. if isinstance(int_or_str, int):
  68. return int_or_str
  69. return ord(int_or_str)
  70. @staticmethod
  71. def to_chr(int_or_str):
  72. py2 = six.PY2
  73. if isinstance(int_or_str, (bytes, bytearray)):
  74. return int_or_str
  75. if py2:
  76. return chr(int_or_str)
  77. return bytes([int_or_str])
  78. class RLEDecompressor(Decompressor):
  79. """
  80. Decompresses data using the Run Length Encoding algorithm
  81. """
  82. def decompress_row(self, offset, length, result_length, page):
  83. b = self.to_ord
  84. c = self.to_chr
  85. current_result_array_index = 0
  86. result = []
  87. i = 0
  88. for j in xrange(length):
  89. if i != j:
  90. continue
  91. control_byte = b(page[offset + i]) & 0xF0
  92. end_of_first_byte = b(page[offset + i]) & 0x0F
  93. if control_byte == 0x00:
  94. if i != (length - 1):
  95. count_of_bytes_to_copy = (
  96. (b(page[offset + i + 1]) & 0xFF) +
  97. 64 +
  98. end_of_first_byte * 256
  99. )
  100. start = offset + i + 2
  101. end = start + count_of_bytes_to_copy
  102. result.append(c(page[start:end]))
  103. i += count_of_bytes_to_copy + 1
  104. current_result_array_index += count_of_bytes_to_copy
  105. elif control_byte == 0x40:
  106. copy_counter = (
  107. end_of_first_byte * 16 +
  108. (b(page[offset + i + 1]) & 0xFF)
  109. )
  110. for _ in xrange(copy_counter + 18):
  111. result.append(c(page[offset + i + 2]))
  112. current_result_array_index += 1
  113. i += 2
  114. elif control_byte == 0x60:
  115. for _ in xrange(end_of_first_byte * 256 +
  116. (b(page[offset + i + 1]) & 0xFF) + 17):
  117. result.append(c(0x20))
  118. current_result_array_index += 1
  119. i += 1
  120. elif control_byte == 0x70:
  121. for _ in xrange(end_of_first_byte * 256 +
  122. (b(page[offset + i + 1]) & 0xFF) + 17):
  123. result.append(c(0x00))
  124. current_result_array_index += 1
  125. i += 1
  126. elif control_byte == 0x80:
  127. count_of_bytes_to_copy = min(end_of_first_byte + 1,
  128. length - (i + 1))
  129. start = offset + i + 1
  130. end = start + count_of_bytes_to_copy
  131. result.append(c(page[start:end]))
  132. i += count_of_bytes_to_copy
  133. current_result_array_index += count_of_bytes_to_copy
  134. elif control_byte == 0x90:
  135. count_of_bytes_to_copy = min(end_of_first_byte + 17,
  136. length - (i + 1))
  137. start = offset + i + 1
  138. end = start + count_of_bytes_to_copy
  139. result.append(c(page[start:end]))
  140. i += count_of_bytes_to_copy
  141. current_result_array_index += count_of_bytes_to_copy
  142. elif control_byte == 0xA0:
  143. count_of_bytes_to_copy = min(end_of_first_byte + 33,
  144. length - (i + 1))
  145. start = offset + i + 1
  146. end = start + count_of_bytes_to_copy
  147. result.append(c(page[start:end]))
  148. i += count_of_bytes_to_copy
  149. current_result_array_index += count_of_bytes_to_copy
  150. elif control_byte == 0xB0:
  151. count_of_bytes_to_copy = min(end_of_first_byte + 49,
  152. length - (i + 1))
  153. start = offset + i + 1
  154. end = start + count_of_bytes_to_copy
  155. result.append(c(page[start:end]))
  156. i += count_of_bytes_to_copy
  157. current_result_array_index += count_of_bytes_to_copy
  158. elif control_byte == 0xC0:
  159. for _ in xrange(end_of_first_byte + 3):
  160. result.append(c(page[offset + i + 1]))
  161. current_result_array_index += 1
  162. i += 1
  163. elif control_byte == 0xD0:
  164. for _ in xrange(end_of_first_byte + 2):
  165. result.append(c(0x40))
  166. current_result_array_index += 1
  167. elif control_byte == 0xE0:
  168. for _ in xrange(end_of_first_byte + 2):
  169. result.append(c(0x20))
  170. current_result_array_index += 1
  171. elif control_byte == 0xF0:
  172. for _ in xrange(end_of_first_byte + 2):
  173. result.append(c(0x00))
  174. current_result_array_index += 1
  175. else:
  176. self.parent.logger.error('unknown control byte: %s',
  177. control_byte)
  178. i += 1
  179. result = b''.join(result)
  180. if len(result) != result_length:
  181. self.parent.logger.error('unexpected result length: %d != %d' %
  182. (len(result), result_length))
  183. return result
  184. class RDCDecompressor(Decompressor):
  185. """
  186. Decompresses data using the Ross Data Compression algorithm
  187. http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/
  188. articles/CUJ/1992/9210/ross/ross.htm
  189. """
  190. def decompress_row(self, offset, length, result_length, page):
  191. src_row = [self.to_ord(x) for x in page[offset:offset + length]]
  192. out_row = [0] * result_length
  193. ctrl_mask = 0
  194. ctrl_bits = 0
  195. src_offset = 0
  196. out_offset = 0
  197. # process each item in src_row
  198. while src_offset < (len(src_row) - 2):
  199. # get new load of control bits if needed
  200. ctrl_mask = ctrl_mask >> 1
  201. if ctrl_mask == 0:
  202. ctrl_bits = (src_row[src_offset] << 8) +\
  203. src_row[src_offset + 1]
  204. src_offset += 2
  205. ctrl_mask = 0x8000
  206. # just copy this char if control bit is zero
  207. if (ctrl_bits & ctrl_mask) == 0:
  208. out_row[out_offset] = src_row[src_offset]
  209. out_offset += 1
  210. src_offset += 1
  211. continue
  212. # undo the compression code
  213. cmd = (src_row[src_offset] >> 4) & 0x0F
  214. cnt = src_row[src_offset] & 0x0F
  215. src_offset += 1
  216. if cmd == 0: # short rle
  217. cnt += 3
  218. for k in xrange(cnt):
  219. out_row[out_offset + k] = src_row[src_offset]
  220. out_offset += cnt
  221. src_offset += 1
  222. elif cmd == 1: # long rle
  223. cnt += src_row[src_offset] << 4
  224. cnt += 19
  225. src_offset += 1
  226. for k in xrange(cnt):
  227. out_row[out_offset + k] = src_row[src_offset]
  228. out_offset += cnt
  229. src_offset += 1
  230. elif cmd == 2: # long pattern
  231. ofs = cnt + 3
  232. ofs += src_row[src_offset] << 4
  233. src_offset += 1
  234. cnt = src_row[src_offset]
  235. src_offset += 1
  236. cnt += 16
  237. for k in xrange(cnt):
  238. out_row[out_offset + k] = out_row[out_offset - ofs + k]
  239. out_offset += cnt
  240. elif cmd >= 3 and cmd <= 15: # short pattern
  241. ofs = cnt + 3
  242. ofs += src_row[src_offset] << 4
  243. src_offset += 1
  244. for k in xrange(cmd):
  245. out_row[out_offset + k] = out_row[out_offset - ofs + k]
  246. out_offset += cmd
  247. else:
  248. self.parent.logger.error(
  249. 'unknown marker %s at offset %s', src_row[src_offset],
  250. src_offset
  251. )
  252. break
  253. return b''.join([self.to_chr(x) for x in out_row])
  254. class SAS7BDAT(object):
  255. """
  256. SAS7BDAT(path[, log_level[, extra_time_format_strings[, \
  257. extra_date_time_format_strings[, extra_date_format_strings[, \
  258. fh=fh]]]]]) -> \
  259. SAS7BDAT object
  260. Open a SAS7BDAT file or use an existing file handle.
  261. The log level are standard logging levels (defaults to logging.INFO).
  262. If your sas7bdat file uses non-standard format strings for time, datetime,
  263. or date values, pass those strings into the constructor using the
  264. appropriate kwarg.
  265. The file will be opened from the path supplied, unless a file handle
  266. is supplied. The file handle should be opened in binary mode for
  267. correct operation.
  268. """
  269. _open_files = []
  270. RLE_COMPRESSION = b'SASYZCRL'
  271. RDC_COMPRESSION = b'SASYZCR2'
  272. COMPRESSION_LITERALS = set([
  273. RLE_COMPRESSION, RDC_COMPRESSION
  274. ])
  275. DECOMPRESSORS = {
  276. RLE_COMPRESSION: RLEDecompressor,
  277. RDC_COMPRESSION: RDCDecompressor
  278. }
  279. TIME_FORMAT_STRINGS = set([
  280. 'TIME'
  281. ])
  282. DATE_TIME_FORMAT_STRINGS = set([
  283. 'DATETIME'
  284. ])
  285. DATE_FORMAT_STRINGS = set([
  286. 'YYMMDD', 'MMDDYY', 'DDMMYY', 'DATE', 'JULIAN', 'MONYY'
  287. ])
  288. def __init__(self, path, log_level=logging.INFO,
  289. extra_time_format_strings=None,
  290. extra_date_time_format_strings=None,
  291. extra_date_format_strings=None,
  292. skip_header=False,
  293. encoding='utf8',
  294. encoding_errors='ignore',
  295. align_correction=True,
  296. fh=None, strip_whitespace_from_strings=True):
  297. """
  298. x.__init__(...) initializes x; see help(type(x)) for signature
  299. """
  300. if log_level == logging.DEBUG:
  301. sys.excepthook = _debug
  302. self.path = path
  303. self.endianess = None
  304. self.u64 = False
  305. self.logger = self._make_logger(level=log_level)
  306. self._update_format_strings(
  307. self.TIME_FORMAT_STRINGS, extra_time_format_strings
  308. )
  309. self._update_format_strings(
  310. self.DATE_TIME_FORMAT_STRINGS, extra_date_time_format_strings
  311. )
  312. self._update_format_strings(
  313. self.DATE_FORMAT_STRINGS, extra_date_format_strings
  314. )
  315. self.skip_header = skip_header
  316. self.strip_whitespace_from_strings = strip_whitespace_from_strings
  317. self.encoding = encoding
  318. self.encoding_errors = encoding_errors
  319. self.align_correction = align_correction
  320. self._file = fh or open(self.path, 'rb')
  321. self._open_files.append(self._file)
  322. self.cached_page = None
  323. self.current_page_type = None
  324. self.current_page_block_count = None
  325. self.current_page_subheaders_count = None
  326. self.current_file_position = 0
  327. self.current_page_data_subheader_pointers = []
  328. self.current_row = []
  329. self.column_names_strings = []
  330. self.column_names = []
  331. self.column_types = []
  332. self.column_data_offsets = []
  333. self.column_data_lengths = []
  334. self.columns = []
  335. self.header = SASHeader(self)
  336. self.properties = self.header.properties
  337. self.header.parse_metadata()
  338. self.logger.debug('\n%s', self.header)
  339. self._iter = self.readlines()
  340. def __repr__(self):
  341. """
  342. x.__repr__() <==> repr(x)
  343. """
  344. return 'SAS7BDAT file: %s' % os.path.basename(self.path)
  345. def __enter__(self):
  346. """
  347. __enter__() -> self.
  348. """
  349. return self
  350. def __exit__(self, exc_type, exc_val, exc_tb):
  351. """
  352. __exit__(*excinfo) -> None. Closes the file.
  353. """
  354. self.close()
  355. def __iter__(self):
  356. """
  357. x.__iter__() <==> iter(x)
  358. """
  359. return self.readlines()
  360. def _update_format_strings(self, var, format_strings):
  361. if format_strings is not None:
  362. if isinstance(format_strings, str):
  363. var.add(format_strings)
  364. elif isinstance(format_strings, (set, list, tuple)):
  365. var.update(set(format_strings))
  366. else:
  367. raise NotImplementedError
  368. def close(self):
  369. """
  370. close() -> None or (perhaps) an integer. Close the file.
  371. A closed file cannot be used for further I/O operations.
  372. close() may be called more than once without error.
  373. Some kinds of file objects (for example, opened by popen())
  374. may return an exit status upon closing.
  375. """
  376. return self._file.close()
  377. def _make_logger(self, level=logging.INFO):
  378. """
  379. Create a custom logger with the specified properties.
  380. """
  381. logger = logging.getLogger(self.path)
  382. logger.setLevel(level)
  383. fmt = '%(message)s'
  384. stream_handler = logging.StreamHandler()
  385. if platform.system() != 'Windows':
  386. stream_handler.emit = _get_color_emit(
  387. os.path.basename(self.path),
  388. stream_handler.emit
  389. )
  390. else:
  391. fmt = '[%s] %%(message)s' % os.path.basename(self.path)
  392. formatter = logging.Formatter(fmt, '%y-%m-%d %H:%M:%S')
  393. stream_handler.setFormatter(formatter)
  394. logger.addHandler(stream_handler)
  395. return logger
  396. def _read_bytes(self, offsets_to_lengths):
  397. result = {}
  398. if not self.cached_page:
  399. for offset, length in six.iteritems(offsets_to_lengths):
  400. skipped = 0
  401. while skipped < (offset - self.current_file_position):
  402. seek = offset - self.current_file_position - skipped
  403. skipped += seek
  404. self._file.seek(seek, 0)
  405. tmp = self._file.read(length)
  406. if len(tmp) < length:
  407. self.logger.error(
  408. 'failed to read %s bytes from sas7bdat file', length
  409. )
  410. self.current_file_position = offset + length
  411. result[offset] = tmp
  412. else:
  413. for offset, length in six.iteritems(offsets_to_lengths):
  414. result[offset] = self.cached_page[offset:offset + length]
  415. return result
  416. def _read_val(self, fmt, raw_bytes, size):
  417. if fmt == 'i' and self.u64 and size == 8:
  418. fmt = 'q'
  419. newfmt = fmt
  420. if fmt == 's':
  421. newfmt = '%ds' % min(size, len(raw_bytes))
  422. elif fmt in set(['number', 'datetime', 'date', 'time']):
  423. newfmt = 'd'
  424. if len(raw_bytes) != size:
  425. size = len(raw_bytes)
  426. if size < 8:
  427. if self.endianess == 'little':
  428. raw_bytes = b''.join([b'\x00' * (8 - size), raw_bytes])
  429. else:
  430. raw_bytes += b'\x00' * (8 - size)
  431. size = 8
  432. if self.endianess == 'big':
  433. newfmt = '>%s' % newfmt
  434. else:
  435. newfmt = '<%s' % newfmt
  436. val = struct.unpack(str(newfmt), raw_bytes[:size])[0]
  437. if fmt == 's':
  438. val = val.strip(b'\x00')
  439. if self.strip_whitespace_from_strings:
  440. val = val.strip()
  441. elif math.isnan(val):
  442. val = None
  443. elif fmt == 'datetime':
  444. val = datetime(1960, 1, 1) + timedelta(seconds=val)
  445. elif fmt == 'time':
  446. val = (datetime(1960, 1, 1) + timedelta(seconds=val)).time()
  447. elif fmt == 'date':
  448. try:
  449. val = (datetime(1960, 1, 1) + timedelta(days=val)).date()
  450. except OverflowError:
  451. # Some data sets flagged with a date format are actually
  452. # stored as datetime values
  453. val = datetime(1960, 1, 1) + timedelta(seconds=val)
  454. return val
  455. def readlines(self):
  456. """
  457. readlines() -> generator which yields lists of values, each a line
  458. from the file.
  459. Possible values in the list are None, string, float, datetime.datetime,
  460. datetime.date, and datetime.time.
  461. """
  462. bit_offset = self.header.PAGE_BIT_OFFSET
  463. subheader_pointer_length = self.header.SUBHEADER_POINTER_LENGTH
  464. row_count = self.header.properties.row_count
  465. current_row_in_file_index = 0
  466. current_row_on_page_index = 0
  467. if not self.skip_header:
  468. yield [x.name.decode(self.encoding, self.encoding_errors)
  469. for x in self.columns]
  470. if not self.cached_page:
  471. self._file.seek(self.properties.header_length)
  472. self._read_next_page()
  473. while current_row_in_file_index < row_count:
  474. current_row_in_file_index += 1
  475. current_page_type = self.current_page_type
  476. if current_page_type == self.header.PAGE_META_TYPE:
  477. try:
  478. current_subheader_pointer =\
  479. self.current_page_data_subheader_pointers[
  480. current_row_on_page_index
  481. ]
  482. except IndexError:
  483. self._read_next_page()
  484. current_row_on_page_index = 0
  485. else:
  486. current_row_on_page_index += 1
  487. cls = self.header.SUBHEADER_INDEX_TO_CLASS.get(
  488. self.header.DATA_SUBHEADER_INDEX
  489. )
  490. if cls is None:
  491. raise NotImplementedError
  492. cls(self).process_subheader(
  493. current_subheader_pointer.offset,
  494. current_subheader_pointer.length
  495. )
  496. if current_row_on_page_index ==\
  497. len(self.current_page_data_subheader_pointers):
  498. self._read_next_page()
  499. current_row_on_page_index = 0
  500. elif current_page_type in self.header.PAGE_MIX_TYPE:
  501. if self.align_correction:
  502. align_correction = (
  503. bit_offset + self.header.SUBHEADER_POINTERS_OFFSET +
  504. self.current_page_subheaders_count *
  505. subheader_pointer_length
  506. ) % 8
  507. else:
  508. align_correction = 0
  509. offset = (
  510. bit_offset + self.header.SUBHEADER_POINTERS_OFFSET +
  511. align_correction + self.current_page_subheaders_count *
  512. subheader_pointer_length + current_row_on_page_index *
  513. self.properties.row_length
  514. )
  515. try:
  516. self.current_row = self._process_byte_array_with_data(
  517. offset,
  518. self.properties.row_length
  519. )
  520. except:
  521. self.logger.exception(
  522. 'failed to process data (you might want to try '
  523. 'passing align_correction=%s to the SAS7BDAT '
  524. 'constructor)' % (not self.align_correction)
  525. )
  526. raise
  527. current_row_on_page_index += 1
  528. if current_row_on_page_index == min(
  529. self.properties.row_count,
  530. self.properties.mix_page_row_count
  531. ):
  532. self._read_next_page()
  533. current_row_on_page_index = 0
  534. elif current_page_type == self.header.PAGE_DATA_TYPE:
  535. self.current_row = self._process_byte_array_with_data(
  536. bit_offset + self.header.SUBHEADER_POINTERS_OFFSET +
  537. current_row_on_page_index *
  538. self.properties.row_length,
  539. self.properties.row_length
  540. )
  541. current_row_on_page_index += 1
  542. if current_row_on_page_index == self.current_page_block_count:
  543. self._read_next_page()
  544. current_row_on_page_index = 0
  545. else:
  546. self.logger.error('unknown page type: %s', current_page_type)
  547. yield self.current_row
  548. def _read_next_page(self):
  549. self.current_page_data_subheader_pointers = []
  550. self.cached_page = self._file.read(self.properties.page_length)
  551. if len(self.cached_page) <= 0:
  552. return
  553. if len(self.cached_page) != self.properties.page_length:
  554. self.logger.error(
  555. 'failed to read complete page from file (read %s of %s bytes)',
  556. len(self.cached_page), self.properties.page_length
  557. )
  558. self.header.read_page_header()
  559. if self.current_page_type == self.header.PAGE_META_TYPE:
  560. self.header.process_page_metadata()
  561. if self.current_page_type not in [
  562. self.header.PAGE_META_TYPE,
  563. self.header.PAGE_DATA_TYPE
  564. ] + self.header.PAGE_MIX_TYPE:
  565. self._read_next_page()
  566. def _process_byte_array_with_data(self, offset, length):
  567. row_elements = []
  568. if self.properties.compression and length < self.properties.row_length:
  569. decompressor = self.DECOMPRESSORS.get(
  570. self.properties.compression
  571. )
  572. source = decompressor(self).decompress_row(
  573. offset, length, self.properties.row_length,
  574. self.cached_page
  575. )
  576. offset = 0
  577. else:
  578. source = self.cached_page
  579. for i in xrange(self.properties.column_count):
  580. length = self.column_data_lengths[i]
  581. if length == 0:
  582. break
  583. start = offset + self.column_data_offsets[i]
  584. end = offset + self.column_data_offsets[i] + length
  585. temp = source[start:end]
  586. if self.columns[i].type == 'number':
  587. if self.column_data_lengths[i] <= 2:
  588. row_elements.append(self._read_val(
  589. 'h', temp, length
  590. ))
  591. else:
  592. fmt = self.columns[i].format
  593. if not fmt:
  594. row_elements.append(self._read_val(
  595. 'number', temp, length
  596. ))
  597. elif fmt in self.TIME_FORMAT_STRINGS:
  598. row_elements.append(self._read_val(
  599. 'time', temp, length
  600. ))
  601. elif fmt in self.DATE_TIME_FORMAT_STRINGS:
  602. row_elements.append(self._read_val(
  603. 'datetime', temp, length
  604. ))
  605. elif fmt in self.DATE_FORMAT_STRINGS:
  606. row_elements.append(self._read_val(
  607. 'date', temp, length
  608. ))
  609. else:
  610. row_elements.append(self._read_val(
  611. 'number', temp, length
  612. ))
  613. else: # string
  614. row_elements.append(self._read_val(
  615. 's', temp, length
  616. ).decode(self.encoding, self.encoding_errors))
  617. return row_elements
  618. def convert_file(self, out_file, delimiter=',', step_size=100000,
  619. encoding=None):
  620. """
  621. convert_file(out_file[, delimiter[, step_size]]) -> None
  622. A convenience method to convert a SAS7BDAT file into a delimited
  623. text file. Defaults to comma separated. The step_size parameter
  624. is uses to show progress on longer running conversions.
  625. """
  626. delimiter = str(delimiter)
  627. self.logger.debug('saving as: %s', out_file)
  628. out_f = None
  629. success = True
  630. try:
  631. if out_file == '-':
  632. out_f = sys.stdout
  633. else:
  634. out_f = open(out_file, 'w', encoding=encoding)
  635. out = csv.writer(out_f, lineterminator='\n', delimiter=delimiter)
  636. i = 0
  637. for i, line in enumerate(self, 1):
  638. if len(line) != (self.properties.column_count or 0):
  639. msg = 'parsed line into %s columns but was ' \
  640. 'expecting %s.\n%s' %\
  641. (len(line), self.properties.column_count, line)
  642. self.logger.error(msg)
  643. success = False
  644. if self.logger.level == logging.DEBUG:
  645. raise ParseError(msg)
  646. break
  647. if not i % step_size:
  648. self.logger.info(
  649. '%.1f%% complete',
  650. float(i) / self.properties.row_count * 100.0
  651. )
  652. try:
  653. out.writerow(line)
  654. except IOError:
  655. self.logger.warn('wrote %s lines before interruption', i)
  656. success = False
  657. break
  658. self.logger.info('\u27f6 [%s] wrote %s of %s lines',
  659. os.path.basename(out_file), i - 1,
  660. self.properties.row_count or 0)
  661. except Exception as e:
  662. self.logger.exception(e)
  663. success = False
  664. finally:
  665. if out_f is not None:
  666. out_f.close()
  667. return success
  668. def to_data_frame(self):
  669. """
  670. to_data_frame() -> pandas.DataFrame object
  671. A convenience method to convert a SAS7BDAT file into a pandas
  672. DataFrame.
  673. """
  674. import pandas as pd
  675. data = list(self.readlines())
  676. return pd.DataFrame(data[1:], columns=data[0])
  677. class Column(object):
  678. def __init__(self, col_id, name, label, col_format, col_type, length):
  679. self.col_id = col_id
  680. self.name = name
  681. self.label = label
  682. self.format = col_format.decode("utf-8")
  683. self.type = col_type
  684. self.length = length
  685. def __repr__(self):
  686. return self.name
  687. class SubheaderPointer(object):
  688. def __init__(self, offset=None, length=None, compression=None,
  689. p_type=None):
  690. self.offset = offset
  691. self.length = length
  692. self.compression = compression
  693. self.type = p_type
  694. class ProcessingSubheader(object):
  695. TEXT_BLOCK_SIZE_LENGTH = 2
  696. ROW_LENGTH_OFFSET_MULTIPLIER = 5
  697. ROW_COUNT_OFFSET_MULTIPLIER = 6
  698. COL_COUNT_P1_MULTIPLIER = 9
  699. COL_COUNT_P2_MULTIPLIER = 10
  700. ROW_COUNT_ON_MIX_PAGE_OFFSET_MULTIPLIER = 15 # rowcountfp
  701. COLUMN_NAME_POINTER_LENGTH = 8
  702. COLUMN_NAME_TEXT_SUBHEADER_OFFSET = 0
  703. COLUMN_NAME_TEXT_SUBHEADER_LENGTH = 2
  704. COLUMN_NAME_OFFSET_OFFSET = 2
  705. COLUMN_NAME_OFFSET_LENGTH = 2
  706. COLUMN_NAME_LENGTH_OFFSET = 4
  707. COLUMN_NAME_LENGTH_LENGTH = 2
  708. COLUMN_DATA_OFFSET_OFFSET = 8
  709. COLUMN_DATA_LENGTH_OFFSET = 8
  710. COLUMN_DATA_LENGTH_LENGTH = 4
  711. COLUMN_TYPE_OFFSET = 14
  712. COLUMN_TYPE_LENGTH = 1
  713. COLUMN_FORMAT_TEXT_SUBHEADER_INDEX_OFFSET = 22
  714. COLUMN_FORMAT_TEXT_SUBHEADER_INDEX_LENGTH = 2
  715. COLUMN_FORMAT_OFFSET_OFFSET = 24
  716. COLUMN_FORMAT_OFFSET_LENGTH = 2
  717. COLUMN_FORMAT_LENGTH_OFFSET = 26
  718. COLUMN_FORMAT_LENGTH_LENGTH = 2
  719. COLUMN_LABEL_TEXT_SUBHEADER_INDEX_OFFSET = 28
  720. COLUMN_LABEL_TEXT_SUBHEADER_INDEX_LENGTH = 2
  721. COLUMN_LABEL_OFFSET_OFFSET = 30
  722. COLUMN_LABEL_OFFSET_LENGTH = 2
  723. COLUMN_LABEL_LENGTH_OFFSET = 32
  724. COLUMN_LABEL_LENGTH_LENGTH = 2
  725. def __init__(self, parent):
  726. self.parent = parent
  727. self.logger = parent.logger
  728. self.properties = parent.header.properties
  729. self.int_length = 8 if self.properties.u64 else 4
  730. def process_subheader(self, offset, length):
  731. raise NotImplementedError
  732. class RowSizeSubheader(ProcessingSubheader):
  733. def process_subheader(self, offset, length):
  734. int_len = self.int_length
  735. lcs = offset + (682 if self.properties.u64 else 354)
  736. lcp = offset + (706 if self.properties.u64 else 378)
  737. vals = self.parent._read_bytes({
  738. offset + self.ROW_LENGTH_OFFSET_MULTIPLIER * int_len: int_len,
  739. offset + self.ROW_COUNT_OFFSET_MULTIPLIER * int_len: int_len,
  740. offset + self.ROW_COUNT_ON_MIX_PAGE_OFFSET_MULTIPLIER * int_len:
  741. int_len,
  742. offset + self.COL_COUNT_P1_MULTIPLIER * int_len: int_len,
  743. offset + self.COL_COUNT_P2_MULTIPLIER * int_len: int_len,
  744. lcs: 2,
  745. lcp: 2,
  746. })
  747. if self.properties.row_length is not None:
  748. self.logger.error('found more than one row length subheader')
  749. if self.properties.row_count is not None:
  750. self.logger.error('found more than one row count subheader')
  751. if self.properties.col_count_p1 is not None:
  752. self.logger.error('found more than one col count p1 subheader')
  753. if self.properties.col_count_p2 is not None:
  754. self.logger.error('found more than one col count p2 subheader')
  755. if self.properties.mix_page_row_count is not None:
  756. self.logger.error('found more than one mix page row count '
  757. 'subheader')
  758. self.properties.row_length = self.parent._read_val(
  759. 'i',
  760. vals[offset + self.ROW_LENGTH_OFFSET_MULTIPLIER * int_len],
  761. int_len
  762. )
  763. self.properties.row_count = self.parent._read_val(
  764. 'i',
  765. vals[offset + self.ROW_COUNT_OFFSET_MULTIPLIER * int_len],
  766. int_len
  767. )
  768. self.properties.col_count_p1 = self.parent._read_val(
  769. 'i',
  770. vals[offset + self.COL_COUNT_P1_MULTIPLIER * int_len],
  771. int_len
  772. )
  773. self.properties.col_count_p2 = self.parent._read_val(
  774. 'i',
  775. vals[offset + self.COL_COUNT_P2_MULTIPLIER * int_len],
  776. int_len
  777. )
  778. self.properties.mix_page_row_count = self.parent._read_val(
  779. 'i',
  780. vals[offset + self.ROW_COUNT_ON_MIX_PAGE_OFFSET_MULTIPLIER *
  781. int_len],
  782. int_len
  783. )
  784. self.properties.lcs = self.parent._read_val('h', vals[lcs], 2)
  785. self.properties.lcp = self.parent._read_val('h', vals[lcp], 2)
  786. class ColumnSizeSubheader(ProcessingSubheader):
  787. def process_subheader(self, offset, length):
  788. offset += self.int_length
  789. vals = self.parent._read_bytes({
  790. offset: self.int_length
  791. })
  792. if self.properties.column_count is not None:
  793. self.logger.error('found more than one column count subheader')
  794. self.properties.column_count = self.parent._read_val(
  795. 'i', vals[offset], self.int_length
  796. )
  797. if self.properties.col_count_p1 + self.properties.col_count_p2 !=\
  798. self.properties.column_count:
  799. self.logger.warning('column count mismatch')
  800. class SubheaderCountsSubheader(ProcessingSubheader):
  801. def process_subheader(self, offset, length):
  802. pass # Not sure what to do here yet
  803. class ColumnTextSubheader(ProcessingSubheader):
  804. def process_subheader(self, offset, length):
  805. offset += self.int_length
  806. vals = self.parent._read_bytes({
  807. offset: self.TEXT_BLOCK_SIZE_LENGTH
  808. })
  809. text_block_size = self.parent._read_val(
  810. 'h', vals[offset], self.TEXT_BLOCK_SIZE_LENGTH
  811. )
  812. vals = self.parent._read_bytes({
  813. offset: text_block_size
  814. })
  815. self.parent.column_names_strings.append(vals[offset])
  816. if len(self.parent.column_names_strings) == 1:
  817. column_name = self.parent.column_names_strings[0]
  818. compression_literal = None
  819. for cl in SAS7BDAT.COMPRESSION_LITERALS:
  820. if cl in column_name:
  821. compression_literal = cl
  822. break
  823. self.properties.compression = compression_literal
  824. offset -= self.int_length
  825. vals = self.parent._read_bytes({
  826. offset + (20 if self.properties.u64 else 16): 8
  827. })
  828. compression_literal = self.parent._read_val(
  829. 's',
  830. vals[offset + (20 if self.properties.u64 else 16)],
  831. 8
  832. ).strip()
  833. if compression_literal == '':
  834. self.properties.lcs = 0
  835. vals = self.parent._read_bytes({
  836. offset + 16 + (20 if self.properties.u64 else 16):
  837. self.properties.lcp
  838. })
  839. creatorproc = self.parent._read_val(
  840. 's',
  841. vals[offset + 16 + (20 if self.properties.u64 else 16)],
  842. self.properties.lcp
  843. )
  844. self.properties.creator_proc = creatorproc
  845. elif compression_literal == SAS7BDAT.RLE_COMPRESSION:
  846. vals = self.parent._read_bytes({
  847. offset + 24 + (20 if self.properties.u64 else 16):
  848. self.properties.lcp
  849. })
  850. creatorproc = self.parent._read_val(
  851. 's',
  852. vals[offset + 24 + (20 if self.properties.u64 else 16)],
  853. self.properties.lcp
  854. )
  855. self.properties.creator_proc = creatorproc
  856. elif self.properties.lcs > 0:
  857. self.properties.lcp = 0
  858. vals = self.parent._read_bytes({
  859. offset + (20 if self.properties.u64 else 16):
  860. self.properties.lcs
  861. })
  862. creator = self.parent._read_val(
  863. 's',
  864. vals[offset + (20 if self.properties.u64 else 16)],
  865. self.properties.lcs
  866. )
  867. self.properties.creator = creator
  868. class ColumnNameSubheader(ProcessingSubheader):
  869. def process_subheader(self, offset, length):
  870. offset += self.int_length
  871. column_name_pointers_count = (length - 2 * self.int_length - 12) // 8
  872. for i in xrange(column_name_pointers_count):
  873. text_subheader = (
  874. offset + self.COLUMN_NAME_POINTER_LENGTH * (i + 1) +
  875. self.COLUMN_NAME_TEXT_SUBHEADER_OFFSET
  876. )
  877. col_name_offset = (
  878. offset + self.COLUMN_NAME_POINTER_LENGTH * (i + 1) +
  879. self.COLUMN_NAME_OFFSET_OFFSET
  880. )
  881. col_name_length = (
  882. offset + self.COLUMN_NAME_POINTER_LENGTH * (i + 1) +
  883. self.COLUMN_NAME_LENGTH_OFFSET
  884. )
  885. vals = self.parent._read_bytes({
  886. text_subheader: self.COLUMN_NAME_TEXT_SUBHEADER_LENGTH,
  887. col_name_offset: self.COLUMN_NAME_OFFSET_LENGTH,
  888. col_name_length: self.COLUMN_NAME_LENGTH_LENGTH,
  889. })
  890. idx = self.parent._read_val(
  891. 'h', vals[text_subheader],
  892. self.COLUMN_NAME_TEXT_SUBHEADER_LENGTH
  893. )
  894. col_offset = self.parent._read_val(
  895. 'h', vals[col_name_offset],
  896. self.COLUMN_NAME_OFFSET_LENGTH
  897. )
  898. col_len = self.parent._read_val(
  899. 'h', vals[col_name_length],
  900. self.COLUMN_NAME_LENGTH_LENGTH
  901. )
  902. name_str = self.parent.column_names_strings[idx]
  903. self.parent.column_names.append(
  904. name_str[col_offset:col_offset + col_len]
  905. )
  906. class ColumnAttributesSubheader(ProcessingSubheader):
  907. def process_subheader(self, offset, length):
  908. int_len = self.int_length
  909. column_attributes_vectors_count = (
  910. (length - 2 * int_len - 12) // (int_len + 8)
  911. )
  912. for i in xrange(column_attributes_vectors_count):
  913. col_data_offset = (
  914. offset + int_len + self.COLUMN_DATA_OFFSET_OFFSET + i *
  915. (int_len + 8)
  916. )
  917. col_data_len = (
  918. offset + 2 * int_len + self.COLUMN_DATA_LENGTH_OFFSET + i *
  919. (int_len + 8)
  920. )
  921. col_types = (
  922. offset + 2 * int_len + self.COLUMN_TYPE_OFFSET + i *
  923. (int_len + 8)
  924. )
  925. vals = self.parent._read_bytes({
  926. col_data_offset: int_len,
  927. col_data_len: self.COLUMN_DATA_LENGTH_LENGTH,
  928. col_types: self.COLUMN_TYPE_LENGTH,
  929. })
  930. self.parent.column_data_offsets.append(self.parent._read_val(
  931. 'i', vals[col_data_offset], int_len
  932. ))
  933. self.parent.column_data_lengths.append(self.parent._read_val(
  934. 'i', vals[col_data_len], self.COLUMN_DATA_LENGTH_LENGTH
  935. ))
  936. ctype = self.parent._read_val(
  937. 'b', vals[col_types], self.COLUMN_TYPE_LENGTH
  938. )
  939. self.parent.column_types.append(
  940. 'number' if ctype == 1 else 'string'
  941. )
  942. class FormatAndLabelSubheader(ProcessingSubheader):
  943. def process_subheader(self, offset, length):
  944. int_len = self.int_length
  945. text_subheader_format = (
  946. offset + self.COLUMN_FORMAT_TEXT_SUBHEADER_INDEX_OFFSET + 3 *
  947. int_len
  948. )
  949. col_format_offset = (
  950. offset + self.COLUMN_FORMAT_OFFSET_OFFSET + 3 * int_len
  951. )
  952. col_format_len = (
  953. offset + self.COLUMN_FORMAT_LENGTH_OFFSET + 3 * int_len
  954. )
  955. text_subheader_label = (
  956. offset + self.COLUMN_LABEL_TEXT_SUBHEADER_INDEX_OFFSET + 3 *
  957. int_len
  958. )
  959. col_label_offset = (
  960. offset + self.COLUMN_LABEL_OFFSET_OFFSET + 3 * int_len
  961. )
  962. col_label_len = (
  963. offset + self.COLUMN_LABEL_LENGTH_OFFSET + 3 * int_len
  964. )
  965. vals = self.parent._read_bytes({
  966. text_subheader_format:
  967. self.COLUMN_FORMAT_TEXT_SUBHEADER_INDEX_LENGTH,
  968. col_format_offset: self.COLUMN_FORMAT_OFFSET_LENGTH,
  969. col_format_len: self.COLUMN_FORMAT_LENGTH_LENGTH,
  970. text_subheader_label:
  971. self.COLUMN_LABEL_TEXT_SUBHEADER_INDEX_LENGTH,
  972. col_label_offset: self.COLUMN_LABEL_OFFSET_LENGTH,
  973. col_label_len: self.COLUMN_LABEL_LENGTH_LENGTH,
  974. })
  975. # min used to prevent incorrect data which appear in some files
  976. format_idx = min(
  977. self.parent._read_val(
  978. 'h', vals[text_subheader_format],
  979. self.COLUMN_FORMAT_TEXT_SUBHEADER_INDEX_LENGTH
  980. ),
  981. len(self.parent.column_names_strings) - 1
  982. )
  983. format_start = self.parent._read_val(
  984. 'h', vals[col_format_offset],
  985. self.COLUMN_FORMAT_OFFSET_LENGTH
  986. )
  987. format_len = self.parent._read_val(
  988. 'h', vals[col_format_len],
  989. self.COLUMN_FORMAT_LENGTH_LENGTH
  990. )
  991. # min used to prevent incorrect data which appear in some files
  992. label_idx = min(
  993. self.parent._read_val(
  994. 'h', vals[text_subheader_label],
  995. self.COLUMN_LABEL_TEXT_SUBHEADER_INDEX_LENGTH,
  996. ),
  997. len(self.parent.column_names_strings) - 1
  998. )
  999. label_start = self.parent._read_val(
  1000. 'h', vals[col_label_offset],
  1001. self.COLUMN_LABEL_OFFSET_LENGTH
  1002. )
  1003. label_len = self.parent._read_val(
  1004. 'h', vals[col_label_len],
  1005. self.COLUMN_LABEL_LENGTH_LENGTH
  1006. )
  1007. label_names = self.parent.column_names_strings[label_idx]
  1008. column_label = label_names[label_start:label_start + label_len]
  1009. format_names = self.parent.column_names_strings[format_idx]
  1010. column_format = format_names[format_start:format_start + format_len]
  1011. current_column_number = len(self.parent.columns)
  1012. self.parent.columns.append(
  1013. Column(current_column_number,
  1014. self.parent.column_names[current_column_number],
  1015. column_label,
  1016. column_format,
  1017. self.parent.column_types[current_column_number],
  1018. self.parent.column_data_lengths[current_column_number])
  1019. )
  1020. class ColumnListSubheader(ProcessingSubheader):
  1021. def process_subheader(self, offset, length):
  1022. pass # Not sure what to do with this yet
  1023. class DataSubheader(ProcessingSubheader):
  1024. def process_subheader(self, offset, length):
  1025. self.parent.current_row = self.parent._process_byte_array_with_data(
  1026. offset, length
  1027. )
  1028. class SASProperties(object):
  1029. def __init__(self):
  1030. self.u64 = False
  1031. self.endianess = None
  1032. self.platform = None
  1033. self.name = None
  1034. self.file_type = None
  1035. self.date_created = None
  1036. self.date_modified = None
  1037. self.header_length = None
  1038. self.page_length = None
  1039. self.page_count = None
  1040. self.sas_release = None
  1041. self.server_type = None
  1042. self.os_type = None
  1043. self.os_name = None
  1044. self.compression = None
  1045. self.row_length = None
  1046. self.row_count = None
  1047. self.col_count_p1 = None
  1048. self.col_count_p2 = None
  1049. self.mix_page_row_count = None
  1050. self.lcs = None
  1051. self.lcp = None
  1052. self.creator = None
  1053. self.creator_proc = None
  1054. self.column_count = None
  1055. self.filename = None
  1056. class SASHeader(object):
  1057. MAGIC = b'\x00\x00\x00\x00\x00\x00\x00\x00' \
  1058. b'\x00\x00\x00\x00\xc2\xea\x81\x60' \
  1059. b'\xb3\x14\x11\xcf\xbd\x92\x08\x00' \
  1060. b'\x09\xc7\x31\x8c\x18\x1f\x10\x11'
  1061. ROW_SIZE_SUBHEADER_INDEX = 'row_size'
  1062. COLUMN_SIZE_SUBHEADER_INDEX = 'column_size'
  1063. SUBHEADER_COUNTS_SUBHEADER_INDEX = 'subheader_counts'
  1064. COLUMN_TEXT_SUBHEADER_INDEX = 'column_text'
  1065. COLUMN_NAME_SUBHEADER_INDEX = 'column_name'
  1066. COLUMN_ATTRIBUTES_SUBHEADER_INDEX = 'column_attributes'
  1067. FORMAT_AND_LABEL_SUBHEADER_INDEX = 'format_and_label'
  1068. COLUMN_LIST_SUBHEADER_INDEX = 'column_list'
  1069. DATA_SUBHEADER_INDEX = 'data'
  1070. # Subheader signatures, 32 and 64 bit, little and big endian
  1071. SUBHEADER_SIGNATURE_TO_INDEX = {
  1072. b'\xF7\xF7\xF7\xF7': ROW_SIZE_SUBHEADER_INDEX,
  1073. b'\x00\x00\x00\x00\xF7\xF7\xF7\xF7': ROW_SIZE_SUBHEADER_INDEX,
  1074. b'\xF7\xF7\xF7\xF7\x00\x00\x00\x00': ROW_SIZE_SUBHEADER_INDEX,
  1075. b'\xF6\xF6\xF6\xF6': COLUMN_SIZE_SUBHEADER_INDEX,
  1076. b'\x00\x00\x00\x00\xF6\xF6\xF6\xF6': COLUMN_SIZE_SUBHEADER_INDEX,
  1077. b'\xF6\xF6\xF6\xF6\x00\x00\x00\x00': COLUMN_SIZE_SUBHEADER_INDEX,
  1078. b'\x00\xFC\xFF\xFF': SUBHEADER_COUNTS_SUBHEADER_INDEX,
  1079. b'\xFF\xFF\xFC\x00': SUBHEADER_COUNTS_SUBHEADER_INDEX,
  1080. b'\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF': SUBHEADER_COUNTS_SUBHEADER_INDEX,
  1081. b'\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00': SUBHEADER_COUNTS_SUBHEADER_INDEX,
  1082. b'\xFD\xFF\xFF\xFF': COLUMN_TEXT_SUBHEADER_INDEX,
  1083. b'\xFF\xFF\xFF\xFD': COLUMN_TEXT_SUBHEADER_INDEX,
  1084. b'\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF': COLUMN_TEXT_SUBHEADER_INDEX,
  1085. b'\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD': COLUMN_TEXT_SUBHEADER_INDEX,
  1086. b'\xFF\xFF\xFF\xFF': COLUMN_NAME_SUBHEADER_INDEX,
  1087. b'\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF': COLUMN_NAME_SUBHEADER_INDEX,
  1088. b'\xFC\xFF\xFF\xFF': COLUMN_ATTRIBUTES_SUBHEADER_INDEX,
  1089. b'\xFF\xFF\xFF\xFC': COLUMN_ATTRIBUTES_SUBHEADER_INDEX,
  1090. b'\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF': COLUMN_ATTRIBUTES_SUBHEADER_INDEX,
  1091. b'\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC': COLUMN_ATTRIBUTES_SUBHEADER_INDEX,
  1092. b'\xFE\xFB\xFF\xFF': FORMAT_AND_LABEL_SUBHEADER_INDEX,
  1093. b'\xFF\xFF\xFB\xFE': FORMAT_AND_LABEL_SUBHEADER_INDEX,
  1094. b'\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF': FORMAT_AND_LABEL_SUBHEADER_INDEX,
  1095. b'\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE': FORMAT_AND_LABEL_SUBHEADER_INDEX,
  1096. b'\xFE\xFF\xFF\xFF': COLUMN_LIST_SUBHEADER_INDEX,
  1097. b'\xFF\xFF\xFF\xFE': COLUMN_LIST_SUBHEADER_INDEX,
  1098. b'\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF': COLUMN_LIST_SUBHEADER_INDEX,
  1099. b'\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE': COLUMN_LIST_SUBHEADER_INDEX,
  1100. }
  1101. SUBHEADER_INDEX_TO_CLASS = {
  1102. ROW_SIZE_SUBHEADER_INDEX: RowSizeSubheader,
  1103. COLUMN_SIZE_SUBHEADER_INDEX: ColumnSizeSubheader,
  1104. SUBHEADER_COUNTS_SUBHEADER_INDEX: SubheaderCountsSubheader,
  1105. COLUMN_TEXT_SUBHEADER_INDEX: ColumnTextSubheader,
  1106. COLUMN_NAME_SUBHEADER_INDEX: ColumnNameSubheader,
  1107. COLUMN_ATTRIBUTES_SUBHEADER_INDEX: ColumnAttributesSubheader,
  1108. FORMAT_AND_LABEL_SUBHEADER_INDEX: FormatAndLabelSubheader,
  1109. COLUMN_LIST_SUBHEADER_INDEX: ColumnListSubheader,
  1110. DATA_SUBHEADER_INDEX: DataSubheader,
  1111. }
  1112. ALIGN_1_CHECKER_VALUE = b'3'
  1113. ALIGN_1_OFFSET = 32
  1114. ALIGN_1_LENGTH = 1
  1115. ALIGN_1_VALUE = 4
  1116. U64_BYTE_CHECKER_VALUE = b'3'
  1117. ALIGN_2_OFFSET = 35
  1118. ALIGN_2_LENGTH = 1
  1119. ALIGN_2_VALUE = 4
  1120. ENDIANNESS_OFFSET = 37
  1121. ENDIANNESS_LENGTH = 1
  1122. PLATFORM_OFFSET = 39
  1123. PLATFORM_LENGTH = 1
  1124. DATASET_OFFSET = 92
  1125. DATASET_LENGTH = 64
  1126. FILE_TYPE_OFFSET = 156
  1127. FILE_TYPE_LENGTH = 8
  1128. DATE_CREATED_OFFSET = 164
  1129. DATE_CREATED_LENGTH = 8
  1130. DATE_MODIFIED_OFFSET = 172
  1131. DATE_MODIFIED_LENGTH = 8
  1132. HEADER_SIZE_OFFSET = 196
  1133. HEADER_SIZE_LENGTH = 4
  1134. PAGE_SIZE_OFFSET = 200
  1135. PAGE_SIZE_LENGTH = 4
  1136. PAGE_COUNT_OFFSET = 204
  1137. PAGE_COUNT_LENGTH = 4
  1138. SAS_RELEASE_OFFSET = 216
  1139. SAS_RELEASE_LENGTH = 8
  1140. SAS_SERVER_TYPE_OFFSET = 224
  1141. SAS_SERVER_TYPE_LENGTH = 16
  1142. OS_VERSION_NUMBER_OFFSET = 240
  1143. OS_VERSION_NUMBER_LENGTH = 16
  1144. OS_MAKER_OFFSET = 256
  1145. OS_MAKER_LENGTH = 16
  1146. OS_NAME_OFFSET = 272
  1147. OS_NAME_LENGTH = 16
  1148. PAGE_BIT_OFFSET_X86 = 16
  1149. PAGE_BIT_OFFSET_X64 = 32
  1150. SUBHEADER_POINTER_LENGTH_X86 = 12
  1151. SUBHEADER_POINTER_LENGTH_X64 = 24
  1152. PAGE_TYPE_OFFSET = 0
  1153. PAGE_TYPE_LENGTH = 2
  1154. BLOCK_COUNT_OFFSET = 2
  1155. BLOCK_COUNT_LENGTH = 2
  1156. SUBHEADER_COUNT_OFFSET = 4
  1157. SUBHEADER_COUNT_LENGTH = 2
  1158. PAGE_META_TYPE = 0
  1159. PAGE_DATA_TYPE = 256
  1160. PAGE_MIX_TYPE = [512, 640]
  1161. PAGE_AMD_TYPE = 1024
  1162. PAGE_METC_TYPE = 16384
  1163. PAGE_COMP_TYPE = -28672
  1164. PAGE_MIX_DATA_TYPE = PAGE_MIX_TYPE + [PAGE_DATA_TYPE]
  1165. PAGE_META_MIX_AMD = [PAGE_META_TYPE] + PAGE_MIX_TYPE + [PAGE_AMD_TYPE]
  1166. PAGE_ANY = PAGE_META_MIX_AMD +\
  1167. [PAGE_DATA_TYPE, PAGE_METC_TYPE, PAGE_COMP_TYPE]
  1168. SUBHEADER_POINTERS_OFFSET = 8
  1169. TRUNCATED_SUBHEADER_ID = 1
  1170. COMPRESSED_SUBHEADER_ID = 4
  1171. COMPRESSED_SUBHEADER_TYPE = 1
  1172. def __init__(self, parent):
  1173. self.parent = parent
  1174. self.properties = SASProperties()
  1175. self.properties.filename = os.path.basename(parent.path)
  1176. # Check magic number
  1177. h = parent.cached_page = parent._file.read(288)
  1178. if len(h) < 288:
  1179. parent.logger.error('header too short (not a sas7bdat file?)')
  1180. return
  1181. if not self.check_magic_number(h):
  1182. parent.logger.error('magic number mismatch')
  1183. return
  1184. align1 = 0
  1185. align2 = 0
  1186. offsets_and_lengths = {
  1187. self.ALIGN_1_OFFSET: self.ALIGN_1_LENGTH,
  1188. self.ALIGN_2_OFFSET: self.ALIGN_2_LENGTH,
  1189. }
  1190. align_vals = parent._read_bytes(offsets_and_lengths)
  1191. if align_vals[self.ALIGN_1_OFFSET] == self.U64_BYTE_CHECKER_VALUE:
  1192. align2 = self.ALIGN_2_VALUE
  1193. self.properties.u64 = True
  1194. if align_vals[self.ALIGN_2_OFFSET] == self.ALIGN_1_CHECKER_VALUE:
  1195. align1 = self.ALIGN_1_VALUE
  1196. total_align = align1 + align2
  1197. offsets_and_lengths = {
  1198. self.ENDIANNESS_OFFSET: self.ENDIANNESS_LENGTH,
  1199. self.PLATFORM_OFFSET: self.PLATFORM_LENGTH,
  1200. self.DATASET_OFFSET: self.DATASET_LENGTH,
  1201. self.FILE_TYPE_OFFSET: self.FILE_TYPE_LENGTH,
  1202. self.DATE_CREATED_OFFSET + align1: self.DATE_CREATED_LENGTH,
  1203. self.DATE_MODIFIED_OFFSET + align1: self.DATE_MODIFIED_LENGTH,
  1204. self.HEADER_

Large files files are truncated, but you can click here to view the full file