PageRenderTime 94ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/pandas/core/format.py

http://github.com/pydata/pandas
Python | 2298 lines | 2261 code | 24 blank | 13 comment | 33 complexity | b71d18b5fbb16c4590d0e20f32c22215 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. #coding: utf-8
  2. from __future__ import print_function
  3. # pylint: disable=W0141
  4. import sys
  5. import re
  6. from pandas.core.base import PandasObject
  7. from pandas.core.common import adjoin, isnull, notnull
  8. from pandas.core.index import Index, MultiIndex, _ensure_index
  9. from pandas import compat
  10. from pandas.compat import(StringIO, lzip, range, map, zip, reduce, u,
  11. OrderedDict)
  12. from pandas.util.terminal import get_terminal_size
  13. from pandas.core.config import get_option, set_option, reset_option
  14. import pandas.core.common as com
  15. import pandas.lib as lib
  16. from pandas.tslib import iNaT
  17. import numpy as np
  18. import itertools
  19. import csv
  20. from datetime import time
  21. from pandas.tseries.period import PeriodIndex, DatetimeIndex
  22. docstring_to_string = """
  23. Parameters
  24. ----------
  25. frame : DataFrame
  26. object to render
  27. buf : StringIO-like, optional
  28. buffer to write to
  29. columns : sequence, optional
  30. the subset of columns to write; default None writes all columns
  31. col_space : int, optional
  32. the minimum width of each column
  33. header : bool, optional
  34. whether to print column labels, default True
  35. index : bool, optional
  36. whether to print index (row) labels, default True
  37. na_rep : string, optional
  38. string representation of NAN to use, default 'NaN'
  39. formatters : list or dict of one-parameter functions, optional
  40. formatter functions to apply to columns' elements by position or name,
  41. default None. The result of each function must be a unicode string.
  42. List must be of length equal to the number of columns.
  43. float_format : one-parameter function, optional
  44. formatter function to apply to columns' elements if they are floats,
  45. default None. The result of this function must be a unicode string.
  46. sparsify : bool, optional
  47. Set to False for a DataFrame with a hierarchical index to print every
  48. multiindex key at each row, default True
  49. justify : {'left', 'right'}, default None
  50. Left or right-justify the column labels. If None uses the option from
  51. the print configuration (controlled by set_option), 'right' out
  52. of the box.
  53. index_names : bool, optional
  54. Prints the names of the indexes, default True
  55. force_unicode : bool, default False
  56. Always return a unicode result. Deprecated in v0.10.0 as string
  57. formatting is now rendered to unicode by default.
  58. Returns
  59. -------
  60. formatted : string (or unicode, depending on data and options)"""
  61. class CategoricalFormatter(object):
  62. def __init__(self, categorical, buf=None, length=True,
  63. na_rep='NaN', name=False, footer=True):
  64. self.categorical = categorical
  65. self.buf = buf if buf is not None else StringIO(u(""))
  66. self.name = name
  67. self.na_rep = na_rep
  68. self.length = length
  69. self.footer = footer
  70. def _get_footer(self):
  71. footer = ''
  72. if self.name:
  73. name = com.pprint_thing(self.categorical.name,
  74. escape_chars=('\t', '\r', '\n'))
  75. footer += ('Name: %s' % name if self.categorical.name is not None
  76. else '')
  77. if self.length:
  78. if footer:
  79. footer += ', '
  80. footer += "Length: %d" % len(self.categorical)
  81. levheader = 'Levels (%d): ' % len(self.categorical.levels)
  82. # TODO: should max_line_width respect a setting?
  83. levstring = np.array_repr(self.categorical.levels, max_line_width=60)
  84. indent = ' ' * (levstring.find('[') + len(levheader) + 1)
  85. lines = levstring.split('\n')
  86. levstring = '\n'.join([lines[0]] +
  87. [indent + x.lstrip() for x in lines[1:]])
  88. if footer:
  89. footer += ', '
  90. footer += levheader + levstring
  91. return compat.text_type(footer)
  92. def _get_formatted_values(self):
  93. return format_array(np.asarray(self.categorical), None,
  94. float_format=None,
  95. na_rep=self.na_rep)
  96. def to_string(self):
  97. categorical = self.categorical
  98. if len(categorical) == 0:
  99. if self.footer:
  100. return self._get_footer()
  101. else:
  102. return u('')
  103. fmt_values = self._get_formatted_values()
  104. pad_space = 10
  105. result = ['%s' % i for i in fmt_values]
  106. if self.footer:
  107. footer = self._get_footer()
  108. if footer:
  109. result.append(footer)
  110. return compat.text_type(u('\n').join(result))
  111. class SeriesFormatter(object):
  112. def __init__(self, series, buf=None, header=True, length=True,
  113. na_rep='NaN', name=False, float_format=None, dtype=True):
  114. self.series = series
  115. self.buf = buf if buf is not None else StringIO()
  116. self.name = name
  117. self.na_rep = na_rep
  118. self.length = length
  119. self.header = header
  120. if float_format is None:
  121. float_format = get_option("display.float_format")
  122. self.float_format = float_format
  123. self.dtype = dtype
  124. def _get_footer(self):
  125. footer = u('')
  126. if self.name:
  127. if getattr(self.series.index, 'freq', None):
  128. footer += 'Freq: %s' % self.series.index.freqstr
  129. if footer and self.series.name is not None:
  130. footer += ', '
  131. series_name = com.pprint_thing(self.series.name,
  132. escape_chars=('\t', '\r', '\n'))
  133. footer += ("Name: %s" %
  134. series_name) if self.series.name is not None else ""
  135. if self.length:
  136. if footer:
  137. footer += ', '
  138. footer += 'Length: %d' % len(self.series)
  139. if self.dtype:
  140. name = getattr(self.series.dtype, 'name', None)
  141. if name:
  142. if footer:
  143. footer += ', '
  144. footer += 'dtype: %s' % com.pprint_thing(name)
  145. return compat.text_type(footer)
  146. def _get_formatted_index(self):
  147. index = self.series.index
  148. is_multi = isinstance(index, MultiIndex)
  149. if is_multi:
  150. have_header = any(name for name in index.names)
  151. fmt_index = index.format(names=True)
  152. else:
  153. have_header = index.name is not None
  154. fmt_index = index.format(name=True)
  155. return fmt_index, have_header
  156. def _get_formatted_values(self):
  157. return format_array(self.series.values, None,
  158. float_format=self.float_format,
  159. na_rep=self.na_rep)
  160. def to_string(self):
  161. series = self.series
  162. if len(series) == 0:
  163. return u('')
  164. fmt_index, have_header = self._get_formatted_index()
  165. fmt_values = self._get_formatted_values()
  166. maxlen = max(len(x) for x in fmt_index)
  167. pad_space = min(maxlen, 60)
  168. result = ['%s %s'] * len(fmt_values)
  169. for i, (k, v) in enumerate(zip(fmt_index[1:], fmt_values)):
  170. idx = k.ljust(pad_space)
  171. result[i] = result[i] % (idx, v)
  172. if self.header and have_header:
  173. result.insert(0, fmt_index[0])
  174. footer = self._get_footer()
  175. if footer:
  176. result.append(footer)
  177. return compat.text_type(u('\n').join(result))
  178. def _strlen_func():
  179. if compat.PY3: # pragma: no cover
  180. _strlen = len
  181. else:
  182. encoding = get_option("display.encoding")
  183. def _strlen(x):
  184. try:
  185. return len(x.decode(encoding))
  186. except UnicodeError:
  187. return len(x)
  188. return _strlen
  189. class TableFormatter(object):
  190. is_truncated = False
  191. show_dimensions = None
  192. @property
  193. def should_show_dimensions(self):
  194. return self.show_dimensions is True or (self.show_dimensions == 'truncate' and self.is_truncated)
  195. def _get_formatter(self, i):
  196. if isinstance(self.formatters, (list, tuple)):
  197. if com.is_integer(i):
  198. return self.formatters[i]
  199. else:
  200. return None
  201. else:
  202. if com.is_integer(i) and i not in self.columns:
  203. i = self.columns[i]
  204. return self.formatters.get(i, None)
  205. class DataFrameFormatter(TableFormatter):
  206. """
  207. Render a DataFrame
  208. self.to_string() : console-friendly tabular output
  209. self.to_html() : html table
  210. self.to_latex() : LaTeX tabular environment table
  211. """
  212. __doc__ = __doc__ if __doc__ else ''
  213. __doc__ += docstring_to_string
  214. def __init__(self, frame, buf=None, columns=None, col_space=None,
  215. header=True, index=True, na_rep='NaN', formatters=None,
  216. justify=None, float_format=None, sparsify=None,
  217. index_names=True, line_width=None, max_rows=None,
  218. max_cols=None, show_dimensions=False, **kwds):
  219. self.frame = frame
  220. self.buf = buf if buf is not None else StringIO()
  221. self.show_index_names = index_names
  222. if sparsify is None:
  223. sparsify = get_option("display.multi_sparse")
  224. self.sparsify = sparsify
  225. self.float_format = float_format
  226. self.formatters = formatters if formatters is not None else {}
  227. self.na_rep = na_rep
  228. self.col_space = col_space
  229. self.header = header
  230. self.index = index
  231. self.line_width = line_width
  232. self.max_rows = max_rows
  233. self.max_cols = max_cols
  234. self.max_rows_displayed = min(max_rows or len(self.frame),
  235. len(self.frame))
  236. self.show_dimensions = show_dimensions
  237. if justify is None:
  238. self.justify = get_option("display.colheader_justify")
  239. else:
  240. self.justify = justify
  241. self.kwds = kwds
  242. if columns is not None:
  243. self.columns = _ensure_index(columns)
  244. self.frame = self.frame[self.columns]
  245. else:
  246. self.columns = frame.columns
  247. self._chk_truncate()
  248. def _chk_truncate(self):
  249. from pandas.tools.merge import concat
  250. truncate_h = self.max_cols and (len(self.columns) > self.max_cols)
  251. truncate_v = self.max_rows and (len(self.frame) > self.max_rows)
  252. # Cut the data to the information actually printed
  253. max_cols = self.max_cols
  254. max_rows = self.max_rows
  255. frame = self.frame
  256. if truncate_h:
  257. if max_cols > 1:
  258. col_num = (max_cols // 2)
  259. frame = concat( (frame.iloc[:,:col_num],frame.iloc[:,-col_num:]),axis=1 )
  260. else:
  261. col_num = max_cols
  262. frame = frame.iloc[:,:max_cols]
  263. self.tr_col_num = col_num
  264. if truncate_v:
  265. if max_rows > 1:
  266. row_num = max_rows // 2
  267. frame = concat( (frame.iloc[:row_num,:],frame.iloc[-row_num:,:]) )
  268. else:
  269. row_num = max_rows
  270. frame = frame.iloc[:max_rows,:]
  271. self.tr_row_num = row_num
  272. self.tr_frame = frame
  273. self.truncate_h = truncate_h
  274. self.truncate_v = truncate_v
  275. self.is_truncated = self.truncate_h or self.truncate_v
  276. def _to_str_columns(self):
  277. """
  278. Render a DataFrame to a list of columns (as lists of strings).
  279. """
  280. _strlen = _strlen_func()
  281. frame = self.tr_frame
  282. # may include levels names also
  283. str_index = self._get_formatted_index(frame)
  284. str_columns = self._get_formatted_column_labels(frame)
  285. if self.header:
  286. stringified = []
  287. col_headers = frame.columns
  288. for i, c in enumerate(frame):
  289. cheader = str_columns[i]
  290. max_colwidth = max(self.col_space or 0,
  291. *(_strlen(x) for x in cheader))
  292. fmt_values = self._format_col(i)
  293. fmt_values = _make_fixed_width(fmt_values, self.justify,
  294. minimum=max_colwidth)
  295. max_len = max(np.max([_strlen(x) for x in fmt_values]),
  296. max_colwidth)
  297. if self.justify == 'left':
  298. cheader = [x.ljust(max_len) for x in cheader]
  299. else:
  300. cheader = [x.rjust(max_len) for x in cheader]
  301. stringified.append(cheader + fmt_values)
  302. else:
  303. stringified = []
  304. for i, c in enumerate(frame):
  305. formatter = self._get_formatter(i)
  306. fmt_values = self._format_col(i)
  307. fmt_values = _make_fixed_width(fmt_values, self.justify)
  308. stringified.append(fmt_values)
  309. strcols = stringified
  310. if self.index:
  311. strcols.insert(0, str_index)
  312. # Add ... to signal truncated
  313. truncate_h = self.truncate_h
  314. truncate_v = self.truncate_v
  315. if truncate_h:
  316. col_num = self.tr_col_num
  317. col_width = len(strcols[col_num][0]) # infer from column header
  318. strcols.insert(col_num + 1, ['...'.center(col_width)] * (len(str_index)))
  319. if truncate_v:
  320. n_header_rows = len(str_index) - len(frame)
  321. row_num = self.tr_row_num
  322. for ix,col in enumerate(strcols):
  323. cwidth = len(strcols[ix][row_num]) # infer from above row
  324. is_dot_col = False
  325. if truncate_h:
  326. is_dot_col = ix == col_num + 1
  327. if cwidth > 3 or is_dot_col:
  328. my_str = '...'
  329. else:
  330. my_str = '..'
  331. if ix == 0:
  332. dot_str = my_str.ljust(cwidth)
  333. elif is_dot_col:
  334. dot_str = my_str.center(cwidth)
  335. else:
  336. dot_str = my_str.rjust(cwidth)
  337. strcols[ix].insert(row_num + n_header_rows, dot_str)
  338. return strcols
  339. def to_string(self):
  340. """
  341. Render a DataFrame to a console-friendly tabular output.
  342. """
  343. frame = self.frame
  344. if len(frame.columns) == 0 or len(frame.index) == 0:
  345. info_line = (u('Empty %s\nColumns: %s\nIndex: %s')
  346. % (type(self.frame).__name__,
  347. com.pprint_thing(frame.columns),
  348. com.pprint_thing(frame.index)))
  349. text = info_line
  350. else:
  351. strcols = self._to_str_columns()
  352. if self.line_width is None:
  353. text = adjoin(1, *strcols)
  354. else:
  355. text = self._join_multiline(*strcols)
  356. self.buf.writelines(text)
  357. if self.should_show_dimensions:
  358. self.buf.write("\n\n[%d rows x %d columns]"
  359. % (len(frame), len(frame.columns)))
  360. def _join_multiline(self, *strcols):
  361. lwidth = self.line_width
  362. adjoin_width = 1
  363. strcols = list(strcols)
  364. if self.index:
  365. idx = strcols.pop(0)
  366. lwidth -= np.array([len(x) for x in idx]).max() + adjoin_width
  367. col_widths = [np.array([len(x) for x in col]).max()
  368. if len(col) > 0 else 0
  369. for col in strcols]
  370. col_bins = _binify(col_widths, lwidth)
  371. nbins = len(col_bins)
  372. if self.max_rows and len(self.frame) > self.max_rows:
  373. nrows = self.max_rows + 1
  374. else:
  375. nrows = len(self.frame)
  376. str_lst = []
  377. st = 0
  378. for i, ed in enumerate(col_bins):
  379. row = strcols[st:ed]
  380. row.insert(0, idx)
  381. if nbins > 1:
  382. if ed <= len(strcols) and i < nbins - 1:
  383. row.append([' \\'] + [' '] * (nrows - 1))
  384. else:
  385. row.append([' '] * nrows)
  386. str_lst.append(adjoin(adjoin_width, *row))
  387. st = ed
  388. return '\n\n'.join(str_lst)
  389. def to_latex(self, column_format=None, longtable=False):
  390. """
  391. Render a DataFrame to a LaTeX tabular/longtable environment output.
  392. """
  393. self.escape = self.kwds.get('escape', True)
  394. #TODO: column_format is not settable in df.to_latex
  395. def get_col_type(dtype):
  396. if issubclass(dtype.type, np.number):
  397. return 'r'
  398. else:
  399. return 'l'
  400. frame = self.frame
  401. if len(frame.columns) == 0 or len(frame.index) == 0:
  402. info_line = (u('Empty %s\nColumns: %s\nIndex: %s')
  403. % (type(self.frame).__name__,
  404. frame.columns, frame.index))
  405. strcols = [[info_line]]
  406. else:
  407. strcols = self._to_str_columns()
  408. if column_format is None:
  409. dtypes = self.frame.dtypes.values
  410. if self.index:
  411. column_format = 'l%s' % ''.join(map(get_col_type, dtypes))
  412. else:
  413. column_format = '%s' % ''.join(map(get_col_type, dtypes))
  414. elif not isinstance(column_format,
  415. compat.string_types): # pragma: no cover
  416. raise AssertionError('column_format must be str or unicode, not %s'
  417. % type(column_format))
  418. def write(buf, frame, column_format, strcols, longtable=False):
  419. if not longtable:
  420. buf.write('\\begin{tabular}{%s}\n' % column_format)
  421. buf.write('\\toprule\n')
  422. else:
  423. buf.write('\\begin{longtable}{%s}\n' % column_format)
  424. buf.write('\\toprule\n')
  425. nlevels = frame.index.nlevels
  426. for i, row in enumerate(zip(*strcols)):
  427. if i == nlevels:
  428. buf.write('\\midrule\n') # End of header
  429. if longtable:
  430. buf.write('\\endhead\n')
  431. buf.write('\\midrule\n')
  432. buf.write('\\multicolumn{3}{r}{{Continued on next '
  433. 'page}} \\\\\n')
  434. buf.write('\midrule\n')
  435. buf.write('\endfoot\n\n')
  436. buf.write('\\bottomrule\n')
  437. buf.write('\\endlastfoot\n')
  438. if self.escape:
  439. crow = [(x.replace('\\', '\\textbackslash') # escape backslashes first
  440. .replace('_', '\\_')
  441. .replace('%', '\\%')
  442. .replace('$', '\\$')
  443. .replace('#', '\\#')
  444. .replace('{', '\\{')
  445. .replace('}', '\\}')
  446. .replace('~', '\\textasciitilde')
  447. .replace('^', '\\textasciicircum')
  448. .replace('&', '\\&') if x else '{}') for x in row]
  449. else:
  450. crow = [x if x else '{}' for x in row]
  451. buf.write(' & '.join(crow))
  452. buf.write(' \\\\\n')
  453. if not longtable:
  454. buf.write('\\bottomrule\n')
  455. buf.write('\\end{tabular}\n')
  456. else:
  457. buf.write('\\end{longtable}\n')
  458. if hasattr(self.buf, 'write'):
  459. write(self.buf, frame, column_format, strcols, longtable)
  460. elif isinstance(self.buf, compat.string_types):
  461. with open(self.buf, 'w') as f:
  462. write(f, frame, column_format, strcols, longtable)
  463. else:
  464. raise TypeError('buf is not a file name and it has no write '
  465. 'method')
  466. def _format_col(self, i):
  467. frame = self.tr_frame
  468. formatter = self._get_formatter(i)
  469. return format_array(
  470. (frame.iloc[:, i]).get_values(),
  471. formatter, float_format=self.float_format, na_rep=self.na_rep,
  472. space=self.col_space
  473. )
  474. def to_html(self, classes=None):
  475. """
  476. Render a DataFrame to a html table.
  477. """
  478. html_renderer = HTMLFormatter(self, classes=classes,
  479. max_rows=self.max_rows,
  480. max_cols=self.max_cols)
  481. if hasattr(self.buf, 'write'):
  482. html_renderer.write_result(self.buf)
  483. elif isinstance(self.buf, compat.string_types):
  484. with open(self.buf, 'w') as f:
  485. html_renderer.write_result(f)
  486. else:
  487. raise TypeError('buf is not a file name and it has no write '
  488. ' method')
  489. def _get_formatted_column_labels(self,frame):
  490. from pandas.core.index import _sparsify
  491. def is_numeric_dtype(dtype):
  492. return issubclass(dtype.type, np.number)
  493. columns = frame.columns
  494. if isinstance(columns, MultiIndex):
  495. fmt_columns = columns.format(sparsify=False, adjoin=False)
  496. fmt_columns = lzip(*fmt_columns)
  497. dtypes = self.frame.dtypes.values
  498. need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
  499. str_columns = list(zip(*[
  500. [' ' + y if y not in self.formatters and need_leadsp[x]
  501. else y for y in x] for x in fmt_columns]))
  502. if self.sparsify:
  503. str_columns = _sparsify(str_columns)
  504. str_columns = [list(x) for x in zip(*str_columns)]
  505. else:
  506. fmt_columns = columns.format()
  507. dtypes = self.frame.dtypes
  508. need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
  509. str_columns = [[' ' + x
  510. if not self._get_formatter(i) and need_leadsp[x]
  511. else x]
  512. for i, (col, x) in
  513. enumerate(zip(columns, fmt_columns))]
  514. if self.show_index_names and self.has_index_names:
  515. for x in str_columns:
  516. x.append('')
  517. return str_columns
  518. @property
  519. def has_index_names(self):
  520. return _has_names(self.frame.index)
  521. @property
  522. def has_column_names(self):
  523. return _has_names(self.frame.columns)
  524. def _get_formatted_index(self,frame):
  525. # Note: this is only used by to_string(), not by to_html().
  526. index = frame.index
  527. columns = frame.columns
  528. show_index_names = self.show_index_names and self.has_index_names
  529. show_col_names = (self.show_index_names and self.has_column_names)
  530. fmt = self._get_formatter('__index__')
  531. if isinstance(index, MultiIndex):
  532. fmt_index = index.format(sparsify=self.sparsify, adjoin=False,
  533. names=show_index_names,
  534. formatter=fmt)
  535. else:
  536. fmt_index = [index.format(name=show_index_names, formatter=fmt)]
  537. adjoined = adjoin(1, *fmt_index).split('\n')
  538. # empty space for columns
  539. if show_col_names:
  540. col_header = ['%s' % x for x in self._get_column_name_list()]
  541. else:
  542. col_header = [''] * columns.nlevels
  543. if self.header:
  544. return col_header + adjoined
  545. else:
  546. return adjoined
  547. def _get_column_name_list(self):
  548. names = []
  549. columns = self.frame.columns
  550. if isinstance(columns, MultiIndex):
  551. names.extend('' if name is None else name
  552. for name in columns.names)
  553. else:
  554. names.append('' if columns.name is None else columns.name)
  555. return names
  556. class HTMLFormatter(TableFormatter):
  557. indent_delta = 2
  558. def __init__(self, formatter, classes=None, max_rows=None, max_cols=None):
  559. self.fmt = formatter
  560. self.classes = classes
  561. self.frame = self.fmt.frame
  562. self.columns = self.fmt.tr_frame.columns
  563. self.elements = []
  564. self.bold_rows = self.fmt.kwds.get('bold_rows', False)
  565. self.escape = self.fmt.kwds.get('escape', True)
  566. self.max_rows = max_rows or len(self.fmt.frame)
  567. self.max_cols = max_cols or len(self.fmt.columns)
  568. self.show_dimensions = self.fmt.show_dimensions
  569. self.is_truncated = self.max_rows < len(self.fmt.frame) or self.max_cols < len(self.fmt.columns)
  570. def write(self, s, indent=0):
  571. rs = com.pprint_thing(s)
  572. self.elements.append(' ' * indent + rs)
  573. def write_th(self, s, indent=0, tags=None):
  574. if (self.fmt.col_space is not None
  575. and self.fmt.col_space > 0):
  576. tags = (tags or "")
  577. tags += 'style="min-width: %s;"' % self.fmt.col_space
  578. return self._write_cell(s, kind='th', indent=indent, tags=tags)
  579. def write_td(self, s, indent=0, tags=None):
  580. return self._write_cell(s, kind='td', indent=indent, tags=tags)
  581. def _write_cell(self, s, kind='td', indent=0, tags=None):
  582. if tags is not None:
  583. start_tag = '<%s %s>' % (kind, tags)
  584. else:
  585. start_tag = '<%s>' % kind
  586. if self.escape:
  587. # escape & first to prevent double escaping of &
  588. esc = OrderedDict(
  589. [('&', r'&amp;'), ('<', r'&lt;'), ('>', r'&gt;')]
  590. )
  591. else:
  592. esc = {}
  593. rs = com.pprint_thing(s, escape_chars=esc)
  594. self.write(
  595. '%s%s</%s>' % (start_tag, rs, kind), indent)
  596. def write_tr(self, line, indent=0, indent_delta=4, header=False,
  597. align=None, tags=None, nindex_levels=0):
  598. if tags is None:
  599. tags = {}
  600. if align is None:
  601. self.write('<tr>', indent)
  602. else:
  603. self.write('<tr style="text-align: %s;">' % align, indent)
  604. indent += indent_delta
  605. for i, s in enumerate(line):
  606. val_tag = tags.get(i, None)
  607. if header or (self.bold_rows and i < nindex_levels):
  608. self.write_th(s, indent, tags=val_tag)
  609. else:
  610. self.write_td(s, indent, tags=val_tag)
  611. indent -= indent_delta
  612. self.write('</tr>', indent)
  613. def write_result(self, buf):
  614. indent = 0
  615. frame = self.frame
  616. _classes = ['dataframe'] # Default class.
  617. if self.classes is not None:
  618. if isinstance(self.classes, str):
  619. self.classes = self.classes.split()
  620. if not isinstance(self.classes, (list, tuple)):
  621. raise AssertionError(('classes must be list or tuple, '
  622. 'not %s') % type(self.classes))
  623. _classes.extend(self.classes)
  624. self.write('<table border="1" class="%s">' % ' '.join(_classes),
  625. indent)
  626. indent += self.indent_delta
  627. indent = self._write_header(indent)
  628. indent = self._write_body(indent)
  629. self.write('</table>', indent)
  630. if self.should_show_dimensions:
  631. by = chr(215) if compat.PY3 else unichr(215) # ×
  632. self.write(u('<p>%d rows %s %d columns</p>') %
  633. (len(frame), by, len(frame.columns)))
  634. _put_lines(buf, self.elements)
  635. def _write_header(self, indent):
  636. truncate_h = self.fmt.truncate_h
  637. row_levels = self.frame.index.nlevels
  638. if not self.fmt.header:
  639. # write nothing
  640. return indent
  641. def _column_header():
  642. if self.fmt.index:
  643. row = [''] * (self.frame.index.nlevels - 1)
  644. else:
  645. row = []
  646. if isinstance(self.columns, MultiIndex):
  647. if self.fmt.has_column_names and self.fmt.index:
  648. row.append(single_column_table(self.columns.names))
  649. else:
  650. row.append('')
  651. style = "text-align: %s;" % self.fmt.justify
  652. row.extend([single_column_table(c, self.fmt.justify, style) for
  653. c in self.columns])
  654. else:
  655. if self.fmt.index:
  656. row.append(self.columns.name or '')
  657. row.extend(self.columns)
  658. return row
  659. self.write('<thead>', indent)
  660. row = []
  661. indent += self.indent_delta
  662. if isinstance(self.columns, MultiIndex):
  663. template = 'colspan="%d" halign="left"'
  664. if self.fmt.sparsify:
  665. # GH3547
  666. sentinel = com.sentinel_factory()
  667. else:
  668. sentinel = None
  669. levels = self.columns.format(sparsify=sentinel,
  670. adjoin=False, names=False)
  671. level_lengths = _get_level_lengths(levels, sentinel)
  672. inner_lvl = len(level_lengths) - 1
  673. for lnum, (records, values) in enumerate(zip(level_lengths,
  674. levels)):
  675. if truncate_h:
  676. # modify the header lines
  677. ins_col = self.fmt.tr_col_num
  678. if self.fmt.sparsify:
  679. recs_new = {}
  680. # Increment tags after ... col.
  681. for tag,span in list(records.items()):
  682. if tag >= ins_col:
  683. recs_new[tag + 1] = span
  684. elif tag + span > ins_col:
  685. recs_new[tag] = span + 1
  686. if lnum == inner_lvl:
  687. values = values[:ins_col] + (u('...'),) + \
  688. values[ins_col:]
  689. else: # sparse col headers do not receive a ...
  690. values = values[:ins_col] + \
  691. (values[ins_col - 1],) + values[ins_col:]
  692. else:
  693. recs_new[tag] = span
  694. # if ins_col lies between tags, all col headers get ...
  695. if tag + span == ins_col:
  696. recs_new[ins_col] = 1
  697. values = values[:ins_col] + (u('...'),) + \
  698. values[ins_col:]
  699. records = recs_new
  700. inner_lvl = len(level_lengths) - 1
  701. if lnum == inner_lvl:
  702. records[ins_col] = 1
  703. else:
  704. recs_new = {}
  705. for tag,span in list(records.items()):
  706. if tag >= ins_col:
  707. recs_new[tag + 1] = span
  708. else:
  709. recs_new[tag] = span
  710. recs_new[ins_col] = 1
  711. records = recs_new
  712. values = values[:ins_col] + [u('...')] + values[ins_col:]
  713. name = self.columns.names[lnum]
  714. row = [''] * (row_levels - 1) + ['' if name is None
  715. else com.pprint_thing(name)]
  716. tags = {}
  717. j = len(row)
  718. for i, v in enumerate(values):
  719. if i in records:
  720. if records[i] > 1:
  721. tags[j] = template % records[i]
  722. else:
  723. continue
  724. j += 1
  725. row.append(v)
  726. self.write_tr(row, indent, self.indent_delta, tags=tags,
  727. header=True)
  728. else:
  729. col_row = _column_header()
  730. align = self.fmt.justify
  731. if truncate_h:
  732. ins_col = row_levels + self.fmt.tr_col_num
  733. col_row.insert(ins_col, '...')
  734. self.write_tr(col_row, indent, self.indent_delta, header=True,
  735. align=align)
  736. if self.fmt.has_index_names:
  737. row = [
  738. x if x is not None else '' for x in self.frame.index.names
  739. ] + [''] * min(len(self.columns), self.max_cols)
  740. if truncate_h:
  741. ins_col = row_levels + self.fmt.tr_col_num
  742. row.insert(ins_col, '')
  743. self.write_tr(row, indent, self.indent_delta, header=True)
  744. indent -= self.indent_delta
  745. self.write('</thead>', indent)
  746. return indent
  747. def _write_body(self, indent):
  748. self.write('<tbody>', indent)
  749. indent += self.indent_delta
  750. fmt_values = {}
  751. for i in range(min(len(self.columns), self.max_cols)):
  752. fmt_values[i] = self.fmt._format_col(i)
  753. # write values
  754. if self.fmt.index:
  755. if isinstance(self.frame.index, MultiIndex):
  756. self._write_hierarchical_rows(fmt_values, indent)
  757. else:
  758. self._write_regular_rows(fmt_values, indent)
  759. else:
  760. for i in range(len(self.frame)):
  761. row = [fmt_values[j][i] for j in range(len(self.columns))]
  762. self.write_tr(row, indent, self.indent_delta, tags=None)
  763. indent -= self.indent_delta
  764. self.write('</tbody>', indent)
  765. indent -= self.indent_delta
  766. return indent
  767. def _write_regular_rows(self, fmt_values, indent):
  768. truncate_h = self.fmt.truncate_h
  769. truncate_v = self.fmt.truncate_v
  770. ncols = len(self.fmt.tr_frame.columns)
  771. nrows = len(self.fmt.tr_frame)
  772. fmt = self.fmt._get_formatter('__index__')
  773. if fmt is not None:
  774. index_values = self.fmt.tr_frame.index.map(fmt)
  775. else:
  776. index_values = self.fmt.tr_frame.index.format()
  777. for i in range(nrows):
  778. if truncate_v and i == (self.fmt.tr_row_num):
  779. str_sep_row = [ '...' for ele in row ]
  780. self.write_tr(str_sep_row, indent, self.indent_delta, tags=None,
  781. nindex_levels=1)
  782. row = []
  783. row.append(index_values[i])
  784. row.extend(fmt_values[j][i] for j in range(ncols))
  785. if truncate_h:
  786. dot_col_ix = self.fmt.tr_col_num + 1
  787. row.insert(dot_col_ix, '...')
  788. self.write_tr(row, indent, self.indent_delta, tags=None,
  789. nindex_levels=1)
  790. def _write_hierarchical_rows(self, fmt_values, indent):
  791. template = 'rowspan="%d" valign="top"'
  792. truncate_h = self.fmt.truncate_h
  793. truncate_v = self.fmt.truncate_v
  794. frame = self.fmt.tr_frame
  795. ncols = len(frame.columns)
  796. nrows = len(frame)
  797. row_levels = self.frame.index.nlevels
  798. idx_values = frame.index.format(sparsify=False, adjoin=False,
  799. names=False)
  800. idx_values = lzip(*idx_values)
  801. if self.fmt.sparsify:
  802. # GH3547
  803. sentinel = com.sentinel_factory()
  804. levels = frame.index.format(sparsify=sentinel,
  805. adjoin=False, names=False)
  806. level_lengths = _get_level_lengths(levels, sentinel)
  807. inner_lvl = len(level_lengths) - 1
  808. if truncate_v:
  809. # Insert ... row and adjust idx_values and
  810. # level_lengths to take this into account.
  811. ins_row = self.fmt.tr_row_num
  812. for lnum,records in enumerate(level_lengths):
  813. rec_new = {}
  814. for tag,span in list(records.items()):
  815. if tag >= ins_row:
  816. rec_new[tag + 1] = span
  817. elif tag + span > ins_row:
  818. rec_new[tag] = span + 1
  819. dot_row = list(idx_values[ins_row - 1])
  820. dot_row[-1] = u('...')
  821. idx_values.insert(ins_row,tuple(dot_row))
  822. else:
  823. rec_new[tag] = span
  824. # If ins_row lies between tags, all cols idx cols receive ...
  825. if tag + span == ins_row:
  826. rec_new[ins_row] = 1
  827. if lnum == 0:
  828. idx_values.insert(ins_row,tuple([u('...')]*len(level_lengths)))
  829. level_lengths[lnum] = rec_new
  830. level_lengths[inner_lvl][ins_row] = 1
  831. for ix_col in range(len(fmt_values)):
  832. fmt_values[ix_col].insert(ins_row,'...')
  833. nrows += 1
  834. for i in range(nrows):
  835. row = []
  836. tags = {}
  837. sparse_offset = 0
  838. j = 0
  839. for records, v in zip(level_lengths, idx_values[i]):
  840. if i in records:
  841. if records[i] > 1:
  842. tags[j] = template % records[i]
  843. else:
  844. sparse_offset += 1
  845. continue
  846. j += 1
  847. row.append(v)
  848. row.extend(fmt_values[j][i] for j in range(ncols))
  849. if truncate_h:
  850. row.insert(row_levels - sparse_offset + self.fmt.tr_col_num, '...')
  851. self.write_tr(row, indent, self.indent_delta, tags=tags,
  852. nindex_levels=len(levels) - sparse_offset)
  853. else:
  854. for i in range(len(frame)):
  855. idx_values = list(zip(*frame.index.format(sparsify=False,
  856. adjoin=False,
  857. names=False)))
  858. row = []
  859. row.extend(idx_values[i])
  860. row.extend(fmt_values[j][i] for j in range(ncols))
  861. if truncate_h:
  862. row.insert(row_levels + self.fmt.tr_col_num, '...')
  863. self.write_tr(row, indent, self.indent_delta, tags=None,
  864. nindex_levels=frame.index.nlevels)
  865. def _get_level_lengths(levels, sentinel=''):
  866. from itertools import groupby
  867. def _make_grouper():
  868. record = {'count': 0}
  869. def grouper(x):
  870. if x != sentinel:
  871. record['count'] += 1
  872. return record['count']
  873. return grouper
  874. result = []
  875. for lev in levels:
  876. i = 0
  877. f = _make_grouper()
  878. recs = {}
  879. for key, gpr in groupby(lev, f):
  880. values = list(gpr)
  881. recs[i] = len(values)
  882. i += len(values)
  883. result.append(recs)
  884. return result
  885. class CSVFormatter(object):
  886. def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None,
  887. cols=None, header=True, index=True, index_label=None,
  888. mode='w', nanRep=None, encoding=None, quoting=None,
  889. line_terminator='\n', chunksize=None, engine=None,
  890. tupleize_cols=False, quotechar='"', date_format=None,
  891. doublequote=True, escapechar=None):
  892. self.engine = engine # remove for 0.13
  893. self.obj = obj
  894. if path_or_buf is None:
  895. path_or_buf = StringIO()
  896. self.path_or_buf = path_or_buf
  897. self.sep = sep
  898. self.na_rep = na_rep
  899. self.float_format = float_format
  900. self.header = header
  901. self.index = index
  902. self.index_label = index_label
  903. self.mode = mode
  904. self.encoding = encoding
  905. if quoting is None:
  906. quoting = csv.QUOTE_MINIMAL
  907. self.quoting = quoting
  908. if quoting == csv.QUOTE_NONE:
  909. # prevents crash in _csv
  910. quotechar = None
  911. self.quotechar = quotechar
  912. self.doublequote = doublequote
  913. self.escapechar = escapechar
  914. self.line_terminator = line_terminator
  915. self.date_format = date_format
  916. # GH3457
  917. if not self.obj.columns.is_unique and engine == 'python':
  918. raise NotImplementedError("columns.is_unique == False not "
  919. "supported with engine='python'")
  920. self.tupleize_cols = tupleize_cols
  921. self.has_mi_columns = isinstance(obj.columns, MultiIndex
  922. ) and not self.tupleize_cols
  923. # validate mi options
  924. if self.has_mi_columns:
  925. if cols is not None:
  926. raise TypeError("cannot specify cols with a MultiIndex on the "
  927. "columns")
  928. if cols is not None:
  929. if isinstance(cols, Index):
  930. cols = cols.to_native_types(na_rep=na_rep,
  931. float_format=float_format,
  932. date_format=date_format)
  933. else:
  934. cols = list(cols)
  935. self.obj = self.obj.loc[:, cols]
  936. # update columns to include possible multiplicity of dupes
  937. # and make sure sure cols is just a list of labels
  938. cols = self.obj.columns
  939. if isinstance(cols, Index):
  940. cols = cols.to_native_types(na_rep=na_rep,
  941. float_format=float_format,
  942. date_format=date_format)
  943. else:
  944. cols = list(cols)
  945. # save it
  946. self.cols = cols
  947. # preallocate data 2d list
  948. self.blocks = self.obj._data.blocks
  949. ncols = sum(b.shape[0] for b in self.blocks)
  950. self.data = [None] * ncols
  951. if chunksize is None:
  952. chunksize = (100000 / (len(self.cols) or 1)) or 1
  953. self.chunksize = int(chunksize)
  954. self.data_index = obj.index
  955. if isinstance(obj.index, PeriodIndex):
  956. self.data_index = obj.index.to_timestamp()
  957. if (isinstance(self.data_index, DatetimeIndex) and
  958. date_format is not None):
  959. self.data_index = Index([x.strftime(date_format)
  960. if notnull(x) else ''
  961. for x in self.data_index])
  962. self.nlevels = getattr(self.data_index, 'nlevels', 1)
  963. if not index:
  964. self.nlevels = 0
  965. # original python implem. of df.to_csv
  966. # invoked by df.to_csv(engine=python)
  967. def _helper_csv(self, writer, na_rep=None, cols=None,
  968. header=True, index=True,
  969. index_label=None, float_format=None, date_format=None):
  970. if cols is None:
  971. cols = self.columns
  972. has_aliases = isinstance(header, (tuple, list, np.ndarray))
  973. if has_aliases or header:
  974. if index:
  975. # should write something for index label
  976. if index_label is not False:
  977. if index_label is None:
  978. if isinstance(self.obj.index, MultiIndex):
  979. index_label = []
  980. for i, name in enumerate(self.obj.index.names):
  981. if name is None:
  982. name = ''
  983. index_label.append(name)
  984. else:
  985. index_label = self.obj.index.name
  986. if index_label is None:
  987. index_label = ['']
  988. else:
  989. index_label = [index_label]
  990. elif not isinstance(index_label,
  991. (list, tuple, np.ndarray)):
  992. # given a string for a DF with Index
  993. index_label = [index_label]
  994. encoded_labels = list(index_label)
  995. else:
  996. encoded_labels = []
  997. if has_aliases:
  998. if len(header) != len(cols):
  999. raise ValueError(('Writing %d cols but got %d aliases'
  1000. % (len(cols), len(header))))
  1001. else:
  1002. write_cols = header
  1003. else:
  1004. write_cols = cols
  1005. encoded_cols = list(write_cols)
  1006. writer.writerow(encoded_labels + encoded_cols)
  1007. else:
  1008. encoded_cols = list(cols)
  1009. writer.writerow(encoded_cols)
  1010. if date_format is None:
  1011. date_formatter = lambda x: lib.Timestamp(x)._repr_base
  1012. else:
  1013. def strftime_with_nulls(x):
  1014. x = lib.Timestamp(x)
  1015. if notnull(x):
  1016. return x.strftime(date_format)
  1017. date_formatter = lambda x: strftime_with_nulls(x)
  1018. data_index = self.obj.index
  1019. if isinstance(self.obj.index, PeriodIndex):
  1020. data_index = self.obj.index.to_timestamp()
  1021. if isinstance(data_index, DatetimeIndex) and date_format is not None:
  1022. data_index = Index([date_formatter(x) for x in data_index])
  1023. values = self.obj.copy()
  1024. values.index = data_index
  1025. values.columns = values.columns.to_native_types(
  1026. na_rep=na_rep, float_format=float_format,
  1027. date_format=date_format)
  1028. values = values[cols]
  1029. series = {}
  1030. for k, v in compat.iteritems(values._series):
  1031. series[k] = v.values
  1032. nlevels = getattr(data_index, 'nlevels', 1)
  1033. for j, idx in enumerate(data_index):
  1034. row_fields = []
  1035. if index:
  1036. if nlevels == 1:
  1037. row_fields = [idx]
  1038. else: # handle MultiIndex
  1039. row_fields = list(idx)
  1040. for i, col in enumerate(cols):
  1041. val = series[col][j]
  1042. if lib.checknull(val):
  1043. val = na_rep
  1044. if float_format is not None and com.is_float(val):
  1045. val = float_format % val
  1046. elif isinstance(val, (np.datetime64, lib.Timestamp)):
  1047. val = date_formatter(val)
  1048. row_fields.append(val)
  1049. writer.writerow(row_fields)
  1050. def save(self):
  1051. # create the writer & save
  1052. if hasattr(self.path_or_buf, 'write'):
  1053. f = self.path_or_buf
  1054. close = False
  1055. else:
  1056. f = com._get_handle(self.path_or_buf, self.mode,
  1057. encoding=self.encoding)
  1058. close = True
  1059. try:
  1060. writer_kwargs = dict(lineterminator=self.line_terminator,
  1061. delimiter=self.sep, quoting=self.quoting,
  1062. doublequote=self.doublequote,
  1063. escapechar=self.escapechar,
  1064. quotechar=self.quotechar)
  1065. if self.encoding is not None:
  1066. writer_kwargs['encoding'] = self.encoding
  1067. self.writer = com.UnicodeWriter(f, **writer_kwargs)
  1068. else:
  1069. self.writer = csv.writer(f, **writer_kwargs)
  1070. if self.engine == 'python':
  1071. # to be removed in 0.13
  1072. self._helper_csv(self.writer, na_rep=self.na_rep,
  1073. float_format=self.float_format,
  1074. cols=self.cols, header=self.header,
  1075. index=self.index,
  1076. index_label=self.index_label,
  1077. date_format=self.date_format)
  1078. else:
  1079. self._save()
  1080. finally:
  1081. if close:
  1082. f.close()
  1083. def _save_header(self):
  1084. writer = self.writer
  1085. obj = self.obj
  1086. index_label = self.index_label
  1087. cols = self.cols
  1088. has_mi_columns = self.has_mi_columns
  1089. header = self.header
  1090. encoded_labels = []
  1091. has_aliases = isinstance(header, (tuple, list, np.ndarray))
  1092. if not (has_aliases or self.header):
  1093. return
  1094. if has_aliases:
  1095. if len(header) != len(cols):
  1096. raise ValueError(('Writing %d cols but got %d aliases'
  1097. % (len(cols), len(header))))
  1098. else:
  1099. write_cols = header
  1100. else:
  1101. write_cols = cols
  1102. if self.index:
  1103. # should write something for index label
  1104. if index_label is not False:
  1105. if index_label is None:
  1106. if isinstance(obj.index, MultiIndex):
  1107. index_label = []
  1108. for i, name in enumerate(obj.index.names):
  1109. if name is None:
  1110. name = ''
  1111. index_label.append(name)
  1112. else:
  1113. index_label = obj.index.name
  1114. if index_label is None:
  1115. index_label = ['']
  1116. else:
  1117. index_label = [index_label]
  1118. elif not isinstance(index_label, (list, tuple, np.ndarray)):
  1119. # given a string for a DF with Index
  1120. index_label = [index_label]
  1121. encoded_labels = list(index_label)
  1122. else:
  1123. encoded_labels = []
  1124. if not has_mi_columns:
  1125. encoded_labels += list(write_cols)
  1126. # write out the mi
  1127. if has_mi_columns:
  1128. columns = obj.columns
  1129. # write out the names for each level, then ALL of the values for
  1130. # each level
  1131. for i in range(columns.nlevels):
  1132. # we need at least 1 index column to write our col names
  1133. col_line = []
  1134. if self.index:
  1135. # name is the first column

Large files files are truncated, but you can click here to view the full file