/pandas/core/format.py
Python | 2298 lines | 2261 code | 24 blank | 13 comment | 33 complexity | b71d18b5fbb16c4590d0e20f32c22215 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0
Large files files are truncated, but you can click here to view the full file
- #coding: utf-8
- from __future__ import print_function
- # pylint: disable=W0141
- import sys
- import re
- from pandas.core.base import PandasObject
- from pandas.core.common import adjoin, isnull, notnull
- from pandas.core.index import Index, MultiIndex, _ensure_index
- from pandas import compat
- from pandas.compat import(StringIO, lzip, range, map, zip, reduce, u,
- OrderedDict)
- from pandas.util.terminal import get_terminal_size
- from pandas.core.config import get_option, set_option, reset_option
- import pandas.core.common as com
- import pandas.lib as lib
- from pandas.tslib import iNaT
- import numpy as np
- import itertools
- import csv
- from datetime import time
- from pandas.tseries.period import PeriodIndex, DatetimeIndex
- docstring_to_string = """
- Parameters
- ----------
- frame : DataFrame
- object to render
- buf : StringIO-like, optional
- buffer to write to
- columns : sequence, optional
- the subset of columns to write; default None writes all columns
- col_space : int, optional
- the minimum width of each column
- header : bool, optional
- whether to print column labels, default True
- index : bool, optional
- whether to print index (row) labels, default True
- na_rep : string, optional
- string representation of NAN to use, default 'NaN'
- formatters : list or dict of one-parameter functions, optional
- formatter functions to apply to columns' elements by position or name,
- default None. The result of each function must be a unicode string.
- List must be of length equal to the number of columns.
- float_format : one-parameter function, optional
- formatter function to apply to columns' elements if they are floats,
- default None. The result of this function must be a unicode string.
- sparsify : bool, optional
- Set to False for a DataFrame with a hierarchical index to print every
- multiindex key at each row, default True
- justify : {'left', 'right'}, default None
- Left or right-justify the column labels. If None uses the option from
- the print configuration (controlled by set_option), 'right' out
- of the box.
- index_names : bool, optional
- Prints the names of the indexes, default True
- force_unicode : bool, default False
- Always return a unicode result. Deprecated in v0.10.0 as string
- formatting is now rendered to unicode by default.
- Returns
- -------
- formatted : string (or unicode, depending on data and options)"""
- class CategoricalFormatter(object):
- def __init__(self, categorical, buf=None, length=True,
- na_rep='NaN', name=False, footer=True):
- self.categorical = categorical
- self.buf = buf if buf is not None else StringIO(u(""))
- self.name = name
- self.na_rep = na_rep
- self.length = length
- self.footer = footer
- def _get_footer(self):
- footer = ''
- if self.name:
- name = com.pprint_thing(self.categorical.name,
- escape_chars=('\t', '\r', '\n'))
- footer += ('Name: %s' % name if self.categorical.name is not None
- else '')
- if self.length:
- if footer:
- footer += ', '
- footer += "Length: %d" % len(self.categorical)
- levheader = 'Levels (%d): ' % len(self.categorical.levels)
- # TODO: should max_line_width respect a setting?
- levstring = np.array_repr(self.categorical.levels, max_line_width=60)
- indent = ' ' * (levstring.find('[') + len(levheader) + 1)
- lines = levstring.split('\n')
- levstring = '\n'.join([lines[0]] +
- [indent + x.lstrip() for x in lines[1:]])
- if footer:
- footer += ', '
- footer += levheader + levstring
- return compat.text_type(footer)
- def _get_formatted_values(self):
- return format_array(np.asarray(self.categorical), None,
- float_format=None,
- na_rep=self.na_rep)
- def to_string(self):
- categorical = self.categorical
- if len(categorical) == 0:
- if self.footer:
- return self._get_footer()
- else:
- return u('')
- fmt_values = self._get_formatted_values()
- pad_space = 10
- result = ['%s' % i for i in fmt_values]
- if self.footer:
- footer = self._get_footer()
- if footer:
- result.append(footer)
- return compat.text_type(u('\n').join(result))
- class SeriesFormatter(object):
- def __init__(self, series, buf=None, header=True, length=True,
- na_rep='NaN', name=False, float_format=None, dtype=True):
- self.series = series
- self.buf = buf if buf is not None else StringIO()
- self.name = name
- self.na_rep = na_rep
- self.length = length
- self.header = header
- if float_format is None:
- float_format = get_option("display.float_format")
- self.float_format = float_format
- self.dtype = dtype
- def _get_footer(self):
- footer = u('')
- if self.name:
- if getattr(self.series.index, 'freq', None):
- footer += 'Freq: %s' % self.series.index.freqstr
- if footer and self.series.name is not None:
- footer += ', '
- series_name = com.pprint_thing(self.series.name,
- escape_chars=('\t', '\r', '\n'))
- footer += ("Name: %s" %
- series_name) if self.series.name is not None else ""
- if self.length:
- if footer:
- footer += ', '
- footer += 'Length: %d' % len(self.series)
- if self.dtype:
- name = getattr(self.series.dtype, 'name', None)
- if name:
- if footer:
- footer += ', '
- footer += 'dtype: %s' % com.pprint_thing(name)
- return compat.text_type(footer)
- def _get_formatted_index(self):
- index = self.series.index
- is_multi = isinstance(index, MultiIndex)
- if is_multi:
- have_header = any(name for name in index.names)
- fmt_index = index.format(names=True)
- else:
- have_header = index.name is not None
- fmt_index = index.format(name=True)
- return fmt_index, have_header
- def _get_formatted_values(self):
- return format_array(self.series.values, None,
- float_format=self.float_format,
- na_rep=self.na_rep)
- def to_string(self):
- series = self.series
- if len(series) == 0:
- return u('')
- fmt_index, have_header = self._get_formatted_index()
- fmt_values = self._get_formatted_values()
- maxlen = max(len(x) for x in fmt_index)
- pad_space = min(maxlen, 60)
- result = ['%s %s'] * len(fmt_values)
- for i, (k, v) in enumerate(zip(fmt_index[1:], fmt_values)):
- idx = k.ljust(pad_space)
- result[i] = result[i] % (idx, v)
- if self.header and have_header:
- result.insert(0, fmt_index[0])
- footer = self._get_footer()
- if footer:
- result.append(footer)
- return compat.text_type(u('\n').join(result))
- def _strlen_func():
- if compat.PY3: # pragma: no cover
- _strlen = len
- else:
- encoding = get_option("display.encoding")
- def _strlen(x):
- try:
- return len(x.decode(encoding))
- except UnicodeError:
- return len(x)
- return _strlen
- class TableFormatter(object):
- is_truncated = False
- show_dimensions = None
- @property
- def should_show_dimensions(self):
- return self.show_dimensions is True or (self.show_dimensions == 'truncate' and self.is_truncated)
- def _get_formatter(self, i):
- if isinstance(self.formatters, (list, tuple)):
- if com.is_integer(i):
- return self.formatters[i]
- else:
- return None
- else:
- if com.is_integer(i) and i not in self.columns:
- i = self.columns[i]
- return self.formatters.get(i, None)
- class DataFrameFormatter(TableFormatter):
- """
- Render a DataFrame
- self.to_string() : console-friendly tabular output
- self.to_html() : html table
- self.to_latex() : LaTeX tabular environment table
- """
- __doc__ = __doc__ if __doc__ else ''
- __doc__ += docstring_to_string
- def __init__(self, frame, buf=None, columns=None, col_space=None,
- header=True, index=True, na_rep='NaN', formatters=None,
- justify=None, float_format=None, sparsify=None,
- index_names=True, line_width=None, max_rows=None,
- max_cols=None, show_dimensions=False, **kwds):
- self.frame = frame
- self.buf = buf if buf is not None else StringIO()
- self.show_index_names = index_names
- if sparsify is None:
- sparsify = get_option("display.multi_sparse")
- self.sparsify = sparsify
- self.float_format = float_format
- self.formatters = formatters if formatters is not None else {}
- self.na_rep = na_rep
- self.col_space = col_space
- self.header = header
- self.index = index
- self.line_width = line_width
- self.max_rows = max_rows
- self.max_cols = max_cols
- self.max_rows_displayed = min(max_rows or len(self.frame),
- len(self.frame))
- self.show_dimensions = show_dimensions
- if justify is None:
- self.justify = get_option("display.colheader_justify")
- else:
- self.justify = justify
- self.kwds = kwds
- if columns is not None:
- self.columns = _ensure_index(columns)
- self.frame = self.frame[self.columns]
- else:
- self.columns = frame.columns
- self._chk_truncate()
- def _chk_truncate(self):
- from pandas.tools.merge import concat
- truncate_h = self.max_cols and (len(self.columns) > self.max_cols)
- truncate_v = self.max_rows and (len(self.frame) > self.max_rows)
- # Cut the data to the information actually printed
- max_cols = self.max_cols
- max_rows = self.max_rows
- frame = self.frame
- if truncate_h:
- if max_cols > 1:
- col_num = (max_cols // 2)
- frame = concat( (frame.iloc[:,:col_num],frame.iloc[:,-col_num:]),axis=1 )
- else:
- col_num = max_cols
- frame = frame.iloc[:,:max_cols]
- self.tr_col_num = col_num
- if truncate_v:
- if max_rows > 1:
- row_num = max_rows // 2
- frame = concat( (frame.iloc[:row_num,:],frame.iloc[-row_num:,:]) )
- else:
- row_num = max_rows
- frame = frame.iloc[:max_rows,:]
- self.tr_row_num = row_num
- self.tr_frame = frame
- self.truncate_h = truncate_h
- self.truncate_v = truncate_v
- self.is_truncated = self.truncate_h or self.truncate_v
- def _to_str_columns(self):
- """
- Render a DataFrame to a list of columns (as lists of strings).
- """
- _strlen = _strlen_func()
- frame = self.tr_frame
- # may include levels names also
- str_index = self._get_formatted_index(frame)
- str_columns = self._get_formatted_column_labels(frame)
- if self.header:
- stringified = []
- col_headers = frame.columns
- for i, c in enumerate(frame):
- cheader = str_columns[i]
- max_colwidth = max(self.col_space or 0,
- *(_strlen(x) for x in cheader))
- fmt_values = self._format_col(i)
- fmt_values = _make_fixed_width(fmt_values, self.justify,
- minimum=max_colwidth)
- max_len = max(np.max([_strlen(x) for x in fmt_values]),
- max_colwidth)
- if self.justify == 'left':
- cheader = [x.ljust(max_len) for x in cheader]
- else:
- cheader = [x.rjust(max_len) for x in cheader]
- stringified.append(cheader + fmt_values)
- else:
- stringified = []
- for i, c in enumerate(frame):
- formatter = self._get_formatter(i)
- fmt_values = self._format_col(i)
- fmt_values = _make_fixed_width(fmt_values, self.justify)
- stringified.append(fmt_values)
- strcols = stringified
- if self.index:
- strcols.insert(0, str_index)
- # Add ... to signal truncated
- truncate_h = self.truncate_h
- truncate_v = self.truncate_v
- if truncate_h:
- col_num = self.tr_col_num
- col_width = len(strcols[col_num][0]) # infer from column header
- strcols.insert(col_num + 1, ['...'.center(col_width)] * (len(str_index)))
- if truncate_v:
- n_header_rows = len(str_index) - len(frame)
- row_num = self.tr_row_num
- for ix,col in enumerate(strcols):
- cwidth = len(strcols[ix][row_num]) # infer from above row
- is_dot_col = False
- if truncate_h:
- is_dot_col = ix == col_num + 1
- if cwidth > 3 or is_dot_col:
- my_str = '...'
- else:
- my_str = '..'
- if ix == 0:
- dot_str = my_str.ljust(cwidth)
- elif is_dot_col:
- dot_str = my_str.center(cwidth)
- else:
- dot_str = my_str.rjust(cwidth)
- strcols[ix].insert(row_num + n_header_rows, dot_str)
- return strcols
- def to_string(self):
- """
- Render a DataFrame to a console-friendly tabular output.
- """
- frame = self.frame
- if len(frame.columns) == 0 or len(frame.index) == 0:
- info_line = (u('Empty %s\nColumns: %s\nIndex: %s')
- % (type(self.frame).__name__,
- com.pprint_thing(frame.columns),
- com.pprint_thing(frame.index)))
- text = info_line
- else:
- strcols = self._to_str_columns()
- if self.line_width is None:
- text = adjoin(1, *strcols)
- else:
- text = self._join_multiline(*strcols)
- self.buf.writelines(text)
- if self.should_show_dimensions:
- self.buf.write("\n\n[%d rows x %d columns]"
- % (len(frame), len(frame.columns)))
- def _join_multiline(self, *strcols):
- lwidth = self.line_width
- adjoin_width = 1
- strcols = list(strcols)
- if self.index:
- idx = strcols.pop(0)
- lwidth -= np.array([len(x) for x in idx]).max() + adjoin_width
- col_widths = [np.array([len(x) for x in col]).max()
- if len(col) > 0 else 0
- for col in strcols]
- col_bins = _binify(col_widths, lwidth)
- nbins = len(col_bins)
- if self.max_rows and len(self.frame) > self.max_rows:
- nrows = self.max_rows + 1
- else:
- nrows = len(self.frame)
- str_lst = []
- st = 0
- for i, ed in enumerate(col_bins):
- row = strcols[st:ed]
- row.insert(0, idx)
- if nbins > 1:
- if ed <= len(strcols) and i < nbins - 1:
- row.append([' \\'] + [' '] * (nrows - 1))
- else:
- row.append([' '] * nrows)
- str_lst.append(adjoin(adjoin_width, *row))
- st = ed
- return '\n\n'.join(str_lst)
- def to_latex(self, column_format=None, longtable=False):
- """
- Render a DataFrame to a LaTeX tabular/longtable environment output.
- """
- self.escape = self.kwds.get('escape', True)
- #TODO: column_format is not settable in df.to_latex
- def get_col_type(dtype):
- if issubclass(dtype.type, np.number):
- return 'r'
- else:
- return 'l'
- frame = self.frame
- if len(frame.columns) == 0 or len(frame.index) == 0:
- info_line = (u('Empty %s\nColumns: %s\nIndex: %s')
- % (type(self.frame).__name__,
- frame.columns, frame.index))
- strcols = [[info_line]]
- else:
- strcols = self._to_str_columns()
- if column_format is None:
- dtypes = self.frame.dtypes.values
- if self.index:
- column_format = 'l%s' % ''.join(map(get_col_type, dtypes))
- else:
- column_format = '%s' % ''.join(map(get_col_type, dtypes))
- elif not isinstance(column_format,
- compat.string_types): # pragma: no cover
- raise AssertionError('column_format must be str or unicode, not %s'
- % type(column_format))
- def write(buf, frame, column_format, strcols, longtable=False):
- if not longtable:
- buf.write('\\begin{tabular}{%s}\n' % column_format)
- buf.write('\\toprule\n')
- else:
- buf.write('\\begin{longtable}{%s}\n' % column_format)
- buf.write('\\toprule\n')
- nlevels = frame.index.nlevels
- for i, row in enumerate(zip(*strcols)):
- if i == nlevels:
- buf.write('\\midrule\n') # End of header
- if longtable:
- buf.write('\\endhead\n')
- buf.write('\\midrule\n')
- buf.write('\\multicolumn{3}{r}{{Continued on next '
- 'page}} \\\\\n')
- buf.write('\midrule\n')
- buf.write('\endfoot\n\n')
- buf.write('\\bottomrule\n')
- buf.write('\\endlastfoot\n')
- if self.escape:
- crow = [(x.replace('\\', '\\textbackslash') # escape backslashes first
- .replace('_', '\\_')
- .replace('%', '\\%')
- .replace('$', '\\$')
- .replace('#', '\\#')
- .replace('{', '\\{')
- .replace('}', '\\}')
- .replace('~', '\\textasciitilde')
- .replace('^', '\\textasciicircum')
- .replace('&', '\\&') if x else '{}') for x in row]
- else:
- crow = [x if x else '{}' for x in row]
- buf.write(' & '.join(crow))
- buf.write(' \\\\\n')
- if not longtable:
- buf.write('\\bottomrule\n')
- buf.write('\\end{tabular}\n')
- else:
- buf.write('\\end{longtable}\n')
- if hasattr(self.buf, 'write'):
- write(self.buf, frame, column_format, strcols, longtable)
- elif isinstance(self.buf, compat.string_types):
- with open(self.buf, 'w') as f:
- write(f, frame, column_format, strcols, longtable)
- else:
- raise TypeError('buf is not a file name and it has no write '
- 'method')
- def _format_col(self, i):
- frame = self.tr_frame
- formatter = self._get_formatter(i)
- return format_array(
- (frame.iloc[:, i]).get_values(),
- formatter, float_format=self.float_format, na_rep=self.na_rep,
- space=self.col_space
- )
- def to_html(self, classes=None):
- """
- Render a DataFrame to a html table.
- """
- html_renderer = HTMLFormatter(self, classes=classes,
- max_rows=self.max_rows,
- max_cols=self.max_cols)
- if hasattr(self.buf, 'write'):
- html_renderer.write_result(self.buf)
- elif isinstance(self.buf, compat.string_types):
- with open(self.buf, 'w') as f:
- html_renderer.write_result(f)
- else:
- raise TypeError('buf is not a file name and it has no write '
- ' method')
- def _get_formatted_column_labels(self,frame):
- from pandas.core.index import _sparsify
- def is_numeric_dtype(dtype):
- return issubclass(dtype.type, np.number)
- columns = frame.columns
- if isinstance(columns, MultiIndex):
- fmt_columns = columns.format(sparsify=False, adjoin=False)
- fmt_columns = lzip(*fmt_columns)
- dtypes = self.frame.dtypes.values
- need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
- str_columns = list(zip(*[
- [' ' + y if y not in self.formatters and need_leadsp[x]
- else y for y in x] for x in fmt_columns]))
- if self.sparsify:
- str_columns = _sparsify(str_columns)
- str_columns = [list(x) for x in zip(*str_columns)]
- else:
- fmt_columns = columns.format()
- dtypes = self.frame.dtypes
- need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
- str_columns = [[' ' + x
- if not self._get_formatter(i) and need_leadsp[x]
- else x]
- for i, (col, x) in
- enumerate(zip(columns, fmt_columns))]
- if self.show_index_names and self.has_index_names:
- for x in str_columns:
- x.append('')
- return str_columns
- @property
- def has_index_names(self):
- return _has_names(self.frame.index)
- @property
- def has_column_names(self):
- return _has_names(self.frame.columns)
- def _get_formatted_index(self,frame):
- # Note: this is only used by to_string(), not by to_html().
- index = frame.index
- columns = frame.columns
- show_index_names = self.show_index_names and self.has_index_names
- show_col_names = (self.show_index_names and self.has_column_names)
- fmt = self._get_formatter('__index__')
- if isinstance(index, MultiIndex):
- fmt_index = index.format(sparsify=self.sparsify, adjoin=False,
- names=show_index_names,
- formatter=fmt)
- else:
- fmt_index = [index.format(name=show_index_names, formatter=fmt)]
- adjoined = adjoin(1, *fmt_index).split('\n')
- # empty space for columns
- if show_col_names:
- col_header = ['%s' % x for x in self._get_column_name_list()]
- else:
- col_header = [''] * columns.nlevels
- if self.header:
- return col_header + adjoined
- else:
- return adjoined
- def _get_column_name_list(self):
- names = []
- columns = self.frame.columns
- if isinstance(columns, MultiIndex):
- names.extend('' if name is None else name
- for name in columns.names)
- else:
- names.append('' if columns.name is None else columns.name)
- return names
- class HTMLFormatter(TableFormatter):
- indent_delta = 2
- def __init__(self, formatter, classes=None, max_rows=None, max_cols=None):
- self.fmt = formatter
- self.classes = classes
- self.frame = self.fmt.frame
- self.columns = self.fmt.tr_frame.columns
- self.elements = []
- self.bold_rows = self.fmt.kwds.get('bold_rows', False)
- self.escape = self.fmt.kwds.get('escape', True)
- self.max_rows = max_rows or len(self.fmt.frame)
- self.max_cols = max_cols or len(self.fmt.columns)
- self.show_dimensions = self.fmt.show_dimensions
- self.is_truncated = self.max_rows < len(self.fmt.frame) or self.max_cols < len(self.fmt.columns)
- def write(self, s, indent=0):
- rs = com.pprint_thing(s)
- self.elements.append(' ' * indent + rs)
- def write_th(self, s, indent=0, tags=None):
- if (self.fmt.col_space is not None
- and self.fmt.col_space > 0):
- tags = (tags or "")
- tags += 'style="min-width: %s;"' % self.fmt.col_space
- return self._write_cell(s, kind='th', indent=indent, tags=tags)
- def write_td(self, s, indent=0, tags=None):
- return self._write_cell(s, kind='td', indent=indent, tags=tags)
- def _write_cell(self, s, kind='td', indent=0, tags=None):
- if tags is not None:
- start_tag = '<%s %s>' % (kind, tags)
- else:
- start_tag = '<%s>' % kind
- if self.escape:
- # escape & first to prevent double escaping of &
- esc = OrderedDict(
- [('&', r'&'), ('<', r'<'), ('>', r'>')]
- )
- else:
- esc = {}
- rs = com.pprint_thing(s, escape_chars=esc)
- self.write(
- '%s%s</%s>' % (start_tag, rs, kind), indent)
- def write_tr(self, line, indent=0, indent_delta=4, header=False,
- align=None, tags=None, nindex_levels=0):
- if tags is None:
- tags = {}
- if align is None:
- self.write('<tr>', indent)
- else:
- self.write('<tr style="text-align: %s;">' % align, indent)
- indent += indent_delta
- for i, s in enumerate(line):
- val_tag = tags.get(i, None)
- if header or (self.bold_rows and i < nindex_levels):
- self.write_th(s, indent, tags=val_tag)
- else:
- self.write_td(s, indent, tags=val_tag)
- indent -= indent_delta
- self.write('</tr>', indent)
- def write_result(self, buf):
- indent = 0
- frame = self.frame
- _classes = ['dataframe'] # Default class.
- if self.classes is not None:
- if isinstance(self.classes, str):
- self.classes = self.classes.split()
- if not isinstance(self.classes, (list, tuple)):
- raise AssertionError(('classes must be list or tuple, '
- 'not %s') % type(self.classes))
- _classes.extend(self.classes)
- self.write('<table border="1" class="%s">' % ' '.join(_classes),
- indent)
- indent += self.indent_delta
- indent = self._write_header(indent)
- indent = self._write_body(indent)
- self.write('</table>', indent)
- if self.should_show_dimensions:
- by = chr(215) if compat.PY3 else unichr(215) # Ă
- self.write(u('<p>%d rows %s %d columns</p>') %
- (len(frame), by, len(frame.columns)))
- _put_lines(buf, self.elements)
- def _write_header(self, indent):
- truncate_h = self.fmt.truncate_h
- row_levels = self.frame.index.nlevels
- if not self.fmt.header:
- # write nothing
- return indent
- def _column_header():
- if self.fmt.index:
- row = [''] * (self.frame.index.nlevels - 1)
- else:
- row = []
- if isinstance(self.columns, MultiIndex):
- if self.fmt.has_column_names and self.fmt.index:
- row.append(single_column_table(self.columns.names))
- else:
- row.append('')
- style = "text-align: %s;" % self.fmt.justify
- row.extend([single_column_table(c, self.fmt.justify, style) for
- c in self.columns])
- else:
- if self.fmt.index:
- row.append(self.columns.name or '')
- row.extend(self.columns)
- return row
- self.write('<thead>', indent)
- row = []
- indent += self.indent_delta
- if isinstance(self.columns, MultiIndex):
- template = 'colspan="%d" halign="left"'
- if self.fmt.sparsify:
- # GH3547
- sentinel = com.sentinel_factory()
- else:
- sentinel = None
- levels = self.columns.format(sparsify=sentinel,
- adjoin=False, names=False)
- level_lengths = _get_level_lengths(levels, sentinel)
- inner_lvl = len(level_lengths) - 1
- for lnum, (records, values) in enumerate(zip(level_lengths,
- levels)):
- if truncate_h:
- # modify the header lines
- ins_col = self.fmt.tr_col_num
- if self.fmt.sparsify:
- recs_new = {}
- # Increment tags after ... col.
- for tag,span in list(records.items()):
- if tag >= ins_col:
- recs_new[tag + 1] = span
- elif tag + span > ins_col:
- recs_new[tag] = span + 1
- if lnum == inner_lvl:
- values = values[:ins_col] + (u('...'),) + \
- values[ins_col:]
- else: # sparse col headers do not receive a ...
- values = values[:ins_col] + \
- (values[ins_col - 1],) + values[ins_col:]
- else:
- recs_new[tag] = span
- # if ins_col lies between tags, all col headers get ...
- if tag + span == ins_col:
- recs_new[ins_col] = 1
- values = values[:ins_col] + (u('...'),) + \
- values[ins_col:]
- records = recs_new
- inner_lvl = len(level_lengths) - 1
- if lnum == inner_lvl:
- records[ins_col] = 1
- else:
- recs_new = {}
- for tag,span in list(records.items()):
- if tag >= ins_col:
- recs_new[tag + 1] = span
- else:
- recs_new[tag] = span
- recs_new[ins_col] = 1
- records = recs_new
- values = values[:ins_col] + [u('...')] + values[ins_col:]
- name = self.columns.names[lnum]
- row = [''] * (row_levels - 1) + ['' if name is None
- else com.pprint_thing(name)]
- tags = {}
- j = len(row)
- for i, v in enumerate(values):
- if i in records:
- if records[i] > 1:
- tags[j] = template % records[i]
- else:
- continue
- j += 1
- row.append(v)
- self.write_tr(row, indent, self.indent_delta, tags=tags,
- header=True)
- else:
- col_row = _column_header()
- align = self.fmt.justify
- if truncate_h:
- ins_col = row_levels + self.fmt.tr_col_num
- col_row.insert(ins_col, '...')
- self.write_tr(col_row, indent, self.indent_delta, header=True,
- align=align)
- if self.fmt.has_index_names:
- row = [
- x if x is not None else '' for x in self.frame.index.names
- ] + [''] * min(len(self.columns), self.max_cols)
- if truncate_h:
- ins_col = row_levels + self.fmt.tr_col_num
- row.insert(ins_col, '')
- self.write_tr(row, indent, self.indent_delta, header=True)
- indent -= self.indent_delta
- self.write('</thead>', indent)
- return indent
- def _write_body(self, indent):
- self.write('<tbody>', indent)
- indent += self.indent_delta
- fmt_values = {}
- for i in range(min(len(self.columns), self.max_cols)):
- fmt_values[i] = self.fmt._format_col(i)
- # write values
- if self.fmt.index:
- if isinstance(self.frame.index, MultiIndex):
- self._write_hierarchical_rows(fmt_values, indent)
- else:
- self._write_regular_rows(fmt_values, indent)
- else:
- for i in range(len(self.frame)):
- row = [fmt_values[j][i] for j in range(len(self.columns))]
- self.write_tr(row, indent, self.indent_delta, tags=None)
- indent -= self.indent_delta
- self.write('</tbody>', indent)
- indent -= self.indent_delta
- return indent
- def _write_regular_rows(self, fmt_values, indent):
- truncate_h = self.fmt.truncate_h
- truncate_v = self.fmt.truncate_v
- ncols = len(self.fmt.tr_frame.columns)
- nrows = len(self.fmt.tr_frame)
- fmt = self.fmt._get_formatter('__index__')
- if fmt is not None:
- index_values = self.fmt.tr_frame.index.map(fmt)
- else:
- index_values = self.fmt.tr_frame.index.format()
- for i in range(nrows):
- if truncate_v and i == (self.fmt.tr_row_num):
- str_sep_row = [ '...' for ele in row ]
- self.write_tr(str_sep_row, indent, self.indent_delta, tags=None,
- nindex_levels=1)
- row = []
- row.append(index_values[i])
- row.extend(fmt_values[j][i] for j in range(ncols))
- if truncate_h:
- dot_col_ix = self.fmt.tr_col_num + 1
- row.insert(dot_col_ix, '...')
- self.write_tr(row, indent, self.indent_delta, tags=None,
- nindex_levels=1)
- def _write_hierarchical_rows(self, fmt_values, indent):
- template = 'rowspan="%d" valign="top"'
- truncate_h = self.fmt.truncate_h
- truncate_v = self.fmt.truncate_v
- frame = self.fmt.tr_frame
- ncols = len(frame.columns)
- nrows = len(frame)
- row_levels = self.frame.index.nlevels
- idx_values = frame.index.format(sparsify=False, adjoin=False,
- names=False)
- idx_values = lzip(*idx_values)
- if self.fmt.sparsify:
- # GH3547
- sentinel = com.sentinel_factory()
- levels = frame.index.format(sparsify=sentinel,
- adjoin=False, names=False)
- level_lengths = _get_level_lengths(levels, sentinel)
- inner_lvl = len(level_lengths) - 1
- if truncate_v:
- # Insert ... row and adjust idx_values and
- # level_lengths to take this into account.
- ins_row = self.fmt.tr_row_num
- for lnum,records in enumerate(level_lengths):
- rec_new = {}
- for tag,span in list(records.items()):
- if tag >= ins_row:
- rec_new[tag + 1] = span
- elif tag + span > ins_row:
- rec_new[tag] = span + 1
- dot_row = list(idx_values[ins_row - 1])
- dot_row[-1] = u('...')
- idx_values.insert(ins_row,tuple(dot_row))
- else:
- rec_new[tag] = span
- # If ins_row lies between tags, all cols idx cols receive ...
- if tag + span == ins_row:
- rec_new[ins_row] = 1
- if lnum == 0:
- idx_values.insert(ins_row,tuple([u('...')]*len(level_lengths)))
- level_lengths[lnum] = rec_new
- level_lengths[inner_lvl][ins_row] = 1
- for ix_col in range(len(fmt_values)):
- fmt_values[ix_col].insert(ins_row,'...')
- nrows += 1
- for i in range(nrows):
- row = []
- tags = {}
- sparse_offset = 0
- j = 0
- for records, v in zip(level_lengths, idx_values[i]):
- if i in records:
- if records[i] > 1:
- tags[j] = template % records[i]
- else:
- sparse_offset += 1
- continue
- j += 1
- row.append(v)
- row.extend(fmt_values[j][i] for j in range(ncols))
- if truncate_h:
- row.insert(row_levels - sparse_offset + self.fmt.tr_col_num, '...')
- self.write_tr(row, indent, self.indent_delta, tags=tags,
- nindex_levels=len(levels) - sparse_offset)
- else:
- for i in range(len(frame)):
- idx_values = list(zip(*frame.index.format(sparsify=False,
- adjoin=False,
- names=False)))
- row = []
- row.extend(idx_values[i])
- row.extend(fmt_values[j][i] for j in range(ncols))
- if truncate_h:
- row.insert(row_levels + self.fmt.tr_col_num, '...')
- self.write_tr(row, indent, self.indent_delta, tags=None,
- nindex_levels=frame.index.nlevels)
- def _get_level_lengths(levels, sentinel=''):
- from itertools import groupby
- def _make_grouper():
- record = {'count': 0}
- def grouper(x):
- if x != sentinel:
- record['count'] += 1
- return record['count']
- return grouper
- result = []
- for lev in levels:
- i = 0
- f = _make_grouper()
- recs = {}
- for key, gpr in groupby(lev, f):
- values = list(gpr)
- recs[i] = len(values)
- i += len(values)
- result.append(recs)
- return result
- class CSVFormatter(object):
- def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None,
- cols=None, header=True, index=True, index_label=None,
- mode='w', nanRep=None, encoding=None, quoting=None,
- line_terminator='\n', chunksize=None, engine=None,
- tupleize_cols=False, quotechar='"', date_format=None,
- doublequote=True, escapechar=None):
- self.engine = engine # remove for 0.13
- self.obj = obj
- if path_or_buf is None:
- path_or_buf = StringIO()
- self.path_or_buf = path_or_buf
- self.sep = sep
- self.na_rep = na_rep
- self.float_format = float_format
- self.header = header
- self.index = index
- self.index_label = index_label
- self.mode = mode
- self.encoding = encoding
- if quoting is None:
- quoting = csv.QUOTE_MINIMAL
- self.quoting = quoting
- if quoting == csv.QUOTE_NONE:
- # prevents crash in _csv
- quotechar = None
- self.quotechar = quotechar
- self.doublequote = doublequote
- self.escapechar = escapechar
- self.line_terminator = line_terminator
- self.date_format = date_format
- # GH3457
- if not self.obj.columns.is_unique and engine == 'python':
- raise NotImplementedError("columns.is_unique == False not "
- "supported with engine='python'")
- self.tupleize_cols = tupleize_cols
- self.has_mi_columns = isinstance(obj.columns, MultiIndex
- ) and not self.tupleize_cols
- # validate mi options
- if self.has_mi_columns:
- if cols is not None:
- raise TypeError("cannot specify cols with a MultiIndex on the "
- "columns")
- if cols is not None:
- if isinstance(cols, Index):
- cols = cols.to_native_types(na_rep=na_rep,
- float_format=float_format,
- date_format=date_format)
- else:
- cols = list(cols)
- self.obj = self.obj.loc[:, cols]
- # update columns to include possible multiplicity of dupes
- # and make sure sure cols is just a list of labels
- cols = self.obj.columns
- if isinstance(cols, Index):
- cols = cols.to_native_types(na_rep=na_rep,
- float_format=float_format,
- date_format=date_format)
- else:
- cols = list(cols)
- # save it
- self.cols = cols
- # preallocate data 2d list
- self.blocks = self.obj._data.blocks
- ncols = sum(b.shape[0] for b in self.blocks)
- self.data = [None] * ncols
- if chunksize is None:
- chunksize = (100000 / (len(self.cols) or 1)) or 1
- self.chunksize = int(chunksize)
- self.data_index = obj.index
- if isinstance(obj.index, PeriodIndex):
- self.data_index = obj.index.to_timestamp()
- if (isinstance(self.data_index, DatetimeIndex) and
- date_format is not None):
- self.data_index = Index([x.strftime(date_format)
- if notnull(x) else ''
- for x in self.data_index])
- self.nlevels = getattr(self.data_index, 'nlevels', 1)
- if not index:
- self.nlevels = 0
- # original python implem. of df.to_csv
- # invoked by df.to_csv(engine=python)
- def _helper_csv(self, writer, na_rep=None, cols=None,
- header=True, index=True,
- index_label=None, float_format=None, date_format=None):
- if cols is None:
- cols = self.columns
- has_aliases = isinstance(header, (tuple, list, np.ndarray))
- if has_aliases or header:
- if index:
- # should write something for index label
- if index_label is not False:
- if index_label is None:
- if isinstance(self.obj.index, MultiIndex):
- index_label = []
- for i, name in enumerate(self.obj.index.names):
- if name is None:
- name = ''
- index_label.append(name)
- else:
- index_label = self.obj.index.name
- if index_label is None:
- index_label = ['']
- else:
- index_label = [index_label]
- elif not isinstance(index_label,
- (list, tuple, np.ndarray)):
- # given a string for a DF with Index
- index_label = [index_label]
- encoded_labels = list(index_label)
- else:
- encoded_labels = []
- if has_aliases:
- if len(header) != len(cols):
- raise ValueError(('Writing %d cols but got %d aliases'
- % (len(cols), len(header))))
- else:
- write_cols = header
- else:
- write_cols = cols
- encoded_cols = list(write_cols)
- writer.writerow(encoded_labels + encoded_cols)
- else:
- encoded_cols = list(cols)
- writer.writerow(encoded_cols)
- if date_format is None:
- date_formatter = lambda x: lib.Timestamp(x)._repr_base
- else:
- def strftime_with_nulls(x):
- x = lib.Timestamp(x)
- if notnull(x):
- return x.strftime(date_format)
- date_formatter = lambda x: strftime_with_nulls(x)
- data_index = self.obj.index
- if isinstance(self.obj.index, PeriodIndex):
- data_index = self.obj.index.to_timestamp()
- if isinstance(data_index, DatetimeIndex) and date_format is not None:
- data_index = Index([date_formatter(x) for x in data_index])
- values = self.obj.copy()
- values.index = data_index
- values.columns = values.columns.to_native_types(
- na_rep=na_rep, float_format=float_format,
- date_format=date_format)
- values = values[cols]
- series = {}
- for k, v in compat.iteritems(values._series):
- series[k] = v.values
- nlevels = getattr(data_index, 'nlevels', 1)
- for j, idx in enumerate(data_index):
- row_fields = []
- if index:
- if nlevels == 1:
- row_fields = [idx]
- else: # handle MultiIndex
- row_fields = list(idx)
- for i, col in enumerate(cols):
- val = series[col][j]
- if lib.checknull(val):
- val = na_rep
- if float_format is not None and com.is_float(val):
- val = float_format % val
- elif isinstance(val, (np.datetime64, lib.Timestamp)):
- val = date_formatter(val)
- row_fields.append(val)
- writer.writerow(row_fields)
- def save(self):
- # create the writer & save
- if hasattr(self.path_or_buf, 'write'):
- f = self.path_or_buf
- close = False
- else:
- f = com._get_handle(self.path_or_buf, self.mode,
- encoding=self.encoding)
- close = True
- try:
- writer_kwargs = dict(lineterminator=self.line_terminator,
- delimiter=self.sep, quoting=self.quoting,
- doublequote=self.doublequote,
- escapechar=self.escapechar,
- quotechar=self.quotechar)
- if self.encoding is not None:
- writer_kwargs['encoding'] = self.encoding
- self.writer = com.UnicodeWriter(f, **writer_kwargs)
- else:
- self.writer = csv.writer(f, **writer_kwargs)
- if self.engine == 'python':
- # to be removed in 0.13
- self._helper_csv(self.writer, na_rep=self.na_rep,
- float_format=self.float_format,
- cols=self.cols, header=self.header,
- index=self.index,
- index_label=self.index_label,
- date_format=self.date_format)
- else:
- self._save()
- finally:
- if close:
- f.close()
- def _save_header(self):
- writer = self.writer
- obj = self.obj
- index_label = self.index_label
- cols = self.cols
- has_mi_columns = self.has_mi_columns
- header = self.header
- encoded_labels = []
- has_aliases = isinstance(header, (tuple, list, np.ndarray))
- if not (has_aliases or self.header):
- return
- if has_aliases:
- if len(header) != len(cols):
- raise ValueError(('Writing %d cols but got %d aliases'
- % (len(cols), len(header))))
- else:
- write_cols = header
- else:
- write_cols = cols
- if self.index:
- # should write something for index label
- if index_label is not False:
- if index_label is None:
- if isinstance(obj.index, MultiIndex):
- index_label = []
- for i, name in enumerate(obj.index.names):
- if name is None:
- name = ''
- index_label.append(name)
- else:
- index_label = obj.index.name
- if index_label is None:
- index_label = ['']
- else:
- index_label = [index_label]
- elif not isinstance(index_label, (list, tuple, np.ndarray)):
- # given a string for a DF with Index
- index_label = [index_label]
- encoded_labels = list(index_label)
- else:
- encoded_labels = []
- if not has_mi_columns:
- encoded_labels += list(write_cols)
- # write out the mi
- if has_mi_columns:
- columns = obj.columns
- # write out the names for each level, then ALL of the values for
- # each level
- for i in range(columns.nlevels):
- # we need at least 1 index column to write our col names
- col_line = []
- if self.index:
- # name is the first column
- …
Large files files are truncated, but you can click here to view the full file