PageRenderTime 2ms CodeModel.GetById 52ms app.highlight 68ms RepoModel.GetById 2ms app.codeStats 0ms

/astropy/table/column.py

https://github.com/mdboom/astropy
Python | 1124 lines | 976 code | 48 blank | 100 comment | 32 complexity | efe0cd2c7440cbe9ebd828cb6e3dc4c2 MD5 | raw file
   1# Licensed under a 3-clause BSD style license - see LICENSE.rst
   2from __future__ import (absolute_import, division, print_function,
   3                        unicode_literals)
   4from ..extern import six
   5
   6import weakref
   7
   8from copy import deepcopy
   9
  10import numpy as np
  11from numpy import ma
  12
  13from ..units import Unit, Quantity
  14from ..utils.compat import NUMPY_LT_1_8
  15from ..utils.console import color_print
  16from ..utils.metadata import MetaData
  17from ..utils.data_info import BaseColumnInfo, InfoDescriptor, dtype_info_name
  18from . import groups
  19from . import pprint
  20from .np_utils import fix_column_name
  21
  22from ..config import ConfigAlias
  23
  24
  25AUTO_COLNAME = ConfigAlias(
  26    '0.4', 'AUTO_COLNAME', 'auto_colname',
  27    'astropy.table.column', 'astropy.table')
  28
  29# Create a generic TableFormatter object for use by bare columns with no
  30# parent table.
  31FORMATTER = pprint.TableFormatter()
  32INTEGER_TYPES = (int, long, np.integer) if six.PY2 else (int, np.integer)
  33
  34def _auto_names(n_cols):
  35    from . import conf
  36    return [str(conf.auto_colname).format(i) for i in range(n_cols)]
  37
  38
  39# list of one and two-dimensional comparison functions, which sometimes return
  40# a Column class and sometimes a plain array. Used in __array_wrap__ to ensure
  41# they only return plain (masked) arrays (see #1446 and #1685)
  42_comparison_functions = set(
  43    [np.greater, np.greater_equal, np.less, np.less_equal,
  44     np.not_equal, np.equal,
  45     np.isfinite, np.isinf, np.isnan, np.sign, np.signbit])
  46
  47
  48def col_copy(col):
  49    """
  50    This is a mixin-safe version of Column.copy() (with copy_data=True).
  51    """
  52    if isinstance(col, BaseColumn):
  53        return col.copy()
  54
  55    # The new column should have None for the parent_table ref.  If the
  56    # original parent_table weakref there at the point of copying then it
  57    # generates an infinite recursion.  Instead temporarily remove the weakref
  58    # on the original column and restore after the copy in an exception-safe
  59    # manner.
  60
  61    parent_table = col.info.parent_table
  62    col.info.parent_table = None
  63
  64    try:
  65        newcol = col.copy() if hasattr(col, 'copy') else deepcopy(col)
  66        newcol.info = col.info
  67    finally:
  68        col.info.parent_table = parent_table
  69
  70    return newcol
  71
  72
  73class FalseArray(np.ndarray):
  74    def __new__(cls, shape):
  75        obj = np.zeros(shape, dtype=np.bool).view(cls)
  76        return obj
  77
  78    def __setitem__(self, item, val):
  79        val = np.asarray(val)
  80        if np.any(val):
  81            raise ValueError('Cannot set any element of {0} class to True'
  82                             .format(self.__class__.__name__))
  83
  84    def __setslice__(self, start, stop, val):
  85        val = np.asarray(val)
  86        if np.any(val):
  87            raise ValueError('Cannot set any element of {0} class to True'
  88                             .format(self.__class__.__name__))
  89
  90
  91class ColumnInfo(BaseColumnInfo):
  92    attrs_from_parent = BaseColumnInfo.attr_names
  93
  94
  95class _NDColumnProxyShim(np.ndarray):
  96    """
  97    This mixin class exists solely to provide an override to
  98    ndarray.__getitem__ that provides the desirable behavior for single
  99    item gets on columns with multi-dimensional data types.  The default
 100    behavior from Numpy is to automatically view-cast these to the ndarray
 101    subclass (i.e. Column), but the multi-dimensional array elements of
 102    multi-dimensional columns are not, themselves, Columns.
 103
 104    This class is shimmed into a new class used for any BaseColumn instances
 105    that contain multi-dimensional data via BaseColumn._get_nd_proxy_class
 106    (this is also done explicitly in MaskedColumn.__new__ due to the
 107    peculiarities of MaskedColumn).
 108    """
 109
 110    def __getitem__(self, item):
 111        if isinstance(item, INTEGER_TYPES):
 112            return self.data[item]  # Return as plain ndarray or ma.MaskedArray
 113        else:
 114            return super(_NDColumnProxyShim, self).__getitem__(item)
 115
 116
 117class BaseColumn(np.ndarray):
 118
 119    meta = MetaData()
 120
 121    _nd_proxy_classes = {}
 122    """
 123    Alternate versions of BaseColumn and any subclasses that have the
 124    _NDColumnProxyShim, mapped to by the original class.  The shimmed
 125    classes have the same name as the original class and are otherwise
 126    indistinguishable.  This hack exists only as a performance tweak.
 127    """
 128
 129    def __new__(cls, data=None, name=None,
 130                dtype=None, shape=(), length=0,
 131                description=None, unit=None, format=None, meta=None, copy=False):
 132
 133        if data is None:
 134            dtype = (np.dtype(dtype).str, shape)
 135            self_data = np.zeros(length, dtype=dtype)
 136        elif isinstance(data, BaseColumn) and hasattr(data, '_name'):
 137            # When unpickling a MaskedColumn, ``data`` will be a bare
 138            # BaseColumn with none of the expected attributes.  In this case
 139            # do NOT execute this block which initializes from ``data``
 140            # attributes.
 141            self_data = np.array(data.data, dtype=dtype, copy=copy)
 142            if description is None:
 143                description = data.description
 144            if unit is None:
 145                unit = unit or data.unit
 146            if format is None:
 147                format = data.format
 148            if meta is None:
 149                meta = deepcopy(data.meta)
 150            if name is None:
 151                name = data.name
 152        elif isinstance(data, Quantity):
 153            if unit is None:
 154                self_data = np.array(data, dtype=dtype, copy=copy)
 155                unit = data.unit
 156            else:
 157                self_data = np.array(data.to(unit), dtype=dtype, copy=copy)
 158            if description is None:
 159                description = data.info.description
 160            if format is None:
 161                format = data.info.format
 162            if meta is None:
 163                meta = deepcopy(data.info.meta)
 164
 165        else:
 166            self_data = np.array(data, dtype=dtype, copy=copy)
 167
 168        cls = cls._get_nd_proxy_class(self_data)
 169
 170        self = self_data.view(cls)
 171        self._name = fix_column_name(name)
 172        self.unit = unit
 173        self.format = format
 174        self.description = description
 175        self.meta = meta
 176        self._parent_table = None
 177
 178        return self
 179
 180    @classmethod
 181    def _get_nd_proxy_class(cls, data):
 182        """
 183        Creates new classes with the _NDColumnProxyShim.  See the docstring
 184        for _NDColumnProxyShim for more detail.
 185
 186        The data argument should be the array data that will be held by the
 187        column--this can be used to determine what proxy class to use if any at
 188        all.
 189        """
 190
 191        if data.ndim < 2:
 192            # We only this special proxy for columns whose individual elements
 193            # are themselves arrays
 194            return cls
 195
 196        if cls not in cls._nd_proxy_classes:
 197            cls._nd_proxy_classes[cls] = type(cls.__name__,
 198                                              (_NDColumnProxyShim, cls), {})
 199        return cls._nd_proxy_classes[cls]
 200
 201    @property
 202    def data(self):
 203        return self.view(np.ndarray)
 204
 205    @property
 206    def parent_table(self):
 207        if self._parent_table is None:
 208            return None
 209        else:
 210            return self._parent_table()
 211
 212    @parent_table.setter
 213    def parent_table(self, table):
 214        if table is None:
 215            self._parent_table = None
 216        else:
 217            self._parent_table = weakref.ref(table)
 218
 219    info = InfoDescriptor(ColumnInfo)
 220
 221    def copy(self, order='C', data=None, copy_data=True):
 222        """
 223        Return a copy of the current instance.
 224
 225        If ``data`` is supplied then a view (reference) of ``data`` is used,
 226        and ``copy_data`` is ignored.
 227
 228        Parameters
 229        ----------
 230        order : {'C', 'F', 'A', 'K'}, optional
 231            Controls the memory layout of the copy. 'C' means C-order,
 232            'F' means F-order, 'A' means 'F' if ``a`` is Fortran contiguous,
 233            'C' otherwise. 'K' means match the layout of ``a`` as closely
 234            as possible. (Note that this function and :func:numpy.copy are very
 235            similar, but have different default values for their order=
 236            arguments.)  Default is 'C'.
 237        data : array, optional
 238            If supplied then use a view of ``data`` instead of the instance
 239            data.  This allows copying the instance attributes and meta.
 240        copy_data : bool, optional
 241            Make a copy of the internal numpy array instead of using a
 242            reference.  Default is True.
 243
 244        Returns
 245        -------
 246        col : Column or MaskedColumn
 247            Copy of the current column (same type as original)
 248        """
 249        if data is None:
 250            data = self.data
 251            if copy_data:
 252                data = data.copy(order)
 253
 254        out = data.view(self.__class__)
 255        out.__array_finalize__(self)
 256        # for MaskedColumn, MaskedArray.__array_finalize__ also copies mask
 257        # from self, which is not the idea here, so undo
 258        if isinstance(self, MaskedColumn):
 259            out._mask = data._mask
 260
 261        self._copy_groups(out)
 262
 263        return out
 264
 265    def __setstate__(self, state):
 266        """
 267        Restore the internal state of the Column/MaskedColumn for pickling
 268        purposes.  This requires that the last element of ``state`` is a
 269        5-tuple that has Column-specific state values.
 270        """
 271        # Get the Column attributes and meta
 272        name, unit, format, description, meta = state[-1]
 273        state = state[:-1]
 274
 275        # Using super(type(self), self).__setstate__() gives an infinite
 276        # recursion.  Manually call the right super class to actually set up
 277        # the array object.
 278        super_class = ma.MaskedArray if isinstance(self, ma.MaskedArray) else np.ndarray
 279        super_class.__setstate__(self, state)
 280
 281        # Set the Column attributes and meta
 282        self._name = name
 283        self.unit = unit
 284        self.format = format
 285        self.description = description
 286        self.meta = meta
 287
 288    def __reduce__(self):
 289        """
 290        Return a 3-tuple for pickling a Column.  Use the super-class
 291        functionality but then add in a 5-tuple of Column-specific values
 292        that get used in __setstate__.
 293        """
 294        super_class = ma.MaskedArray if isinstance(self, ma.MaskedArray) else np.ndarray
 295        reconstruct_func, reconstruct_func_args, state = super_class.__reduce__(self)
 296
 297        # Define Column-specific attrs and meta that gets added to state.
 298        column_state = (self.name, self.unit, self.format, self.description,
 299                        self.meta)
 300        state = state + (column_state,)
 301
 302        return reconstruct_func, reconstruct_func_args, state
 303
 304    # avoid == and != to be done based on type of subclass
 305    # (helped solve #1446; see also __array_wrap__)
 306    def __eq__(self, other):
 307        return self.data.__eq__(other)
 308
 309    def __ne__(self, other):
 310        return self.data.__ne__(other)
 311
 312    def __array_finalize__(self, obj):
 313        # Obj will be none for direct call to Column() creator
 314        if obj is None:
 315            return
 316
 317        if six.callable(super(BaseColumn, self).__array_finalize__):
 318            super(BaseColumn, self).__array_finalize__(obj)
 319
 320        # Self was created from template (e.g. obj[slice] or (obj * 2))
 321        # or viewcast e.g. obj.view(Column).  In either case we want to
 322        # init Column attributes for self from obj if possible.
 323        self.parent_table = None
 324        self._copy_attrs(obj)
 325
 326    def __array_wrap__(self, out_arr, context=None):
 327        """
 328        __array_wrap__ is called at the end of every ufunc.
 329
 330        Normally, we want a Column object back and do not have to do anything
 331        special. But there are two exceptions:
 332
 333        1) If the output shape is different (e.g. for reduction ufuncs
 334           like sum() or mean()), a Column still linking to a parent_table
 335           makes little sense, so we return the output viewed as the
 336           column content (ndarray or MaskedArray).
 337           For this case, we use "[()]" to select everything, and to ensure we
 338           convert a zero rank array to a scalar. (For some reason np.sum()
 339           returns a zero rank scalar array while np.mean() returns a scalar;
 340           So the [()] is needed for this case.
 341
 342        2) When the output is created by any function that returns a boolean
 343           we also want to consistently return an array rather than a column
 344           (see #1446 and #1685)
 345        """
 346        out_arr = super(BaseColumn, self).__array_wrap__(out_arr, context)
 347        if (self.shape != out_arr.shape or
 348            (isinstance(out_arr, BaseColumn) and
 349             (context is not None and context[0] in _comparison_functions))):
 350            return out_arr.data[()]
 351        else:
 352            return out_arr
 353
 354    @property
 355    def name(self):
 356        """
 357        The name of this column.
 358        """
 359        return self._name
 360
 361    @name.setter
 362    def name(self, val):
 363        val = fix_column_name(val)
 364
 365        if self.parent_table is not None:
 366            table = self.parent_table
 367            table.columns._rename_column(self.name, val)
 368
 369        self._name = val
 370
 371    @property
 372    def descr(self):
 373        """Array-interface compliant full description of the column.
 374
 375        This returns a 3-tuple (name, type, shape) that can always be
 376        used in a structured array dtype definition.
 377        """
 378        return (self.name, self.dtype.str, self.shape[1:])
 379
 380    def iter_str_vals(self):
 381        """
 382        Return an iterator that yields the string-formatted values of this
 383        column.
 384
 385        Returns
 386        -------
 387        str_vals : iterator
 388            Column values formatted as strings
 389        """
 390        # Iterate over formatted values with no max number of lines, no column
 391        # name, no unit, and ignoring the returned header info in outs.
 392        _pformat_col_iter = self._formatter._pformat_col_iter
 393        for str_val in _pformat_col_iter(self, -1, show_name=False, show_unit=False,
 394                                         show_dtype=False, outs={}):
 395            yield str_val
 396
 397    def attrs_equal(self, col):
 398        """Compare the column attributes of ``col`` to this object.
 399
 400        The comparison attributes are: ``name``, ``unit``, ``dtype``,
 401        ``format``, ``description``, and ``meta``.
 402
 403        Parameters
 404        ----------
 405        col : Column
 406            Comparison column
 407
 408        Returns
 409        -------
 410        equal : boolean
 411            True if all attributes are equal
 412        """
 413        if not isinstance(col, BaseColumn):
 414            raise ValueError('Comparison `col` must be a Column or '
 415                             'MaskedColumn object')
 416
 417        attrs = ('name', 'unit', 'dtype', 'format', 'description', 'meta')
 418        equal = all(getattr(self, x) == getattr(col, x) for x in attrs)
 419
 420        return equal
 421
 422    @property
 423    def _formatter(self):
 424        return FORMATTER if (self.parent_table is None) else self.parent_table.formatter
 425
 426    def pformat(self, max_lines=None, show_name=True, show_unit=False, show_dtype=False,
 427                html=False):
 428        """Return a list of formatted string representation of column values.
 429
 430        If no value of ``max_lines`` is supplied then the height of the
 431        screen terminal is used to set ``max_lines``.  If the terminal
 432        height cannot be determined then the default will be
 433        determined using the ``astropy.conf.max_lines`` configuration
 434        item. If a negative value of ``max_lines`` is supplied then
 435        there is no line limit applied.
 436
 437        Parameters
 438        ----------
 439        max_lines : int
 440            Maximum lines of output (header + data rows)
 441
 442        show_name : bool
 443            Include column name (default=True)
 444
 445        show_unit : bool
 446            Include a header row for unit (default=False)
 447
 448        show_dtype : bool
 449            Include column dtype (default=False)
 450
 451        html : bool
 452            Format the output as an HTML table (default=False)
 453
 454        Returns
 455        -------
 456        lines : list
 457            List of lines with header and formatted column values
 458
 459        """
 460        _pformat_col = self._formatter._pformat_col
 461        lines, outs = _pformat_col(self, max_lines, show_name=show_name,
 462                                   show_unit=show_unit, show_dtype=show_dtype,
 463                                   html=html)
 464        return lines
 465
 466    def pprint(self, max_lines=None, show_name=True, show_unit=False, show_dtype=False):
 467        """Print a formatted string representation of column values.
 468
 469        If no value of ``max_lines`` is supplied then the height of the
 470        screen terminal is used to set ``max_lines``.  If the terminal
 471        height cannot be determined then the default will be
 472        determined using the ``astropy.conf.max_lines`` configuration
 473        item. If a negative value of ``max_lines`` is supplied then
 474        there is no line limit applied.
 475
 476        Parameters
 477        ----------
 478        max_lines : int
 479            Maximum number of values in output
 480
 481        show_name : bool
 482            Include column name (default=True)
 483
 484        show_unit : bool
 485            Include a header row for unit (default=False)
 486
 487        show_dtype : bool
 488            Include column dtype (default=True)
 489        """
 490        _pformat_col = self._formatter._pformat_col
 491        lines, outs = _pformat_col(self, max_lines, show_name=show_name, show_unit=show_unit,
 492                                   show_dtype=show_dtype)
 493
 494        n_header = outs['n_header']
 495        for i, line in enumerate(lines):
 496            if i < n_header:
 497                color_print(line, 'red')
 498            else:
 499                print(line)
 500
 501    def more(self, max_lines=None, show_name=True, show_unit=False):
 502        """Interactively browse column with a paging interface.
 503
 504        Supported keys::
 505
 506          f, <space> : forward one page
 507          b : back one page
 508          r : refresh same page
 509          n : next row
 510          p : previous row
 511          < : go to beginning
 512          > : go to end
 513          q : quit browsing
 514          h : print this help
 515
 516        Parameters
 517        ----------
 518        max_lines : int
 519            Maximum number of lines in table output
 520
 521        show_name : bool
 522            Include a header row for column names (default=True)
 523
 524        show_unit : bool
 525            Include a header row for unit (default=False)
 526
 527        """
 528        _more_tabcol = self._formatter._more_tabcol
 529        _more_tabcol(self, max_lines=max_lines, show_name=show_name,
 530                     show_unit=show_unit)
 531
 532    @property
 533    def unit(self):
 534        """
 535        The unit associated with this column.  May be a string or a
 536        `astropy.units.UnitBase` instance.
 537
 538        Setting the ``unit`` property does not change the values of the
 539        data.  To perform a unit conversion, use ``convert_unit_to``.
 540        """
 541        return self._unit
 542
 543    @unit.setter
 544    def unit(self, unit):
 545        if unit is None:
 546            self._unit = None
 547        else:
 548            self._unit = Unit(unit, parse_strict='silent')
 549
 550    @unit.deleter
 551    def unit(self):
 552        self._unit = None
 553
 554    def convert_unit_to(self, new_unit, equivalencies=[]):
 555        """
 556        Converts the values of the column in-place from the current
 557        unit to the given unit.
 558
 559        To change the unit associated with this column without
 560        actually changing the data values, simply set the ``unit``
 561        property.
 562
 563        Parameters
 564        ----------
 565        new_unit : str or `astropy.units.UnitBase` instance
 566            The unit to convert to.
 567
 568        equivalencies : list of equivalence pairs, optional
 569           A list of equivalence pairs to try if the unit are not
 570           directly convertible.  See :ref:`unit_equivalencies`.
 571
 572        Raises
 573        ------
 574        astropy.units.UnitsError
 575            If units are inconsistent
 576        """
 577        if self.unit is None:
 578            raise ValueError("No unit set on column")
 579        self.data[:] = self.unit.to(
 580            new_unit, self.data, equivalencies=equivalencies)
 581        self.unit = new_unit
 582
 583    @property
 584    def groups(self):
 585        if not hasattr(self, '_groups'):
 586            self._groups = groups.ColumnGroups(self)
 587        return self._groups
 588
 589    def group_by(self, keys):
 590        """
 591        Group this column by the specified ``keys``
 592
 593        This effectively splits the column into groups which correspond to
 594        unique values of the ``keys`` grouping object.  The output is a new
 595        `Column` or `MaskedColumn` which contains a copy of this column but
 596        sorted by row according to ``keys``.
 597
 598        The ``keys`` input to ``group_by`` must be a numpy array with the
 599        same length as this column.
 600
 601        Parameters
 602        ----------
 603        keys : numpy array
 604            Key grouping object
 605
 606        Returns
 607        -------
 608        out : Column
 609            New column with groups attribute set accordingly
 610        """
 611        return groups.column_group_by(self, keys)
 612
 613    def _copy_groups(self, out):
 614        """
 615        Copy current groups into a copy of self ``out``
 616        """
 617        if self.parent_table:
 618            if hasattr(self.parent_table, '_groups'):
 619                out._groups = groups.ColumnGroups(out, indices=self.parent_table._groups._indices)
 620        elif hasattr(self, '_groups'):
 621            out._groups = groups.ColumnGroups(out, indices=self._groups._indices)
 622
 623    # Strip off the BaseColumn-ness for repr and str so that
 624    # MaskedColumn.data __repr__ does not include masked_BaseColumn(data =
 625    # [1 2], ...).
 626    def __repr__(self):
 627        return np.asarray(self).__repr__()
 628
 629    @property
 630    def quantity(self):
 631        """
 632        A view of this table column as a `~astropy.units.Quantity` object with
 633        units given by the Column's `unit` parameter.
 634        """
 635        # the Quantity initializer is used here because it correctly fails
 636        # if the column's values are non-numeric (like strings), while .view
 637        # will happily return a quantity with gibberish for numerical values
 638        return Quantity(self, copy=False, dtype=self.dtype, order='A')
 639
 640    def to(self, unit, equivalencies=[], **kwargs):
 641        """
 642        Converts this table column to a `~astropy.units.Quantity` object with
 643        the requested units.
 644
 645        Parameters
 646        ----------
 647        unit : `~astropy.units.Unit` or str
 648            The unit to convert to (i.e., a valid argument to the
 649            :meth:`astropy.units.Quantity.to` method).
 650        equivalencies : list of equivalence pairs, optional
 651            Equivalencies to use for this conversion.  See
 652            :meth:`astropy.units.Quantity.to` for more details.
 653
 654        Returns
 655        -------
 656        quantity : `~astropy.units.Quantity`
 657            A quantity object with the contents of this column in the units
 658            ``unit``.
 659        """
 660        return self.quantity.to(unit, equivalencies)
 661
 662    def _copy_attrs(self, obj):
 663        """
 664        Copy key column attributes from ``obj`` to self
 665        """
 666        for attr in ('name', 'unit', 'format', 'description'):
 667            val = getattr(obj, attr, None)
 668            setattr(self, attr, val)
 669        self.meta = deepcopy(getattr(obj, 'meta', {}))
 670
 671
 672class Column(BaseColumn):
 673    """Define a data column for use in a Table object.
 674
 675    Parameters
 676    ----------
 677    data : list, ndarray or None
 678        Column data values
 679    name : str
 680        Column name and key for reference within Table
 681    dtype : numpy.dtype compatible value
 682        Data type for column
 683    shape : tuple or ()
 684        Dimensions of a single row element in the column data
 685    length : int or 0
 686        Number of row elements in column data
 687    description : str or None
 688        Full description of column
 689    unit : str or None
 690        Physical unit
 691    format : str or None or function or callable
 692        Format string for outputting column values.  This can be an
 693        "old-style" (``format % value``) or "new-style" (`str.format`)
 694        format specification string or a function or any callable object that
 695        accepts a single value and returns a string.
 696    meta : dict-like or None
 697        Meta-data associated with the column
 698
 699    Examples
 700    --------
 701    A Column can be created in two different ways:
 702
 703    - Provide a ``data`` value but not ``shape`` or ``length`` (which are
 704      inferred from the data).
 705
 706      Examples::
 707
 708        col = Column(data=[1, 2], name='name')  # shape=(2,)
 709        col = Column(data=[[1, 2], [3, 4]], name='name')  # shape=(2, 2)
 710        col = Column(data=[1, 2], name='name', dtype=float)
 711        col = Column(data=np.array([1, 2]), name='name')
 712        col = Column(data=['hello', 'world'], name='name')
 713
 714      The ``dtype`` argument can be any value which is an acceptable
 715      fixed-size data-type initializer for the numpy.dtype() method.  See
 716      `<http://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html>`_.
 717      Examples include:
 718
 719      - Python non-string type (float, int, bool)
 720      - Numpy non-string type (e.g. np.float32, np.int64, np.bool)
 721      - Numpy.dtype array-protocol type strings (e.g. 'i4', 'f8', 'S15')
 722
 723      If no ``dtype`` value is provide then the type is inferred using
 724      ``np.array(data)``.
 725
 726    - Provide ``length`` and optionally ``shape``, but not ``data``
 727
 728      Examples::
 729
 730        col = Column(name='name', length=5)
 731        col = Column(name='name', dtype=int, length=10, shape=(3,4))
 732
 733      The default ``dtype`` is ``np.float64``.  The ``shape`` argument is the
 734      array shape of a single cell in the column.
 735    """
 736
 737    def __new__(cls, data=None, name=None,
 738                dtype=None, shape=(), length=0,
 739                description=None, unit=None, format=None, meta=None, copy=False):
 740
 741        if isinstance(data, MaskedColumn) and np.any(data.mask):
 742            raise TypeError("Cannot convert a MaskedColumn with masked value to a Column")
 743
 744        self = super(Column, cls).__new__(cls, data=data, name=name, dtype=dtype,
 745                                          shape=shape, length=length, description=description,
 746                                          unit=unit, format=format, meta=meta, copy=copy)
 747        return self
 748
 749    def _base_repr_(self, html=False):
 750        # If scalar then just convert to correct numpy type and use numpy repr
 751        if self.ndim == 0:
 752            return repr(self.item())
 753
 754        descr_vals = [self.__class__.__name__]
 755        unit = None if self.unit is None else str(self.unit)
 756        shape = None if self.ndim <= 1 else self.shape[1:]
 757        for attr, val in (('name', self.name),
 758                          ('dtype', dtype_info_name(self.dtype)),
 759                          ('shape', shape),
 760                          ('unit', unit),
 761                          ('format', self.format),
 762                          ('description', self.description),
 763                          ('length', len(self))):
 764
 765            if val is not None:
 766                descr_vals.append('{0}={1}'.format(attr, repr(val)))
 767
 768        descr = '<' + ' '.join(descr_vals) + '>\n'
 769
 770        if html:
 771            from ..utils.xml.writer import xml_escape
 772            descr = xml_escape(descr)
 773
 774        data_lines, outs = self._formatter._pformat_col(
 775            self, show_name=False, show_unit=False, show_length=False, html=html)
 776
 777        out = descr + '\n'.join(data_lines)
 778        if six.PY2 and isinstance(out, six.text_type):
 779            out = out.encode('utf-8')
 780
 781        return out
 782
 783    def _repr_html_(self):
 784        return self._base_repr_(html=True)
 785
 786    def __repr__(self):
 787        return self._base_repr_(html=False)
 788
 789    def __unicode__(self):
 790        # If scalar then just convert to correct numpy type and use numpy repr
 791        if self.ndim == 0:
 792            return str(self.item())
 793
 794        lines, outs = self._formatter._pformat_col(self)
 795        return '\n'.join(lines)
 796    if six.PY3:
 797        __str__ = __unicode__
 798
 799    def __bytes__(self):
 800        return six.text_type(self).encode('utf-8')
 801    if six.PY2:
 802        __str__ = __bytes__
 803
 804    # Set items using a view of the underlying data, as it gives an
 805    # order-of-magnitude speed-up. [#2994]
 806    def __setitem__(self, index, value):
 807        self.data[index] = value
 808
 809    # # Set slices using a view of the underlying data, as it gives an
 810    # # order-of-magnitude speed-up.  Only gets called in Python 2.  [#3020]
 811    def __setslice__(self, start, stop, value):
 812        self.data.__setslice__(start, stop, value)
 813
 814    def insert(self, obj, values):
 815        """
 816        Insert values before the given indices in the column and return
 817        a new `~astropy.table.Column` object.
 818
 819        Parameters
 820        ----------
 821        obj : int, slice or sequence of ints
 822            Object that defines the index or indices before which ``values`` is
 823            inserted.
 824        values : array_like
 825            Value(s) to insert.  If the type of ``values`` is different
 826            from that of quantity, ``values`` is converted to the matching type.
 827            ``values`` should be shaped so that it can be broadcast appropriately
 828
 829        Returns
 830        -------
 831        out : `~astropy.table.Column`
 832            A copy of column with ``values`` and ``mask`` inserted.  Note that the
 833            insertion does not occur in-place: a new column is returned.
 834        """
 835        if self.dtype.kind == 'O':
 836            # Even if values is array-like (e.g. [1,2,3]), insert as a single
 837            # object.  Numpy.insert instead inserts each element in an array-like
 838            # input individually.
 839            data = np.insert(self, obj, None, axis=0)
 840            data[obj] = values
 841        else:
 842            # Explicitly convert to dtype of this column.  Needed because numpy 1.7
 843            # enforces safe casting by default, so .  This isn't the case for 1.6 or 1.8+.
 844            values = np.asarray(values, dtype=self.dtype)
 845            data = np.insert(self, obj, values, axis=0)
 846        out = data.view(self.__class__)
 847        out.__array_finalize__(self)
 848        return out
 849
 850    # We do this to make the methods show up in the API docs
 851    name = BaseColumn.name
 852    unit = BaseColumn.unit
 853    copy = BaseColumn.copy
 854    more = BaseColumn.more
 855    pprint = BaseColumn.pprint
 856    pformat = BaseColumn.pformat
 857    convert_unit_to = BaseColumn.convert_unit_to
 858    quantity = BaseColumn.quantity
 859    to = BaseColumn.to
 860
 861
 862class MaskedColumn(Column, ma.MaskedArray):
 863    """Define a masked data column for use in a Table object.
 864
 865    Parameters
 866    ----------
 867    data : list, ndarray or None
 868        Column data values
 869    name : str
 870        Column name and key for reference within Table
 871    mask : list, ndarray or None
 872        Boolean mask for which True indicates missing or invalid data
 873    fill_value : float, int, str or None
 874        Value used when filling masked column elements
 875    dtype : numpy.dtype compatible value
 876        Data type for column
 877    shape : tuple or ()
 878        Dimensions of a single row element in the column data
 879    length : int or 0
 880        Number of row elements in column data
 881    description : str or None
 882        Full description of column
 883    unit : str or None
 884        Physical unit
 885    format : str or None or function or callable
 886        Format string for outputting column values.  This can be an
 887        "old-style" (``format % value``) or "new-style" (`str.format`)
 888        format specification string or a function or any callable object that
 889        accepts a single value and returns a string.
 890    meta : dict-like or None
 891        Meta-data associated with the column
 892
 893    Examples
 894    --------
 895    A MaskedColumn is similar to a Column except that it includes ``mask`` and
 896    ``fill_value`` attributes.  It can be created in two different ways:
 897
 898    - Provide a ``data`` value but not ``shape`` or ``length`` (which are
 899      inferred from the data).
 900
 901      Examples::
 902
 903        col = MaskedColumn(data=[1, 2], name='name')
 904        col = MaskedColumn(data=[1, 2], name='name', mask=[True, False])
 905        col = MaskedColumn(data=[1, 2], name='name', dtype=float, fill_value=99)
 906
 907      The ``mask`` argument will be cast as a boolean array and specifies
 908      which elements are considered to be missing or invalid.
 909
 910      The ``dtype`` argument can be any value which is an acceptable
 911      fixed-size data-type initializer for the numpy.dtype() method.  See
 912      `<http://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html>`_.
 913      Examples include:
 914
 915      - Python non-string type (float, int, bool)
 916      - Numpy non-string type (e.g. np.float32, np.int64, np.bool)
 917      - Numpy.dtype array-protocol type strings (e.g. 'i4', 'f8', 'S15')
 918
 919      If no ``dtype`` value is provide then the type is inferred using
 920      ``np.array(data)``.  When ``data`` is provided then the ``shape``
 921      and ``length`` arguments are ignored.
 922
 923    - Provide ``length`` and optionally ``shape``, but not ``data``
 924
 925      Examples::
 926
 927        col = MaskedColumn(name='name', length=5)
 928        col = MaskedColumn(name='name', dtype=int, length=10, shape=(3,4))
 929
 930      The default ``dtype`` is ``np.float64``.  The ``shape`` argument is the
 931      array shape of a single cell in the column.
 932    """
 933
 934    def __new__(cls, data=None, name=None, mask=None, fill_value=None,
 935                dtype=None, shape=(), length=0,
 936                description=None, unit=None, format=None, meta=None, copy=False):
 937
 938        if mask is None and hasattr(data, 'mask'):
 939            mask = data.mask
 940        else:
 941            mask = deepcopy(mask)
 942
 943        # Create self using MaskedArray as a wrapper class, following the example of
 944        # class MSubArray in
 945        # https://github.com/numpy/numpy/blob/maintenance/1.8.x/numpy/ma/tests/test_subclassing.py
 946        # This pattern makes it so that __array_finalize__ is called as expected (e.g. #1471 and
 947        # https://github.com/astropy/astropy/commit/ff6039e8)
 948
 949        # First just pass through all args and kwargs to BaseColumn, then wrap that object
 950        # with MaskedArray.
 951        self_data = BaseColumn(data, dtype=dtype, shape=shape, length=length, name=name,
 952                               unit=unit, format=format, description=description, meta=meta, copy=copy)
 953
 954        cls = cls._get_nd_proxy_class(self_data)
 955
 956        self = ma.MaskedArray.__new__(cls, data=self_data, mask=mask)
 957
 958        # Note: do not set fill_value in the MaskedArray constructor because this does not
 959        # go through the fill_value workarounds (see _fix_fill_value below).
 960        if fill_value is None and hasattr(data, 'fill_value'):
 961            fill_value = data.fill_value
 962        self.fill_value = fill_value
 963
 964        self.parent_table = None
 965
 966        return self
 967
 968    def _fix_fill_value(self, val):
 969        """Fix a fill value (if needed) to work around a bug with setting the fill
 970        value of a string array in MaskedArray with Python 3.x.  See
 971        https://github.com/numpy/numpy/pull/2733.  This mimics the check in
 972        numpy.ma.core._check_fill_value() (version < 1.8) which incorrectly sets
 973        fill_value to a default if self.dtype.char is 'U' (which is the case for Python
 974        3).  Here we change the string to a byte string so that in Python 3 the
 975        isinstance(val, basestring) part fails.
 976        """
 977
 978        if (NUMPY_LT_1_8 and isinstance(val, six.string_types) and
 979                (self.dtype.char not in 'SV')):
 980            val = val.encode()
 981        return val
 982
 983    @property
 984    def fill_value(self):
 985        return self.get_fill_value()  # defer to native ma.MaskedArray method
 986
 987    @fill_value.setter
 988    def fill_value(self, val):
 989        """Set fill value both in the masked column view and in the parent table
 990        if it exists.  Setting one or the other alone doesn't work."""
 991        val = self._fix_fill_value(val)
 992
 993        # Yet another ma bug workaround: If the value of fill_value for a string array is
 994        # requested but not yet set then it gets created as 'N/A'.  From this point onward
 995        # any new fill_values are truncated to 3 characters.  Note that this does not
 996        # occur if the masked array is a structured array (as in the previous block that
 997        # deals with the parent table).
 998        #
 999        # >>> x = ma.array(['xxxx'])
1000        # >>> x.fill_value  # fill_value now gets represented as an 'S3' array
1001        # 'N/A'
1002        # >>> x.fill_value='yyyy'
1003        # >>> x.fill_value
1004        # 'yyy'
1005        #
1006        # To handle this we are forced to reset a private variable first:
1007        self._fill_value = None
1008
1009        self.set_fill_value(val)  # defer to native ma.MaskedArray method
1010
1011    @property
1012    def data(self):
1013        out = self.view(ma.MaskedArray)
1014        # The following is necessary because of a bug in Numpy, which was
1015        # fixed in numpy/numpy#2703. The fix should be included in Numpy 1.8.0.
1016        out.fill_value = self.fill_value
1017        return out
1018
1019    def filled(self, fill_value=None):
1020        """Return a copy of self, with masked values filled with a given value.
1021
1022        Parameters
1023        ----------
1024        fill_value : scalar; optional
1025            The value to use for invalid entries (`None` by default).  If
1026            `None`, the ``fill_value`` attribute of the array is used
1027            instead.
1028
1029        Returns
1030        -------
1031        filled_column : Column
1032            A copy of ``self`` with masked entries replaced by `fill_value`
1033            (be it the function argument or the attribute of ``self``).
1034        """
1035        if fill_value is None:
1036            fill_value = self.fill_value
1037        fill_value = self._fix_fill_value(fill_value)
1038
1039        data = super(MaskedColumn, self).filled(fill_value)
1040        # Use parent table definition of Column if available
1041        column_cls = self.parent_table.Column if (self.parent_table is not None) else Column
1042        out = column_cls(name=self.name, data=data, unit=self.unit,
1043                         format=self.format, description=self.description,
1044                         meta=deepcopy(self.meta))
1045        return out
1046
1047    def insert(self, obj, values, mask=None):
1048        """
1049        Insert values along the given axis before the given indices and return
1050        a new `~astropy.table.MaskedColumn` object.
1051
1052        Parameters
1053        ----------
1054        obj : int, slice or sequence of ints
1055            Object that defines the index or indices before which ``values`` is
1056            inserted.
1057        values : array_like
1058            Value(s) to insert.  If the type of ``values`` is different
1059            from that of quantity, ``values`` is converted to the matching type.
1060            ``values`` should be shaped so that it can be broadcast appropriately
1061        mask : boolean array_like
1062            Mask value(s) to insert.  If not supplied then False is used.
1063
1064        Returns
1065        -------
1066        out : `~astropy.table.MaskedColumn`
1067            A copy of column with ``values`` and ``mask`` inserted.  Note that the
1068            insertion does not occur in-place: a new masked column is returned.
1069        """
1070        self_ma = self.data  # self viewed as MaskedArray
1071
1072        if self.dtype.kind == 'O':
1073            # Even if values is array-like (e.g. [1,2,3]), insert as a single
1074            # object.  Numpy.insert instead inserts each element in an array-like
1075            # input individually.
1076            new_data = np.insert(self_ma.data, obj, None, axis=0)
1077            new_data[obj] = values
1078        else:
1079            # Explicitly convert to dtype of this column.  Needed because numpy 1.7
1080            # enforces safe casting by default, so .  This isn't the case for 1.6 or 1.8+.
1081            values = np.asarray(values, dtype=self.dtype)
1082            new_data = np.insert(self_ma.data, obj, values, axis=0)
1083
1084        if mask is None:
1085            if self.dtype.kind == 'O':
1086                mask = False
1087            else:
1088                mask = np.zeros(values.shape, dtype=np.bool)
1089        new_mask = np.insert(self_ma.mask, obj, mask, axis=0)
1090        new_ma = np.ma.array(new_data, mask=new_mask, copy=False)
1091
1092        out = new_ma.view(self.__class__)
1093        out.parent_table = None
1094        out._copy_attrs(self)
1095
1096        return out
1097
1098    def __getitem__(self, item):
1099        out = super(MaskedColumn, self).__getitem__(item)
1100
1101        # Fixes issue #3023: when calling getitem with a MaskedArray subclass
1102        # the original object attributes are not copied.
1103        if out.__class__ is self.__class__:
1104            out.parent_table = None
1105            out._copy_attrs(self)
1106
1107        return out
1108
1109    # Set items and slices using MaskedArray method, instead of falling through
1110    # to the (faster) Column version which uses an ndarray view.  This doesn't
1111    # copy the mask properly. See test_setting_from_masked_column test.
1112    def __setitem__(self, index, value):
1113        ma.MaskedArray.__setitem__(self, index, value)
1114
1115    def __setslice__(self, start, stop, value):
1116        ma.MaskedArray.__setslice__(self, start, stop, value)
1117
1118    # We do this to make the methods show up in the API docs
1119    name = BaseColumn.name
1120    copy = BaseColumn.copy
1121    more = BaseColumn.more
1122    pprint = BaseColumn.pprint
1123    pformat = BaseColumn.pformat
1124    convert_unit_to = BaseColumn.convert_unit_to