/pandas/io/json/table_schema.py

https://github.com/neurodebian/pandas · Python · 182 lines · 144 code · 4 blank · 34 comment · 9 complexity · e8136e8fb6b9f7dbe2c2a885b0d3e453 MD5 · raw file

  1. """
  2. Table Schema builders
  3. http://specs.frictionlessdata.io/json-table-schema/
  4. """
  5. from pandas.core.common import _all_not_none
  6. from pandas.core.dtypes.common import (
  7. is_integer_dtype, is_timedelta64_dtype, is_numeric_dtype,
  8. is_bool_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
  9. is_categorical_dtype, is_period_dtype, is_string_dtype
  10. )
  11. def as_json_table_type(x):
  12. """
  13. Convert a NumPy / pandas type to its corresponding json_table.
  14. Parameters
  15. ----------
  16. x : array or dtype
  17. Returns
  18. -------
  19. t : str
  20. the Table Schema data types
  21. Notes
  22. -----
  23. This table shows the relationship between NumPy / pandas dtypes,
  24. and Table Schema dtypes.
  25. ============== =================
  26. Pandas type Table Schema type
  27. ============== =================
  28. int64 integer
  29. float64 number
  30. bool boolean
  31. datetime64[ns] datetime
  32. timedelta64[ns] duration
  33. object str
  34. categorical any
  35. =============== =================
  36. """
  37. if is_integer_dtype(x):
  38. return 'integer'
  39. elif is_bool_dtype(x):
  40. return 'boolean'
  41. elif is_numeric_dtype(x):
  42. return 'number'
  43. elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or
  44. is_period_dtype(x)):
  45. return 'datetime'
  46. elif is_timedelta64_dtype(x):
  47. return 'duration'
  48. elif is_categorical_dtype(x):
  49. return 'any'
  50. elif is_string_dtype(x):
  51. return 'string'
  52. else:
  53. return 'any'
  54. def set_default_names(data):
  55. """Sets index names to 'index' for regular, or 'level_x' for Multi"""
  56. if _all_not_none(*data.index.names):
  57. return data
  58. data = data.copy()
  59. if data.index.nlevels > 1:
  60. names = [name if name is not None else 'level_{}'.format(i)
  61. for i, name in enumerate(data.index.names)]
  62. data.index.names = names
  63. else:
  64. data.index.name = data.index.name or 'index'
  65. return data
  66. def make_field(arr, dtype=None):
  67. dtype = dtype or arr.dtype
  68. if arr.name is None:
  69. name = 'values'
  70. else:
  71. name = arr.name
  72. field = {'name': name,
  73. 'type': as_json_table_type(dtype)}
  74. if is_categorical_dtype(arr):
  75. if hasattr(arr, 'categories'):
  76. cats = arr.categories
  77. ordered = arr.ordered
  78. else:
  79. cats = arr.cat.categories
  80. ordered = arr.cat.ordered
  81. field['constraints'] = {"enum": list(cats)}
  82. field['ordered'] = ordered
  83. elif is_period_dtype(arr):
  84. field['freq'] = arr.freqstr
  85. elif is_datetime64tz_dtype(arr):
  86. if hasattr(arr, 'dt'):
  87. field['tz'] = arr.dt.tz.zone
  88. else:
  89. field['tz'] = arr.tz.zone
  90. return field
  91. def build_table_schema(data, index=True, primary_key=None, version=True):
  92. """
  93. Create a Table schema from ``data``.
  94. Parameters
  95. ----------
  96. data : Series, DataFrame
  97. index : bool, default True
  98. Whether to include ``data.index`` in the schema.
  99. primary_key : bool or None, default True
  100. column names to designate as the primary key.
  101. The default `None` will set `'primaryKey'` to the index
  102. level or levels if the index is unique.
  103. version : bool, default True
  104. Whether to include a field `pandas_version` with the version
  105. of pandas that generated the schema.
  106. Returns
  107. -------
  108. schema : dict
  109. Examples
  110. --------
  111. >>> df = pd.DataFrame(
  112. ... {'A': [1, 2, 3],
  113. ... 'B': ['a', 'b', 'c'],
  114. ... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
  115. ... }, index=pd.Index(range(3), name='idx'))
  116. >>> build_table_schema(df)
  117. {'fields': [{'name': 'idx', 'type': 'integer'},
  118. {'name': 'A', 'type': 'integer'},
  119. {'name': 'B', 'type': 'string'},
  120. {'name': 'C', 'type': 'datetime'}],
  121. 'pandas_version': '0.20.0',
  122. 'primaryKey': ['idx']}
  123. Notes
  124. -----
  125. See `_as_json_table_type` for conversion types.
  126. Timedeltas as converted to ISO8601 duration format with
  127. 9 decimal places after the secnods field for nanosecond precision.
  128. Categoricals are converted to the `any` dtype, and use the `enum` field
  129. constraint to list the allowed values. The `ordered` attribute is included
  130. in an `ordered` field.
  131. """
  132. if index is True:
  133. data = set_default_names(data)
  134. schema = {}
  135. fields = []
  136. if index:
  137. if data.index.nlevels > 1:
  138. for level in data.index.levels:
  139. fields.append(make_field(level))
  140. else:
  141. fields.append(make_field(data.index))
  142. if data.ndim > 1:
  143. for column, s in data.iteritems():
  144. fields.append(make_field(s))
  145. else:
  146. fields.append(make_field(data))
  147. schema['fields'] = fields
  148. if index and data.index.is_unique and primary_key is None:
  149. if data.index.nlevels == 1:
  150. schema['primaryKey'] = [data.index.name]
  151. else:
  152. schema['primaryKey'] = data.index.names
  153. elif primary_key is not None:
  154. schema['primaryKey'] = primary_key
  155. if version:
  156. schema['pandas_version'] = '0.20.0'
  157. return schema