/pandas/io/tests/json/test_json_norm.py

http://github.com/wesm/pandas · Python · 230 lines · 183 code · 46 blank · 1 comment · 7 complexity · 05f692de09335d8a2c0e6c1bacb987ed MD5 · raw file

  1. import nose
  2. from pandas import DataFrame
  3. import numpy as np
  4. import json
  5. import pandas.util.testing as tm
  6. from pandas import compat
  7. from pandas.io.json import json_normalize, nested_to_record
  8. def _assert_equal_data(left, right):
  9. if not left.columns.equals(right.columns):
  10. left = left.reindex(columns=right.columns)
  11. tm.assert_frame_equal(left, right)
  12. class TestJSONNormalize(tm.TestCase):
  13. def setUp(self):
  14. self.state_data = [
  15. {'counties': [{'name': 'Dade', 'population': 12345},
  16. {'name': 'Broward', 'population': 40000},
  17. {'name': 'Palm Beach', 'population': 60000}],
  18. 'info': {'governor': 'Rick Scott'},
  19. 'shortname': 'FL',
  20. 'state': 'Florida'},
  21. {'counties': [{'name': 'Summit', 'population': 1234},
  22. {'name': 'Cuyahoga', 'population': 1337}],
  23. 'info': {'governor': 'John Kasich'},
  24. 'shortname': 'OH',
  25. 'state': 'Ohio'}]
  26. def test_simple_records(self):
  27. recs = [{'a': 1, 'b': 2, 'c': 3},
  28. {'a': 4, 'b': 5, 'c': 6},
  29. {'a': 7, 'b': 8, 'c': 9},
  30. {'a': 10, 'b': 11, 'c': 12}]
  31. result = json_normalize(recs)
  32. expected = DataFrame(recs)
  33. tm.assert_frame_equal(result, expected)
  34. def test_simple_normalize(self):
  35. result = json_normalize(self.state_data[0], 'counties')
  36. expected = DataFrame(self.state_data[0]['counties'])
  37. tm.assert_frame_equal(result, expected)
  38. result = json_normalize(self.state_data, 'counties')
  39. expected = []
  40. for rec in self.state_data:
  41. expected.extend(rec['counties'])
  42. expected = DataFrame(expected)
  43. tm.assert_frame_equal(result, expected)
  44. result = json_normalize(self.state_data, 'counties', meta='state')
  45. expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
  46. tm.assert_frame_equal(result, expected)
  47. def test_more_deeply_nested(self):
  48. data = [{'country': 'USA',
  49. 'states': [{'name': 'California',
  50. 'cities': [{'name': 'San Francisco',
  51. 'pop': 12345},
  52. {'name': 'Los Angeles',
  53. 'pop': 12346}]
  54. },
  55. {'name': 'Ohio',
  56. 'cities': [{'name': 'Columbus',
  57. 'pop': 1234},
  58. {'name': 'Cleveland',
  59. 'pop': 1236}]}
  60. ]
  61. },
  62. {'country': 'Germany',
  63. 'states': [{'name': 'Bayern',
  64. 'cities': [{'name': 'Munich', 'pop': 12347}]
  65. },
  66. {'name': 'Nordrhein-Westfalen',
  67. 'cities': [{'name': 'Duesseldorf', 'pop': 1238},
  68. {'name': 'Koeln', 'pop': 1239}]}
  69. ]
  70. }
  71. ]
  72. result = json_normalize(data, ['states', 'cities'],
  73. meta=['country', ['states', 'name']])
  74. # meta_prefix={'states': 'state_'})
  75. ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3,
  76. 'states.name': ['California', 'California', 'Ohio', 'Ohio',
  77. 'Bayern', 'Nordrhein-Westfalen',
  78. 'Nordrhein-Westfalen'],
  79. 'name': ['San Francisco', 'Los Angeles', 'Columbus',
  80. 'Cleveland', 'Munich', 'Duesseldorf', 'Koeln'],
  81. 'pop': [12345, 12346, 1234, 1236, 12347, 1238, 1239]}
  82. expected = DataFrame(ex_data, columns=result.columns)
  83. tm.assert_frame_equal(result, expected)
  84. def test_shallow_nested(self):
  85. data = [{'state': 'Florida',
  86. 'shortname': 'FL',
  87. 'info': {
  88. 'governor': 'Rick Scott'
  89. },
  90. 'counties': [{'name': 'Dade', 'population': 12345},
  91. {'name': 'Broward', 'population': 40000},
  92. {'name': 'Palm Beach', 'population': 60000}]},
  93. {'state': 'Ohio',
  94. 'shortname': 'OH',
  95. 'info': {
  96. 'governor': 'John Kasich'
  97. },
  98. 'counties': [{'name': 'Summit', 'population': 1234},
  99. {'name': 'Cuyahoga', 'population': 1337}]}]
  100. result = json_normalize(data, 'counties',
  101. ['state', 'shortname',
  102. ['info', 'governor']])
  103. ex_data = {'name': ['Dade', 'Broward', 'Palm Beach', 'Summit',
  104. 'Cuyahoga'],
  105. 'state': ['Florida'] * 3 + ['Ohio'] * 2,
  106. 'shortname': ['FL', 'FL', 'FL', 'OH', 'OH'],
  107. 'info.governor': ['Rick Scott'] * 3 + ['John Kasich'] * 2,
  108. 'population': [12345, 40000, 60000, 1234, 1337]}
  109. expected = DataFrame(ex_data, columns=result.columns)
  110. tm.assert_frame_equal(result, expected)
  111. def test_meta_name_conflict(self):
  112. data = [{'foo': 'hello',
  113. 'bar': 'there',
  114. 'data': [{'foo': 'something', 'bar': 'else'},
  115. {'foo': 'something2', 'bar': 'else2'}]}]
  116. self.assertRaises(ValueError, json_normalize, data,
  117. 'data', meta=['foo', 'bar'])
  118. result = json_normalize(data, 'data', meta=['foo', 'bar'],
  119. meta_prefix='meta')
  120. for val in ['metafoo', 'metabar', 'foo', 'bar']:
  121. self.assertTrue(val in result)
  122. def test_record_prefix(self):
  123. result = json_normalize(self.state_data[0], 'counties')
  124. expected = DataFrame(self.state_data[0]['counties'])
  125. tm.assert_frame_equal(result, expected)
  126. result = json_normalize(self.state_data, 'counties',
  127. meta='state',
  128. record_prefix='county_')
  129. expected = []
  130. for rec in self.state_data:
  131. expected.extend(rec['counties'])
  132. expected = DataFrame(expected)
  133. expected = expected.rename(columns=lambda x: 'county_' + x)
  134. expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2])
  135. tm.assert_frame_equal(result, expected)
  136. def test_non_ascii_key(self):
  137. if compat.PY3:
  138. testjson = (
  139. b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' +
  140. b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
  141. ).decode('utf8')
  142. else:
  143. testjson = ('[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
  144. '{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]')
  145. testdata = {
  146. u'sub.A': [1, 3],
  147. u'sub.B': [2, 4],
  148. b"\xc3\x9cnic\xc3\xb8de".decode('utf8'): [0, 1]
  149. }
  150. expected = DataFrame(testdata)
  151. result = json_normalize(json.loads(testjson))
  152. tm.assert_frame_equal(result, expected)
  153. class TestNestedToRecord(tm.TestCase):
  154. def test_flat_stays_flat(self):
  155. recs = [dict(flat1=1, flat2=2),
  156. dict(flat1=3, flat2=4),
  157. ]
  158. result = nested_to_record(recs)
  159. expected = recs
  160. self.assertEqual(result, expected)
  161. def test_one_level_deep_flattens(self):
  162. data = dict(flat1=1,
  163. dict1=dict(c=1, d=2))
  164. result = nested_to_record(data)
  165. expected = {'dict1.c': 1,
  166. 'dict1.d': 2,
  167. 'flat1': 1}
  168. self.assertEqual(result, expected)
  169. def test_nested_flattens(self):
  170. data = dict(flat1=1,
  171. dict1=dict(c=1, d=2),
  172. nested=dict(e=dict(c=1, d=2),
  173. d=2))
  174. result = nested_to_record(data)
  175. expected = {'dict1.c': 1,
  176. 'dict1.d': 2,
  177. 'flat1': 1,
  178. 'nested.d': 2,
  179. 'nested.e.c': 1,
  180. 'nested.e.d': 2}
  181. self.assertEqual(result, expected)
  182. if __name__ == '__main__':
  183. nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb',
  184. '--pdb-failure', '-s'], exit=False)