/pandas/tools/tests/test_util.py

http://github.com/wesm/pandas · Python · 381 lines · 272 code · 88 blank · 21 comment · 26 complexity · e64f86405e648b17d4b9384d7df2c501 MD5 · raw file

  1. import os
  2. import locale
  3. import codecs
  4. import nose
  5. import numpy as np
  6. import pandas as pd
  7. from pandas import date_range, Index
  8. import pandas.util.testing as tm
  9. from pandas.tools.util import cartesian_product, to_numeric
  10. CURRENT_LOCALE = locale.getlocale()
  11. LOCALE_OVERRIDE = os.environ.get('LOCALE_OVERRIDE', None)
  12. class TestCartesianProduct(tm.TestCase):
  13. def test_simple(self):
  14. x, y = list('ABC'), [1, 22]
  15. result1, result2 = cartesian_product([x, y])
  16. expected1 = np.array(['A', 'A', 'B', 'B', 'C', 'C'])
  17. expected2 = np.array([1, 22, 1, 22, 1, 22])
  18. tm.assert_numpy_array_equal(result1, expected1)
  19. tm.assert_numpy_array_equal(result2, expected2)
  20. def test_datetimeindex(self):
  21. # regression test for GitHub issue #6439
  22. # make sure that the ordering on datetimeindex is consistent
  23. x = date_range('2000-01-01', periods=2)
  24. result1, result2 = [Index(y).day for y in cartesian_product([x, x])]
  25. expected1 = np.array([1, 1, 2, 2], dtype=np.int32)
  26. expected2 = np.array([1, 2, 1, 2], dtype=np.int32)
  27. tm.assert_numpy_array_equal(result1, expected1)
  28. tm.assert_numpy_array_equal(result2, expected2)
  29. class TestLocaleUtils(tm.TestCase):
  30. @classmethod
  31. def setUpClass(cls):
  32. super(TestLocaleUtils, cls).setUpClass()
  33. cls.locales = tm.get_locales()
  34. if not cls.locales:
  35. raise nose.SkipTest("No locales found")
  36. tm._skip_if_windows()
  37. @classmethod
  38. def tearDownClass(cls):
  39. super(TestLocaleUtils, cls).tearDownClass()
  40. del cls.locales
  41. def test_get_locales(self):
  42. # all systems should have at least a single locale
  43. assert len(tm.get_locales()) > 0
  44. def test_get_locales_prefix(self):
  45. if len(self.locales) == 1:
  46. raise nose.SkipTest("Only a single locale found, no point in "
  47. "trying to test filtering locale prefixes")
  48. first_locale = self.locales[0]
  49. assert len(tm.get_locales(prefix=first_locale[:2])) > 0
  50. def test_set_locale(self):
  51. if len(self.locales) == 1:
  52. raise nose.SkipTest("Only a single locale found, no point in "
  53. "trying to test setting another locale")
  54. if LOCALE_OVERRIDE is not None:
  55. lang, enc = LOCALE_OVERRIDE.split('.')
  56. else:
  57. lang, enc = 'it_CH', 'UTF-8'
  58. enc = codecs.lookup(enc).name
  59. new_locale = lang, enc
  60. if not tm._can_set_locale(new_locale):
  61. with tm.assertRaises(locale.Error):
  62. with tm.set_locale(new_locale):
  63. pass
  64. else:
  65. with tm.set_locale(new_locale) as normalized_locale:
  66. new_lang, new_enc = normalized_locale.split('.')
  67. new_enc = codecs.lookup(enc).name
  68. normalized_locale = new_lang, new_enc
  69. self.assertEqual(normalized_locale, new_locale)
  70. current_locale = locale.getlocale()
  71. self.assertEqual(current_locale, CURRENT_LOCALE)
  72. class TestToNumeric(tm.TestCase):
  73. def test_series(self):
  74. s = pd.Series(['1', '-3.14', '7'])
  75. res = to_numeric(s)
  76. expected = pd.Series([1, -3.14, 7])
  77. tm.assert_series_equal(res, expected)
  78. s = pd.Series(['1', '-3.14', 7])
  79. res = to_numeric(s)
  80. tm.assert_series_equal(res, expected)
  81. def test_series_numeric(self):
  82. s = pd.Series([1, 3, 4, 5], index=list('ABCD'), name='XXX')
  83. res = to_numeric(s)
  84. tm.assert_series_equal(res, s)
  85. s = pd.Series([1., 3., 4., 5.], index=list('ABCD'), name='XXX')
  86. res = to_numeric(s)
  87. tm.assert_series_equal(res, s)
  88. # bool is regarded as numeric
  89. s = pd.Series([True, False, True, True],
  90. index=list('ABCD'), name='XXX')
  91. res = to_numeric(s)
  92. tm.assert_series_equal(res, s)
  93. def test_error(self):
  94. s = pd.Series([1, -3.14, 'apple'])
  95. msg = 'Unable to parse string "apple" at position 2'
  96. with tm.assertRaisesRegexp(ValueError, msg):
  97. to_numeric(s, errors='raise')
  98. res = to_numeric(s, errors='ignore')
  99. expected = pd.Series([1, -3.14, 'apple'])
  100. tm.assert_series_equal(res, expected)
  101. res = to_numeric(s, errors='coerce')
  102. expected = pd.Series([1, -3.14, np.nan])
  103. tm.assert_series_equal(res, expected)
  104. s = pd.Series(['orange', 1, -3.14, 'apple'])
  105. msg = 'Unable to parse string "orange" at position 0'
  106. with tm.assertRaisesRegexp(ValueError, msg):
  107. to_numeric(s, errors='raise')
  108. def test_error_seen_bool(self):
  109. s = pd.Series([True, False, 'apple'])
  110. msg = 'Unable to parse string "apple" at position 2'
  111. with tm.assertRaisesRegexp(ValueError, msg):
  112. to_numeric(s, errors='raise')
  113. res = to_numeric(s, errors='ignore')
  114. expected = pd.Series([True, False, 'apple'])
  115. tm.assert_series_equal(res, expected)
  116. # coerces to float
  117. res = to_numeric(s, errors='coerce')
  118. expected = pd.Series([1., 0., np.nan])
  119. tm.assert_series_equal(res, expected)
  120. def test_list(self):
  121. s = ['1', '-3.14', '7']
  122. res = to_numeric(s)
  123. expected = np.array([1, -3.14, 7])
  124. tm.assert_numpy_array_equal(res, expected)
  125. def test_list_numeric(self):
  126. s = [1, 3, 4, 5]
  127. res = to_numeric(s)
  128. tm.assert_numpy_array_equal(res, np.array(s, dtype=np.int64))
  129. s = [1., 3., 4., 5.]
  130. res = to_numeric(s)
  131. tm.assert_numpy_array_equal(res, np.array(s))
  132. # bool is regarded as numeric
  133. s = [True, False, True, True]
  134. res = to_numeric(s)
  135. tm.assert_numpy_array_equal(res, np.array(s))
  136. def test_numeric(self):
  137. s = pd.Series([1, -3.14, 7], dtype='O')
  138. res = to_numeric(s)
  139. expected = pd.Series([1, -3.14, 7])
  140. tm.assert_series_equal(res, expected)
  141. s = pd.Series([1, -3.14, 7])
  142. res = to_numeric(s)
  143. tm.assert_series_equal(res, expected)
  144. def test_all_nan(self):
  145. s = pd.Series(['a', 'b', 'c'])
  146. res = to_numeric(s, errors='coerce')
  147. expected = pd.Series([np.nan, np.nan, np.nan])
  148. tm.assert_series_equal(res, expected)
  149. def test_type_check(self):
  150. # GH 11776
  151. df = pd.DataFrame({'a': [1, -3.14, 7], 'b': ['4', '5', '6']})
  152. with tm.assertRaisesRegexp(TypeError, "1-d array"):
  153. to_numeric(df)
  154. for errors in ['ignore', 'raise', 'coerce']:
  155. with tm.assertRaisesRegexp(TypeError, "1-d array"):
  156. to_numeric(df, errors=errors)
  157. def test_scalar(self):
  158. self.assertEqual(pd.to_numeric(1), 1)
  159. self.assertEqual(pd.to_numeric(1.1), 1.1)
  160. self.assertEqual(pd.to_numeric('1'), 1)
  161. self.assertEqual(pd.to_numeric('1.1'), 1.1)
  162. with tm.assertRaises(ValueError):
  163. to_numeric('XX', errors='raise')
  164. self.assertEqual(to_numeric('XX', errors='ignore'), 'XX')
  165. self.assertTrue(np.isnan(to_numeric('XX', errors='coerce')))
  166. def test_numeric_dtypes(self):
  167. idx = pd.Index([1, 2, 3], name='xxx')
  168. res = pd.to_numeric(idx)
  169. tm.assert_index_equal(res, idx)
  170. res = pd.to_numeric(pd.Series(idx, name='xxx'))
  171. tm.assert_series_equal(res, pd.Series(idx, name='xxx'))
  172. res = pd.to_numeric(idx.values)
  173. tm.assert_numpy_array_equal(res, idx.values)
  174. idx = pd.Index([1., np.nan, 3., np.nan], name='xxx')
  175. res = pd.to_numeric(idx)
  176. tm.assert_index_equal(res, idx)
  177. res = pd.to_numeric(pd.Series(idx, name='xxx'))
  178. tm.assert_series_equal(res, pd.Series(idx, name='xxx'))
  179. res = pd.to_numeric(idx.values)
  180. tm.assert_numpy_array_equal(res, idx.values)
  181. def test_str(self):
  182. idx = pd.Index(['1', '2', '3'], name='xxx')
  183. exp = np.array([1, 2, 3], dtype='int64')
  184. res = pd.to_numeric(idx)
  185. tm.assert_index_equal(res, pd.Index(exp, name='xxx'))
  186. res = pd.to_numeric(pd.Series(idx, name='xxx'))
  187. tm.assert_series_equal(res, pd.Series(exp, name='xxx'))
  188. res = pd.to_numeric(idx.values)
  189. tm.assert_numpy_array_equal(res, exp)
  190. idx = pd.Index(['1.5', '2.7', '3.4'], name='xxx')
  191. exp = np.array([1.5, 2.7, 3.4])
  192. res = pd.to_numeric(idx)
  193. tm.assert_index_equal(res, pd.Index(exp, name='xxx'))
  194. res = pd.to_numeric(pd.Series(idx, name='xxx'))
  195. tm.assert_series_equal(res, pd.Series(exp, name='xxx'))
  196. res = pd.to_numeric(idx.values)
  197. tm.assert_numpy_array_equal(res, exp)
  198. def test_datetimelike(self):
  199. for tz in [None, 'US/Eastern', 'Asia/Tokyo']:
  200. idx = pd.date_range('20130101', periods=3, tz=tz, name='xxx')
  201. res = pd.to_numeric(idx)
  202. tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx'))
  203. res = pd.to_numeric(pd.Series(idx, name='xxx'))
  204. tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx'))
  205. res = pd.to_numeric(idx.values)
  206. tm.assert_numpy_array_equal(res, idx.asi8)
  207. def test_timedelta(self):
  208. idx = pd.timedelta_range('1 days', periods=3, freq='D', name='xxx')
  209. res = pd.to_numeric(idx)
  210. tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx'))
  211. res = pd.to_numeric(pd.Series(idx, name='xxx'))
  212. tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx'))
  213. res = pd.to_numeric(idx.values)
  214. tm.assert_numpy_array_equal(res, idx.asi8)
  215. def test_period(self):
  216. idx = pd.period_range('2011-01', periods=3, freq='M', name='xxx')
  217. res = pd.to_numeric(idx)
  218. tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx'))
  219. # ToDo: enable when we can support native PeriodDtype
  220. # res = pd.to_numeric(pd.Series(idx, name='xxx'))
  221. # tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx'))
  222. def test_non_hashable(self):
  223. # Test for Bug #13324
  224. s = pd.Series([[10.0, 2], 1.0, 'apple'])
  225. res = pd.to_numeric(s, errors='coerce')
  226. tm.assert_series_equal(res, pd.Series([np.nan, 1.0, np.nan]))
  227. res = pd.to_numeric(s, errors='ignore')
  228. tm.assert_series_equal(res, pd.Series([[10.0, 2], 1.0, 'apple']))
  229. with self.assertRaisesRegexp(TypeError, "Invalid object type"):
  230. pd.to_numeric(s)
  231. def test_downcast(self):
  232. # see gh-13352
  233. mixed_data = ['1', 2, 3]
  234. int_data = [1, 2, 3]
  235. date_data = np.array(['1970-01-02', '1970-01-03',
  236. '1970-01-04'], dtype='datetime64[D]')
  237. invalid_downcast = 'unsigned-integer'
  238. msg = 'invalid downcasting method provided'
  239. smallest_int_dtype = np.dtype(np.typecodes['Integer'][0])
  240. smallest_uint_dtype = np.dtype(np.typecodes['UnsignedInteger'][0])
  241. # support below np.float32 is rare and far between
  242. float_32_char = np.dtype(np.float32).char
  243. smallest_float_dtype = float_32_char
  244. for data in (mixed_data, int_data, date_data):
  245. with self.assertRaisesRegexp(ValueError, msg):
  246. pd.to_numeric(data, downcast=invalid_downcast)
  247. expected = np.array([1, 2, 3], dtype=np.int64)
  248. res = pd.to_numeric(data)
  249. tm.assert_numpy_array_equal(res, expected)
  250. res = pd.to_numeric(data, downcast=None)
  251. tm.assert_numpy_array_equal(res, expected)
  252. expected = np.array([1, 2, 3], dtype=smallest_int_dtype)
  253. for signed_downcast in ('integer', 'signed'):
  254. res = pd.to_numeric(data, downcast=signed_downcast)
  255. tm.assert_numpy_array_equal(res, expected)
  256. expected = np.array([1, 2, 3], dtype=smallest_uint_dtype)
  257. res = pd.to_numeric(data, downcast='unsigned')
  258. tm.assert_numpy_array_equal(res, expected)
  259. expected = np.array([1, 2, 3], dtype=smallest_float_dtype)
  260. res = pd.to_numeric(data, downcast='float')
  261. tm.assert_numpy_array_equal(res, expected)
  262. # if we can't successfully cast the given
  263. # data to a numeric dtype, do not bother
  264. # with the downcast parameter
  265. data = ['foo', 2, 3]
  266. expected = np.array(data, dtype=object)
  267. res = pd.to_numeric(data, errors='ignore',
  268. downcast='unsigned')
  269. tm.assert_numpy_array_equal(res, expected)
  270. # cannot cast to an unsigned integer because
  271. # we have a negative number
  272. data = ['-1', 2, 3]
  273. expected = np.array([-1, 2, 3], dtype=np.int64)
  274. res = pd.to_numeric(data, downcast='unsigned')
  275. tm.assert_numpy_array_equal(res, expected)
  276. # cannot cast to an integer (signed or unsigned)
  277. # because we have a float number
  278. data = ['1.1', 2, 3]
  279. expected = np.array([1.1, 2, 3], dtype=np.float64)
  280. for downcast in ('integer', 'signed', 'unsigned'):
  281. res = pd.to_numeric(data, downcast=downcast)
  282. tm.assert_numpy_array_equal(res, expected)
  283. # the smallest integer dtype need not be np.(u)int8
  284. data = ['256', 257, 258]
  285. for downcast, expected_dtype in zip(
  286. ['integer', 'signed', 'unsigned'],
  287. [np.int16, np.int16, np.uint16]):
  288. expected = np.array([256, 257, 258], dtype=expected_dtype)
  289. res = pd.to_numeric(data, downcast=downcast)
  290. tm.assert_numpy_array_equal(res, expected)
  291. if __name__ == '__main__':
  292. nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
  293. exit=False)