/petl/test/transform/test_sorts.py

http://github.com/alimanfoo/petl · Python · 525 lines · 404 code · 106 blank · 15 comment · 8 complexity · a5deef1978d912c7773f9c17f424e6f5 MD5 · raw file

  1. from __future__ import absolute_import, print_function, division
  2. import os
  3. import gc
  4. import logging
  5. from datetime import datetime
  6. import sys
  7. from petl.compat import next
  8. from petl.test.helpers import ieq, eq_
  9. from petl.util import nrows
  10. from petl.transform.basics import cat
  11. from petl.transform.sorts import sort, mergesort, issorted
  12. logger = logging.getLogger(__name__)
  13. debug = logger.debug
  14. def test_sort_1():
  15. table = (('foo', 'bar'),
  16. ('C', '2'),
  17. ('A', '9'),
  18. ('A', '6'),
  19. ('F', '1'),
  20. ('D', '10'))
  21. result = sort(table, 'foo')
  22. expectation = (('foo', 'bar'),
  23. ('A', '9'),
  24. ('A', '6'),
  25. ('C', '2'),
  26. ('D', '10'),
  27. ('F', '1'))
  28. ieq(expectation, result)
  29. def test_sort_2():
  30. table = (('foo', 'bar'),
  31. ('C', '2'),
  32. ('A', '9'),
  33. ('A', '6'),
  34. ('F', '1'),
  35. ('D', '10'))
  36. result = sort(table, key=('foo', 'bar'))
  37. expectation = (('foo', 'bar'),
  38. ('A', '6'),
  39. ('A', '9'),
  40. ('C', '2'),
  41. ('D', '10'),
  42. ('F', '1'))
  43. ieq(expectation, result)
  44. result = sort(table) # default is lexical sort
  45. expectation = (('foo', 'bar'),
  46. ('A', '6'),
  47. ('A', '9'),
  48. ('C', '2'),
  49. ('D', '10'),
  50. ('F', '1'))
  51. ieq(expectation, result)
  52. def test_sort_3():
  53. table = (('foo', 'bar'),
  54. ('C', '2'),
  55. ('A', '9'),
  56. ('A', '6'),
  57. ('F', '1'),
  58. ('D', '10'))
  59. result = sort(table, 'bar')
  60. expectation = (('foo', 'bar'),
  61. ('F', '1'),
  62. ('D', '10'),
  63. ('C', '2'),
  64. ('A', '6'),
  65. ('A', '9'))
  66. ieq(expectation, result)
  67. def test_sort_4():
  68. table = (('foo', 'bar'),
  69. ('C', 2),
  70. ('A', 9),
  71. ('A', 6),
  72. ('F', 1),
  73. ('D', 10))
  74. result = sort(table, 'bar')
  75. expectation = (('foo', 'bar'),
  76. ('F', 1),
  77. ('C', 2),
  78. ('A', 6),
  79. ('A', 9),
  80. ('D', 10))
  81. ieq(expectation, result)
  82. def test_sort_5():
  83. table = (('foo', 'bar'),
  84. (2.3, 2),
  85. (1.2, 9),
  86. (2.3, 6),
  87. (3.2, 1),
  88. (1.2, 10))
  89. expectation = (('foo', 'bar'),
  90. (1.2, 9),
  91. (1.2, 10),
  92. (2.3, 2),
  93. (2.3, 6),
  94. (3.2, 1))
  95. # can use either field names or indices (from 1) to specify sort key
  96. result = sort(table, key=('foo', 'bar'))
  97. ieq(expectation, result)
  98. result = sort(table, key=(0, 1))
  99. ieq(expectation, result)
  100. result = sort(table, key=('foo', 1))
  101. ieq(expectation, result)
  102. result = sort(table, key=(0, 'bar'))
  103. ieq(expectation, result)
  104. def test_sort_6():
  105. table = (('foo', 'bar'),
  106. (2.3, 2),
  107. (1.2, 9),
  108. (2.3, 6),
  109. (3.2, 1),
  110. (1.2, 10))
  111. expectation = (('foo', 'bar'),
  112. (3.2, 1),
  113. (2.3, 6),
  114. (2.3, 2),
  115. (1.2, 10),
  116. (1.2, 9))
  117. result = sort(table, key=('foo', 'bar'), reverse=True)
  118. ieq(expectation, result)
  119. def test_sort_buffered():
  120. table = (('foo', 'bar'),
  121. ('C', 2),
  122. ('A', 9),
  123. ('A', 6),
  124. ('F', 1),
  125. ('D', 10))
  126. # test sort forwards
  127. expectation = (('foo', 'bar'),
  128. ('F', 1),
  129. ('C', 2),
  130. ('A', 6),
  131. ('A', 9),
  132. ('D', 10))
  133. result = sort(table, 'bar')
  134. ieq(expectation, result)
  135. result = sort(table, 'bar', buffersize=2)
  136. ieq(expectation, result)
  137. # sort in reverse
  138. expectation = (('foo', 'bar'),
  139. ('D', 10),
  140. ('A', 9),
  141. ('A', 6),
  142. ('C', 2),
  143. ('F', 1))
  144. result = sort(table, 'bar', reverse=True)
  145. ieq(expectation, result)
  146. result = sort(table, 'bar', reverse=True, buffersize=2)
  147. ieq(expectation, result)
  148. # no key
  149. expectation = (('foo', 'bar'),
  150. ('F', 1),
  151. ('D', 10),
  152. ('C', 2),
  153. ('A', 9),
  154. ('A', 6))
  155. result = sort(table, reverse=True)
  156. ieq(expectation, result)
  157. result = sort(table, reverse=True, buffersize=2)
  158. ieq(expectation, result)
  159. def test_sort_buffered_tempdir():
  160. table = (('foo', 'bar'),
  161. ('C', 2),
  162. ('A', 9),
  163. ('A', 6),
  164. ('F', 1),
  165. ('D', 10))
  166. # test sort forwards
  167. expectation = (('foo', 'bar'),
  168. ('F', 1),
  169. ('C', 2),
  170. ('A', 6),
  171. ('A', 9),
  172. ('D', 10))
  173. result = sort(table, 'bar')
  174. ieq(expectation, result)
  175. tempdir = 'tmp'
  176. if not os.path.exists(tempdir):
  177. os.mkdir(tempdir)
  178. result = sort(table, 'bar', buffersize=2, tempdir=tempdir)
  179. ieq(expectation, result)
  180. def test_sort_buffered_independent():
  181. table = (('foo', 'bar'),
  182. ('C', 2),
  183. ('A', 9),
  184. ('A', 6),
  185. ('F', 1),
  186. ('D', 10))
  187. expectation = (('foo', 'bar'),
  188. ('F', 1),
  189. ('C', 2),
  190. ('A', 6),
  191. ('A', 9),
  192. ('D', 10))
  193. result = sort(table, 'bar', buffersize=4)
  194. nrows(result) # cause data to be cached
  195. # check that two row iterators are independent, i.e., consuming rows
  196. # from one does not affect the other
  197. it1 = iter(result)
  198. it2 = iter(result)
  199. eq_(expectation[0], next(it1))
  200. eq_(expectation[1], next(it1))
  201. eq_(expectation[0], next(it2))
  202. eq_(expectation[1], next(it2))
  203. eq_(expectation[2], next(it2))
  204. eq_(expectation[2], next(it1))
  205. def _get_names(l):
  206. return [x.name for x in l]
  207. def test_sort_buffered_cleanup():
  208. table = (('foo', 'bar'),
  209. ('C', 2),
  210. ('A', 9),
  211. ('A', 6),
  212. ('F', 1),
  213. ('D', 10))
  214. result = sort(table, 'bar', buffersize=2)
  215. debug('initially filecache should be empty')
  216. eq_(None, result._filecache)
  217. debug('pull rows through, should populate file cache')
  218. eq_(5, nrows(result))
  219. eq_(3, len(result._filecache))
  220. debug('check all files exist')
  221. filenames = _get_names(result._filecache)
  222. for fn in filenames:
  223. assert os.path.exists(fn), fn
  224. debug('delete object and garbage collect')
  225. del result
  226. gc.collect()
  227. debug('check all files have been deleted')
  228. for fn in filenames:
  229. assert not os.path.exists(fn), fn
  230. import platform
  231. if platform.python_implementation() == 'PyPy':
  232. print('SKIP sort cleanup test (PyPy)', file=sys.stderr)
  233. else:
  234. def test_sort_buffered_cleanup_open_iterator():
  235. table = (('foo', 'bar'),
  236. ('C', 2),
  237. ('A', 9),
  238. ('A', 6),
  239. ('F', 1),
  240. ('D', 10))
  241. # check if cleanup is robust against open iterators
  242. result = sort(table, 'bar', buffersize=2)
  243. debug('pull rows through, should populate file cache')
  244. eq_(5, nrows(result))
  245. eq_(3, len(result._filecache))
  246. debug('check all files exist')
  247. filenames = _get_names(result._filecache)
  248. for fn in filenames:
  249. assert os.path.exists(fn), fn
  250. debug(filenames)
  251. debug('open an iterator')
  252. it = iter(result)
  253. next(it)
  254. next(it)
  255. debug('delete objects and garbage collect')
  256. del result
  257. del it
  258. gc.collect()
  259. for fn in filenames:
  260. assert not os.path.exists(fn), fn
  261. def test_sort_empty():
  262. table = (('foo', 'bar'),)
  263. expect = (('foo', 'bar'),)
  264. actual = sort(table)
  265. ieq(expect, actual)
  266. def test_sort_none():
  267. table = (('foo', 'bar'),
  268. ('C', 2),
  269. ('A', 9),
  270. ('A', None),
  271. ('F', 1),
  272. ('D', 10))
  273. result = sort(table, 'bar')
  274. print(list(result))
  275. expectation = (('foo', 'bar'),
  276. ('A', None),
  277. ('F', 1),
  278. ('C', 2),
  279. ('A', 9),
  280. ('D', 10))
  281. ieq(expectation, result)
  282. dt = datetime.now().replace
  283. table = (('foo', 'bar'),
  284. ('C', dt(hour=5)),
  285. ('A', dt(hour=1)),
  286. ('A', None),
  287. ('F', dt(hour=9)),
  288. ('D', dt(hour=17)))
  289. result = sort(table, 'bar')
  290. expectation = (('foo', 'bar'),
  291. ('A', None),
  292. ('A', dt(hour=1)),
  293. ('C', dt(hour=5)),
  294. ('F', dt(hour=9)),
  295. ('D', dt(hour=17)))
  296. ieq(expectation, result)
  297. # TODO test sort with native comparison
  298. def test_mergesort_1():
  299. table1 = (('foo', 'bar'),
  300. ('A', 6),
  301. ('C', 2),
  302. ('D', 10),
  303. ('A', 9),
  304. ('F', 1))
  305. table2 = (('foo', 'bar'),
  306. ('B', 3),
  307. ('D', 10),
  308. ('A', 10),
  309. ('F', 4))
  310. # should be same as concatenate then sort (but more efficient, esp. when
  311. # presorted)
  312. expect = sort(cat(table1, table2))
  313. actual = mergesort(table1, table2)
  314. ieq(expect, actual)
  315. ieq(expect, actual)
  316. actual = mergesort(sort(table1), sort(table2), presorted=True)
  317. ieq(expect, actual)
  318. ieq(expect, actual)
  319. def test_mergesort_2():
  320. table1 = (('foo', 'bar'),
  321. ('A', 9),
  322. ('C', 2),
  323. ('D', 10),
  324. ('A', 6),
  325. ('F', 1))
  326. table2 = (('foo', 'baz'),
  327. ('B', 3),
  328. ('D', 10),
  329. ('A', 10),
  330. ('F', 4))
  331. # should be same as concatenate then sort (but more efficient, esp. when
  332. # presorted)
  333. expect = sort(cat(table1, table2), key='foo')
  334. actual = mergesort(table1, table2, key='foo')
  335. ieq(expect, actual)
  336. ieq(expect, actual)
  337. actual = mergesort(sort(table1, key='foo'), sort(table2, key='foo'),
  338. key='foo', presorted=True)
  339. ieq(expect, actual)
  340. ieq(expect, actual)
  341. def test_mergesort_3():
  342. table1 = (('foo', 'bar'),
  343. ('A', 9),
  344. ('C', 2),
  345. ('D', 10),
  346. ('A', 6),
  347. ('F', 1))
  348. table2 = (('foo', 'baz'),
  349. ('B', 3),
  350. ('D', 10),
  351. ('A', 10),
  352. ('F', 4))
  353. # should be same as concatenate then sort (but more efficient, esp. when
  354. # presorted)
  355. expect = sort(cat(table1, table2), key='foo', reverse=True)
  356. actual = mergesort(table1, table2, key='foo', reverse=True)
  357. ieq(expect, actual)
  358. ieq(expect, actual)
  359. actual = mergesort(sort(table1, key='foo', reverse=True),
  360. sort(table2, key='foo', reverse=True),
  361. key='foo', reverse=True, presorted=True)
  362. ieq(expect, actual)
  363. ieq(expect, actual)
  364. def test_mergesort_4():
  365. table1 = (('foo', 'bar', 'baz'),
  366. (1, 'A', True),
  367. (2, 'B', None),
  368. (4, 'C', True))
  369. table2 = (('bar', 'baz', 'quux'),
  370. ('A', True, 42.0),
  371. ('B', False, 79.3),
  372. ('C', False, 12.4))
  373. expect = sort(cat(table1, table2), key='bar')
  374. actual = mergesort(table1, table2, key='bar')
  375. ieq(expect, actual)
  376. ieq(expect, actual)
  377. def test_mergesort_empty():
  378. table1 = (('foo', 'bar'),
  379. ('A', 9),
  380. ('C', 2),
  381. ('D', 10),
  382. ('F', 1))
  383. table2 = (('foo', 'bar'),)
  384. expect = table1
  385. actual = mergesort(table1, table2, key='foo')
  386. ieq(expect, actual)
  387. ieq(expect, actual)
  388. def test_issorted():
  389. table1 = (('foo', 'bar', 'baz'),
  390. ('a', 1, True),
  391. ('b', 3, True),
  392. ('b', 2))
  393. assert issorted(table1, key='foo')
  394. assert not issorted(table1, key='foo', reverse=True)
  395. assert not issorted(table1, key='foo', strict=True)
  396. table2 = (('foo', 'bar', 'baz'),
  397. ('b', 2, True),
  398. ('a', 1, True),
  399. ('b', 3))
  400. assert not issorted(table2, key='foo')
  401. table3 = (('foo', 'bar', 'baz'),
  402. ('a', 1, True),
  403. ('b', 2, True),
  404. ('b', 3))
  405. assert issorted(table3, key=('foo', 'bar'))
  406. assert issorted(table3)
  407. table4 = (('foo', 'bar', 'baz'),
  408. ('a', 1, True),
  409. ('b', 3, True),
  410. ('b', 2))
  411. assert not issorted(table4, key=('foo', 'bar'))
  412. assert not issorted(table4)
  413. table5 = (('foo', 'bar', 'baz'),
  414. ('b', 3, True),
  415. ('b', 2),
  416. ('a', 1, True))
  417. assert not issorted(table5, key='foo')
  418. assert issorted(table5, key='foo', reverse=True)
  419. assert not issorted(table5, key='foo', reverse=True, strict=True)