PageRenderTime 43ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/pandas/_libs/join.pyx

https://github.com/neurodebian/pandas
Cython | 267 lines | 188 code | 62 blank | 17 comment | 49 complexity | 3921bcce405e4ed340b61f7565bb63c6 MD5 | raw file
  1. # cython: profile=False
  2. cimport numpy as np
  3. import numpy as np
  4. cimport cython
  5. from cython cimport Py_ssize_t
  6. np.import_array()
  7. from numpy cimport (ndarray,
  8. int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
  9. uint32_t, uint64_t, float16_t, float32_t, float64_t)
  10. cdef double NaN = <double> np.NaN
  11. cdef double nan = NaN
  12. from pandas._libs.algos import groupsort_indexer, ensure_platform_int
  13. from pandas.core.algorithms import take_nd
  14. include "join_func_helper.pxi"
  15. def inner_join(ndarray[int64_t] left, ndarray[int64_t] right,
  16. Py_ssize_t max_groups):
  17. cdef:
  18. Py_ssize_t i, j, k, count = 0
  19. ndarray[int64_t] left_count, right_count, left_sorter, right_sorter
  20. ndarray[int64_t] left_indexer, right_indexer
  21. int64_t lc, rc
  22. # NA group in location 0
  23. left_sorter, left_count = groupsort_indexer(left, max_groups)
  24. right_sorter, right_count = groupsort_indexer(right, max_groups)
  25. # First pass, determine size of result set, do not use the NA group
  26. for i in range(1, max_groups + 1):
  27. lc = left_count[i]
  28. rc = right_count[i]
  29. if rc > 0 and lc > 0:
  30. count += lc * rc
  31. # group 0 is the NA group
  32. cdef:
  33. Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0
  34. Py_ssize_t offset
  35. # exclude the NA group
  36. left_pos = left_count[0]
  37. right_pos = right_count[0]
  38. left_indexer = np.empty(count, dtype=np.int64)
  39. right_indexer = np.empty(count, dtype=np.int64)
  40. for i in range(1, max_groups + 1):
  41. lc = left_count[i]
  42. rc = right_count[i]
  43. if rc > 0 and lc > 0:
  44. for j in range(lc):
  45. offset = position + j * rc
  46. for k in range(rc):
  47. left_indexer[offset + k] = left_pos + j
  48. right_indexer[offset + k] = right_pos + k
  49. position += lc * rc
  50. left_pos += lc
  51. right_pos += rc
  52. return (_get_result_indexer(left_sorter, left_indexer),
  53. _get_result_indexer(right_sorter, right_indexer))
  54. def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
  55. Py_ssize_t max_groups, sort=True):
  56. cdef:
  57. Py_ssize_t i, j, k, count = 0
  58. ndarray[int64_t] left_count, right_count
  59. ndarray left_sorter, right_sorter, rev
  60. ndarray[int64_t] left_indexer, right_indexer
  61. int64_t lc, rc
  62. # NA group in location 0
  63. left_sorter, left_count = groupsort_indexer(left, max_groups)
  64. right_sorter, right_count = groupsort_indexer(right, max_groups)
  65. # First pass, determine size of result set, do not use the NA group
  66. for i in range(1, max_groups + 1):
  67. if right_count[i] > 0:
  68. count += left_count[i] * right_count[i]
  69. else:
  70. count += left_count[i]
  71. # group 0 is the NA group
  72. cdef:
  73. Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0
  74. Py_ssize_t offset
  75. # exclude the NA group
  76. left_pos = left_count[0]
  77. right_pos = right_count[0]
  78. left_indexer = np.empty(count, dtype=np.int64)
  79. right_indexer = np.empty(count, dtype=np.int64)
  80. for i in range(1, max_groups + 1):
  81. lc = left_count[i]
  82. rc = right_count[i]
  83. if rc == 0:
  84. for j in range(lc):
  85. left_indexer[position + j] = left_pos + j
  86. right_indexer[position + j] = -1
  87. position += lc
  88. else:
  89. for j in range(lc):
  90. offset = position + j * rc
  91. for k in range(rc):
  92. left_indexer[offset + k] = left_pos + j
  93. right_indexer[offset + k] = right_pos + k
  94. position += lc * rc
  95. left_pos += lc
  96. right_pos += rc
  97. left_indexer = _get_result_indexer(left_sorter, left_indexer)
  98. right_indexer = _get_result_indexer(right_sorter, right_indexer)
  99. if not sort: # if not asked to sort, revert to original order
  100. if len(left) == len(left_indexer):
  101. # no multiple matches for any row on the left
  102. # this is a short-cut to avoid groupsort_indexer
  103. # otherwise, the `else` path also works in this case
  104. left_sorter = ensure_platform_int(left_sorter)
  105. rev = np.empty(len(left), dtype=np.intp)
  106. rev.put(left_sorter, np.arange(len(left)))
  107. else:
  108. rev, _ = groupsort_indexer(left_indexer, len(left))
  109. rev = ensure_platform_int(rev)
  110. right_indexer = right_indexer.take(rev)
  111. left_indexer = left_indexer.take(rev)
  112. return left_indexer, right_indexer
  113. def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
  114. Py_ssize_t max_groups):
  115. cdef:
  116. Py_ssize_t i, j, k, count = 0
  117. ndarray[int64_t] left_count, right_count, left_sorter, right_sorter
  118. ndarray[int64_t] left_indexer, right_indexer
  119. int64_t lc, rc
  120. # NA group in location 0
  121. left_sorter, left_count = groupsort_indexer(left, max_groups)
  122. right_sorter, right_count = groupsort_indexer(right, max_groups)
  123. # First pass, determine size of result set, do not use the NA group
  124. for i in range(1, max_groups + 1):
  125. lc = left_count[i]
  126. rc = right_count[i]
  127. if rc > 0 and lc > 0:
  128. count += lc * rc
  129. else:
  130. count += lc + rc
  131. # group 0 is the NA group
  132. cdef:
  133. int64_t left_pos = 0, right_pos = 0
  134. Py_ssize_t offset, position = 0
  135. # exclude the NA group
  136. left_pos = left_count[0]
  137. right_pos = right_count[0]
  138. left_indexer = np.empty(count, dtype=np.int64)
  139. right_indexer = np.empty(count, dtype=np.int64)
  140. for i in range(1, max_groups + 1):
  141. lc = left_count[i]
  142. rc = right_count[i]
  143. if rc == 0:
  144. for j in range(lc):
  145. left_indexer[position + j] = left_pos + j
  146. right_indexer[position + j] = -1
  147. position += lc
  148. elif lc == 0:
  149. for j in range(rc):
  150. left_indexer[position + j] = -1
  151. right_indexer[position + j] = right_pos + j
  152. position += rc
  153. else:
  154. for j in range(lc):
  155. offset = position + j * rc
  156. for k in range(rc):
  157. left_indexer[offset + k] = left_pos + j
  158. right_indexer[offset + k] = right_pos + k
  159. position += lc * rc
  160. left_pos += lc
  161. right_pos += rc
  162. return (_get_result_indexer(left_sorter, left_indexer),
  163. _get_result_indexer(right_sorter, right_indexer))
  164. def _get_result_indexer(sorter, indexer):
  165. if len(sorter) > 0:
  166. res = take_nd(sorter, indexer, fill_value=-1)
  167. else:
  168. # length-0 case
  169. res = np.empty(len(indexer), dtype=np.int64)
  170. res.fill(-1)
  171. return res
  172. def ffill_indexer(ndarray[int64_t] indexer):
  173. cdef:
  174. Py_ssize_t i, n = len(indexer)
  175. ndarray[int64_t] result
  176. int64_t val, last_obs
  177. result = np.empty(n, dtype=np.int64)
  178. last_obs = -1
  179. for i in range(n):
  180. val = indexer[i]
  181. if val == -1:
  182. result[i] = last_obs
  183. else:
  184. result[i] = val
  185. last_obs = val
  186. return result
  187. def ffill_by_group(ndarray[int64_t] indexer, ndarray[int64_t] group_ids,
  188. int64_t max_group):
  189. cdef:
  190. Py_ssize_t i, n = len(indexer)
  191. ndarray[int64_t] result, last_obs
  192. int64_t gid, val
  193. result = np.empty(n, dtype=np.int64)
  194. last_obs = np.empty(max_group, dtype=np.int64)
  195. last_obs.fill(-1)
  196. for i in range(n):
  197. gid = group_ids[i]
  198. val = indexer[i]
  199. if val == -1:
  200. result[i] = last_obs[gid]
  201. else:
  202. result[i] = val
  203. last_obs[gid] = val
  204. return result
  205. include "join_helper.pxi"