PageRenderTime 54ms CodeModel.GetById 26ms RepoModel.GetById 1ms app.codeStats 0ms

/src/hashsig.c

https://github.com/rowanj/libgit2
C | 347 lines | 258 code | 73 blank | 16 comment | 54 complexity | 0ed38a3a2c59d50a3c73afa597de54eb MD5 | raw file
  1. /*
  2. * Copyright (C) the libgit2 contributors. All rights reserved.
  3. *
  4. * This file is part of libgit2, distributed under the GNU GPL v2 with
  5. * a Linking Exception. For full terms see the included COPYING file.
  6. */
  7. #include "hashsig.h"
  8. #include "fileops.h"
  9. #include "util.h"
  10. typedef uint32_t hashsig_t;
  11. typedef uint64_t hashsig_state;
  12. #define HASHSIG_SCALE 100
  13. #define HASHSIG_MAX_RUN 80
  14. #define HASHSIG_HASH_START 0x012345678ABCDEF0LL
  15. #define HASHSIG_HASH_SHIFT 5
  16. #define HASHSIG_HASH_MIX(S,CH) \
  17. (S) = ((S) << HASHSIG_HASH_SHIFT) - (S) + (hashsig_state)(CH)
  18. #define HASHSIG_HEAP_SIZE ((1 << 7) - 1)
  19. #define HASHSIG_HEAP_MIN_SIZE 4
  20. typedef int (*hashsig_cmp)(const void *a, const void *b, void *);
  21. typedef struct {
  22. int size, asize;
  23. hashsig_cmp cmp;
  24. hashsig_t values[HASHSIG_HEAP_SIZE];
  25. } hashsig_heap;
  26. struct git_hashsig {
  27. hashsig_heap mins;
  28. hashsig_heap maxs;
  29. git_hashsig_option_t opt;
  30. int considered;
  31. };
  32. #define HEAP_LCHILD_OF(I) (((I)<<1)+1)
  33. #define HEAP_RCHILD_OF(I) (((I)<<1)+2)
  34. #define HEAP_PARENT_OF(I) (((I)-1)>>1)
  35. static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp)
  36. {
  37. h->size = 0;
  38. h->asize = HASHSIG_HEAP_SIZE;
  39. h->cmp = cmp;
  40. }
  41. static int hashsig_cmp_max(const void *a, const void *b, void *payload)
  42. {
  43. hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
  44. GIT_UNUSED(payload);
  45. return (av < bv) ? -1 : (av > bv) ? 1 : 0;
  46. }
  47. static int hashsig_cmp_min(const void *a, const void *b, void *payload)
  48. {
  49. hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
  50. GIT_UNUSED(payload);
  51. return (av > bv) ? -1 : (av < bv) ? 1 : 0;
  52. }
  53. static void hashsig_heap_up(hashsig_heap *h, int el)
  54. {
  55. int parent_el = HEAP_PARENT_OF(el);
  56. while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el], NULL) > 0) {
  57. hashsig_t t = h->values[el];
  58. h->values[el] = h->values[parent_el];
  59. h->values[parent_el] = t;
  60. el = parent_el;
  61. parent_el = HEAP_PARENT_OF(el);
  62. }
  63. }
  64. static void hashsig_heap_down(hashsig_heap *h, int el)
  65. {
  66. hashsig_t v, lv, rv;
  67. /* 'el < h->size / 2' tests if el is bottom row of heap */
  68. while (el < h->size / 2) {
  69. int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel;
  70. v = h->values[el];
  71. lv = h->values[lel];
  72. rv = h->values[rel];
  73. if (h->cmp(&v, &lv, NULL) < 0 && h->cmp(&v, &rv, NULL) < 0)
  74. break;
  75. swapel = (h->cmp(&lv, &rv, NULL) < 0) ? lel : rel;
  76. h->values[el] = h->values[swapel];
  77. h->values[swapel] = v;
  78. el = swapel;
  79. }
  80. }
  81. static void hashsig_heap_sort(hashsig_heap *h)
  82. {
  83. /* only need to do this at the end for signature comparison */
  84. git__qsort_r(h->values, h->size, sizeof(hashsig_t), h->cmp, NULL);
  85. }
  86. static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val)
  87. {
  88. /* if heap is not full, insert new element */
  89. if (h->size < h->asize) {
  90. h->values[h->size++] = val;
  91. hashsig_heap_up(h, h->size - 1);
  92. }
  93. /* if heap is full, pop top if new element should replace it */
  94. else if (h->cmp(&val, &h->values[0], NULL) > 0) {
  95. h->size--;
  96. h->values[0] = h->values[h->size];
  97. hashsig_heap_down(h, 0);
  98. }
  99. }
  100. typedef struct {
  101. int use_ignores;
  102. uint8_t ignore_ch[256];
  103. } hashsig_in_progress;
  104. static void hashsig_in_progress_init(
  105. hashsig_in_progress *prog, git_hashsig *sig)
  106. {
  107. int i;
  108. switch (sig->opt) {
  109. case GIT_HASHSIG_IGNORE_WHITESPACE:
  110. for (i = 0; i < 256; ++i)
  111. prog->ignore_ch[i] = git__isspace_nonlf(i);
  112. prog->use_ignores = 1;
  113. break;
  114. case GIT_HASHSIG_SMART_WHITESPACE:
  115. for (i = 0; i < 256; ++i)
  116. prog->ignore_ch[i] = git__isspace(i);
  117. prog->use_ignores = 1;
  118. break;
  119. default:
  120. memset(prog, 0, sizeof(*prog));
  121. break;
  122. }
  123. }
  124. #define HASHSIG_IN_PROGRESS_INIT { 1 }
  125. static int hashsig_add_hashes(
  126. git_hashsig *sig,
  127. const uint8_t *data,
  128. size_t size,
  129. hashsig_in_progress *prog)
  130. {
  131. const uint8_t *scan = data, *end = data + size;
  132. hashsig_state state = HASHSIG_HASH_START;
  133. int use_ignores = prog->use_ignores, len;
  134. uint8_t ch;
  135. while (scan < end) {
  136. state = HASHSIG_HASH_START;
  137. for (len = 0; scan < end && len < HASHSIG_MAX_RUN; ) {
  138. ch = *scan;
  139. if (use_ignores)
  140. for (; scan < end && git__isspace_nonlf(ch); ch = *scan)
  141. ++scan;
  142. else if (sig->opt != GIT_HASHSIG_NORMAL)
  143. for (; scan < end && ch == '\r'; ch = *scan)
  144. ++scan;
  145. /* peek at next character to decide what to do next */
  146. if (sig->opt == GIT_HASHSIG_SMART_WHITESPACE)
  147. use_ignores = (ch == '\n');
  148. if (scan >= end)
  149. break;
  150. ++scan;
  151. /* check run terminator */
  152. if (ch == '\n' || ch == '\0')
  153. break;
  154. ++len;
  155. HASHSIG_HASH_MIX(state, ch);
  156. }
  157. if (len > 0) {
  158. hashsig_heap_insert(&sig->mins, (hashsig_t)state);
  159. hashsig_heap_insert(&sig->maxs, (hashsig_t)state);
  160. sig->considered++;
  161. while (scan < end && (*scan == '\n' || !*scan))
  162. ++scan;
  163. }
  164. }
  165. prog->use_ignores = use_ignores;
  166. return 0;
  167. }
  168. static int hashsig_finalize_hashes(git_hashsig *sig)
  169. {
  170. if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE) {
  171. giterr_set(GITERR_INVALID,
  172. "File too small for similarity signature calculation");
  173. return GIT_EBUFS;
  174. }
  175. hashsig_heap_sort(&sig->mins);
  176. hashsig_heap_sort(&sig->maxs);
  177. return 0;
  178. }
  179. static git_hashsig *hashsig_alloc(git_hashsig_option_t opts)
  180. {
  181. git_hashsig *sig = git__calloc(1, sizeof(git_hashsig));
  182. if (!sig)
  183. return NULL;
  184. hashsig_heap_init(&sig->mins, hashsig_cmp_min);
  185. hashsig_heap_init(&sig->maxs, hashsig_cmp_max);
  186. sig->opt = opts;
  187. return sig;
  188. }
  189. int git_hashsig_create(
  190. git_hashsig **out,
  191. const char *buf,
  192. size_t buflen,
  193. git_hashsig_option_t opts)
  194. {
  195. int error;
  196. hashsig_in_progress prog;
  197. git_hashsig *sig = hashsig_alloc(opts);
  198. GITERR_CHECK_ALLOC(sig);
  199. hashsig_in_progress_init(&prog, sig);
  200. error = hashsig_add_hashes(sig, (const uint8_t *)buf, buflen, &prog);
  201. if (!error)
  202. error = hashsig_finalize_hashes(sig);
  203. if (!error)
  204. *out = sig;
  205. else
  206. git_hashsig_free(sig);
  207. return error;
  208. }
  209. int git_hashsig_create_fromfile(
  210. git_hashsig **out,
  211. const char *path,
  212. git_hashsig_option_t opts)
  213. {
  214. uint8_t buf[0x1000];
  215. ssize_t buflen = 0;
  216. int error = 0, fd;
  217. hashsig_in_progress prog;
  218. git_hashsig *sig = hashsig_alloc(opts);
  219. GITERR_CHECK_ALLOC(sig);
  220. if ((fd = git_futils_open_ro(path)) < 0) {
  221. git__free(sig);
  222. return fd;
  223. }
  224. hashsig_in_progress_init(&prog, sig);
  225. while (!error) {
  226. if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) {
  227. if ((error = (int)buflen) < 0)
  228. giterr_set(GITERR_OS,
  229. "Read error on '%s' calculating similarity hashes", path);
  230. break;
  231. }
  232. error = hashsig_add_hashes(sig, buf, buflen, &prog);
  233. }
  234. p_close(fd);
  235. if (!error)
  236. error = hashsig_finalize_hashes(sig);
  237. if (!error)
  238. *out = sig;
  239. else
  240. git_hashsig_free(sig);
  241. return error;
  242. }
  243. void git_hashsig_free(git_hashsig *sig)
  244. {
  245. git__free(sig);
  246. }
  247. static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b)
  248. {
  249. int matches = 0, i, j, cmp;
  250. assert(a->cmp == b->cmp);
  251. /* hash heaps are sorted - just look for overlap vs total */
  252. for (i = 0, j = 0; i < a->size && j < b->size; ) {
  253. cmp = a->cmp(&a->values[i], &b->values[j], NULL);
  254. if (cmp < 0)
  255. ++i;
  256. else if (cmp > 0)
  257. ++j;
  258. else {
  259. ++i; ++j; ++matches;
  260. }
  261. }
  262. return HASHSIG_SCALE * (matches * 2) / (a->size + b->size);
  263. }
  264. int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b)
  265. {
  266. /* if we have fewer than the maximum number of elements, then just use
  267. * one array since the two arrays will be the same
  268. */
  269. if (a->mins.size < HASHSIG_HEAP_SIZE)
  270. return hashsig_heap_compare(&a->mins, &b->mins);
  271. else
  272. return (hashsig_heap_compare(&a->mins, &b->mins) +
  273. hashsig_heap_compare(&a->maxs, &b->maxs)) / 2;
  274. }