/src/common/lookup_table.cpp

https://github.com/xiaochuanle/MECAT · C++ · 160 lines · 148 code · 12 blank · 0 comment · 27 complexity · 2eb03bd2d8e2500a4125cf00e1ab0f4a MD5 · raw file

  1. #include "lookup_table.h"
  2. #include "packed_db.h"
  3. #include <cstdio>
  4. ref_index*
  5. destroy_ref_index(ref_index* ridx)
  6. {
  7. safe_free(ridx->kmer_counts);
  8. safe_free(ridx->kmer_starts);
  9. safe_free(ridx->kmer_offsets);
  10. safe_free(ridx);
  11. return NULL;
  12. }
  13. typedef struct
  14. {
  15. ref_index* ridx;
  16. uint32_t min_key;
  17. uint32_t max_key;
  18. volume_t* v;
  19. int kmer_size;
  20. } ref_index_thread_info;
  21. void*
  22. fill_ref_index_offsets_func(void* arg)
  23. {
  24. ref_index_thread_info* riti = (ref_index_thread_info*)(arg);
  25. volume_t* v = riti->v;
  26. int num_reads = v->num_reads;
  27. ref_index* index = riti->ridx;
  28. int kmer_size = riti->kmer_size;
  29. uint32_t index_count = 1 << (kmer_size * 2);
  30. uint32_t leftnum = 34 - 2 * kmer_size;
  31. int i, j;
  32. for (i = 0; i < num_reads; ++i)
  33. {
  34. int read_start = v->offset_list->offset_list[i].offset;
  35. int read_size = v->offset_list->offset_list[i].size;
  36. uint32_t eit = 0;
  37. for (j = 0; j < read_size; ++j)
  38. {
  39. int k = read_start + j;
  40. uint8_t c = PackedDB::get_char(v->data, k);
  41. eit = (eit << 2) | c;
  42. assert(eit < index_count);
  43. if (j >= kmer_size - 1)
  44. {
  45. if (index->kmer_starts[eit] && eit >= riti->min_key && eit <= riti->max_key)
  46. {
  47. index->kmer_starts[eit][index->kmer_counts[eit]] = k + 1 - kmer_size;
  48. ++index->kmer_counts[eit];
  49. }
  50. eit <<= leftnum;
  51. eit >>= leftnum;
  52. }
  53. }
  54. }
  55. return NULL;
  56. }
  57. ref_index*
  58. create_ref_index(volume_t* v, int kmer_size, int num_threads)
  59. {
  60. DynamicTimer dtimer(__func__);
  61. uint32_t index_count = 1 << (kmer_size * 2);
  62. uint32_t leftnum = 34 - 2 * kmer_size;
  63. ref_index* index = (ref_index*)malloc(sizeof(ref_index));
  64. safe_calloc(index->kmer_counts, int, index_count);
  65. int num_reads = v->num_reads;
  66. for (uint32_t i = 0; i != index_count; ++i) assert(index->kmer_counts[i] == 0);
  67. for (int i = 0; i != num_reads; ++i)
  68. {
  69. int read_start = v->offset_list->offset_list[i].offset;
  70. int read_size = v->offset_list->offset_list[i].size;
  71. uint32_t eit = 0;
  72. for (int j = 0; j < read_size; ++j)
  73. {
  74. int k = read_start + j;
  75. uint8_t c = PackedDB::get_char(v->data, k);
  76. assert(c>= 0 && c < 4);
  77. eit = (eit << 2) | c;
  78. if (j >= kmer_size - 1)
  79. {
  80. assert(eit < index_count);
  81. ++index->kmer_counts[eit];
  82. eit = eit << leftnum;
  83. eit = eit >> leftnum;
  84. }
  85. }
  86. }
  87. int num_kmers = 0;
  88. for (uint32_t i = 0; i != index_count; ++i)
  89. {
  90. if (index->kmer_counts[i] > 128) index->kmer_counts[i] = 0;
  91. num_kmers += index->kmer_counts[i];
  92. }
  93. printf("number of kmers: %d\n", num_kmers);
  94. safe_malloc(index->kmer_offsets, int, num_kmers);
  95. safe_malloc(index->kmer_starts, int*, index_count);
  96. if (v->curr < 10 * 1000000) num_threads = 1;
  97. int kmers_per_thread = (num_kmers + num_threads - 1) / num_threads;
  98. fprintf(stderr, "%d threads are used for filling offset lists.\n", num_threads);
  99. uint32_t hash_boundaries[2 * num_threads];
  100. uint32_t L = 0;
  101. num_kmers = 0;
  102. int kmer_cnt = 0;
  103. int tid = 0;
  104. for (uint32_t i = 0; i != index_count; ++i)
  105. {
  106. if (index->kmer_counts[i])
  107. {
  108. index->kmer_starts[i] = index->kmer_offsets + num_kmers;
  109. num_kmers += index->kmer_counts[i];
  110. kmer_cnt += index->kmer_counts[i];
  111. index->kmer_counts[i] = 0;
  112. if (kmer_cnt >= kmers_per_thread)
  113. {
  114. printf("thread %d: %d\t%d\n", tid, L, i);
  115. hash_boundaries[2 * tid] = L;
  116. hash_boundaries[2 * tid + 1] = i;
  117. ++tid;
  118. L = i + 1;
  119. kmer_cnt = 0;
  120. }
  121. }
  122. else
  123. {
  124. index->kmer_starts[i] = NULL;
  125. }
  126. }
  127. if (kmer_cnt)
  128. {
  129. printf("thread %d: %d\t%d\n", tid, L, index_count - 1);
  130. hash_boundaries[2 * tid] = L;
  131. hash_boundaries[2 * tid + 1] = index_count - 1;
  132. }
  133. ref_index_thread_info ritis[num_threads];
  134. for (int i = 0; i != num_threads; ++i)
  135. {
  136. ritis[i].ridx = index;
  137. ritis[i].min_key = hash_boundaries[2 * i];
  138. ritis[i].max_key = hash_boundaries[2 * i + 1];
  139. ritis[i].v = v;
  140. ritis[i].kmer_size = kmer_size;
  141. }
  142. pthread_t tids[num_threads];
  143. for (int j = 0; j < num_threads; ++j)
  144. pthread_create(tids + j, NULL, fill_ref_index_offsets_func, (void*)(ritis + j));
  145. for (int j = 0; j < num_threads; ++j)
  146. pthread_join(tids[j], NULL);
  147. return index;
  148. }