/src/grappa/binencode.c

https://code.google.com/p/poy/ · C · 619 lines · 519 code · 92 blank · 8 comment · 94 complexity · 6dfa153e35c3026e053c14279858ed6c MD5 · raw file

  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #ifndef CYGWINNT
  4. #include <string.h>
  5. #endif
  6. #include <math.h>
  7. #include "structs.h"
  8. #include "binencode.h"
  9. FILE *outfile;
  10. extern int NUM_GENOMES;
  11. int
  12. mapToPolarity ( int gene, int num_genes )
  13. {
  14. if ( gene > num_genes )
  15. gene -= ( 2 * num_genes + 1 );
  16. return ( gene );
  17. }
  18. int
  19. mapToLinear ( int gene, int num_genes )
  20. {
  21. if ( gene < 0 )
  22. gene += ( 2 * num_genes + 1 );
  23. return ( gene );
  24. }
  25. void
  26. fillGenomes ( struct genome_struct *g, int num_genomes, int num_genes )
  27. {
  28. int i, j;
  29. int r, tmp;
  30. g[0].genome_num = 1;
  31. sprintf ( g[0].gnamePtr, "Genome %3d", g[0].genome_num );
  32. for ( j = 0; j < num_genes; j++ )
  33. g[0].genes[j] = ( j + 1 );
  34. for ( i = 1; i < num_genomes; i++ )
  35. {
  36. g[i].genome_num = i + 1;
  37. sprintf ( g[i].gnamePtr, "Genome %3d", g[i].genome_num );
  38. for ( j = 0; j < num_genes; j++ )
  39. {
  40. if ( random ( ) <= MAX_RAND / 2 )
  41. g[i].genes[j] = ( j + 1 );
  42. else
  43. g[i].genes[j] = -( j + 1 );
  44. }
  45. for ( j = num_genes - 1; j >= 1; j-- )
  46. {
  47. r = ( int ) floor ( ( double ) ( j - 1 ) *
  48. ( ( double ) random ( ) /
  49. ( double ) MAX_RAND ) );
  50. tmp = g[i].genes[r];
  51. g[i].genes[r] = g[i].genes[j];
  52. g[i].genes[j] = tmp;
  53. }
  54. }
  55. return;
  56. }
  57. void
  58. fillGenomeRand ( int *genes, int num_genes )
  59. {
  60. int j;
  61. int r, tmp;
  62. for ( j = 0; j < num_genes; j++ )
  63. {
  64. if ( random ( ) <= MAX_RAND / 2 )
  65. genes[j] = ( j + 1 );
  66. else
  67. genes[j] = -( j + 1 );
  68. }
  69. for ( j = num_genes - 1; j >= 1; j-- )
  70. {
  71. r = ( int ) floor ( ( double ) ( j - 1 ) *
  72. ( ( double ) random ( ) /
  73. ( double ) MAX_RAND ) );
  74. tmp = genes[r];
  75. genes[r] = genes[j];
  76. genes[j] = tmp;
  77. }
  78. return;
  79. }
  80. void
  81. printGenomes ( struct genome_struct *g, int num_genomes, int num_genes )
  82. {
  83. int i, j;
  84. fprintf ( outfile, "Genomes:\n\n" );
  85. for ( i = 0; i < num_genomes; i++ )
  86. {
  87. fprintf ( outfile, "(%3d) %16s:", g[i].genome_num, g[i].gnamePtr );
  88. for ( j = 0; j < num_genes; j++ )
  89. {
  90. fprintf ( outfile, "%3d ", g[i].genes[j] );
  91. }
  92. fprintf ( outfile, "\n" );
  93. }
  94. fprintf ( outfile, "\n" );
  95. return;
  96. }
  97. void
  98. createGeneAdjMatrix ( int ***adj, int num_genes )
  99. {
  100. int i, j, num;
  101. num = 2 * num_genes + 1;
  102. *adj = ( int ** ) malloc ( num * sizeof ( int * ) );
  103. if ( *adj == ( int ** ) NULL )
  104. fprintf ( stderr, "ERROR: adj NULL\n" );
  105. for ( i = 0; i < num; i++ )
  106. {
  107. ( *adj )[i] = ( int * ) malloc ( num * sizeof ( int ) );
  108. if ( ( *adj )[i] == ( int * ) NULL )
  109. fprintf ( stderr, "ERROR: adj[%d] NULL\n", i );
  110. for ( j = 0; j < num; j++ )
  111. ( *adj )[i][j] = 0;
  112. }
  113. return;
  114. }
  115. void
  116. destroyGeneAdjMatrix ( int **adj, int num_genes )
  117. {
  118. int i, num;
  119. num = 2 * num_genes;
  120. for ( i = num - 1; i >= 0; i-- )
  121. free ( adj[i] );
  122. free ( adj );
  123. return;
  124. }
  125. void
  126. setAdj ( int **adj, int num_genes, int g0, int g1 )
  127. {
  128. int index0, index1;
  129. index0 = mapToLinear ( g0, num_genes );
  130. index1 = mapToLinear ( g1, num_genes );
  131. adj[index0][index1] = 1;
  132. index0 = mapToLinear ( -g1, num_genes );
  133. index1 = mapToLinear ( -g0, num_genes );
  134. adj[index0][index1] = 1;
  135. return;
  136. }
  137. void
  138. fillGeneAdjMatrix ( int **adj, struct genome_struct *mygenomes,
  139. int num_genomes, int num_genes )
  140. {
  141. int i, j;
  142. int g0, g1;
  143. for ( i = 0; i < num_genomes; i++ )
  144. {
  145. for ( j = 0; j < num_genes; j++ )
  146. {
  147. g0 = mygenomes[i].genes[j];
  148. g1 = mygenomes[i].genes[( j + 1 ) % num_genes];
  149. setAdj ( adj, num_genes, g0, g1 );
  150. }
  151. }
  152. return;
  153. }
  154. void
  155. printGeneAdjMatrix ( int **adj, int num_genes )
  156. {
  157. int i, j;
  158. int num;
  159. num = 2 * num_genes;
  160. fprintf ( outfile, " \t" );
  161. for ( j = 1; j <= num; j++ )
  162. {
  163. fprintf ( outfile, "%3d \t", mapToPolarity ( j, num_genes ) );
  164. }
  165. fprintf ( outfile, "\n" );
  166. for ( i = 1; i <= num; i++ )
  167. {
  168. fprintf ( outfile, "%3d: \t", mapToPolarity ( i, num_genes ) );
  169. for ( j = 1; j <= num; j++ )
  170. {
  171. fprintf ( outfile, "%3d \t",
  172. mapToPolarity ( adj[i][j], num_genes ) );
  173. }
  174. fprintf ( outfile, "\n" );
  175. }
  176. fprintf ( outfile, "\n" );
  177. return;
  178. }
  179. int
  180. getAdjNum ( int **adj, int num_genes )
  181. {
  182. int i, j;
  183. int num;
  184. int k;
  185. num = 2 * num_genes;
  186. k = 0;
  187. for ( i = 1; i <= num; i++ )
  188. for ( j = 1; j <= num; j++ )
  189. k += adj[i][j];
  190. k /= 2;
  191. return ( k );
  192. }
  193. int
  194. containsPair ( struct genome_struct *g, int G0, int G1, int num_genes )
  195. {
  196. int found;
  197. int i, p0, p1;
  198. found = FALSE;
  199. i = 0;
  200. while ( ( i < num_genes ) && ( !found ) )
  201. {
  202. p0 = g->genes[i];
  203. p1 = g->genes[( i + 1 ) % num_genes];
  204. found = ( ( p0 == G0 && p1 == G1 ) || ( p0 == -G1 && p1 == -G0 ) );
  205. i++;
  206. }
  207. return ( found );
  208. }
  209. void
  210. encodeAdjGenomes ( int **adj, struct genome_struct *mygenomes,
  211. int num_genomes, int num_genes )
  212. {
  213. int i, j, v, num;
  214. int width;
  215. int *geneVectorG0;
  216. int *geneVectorG1;
  217. width = getAdjNum ( adj, num_genes );
  218. geneVectorG0 = ( int * ) malloc ( width * sizeof ( int ) );
  219. if ( geneVectorG0 == ( int * ) NULL )
  220. fprintf ( stderr, "ERROR: geneVectorG0 NULL\n" );
  221. geneVectorG1 = ( int * ) malloc ( width * sizeof ( int ) );
  222. if ( geneVectorG1 == ( int * ) NULL )
  223. fprintf ( stderr, "ERROR: geneVectorG1 NULL\n" );
  224. v = 0;
  225. for ( i = 1; i < num_genes; i++ )
  226. {
  227. geneVectorG0[v] = i;
  228. geneVectorG1[v] = i + 1;
  229. adj[mapToLinear ( i, num_genes )][mapToLinear ( i + 1, num_genes )] =
  230. 0;
  231. adj[mapToLinear ( -( i + 1 ), num_genes )][mapToLinear
  232. ( -i, num_genes )] = 0;
  233. v++;
  234. }
  235. geneVectorG0[v] = num_genes;
  236. geneVectorG1[v] = 1;
  237. adj[mapToLinear ( num_genes, num_genes )][mapToLinear ( 1, num_genes )] =
  238. 0;
  239. adj[mapToLinear ( -1, num_genes )][mapToLinear ( -num_genes, num_genes )]
  240. = 0;
  241. v++;
  242. num = 2 * num_genes;
  243. for ( i = 1; i <= num; i++ )
  244. for ( j = 1; j <= num; j++ )
  245. if ( adj[i][j] )
  246. {
  247. geneVectorG0[v] = mapToPolarity ( i, num_genes );
  248. geneVectorG1[v] = mapToPolarity ( j, num_genes );
  249. adj[mapToLinear ( i, num_genes )][mapToLinear
  250. ( j, num_genes )] = 0;
  251. adj[mapToLinear ( -j, num_genes )][mapToLinear
  252. ( -i, num_genes )] = 0;
  253. v++;
  254. }
  255. #if DEBUG
  256. fprintf ( outfile, "VECTOR GENES:\n" );
  257. fprintf ( outfile, "width: %d v: %d\n", width, v );
  258. for ( i = 0; i < v; i++ )
  259. fprintf ( outfile, "V[%3d]: ( %4d, %4d)\n", i, geneVectorG0[i],
  260. geneVectorG1[i] );
  261. fprintf ( outfile, "\n" );
  262. #endif
  263. for ( i = 0; i < num_genomes; i++ )
  264. {
  265. mygenomes[i].encoding =
  266. ( char * ) malloc ( ( width + 1 ) * sizeof ( char ) );
  267. if ( mygenomes[i].encoding == ( char * ) NULL )
  268. fprintf ( stderr, "ERROR: encoding NULL\n" );
  269. for ( j = 0; j < width; j++ )
  270. {
  271. if ( containsPair ( mygenomes + i,
  272. geneVectorG0[j], geneVectorG1[j],
  273. num_genes ) )
  274. {
  275. mygenomes[i].encoding[j] = '1';
  276. }
  277. else
  278. {
  279. mygenomes[i].encoding[j] = '0';
  280. }
  281. }
  282. mygenomes[i].encoding[width] = '\0';
  283. }
  284. free ( geneVectorG1 );
  285. free ( geneVectorG0 );
  286. return;
  287. }
  288. void
  289. freeGenomes ( struct genome_struct *mygenomes, int num_genomes )
  290. {
  291. int i;
  292. for ( i = 0; i < num_genomes; i++ )
  293. free ( mygenomes[i].encoding );
  294. free ( mygenomes );
  295. return;
  296. }
  297. void
  298. printGenomeEncodings ( struct genome_struct *g, int num_genomes )
  299. {
  300. int i;
  301. fprintf ( outfile, "GENOME ENCODINGS:\n" );
  302. for ( i = 0; i < num_genomes; i++ )
  303. {
  304. fprintf ( outfile, "%s: %s\n", g[i].gnamePtr, g[i].encoding );
  305. }
  306. fprintf ( outfile, "\n" );
  307. return;
  308. }
  309. void
  310. encodeGenomes ( struct genome_struct *mygenomes,
  311. int num_genomes, int num_genes )
  312. {
  313. int **adj;
  314. createGeneAdjMatrix ( &adj, num_genes );
  315. fillGeneAdjMatrix ( adj, mygenomes, num_genomes, num_genes );
  316. #if DEBUG
  317. printGeneAdjMatrix ( adj, num_genes );
  318. #endif
  319. encodeAdjGenomes ( adj, mygenomes, num_genomes, num_genes );
  320. destroyGeneAdjMatrix ( adj, num_genes );
  321. return;
  322. }
  323. int
  324. hamming_distance_from_encoding ( struct genome_struct *g1,
  325. struct genome_struct *g2 )
  326. {
  327. int i, num;
  328. int width;
  329. num = 0;
  330. width = strlen ( g1->encoding );
  331. if ( width != strlen ( g2->encoding ) )
  332. fprintf ( stderr, "ERROR: hamming_distance_from_encoding\n" );
  333. for ( i = 0; i < width; i++ )
  334. {
  335. num += ( g1->encoding[i] ^ g2->encoding[i] );
  336. }
  337. return ( num );
  338. }
  339. void
  340. findAllHammingDistancesFromEncoding ( struct genome_struct *mygenomes,
  341. int num_genomes, int num_genes )
  342. {
  343. int i, j, dist;
  344. fprintf ( outfile, "HAMMING DISTANCE From ENCODING:\n" );
  345. for ( i = 0; i < num_genomes; i++ )
  346. {
  347. for ( j = i + 1; j < num_genomes; j++ )
  348. {
  349. dist =
  350. hamming_distance_from_encoding ( mygenomes + i,
  351. mygenomes + j );
  352. fprintf ( outfile,
  353. "The Hamming Distance between %s and %s is %3d (BP=%3d)\n",
  354. mygenomes[i].gnamePtr, mygenomes[j].gnamePtr, dist,
  355. dist / 2 );
  356. }
  357. }
  358. fprintf ( outfile, "\n" );
  359. return;
  360. }
  361. int
  362. hamming_distance_quad ( struct genome_struct *g1, struct genome_struct *g2,
  363. int num_genes )
  364. {
  365. int i, num;
  366. int gene0, gene1;
  367. num = 0;
  368. for ( i = 0; i < num_genes; i++ )
  369. {
  370. gene0 = g1->genes[i];
  371. gene1 = g1->genes[( i + 1 ) % num_genes];
  372. num += containsPair ( g2, gene0, gene1, num_genes );
  373. }
  374. num = num_genes - num;
  375. return ( num );
  376. }
  377. #define setAdj(g0, g1, gAdj, n) \
  378. if ((g0)>0) \
  379. gAdj[(g0)] = (g1); \
  380. else \
  381. gAdj[(n) - (g0)] = (g1); \
  382. if ((g1)>0) \
  383. gAdj[(n) + (g1)] = -(g0); \
  384. else \
  385. gAdj[-(g1)] = -(g0);
  386. #define checkAdj(g0, g1, gAdj, n) \
  387. if ((g0)>0) { \
  388. if ((g1)>0) { \
  389. if ((gAdj[(g0)] != (g1)) && (gAdj[(n) + (g1)] != -(g0))) \
  390. num++; \
  391. } \
  392. else { \
  393. if ((gAdj[(g0)] != (g1)) && (gAdj[-(g1)] != -(g0))) \
  394. num++; \
  395. } \
  396. } \
  397. else { \
  398. if ((g1)>0) { \
  399. if ((gAdj[(n) - (g0)] != (g1)) && (gAdj[(n) + (g1)] != -(g0))) \
  400. num++; \
  401. } \
  402. else { \
  403. if ((gAdj[(n) - (g0)] != (g1)) && (gAdj[-(g1)] != -(g0))) \
  404. num++; \
  405. } \
  406. }
  407. int
  408. hamming_distance ( struct genome_struct *g1, struct genome_struct *g2,
  409. int num_genes, int CIRCULAR, int *geneAdj )
  410. {
  411. register int i;
  412. register int gene0, gene1, firstgene;
  413. int num;
  414. num = 0;
  415. firstgene = gene0 = g1->genes[0];
  416. gene1 = g1->genes[1];
  417. setAdj ( gene0, gene1, geneAdj, num_genes );
  418. for ( i = 1; i < num_genes - 1; i++ )
  419. {
  420. gene0 = gene1;
  421. gene1 = g1->genes[i + 1];
  422. setAdj ( gene0, gene1, geneAdj, num_genes );
  423. }
  424. gene0 = gene1;
  425. gene1 = firstgene;
  426. setAdj ( gene0, gene1, geneAdj, num_genes );
  427. firstgene = gene0 = g2->genes[0];
  428. gene1 = g2->genes[1];
  429. checkAdj ( gene0, gene1, geneAdj, num_genes );
  430. for ( i = 1; i < num_genes - 1; i++ )
  431. {
  432. gene0 = gene1;
  433. gene1 = g2->genes[i + 1];
  434. checkAdj ( gene0, gene1, geneAdj, num_genes );
  435. }
  436. if ( CIRCULAR )
  437. {
  438. gene0 = gene1;
  439. gene1 = firstgene;
  440. checkAdj ( gene0, gene1, geneAdj, num_genes );
  441. }
  442. return ( num );
  443. }
  444. int
  445. hamming_distance_nomem ( struct genome_struct *g1, struct genome_struct *g2,
  446. int num_genes, int CIRCULAR )
  447. {
  448. int dist;
  449. int *geneAdj;
  450. geneAdj = ( int * ) malloc ( ( num_genes + 1 ) * 2 * sizeof ( int ) );
  451. if ( geneAdj == ( int * ) NULL )
  452. fprintf ( stderr, "ERROR: geneAdj NULL\n" );
  453. dist = hamming_distance ( g1, g2, num_genes, CIRCULAR, geneAdj );
  454. free ( geneAdj );
  455. return ( dist );
  456. }
  457. void
  458. setBPmatrix ( int **distmatrix, struct genome_struct *genomes,
  459. int num_genes, int num_genomes, distmem_t * distmem,
  460. int CIRCULAR )
  461. {
  462. int i, j;
  463. for ( i = 0; i < num_genomes; i++ )
  464. {
  465. distmatrix[i][i] = 0;
  466. for ( j = i + 1; j < num_genomes; j++ )
  467. {
  468. distmatrix[i][j] = distmatrix[j][i] =
  469. hamming_distance ( genomes + i, genomes + j, num_genes,
  470. CIRCULAR, distmem->hammingArr );
  471. }
  472. }
  473. }
  474. void
  475. findAllHammingDistancesFromScratch ( struct genome_struct *mygenomes,
  476. int num_genomes, int num_genes,
  477. int CIRCULAR )
  478. {
  479. int i, j, dist;
  480. int *geneAdj;
  481. geneAdj = ( int * ) malloc ( ( num_genes + 1 ) * 2 * sizeof ( int ) );
  482. fprintf ( outfile, "HAMMING DISTANCE From SCRATCH:\n" );
  483. for ( i = 0; i < num_genomes; i++ )
  484. {
  485. for ( j = i + 1; j < num_genomes; j++ )
  486. {
  487. dist = hamming_distance ( mygenomes + i, mygenomes + j, num_genes,
  488. CIRCULAR, geneAdj );
  489. fprintf ( outfile,
  490. "The Hamming Distance between %s and %s is %3d (BP=%3d)\n",
  491. mygenomes[i].gnamePtr, mygenomes[j].gnamePtr, dist,
  492. dist / 2 );
  493. }
  494. }
  495. fprintf ( outfile, "\n" );
  496. free ( geneAdj );
  497. return;
  498. }
  499. void
  500. main_binencode ( )
  501. {
  502. int NUM_GENES = 0;
  503. int NUM_GENOMES = 0;
  504. int CIRCULAR = TRUE;
  505. struct genome_struct *mygenomes;
  506. /* Allocate the genomes */
  507. mygenomes = ( struct genome_struct * )
  508. malloc ( NUM_GENOMES * sizeof ( struct genome_struct ) );
  509. if ( mygenomes == ( struct genome_struct * ) NULL )
  510. fprintf ( stderr, "ERROR: mygenomes is NULL\n" );
  511. /* Fill the genomes with sample data */
  512. fillGenomes ( mygenomes, NUM_GENOMES, NUM_GENES );
  513. /* Print out the original genomes */
  514. printGenomes ( mygenomes, NUM_GENOMES, NUM_GENES );
  515. /* Encode the genomes */
  516. encodeGenomes ( mygenomes, NUM_GENOMES, NUM_GENES );
  517. /* Print out the genome encodings */
  518. printGenomeEncodings ( mygenomes, NUM_GENOMES );
  519. /* Calculate Hamming Distances */
  520. findAllHammingDistancesFromEncoding ( mygenomes, NUM_GENOMES, NUM_GENES );
  521. /* Calculate Hamming Distances */
  522. findAllHammingDistancesFromScratch ( mygenomes, NUM_GENOMES, NUM_GENES,
  523. CIRCULAR );
  524. /* Free the genomes */
  525. freeGenomes ( mygenomes, NUM_GENOMES );
  526. return;
  527. }