/tool_shed/java-tools/BamTools/src/main/java/ca/mcgill/genome/bamtools/commands/FilterDuplicates.java

https://bitbucket.org/CharlesJB/mugqic_pipeline · Java · 369 lines · 329 code · 35 blank · 5 comment · 73 complexity · 5c38f0a3e8ed17c05df137df61b3a8ef MD5 · raw file

  1. package ca.mcgill.genome.bamtools.commands;
  2. import java.io.BufferedWriter;
  3. import java.io.File;
  4. import java.io.FileOutputStream;
  5. import java.io.IOException;
  6. import java.io.OutputStreamWriter;
  7. import java.nio.charset.Charset;
  8. import java.util.zip.GZIPOutputStream;
  9. import ca.mcgill.genome.bamtools.DefaultTool;
  10. import ca.mcgill.genome.bamtools.filterdups.DuplicateKmerBuilder;
  11. import ca.mcgill.genome.bamtools.filterdups.SequenceLengthException;
  12. import ca.mcgill.genome.bamtools.parsers.FASTQFileParser;
  13. import ca.mcgill.genome.bamtools.parsers.FASTQPEFileParser;
  14. import ca.mcgill.genome.bamtools.parsers.Sequence;
  15. import ca.mcgill.genome.bamtools.parsers.SequenceFileParser;
  16. public class FilterDuplicates extends DefaultTool {
  17. private final DuplicateKmerBuilder duplicateKmerBuilder = new DuplicateKmerBuilder();
  18. private File read1 = null;
  19. private File read2 = null;
  20. private byte qualOffset = 33;
  21. private String prefix = null;
  22. private boolean onlyOutputReadNames = false;
  23. @Override
  24. public String getCmdName() {
  25. return "filterdups";
  26. }
  27. @Override
  28. public String getCmdUsage() {
  29. return super.getCmdUsage();
  30. }
  31. private void printUsage(String errMsg) {
  32. System.out.println(errMsg);
  33. printUsageHeader();
  34. System.out.println("\t--read1 Fastq of read1");
  35. System.out.println("\t--read2 Fastq of read2 (optional)");
  36. System.out.println("\t-Q Fastq Quality offset. (default: "+ qualOffset + ")");
  37. System.out.println("\t--prefix Output prefix (default original filename .dup.gz)");
  38. System.out.println("\t--readOutput Only output duplicate read names");
  39. System.out.println("\t-k,--kmer kmer size (default: " + duplicateKmerBuilder.getKmerSize() + ")");
  40. System.out.println("\t-o,--offset Read offset to get kmer (default: " + duplicateKmerBuilder.getOffset() + ")");
  41. }
  42. @Override
  43. public int run(String[] args) {
  44. try {
  45. for (int idx = 1; idx < args.length; idx++) {
  46. if (args[idx].equals("--read1")) {
  47. idx++;
  48. read1 = new File(args[idx]);
  49. } else if (args[idx].equals("--read2")) {
  50. idx++;
  51. read2 = new File(args[idx]);
  52. } else if (args[idx].equals("-Q")) {
  53. idx++;
  54. qualOffset = Byte.parseByte(args[idx]);
  55. } else if (args[idx].equals("--readOutput")) {
  56. onlyOutputReadNames = true;
  57. } else if (args[idx].equals("--prefix")) {
  58. idx++;
  59. prefix = args[idx];
  60. } else if (args[idx].equals("--kmer") || args[idx].equals("-k")) {
  61. idx++;
  62. duplicateKmerBuilder.setKmerSize(Integer.parseInt(args[idx]));
  63. } else if (args[idx].equals("--offset")
  64. || args[idx].equals("-o")) {
  65. idx++;
  66. duplicateKmerBuilder.setOffset(Integer.parseInt(args[idx]));
  67. }
  68. }
  69. if (read1 == null) {
  70. printUsage("read1 not set");
  71. return 1;
  72. }
  73. filterDups();
  74. } catch (Exception e) {
  75. throw new RuntimeException(e);
  76. }
  77. return 0;
  78. }
  79. public void filterDups() {
  80. BufferedWriter read1Writer = null;
  81. BufferedWriter read2Writer = null;
  82. try {
  83. String read1OutputFile;
  84. if (prefix != null) {
  85. read1OutputFile = prefix + ".read1.gz";
  86. } else {
  87. read1OutputFile = read1 + ".dup.read1.gz";
  88. }
  89. String read2OutputFile = null;
  90. SequenceFileParser fileParser;
  91. long totalFileSize = read1.length();
  92. if (read2 != null) {
  93. if (prefix != null) {
  94. read2OutputFile = prefix + ".read2.gz";
  95. } else {
  96. read2OutputFile = read2 + ".dup.read2.gz";
  97. }
  98. totalFileSize += read2OutputFile.length();
  99. fileParser = new FASTQPEFileParser(read1, read2, qualOffset);
  100. duplicateKmerBuilder.setPairedSequence(true);
  101. } else {
  102. fileParser = new FASTQFileParser(read1, qualOffset);
  103. }
  104. long lastPercent = -1;
  105. System.out.println("Build kmer ["
  106. + duplicateKmerBuilder.getKmerSize() + "] Hash");
  107. while (true) {
  108. Sequence sequence = fileParser.nextSequence();
  109. if (sequence == null)
  110. break;
  111. if (sequence.getSequence().length() <= (duplicateKmerBuilder
  112. .getKmerSize() + duplicateKmerBuilder.getOffset())) {
  113. continue;
  114. }
  115. duplicateKmerBuilder.processSequence(sequence);
  116. long percent = fileParser.getByteCount() * 100l / totalFileSize;
  117. if (percent != lastPercent) {
  118. lastPercent = percent;
  119. System.out.print("\rFile read: " + percent + "%");
  120. }
  121. }
  122. System.out.println();
  123. fileParser.close();
  124. double nbKmers = duplicateKmerBuilder.getNbKnownSequencesFound()
  125. .size();
  126. double total = duplicateKmerBuilder.getTotalNbReads();
  127. System.out.println("Dup: " + (100.0 - (nbKmers * 100.0d / total))
  128. + '%');
  129. // Second pass
  130. fileParser = new FASTQFileParser(read1, qualOffset);
  131. totalFileSize = read1.length();
  132. read1Writer = new BufferedWriter(
  133. new OutputStreamWriter(new GZIPOutputStream(
  134. new FileOutputStream(read1OutputFile)),
  135. Charset.forName("ASCII")));
  136. if (read2 != null) {
  137. FASTQFileParser filePEParser = new FASTQFileParser(read2,
  138. qualOffset);
  139. read2Writer = new BufferedWriter(new OutputStreamWriter(
  140. new GZIPOutputStream(new FileOutputStream(
  141. read2OutputFile)), Charset.forName("ASCII")));
  142. lastPercent = -1;
  143. System.out.println("Extract 'best' unique reads");
  144. while (true) {
  145. Sequence read1Sequence = fileParser.nextSequence();
  146. Sequence read2Sequence = filePEParser.nextSequence();
  147. if (read1Sequence == null)
  148. break;
  149. String sequence = read1Sequence.getSequence()
  150. + read2Sequence.getSequence();
  151. if (sequence.length() <= (duplicateKmerBuilder
  152. .getKmerSize() + duplicateKmerBuilder.getOffset())) {
  153. if (onlyOutputReadNames) {
  154. read1Writer.append(read1Sequence.getHeader());
  155. read1Writer.newLine();
  156. read2Writer.append(read2Sequence.getHeader());
  157. read2Writer.newLine();
  158. }
  159. continue;
  160. }
  161. int qualitySum = 0;
  162. for (int idx = 0; idx < read1Sequence.getBaseQualities().length; idx++) {
  163. qualitySum += read1Sequence.getBaseQualities()[idx];
  164. }
  165. for (int idx = 0; idx < read2Sequence.getBaseQualities().length; idx++) {
  166. qualitySum += read2Sequence.getBaseQualities()[idx];
  167. }
  168. boolean isWritable;
  169. try {
  170. isWritable = duplicateKmerBuilder
  171. .getKmerCountAndMark(
  172. sequence,
  173. qualitySum
  174. / (read1Sequence
  175. .getBaseQualities().length + read2Sequence
  176. .getBaseQualities().length));
  177. } catch (SequenceLengthException e) {
  178. throw new RuntimeException(
  179. "Kmer too long, or offset to big: "
  180. + read1Sequence.getHeader());
  181. }
  182. if (isWritable && !onlyOutputReadNames) {
  183. read1Writer.append('@').append(
  184. read1Sequence.getHeader());
  185. read1Writer.newLine();
  186. read1Writer.append(read1Sequence.getSequence());
  187. read1Writer.newLine();
  188. read1Writer.append('+').append(
  189. read1Sequence.getHeader());
  190. read1Writer.newLine();
  191. for (int idx = 0; idx < read1Sequence
  192. .getBaseQualities().length; idx++) {
  193. read1Writer.append((char) (read1Sequence
  194. .getBaseQualities()[idx] + 33));
  195. }
  196. read1Writer.newLine();
  197. read2Writer.append('@').append(
  198. read2Sequence.getHeader());
  199. read2Writer.newLine();
  200. read2Writer.append(read2Sequence.getSequence());
  201. read2Writer.newLine();
  202. read2Writer.append('+').append(
  203. read2Sequence.getHeader());
  204. read2Writer.newLine();
  205. for (int idx = 0; idx < read2Sequence
  206. .getBaseQualities().length; idx++) {
  207. read2Writer.append((char) (read2Sequence
  208. .getBaseQualities()[idx] + 33));
  209. }
  210. read2Writer.newLine();
  211. } else if (!isWritable && onlyOutputReadNames) {
  212. read1Writer.append(read1Sequence.getHeader());
  213. read1Writer.newLine();
  214. read2Writer.append(read2Sequence.getHeader());
  215. read2Writer.newLine();
  216. }
  217. long percent = fileParser.getByteCount() * 100l
  218. / totalFileSize;
  219. if (percent != lastPercent) {
  220. lastPercent = percent;
  221. System.out.print("\rFile read: " + percent + "%");
  222. }
  223. }
  224. System.out.println();
  225. read1Writer.close();
  226. read2Writer.close();
  227. fileParser.close();
  228. filePEParser.close();
  229. } else {
  230. lastPercent = -1;
  231. System.out.println("Extract 'best' unique reads");
  232. while (true) {
  233. Sequence read1Sequence = fileParser.nextSequence();
  234. if (read1Sequence == null)
  235. break;
  236. String sequence = read1Sequence.getSequence();
  237. if (sequence.length() <= (duplicateKmerBuilder
  238. .getKmerSize() + duplicateKmerBuilder.getOffset())) {
  239. if (onlyOutputReadNames) {
  240. read1Writer.append(read1Sequence.getHeader());
  241. read1Writer.newLine();
  242. }
  243. continue;
  244. }
  245. int qualitySum = 0;
  246. for (int idx = 0; idx < read1Sequence.getBaseQualities().length; idx++) {
  247. qualitySum += read1Sequence.getBaseQualities()[idx];
  248. }
  249. boolean isWritable;
  250. try {
  251. isWritable = duplicateKmerBuilder
  252. .getKmerCountAndMark(
  253. sequence,
  254. qualitySum
  255. / read1Sequence
  256. .getBaseQualities().length);
  257. } catch (SequenceLengthException e) {
  258. throw new RuntimeException(
  259. "Kmer too long, or offset to big: "
  260. + read1Sequence.getHeader());
  261. }
  262. if (isWritable && !onlyOutputReadNames) {
  263. read1Writer.append('@').append(
  264. read1Sequence.getHeader());
  265. read1Writer.newLine();
  266. read1Writer.append(read1Sequence.getSequence());
  267. read1Writer.newLine();
  268. read1Writer.append('+').append(
  269. read1Sequence.getHeader());
  270. read1Writer.newLine();
  271. for (int idx = 0; idx < read1Sequence
  272. .getBaseQualities().length; idx++) {
  273. read1Writer.append((char) (read1Sequence
  274. .getBaseQualities()[idx] + 33));
  275. }
  276. read1Writer.newLine();
  277. } else if (!isWritable && onlyOutputReadNames) {
  278. read1Writer.append(read1Sequence.getHeader());
  279. read1Writer.newLine();
  280. }
  281. long percent = fileParser.getByteCount() * 100l
  282. / totalFileSize;
  283. if (percent != lastPercent) {
  284. lastPercent = percent;
  285. System.out.print("\rFile read: " + percent + "%");
  286. }
  287. }
  288. System.out.println();
  289. fileParser.close();
  290. }
  291. } catch (IOException e) {
  292. e.printStackTrace();
  293. } finally {
  294. if (read1Writer != null) {
  295. try {
  296. read1Writer.close();
  297. } catch (IOException e) {
  298. // oh well
  299. }
  300. }
  301. if (read2Writer != null) {
  302. try {
  303. read2Writer.close();
  304. } catch (IOException e) {
  305. // oh well
  306. }
  307. }
  308. }
  309. }
  310. public static void printGC() {
  311. runGC();
  312. System.out.println("Mem: " + (usedMemory() / 1024 / 1024));
  313. }
  314. private static void runGC() {
  315. // It helps to call Runtime.gc()
  316. // using several method calls:
  317. for (int r = 0; r < 4; ++r)
  318. _runGC();
  319. }
  320. private static void _runGC() {
  321. long usedMem1 = usedMemory(), usedMem2 = Long.MAX_VALUE;
  322. for (int i = 0; (usedMem1 < usedMem2) && (i < 500); ++i) {
  323. s_runtime.runFinalization();
  324. s_runtime.gc();
  325. Thread.yield();
  326. usedMem2 = usedMem1;
  327. usedMem1 = usedMemory();
  328. }
  329. }
  330. private static long usedMemory() {
  331. return s_runtime.totalMemory() - s_runtime.freeMemory();
  332. }
  333. private static final Runtime s_runtime = Runtime.getRuntime();
  334. }