PageRenderTime 43ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/00_SCRIPTS/hinton_lab/00_will/RNA-SEQ-pipeline/QC_data.py

https://gitlab.com/will_rowe/biodock
Python | 270 lines | 249 code | 2 blank | 19 comment | 13 complexity | 79c7ff4f795b47e444a96483e803dc27 MD5 | raw file
  1. #!/usr/bin/env python3
  2. import os, sys, subprocess, re, getopt
  3. """
  4. ################################################################################
  5. ### INFO
  6. This script QCs the supplied FASTQ file. It runs FastQC and Trimmomatic (with a sliding window, quality based trim), it then re-runs FastQC and inspects the reads with Kraken.
  7. version = 0.2
  8. author = Will Rowe
  9. email = will.rowe@liverpool.ac.uk
  10. ################################################################################
  11. ### NOTES
  12. This script is not complete:
  13. need to add customisation of parameters
  14. need to make stand alone (name==main)
  15. Still working the most appropriate interpretation of Kraken results. At the moment the script looks at the number of unclassified reads and the number belonging to our genus of interest (Salmonella). Considering adding a feature to raise a warning flag if either of these are outside a 'normal' range.
  16. """
  17. ################################################################################
  18. ### SET PATHS AND DEFAULTS
  19. # path to software on cluster
  20. fastqc_path = '/usr/local/share/FastQC/fastqc'
  21. kraken_path = '/pub46/willr/000_HOME/0005_RNA-SEQ-PIPELINE/01_BIN/kraken'
  22. kraken_report_path = '/pub46/willr/000_HOME/0005_RNA-SEQ-PIPELINE/01_BIN/kraken-report'
  23. kraken_db = '/pub46/willr/000_HOME/0002_REF_DATA/0004_DBs/minikraken_20141208'
  24. trimm_path = '/pub46/willr/000_HOME/0005_RNA-SEQ-PIPELINE/01_BIN/trimmomatic-0.36.jar'
  25. # default settings
  26. threads = '20'
  27. length_cutoff = '20'
  28. results_sub_dir = '.'
  29. kraken_search_term = 'Salmonella\n' # include \n to only match Genus in Kraken report file
  30. ################################################################################
  31. ### FUNCTIONS
  32. """
  33. QC data
  34. this function runs FastQC and Trimmomatic on the raw fastq data, then runs Kraken and FastQC on the trimmed reads.
  35. returns a QC results file and fastq.gz file of trimmed reads
  36. """
  37. def qc_data(input_fastq_file, length_cutoff, results_sub_dir, file_basename, QC_results_file):
  38. # save trimmed input data as
  39. trimmed_data = results_sub_dir + '/trimmed.' + file_basename
  40. # QC commands
  41. fastqc_cmd = fastqc_path + ' -t ' + threads + ' ' + input_fastq_file + ' --outdir ' + results_sub_dir
  42. trimm_cmd = trimm_path + ' SE -threads ' + threads + ' ' + input_fastq_file + ' ' + trimmed_data + ' ' + ' SLIDINGWINDOW:4:20 MINLEN:' + length_cutoff + ' &> ' + results_sub_dir + '/trimm.log'
  43. kraken_cmd = kraken_path + ' --threads ' + threads + ' --preload --fastq-input --gzip-compressed --db ' + kraken_db + ' ' + trimmed_data + ' | ' + kraken_report_path + ' --db ' + kraken_db + ' > ' + results_sub_dir + '/krakenreport'
  44. fastqc_cmd_2 = fastqc_path + ' -t ' + threads + ' ' + trimmed_data + ' --outdir ' + results_sub_dir
  45. # run fastqc subprocess
  46. processes = []
  47. with open('%s/fastqc.log' % results_sub_dir, 'w') as fastqc_log:
  48. p1 = subprocess.Popen(fastqc_cmd, shell=True, stdout=fastqc_log, stderr=fastqc_log)
  49. processes.append(p1)
  50. # os.system for trimmomatic
  51. trimm_log = results_sub_dir + '/trimm.log'
  52. os.system(trimm_cmd)
  53. # wait for fastqc subprocesses to complete:
  54. exit_codes = [p.wait() for p in processes]
  55. # retrieve QC stats for raw data
  56. sample_stats = {}
  57. # retrieve QC stats for raw data - initial fastqc of data
  58. fastqc_file = file_basename + '_fastqc/fastqc_data.txt'
  59. fastqc_file = fastqc_file.replace('.fastq.gz','')
  60. with open('{}/{}' .format(results_sub_dir, fastqc_file), 'r') as fh:
  61. fastqc_lines = fh.readlines()
  62. fastqc_base_seq_qual = [line for line in fastqc_lines if re.search('>>Per base sequence quality\t', line)]
  63. fastqc_len_dist = [line for line in fastqc_lines if re.search('>>Sequence Length Distribution\t', line)]
  64. fastqc_seq_len = [line for line in fastqc_lines if re.match('Sequence length', line)]
  65. sample_stats['fastqc'] = fastqc_base_seq_qual, fastqc_len_dist, fastqc_seq_len
  66. # retrieve QC stats for raw data - trimmomatic summary
  67. with open('{}/trimm.log' .format(results_sub_dir), 'r') as fh:
  68. trimm_lines = fh.readlines()
  69. trimm_summary = [line for line in trimm_lines if re.search(' Surviving: ', line)]
  70. sample_stats['trimmomatic'] = trimm_summary
  71. # parse QC stats for raw data - initial fastqc
  72. if re.search('pass', str(sample_stats["fastqc"][0])) is not None:
  73. print ('\t\t*FastQC:\tsequence base quality\t==>\tPASS')
  74. base_qual = 'PASS'
  75. else:
  76. print ('\t\t*FastQC:\tsequence base quality\t==>\tFAIL')
  77. base_qual = 'FAIL'
  78. if re.search('pass', str(sample_stats["fastqc"][1])) is not None:
  79. print ('\t\t*FastQC:\tseq length distribution\t==>\tPASS')
  80. len_dist = 'PASS'
  81. else:
  82. print ('\t\t*FastQC:\tseq length distribution\t==>\tFAIL')
  83. len_dist = 'FAIL'
  84. fqc_search = re.compile(r"Sequence length\\t(.*)\\t\\n")
  85. fastqc_match = re.search(fqc_search, str(sample_stats["fastqc"][2]))
  86. if fastqc_match:
  87. print ('\t\t*FastQC:\tinitial sequence length\t==>\t{}' .format(fastqc_match.group(1)))
  88. seq_length = fastqc_match.group(1)
  89. else:
  90. print ('\t\t*FastQC:\tinitial sequence length\t==>\terror with parsing fastqc results\n')
  91. seq_length = 'ERROR'
  92. # parse QC stats for raw data - trimmomatic
  93. trimm_match = re.search('Dropped:\s+\d+\s+\((\S+)\)', str(sample_stats["trimmomatic"]))
  94. if trimm_match:
  95. print ('\t\t*Trimmomatic:\tdropped sequences\t==>\t{}' .format(trimm_match.group(1)))
  96. dropped_seqs = trimm_match.group(1)
  97. else:
  98. print ('\t\t*Trimmomatic:\tdropped sequences\t==>\terror with parsing trimmomatic results\n')
  99. dropped_seqs = 'ERROR'
  100. # run second fastqc + kraken subprocesses
  101. print ('\n\n\t*Completed FastQC and Trimmomatic jobs!\n\n\n\t*Running FastQC and and Kraken on trimmed data . . .\n\n')
  102. processes2 = []
  103. with open('%s/fastqc2.log' % results_sub_dir, 'w') as fastqc_log2:
  104. p1 = subprocess.Popen(fastqc_cmd_2, shell=True, stdout=fastqc_log2, stderr=fastqc_log2)
  105. processes2.append(p1)
  106. with open('%s/kraken.log' % results_sub_dir, 'w') as kraken_log:
  107. p2 = subprocess.Popen(kraken_cmd, shell=True, stdout=kraken_log, stderr=kraken_log)
  108. processes2.append(p2)
  109. # wait for subprocesses to complete:
  110. exit_codes = [p.wait() for p in processes2]
  111. # retrieve QC stats - use kraken log to find number of unclassified sequences and the total number of sequences in the trimmed dataset
  112. with open('{}/kraken.log' .format(results_sub_dir), 'r') as fh:
  113. kraken_lines = fh.readlines()
  114. kraken_unclassified = [line for line in kraken_lines if re.search('sequences unclassified ', line)]
  115. sample_stats['kraken'] = kraken_unclassified
  116. for line in kraken_lines:
  117. seq_num_match = re.search('\r(\d+) sequences \(.*\) processed in', line)
  118. if seq_num_match:
  119. seq_num = seq_num_match.group(1)
  120. break
  121. else:
  122. seq_num = 'Could not parse Kraken log file for seq num'
  123. # retrieve QC stats - use krakenreport to find number of salmonella sequences
  124. with open('{}/krakenreport' .format(results_sub_dir), 'r') as fh:
  125. krakenreport_lines = fh.readlines()
  126. kraken_search = [line for line in krakenreport_lines if re.search(kraken_search_term, line)]
  127. sample_stats['kraken_search_result'] = kraken_search
  128. # retrieve QC stats - fastqc results for trimmed data
  129. trimmed_basename = os.path.basename(trimmed_data)
  130. fastqc_file2 = trimmed_basename + '_fastqc/fastqc_data.txt'
  131. fastqc_file2 = fastqc_file2.replace('.fastq.gz','')
  132. with open('{}/{}' .format(results_sub_dir, fastqc_file2), 'r') as fh:
  133. fastqc_lines2 = fh.readlines()
  134. fastqc_base_seq_qual_trimmed = [line for line in fastqc_lines2 if re.search('>>Per base sequence quality\t', line)]
  135. fastqc_len_dist_trimmed = [line for line in fastqc_lines2 if re.search('>>Sequence Length Distribution\t', line)]
  136. fastqc_seq_len_trimmed = [line for line in fastqc_lines2 if re.match('Sequence length', line)]
  137. sample_stats['fastqc_trimmed'] = fastqc_base_seq_qual_trimmed, fastqc_len_dist_trimmed, fastqc_seq_len_trimmed
  138. # parse QC stats for results - kraken
  139. kraken_match = re.search('\((\S+)\)', str(sample_stats["kraken"]))
  140. if kraken_match:
  141. print ('\t\t*Kraken:\tunclassified sequences\t==>\t{}' .format(kraken_match.group(1)))
  142. unclassified_reads = kraken_match.group(1)
  143. else:
  144. print ('\t\t*Kraken:\tunclassified sequences\t==>\terror with parsing kraken results\n')
  145. unclassified_reads = 'ERROR'
  146. kraken_search_match = re.search('\s+(\d+\.\d+)', str(sample_stats["kraken_search_result"]))
  147. if kraken_search_match:
  148. print ('\t\t*Kraken:\tSalmonella sequences\t==>\t{}%' .format(kraken_search_match.group(1)))
  149. search_term_reads = kraken_search_match.group(1)
  150. else:
  151. print ('\t\t*Kraken:\tSalmonella sequences\t==>\terror with parsing kraken results for search term ({})\n' .format(kraken_search_term))
  152. search_term_reads = 'ERROR'
  153. # parse QC stats for results - fastqc
  154. if re.search('pass', str(sample_stats["fastqc_trimmed"][0])) is not None:
  155. print ('\t\t*FastQC:\tsequence base quality\t==>\tPASS')
  156. base_qual_trimmed = 'PASS'
  157. else:
  158. print ('\t\t*FastQC:\tsequence base quality\t==>\tFAIL')
  159. base_qual_trimmed = 'FAIL'
  160. if re.search('pass', str(sample_stats["fastqc_trimmed"][1])) is not None:
  161. print ('\t\t*FastQC:\tseq length distribution\t==>\tPASS')
  162. len_dist_trimmed = 'PASS'
  163. else:
  164. print ('\t\t*FastQC:\tseq length distribution\t==>\tFAIL')
  165. len_dist_trimmed = 'FAIL'
  166. fqc_search = re.compile(r"Sequence length\\t(.*)\\t\\n")
  167. fastqc2_match = re.search(fqc_search, str(sample_stats["fastqc_trimmed"][2]))
  168. if fastqc2_match:
  169. print ('\t\t*FastQC:\ttrimmed sequence length\t==>\t{}' .format(fastqc2_match.group(1)))
  170. seq_length_trimmed = fastqc2_match.group(1)
  171. else:
  172. print ('\t\t*FastQC:\ttrimmed sequence length\t==>\terror with parsing fastqc results\n')
  173. seq_length_trimmed = 'ERROR'
  174. # write results to file
  175. with open(QC_results_file, 'w') as fh:
  176. fh.write('{}\t{}\t{}\t{}\t{}\t\t{}\t{}\t{}\t{}\t{}%\t{}\t\n' .format(file_basename, seq_length, base_qual, len_dist, dropped_seqs, seq_length_trimmed, base_qual_trimmed, len_dist_trimmed, unclassified_reads, search_term_reads, seq_num))
  177. return trimmed_data
  178. ################################################################################
  179. ### MAIN SCRIPT
  180. if __name__ == "__main__":
  181. # read commandline arguments and create variables
  182. I_met, O_met = (False,)*3
  183. try:
  184. myopts, args = getopt.getopt(sys.argv[1:],'i:o:d:l:')
  185. except getopt.GetoptError as e:
  186. print (str(e))
  187. print ('usage: {} -i [input .fastq.gz file] -o [output file] -d [directory for output files] -l [length cut-off value]' .format(sys.argv[0]))
  188. sys.exit(2)
  189. for option, argument in myopts:
  190. # get input file (required)
  191. if option == '-i':
  192. input_fastq_file = os.path.abspath(argument)
  193. try:
  194. os.path.isfile(input_fastq_file)
  195. except Exception, e:
  196. print >> sys.stderr, 'Supplied input does not appear to be a file . . .'
  197. print >> sys.stderr, 'Exception: %s' % str(e)
  198. sys.exit(1)
  199. file_basename = os.path.basename(input_fastq_file)
  200. I_met = True
  201. # get name for QC summary file (required)
  202. if option == '-o':
  203. QC_results_file = os.path.abspath(argument)
  204. O_met = True
  205. # get directory for output files (kraken report etc.)
  206. if option == '-d':
  207. results_sub_dir = os.path.abspath(argument)
  208. if not os.path.exists(results_sub_dir):
  209. try:
  210. os.makedirs(results_sub_dir)
  211. except Exception, e:
  212. print >> sys.stderr, 'Can\'t create the supplied results directory . . .'
  213. print >> sys.stderr, 'Exception: %s' % str(e)
  214. sys.exit(1)
  215. # get length cutoff value
  216. if option == '-l':
  217. length_cutoff = argument
  218. try:
  219. length_cutoff.isdigit()
  220. except Exception, e:
  221. print >> sys.stderr, 'The supplied length cut-off value does not appear to be a digit . . .'
  222. print >> sys.stderr, 'Exception: %s' % str(e)
  223. sys.exit(1)
  224. if not any((I_met, O_met)):
  225. print ('options i and o must both be supplied!\n')
  226. print ('usage: {} -i [input .fastq.gz file] -o [output file] -d [directory for output files] -l [length cut-off value]' .format(sys.argv[0]))
  227. sys.exit(2)
  228. # run the QC module
  229. try:
  230. QC_results = qc_data(input_fastq_file, length_cutoff, results_sub_dir, file_basename, QC_results_file)
  231. except Exception, e:
  232. print >> sys.stderr, 'Can\'t run QC module . . .'
  233. print >> sys.stderr, 'Exception: %s' % str(e)
  234. sys.exit(1)