/00_SCRIPTS/hinton_lab/00_will/RNA-SEQ-pipeline/QC_data.py
Python | 270 lines | 249 code | 2 blank | 19 comment | 13 complexity | 79c7ff4f795b47e444a96483e803dc27 MD5 | raw file
- #!/usr/bin/env python3
- import os, sys, subprocess, re, getopt
- """
- ################################################################################
- ### INFO
- This script QCs the supplied FASTQ file. It runs FastQC and Trimmomatic (with a sliding window, quality based trim), it then re-runs FastQC and inspects the reads with Kraken.
- version = 0.2
- author = Will Rowe
- email = will.rowe@liverpool.ac.uk
- ################################################################################
- ### NOTES
- This script is not complete:
- need to add customisation of parameters
- need to make stand alone (name==main)
- Still working the most appropriate interpretation of Kraken results. At the moment the script looks at the number of unclassified reads and the number belonging to our genus of interest (Salmonella). Considering adding a feature to raise a warning flag if either of these are outside a 'normal' range.
- """
- ################################################################################
- ### SET PATHS AND DEFAULTS
- # path to software on cluster
- fastqc_path = '/usr/local/share/FastQC/fastqc'
- kraken_path = '/pub46/willr/000_HOME/0005_RNA-SEQ-PIPELINE/01_BIN/kraken'
- kraken_report_path = '/pub46/willr/000_HOME/0005_RNA-SEQ-PIPELINE/01_BIN/kraken-report'
- kraken_db = '/pub46/willr/000_HOME/0002_REF_DATA/0004_DBs/minikraken_20141208'
- trimm_path = '/pub46/willr/000_HOME/0005_RNA-SEQ-PIPELINE/01_BIN/trimmomatic-0.36.jar'
- # default settings
- threads = '20'
- length_cutoff = '20'
- results_sub_dir = '.'
- kraken_search_term = 'Salmonella\n' # include \n to only match Genus in Kraken report file
- ################################################################################
- ### FUNCTIONS
- """
- QC data
- this function runs FastQC and Trimmomatic on the raw fastq data, then runs Kraken and FastQC on the trimmed reads.
- returns a QC results file and fastq.gz file of trimmed reads
- """
- def qc_data(input_fastq_file, length_cutoff, results_sub_dir, file_basename, QC_results_file):
- # save trimmed input data as
- trimmed_data = results_sub_dir + '/trimmed.' + file_basename
- # QC commands
- fastqc_cmd = fastqc_path + ' -t ' + threads + ' ' + input_fastq_file + ' --outdir ' + results_sub_dir
- trimm_cmd = trimm_path + ' SE -threads ' + threads + ' ' + input_fastq_file + ' ' + trimmed_data + ' ' + ' SLIDINGWINDOW:4:20 MINLEN:' + length_cutoff + ' &> ' + results_sub_dir + '/trimm.log'
- kraken_cmd = kraken_path + ' --threads ' + threads + ' --preload --fastq-input --gzip-compressed --db ' + kraken_db + ' ' + trimmed_data + ' | ' + kraken_report_path + ' --db ' + kraken_db + ' > ' + results_sub_dir + '/krakenreport'
- fastqc_cmd_2 = fastqc_path + ' -t ' + threads + ' ' + trimmed_data + ' --outdir ' + results_sub_dir
- # run fastqc subprocess
- processes = []
- with open('%s/fastqc.log' % results_sub_dir, 'w') as fastqc_log:
- p1 = subprocess.Popen(fastqc_cmd, shell=True, stdout=fastqc_log, stderr=fastqc_log)
- processes.append(p1)
- # os.system for trimmomatic
- trimm_log = results_sub_dir + '/trimm.log'
- os.system(trimm_cmd)
- # wait for fastqc subprocesses to complete:
- exit_codes = [p.wait() for p in processes]
- # retrieve QC stats for raw data
- sample_stats = {}
- # retrieve QC stats for raw data - initial fastqc of data
- fastqc_file = file_basename + '_fastqc/fastqc_data.txt'
- fastqc_file = fastqc_file.replace('.fastq.gz','')
- with open('{}/{}' .format(results_sub_dir, fastqc_file), 'r') as fh:
- fastqc_lines = fh.readlines()
- fastqc_base_seq_qual = [line for line in fastqc_lines if re.search('>>Per base sequence quality\t', line)]
- fastqc_len_dist = [line for line in fastqc_lines if re.search('>>Sequence Length Distribution\t', line)]
- fastqc_seq_len = [line for line in fastqc_lines if re.match('Sequence length', line)]
- sample_stats['fastqc'] = fastqc_base_seq_qual, fastqc_len_dist, fastqc_seq_len
- # retrieve QC stats for raw data - trimmomatic summary
- with open('{}/trimm.log' .format(results_sub_dir), 'r') as fh:
- trimm_lines = fh.readlines()
- trimm_summary = [line for line in trimm_lines if re.search(' Surviving: ', line)]
- sample_stats['trimmomatic'] = trimm_summary
- # parse QC stats for raw data - initial fastqc
- if re.search('pass', str(sample_stats["fastqc"][0])) is not None:
- print ('\t\t*FastQC:\tsequence base quality\t==>\tPASS')
- base_qual = 'PASS'
- else:
- print ('\t\t*FastQC:\tsequence base quality\t==>\tFAIL')
- base_qual = 'FAIL'
- if re.search('pass', str(sample_stats["fastqc"][1])) is not None:
- print ('\t\t*FastQC:\tseq length distribution\t==>\tPASS')
- len_dist = 'PASS'
- else:
- print ('\t\t*FastQC:\tseq length distribution\t==>\tFAIL')
- len_dist = 'FAIL'
- fqc_search = re.compile(r"Sequence length\\t(.*)\\t\\n")
- fastqc_match = re.search(fqc_search, str(sample_stats["fastqc"][2]))
- if fastqc_match:
- print ('\t\t*FastQC:\tinitial sequence length\t==>\t{}' .format(fastqc_match.group(1)))
- seq_length = fastqc_match.group(1)
- else:
- print ('\t\t*FastQC:\tinitial sequence length\t==>\terror with parsing fastqc results\n')
- seq_length = 'ERROR'
- # parse QC stats for raw data - trimmomatic
- trimm_match = re.search('Dropped:\s+\d+\s+\((\S+)\)', str(sample_stats["trimmomatic"]))
- if trimm_match:
- print ('\t\t*Trimmomatic:\tdropped sequences\t==>\t{}' .format(trimm_match.group(1)))
- dropped_seqs = trimm_match.group(1)
- else:
- print ('\t\t*Trimmomatic:\tdropped sequences\t==>\terror with parsing trimmomatic results\n')
- dropped_seqs = 'ERROR'
- # run second fastqc + kraken subprocesses
- print ('\n\n\t*Completed FastQC and Trimmomatic jobs!\n\n\n\t*Running FastQC and and Kraken on trimmed data . . .\n\n')
- processes2 = []
- with open('%s/fastqc2.log' % results_sub_dir, 'w') as fastqc_log2:
- p1 = subprocess.Popen(fastqc_cmd_2, shell=True, stdout=fastqc_log2, stderr=fastqc_log2)
- processes2.append(p1)
- with open('%s/kraken.log' % results_sub_dir, 'w') as kraken_log:
- p2 = subprocess.Popen(kraken_cmd, shell=True, stdout=kraken_log, stderr=kraken_log)
- processes2.append(p2)
- # wait for subprocesses to complete:
- exit_codes = [p.wait() for p in processes2]
- # retrieve QC stats - use kraken log to find number of unclassified sequences and the total number of sequences in the trimmed dataset
- with open('{}/kraken.log' .format(results_sub_dir), 'r') as fh:
- kraken_lines = fh.readlines()
- kraken_unclassified = [line for line in kraken_lines if re.search('sequences unclassified ', line)]
- sample_stats['kraken'] = kraken_unclassified
- for line in kraken_lines:
- seq_num_match = re.search('\r(\d+) sequences \(.*\) processed in', line)
- if seq_num_match:
- seq_num = seq_num_match.group(1)
- break
- else:
- seq_num = 'Could not parse Kraken log file for seq num'
- # retrieve QC stats - use krakenreport to find number of salmonella sequences
- with open('{}/krakenreport' .format(results_sub_dir), 'r') as fh:
- krakenreport_lines = fh.readlines()
- kraken_search = [line for line in krakenreport_lines if re.search(kraken_search_term, line)]
- sample_stats['kraken_search_result'] = kraken_search
- # retrieve QC stats - fastqc results for trimmed data
- trimmed_basename = os.path.basename(trimmed_data)
- fastqc_file2 = trimmed_basename + '_fastqc/fastqc_data.txt'
- fastqc_file2 = fastqc_file2.replace('.fastq.gz','')
- with open('{}/{}' .format(results_sub_dir, fastqc_file2), 'r') as fh:
- fastqc_lines2 = fh.readlines()
- fastqc_base_seq_qual_trimmed = [line for line in fastqc_lines2 if re.search('>>Per base sequence quality\t', line)]
- fastqc_len_dist_trimmed = [line for line in fastqc_lines2 if re.search('>>Sequence Length Distribution\t', line)]
- fastqc_seq_len_trimmed = [line for line in fastqc_lines2 if re.match('Sequence length', line)]
- sample_stats['fastqc_trimmed'] = fastqc_base_seq_qual_trimmed, fastqc_len_dist_trimmed, fastqc_seq_len_trimmed
- # parse QC stats for results - kraken
- kraken_match = re.search('\((\S+)\)', str(sample_stats["kraken"]))
- if kraken_match:
- print ('\t\t*Kraken:\tunclassified sequences\t==>\t{}' .format(kraken_match.group(1)))
- unclassified_reads = kraken_match.group(1)
- else:
- print ('\t\t*Kraken:\tunclassified sequences\t==>\terror with parsing kraken results\n')
- unclassified_reads = 'ERROR'
- kraken_search_match = re.search('\s+(\d+\.\d+)', str(sample_stats["kraken_search_result"]))
- if kraken_search_match:
- print ('\t\t*Kraken:\tSalmonella sequences\t==>\t{}%' .format(kraken_search_match.group(1)))
- search_term_reads = kraken_search_match.group(1)
- else:
- print ('\t\t*Kraken:\tSalmonella sequences\t==>\terror with parsing kraken results for search term ({})\n' .format(kraken_search_term))
- search_term_reads = 'ERROR'
- # parse QC stats for results - fastqc
- if re.search('pass', str(sample_stats["fastqc_trimmed"][0])) is not None:
- print ('\t\t*FastQC:\tsequence base quality\t==>\tPASS')
- base_qual_trimmed = 'PASS'
- else:
- print ('\t\t*FastQC:\tsequence base quality\t==>\tFAIL')
- base_qual_trimmed = 'FAIL'
- if re.search('pass', str(sample_stats["fastqc_trimmed"][1])) is not None:
- print ('\t\t*FastQC:\tseq length distribution\t==>\tPASS')
- len_dist_trimmed = 'PASS'
- else:
- print ('\t\t*FastQC:\tseq length distribution\t==>\tFAIL')
- len_dist_trimmed = 'FAIL'
- fqc_search = re.compile(r"Sequence length\\t(.*)\\t\\n")
- fastqc2_match = re.search(fqc_search, str(sample_stats["fastqc_trimmed"][2]))
- if fastqc2_match:
- print ('\t\t*FastQC:\ttrimmed sequence length\t==>\t{}' .format(fastqc2_match.group(1)))
- seq_length_trimmed = fastqc2_match.group(1)
- else:
- print ('\t\t*FastQC:\ttrimmed sequence length\t==>\terror with parsing fastqc results\n')
- seq_length_trimmed = 'ERROR'
- # write results to file
- with open(QC_results_file, 'w') as fh:
- fh.write('{}\t{}\t{}\t{}\t{}\t\t{}\t{}\t{}\t{}\t{}%\t{}\t\n' .format(file_basename, seq_length, base_qual, len_dist, dropped_seqs, seq_length_trimmed, base_qual_trimmed, len_dist_trimmed, unclassified_reads, search_term_reads, seq_num))
- return trimmed_data
- ################################################################################
- ### MAIN SCRIPT
- if __name__ == "__main__":
- # read commandline arguments and create variables
- I_met, O_met = (False,)*3
- try:
- myopts, args = getopt.getopt(sys.argv[1:],'i:o:d:l:')
- except getopt.GetoptError as e:
- print (str(e))
- print ('usage: {} -i [input .fastq.gz file] -o [output file] -d [directory for output files] -l [length cut-off value]' .format(sys.argv[0]))
- sys.exit(2)
- for option, argument in myopts:
- # get input file (required)
- if option == '-i':
- input_fastq_file = os.path.abspath(argument)
- try:
- os.path.isfile(input_fastq_file)
- except Exception, e:
- print >> sys.stderr, 'Supplied input does not appear to be a file . . .'
- print >> sys.stderr, 'Exception: %s' % str(e)
- sys.exit(1)
- file_basename = os.path.basename(input_fastq_file)
- I_met = True
- # get name for QC summary file (required)
- if option == '-o':
- QC_results_file = os.path.abspath(argument)
- O_met = True
- # get directory for output files (kraken report etc.)
- if option == '-d':
- results_sub_dir = os.path.abspath(argument)
- if not os.path.exists(results_sub_dir):
- try:
- os.makedirs(results_sub_dir)
- except Exception, e:
- print >> sys.stderr, 'Can\'t create the supplied results directory . . .'
- print >> sys.stderr, 'Exception: %s' % str(e)
- sys.exit(1)
- # get length cutoff value
- if option == '-l':
- length_cutoff = argument
- try:
- length_cutoff.isdigit()
- except Exception, e:
- print >> sys.stderr, 'The supplied length cut-off value does not appear to be a digit . . .'
- print >> sys.stderr, 'Exception: %s' % str(e)
- sys.exit(1)
- if not any((I_met, O_met)):
- print ('options i and o must both be supplied!\n')
- print ('usage: {} -i [input .fastq.gz file] -o [output file] -d [directory for output files] -l [length cut-off value]' .format(sys.argv[0]))
- sys.exit(2)
- # run the QC module
- try:
- QC_results = qc_data(input_fastq_file, length_cutoff, results_sub_dir, file_basename, QC_results_file)
- except Exception, e:
- print >> sys.stderr, 'Can\'t run QC module . . .'
- print >> sys.stderr, 'Exception: %s' % str(e)
- sys.exit(1)