/filter_meme.py
Python | 113 lines | 59 code | 13 blank | 41 comment | 10 complexity | 78ed75940a0c7f3b40952caca903c16a MD5 | raw file
- #!/usr/bin/env python
- # John Blischak
- # 29 Oct 2012
- # Prepares fasta files from filtered ChIP-seq peaks in order to find enriched motifs
- # using MEMECHIP.
- # 1) Filter out peaks that do not meet criteria.
- # 2) Make a bed file of the summit +/- 50 bp.
- # 2) Extracts the fasta sequence of summits from the specified genome.
- ################################################################################
- '''
- Usage:
- filter_meme.py peaks.xls hg19.fasta.gz additional_args > filtered.fasta
- where:
- peaks.xls is the output from MACS.
- hg19.fasta.gz is the fasta sequence of an entire genome (gzipped or not).
- additional_args filter the peaks (optional):
- length=# --remove peaks with lenghts greater than #.
- tags=# --remove peaks with fewer than # tags.
- pval=# --remove peaks with -10*log10(p) less than #.
- enrich=# --remove peaks with fold enrichment less than #.
- FDR=# --remove peaks with FDR(%) greater than #.
- '''
- ################################################################################
- import os
- import sys
- import gzip
- import pybedtools as bt
- import pandas as pd
- ################################################################################
- def set_argvs(sys_argvs):
- '''
- Sets the filters by reading in user defined system arguments.
- '''
- # Default values (low stringency)
- length = 3000000000
- tags = 0
- pval = 0
- enrich = 1
- FDR = 100
- # User specified values
- for argv in sys_argvs:
- if argv[:6] == 'length':
- length = int(argv.split('=')[1])
- elif argv[:4] == 'tags':
- tags = int(argv.split('=')[1])
- elif argv[:4] == 'pval':
- pval = float(argv.split('=')[1])
- elif argv[:6] == 'enrich':
- enrich = float(argv.split('=')[1])
- elif argv[:3] == 'FDR':
- FDR = float(argv.split('=')[1])
- else:
- sys.sterr.write('Did not recognize parameter %s\n'%(argv))
- return length, tags, pval, enrich, FDR
-
-
- ################################################################################
- if __name__ == '__main__':
- sys.stderr.write('Working directory: ' + os.getcwd() + '\n')
- peaks_xls = sys.argv[1]
- #peaks_xls = '/mnt/lustre/home/jdblischak/tag/sequences/JUNB/18358_C/MACS_sub_0.01_10,30/MACS_sub_0.01_10,30_peaks.xls'
- sys.stderr.write('peaks.xls file: ' + peaks_xls + '\n')
- genome = sys.argv[2]
- #genome = '/mnt/lustre/data/share/HumanGenome/hg19/allhg19_norandom.fasta.gz'
- #genome = '/mnt/lustre/data/share/HCR_Chipseq/Genomes/panTro3/panTro3_nonrandom.fa'
- sys.stderr.write('Genome file: ' + genome + '\n')
- peaks = pd.read_table(peaks_xls, skiprows = 23)
- ## peaks.shape
- ## peaks.index
- ## peaks.columns
- ## peaks.dtypes
- ## peaks.ix[0:3, 0:3]
- sys.stderr.write('Filtering...\n')
- length, tags, pval, enrich, FDR = set_argvs(sys.argv[3:])
- sys.stderr.write('length\t%d\ntags\t%d\n-10*log10(p-value)\t%.2f\nfold_enrichment\t%.2f\nFDR(%%)\t%.2f\n'%(
- length, tags, pval, enrich, FDR))
- filtered = peaks[(peaks['length'] < length) &
- (peaks['tags'] > tags) &
- (peaks['-10*log10(pvalue)'] > pval) &
- (peaks['fold_enrichment'] > enrich) &
- (peaks['FDR(%)'] < FDR)]
- assert filtered.shape[0] > 0, 'No peaks meet criteria.'
- summit_locs = filtered['start'] + filtered['summit']
- summits = pd.concat([filtered['chr'], summit_locs - 50, summit_locs + 50],
- keys = ['chr', 'start', 'end'], axis = 1)
- sys.stderr.write('Converting from pandas DataFrame to BedTool...\n')
- summits_bed = bt.BedTool(summits.to_string(header = False, index = False),
- from_string = True)
- summits_bed.count()
- if genome[-2:] == 'gz':
- genome_fasta = gzip.open(genome, 'rb')
- else:
- genome_fasta = open(genome, 'r')
- sys.stderr.write('Extracting fasta sequences...\n')
- # bt.sequence is a wrapper for fastaFromBed.
- summits_bed = summits_bed.sequence(fi = genome_fasta)
- sys.stdout.write(open(summits_bed.seqfn).read())
- sys.stderr.write('Finished successfully.\n')
- genome_fasta.close()
- sys.exit(0)
- ################################################################################