/gqSeqUtils/R/utils.r
R | 678 lines | 349 code | 122 blank | 207 comment | 60 complexity | cf9bc218bbc015dd1f8d09515c152199 MD5 | raw file
- # TODO:
- # - input can be filename or what not or comma separated
- # - Vectorize for multiple FASTAs input...
- # Other plots? report?
- # - units in Mb, kb
- # - GC content? dinucleotide content?
- # - Coverage estimate relative to expected genoeme size
- # - filter small contigs??
- # - etc.
- # - NG50 *****
- # - Parallelize
- # - Biostrings extension?
- # - line at N50
- # - ggvis
- # - would be nice if wouls support XStringset + mcols for coloring attributes
- # any input type
- # - issue warning if duplicate sequences
- # - exlude MASK Ns from statistics???
- # - kmers???
- # - Are you sure will work for RNA,DNA,AA? If not, would be nice one day
- NX <- function(s, genome.size = NULL )
- {
- g = ifelse( is.null(genome.size), sum(width(s)), genome.size)
- s.width.ordered = rev(sort(width(s)))
- #cum.fraction = cumsum(s.width.ordered) / sum(s.width.ordered)
- cum.fraction = cumsum(s.width.ordered) / g
- s.N = sapply(as.character(seq(10, 90, by = 10)), function(p) {
- s.width.ordered[which(cum.fraction >= as.numeric(p) / 100)[1]]
- }, simplify = TRUE)
-
- names(s.N) = paste0( ifelse( is.null(genome.size) ,'N',"NG"), names(s.N) )
-
- return(s.N)
- }
- # TODO: not sure NG will always work!
- #' A simple function that leverages Biostrings to generate a bunch of stats from dna FASTA files.
- #'
- #' @param sequences Character vector of fasta file names. Can also be filenames separated by sequences.sep
- #' @param type generic, genome or trinity. Not a lot of differences for now
- #' @param output.prefix path+filename prefix to append to output files
- #' @param write.stats Boolean, should a csv and pdf of stats be written out
- #' @param sequences.sep If sequences is character, will be split by this separator
- #' @param mask.motif If set, the specified motifs will be masked and the length statistics will be computed as if the masked sequences were removed.
- #' @export
- DnaFastaSummary = function(
- sequences,
- mask.Ns = TRUE,
- target.assembly.length = NULL,
- # type = c("generic", "genome", "trinity"),
- # write.report = TRUE,
- # report.prefix = 'stats',
- mc.cores=1,
- sequences.sep=','
- ){
-
- #type = match.arg(type)
-
- # cd /netmount/ip03_home/lefebvr3/dna; myr
- # sequences = list.files(pattern="\\.fasta$") ;mc.cores=12;sequences.sep=',';target.assembly.length=207607558; mask.Ns = TRUE
- require(Biostrings)
- # require(magrittr)
- # require(hwriter)
- # require(Rmarkdown)
- # require(ggvis)
- # require(stringr)
- # require(knitr)
- ## Check input seqences
- if(length(unique(sapply(sequences,class))) != 1 ){stop("sequences must be objects of the same class (character or DNAStringSet)")}
- if( is.character(sequences) )
- {
- sequences = unique(unlist(strsplit(sequences, split=sequences.sep, fixed=TRUE)))
- if(any( !file.exists(sequences))){stop("One of the files input does not exist")}
- }else{
- if( any( ! sapply(sequences,is,"DNAStringSet") )) { stop("sequences must be ether character vector or a list of DNAStringSet objects") }
- }
-
- ## Iterate over sequences and calculate statistics
- dat = mclapply(sequences,function(seq){
- message("Processing one sequence...")
- # seq = sequences[1]
- if( is.character(seq) )
- {
- seq = readDNAStringSet(seq)
- }
-
-
- # TODO: still need to figure out how to mask properly
-
- l = list(
- "Maximum Length" = max(width(seq)),
- "Mimimum Length" = min(width(seq)),
- "Mean Length" = mean(width(seq)),
- "Median Length" = median(width(seq)),
- "1st. Quartile Length" = quantile(width(seq),0.25,names=FALSE),
- "3rd. Quartile Length" = quantile(width(seq),0.75,names=FALSE) )
- l = c(l,NX(seq))
- if( !is.null(target.assembly.length) ){
- l = c(l, NX(seq, target.assembly.length))
- }
-
-
- # alphabet Frequencies
- s.af = alphabetFrequency(s)
- s.af.total.perc = apply(s.af, MARGIN = 2, sum) / sum(width(s)) * 100
- names(s.af.total.perc) = paste0("Total ", names(s.af.total.perc), " %")
-
-
-
- # TODO: it was nice to have something report-ready with bp indicated!
-
-
-
-
-
- },mc.cores=mc.cores)
- names(stats) = sequences
-
-
-
- # N-content distribution
- s.n.summary = unlist(as.list(summary(s.af[, "N"] / width(s) * 100,digits=Inf)))
- names(s.n.summary) = paste0(names(s.n.summary), " Sequence N %")
-
- # GC-content
- # s.gc = letterFrequency(s, letters = c("GC")) / width(s) * 100
-
- # Generic statistics
- stats = c(
- "Nb. Sequences" = length(s),
- "Total Sequences Length (bp)" = sum(width(s)),
- #"N50" = N50(width(s)),
- s.summary,
- "Nb. Duplicate Sequences" = sum(duplicated(s)),
- s.N,
- s.n.summary,
- s.af.total.perc
- )
- # Axis labels for histogram plot
- xlabel = "Sequence Length (bp, log10 scale)"
- ylabel = "Sequence Count"
- if (type == 'trinity') {
- names(stats) = gsub("Sequence", "Transcript", names(stats))
- xlabel = gsub("Sequence", "Transcript", xlabel)
- ylabel = gsub("Sequence", "Transcript", ylabel)
- # Insert "Nb. Components" as second column
- stats = c(stats[1], "Nb. Components" = length(unique(str_extract(names(s), "(^comp[0-9]+_c[0-9]+_)|(^c[0-9]+_g[0-9]+_)"))), stats[2:length(stats)])
- }
- if (type == 'genome') {
- names(stats) = gsub("Sequence", "Contig", names(stats))
- xlabel = gsub("Sequence", "Contig", xlabel)
- ylabel = gsub("Sequence", "Contig", ylabel)
- }
-
- # ## Sliding window frequency content
- # plot ( letterFrequencyInSlidingView(largest.contig , view.width = 1000, c("GC"))[, 1] / 1000, ylab = "GC content")
- # ## dinucl. freqs
- # head(dinucleotideFrequency(assembly))
- if (write.stats) {
- # Create CSV stats file
- write.csv(data.frame(stats), file = paste0(output.prefix, ".csv"))
- # Create histogram plot of sequence length distribution with N50
- sequence.lengths = ggplot(data.frame(length = width(s)), aes(x = length))
- sequence.lengths = sequence.lengths +
- geom_histogram(colour = "darkgrey", fill = "white", binwidth = 0.02) +
- scale_x_log10(
- breaks = c(200, 500, 1000, 2000, 5000, 10000, 20000, 50000)) +
- geom_vline(
- data = data.frame(Legend = paste0("N50 = ", stats["N50 (bp)"], " bp"), vals = stats["N50 (bp)"]),
- aes(xintercept = vals, shape = Legend),
- colour = "black",
- show_guide = TRUE) +
- xlab(xlabel) +
- ylab(ylabel) +
- ggtitle("Transcript Length Distribution") +
- theme(plot.title = element_text(face = "bold"), legend.title = element_blank())
- # Create PDF file of stats + histogram plot
- pdf(file = paste0(output.prefix, ".pdf"), height = 5, width = 7)
- textplot(data.frame("Statistic" = format(stats,
- digits = 2,
- big.mark = ",",
- scientific = FALSE,
- drop0trailing = TRUE)))
- suppressWarnings(print(sequence.lengths))
- dev.off()
- # Create JPEG image of histogram plot
- jpeg(file = paste0(output.prefix, ".jpg"), height = 500, width = 700)
- sequence.lengths = sequence.lengths +
- theme(plot.title = element_text(size = 16),
- axis.title = element_text(size = 16),
- axis.text = element_text(size = 14),
- legend.text = element_text(size = 16))
- print(sequence.lengths)
- dev.off()
- }
- return(stats)
- }
- #' This function reads a fasta file, and writes back shuffled chunks of equal sizes.
- #'
- #' @export
- splitDNAFasta = function(fasta, output.dir = 'chunks', chunk.size = 1000, shuffle = TRUE, mc.cores = 8) {
- # fasta = 'assembly/Trinity.fasta';output.dir = 'chunks'; chunk.size = 1000; shuffle = TRUE
- require(Biostrings)
- require(multicore)
- dir.create(output.dir, showWarnings = FALSE)
- s = readDNAStringSet(fasta)
- f = ceiling(c(1:length(s)) / chunk.size) # this create factor of chunks
- if (shuffle) {
- f = sample(f, size = length(f))
- }
- s = split(s, f)
- names(s) = format(names(s), width = ceiling(log10(1 + max(f))), justify = 'right')
- names(s) = gsub(' ', '0', names(s))
- x = mapply(function(o, n) {
- #print(n)
- names(o) = gsub('^[0-9]*?\\.', '', names(o))
- fn = file.path(output.dir, paste0(n, '.fasta'))
- writeXStringSet(o, filepath = fn)
- fn
- }, s, names(s))
- return(unlist(x)) # returns chunks filenames
- }
- #' The role of this function is to extract the gene level annotation data.frame from a gtf file. It returns a data.frame with
- #' columns "featureID" (parsed from gene_id attribute), "SYMBOL" (attempts to find gene_name attribute, if none, sets to gene_id)
- #' and "gene.length" (via gqSeqUtils::calculateGeneLengthsFromGtf()).
- #'
- #'
- #' @param gtf Path to a gtf file. Alternatively, a GRanges object.
- #' @param fn Optional file name to write the table
- #' @param ... further arguments passed down to calculateGeneLengthsFromGtf(). See default values
- #'
- #' @import IRanges
- #' @import GenomicRanges
- #' @import rtracklayer
- #' @export
- extractGeneAnnotationFromGtf = function(gtf, fn = NULL,...) {
- # require(stringr)
- # require(rtracklayer)
- # require(GenomicRanges)
- message("parsing gtf... this might take a while")
-
- # Import gtf to GRanges object
- message("importing gtf to memory...")
- gr = rtracklayer::import(gtf, format = 'gtf')
- message("done importing.")
- # Parse out gene_id and other, optional attributes (deprecated.. format = "gtf" above parses all correctly already)
- #gr$gene_id = gsub('((gene_id |;)|\")', '', str_extract(as.character(gr$group), 'gene_id.*?;')) # mandatory
- #gr$gene_name = gsub('((gene_name |;)|\")', '', str_extract(as.character(gr$group), 'gene_name.*?;')) # optional
- #gr$gene_biotype = gsub('((gene_biotype |;)|\")', '', str_extract(as.character(gr$group), 'gene_biotype.*?;')) # optional
-
- # gene_name and gene_biotype are two OPTIONAL but interesting attributes. Thefore is absent, set to NA
- if (! "gene_biotype" %in% colnames(mcols(gr))) {mcols(gr)[['gene_biotype']] = NA}
- if (! "gene_name" %in% colnames(mcols(gr))) {mcols(gr)[['gene_name']] = NA}
-
- # Collapse transcripts to obtain gene annotation
- ann = unique(mcols(gr)[, c("gene_id", "gene_name", "gene_biotype")])
-
- # make sure gene_name and biotype are not screwing up unicity
- if (anyDuplicated(ann$gene_id)) {
- ann[['gene_name']] = NA; ann[['gene_biotype']] = NA
- ann = unique(ann)
- warning("gene_id not in 1:n relationship with gene_name and gene_biotype. The latter are dropped")
- }
- rownames(ann) = ann$gene_id
-
- # Replace empty symbols with gene_id
- ann$gene_name[is.na(ann$gene_name)] = rownames(ann)[is.na(ann$gene_name)]
-
- # Rename columns
- colnames(ann)[1:2] = c("featureID", "SYMBOL")
-
- # Gene length
- ann[["gene.length"]] = calculateGeneLengthsFromGtf(gr,...)[rownames(ann)]
-
- ann = as.data.frame(ann)
- # In case one wants to write to file directly
- if (!is.null(fn)) {
- write.delim(ann, file = fn, row.names = FALSE, col.names = TRUE, quote = FALSE, sep = '\t')
- }
- return(ann)
- }
- #' A small function that reads a gtf file (with rtracklayer), splits by gene_id, and then applies a reduce (GenomicRanges) operation to
- #' the Ranges to get an estimated genomic length
- #' of gene models. Return value is a named vector of gene lengths or a data.frame.
- #'
- #' @param gtf Path to a gtf file. Alternatively, a GRanges object.
- #' @param feature.types Kinds of features in the gtf to consider, defaults to CDS and exon.
- #' @param as.data.frame return a data.frame instead of named vector
- #' @param output.file If as.data.frame is TRUE, a file name where to write it as tab-seprated values
- #'
- #' @import IRanges
- #' @import GenomicRanges
- #' @import rtracklayer
- #' @export
- calculateGeneLengthsFromGtf = function(gtf,feature.types=c("exon","CDS"), as.data.frame=FALSE, output.file=NULL) {
- # require(rtracklayer)
- # require(GenomicRanges)
- # require(stringr)
- message("Calculating gene lengths...")
- # Import gtf to GRanges object
- message("importing...")
- if (is.character(gtf)) {
- message("importing")
- gl = rtracklayer::import.gff(gtf, format = 'gtf', asRangedData = FALSE)
- message("done importing.")
- } else {
- gl = gtf
- }
- # Subset only to exon tracks
- message(sprintf("Subsetting to %s",paste(feature.types,collapse=' and ')))
- gl = gl[gl$type %in% feature.types, ]
- # Parse out the gene_id
- message("checking gene_id")
- #gl$geneid = gsub('((gene_id |;)|\")', '', str_extract(as.character(gl$group), 'gene_id.*?;'))
- if (! "gene_id" %in% colnames(mcols(gl))) {stop("gene_id not a meta-data column of GRanges object")}
-
- # Split to a GRangesList object
- message("split")
- gl = GenomicRanges::split(gl, f = gl$gene_id) # split for GRanges signature turns into GRangesList!
-
- # Reduce to unique disjoint intervals (strand aware)
- message("reduce")
- gl = GenomicRanges::reduce(gl)#, ignore.strand = FALSE) # reduce eliminates overlaps
-
- # Sum interval lengths
- message("sum")
- gl = sum (width(gl))
-
- n = names(gl)
- gl = as.numeric(gl)
- names(gl) = n # really annoying... max integer size
- message("done calculating gene lengths.")
-
-
- if(as.data.frame)
- {
- gl = data.frame("gene_id"=names(gl),"length"=gl,stringsAsFactors=FALSE)
- if(!is.null(output.file))
- {
- write.table(gl, file=output.file, sep='\t',quote=FALSE,row.names=FALSE,col.names=FALSE)
- }
- }
-
- return(gl)
-
- # library(GenomicFeatures)
- # library(gqSeqUtils)
- # gtf = "/sb/programs/analyste/genomes/Rattus_norvegicus/UCSC/rn5/Annotation/Genes/genes.gtf"
- #
- # # With GenomicFeatures
- # t0 = Sys.time()
- # txdb = makeTranscriptDbFromGFF("genes.gtf", format = 'gtf')
- # x = exonsBy(txdb, by = "gene")
- # x = sum( width( reduce(x)))
- # print(Sys.time() - t0)
- #
- # # With homebrew
- # t0 = Sys.time()
- # y = calculateGeneLengthsFromGtf(gtf)
- # print(Sys.time() - t0)
- #
- # # Compare
- # identical(names(x), names(y))
- # summary(abs(x - y))
- # identical(x, y)
- # pdf("~/temp.pdf")
- # plot(x, y)
- # dev.off()
- }
- #' This function reads 'masked csv' files.
- #'
- #' @param df A data frame to write out
- #' @param fn Filename
- #' @param rows, cols logical, character or integer vector specifying which columns or rows should be labeled "included" in the masked-format table, see readMaskedTable
- #' @param num.cols logical, character or integer vector specifying which columns should be labeled "num" in the masked-format table. All others labeled char See readMaskedTable
- #' @export
- writeMaskedTable = function(df, fn
- , rows = rownames(df)
- , cols = colnames(df)
- , num.cols = NULL ) {
- # df = cars; fn = 'temp.txt'; rows = rownames(df); cols = colnames(df); num.cols = NULL
- # Checks on rows and cols and num.cols
- lapply(list(cols, num.cols), function(co) {
- if (is.character(co)) {
- if (any(! co %in% colnames(df))) {stop('Undefined column selected as visible')}
- }
- if (is.logical(co)) {
- if (length(co) != ncol(df)) {stop('Incorrect length for logical vector spec. columns')}
- }
- if (is.numeric(co)) {
- if (any(! abs(co) %in% 1:ncol(df))) {stop('Incorrect indices spec. columns')}
- }
- })
- if (is.character(rows)) {
- if (any(! rows %in% rownames(df))) {stop('Undefined row selected')}
- }
- if (is.logical(rows)) {
- if (length(rows) != nrow(df)) {stop('Incorrect length for logical vector spec. rows')}
- }
- if (is.numeric(rows)) {
- if (any(! abs(rows) %in% 1:nrow(df))) {stop('Incorrect indices spec. rows')}
- }
- # Define column mask
- col.mask = rep('exclude', times = ncol(df))
- names(col.mask) = colnames(df)
- col.mask[cols] = 'include'
-
- # Define row mask
- row.mask = rep('exclude', times = nrow(df))
- names(row.mask) = rownames(df)
- row.mask[rows] = 'include'
-
- # Define which rows are numerical
- num.mask = rep('char', times = ncol(df))
- names(num.mask) = colnames(df)
- num.mask[num.cols] = 'num'
-
- dat = data.frame('.MASK' = row.mask, df, stringsAsFactors = FALSE)
- headers = rbind(c('.MASK', col.mask), c('.TYPE', num.mask))
- # write.table(headers, file = fn, sep = '\t', row.names = FALSE, col.names = FALSE, quote = FALSE)
- # suppressWarnings(write.table(dat, file = fn, sep = '\t', row.names = FALSE, col.names = TRUE, quote = FALSE, append = TRUE))
-
- write.table(headers, file = fn, sep = ',', row.names = FALSE, col.names = FALSE, quote = TRUE, qmethod = 'double', dec = '.', append = FALSE)
- suppressWarnings(
- write.table(dat, file = fn, sep = ',', row.names = FALSE, col.names = TRUE , quote = TRUE, qmethod = 'double', dec = '.', append = TRUE)
- )
- }
- #' This function reads 'masked csv' files.
- #'
- #' @param fn Filename
- #' @param apply.mask.rows, apply.mask.cols Whether or not to apply the mask on rows or columns
- #' @export
- readMaskedTable = function(fn, apply.mask.rows = TRUE, apply.mask.cols = TRUE) {
- # fn = 'temp.txt';apply.mask.rows = TRUE; apply.mask.cols = TRUE
- # Read the column mask and numerical indicator lines
- # headers = read.table(fn, sep = '\t', quote = '', comment.char = ''
- # , check.names = FALSE, nrows = 2
- # , stringsAsFactors = FALSE, header = FALSE
- # , colClasses = "character")
- headers = read.table(fn, sep = ',', quote = '\"', comment.char = ''
- , check.names = FALSE, nrows = 2
- , stringsAsFactors = FALSE, header = FALSE
- , colClasses = "character")
- # Column mask
- col.mask = as.character(headers[1, ])[-1]
- if (any(! col.mask %in% c('include', 'exclude'))) {stop('Col mask values should be \"include or exclude\"')}
- col.mask = col.mask == 'exclude'
- col.mask = col.mask & apply.mask.cols
-
- # Numerical mask
- num.mask = as.character( headers[2, ])[-1]
- if (any(! num.mask %in% c('char', 'num'))) {stop('Num mask values should be \"char or num\"')}
- num.mask = num.mask == 'num'
-
- # Read actual table
- # dat = read.table(fn, sep = '\t', quote = '', comment.char = '', skip = 2
- # , check.names = FALSE, stringsAsFactors = FALSE, header = TRUE
- # , colClasses = "character")
- dat = read.table(fn, sep = ',', quote = '\"', comment.char = '', skip = 2
- , check.names = FALSE, stringsAsFactors = FALSE, header = TRUE
- , colClasses = "character")
- # Extract row mask
- row.mask = as.character(dat[, 1])
- dat[, 1] = NULL # remove the row mask column
- if (any(! row.mask %in% c('include', 'exclude'))) {stop('Row mask values should be \"include or exclude\"')}
- row.mask = row.mask == 'exclude'
- row.mask = row.mask & apply.mask.rows
-
- # Apply num mask
- for (i in which(num.mask)) {
- dat[, i] = as.numeric(dat[, i])
- }
- # Apply row and col masks
- dat = dat[!row.mask, !col.mask, drop = FALSE]
- rownames(dat) = NULL
- return(dat)
- }
- #' This function takes a df of numbers and returns formatted number with percentages relative to the first column
- #'
- #' @param df A data.frame or matrix
- #'
- #' @export
- formatCountsTablePerc = function(df) {
- df = apply(df, MARGIN = 2, function(co) {
- co = paste0( format(co, big.mark = ',', trim = TRUE), ' (', round(100*co/df[, 1], 2), '%)')
- names(co) = rownames(df); co
- })
- return(as.data.frame(df, stringsAsFactors = FALSE, check.names = FALSE))
- }
- #' This function performs a series of gsub() operation
- #' @export
- mgsub = function(patterns, replacements, x) {
- if (length(patterns) != length(replacements)) {stop('patterns must be same size as replacements')}
- for (i in 1:length(patterns)) {
- x = gsub(patterns[i], replacements[i], x)
- }
- x
- }
- # TODO: move me to gqUtils
- #' This function returns TRUE if synthaxically valid
- #'
- #' @export
- synthaxValid = function(x) {
- return(identical(x, make.names(x)))
- }
- #' A function
- #' @export
- guessCluster = function() {
- host = system('hostname', intern = TRUE)
- if (grepl('^abacus', host)) {return('abacus')}
- if (grepl('^lg-', host)) {return('guillimin')}
- if (grepl('^ip03$', host)) {return('mammouth')}
- dnsdomain = system('dnsdomainname', intern = TRUE)
- if (grepl('ferrier.genome.mcgill.ca', dnsdomain)) {return('abacus')}
- if (grepl('guillimin.clumeq.ca', dnsdomain)) {return('guillimin')}
- if (grepl('^m$', dnsdomain)) {return('mammouth')}
- stop('Could not guess cluster!!')
- }
- #' This function is a wrapper around biomaRt for the simple case of no filter. Unfortunately, access to previous ensembl release is constrained by the existence of a biomart instance
- #'
- #'
- #' @export
- getBMsimple<- function(
- host = NA,
- mart = NA,
- dataset = NA,
- attributes = NA,
- out.tsv.file = NULL,
- service.path = "/biomart/martservice"
- ){
- # # E.g. jul2012.archive.ensembl.org see possibilities at http://www.ensembl.org/info/website/archives/index.html or other archives
- # library(biomaRt); host= NA; mart= NA; dataset= NA; attributes = NA; out.tsv.file= NULL;service.path= "/biomart/martservice"
-
- if(is.na(host)){
- example.hosts = c("www.ensembl.org","grch37.ensembl.org","metazoa.ensembl.org","plants.ensembl.org","fungi.ensembl.org","protists.ensembl.org","feb2012.archive.ensembl.org")
- stop("Host must be specified. Examples include:\n",paste(example.hosts,collapse='\n'),"\n See http://www.ensembl.org/info/website/archives/index.html")
- }
-
- # Check mart
- available.marts = as.character(listMarts( host = host , path = service.path )[,1])
- if( is.na(mart) | ! mart %in% available.marts ){
- stop("Valid marts are:\n",paste(available.marts ,collapse="\n"))
- }
- # mart = "plants_mart_24"
- m = useMart(mart, host=host, path=service.path)
-
- # Check dataset
- available.datasets = sort(listDatasets(m)[,1])
- if( is.na(dataset) | ! dataset %in% available.datasets ){
- message("Valid datasets are:")
- writeLines(available.datasets)
- stop("Invalid dataset specified")
- }
- # dataset = "athaliana_eg_gene"
- m = useMart(mart, host=host, path=service.path, dataset=dataset)
-
- # Attributes
- #available.attributes = sort(listAttributes(m)[,1])
- available.attributes = unique(listAttributes(m))
- available.attributes = available.attributes[order(available.attributes$name),]
- if( any(is.na(attributes)) | any(! attributes %in% available.attributes$name ) ){
- message("Valid attributes are:")
- print( available.attributes )
- message("\n\nInteresting attributes are perhaps:")
- print( available.attributes[
- grepl("^GO|^go|gene_id|goa|goslim",available.attributes$name)
- | grepl("GO|slim|tology|Ensembl Gene",available.attributes$description)
- ,] )
- stop("Invalid attribute specified")
- }
- # attributes = c("ensembl_gene_id","go_accession","go_name_1006")
- results = getBM(attributes = attributes, mart = m) # 182026
- # out.tsv.file = "temp.tsv"
- if(!is.null(out.tsv.file))
- {
- write.table(results, file=out.tsv.file, quote=FALSE,sep='\t',col.names=FALSE,row.names=FALSE)
- }
-
- return(results)
- }
- # GO slims are particularly useful for giving a summary of the results of GO annotation of a genome, microarray, or cDNA collection when broad classification of gene product function is required. See the map2slim.pl section for a perl implementation of this.
- # res = getBMsimple("www.ensembl.org","ENSEMBL_MART_ENSEMBL","dmelanogaster_gene_ensembl",c("ensembl_gene_id","goslim_goa_accession","goslim_goa_description")) ; nrow(unique(res[,1:2]))
- # merge cuffdiff fpkm tracking and diff file in one general file
- formatCuffdiffOutput=function(
- designFolderPath,
- outputFilePath
- ){
- tracking=read.table(file.path(designFolderPath,"isoforms.fpkm_tracking"),header=T,sep="\t",comment.char = "",check.names=FALSE)
- diffFile=read.table(file.path(designFolderPath,"isoform_exp.diff"),header=T,sep="\t",comment.char = "",check.names=FALSE)
- mergeFiles=merge(diffFile,tracking,by.y="tracking_id",by.x="test_id")
- newFormatedTable=data.frame(
- test_id=mergeFiles$test_id,
- gene_id=mergeFiles$gene_id.x,
- tss_id=mergeFiles$tss_id,
- nearest_ref_id=mergeFiles$nearest_ref_id,
- class_code=mergeFiles$class_code,
- gene=mergeFiles$gene,
- locus=mergeFiles$locus.x,
- length=mergeFiles$length,
- log2FC=mergeFiles$'log2(fold_change)',
- test_stat=mergeFiles$test_stat,
- p_value=mergeFiles$p_value,
- q_value=mergeFiles$q_value)
- if (! file.exists(dirname(outputFilePath))) {
- dir.create(dirname(outputFilePath))
- }
- write.csv(newFormatedTable[order(newFormatedTable$q_value,decreasing=F),],file=outputFilePath,row.names=FALSE)
- return(newFormatedTable[order(newFormatedTable$q_value,decreasing=F),])
- }