utils.r - Other plots? report? TODO: not sure NG will alway…

/gqSeqUtils/R/utils.r

https://bitbucket.org/mugqic/rpackages · R · 678 lines · 349 code · 122 blank · 207 comment · 60 complexity · cf9bc218bbc015dd1f8d09515c152199 MD5 · raw file

# TODO: 
# - input can be filename or what not or comma separated
# - Vectorize for multiple FASTAs input...
# Other plots? report?
# - units in Mb, kb
# - GC content? dinucleotide content?
# - Coverage estimate relative to expected genoeme size
# - filter small contigs??
# - etc.
# - NG50 *****
# - Parallelize
# - Biostrings extension?
# - line at N50
# - ggvis
# - would be nice if wouls support XStringset + mcols for coloring attributes
# any input type
# - issue warning if duplicate sequences
# - exlude MASK Ns from statistics???
# - kmers???
# - Are you sure will work for RNA,DNA,AA? If not, would be nice one day


NX <- function(s, genome.size = NULL )
{		
	g  = ifelse( is.null(genome.size), sum(width(s)), genome.size)

    s.width.ordered = rev(sort(width(s)))
    #cum.fraction = cumsum(s.width.ordered) / sum(s.width.ordered)
    cum.fraction = cumsum(s.width.ordered) / g
    s.N = sapply(as.character(seq(10, 90, by = 10)), function(p) {
      s.width.ordered[which(cum.fraction >= as.numeric(p) / 100)[1]]
    }, simplify = TRUE)
	
	names(s.N) = paste0( ifelse( is.null(genome.size) ,'N',"NG"), names(s.N) )
	
	return(s.N)
}
# TODO: not sure NG will always work!


#' A simple function that leverages Biostrings to generate a bunch of stats from dna FASTA files.
#' 
#' @param sequences Character vector of fasta file names. Can also be filenames separated by sequences.sep
#' @param type generic, genome or trinity. Not a lot of differences for now
#' @param output.prefix path+filename prefix to append to output files
#' @param write.stats Boolean, should a csv and pdf of stats be written out
#' @param sequences.sep If sequences is character, will be split by this separator
#' @param mask.motif If set, the specified motifs will be masked and the length statistics will be computed as if the masked sequences were removed. 
#' @export
DnaFastaSummary = function(
	sequences,
	mask.Ns = TRUE,
	target.assembly.length = NULL,
#    type = c("generic", "genome", "trinity"),
#	write.report = TRUE,
#	report.prefix = 'stats',
	mc.cores=1,
	sequences.sep=','
){
  
  #type = match.arg(type)
  
  # cd  /netmount/ip03_home/lefebvr3/dna; myr
  # sequences = list.files(pattern="\\.fasta$") ;mc.cores=12;sequences.sep=',';target.assembly.length=207607558; mask.Ns = TRUE

  require(Biostrings)
#  require(magrittr)
#  require(hwriter)
#  require(Rmarkdown)
#  require(ggvis)
#  require(stringr)
#  require(knitr)



  ## Check input seqences
  if(length(unique(sapply(sequences,class))) != 1 ){stop("sequences must be objects of the same class (character or DNAStringSet)")}
  if( is.character(sequences) )
  {
	  sequences = unique(unlist(strsplit(sequences, split=sequences.sep, fixed=TRUE)))
	  if(any( !file.exists(sequences))){stop("One of the files input does not exist")}
  }else{
  	  if( any( ! sapply(sequences,is,"DNAStringSet")  )) { stop("sequences must be ether character vector or a list of DNAStringSet objects") }
  }
  
  ## Iterate over sequences and calculate statistics
  dat = mclapply(sequences,function(seq){
	  message("Processing one sequence...")
	  # seq = sequences[1] 
	  if( is.character(seq) )
	  {
		  seq = readDNAStringSet(seq)
	  }
  	  	  
		  
	 # TODO: still need to figure out how to mask properly  
		  
	  l = list( 
		  "Maximum Length"        =      max(width(seq)), 
		  "Mimimum Length"        =      min(width(seq)),
		  "Mean Length"           =     mean(width(seq)),
		  "Median Length"         =   median(width(seq)),
		  "1st. Quartile Length"  = quantile(width(seq),0.25,names=FALSE),
		  "3rd. Quartile Length"  = quantile(width(seq),0.75,names=FALSE) )	  
	   l = c(l,NX(seq))
	   if( !is.null(target.assembly.length) ){
		   l = c(l, NX(seq, target.assembly.length))
	   }
	   	  
		 
	   # alphabet Frequencies
	   s.af = alphabetFrequency(s)
	   s.af.total.perc = apply(s.af, MARGIN = 2, sum) / sum(width(s)) * 100
	   names(s.af.total.perc) =  paste0("Total ", names(s.af.total.perc), " %")
  
		 
		  
		# TODO: it was nice to have something report-ready with bp indicated!
	  
	
		
	  

	  
  },mc.cores=mc.cores)
  names(stats) = sequences
  
  


  

  # N-content distribution
  s.n.summary  = unlist(as.list(summary(s.af[, "N"] / width(s) * 100,digits=Inf)))
  names(s.n.summary) = paste0(names(s.n.summary), " Sequence N %")
  
  # GC-content
  # s.gc = letterFrequency(s, letters = c("GC")) / width(s) * 100
  
  # Generic statistics
  stats = c(
    "Nb. Sequences"           = length(s),
    "Total Sequences Length (bp)"  = sum(width(s)),
    #"N50"                     = N50(width(s)),
    s.summary,
    "Nb. Duplicate Sequences" = sum(duplicated(s)),
    s.N,
    s.n.summary,
    s.af.total.perc
  )

  # Axis labels for histogram plot
  xlabel = "Sequence Length (bp, log10 scale)"
  ylabel = "Sequence Count"

  if (type == 'trinity') {
    names(stats) = gsub("Sequence", "Transcript", names(stats))
    xlabel = gsub("Sequence", "Transcript", xlabel)
    ylabel = gsub("Sequence", "Transcript", ylabel)
    # Insert "Nb. Components" as second column
    stats = c(stats[1], "Nb. Components" = length(unique(str_extract(names(s), "(^comp[0-9]+_c[0-9]+_)|(^c[0-9]+_g[0-9]+_)"))), stats[2:length(stats)])
  }
  if (type == 'genome') {
    names(stats) = gsub("Sequence", "Contig", names(stats))
    xlabel = gsub("Sequence", "Contig", xlabel)
    ylabel = gsub("Sequence", "Contig", ylabel)
  }
  
  # ## Sliding window frequency content
  # plot ( letterFrequencyInSlidingView(largest.contig , view.width = 1000, c("GC"))[, 1] / 1000, ylab = "GC content")
  # ## dinucl. freqs
  # head(dinucleotideFrequency(assembly))

  if (write.stats) {
    # Create CSV stats file
    write.csv(data.frame(stats), file = paste0(output.prefix, ".csv"))

    # Create histogram plot of sequence length distribution with N50
    sequence.lengths = ggplot(data.frame(length = width(s)), aes(x = length))
    sequence.lengths = sequence.lengths +
                       geom_histogram(colour = "darkgrey", fill = "white", binwidth = 0.02) +
                       scale_x_log10(        
                       breaks = c(200, 500, 1000, 2000, 5000, 10000, 20000, 50000)) +
                       geom_vline(
                         data = data.frame(Legend = paste0("N50 = ", stats["N50 (bp)"], " bp"), vals = stats["N50 (bp)"]),
                         aes(xintercept = vals, shape = Legend),
                         colour = "black",
                         show_guide = TRUE) +
                       xlab(xlabel) +
                       ylab(ylabel) +
                       ggtitle("Transcript Length Distribution") +
                       theme(plot.title = element_text(face = "bold"), legend.title = element_blank())

    # Create PDF file of stats + histogram plot
    pdf(file = paste0(output.prefix, ".pdf"), height = 5, width = 7)
    textplot(data.frame("Statistic" = format(stats,
                                             digits = 2,
                                             big.mark = ",",
                                             scientific = FALSE,
                                             drop0trailing = TRUE)))
    suppressWarnings(print(sequence.lengths))
    dev.off()

    # Create JPEG image of histogram plot
    jpeg(file = paste0(output.prefix, ".jpg"), height = 500, width = 700)
    sequence.lengths = sequence.lengths +
                         theme(plot.title = element_text(size = 16),
                               axis.title = element_text(size = 16),
                               axis.text = element_text(size = 14),
                               legend.text = element_text(size = 16))
    print(sequence.lengths)
    dev.off()
  }

  return(stats)
}







#' This function reads a fasta file, and writes back shuffled chunks of equal sizes.
#'
#' @export
splitDNAFasta = function(fasta, output.dir = 'chunks', chunk.size = 1000, shuffle = TRUE, mc.cores = 8) {
  # fasta = 'assembly/Trinity.fasta';output.dir = 'chunks'; chunk.size = 1000; shuffle = TRUE
  require(Biostrings)
  require(multicore)
  dir.create(output.dir, showWarnings = FALSE)
  s = readDNAStringSet(fasta)
  f = ceiling(c(1:length(s)) / chunk.size)  # this create factor of chunks
  if (shuffle) {
    f = sample(f, size = length(f))
  }
  s = split(s, f)
  names(s) = format(names(s), width = ceiling(log10(1 + max(f))), justify = 'right')
  names(s) = gsub(' ', '0', names(s))

  x = mapply(function(o, n) {
    #print(n)
    names(o) = gsub('^[0-9]*?\\.', '', names(o))
    fn = file.path(output.dir, paste0(n, '.fasta'))
    writeXStringSet(o, filepath = fn)
	fn
  }, s, names(s))
  return(unlist(x)) # returns chunks filenames
}


#' The role of this function is to extract the gene level annotation data.frame from a gtf file. It returns a data.frame with
#' columns "featureID" (parsed from gene_id attribute), "SYMBOL" (attempts to find gene_name attribute, if none, sets to gene_id) 
#' and "gene.length" (via gqSeqUtils::calculateGeneLengthsFromGtf()).
#' 
#'
#' @param gtf Path to a gtf file. Alternatively, a GRanges object.
#' @param fn Optional file name to write the table
#' @param ... further arguments passed down to calculateGeneLengthsFromGtf(). See default values
#'
#' @import IRanges
#' @import GenomicRanges
#' @import rtracklayer
#' @export
extractGeneAnnotationFromGtf = function(gtf, fn = NULL,...) {
#  require(stringr)
#  require(rtracklayer)
#  require(GenomicRanges)
  message("parsing gtf... this might take a while")
  
  # Import gtf to GRanges object
  message("importing gtf to memory...")
  gr = rtracklayer::import(gtf, format = 'gtf')
  message("done importing.")
  # Parse out gene_id and other,  optional attributes (deprecated.. format = "gtf" above parses all correctly already)
  #gr$gene_id = gsub('((gene_id |;)|\")', '', str_extract(as.character(gr$group), 'gene_id.*?;')) # mandatory
  #gr$gene_name = gsub('((gene_name |;)|\")', '', str_extract(as.character(gr$group), 'gene_name.*?;')) # optional
  #gr$gene_biotype = gsub('((gene_biotype |;)|\")', '', str_extract(as.character(gr$group), 'gene_biotype.*?;')) # optional
  
  # gene_name and gene_biotype are two OPTIONAL but interesting attributes. Thefore is absent, set to NA
  if (! "gene_biotype" %in% colnames(mcols(gr))) {mcols(gr)[['gene_biotype']] = NA}
  if (! "gene_name" %in% colnames(mcols(gr))) {mcols(gr)[['gene_name']] = NA}
    
  # Collapse transcripts to obtain gene annotation
  ann = unique(mcols(gr)[, c("gene_id", "gene_name", "gene_biotype")])
  
  # make sure gene_name and biotype are not screwing up unicity
  if (anyDuplicated(ann$gene_id)) {
    ann[['gene_name']] = NA; ann[['gene_biotype']] = NA
    ann = unique(ann)
    warning("gene_id not in 1:n relationship with gene_name and gene_biotype. The latter are dropped")
  }  
  rownames(ann) = ann$gene_id
  
  # Replace empty symbols with gene_id
  ann$gene_name[is.na(ann$gene_name)] = rownames(ann)[is.na(ann$gene_name)]
  
  # Rename columns
  colnames(ann)[1:2] = c("featureID", "SYMBOL")
  
  # Gene length
  ann[["gene.length"]] = calculateGeneLengthsFromGtf(gr,...)[rownames(ann)]
  
  ann = as.data.frame(ann)

  # In case one wants to write to file directly
  if (!is.null(fn)) {
    write.delim(ann, file = fn, row.names = FALSE, col.names = TRUE, quote = FALSE, sep = '\t')
  }

  return(ann)
}


#' A small function that reads a gtf file (with rtracklayer), splits by gene_id, and then applies a reduce (GenomicRanges) operation to 
#' the Ranges to get an estimated genomic length
#' of gene models. Return value is a named vector of gene lengths or a data.frame.
#'  
#' @param gtf Path to a gtf file. Alternatively, a GRanges object.
#' @param feature.types Kinds of features in the gtf to consider, defaults to CDS and exon.
#' @param as.data.frame return a data.frame instead of named vector
#' @param  output.file If as.data.frame is TRUE, a file name where to write it as tab-seprated values
#'
#' @import IRanges
#' @import GenomicRanges
#' @import rtracklayer
#' @export
calculateGeneLengthsFromGtf = function(gtf,feature.types=c("exon","CDS"), as.data.frame=FALSE, output.file=NULL) {
#  require(rtracklayer)
#  require(GenomicRanges)
#  require(stringr)

  message("Calculating gene lengths...")

  # Import gtf to GRanges object
  message("importing...")
  if (is.character(gtf)) {
    message("importing")
    gl = rtracklayer::import.gff(gtf, format = 'gtf', asRangedData = FALSE)
    message("done importing.")
  } else {
    gl = gtf
  }

  # Subset only to exon tracks
  message(sprintf("Subsetting to %s",paste(feature.types,collapse=' and ')))
  gl = gl[gl$type %in% feature.types, ]  

  # Parse out the gene_id 
  message("checking gene_id")
  #gl$geneid = gsub('((gene_id |;)|\")', '', str_extract(as.character(gl$group), 'gene_id.*?;'))
  if (! "gene_id" %in% colnames(mcols(gl))) {stop("gene_id not a meta-data column of GRanges object")}
  
  # Split to a GRangesList object
  message("split")
  gl = GenomicRanges::split(gl, f = gl$gene_id)   # split for GRanges signature turns into GRangesList!
  
  # Reduce to unique disjoint intervals (strand aware)
  message("reduce")
  gl = GenomicRanges::reduce(gl)#, ignore.strand = FALSE) # reduce eliminates overlaps
  
  # Sum interval lengths
  message("sum")
  gl = sum (width(gl))
  
  n = names(gl)
  gl = as.numeric(gl)
  names(gl) = n # really annoying... max integer size

  message("done calculating gene lengths.")
  
  
  if(as.data.frame)
  {
	  gl = data.frame("gene_id"=names(gl),"length"=gl,stringsAsFactors=FALSE)
	  if(!is.null(output.file))
	  {
  		  write.table(gl, file=output.file, sep='\t',quote=FALSE,row.names=FALSE,col.names=FALSE)
	  }
  }

  
  return(gl)
  
  # library(GenomicFeatures)
  # library(gqSeqUtils)
  # gtf = "/sb/programs/analyste/genomes/Rattus_norvegicus/UCSC/rn5/Annotation/Genes/genes.gtf"
  # 
  # # With GenomicFeatures
  # t0 = Sys.time()
  # txdb  =  makeTranscriptDbFromGFF("genes.gtf", format = 'gtf')
  # x = exonsBy(txdb, by = "gene")
  # x = sum( width( reduce(x)))
  # print(Sys.time() - t0)
  # 
  # # With homebrew
  # t0 = Sys.time()
  # y = calculateGeneLengthsFromGtf(gtf)
  # print(Sys.time() - t0)
  # 
  # # Compare
  # identical(names(x), names(y))
  # summary(abs(x - y))
  # identical(x, y)
  # pdf("~/temp.pdf")
  # plot(x, y)
  # dev.off()
}



#' This function reads 'masked csv' files. 
#'
#' @param df A data frame to write out
#' @param fn Filename
#' @param rows, cols logical, character or integer vector specifying which columns or rows should be labeled "included" in the masked-format table, see readMaskedTable
#' @param num.cols logical, character or integer vector specifying which columns should be labeled "num" in the masked-format table. All others labeled char See readMaskedTable
#' @export
writeMaskedTable = function(df, fn
                             , rows      = rownames(df)
                             , cols      = colnames(df)
                             , num.cols  = NULL ) {
  # df = cars; fn = 'temp.txt'; rows = rownames(df); cols = colnames(df); num.cols = NULL

  # Checks on rows and cols and num.cols
  lapply(list(cols, num.cols), function(co) {
    if (is.character(co)) {
      if (any(! co %in% colnames(df))) {stop('Undefined column selected as visible')}
    }
    if (is.logical(co)) {
      if (length(co) != ncol(df)) {stop('Incorrect length for logical vector spec. columns')}
    }
    if (is.numeric(co)) {
      if (any(! abs(co) %in% 1:ncol(df))) {stop('Incorrect indices spec. columns')}
    }
  })
  if (is.character(rows)) {
    if (any(! rows %in% rownames(df))) {stop('Undefined row selected')}
  }
  if (is.logical(rows)) {
    if (length(rows) != nrow(df)) {stop('Incorrect length for logical vector spec. rows')}
  }
  if (is.numeric(rows)) {
    if (any(! abs(rows) %in% 1:nrow(df))) {stop('Incorrect indices spec. rows')}
  }

  # Define column mask
  col.mask = rep('exclude', times = ncol(df))
  names(col.mask) = colnames(df)
  col.mask[cols]  = 'include'
  
  # Define row mask
  row.mask = rep('exclude', times = nrow(df))
  names(row.mask) = rownames(df)  
  row.mask[rows] = 'include'
  
  # Define which rows are numerical
  num.mask = rep('char', times = ncol(df))  
  names(num.mask) = colnames(df)
  num.mask[num.cols] = 'num'
  
  dat = data.frame('.MASK' = row.mask, df, stringsAsFactors = FALSE)
  headers = rbind(c('.MASK', col.mask), c('.TYPE', num.mask))
  # write.table(headers, file = fn, sep = '\t', row.names = FALSE, col.names = FALSE, quote = FALSE)
  # suppressWarnings(write.table(dat, file = fn, sep = '\t', row.names = FALSE, col.names = TRUE, quote = FALSE, append = TRUE))
  
  write.table(headers, file = fn, sep = ',', row.names = FALSE, col.names = FALSE, quote = TRUE, qmethod = 'double', dec = '.', append = FALSE)
  suppressWarnings(
    write.table(dat, file = fn, sep = ',', row.names = FALSE, col.names = TRUE , quote = TRUE, qmethod = 'double', dec = '.', append = TRUE)
  )
}

#' This function reads 'masked csv' files. 
#'
#' @param fn Filename
#' @param apply.mask.rows, apply.mask.cols Whether or not to apply the mask on rows or columns
#' @export
readMaskedTable = function(fn, apply.mask.rows = TRUE,  apply.mask.cols = TRUE) {  
  # fn = 'temp.txt';apply.mask.rows = TRUE;  apply.mask.cols = TRUE

  # Read the column mask and numerical indicator lines
  #  headers = read.table(fn, sep = '\t', quote = '', comment.char = ''
  #      , check.names = FALSE, nrows = 2
  #      , stringsAsFactors = FALSE, header = FALSE
  #      , colClasses = "character")
  headers = read.table(fn, sep = ',', quote = '\"', comment.char = ''
                       , check.names = FALSE, nrows = 2
                       , stringsAsFactors = FALSE, header = FALSE
                       , colClasses = "character")

  # Column mask
  col.mask = as.character(headers[1, ])[-1]
  if (any(! col.mask %in% c('include', 'exclude'))) {stop('Col mask values should be \"include or exclude\"')}
  col.mask = col.mask == 'exclude'
  col.mask = col.mask & apply.mask.cols
  
  # Numerical mask
  num.mask = as.character( headers[2, ])[-1]
  if (any(! num.mask %in% c('char', 'num'))) {stop('Num mask values should be \"char or num\"')}
  num.mask = num.mask == 'num'
  
  # Read actual table
  #  dat = read.table(fn, sep = '\t', quote = '', comment.char = '', skip = 2
  #    , check.names = FALSE, stringsAsFactors = FALSE, header = TRUE
  #    , colClasses = "character")
  dat = read.table(fn, sep = ',', quote = '\"', comment.char = '', skip = 2
                   , check.names = FALSE, stringsAsFactors = FALSE, header = TRUE
                   , colClasses = "character")


  # Extract row mask
  row.mask = as.character(dat[, 1])
  dat[, 1] = NULL # remove the row mask column
  if (any(! row.mask  %in% c('include', 'exclude'))) {stop('Row mask values should be \"include or exclude\"')}
  row.mask = row.mask == 'exclude'
  row.mask = row.mask & apply.mask.rows
  
  # Apply num mask
  for (i in which(num.mask)) {
    dat[, i] = as.numeric(dat[, i])
  }

  # Apply row and col masks
  dat = dat[!row.mask, !col.mask, drop = FALSE]
  rownames(dat) = NULL

  return(dat)
}


#' This function takes a df of numbers and returns formatted number with percentages relative to the first column
#'
#' @param df A data.frame or matrix
#'
#' @export
formatCountsTablePerc = function(df) {
  df = apply(df, MARGIN = 2, function(co) {
    co =  paste0( format(co, big.mark = ',', trim = TRUE), ' (', round(100*co/df[, 1], 2), '%)')
    names(co) = rownames(df); co
  }) 
  return(as.data.frame(df, stringsAsFactors = FALSE, check.names = FALSE))
}


#' This function performs a series of gsub() operation
#' @export
mgsub = function(patterns, replacements, x) {
  if (length(patterns) != length(replacements)) {stop('patterns must be same size as replacements')}
  for (i in 1:length(patterns)) {
    x = gsub(patterns[i], replacements[i], x)  
  }
  x
}
# TODO: move me to gqUtils


#' This function returns TRUE if synthaxically valid
#'
#' @export
synthaxValid = function(x) {
  return(identical(x, make.names(x)))
}


#' A function
#' @export
guessCluster = function() {
  host = system('hostname', intern = TRUE)
  if (grepl('^abacus', host)) {return('abacus')}
  if (grepl('^lg-', host)) {return('guillimin')}
  if (grepl('^ip03$', host)) {return('mammouth')}

  dnsdomain = system('dnsdomainname', intern = TRUE)
  if (grepl('ferrier.genome.mcgill.ca', dnsdomain)) {return('abacus')}
  if (grepl('guillimin.clumeq.ca', dnsdomain)) {return('guillimin')}
  if (grepl('^m$', dnsdomain)) {return('mammouth')}

  stop('Could not guess cluster!!')
}





#' This function is a wrapper around biomaRt for the simple case of no filter. Unfortunately, access to previous ensembl release is constrained by the existence of a biomart instance
#' 
#' 
#' @export
getBMsimple<- function(
	host                    = NA,
	mart					= NA,
	dataset					= NA,
	attributes 				= NA,
	out.tsv.file            = NULL,
	service.path			= "/biomart/martservice"
){	
	# # E.g. jul2012.archive.ensembl.org see possibilities at http://www.ensembl.org/info/website/archives/index.html or other  archives
	#  library(biomaRt); host= NA; mart= NA; dataset= NA; attributes = NA; out.tsv.file= NULL;service.path= "/biomart/martservice"
	
	if(is.na(host)){
		example.hosts = c("www.ensembl.org","grch37.ensembl.org","metazoa.ensembl.org","plants.ensembl.org","fungi.ensembl.org","protists.ensembl.org","feb2012.archive.ensembl.org")
		stop("Host must be specified. Examples include:\n",paste(example.hosts,collapse='\n'),"\n See http://www.ensembl.org/info/website/archives/index.html")
	}
	
	# Check mart
	available.marts = as.character(listMarts( host = host , path = service.path )[,1])
	if( is.na(mart) | ! mart %in% available.marts ){
		stop("Valid marts are:\n",paste(available.marts ,collapse="\n"))
	}
	# mart = "plants_mart_24"
	m = useMart(mart, host=host, path=service.path)
		
	# Check dataset
	available.datasets =  sort(listDatasets(m)[,1])
	if( is.na(dataset) | ! dataset %in% available.datasets ){
		message("Valid datasets are:")
		writeLines(available.datasets)
		stop("Invalid dataset specified")
	}
	# dataset = "athaliana_eg_gene"
	m = useMart(mart, host=host, path=service.path, dataset=dataset)
	
	# Attributes
	#available.attributes = sort(listAttributes(m)[,1])
	available.attributes = unique(listAttributes(m))
	available.attributes = available.attributes[order(available.attributes$name),]
	if( any(is.na(attributes)) | any(! attributes %in% available.attributes$name  ) ){
		message("Valid attributes are:")
		print( available.attributes ) 
		message("\n\nInteresting attributes are perhaps:")
		print( available.attributes[
			grepl("^GO|^go|gene_id|goa|goslim",available.attributes$name)
			|  grepl("GO|slim|tology|Ensembl Gene",available.attributes$description)
		,] )
		stop("Invalid attribute specified")
	}
	# attributes = c("ensembl_gene_id","go_accession","go_name_1006")
	results = getBM(attributes = attributes, mart = m) # 182026

	# out.tsv.file = "temp.tsv"
	if(!is.null(out.tsv.file))
	{
		write.table(results, file=out.tsv.file, quote=FALSE,sep='\t',col.names=FALSE,row.names=FALSE)
	}
	
	return(results)

}
# GO slims are particularly useful for giving a summary of the results of GO annotation of a genome, microarray, or cDNA collection when broad classification of gene product function is required. See the map2slim.pl section for a perl implementation of this.
# res = getBMsimple("www.ensembl.org","ENSEMBL_MART_ENSEMBL","dmelanogaster_gene_ensembl",c("ensembl_gene_id","goslim_goa_accession","goslim_goa_description")) ; nrow(unique(res[,1:2]))

# merge cuffdiff fpkm tracking and diff file in one general file
formatCuffdiffOutput=function(
    designFolderPath,
    outputFilePath
){
    tracking=read.table(file.path(designFolderPath,"isoforms.fpkm_tracking"),header=T,sep="\t",comment.char = "",check.names=FALSE)
    diffFile=read.table(file.path(designFolderPath,"isoform_exp.diff"),header=T,sep="\t",comment.char = "",check.names=FALSE)
    mergeFiles=merge(diffFile,tracking,by.y="tracking_id",by.x="test_id")
    newFormatedTable=data.frame(
        test_id=mergeFiles$test_id, 
        gene_id=mergeFiles$gene_id.x, 
        tss_id=mergeFiles$tss_id, 
        nearest_ref_id=mergeFiles$nearest_ref_id, 
        class_code=mergeFiles$class_code, 
        gene=mergeFiles$gene, 
        locus=mergeFiles$locus.x, 
        length=mergeFiles$length, 
        log2FC=mergeFiles$'log2(fold_change)', 
        test_stat=mergeFiles$test_stat, 
        p_value=mergeFiles$p_value, 
        q_value=mergeFiles$q_value)
    if  (! file.exists(dirname(outputFilePath))) {
        dir.create(dirname(outputFilePath))
    }
    write.csv(newFormatedTable[order(newFormatedTable$q_value,decreasing=F),],file=outputFilePath,row.names=FALSE)
    return(newFormatedTable[order(newFormatedTable$q_value,decreasing=F),])
}
Alerts (4)

Complexity hotspot; lines 430 to 431 (total complexity: 4)
430 431
Complexity hotspot; lines 440 to 441 (total complexity: 4)
440 441