PageRenderTime 47ms CodeModel.GetById 15ms RepoModel.GetById 1ms app.codeStats 0ms

/gqSeqUtils/R/utils.r

https://bitbucket.org/mugqic/rpackages
R | 678 lines | 349 code | 122 blank | 207 comment | 60 complexity | cf9bc218bbc015dd1f8d09515c152199 MD5 | raw file
  1. # TODO:
  2. # - input can be filename or what not or comma separated
  3. # - Vectorize for multiple FASTAs input...
  4. # Other plots? report?
  5. # - units in Mb, kb
  6. # - GC content? dinucleotide content?
  7. # - Coverage estimate relative to expected genoeme size
  8. # - filter small contigs??
  9. # - etc.
  10. # - NG50 *****
  11. # - Parallelize
  12. # - Biostrings extension?
  13. # - line at N50
  14. # - ggvis
  15. # - would be nice if wouls support XStringset + mcols for coloring attributes
  16. # any input type
  17. # - issue warning if duplicate sequences
  18. # - exlude MASK Ns from statistics???
  19. # - kmers???
  20. # - Are you sure will work for RNA,DNA,AA? If not, would be nice one day
  21. NX <- function(s, genome.size = NULL )
  22. {
  23. g = ifelse( is.null(genome.size), sum(width(s)), genome.size)
  24. s.width.ordered = rev(sort(width(s)))
  25. #cum.fraction = cumsum(s.width.ordered) / sum(s.width.ordered)
  26. cum.fraction = cumsum(s.width.ordered) / g
  27. s.N = sapply(as.character(seq(10, 90, by = 10)), function(p) {
  28. s.width.ordered[which(cum.fraction >= as.numeric(p) / 100)[1]]
  29. }, simplify = TRUE)
  30. names(s.N) = paste0( ifelse( is.null(genome.size) ,'N',"NG"), names(s.N) )
  31. return(s.N)
  32. }
  33. # TODO: not sure NG will always work!
  34. #' A simple function that leverages Biostrings to generate a bunch of stats from dna FASTA files.
  35. #'
  36. #' @param sequences Character vector of fasta file names. Can also be filenames separated by sequences.sep
  37. #' @param type generic, genome or trinity. Not a lot of differences for now
  38. #' @param output.prefix path+filename prefix to append to output files
  39. #' @param write.stats Boolean, should a csv and pdf of stats be written out
  40. #' @param sequences.sep If sequences is character, will be split by this separator
  41. #' @param mask.motif If set, the specified motifs will be masked and the length statistics will be computed as if the masked sequences were removed.
  42. #' @export
  43. DnaFastaSummary = function(
  44. sequences,
  45. mask.Ns = TRUE,
  46. target.assembly.length = NULL,
  47. # type = c("generic", "genome", "trinity"),
  48. # write.report = TRUE,
  49. # report.prefix = 'stats',
  50. mc.cores=1,
  51. sequences.sep=','
  52. ){
  53. #type = match.arg(type)
  54. # cd /netmount/ip03_home/lefebvr3/dna; myr
  55. # sequences = list.files(pattern="\\.fasta$") ;mc.cores=12;sequences.sep=',';target.assembly.length=207607558; mask.Ns = TRUE
  56. require(Biostrings)
  57. # require(magrittr)
  58. # require(hwriter)
  59. # require(Rmarkdown)
  60. # require(ggvis)
  61. # require(stringr)
  62. # require(knitr)
  63. ## Check input seqences
  64. if(length(unique(sapply(sequences,class))) != 1 ){stop("sequences must be objects of the same class (character or DNAStringSet)")}
  65. if( is.character(sequences) )
  66. {
  67. sequences = unique(unlist(strsplit(sequences, split=sequences.sep, fixed=TRUE)))
  68. if(any( !file.exists(sequences))){stop("One of the files input does not exist")}
  69. }else{
  70. if( any( ! sapply(sequences,is,"DNAStringSet") )) { stop("sequences must be ether character vector or a list of DNAStringSet objects") }
  71. }
  72. ## Iterate over sequences and calculate statistics
  73. dat = mclapply(sequences,function(seq){
  74. message("Processing one sequence...")
  75. # seq = sequences[1]
  76. if( is.character(seq) )
  77. {
  78. seq = readDNAStringSet(seq)
  79. }
  80. # TODO: still need to figure out how to mask properly
  81. l = list(
  82. "Maximum Length" = max(width(seq)),
  83. "Mimimum Length" = min(width(seq)),
  84. "Mean Length" = mean(width(seq)),
  85. "Median Length" = median(width(seq)),
  86. "1st. Quartile Length" = quantile(width(seq),0.25,names=FALSE),
  87. "3rd. Quartile Length" = quantile(width(seq),0.75,names=FALSE) )
  88. l = c(l,NX(seq))
  89. if( !is.null(target.assembly.length) ){
  90. l = c(l, NX(seq, target.assembly.length))
  91. }
  92. # alphabet Frequencies
  93. s.af = alphabetFrequency(s)
  94. s.af.total.perc = apply(s.af, MARGIN = 2, sum) / sum(width(s)) * 100
  95. names(s.af.total.perc) = paste0("Total ", names(s.af.total.perc), " %")
  96. # TODO: it was nice to have something report-ready with bp indicated!
  97. },mc.cores=mc.cores)
  98. names(stats) = sequences
  99. # N-content distribution
  100. s.n.summary = unlist(as.list(summary(s.af[, "N"] / width(s) * 100,digits=Inf)))
  101. names(s.n.summary) = paste0(names(s.n.summary), " Sequence N %")
  102. # GC-content
  103. # s.gc = letterFrequency(s, letters = c("GC")) / width(s) * 100
  104. # Generic statistics
  105. stats = c(
  106. "Nb. Sequences" = length(s),
  107. "Total Sequences Length (bp)" = sum(width(s)),
  108. #"N50" = N50(width(s)),
  109. s.summary,
  110. "Nb. Duplicate Sequences" = sum(duplicated(s)),
  111. s.N,
  112. s.n.summary,
  113. s.af.total.perc
  114. )
  115. # Axis labels for histogram plot
  116. xlabel = "Sequence Length (bp, log10 scale)"
  117. ylabel = "Sequence Count"
  118. if (type == 'trinity') {
  119. names(stats) = gsub("Sequence", "Transcript", names(stats))
  120. xlabel = gsub("Sequence", "Transcript", xlabel)
  121. ylabel = gsub("Sequence", "Transcript", ylabel)
  122. # Insert "Nb. Components" as second column
  123. stats = c(stats[1], "Nb. Components" = length(unique(str_extract(names(s), "(^comp[0-9]+_c[0-9]+_)|(^c[0-9]+_g[0-9]+_)"))), stats[2:length(stats)])
  124. }
  125. if (type == 'genome') {
  126. names(stats) = gsub("Sequence", "Contig", names(stats))
  127. xlabel = gsub("Sequence", "Contig", xlabel)
  128. ylabel = gsub("Sequence", "Contig", ylabel)
  129. }
  130. # ## Sliding window frequency content
  131. # plot ( letterFrequencyInSlidingView(largest.contig , view.width = 1000, c("GC"))[, 1] / 1000, ylab = "GC content")
  132. # ## dinucl. freqs
  133. # head(dinucleotideFrequency(assembly))
  134. if (write.stats) {
  135. # Create CSV stats file
  136. write.csv(data.frame(stats), file = paste0(output.prefix, ".csv"))
  137. # Create histogram plot of sequence length distribution with N50
  138. sequence.lengths = ggplot(data.frame(length = width(s)), aes(x = length))
  139. sequence.lengths = sequence.lengths +
  140. geom_histogram(colour = "darkgrey", fill = "white", binwidth = 0.02) +
  141. scale_x_log10(
  142. breaks = c(200, 500, 1000, 2000, 5000, 10000, 20000, 50000)) +
  143. geom_vline(
  144. data = data.frame(Legend = paste0("N50 = ", stats["N50 (bp)"], " bp"), vals = stats["N50 (bp)"]),
  145. aes(xintercept = vals, shape = Legend),
  146. colour = "black",
  147. show_guide = TRUE) +
  148. xlab(xlabel) +
  149. ylab(ylabel) +
  150. ggtitle("Transcript Length Distribution") +
  151. theme(plot.title = element_text(face = "bold"), legend.title = element_blank())
  152. # Create PDF file of stats + histogram plot
  153. pdf(file = paste0(output.prefix, ".pdf"), height = 5, width = 7)
  154. textplot(data.frame("Statistic" = format(stats,
  155. digits = 2,
  156. big.mark = ",",
  157. scientific = FALSE,
  158. drop0trailing = TRUE)))
  159. suppressWarnings(print(sequence.lengths))
  160. dev.off()
  161. # Create JPEG image of histogram plot
  162. jpeg(file = paste0(output.prefix, ".jpg"), height = 500, width = 700)
  163. sequence.lengths = sequence.lengths +
  164. theme(plot.title = element_text(size = 16),
  165. axis.title = element_text(size = 16),
  166. axis.text = element_text(size = 14),
  167. legend.text = element_text(size = 16))
  168. print(sequence.lengths)
  169. dev.off()
  170. }
  171. return(stats)
  172. }
  173. #' This function reads a fasta file, and writes back shuffled chunks of equal sizes.
  174. #'
  175. #' @export
  176. splitDNAFasta = function(fasta, output.dir = 'chunks', chunk.size = 1000, shuffle = TRUE, mc.cores = 8) {
  177. # fasta = 'assembly/Trinity.fasta';output.dir = 'chunks'; chunk.size = 1000; shuffle = TRUE
  178. require(Biostrings)
  179. require(multicore)
  180. dir.create(output.dir, showWarnings = FALSE)
  181. s = readDNAStringSet(fasta)
  182. f = ceiling(c(1:length(s)) / chunk.size) # this create factor of chunks
  183. if (shuffle) {
  184. f = sample(f, size = length(f))
  185. }
  186. s = split(s, f)
  187. names(s) = format(names(s), width = ceiling(log10(1 + max(f))), justify = 'right')
  188. names(s) = gsub(' ', '0', names(s))
  189. x = mapply(function(o, n) {
  190. #print(n)
  191. names(o) = gsub('^[0-9]*?\\.', '', names(o))
  192. fn = file.path(output.dir, paste0(n, '.fasta'))
  193. writeXStringSet(o, filepath = fn)
  194. fn
  195. }, s, names(s))
  196. return(unlist(x)) # returns chunks filenames
  197. }
  198. #' The role of this function is to extract the gene level annotation data.frame from a gtf file. It returns a data.frame with
  199. #' columns "featureID" (parsed from gene_id attribute), "SYMBOL" (attempts to find gene_name attribute, if none, sets to gene_id)
  200. #' and "gene.length" (via gqSeqUtils::calculateGeneLengthsFromGtf()).
  201. #'
  202. #'
  203. #' @param gtf Path to a gtf file. Alternatively, a GRanges object.
  204. #' @param fn Optional file name to write the table
  205. #' @param ... further arguments passed down to calculateGeneLengthsFromGtf(). See default values
  206. #'
  207. #' @import IRanges
  208. #' @import GenomicRanges
  209. #' @import rtracklayer
  210. #' @export
  211. extractGeneAnnotationFromGtf = function(gtf, fn = NULL,...) {
  212. # require(stringr)
  213. # require(rtracklayer)
  214. # require(GenomicRanges)
  215. message("parsing gtf... this might take a while")
  216. # Import gtf to GRanges object
  217. message("importing gtf to memory...")
  218. gr = rtracklayer::import(gtf, format = 'gtf')
  219. message("done importing.")
  220. # Parse out gene_id and other, optional attributes (deprecated.. format = "gtf" above parses all correctly already)
  221. #gr$gene_id = gsub('((gene_id |;)|\")', '', str_extract(as.character(gr$group), 'gene_id.*?;')) # mandatory
  222. #gr$gene_name = gsub('((gene_name |;)|\")', '', str_extract(as.character(gr$group), 'gene_name.*?;')) # optional
  223. #gr$gene_biotype = gsub('((gene_biotype |;)|\")', '', str_extract(as.character(gr$group), 'gene_biotype.*?;')) # optional
  224. # gene_name and gene_biotype are two OPTIONAL but interesting attributes. Thefore is absent, set to NA
  225. if (! "gene_biotype" %in% colnames(mcols(gr))) {mcols(gr)[['gene_biotype']] = NA}
  226. if (! "gene_name" %in% colnames(mcols(gr))) {mcols(gr)[['gene_name']] = NA}
  227. # Collapse transcripts to obtain gene annotation
  228. ann = unique(mcols(gr)[, c("gene_id", "gene_name", "gene_biotype")])
  229. # make sure gene_name and biotype are not screwing up unicity
  230. if (anyDuplicated(ann$gene_id)) {
  231. ann[['gene_name']] = NA; ann[['gene_biotype']] = NA
  232. ann = unique(ann)
  233. warning("gene_id not in 1:n relationship with gene_name and gene_biotype. The latter are dropped")
  234. }
  235. rownames(ann) = ann$gene_id
  236. # Replace empty symbols with gene_id
  237. ann$gene_name[is.na(ann$gene_name)] = rownames(ann)[is.na(ann$gene_name)]
  238. # Rename columns
  239. colnames(ann)[1:2] = c("featureID", "SYMBOL")
  240. # Gene length
  241. ann[["gene.length"]] = calculateGeneLengthsFromGtf(gr,...)[rownames(ann)]
  242. ann = as.data.frame(ann)
  243. # In case one wants to write to file directly
  244. if (!is.null(fn)) {
  245. write.delim(ann, file = fn, row.names = FALSE, col.names = TRUE, quote = FALSE, sep = '\t')
  246. }
  247. return(ann)
  248. }
  249. #' A small function that reads a gtf file (with rtracklayer), splits by gene_id, and then applies a reduce (GenomicRanges) operation to
  250. #' the Ranges to get an estimated genomic length
  251. #' of gene models. Return value is a named vector of gene lengths or a data.frame.
  252. #'
  253. #' @param gtf Path to a gtf file. Alternatively, a GRanges object.
  254. #' @param feature.types Kinds of features in the gtf to consider, defaults to CDS and exon.
  255. #' @param as.data.frame return a data.frame instead of named vector
  256. #' @param output.file If as.data.frame is TRUE, a file name where to write it as tab-seprated values
  257. #'
  258. #' @import IRanges
  259. #' @import GenomicRanges
  260. #' @import rtracklayer
  261. #' @export
  262. calculateGeneLengthsFromGtf = function(gtf,feature.types=c("exon","CDS"), as.data.frame=FALSE, output.file=NULL) {
  263. # require(rtracklayer)
  264. # require(GenomicRanges)
  265. # require(stringr)
  266. message("Calculating gene lengths...")
  267. # Import gtf to GRanges object
  268. message("importing...")
  269. if (is.character(gtf)) {
  270. message("importing")
  271. gl = rtracklayer::import.gff(gtf, format = 'gtf', asRangedData = FALSE)
  272. message("done importing.")
  273. } else {
  274. gl = gtf
  275. }
  276. # Subset only to exon tracks
  277. message(sprintf("Subsetting to %s",paste(feature.types,collapse=' and ')))
  278. gl = gl[gl$type %in% feature.types, ]
  279. # Parse out the gene_id
  280. message("checking gene_id")
  281. #gl$geneid = gsub('((gene_id |;)|\")', '', str_extract(as.character(gl$group), 'gene_id.*?;'))
  282. if (! "gene_id" %in% colnames(mcols(gl))) {stop("gene_id not a meta-data column of GRanges object")}
  283. # Split to a GRangesList object
  284. message("split")
  285. gl = GenomicRanges::split(gl, f = gl$gene_id) # split for GRanges signature turns into GRangesList!
  286. # Reduce to unique disjoint intervals (strand aware)
  287. message("reduce")
  288. gl = GenomicRanges::reduce(gl)#, ignore.strand = FALSE) # reduce eliminates overlaps
  289. # Sum interval lengths
  290. message("sum")
  291. gl = sum (width(gl))
  292. n = names(gl)
  293. gl = as.numeric(gl)
  294. names(gl) = n # really annoying... max integer size
  295. message("done calculating gene lengths.")
  296. if(as.data.frame)
  297. {
  298. gl = data.frame("gene_id"=names(gl),"length"=gl,stringsAsFactors=FALSE)
  299. if(!is.null(output.file))
  300. {
  301. write.table(gl, file=output.file, sep='\t',quote=FALSE,row.names=FALSE,col.names=FALSE)
  302. }
  303. }
  304. return(gl)
  305. # library(GenomicFeatures)
  306. # library(gqSeqUtils)
  307. # gtf = "/sb/programs/analyste/genomes/Rattus_norvegicus/UCSC/rn5/Annotation/Genes/genes.gtf"
  308. #
  309. # # With GenomicFeatures
  310. # t0 = Sys.time()
  311. # txdb = makeTranscriptDbFromGFF("genes.gtf", format = 'gtf')
  312. # x = exonsBy(txdb, by = "gene")
  313. # x = sum( width( reduce(x)))
  314. # print(Sys.time() - t0)
  315. #
  316. # # With homebrew
  317. # t0 = Sys.time()
  318. # y = calculateGeneLengthsFromGtf(gtf)
  319. # print(Sys.time() - t0)
  320. #
  321. # # Compare
  322. # identical(names(x), names(y))
  323. # summary(abs(x - y))
  324. # identical(x, y)
  325. # pdf("~/temp.pdf")
  326. # plot(x, y)
  327. # dev.off()
  328. }
  329. #' This function reads 'masked csv' files.
  330. #'
  331. #' @param df A data frame to write out
  332. #' @param fn Filename
  333. #' @param rows, cols logical, character or integer vector specifying which columns or rows should be labeled "included" in the masked-format table, see readMaskedTable
  334. #' @param num.cols logical, character or integer vector specifying which columns should be labeled "num" in the masked-format table. All others labeled char See readMaskedTable
  335. #' @export
  336. writeMaskedTable = function(df, fn
  337. , rows = rownames(df)
  338. , cols = colnames(df)
  339. , num.cols = NULL ) {
  340. # df = cars; fn = 'temp.txt'; rows = rownames(df); cols = colnames(df); num.cols = NULL
  341. # Checks on rows and cols and num.cols
  342. lapply(list(cols, num.cols), function(co) {
  343. if (is.character(co)) {
  344. if (any(! co %in% colnames(df))) {stop('Undefined column selected as visible')}
  345. }
  346. if (is.logical(co)) {
  347. if (length(co) != ncol(df)) {stop('Incorrect length for logical vector spec. columns')}
  348. }
  349. if (is.numeric(co)) {
  350. if (any(! abs(co) %in% 1:ncol(df))) {stop('Incorrect indices spec. columns')}
  351. }
  352. })
  353. if (is.character(rows)) {
  354. if (any(! rows %in% rownames(df))) {stop('Undefined row selected')}
  355. }
  356. if (is.logical(rows)) {
  357. if (length(rows) != nrow(df)) {stop('Incorrect length for logical vector spec. rows')}
  358. }
  359. if (is.numeric(rows)) {
  360. if (any(! abs(rows) %in% 1:nrow(df))) {stop('Incorrect indices spec. rows')}
  361. }
  362. # Define column mask
  363. col.mask = rep('exclude', times = ncol(df))
  364. names(col.mask) = colnames(df)
  365. col.mask[cols] = 'include'
  366. # Define row mask
  367. row.mask = rep('exclude', times = nrow(df))
  368. names(row.mask) = rownames(df)
  369. row.mask[rows] = 'include'
  370. # Define which rows are numerical
  371. num.mask = rep('char', times = ncol(df))
  372. names(num.mask) = colnames(df)
  373. num.mask[num.cols] = 'num'
  374. dat = data.frame('.MASK' = row.mask, df, stringsAsFactors = FALSE)
  375. headers = rbind(c('.MASK', col.mask), c('.TYPE', num.mask))
  376. # write.table(headers, file = fn, sep = '\t', row.names = FALSE, col.names = FALSE, quote = FALSE)
  377. # suppressWarnings(write.table(dat, file = fn, sep = '\t', row.names = FALSE, col.names = TRUE, quote = FALSE, append = TRUE))
  378. write.table(headers, file = fn, sep = ',', row.names = FALSE, col.names = FALSE, quote = TRUE, qmethod = 'double', dec = '.', append = FALSE)
  379. suppressWarnings(
  380. write.table(dat, file = fn, sep = ',', row.names = FALSE, col.names = TRUE , quote = TRUE, qmethod = 'double', dec = '.', append = TRUE)
  381. )
  382. }
  383. #' This function reads 'masked csv' files.
  384. #'
  385. #' @param fn Filename
  386. #' @param apply.mask.rows, apply.mask.cols Whether or not to apply the mask on rows or columns
  387. #' @export
  388. readMaskedTable = function(fn, apply.mask.rows = TRUE, apply.mask.cols = TRUE) {
  389. # fn = 'temp.txt';apply.mask.rows = TRUE; apply.mask.cols = TRUE
  390. # Read the column mask and numerical indicator lines
  391. # headers = read.table(fn, sep = '\t', quote = '', comment.char = ''
  392. # , check.names = FALSE, nrows = 2
  393. # , stringsAsFactors = FALSE, header = FALSE
  394. # , colClasses = "character")
  395. headers = read.table(fn, sep = ',', quote = '\"', comment.char = ''
  396. , check.names = FALSE, nrows = 2
  397. , stringsAsFactors = FALSE, header = FALSE
  398. , colClasses = "character")
  399. # Column mask
  400. col.mask = as.character(headers[1, ])[-1]
  401. if (any(! col.mask %in% c('include', 'exclude'))) {stop('Col mask values should be \"include or exclude\"')}
  402. col.mask = col.mask == 'exclude'
  403. col.mask = col.mask & apply.mask.cols
  404. # Numerical mask
  405. num.mask = as.character( headers[2, ])[-1]
  406. if (any(! num.mask %in% c('char', 'num'))) {stop('Num mask values should be \"char or num\"')}
  407. num.mask = num.mask == 'num'
  408. # Read actual table
  409. # dat = read.table(fn, sep = '\t', quote = '', comment.char = '', skip = 2
  410. # , check.names = FALSE, stringsAsFactors = FALSE, header = TRUE
  411. # , colClasses = "character")
  412. dat = read.table(fn, sep = ',', quote = '\"', comment.char = '', skip = 2
  413. , check.names = FALSE, stringsAsFactors = FALSE, header = TRUE
  414. , colClasses = "character")
  415. # Extract row mask
  416. row.mask = as.character(dat[, 1])
  417. dat[, 1] = NULL # remove the row mask column
  418. if (any(! row.mask %in% c('include', 'exclude'))) {stop('Row mask values should be \"include or exclude\"')}
  419. row.mask = row.mask == 'exclude'
  420. row.mask = row.mask & apply.mask.rows
  421. # Apply num mask
  422. for (i in which(num.mask)) {
  423. dat[, i] = as.numeric(dat[, i])
  424. }
  425. # Apply row and col masks
  426. dat = dat[!row.mask, !col.mask, drop = FALSE]
  427. rownames(dat) = NULL
  428. return(dat)
  429. }
  430. #' This function takes a df of numbers and returns formatted number with percentages relative to the first column
  431. #'
  432. #' @param df A data.frame or matrix
  433. #'
  434. #' @export
  435. formatCountsTablePerc = function(df) {
  436. df = apply(df, MARGIN = 2, function(co) {
  437. co = paste0( format(co, big.mark = ',', trim = TRUE), ' (', round(100*co/df[, 1], 2), '%)')
  438. names(co) = rownames(df); co
  439. })
  440. return(as.data.frame(df, stringsAsFactors = FALSE, check.names = FALSE))
  441. }
  442. #' This function performs a series of gsub() operation
  443. #' @export
  444. mgsub = function(patterns, replacements, x) {
  445. if (length(patterns) != length(replacements)) {stop('patterns must be same size as replacements')}
  446. for (i in 1:length(patterns)) {
  447. x = gsub(patterns[i], replacements[i], x)
  448. }
  449. x
  450. }
  451. # TODO: move me to gqUtils
  452. #' This function returns TRUE if synthaxically valid
  453. #'
  454. #' @export
  455. synthaxValid = function(x) {
  456. return(identical(x, make.names(x)))
  457. }
  458. #' A function
  459. #' @export
  460. guessCluster = function() {
  461. host = system('hostname', intern = TRUE)
  462. if (grepl('^abacus', host)) {return('abacus')}
  463. if (grepl('^lg-', host)) {return('guillimin')}
  464. if (grepl('^ip03$', host)) {return('mammouth')}
  465. dnsdomain = system('dnsdomainname', intern = TRUE)
  466. if (grepl('ferrier.genome.mcgill.ca', dnsdomain)) {return('abacus')}
  467. if (grepl('guillimin.clumeq.ca', dnsdomain)) {return('guillimin')}
  468. if (grepl('^m$', dnsdomain)) {return('mammouth')}
  469. stop('Could not guess cluster!!')
  470. }
  471. #' This function is a wrapper around biomaRt for the simple case of no filter. Unfortunately, access to previous ensembl release is constrained by the existence of a biomart instance
  472. #'
  473. #'
  474. #' @export
  475. getBMsimple<- function(
  476. host = NA,
  477. mart = NA,
  478. dataset = NA,
  479. attributes = NA,
  480. out.tsv.file = NULL,
  481. service.path = "/biomart/martservice"
  482. ){
  483. # # E.g. jul2012.archive.ensembl.org see possibilities at http://www.ensembl.org/info/website/archives/index.html or other archives
  484. # library(biomaRt); host= NA; mart= NA; dataset= NA; attributes = NA; out.tsv.file= NULL;service.path= "/biomart/martservice"
  485. if(is.na(host)){
  486. example.hosts = c("www.ensembl.org","grch37.ensembl.org","metazoa.ensembl.org","plants.ensembl.org","fungi.ensembl.org","protists.ensembl.org","feb2012.archive.ensembl.org")
  487. stop("Host must be specified. Examples include:\n",paste(example.hosts,collapse='\n'),"\n See http://www.ensembl.org/info/website/archives/index.html")
  488. }
  489. # Check mart
  490. available.marts = as.character(listMarts( host = host , path = service.path )[,1])
  491. if( is.na(mart) | ! mart %in% available.marts ){
  492. stop("Valid marts are:\n",paste(available.marts ,collapse="\n"))
  493. }
  494. # mart = "plants_mart_24"
  495. m = useMart(mart, host=host, path=service.path)
  496. # Check dataset
  497. available.datasets = sort(listDatasets(m)[,1])
  498. if( is.na(dataset) | ! dataset %in% available.datasets ){
  499. message("Valid datasets are:")
  500. writeLines(available.datasets)
  501. stop("Invalid dataset specified")
  502. }
  503. # dataset = "athaliana_eg_gene"
  504. m = useMart(mart, host=host, path=service.path, dataset=dataset)
  505. # Attributes
  506. #available.attributes = sort(listAttributes(m)[,1])
  507. available.attributes = unique(listAttributes(m))
  508. available.attributes = available.attributes[order(available.attributes$name),]
  509. if( any(is.na(attributes)) | any(! attributes %in% available.attributes$name ) ){
  510. message("Valid attributes are:")
  511. print( available.attributes )
  512. message("\n\nInteresting attributes are perhaps:")
  513. print( available.attributes[
  514. grepl("^GO|^go|gene_id|goa|goslim",available.attributes$name)
  515. | grepl("GO|slim|tology|Ensembl Gene",available.attributes$description)
  516. ,] )
  517. stop("Invalid attribute specified")
  518. }
  519. # attributes = c("ensembl_gene_id","go_accession","go_name_1006")
  520. results = getBM(attributes = attributes, mart = m) # 182026
  521. # out.tsv.file = "temp.tsv"
  522. if(!is.null(out.tsv.file))
  523. {
  524. write.table(results, file=out.tsv.file, quote=FALSE,sep='\t',col.names=FALSE,row.names=FALSE)
  525. }
  526. return(results)
  527. }
  528. # GO slims are particularly useful for giving a summary of the results of GO annotation of a genome, microarray, or cDNA collection when broad classification of gene product function is required. See the map2slim.pl section for a perl implementation of this.
  529. # res = getBMsimple("www.ensembl.org","ENSEMBL_MART_ENSEMBL","dmelanogaster_gene_ensembl",c("ensembl_gene_id","goslim_goa_accession","goslim_goa_description")) ; nrow(unique(res[,1:2]))
  530. # merge cuffdiff fpkm tracking and diff file in one general file
  531. formatCuffdiffOutput=function(
  532. designFolderPath,
  533. outputFilePath
  534. ){
  535. tracking=read.table(file.path(designFolderPath,"isoforms.fpkm_tracking"),header=T,sep="\t",comment.char = "",check.names=FALSE)
  536. diffFile=read.table(file.path(designFolderPath,"isoform_exp.diff"),header=T,sep="\t",comment.char = "",check.names=FALSE)
  537. mergeFiles=merge(diffFile,tracking,by.y="tracking_id",by.x="test_id")
  538. newFormatedTable=data.frame(
  539. test_id=mergeFiles$test_id,
  540. gene_id=mergeFiles$gene_id.x,
  541. tss_id=mergeFiles$tss_id,
  542. nearest_ref_id=mergeFiles$nearest_ref_id,
  543. class_code=mergeFiles$class_code,
  544. gene=mergeFiles$gene,
  545. locus=mergeFiles$locus.x,
  546. length=mergeFiles$length,
  547. log2FC=mergeFiles$'log2(fold_change)',
  548. test_stat=mergeFiles$test_stat,
  549. p_value=mergeFiles$p_value,
  550. q_value=mergeFiles$q_value)
  551. if (! file.exists(dirname(outputFilePath))) {
  552. dir.create(dirname(outputFilePath))
  553. }
  554. write.csv(newFormatedTable[order(newFormatedTable$q_value,decreasing=F),],file=outputFilePath,row.names=FALSE)
  555. return(newFormatedTable[order(newFormatedTable$q_value,decreasing=F),])
  556. }