PageRenderTime 26ms CodeModel.GetById 27ms RepoModel.GetById 1ms app.codeStats 0ms

/data/pcc6803/cyanobase.R

https://gitlab.com/raim/genomeBrowser
R | 324 lines | 210 code | 46 blank | 68 comment | 7 complexity | 54eb395bb73c7998701b50015f5baf25 MD5 | raw file
  1. require(segmenTools)
  2. require(readxl)
  3. ## load functions
  4. browser.path <- sub("GENBRO=","",system("env|grep GENBRO",intern=TRUE))
  5. data.path <- sub("GENDAT=","",system("env|grep GENDAT",intern=TRUE))
  6. source(file.path(browser.path,"src/genomeBrowser_utils.R"))
  7. ## data path
  8. cyano.path <- sub("PCC6803DAT=","",system("env|grep PCC6803DAT",intern=TRUE))
  9. ## working with IDs, set this to false!
  10. op <- options(stringsAsFactors=FALSE)
  11. ### CYANOBASE ANNOTATION DATA
  12. ## NOTE: this assumes equal sequence lengths
  13. ## between cyanobase annotation and ncbi releases!
  14. ## checked for cyanobase 20170923 vs. .1 versions of NCBI
  15. ## TODO behle21 and kopf14 data are parsed here AFTER this
  16. ## file was required
  17. ## chromosome Length Index
  18. chrIdx <- read.delim(file.path(cyano.path,"chromosomes","pcc6803.csv"),
  19. header=TRUE,row.names=1)
  20. chrL <- chrIdx[,2]
  21. chrS <- segmenTools::getChrSum(chrL) # chromosome index
  22. ## mapping of different chromosome IDs
  23. chrMap <- read.delim(file.path(browser.path,"data","pcc6803","parameters",
  24. "chromosomemapping.txt"),
  25. header=FALSE,row.names=1)
  26. ## parse basic gene annotation
  27. file <- file.path(cyano.path,"originalData", paste("genes.txt",sep=""))
  28. cyan <- read.delim(file,header=FALSE)
  29. colnames(cyan) <- c("assembly","ID","chr","start","end","strand","function")
  30. cyan[,"chr"] <- as.numeric(chrMap[cyan[,"chr"],1])
  31. ## correct negative coordinates of circular genes
  32. ## USING NCBI chromosome lengths!!
  33. circ <- which(cyan[,"start"]<0)
  34. cyan[circ,"start"] <- chrL[cyan[circ,"chr"]]+cyan[circ,"start"]
  35. ## rm unused columns
  36. cyan <- cyan[,-1]
  37. ## NAMES
  38. ## use cyanobase GENE SYMBOLS as names/alias
  39. genes <- read.delim(file.path(cyano.path,"originalData","gene_symbol.txt"),
  40. header=FALSE)
  41. cyan <- cbind.data.frame(cyan,
  42. alias=genes[match(cyan[,"ID"],genes[,"V2"]),"V4"])
  43. nms <- sub("/.*","",sub(",.*","",cyan[,"alias"]))
  44. cyan <- cbind.data.frame(cyan, name=nms)
  45. ## use IDs where no name is present
  46. cyan[is.na(cyan[,"name"]),"name"] <- cyan[is.na(cyan[,"name"]),"ID"]
  47. ## TODO: use first of ,- or /-separated lists
  48. ## store full names in "alias" or similar
  49. ## in annotation.RData - add kopf14 transcripts and cyanobase, but
  50. ## only for plotFeatures.R reference (rm redundant from plot types)
  51. ## add annotation data?
  52. ## record and write index file!
  53. cat <- read.delim(file.path(cyano.path,"originalData","category.txt"),
  54. header=FALSE)
  55. cat <- apply(cat, 2, function(x) gsub("&amp;","&",gsub("&#x27;","'",x)))
  56. catstr <- unlist(gsub(";$","",apply(cat[match(cyan[,"ID"],cat[,"V1"]),
  57. c("V3","V5")],1,
  58. paste, collapse=";")))
  59. cyan <- cbind.data.frame(cyan, category=catstr)
  60. ## ADD UNIPROT GO
  61. go.file <- file.path(cyano.path,"originalData","pcc6803_uniprot_go.txt")
  62. go <- read.delim(go.file)
  63. colnames(go) <- c("uniprot","cyanobaseID","go")
  64. go$cyanobaseID <- gsub("syn:","",go$cyanobaseID)
  65. ## extract all GO terms & descriptions
  66. got <- strsplit(go$go,";")
  67. god <- got <- unique(unlist(got))
  68. got <- sapply(got, function(x) gsub("\\]","",gsub(".*\\[","",x)))
  69. god <- sapply(god, function(x) trimws(gsub("\\[.*","",x)))
  70. names(god) <- names(got) <- NULL
  71. god <- cbind(term=got, description=god)
  72. god <- god[!duplicated(god[,1]),]
  73. ## WRITE OUT GO MAPPING
  74. go.map <- file.path(cyano.path,"processedData","pcc6803_uniprot_go_terms.tsv")
  75. write.table(god, file=go.map, quote=FALSE, row.names=FALSE, sep="\t")
  76. ## generate pure GO term column
  77. got <- strsplit(go$go,";")
  78. got <- lapply(got, function(x) gsub("\\]","",gsub(".*\\[","",x)))
  79. got <- unlist(lapply(got, paste0, collapse=";"))
  80. ## expand genes
  81. ggenes <- strsplit(go$cyanobaseID,";")
  82. glen <- unlist(lapply(ggenes, length))
  83. ## loose two empty!!
  84. rm <- which(glen==0)
  85. go <- go[-rm,]
  86. ggenes <- ggenes[-rm]
  87. glen <- glen[-rm]
  88. got <- got[-rm]
  89. ## expand multiple cyanobase genes
  90. tmp <- sapply(1:length(glen), function(x) rep(got[x], glen[x]))
  91. ok <- sum(unlist(lapply(tmp,length)) != unlist(lapply(ggenes,length)))
  92. if ( ok>0 ) stop("wrong length of GO gene expansion")
  93. ## gene - go table!
  94. allgo <- cbind.data.frame(ID=unlist(ggenes), GO=unlist(tmp))
  95. ##tmp <- merge(cyan, allgo, by="ID", all=TRUE)
  96. cyan <- cbind.data.frame(cyan, GO=allgo$GO[match(cyan$ID, allgo$ID)])
  97. ## load lehmann14 transcriptome data and add clusterings and colors!
  98. leh14 <- read.delim(file.path(cyano.path,"originalData",
  99. "supp_gku641_SupportingDataFile2.csv"))
  100. leh14cols <- c(CL_prakash09_sc="supercoiling_transcriptome",
  101. CL_leh14="diurnal_transcriptome",
  102. CL_leh14_at2="AT2_periodicity")
  103. leh14dat <- leh14[match(cyan[,"ID"],leh14[,"ID"]),leh14cols]
  104. colnames(leh14dat) <- names(leh14cols)
  105. cyan <- cbind(cyan, leh14dat)
  106. ## TODO: load this from file!
  107. diur.cls <- as.character(cyan[,"CL_leh14"])
  108. cls.srt <- sort(as.numeric(unique(diur.cls)))
  109. ## set cluster colors
  110. cls.col <- rep("gray", length(cls.srt))
  111. names(cls.col) <- cls.srt
  112. ### main
  113. cls.col[c("2")] <- "#8493CA" # pastel blue
  114. cls.col[c("7")] <- "#0000FF" # blue
  115. cls.col[c("3")] <- "#00E6FF" # cyan"
  116. cls.col[c("8")] <- "#FF0000" # red
  117. cls.col[c("6")] <- rgb(1,.8,0) # dark yellow
  118. cls.col[c("1")] <- "#82CA9D" # pastel green
  119. ## background
  120. cls.col[c("5")] <- "#626262" # gray: foreign genes?? transposons!
  121. cls.col[c("4")] <- rgb(0,0,.6) # dark blue
  122. cls.col[c("9")] <- "#6ECFF6" # pastel cyan
  123. cls.col[c("10")]<- "#F7977A" # pastel red
  124. ## not present on array
  125. cls.col[c("11")]<- "#000000" # black #"#E1E1E1" # light gray
  126. ftcols <- cls.col[as.character(cyan[,"CL_leh14"])]
  127. ftcols[is.na(ftcols)] <- "#000000"
  128. cyan <- cbind(cyan,CL_leh14_col=ftcols)
  129. ## load saha16 transcriptome clustering (segmenTier/segmenTools)
  130. load(file.path(cyano.path,"saha16","saha16.RData"))
  131. cset <- cdat$cset
  132. k <- selected(cset)
  133. cls <- cset$clusters[,k]
  134. cls.col <- cset$colors[[k]]
  135. ftcols <- cls.col[as.character(cls[cyan[,"ID"]])]
  136. cyan <- cbind(cyan,
  137. CL_saha16=as.character(cls[cyan[,"ID"]]),
  138. CL_saha16_col=ftcols)
  139. ## load Zavrel et al. 2019 proteome clustering
  140. zav <- read.delim(file.path(cyano.path,"processedData","zavrel19_proteome.tsv"))
  141. ## search gene IDs
  142. zlst <- strsplit(zav$kegg,";")
  143. zids <- lapply(zlst, function(x) {
  144. y <- c(match(x, cyan$ID))
  145. y[!is.na(y)]})
  146. znum <- unlist(lapply(zids, length))
  147. cat(paste(sum(znum==2), "proteins with multiple genes:\n",
  148. paste(zav[znum==2,"protein"], collapse=" "),"\n",
  149. paste(zav[znum==2,"gene"], collapse=" "),
  150. "\nonly counting first\n"))
  151. zids <- unlist(lapply(zlst, function(x) x[1]))
  152. length(match(zids, cyan$ID))
  153. cyan <- cbind(cyan, zav[match(cyan$ID, zids),
  154. c("CL_zavrel19","CL_zavrel19_super")])
  155. ## load Behle et al. 2021 topA-OX clusters and colors
  156. ## TODO: REPLACE THIS BY SUPPLEMENT OF THE PAPER (add to download list!)
  157. behle21.file <- "~/work/CoilHack/experiments/RNAseq/20201204_RM_topA/analysis/genome_0/20201204_RM_topA_results.tsv"
  158. behle21.batch <- "~/work/CoilHack/experiments/RNAseq/20200602_WM_coilhack_endpoint/analysis_4/20200602_WM_coilhack_endpoint_results.tsv"
  159. if ( file.exists(behle21.file) ) {
  160. b21 <- read.delim(behle21.file)
  161. b21 <- b21[match(cyan[,"ID"], b21$cyanobaseID),]
  162. b21b <- read.delim(behle21.batch)
  163. b21b <- b21b[match(cyan[,"ID"], b21b$cyanobaseID),]
  164. cyan <- cbind(cyan, b21[,c("CL_behle21","CL_behle21_col")])
  165. ## gray scale for log2 FC
  166. num2col <- function(x,n=100, limits, pal, colf=viridis::viridis){
  167. if ( missing(pal) ) pal <- colf(n)
  168. if ( missing(limits) ) limits <- range(x, na.rm=TRUE)
  169. pal[findInterval(x,seq(limits[1],limits[2],
  170. length.out=length(pal)+1), all.inside=TRUE)]
  171. }
  172. cyan <- cbind(cyan, b21b[,grep("log2FoldChange",colnames(b21b))])
  173. topa.col <- num2col(cyan$TOPA_log2FoldChange, limits=c(-1,1))
  174. gyrb.col <- num2col(cyan$GyrB_log2FoldChange, limits=c(-2,2))
  175. gyra.col <- num2col(cyan$GyrA_log2FoldChange, limits=c(-2,2))
  176. cyan <- cbind(cyan, TOPA_color=topa.col,
  177. GyrA_color=gyra.col, GyrB_color=gyrb.col)
  178. }
  179. ## add kopf14 TU and add CL_kopf14, if it exists already,
  180. ## TODO: generate kopf14 file in preprocessing step!
  181. kopf14.file <- file.path(cyano.path,"processedData","kopf14_tu_genes.tsv")
  182. if ( file.exists(kopf14.file) ) {
  183. kpf <- read.delim(kopf14.file)
  184. kpf <- kpf[match(cyan[,"ID"], kpf$gene),]
  185. cyan <- cbind(cyan, TU_kopf14=kpf[,"TU.ID"],
  186. CL_kopf14=kpf[,"Max.cond."])
  187. }
  188. ## add Koskinen et al. 2016 transcriptome data
  189. kosk16.file <- file.path(cyano.path,"originalData",
  190. "MMI_13214_supp-0001-Dataset_S1.xlsx")
  191. kosk16 <- readxl::read_xlsx(kosk16.file, sheet=1, skip=1)
  192. kosk16 <- kosk16[match(cyan[,"ID"], kosk16$`Gene ID`),2:3]
  193. colnames(kosk16) <- paste0("sigBCDE_",gsub(" ","",colnames(kosk16)))
  194. cyan <- cbind(cyan, kosk16)
  195. ## FUNCTIONAL CATEGORIES
  196. ## extract categories and store index!
  197. maincat <- cbind(unique(cat[,3]), unique(cat[,4]))
  198. tmp <- unique(cat[,5])
  199. tmp <- tmp[tmp!=""]
  200. subcat <- cbind(tmp,rep(NA,length(tmp)))
  201. for ( i in 1:length(tmp) )
  202. subcat[i,2] <- c(cat[which(cat[,5]==tmp[i])[1],6])
  203. catidx <- rbind(maincat,subcat)
  204. catidx <- catidx[order(catidx[,1]),]
  205. file.name <- file.path(cyano.path,"processedData","annotation_index.txt")
  206. write.table(catidx, file=file.name, quote=F, col.names=F, row.names=F,sep="\t")
  207. ## GO TERMS - TODO
  208. #go <- read.delim(file.path(cyano.path,"originalData","goterm.txt"),header=FALSE)
  209. ## add type column
  210. cyan <- cbind(cyan,type=rep("gene",nrow(cyan)))
  211. ## reorder
  212. mainCols <- c("ID","name","type","chr","start","end","strand")
  213. cyan <- cbind(cyan[,mainCols],cyan[,!colnames(cyan)%in%mainCols])
  214. ## add chromosomes!
  215. chr <- as.data.frame(matrix(NA, nrow=length(chrL), ncol=ncol(cyan)))
  216. colnames(chr) <- colnames(cyan)
  217. chr[,"chr"] <- 1:length(chrL)
  218. chr[,"start"] <- 1
  219. chr[,"end"] <- chrL
  220. chr[,"strand"] <- "."
  221. chr[,"ID"] <- 1:length(chrL)
  222. chr[,"name"] <- as.character(chrIdx[,1])
  223. chr[,"type"] <- "chromosome"
  224. cyan <- rbind(cyan,chr)
  225. ## * RENAME GENES!
  226. ## gyrA/gyrA:
  227. ## rename sll1941 to gyrA2 (48.61% AA identity to S.coccus PCC7942)
  228. ## keep slr0417 as gyrA (62.4% AA identity to S.coccus PCC7942)
  229. idx <- which(cyan[,"ID"]=="sll1941")
  230. cyan[idx,"name"] <- "gyrA2"
  231. ## WRITE-OUT TABLE FILE
  232. file.name <- file.path(cyano.path,"features_cyanobase.csv")
  233. write.table(cyan, file=file.name, sep="\t", row.names=F, quote=F, na="")
  234. ## EXPAND CIRCULAR GENES for overlap analyses and genomeBrowser
  235. ## keep color annotation!
  236. cyan.circ <- segmenTools::expandCircularFeatures(cyan, chrL, copyCols=TRUE)
  237. ## genomeBrowser Data!!
  238. columns <- c(ID="ID", name="name", type="type", chr="chr", strand="strand",
  239. start="start", end="end", color="CL_behle21_col")
  240. types <- c("gene","gene-circ2")
  241. out.path <- file.path(data.path,"pcc6803")
  242. odat <- list(ID="cyanobase",
  243. description="annotated genome features",
  244. data=cyan.circ,
  245. settings=list(type="plotFeatures",
  246. types=types,columns=columns,
  247. typord=FALSE,cuttypes=TRUE,names=TRUE,
  248. ylab="cyano-\nbase",hgt=.75,xaxis=FALSE))
  249. ## store data
  250. file.name <- file.path(out.path,paste(odat$ID,".RData",sep=""))
  251. save(odat,file=file.name)
  252. ## features with color gradient by log2-fold change in gyrA-kd
  253. ## features with color gradient by log2-fold change in topA-OX
  254. ## copy each feature
  255. columns["color"] <- "behle21_batch_color"
  256. cyx <- rbind(cyan.circ,
  257. cyan.circ,
  258. cyan.circ)
  259. cyx <- cbind(cyx,
  260. "behle21_batch_color"=c(cyan.circ$TOPA_color,
  261. cyan.circ$GyrA_color,
  262. cyan.circ$GyrB_color))
  263. cyx$type <- c(rep("TopA-OX",nrow(cyan.circ)),
  264. rep("GyrA-kd",nrow(cyan.circ)),
  265. rep("GyrB-kd",nrow(cyan.circ)))
  266. types <- unique(cyx$type)
  267. odat <- list(ID="behle21_batch",
  268. description="features by expression level in topoisomerase experiments, Behle et al. 2021",
  269. data=cyx,
  270. settings=list(type="plotFeatures",
  271. arrow = list(length=0, code=0, angle=0,
  272. pch=NA, lwd=2),
  273. types=types,columns=columns,
  274. typord=TRUE,cuttypes=FALSE,names=FALSE,
  275. ylab="",hgt=.25,xaxis=FALSE))
  276. ## store data
  277. file.name <- file.path(out.path,paste(odat$ID,".RData",sep=""))
  278. save(odat,file=file.name)