/data/pcc6803/cyanobase.R
R | 324 lines | 210 code | 46 blank | 68 comment | 7 complexity | 54eb395bb73c7998701b50015f5baf25 MD5 | raw file
- require(segmenTools)
- require(readxl)
- ## load functions
- browser.path <- sub("GENBRO=","",system("env|grep GENBRO",intern=TRUE))
- data.path <- sub("GENDAT=","",system("env|grep GENDAT",intern=TRUE))
- source(file.path(browser.path,"src/genomeBrowser_utils.R"))
- ## data path
- cyano.path <- sub("PCC6803DAT=","",system("env|grep PCC6803DAT",intern=TRUE))
- ## working with IDs, set this to false!
- op <- options(stringsAsFactors=FALSE)
- ### CYANOBASE ANNOTATION DATA
- ## NOTE: this assumes equal sequence lengths
- ## between cyanobase annotation and ncbi releases!
- ## checked for cyanobase 20170923 vs. .1 versions of NCBI
- ## TODO behle21 and kopf14 data are parsed here AFTER this
- ## file was required
- ## chromosome Length Index
- chrIdx <- read.delim(file.path(cyano.path,"chromosomes","pcc6803.csv"),
- header=TRUE,row.names=1)
- chrL <- chrIdx[,2]
- chrS <- segmenTools::getChrSum(chrL) # chromosome index
- ## mapping of different chromosome IDs
- chrMap <- read.delim(file.path(browser.path,"data","pcc6803","parameters",
- "chromosomemapping.txt"),
- header=FALSE,row.names=1)
- ## parse basic gene annotation
- file <- file.path(cyano.path,"originalData", paste("genes.txt",sep=""))
- cyan <- read.delim(file,header=FALSE)
- colnames(cyan) <- c("assembly","ID","chr","start","end","strand","function")
- cyan[,"chr"] <- as.numeric(chrMap[cyan[,"chr"],1])
- ## correct negative coordinates of circular genes
- ## USING NCBI chromosome lengths!!
- circ <- which(cyan[,"start"]<0)
- cyan[circ,"start"] <- chrL[cyan[circ,"chr"]]+cyan[circ,"start"]
- ## rm unused columns
- cyan <- cyan[,-1]
- ## NAMES
- ## use cyanobase GENE SYMBOLS as names/alias
- genes <- read.delim(file.path(cyano.path,"originalData","gene_symbol.txt"),
- header=FALSE)
- cyan <- cbind.data.frame(cyan,
- alias=genes[match(cyan[,"ID"],genes[,"V2"]),"V4"])
- nms <- sub("/.*","",sub(",.*","",cyan[,"alias"]))
- cyan <- cbind.data.frame(cyan, name=nms)
- ## use IDs where no name is present
- cyan[is.na(cyan[,"name"]),"name"] <- cyan[is.na(cyan[,"name"]),"ID"]
- ## TODO: use first of ,- or /-separated lists
- ## store full names in "alias" or similar
- ## in annotation.RData - add kopf14 transcripts and cyanobase, but
- ## only for plotFeatures.R reference (rm redundant from plot types)
- ## add annotation data?
- ## record and write index file!
- cat <- read.delim(file.path(cyano.path,"originalData","category.txt"),
- header=FALSE)
- cat <- apply(cat, 2, function(x) gsub("&","&",gsub("'","'",x)))
- catstr <- unlist(gsub(";$","",apply(cat[match(cyan[,"ID"],cat[,"V1"]),
- c("V3","V5")],1,
- paste, collapse=";")))
- cyan <- cbind.data.frame(cyan, category=catstr)
- ## ADD UNIPROT GO
- go.file <- file.path(cyano.path,"originalData","pcc6803_uniprot_go.txt")
- go <- read.delim(go.file)
- colnames(go) <- c("uniprot","cyanobaseID","go")
- go$cyanobaseID <- gsub("syn:","",go$cyanobaseID)
- ## extract all GO terms & descriptions
- got <- strsplit(go$go,";")
- god <- got <- unique(unlist(got))
- got <- sapply(got, function(x) gsub("\\]","",gsub(".*\\[","",x)))
- god <- sapply(god, function(x) trimws(gsub("\\[.*","",x)))
- names(god) <- names(got) <- NULL
- god <- cbind(term=got, description=god)
- god <- god[!duplicated(god[,1]),]
- ## WRITE OUT GO MAPPING
- go.map <- file.path(cyano.path,"processedData","pcc6803_uniprot_go_terms.tsv")
- write.table(god, file=go.map, quote=FALSE, row.names=FALSE, sep="\t")
- ## generate pure GO term column
- got <- strsplit(go$go,";")
- got <- lapply(got, function(x) gsub("\\]","",gsub(".*\\[","",x)))
- got <- unlist(lapply(got, paste0, collapse=";"))
- ## expand genes
- ggenes <- strsplit(go$cyanobaseID,";")
- glen <- unlist(lapply(ggenes, length))
- ## loose two empty!!
- rm <- which(glen==0)
- go <- go[-rm,]
- ggenes <- ggenes[-rm]
- glen <- glen[-rm]
- got <- got[-rm]
- ## expand multiple cyanobase genes
- tmp <- sapply(1:length(glen), function(x) rep(got[x], glen[x]))
- ok <- sum(unlist(lapply(tmp,length)) != unlist(lapply(ggenes,length)))
- if ( ok>0 ) stop("wrong length of GO gene expansion")
- ## gene - go table!
- allgo <- cbind.data.frame(ID=unlist(ggenes), GO=unlist(tmp))
- ##tmp <- merge(cyan, allgo, by="ID", all=TRUE)
- cyan <- cbind.data.frame(cyan, GO=allgo$GO[match(cyan$ID, allgo$ID)])
- ## load lehmann14 transcriptome data and add clusterings and colors!
- leh14 <- read.delim(file.path(cyano.path,"originalData",
- "supp_gku641_SupportingDataFile2.csv"))
- leh14cols <- c(CL_prakash09_sc="supercoiling_transcriptome",
- CL_leh14="diurnal_transcriptome",
- CL_leh14_at2="AT2_periodicity")
- leh14dat <- leh14[match(cyan[,"ID"],leh14[,"ID"]),leh14cols]
- colnames(leh14dat) <- names(leh14cols)
- cyan <- cbind(cyan, leh14dat)
-
- ## TODO: load this from file!
- diur.cls <- as.character(cyan[,"CL_leh14"])
- cls.srt <- sort(as.numeric(unique(diur.cls)))
- ## set cluster colors
- cls.col <- rep("gray", length(cls.srt))
- names(cls.col) <- cls.srt
- ### main
- cls.col[c("2")] <- "#8493CA" # pastel blue
- cls.col[c("7")] <- "#0000FF" # blue
- cls.col[c("3")] <- "#00E6FF" # cyan"
- cls.col[c("8")] <- "#FF0000" # red
- cls.col[c("6")] <- rgb(1,.8,0) # dark yellow
- cls.col[c("1")] <- "#82CA9D" # pastel green
- ## background
- cls.col[c("5")] <- "#626262" # gray: foreign genes?? transposons!
- cls.col[c("4")] <- rgb(0,0,.6) # dark blue
- cls.col[c("9")] <- "#6ECFF6" # pastel cyan
- cls.col[c("10")]<- "#F7977A" # pastel red
- ## not present on array
- cls.col[c("11")]<- "#000000" # black #"#E1E1E1" # light gray
- ftcols <- cls.col[as.character(cyan[,"CL_leh14"])]
- ftcols[is.na(ftcols)] <- "#000000"
- cyan <- cbind(cyan,CL_leh14_col=ftcols)
- ## load saha16 transcriptome clustering (segmenTier/segmenTools)
- load(file.path(cyano.path,"saha16","saha16.RData"))
- cset <- cdat$cset
- k <- selected(cset)
- cls <- cset$clusters[,k]
- cls.col <- cset$colors[[k]]
- ftcols <- cls.col[as.character(cls[cyan[,"ID"]])]
- cyan <- cbind(cyan,
- CL_saha16=as.character(cls[cyan[,"ID"]]),
- CL_saha16_col=ftcols)
- ## load Zavrel et al. 2019 proteome clustering
- zav <- read.delim(file.path(cyano.path,"processedData","zavrel19_proteome.tsv"))
- ## search gene IDs
- zlst <- strsplit(zav$kegg,";")
- zids <- lapply(zlst, function(x) {
- y <- c(match(x, cyan$ID))
- y[!is.na(y)]})
- znum <- unlist(lapply(zids, length))
- cat(paste(sum(znum==2), "proteins with multiple genes:\n",
- paste(zav[znum==2,"protein"], collapse=" "),"\n",
- paste(zav[znum==2,"gene"], collapse=" "),
- "\nonly counting first\n"))
- zids <- unlist(lapply(zlst, function(x) x[1]))
- length(match(zids, cyan$ID))
- cyan <- cbind(cyan, zav[match(cyan$ID, zids),
- c("CL_zavrel19","CL_zavrel19_super")])
- ## load Behle et al. 2021 topA-OX clusters and colors
- ## TODO: REPLACE THIS BY SUPPLEMENT OF THE PAPER (add to download list!)
- behle21.file <- "~/work/CoilHack/experiments/RNAseq/20201204_RM_topA/analysis/genome_0/20201204_RM_topA_results.tsv"
- behle21.batch <- "~/work/CoilHack/experiments/RNAseq/20200602_WM_coilhack_endpoint/analysis_4/20200602_WM_coilhack_endpoint_results.tsv"
- if ( file.exists(behle21.file) ) {
- b21 <- read.delim(behle21.file)
- b21 <- b21[match(cyan[,"ID"], b21$cyanobaseID),]
- b21b <- read.delim(behle21.batch)
- b21b <- b21b[match(cyan[,"ID"], b21b$cyanobaseID),]
- cyan <- cbind(cyan, b21[,c("CL_behle21","CL_behle21_col")])
- ## gray scale for log2 FC
- num2col <- function(x,n=100, limits, pal, colf=viridis::viridis){
- if ( missing(pal) ) pal <- colf(n)
- if ( missing(limits) ) limits <- range(x, na.rm=TRUE)
- pal[findInterval(x,seq(limits[1],limits[2],
- length.out=length(pal)+1), all.inside=TRUE)]
- }
- cyan <- cbind(cyan, b21b[,grep("log2FoldChange",colnames(b21b))])
- topa.col <- num2col(cyan$TOPA_log2FoldChange, limits=c(-1,1))
- gyrb.col <- num2col(cyan$GyrB_log2FoldChange, limits=c(-2,2))
- gyra.col <- num2col(cyan$GyrA_log2FoldChange, limits=c(-2,2))
- cyan <- cbind(cyan, TOPA_color=topa.col,
- GyrA_color=gyra.col, GyrB_color=gyrb.col)
- }
- ## add kopf14 TU and add CL_kopf14, if it exists already,
- ## TODO: generate kopf14 file in preprocessing step!
- kopf14.file <- file.path(cyano.path,"processedData","kopf14_tu_genes.tsv")
- if ( file.exists(kopf14.file) ) {
- kpf <- read.delim(kopf14.file)
- kpf <- kpf[match(cyan[,"ID"], kpf$gene),]
- cyan <- cbind(cyan, TU_kopf14=kpf[,"TU.ID"],
- CL_kopf14=kpf[,"Max.cond."])
- }
- ## add Koskinen et al. 2016 transcriptome data
- kosk16.file <- file.path(cyano.path,"originalData",
- "MMI_13214_supp-0001-Dataset_S1.xlsx")
- kosk16 <- readxl::read_xlsx(kosk16.file, sheet=1, skip=1)
- kosk16 <- kosk16[match(cyan[,"ID"], kosk16$`Gene ID`),2:3]
- colnames(kosk16) <- paste0("sigBCDE_",gsub(" ","",colnames(kosk16)))
- cyan <- cbind(cyan, kosk16)
- ## FUNCTIONAL CATEGORIES
- ## extract categories and store index!
- maincat <- cbind(unique(cat[,3]), unique(cat[,4]))
- tmp <- unique(cat[,5])
- tmp <- tmp[tmp!=""]
- subcat <- cbind(tmp,rep(NA,length(tmp)))
- for ( i in 1:length(tmp) )
- subcat[i,2] <- c(cat[which(cat[,5]==tmp[i])[1],6])
- catidx <- rbind(maincat,subcat)
- catidx <- catidx[order(catidx[,1]),]
- file.name <- file.path(cyano.path,"processedData","annotation_index.txt")
- write.table(catidx, file=file.name, quote=F, col.names=F, row.names=F,sep="\t")
- ## GO TERMS - TODO
- #go <- read.delim(file.path(cyano.path,"originalData","goterm.txt"),header=FALSE)
- ## add type column
- cyan <- cbind(cyan,type=rep("gene",nrow(cyan)))
- ## reorder
- mainCols <- c("ID","name","type","chr","start","end","strand")
- cyan <- cbind(cyan[,mainCols],cyan[,!colnames(cyan)%in%mainCols])
- ## add chromosomes!
- chr <- as.data.frame(matrix(NA, nrow=length(chrL), ncol=ncol(cyan)))
- colnames(chr) <- colnames(cyan)
- chr[,"chr"] <- 1:length(chrL)
- chr[,"start"] <- 1
- chr[,"end"] <- chrL
- chr[,"strand"] <- "."
- chr[,"ID"] <- 1:length(chrL)
- chr[,"name"] <- as.character(chrIdx[,1])
- chr[,"type"] <- "chromosome"
- cyan <- rbind(cyan,chr)
- ## * RENAME GENES!
- ## gyrA/gyrA:
- ## rename sll1941 to gyrA2 (48.61% AA identity to S.coccus PCC7942)
- ## keep slr0417 as gyrA (62.4% AA identity to S.coccus PCC7942)
- idx <- which(cyan[,"ID"]=="sll1941")
- cyan[idx,"name"] <- "gyrA2"
- ## WRITE-OUT TABLE FILE
- file.name <- file.path(cyano.path,"features_cyanobase.csv")
- write.table(cyan, file=file.name, sep="\t", row.names=F, quote=F, na="")
- ## EXPAND CIRCULAR GENES for overlap analyses and genomeBrowser
- ## keep color annotation!
- cyan.circ <- segmenTools::expandCircularFeatures(cyan, chrL, copyCols=TRUE)
- ## genomeBrowser Data!!
- columns <- c(ID="ID", name="name", type="type", chr="chr", strand="strand",
- start="start", end="end", color="CL_behle21_col")
- types <- c("gene","gene-circ2")
- out.path <- file.path(data.path,"pcc6803")
- odat <- list(ID="cyanobase",
- description="annotated genome features",
- data=cyan.circ,
- settings=list(type="plotFeatures",
- types=types,columns=columns,
- typord=FALSE,cuttypes=TRUE,names=TRUE,
- ylab="cyano-\nbase",hgt=.75,xaxis=FALSE))
- ## store data
- file.name <- file.path(out.path,paste(odat$ID,".RData",sep=""))
- save(odat,file=file.name)
- ## features with color gradient by log2-fold change in gyrA-kd
- ## features with color gradient by log2-fold change in topA-OX
- ## copy each feature
- columns["color"] <- "behle21_batch_color"
- cyx <- rbind(cyan.circ,
- cyan.circ,
- cyan.circ)
- cyx <- cbind(cyx,
- "behle21_batch_color"=c(cyan.circ$TOPA_color,
- cyan.circ$GyrA_color,
- cyan.circ$GyrB_color))
- cyx$type <- c(rep("TopA-OX",nrow(cyan.circ)),
- rep("GyrA-kd",nrow(cyan.circ)),
- rep("GyrB-kd",nrow(cyan.circ)))
- types <- unique(cyx$type)
- odat <- list(ID="behle21_batch",
- description="features by expression level in topoisomerase experiments, Behle et al. 2021",
- data=cyx,
- settings=list(type="plotFeatures",
- arrow = list(length=0, code=0, angle=0,
- pch=NA, lwd=2),
- types=types,columns=columns,
- typord=TRUE,cuttypes=FALSE,names=FALSE,
- ylab="",hgt=.25,xaxis=FALSE))
- ## store data
- file.name <- file.path(out.path,paste(odat$ID,".RData",sep=""))
- save(odat,file=file.name)