cyanobase.R | searchcode

/data/pcc6803/cyanobase.R

https://gitlab.com/raim/genomeBrowser
R | 324 lines | 210 code | 46 blank | 68 comment | 7 complexity | 54eb395bb73c7998701b50015f5baf25 MD5 | raw file


require(segmenTools)
require(readxl)

## load functions
browser.path <- sub("GENBRO=","",system("env|grep GENBRO",intern=TRUE))
data.path <- sub("GENDAT=","",system("env|grep GENDAT",intern=TRUE))
source(file.path(browser.path,"src/genomeBrowser_utils.R"))
## data path
cyano.path <- sub("PCC6803DAT=","",system("env|grep PCC6803DAT",intern=TRUE))

## working with IDs, set this to false!
op <- options(stringsAsFactors=FALSE)


### CYANOBASE ANNOTATION DATA
## NOTE: this assumes equal sequence lengths
## between cyanobase annotation and ncbi releases!
## checked for cyanobase 20170923 vs. .1 versions of NCBI

## TODO behle21 and kopf14 data are parsed here AFTER this
## file was required

## chromosome Length Index
chrIdx <- read.delim(file.path(cyano.path,"chromosomes","pcc6803.csv"),
                     header=TRUE,row.names=1)
chrL <- chrIdx[,2]
chrS <- segmenTools::getChrSum(chrL) # chromosome index
## mapping of different chromosome IDs
chrMap <- read.delim(file.path(browser.path,"data","pcc6803","parameters",
                               "chromosomemapping.txt"),
                     header=FALSE,row.names=1)

## parse basic gene annotation
file <- file.path(cyano.path,"originalData", paste("genes.txt",sep=""))
cyan <- read.delim(file,header=FALSE)
colnames(cyan) <- c("assembly","ID","chr","start","end","strand","function")
cyan[,"chr"] <- as.numeric(chrMap[cyan[,"chr"],1])
## correct negative coordinates of circular genes
## USING NCBI chromosome lengths!!
circ <- which(cyan[,"start"]<0)
cyan[circ,"start"] <- chrL[cyan[circ,"chr"]]+cyan[circ,"start"]
## rm unused columns
cyan <- cyan[,-1]

## NAMES

## use cyanobase GENE SYMBOLS as names/alias
genes <- read.delim(file.path(cyano.path,"originalData","gene_symbol.txt"),
                    header=FALSE)
cyan <- cbind.data.frame(cyan,
                         alias=genes[match(cyan[,"ID"],genes[,"V2"]),"V4"])
nms <- sub("/.*","",sub(",.*","",cyan[,"alias"]))
cyan <- cbind.data.frame(cyan, name=nms)

## use IDs where no name is present
cyan[is.na(cyan[,"name"]),"name"] <- cyan[is.na(cyan[,"name"]),"ID"]

## TODO: use first of ,- or /-separated lists
## store full names in "alias" or similar
## in annotation.RData - add kopf14 transcripts and cyanobase, but
## only for plotFeatures.R reference (rm redundant from plot types)


## add annotation data?
## record and write index file!
cat <- read.delim(file.path(cyano.path,"originalData","category.txt"),
                  header=FALSE)
cat <- apply(cat, 2, function(x) gsub("&amp;","&",gsub("&#x27;","'",x)))

catstr <- unlist(gsub(";$","",apply(cat[match(cyan[,"ID"],cat[,"V1"]),
                                        c("V3","V5")],1,
                                    paste, collapse=";")))
cyan <- cbind.data.frame(cyan, category=catstr)

## ADD UNIPROT GO
go.file <- file.path(cyano.path,"originalData","pcc6803_uniprot_go.txt")
go <- read.delim(go.file)
colnames(go) <- c("uniprot","cyanobaseID","go")
go$cyanobaseID <- gsub("syn:","",go$cyanobaseID)
## extract all GO terms & descriptions
got <- strsplit(go$go,";")
god <- got <- unique(unlist(got))
got <- sapply(got, function(x) gsub("\\]","",gsub(".*\\[","",x)))
god <- sapply(god, function(x) trimws(gsub("\\[.*","",x)))
names(god) <- names(got) <- NULL
god <- cbind(term=got, description=god)
god <- god[!duplicated(god[,1]),]
## WRITE OUT GO MAPPING
go.map <- file.path(cyano.path,"processedData","pcc6803_uniprot_go_terms.tsv")
write.table(god, file=go.map, quote=FALSE, row.names=FALSE, sep="\t")

## generate pure GO term column
got <- strsplit(go$go,";")
got <- lapply(got, function(x) gsub("\\]","",gsub(".*\\[","",x)))
got <- unlist(lapply(got, paste0, collapse=";"))
## expand genes
ggenes <- strsplit(go$cyanobaseID,";")
glen <- unlist(lapply(ggenes, length))
## loose two empty!!
rm <- which(glen==0)
go <- go[-rm,]
ggenes <- ggenes[-rm]
glen <- glen[-rm]
got <- got[-rm]
## expand multiple cyanobase genes
tmp <- sapply(1:length(glen), function(x) rep(got[x], glen[x]))
ok <- sum(unlist(lapply(tmp,length)) != unlist(lapply(ggenes,length)))
if ( ok>0 ) stop("wrong length of GO gene expansion")
## gene - go table!
allgo <- cbind.data.frame(ID=unlist(ggenes), GO=unlist(tmp))

##tmp <- merge(cyan, allgo, by="ID", all=TRUE)

cyan <- cbind.data.frame(cyan, GO=allgo$GO[match(cyan$ID, allgo$ID)])

## load lehmann14 transcriptome data and add clusterings and colors!
leh14 <- read.delim(file.path(cyano.path,"originalData",
                              "supp_gku641_SupportingDataFile2.csv"))
leh14cols <- c(CL_prakash09_sc="supercoiling_transcriptome",
               CL_leh14="diurnal_transcriptome",
               CL_leh14_at2="AT2_periodicity")
leh14dat <- leh14[match(cyan[,"ID"],leh14[,"ID"]),leh14cols]
colnames(leh14dat) <- names(leh14cols)
cyan <- cbind(cyan, leh14dat)
  
## TODO: load this from file!
diur.cls <- as.character(cyan[,"CL_leh14"])
cls.srt <- sort(as.numeric(unique(diur.cls)))
## set cluster colors
cls.col <- rep("gray", length(cls.srt))
names(cls.col) <- cls.srt
### main
cls.col[c("2")] <- "#8493CA" # pastel blue
cls.col[c("7")] <- "#0000FF"   # blue
cls.col[c("3")] <- "#00E6FF" # cyan"
cls.col[c("8")] <- "#FF0000"   # red
cls.col[c("6")] <- rgb(1,.8,0) # dark yellow
cls.col[c("1")] <- "#82CA9D" # pastel green
## background
cls.col[c("5")] <- "#626262" # gray: foreign genes?? transposons!
cls.col[c("4")] <- rgb(0,0,.6) # dark blue
cls.col[c("9")] <- "#6ECFF6" # pastel cyan
cls.col[c("10")]<- "#F7977A" # pastel red
## not present on array
cls.col[c("11")]<- "#000000" # black #"#E1E1E1" # light gray

ftcols <- cls.col[as.character(cyan[,"CL_leh14"])]
ftcols[is.na(ftcols)] <- "#000000"
cyan <- cbind(cyan,CL_leh14_col=ftcols)

## load saha16 transcriptome clustering (segmenTier/segmenTools)
load(file.path(cyano.path,"saha16","saha16.RData"))
cset <- cdat$cset
k <- selected(cset)
cls <- cset$clusters[,k]
cls.col <-  cset$colors[[k]]
ftcols <- cls.col[as.character(cls[cyan[,"ID"]])]
cyan <- cbind(cyan,
              CL_saha16=as.character(cls[cyan[,"ID"]]),
              CL_saha16_col=ftcols)

## load Zavrel et al. 2019 proteome clustering
zav <- read.delim(file.path(cyano.path,"processedData","zavrel19_proteome.tsv"))

## search gene IDs
zlst <- strsplit(zav$kegg,";")
zids <- lapply(zlst, function(x) {
    y <- c(match(x, cyan$ID))
    y[!is.na(y)]})
znum <- unlist(lapply(zids, length))

cat(paste(sum(znum==2), "proteins with multiple genes:\n",
          paste(zav[znum==2,"protein"], collapse=" "),"\n",
          paste(zav[znum==2,"gene"], collapse=" "),
          "\nonly counting first\n"))
zids <- unlist(lapply(zlst, function(x) x[1]))

length(match(zids, cyan$ID))

cyan <- cbind(cyan, zav[match(cyan$ID, zids),
                        c("CL_zavrel19","CL_zavrel19_super")])

## load Behle et al. 2021 topA-OX clusters and colors
## TODO: REPLACE THIS BY SUPPLEMENT OF THE PAPER (add to download list!)
behle21.file <- "~/work/CoilHack/experiments/RNAseq/20201204_RM_topA/analysis/genome_0/20201204_RM_topA_results.tsv"
behle21.batch <- "~/work/CoilHack/experiments/RNAseq/20200602_WM_coilhack_endpoint/analysis_4/20200602_WM_coilhack_endpoint_results.tsv"
if ( file.exists(behle21.file) ) {
    b21 <- read.delim(behle21.file)
    b21 <- b21[match(cyan[,"ID"], b21$cyanobaseID),]
    b21b <- read.delim(behle21.batch)
    b21b <- b21b[match(cyan[,"ID"], b21b$cyanobaseID),]

    cyan <- cbind(cyan, b21[,c("CL_behle21","CL_behle21_col")])

    ## gray scale for log2 FC
    num2col <- function(x,n=100, limits, pal, colf=viridis::viridis){
        if ( missing(pal) ) pal <- colf(n)
        if ( missing(limits) ) limits <- range(x, na.rm=TRUE)
        pal[findInterval(x,seq(limits[1],limits[2],
                               length.out=length(pal)+1), all.inside=TRUE)]
    }
    cyan <- cbind(cyan, b21b[,grep("log2FoldChange",colnames(b21b))])
    topa.col <- num2col(cyan$TOPA_log2FoldChange, limits=c(-1,1))
    gyrb.col <- num2col(cyan$GyrB_log2FoldChange, limits=c(-2,2))
    gyra.col <- num2col(cyan$GyrA_log2FoldChange, limits=c(-2,2))
    cyan <- cbind(cyan, TOPA_color=topa.col,
                  GyrA_color=gyra.col, GyrB_color=gyrb.col)
}

## add kopf14 TU and add CL_kopf14, if it exists already,
## TODO: generate kopf14 file in preprocessing step!
kopf14.file <-  file.path(cyano.path,"processedData","kopf14_tu_genes.tsv")
if ( file.exists(kopf14.file) ) {
    kpf <- read.delim(kopf14.file)
    kpf <- kpf[match(cyan[,"ID"], kpf$gene),]
    cyan <- cbind(cyan, TU_kopf14=kpf[,"TU.ID"],
                  CL_kopf14=kpf[,"Max.cond."])
}

## add Koskinen et al. 2016 transcriptome data
kosk16.file <- file.path(cyano.path,"originalData",
                         "MMI_13214_supp-0001-Dataset_S1.xlsx")
kosk16 <- readxl::read_xlsx(kosk16.file, sheet=1, skip=1)
kosk16 <- kosk16[match(cyan[,"ID"], kosk16$`Gene ID`),2:3]
colnames(kosk16) <- paste0("sigBCDE_",gsub(" ","",colnames(kosk16)))
cyan <- cbind(cyan, kosk16)

## FUNCTIONAL CATEGORIES
## extract categories and store index!
maincat <- cbind(unique(cat[,3]), unique(cat[,4]))
tmp <- unique(cat[,5])
tmp <- tmp[tmp!=""]
subcat <- cbind(tmp,rep(NA,length(tmp)))
for ( i in 1:length(tmp) ) 
    subcat[i,2] <- c(cat[which(cat[,5]==tmp[i])[1],6])
catidx <- rbind(maincat,subcat)
catidx <- catidx[order(catidx[,1]),]
file.name <- file.path(cyano.path,"processedData","annotation_index.txt")
write.table(catidx, file=file.name, quote=F, col.names=F, row.names=F,sep="\t")

## GO TERMS - TODO
#go <- read.delim(file.path(cyano.path,"originalData","goterm.txt"),header=FALSE)

## add type column
cyan <- cbind(cyan,type=rep("gene",nrow(cyan)))

## reorder
mainCols <- c("ID","name","type","chr","start","end","strand")
cyan <- cbind(cyan[,mainCols],cyan[,!colnames(cyan)%in%mainCols])

## add chromosomes!
chr <- as.data.frame(matrix(NA, nrow=length(chrL), ncol=ncol(cyan)))
colnames(chr) <- colnames(cyan)
chr[,"chr"] <- 1:length(chrL)
chr[,"start"] <- 1
chr[,"end"] <- chrL
chr[,"strand"] <- "."
chr[,"ID"] <- 1:length(chrL)
chr[,"name"] <- as.character(chrIdx[,1])
chr[,"type"] <- "chromosome"

cyan <- rbind(cyan,chr)

## * RENAME GENES!
## gyrA/gyrA:
## rename sll1941 to gyrA2 (48.61% AA identity to S.coccus PCC7942)
## keep slr0417 as gyrA (62.4% AA identity to S.coccus PCC7942)
idx <- which(cyan[,"ID"]=="sll1941")
cyan[idx,"name"] <- "gyrA2"

## WRITE-OUT TABLE FILE
file.name <- file.path(cyano.path,"features_cyanobase.csv")
write.table(cyan, file=file.name, sep="\t", row.names=F, quote=F, na="")

## EXPAND CIRCULAR GENES for overlap analyses and genomeBrowser
## keep color annotation!
cyan.circ <- segmenTools::expandCircularFeatures(cyan, chrL, copyCols=TRUE)


## genomeBrowser Data!!
columns <- c(ID="ID", name="name", type="type", chr="chr", strand="strand",
             start="start", end="end", color="CL_behle21_col")
types <- c("gene","gene-circ2")
out.path <- file.path(data.path,"pcc6803")
odat <- list(ID="cyanobase",
             description="annotated genome features",
             data=cyan.circ,
             settings=list(type="plotFeatures",
               types=types,columns=columns,
               typord=FALSE,cuttypes=TRUE,names=TRUE,
               ylab="cyano-\nbase",hgt=.75,xaxis=FALSE))
## store data
file.name <- file.path(out.path,paste(odat$ID,".RData",sep=""))
save(odat,file=file.name)

## features with color gradient by log2-fold change in gyrA-kd
## features with color gradient by log2-fold change in topA-OX
## copy each feature

columns["color"] <- "behle21_batch_color"
cyx <- rbind(cyan.circ,
             cyan.circ,
             cyan.circ)
cyx <- cbind(cyx,
             "behle21_batch_color"=c(cyan.circ$TOPA_color,
                                     cyan.circ$GyrA_color,
                                     cyan.circ$GyrB_color))
cyx$type <- c(rep("TopA-OX",nrow(cyan.circ)),
              rep("GyrA-kd",nrow(cyan.circ)),
              rep("GyrB-kd",nrow(cyan.circ)))
types <- unique(cyx$type)
odat <- list(ID="behle21_batch",
             description="features by expression level in topoisomerase experiments, Behle et al. 2021",
             data=cyx,
             settings=list(type="plotFeatures",
                           arrow = list(length=0, code=0, angle=0,
                                        pch=NA, lwd=2),
                           types=types,columns=columns,
                           typord=TRUE,cuttypes=FALSE,names=FALSE,
                           ylab="",hgt=.25,xaxis=FALSE))
## store data
file.name <- file.path(out.path,paste(odat$ID,".RData",sep=""))
save(odat,file=file.name)