/gqData/inst/extdata/microarray_annotations_library_sources/trace.r
R | 933 lines | 545 code | 204 blank | 184 comment | 1 complexity | 7a896b0a1efc839816bb189963b9d317 MD5 | raw file
- library(GEOquery)
- library(limma)
- library("org.Hs.eg.db")
- library("org.Mm.eg.db")
- library(multicore)
- library(gdata)
- library(oligo)
- library(RCurl)
- library(ExiMiR) # for Exiqon
- library(LVSmiRNA) # for Agilent
- library(AgiMicroRna) # for Agilent
- library(miRNApath) # for pathway analysis
- library(RmiR) # working with mirna in genral
- library(microRNA) # working with mirna in genral
- library(charm)
- library(magrittr)
- library(data.table)
- options(stringsAsFactors=FALSE)
- ## Mappings useful for some platforms
- x.human <- org.Hs.egSYMBOL2EG # For the reverse map:
- mapped_genes.human <- mappedkeys(x.human) # Get the entrez gene identifiers that are mapped to a gene symbol
- xx.human <- as.list(x.human[mapped_genes.human]) # Convert to a list
- mm.s2eg = as.list(revmap(org.Mm.egSYMBOL))
- mm.eg2s = as.list(org.Mm.egSYMBOL)
- mm.refseq2eg = as.list(org.Mm.egREFSEQ2EG)
- ############################################################################################################
- ############################################################################################################
- # Expression Arrays
- ############################################################################################################
- ############################################################################################################
- ## Agilent_017942_D_F_20071024_AK_12_MRD027-1_US09473739_251794210018_S01_GE1_107_Sep09_1_4.txt
- ## Custom Design From Geraldine Butler
- # load chip design from a data file..
- fn="Agilent_017942_D_F_20071024_AK_12_MRD027-1_US09473739_251794210018_S01_GE1_107_Sep09_1_4.txt"
- f = read.delim(fn,skip=9,comment.char='',quote='')
- f = unique(f[,c(7,8)])
- f$parsedCgdFeature = gsub('^gb_','',f$SystematicName)
- #f = f[1:(nrow(f)-1),]
- # Annotations from CGD
- fn="http://www.candidagenome.org/download/chromosomal_feature_files/C_albicans_SC5314/C_albicans_SC5314_A21_current_chromosomal_feature.tab"
- ann = read.delim(fn,skip=8,comment.char='',quote='',header=FALSE)
- colnames(ann) = c(
- "Feature name"
- ,"Gene name"
- ,"Aliases"
- ,"Feature type"
- ,"Chromosome"
- ,"Start Coordinate"
- ,"Stop Coordinate"
- ,"Strand"
- ,"Primary CGDID"
- ,"Secondary CGDID"
- ,"Description"
- ,"Date Created"
- ,"Sequence Coordinate Version Date (if any)"
- ,"Blank"
- ,"Blank2"
- ,"Date of gene name reservation"
- ,"Has the reserved gene name become the standard name"
- ,"Name of S. cerevisiae ortholog(s)"
- )
- rownames(ann) = ann[['Feature name']]
- # Join
- table(f$parsedCgdFeature %in% rownames(ann))
- missing = unique(f$parsedCgdFeature[! f$parsedCgdFeature %in% rownames(ann)])
- fdata = merge(f,ann,all.x=TRUE,all.y=FALSE,by.x="parsedCgdFeature",by.y=0)
- #Controled column name
- fdata = data.frame(
- 'ProbeID' = fdata[['ProbeName']]
- ,'SYMBOL' = fdata[['Gene name']]
- ,fdata,row.names=fdata[['ProbeName']],check.names=FALSE)
- fdata$SYMBOL[fdata$SYMBOL==''] = NA
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$parsedCgdFeature[is.na(fdata$SYMBOL)] # replace emtpy probes
- #fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
- #fdata[['GO_ID']] = NULL # this is useless anyway
- save(fdata ,file='../../../data/Agilent017942.RData')
- ## Agilent-028279 SurePrint G3 Rat GE 8x60K Microarray GEO_028279_D_GEO_20130204.txt
- # http://www.genomics.agilent.com/CollectionSubpage.aspx?PageType=Product&SubPageType=ProductData&PageID=1524
- f = read.delim('GEO_028279_D_GEO_20130204.txt',skip=42,comment.char='',quote='')
- f = f[1:(nrow(f)-1),]
- fdata = unique(f[,4:20])
- fdata = data.frame(
- 'ProbeID' = fdata[['NAME']]
- ,'SYMBOL' = fdata[['GENE_SYMBOL']]
- ,'ENTREZID' = fdata[['LOCUSLINK_ID']]
- ,'Control' = fdata[['CONTROL_TYPE']]
- ,'Chromosome' = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
- ,fdata,row.names=fdata[['NAME']])
- fdata$SYMBOL[fdata$SYMBOL==''] = NA
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
- fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
- fdata[['GO_ID']] = NULL # this is useless anyway
- save(fdata ,file='../../../data/Agilent028279.RData')
- ## Agilent-028004 SurePrint G3 Human Gene Expression 8x60K Microarray Kit
- # http://www.genomics.agilent.com/CollectionSubpage.aspx?PageType=Product&SubPageType=ProductData&PageID=1516
- f = read.delim('GEO_028004_D_GEO_20120411.txt',skip=42,comment.char='',quote='')
- f = f[1:(nrow(f)-1),]
- f = unique(f[,4:20])
- fdata = f
- fdata = data.frame(
- 'ProbeID' = fdata[['NAME']]
- ,'SYMBOL' = fdata[['GENE_SYMBOL']]
- ,'ENTREZID' = fdata[['LOCUSLINK_ID']]
- ,'Control' = fdata[['CONTROL_TYPE']]
- ,'Chromosome' = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
- ,fdata,row.names=fdata[['NAME']])
- fdata$SYMBOL[fdata$SYMBOL==''] = NA
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
- fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
- fdata[['GO_ID']] = NULL # this is useless anyway
- save(fdata ,file='../../../data/Agilent028004.RData')
- ## Agilent-028005 SurePrint G3 Mouse GE 8x60K Microarray Sat 7 Jul 06:27:30 2012
- gpl = 'GPL10787'
- fdata = Table(getGEO(gpl))
- fdata = data.frame(
- 'ProbeID' = fdata[['ID']]
- ,'SYMBOL' = fdata[['GENE_SYMBOL']]
- ,'ENTREZID' = fdata[['GENE']]
- ,'Control' = fdata[['CONTROL_TYPE']]
- ,'Chromosome' = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
- ,fdata,row.names=fdata$ID)
- fdata$SYMBOL[fdata$SYMBOL==''] = NA
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
- fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
- fdata[['GO_ID']] = NULL # this is useless anyway
- save(fdata ,file='../../../data/Agilent028005.RData')
- # Agilent-039494 SurePrint G3 Human GE v2 8x60K Microarray Wed 15 Aug 17:35:19 2012
- # GEO_039494_D_GEO_20120628.txt
- # Name: SurePrint G3 Human GE v2 8x60K Microarray
- # Design ID: 039494
- # Design Format: 8 x 60 K
- # Control Grid: IS-62976-8-V2_60kby8_GX_EQC_201000210
- # Build Version: hg19:GRCh37:Feb2009
- f = read.delim('GEO_039494_D_GEO_20120628.txt',skip=42)
- f = f[1:(nrow(f)-1),]
- f = unique(f[,4:20])
- fdata = f
- fdata = data.frame(
- 'ProbeID' = fdata[['NAME']]
- ,'SYMBOL' = fdata[['GENE_SYMBOL']]
- ,'ENTREZID' = fdata[['LOCUSLINK_ID']]
- ,'Control' = fdata[['CONTROL_TYPE']]
- ,'Chromosome' = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
- ,fdata,row.names=fdata[['NAME']])
- fdata$SYMBOL[fdata$SYMBOL==''] = NA
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
- fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
- fdata[['GO_ID']] = NULL # this is useless anyway
- save(fdata ,file='../../../data/Agilent039494.RData')
- ## Agilent-026440 Wed 10 Oct 15:42:34 2012
- # Name: S. scrofa (Pig) Oligo Microarray v2
- # Design ID: 026440
- # Design Format: 4 X 44K
- # Control Grid: IS-45220-4-V1_4x44K_GX_EQC_V20060608
- # Build Version: Not Applicable
- f = read.delim('GEO_026440_D_GEO_20120509.txt',skip=42,comment.char='',quote='') # damn it the file is broken
- f = f[1:(nrow(f)-1),]
- f = unique(f[,4:20])
- fdata = f
- fdata = data.frame(
- 'ProbeID' = fdata[['NAME']]
- ,'SYMBOL' = fdata[['GENE_SYMBOL']]
- ,'ENTREZID' = fdata[['LOCUSLINK_ID']]
- ,'Control' = fdata[['CONTROL_TYPE']]
- ,'Chromosome' = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
- ,fdata,row.names=fdata[['NAME']])
- fdata$SYMBOL[fdata$SYMBOL==''] = NA
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
- fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
- fdata[['GO_ID']] = NULL # this is useless anyway
- save(fdata ,file='../../../data/Agilent026440.RData')
- ## Affymetrix [Mouse430_2] Affymetrix Mouse Genome 430 2.0 Array
- gpl = 'GPL1261'
- temp = Table(getGEO(gpl))
- ls("package:mirbase.db")
- fdata= temp
- x = as.list(mirbaseACC2ID)
- fdata[!fdata$ACC %in% names(x),]
- fdata = data.frame(
- 'ProbeID' = fdata[['ID']]
- ,'SYMBOL' = fdata[['Gene Symbol']]
- ,'ENTREZID' = fdata[['ENTREZ_GENE_ID']]
- ,'Control' = ''
- ,'Chromosome' = ''
- ,fdata,row.names=fdata$ID,check.names=FALSE)
- fdata$SYMBOL[fdata$SYMBOL==''] = NA
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
- fdata = fdata[,1:18]# remove GO
- save(fdata,file='../../../data/Affymetrix_Mouse430_2.RData')
- ## Affymetrix 3p IVT Yeast Genome 2.0 Array
- library(yeast2.db)
- options(stringsAsFactors=FALSE)
- bioc.fdata = select(yeast2.db, columns=c("GENENAME","ORF"),keys=keys(yeast2.db))
- rownames(bioc.fdata) = bioc.fdata[["PROBEID"]]
- fn = "Yeast_2.na34.annot.csv"
- fdata = read.csv(fn,comment.char='#',check.names=FALSE)
- fdata = data.frame(
- 'ProbeID' = fdata[['Probe Set ID']]
- ,'SYMBOL' = fdata[['Gene Symbol']]
- ,'ENTREZID' = fdata[['Entrez Gene']]
- ,bioc.fdata[fdata[['Probe Set ID']], ]
- ,fdata,row.names=fdata[['Probe Set ID']],check.names=FALSE)
- fdata$SYMBOL = gsub("///.*","and others",fdata$SYMBOL )
- fdata$SYMBOL[fdata$SYMBOL=='---'] = NA
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
- save(fdata,file='../../../data/Affymetrix_Yeast_2.RData')
- ## Illumina mouse WG-6 Mon 11 Feb 11:02:51 2013
- # Note : We extract the probe annotation from "MouseWG-6_V2_0_R3_11278593_A.bgx" via an actual SampleProbeProfile...
- fdata = read.delim('MouseWG-6_V2_0_R3_11278593_A.bgx_SampleProbeProfile_20130118.txt',check.names=FALSE,comment.char='', quote='',colClasses='character')
- fdata = fdata[,!grepl('^MHA',colnames(fdata))]
- rownames(fdata) = fdata$ProbeID
- fdata[['ENTREZID']] = fdata[['ENTREZ_GENE_ID']]
- fdata[['ENTREZID']][fdata[['ENTREZID']]==''] = NA
- fdata = fdata[,c("ProbeID",'SYMBOL','ENTREZID',setdiff(colnames(fdata),c('ProbeID','SYMBOL','ENTREZID')))]
- fdata = fdata[,!grepl("ONTOLOGY_",colnames(fdata))]
- save(fdata,file='../../../data/MouseWG-6_V2_0_R3_11278593_A.RData')
- ## Illlumina HT12 from example sample probe report Sat 23 Mar 22:07:41 2013
- library("org.Hs.eg.db")
- fdata = read.delim('HumanHT-12_v4_sample_probe_report.txt',check.names=FALSE,comment.char='', quote='',colClasses='character',skip=3)
- fdata = fdata[,c(1:22)]
- fdata = fdata[,!grepl('ONTOLOGY',colnames(fdata))]
- rownames(fdata) = fdata$ProbeID
- fdata$SYMBOL[fdata$SYMBOL==''] = NA
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
- fdata[['ENTREZID']] = NA
- fdata[['ENTREZID']][fdata$SYMBOL %in% keys(org.Hs.egSYMBOL2EG)] =as.character( mget(fdata$SYMBOL[fdata$SYMBOL %in% keys(org.Hs.egSYMBOL2EG)], org.Hs.egSYMBOL2EG ))
- save(fdata,file='../../../data/HumanHT-12_v4.RData')
- ## Mouse Gene 1.0 ST Array : MoGene-1_0-st-v1
- # source("http://bioconductor.org/biocLite.R")
- # biocLite("mogene10sttranscriptcluster.db")
- library(mogene10sttranscriptcluster.db) # ls("package:mogene10sttranscriptcluster.db")
- options(stringsAsFactors=FALSE)
- gpl="GPL6246"
- fdata = Table(getGEO(gpl))
- fdata = fdata[fdata[['SPOT_ID']]!="Not currently mapped to latest genome",]
- fdata[["GeneSymbol"]] = sapply(strsplit(fdata[['gene_assignment']],split=' // '),function(z)z[2])# Attempt at symbol
- fdata = data.frame(
- 'ProbeID' = fdata[['ID']]
- ,'SYMBOL' = unlist(mget(fdata[['ID']],mogene10sttranscriptclusterSYMBOL))
- ,'ENTREZID' = unlist(mget(fdata[['ID']],mogene10sttranscriptclusterENTREZID))
- ,'GENENAME' = unlist(mget(fdata[['ID']],mogene10sttranscriptclusterGENENAME))
- ,fdata,row.names=fdata$ID,check.names=FALSE)
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)]
- save(fdata,file='../../../data/Affymetrix_MoGene-1_0-st-v1.RData')
- # TODO: use Bioc library to add a few things...
- # [RaGene-2_0-st] Affymetrix Rat Gene 2.0 ST Array
- library(ragene20sttranscriptcluster.db)
- gpl="GPL17117"
- fdata = Table(getGEO(gpl))
- fdata = fdata[fdata$ID %in% keys(ragene20sttranscriptclusterSYMBOL),]
- fdata[["GeneSymbol"]] = sapply(strsplit(fdata[['gene_assignment']],split=' // '),function(z)z[2])# Attempt at symbol
- fdata = data.frame(
- 'ProbeID' = fdata[['ID']]
- ,'SYMBOL' = unlist(mget(fdata[['ID']],ragene20sttranscriptclusterSYMBOL))
- ,'ENTREZID' = unlist(mget(fdata[['ID']],ragene20sttranscriptclusterENTREZID))
- ,'GENENAME' = unlist(mget(fdata[['ID']],ragene20sttranscriptclusterGENENAME))
- ,fdata,row.names=fdata$ID,check.names=FALSE)
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)]
- fdata = fdata[,!grepl("^GO_",colnames(fdata))]
- save(fdata,file='../../../data/Affymetrix_RaGene-2_0-st.RData')
- #raw = oligo::rma(oligo::read.celfiles(list.files("~/Dropbox/projects/2013/gsebire_Sebire001_PRJBFX-482/chips",full.names=TRUE)))
- # Options: use bioc annot, use annot from probe profile, use annot from switchtoi
- # [MoGene-2_0-st] Affymetrix Mouse Gene 2.0 ST Array [transcript (gene) version]
- library(mogene20sttranscriptcluster.db)
- gpl="GPL16570"
- fdata = Table(getGEO(gpl))
- fdata = fdata[fdata$ID %in% keys(mogene20sttranscriptclusterSYMBOL),]
- setequal(fdata$ID,keys(mogene20sttranscriptclusterSYMBOL))
- fdata[["GeneSymbol"]] = sapply(strsplit(fdata[['gene_assignment']],split=' // '),function(z)z[2])# Attempt at symbol
- fdata = data.frame(
- 'ProbeID' = fdata[['ID']]
- ,'SYMBOL' = unlist(mget(fdata[['ID']],mogene20sttranscriptclusterSYMBOL))
- ,'ENTREZID' = unlist(mget(fdata[['ID']],mogene20sttranscriptclusterENTREZID))
- ,'GENENAME' = unlist(mget(fdata[['ID']],mogene20sttranscriptclusterGENENAME))
- ,fdata,row.names=fdata$ID,check.names=FALSE)
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)]
- fdata = fdata[,!grepl("^GO_",colnames(fdata))]
- save(fdata,file='../../../data/Affymetrix_MoGene-2_0-st.RData')
- # [HuGene-2_0-st] Affymetrix Human Gene 2.0 ST Array [transcript (gene) version]
- library(hugene20sttranscriptcluster.db)
- #fn = "HuGene-2_0-st-v1.na33.2.hg19.transcript.csv"
- fn = "HuGene-2_0-st-v1.na34.hg19.transcript.csv"
- fdata = read.csv(fn,skip=23, comment.char='',colClasses='character')
- rownames(fdata) = fdata$transcript_cluster_id
- fdata = fdata[rownames(fdata) %in% keys(hugene20sttranscriptclusterSYMBOL),]
- setequal(rownames(fdata),keys(hugene20sttranscriptclusterSYMBOL))
- fdata = data.frame(
- 'ProbeID' = rownames(fdata)
- ,'SYMBOL' = unlist(mget(rownames(fdata),hugene20sttranscriptclusterSYMBOL))
- ,'ENTREZID' = unlist(mget(rownames(fdata),hugene20sttranscriptclusterENTREZID))
- ,'GENENAME' = unlist(mget(rownames(fdata),hugene20sttranscriptclusterGENENAME))
- ,fdata,row.names=rownames(fdata),check.names=FALSE)
- fdata[["GeneSymbol"]] = sapply(strsplit(fdata[['gene_assignment']],split=' // '),function(z)z[2])# Attempt at symbol
- fdata[["Cytoband"]] = sapply(strsplit(fdata[['gene_assignment']],split=' // '),function(z)z[4])# Attempt at symbol
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["GeneSymbol"]][is.na(fdata$SYMBOL)]
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)]
- fdata = fdata[,!grepl("^GO_",colnames(fdata))]
- save(fdata,file='../../../data/Affymetrix_HuGene-2_0-st.RData')
- ############################################################################################################
- ############################################################################################################
- # Clariom
- ############################################################################################################
- ############################################################################################################
- # Clariom_S_Mouse_HT
- library(data.table)
- library(stringr)
- library(magrittr)
- fdata = fread("/Users/flefebvr/Dropbox/projects/2016/skimmins_kimmins006_PRJBFX-1415/Clariom_S_Mouse_HT.na36.mm10.transcript.csv") %>% as.data.frame # this file donwloaded from netaffx
- rownames(fdata) = fdata$"transcript_cluster_id"
- cols=c( # columns to keep!
- "transcript_cluster_id",
- "probeset_id",
- "seqname",
- "strand",
- "start",
- "stop",
- "total_probes",
- "gene_assignment",
- "category",
- "locus type",
- "notes")
- fdata = fdata[,cols]
- fdata = data.frame(
- 'ProbeID' = rownames(fdata)
- ,'SYMBOL' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[2])
- ,'ENTREZID' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[5])
- ,"GeneName" = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[3])
- ,fdata,row.names=rownames(fdata),check.names=FALSE)
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)]
- fdata$"gene_assignment" %<>% str_split(" /// ") %>% sapply(function(x)x[1])
- colnames(fdata) %<>% gsub("^gene_assignment$","gene_assignment (first transcript only)",.)
- save(fdata,file='../../../data/Affymetrix_Clariom_S_Mouse_HT.RData')
- # compare with Eloi's file
- # x = fread("/Users/flefebvr/Dropbox/projects/2016/skimmins_kimmins006_PRJBFX-1415/kimmins006.summary.SST-RMA-GENE-FULL.txt") %>% as.data.frame
- # rownames(x) = x[,1]
- # setequal(rownames(x),rownames(fdata))
- # Clariom_S_Mouse
- library(data.table)
- library(stringr)
- library(magrittr)
- fdata = fread("/Users/emercier/Workdir/gqMicroarray_newPlatform/data/Clariom_S_Mouse.na36.mm10.transcript.csv") %>% as.data.frame # this file donwloaded from netaffx
- rownames(fdata) = fdata$"transcript_cluster_id"
- cols=c( # columns to keep!
- "transcript_cluster_id",
- "probeset_id",
- "seqname",
- "strand",
- "start",
- "stop",
- "total_probes",
- "gene_assignment",
- "category",
- "locus type",
- "notes")
- fdata = fdata[,cols]
- fdata = data.frame(
- 'ProbeID' = rownames(fdata)
- ,'SYMBOL' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[2])
- ,'ENTREZID' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[5])
- ,"GeneName" = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[3])
- ,fdata,row.names=rownames(fdata),check.names=FALSE)
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)] #warnings are expected
- fdata$"gene_assignment" %<>% str_split(" /// ") %>% sapply(function(x)x[1])
- colnames(fdata) %<>% gsub("^gene_assignment$","gene_assignment (first transcript only)",.)
- save(fdata,file='~/Tools/rpackages/gqData/data/Affymetrix_Clariom_S_Mouse.RData')
- # compare with Eloi's file
- # x = fread("/Users/emercier/Workdir/Boerboom008_PRJBFX-1419/data/boerboom008.summary.SST-RMA-GENE-FULL.txt") %>% as.data.frame
- # rownames(x) = x[,1]
- # setequal(rownames(x),rownames(fdata))
- #Clariom D human
- library(data.table)
- library(stringr)
- library(magrittr)
- fdata = fread("/Users/emercier/Workdir/gqMicroarray_newPlatform/data/Clariom_D_Human.na36.hg38.transcript.csv") %>% as.data.frame # this file donwloaded from netaffx
- rownames(fdata) = fdata$"transcript_cluster_id"
- cols=c( # columns to keep!
- "transcript_cluster_id",
- "probeset_id",
- "seqname",
- "strand",
- "start",
- "stop",
- "total_probes",
- "gene_assignment",
- "category",
- "locus type",
- "notes")
- fdata = fdata[,cols]
- fdata = data.frame(
- 'ProbeID' = rownames(fdata)
- ,'SYMBOL' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[2])
- ,'ENTREZID' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[5])
- ,"GeneName" = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[3])
- ,fdata,row.names=rownames(fdata),check.names=FALSE)
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)] #warnings are expected
- fdata$"gene_assignment" %<>% str_split(" /// ") %>% sapply(function(x)x[1])
- colnames(fdata) %<>% gsub("^gene_assignment$","gene_assignment (first transcript only)",.)
- save(fdata,file='~/Tools/rpackages/gqData/data/Affymetrix_Clariom_D_Human.RData')
- #Clariom S human
- library(data.table)
- library(stringr)
- library(magrittr)
- fdata = fread("/Users/emercier/Workdir/gqMicroarray_newPlatform/data/Clariom_S_Human.na36.hg38.transcript.csv") %>% as.data.frame # this file donwloaded from affy
- rownames(fdata) = fdata$"transcript_cluster_id"
- cols=c( # columns to keep!
- "transcript_cluster_id",
- "probeset_id",
- "seqname",
- "strand",
- "start",
- "stop",
- "total_probes",
- "gene_assignment",
- "category",
- "locus type",
- "notes")
- fdata = fdata[,cols]
- fdata = data.frame(
- 'ProbeID' = rownames(fdata)
- ,'SYMBOL' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[2])
- ,'ENTREZID' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[5])
- ,"GeneName" = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[3])
- ,fdata,row.names=rownames(fdata),check.names=FALSE)
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)] #warnings are expected
- fdata$"gene_assignment" %<>% str_split(" /// ") %>% sapply(function(x)x[1])
- colnames(fdata) %<>% gsub("^gene_assignment$","gene_assignment (first transcript only)",.)
- save(fdata,file='~/Tools/rpackages/gqData/data/Affymetrix_Clariom_S_Human.RData')
- #Clariom S human HT #exact same then Clariom S
- library(data.table)
- library(stringr)
- library(magrittr)
- fdata = fread("/Users/emercier/Workdir/gqMicroarray_newPlatform/data/Clariom_S_Human_HT.na36.hg38.transcript.csv") %>% as.data.frame # this file donwloaded from affy
- rownames(fdata) = fdata$"transcript_cluster_id"
- cols=c( # columns to keep!
- "transcript_cluster_id",
- "probeset_id",
- "seqname",
- "strand",
- "start",
- "stop",
- "total_probes",
- "gene_assignment",
- "category",
- "locus type",
- "notes")
- fdata = fdata[,cols]
- fdata = data.frame(
- 'ProbeID' = rownames(fdata)
- ,'SYMBOL' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[2])
- ,'ENTREZID' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[5])
- ,"GeneName" = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[3])
- ,fdata,row.names=rownames(fdata),check.names=FALSE)
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)] #warnings are expected
- fdata$"gene_assignment" %<>% str_split(" /// ") %>% sapply(function(x)x[1])
- colnames(fdata) %<>% gsub("^gene_assignment$","gene_assignment (first transcript only)",.)
- save(fdata,file='~/Tools/rpackages/gqData/data/Affymetrix_Clariom_S_Human_HT.RData')
- ############################################################################################################
- ############################################################################################################
- # 450k Methylation arrays
- ############################################################################################################
- ############################################################################################################
- ## Illumina 450k Fri 23 Nov 11:05:37 2012 . Illumina Inc. updated this Nov 20 2012
- gpl = 'GPL13534'
- temp = Table(getGEO(gpl))
- fdata= temp
- fdata = data.frame(
- 'ProbeID' = fdata[['ID']]
- ,'SYMBOL' = NA
- ,'ENTREZID' = NA
- ,'Control' = ''
- ,'Chromosome' = fdata[['CHR']]
- ,fdata,row.names=fdata$ID,check.names=FALSE)
- # Define symbols
- sy = strsplit(fdata$UCSC_RefGene_Name,split=';')
- sy = mclapply(sy,unique,mc.cores=8)
- fdata$SYMBOL = unlist( mclapply(sy,function(x)paste(x,collapse=';'),mc.cores=8) )
- fdata$SYMBOL[fdata$SYMBOL==''] = NA
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
- # Define entrez ids
- eg = strsplit(fdata$UCSC_RefGene_Accession,split=';')
- eg = mclapply(eg,unique,mc.cores=8)
- dict = unique(unlist(eg))
- dict = dict[ dict %in% names(as.list(org.Hs.egREFSEQ2EG)) ]
- dict = as.list(org.Hs.egREFSEQ2EG[ dict ])
- gc()
- #z =Sys.time()
- eg=mclapply(eg,function(accs) # should take 5 minutes if not swapping
- {
- unlist(dict[accs])
- },mc.cores=4)
- #print(Sys.time()-z)
- fdata$ENTREZID = unlist(mclapply(eg,function(x)paste(unique(x),collapse=';'),mc.cores=8))
- fdata$ENTREZID[fdata$ENTREZID==''] = NA
- gc()
- save(fdata,file='../../../data/IlluminaHumanMethylation450.RData')
- ############################################################################################################
- ############################################################################################################
- # Agilent CpG Island Arrays
- ############################################################################################################
- ############################################################################################################
- ## Agilent-015279 Mouse CpG Island ChIP-on-Chip Microarray 2x105K (
- library(BSgenome.Mmusculus.UCSC.mm9) # this will load object Mmusculus
- bs.genome = Mmusculus
- f = read.delim('GEO_015279_D_GEO_20111102.txt',skip=34,comment.char='',quote='',na.strings = "fbnjwknfwjfw")
- f = f[1:(nrow(f)-1),]
- f = unique(f[,4:ncol(f)])
- f[['ID']] = f[['SPOT_ID']]
- f = f[!grepl('^NA\\.',f$ID),]
- f = f[f$ID!='NA',]
- fdata = f
- fdata = data.frame(
- 'ProbeID' = fdata[['ID']]
- ,'SYMBOL' = '' #fdata[['GENE_SYMBOL']]
- ,'ENTREZID' = ''
- ,'Control' = fdata[['CONTROL_TYPE']]
- ,'Chromosome' = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
- ,'seq' = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
- ,'start' = as.numeric(sapply( strsplit(fdata[['CHROMOSOMAL_LOCATION']],split='(:|-)') ,function(z)z[2]))
- ,'end' = as.numeric(sapply( strsplit(fdata[['CHROMOSOMAL_LOCATION']],split='(:|-)') ,function(z)z[3]))
- ,'SEQUENCE' = NA
- ,'cpg.density' = NA # This will be tuned accoring to fragment size... which seems to be between 200 and 1000 bp
- ,'cpg.density.windowSize' = 1000 # this will always be relative to the
- ,'gc.count' = NA
- #,'LOCATION' = fdata[['CHROMOSOMAL_LOCATION']]
- #,'GENOME' = 'mm9'
- #,'ISLAND_LOCATION' = fdata[['DESCRIPTION']]
- ,fdata,row.names=fdata$ID)
- # define SYMBOL
- # SYMBOL should be assigned thourgh GB_ACC instead, their columnn is odd.
- fdata$SYMBOL = fdata$GENE_SYMBOL
- fdata$SYMBOL[fdata$SYMBOL==''] = fdata$ID[fdata$SYMBOL=='']
- #fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
- # define ENTREZID
- fdata$ENTREZID = NA
- fdata$ENTREZID[fdata$SYMBOL %in% keys(org.Mm.egSYMBOL2EG) ] =
- unlist( mget( fdata$SYMBOL[fdata$SYMBOL %in% keys(org.Mm.egSYMBOL2EG) ] , org.Mm.egSYMBOL2EG ) )
- #fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
- #fdata$ENTREZID = as.character(sapply( gsub(':.*','',fdata[['GB_ACC']]) ,function(gb)mm.refseq2eg[[gb]]))
- #fdata$ENTREZID[fdata$ENTREZID=='NULL'] = NA # set emptpy entrez id as NA
- # define SYMBOL
- # SYMBOL should be assigned thourgh GB_ACC instead, their columnn is odd.
- #fdata$SYMBOL = as.character(sapply(fdata[['ENTREZID']],function(eg) mm.eg2s[[eg]] ))
- #fdata$SYMBOL[fdata$SYMBOL=='NULL'] = NA
- #fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
- # Define Sequence
- #wo = fdata[['chr']] %in% names(bs.genome)
- #fdata[['SEQUENCE']][wo] = getSeq(bs.genome, fdata[['chr']][wo], start=fdata[['chr.start']][wo],end=fdata[['chr.end']][wo], as.character=TRUE) # haha the GEO annotation has locs outside of chromosome ranges
- # GC count
- #seqs = DNAStringSet(fdata[['SEQUENCE']][wo] )
- #tmp <- alphabetFrequency(seqs, baseOnly = TRUE)
- #fdata[['gc.count']][wo] = tmp[, "C"] + tmp[, "G"]
- # Define window CpG density
- #fdata[['cpg.density']] = cpgdensity(bs.genome, chr = fdata[['chr']], pos = fdata[['chr.start']], windowSize = unique(fdata[['cpg.density.windowSize']]) )
- save(fdata ,file='../../../data/Agilent015279.RData')
- ############################################################################################################
- ############################################################################################################
- # miRNA arrays
- ############################################################################################################
- ############################################################################################################
- # mirBase stuff
- #library(mirbase.db)
- #ls("package:mirbase.db")
- #all.mirbase.ids = names(as.list(mirbaseID2ACC))
- #all.mirbase.acc = as.character((as.list(mirbaseID2ACC)))
- #all.mirbase.mature = as.character(unlist(lapply( mget(mappedkeys(mirbaseMATURE), mirbaseMATURE), function(x) matureName(x))))
- #ask.mirbase <- function(ids)
- #{
- # sapply(ids,function(id){
- #Sys.sleep(0.1)
- # print(id)
- # html = readLines(paste('http://www.mirbase.org/cgi-bin/query.pl?terms=',id,sep=''))
- # if(any(grepl('We found <b>1</b> unique result for your query',html,fixed=TRUE)))
- # {
- # html = html[grepl('acc=',html)][2]
- # html = gsub('.*\\">','',html)
- # html = gsub('</a></td>','',html)
- # #html = gsub('.*acc=.*acc=.*\\">','',html)
- # #html = gsub('</a></td>.*','',html)
- #
- # }else{html = NA}
- # return(html)
- #
- # },simplify=FALSE)
- #}
- ## Agilent-035430 mouse miRNA array (miRBase release 18 miRNA ID version)
- # TODO: There are two problems with the Agilent annot. First, not mirbase 18 as stated. Then, not a probe annot, and no content, so useless.
- gpl = 'GPL15547'
- temp = Table(getGEO(gpl))
- # mirBase stuff
- mb = read.xls('../miRbase/miRNA_17.xls')
- rownames(mb) = mb[['Accession']]
- ids = mb[,c(2,6,9,12,15)]
- ids = sapply(1:nrow(ids),function(i){
- x = as.character(ids[i,])
- x = x[x!='']
- },simplify=FALSE)
- names(ids) = rownames(mb)
- ids = revmap(ids)
- # Agilent annotation?
- #ann = read.delim('GEO_035430_D_GEO_20111226.txt',skip=34)
- #ann = ann[1:(nrow(ann)-1),]
- #ann = ann[!is.na(ann$SPOT_ID),]
- #nrow(ann)
- fdata= temp
- fdata = data.frame(
- 'ProbeID' = fdata[['ID']]
- ,'SYMBOL' = fdata[['miRNA_ID']]
- ,'ACCESSION' = sapply(fdata[['ID']], function(x)sort(paste(ids[[x]],collapse=',')) )
- ,'ENTREZID' = ''
- ,'Control' = ''
- ,'Chromosome' = ''
- ,fdata,row.names=fdata$ID,check.names=FALSE)
- fdata$SYMBOL[fdata$SYMBOL==''] = NA
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
- save(fdata,file='../../../data/Agilent035430.RData')
- ## Affy miRNA Mon 9 Jul 13:42:03 2012
- gpl = 'GPL14613'
- temp = Table(getGEO(gpl))
- temp[['my.mirbase.id']] = temp[['Transcript ID(Array Design)']]
- temp[['my.mirbase.id']] = gsub('-star','*',temp[['my.mirbase.id']],fixed=TRUE)
- temp[['my.mirbase.id']] = sapply(strsplit(temp[['my.mirbase.id']],split=' // '),function(x)paste(unique(x),collapse=' // '))
- mb = read.xls('../miRbase/miRNA_15.xls')
- rownames(mb) = mb[['Accession']]
- ids = mb[,c(2,6,9,12,15)]
- ids = sapply(1:nrow(ids),function(i){
- x = as.character(ids[i,])
- x = x[x!='']
- },simplify=FALSE)
- names(ids) = rownames(mb)
- ids = revmap(ids)
- #temp[['my.mirbase.id']][!temp[['my.mirbase.id']] %in% names(ids)]
- fdata= temp
- fdata = data.frame(
- 'ProbeID' = fdata[['ID']]
- ,'SYMBOL' = fdata[['my.mirbase.id']]
- ,'ENTREZID' = ''
- ,'ACCESSION' = sapply(fdata[['my.mirbase.id']], function(x)sort(paste(ids[[x]],collapse=',')) )
- ,'Control' = ''
- ,'Chromosome' = gsub(':.*','',fdata[['Alignments']])
- ,fdata,row.names=fdata$ID,check.names=FALSE)
- fdata$SYMBOL[fdata$SYMBOL==''] = NA
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
- save(fdata,file='../../../data/Affymetrix_miRNA_2_0.RData')
- # TODO: see how the packages microRNA, miRNApath and Rmir can help you do pathway analysis
- # Affymetrix_miRNA_3_0
- # [miRNA-3_0] Affymetrix Multispecies miRNA-3 Array
- ## Affy miRNA Mar 4 2014
- gpl = 'GPL16384'
- fdata = Table(getGEO(gpl))
- fdata = data.frame(
- 'ProbeID' = fdata[['ID']]
- ,'SYMBOL' = fdata[['Transcript ID(Array Design)']]
- ,fdata,row.names=fdata$ID,check.names=FALSE)
- fdata$SYMBOL[fdata$SYMBOL==''] = NA
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
- save(fdata,file='../../../data/Affymetrix_miRNA_3_0.RData')
- # ## Agilent-031181_MarkBasik (Mark Basiks version... not exactly 031181 probe names not exactly the same)
- # #GEO_031181_D_GEO_20121224.txt
- e = read.maimages("/Users/flefebvr/Dropbox/projects/2012/yriazalhosseini_450k_Nov2012_PRJBFX-348/miRNA/chips/253118113936_201302211504_S01_miRNA_107_Sep09_1_1.txt",green.only=TRUE,source='agilent')$genes
- # cannot share the latter file on bitbucket
- e = unique(e[,3:ncol(e)])
- rownames(e) = e$ProbeName
- fdata = read.delim('GEO_031181_D_GEO_20121224.txt',skip=34)
- fdata = unique( fdata[,4:ncol(fdata)] )
- fdata = fdata[!(fdata$SPOT_ID == '' | is.na(fdata$SPOT_ID ) ),]
- rownames(fdata) = fdata$SPOT_ID
- #setdiff(rownames(e),rownames(fdata))
- #setdiff(rownames(fdata),rownames(e))
- fdata = merge(e,fdata,by=0,all.x=TRUE,all.y=FALSE)
- rownames(fdata) = fdata$Row.names
- fdata$Row.names = NULL
- fdata = data.frame(
- 'ProbeID' = fdata[['ProbeName']]
- ,'SYMBOL' = fdata[['SystematicName']]
- ,'Control' = fdata[['ControlType']]
- ,'Chromosome' = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
- ,fdata,row.names=fdata[['ProbeName']])
- save(fdata ,file='../../../data/Agilent031181_MarkBasik.RData')
- #
- # # Agilent-039494 SurePrint G3 Human GE v2 8x60K Microarray Wed 15 Aug 17:35:19 2012
- # # GEO_039494_D_GEO_20120628.txt
- # # Name: SurePrint G3 Human GE v2 8x60K Microarray
- # # Design ID: 039494
- # # Design Format: 8 x 60 K
- # # Control Grid: IS-62976-8-V2_60kby8_GX_EQC_201000210
- # # Build Version: hg19:GRCh37:Feb2009
- # f = read.delim('GEO_039494_D_GEO_20120628.txt',skip=42)
- # f = f[1:(nrow(f)-1),]
- # f = unique(f[,4:20])
- # fdata = f
- # fdata = data.frame(
- # 'ProbeID' = fdata[['NAME']]
- # ,'SYMBOL' = fdata[['GENE_SYMBOL']]
- # ,'ENTREZID' = fdata[['LOCUSLINK_ID']]
- # ,'Control' = fdata[['CONTROL_TYPE']]
- # ,'Chromosome' = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
- # ,fdata,row.names=fdata[['NAME']])
- # fdata$SYMBOL[fdata$SYMBOL==''] = NA
- # fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
- # fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
- # fdata[['GO_ID']] = NULL # this is useless anyway
- # save(fdata ,file='../../../data/Agilent039494.RData')
- #
- ## Agilent-070155 Mouse_miRNA_V21.0_Microarray 8 x 60 K
- f = read.delim("GEO_070155_D_GEO_20141006.txt",skip=34,comment.char='',quote='',check.names=F)
- f = f[1:(nrow(f)-1),]
- f = unique(f[,4:ncol(f)])
- fdata = f
- fdata = fdata[!is.na(fdata$SPOT_ID),]
- fdata = data.frame(
- 'ProbeID' = fdata[["SPOT_ID"]]
- ,'SYMBOL' = fdata[['GENE_SYMBOL']]
- ,'Control' = fdata[['CONTROL_TYPE']]
- ,fdata,row.names=fdata[['SPOT_ID']])
- fdata$SYMBOL[fdata$SYMBOL==''] = NA
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
- save(fdata ,file='../../../data/Agilent070155.RData')
- # Affymetrix_miRNA_4_0
- # [miRNA-4_0] Affymetrix Multispecies miRNA-4 Array
- ## Affy miRNA Sept 2015
- gpl = 'GPL19117'
- fdata = Table(getGEO(gpl))
- fdata = data.frame(
- 'ProbeID' = fdata[['ID']]
- ,'SYMBOL' = fdata[['Transcript ID(Array Design)']]
- ,fdata,row.names=fdata$ID,check.names=FALSE)
- fdata$SYMBOL[fdata$SYMBOL==''] = NA
- fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
- save(fdata,file='/lb/project/mugqic/analyste_dev/software/mugqic_R_packages/mugqic_R_packages-master/gqData/data/Affymetrix_miRNA_4_0.RData')