trace.r - Expression Arrays

/gqData/inst/extdata/microarray_annotations_library_sources/trace.r

https://bitbucket.org/mugqic/rpackages · R · 933 lines · 545 code · 204 blank · 184 comment · 1 complexity · 7a896b0a1efc839816bb189963b9d317 MD5 · raw file

library(GEOquery)
library(limma)
library("org.Hs.eg.db")
library("org.Mm.eg.db")
library(multicore)
library(gdata)
library(oligo)
library(RCurl)
library(ExiMiR) # for Exiqon
library(LVSmiRNA) # for Agilent
library(AgiMicroRna) # for Agilent
library(miRNApath) # for pathway analysis
library(RmiR) # working with mirna in genral
library(microRNA) # working with mirna in genral
library(charm)
library(magrittr)
library(data.table)
options(stringsAsFactors=FALSE)





## Mappings useful for some platforms
x.human <- org.Hs.egSYMBOL2EG   # For the reverse map:
mapped_genes.human <- mappedkeys(x.human) # Get the entrez gene identifiers that are mapped to a gene symbol
xx.human <- as.list(x.human[mapped_genes.human]) # Convert to a list

mm.s2eg = as.list(revmap(org.Mm.egSYMBOL))
mm.eg2s = as.list(org.Mm.egSYMBOL)
mm.refseq2eg =  as.list(org.Mm.egREFSEQ2EG)



############################################################################################################
############################################################################################################
# Expression Arrays
############################################################################################################
############################################################################################################


## Agilent_017942_D_F_20071024_AK_12_MRD027-1_US09473739_251794210018_S01_GE1_107_Sep09_1_4.txt
## Custom Design From Geraldine Butler


# load chip design from a data file..
fn="Agilent_017942_D_F_20071024_AK_12_MRD027-1_US09473739_251794210018_S01_GE1_107_Sep09_1_4.txt"
f = read.delim(fn,skip=9,comment.char='',quote='')
f = unique(f[,c(7,8)])
f$parsedCgdFeature = gsub('^gb_','',f$SystematicName)
#f  = f[1:(nrow(f)-1),]

# Annotations from CGD
fn="http://www.candidagenome.org/download/chromosomal_feature_files/C_albicans_SC5314/C_albicans_SC5314_A21_current_chromosomal_feature.tab"
ann = read.delim(fn,skip=8,comment.char='',quote='',header=FALSE)
colnames(ann) = c(
"Feature name"
,"Gene name"
,"Aliases"
,"Feature type"
,"Chromosome"
,"Start Coordinate"
,"Stop Coordinate"
,"Strand"
,"Primary CGDID"
,"Secondary CGDID"
,"Description"
,"Date Created"
,"Sequence Coordinate Version Date (if any)"
,"Blank"
,"Blank2"
,"Date of gene name reservation"
,"Has the reserved gene name become the standard name"
,"Name of S. cerevisiae ortholog(s)"
)
rownames(ann) = ann[['Feature name']]

# Join
table(f$parsedCgdFeature %in% rownames(ann))
missing = unique(f$parsedCgdFeature[! f$parsedCgdFeature %in% rownames(ann)])
fdata = merge(f,ann,all.x=TRUE,all.y=FALSE,by.x="parsedCgdFeature",by.y=0)



#Controled column name
fdata = data.frame(
	 'ProbeID'     = fdata[['ProbeName']]
	,'SYMBOL'	= fdata[['Gene name']]
	,fdata,row.names=fdata[['ProbeName']],check.names=FALSE)
fdata$SYMBOL[fdata$SYMBOL==''] = NA
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$parsedCgdFeature[is.na(fdata$SYMBOL)] # replace emtpy probes
#fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
#fdata[['GO_ID']] = NULL # this is useless anyway
save(fdata ,file='../../../data/Agilent017942.RData')




## Agilent-028279 SurePrint G3 Rat GE 8x60K Microarray GEO_028279_D_GEO_20130204.txt
# http://www.genomics.agilent.com/CollectionSubpage.aspx?PageType=Product&SubPageType=ProductData&PageID=1524
f = read.delim('GEO_028279_D_GEO_20130204.txt',skip=42,comment.char='',quote='')
f  = f[1:(nrow(f)-1),]
fdata = unique(f[,4:20])
fdata = data.frame(
	 'ProbeID'     = fdata[['NAME']]
	,'SYMBOL'      = fdata[['GENE_SYMBOL']]
	,'ENTREZID'    = fdata[['LOCUSLINK_ID']]
	,'Control'     = fdata[['CONTROL_TYPE']]
	,'Chromosome'  = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
	,fdata,row.names=fdata[['NAME']])
fdata$SYMBOL[fdata$SYMBOL==''] = NA
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
fdata[['GO_ID']] = NULL # this is useless anyway
save(fdata ,file='../../../data/Agilent028279.RData')



## Agilent-028004  SurePrint G3 Human Gene Expression 8x60K Microarray Kit
# http://www.genomics.agilent.com/CollectionSubpage.aspx?PageType=Product&SubPageType=ProductData&PageID=1516
f = read.delim('GEO_028004_D_GEO_20120411.txt',skip=42,comment.char='',quote='')
f  = f[1:(nrow(f)-1),]
f = unique(f[,4:20])
fdata = f
fdata = data.frame(
	 'ProbeID'     = fdata[['NAME']]
	,'SYMBOL'      = fdata[['GENE_SYMBOL']]
	,'ENTREZID'    = fdata[['LOCUSLINK_ID']]
	,'Control'     = fdata[['CONTROL_TYPE']]
	,'Chromosome'  = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
	,fdata,row.names=fdata[['NAME']])
fdata$SYMBOL[fdata$SYMBOL==''] = NA
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
fdata[['GO_ID']] = NULL # this is useless anyway
save(fdata ,file='../../../data/Agilent028004.RData')


## Agilent-028005 SurePrint G3 Mouse GE 8x60K Microarray Sat  7 Jul 06:27:30 2012
gpl = 'GPL10787'
fdata  = Table(getGEO(gpl))
fdata = data.frame(
	 'ProbeID'     = fdata[['ID']]
	,'SYMBOL'      = fdata[['GENE_SYMBOL']]
	,'ENTREZID'    = fdata[['GENE']]
	,'Control'     = fdata[['CONTROL_TYPE']]
	,'Chromosome'  = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
	,fdata,row.names=fdata$ID)
fdata$SYMBOL[fdata$SYMBOL==''] = NA
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
fdata[['GO_ID']] = NULL # this is useless anyway
save(fdata ,file='../../../data/Agilent028005.RData')

# Agilent-039494 SurePrint G3 Human GE v2 8x60K Microarray Wed 15 Aug 17:35:19 2012
#  GEO_039494_D_GEO_20120628.txt
#	Name: 	SurePrint G3 Human GE v2 8x60K Microarray
#  	Design ID: 	039494
#  	Design Format: 	8 x 60 K
#  	Control Grid: 	IS-62976-8-V2_60kby8_GX_EQC_201000210
#  	Build Version: 	hg19:GRCh37:Feb2009
f = read.delim('GEO_039494_D_GEO_20120628.txt',skip=42)
f  = f[1:(nrow(f)-1),]
f = unique(f[,4:20])
fdata = f
fdata = data.frame(
	 'ProbeID'     = fdata[['NAME']]
	,'SYMBOL'      = fdata[['GENE_SYMBOL']]
	,'ENTREZID'    = fdata[['LOCUSLINK_ID']]
	,'Control'     = fdata[['CONTROL_TYPE']]
	,'Chromosome'  = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
	,fdata,row.names=fdata[['NAME']])
fdata$SYMBOL[fdata$SYMBOL==''] = NA
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
fdata[['GO_ID']] = NULL # this is useless anyway
save(fdata ,file='../../../data/Agilent039494.RData')


## Agilent-026440 Wed 10 Oct 15:42:34 2012
#	Name: 	S. scrofa (Pig) Oligo Microarray v2
#  	Design ID: 	026440
#  	Design Format: 	4 X 44K
#  	Control Grid: 	IS-45220-4-V1_4x44K_GX_EQC_V20060608
#  	Build Version: 	Not Applicable
f = read.delim('GEO_026440_D_GEO_20120509.txt',skip=42,comment.char='',quote='') # damn it the file is broken
f  = f[1:(nrow(f)-1),]
f = unique(f[,4:20])
fdata = f
fdata = data.frame(
	 'ProbeID'     = fdata[['NAME']]
	,'SYMBOL'      = fdata[['GENE_SYMBOL']]
	,'ENTREZID'    = fdata[['LOCUSLINK_ID']]
	,'Control'     = fdata[['CONTROL_TYPE']]
	,'Chromosome'  = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
	,fdata,row.names=fdata[['NAME']])
fdata$SYMBOL[fdata$SYMBOL==''] = NA
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
fdata[['GO_ID']] = NULL # this is useless anyway
save(fdata ,file='../../../data/Agilent026440.RData')







## Affymetrix [Mouse430_2] Affymetrix Mouse Genome 430 2.0 Array
gpl = 'GPL1261'
temp  = Table(getGEO(gpl))
ls("package:mirbase.db")


fdata= temp
x = as.list(mirbaseACC2ID)
fdata[!fdata$ACC %in% names(x),]


fdata = data.frame(
	 'ProbeID'     = fdata[['ID']]
	,'SYMBOL'      = fdata[['Gene Symbol']]
	,'ENTREZID'    = fdata[['ENTREZ_GENE_ID']]
	,'Control'     = ''
	,'Chromosome'  = ''
	,fdata,row.names=fdata$ID,check.names=FALSE)


fdata$SYMBOL[fdata$SYMBOL==''] = NA
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
fdata = fdata[,1:18]# remove GO

save(fdata,file='../../../data/Affymetrix_Mouse430_2.RData')



## Affymetrix 3p IVT  Yeast Genome 2.0 Array
library(yeast2.db)
options(stringsAsFactors=FALSE)
bioc.fdata = select(yeast2.db, columns=c("GENENAME","ORF"),keys=keys(yeast2.db))
rownames(bioc.fdata) = bioc.fdata[["PROBEID"]]

fn = "Yeast_2.na34.annot.csv"
fdata = read.csv(fn,comment.char='#',check.names=FALSE)
fdata = data.frame(
	 'ProbeID'     = fdata[['Probe Set ID']]
	,'SYMBOL'      = fdata[['Gene Symbol']]
	,'ENTREZID'    = fdata[['Entrez Gene']]
	,bioc.fdata[fdata[['Probe Set ID']], ]
	,fdata,row.names=fdata[['Probe Set ID']],check.names=FALSE)
fdata$SYMBOL = gsub("///.*","and others",fdata$SYMBOL )
fdata$SYMBOL[fdata$SYMBOL=='---'] = NA
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
save(fdata,file='../../../data/Affymetrix_Yeast_2.RData')






## Illumina mouse WG-6 Mon 11 Feb 11:02:51 2013
# Note : We extract the probe annotation from "MouseWG-6_V2_0_R3_11278593_A.bgx" via an actual SampleProbeProfile...
fdata = read.delim('MouseWG-6_V2_0_R3_11278593_A.bgx_SampleProbeProfile_20130118.txt',check.names=FALSE,comment.char='', quote='',colClasses='character')
fdata = fdata[,!grepl('^MHA',colnames(fdata))]
rownames(fdata) = fdata$ProbeID
fdata[['ENTREZID']] = fdata[['ENTREZ_GENE_ID']]
fdata[['ENTREZID']][fdata[['ENTREZID']]==''] = NA
fdata = fdata[,c("ProbeID",'SYMBOL','ENTREZID',setdiff(colnames(fdata),c('ProbeID','SYMBOL','ENTREZID')))]
fdata = fdata[,!grepl("ONTOLOGY_",colnames(fdata))]
save(fdata,file='../../../data/MouseWG-6_V2_0_R3_11278593_A.RData')



## Illlumina HT12 from example sample probe report Sat 23 Mar 22:07:41 2013
library("org.Hs.eg.db")
fdata = read.delim('HumanHT-12_v4_sample_probe_report.txt',check.names=FALSE,comment.char='', quote='',colClasses='character',skip=3)
fdata = fdata[,c(1:22)]
fdata = fdata[,!grepl('ONTOLOGY',colnames(fdata))]
rownames(fdata) = fdata$ProbeID

fdata$SYMBOL[fdata$SYMBOL==''] = NA
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes

fdata[['ENTREZID']] = NA
fdata[['ENTREZID']][fdata$SYMBOL %in% keys(org.Hs.egSYMBOL2EG)] =as.character( mget(fdata$SYMBOL[fdata$SYMBOL %in% keys(org.Hs.egSYMBOL2EG)], org.Hs.egSYMBOL2EG ))

save(fdata,file='../../../data/HumanHT-12_v4.RData')









## Mouse Gene 1.0 ST Array : MoGene-1_0-st-v1
#    source("http://bioconductor.org/biocLite.R")
#    biocLite("mogene10sttranscriptcluster.db")
library(mogene10sttranscriptcluster.db) # ls("package:mogene10sttranscriptcluster.db")
options(stringsAsFactors=FALSE)
gpl="GPL6246"
fdata = Table(getGEO(gpl))
fdata = fdata[fdata[['SPOT_ID']]!="Not currently mapped to latest genome",]
fdata[["GeneSymbol"]] = sapply(strsplit(fdata[['gene_assignment']],split=' // '),function(z)z[2])# Attempt at symbol
fdata = data.frame(
	 'ProbeID'     = fdata[['ID']]
	,'SYMBOL'      = unlist(mget(fdata[['ID']],mogene10sttranscriptclusterSYMBOL))
	,'ENTREZID'    = unlist(mget(fdata[['ID']],mogene10sttranscriptclusterENTREZID))
	,'GENENAME'    = unlist(mget(fdata[['ID']],mogene10sttranscriptclusterGENENAME))
	,fdata,row.names=fdata$ID,check.names=FALSE)
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)]
save(fdata,file='../../../data/Affymetrix_MoGene-1_0-st-v1.RData')

# TODO: use Bioc library to add a few things...



# [RaGene-2_0-st] Affymetrix Rat Gene 2.0 ST Array
library(ragene20sttranscriptcluster.db)
gpl="GPL17117"
fdata = Table(getGEO(gpl))
fdata = fdata[fdata$ID %in% keys(ragene20sttranscriptclusterSYMBOL),]
fdata[["GeneSymbol"]] = sapply(strsplit(fdata[['gene_assignment']],split=' // '),function(z)z[2])# Attempt at symbol
fdata = data.frame(
	 'ProbeID'     = fdata[['ID']]
	,'SYMBOL'      = unlist(mget(fdata[['ID']],ragene20sttranscriptclusterSYMBOL))
	,'ENTREZID'    = unlist(mget(fdata[['ID']],ragene20sttranscriptclusterENTREZID))
	,'GENENAME'    = unlist(mget(fdata[['ID']],ragene20sttranscriptclusterGENENAME))
	,fdata,row.names=fdata$ID,check.names=FALSE)
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)]
fdata = fdata[,!grepl("^GO_",colnames(fdata))]
save(fdata,file='../../../data/Affymetrix_RaGene-2_0-st.RData')
#raw = oligo::rma(oligo::read.celfiles(list.files("~/Dropbox/projects/2013/gsebire_Sebire001_PRJBFX-482/chips",full.names=TRUE)))


# Options: use bioc annot, use annot from probe profile, use annot from switchtoi












# [MoGene-2_0-st] Affymetrix Mouse Gene 2.0 ST Array [transcript (gene) version]
library(mogene20sttranscriptcluster.db)
gpl="GPL16570"
fdata = Table(getGEO(gpl))
fdata = fdata[fdata$ID %in% keys(mogene20sttranscriptclusterSYMBOL),]
setequal(fdata$ID,keys(mogene20sttranscriptclusterSYMBOL))
fdata[["GeneSymbol"]] = sapply(strsplit(fdata[['gene_assignment']],split=' // '),function(z)z[2])# Attempt at symbol
fdata = data.frame(
	 'ProbeID'     = fdata[['ID']]
	,'SYMBOL'      = unlist(mget(fdata[['ID']],mogene20sttranscriptclusterSYMBOL))
	,'ENTREZID'    = unlist(mget(fdata[['ID']],mogene20sttranscriptclusterENTREZID))
	,'GENENAME'    = unlist(mget(fdata[['ID']],mogene20sttranscriptclusterGENENAME))
	,fdata,row.names=fdata$ID,check.names=FALSE)
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)]
fdata = fdata[,!grepl("^GO_",colnames(fdata))]
save(fdata,file='../../../data/Affymetrix_MoGene-2_0-st.RData')










# [HuGene-2_0-st] Affymetrix Human Gene 2.0 ST Array [transcript (gene) version]
library(hugene20sttranscriptcluster.db)
#fn =   "HuGene-2_0-st-v1.na33.2.hg19.transcript.csv"
 fn = "HuGene-2_0-st-v1.na34.hg19.transcript.csv"
fdata = read.csv(fn,skip=23, comment.char='',colClasses='character')
rownames(fdata) = fdata$transcript_cluster_id
fdata = fdata[rownames(fdata) %in% keys(hugene20sttranscriptclusterSYMBOL),]
setequal(rownames(fdata),keys(hugene20sttranscriptclusterSYMBOL))
fdata = data.frame(
	 'ProbeID'     = rownames(fdata)
	,'SYMBOL'      = unlist(mget(rownames(fdata),hugene20sttranscriptclusterSYMBOL))
	,'ENTREZID'    = unlist(mget(rownames(fdata),hugene20sttranscriptclusterENTREZID))
	,'GENENAME'    = unlist(mget(rownames(fdata),hugene20sttranscriptclusterGENENAME))
	,fdata,row.names=rownames(fdata),check.names=FALSE)
fdata[["GeneSymbol"]] = sapply(strsplit(fdata[['gene_assignment']],split=' // '),function(z)z[2])# Attempt at symbol
fdata[["Cytoband"]] = sapply(strsplit(fdata[['gene_assignment']],split=' // '),function(z)z[4])# Attempt at symbol
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["GeneSymbol"]][is.na(fdata$SYMBOL)]
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)]

fdata = fdata[,!grepl("^GO_",colnames(fdata))]
save(fdata,file='../../../data/Affymetrix_HuGene-2_0-st.RData')

############################################################################################################
############################################################################################################
# Clariom
############################################################################################################
############################################################################################################

# Clariom_S_Mouse_HT

library(data.table)
library(stringr)
library(magrittr)
fdata = fread("/Users/flefebvr/Dropbox/projects/2016/skimmins_kimmins006_PRJBFX-1415/Clariom_S_Mouse_HT.na36.mm10.transcript.csv") %>% as.data.frame # this file donwloaded from netaffx
rownames(fdata) = fdata$"transcript_cluster_id"
cols=c( # columns to keep!
"transcript_cluster_id",
"probeset_id",
"seqname",
"strand",
"start",
"stop",
"total_probes",
"gene_assignment",
"category",
"locus type",
"notes")
fdata = fdata[,cols]
fdata = data.frame(
	 'ProbeID'     = rownames(fdata)
	,'SYMBOL'      = fdata$"gene_assignment"  %>% str_split("( /// | // )") %>% sapply(function(x)x[2])
	,'ENTREZID'    = fdata$"gene_assignment"  %>% str_split("( /// | // )") %>% sapply(function(x)x[5])
	,"GeneName"           = fdata$"gene_assignment"  %>% str_split("( /// | // )") %>% sapply(function(x)x[3])
	,fdata,row.names=rownames(fdata),check.names=FALSE)
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)]
fdata$"gene_assignment" %<>% str_split(" /// ") %>% sapply(function(x)x[1])
colnames(fdata) %<>% gsub("^gene_assignment$","gene_assignment (first transcript only)",.)
save(fdata,file='../../../data/Affymetrix_Clariom_S_Mouse_HT.RData')
# compare with Eloi's file
# x = fread("/Users/flefebvr/Dropbox/projects/2016/skimmins_kimmins006_PRJBFX-1415/kimmins006.summary.SST-RMA-GENE-FULL.txt") %>% as.data.frame
# rownames(x) = x[,1]
# setequal(rownames(x),rownames(fdata))

# Clariom_S_Mouse

library(data.table)
library(stringr)
library(magrittr)
fdata = fread("/Users/emercier/Workdir/gqMicroarray_newPlatform/data/Clariom_S_Mouse.na36.mm10.transcript.csv") %>% as.data.frame # this file donwloaded from netaffx
rownames(fdata) = fdata$"transcript_cluster_id"
cols=c( # columns to keep!
"transcript_cluster_id",
"probeset_id",
"seqname",
"strand",
"start",
"stop",
"total_probes",
"gene_assignment",
"category",
"locus type",
"notes")
fdata = fdata[,cols]
fdata = data.frame(
	 'ProbeID'     = rownames(fdata)
	,'SYMBOL'      = fdata$"gene_assignment"  %>% str_split("( /// | // )") %>% sapply(function(x)x[2])
	,'ENTREZID'    = fdata$"gene_assignment"  %>% str_split("( /// | // )") %>% sapply(function(x)x[5])
	,"GeneName"           = fdata$"gene_assignment"  %>% str_split("( /// | // )") %>% sapply(function(x)x[3])
	,fdata,row.names=rownames(fdata),check.names=FALSE)
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)] #warnings are expected
fdata$"gene_assignment" %<>% str_split(" /// ") %>% sapply(function(x)x[1])
colnames(fdata) %<>% gsub("^gene_assignment$","gene_assignment (first transcript only)",.)
save(fdata,file='~/Tools/rpackages/gqData/data/Affymetrix_Clariom_S_Mouse.RData')
# compare with Eloi's file
# x = fread("/Users/emercier/Workdir/Boerboom008_PRJBFX-1419/data/boerboom008.summary.SST-RMA-GENE-FULL.txt") %>% as.data.frame
# rownames(x) = x[,1]
# setequal(rownames(x),rownames(fdata))

#Clariom D human
library(data.table)
library(stringr)
library(magrittr)
fdata = fread("/Users/emercier/Workdir/gqMicroarray_newPlatform/data/Clariom_D_Human.na36.hg38.transcript.csv") %>% as.data.frame # this file donwloaded from netaffx
rownames(fdata) = fdata$"transcript_cluster_id"
cols=c( # columns to keep!
"transcript_cluster_id",
"probeset_id",
"seqname",
"strand",
"start",
"stop",
"total_probes",
"gene_assignment",
"category",
"locus type",
"notes")
fdata = fdata[,cols]
fdata = data.frame(
	 'ProbeID'     = rownames(fdata)
	,'SYMBOL'      = fdata$"gene_assignment"  %>% str_split("( /// | // )") %>% sapply(function(x)x[2])
	,'ENTREZID'    = fdata$"gene_assignment"  %>% str_split("( /// | // )") %>% sapply(function(x)x[5])
	,"GeneName"           = fdata$"gene_assignment"  %>% str_split("( /// | // )") %>% sapply(function(x)x[3])
	,fdata,row.names=rownames(fdata),check.names=FALSE)
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)] #warnings are expected
fdata$"gene_assignment" %<>% str_split(" /// ") %>% sapply(function(x)x[1])
colnames(fdata) %<>% gsub("^gene_assignment$","gene_assignment (first transcript only)",.)
save(fdata,file='~/Tools/rpackages/gqData/data/Affymetrix_Clariom_D_Human.RData')


#Clariom S human
library(data.table)
library(stringr)
library(magrittr)
fdata = fread("/Users/emercier/Workdir/gqMicroarray_newPlatform/data/Clariom_S_Human.na36.hg38.transcript.csv") %>% as.data.frame # this file donwloaded from affy
rownames(fdata) = fdata$"transcript_cluster_id"
cols=c( # columns to keep!
"transcript_cluster_id",
"probeset_id",
"seqname",
"strand",
"start",
"stop",
"total_probes",
"gene_assignment",
"category",
"locus type",
"notes")
fdata = fdata[,cols]
fdata = data.frame(
	 'ProbeID'     = rownames(fdata)
	,'SYMBOL'      = fdata$"gene_assignment"  %>% str_split("( /// | // )") %>% sapply(function(x)x[2])
	,'ENTREZID'    = fdata$"gene_assignment"  %>% str_split("( /// | // )") %>% sapply(function(x)x[5])
	,"GeneName"           = fdata$"gene_assignment"  %>% str_split("( /// | // )") %>% sapply(function(x)x[3])
	,fdata,row.names=rownames(fdata),check.names=FALSE)
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)] #warnings are expected
fdata$"gene_assignment" %<>% str_split(" /// ") %>% sapply(function(x)x[1])
colnames(fdata) %<>% gsub("^gene_assignment$","gene_assignment (first transcript only)",.)
save(fdata,file='~/Tools/rpackages/gqData/data/Affymetrix_Clariom_S_Human.RData')


#Clariom S human HT #exact same then Clariom S
library(data.table)
library(stringr)
library(magrittr)
fdata = fread("/Users/emercier/Workdir/gqMicroarray_newPlatform/data/Clariom_S_Human_HT.na36.hg38.transcript.csv") %>% as.data.frame # this file donwloaded from affy
rownames(fdata) = fdata$"transcript_cluster_id"
cols=c( # columns to keep!
"transcript_cluster_id",
"probeset_id",
"seqname",
"strand",
"start",
"stop",
"total_probes",
"gene_assignment",
"category",
"locus type",
"notes")
fdata = fdata[,cols]
fdata = data.frame(
	 'ProbeID'     = rownames(fdata)
	,'SYMBOL'      = fdata$"gene_assignment"  %>% str_split("( /// | // )") %>% sapply(function(x)x[2])
	,'ENTREZID'    = fdata$"gene_assignment"  %>% str_split("( /// | // )") %>% sapply(function(x)x[5])
	,"GeneName"           = fdata$"gene_assignment"  %>% str_split("( /// | // )") %>% sapply(function(x)x[3])
	,fdata,row.names=rownames(fdata),check.names=FALSE)
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)] #warnings are expected
fdata$"gene_assignment" %<>% str_split(" /// ") %>% sapply(function(x)x[1])
colnames(fdata) %<>% gsub("^gene_assignment$","gene_assignment (first transcript only)",.)
save(fdata,file='~/Tools/rpackages/gqData/data/Affymetrix_Clariom_S_Human_HT.RData')

############################################################################################################
############################################################################################################
# 450k Methylation arrays
############################################################################################################
############################################################################################################

## Illumina 450k Fri 23 Nov 11:05:37 2012 . Illumina Inc. updated this Nov 20 2012
gpl = 'GPL13534'
temp  = Table(getGEO(gpl))

fdata= temp
fdata = data.frame(
	 'ProbeID'     = fdata[['ID']]
	,'SYMBOL'      = NA
	,'ENTREZID'    = NA
	,'Control'     = ''
	,'Chromosome'  = fdata[['CHR']]
	,fdata,row.names=fdata$ID,check.names=FALSE)


# Define symbols
sy = strsplit(fdata$UCSC_RefGene_Name,split=';')
sy = mclapply(sy,unique,mc.cores=8)
fdata$SYMBOL = unlist( mclapply(sy,function(x)paste(x,collapse=';'),mc.cores=8) )
fdata$SYMBOL[fdata$SYMBOL==''] = NA
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes

# Define entrez ids
eg = strsplit(fdata$UCSC_RefGene_Accession,split=';')
eg = mclapply(eg,unique,mc.cores=8)
dict = unique(unlist(eg))
dict = dict[ dict %in% names(as.list(org.Hs.egREFSEQ2EG))     ]
dict = as.list(org.Hs.egREFSEQ2EG[ dict ])
gc()
#z =Sys.time()
eg=mclapply(eg,function(accs) # should take 5 minutes if not swapping
{
	unlist(dict[accs])
},mc.cores=4)
#print(Sys.time()-z)
fdata$ENTREZID = unlist(mclapply(eg,function(x)paste(unique(x),collapse=';'),mc.cores=8))
fdata$ENTREZID[fdata$ENTREZID==''] = NA
gc()
save(fdata,file='../../../data/IlluminaHumanMethylation450.RData')





############################################################################################################
############################################################################################################
# Agilent CpG Island Arrays
############################################################################################################
############################################################################################################



## Agilent-015279 Mouse CpG Island ChIP-on-Chip Microarray 2x105K (
library(BSgenome.Mmusculus.UCSC.mm9) # this will load object Mmusculus
bs.genome = Mmusculus

f = read.delim('GEO_015279_D_GEO_20111102.txt',skip=34,comment.char='',quote='',na.strings = "fbnjwknfwjfw")
f  = f[1:(nrow(f)-1),]
f = unique(f[,4:ncol(f)])
f[['ID']] = f[['SPOT_ID']]
f = f[!grepl('^NA\\.',f$ID),]
f = f[f$ID!='NA',]
fdata = f


fdata = data.frame(
	 'ProbeID'     	= fdata[['ID']]
	,'SYMBOL'      	= '' #fdata[['GENE_SYMBOL']]
	,'ENTREZID'    	= ''
	,'Control'     	= fdata[['CONTROL_TYPE']]
	,'Chromosome'  	= gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
	,'seq'  	= gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
	,'start'   	= as.numeric(sapply(   strsplit(fdata[['CHROMOSOMAL_LOCATION']],split='(:|-)')      ,function(z)z[2]))
	,'end'     	= as.numeric(sapply(   strsplit(fdata[['CHROMOSOMAL_LOCATION']],split='(:|-)')      ,function(z)z[3]))
	,'SEQUENCE'     = NA
	,'cpg.density' 	= NA	# This will be tuned accoring to fragment size... which seems to be between 200 and 1000 bp
	,'cpg.density.windowSize' 	= 1000	# this will always be relative to the
	,'gc.count'  	= NA
	#,'LOCATION'    = fdata[['CHROMOSOMAL_LOCATION']]
	#,'GENOME'      = 'mm9'
	#,'ISLAND_LOCATION' = fdata[['DESCRIPTION']]
	,fdata,row.names=fdata$ID)



# define SYMBOL
# SYMBOL should be assigned thourgh GB_ACC instead, their columnn is odd.
fdata$SYMBOL  = fdata$GENE_SYMBOL
fdata$SYMBOL[fdata$SYMBOL==''] = fdata$ID[fdata$SYMBOL=='']
#fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes

# define ENTREZID
fdata$ENTREZID = NA
fdata$ENTREZID[fdata$SYMBOL %in%  keys(org.Mm.egSYMBOL2EG) ] =
	 unlist( mget( fdata$SYMBOL[fdata$SYMBOL %in%  keys(org.Mm.egSYMBOL2EG) ]   , org.Mm.egSYMBOL2EG ) )



#fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
#fdata$ENTREZID  = as.character(sapply( gsub(':.*','',fdata[['GB_ACC']])   ,function(gb)mm.refseq2eg[[gb]]))
#fdata$ENTREZID[fdata$ENTREZID=='NULL'] = NA # set emptpy entrez id as NA

# define SYMBOL
# SYMBOL should be assigned thourgh GB_ACC instead, their columnn is odd.
#fdata$SYMBOL  = as.character(sapply(fdata[['ENTREZID']],function(eg) mm.eg2s[[eg]] ))
#fdata$SYMBOL[fdata$SYMBOL=='NULL'] = NA
#fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes


# Define Sequence
#wo = fdata[['chr']] %in% names(bs.genome)
#fdata[['SEQUENCE']][wo] = getSeq(bs.genome, fdata[['chr']][wo], start=fdata[['chr.start']][wo],end=fdata[['chr.end']][wo], as.character=TRUE) # haha the GEO annotation has locs outside of chromosome ranges

# GC count
#seqs = DNAStringSet(fdata[['SEQUENCE']][wo] )
#tmp <- alphabetFrequency(seqs, baseOnly = TRUE)
#fdata[['gc.count']][wo] = tmp[, "C"] + tmp[, "G"]

# Define window CpG density
#fdata[['cpg.density']] = cpgdensity(bs.genome, chr = fdata[['chr']], pos = fdata[['chr.start']], windowSize = unique(fdata[['cpg.density.windowSize']]) )




save(fdata ,file='../../../data/Agilent015279.RData')





















############################################################################################################
############################################################################################################
# miRNA arrays
############################################################################################################
############################################################################################################


# mirBase stuff
#library(mirbase.db)
#ls("package:mirbase.db")
#all.mirbase.ids = names(as.list(mirbaseID2ACC))
#all.mirbase.acc = as.character((as.list(mirbaseID2ACC)))
#all.mirbase.mature = as.character(unlist(lapply( mget(mappedkeys(mirbaseMATURE), mirbaseMATURE), function(x) matureName(x))))
#ask.mirbase <- function(ids)
#{
#	sapply(ids,function(id){
			#Sys.sleep(0.1)
#			print(id)
#			html = readLines(paste('http://www.mirbase.org/cgi-bin/query.pl?terms=',id,sep=''))
#			if(any(grepl('We found <b>1</b> unique result for your query',html,fixed=TRUE)))
#			{
#				html = html[grepl('acc=',html)][2]
#				html = gsub('.*\\">','',html)
#				html = gsub('</a></td>','',html)
#				#html = gsub('.*acc=.*acc=.*\\">','',html)
#				#html = gsub('</a></td>.*','',html)
#
#			}else{html = NA}
#			return(html)
#
#		},simplify=FALSE)
#}






## Agilent-035430 mouse miRNA array (miRBase release 18 miRNA ID version)
# TODO: There are two problems with the Agilent annot. First, not mirbase 18 as stated. Then, not a probe annot, and no content, so useless.
gpl = 'GPL15547'
temp  = Table(getGEO(gpl))

# mirBase stuff
mb = read.xls('../miRbase/miRNA_17.xls')
rownames(mb) = mb[['Accession']]
ids =  mb[,c(2,6,9,12,15)]
ids = sapply(1:nrow(ids),function(i){
	x = as.character(ids[i,])
	x = x[x!='']
	},simplify=FALSE)
names(ids) = rownames(mb)
ids = revmap(ids)

# Agilent annotation?
#ann = read.delim('GEO_035430_D_GEO_20111226.txt',skip=34)
#ann = ann[1:(nrow(ann)-1),]
#ann = ann[!is.na(ann$SPOT_ID),]
#nrow(ann)

fdata= temp
fdata = data.frame(
	 'ProbeID'     = fdata[['ID']]
	,'SYMBOL'      = fdata[['miRNA_ID']]
	,'ACCESSION'   = sapply(fdata[['ID']], function(x)sort(paste(ids[[x]],collapse=','))  )
	,'ENTREZID'    = ''
	,'Control'     = ''
	,'Chromosome'  = ''
	,fdata,row.names=fdata$ID,check.names=FALSE)


fdata$SYMBOL[fdata$SYMBOL==''] = NA
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
save(fdata,file='../../../data/Agilent035430.RData')



## Affy miRNA Mon  9 Jul 13:42:03 2012
gpl = 'GPL14613'
temp  = Table(getGEO(gpl))
temp[['my.mirbase.id']] = temp[['Transcript ID(Array Design)']]
temp[['my.mirbase.id']] = gsub('-star','*',temp[['my.mirbase.id']],fixed=TRUE)
temp[['my.mirbase.id']] = sapply(strsplit(temp[['my.mirbase.id']],split=' // '),function(x)paste(unique(x),collapse=' // '))
mb = read.xls('../miRbase/miRNA_15.xls')
rownames(mb) = mb[['Accession']]
ids =  mb[,c(2,6,9,12,15)]
ids = sapply(1:nrow(ids),function(i){
	x = as.character(ids[i,])
	x = x[x!='']
	},simplify=FALSE)
names(ids) = rownames(mb)
ids = revmap(ids)
#temp[['my.mirbase.id']][!temp[['my.mirbase.id']] %in% names(ids)]
fdata= temp
fdata = data.frame(
	 'ProbeID'     = fdata[['ID']]
	,'SYMBOL'      = fdata[['my.mirbase.id']]
	,'ENTREZID'    = ''
	,'ACCESSION'   = sapply(fdata[['my.mirbase.id']], function(x)sort(paste(ids[[x]],collapse=','))  )
	,'Control'     = ''
	,'Chromosome'  = gsub(':.*','',fdata[['Alignments']])
	,fdata,row.names=fdata$ID,check.names=FALSE)


fdata$SYMBOL[fdata$SYMBOL==''] = NA
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes


save(fdata,file='../../../data/Affymetrix_miRNA_2_0.RData')
# TODO:  see how the packages microRNA, miRNApath and Rmir can help you do pathway analysis



#  Affymetrix_miRNA_3_0
# [miRNA-3_0] Affymetrix Multispecies miRNA-3 Array
## Affy miRNA Mar 4 2014
gpl = 'GPL16384'
fdata  = Table(getGEO(gpl))
fdata = data.frame(
	 'ProbeID'     = fdata[['ID']]
	,'SYMBOL'      = fdata[['Transcript ID(Array Design)']]
	,fdata,row.names=fdata$ID,check.names=FALSE)
fdata$SYMBOL[fdata$SYMBOL==''] = NA
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
save(fdata,file='../../../data/Affymetrix_miRNA_3_0.RData')





# ## Agilent-031181_MarkBasik (Mark Basiks version... not exactly 031181 probe names not exactly the same)
# #GEO_031181_D_GEO_20121224.txt
e = read.maimages("/Users/flefebvr/Dropbox/projects/2012/yriazalhosseini_450k_Nov2012_PRJBFX-348/miRNA/chips/253118113936_201302211504_S01_miRNA_107_Sep09_1_1.txt",green.only=TRUE,source='agilent')$genes
  # cannot share the latter file on bitbucket
e = unique(e[,3:ncol(e)])
rownames(e) = e$ProbeName
fdata =  read.delim('GEO_031181_D_GEO_20121224.txt',skip=34)
fdata = unique( fdata[,4:ncol(fdata)] )
fdata = fdata[!(fdata$SPOT_ID == '' | is.na(fdata$SPOT_ID ) ),]
rownames(fdata) = fdata$SPOT_ID
#setdiff(rownames(e),rownames(fdata))
#setdiff(rownames(fdata),rownames(e))
fdata  = merge(e,fdata,by=0,all.x=TRUE,all.y=FALSE)
rownames(fdata) = fdata$Row.names
fdata$Row.names = NULL
 fdata = data.frame(
 	 'ProbeID'     = fdata[['ProbeName']]
 	,'SYMBOL'      = fdata[['SystematicName']]
 	,'Control'     = fdata[['ControlType']]
 	,'Chromosome'  = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
 	,fdata,row.names=fdata[['ProbeName']])
save(fdata ,file='../../../data/Agilent031181_MarkBasik.RData')

#
# # Agilent-039494 SurePrint G3 Human GE v2 8x60K Microarray Wed 15 Aug 17:35:19 2012
# #  GEO_039494_D_GEO_20120628.txt
# #	Name: 	SurePrint G3 Human GE v2 8x60K Microarray
# #  	Design ID: 	039494
# #  	Design Format: 	8 x 60 K
# #  	Control Grid: 	IS-62976-8-V2_60kby8_GX_EQC_201000210
# #  	Build Version: 	hg19:GRCh37:Feb2009
# f = read.delim('GEO_039494_D_GEO_20120628.txt',skip=42)
# f  = f[1:(nrow(f)-1),]
# f = unique(f[,4:20])
# fdata = f
# fdata = data.frame(
# 	 'ProbeID'     = fdata[['NAME']]
# 	,'SYMBOL'      = fdata[['GENE_SYMBOL']]
# 	,'ENTREZID'    = fdata[['LOCUSLINK_ID']]
# 	,'Control'     = fdata[['CONTROL_TYPE']]
# 	,'Chromosome'  = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
# 	,fdata,row.names=fdata[['NAME']])
# fdata$SYMBOL[fdata$SYMBOL==''] = NA
# fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
# fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
# fdata[['GO_ID']] = NULL # this is useless anyway
# save(fdata ,file='../../../data/Agilent039494.RData')
#




## Agilent-070155 Mouse_miRNA_V21.0_Microarray 8 x 60 K
f = read.delim("GEO_070155_D_GEO_20141006.txt",skip=34,comment.char='',quote='',check.names=F)
f  = f[1:(nrow(f)-1),]
f = unique(f[,4:ncol(f)])
fdata = f
fdata = fdata[!is.na(fdata$SPOT_ID),]
fdata = data.frame(
	 'ProbeID'     = fdata[["SPOT_ID"]]
	,'SYMBOL'      = fdata[['GENE_SYMBOL']]
	,'Control'     = fdata[['CONTROL_TYPE']]
	,fdata,row.names=fdata[['SPOT_ID']])
fdata$SYMBOL[fdata$SYMBOL==''] = NA
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
save(fdata ,file='../../../data/Agilent070155.RData')





#  Affymetrix_miRNA_4_0
# [miRNA-4_0] Affymetrix Multispecies miRNA-4 Array
## Affy miRNA Sept 2015
gpl = 'GPL19117'
fdata  = Table(getGEO(gpl))
fdata = data.frame(
    'ProbeID'     = fdata[['ID']]
    ,'SYMBOL'      = fdata[['Transcript ID(Array Design)']]
    ,fdata,row.names=fdata$ID,check.names=FALSE)
fdata$SYMBOL[fdata$SYMBOL==''] = NA
fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
save(fdata,file='/lb/project/mugqic/analyste_dev/software/mugqic_R_packages/mugqic_R_packages-master/gqData/data/Affymetrix_miRNA_4_0.RData')