PageRenderTime 172ms CodeModel.GetById 18ms RepoModel.GetById 3ms app.codeStats 1ms

/gqData/inst/extdata/microarray_annotations_library_sources/trace.r

https://bitbucket.org/mugqic/rpackages
R | 933 lines | 545 code | 204 blank | 184 comment | 1 complexity | 7a896b0a1efc839816bb189963b9d317 MD5 | raw file
  1. library(GEOquery)
  2. library(limma)
  3. library("org.Hs.eg.db")
  4. library("org.Mm.eg.db")
  5. library(multicore)
  6. library(gdata)
  7. library(oligo)
  8. library(RCurl)
  9. library(ExiMiR) # for Exiqon
  10. library(LVSmiRNA) # for Agilent
  11. library(AgiMicroRna) # for Agilent
  12. library(miRNApath) # for pathway analysis
  13. library(RmiR) # working with mirna in genral
  14. library(microRNA) # working with mirna in genral
  15. library(charm)
  16. library(magrittr)
  17. library(data.table)
  18. options(stringsAsFactors=FALSE)
  19. ## Mappings useful for some platforms
  20. x.human <- org.Hs.egSYMBOL2EG # For the reverse map:
  21. mapped_genes.human <- mappedkeys(x.human) # Get the entrez gene identifiers that are mapped to a gene symbol
  22. xx.human <- as.list(x.human[mapped_genes.human]) # Convert to a list
  23. mm.s2eg = as.list(revmap(org.Mm.egSYMBOL))
  24. mm.eg2s = as.list(org.Mm.egSYMBOL)
  25. mm.refseq2eg = as.list(org.Mm.egREFSEQ2EG)
  26. ############################################################################################################
  27. ############################################################################################################
  28. # Expression Arrays
  29. ############################################################################################################
  30. ############################################################################################################
  31. ## Agilent_017942_D_F_20071024_AK_12_MRD027-1_US09473739_251794210018_S01_GE1_107_Sep09_1_4.txt
  32. ## Custom Design From Geraldine Butler
  33. # load chip design from a data file..
  34. fn="Agilent_017942_D_F_20071024_AK_12_MRD027-1_US09473739_251794210018_S01_GE1_107_Sep09_1_4.txt"
  35. f = read.delim(fn,skip=9,comment.char='',quote='')
  36. f = unique(f[,c(7,8)])
  37. f$parsedCgdFeature = gsub('^gb_','',f$SystematicName)
  38. #f = f[1:(nrow(f)-1),]
  39. # Annotations from CGD
  40. fn="http://www.candidagenome.org/download/chromosomal_feature_files/C_albicans_SC5314/C_albicans_SC5314_A21_current_chromosomal_feature.tab"
  41. ann = read.delim(fn,skip=8,comment.char='',quote='',header=FALSE)
  42. colnames(ann) = c(
  43. "Feature name"
  44. ,"Gene name"
  45. ,"Aliases"
  46. ,"Feature type"
  47. ,"Chromosome"
  48. ,"Start Coordinate"
  49. ,"Stop Coordinate"
  50. ,"Strand"
  51. ,"Primary CGDID"
  52. ,"Secondary CGDID"
  53. ,"Description"
  54. ,"Date Created"
  55. ,"Sequence Coordinate Version Date (if any)"
  56. ,"Blank"
  57. ,"Blank2"
  58. ,"Date of gene name reservation"
  59. ,"Has the reserved gene name become the standard name"
  60. ,"Name of S. cerevisiae ortholog(s)"
  61. )
  62. rownames(ann) = ann[['Feature name']]
  63. # Join
  64. table(f$parsedCgdFeature %in% rownames(ann))
  65. missing = unique(f$parsedCgdFeature[! f$parsedCgdFeature %in% rownames(ann)])
  66. fdata = merge(f,ann,all.x=TRUE,all.y=FALSE,by.x="parsedCgdFeature",by.y=0)
  67. #Controled column name
  68. fdata = data.frame(
  69. 'ProbeID' = fdata[['ProbeName']]
  70. ,'SYMBOL' = fdata[['Gene name']]
  71. ,fdata,row.names=fdata[['ProbeName']],check.names=FALSE)
  72. fdata$SYMBOL[fdata$SYMBOL==''] = NA
  73. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$parsedCgdFeature[is.na(fdata$SYMBOL)] # replace emtpy probes
  74. #fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
  75. #fdata[['GO_ID']] = NULL # this is useless anyway
  76. save(fdata ,file='../../../data/Agilent017942.RData')
  77. ## Agilent-028279 SurePrint G3 Rat GE 8x60K Microarray GEO_028279_D_GEO_20130204.txt
  78. # http://www.genomics.agilent.com/CollectionSubpage.aspx?PageType=Product&SubPageType=ProductData&PageID=1524
  79. f = read.delim('GEO_028279_D_GEO_20130204.txt',skip=42,comment.char='',quote='')
  80. f = f[1:(nrow(f)-1),]
  81. fdata = unique(f[,4:20])
  82. fdata = data.frame(
  83. 'ProbeID' = fdata[['NAME']]
  84. ,'SYMBOL' = fdata[['GENE_SYMBOL']]
  85. ,'ENTREZID' = fdata[['LOCUSLINK_ID']]
  86. ,'Control' = fdata[['CONTROL_TYPE']]
  87. ,'Chromosome' = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
  88. ,fdata,row.names=fdata[['NAME']])
  89. fdata$SYMBOL[fdata$SYMBOL==''] = NA
  90. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
  91. fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
  92. fdata[['GO_ID']] = NULL # this is useless anyway
  93. save(fdata ,file='../../../data/Agilent028279.RData')
  94. ## Agilent-028004 SurePrint G3 Human Gene Expression 8x60K Microarray Kit
  95. # http://www.genomics.agilent.com/CollectionSubpage.aspx?PageType=Product&SubPageType=ProductData&PageID=1516
  96. f = read.delim('GEO_028004_D_GEO_20120411.txt',skip=42,comment.char='',quote='')
  97. f = f[1:(nrow(f)-1),]
  98. f = unique(f[,4:20])
  99. fdata = f
  100. fdata = data.frame(
  101. 'ProbeID' = fdata[['NAME']]
  102. ,'SYMBOL' = fdata[['GENE_SYMBOL']]
  103. ,'ENTREZID' = fdata[['LOCUSLINK_ID']]
  104. ,'Control' = fdata[['CONTROL_TYPE']]
  105. ,'Chromosome' = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
  106. ,fdata,row.names=fdata[['NAME']])
  107. fdata$SYMBOL[fdata$SYMBOL==''] = NA
  108. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
  109. fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
  110. fdata[['GO_ID']] = NULL # this is useless anyway
  111. save(fdata ,file='../../../data/Agilent028004.RData')
  112. ## Agilent-028005 SurePrint G3 Mouse GE 8x60K Microarray Sat 7 Jul 06:27:30 2012
  113. gpl = 'GPL10787'
  114. fdata = Table(getGEO(gpl))
  115. fdata = data.frame(
  116. 'ProbeID' = fdata[['ID']]
  117. ,'SYMBOL' = fdata[['GENE_SYMBOL']]
  118. ,'ENTREZID' = fdata[['GENE']]
  119. ,'Control' = fdata[['CONTROL_TYPE']]
  120. ,'Chromosome' = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
  121. ,fdata,row.names=fdata$ID)
  122. fdata$SYMBOL[fdata$SYMBOL==''] = NA
  123. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
  124. fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
  125. fdata[['GO_ID']] = NULL # this is useless anyway
  126. save(fdata ,file='../../../data/Agilent028005.RData')
  127. # Agilent-039494 SurePrint G3 Human GE v2 8x60K Microarray Wed 15 Aug 17:35:19 2012
  128. # GEO_039494_D_GEO_20120628.txt
  129. # Name: SurePrint G3 Human GE v2 8x60K Microarray
  130. # Design ID: 039494
  131. # Design Format: 8 x 60 K
  132. # Control Grid: IS-62976-8-V2_60kby8_GX_EQC_201000210
  133. # Build Version: hg19:GRCh37:Feb2009
  134. f = read.delim('GEO_039494_D_GEO_20120628.txt',skip=42)
  135. f = f[1:(nrow(f)-1),]
  136. f = unique(f[,4:20])
  137. fdata = f
  138. fdata = data.frame(
  139. 'ProbeID' = fdata[['NAME']]
  140. ,'SYMBOL' = fdata[['GENE_SYMBOL']]
  141. ,'ENTREZID' = fdata[['LOCUSLINK_ID']]
  142. ,'Control' = fdata[['CONTROL_TYPE']]
  143. ,'Chromosome' = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
  144. ,fdata,row.names=fdata[['NAME']])
  145. fdata$SYMBOL[fdata$SYMBOL==''] = NA
  146. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
  147. fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
  148. fdata[['GO_ID']] = NULL # this is useless anyway
  149. save(fdata ,file='../../../data/Agilent039494.RData')
  150. ## Agilent-026440 Wed 10 Oct 15:42:34 2012
  151. # Name: S. scrofa (Pig) Oligo Microarray v2
  152. # Design ID: 026440
  153. # Design Format: 4 X 44K
  154. # Control Grid: IS-45220-4-V1_4x44K_GX_EQC_V20060608
  155. # Build Version: Not Applicable
  156. f = read.delim('GEO_026440_D_GEO_20120509.txt',skip=42,comment.char='',quote='') # damn it the file is broken
  157. f = f[1:(nrow(f)-1),]
  158. f = unique(f[,4:20])
  159. fdata = f
  160. fdata = data.frame(
  161. 'ProbeID' = fdata[['NAME']]
  162. ,'SYMBOL' = fdata[['GENE_SYMBOL']]
  163. ,'ENTREZID' = fdata[['LOCUSLINK_ID']]
  164. ,'Control' = fdata[['CONTROL_TYPE']]
  165. ,'Chromosome' = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
  166. ,fdata,row.names=fdata[['NAME']])
  167. fdata$SYMBOL[fdata$SYMBOL==''] = NA
  168. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
  169. fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
  170. fdata[['GO_ID']] = NULL # this is useless anyway
  171. save(fdata ,file='../../../data/Agilent026440.RData')
  172. ## Affymetrix [Mouse430_2] Affymetrix Mouse Genome 430 2.0 Array
  173. gpl = 'GPL1261'
  174. temp = Table(getGEO(gpl))
  175. ls("package:mirbase.db")
  176. fdata= temp
  177. x = as.list(mirbaseACC2ID)
  178. fdata[!fdata$ACC %in% names(x),]
  179. fdata = data.frame(
  180. 'ProbeID' = fdata[['ID']]
  181. ,'SYMBOL' = fdata[['Gene Symbol']]
  182. ,'ENTREZID' = fdata[['ENTREZ_GENE_ID']]
  183. ,'Control' = ''
  184. ,'Chromosome' = ''
  185. ,fdata,row.names=fdata$ID,check.names=FALSE)
  186. fdata$SYMBOL[fdata$SYMBOL==''] = NA
  187. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
  188. fdata = fdata[,1:18]# remove GO
  189. save(fdata,file='../../../data/Affymetrix_Mouse430_2.RData')
  190. ## Affymetrix 3p IVT Yeast Genome 2.0 Array
  191. library(yeast2.db)
  192. options(stringsAsFactors=FALSE)
  193. bioc.fdata = select(yeast2.db, columns=c("GENENAME","ORF"),keys=keys(yeast2.db))
  194. rownames(bioc.fdata) = bioc.fdata[["PROBEID"]]
  195. fn = "Yeast_2.na34.annot.csv"
  196. fdata = read.csv(fn,comment.char='#',check.names=FALSE)
  197. fdata = data.frame(
  198. 'ProbeID' = fdata[['Probe Set ID']]
  199. ,'SYMBOL' = fdata[['Gene Symbol']]
  200. ,'ENTREZID' = fdata[['Entrez Gene']]
  201. ,bioc.fdata[fdata[['Probe Set ID']], ]
  202. ,fdata,row.names=fdata[['Probe Set ID']],check.names=FALSE)
  203. fdata$SYMBOL = gsub("///.*","and others",fdata$SYMBOL )
  204. fdata$SYMBOL[fdata$SYMBOL=='---'] = NA
  205. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
  206. save(fdata,file='../../../data/Affymetrix_Yeast_2.RData')
  207. ## Illumina mouse WG-6 Mon 11 Feb 11:02:51 2013
  208. # Note : We extract the probe annotation from "MouseWG-6_V2_0_R3_11278593_A.bgx" via an actual SampleProbeProfile...
  209. fdata = read.delim('MouseWG-6_V2_0_R3_11278593_A.bgx_SampleProbeProfile_20130118.txt',check.names=FALSE,comment.char='', quote='',colClasses='character')
  210. fdata = fdata[,!grepl('^MHA',colnames(fdata))]
  211. rownames(fdata) = fdata$ProbeID
  212. fdata[['ENTREZID']] = fdata[['ENTREZ_GENE_ID']]
  213. fdata[['ENTREZID']][fdata[['ENTREZID']]==''] = NA
  214. fdata = fdata[,c("ProbeID",'SYMBOL','ENTREZID',setdiff(colnames(fdata),c('ProbeID','SYMBOL','ENTREZID')))]
  215. fdata = fdata[,!grepl("ONTOLOGY_",colnames(fdata))]
  216. save(fdata,file='../../../data/MouseWG-6_V2_0_R3_11278593_A.RData')
  217. ## Illlumina HT12 from example sample probe report Sat 23 Mar 22:07:41 2013
  218. library("org.Hs.eg.db")
  219. fdata = read.delim('HumanHT-12_v4_sample_probe_report.txt',check.names=FALSE,comment.char='', quote='',colClasses='character',skip=3)
  220. fdata = fdata[,c(1:22)]
  221. fdata = fdata[,!grepl('ONTOLOGY',colnames(fdata))]
  222. rownames(fdata) = fdata$ProbeID
  223. fdata$SYMBOL[fdata$SYMBOL==''] = NA
  224. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
  225. fdata[['ENTREZID']] = NA
  226. fdata[['ENTREZID']][fdata$SYMBOL %in% keys(org.Hs.egSYMBOL2EG)] =as.character( mget(fdata$SYMBOL[fdata$SYMBOL %in% keys(org.Hs.egSYMBOL2EG)], org.Hs.egSYMBOL2EG ))
  227. save(fdata,file='../../../data/HumanHT-12_v4.RData')
  228. ## Mouse Gene 1.0 ST Array : MoGene-1_0-st-v1
  229. # source("http://bioconductor.org/biocLite.R")
  230. # biocLite("mogene10sttranscriptcluster.db")
  231. library(mogene10sttranscriptcluster.db) # ls("package:mogene10sttranscriptcluster.db")
  232. options(stringsAsFactors=FALSE)
  233. gpl="GPL6246"
  234. fdata = Table(getGEO(gpl))
  235. fdata = fdata[fdata[['SPOT_ID']]!="Not currently mapped to latest genome",]
  236. fdata[["GeneSymbol"]] = sapply(strsplit(fdata[['gene_assignment']],split=' // '),function(z)z[2])# Attempt at symbol
  237. fdata = data.frame(
  238. 'ProbeID' = fdata[['ID']]
  239. ,'SYMBOL' = unlist(mget(fdata[['ID']],mogene10sttranscriptclusterSYMBOL))
  240. ,'ENTREZID' = unlist(mget(fdata[['ID']],mogene10sttranscriptclusterENTREZID))
  241. ,'GENENAME' = unlist(mget(fdata[['ID']],mogene10sttranscriptclusterGENENAME))
  242. ,fdata,row.names=fdata$ID,check.names=FALSE)
  243. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)]
  244. save(fdata,file='../../../data/Affymetrix_MoGene-1_0-st-v1.RData')
  245. # TODO: use Bioc library to add a few things...
  246. # [RaGene-2_0-st] Affymetrix Rat Gene 2.0 ST Array
  247. library(ragene20sttranscriptcluster.db)
  248. gpl="GPL17117"
  249. fdata = Table(getGEO(gpl))
  250. fdata = fdata[fdata$ID %in% keys(ragene20sttranscriptclusterSYMBOL),]
  251. fdata[["GeneSymbol"]] = sapply(strsplit(fdata[['gene_assignment']],split=' // '),function(z)z[2])# Attempt at symbol
  252. fdata = data.frame(
  253. 'ProbeID' = fdata[['ID']]
  254. ,'SYMBOL' = unlist(mget(fdata[['ID']],ragene20sttranscriptclusterSYMBOL))
  255. ,'ENTREZID' = unlist(mget(fdata[['ID']],ragene20sttranscriptclusterENTREZID))
  256. ,'GENENAME' = unlist(mget(fdata[['ID']],ragene20sttranscriptclusterGENENAME))
  257. ,fdata,row.names=fdata$ID,check.names=FALSE)
  258. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)]
  259. fdata = fdata[,!grepl("^GO_",colnames(fdata))]
  260. save(fdata,file='../../../data/Affymetrix_RaGene-2_0-st.RData')
  261. #raw = oligo::rma(oligo::read.celfiles(list.files("~/Dropbox/projects/2013/gsebire_Sebire001_PRJBFX-482/chips",full.names=TRUE)))
  262. # Options: use bioc annot, use annot from probe profile, use annot from switchtoi
  263. # [MoGene-2_0-st] Affymetrix Mouse Gene 2.0 ST Array [transcript (gene) version]
  264. library(mogene20sttranscriptcluster.db)
  265. gpl="GPL16570"
  266. fdata = Table(getGEO(gpl))
  267. fdata = fdata[fdata$ID %in% keys(mogene20sttranscriptclusterSYMBOL),]
  268. setequal(fdata$ID,keys(mogene20sttranscriptclusterSYMBOL))
  269. fdata[["GeneSymbol"]] = sapply(strsplit(fdata[['gene_assignment']],split=' // '),function(z)z[2])# Attempt at symbol
  270. fdata = data.frame(
  271. 'ProbeID' = fdata[['ID']]
  272. ,'SYMBOL' = unlist(mget(fdata[['ID']],mogene20sttranscriptclusterSYMBOL))
  273. ,'ENTREZID' = unlist(mget(fdata[['ID']],mogene20sttranscriptclusterENTREZID))
  274. ,'GENENAME' = unlist(mget(fdata[['ID']],mogene20sttranscriptclusterGENENAME))
  275. ,fdata,row.names=fdata$ID,check.names=FALSE)
  276. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)]
  277. fdata = fdata[,!grepl("^GO_",colnames(fdata))]
  278. save(fdata,file='../../../data/Affymetrix_MoGene-2_0-st.RData')
  279. # [HuGene-2_0-st] Affymetrix Human Gene 2.0 ST Array [transcript (gene) version]
  280. library(hugene20sttranscriptcluster.db)
  281. #fn = "HuGene-2_0-st-v1.na33.2.hg19.transcript.csv"
  282. fn = "HuGene-2_0-st-v1.na34.hg19.transcript.csv"
  283. fdata = read.csv(fn,skip=23, comment.char='',colClasses='character')
  284. rownames(fdata) = fdata$transcript_cluster_id
  285. fdata = fdata[rownames(fdata) %in% keys(hugene20sttranscriptclusterSYMBOL),]
  286. setequal(rownames(fdata),keys(hugene20sttranscriptclusterSYMBOL))
  287. fdata = data.frame(
  288. 'ProbeID' = rownames(fdata)
  289. ,'SYMBOL' = unlist(mget(rownames(fdata),hugene20sttranscriptclusterSYMBOL))
  290. ,'ENTREZID' = unlist(mget(rownames(fdata),hugene20sttranscriptclusterENTREZID))
  291. ,'GENENAME' = unlist(mget(rownames(fdata),hugene20sttranscriptclusterGENENAME))
  292. ,fdata,row.names=rownames(fdata),check.names=FALSE)
  293. fdata[["GeneSymbol"]] = sapply(strsplit(fdata[['gene_assignment']],split=' // '),function(z)z[2])# Attempt at symbol
  294. fdata[["Cytoband"]] = sapply(strsplit(fdata[['gene_assignment']],split=' // '),function(z)z[4])# Attempt at symbol
  295. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["GeneSymbol"]][is.na(fdata$SYMBOL)]
  296. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)]
  297. fdata = fdata[,!grepl("^GO_",colnames(fdata))]
  298. save(fdata,file='../../../data/Affymetrix_HuGene-2_0-st.RData')
  299. ############################################################################################################
  300. ############################################################################################################
  301. # Clariom
  302. ############################################################################################################
  303. ############################################################################################################
  304. # Clariom_S_Mouse_HT
  305. library(data.table)
  306. library(stringr)
  307. library(magrittr)
  308. fdata = fread("/Users/flefebvr/Dropbox/projects/2016/skimmins_kimmins006_PRJBFX-1415/Clariom_S_Mouse_HT.na36.mm10.transcript.csv") %>% as.data.frame # this file donwloaded from netaffx
  309. rownames(fdata) = fdata$"transcript_cluster_id"
  310. cols=c( # columns to keep!
  311. "transcript_cluster_id",
  312. "probeset_id",
  313. "seqname",
  314. "strand",
  315. "start",
  316. "stop",
  317. "total_probes",
  318. "gene_assignment",
  319. "category",
  320. "locus type",
  321. "notes")
  322. fdata = fdata[,cols]
  323. fdata = data.frame(
  324. 'ProbeID' = rownames(fdata)
  325. ,'SYMBOL' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[2])
  326. ,'ENTREZID' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[5])
  327. ,"GeneName" = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[3])
  328. ,fdata,row.names=rownames(fdata),check.names=FALSE)
  329. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)]
  330. fdata$"gene_assignment" %<>% str_split(" /// ") %>% sapply(function(x)x[1])
  331. colnames(fdata) %<>% gsub("^gene_assignment$","gene_assignment (first transcript only)",.)
  332. save(fdata,file='../../../data/Affymetrix_Clariom_S_Mouse_HT.RData')
  333. # compare with Eloi's file
  334. # x = fread("/Users/flefebvr/Dropbox/projects/2016/skimmins_kimmins006_PRJBFX-1415/kimmins006.summary.SST-RMA-GENE-FULL.txt") %>% as.data.frame
  335. # rownames(x) = x[,1]
  336. # setequal(rownames(x),rownames(fdata))
  337. # Clariom_S_Mouse
  338. library(data.table)
  339. library(stringr)
  340. library(magrittr)
  341. fdata = fread("/Users/emercier/Workdir/gqMicroarray_newPlatform/data/Clariom_S_Mouse.na36.mm10.transcript.csv") %>% as.data.frame # this file donwloaded from netaffx
  342. rownames(fdata) = fdata$"transcript_cluster_id"
  343. cols=c( # columns to keep!
  344. "transcript_cluster_id",
  345. "probeset_id",
  346. "seqname",
  347. "strand",
  348. "start",
  349. "stop",
  350. "total_probes",
  351. "gene_assignment",
  352. "category",
  353. "locus type",
  354. "notes")
  355. fdata = fdata[,cols]
  356. fdata = data.frame(
  357. 'ProbeID' = rownames(fdata)
  358. ,'SYMBOL' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[2])
  359. ,'ENTREZID' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[5])
  360. ,"GeneName" = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[3])
  361. ,fdata,row.names=rownames(fdata),check.names=FALSE)
  362. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)] #warnings are expected
  363. fdata$"gene_assignment" %<>% str_split(" /// ") %>% sapply(function(x)x[1])
  364. colnames(fdata) %<>% gsub("^gene_assignment$","gene_assignment (first transcript only)",.)
  365. save(fdata,file='~/Tools/rpackages/gqData/data/Affymetrix_Clariom_S_Mouse.RData')
  366. # compare with Eloi's file
  367. # x = fread("/Users/emercier/Workdir/Boerboom008_PRJBFX-1419/data/boerboom008.summary.SST-RMA-GENE-FULL.txt") %>% as.data.frame
  368. # rownames(x) = x[,1]
  369. # setequal(rownames(x),rownames(fdata))
  370. #Clariom D human
  371. library(data.table)
  372. library(stringr)
  373. library(magrittr)
  374. fdata = fread("/Users/emercier/Workdir/gqMicroarray_newPlatform/data/Clariom_D_Human.na36.hg38.transcript.csv") %>% as.data.frame # this file donwloaded from netaffx
  375. rownames(fdata) = fdata$"transcript_cluster_id"
  376. cols=c( # columns to keep!
  377. "transcript_cluster_id",
  378. "probeset_id",
  379. "seqname",
  380. "strand",
  381. "start",
  382. "stop",
  383. "total_probes",
  384. "gene_assignment",
  385. "category",
  386. "locus type",
  387. "notes")
  388. fdata = fdata[,cols]
  389. fdata = data.frame(
  390. 'ProbeID' = rownames(fdata)
  391. ,'SYMBOL' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[2])
  392. ,'ENTREZID' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[5])
  393. ,"GeneName" = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[3])
  394. ,fdata,row.names=rownames(fdata),check.names=FALSE)
  395. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)] #warnings are expected
  396. fdata$"gene_assignment" %<>% str_split(" /// ") %>% sapply(function(x)x[1])
  397. colnames(fdata) %<>% gsub("^gene_assignment$","gene_assignment (first transcript only)",.)
  398. save(fdata,file='~/Tools/rpackages/gqData/data/Affymetrix_Clariom_D_Human.RData')
  399. #Clariom S human
  400. library(data.table)
  401. library(stringr)
  402. library(magrittr)
  403. fdata = fread("/Users/emercier/Workdir/gqMicroarray_newPlatform/data/Clariom_S_Human.na36.hg38.transcript.csv") %>% as.data.frame # this file donwloaded from affy
  404. rownames(fdata) = fdata$"transcript_cluster_id"
  405. cols=c( # columns to keep!
  406. "transcript_cluster_id",
  407. "probeset_id",
  408. "seqname",
  409. "strand",
  410. "start",
  411. "stop",
  412. "total_probes",
  413. "gene_assignment",
  414. "category",
  415. "locus type",
  416. "notes")
  417. fdata = fdata[,cols]
  418. fdata = data.frame(
  419. 'ProbeID' = rownames(fdata)
  420. ,'SYMBOL' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[2])
  421. ,'ENTREZID' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[5])
  422. ,"GeneName" = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[3])
  423. ,fdata,row.names=rownames(fdata),check.names=FALSE)
  424. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)] #warnings are expected
  425. fdata$"gene_assignment" %<>% str_split(" /// ") %>% sapply(function(x)x[1])
  426. colnames(fdata) %<>% gsub("^gene_assignment$","gene_assignment (first transcript only)",.)
  427. save(fdata,file='~/Tools/rpackages/gqData/data/Affymetrix_Clariom_S_Human.RData')
  428. #Clariom S human HT #exact same then Clariom S
  429. library(data.table)
  430. library(stringr)
  431. library(magrittr)
  432. fdata = fread("/Users/emercier/Workdir/gqMicroarray_newPlatform/data/Clariom_S_Human_HT.na36.hg38.transcript.csv") %>% as.data.frame # this file donwloaded from affy
  433. rownames(fdata) = fdata$"transcript_cluster_id"
  434. cols=c( # columns to keep!
  435. "transcript_cluster_id",
  436. "probeset_id",
  437. "seqname",
  438. "strand",
  439. "start",
  440. "stop",
  441. "total_probes",
  442. "gene_assignment",
  443. "category",
  444. "locus type",
  445. "notes")
  446. fdata = fdata[,cols]
  447. fdata = data.frame(
  448. 'ProbeID' = rownames(fdata)
  449. ,'SYMBOL' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[2])
  450. ,'ENTREZID' = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[5])
  451. ,"GeneName" = fdata$"gene_assignment" %>% str_split("( /// | // )") %>% sapply(function(x)x[3])
  452. ,fdata,row.names=rownames(fdata),check.names=FALSE)
  453. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata[["ProbeID"]][is.na(fdata$SYMBOL)] #warnings are expected
  454. fdata$"gene_assignment" %<>% str_split(" /// ") %>% sapply(function(x)x[1])
  455. colnames(fdata) %<>% gsub("^gene_assignment$","gene_assignment (first transcript only)",.)
  456. save(fdata,file='~/Tools/rpackages/gqData/data/Affymetrix_Clariom_S_Human_HT.RData')
  457. ############################################################################################################
  458. ############################################################################################################
  459. # 450k Methylation arrays
  460. ############################################################################################################
  461. ############################################################################################################
  462. ## Illumina 450k Fri 23 Nov 11:05:37 2012 . Illumina Inc. updated this Nov 20 2012
  463. gpl = 'GPL13534'
  464. temp = Table(getGEO(gpl))
  465. fdata= temp
  466. fdata = data.frame(
  467. 'ProbeID' = fdata[['ID']]
  468. ,'SYMBOL' = NA
  469. ,'ENTREZID' = NA
  470. ,'Control' = ''
  471. ,'Chromosome' = fdata[['CHR']]
  472. ,fdata,row.names=fdata$ID,check.names=FALSE)
  473. # Define symbols
  474. sy = strsplit(fdata$UCSC_RefGene_Name,split=';')
  475. sy = mclapply(sy,unique,mc.cores=8)
  476. fdata$SYMBOL = unlist( mclapply(sy,function(x)paste(x,collapse=';'),mc.cores=8) )
  477. fdata$SYMBOL[fdata$SYMBOL==''] = NA
  478. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
  479. # Define entrez ids
  480. eg = strsplit(fdata$UCSC_RefGene_Accession,split=';')
  481. eg = mclapply(eg,unique,mc.cores=8)
  482. dict = unique(unlist(eg))
  483. dict = dict[ dict %in% names(as.list(org.Hs.egREFSEQ2EG)) ]
  484. dict = as.list(org.Hs.egREFSEQ2EG[ dict ])
  485. gc()
  486. #z =Sys.time()
  487. eg=mclapply(eg,function(accs) # should take 5 minutes if not swapping
  488. {
  489. unlist(dict[accs])
  490. },mc.cores=4)
  491. #print(Sys.time()-z)
  492. fdata$ENTREZID = unlist(mclapply(eg,function(x)paste(unique(x),collapse=';'),mc.cores=8))
  493. fdata$ENTREZID[fdata$ENTREZID==''] = NA
  494. gc()
  495. save(fdata,file='../../../data/IlluminaHumanMethylation450.RData')
  496. ############################################################################################################
  497. ############################################################################################################
  498. # Agilent CpG Island Arrays
  499. ############################################################################################################
  500. ############################################################################################################
  501. ## Agilent-015279 Mouse CpG Island ChIP-on-Chip Microarray 2x105K (
  502. library(BSgenome.Mmusculus.UCSC.mm9) # this will load object Mmusculus
  503. bs.genome = Mmusculus
  504. f = read.delim('GEO_015279_D_GEO_20111102.txt',skip=34,comment.char='',quote='',na.strings = "fbnjwknfwjfw")
  505. f = f[1:(nrow(f)-1),]
  506. f = unique(f[,4:ncol(f)])
  507. f[['ID']] = f[['SPOT_ID']]
  508. f = f[!grepl('^NA\\.',f$ID),]
  509. f = f[f$ID!='NA',]
  510. fdata = f
  511. fdata = data.frame(
  512. 'ProbeID' = fdata[['ID']]
  513. ,'SYMBOL' = '' #fdata[['GENE_SYMBOL']]
  514. ,'ENTREZID' = ''
  515. ,'Control' = fdata[['CONTROL_TYPE']]
  516. ,'Chromosome' = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
  517. ,'seq' = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
  518. ,'start' = as.numeric(sapply( strsplit(fdata[['CHROMOSOMAL_LOCATION']],split='(:|-)') ,function(z)z[2]))
  519. ,'end' = as.numeric(sapply( strsplit(fdata[['CHROMOSOMAL_LOCATION']],split='(:|-)') ,function(z)z[3]))
  520. ,'SEQUENCE' = NA
  521. ,'cpg.density' = NA # This will be tuned accoring to fragment size... which seems to be between 200 and 1000 bp
  522. ,'cpg.density.windowSize' = 1000 # this will always be relative to the
  523. ,'gc.count' = NA
  524. #,'LOCATION' = fdata[['CHROMOSOMAL_LOCATION']]
  525. #,'GENOME' = 'mm9'
  526. #,'ISLAND_LOCATION' = fdata[['DESCRIPTION']]
  527. ,fdata,row.names=fdata$ID)
  528. # define SYMBOL
  529. # SYMBOL should be assigned thourgh GB_ACC instead, their columnn is odd.
  530. fdata$SYMBOL = fdata$GENE_SYMBOL
  531. fdata$SYMBOL[fdata$SYMBOL==''] = fdata$ID[fdata$SYMBOL=='']
  532. #fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
  533. # define ENTREZID
  534. fdata$ENTREZID = NA
  535. fdata$ENTREZID[fdata$SYMBOL %in% keys(org.Mm.egSYMBOL2EG) ] =
  536. unlist( mget( fdata$SYMBOL[fdata$SYMBOL %in% keys(org.Mm.egSYMBOL2EG) ] , org.Mm.egSYMBOL2EG ) )
  537. #fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
  538. #fdata$ENTREZID = as.character(sapply( gsub(':.*','',fdata[['GB_ACC']]) ,function(gb)mm.refseq2eg[[gb]]))
  539. #fdata$ENTREZID[fdata$ENTREZID=='NULL'] = NA # set emptpy entrez id as NA
  540. # define SYMBOL
  541. # SYMBOL should be assigned thourgh GB_ACC instead, their columnn is odd.
  542. #fdata$SYMBOL = as.character(sapply(fdata[['ENTREZID']],function(eg) mm.eg2s[[eg]] ))
  543. #fdata$SYMBOL[fdata$SYMBOL=='NULL'] = NA
  544. #fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
  545. # Define Sequence
  546. #wo = fdata[['chr']] %in% names(bs.genome)
  547. #fdata[['SEQUENCE']][wo] = getSeq(bs.genome, fdata[['chr']][wo], start=fdata[['chr.start']][wo],end=fdata[['chr.end']][wo], as.character=TRUE) # haha the GEO annotation has locs outside of chromosome ranges
  548. # GC count
  549. #seqs = DNAStringSet(fdata[['SEQUENCE']][wo] )
  550. #tmp <- alphabetFrequency(seqs, baseOnly = TRUE)
  551. #fdata[['gc.count']][wo] = tmp[, "C"] + tmp[, "G"]
  552. # Define window CpG density
  553. #fdata[['cpg.density']] = cpgdensity(bs.genome, chr = fdata[['chr']], pos = fdata[['chr.start']], windowSize = unique(fdata[['cpg.density.windowSize']]) )
  554. save(fdata ,file='../../../data/Agilent015279.RData')
  555. ############################################################################################################
  556. ############################################################################################################
  557. # miRNA arrays
  558. ############################################################################################################
  559. ############################################################################################################
  560. # mirBase stuff
  561. #library(mirbase.db)
  562. #ls("package:mirbase.db")
  563. #all.mirbase.ids = names(as.list(mirbaseID2ACC))
  564. #all.mirbase.acc = as.character((as.list(mirbaseID2ACC)))
  565. #all.mirbase.mature = as.character(unlist(lapply( mget(mappedkeys(mirbaseMATURE), mirbaseMATURE), function(x) matureName(x))))
  566. #ask.mirbase <- function(ids)
  567. #{
  568. # sapply(ids,function(id){
  569. #Sys.sleep(0.1)
  570. # print(id)
  571. # html = readLines(paste('http://www.mirbase.org/cgi-bin/query.pl?terms=',id,sep=''))
  572. # if(any(grepl('We found <b>1</b> unique result for your query',html,fixed=TRUE)))
  573. # {
  574. # html = html[grepl('acc=',html)][2]
  575. # html = gsub('.*\\">','',html)
  576. # html = gsub('</a></td>','',html)
  577. # #html = gsub('.*acc=.*acc=.*\\">','',html)
  578. # #html = gsub('</a></td>.*','',html)
  579. #
  580. # }else{html = NA}
  581. # return(html)
  582. #
  583. # },simplify=FALSE)
  584. #}
  585. ## Agilent-035430 mouse miRNA array (miRBase release 18 miRNA ID version)
  586. # TODO: There are two problems with the Agilent annot. First, not mirbase 18 as stated. Then, not a probe annot, and no content, so useless.
  587. gpl = 'GPL15547'
  588. temp = Table(getGEO(gpl))
  589. # mirBase stuff
  590. mb = read.xls('../miRbase/miRNA_17.xls')
  591. rownames(mb) = mb[['Accession']]
  592. ids = mb[,c(2,6,9,12,15)]
  593. ids = sapply(1:nrow(ids),function(i){
  594. x = as.character(ids[i,])
  595. x = x[x!='']
  596. },simplify=FALSE)
  597. names(ids) = rownames(mb)
  598. ids = revmap(ids)
  599. # Agilent annotation?
  600. #ann = read.delim('GEO_035430_D_GEO_20111226.txt',skip=34)
  601. #ann = ann[1:(nrow(ann)-1),]
  602. #ann = ann[!is.na(ann$SPOT_ID),]
  603. #nrow(ann)
  604. fdata= temp
  605. fdata = data.frame(
  606. 'ProbeID' = fdata[['ID']]
  607. ,'SYMBOL' = fdata[['miRNA_ID']]
  608. ,'ACCESSION' = sapply(fdata[['ID']], function(x)sort(paste(ids[[x]],collapse=',')) )
  609. ,'ENTREZID' = ''
  610. ,'Control' = ''
  611. ,'Chromosome' = ''
  612. ,fdata,row.names=fdata$ID,check.names=FALSE)
  613. fdata$SYMBOL[fdata$SYMBOL==''] = NA
  614. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
  615. save(fdata,file='../../../data/Agilent035430.RData')
  616. ## Affy miRNA Mon 9 Jul 13:42:03 2012
  617. gpl = 'GPL14613'
  618. temp = Table(getGEO(gpl))
  619. temp[['my.mirbase.id']] = temp[['Transcript ID(Array Design)']]
  620. temp[['my.mirbase.id']] = gsub('-star','*',temp[['my.mirbase.id']],fixed=TRUE)
  621. temp[['my.mirbase.id']] = sapply(strsplit(temp[['my.mirbase.id']],split=' // '),function(x)paste(unique(x),collapse=' // '))
  622. mb = read.xls('../miRbase/miRNA_15.xls')
  623. rownames(mb) = mb[['Accession']]
  624. ids = mb[,c(2,6,9,12,15)]
  625. ids = sapply(1:nrow(ids),function(i){
  626. x = as.character(ids[i,])
  627. x = x[x!='']
  628. },simplify=FALSE)
  629. names(ids) = rownames(mb)
  630. ids = revmap(ids)
  631. #temp[['my.mirbase.id']][!temp[['my.mirbase.id']] %in% names(ids)]
  632. fdata= temp
  633. fdata = data.frame(
  634. 'ProbeID' = fdata[['ID']]
  635. ,'SYMBOL' = fdata[['my.mirbase.id']]
  636. ,'ENTREZID' = ''
  637. ,'ACCESSION' = sapply(fdata[['my.mirbase.id']], function(x)sort(paste(ids[[x]],collapse=',')) )
  638. ,'Control' = ''
  639. ,'Chromosome' = gsub(':.*','',fdata[['Alignments']])
  640. ,fdata,row.names=fdata$ID,check.names=FALSE)
  641. fdata$SYMBOL[fdata$SYMBOL==''] = NA
  642. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
  643. save(fdata,file='../../../data/Affymetrix_miRNA_2_0.RData')
  644. # TODO: see how the packages microRNA, miRNApath and Rmir can help you do pathway analysis
  645. # Affymetrix_miRNA_3_0
  646. # [miRNA-3_0] Affymetrix Multispecies miRNA-3 Array
  647. ## Affy miRNA Mar 4 2014
  648. gpl = 'GPL16384'
  649. fdata = Table(getGEO(gpl))
  650. fdata = data.frame(
  651. 'ProbeID' = fdata[['ID']]
  652. ,'SYMBOL' = fdata[['Transcript ID(Array Design)']]
  653. ,fdata,row.names=fdata$ID,check.names=FALSE)
  654. fdata$SYMBOL[fdata$SYMBOL==''] = NA
  655. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
  656. save(fdata,file='../../../data/Affymetrix_miRNA_3_0.RData')
  657. # ## Agilent-031181_MarkBasik (Mark Basiks version... not exactly 031181 probe names not exactly the same)
  658. # #GEO_031181_D_GEO_20121224.txt
  659. e = read.maimages("/Users/flefebvr/Dropbox/projects/2012/yriazalhosseini_450k_Nov2012_PRJBFX-348/miRNA/chips/253118113936_201302211504_S01_miRNA_107_Sep09_1_1.txt",green.only=TRUE,source='agilent')$genes
  660. # cannot share the latter file on bitbucket
  661. e = unique(e[,3:ncol(e)])
  662. rownames(e) = e$ProbeName
  663. fdata = read.delim('GEO_031181_D_GEO_20121224.txt',skip=34)
  664. fdata = unique( fdata[,4:ncol(fdata)] )
  665. fdata = fdata[!(fdata$SPOT_ID == '' | is.na(fdata$SPOT_ID ) ),]
  666. rownames(fdata) = fdata$SPOT_ID
  667. #setdiff(rownames(e),rownames(fdata))
  668. #setdiff(rownames(fdata),rownames(e))
  669. fdata = merge(e,fdata,by=0,all.x=TRUE,all.y=FALSE)
  670. rownames(fdata) = fdata$Row.names
  671. fdata$Row.names = NULL
  672. fdata = data.frame(
  673. 'ProbeID' = fdata[['ProbeName']]
  674. ,'SYMBOL' = fdata[['SystematicName']]
  675. ,'Control' = fdata[['ControlType']]
  676. ,'Chromosome' = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
  677. ,fdata,row.names=fdata[['ProbeName']])
  678. save(fdata ,file='../../../data/Agilent031181_MarkBasik.RData')
  679. #
  680. # # Agilent-039494 SurePrint G3 Human GE v2 8x60K Microarray Wed 15 Aug 17:35:19 2012
  681. # # GEO_039494_D_GEO_20120628.txt
  682. # # Name: SurePrint G3 Human GE v2 8x60K Microarray
  683. # # Design ID: 039494
  684. # # Design Format: 8 x 60 K
  685. # # Control Grid: IS-62976-8-V2_60kby8_GX_EQC_201000210
  686. # # Build Version: hg19:GRCh37:Feb2009
  687. # f = read.delim('GEO_039494_D_GEO_20120628.txt',skip=42)
  688. # f = f[1:(nrow(f)-1),]
  689. # f = unique(f[,4:20])
  690. # fdata = f
  691. # fdata = data.frame(
  692. # 'ProbeID' = fdata[['NAME']]
  693. # ,'SYMBOL' = fdata[['GENE_SYMBOL']]
  694. # ,'ENTREZID' = fdata[['LOCUSLINK_ID']]
  695. # ,'Control' = fdata[['CONTROL_TYPE']]
  696. # ,'Chromosome' = gsub(':.*','',fdata[['CHROMOSOMAL_LOCATION']])
  697. # ,fdata,row.names=fdata[['NAME']])
  698. # fdata$SYMBOL[fdata$SYMBOL==''] = NA
  699. # fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
  700. # fdata$ENTREZID[fdata$ENTREZID==''] = NA # set emptpy entrez id as NA
  701. # fdata[['GO_ID']] = NULL # this is useless anyway
  702. # save(fdata ,file='../../../data/Agilent039494.RData')
  703. #
  704. ## Agilent-070155 Mouse_miRNA_V21.0_Microarray 8 x 60 K
  705. f = read.delim("GEO_070155_D_GEO_20141006.txt",skip=34,comment.char='',quote='',check.names=F)
  706. f = f[1:(nrow(f)-1),]
  707. f = unique(f[,4:ncol(f)])
  708. fdata = f
  709. fdata = fdata[!is.na(fdata$SPOT_ID),]
  710. fdata = data.frame(
  711. 'ProbeID' = fdata[["SPOT_ID"]]
  712. ,'SYMBOL' = fdata[['GENE_SYMBOL']]
  713. ,'Control' = fdata[['CONTROL_TYPE']]
  714. ,fdata,row.names=fdata[['SPOT_ID']])
  715. fdata$SYMBOL[fdata$SYMBOL==''] = NA
  716. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
  717. save(fdata ,file='../../../data/Agilent070155.RData')
  718. # Affymetrix_miRNA_4_0
  719. # [miRNA-4_0] Affymetrix Multispecies miRNA-4 Array
  720. ## Affy miRNA Sept 2015
  721. gpl = 'GPL19117'
  722. fdata = Table(getGEO(gpl))
  723. fdata = data.frame(
  724. 'ProbeID' = fdata[['ID']]
  725. ,'SYMBOL' = fdata[['Transcript ID(Array Design)']]
  726. ,fdata,row.names=fdata$ID,check.names=FALSE)
  727. fdata$SYMBOL[fdata$SYMBOL==''] = NA
  728. fdata$SYMBOL[is.na(fdata$SYMBOL)] = fdata$ProbeID[is.na(fdata$SYMBOL)] # replace emtpy probes
  729. save(fdata,file='/lb/project/mugqic/analyste_dev/software/mugqic_R_packages/mugqic_R_packages-master/gqData/data/Affymetrix_miRNA_4_0.RData')