PageRenderTime 52ms CodeModel.GetById 31ms app.highlight 16ms RepoModel.GetById 1ms app.codeStats 0ms

/scripts/factor_analysis.R

http://github.com/hpiwowar/alt-metrics_stats
R | 535 lines | 294 code | 142 blank | 99 comment | 14 complexity | ddffee62a0eb79f390204ed13e689686 MD5 | raw file
  1bSaveImage = T
  2
  3# to get latex to work, first in the R GUI go to 
  4# Misc, start X11 server
  5# And execute the following line
  6Sys.setenv( PATH=paste(Sys.getenv("PATH"),"/usr/texbin",sep=":") ) 
  7
  8library(Rserve)
  9Rserve(args="--no-save")
 10
 11#library(plyr)
 12source("scripts/helper_functions.R")
 13#source("scripts/preprocessing.R")
 14
 15### Get just the dataset we want for this
 16## Restrict to Research Articles
 17## get rid of other columns we won't be using
 18
 19excludeColumns = c("journal", "articleType", "authorsCount")
 20dat.indep.stats = dat.eventcounts[,names(dat.eventcounts) %nin% excludeColumns]
 21dat.indep.stats = dat.indep.stats[which(as.character(dat.eventcounts$articleType) == "Research Article"),]
 22summary(dat.indep.stats)
 23
 24metadataColumns = c("doi", "pubDate", "daysSincePublished", "journal", "articleType", "authorsCount")
 25altmetricsColumns = names(dat.eventcounts)[names(dat.eventcounts) %nin% metadataColumns]
 26
 27#### transform
 28
 29dat.indep.stats.tr = dat.indep.stats
 30dat.indep.stats.tr[,altmetricsColumns] = tr(dat.indep.stats.tr[,altmetricsColumns])
 31summary(dat.indep.stats.tr)
 32
 33###### Normalize
 34
 35
 36inWindow = function(x, windowSize=60) {
 37	inWindowVals = list()
 38	for (ii in seq(along=x)) {
 39		inWindowVals[ii] = list(which((x > (x[ii] - windowSize/2)) & (x < (x[ii] + windowSize/2))))
 40	}
 41	return(inWindowVals)
 42}
 43
 44meanInWindow = function(whichInWindow, y) {
 45	a = sapply(seq(along=y), function(i, x, y) {mean(y[x[[i]]], na.rm=T)}, whichInWindow, y)
 46	return(a)
 47}
 48
 49varInWindow = function(whichInWindow, y, ii) {
 50	a = sapply(seq(along=y), function(i, x, y) {var(y[x[[i]]], na.rm=T)}, whichInWindow, y)
 51	return(a)
 52}
 53
 54
 55
 56dat.indep.stats.tr.norm = dat.indep.stats.tr
 57dat.indep.stats.tr.meannorm = dat.indep.stats.tr
 58dat.background = data.frame(pubDate = dat.indep.stats.tr.norm$pubDate)
 59
 60inWindow180 = inWindow(dat.indep.stats.tr$daysSincePublished, 180)
 61
 62
 63for (col in altmetricsColumns) {
 64	print(col)
 65	background = meanInWindow(inWindow180, dat.indep.stats.tr[,col])
 66	background.var = varInWindow(inWindow180, dat.indep.stats.tr[,col])
 67    dat.indep.stats.tr.norm[,col] = (dat.indep.stats.tr[, col] - background) / background.var
 68    dat.indep.stats.tr.meannorm[,col] = (dat.indep.stats.tr[, col] - background) 
 69	dat.background[,col] = background
 70}
 71
 72quartz()
 73par(mfrow = c(ceiling(length(altmetricsColumns)/4), 4), oma=c(2,2,4,2), mar=c(2, 1, 1, 1))
 74for (col in altmetricsColumns) {
 75	print(col)
 76	#pdf(paste("results/timing/", col, ".pdf", sep=""))
 77	plot(dat.indep.stats.tr$pubDate, dat.indep.stats.tr[,col], main=col, col="black", pch=20, cex=.5)	
 78	points(dat.indep.stats.tr$pubDate, dat.background[,col], col="red", lwd=5, main=col)
 79}
 80title("Trends over time", outer=TRUE)
 81
 82for (col in altmetricsColumns) {
 83	print(col)
 84	#pdf(paste("results/timing/", col, ".pdf", sep=""))
 85
 86	#quartz()
 87	#plot(dat.indep.stats.tr$pubDate, dat.indep.stats.tr[, col]^2 - 1, main=col)
 88	#points(dat.indep.stats.tr$pubDate, dat.background[,col]^2 - 1, col="red", lwd=5, main=col)
 89
 90	quartz()
 91	plot(dat.indep.stats.tr.norm$pubDate, dat.indep.stats.tr[,col], main=col, col="black", pch=20, cex=.5)
 92	points(dat.indep.stats.tr.norm$pubDate, dat.background[,col], col="red", lwd=5, main=col, pch=20, cex=.25)
 93	par(new=T)
 94	plot(dat.indep.stats.tr.norm$pubDate, dat.indep.stats.tr.meannorm[,col], main=col, col="blue", pch=20, cex=.5, axes=F)
 95	axis(side=4)
 96}
 97summary(dat.indep.stats.tr.norm)
 98summary(dat.background)
 99
100
101
102
103############## Get correlation matrix
104
105# Need to do special sorts of correlations because have binary data, not just this:
106# mycor = rcorr(as.matrix(dat.indep.stats))$r
107
108library(polycor)
109#myhetcorr = hetcor.modified(dat.indep.stats, use="complete.obs", std.err=FALSE, pd=FALSE)
110myhetcorr = hetcor.modified(dat.indep.stats.tr.meannorm[,altmetricsColumns], 
111							use="pairwise.complete.obs", 
112							std.err=FALSE, 
113							pd=FALSE, 
114							type="spearman")
115#							type="pearson")
116mycor.unadjusted = myhetcorr$correlations
117#write.table(mycor.unadjusted,"/Users/hpiwowar/stats link/mycor.unadjusted.txt",append=F,quote=F,sep="\t",row.names=T)
118
119# Are some correlations NA?
120which(is.na(mycor.unadjusted))
121#mycor.unadjusted[which(is.na(mycor.unadjusted))] = 1
122
123# Now fix the correlation matrix if it is not positive-definite
124mycor = adjust.to.positive.definite(mycor.unadjusted)
125
126#display
127library(gplots)
128colorRange = round(range(mycor) * 15) + 16
129colorChoices = bluered(32)[colorRange[1]:colorRange[2]]
130if (bSaveImage) png("results/heatmap.png")
131	heatmap.2(mycor, col=colorChoices, cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", margins=c(10,10), key=FALSE, keysize=0.1)
132if (bSaveImage) dev.off()
133
134# Now a heatmap with a subsample of articles and the variables
135set.seed(42)
136dat.subsample = as.matrix(dat.indep.stats[sample(1:dim(dat.indep.stats)[1], 1000, TRUE),altmetricsColumns])
137m=32
138# using canberra distance because tried several and this one had the most interperable dendrogram
139heatmap.2(t(dat.subsample), col=bluered(m*2)[m:(m*2-1)], 
140	distfun = function(x) dist(x, method="canberra"), 
141	cexRow=.6, cexCol=.6, dend = "both", scale="row", trace="none", 
142	margins=c(1,10), key=FALSE, keysize=0.1, breaks=c(0:m)/m)
143
144
145showpanel <- function(col) {
146  image(z=matrix(1:100, ncol=1), col=col, xaxt="n", yaxt="n" )
147}
148quartz()
149showpanel(colorChoices)
150showpanel(bluered(32))
151
152
153############## FIRST ORDER ANALYSIS
154
155##############  Determine number of First-Order factors
156
157# Determine Number of Factors to Extract
158library(nFactors)
159eigenvectors.1st <- eigen(mycor) # get eigenvalues
160# this line takes a long time
161aparallel.1st <- parallel(subject=nrow(dat.indep.stats.tr.meannorm), var=ncol(dat.indep.stats.tr.meannorm), rep=100, cent=.05)
162scree.results.1st <- nScree(eigenvectors.1st$values, aparallel.1st$eigen$qevpea)
163summary(scree.results.1st)
164plotnScree(scree.results.1st) 
165
166# Pull out the "Optimal Coordinate"
167# defined in nScree help page as 
168# The optimal coordinates (OC) corresponds to an extrapolation of the preceding eigenvalue by a regression line between the eignvalue coordinates and the last eigenvalue coordinate
169#number.factors.1st = scree.results.1st$Components$noc
170number.factors.1st = 5
171
172
173##############  Do First-Order Factor Analysis
174
175# Maximum liklihood 
176fit.ml = factanal(dat.indep.stats.tr.meannorm, number.factors.1st, rotation="oblimin", covmat=mycor)
177print(fit.ml, sort=TRUE)
178
179
180# Use princip axis when maximum liklihood fails to converge:
181library(psych)
182
183fit.fa.1st = fa(mycor, number.factors.1st, fm="minres", rotate="oblimin", 
184                scores=FALSE, residuals=TRUE, n.obs=max(dim(dat.indep.stats.tr.meannorm)))
185
186#to show the loadings sorted by absolute value
187print(fit.fa.1st, sort=TRUE)
188
189#fa.diagram(fit.fa.1st)
190#cluster.plot(fit.fa.1st)
191
192# look at residuals
193#fit.fa.1st.residuals = fit.fa.1st$residual
194#heatmap.2(fit.fa.1st.residuals, col=bluered(16), cexRow=0.5, cexCol = .8, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(15,15), key=FALSE, keysize=0.1)
195
196
197factor.names.1st = c(
198"MR1"="MR1",
199"MR2"="MR2",
200"MR3"="MR3",
201"MR4"="MR4",
202"MR5"="MR5")
203
204for (afactor in names(factor.names.1st)) {
205    print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))
206    print.thresh(fit.fa.1st$loadings[, afactor], .3, TRUE)
207    print.thresh(fit.fa.1st$loadings[, afactor], -0.3, FALSE)
208}
209
210
211
212############## SECOND ORDER ANALYSIS
213
214##############  Determine number of Second-Order factors
215# Equations as per Factor Analysis, Second Edition, by Gorsuch
216
217# Correlation of my first order results
218fit.fa.1st.cor = fit.fa.1st$r.scores
219colnames(fit.fa.1st.cor) = factor.names.1st[colnames(fit.fa.1st$loadings)]
220rownames(fit.fa.1st.cor) = factor.names.1st[colnames(fit.fa.1st$loadings)]
221
222eigenvectors.2nd <- eigen(fit.fa.1st.cor) # get eigenvalues
223aparallel.2nd <- parallel(subject=nrow(fit.fa.1st.cor), var=ncol(fit.fa.1st.cor), rep=100, cent=.05)
224scree.results.2nd <- nScree(eigenvectors.2nd$values, aparallel.2nd$eigen$qevpea)
225scree.results.2nd
226plotnScree(scree.results.2nd) 
227
228#number.factors.2nd = scree.results.2nd$Components$noc
229number.factors.2nd = 3
230
231
232##############  Do Second-Order Factor Analysis
233
234# Ideally uncorrelated, but want it to be a good fit
235#fit.fa.2nd = fa(fit.fa.1st.cor, number.factors.2nd, fa="minres", rotate="varimax")
236fit.fa.2nd = fa(fit.fa.1st.cor, number.factors.2nd, fm="minres", rotate="varimax")
237print(fit.fa.2nd, sort=TRUE)
238
239#fa.diagram(fit.fa.2nd)
240#cluster.plot(fit.fa.2nd)
241
242##############  Map variables directly to second order analysis
243# Equations as per Factor Analysis, Second Edition, by Gorsuch
244
245U = fit.fa.2nd$uniquenesses * diag(number.factors.1st)
246P = fit.fa.2nd$loadings
247A = cbind(P, U)
248Pvf = fit.fa.1st$loadings
249
250Pvo = Pvf %*% A
251
252############# HAVE A LOOK AT THESE RESULTS
253
254#Pvo[1:24, 1:4]
255# Interesting:  last.author.num.prev.geoae.sharing.tr          0.134139157 -0.066308705  0.138307260 -0.04026715
256# what about how it would correlate with our main endpoint?
257
258
259factor.names.2nd = c(
260	"MR1"="Mendeley",
261	"MR2"="Citations and downloads",
262	"MR3"="Facebook and Comments")
263
264# On original variables
265for (afactor in names(factor.names.2nd)) {
266    print(paste(afactor, ": ", factor.names.2nd[afactor], sep=""))    
267    print.thresh(Pvo[, afactor], .3, TRUE)
268    print.thresh(Pvo[, afactor], -0.3, FALSE)
269}
270
271# On first-order factors
272for (afactor in names(factor.names.2nd)) {
273    print(paste(afactor, ": ", factor.names.2nd[afactor], sep=""))
274    print.thresh(fit.fa.2nd$loadings[, afactor], .3, TRUE)
275    print.thresh(fit.fa.2nd$loadings[, afactor], -0.3, FALSE)
276}
277########   IMPUTE VARIABLES SO CAN CALCULATE SCORES
278
279
280# Just want one output variables
281#dat$dataset.in.geo.or.ae.int = dat.nums$in.ae.or.geo
282#dat.impute.input = dat[,!names(dat) %in% c("dataset.in.geo", "dataset.in.geo.or.ae")]
283dat.impute.input = dat.indep.stats.norm
284
285# Show the pattern of NAs
286library(mice)
287#md.pattern(dat.impute.input)
288
289### Impute variables
290mice.output = mice(dat.impute.input, m=1)  # singular
291
292 
293# Now flush out the rest of the scores 
294dat.imputed = complete(mice.output, 1)
295#dat.for.scores = dat.imputed[,!names(dat.imputed) %in% c("dataset.in.geo.or.ae.int")]
296dat.for.scores = dat.imputed
297
298
299
300###### COMPUTE FIRST ORDER SCORES
301
302scores.1st = factor.scores.bartlett(dat.for.scores, fit.fa.1st)
303
304dat.scores.merge = cbind(dat, scores.1st)
305
306for (afactor in names(factor.names.1st)) {
307    print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))
308    quartz()
309    plsmo(-1*dat.scores.merge$days.since.published, dat.scores.merge[,afactor], ylab=factor.names.1st[afactor], datadensity=TRUE)
310    title(factor.names.1st[afactor])
311    loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))
312    par(new=TRUE) 
313    for (loaded_var in loaded) {
314        print(loaded_var)
315        plsmo(-1*dat.scores.merge$days.since.published, dat.scores.merge[,loaded_var], add=T, datadensity=T, col="red", axes=FALSE)
316        axis(4)    
317    }
318}
319
320    plot(dat.scores.merge$days.since.published, dat.scores.merge[,names(dat.merge) %in% loaded], add=T)
321
322######### SECOND ORDER 
323
324
325loadings.2nd = Pvo
326
327#fit.pa.2nd.tovars = list(loadings=loadings.2nd[,(1+length(colnames(fit.pa.2nd$weights))):length(colnames(loadings.2nd))])
328fit.pa.2nd.tovars = list(loadings=loadings.2nd[,colnames(fit.fa.2nd$loadings)])
329fit.pa.2nd.tovars$uniquenesses = apply(fit.pa.2nd.tovars$loadings^2, 1, sum)
330
331scores.to.dat.2nd = factor.scores.bartlett(dat.for.scores, fit.pa.2nd.tovars)
332
333
334for (afactor in names(factor.names.1st)) {
335    print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))
336    quartz()
337    plsmo(-1*dat.regress$days.since.published, dat.regress[afactor], ylab=factor.names.1st[afactor])
338    title(factor.names.1st[afactor])
339    loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))
340    plot(-1*dat.merge$days.since.published, dat.merge[,names(dat.merge) %in% loaded], add=T)
341}
342
343####### MERGE WITH AIM3
344load("/Users/hpiwowar/Documents/Code/hpiwowar/pypub/trunk/src/aim3/stats/dat_aim3.RData")
345
346dat.plos = cbind(dat, scores.1st)
347
348dat.merge = merge(dat.plos, dat.aim3, by="pmid")
349
350dat.regress = data.frame(dataset.in.geo.or.ae.int = dat.merge$dataset.in.geo.or.ae.int)
351score.names = dimnames(scores.1st)[[2]]
352dat.regress[,score.names] = dat.merge[,names(dat.merge) %in% score.names]
353
354vars.indep = c("days.since.published", 
355"num.authors.tr",
356"first.author.num.prev.pubs.tr", 
357"first.author.num.prev.pmc.cites.tr",
358"last.author.num.prev.pubs.tr",
359"last.author.num.prev.pmc.cites.tr",
360"num.grant.numbers.tr",
361"pubmed.is.humans",
362"pubmed.is.cancer",
363"country.usa",
364"institution.rank",
365"nih.sum.sum.dollars.tr")
366
367dat.regress[,vars.indep] = dat.merge[,vars.indep]
368
369
370
371
372
373
374plot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress)
375boxplot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress, main="MR2")
376boxplot(log(MR1) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
377boxplot(log(MR2) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
378boxplot(log(MR3) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
379boxplot(log(MR4) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
380boxplot(log(MR5) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
381
382for (afactor in names(factor.names.1st)) {
383    print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))
384    quartz()
385    boxplot(dat.regress[,afactor] ~ dat.regress$dataset.in.geo.or.ae.int)
386    title(factor.names.1st[afactor])
387}
388
389
390
391##############################################################################
392
393####### FIRST ORDER REGRESSION
394# Not sure if this will be a primary result
395
396
397# If I name them here, then setting up the regressions is more awkward due to long names
398#names(dat.regress) = c("dataset.in.geo.or.ae.int", factor.names.1st[names(dat.regress)[-1]])
399
400library(rms)
401dd.regress = datadist(dat.regress)
402options(datadist='dd.regress')
403options(digits=2)
404
405f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +
406    (rcs(days.since.aug.2009, 10) +
407    rcs(num.authors.tr, 3) ) +
408    rcs(first.author.num.prev.pubs.tr, 3) + 
409    rcs(first.author.num.prev.pmc.cites.tr, 3) +
410    rcs(last.author.num.prev.pubs.tr, 3) + 
411    rcs(last.author.num.prev.pmc.cites.tr, 3) + 
412    as.numeric(pubmed.is.humans) + 
413    as.numeric(pubmed.is.cancer) + 
414    as.numeric(country.usa) )
415#    rcs(institution.rank, 3) )
416    ,
417    dat=dat.regress, x=T, y=T)
418anova(f.1st.nonlinear.interactions.reduced)
419
420f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +
421    rcs(days.since.aug.2009, 10) + 
422    (rcs(num.authors.tr, 3) + 
423    rcs(last.author.num.prev.pmc.cites.tr, 3) + 
424    as.numeric(pubmed.is.humans) + 
425    as.numeric(pubmed.is.cancer) + 
426    as.numeric(country.usa) ) )
427    #rcs(institution.rank, 3) )
428    # which plos journal?
429    # watch degrees of freedom!
430    # try num.cites.scopus
431    # read up about ols.  Am I requiring it to be linear?
432    # is MR3 normal?  does it matter?  sqrt it or log it?
433    ,
434    dat=dat.regress, x=T, y=T)
435anova(f.1st.nonlinear.interactions.reduced)
436
437s = summary(f.1st.nonlinear.interactions.reduced)
438s
439plot(s)
440
441
442
443
444summ.1st.nonlinear.interactions.reduced = summary(f.1st.nonlinear.interactions.reduced)
445summ.1st.nonlinear.interactions.reduced.dimnames = dimnames(summ.1st.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.1st.nonlinear.interactions.reduced)[[1]]),2)]
446dimnames(summ.1st.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.1st.nonlinear.interactions.reduced)[[1]]),2)] = factor.names.1st[summ.1st.nonlinear.interactions.reduced.dimnames]
447plot(summ.1st.nonlinear.interactions.reduced, q = c(0.95), col=gray(0.5), log=T, cex=.8)
448title("Multivariate nonlinear regressions with interactions")
449
450### Dots of first-order factors
451dat.regress.named = dat.regress
452names(dat.regress.named) = c("dataset.in.geo.or.ae.int", factor.names.1st[names(dat.regress)[-1]])
453
454dots.1st.nonlinear.interactions.reduced = summary(dataset.in.geo.or.ae.int ~ .,
455    dat=dat.regress.named)
456plot(dots.1st.nonlinear.interactions.reduced, cex.labels=0.5, cex=0.7, 
457    xlab="Percentage of studies with datasets in GEO or ArrayExpress", 
458    main="Univariate data sharing behaviour on first order factors")
459#plot.summary.formula.response
460#?summary.formula
461			
462
463
464### HERE
465
466dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat.merge$dataset.in.geo.or.ae.int) 
467score.names.2nd = dimnames(scores.to.dat.2nd)[[2]]
468dat.regress.2nd[,score.names.2nd] = dat.merge[,names(dat.merge) %in% score.names.2nd]
469
470dat.regress.2nd[,vars.indep] = dat.merge[,vars.indep]
471
472library(rms)
473
474dd.regress.2nd = datadist(dat.regress.2nd)
475options(datadist='dd.regress.2nd')
476options(digits=2)
477
478
479f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ 
480    (rcs(days.since.aug.2009, 10) + rcs(MR1, 3))^2,
481    dat=dat.regress.2nd, x=T, y=T)
482anova(f.2nd.nonlinear.interactions.reduced)
483
484f.2nd.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +
485    rcs(days.since.aug.2009, 10))^2, 
486    dat=dat.regress.2nd, x=T, y=T)
487anova(f.2nd.nonlinear.interactions.reduced)
488
489f.2nd.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +
490    (rcs(days.since.aug.2009, 10) +
491    rcs(num.authors.tr, 3) ) +
492    rcs(first.author.num.prev.pubs.tr, 3) + 
493    rcs(first.author.num.prev.pmc.cites.tr, 3) +
494    rcs(last.author.num.prev.pubs.tr, 3) + 
495    rcs(last.author.num.prev.pmc.cites.tr, 3) + 
496    as.numeric(pubmed.is.humans) + 
497    as.numeric(pubmed.is.cancer) + 
498    as.numeric(country.usa) )
499#    rcs(institution.rank, 3) )
500    ,
501    dat=dat.regress.2nd, x=T, y=T)
502anova(f.2nd.nonlinear.interactions.reduced)
503
504
505f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ 
506    (rcs(MR1, 4) + rcs(MR2, 4))^2,
507    dat=dat.regress.2nd, x=T, y=T)
508anova(f.2nd.nonlinear.interactions.reduced)
509
510summ.2nd.nonlinear.interactions.reduced = summary(f.2nd.nonlinear.interactions.reduced)
511summ.2nd.nonlinear.interactions.reduced.dimnames = dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]]),2)]
512dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]]),2)] = factor.names.2nd[summ.2nd.nonlinear.interactions.reduced.dimnames]
513plot(summ.2nd.nonlinear.interactions.reduced, q = c(0.95), col=gray(0.5), log=T, cex=.8)
514title("Multivariate nonlinear regression with interactions")
515
516### Dots of second-order factors
517dat.regress.2nd.named = dat.regress.2nd
518names(dat.regress.2nd.named) = c("dataset.in.geo.or.ae.int", factor.names.2nd[names(dat.regress.2nd)[-1]])
519
520dots.2nd.nonlinear.interactions.reduced = summary(dataset.in.geo.or.ae.int ~ .,
521    dat=dat.regress.2nd.named)
522plot(dots.2nd.nonlinear.interactions.reduced, cex.labels=0.5, cex=0.7, 
523    xlab="Percentage of studies with datasets in GEO or ArrayExpress", 
524    main="Univariate data sharing behaviour\non second order factors")
525
526
527
528########
529
530#save.image("image.RData")
531#setwd("/Users/hpiwowar/Documents/Code/hpiwowar/pypub/trunk/src/aim3/stats")
532#load("image.RData")
533
534#setwd("/Users/hpiwowar/Documents/Code/hpiwowar/pypub/trunk/src/aim3/stats")
535#for (iii in 1:4) source("aim3_stats_20100217b.R", echo=TRUE)