PageRenderTime 1830ms CodeModel.GetById 38ms RepoModel.GetById 10ms app.codeStats 0ms

/scripts/factor_analysis.R

http://github.com/hpiwowar/alt-metrics_stats
R | 535 lines | 294 code | 142 blank | 99 comment | 14 complexity | ddffee62a0eb79f390204ed13e689686 MD5 | raw file
Possible License(s): MIT
  1. bSaveImage = T
  2. # to get latex to work, first in the R GUI go to
  3. # Misc, start X11 server
  4. # And execute the following line
  5. Sys.setenv( PATH=paste(Sys.getenv("PATH"),"/usr/texbin",sep=":") )
  6. library(Rserve)
  7. Rserve(args="--no-save")
  8. #library(plyr)
  9. source("scripts/helper_functions.R")
  10. #source("scripts/preprocessing.R")
  11. ### Get just the dataset we want for this
  12. ## Restrict to Research Articles
  13. ## get rid of other columns we won't be using
  14. excludeColumns = c("journal", "articleType", "authorsCount")
  15. dat.indep.stats = dat.eventcounts[,names(dat.eventcounts) %nin% excludeColumns]
  16. dat.indep.stats = dat.indep.stats[which(as.character(dat.eventcounts$articleType) == "Research Article"),]
  17. summary(dat.indep.stats)
  18. metadataColumns = c("doi", "pubDate", "daysSincePublished", "journal", "articleType", "authorsCount")
  19. altmetricsColumns = names(dat.eventcounts)[names(dat.eventcounts) %nin% metadataColumns]
  20. #### transform
  21. dat.indep.stats.tr = dat.indep.stats
  22. dat.indep.stats.tr[,altmetricsColumns] = tr(dat.indep.stats.tr[,altmetricsColumns])
  23. summary(dat.indep.stats.tr)
  24. ###### Normalize
  25. inWindow = function(x, windowSize=60) {
  26. inWindowVals = list()
  27. for (ii in seq(along=x)) {
  28. inWindowVals[ii] = list(which((x > (x[ii] - windowSize/2)) & (x < (x[ii] + windowSize/2))))
  29. }
  30. return(inWindowVals)
  31. }
  32. meanInWindow = function(whichInWindow, y) {
  33. a = sapply(seq(along=y), function(i, x, y) {mean(y[x[[i]]], na.rm=T)}, whichInWindow, y)
  34. return(a)
  35. }
  36. varInWindow = function(whichInWindow, y, ii) {
  37. a = sapply(seq(along=y), function(i, x, y) {var(y[x[[i]]], na.rm=T)}, whichInWindow, y)
  38. return(a)
  39. }
  40. dat.indep.stats.tr.norm = dat.indep.stats.tr
  41. dat.indep.stats.tr.meannorm = dat.indep.stats.tr
  42. dat.background = data.frame(pubDate = dat.indep.stats.tr.norm$pubDate)
  43. inWindow180 = inWindow(dat.indep.stats.tr$daysSincePublished, 180)
  44. for (col in altmetricsColumns) {
  45. print(col)
  46. background = meanInWindow(inWindow180, dat.indep.stats.tr[,col])
  47. background.var = varInWindow(inWindow180, dat.indep.stats.tr[,col])
  48. dat.indep.stats.tr.norm[,col] = (dat.indep.stats.tr[, col] - background) / background.var
  49. dat.indep.stats.tr.meannorm[,col] = (dat.indep.stats.tr[, col] - background)
  50. dat.background[,col] = background
  51. }
  52. quartz()
  53. par(mfrow = c(ceiling(length(altmetricsColumns)/4), 4), oma=c(2,2,4,2), mar=c(2, 1, 1, 1))
  54. for (col in altmetricsColumns) {
  55. print(col)
  56. #pdf(paste("results/timing/", col, ".pdf", sep=""))
  57. plot(dat.indep.stats.tr$pubDate, dat.indep.stats.tr[,col], main=col, col="black", pch=20, cex=.5)
  58. points(dat.indep.stats.tr$pubDate, dat.background[,col], col="red", lwd=5, main=col)
  59. }
  60. title("Trends over time", outer=TRUE)
  61. for (col in altmetricsColumns) {
  62. print(col)
  63. #pdf(paste("results/timing/", col, ".pdf", sep=""))
  64. #quartz()
  65. #plot(dat.indep.stats.tr$pubDate, dat.indep.stats.tr[, col]^2 - 1, main=col)
  66. #points(dat.indep.stats.tr$pubDate, dat.background[,col]^2 - 1, col="red", lwd=5, main=col)
  67. quartz()
  68. plot(dat.indep.stats.tr.norm$pubDate, dat.indep.stats.tr[,col], main=col, col="black", pch=20, cex=.5)
  69. points(dat.indep.stats.tr.norm$pubDate, dat.background[,col], col="red", lwd=5, main=col, pch=20, cex=.25)
  70. par(new=T)
  71. plot(dat.indep.stats.tr.norm$pubDate, dat.indep.stats.tr.meannorm[,col], main=col, col="blue", pch=20, cex=.5, axes=F)
  72. axis(side=4)
  73. }
  74. summary(dat.indep.stats.tr.norm)
  75. summary(dat.background)
  76. ############## Get correlation matrix
  77. # Need to do special sorts of correlations because have binary data, not just this:
  78. # mycor = rcorr(as.matrix(dat.indep.stats))$r
  79. library(polycor)
  80. #myhetcorr = hetcor.modified(dat.indep.stats, use="complete.obs", std.err=FALSE, pd=FALSE)
  81. myhetcorr = hetcor.modified(dat.indep.stats.tr.meannorm[,altmetricsColumns],
  82. use="pairwise.complete.obs",
  83. std.err=FALSE,
  84. pd=FALSE,
  85. type="spearman")
  86. # type="pearson")
  87. mycor.unadjusted = myhetcorr$correlations
  88. #write.table(mycor.unadjusted,"/Users/hpiwowar/stats link/mycor.unadjusted.txt",append=F,quote=F,sep="\t",row.names=T)
  89. # Are some correlations NA?
  90. which(is.na(mycor.unadjusted))
  91. #mycor.unadjusted[which(is.na(mycor.unadjusted))] = 1
  92. # Now fix the correlation matrix if it is not positive-definite
  93. mycor = adjust.to.positive.definite(mycor.unadjusted)
  94. #display
  95. library(gplots)
  96. colorRange = round(range(mycor) * 15) + 16
  97. colorChoices = bluered(32)[colorRange[1]:colorRange[2]]
  98. if (bSaveImage) png("results/heatmap.png")
  99. heatmap.2(mycor, col=colorChoices, cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", margins=c(10,10), key=FALSE, keysize=0.1)
  100. if (bSaveImage) dev.off()
  101. # Now a heatmap with a subsample of articles and the variables
  102. set.seed(42)
  103. dat.subsample = as.matrix(dat.indep.stats[sample(1:dim(dat.indep.stats)[1], 1000, TRUE),altmetricsColumns])
  104. m=32
  105. # using canberra distance because tried several and this one had the most interperable dendrogram
  106. heatmap.2(t(dat.subsample), col=bluered(m*2)[m:(m*2-1)],
  107. distfun = function(x) dist(x, method="canberra"),
  108. cexRow=.6, cexCol=.6, dend = "both", scale="row", trace="none",
  109. margins=c(1,10), key=FALSE, keysize=0.1, breaks=c(0:m)/m)
  110. showpanel <- function(col) {
  111. image(z=matrix(1:100, ncol=1), col=col, xaxt="n", yaxt="n" )
  112. }
  113. quartz()
  114. showpanel(colorChoices)
  115. showpanel(bluered(32))
  116. ############## FIRST ORDER ANALYSIS
  117. ############## Determine number of First-Order factors
  118. # Determine Number of Factors to Extract
  119. library(nFactors)
  120. eigenvectors.1st <- eigen(mycor) # get eigenvalues
  121. # this line takes a long time
  122. aparallel.1st <- parallel(subject=nrow(dat.indep.stats.tr.meannorm), var=ncol(dat.indep.stats.tr.meannorm), rep=100, cent=.05)
  123. scree.results.1st <- nScree(eigenvectors.1st$values, aparallel.1st$eigen$qevpea)
  124. summary(scree.results.1st)
  125. plotnScree(scree.results.1st)
  126. # Pull out the "Optimal Coordinate"
  127. # defined in nScree help page as
  128. # The optimal coordinates (OC) corresponds to an extrapolation of the preceding eigenvalue by a regression line between the eignvalue coordinates and the last eigenvalue coordinate
  129. #number.factors.1st = scree.results.1st$Components$noc
  130. number.factors.1st = 5
  131. ############## Do First-Order Factor Analysis
  132. # Maximum liklihood
  133. fit.ml = factanal(dat.indep.stats.tr.meannorm, number.factors.1st, rotation="oblimin", covmat=mycor)
  134. print(fit.ml, sort=TRUE)
  135. # Use princip axis when maximum liklihood fails to converge:
  136. library(psych)
  137. fit.fa.1st = fa(mycor, number.factors.1st, fm="minres", rotate="oblimin",
  138. scores=FALSE, residuals=TRUE, n.obs=max(dim(dat.indep.stats.tr.meannorm)))
  139. #to show the loadings sorted by absolute value
  140. print(fit.fa.1st, sort=TRUE)
  141. #fa.diagram(fit.fa.1st)
  142. #cluster.plot(fit.fa.1st)
  143. # look at residuals
  144. #fit.fa.1st.residuals = fit.fa.1st$residual
  145. #heatmap.2(fit.fa.1st.residuals, col=bluered(16), cexRow=0.5, cexCol = .8, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(15,15), key=FALSE, keysize=0.1)
  146. factor.names.1st = c(
  147. "MR1"="MR1",
  148. "MR2"="MR2",
  149. "MR3"="MR3",
  150. "MR4"="MR4",
  151. "MR5"="MR5")
  152. for (afactor in names(factor.names.1st)) {
  153. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))
  154. print.thresh(fit.fa.1st$loadings[, afactor], .3, TRUE)
  155. print.thresh(fit.fa.1st$loadings[, afactor], -0.3, FALSE)
  156. }
  157. ############## SECOND ORDER ANALYSIS
  158. ############## Determine number of Second-Order factors
  159. # Equations as per Factor Analysis, Second Edition, by Gorsuch
  160. # Correlation of my first order results
  161. fit.fa.1st.cor = fit.fa.1st$r.scores
  162. colnames(fit.fa.1st.cor) = factor.names.1st[colnames(fit.fa.1st$loadings)]
  163. rownames(fit.fa.1st.cor) = factor.names.1st[colnames(fit.fa.1st$loadings)]
  164. eigenvectors.2nd <- eigen(fit.fa.1st.cor) # get eigenvalues
  165. aparallel.2nd <- parallel(subject=nrow(fit.fa.1st.cor), var=ncol(fit.fa.1st.cor), rep=100, cent=.05)
  166. scree.results.2nd <- nScree(eigenvectors.2nd$values, aparallel.2nd$eigen$qevpea)
  167. scree.results.2nd
  168. plotnScree(scree.results.2nd)
  169. #number.factors.2nd = scree.results.2nd$Components$noc
  170. number.factors.2nd = 3
  171. ############## Do Second-Order Factor Analysis
  172. # Ideally uncorrelated, but want it to be a good fit
  173. #fit.fa.2nd = fa(fit.fa.1st.cor, number.factors.2nd, fa="minres", rotate="varimax")
  174. fit.fa.2nd = fa(fit.fa.1st.cor, number.factors.2nd, fm="minres", rotate="varimax")
  175. print(fit.fa.2nd, sort=TRUE)
  176. #fa.diagram(fit.fa.2nd)
  177. #cluster.plot(fit.fa.2nd)
  178. ############## Map variables directly to second order analysis
  179. # Equations as per Factor Analysis, Second Edition, by Gorsuch
  180. U = fit.fa.2nd$uniquenesses * diag(number.factors.1st)
  181. P = fit.fa.2nd$loadings
  182. A = cbind(P, U)
  183. Pvf = fit.fa.1st$loadings
  184. Pvo = Pvf %*% A
  185. ############# HAVE A LOOK AT THESE RESULTS
  186. #Pvo[1:24, 1:4]
  187. # Interesting: last.author.num.prev.geoae.sharing.tr 0.134139157 -0.066308705 0.138307260 -0.04026715
  188. # what about how it would correlate with our main endpoint?
  189. factor.names.2nd = c(
  190. "MR1"="Mendeley",
  191. "MR2"="Citations and downloads",
  192. "MR3"="Facebook and Comments")
  193. # On original variables
  194. for (afactor in names(factor.names.2nd)) {
  195. print(paste(afactor, ": ", factor.names.2nd[afactor], sep=""))
  196. print.thresh(Pvo[, afactor], .3, TRUE)
  197. print.thresh(Pvo[, afactor], -0.3, FALSE)
  198. }
  199. # On first-order factors
  200. for (afactor in names(factor.names.2nd)) {
  201. print(paste(afactor, ": ", factor.names.2nd[afactor], sep=""))
  202. print.thresh(fit.fa.2nd$loadings[, afactor], .3, TRUE)
  203. print.thresh(fit.fa.2nd$loadings[, afactor], -0.3, FALSE)
  204. }
  205. ######## IMPUTE VARIABLES SO CAN CALCULATE SCORES
  206. # Just want one output variables
  207. #dat$dataset.in.geo.or.ae.int = dat.nums$in.ae.or.geo
  208. #dat.impute.input = dat[,!names(dat) %in% c("dataset.in.geo", "dataset.in.geo.or.ae")]
  209. dat.impute.input = dat.indep.stats.norm
  210. # Show the pattern of NAs
  211. library(mice)
  212. #md.pattern(dat.impute.input)
  213. ### Impute variables
  214. mice.output = mice(dat.impute.input, m=1) # singular
  215. # Now flush out the rest of the scores
  216. dat.imputed = complete(mice.output, 1)
  217. #dat.for.scores = dat.imputed[,!names(dat.imputed) %in% c("dataset.in.geo.or.ae.int")]
  218. dat.for.scores = dat.imputed
  219. ###### COMPUTE FIRST ORDER SCORES
  220. scores.1st = factor.scores.bartlett(dat.for.scores, fit.fa.1st)
  221. dat.scores.merge = cbind(dat, scores.1st)
  222. for (afactor in names(factor.names.1st)) {
  223. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))
  224. quartz()
  225. plsmo(-1*dat.scores.merge$days.since.published, dat.scores.merge[,afactor], ylab=factor.names.1st[afactor], datadensity=TRUE)
  226. title(factor.names.1st[afactor])
  227. loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))
  228. par(new=TRUE)
  229. for (loaded_var in loaded) {
  230. print(loaded_var)
  231. plsmo(-1*dat.scores.merge$days.since.published, dat.scores.merge[,loaded_var], add=T, datadensity=T, col="red", axes=FALSE)
  232. axis(4)
  233. }
  234. }
  235. plot(dat.scores.merge$days.since.published, dat.scores.merge[,names(dat.merge) %in% loaded], add=T)
  236. ######### SECOND ORDER
  237. loadings.2nd = Pvo
  238. #fit.pa.2nd.tovars = list(loadings=loadings.2nd[,(1+length(colnames(fit.pa.2nd$weights))):length(colnames(loadings.2nd))])
  239. fit.pa.2nd.tovars = list(loadings=loadings.2nd[,colnames(fit.fa.2nd$loadings)])
  240. fit.pa.2nd.tovars$uniquenesses = apply(fit.pa.2nd.tovars$loadings^2, 1, sum)
  241. scores.to.dat.2nd = factor.scores.bartlett(dat.for.scores, fit.pa.2nd.tovars)
  242. for (afactor in names(factor.names.1st)) {
  243. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))
  244. quartz()
  245. plsmo(-1*dat.regress$days.since.published, dat.regress[afactor], ylab=factor.names.1st[afactor])
  246. title(factor.names.1st[afactor])
  247. loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))
  248. plot(-1*dat.merge$days.since.published, dat.merge[,names(dat.merge) %in% loaded], add=T)
  249. }
  250. ####### MERGE WITH AIM3
  251. load("/Users/hpiwowar/Documents/Code/hpiwowar/pypub/trunk/src/aim3/stats/dat_aim3.RData")
  252. dat.plos = cbind(dat, scores.1st)
  253. dat.merge = merge(dat.plos, dat.aim3, by="pmid")
  254. dat.regress = data.frame(dataset.in.geo.or.ae.int = dat.merge$dataset.in.geo.or.ae.int)
  255. score.names = dimnames(scores.1st)[[2]]
  256. dat.regress[,score.names] = dat.merge[,names(dat.merge) %in% score.names]
  257. vars.indep = c("days.since.published",
  258. "num.authors.tr",
  259. "first.author.num.prev.pubs.tr",
  260. "first.author.num.prev.pmc.cites.tr",
  261. "last.author.num.prev.pubs.tr",
  262. "last.author.num.prev.pmc.cites.tr",
  263. "num.grant.numbers.tr",
  264. "pubmed.is.humans",
  265. "pubmed.is.cancer",
  266. "country.usa",
  267. "institution.rank",
  268. "nih.sum.sum.dollars.tr")
  269. dat.regress[,vars.indep] = dat.merge[,vars.indep]
  270. plot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress)
  271. boxplot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress, main="MR2")
  272. boxplot(log(MR1) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
  273. boxplot(log(MR2) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
  274. boxplot(log(MR3) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
  275. boxplot(log(MR4) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
  276. boxplot(log(MR5) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
  277. for (afactor in names(factor.names.1st)) {
  278. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))
  279. quartz()
  280. boxplot(dat.regress[,afactor] ~ dat.regress$dataset.in.geo.or.ae.int)
  281. title(factor.names.1st[afactor])
  282. }
  283. ##############################################################################
  284. ####### FIRST ORDER REGRESSION
  285. # Not sure if this will be a primary result
  286. # If I name them here, then setting up the regressions is more awkward due to long names
  287. #names(dat.regress) = c("dataset.in.geo.or.ae.int", factor.names.1st[names(dat.regress)[-1]])
  288. library(rms)
  289. dd.regress = datadist(dat.regress)
  290. options(datadist='dd.regress')
  291. options(digits=2)
  292. f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +
  293. (rcs(days.since.aug.2009, 10) +
  294. rcs(num.authors.tr, 3) ) +
  295. rcs(first.author.num.prev.pubs.tr, 3) +
  296. rcs(first.author.num.prev.pmc.cites.tr, 3) +
  297. rcs(last.author.num.prev.pubs.tr, 3) +
  298. rcs(last.author.num.prev.pmc.cites.tr, 3) +
  299. as.numeric(pubmed.is.humans) +
  300. as.numeric(pubmed.is.cancer) +
  301. as.numeric(country.usa) )
  302. # rcs(institution.rank, 3) )
  303. ,
  304. dat=dat.regress, x=T, y=T)
  305. anova(f.1st.nonlinear.interactions.reduced)
  306. f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +
  307. rcs(days.since.aug.2009, 10) +
  308. (rcs(num.authors.tr, 3) +
  309. rcs(last.author.num.prev.pmc.cites.tr, 3) +
  310. as.numeric(pubmed.is.humans) +
  311. as.numeric(pubmed.is.cancer) +
  312. as.numeric(country.usa) ) )
  313. #rcs(institution.rank, 3) )
  314. # which plos journal?
  315. # watch degrees of freedom!
  316. # try num.cites.scopus
  317. # read up about ols. Am I requiring it to be linear?
  318. # is MR3 normal? does it matter? sqrt it or log it?
  319. ,
  320. dat=dat.regress, x=T, y=T)
  321. anova(f.1st.nonlinear.interactions.reduced)
  322. s = summary(f.1st.nonlinear.interactions.reduced)
  323. s
  324. plot(s)
  325. summ.1st.nonlinear.interactions.reduced = summary(f.1st.nonlinear.interactions.reduced)
  326. summ.1st.nonlinear.interactions.reduced.dimnames = dimnames(summ.1st.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.1st.nonlinear.interactions.reduced)[[1]]),2)]
  327. dimnames(summ.1st.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.1st.nonlinear.interactions.reduced)[[1]]),2)] = factor.names.1st[summ.1st.nonlinear.interactions.reduced.dimnames]
  328. plot(summ.1st.nonlinear.interactions.reduced, q = c(0.95), col=gray(0.5), log=T, cex=.8)
  329. title("Multivariate nonlinear regressions with interactions")
  330. ### Dots of first-order factors
  331. dat.regress.named = dat.regress
  332. names(dat.regress.named) = c("dataset.in.geo.or.ae.int", factor.names.1st[names(dat.regress)[-1]])
  333. dots.1st.nonlinear.interactions.reduced = summary(dataset.in.geo.or.ae.int ~ .,
  334. dat=dat.regress.named)
  335. plot(dots.1st.nonlinear.interactions.reduced, cex.labels=0.5, cex=0.7,
  336. xlab="Percentage of studies with datasets in GEO or ArrayExpress",
  337. main="Univariate data sharing behaviour on first order factors")
  338. #plot.summary.formula.response
  339. #?summary.formula
  340. ### HERE
  341. dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat.merge$dataset.in.geo.or.ae.int)
  342. score.names.2nd = dimnames(scores.to.dat.2nd)[[2]]
  343. dat.regress.2nd[,score.names.2nd] = dat.merge[,names(dat.merge) %in% score.names.2nd]
  344. dat.regress.2nd[,vars.indep] = dat.merge[,vars.indep]
  345. library(rms)
  346. dd.regress.2nd = datadist(dat.regress.2nd)
  347. options(datadist='dd.regress.2nd')
  348. options(digits=2)
  349. f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~
  350. (rcs(days.since.aug.2009, 10) + rcs(MR1, 3))^2,
  351. dat=dat.regress.2nd, x=T, y=T)
  352. anova(f.2nd.nonlinear.interactions.reduced)
  353. f.2nd.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +
  354. rcs(days.since.aug.2009, 10))^2,
  355. dat=dat.regress.2nd, x=T, y=T)
  356. anova(f.2nd.nonlinear.interactions.reduced)
  357. f.2nd.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +
  358. (rcs(days.since.aug.2009, 10) +
  359. rcs(num.authors.tr, 3) ) +
  360. rcs(first.author.num.prev.pubs.tr, 3) +
  361. rcs(first.author.num.prev.pmc.cites.tr, 3) +
  362. rcs(last.author.num.prev.pubs.tr, 3) +
  363. rcs(last.author.num.prev.pmc.cites.tr, 3) +
  364. as.numeric(pubmed.is.humans) +
  365. as.numeric(pubmed.is.cancer) +
  366. as.numeric(country.usa) )
  367. # rcs(institution.rank, 3) )
  368. ,
  369. dat=dat.regress.2nd, x=T, y=T)
  370. anova(f.2nd.nonlinear.interactions.reduced)
  371. f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~
  372. (rcs(MR1, 4) + rcs(MR2, 4))^2,
  373. dat=dat.regress.2nd, x=T, y=T)
  374. anova(f.2nd.nonlinear.interactions.reduced)
  375. summ.2nd.nonlinear.interactions.reduced = summary(f.2nd.nonlinear.interactions.reduced)
  376. summ.2nd.nonlinear.interactions.reduced.dimnames = dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]]),2)]
  377. dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]]),2)] = factor.names.2nd[summ.2nd.nonlinear.interactions.reduced.dimnames]
  378. plot(summ.2nd.nonlinear.interactions.reduced, q = c(0.95), col=gray(0.5), log=T, cex=.8)
  379. title("Multivariate nonlinear regression with interactions")
  380. ### Dots of second-order factors
  381. dat.regress.2nd.named = dat.regress.2nd
  382. names(dat.regress.2nd.named) = c("dataset.in.geo.or.ae.int", factor.names.2nd[names(dat.regress.2nd)[-1]])
  383. dots.2nd.nonlinear.interactions.reduced = summary(dataset.in.geo.or.ae.int ~ .,
  384. dat=dat.regress.2nd.named)
  385. plot(dots.2nd.nonlinear.interactions.reduced, cex.labels=0.5, cex=0.7,
  386. xlab="Percentage of studies with datasets in GEO or ArrayExpress",
  387. main="Univariate data sharing behaviour\non second order factors")
  388. ########
  389. #save.image("image.RData")
  390. #setwd("/Users/hpiwowar/Documents/Code/hpiwowar/pypub/trunk/src/aim3/stats")
  391. #load("image.RData")
  392. #setwd("/Users/hpiwowar/Documents/Code/hpiwowar/pypub/trunk/src/aim3/stats")
  393. #for (iii in 1:4) source("aim3_stats_20100217b.R", echo=TRUE)