PageRenderTime 53ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 1ms

/.Rhistory

http://github.com/hpiwowar/alt-metrics_stats
#! | 1776 lines | 1776 code | 0 blank | 0 comment | 0 complexity | a1e04d14a7b24fc5ae481dae58dfe9c3 MD5 | raw file
Possible License(s): MIT
  1. library(rmeta)
  2. #
  3. #
  4. strip_newlines = function(str) {#
  5. return(gsub("\n", "", str))#
  6. }#
  7. #
  8. start_results = function(str, ...) {#
  9. formatted_string = get_formatted_string(str, ...)#
  10. results <<- formatted_string#
  11. all_results <<- paste(all_results, formatted_string, sep="")#
  12. }#
  13. #
  14. get_formatted_string = function(str, ...)#
  15. if (length(list(...)) > 0) {#
  16. formatted_string = sprintf(str, ...)#
  17. } else {#
  18. formatted_string = str#
  19. }#
  20. return(formatted_string)#
  21. }#
  22. #
  23. append_to_results = function(str, ...) {#
  24. formatted_string = get_formatted_string(str, ...)#
  25. results <<- paste(results, formatted_string, sep="")#
  26. all_results <<- paste(all_results, formatted_string, sep="")#
  27. }#
  28. #
  29. count_unique_items = function(column) {#
  30. return(dim(table(column)))#
  31. }#
  32. #
  33. count_true_items = function(column) {#
  34. return(sum(column==TRUE))#
  35. }#
  36. #
  37. count_false_items = function(column) {#
  38. return(sum(column!=TRUE))#
  39. }#
  40. #
  41. #
  42. #
  43. all_results = ""#
  44. start_results("\nOchsner et al.{Ochsner} identified ")#
  45. #
  46. dat.read = read.csv("rawdata.csv", header=TRUE, sep=",")#
  47. number_of_data_producing_publications_found_by_ochsner = #
  48. count_unique_items(dat.read$pmid)#
  49. append_to_results(number_of_data_producing_publications_found_by_ochsner) #
  50. append_to_results(" studies that generated gene expression microarray data.")#
  51. results#
  52. #
  53. start_results(" Ochsnerâ&#x20AC;&#x2122;s search found ")#
  54. number_of_datasets_found_by_ochsner = #
  55. count_true_items(dat.read$ochsner_found_any_data)#
  56. append_to_results(number_of_datasets_found_by_ochsner)#
  57. append_to_results(" publicly-available datasets, ")#
  58. append_to_results("representing ")#
  59. number_of_data_sharing_publications_found_by_ochsner = #
  60. count_unique_items(dat.read$pmid[dat.read$ochsner_found_any_data == TRUE])#
  61. append_to_results(number_of_data_sharing_publications_found_by_ochsner)#
  62. append_to_results(" distinct studies")#
  63. append_to_results(" (or %.1f%% of all %d data-producing studies; some studies have more than one associated dataset).", #
  64. 100*number_of_data_sharing_publications_found_by_ochsner / number_of_data_producing_publications_found_by_ochsner, #
  65. number_of_data_producing_publications_found_by_ochsner)#
  66. results#
  67. #
  68. start_results(" Of the datasets, ")#
  69. number_of_datasets_found_in_geo_by_ochsner = count_true_items(dat.read$ochsner_found_geo_data)#
  70. append_to_results("%d (%.1f%%)", #
  71. number_of_datasets_found_in_geo_by_ochsner, #
  72. 100*number_of_datasets_found_in_geo_by_ochsner / number_of_data_producing_publications_found_by_ochsner_that_shared_data)#
  73. append_to_results(" were found in the Gene Expression Omnibus (GEO) database, ")#
  74. number_of_datasets_found_in_arrayexpress_by_ochsner = count_true_items(dat.read$ochsner_found_arrayexpress_data)#
  75. append_to_results("%d (%.1f%%)", #
  76. number_of_datasets_found_in_arrayexpress_by_ochsner, #
  77. 100*number_of_datasets_found_in_arrayexpress_by_ochsner / number_of_data_producing_publications_found_by_ochsner_that_shared_data) #
  78. append_to_results(" in ArrayExpress, ")#
  79. number_of_datasets_found_on_journal_sites_by_ochsner = count_true_items(dat.read$ochsner_journal)#
  80. append_to_results("%d (%.1f%%)", #
  81. number_of_datasets_found_on_journal_sites_by_ochsner, #
  82. 100*number_of_datasets_found_on_journal_sites_by_ochsner / number_of_data_producing_publications_found_by_ochsner_that_shared_data) #
  83. append_to_results(" hosted on journal websites, and ")#
  84. number_of_datasets_found_elsewhere_by_ochsner = count_true_items(dat.read$ochsner_other)#
  85. append_to_results("%d (%.1f%%)", #
  86. number_of_datasets_found_elsewhere_by_ochsner, #
  87. 100*number_of_datasets_found_elsewhere_by_ochsner / number_of_data_producing_publications_found_by_ochsner_that_shared_data) #
  88. append_to_results(" on lab websites and smaller online data repositories ")#
  89. append_to_results("(some datasets were found in more than one location).")#
  90. results#
  91. #
  92. start_results(" Combined, GEO and ArrayExpress housed ")#
  93. number_of_datasets_found_in_geo_or_arrayexpress_by_ochsner = #
  94. count_true_items(dat.read$ochsner_found_geo_data | dat.read$ochsner_found_arrayexpress_data)#
  95. append_to_results("%d (%.1f%%)", #
  96. number_of_datasets_found_in_geo_or_arrayexpress_by_ochsner, #
  97. 100*number_of_datasets_found_in_geo_or_arrayexpress_by_ochsner / number_of_data_producing_publications_found_by_ochsner_that_shared_data) #
  98. append_to_results(" of the manually-located datasets.")#
  99. results#
  100. #
  101. start_results("\nNext, we ran automated queries. We queried the GEO and ArrayExpress databases with the PubMed IDs of the ")#
  102. append_to_results(number_of_data_producing_publications_found_by_ochsner)#
  103. append_to_results(" data-producing studies.")#
  104. results#
  105. #
  106. #
  107. start_results(" The automated queries returned ")#
  108. number_of_oschnser_pmid_link_to_geo_or_arrayexpress = count_true_items(dat.read$centralized_link_to_ochsner_pmid & (dat.read$in_geo | dat.read$in_arrayexpress))#
  109. append_to_results(number_of_oschnser_pmid_link_to_geo_or_arrayexpress)#
  110. append_to_results(" datasets in total, with ")#
  111. number_of_oschnser_pmid_link_to_geo = count_true_items(dat.read$centralized_link_to_ochsner_pmid & dat.read$in_geo)#
  112. append_to_results(number_of_oschnser_pmid_link_to_geo)#
  113. append_to_results(" datasets from GEO and ")#
  114. number_of_oschnser_pmid_link_to_arrayexpress = count_true_items(dat.read$centralized_link_to_ochsner_pmid & dat.read$in_arrayexpress)#
  115. append_to_results(number_of_oschnser_pmid_link_to_arrayexpress)#
  116. append_to_results(" datasets from ArrayExpress, including ")#
  117. number_of_oschnser_pmid_link_to_geo_AND_arrayexpress = count_true_items(dat.read$centralized_link_to_ochsner_pmid & dat.read$in_geo & dat.read$in_arrayexpress)#
  118. append_to_results(number_of_oschnser_pmid_link_to_geo_AND_arrayexpress)#
  119. append_to_results(" datasets in both databases (ArrayExpress has a policy of importing selected GEO submissions).")#
  120. results#
  121. #
  122. start_results("\nWe compared the two retrieval sets: the automated search found ")#
  123. number_datasets_found_by_automated_not_ochsner_geo_arrayexpress = #
  124. count_true_items(dat.read$centralized_link_to_ochsner_pmid & !dat.read$ochsner_found_any_data)#
  125. append_to_results(number_datasets_found_by_automated_not_ochsner_geo_arrayexpress)#
  126. append_to_results(" datasets overlooked by the manual search, ")#
  127. append_to_results("while the manual search found ")#
  128. number_datasets_found_by_ochsner_geo_arrayexpress_not_automated = #
  129. count_true_items((dat.read$ochsner_found_geo_data | dat.read$ochsner_found_arrayexpress_data) & !dat.read$centralized_link_to_ochsner_pmid)#
  130. append_to_results(number_datasets_found_by_ochsner_geo_arrayexpress_not_automated) #
  131. append_to_results(" datasets in GEO and ArrayExpress that were not found automatically")#
  132. append_to_results(" (plus ")#
  133. number_datasets_ochsner_not_geo_arrayexpress_or_automated = #
  134. count_true_items((dat.read$ochsner_journal | dat.read$ochsner_other) & !dat.read$centralized_link_to_ochsner_pmid)#
  135. append_to_results(number_datasets_ochsner_not_geo_arrayexpress_or_automated) #
  136. append_to_results(" datasets in other internet locations, as described above).")#
  137. results#
  138. #
  139. start_results(" A union of the manual and automated searches includes ")#
  140. number_of_datasets_found_via_manual_or_automated = count_true_items(dat.read$data_available)#
  141. append_to_results(number_of_datasets_found_via_manual_or_automated)#
  142. append_to_results(" distinct datasets")#
  143. append_to_results("; we consider this our reference set of 'publicly-available datasets' in the analysis below.")#
  144. #
  145. results#
  146. #
  147. start_results("Figure 1 shows the relative recall of the search strategies.")#
  148. append_to_results(" As illustrated in the first bar of Figure A, the manual search retrieved ")#
  149. append_to_results("%.1f%%", #
  150. 100*number_of_datasets_found_by_ochsner / number_of_datasets_found_via_manual_or_automated) #
  151. append_to_results(" of the publicly-available datasets.")#
  152. #
  153. append_to_results(" The second bar highlights the datasets available in either GEO or ArrayExpress: ")#
  154. number_datasets_found_geo_arrayexpress_union = #
  155. count_true_items(dat.read$ochsner_found_geo_data | dat.read$ochsner_found_arrayexpress_data | dat.read$centralized_link_to_ochsner_pmid)#
  156. append_to_results("%.1f%%", #
  157. 100*number_datasets_found_geo_arrayexpress_union / number_of_datasets_found_via_manual_or_automated) #
  158. append_to_results(" of the total.")#
  159. #
  160. append_to_results(" Automated searches retrieve ")#
  161. append_to_results("%.1f%%", #
  162. 100*number_of_oschnser_pmid_link_to_geo_or_arrayexpress / number_datasets_found_geo_arrayexpress_union) #
  163. append_to_results(" of the datasets housed in GEO and/or ArrayExpress, or ")#
  164. append_to_results("%.1f%%", #
  165. 100*number_of_oschnser_pmid_link_to_geo_or_arrayexpress / number_of_datasets_found_via_manual_or_automated) #
  166. append_to_results(" of datasets housed anywhere.")#
  167. #
  168. append_to_results(" Finally, the last two bars depict on the databases individually. Automated searches of just GEO and just ArrayExpress retrieve ")#
  169. append_to_results("%.1f%%", #
  170. 100*number_of_oschnser_pmid_link_to_geo / number_of_datasets_found_via_manual_or_automated) #
  171. append_to_results(" and ")#
  172. append_to_results("%.1f%%", #
  173. 100*number_of_oschnser_pmid_link_to_arrayexpress / number_of_datasets_found_via_manual_or_automated) #
  174. append_to_results(" of all available datasets, respectively.")#
  175. results#
  176. #
  177. start_results("\n\n\n<<TABLE 1 goes here!>>\n\n\n")#
  178. cat(all_results)#
  179. #
  180. #
  181. start_results("Next, we looked at univariate patterns to see if the datasets retrieved automatically ")#
  182. append_to_results("differed from those that were retrieved manually. ")#
  183. append_to_results(strip_newlines("#
  184. The odds that a dataset was about cancer, performed on an Affymetrix platform, #
  185. involved humans, or involved cultured cells were not significantly associated #
  186. with whether the dataset was retrievable through automated mechanisms (p>0.3). #
  187. These odds ratios and their confidence intervals are depicted in Figure 2. #
  188. Similarly, in ANOVA analysis, the species was not significantly different #
  189. between datasets found automatically and those not found automatically (p=0.7).#
  190. "))#
  191. #
  192. #
  193. #
  194. library(Hmisc,T)#
  195. library(Design,T)#
  196. library(sciplot)#
  197. library(rmeta)#
  198. #
  199. dat.raw = dat.read[which(dat.read$data_available == '1'),]#
  200. dim(dat.raw)#
  201. names(dat.raw) = gsub("_", ".", names(dat.raw))#
  202. names(dat.raw) = gsub(" ", ".", names(dat.raw))#
  203. names(dat.raw)#
  204. #
  205. dat = dat.raw#
  206. #
  207. #
  208. #
  209. dat$all = dat$pmid > ""#
  210. dat$in.arrayexpress.or.geo = dat$in.arrayexpress | dat$in.geo#
  211. dat$centralized.arrayexpress = dat$in.arrayexpress & dat$centralized.link.to.ochsner.pmid#
  212. dat$centralized.geo = dat$in.geo & dat$centralized.link.to.ochsner.pmid#
  213. dat$centralized.arrayexpress.or.geo = dat$in.arrayexpress.or.geo & dat$centralized.link.to.ochsner.pmid#
  214. #
  215. #
  216. levels(dat$species)[table(dat$species) < .03*length(dat$species)] = "Other"#
  217. #
  218. #
  219. affy = grep("Affymetrix", levels(dat$array.design))#
  220. levels(dat$array.design)[-affy] = "Other"#
  221. affy = grep("Affymetrix", levels(dat$array.design))#
  222. levels(dat$array.design)[affy] = "Affymetrix"#
  223. #
  224. #
  225. #
  226. fisher.test(table(dat$centralized.link.to.ochsner.pmid, dat$array.design))#
  227. fisher.test(table(dat$centralized.link.to.ochsner.pmid, dat$pubmed.is.cancer))#
  228. fisher.test(table(dat$centralized.link.to.ochsner.pmid, dat$pubmed.is.cultured.cells))#
  229. fisher.test(table(dat$centralized.link.to.ochsner.pmid, dat$pubmed.is.humans))#
  230. fisher.test(table(dat$centralized.link.to.ochsner.pmid, dat$species))#
  231. #
  232. #
  233. anova(lrm(species ~ centralized.link.to.ochsner.pmid, dat=dat))#
  234. #
  235. results#
  236. #
  237. #
  238. #
  239. #
  240. start_results("\n")#
  241. #
  242. wilcox.test(dat$pubmed.number.times.cited.in.pmc ~ dat$centralized.link.to.ochsner.pmid)#
  243. wilcox.test(dat$number.samples ~ dat$centralized.link.to.ochsner.pmid)#
  244. #
  245. append_to_results(strip_newlines("We did find some differences. "))#
  246. wilcox.test(dat$impact.factor ~ dat$centralized.link.to.ochsner.pmid)#
  247. #
  248. #
  249. append_to_results(strip_newlines("Histograms of the distribution of #
  250. citations (Figure 3), impact factors (Figure 4), and sample size (Figure 5) #
  251. illustrate the different distributions for studies with database citation links #
  252. to those without. Searching centralized links in GEO and ArrayExpress can find "))#
  253. table_impact_factor = table(dat$centralized.link.to.ochsner.pmid, cut(dat$impact.factor, c(0, 10, 20, 100)))#
  254. prop_table_impact_factor = prop.table(table_impact_factor, 2)#
  255. append_to_results("%.1f%%", 100*prop_table_impact_factor[2, 3])#
  256. append_to_results(" of datasets associated with articles published in journals with impact factors greater than 20, but only ")#
  257. append_to_results("%.1f%%", 100*prop_table_impact_factor[2, 2])#
  258. append_to_results(" of those with impact factors from 10-20, and only ")#
  259. append_to_results("%.1f%%", 100*prop_table_impact_factor[2, 1])#
  260. append_to_results(" of the datasets for papers published with impact factors less than 10.")#
  261. results#
  262. #
  263. start_results("\n")#
  264. append_to_results("Similarly, our automated search found ")#
  265. table_citations = table(dat$centralized.link.to.ochsner.pmid, cut(dat$pubmed.number.times.cited.in.pmc, c(-1, 0, 5, 10000)))#
  266. prop_table_citations = prop.table(table_citations, 2)#
  267. prop_table_citations#
  268. append_to_results("%.1f%%", 100*prop_table_citations[2, 3])#
  269. append_to_results(" of datasets whose associated articles had more than 5 citations, ")#
  270. append_to_results("%.1f%%", 100*prop_table_citations[2, 2])#
  271. append_to_results(" with 0-5 citations, and only ")#
  272. append_to_results("%.1f%%", 100*prop_table_citations[2, 1])#
  273. append_to_results(" with 0 citations.")#
  274. #
  275. append_to_results("\nOur automated queries retrieved ")#
  276. table_sample_size = table(dat$centralized.link.to.ochsner.pmid, cut(dat$number.samples, c(0, 10, 100, 10000)))#
  277. prop_table_sample_size = prop.table(table_sample_size, 2)#
  278. prop_table_sample_size#
  279. append_to_results("%.1f%%", 100*prop_table_sample_size[2, 3])#
  280. append_to_results(" of datasets with more than 100 datapoints, ")#
  281. append_to_results("%.1f%%", 100*prop_table_sample_size[2, 2])#
  282. append_to_results(" with between 10 and 100, and ")#
  283. append_to_results("%.1f%%", 100*prop_table_sample_size[2, 1]) #
  284. append_to_results(" of datasets with fewer than 10 datapoints.")#
  285. results#
  286. #
  287. start_results("\n")#
  288. append_to_results(strip_newlines("A journal policy requiring an accession number in published articles #
  289. may increase the number of citation links in centralized database ("))#
  290. table_journal_policy = table(dat$centralized.link.to.ochsner.pmid, dat$journal.policy.requires)#
  291. prop_table_journal_policy = prop.table(table_journal_policy, 2)#
  292. append_to_results("%.1f%%", 100*prop_table_journal_policy[2, 2])#
  293. append_to_results(strip_newlines(" of datasets can be found through citation links for journals that #
  294. require an accession number, vs. "))#
  295. append_to_results("%.1f%%", 100*prop_table_journal_policy[2, 1])#
  296. append_to_results(strip_newlines(" of datasets in other journals), but this #
  297. difference was not statistically significant. "))#
  298. #
  299. #
  300. fisher.test(table(dat$centralized.link.to.ochsner.pmid, dat$journal.policy.requires))#
  301. results#
  302. #
  303. start_results("\n")#
  304. append_to_results(strip_newlines("Multivariate regressions did not reveal any significant relationships between #
  305. dataset retrievability and impact factor, number of citations, sample size, #
  306. or journal policy."))#
  307. #
  308. #
  309. f = lrm(formula = centralized.link.to.ochsner.pmid #
  310. ~ rcs(log(impact.factor), 4) #
  311. + journal.policy.requires #
  312. #
  313. #
  314. , dat=dat, x=T, y=T)#
  315. f#
  316. anova(f)#
  317. #
  318. results#
  319. cat(all_results)#
  320. #
  321. #
  322. #
  323. #
  324. #
  325. #
  326. #
  327. plot_on_same_graph = function() {#
  328. par(new=TRUE)#
  329. }#
  330. #
  331. plot_on_new_graph = function() {#
  332. par(new=FALSE)#
  333. }#
  334. #
  335. plot_ci_bar = function(x, bar_number=1, max_number_bars=1, bar_name="", col="black") {#
  336. y_max = 1#
  337. bar_position = bar_number*2 - 1#
  338. x_spaces = max_number_bars*2#
  339. no_factors = rep(TRUE, length(x))#
  340. plot_on_same_graph()#
  341. bargraph.CI(x.factor = no_factors, #
  342. response = x, #
  343. xlim=c(0, x_spaces), #
  344. ylim=c(0, y_max), #
  345. names.arg=c(bar_name), #
  346. col=col, #
  347. space=bar_position)#
  348. }#
  349. #
  350. greys = grey(0:8 / 8)#
  351. light_grey = greys[8]#
  352. medium_grey = greys[4]#
  353. dark_grey = greys[2]#
  354. #
  355. plot_on_new_graph()#
  356. #
  357. plot_ci_bar(dat$all, 1, 4, "", light_grey)#
  358. plot_ci_bar(dat$all, 2, 4, "", light_grey)#
  359. plot_ci_bar(dat$all, 3, 4, "", light_grey)#
  360. #
  361. plot_ci_bar(dat$in.arrayexpress.or.geo, 1, 4, "GEO and/or\nArrayExpress", medium_grey)#
  362. plot_ci_bar(dat$in.geo, 2, 4, "GEO", medium_grey)#
  363. plot_ci_bar(dat$in.arrayexpress, 3, 4, "ArrayExpress", medium_grey)#
  364. #
  365. #
  366. #
  367. plot_ci_bar(dat$centralized.arrayexpress.or.geo, 1, 4, "", dark_grey)#
  368. plot_ci_bar(dat$centralized.geo, 2, 4, "", dark_grey)#
  369. plot_ci_bar(dat$centralized.arrayexpress, 3, 4, "", dark_grey)#
  370. #
  371. #
  372. #
  373. legend("right", c("\nDatasets not at this location\n", "\nDatasets found only\nby Ochsner search\n", "Datasets found by\nsearch of databases\nfor PubMed IDs"), #
  374. fill=c(light_grey, medium_grey, dark_grey), bty='n')#
  375. #
  376. title(main=NULL, ylab="Proportion of article-related publicly-available datasets")#
  377. #
  378. #
  379. #
  380. #
  381. #
  382. #
  383. #
  384. #
  385. #
  386. #
  387. hist_back_back_wrapper = function(histdata, splitdata, breaks=NULL, #
  388. title="Histogram", ylabels="", xlabels="", #
  389. log="n", sigfigs=0){#
  390. plotdata = histdata#
  391. if (log=="y") {#
  392. plotdata = log(histdata)#
  393. }#
  394. sp = split(plotdata, splitdata) #
  395. out = histbackback(sp, probability=TRUE, brks=breaks)#
  396. print(out)#
  397. frequency_maximum = max(c(out$left, out$right))#
  398. frequency_plot_limit = frequency_maximum * 1.1#
  399. print(frequency_plot_limit)#
  400. out = histbackback(sp, probability=TRUE, #
  401. main = title, brks=breaks, #
  402. xlim=c(-frequency_plot_limit, frequency_plot_limit), #
  403. xlab=xlabels,#
  404. axes=FALSE)#
  405. print(out)#
  406. if (log=="y") {#
  407. breaklabels = round(exp(out$breaks), sigfigs)#
  408. } else {#
  409. breaklabels = out$breaks#
  410. }#
  411. title(ylab=ylabels, xlab=xlabels)#
  412. xpoints = c(-frequency_plot_limit/2, 0, frequency_plot_limit/2)#
  413. print(xpoints)#
  414. axis(2, at=0:(length(breaklabels)-1), labels=breaklabels)#
  415. axis(1, at=xpoints, labels=round(abs(xpoints)/sum(out$left), 2))#
  416. #
  417. #
  418. barplot(-out$left, col=medium_grey , horiz=TRUE, space=0, add=TRUE, axes=FALSE) #
  419. barplot(out$right, col=light_grey, horiz=TRUE, space=0, add=TRUE, axes=FALSE) #
  420. return(out)#
  421. }#
  422. #
  423. hist_above_wrapper = function(histdata, splitdata, breaks=NULL, #
  424. title=c("", ""), ylabels="", xlabels="", #
  425. use_log="n", sigfigs=0){#
  426. plotdata = histdata#
  427. if (use_log=="y") {#
  428. plotdata = log(histdata)#
  429. }#
  430. sp = split(plotdata, splitdata) #
  431. h = hist(plotdata, freq=FALSE, axes=FALSE) #
  432. print(h)#
  433. #
  434. par(mfrow=c(2, 1))#
  435. hist_min = 0.9*min(plotdata)#
  436. hist_max = 1.1*max(plotdata)#
  437. h_1 = hist(sp$`1`, breaks=h$breaks, main=title[1], col=medium_grey, xlim=c(hist_min, hist_max), axes=FALSE, xlab="", ylab="") #
  438. #
  439. if (use_log=="y") {#
  440. breaklabels = round(exp(h$breaks), sigfigs)#
  441. } else {#
  442. breaklabels = round(h$breaks, sigfigs)#
  443. }#
  444. print(breaklabels)#
  445. title(ylab=ylabels) #
  446. axis(1, at=h$breaks, labels=breaklabels)#
  447. freq_label = c(0, round(max(h_1$counts)/sum(h_1$counts), 2))#
  448. axis(2, at=c(0, max(h_1$counts)), labels=freq_label)#
  449. #
  450. h_0 = hist(sp$`0`, breaks=h$breaks, main=title[2], col=medium_grey, xlim=c(hist_min, hist_max), axes=FALSE, xlab="", ylab="") #
  451. title(ylab=ylabels, xlab=xlabels)#
  452. axis(1, at=h$breaks, labels=breaklabels)#
  453. freq_label = c(0, round(max(h_0$counts)/sum(h_0$counts), 2))#
  454. axis(2, at=c(0, max(h_0$counts)), labels=freq_label)#
  455. #
  456. }#
  457. #
  458. quartz()#
  459. hist_above_wrapper(dat$impact.factor, #
  460. dat$centralized.link.to.ochsner.pmid, #
  461. seq(0.5, 4, by=0.25), #
  462. c(paste("Histogram of Impact Factor for datasets\nFound by query of databases for PubMed IDs (n=",count_true_items(dat$centralized.link.to.ochsner.pmid),")", sep=""), #
  463. paste("Not found by query of databases (n=",count_false_items(dat$centralized.link.to.ochsner.pmid),")", sep="")), #
  464. "Proportion of datasets",#
  465. "Impact factor (log scale)", #
  466. use_log="y", sigfigs=0)#
  467. #
  468. #
  469. #
  470. quartz()#
  471. hist_above_wrapper(dat$pubmed.number.times.cited.in.pmc+.5, #
  472. dat$centralized.link.to.ochsner.pmid, #
  473. NULL, #
  474. c(paste("Histogram of Received Citations for datasets\nFound by query of databases for PubMed IDs (n=",count_true_items(dat$centralized.link.to.ochsner.pmid),")", sep=""), #
  475. paste("Not found by query of databases (n=",count_false_items(dat$centralized.link.to.ochsner.pmid),")", sep="")), #
  476. "Proportion of datasets",#
  477. "Number of citations by articles in PubMed Central (log scale)",#
  478. use_log="y", sigfigs=0)#
  479. #
  480. quartz()#
  481. hist_above_wrapper(dat$number.samples[!is.na(dat$number.samples)], #
  482. dat$centralized.link.to.ochsner.pmid, #
  483. seq(0, 8, by=1), #
  484. c(paste("Histogram of Dataset Size for datasets\nFound by query of databases for PubMed IDs (n=",count_true_items(dat$centralized.link.to.ochsner.pmid[!is.na(dat$number.samples)]),")", sep=""), #
  485. paste("Not found by query of databases (n=",count_false_items(dat$centralized.link.to.ochsner.pmid[!is.na(dat$number.samples)]),")", sep="")), #
  486. "Proportion of datasets",#
  487. "Number of samples in microarray study (log scale)",#
  488. use_log="y", sigfigs=0)
  489. #
  490. #
  491. #
  492. Sys.setenv( PATH=paste(Sys.getenv("PATH"),"/usr/texbin",sep=":") ) #
  493. #
  494. #
  495. library(Rserve)#
  496. Rserve(args="--no-save")#
  497. #
  498. library(plyr)#
  499. #
  500. setwd("/Users/hpiwowar/Documents/Projects/PLoS ArticleImpact")#
  501. source("aim3_functions_20100215.R")
  502. dat.raw = read.csv("PLoS_Metrics.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE)
  503. dat = data.frame(doi = dat.raw$DOI)#
  504. #
  505. #
  506. dat$days.since.aug.2009 = dat.raw$Days.Since.Aug.2009#
  507. dat$pmid = dat.raw$PMID#
  508. #
  509. #
  510. #
  511. dat$is.plos.biology = ifelse(dat.raw$Journal == "PLoS Biology", 1, 0)#
  512. dat$is.plos.clinical.trials = ifelse(dat.raw$Journal == "PLoS Clinical Trials", 1, 0)#
  513. dat$is.plos.comp.bio = ifelse(dat.raw$Journal == "PLoS Computational Biology", 1, 0)#
  514. dat$is.plos.genetics = ifelse(dat.raw$Journal == "PLoS Genetics", 1, 0)#
  515. dat$is.plos.medicine = ifelse(dat.raw$Journal == "PLoS Medicine", 1, 0)#
  516. dat$is.plos.neglect.tropical = ifelse(dat.raw$Journal == "PLoS Neglected Tropical Diseases", 1, 0)#
  517. dat$is.plos.one = ifelse(dat.raw$Journal == "PLoS ONE", 1, 0)#
  518. dat$is.plos.pathogens = ifelse(dat.raw$Journal == "PLoS Pathogens", 1, 0)#
  519. dat$is.research = factor(dat.raw$X.Research.Article..or..non.Research.Article..)#
  520. #
  521. dat$num.cites.crossref = dat.raw$Citations...CrossRef#
  522. dat$num.cites.pmc = dat.raw$Citations...PubMed.Central#
  523. dat$num.cites.scopus = dat.raw$Citations...Scopus#
  524. dat$num.page.views.html = dat.raw$Total.HTML.Page.Views#
  525. dat$num.downloads.pdf = dat.raw$Total.PDF.Downloads#
  526. dat$num.downloads.xml = dat.raw$Total.XML.Downloads#
  527. #
  528. dat$num.blogs.postgenomic = dat.raw$Blog.Coverage...Postgenomic#
  529. dat$num.blogs.natureblogs = dat.raw$Blog.Coverage...Nature.Blogs#
  530. dat$num.blogs.bloglines = dat.raw$Blog.Coverage...Bloglines#
  531. dat$num.trackbacks = dat.raw$Number.of.Trackbacks#
  532. dat$num.bookmarks.citeulike = dat.raw$Social.Bookmarking...CiteULike#
  533. dat$num.bookmarks.connotea = dat.raw$Social.Bookmarking...Connotea#
  534. dat$num.ratings = dat.raw$Number.of.Ratings#
  535. dat$avg.rating = dat.raw$Average.Rating#
  536. dat$num.note.threads = dat.raw$Number.of.Note.threads#
  537. dat$num.note.replies = dat.raw$Number.of.replies.to.Notes#
  538. dat$num.comment.threads = dat.raw$Number.of.Comment.threads#
  539. #
  540. #
  541. dat$num.ratings.with.comment = dat.raw$Number.of..Star.Ratings..that.also.include.a.text.comment#
  542. #
  543. dat.transformed = dat#
  544. vars.no.transform = c("doi", "pmid", "days.since.aug.2009")#
  545. #
  546. vars.factor = c("is.research", "is.plos.clinical.trials", "is.plos.biology",#
  547. "is.plos.comp.bio", "is.plos.genetics", "is.plos.medicine", "is.plos.neglect.tropical", "is.plos.one", "is.plos.pathogens")#
  548. dat.transformed[,vars.factor] = colwise(factor)(dat[,vars.factor])#
  549. #
  550. vars.to.transform = !(names(dat.transformed) %in% c(vars.no.transform, vars.factor))#
  551. dat.transformed[,vars.to.transform] = tr(dat[,vars.to.transform])#
  552. #
  553. #
  554. summary(dat.transformed)
  555. library(Hmisc)#
  556. library(psych)#
  557. #
  558. Hmisc::describe(dat, listunique=0)
  559. dat.indep.stats = dat.transformed[,vars.to.transform]
  560. library(polycor)#
  561. #
  562. myhetcorr = hetcor.modified(dat.indep.stats, use="pairwise.complete.obs", std.err=FALSE, pd=FALSE)#
  563. mycor.unadjusted = myhetcorr$correlations#
  564. #
  565. #
  566. #
  567. #
  568. which(is.na(mycor.unadjusted))#
  569. #
  570. #
  571. #
  572. #
  573. #
  574. mycor = adjust.to.positive.definite(mycor.unadjusted)#
  575. #
  576. #
  577. library(gplots)#
  578. heatmap.2(mycor, col=bluered(16), cexRow=0.5, cexCol = .8, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(15,15), key=FALSE, keysize=0.1)
  579. mycor
  580. legend()
  581. bluered(16)
  582. blue(16)
  583. grey(16)
  584. greys(16)
  585. ?bluered
  586. showpanel(bluered(64))
  587. library(gplots)
  588. showpanel(bluered(64))
  589. showpanel <- function(col)#
  590. {#
  591. image(z=matrix(1:100, ncol=1), col=col, xaxt="n", yaxt="n" )#
  592. }
  593. showpanel(bluered(64))
  594. heatmap.2(mycor, col=bluered(16), cexRow=0.5, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
  595. heatmap.2(mycor, col=bluered(16), cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
  596. topo
  597. topocolors
  598. topo()
  599. ?colors
  600. heat.colors
  601. heatmap.2(mycor, col=heat.colors(16), cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
  602. heatmap.2(mycor, col=topo.colors(16), cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
  603. heatmap.2(mycor, col=terrain.colors(16), cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
  604. heatmap.2(mycor, col=cm.colors(16), cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
  605. cm.colors
  606. ?cm.colors
  607. showpanel(cm.colors(16))
  608. library(gplots)#
  609. heatmap.2(mycor, col=cm.colors(16), cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
  610. cm.colors(16)
  611. cm.colors(16)[0:8]
  612. showpanel(cm.colors(16)[0:8])
  613. showpanel(bluered(16)[0:8])
  614. showpanel(bluered(32)[0:16])
  615. showpanel(bluered(32)[17:32])
  616. heatmap.2(mycor, col=bluered(32)[17:32], cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
  617. #
  618. #
  619. library(nFactors)#
  620. eigenvectors.1st <- eigen(mycor) #
  621. #
  622. aparallel.1st <- parallel(subject=nrow(dat.indep.stats), var=ncol(dat.indep.stats), rep=100, cent=.05)#
  623. scree.results.1st <- nScree(eigenvectors.1st$values, aparallel.1st$eigen$qevpea)#
  624. summary(scree.results.1st)#
  625. plotnScree(scree.results.1st) #
  626. #
  627. #
  628. #
  629. #
  630. #
  631. number.factors.1st = 5
  632. #
  633. #
  634. fit.ml = factanal(dat, number.factors.1st, rotation="promax", covmat=mycor)#
  635. print(fit.ml, sort=TRUE)#
  636. #
  637. #
  638. #
  639. library(psych)#
  640. fit.fa.1st = fa(mycor, number.factors.1st, fm="minres", rotate="promax", #
  641. scores=FALSE, residuals=TRUE, n.obs=max(dim(dat.indep.stats)))#
  642. #
  643. #
  644. print(fit.fa.1st, sort=TRUE)#
  645. #
  646. #
  647. #
  648. #
  649. #
  650. #
  651. #
  652. #
  653. #
  654. factor.names.1st = c(#
  655. "MR1"="Ratings",#
  656. "MR2"="Citations",#
  657. "MR3"="Views",#
  658. "MR4"="Bookmarks",#
  659. "MR5"="Notes")#
  660. #
  661. for (afactor in names(factor.names.1st)) {#
  662. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  663. print.thresh(fit.fa.1st$loadings[, afactor], .4, TRUE)#
  664. print.thresh(fit.fa.1st$loadings[, afactor], -0.4, FALSE)#
  665. }
  666. #
  667. #
  668. fit.fa.1st.cor = fit.fa.1st$r.scores#
  669. colnames(fit.fa.1st.cor) = factor.names.1st[colnames(fit.fa.1st$loadings)]#
  670. rownames(fit.fa.1st.cor) = factor.names.1st[colnames(fit.fa.1st$loadings)]#
  671. #
  672. eigenvectors.2nd <- eigen(fit.fa.1st.cor) #
  673. aparallel.2nd <- parallel(subject=nrow(fit.fa.1st.cor), var=ncol(fit.fa.1st.cor), rep=100, cent=.05)#
  674. scree.results.2nd <- nScree(eigenvectors.2nd$values, aparallel.2nd$eigen$qevpea)#
  675. scree.results.2nd#
  676. plotnScree(scree.results.2nd) #
  677. #
  678. #
  679. number.factors.2nd = 2
  680. #
  681. #
  682. #
  683. fit.fa.2nd = fa(fit.fa.1st.cor, number.factors.2nd, fa="minres", rotate="varimax")#
  684. print(fit.fa.2nd, sort=TRUE)
  685. #
  686. U = fit.fa.2nd$uniquenesses * diag(number.factors.1st)#
  687. P = fit.fa.2nd$loadings#
  688. A = cbind(P, U)#
  689. Pvf = fit.fa.1st$loadings#
  690. #
  691. Pvo = Pvf %*% A
  692. #
  693. #
  694. factor.names.2nd = c(#
  695. "MR1"="Views, bookmarks, citations",#
  696. "MR2"="Notes and ratings")#
  697. #
  698. #
  699. for (afactor in names(factor.names.2nd)) {#
  700. print(paste(afactor, ": ", factor.names.2nd[afactor], sep="")) #
  701. print.thresh(Pvo[, afactor], .3, TRUE)#
  702. print.thresh(Pvo[, afactor], -0.3, FALSE)#
  703. }#
  704. #
  705. #
  706. for (afactor in names(factor.names.2nd)) {#
  707. print(paste(afactor, ": ", factor.names.2nd[afactor], sep=""))#
  708. print.thresh(fit.fa.2nd$loadings[, afactor], .3, TRUE)#
  709. print.thresh(fit.fa.2nd$loadings[, afactor], -0.3, FALSE)#
  710. }
  711. #
  712. #
  713. factor.names.2nd = c(#
  714. "MR1"="Views, citations, bookmarks",#
  715. "MR2"="Notes and ratings")#
  716. #
  717. #
  718. for (afactor in names(factor.names.2nd)) {#
  719. print(paste(afactor, ": ", factor.names.2nd[afactor], sep="")) #
  720. print.thresh(Pvo[, afactor], .3, TRUE)#
  721. print.thresh(Pvo[, afactor], -0.3, FALSE)#
  722. }#
  723. #
  724. #
  725. for (afactor in names(factor.names.2nd)) {#
  726. print(paste(afactor, ": ", factor.names.2nd[afactor], sep=""))#
  727. print.thresh(fit.fa.2nd$loadings[, afactor], .3, TRUE)#
  728. print.thresh(fit.fa.2nd$loadings[, afactor], -0.3, FALSE)#
  729. }
  730. #
  731. #
  732. #
  733. #
  734. dat.impute.input = dat.indep.stats#
  735. #
  736. #
  737. library(mice)#
  738. #
  739. #
  740. #
  741. mice.output = mice(dat.impute.input, m=1) #
  742. #
  743. #
  744. #
  745. dat.imputed = complete(mice.output, 1)#
  746. #
  747. dat.for.scores = dat.imputed#
  748. #
  749. #
  750. #
  751. scores.1st = factor.scores.bartlett(dat.for.scores, fit.fa.1st)
  752. #
  753. #
  754. load("/Users/hpiwowar/Documents/Code/hpiwowar/pypub/trunk/src/aim3/stats/dat_aim3.RData")#
  755. #
  756. dat.plos = cbind(dat, scores.1st)#
  757. #
  758. dat.merge = merge(dat.plos, dat.aim3, by="pmid")#
  759. #
  760. dat.regress = data.frame(dataset.in.geo.or.ae.int = dat.merge$dataset.in.geo.or.ae.int)#
  761. score.names = dimnames(scores.1st)[[2]]#
  762. dat.regress[,score.names] = dat.merge[,names(dat.merge) %in% score.names]#
  763. #
  764. vars.indep = c("days.since.aug.2009", #
  765. "num.authors.tr",#
  766. "first.author.num.prev.pubs.tr", #
  767. "first.author.num.prev.pmc.cites.tr",#
  768. "last.author.num.prev.pubs.tr",#
  769. "last.author.num.prev.pmc.cites.tr",#
  770. "num.grant.numbers.tr",#
  771. "pubmed.is.humans",#
  772. "pubmed.is.cancer",#
  773. "country.usa",#
  774. "institution.rank",#
  775. "nih.sum.sum.dollars.tr")#
  776. #
  777. dat.regress[,vars.indep] = dat.merge[,vars.indep]#
  778. #
  779. #
  780. #
  781. plot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
  782. boxplot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
  783. boxplot(log(MR1) ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
  784. boxplot(log(MR2) ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
  785. boxplot(log(MR3) ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
  786. boxplot(log(MR4) ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
  787. boxplot(log(MR5) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
  788. boxplot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress, main="MR2")
  789. boxplot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress, main="MR2")
  790. plot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress)
  791. boxplot(log(MR2) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
  792. #
  793. #
  794. #
  795. #
  796. #
  797. library(rms)#
  798. dd.regress = datadist(dat.regress)#
  799. options(datadist='dd.regress')#
  800. options(digits=2)#
  801. #
  802. f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +#
  803. (rcs(days.since.aug.2009, 3) +#
  804. rcs(num.authors.tr, 3) ) +#
  805. rcs(first.author.num.prev.pubs.tr, 3) + #
  806. rcs(first.author.num.prev.pmc.cites.tr, 3) +#
  807. rcs(last.author.num.prev.pubs.tr, 3) + #
  808. rcs(last.author.num.prev.pmc.cites.tr, 3) + #
  809. as.numeric(pubmed.is.humans) + #
  810. as.numeric(pubmed.is.cancer) + #
  811. as.numeric(country.usa) )#
  812. #
  813. ,#
  814. dat=dat.regress, x=T, y=T)#
  815. anova(f.1st.nonlinear.interactions.reduced)
  816. f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +#
  817. rcs(days.since.aug.2009, 3) + #
  818. (rcs(num.authors.tr, 3) + #
  819. rcs(last.author.num.prev.pmc.cites.tr, 3) + #
  820. as.numeric(pubmed.is.humans) + #
  821. as.numeric(pubmed.is.cancer) + #
  822. as.numeric(country.usa) ) )#
  823. #
  824. #
  825. #
  826. #
  827. #
  828. #
  829. ,#
  830. dat=dat.regress, x=T, y=T)#
  831. anova(f.1st.nonlinear.interactions.reduced)
  832. s = summary(f.1st.nonlinear.interactions.reduced)#
  833. s#
  834. plot(s)
  835. names(dat)
  836. f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +#
  837. (rcs(days.since.aug.2009, 10) +#
  838. rcs(num.authors.tr, 3) ) +#
  839. rcs(first.author.num.prev.pubs.tr, 3) + #
  840. rcs(first.author.num.prev.pmc.cites.tr, 3) +#
  841. rcs(last.author.num.prev.pubs.tr, 3) + #
  842. rcs(last.author.num.prev.pmc.cites.tr, 3) + #
  843. as.numeric(pubmed.is.humans) + #
  844. as.numeric(pubmed.is.cancer) + #
  845. as.numeric(country.usa) )#
  846. #
  847. ,#
  848. dat=dat.regress, x=T, y=T)#
  849. anova(f.1st.nonlinear.interactions.reduced)
  850. f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +#
  851. rcs(days.since.aug.2009, 10) + #
  852. (rcs(num.authors.tr, 3) + #
  853. rcs(last.author.num.prev.pmc.cites.tr, 3) + #
  854. as.numeric(pubmed.is.humans) + #
  855. as.numeric(pubmed.is.cancer) + #
  856. as.numeric(country.usa) ) )#
  857. #
  858. #
  859. #
  860. #
  861. #
  862. #
  863. ,#
  864. dat=dat.regress, x=T, y=T)#
  865. anova(f.1st.nonlinear.interactions.reduced)
  866. s = summary(f.1st.nonlinear.interactions.reduced)#
  867. s#
  868. plot(s)
  869. summ.1st.nonlinear.interactions.reduced = summary(f.1st.nonlinear.interactions.reduced)#
  870. summ.1st.nonlinear.interactions.reduced.dimnames = dimnames(summ.1st.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.1st.nonlinear.interactions.reduced)[[1]]),2)]#
  871. dimnames(summ.1st.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.1st.nonlinear.interactions.reduced)[[1]]),2)] = factor.names.1st[summ.1st.nonlinear.interactions.reduced.dimnames]#
  872. plot(summ.1st.nonlinear.interactions.reduced, q = c(0.95), col=gray(0.5), log=T, cex=.8)#
  873. title("Multivariate nonlinear regressions with interactions")
  874. summ.1st.nonlinear.interactions.reduced = summary(f.1st.nonlinear.interactions.reduced)#
  875. summ.1st.nonlinear.interactions.reduced.dimnames = dimnames(summ.1st.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.1st.nonlinear.interactions.reduced)[[1]]),2)]#
  876. dimnames(summ.1st.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.1st.nonlinear.interactions.reduced)[[1]]),2)] = factor.names.1st[summ.1st.nonlinear.interactions.reduced.dimnames]#
  877. plot(summ.1st.nonlinear.interactions.reduced, q = c(0.95), col=gray(0.5), log=T, cex=.8)#
  878. title("Multivariate nonlinear regressions with interactions")
  879. #
  880. dat.regress.named = dat.regress#
  881. names(dat.regress.named) = c("dataset.in.geo.or.ae.int", factor.names.1st[names(dat.regress)[-1]])#
  882. #
  883. dots.1st.nonlinear.interactions.reduced = summary(dataset.in.geo.or.ae.int ~ .,#
  884. dat=dat.regress.named)#
  885. plot(dots.1st.nonlinear.interactions.reduced, cex.labels=0.5, cex=0.7, #
  886. xlab="Percentage of studies with datasets in GEO or ArrayExpress", #
  887. main="Univariate data sharing behaviour on first order factors")
  888. #
  889. #
  890. loadings.2nd = Pvo#
  891. #
  892. #
  893. fit.pa.2nd.tovars = list(loadings=loadings.2nd[,colnames(fit.fa.2nd$loadings)])#
  894. fit.pa.2nd.tovars$uniquenesses = apply(fit.pa.2nd.tovars$loadings^2, 1, sum)#
  895. #
  896. scores.to.dat.2nd = factor.scores.bartlett(dat.for.scores, fit.pa.2nd.tovars)#
  897. #
  898. dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat$dataset.in.geo.or.ae.int) #
  899. dat.regress.2nd[,dimnames(scores.to.dat.2nd)[[2]]] = scores.to.dat.2nd#
  900. #
  901. library(rms)#
  902. #
  903. dd.regress.2nd = datadist(dat.regress.2nd)#
  904. options(datadist='dd.regress.2nd')#
  905. options(digits=2)#
  906. #
  907. #
  908. f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
  909. rcs(MR3, 4) + (rcs(MR1, 4) + rcs(MR4, 4) + rcs(MR5, 4) + rcs(MR2, 4))^2,#
  910. dat=dat.regress.2nd, x=T, y=T)#
  911. anova(f.2nd.nonlinear.interactions.reduced)
  912. names(dat.regress.2nd)
  913. dimnames(scores.to.dat.2nd)[[2]]
  914. dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat$dataset.in.geo.or.ae.int)
  915. dat.regress.2nd[,dimnames(scores.to.dat.2nd)[[2]]] = scores.to.dat.2nd
  916. scores.to.dat.2nd
  917. dim(dat.retress.2nd)
  918. dim(dat.regress.2nd)
  919. dim(dat.regress.2nd)dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat$dataset.in.geo.or.ae.int)
  920. dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat$dataset.in.geo.or.ae.int)
  921. dim(dat.regress.2nd)
  922. ?str(dat.regress.2nd)
  923. str(dat.regress.2nd)
  924. names(dat)
  925. dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat.aim3$dataset.in.geo.or.ae.int) #
  926. dat.regress.2nd[,dimnames(scores.to.dat.2nd)[[2]]] = scores.to.dat.2nd
  927. dim(dat.regress.2nd)
  928. dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat.merge$dataset.in.geo.or.ae.int) #
  929. dat.regress.2nd[,dimnames(scores.to.dat.2nd)[[2]]] = scores.to.dat.2nd
  930. dim(dat.merge)
  931. dim(scores.to.dat.2nd)
  932. dim(scores.to.dat.1st)
  933. dim(scores.1st)
  934. dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat.merge$dataset.in.geo.or.ae.int) #
  935. score.names.2nd = dimnames(scores.to.dat.2nd)[[2]]#
  936. dat.regress.2nd[,score.names.2nd] = dat.merge[,names(dat.merge) %in% score.names.2nd]
  937. #
  938. library(rms)#
  939. #
  940. dd.regress.2nd = datadist(dat.regress.2nd)#
  941. options(datadist='dd.regress.2nd')#
  942. options(digits=2)#
  943. #
  944. #
  945. f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
  946. rcs(MR3, 4) + (rcs(MR1, 4) + rcs(MR4, 4) + rcs(MR5, 4) + rcs(MR2, 4))^2,#
  947. dat=dat.regress.2nd, x=T, y=T)#
  948. anova(f.2nd.nonlinear.interactions.reduced)
  949. f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
  950. (rcs(MR1, 4) + rcs(MR2, 4))^2,#
  951. dat=dat.regress.2nd, x=T, y=T)#
  952. anova(f.2nd.nonlinear.interactions.reduced)
  953. f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ rcs(MR1, 4),#
  954. dat=dat.regress.2nd, x=T, y=T)#
  955. anova(f.2nd.nonlinear.interactions.reduced)
  956. names(dat.merge)
  957. f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
  958. rcs(days.since.aug.2009, 10) + rcs(MR1, 4),#
  959. dat=dat.regress.2nd, x=T, y=T)#
  960. anova(f.2nd.nonlinear.interactions.reduced)
  961. dat.regress.2nd[,days.since.aug.2009] = dat.merge[,days.since.aug.2009]
  962. dat.regress.2nd[,"days.since.aug.2009"] = dat.merge[,"days.since.aug.2009"]
  963. f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
  964. rcs(days.since.aug.2009, 10) + rcs(MR1, 4),#
  965. dat=dat.regress.2nd, x=T, y=T)#
  966. anova(f.2nd.nonlinear.interactions.reduced)
  967. f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
  968. rcs(days.since.aug.2009, 10) + rcs(MR1, 3),#
  969. dat=dat.regress.2nd, x=T, y=T)#
  970. anova(f.2nd.nonlinear.interactions.reduced)
  971. f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
  972. (rcs(days.since.aug.2009, 10) + rcs(MR1, 3))^2,#
  973. dat=dat.regress.2nd, x=T, y=T)#
  974. anova(f.2nd.nonlinear.interactions.reduced)
  975. f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +#
  976. (rcs(days.since.aug.2009, 10)))#
  977. anova(f.2nd.nonlinear.interactions.reduced)
  978. f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ dataset.in.geo.or.ae.int +#
  979. (rcs(days.since.aug.2009, 10)))#
  980. anova(f.2nd.nonlinear.interactions.reduced)
  981. f.1st.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +#
  982. rcs(days.since.aug.2009, 10))^2, #
  983. dat=dat.regress.2nd, x=T, y=T)#
  984. anova(f.2nd.nonlinear.interactions.reduced)
  985. f.2nd.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +#
  986. rcs(days.since.aug.2009, 10))^2, #
  987. dat=dat.regress.2nd, x=T, y=T)#
  988. anova(f.2nd.nonlinear.interactions.reduced)
  989. #
  990. f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
  991. (rcs(MR1, 4) + rcs(MR2, 4))^2,#
  992. dat=dat.regress.2nd, x=T, y=T)#
  993. anova(f.2nd.nonlinear.interactions.reduced)
  994. summ.2nd.nonlinear.interactions.reduced = summary(f.2nd.nonlinear.interactions.reduced)#
  995. summ.2nd.nonlinear.interactions.reduced.dimnames = dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]]),2)]#
  996. dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]]),2)] = factor.names.2nd[summ.2nd.nonlinear.interactions.reduced.dimnames]#
  997. plot(summ.2nd.nonlinear.interactions.reduced, q = c(0.95), col=gray(0.5), log=T, cex=.8)#
  998. title("Multivariate nonlinear regression with interactions")
  999. dat.regress.2nd[,vars.indep] = dat.merge[,vars.indep]#
  1000. #
  1001. library(rms)#
  1002. #
  1003. dd.regress.2nd = datadist(dat.regress.2nd)#
  1004. options(datadist='dd.regress.2nd')#
  1005. options(digits=2)
  1006. f.2nd.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +#
  1007. (rcs(days.since.aug.2009, 10) +#
  1008. rcs(num.authors.tr, 3) ) +#
  1009. rcs(first.author.num.prev.pubs.tr, 3) + #
  1010. rcs(first.author.num.prev.pmc.cites.tr, 3) +#
  1011. rcs(last.author.num.prev.pubs.tr, 3) + #
  1012. rcs(last.author.num.prev.pmc.cites.tr, 3) + #
  1013. as.numeric(pubmed.is.humans) + #
  1014. as.numeric(pubmed.is.cancer) + #
  1015. as.numeric(country.usa) )#
  1016. #
  1017. ,#
  1018. dat=dat.regress.2nd, x=T, y=T)#
  1019. anova(f.2nd.nonlinear.interactions.reduced)
  1020. f.2nd.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +#
  1021. (rcs(days.since.aug.2009, 10) +#
  1022. rcs(num.authors.tr, 3) ) +#
  1023. rcs(first.author.num.prev.pubs.tr, 3) + #
  1024. rcs(first.author.num.prev.pmc.cites.tr, 3) +#
  1025. rcs(last.author.num.prev.pubs.tr, 3) + #
  1026. rcs(last.author.num.prev.pmc.cites.tr, 3) + #
  1027. as.numeric(pubmed.is.humans) + #
  1028. as.numeric(pubmed.is.cancer) + #
  1029. as.numeric(country.usa) )^2#
  1030. #
  1031. ,#
  1032. dat=dat.regress.2nd, x=T, y=T)#
  1033. anova(f.2nd.nonlinear.interactions.reduced)
  1034. f.2nd.nonlinear.interactions.reduced
  1035. f.2nd.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +#
  1036. (rcs(days.since.aug.2009, 10) +#
  1037. rcs(num.authors.tr, 3) ) +#
  1038. rcs(first.author.num.prev.pubs.tr, 3) + #
  1039. rcs(first.author.num.prev.pmc.cites.tr, 3) +#
  1040. rcs(last.author.num.prev.pubs.tr, 3) + #
  1041. rcs(last.author.num.prev.pmc.cites.tr, 3) + #
  1042. as.numeric(pubmed.is.humans) + #
  1043. as.numeric(pubmed.is.cancer) + #
  1044. as.numeric(country.usa) )#
  1045. #
  1046. ,#
  1047. dat=dat.regress.2nd, x=T, y=T)#
  1048. anova(f.2nd.nonlinear.interactions.reduced)
  1049. f.2nd.nonlinear.interactions.reduced
  1050. dim(dat.regress.2nd)
  1051. summ.2nd.nonlinear.interactions.reduced = summary(f.2nd.nonlinear.interactions.reduced)#
  1052. summ.2nd.nonlinear.interactions.reduced.dimnames = dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]]),2)]#
  1053. dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]]),2)] = factor.names.2nd[summ.2nd.nonlinear.interactions.reduced.dimnames]#
  1054. plot(summ.2nd.nonlinear.interactions.reduced, q = c(0.95), col=gray(0.5), log=T, cex=.8)#
  1055. title("Multivariate nonlinear regression with interactions")
  1056. dat.regress.2nd.named = dat.regress.2nd#
  1057. names(dat.regress.2nd.named) = c("dataset.in.geo.or.ae.int", factor.names.2nd[names(dat.regress.2nd)[-1]])#
  1058. #
  1059. dots.2nd.nonlinear.interactions.reduced = summary(dataset.in.geo.or.ae.int ~ .,#
  1060. dat=dat.regress.2nd.named)#
  1061. plot(dots.2nd.nonlinear.interactions.reduced, cex.labels=0.5, cex=0.7, #
  1062. xlab="Percentage of studies with datasets in GEO or ArrayExpress", #
  1063. main="Univariate data sharing behaviour\non second order factors")
  1064. dots.2nd.nonlinear.interactions.reduced = summary(dataset.in.geo.or.ae.int ~ .,#
  1065. dat=dat.regress.2nd.named)
  1066. dat.regress.2nd.named = dat.regress.2nd#
  1067. names(dat.regress.2nd.named) = c("dataset.in.geo.or.ae.int", factor.names.2nd[names(dat.regress.2nd)[-1]])#
  1068. #
  1069. dots.2nd.nonlinear.interactions.reduced = summary(dataset.in.geo.or.ae.int ~ .,#
  1070. dat=dat.regress.2nd.named)#
  1071. plot(dots.2nd.nonlinear.interactions.reduced, cex.labels=0.5, cex=0.7, #
  1072. xlab="Percentage of studies with datasets in GEO or ArrayExpress", #
  1073. main="Univariate data sharing behaviour\non second order factors")
  1074. ?plsmo
  1075. plsmo(days.since.aug.2009, MR1, dat=dat.regress)
  1076. names(dat.regress)
  1077. plsmo(dat.regress$days.since.aug.2009, dat.regress$MR1)
  1078. par(c(5, 1))#
  1079. plsmo(dat.regress$days.since.aug.2009, dat.regress$MR1, main="MR1")
  1080. ?par
  1081. par(mfcol=c(5, 1))#
  1082. plsmo(dat.regress$days.since.aug.2009, dat.regress$MR1, main="MR1")
  1083. par(mfcol=c(1, 2))#
  1084. plsmo(dat.regress$days.since.aug.2009, dat.regress$MR1, main="MR1")
  1085. for (afactor in names(factor.names.1st)) {#
  1086. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  1087. plsmo(dat.regress$days.since.aug.2009, dat.regress[afactor], main=factor.names.1st[afactor])#
  1088. #
  1089. }
  1090. for (afactor in names(factor.names.1st)) {#
  1091. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  1092. quartz()#
  1093. plsmo(dat.regress$days.since.aug.2009, dat.regress[afactor], main=factor.names.1st[afactor])#
  1094. }
  1095. for (afactor in names(factor.names.1st)) {#
  1096. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  1097. quartz()#
  1098. plsmo(dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
  1099. title(factor.names.1st[afactor])#
  1100. }
  1101. ?reverse
  1102. ?rev
  1103. max(dat.regress$days.since.aug.2009)
  1104. ?plot
  1105. for (afactor in names(factor.names.1st)) {#
  1106. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  1107. quartz()#
  1108. plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
  1109. title(factor.names.1st[afactor])#
  1110. }
  1111. for (afactor in names(factor.names.1st)) {#
  1112. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  1113. quartz()#
  1114. plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
  1115. title(factor.names.1st[afactor])#
  1116. }
  1117. for (afactor in names(factor.names.1st)) {#
  1118. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  1119. quartz()#
  1120. boxplot(log(afactor) ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
  1121. title(factor.names.1st[afactor])#
  1122. }
  1123. for (afactor in names(factor.names.1st)) {#
  1124. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  1125. quartz()#
  1126. boxplot(log(dat.regress[afactor]) ~ dat.regress$dataset.in.geo.or.ae.int)#
  1127. title(factor.names.1st[afactor])#
  1128. }
  1129. afactor="MR1"
  1130. boxplot(log(dat.regress[afactor]) ~ dat.regress$dataset.in.geo.or.ae.int)
  1131. max(log(dat.regress[afactor]))
  1132. max(log(dat.regress[afactor], na.rm=T))
  1133. summary(dat.regress[afactor])
  1134. for (afactor in names(factor.names.1st)) {#
  1135. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  1136. quartz()#
  1137. boxplot(at.regress[afactor] ~ dat.regress$dataset.in.geo.or.ae.int)#
  1138. title(factor.names.1st[afactor])#
  1139. }
  1140. #
  1141. for (afactor in names(factor.names.1st)) {#
  1142. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  1143. quartz()#
  1144. boxplot(dat.regress[afactor] ~ dat.regress$dataset.in.geo.or.ae.int)#
  1145. title(factor.names.1st[afactor])#
  1146. }
  1147. boxplot(dat.regress[afactor] ~ dat.regress$dataset.in.geo.or.ae.int)
  1148. plot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress)
  1149. boxplot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress)
  1150. boxplot(MR1 ~ dataset.in.geo.or.ae.int, dat=dat.regress)
  1151. boxplot(log(MR1) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
  1152. boxplot(log(dat.regress$MR1) ~ dat.regress$dataset.in.geo.or.ae.int)
  1153. boxplot(log(dat.regress["MR1"]) ~ dat.regress$dataset.in.geo.or.ae.int)
  1154. boxplot(log(dat.regress["MR1",]) ~ dat.regress$dataset.in.geo.or.ae.int)
  1155. boxplot(log(dat.regress[,"MR1"]) ~ dat.regress$dataset.in.geo.or.ae.int)
  1156. for (afactor in names(factor.names.1st)) {#
  1157. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  1158. quartz()#
  1159. boxplot(dat.regress[,afactor] ~ dat.regress$dataset.in.geo.or.ae.int)#
  1160. title(factor.names.1st[afactor])#
  1161. }
  1162. for (afactor in names(factor.names.1st)) {#
  1163. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  1164. quartz()#
  1165. boxplot(dat.regress[,afactor] ~ dat.regress$dataset.in.geo.or.ae.int)#
  1166. title(factor.names.1st[afactor])#
  1167. }
  1168. names(dat)
  1169. fit.fa.2nd$loadings[, afactor]
  1170. fit.fa.1st$loadings[, afactor]
  1171. loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))
  1172. loaded
  1173. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  1174. quartz()#
  1175. plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
  1176. title(factor.names.1st[afactor])#
  1177. loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))#
  1178. plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[names(dat.regress) %in% loaded], add=T)
  1179. [names(dat.regress) %in% loaded
  1180. names(dat.regress) %in% loaded
  1181. loaded
  1182. names(dat.regress)
  1183. for (afactor in names(factor.names.1st)) {#
  1184. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  1185. quartz()#
  1186. plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
  1187. title(factor.names.1st[afactor])#
  1188. loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))#
  1189. plsmo(-1*dat.merge$days.since.aug.2009, dat.merge[names(dat.marge) %in% loaded], add=T)#
  1190. }
  1191. plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
  1192. title(factor.names.1st[afactor])#
  1193. loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))#
  1194. plsmo(-1*dat.merge$days.since.aug.2009, dat.merge[names(dat.merge) %in% loaded], add=T)
  1195. names(dat.merge) %in% loaded
  1196. plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
  1197. title(factor.names.1st[afactor])#
  1198. loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))#
  1199. plsmo(-1*dat.merge$days.since.aug.2009, dat.merge[,names(dat.merge) %in% loaded], add=T)
  1200. dim(dat.merge[,names(dat.merge) %in% loaded])
  1201. dim(dat.merge$days.since.aug.2009)
  1202. length(dat.merge$days.since.aug.2009)
  1203. dat.merge[,names(dat.merge) %in% loaded]
  1204. for (afactor in names(factor.names.1st)) {#
  1205. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  1206. quartz()#
  1207. plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
  1208. title(factor.names.1st[afactor])#
  1209. loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))#
  1210. plot(-1*dat.merge$days.since.aug.2009, dat.merge[,names(dat.merge) %in% loaded], add=T)#
  1211. }
  1212. setwd("/Users/hpiwowar/Documents/Projects/PLoS ArticleImpact")#
  1213. source("aim3_functions_20100215.R")#
  1214. #
  1215. #
  1216. #
  1217. dat.raw = read.csv("summary_alm_data.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE)
  1218. setwd("/Users/hpiwowar/Documents/Projects/PLoS ArticleImpact")#
  1219. source("aim3_functions_20100215.R")#
  1220. #
  1221. #
  1222. #
  1223. dat.raw = read.csv("summary_alm_data_201002.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE)
  1224. dim(dat.raw)
  1225. names(dat.raw)
  1226. summary(dat.raw)
  1227. names(dat.raw)[0:20]
  1228. dat.raw[0:20,]
  1229. dat = data.frame(doi = dat.raw$DOI)#
  1230. #
  1231. #
  1232. dat$publication.date = dat.raw$Publication.Date#
  1233. #
  1234. #
  1235. #
  1236. dat$is.plos.biology = ifelse(dat.raw$Journal == "PLoS Biology", 1, 0)#
  1237. dat$is.plos.clinical.trials = ifelse(dat.raw$Journal == "PLoS Clinical Trials", 1, 0)#
  1238. dat$is.plos.comp.bio = ifelse(dat.raw$Journal == "PLoS Computational Biology", 1, 0)#
  1239. dat$is.plos.genetics = ifelse(dat.raw$Journal == "PLoS Genetics", 1, 0)#
  1240. dat$is.plos.medicine = ifelse(dat.raw$Journal == "PLoS Medicine", 1, 0)#
  1241. dat$is.plos.neglect.tropical = ifelse(dat.raw$Journal == "PLoS Neglected Tropical Diseases", 1, 0)#
  1242. dat$is.plos.one = ifelse(dat.raw$Journal == "PLoS ONE", 1, 0)#
  1243. dat$is.plos.pathogens = ifelse(dat.raw$Journal == "PLoS Pathogens", 1, 0)#
  1244. dat$is.research = factor(dat.raw$X.Research.Article..or..non.Research.Article..)
  1245. dat$num.cites.crossref = dat.raw$Citations...CrossRef#
  1246. dat$num.cites.pmc = dat.raw$Citations...PubMed.Central#
  1247. dat$num.cites.scopus = dat.raw$Citations...Scopus#
  1248. dat$num.page.views.html = dat.raw$Total.HTML.Page.Views#
  1249. dat$num.downloads.pdf = dat.raw$Total.PDF.Downloads#
  1250. dat$num.downloads.xml = dat.raw$Total.XML.Downloads#
  1251. #
  1252. dat$num.blogs.postgenomic = dat.raw$Blog.Coverage...Postgenomic#
  1253. dat$num.blogs.natureblogs = dat.raw$Blog.Coverage...Nature.Blogs#
  1254. dat$num.blogs.bloglines = dat.raw$Blog.Coverage...Bloglines#
  1255. dat$num.trackbacks = dat.raw$Number.of.Trackbacks#
  1256. dat$num.bookmarks.citeulike = dat.raw$Social.Bookmarking...CiteULike#
  1257. dat$num.bookmarks.connotea = dat.raw$Social.Bookmarking...Connotea#
  1258. dat$num.ratings = dat.raw$Number.of.Ratings#
  1259. dat$avg.rating = dat.raw$Average.Rating#
  1260. dat$num.note.threads = dat.raw$Number.of.Note.threads#
  1261. dat$num.note.replies = dat.raw$Number.of.replies.to.Notes#
  1262. dat$num.comment.threads = dat.raw$Number.of.Comment.threads#
  1263. #
  1264. #
  1265. dat$num.ratings.with.comment = dat.raw$Number.of..Star.Ratings..that.also.include.a.text.comment
  1266. dat.transformed = dat#
  1267. vars.no.transform = c("doi", "pmid", "publicaton.date")#
  1268. #
  1269. vars.factor = c("is.research", "is.plos.clinical.trials", "is.plos.biology",#
  1270. "is.plos.comp.bio", "is.plos.genetics", "is.plos.medicine", "is.plos.neglect.tropical", "is.plos.one", "is.plos.pathogens")#
  1271. dat.transformed[,vars.factor] = colwise(factor)(dat[,vars.factor])#
  1272. #
  1273. vars.to.transform = !(names(dat.transformed) %in% c(vars.no.transform, vars.factor))#
  1274. dat.transformed[,vars.to.transform] = tr(dat[,vars.to.transform])#
  1275. #
  1276. #
  1277. summary(dat.transformed)
  1278. #
  1279. library(Hmisc)#
  1280. library(psych)#
  1281. #
  1282. Hmisc::describe(dat, listunique=0)
  1283. table(dat.raw$Journal)
  1284. dat.indep.stats = dat.transformed[,vars.to.transform]#
  1285. #
  1286. #
  1287. #
  1288. #
  1289. #
  1290. #
  1291. #
  1292. library(polycor)#
  1293. #
  1294. myhetcorr = hetcor.modified(dat.indep.stats, use="pairwise.complete.obs", std.err=FALSE, pd=FALSE)#
  1295. mycor.unadjusted = myhetcorr$correlations#
  1296. #
  1297. #
  1298. #
  1299. #
  1300. which(is.na(mycor.unadjusted))#
  1301. #
  1302. #
  1303. #
  1304. #
  1305. #
  1306. mycor = adjust.to.positive.definite(mycor.unadjusted)
  1307. library(gplots)#
  1308. heatmap.2(mycor, col=bluered(32)[17:32], cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
  1309. library(gplots)#
  1310. heatmap.2(mycor, col=bluered(32)[17:32], cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
  1311. dat$days.since.published = max(dat.raw$Publication.Date) - dat.raw$Publication.Date
  1312. dat.indep.stats = dat.transformed[,vars.to.transform]#
  1313. #
  1314. #
  1315. #
  1316. #
  1317. #
  1318. #
  1319. #
  1320. library(polycor)#
  1321. #
  1322. myhetcorr = hetcor.modified(dat.indep.stats, use="pairwise.complete.obs", std.err=FALSE, pd=FALSE)#
  1323. mycor.unadjusted = myhetcorr$correlations#
  1324. #
  1325. #
  1326. #
  1327. #
  1328. which(is.na(mycor.unadjusted))#
  1329. #
  1330. #
  1331. #
  1332. #
  1333. #
  1334. mycor = adjust.to.positive.definite(mycor.unadjusted)#
  1335. #
  1336. #
  1337. library(gplots)#
  1338. heatmap.2(mycor, col=bluered(32)[17:32], cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
  1339. heatmap.2(mycor, col=bluered(32)[17:32], cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
  1340. names(dat.indep.stats)
  1341. names(dat)
  1342. #
  1343. dat.transformed = dat#
  1344. vars.no.transform = c("doi", "publicaton.date")#
  1345. #
  1346. vars.factor = c("is.research", "is.plos.clinical.trials", "is.plos.biology",#
  1347. "is.plos.comp.bio", "is.plos.genetics", "is.plos.medicine", "is.plos.neglect.tropical", "is.plos.one", "is.plos.pathogens")#
  1348. dat.transformed[,vars.factor] = colwise(factor)(dat[,vars.factor])#
  1349. #
  1350. vars.to.transform = !(names(dat.transformed) %in% c(vars.no.transform, vars.factor))#
  1351. dat.transformed[,vars.to.transform] = tr(dat[,vars.to.transform])#
  1352. #
  1353. #
  1354. summary(dat.transformed)
  1355. vars.to.transform
  1356. dat.transformed[,vars.to.transform]
  1357. summary(dat.indep.stats )
  1358. vars.to.transform
  1359. names(dat.transformed)
  1360. summary(dat.transformed[,vars.to.transform])
  1361. dat.indep.stats = dat.transformed[,vars.to.transform]
  1362. summary(dat.indep.stats)
  1363. library(polycor)#
  1364. #
  1365. myhetcorr = hetcor.modified(dat.indep.stats, use="pairwise.complete.obs", std.err=FALSE, pd=FALSE)#
  1366. mycor.unadjusted = myhetcorr$correlations#
  1367. #
  1368. #
  1369. #
  1370. #
  1371. which(is.na(mycor.unadjusted))#
  1372. #
  1373. #
  1374. #
  1375. #
  1376. #
  1377. mycor = adjust.to.positive.definite(mycor.unadjusted)#
  1378. #
  1379. #
  1380. library(gplots)#
  1381. heatmap.2(mycor, col=bluered(32)[17:32], cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
  1382. dat.transformed = dat#
  1383. vars.no.transform = c("doi", "publication.date")#
  1384. #
  1385. vars.factor = c("is.research", "is.plos.clinical.trials", "is.plos.biology",#
  1386. "is.plos.comp.bio", "is.plos.genetics", "is.plos.medicine", "is.plos.neglect.tropical", "is.plos.one", "is.plos.pathogens")#
  1387. dat.transformed[,vars.factor] = colwise(factor)(dat[,vars.factor])#
  1388. #
  1389. vars.to.transform = !(names(dat.transformed) %in% c(vars.no.transform, vars.factor))#
  1390. dat.transformed[,vars.to.transform] = tr(dat[,vars.to.transform])#
  1391. #
  1392. #
  1393. summary(dat.transformed)#
  1394. #
  1395. #
  1396. #
  1397. #
  1398. library(Hmisc)#
  1399. library(psych)#
  1400. #
  1401. Hmisc::describe(dat, listunique=0)#
  1402. #
  1403. #
  1404. #
  1405. #
  1406. #
  1407. #
  1408. #
  1409. #
  1410. #
  1411. #
  1412. #
  1413. #
  1414. #
  1415. #
  1416. #
  1417. #
  1418. #
  1419. #
  1420. #
  1421. #
  1422. #
  1423. #
  1424. #
  1425. #
  1426. #
  1427. dat.indep.stats = dat.transformed[,vars.to.transform]#
  1428. #
  1429. #
  1430. #
  1431. #
  1432. #
  1433. #
  1434. #
  1435. library(polycor)#
  1436. #
  1437. myhetcorr = hetcor.modified(dat.indep.stats, use="pairwise.complete.obs", std.err=FALSE, pd=FALSE)#
  1438. mycor.unadjusted = myhetcorr$correlations#
  1439. #
  1440. #
  1441. #
  1442. #
  1443. which(is.na(mycor.unadjusted))#
  1444. #
  1445. #
  1446. #
  1447. #
  1448. #
  1449. mycor = adjust.to.positive.definite(mycor.unadjusted)#
  1450. #
  1451. #
  1452. library(gplots)#
  1453. heatmap.2(mycor, col=bluered(32)[17:32], cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
  1454. #
  1455. vars.no.transform = c("doi", "publication.date", "days.since.published")#
  1456. #
  1457. vars.factor = c("is.research", "is.plos.clinical.trials", "is.plos.biology",#
  1458. "is.plos.comp.bio", "is.plos.genetics", "is.plos.medicine", "is.plos.neglect.tropical", "is.plos.one", "is.plos.pathogens")#
  1459. dat.transformed[,vars.factor] = colwise(factor)(dat[,vars.factor])#
  1460. #
  1461. vars.to.transform = !(names(dat.transformed) %in% c(vars.no.transform, vars.factor))#
  1462. dat.transformed[,vars.to.transform] = tr(dat[,vars.to.transform])#
  1463. #
  1464. #
  1465. summary(dat.transformed)#
  1466. #
  1467. #
  1468. #
  1469. #
  1470. library(Hmisc)#
  1471. library(psych)#
  1472. #
  1473. Hmisc::describe(dat, listunique=0)#
  1474. #
  1475. #
  1476. #
  1477. #
  1478. #
  1479. #
  1480. #
  1481. #
  1482. #
  1483. #
  1484. #
  1485. #
  1486. #
  1487. #
  1488. #
  1489. #
  1490. #
  1491. #
  1492. #
  1493. #
  1494. #
  1495. #
  1496. #
  1497. #
  1498. #
  1499. dat.indep.stats = dat.transformed[,vars.to.transform]#
  1500. #
  1501. #
  1502. #
  1503. #
  1504. #
  1505. #
  1506. #
  1507. library(polycor)#
  1508. #
  1509. myhetcorr = hetcor.modified(dat.indep.stats, use="pairwise.complete.obs", std.err=FALSE, pd=FALSE)#
  1510. mycor.unadjusted = myhetcorr$correlations#
  1511. #
  1512. #
  1513. #
  1514. #
  1515. which(is.na(mycor.unadjusted))#
  1516. #
  1517. #
  1518. #
  1519. #
  1520. #
  1521. mycor = adjust.to.positive.definite(mycor.unadjusted)#
  1522. #
  1523. #
  1524. library(gplots)#
  1525. heatmap.2(mycor, col=bluered(32)[17:32], cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
  1526. library(nFactors)#
  1527. eigenvectors.1st <- eigen(mycor) #
  1528. #
  1529. aparallel.1st <- parallel(subject=nrow(dat.indep.stats), var=ncol(dat.indep.stats), rep=100, cent=.05)#
  1530. scree.results.1st <- nScree(eigenvectors.1st$values, aparallel.1st$eigen$qevpea)#
  1531. summary(scree.results.1st)#
  1532. plotnScree(scree.results.1st)
  1533. number.factors.1st = 5#
  1534. #
  1535. #
  1536. #
  1537. #
  1538. #
  1539. fit.ml = factanal(dat, number.factors.1st, rotation="promax", covmat=mycor)#
  1540. print(fit.ml, sort=TRUE)#
  1541. #
  1542. #
  1543. #
  1544. library(psych)#
  1545. fit.fa.1st = fa(mycor, number.factors.1st, fm="minres", rotate="promax", #
  1546. scores=FALSE, residuals=TRUE, n.obs=max(dim(dat.indep.stats)))#
  1547. #
  1548. #
  1549. print(fit.fa.1st, sort=TRUE)#
  1550. #
  1551. #
  1552. #
  1553. #
  1554. #
  1555. #
  1556. #
  1557. #
  1558. #
  1559. factor.names.1st = c(#
  1560. "MR1"="Ratings",#
  1561. "MR2"="Citations",#
  1562. "MR3"="Views",#
  1563. "MR4"="Bookmarks",#
  1564. "MR5"="Notes")#
  1565. #
  1566. for (afactor in names(factor.names.1st)) {#
  1567. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  1568. print.thresh(fit.fa.1st$loadings[, afactor], .4, TRUE)#
  1569. print.thresh(fit.fa.1st$loadings[, afactor], -0.4, FALSE)#
  1570. }
  1571. number.factors.1st = 6#
  1572. #
  1573. #
  1574. #
  1575. #
  1576. #
  1577. fit.ml = factanal(dat, number.factors.1st, rotation="promax", covmat=mycor)#
  1578. print(fit.ml, sort=TRUE)#
  1579. #
  1580. #
  1581. #
  1582. library(psych)#
  1583. fit.fa.1st = fa(mycor, number.factors.1st, fm="minres", rotate="promax", #
  1584. scores=FALSE, residuals=TRUE, n.obs=max(dim(dat.indep.stats)))#
  1585. #
  1586. #
  1587. print(fit.fa.1st, sort=TRUE)
  1588. number.factors.1st = 5#
  1589. #
  1590. #
  1591. #
  1592. #
  1593. #
  1594. fit.ml = factanal(dat, number.factors.1st, rotation="promax", covmat=mycor)#
  1595. print(fit.ml, sort=TRUE)#
  1596. #
  1597. #
  1598. #
  1599. library(psych)#
  1600. fit.fa.1st = fa(mycor, number.factors.1st, fm="minres", rotate="promax", #
  1601. scores=FALSE, residuals=TRUE, n.obs=max(dim(dat.indep.stats)))#
  1602. #
  1603. #
  1604. print(fit.fa.1st, sort=TRUE)
  1605. fit.fa.1st.cor = fit.fa.1st$r.scores#
  1606. colnames(fit.fa.1st.cor) = factor.names.1st[colnames(fit.fa.1st$loadings)]#
  1607. rownames(fit.fa.1st.cor) = factor.names.1st[colnames(fit.fa.1st$loadings)]#
  1608. #
  1609. eigenvectors.2nd <- eigen(fit.fa.1st.cor) #
  1610. aparallel.2nd <- parallel(subject=nrow(fit.fa.1st.cor), var=ncol(fit.fa.1st.cor), rep=100, cent=.05)#
  1611. scree.results.2nd <- nScree(eigenvectors.2nd$values, aparallel.2nd$eigen$qevpea)#
  1612. scree.results.2nd#
  1613. plotnScree(scree.results.2nd)
  1614. number.factors.2nd = 2#
  1615. #
  1616. #
  1617. #
  1618. #
  1619. #
  1620. #
  1621. fit.fa.2nd = fa(fit.fa.1st.cor, number.factors.2nd, fa="minres", rotate="varimax")#
  1622. print(fit.fa.2nd, sort=TRUE)
  1623. number.factors.2nd = 3#
  1624. #
  1625. #
  1626. #
  1627. #
  1628. #
  1629. #
  1630. fit.fa.2nd = fa(fit.fa.1st.cor, number.factors.2nd, fa="minres", rotate="varimax")#
  1631. print(fit.fa.2nd, sort=TRUE)
  1632. number.factors.2nd = 1#
  1633. #
  1634. #
  1635. #
  1636. #
  1637. #
  1638. #
  1639. fit.fa.2nd = fa(fit.fa.1st.cor, number.factors.2nd, fa="minres", rotate="varimax")#
  1640. print(fit.fa.2nd, sort=TRUE)
  1641. number.factors.2nd = 2#
  1642. #
  1643. #
  1644. #
  1645. #
  1646. #
  1647. #
  1648. fit.fa.2nd = fa(fit.fa.1st.cor, number.factors.2nd, fa="minres", rotate="varimax")#
  1649. print(fit.fa.2nd, sort=TRUE)#
  1650. #
  1651. #
  1652. #
  1653. #
  1654. #
  1655. #
  1656. #
  1657. U = fit.fa.2nd$uniquenesses * diag(number.factors.1st)#
  1658. P = fit.fa.2nd$loadings#
  1659. A = cbind(P, U)#
  1660. Pvf = fit.fa.1st$loadings#
  1661. #
  1662. Pvo = Pvf %*% A#
  1663. #
  1664. #
  1665. #
  1666. #
  1667. #
  1668. #
  1669. #
  1670. #
  1671. factor.names.2nd = c(#
  1672. "MR1"="Views, citations, bookmarks",#
  1673. "MR2"="Notes and ratings")#
  1674. #
  1675. #
  1676. for (afactor in names(factor.names.2nd)) {#
  1677. print(paste(afactor, ": ", factor.names.2nd[afactor], sep="")) #
  1678. print.thresh(Pvo[, afactor], .3, TRUE)#
  1679. print.thresh(Pvo[, afactor], -0.3, FALSE)#
  1680. }#
  1681. #
  1682. #
  1683. for (afactor in names(factor.names.2nd)) {#
  1684. print(paste(afactor, ": ", factor.names.2nd[afactor], sep=""))#
  1685. print.thresh(fit.fa.2nd$loadings[, afactor], .3, TRUE)#
  1686. print.thresh(fit.fa.2nd$loadings[, afactor], -0.3, FALSE)#
  1687. }
  1688. #
  1689. #
  1690. #
  1691. #
  1692. dat.impute.input = dat.indep.stats#
  1693. #
  1694. #
  1695. library(mice)#
  1696. #
  1697. #
  1698. #
  1699. mice.output = mice(dat.impute.input, m=1) #
  1700. #
  1701. #
  1702. #
  1703. dat.imputed = complete(mice.output, 1)#
  1704. #
  1705. dat.for.scores = dat.imputed#
  1706. #
  1707. #
  1708. #
  1709. scores.1st = factor.scores.bartlett(dat.for.scores, fit.fa.1st)
  1710. #
  1711. loadings.2nd = Pvo#
  1712. #
  1713. #
  1714. fit.pa.2nd.tovars = list(loadings=loadings.2nd[,colnames(fit.fa.2nd$loadings)])#
  1715. fit.pa.2nd.tovars$uniquenesses = apply(fit.pa.2nd.tovars$loadings^2, 1, sum)#
  1716. #
  1717. scores.to.dat.2nd = factor.scores.bartlett(dat.for.scores, fit.pa.2nd.tovars)
  1718. load("/Users/hpiwowar/Documents/Code/hpiwowar/pypub/trunk/src/aim3/stats/dat_aim3.RData")#
  1719. #
  1720. dat.plos = cbind(dat, scores.1st)#
  1721. #
  1722. dat.merge = merge(dat.plos, dat.aim3, by="pmid")#
  1723. #
  1724. dat.regress = data.frame(dataset.in.geo.or.ae.int = dat.merge$dataset.in.geo.or.ae.int)#
  1725. score.names = dimnames(scores.1st)[[2]]#
  1726. dat.regress[,score.names] = dat.merge[,names(dat.merge) %in% score.names]#
  1727. #
  1728. vars.indep = c("days.since.published", #
  1729. "num.authors.tr",#
  1730. "first.author.num.prev.pubs.tr", #
  1731. "first.author.num.prev.pmc.cites.tr",#
  1732. "last.author.num.prev.pubs.tr",#
  1733. "last.author.num.prev.pmc.cites.tr",#
  1734. "num.grant.numbers.tr",#
  1735. "pubmed.is.humans",#
  1736. "pubmed.is.cancer",#
  1737. "country.usa",#
  1738. "institution.rank",#
  1739. "nih.sum.sum.dollars.tr")#
  1740. #
  1741. dat.regress[,vars.indep] = dat.merge[,vars.indep]#
  1742. #
  1743. #
  1744. #
  1745. plot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress)
  1746. boxplot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress, main="MR2")
  1747. for (afactor in names(factor.names.1st)) {#
  1748. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  1749. quartz()#
  1750. boxplot(dat.regress[,afactor] ~ dat.regress$dataset.in.geo.or.ae.int)#
  1751. title(factor.names.1st[afactor])#
  1752. }
  1753. for (afactor in names(factor.names.1st)) {#
  1754. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  1755. quartz()#
  1756. plsmo(-1*dat.regress$days.since.published, dat.regress[afactor], ylab=factor.names.1st[afactor])#
  1757. title(factor.names.1st[afactor])#
  1758. loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))#
  1759. plot(-1*dat.merge$days.since.published, dat.merge[,names(dat.merge) %in% loaded], add=T)#
  1760. }
  1761. dim(dat.regress)
  1762. dim(dat.merge)
  1763. names(dat.merge)
  1764. afactor
  1765. print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
  1766. quartz()#
  1767. plsmo(-1*dat.regress$days.since.published, dat.regress[afactor], ylab=factor.names.1st[afactor], datadensity=TRUE)#
  1768. title(factor.names.1st[afactor])#
  1769. loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))#
  1770. plot(-1*dat.merge$days.since.published, dat.merge[,names(dat.merge) %in% loaded], add=T)
  1771. names(dat.regress)
  1772. names(scores.1st)
  1773. dim(scores.1st)
  1774. dim(dat.plos)
  1775. dat.scores.merge = cbind(dat.plos, scores.1st)
  1776. dim(d