/.Rhistory
#! | 1776 lines | 1776 code | 0 blank | 0 comment | 0 complexity | a1e04d14a7b24fc5ae481dae58dfe9c3 MD5 | raw file
Possible License(s): MIT
Large files files are truncated, but you can click here to view the full file
- library(rmeta)
- #
- #
- strip_newlines = function(str) {#
- return(gsub("\n", "", str))#
- }#
- #
- start_results = function(str, ...) {#
- formatted_string = get_formatted_string(str, ...)#
- results <<- formatted_string#
- all_results <<- paste(all_results, formatted_string, sep="")#
- }#
- #
- get_formatted_string = function(str, ...)#
- if (length(list(...)) > 0) {#
- formatted_string = sprintf(str, ...)#
- } else {#
- formatted_string = str#
- }#
- return(formatted_string)#
- }#
- #
- append_to_results = function(str, ...) {#
- formatted_string = get_formatted_string(str, ...)#
- results <<- paste(results, formatted_string, sep="")#
- all_results <<- paste(all_results, formatted_string, sep="")#
- }#
- #
- count_unique_items = function(column) {#
- return(dim(table(column)))#
- }#
- #
- count_true_items = function(column) {#
- return(sum(column==TRUE))#
- }#
- #
- count_false_items = function(column) {#
- return(sum(column!=TRUE))#
- }#
- #
- #
- #
- all_results = ""#
- start_results("\nOchsner et al.{Ochsner} identified ")#
- #
- dat.read = read.csv("rawdata.csv", header=TRUE, sep=",")#
- number_of_data_producing_publications_found_by_ochsner = #
- count_unique_items(dat.read$pmid)#
- append_to_results(number_of_data_producing_publications_found_by_ochsner) #
- append_to_results(" studies that generated gene expression microarray data.")#
- results#
- #
- start_results(" Ochsner̢۪s search found ")#
- number_of_datasets_found_by_ochsner = #
- count_true_items(dat.read$ochsner_found_any_data)#
- append_to_results(number_of_datasets_found_by_ochsner)#
- append_to_results(" publicly-available datasets, ")#
- append_to_results("representing ")#
- number_of_data_sharing_publications_found_by_ochsner = #
- count_unique_items(dat.read$pmid[dat.read$ochsner_found_any_data == TRUE])#
- append_to_results(number_of_data_sharing_publications_found_by_ochsner)#
- append_to_results(" distinct studies")#
- append_to_results(" (or %.1f%% of all %d data-producing studies; some studies have more than one associated dataset).", #
- 100*number_of_data_sharing_publications_found_by_ochsner / number_of_data_producing_publications_found_by_ochsner, #
- number_of_data_producing_publications_found_by_ochsner)#
- results#
- #
- start_results(" Of the datasets, ")#
- number_of_datasets_found_in_geo_by_ochsner = count_true_items(dat.read$ochsner_found_geo_data)#
- append_to_results("%d (%.1f%%)", #
- number_of_datasets_found_in_geo_by_ochsner, #
- 100*number_of_datasets_found_in_geo_by_ochsner / number_of_data_producing_publications_found_by_ochsner_that_shared_data)#
- append_to_results(" were found in the Gene Expression Omnibus (GEO) database, ")#
- number_of_datasets_found_in_arrayexpress_by_ochsner = count_true_items(dat.read$ochsner_found_arrayexpress_data)#
- append_to_results("%d (%.1f%%)", #
- number_of_datasets_found_in_arrayexpress_by_ochsner, #
- 100*number_of_datasets_found_in_arrayexpress_by_ochsner / number_of_data_producing_publications_found_by_ochsner_that_shared_data) #
- append_to_results(" in ArrayExpress, ")#
- number_of_datasets_found_on_journal_sites_by_ochsner = count_true_items(dat.read$ochsner_journal)#
- append_to_results("%d (%.1f%%)", #
- number_of_datasets_found_on_journal_sites_by_ochsner, #
- 100*number_of_datasets_found_on_journal_sites_by_ochsner / number_of_data_producing_publications_found_by_ochsner_that_shared_data) #
- append_to_results(" hosted on journal websites, and ")#
- number_of_datasets_found_elsewhere_by_ochsner = count_true_items(dat.read$ochsner_other)#
- append_to_results("%d (%.1f%%)", #
- number_of_datasets_found_elsewhere_by_ochsner, #
- 100*number_of_datasets_found_elsewhere_by_ochsner / number_of_data_producing_publications_found_by_ochsner_that_shared_data) #
- append_to_results(" on lab websites and smaller online data repositories ")#
- append_to_results("(some datasets were found in more than one location).")#
- results#
- #
- start_results(" Combined, GEO and ArrayExpress housed ")#
- number_of_datasets_found_in_geo_or_arrayexpress_by_ochsner = #
- count_true_items(dat.read$ochsner_found_geo_data | dat.read$ochsner_found_arrayexpress_data)#
- append_to_results("%d (%.1f%%)", #
- number_of_datasets_found_in_geo_or_arrayexpress_by_ochsner, #
- 100*number_of_datasets_found_in_geo_or_arrayexpress_by_ochsner / number_of_data_producing_publications_found_by_ochsner_that_shared_data) #
- append_to_results(" of the manually-located datasets.")#
- results#
- #
- start_results("\nNext, we ran automated queries. We queried the GEO and ArrayExpress databases with the PubMed IDs of the ")#
- append_to_results(number_of_data_producing_publications_found_by_ochsner)#
- append_to_results(" data-producing studies.")#
- results#
- #
- #
- start_results(" The automated queries returned ")#
- number_of_oschnser_pmid_link_to_geo_or_arrayexpress = count_true_items(dat.read$centralized_link_to_ochsner_pmid & (dat.read$in_geo | dat.read$in_arrayexpress))#
- append_to_results(number_of_oschnser_pmid_link_to_geo_or_arrayexpress)#
- append_to_results(" datasets in total, with ")#
- number_of_oschnser_pmid_link_to_geo = count_true_items(dat.read$centralized_link_to_ochsner_pmid & dat.read$in_geo)#
- append_to_results(number_of_oschnser_pmid_link_to_geo)#
- append_to_results(" datasets from GEO and ")#
- number_of_oschnser_pmid_link_to_arrayexpress = count_true_items(dat.read$centralized_link_to_ochsner_pmid & dat.read$in_arrayexpress)#
- append_to_results(number_of_oschnser_pmid_link_to_arrayexpress)#
- append_to_results(" datasets from ArrayExpress, including ")#
- number_of_oschnser_pmid_link_to_geo_AND_arrayexpress = count_true_items(dat.read$centralized_link_to_ochsner_pmid & dat.read$in_geo & dat.read$in_arrayexpress)#
- append_to_results(number_of_oschnser_pmid_link_to_geo_AND_arrayexpress)#
- append_to_results(" datasets in both databases (ArrayExpress has a policy of importing selected GEO submissions).")#
- results#
- #
- start_results("\nWe compared the two retrieval sets: the automated search found ")#
- number_datasets_found_by_automated_not_ochsner_geo_arrayexpress = #
- count_true_items(dat.read$centralized_link_to_ochsner_pmid & !dat.read$ochsner_found_any_data)#
- append_to_results(number_datasets_found_by_automated_not_ochsner_geo_arrayexpress)#
- append_to_results(" datasets overlooked by the manual search, ")#
- append_to_results("while the manual search found ")#
- number_datasets_found_by_ochsner_geo_arrayexpress_not_automated = #
- count_true_items((dat.read$ochsner_found_geo_data | dat.read$ochsner_found_arrayexpress_data) & !dat.read$centralized_link_to_ochsner_pmid)#
- append_to_results(number_datasets_found_by_ochsner_geo_arrayexpress_not_automated) #
- append_to_results(" datasets in GEO and ArrayExpress that were not found automatically")#
- append_to_results(" (plus ")#
- number_datasets_ochsner_not_geo_arrayexpress_or_automated = #
- count_true_items((dat.read$ochsner_journal | dat.read$ochsner_other) & !dat.read$centralized_link_to_ochsner_pmid)#
- append_to_results(number_datasets_ochsner_not_geo_arrayexpress_or_automated) #
- append_to_results(" datasets in other internet locations, as described above).")#
- results#
- #
- start_results(" A union of the manual and automated searches includes ")#
- number_of_datasets_found_via_manual_or_automated = count_true_items(dat.read$data_available)#
- append_to_results(number_of_datasets_found_via_manual_or_automated)#
- append_to_results(" distinct datasets")#
- append_to_results("; we consider this our reference set of 'publicly-available datasets' in the analysis below.")#
- #
- results#
- #
- start_results("Figure 1 shows the relative recall of the search strategies.")#
- append_to_results(" As illustrated in the first bar of Figure A, the manual search retrieved ")#
- append_to_results("%.1f%%", #
- 100*number_of_datasets_found_by_ochsner / number_of_datasets_found_via_manual_or_automated) #
- append_to_results(" of the publicly-available datasets.")#
- #
- append_to_results(" The second bar highlights the datasets available in either GEO or ArrayExpress: ")#
- number_datasets_found_geo_arrayexpress_union = #
- count_true_items(dat.read$ochsner_found_geo_data | dat.read$ochsner_found_arrayexpress_data | dat.read$centralized_link_to_ochsner_pmid)#
- append_to_results("%.1f%%", #
- 100*number_datasets_found_geo_arrayexpress_union / number_of_datasets_found_via_manual_or_automated) #
- append_to_results(" of the total.")#
- #
- append_to_results(" Automated searches retrieve ")#
- append_to_results("%.1f%%", #
- 100*number_of_oschnser_pmid_link_to_geo_or_arrayexpress / number_datasets_found_geo_arrayexpress_union) #
- append_to_results(" of the datasets housed in GEO and/or ArrayExpress, or ")#
- append_to_results("%.1f%%", #
- 100*number_of_oschnser_pmid_link_to_geo_or_arrayexpress / number_of_datasets_found_via_manual_or_automated) #
- append_to_results(" of datasets housed anywhere.")#
- #
- append_to_results(" Finally, the last two bars depict on the databases individually. Automated searches of just GEO and just ArrayExpress retrieve ")#
- append_to_results("%.1f%%", #
- 100*number_of_oschnser_pmid_link_to_geo / number_of_datasets_found_via_manual_or_automated) #
- append_to_results(" and ")#
- append_to_results("%.1f%%", #
- 100*number_of_oschnser_pmid_link_to_arrayexpress / number_of_datasets_found_via_manual_or_automated) #
- append_to_results(" of all available datasets, respectively.")#
- results#
- #
- start_results("\n\n\n<<TABLE 1 goes here!>>\n\n\n")#
- cat(all_results)#
- #
- #
- start_results("Next, we looked at univariate patterns to see if the datasets retrieved automatically ")#
- append_to_results("differed from those that were retrieved manually. ")#
- append_to_results(strip_newlines("#
- The odds that a dataset was about cancer, performed on an Affymetrix platform, #
- involved humans, or involved cultured cells were not significantly associated #
- with whether the dataset was retrievable through automated mechanisms (p>0.3). #
- These odds ratios and their confidence intervals are depicted in Figure 2. #
- Similarly, in ANOVA analysis, the species was not significantly different #
- between datasets found automatically and those not found automatically (p=0.7).#
- "))#
- #
- #
- #
- library(Hmisc,T)#
- library(Design,T)#
- library(sciplot)#
- library(rmeta)#
- #
- dat.raw = dat.read[which(dat.read$data_available == '1'),]#
- dim(dat.raw)#
- names(dat.raw) = gsub("_", ".", names(dat.raw))#
- names(dat.raw) = gsub(" ", ".", names(dat.raw))#
- names(dat.raw)#
- #
- dat = dat.raw#
- #
- #
- #
- dat$all = dat$pmid > ""#
- dat$in.arrayexpress.or.geo = dat$in.arrayexpress | dat$in.geo#
- dat$centralized.arrayexpress = dat$in.arrayexpress & dat$centralized.link.to.ochsner.pmid#
- dat$centralized.geo = dat$in.geo & dat$centralized.link.to.ochsner.pmid#
- dat$centralized.arrayexpress.or.geo = dat$in.arrayexpress.or.geo & dat$centralized.link.to.ochsner.pmid#
- #
- #
- levels(dat$species)[table(dat$species) < .03*length(dat$species)] = "Other"#
- #
- #
- affy = grep("Affymetrix", levels(dat$array.design))#
- levels(dat$array.design)[-affy] = "Other"#
- affy = grep("Affymetrix", levels(dat$array.design))#
- levels(dat$array.design)[affy] = "Affymetrix"#
- #
- #
- #
- fisher.test(table(dat$centralized.link.to.ochsner.pmid, dat$array.design))#
- fisher.test(table(dat$centralized.link.to.ochsner.pmid, dat$pubmed.is.cancer))#
- fisher.test(table(dat$centralized.link.to.ochsner.pmid, dat$pubmed.is.cultured.cells))#
- fisher.test(table(dat$centralized.link.to.ochsner.pmid, dat$pubmed.is.humans))#
- fisher.test(table(dat$centralized.link.to.ochsner.pmid, dat$species))#
- #
- #
- anova(lrm(species ~ centralized.link.to.ochsner.pmid, dat=dat))#
- #
- results#
- #
- #
- #
- #
- start_results("\n")#
- #
- wilcox.test(dat$pubmed.number.times.cited.in.pmc ~ dat$centralized.link.to.ochsner.pmid)#
- wilcox.test(dat$number.samples ~ dat$centralized.link.to.ochsner.pmid)#
- #
- append_to_results(strip_newlines("We did find some differences. "))#
- wilcox.test(dat$impact.factor ~ dat$centralized.link.to.ochsner.pmid)#
- #
- #
- append_to_results(strip_newlines("Histograms of the distribution of #
- citations (Figure 3), impact factors (Figure 4), and sample size (Figure 5) #
- illustrate the different distributions for studies with database citation links #
- to those without. Searching centralized links in GEO and ArrayExpress can find "))#
- table_impact_factor = table(dat$centralized.link.to.ochsner.pmid, cut(dat$impact.factor, c(0, 10, 20, 100)))#
- prop_table_impact_factor = prop.table(table_impact_factor, 2)#
- append_to_results("%.1f%%", 100*prop_table_impact_factor[2, 3])#
- append_to_results(" of datasets associated with articles published in journals with impact factors greater than 20, but only ")#
- append_to_results("%.1f%%", 100*prop_table_impact_factor[2, 2])#
- append_to_results(" of those with impact factors from 10-20, and only ")#
- append_to_results("%.1f%%", 100*prop_table_impact_factor[2, 1])#
- append_to_results(" of the datasets for papers published with impact factors less than 10.")#
- results#
- #
- start_results("\n")#
- append_to_results("Similarly, our automated search found ")#
- table_citations = table(dat$centralized.link.to.ochsner.pmid, cut(dat$pubmed.number.times.cited.in.pmc, c(-1, 0, 5, 10000)))#
- prop_table_citations = prop.table(table_citations, 2)#
- prop_table_citations#
- append_to_results("%.1f%%", 100*prop_table_citations[2, 3])#
- append_to_results(" of datasets whose associated articles had more than 5 citations, ")#
- append_to_results("%.1f%%", 100*prop_table_citations[2, 2])#
- append_to_results(" with 0-5 citations, and only ")#
- append_to_results("%.1f%%", 100*prop_table_citations[2, 1])#
- append_to_results(" with 0 citations.")#
- #
- append_to_results("\nOur automated queries retrieved ")#
- table_sample_size = table(dat$centralized.link.to.ochsner.pmid, cut(dat$number.samples, c(0, 10, 100, 10000)))#
- prop_table_sample_size = prop.table(table_sample_size, 2)#
- prop_table_sample_size#
- append_to_results("%.1f%%", 100*prop_table_sample_size[2, 3])#
- append_to_results(" of datasets with more than 100 datapoints, ")#
- append_to_results("%.1f%%", 100*prop_table_sample_size[2, 2])#
- append_to_results(" with between 10 and 100, and ")#
- append_to_results("%.1f%%", 100*prop_table_sample_size[2, 1]) #
- append_to_results(" of datasets with fewer than 10 datapoints.")#
- results#
- #
- start_results("\n")#
- append_to_results(strip_newlines("A journal policy requiring an accession number in published articles #
- may increase the number of citation links in centralized database ("))#
- table_journal_policy = table(dat$centralized.link.to.ochsner.pmid, dat$journal.policy.requires)#
- prop_table_journal_policy = prop.table(table_journal_policy, 2)#
- append_to_results("%.1f%%", 100*prop_table_journal_policy[2, 2])#
- append_to_results(strip_newlines(" of datasets can be found through citation links for journals that #
- require an accession number, vs. "))#
- append_to_results("%.1f%%", 100*prop_table_journal_policy[2, 1])#
- append_to_results(strip_newlines(" of datasets in other journals), but this #
- difference was not statistically significant. "))#
- #
- #
- fisher.test(table(dat$centralized.link.to.ochsner.pmid, dat$journal.policy.requires))#
- results#
- #
- start_results("\n")#
- append_to_results(strip_newlines("Multivariate regressions did not reveal any significant relationships between #
- dataset retrievability and impact factor, number of citations, sample size, #
- or journal policy."))#
- #
- #
- f = lrm(formula = centralized.link.to.ochsner.pmid #
- ~ rcs(log(impact.factor), 4) #
- + journal.policy.requires #
- #
- #
- , dat=dat, x=T, y=T)#
- f#
- anova(f)#
- #
- results#
- cat(all_results)#
- #
- #
- #
- #
- #
- #
- #
- plot_on_same_graph = function() {#
- par(new=TRUE)#
- }#
- #
- plot_on_new_graph = function() {#
- par(new=FALSE)#
- }#
- #
- plot_ci_bar = function(x, bar_number=1, max_number_bars=1, bar_name="", col="black") {#
- y_max = 1#
- bar_position = bar_number*2 - 1#
- x_spaces = max_number_bars*2#
- no_factors = rep(TRUE, length(x))#
- plot_on_same_graph()#
- bargraph.CI(x.factor = no_factors, #
- response = x, #
- xlim=c(0, x_spaces), #
- ylim=c(0, y_max), #
- names.arg=c(bar_name), #
- col=col, #
- space=bar_position)#
- }#
- #
- greys = grey(0:8 / 8)#
- light_grey = greys[8]#
- medium_grey = greys[4]#
- dark_grey = greys[2]#
- #
- plot_on_new_graph()#
- #
- plot_ci_bar(dat$all, 1, 4, "", light_grey)#
- plot_ci_bar(dat$all, 2, 4, "", light_grey)#
- plot_ci_bar(dat$all, 3, 4, "", light_grey)#
- #
- plot_ci_bar(dat$in.arrayexpress.or.geo, 1, 4, "GEO and/or\nArrayExpress", medium_grey)#
- plot_ci_bar(dat$in.geo, 2, 4, "GEO", medium_grey)#
- plot_ci_bar(dat$in.arrayexpress, 3, 4, "ArrayExpress", medium_grey)#
- #
- #
- #
- plot_ci_bar(dat$centralized.arrayexpress.or.geo, 1, 4, "", dark_grey)#
- plot_ci_bar(dat$centralized.geo, 2, 4, "", dark_grey)#
- plot_ci_bar(dat$centralized.arrayexpress, 3, 4, "", dark_grey)#
- #
- #
- #
- legend("right", c("\nDatasets not at this location\n", "\nDatasets found only\nby Ochsner search\n", "Datasets found by\nsearch of databases\nfor PubMed IDs"), #
- fill=c(light_grey, medium_grey, dark_grey), bty='n')#
- #
- title(main=NULL, ylab="Proportion of article-related publicly-available datasets")#
- #
- #
- #
- #
- #
- #
- #
- #
- #
- #
- hist_back_back_wrapper = function(histdata, splitdata, breaks=NULL, #
- title="Histogram", ylabels="", xlabels="", #
- log="n", sigfigs=0){#
- plotdata = histdata#
- if (log=="y") {#
- plotdata = log(histdata)#
- }#
- sp = split(plotdata, splitdata) #
- out = histbackback(sp, probability=TRUE, brks=breaks)#
- print(out)#
- frequency_maximum = max(c(out$left, out$right))#
- frequency_plot_limit = frequency_maximum * 1.1#
- print(frequency_plot_limit)#
- out = histbackback(sp, probability=TRUE, #
- main = title, brks=breaks, #
- xlim=c(-frequency_plot_limit, frequency_plot_limit), #
- xlab=xlabels,#
- axes=FALSE)#
- print(out)#
- if (log=="y") {#
- breaklabels = round(exp(out$breaks), sigfigs)#
- } else {#
- breaklabels = out$breaks#
- }#
- title(ylab=ylabels, xlab=xlabels)#
- xpoints = c(-frequency_plot_limit/2, 0, frequency_plot_limit/2)#
- print(xpoints)#
- axis(2, at=0:(length(breaklabels)-1), labels=breaklabels)#
- axis(1, at=xpoints, labels=round(abs(xpoints)/sum(out$left), 2))#
- #
- #
- barplot(-out$left, col=medium_grey , horiz=TRUE, space=0, add=TRUE, axes=FALSE) #
- barplot(out$right, col=light_grey, horiz=TRUE, space=0, add=TRUE, axes=FALSE) #
- return(out)#
- }#
- #
- hist_above_wrapper = function(histdata, splitdata, breaks=NULL, #
- title=c("", ""), ylabels="", xlabels="", #
- use_log="n", sigfigs=0){#
- plotdata = histdata#
- if (use_log=="y") {#
- plotdata = log(histdata)#
- }#
- sp = split(plotdata, splitdata) #
- h = hist(plotdata, freq=FALSE, axes=FALSE) #
- print(h)#
- #
- par(mfrow=c(2, 1))#
- hist_min = 0.9*min(plotdata)#
- hist_max = 1.1*max(plotdata)#
- h_1 = hist(sp$`1`, breaks=h$breaks, main=title[1], col=medium_grey, xlim=c(hist_min, hist_max), axes=FALSE, xlab="", ylab="") #
- #
- if (use_log=="y") {#
- breaklabels = round(exp(h$breaks), sigfigs)#
- } else {#
- breaklabels = round(h$breaks, sigfigs)#
- }#
- print(breaklabels)#
- title(ylab=ylabels) #
- axis(1, at=h$breaks, labels=breaklabels)#
- freq_label = c(0, round(max(h_1$counts)/sum(h_1$counts), 2))#
- axis(2, at=c(0, max(h_1$counts)), labels=freq_label)#
- #
- h_0 = hist(sp$`0`, breaks=h$breaks, main=title[2], col=medium_grey, xlim=c(hist_min, hist_max), axes=FALSE, xlab="", ylab="") #
- title(ylab=ylabels, xlab=xlabels)#
- axis(1, at=h$breaks, labels=breaklabels)#
- freq_label = c(0, round(max(h_0$counts)/sum(h_0$counts), 2))#
- axis(2, at=c(0, max(h_0$counts)), labels=freq_label)#
- #
- }#
- #
- quartz()#
- hist_above_wrapper(dat$impact.factor, #
- dat$centralized.link.to.ochsner.pmid, #
- seq(0.5, 4, by=0.25), #
- c(paste("Histogram of Impact Factor for datasets\nFound by query of databases for PubMed IDs (n=",count_true_items(dat$centralized.link.to.ochsner.pmid),")", sep=""), #
- paste("Not found by query of databases (n=",count_false_items(dat$centralized.link.to.ochsner.pmid),")", sep="")), #
- "Proportion of datasets",#
- "Impact factor (log scale)", #
- use_log="y", sigfigs=0)#
- #
- #
- #
- quartz()#
- hist_above_wrapper(dat$pubmed.number.times.cited.in.pmc+.5, #
- dat$centralized.link.to.ochsner.pmid, #
- NULL, #
- c(paste("Histogram of Received Citations for datasets\nFound by query of databases for PubMed IDs (n=",count_true_items(dat$centralized.link.to.ochsner.pmid),")", sep=""), #
- paste("Not found by query of databases (n=",count_false_items(dat$centralized.link.to.ochsner.pmid),")", sep="")), #
- "Proportion of datasets",#
- "Number of citations by articles in PubMed Central (log scale)",#
- use_log="y", sigfigs=0)#
- #
- quartz()#
- hist_above_wrapper(dat$number.samples[!is.na(dat$number.samples)], #
- dat$centralized.link.to.ochsner.pmid, #
- seq(0, 8, by=1), #
- c(paste("Histogram of Dataset Size for datasets\nFound by query of databases for PubMed IDs (n=",count_true_items(dat$centralized.link.to.ochsner.pmid[!is.na(dat$number.samples)]),")", sep=""), #
- paste("Not found by query of databases (n=",count_false_items(dat$centralized.link.to.ochsner.pmid[!is.na(dat$number.samples)]),")", sep="")), #
- "Proportion of datasets",#
- "Number of samples in microarray study (log scale)",#
- use_log="y", sigfigs=0)
- #
- #
- #
- Sys.setenv( PATH=paste(Sys.getenv("PATH"),"/usr/texbin",sep=":") ) #
- #
- #
- library(Rserve)#
- Rserve(args="--no-save")#
- #
- library(plyr)#
- #
- setwd("/Users/hpiwowar/Documents/Projects/PLoS ArticleImpact")#
- source("aim3_functions_20100215.R")
- dat.raw = read.csv("PLoS_Metrics.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE)
- dat = data.frame(doi = dat.raw$DOI)#
- #
- #
- dat$days.since.aug.2009 = dat.raw$Days.Since.Aug.2009#
- dat$pmid = dat.raw$PMID#
- #
- #
- #
- dat$is.plos.biology = ifelse(dat.raw$Journal == "PLoS Biology", 1, 0)#
- dat$is.plos.clinical.trials = ifelse(dat.raw$Journal == "PLoS Clinical Trials", 1, 0)#
- dat$is.plos.comp.bio = ifelse(dat.raw$Journal == "PLoS Computational Biology", 1, 0)#
- dat$is.plos.genetics = ifelse(dat.raw$Journal == "PLoS Genetics", 1, 0)#
- dat$is.plos.medicine = ifelse(dat.raw$Journal == "PLoS Medicine", 1, 0)#
- dat$is.plos.neglect.tropical = ifelse(dat.raw$Journal == "PLoS Neglected Tropical Diseases", 1, 0)#
- dat$is.plos.one = ifelse(dat.raw$Journal == "PLoS ONE", 1, 0)#
- dat$is.plos.pathogens = ifelse(dat.raw$Journal == "PLoS Pathogens", 1, 0)#
- dat$is.research = factor(dat.raw$X.Research.Article..or..non.Research.Article..)#
- #
- dat$num.cites.crossref = dat.raw$Citations...CrossRef#
- dat$num.cites.pmc = dat.raw$Citations...PubMed.Central#
- dat$num.cites.scopus = dat.raw$Citations...Scopus#
- dat$num.page.views.html = dat.raw$Total.HTML.Page.Views#
- dat$num.downloads.pdf = dat.raw$Total.PDF.Downloads#
- dat$num.downloads.xml = dat.raw$Total.XML.Downloads#
- #
- dat$num.blogs.postgenomic = dat.raw$Blog.Coverage...Postgenomic#
- dat$num.blogs.natureblogs = dat.raw$Blog.Coverage...Nature.Blogs#
- dat$num.blogs.bloglines = dat.raw$Blog.Coverage...Bloglines#
- dat$num.trackbacks = dat.raw$Number.of.Trackbacks#
- dat$num.bookmarks.citeulike = dat.raw$Social.Bookmarking...CiteULike#
- dat$num.bookmarks.connotea = dat.raw$Social.Bookmarking...Connotea#
- dat$num.ratings = dat.raw$Number.of.Ratings#
- dat$avg.rating = dat.raw$Average.Rating#
- dat$num.note.threads = dat.raw$Number.of.Note.threads#
- dat$num.note.replies = dat.raw$Number.of.replies.to.Notes#
- dat$num.comment.threads = dat.raw$Number.of.Comment.threads#
- #
- #
- dat$num.ratings.with.comment = dat.raw$Number.of..Star.Ratings..that.also.include.a.text.comment#
- #
- dat.transformed = dat#
- vars.no.transform = c("doi", "pmid", "days.since.aug.2009")#
- #
- vars.factor = c("is.research", "is.plos.clinical.trials", "is.plos.biology",#
- "is.plos.comp.bio", "is.plos.genetics", "is.plos.medicine", "is.plos.neglect.tropical", "is.plos.one", "is.plos.pathogens")#
- dat.transformed[,vars.factor] = colwise(factor)(dat[,vars.factor])#
- #
- vars.to.transform = !(names(dat.transformed) %in% c(vars.no.transform, vars.factor))#
- dat.transformed[,vars.to.transform] = tr(dat[,vars.to.transform])#
- #
- #
- summary(dat.transformed)
- library(Hmisc)#
- library(psych)#
- #
- Hmisc::describe(dat, listunique=0)
- dat.indep.stats = dat.transformed[,vars.to.transform]
- library(polycor)#
- #
- myhetcorr = hetcor.modified(dat.indep.stats, use="pairwise.complete.obs", std.err=FALSE, pd=FALSE)#
- mycor.unadjusted = myhetcorr$correlations#
- #
- #
- #
- #
- which(is.na(mycor.unadjusted))#
- #
- #
- #
- #
- #
- mycor = adjust.to.positive.definite(mycor.unadjusted)#
- #
- #
- library(gplots)#
- heatmap.2(mycor, col=bluered(16), cexRow=0.5, cexCol = .8, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(15,15), key=FALSE, keysize=0.1)
- mycor
- legend()
- bluered(16)
- blue(16)
- grey(16)
- greys(16)
- ?bluered
- showpanel(bluered(64))
- library(gplots)
- showpanel(bluered(64))
- showpanel <- function(col)#
- {#
- image(z=matrix(1:100, ncol=1), col=col, xaxt="n", yaxt="n" )#
- }
- showpanel(bluered(64))
- heatmap.2(mycor, col=bluered(16), cexRow=0.5, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
- heatmap.2(mycor, col=bluered(16), cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
- topo
- topocolors
- topo()
- ?colors
- heat.colors
- heatmap.2(mycor, col=heat.colors(16), cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
- heatmap.2(mycor, col=topo.colors(16), cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
- heatmap.2(mycor, col=terrain.colors(16), cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
- heatmap.2(mycor, col=cm.colors(16), cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
- cm.colors
- ?cm.colors
- showpanel(cm.colors(16))
- library(gplots)#
- heatmap.2(mycor, col=cm.colors(16), cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
- cm.colors(16)
- cm.colors(16)[0:8]
- showpanel(cm.colors(16)[0:8])
- showpanel(bluered(16)[0:8])
- showpanel(bluered(32)[0:16])
- showpanel(bluered(32)[17:32])
- heatmap.2(mycor, col=bluered(32)[17:32], cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
- #
- #
- library(nFactors)#
- eigenvectors.1st <- eigen(mycor) #
- #
- aparallel.1st <- parallel(subject=nrow(dat.indep.stats), var=ncol(dat.indep.stats), rep=100, cent=.05)#
- scree.results.1st <- nScree(eigenvectors.1st$values, aparallel.1st$eigen$qevpea)#
- summary(scree.results.1st)#
- plotnScree(scree.results.1st) #
- #
- #
- #
- #
- #
- number.factors.1st = 5
- #
- #
- fit.ml = factanal(dat, number.factors.1st, rotation="promax", covmat=mycor)#
- print(fit.ml, sort=TRUE)#
- #
- #
- #
- library(psych)#
- fit.fa.1st = fa(mycor, number.factors.1st, fm="minres", rotate="promax", #
- scores=FALSE, residuals=TRUE, n.obs=max(dim(dat.indep.stats)))#
- #
- #
- print(fit.fa.1st, sort=TRUE)#
- #
- #
- #
- #
- #
- #
- #
- #
- #
- factor.names.1st = c(#
- "MR1"="Ratings",#
- "MR2"="Citations",#
- "MR3"="Views",#
- "MR4"="Bookmarks",#
- "MR5"="Notes")#
- #
- for (afactor in names(factor.names.1st)) {#
- print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
- print.thresh(fit.fa.1st$loadings[, afactor], .4, TRUE)#
- print.thresh(fit.fa.1st$loadings[, afactor], -0.4, FALSE)#
- }
- #
- #
- fit.fa.1st.cor = fit.fa.1st$r.scores#
- colnames(fit.fa.1st.cor) = factor.names.1st[colnames(fit.fa.1st$loadings)]#
- rownames(fit.fa.1st.cor) = factor.names.1st[colnames(fit.fa.1st$loadings)]#
- #
- eigenvectors.2nd <- eigen(fit.fa.1st.cor) #
- aparallel.2nd <- parallel(subject=nrow(fit.fa.1st.cor), var=ncol(fit.fa.1st.cor), rep=100, cent=.05)#
- scree.results.2nd <- nScree(eigenvectors.2nd$values, aparallel.2nd$eigen$qevpea)#
- scree.results.2nd#
- plotnScree(scree.results.2nd) #
- #
- #
- number.factors.2nd = 2
- #
- #
- #
- fit.fa.2nd = fa(fit.fa.1st.cor, number.factors.2nd, fa="minres", rotate="varimax")#
- print(fit.fa.2nd, sort=TRUE)
- #
- U = fit.fa.2nd$uniquenesses * diag(number.factors.1st)#
- P = fit.fa.2nd$loadings#
- A = cbind(P, U)#
- Pvf = fit.fa.1st$loadings#
- #
- Pvo = Pvf %*% A
- #
- #
- factor.names.2nd = c(#
- "MR1"="Views, bookmarks, citations",#
- "MR2"="Notes and ratings")#
- #
- #
- for (afactor in names(factor.names.2nd)) {#
- print(paste(afactor, ": ", factor.names.2nd[afactor], sep="")) #
- print.thresh(Pvo[, afactor], .3, TRUE)#
- print.thresh(Pvo[, afactor], -0.3, FALSE)#
- }#
- #
- #
- for (afactor in names(factor.names.2nd)) {#
- print(paste(afactor, ": ", factor.names.2nd[afactor], sep=""))#
- print.thresh(fit.fa.2nd$loadings[, afactor], .3, TRUE)#
- print.thresh(fit.fa.2nd$loadings[, afactor], -0.3, FALSE)#
- }
- #
- #
- factor.names.2nd = c(#
- "MR1"="Views, citations, bookmarks",#
- "MR2"="Notes and ratings")#
- #
- #
- for (afactor in names(factor.names.2nd)) {#
- print(paste(afactor, ": ", factor.names.2nd[afactor], sep="")) #
- print.thresh(Pvo[, afactor], .3, TRUE)#
- print.thresh(Pvo[, afactor], -0.3, FALSE)#
- }#
- #
- #
- for (afactor in names(factor.names.2nd)) {#
- print(paste(afactor, ": ", factor.names.2nd[afactor], sep=""))#
- print.thresh(fit.fa.2nd$loadings[, afactor], .3, TRUE)#
- print.thresh(fit.fa.2nd$loadings[, afactor], -0.3, FALSE)#
- }
- #
- #
- #
- #
- dat.impute.input = dat.indep.stats#
- #
- #
- library(mice)#
- #
- #
- #
- mice.output = mice(dat.impute.input, m=1) #
- #
- #
- #
- dat.imputed = complete(mice.output, 1)#
- #
- dat.for.scores = dat.imputed#
- #
- #
- #
- scores.1st = factor.scores.bartlett(dat.for.scores, fit.fa.1st)
- #
- #
- load("/Users/hpiwowar/Documents/Code/hpiwowar/pypub/trunk/src/aim3/stats/dat_aim3.RData")#
- #
- dat.plos = cbind(dat, scores.1st)#
- #
- dat.merge = merge(dat.plos, dat.aim3, by="pmid")#
- #
- dat.regress = data.frame(dataset.in.geo.or.ae.int = dat.merge$dataset.in.geo.or.ae.int)#
- score.names = dimnames(scores.1st)[[2]]#
- dat.regress[,score.names] = dat.merge[,names(dat.merge) %in% score.names]#
- #
- vars.indep = c("days.since.aug.2009", #
- "num.authors.tr",#
- "first.author.num.prev.pubs.tr", #
- "first.author.num.prev.pmc.cites.tr",#
- "last.author.num.prev.pubs.tr",#
- "last.author.num.prev.pmc.cites.tr",#
- "num.grant.numbers.tr",#
- "pubmed.is.humans",#
- "pubmed.is.cancer",#
- "country.usa",#
- "institution.rank",#
- "nih.sum.sum.dollars.tr")#
- #
- dat.regress[,vars.indep] = dat.merge[,vars.indep]#
- #
- #
- #
- plot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
- boxplot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
- boxplot(log(MR1) ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
- boxplot(log(MR2) ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
- boxplot(log(MR3) ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
- boxplot(log(MR4) ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
- boxplot(log(MR5) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
- boxplot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress, main="MR2")
- boxplot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress, main="MR2")
- plot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress)
- boxplot(log(MR2) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
- #
- #
- #
- #
- #
- library(rms)#
- dd.regress = datadist(dat.regress)#
- options(datadist='dd.regress')#
- options(digits=2)#
- #
- f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +#
- (rcs(days.since.aug.2009, 3) +#
- rcs(num.authors.tr, 3) ) +#
- rcs(first.author.num.prev.pubs.tr, 3) + #
- rcs(first.author.num.prev.pmc.cites.tr, 3) +#
- rcs(last.author.num.prev.pubs.tr, 3) + #
- rcs(last.author.num.prev.pmc.cites.tr, 3) + #
- as.numeric(pubmed.is.humans) + #
- as.numeric(pubmed.is.cancer) + #
- as.numeric(country.usa) )#
- #
- ,#
- dat=dat.regress, x=T, y=T)#
- anova(f.1st.nonlinear.interactions.reduced)
- f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +#
- rcs(days.since.aug.2009, 3) + #
- (rcs(num.authors.tr, 3) + #
- rcs(last.author.num.prev.pmc.cites.tr, 3) + #
- as.numeric(pubmed.is.humans) + #
- as.numeric(pubmed.is.cancer) + #
- as.numeric(country.usa) ) )#
- #
- #
- #
- #
- #
- #
- ,#
- dat=dat.regress, x=T, y=T)#
- anova(f.1st.nonlinear.interactions.reduced)
- s = summary(f.1st.nonlinear.interactions.reduced)#
- s#
- plot(s)
- names(dat)
- f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +#
- (rcs(days.since.aug.2009, 10) +#
- rcs(num.authors.tr, 3) ) +#
- rcs(first.author.num.prev.pubs.tr, 3) + #
- rcs(first.author.num.prev.pmc.cites.tr, 3) +#
- rcs(last.author.num.prev.pubs.tr, 3) + #
- rcs(last.author.num.prev.pmc.cites.tr, 3) + #
- as.numeric(pubmed.is.humans) + #
- as.numeric(pubmed.is.cancer) + #
- as.numeric(country.usa) )#
- #
- ,#
- dat=dat.regress, x=T, y=T)#
- anova(f.1st.nonlinear.interactions.reduced)
- f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +#
- rcs(days.since.aug.2009, 10) + #
- (rcs(num.authors.tr, 3) + #
- rcs(last.author.num.prev.pmc.cites.tr, 3) + #
- as.numeric(pubmed.is.humans) + #
- as.numeric(pubmed.is.cancer) + #
- as.numeric(country.usa) ) )#
- #
- #
- #
- #
- #
- #
- ,#
- dat=dat.regress, x=T, y=T)#
- anova(f.1st.nonlinear.interactions.reduced)
- s = summary(f.1st.nonlinear.interactions.reduced)#
- s#
- plot(s)
- summ.1st.nonlinear.interactions.reduced = summary(f.1st.nonlinear.interactions.reduced)#
- summ.1st.nonlinear.interactions.reduced.dimnames = dimnames(summ.1st.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.1st.nonlinear.interactions.reduced)[[1]]),2)]#
- dimnames(summ.1st.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.1st.nonlinear.interactions.reduced)[[1]]),2)] = factor.names.1st[summ.1st.nonlinear.interactions.reduced.dimnames]#
- plot(summ.1st.nonlinear.interactions.reduced, q = c(0.95), col=gray(0.5), log=T, cex=.8)#
- title("Multivariate nonlinear regressions with interactions")
- summ.1st.nonlinear.interactions.reduced = summary(f.1st.nonlinear.interactions.reduced)#
- summ.1st.nonlinear.interactions.reduced.dimnames = dimnames(summ.1st.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.1st.nonlinear.interactions.reduced)[[1]]),2)]#
- dimnames(summ.1st.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.1st.nonlinear.interactions.reduced)[[1]]),2)] = factor.names.1st[summ.1st.nonlinear.interactions.reduced.dimnames]#
- plot(summ.1st.nonlinear.interactions.reduced, q = c(0.95), col=gray(0.5), log=T, cex=.8)#
- title("Multivariate nonlinear regressions with interactions")
- #
- dat.regress.named = dat.regress#
- names(dat.regress.named) = c("dataset.in.geo.or.ae.int", factor.names.1st[names(dat.regress)[-1]])#
- #
- dots.1st.nonlinear.interactions.reduced = summary(dataset.in.geo.or.ae.int ~ .,#
- dat=dat.regress.named)#
- plot(dots.1st.nonlinear.interactions.reduced, cex.labels=0.5, cex=0.7, #
- xlab="Percentage of studies with datasets in GEO or ArrayExpress", #
- main="Univariate data sharing behaviour on first order factors")
- #
- #
- loadings.2nd = Pvo#
- #
- #
- fit.pa.2nd.tovars = list(loadings=loadings.2nd[,colnames(fit.fa.2nd$loadings)])#
- fit.pa.2nd.tovars$uniquenesses = apply(fit.pa.2nd.tovars$loadings^2, 1, sum)#
- #
- scores.to.dat.2nd = factor.scores.bartlett(dat.for.scores, fit.pa.2nd.tovars)#
- #
- dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat$dataset.in.geo.or.ae.int) #
- dat.regress.2nd[,dimnames(scores.to.dat.2nd)[[2]]] = scores.to.dat.2nd#
- #
- library(rms)#
- #
- dd.regress.2nd = datadist(dat.regress.2nd)#
- options(datadist='dd.regress.2nd')#
- options(digits=2)#
- #
- #
- f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
- rcs(MR3, 4) + (rcs(MR1, 4) + rcs(MR4, 4) + rcs(MR5, 4) + rcs(MR2, 4))^2,#
- dat=dat.regress.2nd, x=T, y=T)#
- anova(f.2nd.nonlinear.interactions.reduced)
- names(dat.regress.2nd)
- dimnames(scores.to.dat.2nd)[[2]]
- dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat$dataset.in.geo.or.ae.int)
- dat.regress.2nd[,dimnames(scores.to.dat.2nd)[[2]]] = scores.to.dat.2nd
- scores.to.dat.2nd
- dim(dat.retress.2nd)
- dim(dat.regress.2nd)
- dim(dat.regress.2nd)dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat$dataset.in.geo.or.ae.int)
- dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat$dataset.in.geo.or.ae.int)
- dim(dat.regress.2nd)
- ?str(dat.regress.2nd)
- str(dat.regress.2nd)
- names(dat)
- dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat.aim3$dataset.in.geo.or.ae.int) #
- dat.regress.2nd[,dimnames(scores.to.dat.2nd)[[2]]] = scores.to.dat.2nd
- dim(dat.regress.2nd)
- dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat.merge$dataset.in.geo.or.ae.int) #
- dat.regress.2nd[,dimnames(scores.to.dat.2nd)[[2]]] = scores.to.dat.2nd
- dim(dat.merge)
- dim(scores.to.dat.2nd)
- dim(scores.to.dat.1st)
- dim(scores.1st)
- dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat.merge$dataset.in.geo.or.ae.int) #
- score.names.2nd = dimnames(scores.to.dat.2nd)[[2]]#
- dat.regress.2nd[,score.names.2nd] = dat.merge[,names(dat.merge) %in% score.names.2nd]
- #
- library(rms)#
- #
- dd.regress.2nd = datadist(dat.regress.2nd)#
- options(datadist='dd.regress.2nd')#
- options(digits=2)#
- #
- #
- f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
- rcs(MR3, 4) + (rcs(MR1, 4) + rcs(MR4, 4) + rcs(MR5, 4) + rcs(MR2, 4))^2,#
- dat=dat.regress.2nd, x=T, y=T)#
- anova(f.2nd.nonlinear.interactions.reduced)
- f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
- (rcs(MR1, 4) + rcs(MR2, 4))^2,#
- dat=dat.regress.2nd, x=T, y=T)#
- anova(f.2nd.nonlinear.interactions.reduced)
- f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ rcs(MR1, 4),#
- dat=dat.regress.2nd, x=T, y=T)#
- anova(f.2nd.nonlinear.interactions.reduced)
- names(dat.merge)
- f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
- rcs(days.since.aug.2009, 10) + rcs(MR1, 4),#
- dat=dat.regress.2nd, x=T, y=T)#
- anova(f.2nd.nonlinear.interactions.reduced)
- dat.regress.2nd[,days.since.aug.2009] = dat.merge[,days.since.aug.2009]
- dat.regress.2nd[,"days.since.aug.2009"] = dat.merge[,"days.since.aug.2009"]
- f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
- rcs(days.since.aug.2009, 10) + rcs(MR1, 4),#
- dat=dat.regress.2nd, x=T, y=T)#
- anova(f.2nd.nonlinear.interactions.reduced)
- f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
- rcs(days.since.aug.2009, 10) + rcs(MR1, 3),#
- dat=dat.regress.2nd, x=T, y=T)#
- anova(f.2nd.nonlinear.interactions.reduced)
- f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
- (rcs(days.since.aug.2009, 10) + rcs(MR1, 3))^2,#
- dat=dat.regress.2nd, x=T, y=T)#
- anova(f.2nd.nonlinear.interactions.reduced)
- f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +#
- (rcs(days.since.aug.2009, 10)))#
- anova(f.2nd.nonlinear.interactions.reduced)
- f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ dataset.in.geo.or.ae.int +#
- (rcs(days.since.aug.2009, 10)))#
- anova(f.2nd.nonlinear.interactions.reduced)
- f.1st.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +#
- rcs(days.since.aug.2009, 10))^2, #
- dat=dat.regress.2nd, x=T, y=T)#
- anova(f.2nd.nonlinear.interactions.reduced)
- f.2nd.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +#
- rcs(days.since.aug.2009, 10))^2, #
- dat=dat.regress.2nd, x=T, y=T)#
- anova(f.2nd.nonlinear.interactions.reduced)
- #
- f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
- (rcs(MR1, 4) + rcs(MR2, 4))^2,#
- dat=dat.regress.2nd, x=T, y=T)#
- anova(f.2nd.nonlinear.interactions.reduced)
- summ.2nd.nonlinear.interactions.reduced = summary(f.2nd.nonlinear.interactions.reduced)#
- summ.2nd.nonlinear.interactions.reduced.dimnames = dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]]),2)]#
- dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]]),2)] = factor.names.2nd[summ.2nd.nonlinear.interactions.reduced.dimnames]#
- plot(summ.2nd.nonlinear.interactions.reduced, q = c(0.95), col=gray(0.5), log=T, cex=.8)#
- title("Multivariate nonlinear regression with interactions")
- dat.regress.2nd[,vars.indep] = dat.merge[,vars.indep]#
- #
- library(rms)#
- #
- dd.regress.2nd = datadist(dat.regress.2nd)#
- options(datadist='dd.regress.2nd')#
- options(digits=2)
- f.2nd.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +#
- (rcs(days.since.aug.2009, 10) +#
- rcs(num.authors.tr, 3) ) +#
- rcs(first.author.num.prev.pubs.tr, 3) + #
- rcs(first.author.num.prev.pmc.cites.tr, 3) +#
- rcs(last.author.num.prev.pubs.tr, 3) + #
- rcs(last.author.num.prev.pmc.cites.tr, 3) + #
- as.numeric(pubmed.is.humans) + #
- as.numeric(pubmed.is.cancer) + #
- as.numeric(country.usa) )#
- #
- ,#
- dat=dat.regress.2nd, x=T, y=T)#
- anova(f.2nd.nonlinear.interactions.reduced)
- f.2nd.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +#
- (rcs(days.since.aug.2009, 10) +#
- rcs(num.authors.tr, 3) ) +#
- rcs(first.author.num.prev.pubs.tr, 3) + #
- rcs(first.author.num.prev.pmc.cites.tr, 3) +#
- rcs(last.author.num.prev.pubs.tr, 3) + #
- rcs(last.author.num.prev.pmc.cites.tr, 3) + #
- as.numeric(pubmed.is.humans) + #
- as.numeric(pubmed.is.cancer) + #
- as.numeric(country.usa) )^2#
- #
- ,#
- dat=dat.regress.2nd, x=T, y=T)#
- anova(f.2nd.nonlinear.interactions.reduced)
- f.2nd.nonlinear.interactions.reduced
- f.2nd.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +#
- (rcs(days.since.aug.2009, 10) +#
- rcs(num.authors.tr, 3) ) +#
- rcs(first.author.num.prev.pubs.tr, 3) + #
- rcs(first.author.num.prev.pmc.cites.tr, 3) +#
- rcs(last.author.num.prev.pubs.tr, 3) + #
- rcs(last.author.num.prev.pmc.cites.tr, 3) + #
- as.numeric(pubmed.is.humans) + #
- as.numeric(pubmed.is.cancer) + #
- as.numeric(country.usa) )#
- #
- ,#
- dat=dat.regress.2nd, x=T, y=T)#
- anova(f.2nd.nonlinear.interactions.reduced)
- f.2nd.nonlinear.interactions.reduced
- dim(dat.regress.2nd)
- summ.2nd.nonlinear.interactions.reduced = summary(f.2nd.nonlinear.interactions.reduced)#
- summ.2nd.nonlinear.interactions.reduced.dimnames = dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]]),2)]#
- dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]]),2)] = factor.names.2nd[summ.2nd.nonlinear.interactions.reduced.dimnames]#
- plot(summ.2nd.nonlinear.interactions.reduced, q = c(0.95), col=gray(0.5), log=T, cex=.8)#
- title("Multivariate nonlinear regression with interactions")
- dat.regress.2nd.named = dat.regress.2nd#
- names(dat.regress.2nd.named) = c("dataset.in.geo.or.ae.int", factor.names.2nd[names(dat.regress.2nd)[-1]])#
- #
- dots.2nd.nonlinear.interactions.reduced = summary(dataset.in.geo.or.ae.int ~ .,#
- dat=dat.regress.2nd.named)#
- plot(dots.2nd.nonlinear.interactions.reduced, cex.labels=0.5, cex=0.7, #
- xlab="Percentage of studies with datasets in GEO or ArrayExpress", #
- main="Univariate data sharing behaviour\non second order factors")
- dots.2nd.nonlinear.interactions.reduced = summary(dataset.in.geo.or.ae.int ~ .,#
- dat=dat.regress.2nd.named)
- dat.regress.2nd.named = dat.regress.2nd#
- names(dat.regress.2nd.named) = c("dataset.in.geo.or.ae.int", factor.names.2nd[names(dat.regress.2nd)[-1]])#
- #
- dots.2nd.nonlinear.interactions.reduced = summary(dataset.in.geo.or.ae.int ~ .,#
- dat=dat.regress.2nd.named)#
- plot(dots.2nd.nonlinear.interactions.reduced, cex.labels=0.5, cex=0.7, #
- xlab="Percentage of studies with datasets in GEO or ArrayExpress", #
- main="Univariate data sharing behaviour\non second order factors")
- ?plsmo
- plsmo(days.since.aug.2009, MR1, dat=dat.regress)
- names(dat.regress)
- plsmo(dat.regress$days.since.aug.2009, dat.regress$MR1)
- par(c(5, 1))#
- plsmo(dat.regress$days.since.aug.2009, dat.regress$MR1, main="MR1")
- ?par
- par(mfcol=c(5, 1))#
- plsmo(dat.regress$days.since.aug.2009, dat.regress$MR1, main="MR1")
- par(mfcol=c(1, 2))#
- plsmo(dat.regress$days.since.aug.2009, dat.regress$MR1, main="MR1")
- for (afactor in names(factor.names.1st)) {#
- print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
- plsmo(dat.regress$days.since.aug.2009, dat.regress[afactor], main=factor.names.1st[afactor])#
- #
- }
- for (afactor in names(factor.names.1st)) {#
- print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
- quartz()#
- plsmo(dat.regress$days.since.aug.2009, dat.regress[afactor], main=factor.names.1st[afactor])#
- }
- for (afactor in names(factor.names.1st)) {#
- print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
- quartz()#
- plsmo(dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
- title(factor.names.1st[afactor])#
- }
- ?reverse
- ?rev
- max(dat.regress$days.since.aug.2009)
- ?plot
- for (afactor in names(factor.names.1st)) {#
- print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
- quartz()#
- plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
- title(factor.names.1st[afactor])#
- }
- for (afactor in names(factor.names.1st)) {#
- print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
- quartz()#
- plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
- title(factor.names.1st[afactor])#
- }
- for (afactor in names(factor.names.1st)) {#
- print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
- quartz()#
- boxplot(log(afactor) ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
- title(factor.names.1st[afactor])#
- }
- for (afactor in names(factor.names.1st)) {#
- print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
- quartz()#
- boxplot(log(dat.regress[afactor]) ~ dat.regress$dataset.in.geo.or.ae.int)#
- title(factor.names.1st[afactor])#
- }
- afactor="MR1"
- boxplot(log(dat.regress[afactor]) ~ dat.regress$dataset.in.geo.or.ae.int)
- max(log(dat.regress[afactor]))
- max(log(dat.regress[afactor], na.rm=T))
- summary(dat.regress[afactor])
- for (afactor in names(factor.names.1st)) {#
- print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
- quartz()#
- boxplot(at.regress[afactor] ~ dat.regress$dataset.in.geo.or.ae.int)#
- title(factor.names.1st[afactor])#
- }
- #
- for (afactor in names(factor.names.1st)) {#
- print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
- quartz()#
- boxplot(dat.regress[afactor] ~ dat.regress$dataset.in.geo.or.ae.int)#
- title(factor.names.1st[afactor])#
- }
- boxplot(dat.regress[afactor] ~ dat.regress$dataset.in.geo.or.ae.int)
- plot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress)
- boxplot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress)
- boxplot(MR1 ~ dataset.in.geo.or.ae.int, dat=dat.regress)
- boxplot(log(MR1) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
- boxplot(log(dat.regress$MR1) ~ dat.regress$dataset.in.geo.or.ae.int)
- boxplot(log(dat.regress["MR1"]) ~ dat.regress$dataset.in.geo.or.ae.int)
- boxplot(log(dat.regress["MR1",]) ~ dat.regress$dataset.in.geo.or.ae.int)
- boxplot(log(dat.regress[,"MR1"]) ~ dat.regress$dataset.in.geo.or.ae.int)
- for (afactor in names(factor.names.1st)) {#
- print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
- quartz()#
- boxplot(dat.regress[,afactor] ~ dat.regress$dataset.in.geo.or.ae.int)#
- title(factor.names.1st[afactor])#
- }
- for (afactor in names(factor.names.1st)) {#
- print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
- quartz()#
- boxplot(dat.regress[,afactor] ~ dat.regress$dataset.in.geo.or.ae.int)#
- title(factor.names.1st[afactor])#
- }
- names(dat)
- fit.fa.2nd$loadings[, afactor]
- fit.fa.1st$loadings[, afactor]
- loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))
- loaded
- print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
- quartz()#
- plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
- title(factor.names.1st[afactor])#
- loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))#
- plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[names(dat.regress) %in% loaded], add=T)
- [names(dat.regress) %in% loaded
- names(dat.regress) %in% loaded
- loaded
- names(dat.regress)
- for (afactor in names(factor.names.1st)) {#
- print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
- quartz()#
- plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
- title(factor.names.1st[afactor])#
- loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))#
- plsmo(-1*dat.merge$days.since.aug.2009, dat.merge[names(dat.marge) %in% loaded], add=T)#
- }
- plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
- title(factor.names.1st[afactor])#
- loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))#
- plsmo(-1*dat.merge$days.since.aug.2009, dat.merge[names(dat.merge) %in% loaded], add=…
Large files files are truncated, but you can click here to view the full file