PageRenderTime 77ms CodeModel.GetById 16ms app.highlight 50ms RepoModel.GetById 1ms app.codeStats 0ms

/.Rhistory

http://github.com/hpiwowar/alt-metrics_stats
#! | 1776 lines | 1776 code | 0 blank | 0 comment | 0 complexity | a1e04d14a7b24fc5ae481dae58dfe9c3 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1library(rmeta)
   2#
   3#
   4strip_newlines = function(str) {#
   5    return(gsub("\n", "", str))#
   6}#
   7#
   8start_results = function(str, ...) {#
   9    formatted_string = get_formatted_string(str, ...)#
  10    results <<- formatted_string#
  11    all_results <<- paste(all_results, formatted_string, sep="")#
  12}#
  13#
  14get_formatted_string = function(str, ...)#
  15    if (length(list(...)) > 0) {#
  16        formatted_string = sprintf(str, ...)#
  17    } else {#
  18        formatted_string = str#
  19    }#
  20    return(formatted_string)#
  21}#
  22#
  23append_to_results = function(str, ...) {#
  24    formatted_string = get_formatted_string(str, ...)#
  25    results <<- paste(results, formatted_string, sep="")#
  26    all_results <<- paste(all_results, formatted_string, sep="")#
  27}#
  28#
  29count_unique_items = function(column) {#
  30    return(dim(table(column)))#
  31}#
  32#
  33count_true_items = function(column) {#
  34    return(sum(column==TRUE))#
  35}#
  36#
  37count_false_items = function(column) {#
  38    return(sum(column!=TRUE))#
  39}#
  40#
  41#
  42#
  43all_results = ""#
  44start_results("\nOchsner et al.{Ochsner} identified ")#
  45#
  46dat.read = read.csv("rawdata.csv", header=TRUE, sep=",")#
  47number_of_data_producing_publications_found_by_ochsner = #
  48    count_unique_items(dat.read$pmid)#
  49append_to_results(number_of_data_producing_publications_found_by_ochsner) #
  50append_to_results(" studies that generated gene expression microarray data.")#
  51results#
  52#
  53start_results("  Ochsnerâ&#x20AC;&#x2122;s search found ")#
  54number_of_datasets_found_by_ochsner = #
  55    count_true_items(dat.read$ochsner_found_any_data)#
  56append_to_results(number_of_datasets_found_by_ochsner)#
  57append_to_results(" publicly-available datasets, ")#
  58append_to_results("representing ")#
  59number_of_data_sharing_publications_found_by_ochsner = #
  60    count_unique_items(dat.read$pmid[dat.read$ochsner_found_any_data == TRUE])#
  61append_to_results(number_of_data_sharing_publications_found_by_ochsner)#
  62append_to_results(" distinct studies")#
  63append_to_results(" (or %.1f%% of all %d data-producing studies; some studies have more than one associated dataset).", #
  64                100*number_of_data_sharing_publications_found_by_ochsner / number_of_data_producing_publications_found_by_ochsner, #
  65                number_of_data_producing_publications_found_by_ochsner)#
  66results#
  67#
  68start_results("  Of the datasets, ")#
  69number_of_datasets_found_in_geo_by_ochsner = count_true_items(dat.read$ochsner_found_geo_data)#
  70append_to_results("%d (%.1f%%)", #
  71                number_of_datasets_found_in_geo_by_ochsner, #
  72                100*number_of_datasets_found_in_geo_by_ochsner / number_of_data_producing_publications_found_by_ochsner_that_shared_data)#
  73append_to_results(" were found in the Gene Expression Omnibus (GEO) database, ")#
  74number_of_datasets_found_in_arrayexpress_by_ochsner = count_true_items(dat.read$ochsner_found_arrayexpress_data)#
  75append_to_results("%d (%.1f%%)", #
  76                number_of_datasets_found_in_arrayexpress_by_ochsner, #
  77                100*number_of_datasets_found_in_arrayexpress_by_ochsner /  number_of_data_producing_publications_found_by_ochsner_that_shared_data)               #
  78append_to_results(" in ArrayExpress, ")#
  79number_of_datasets_found_on_journal_sites_by_ochsner = count_true_items(dat.read$ochsner_journal)#
  80append_to_results("%d (%.1f%%)", #
  81                number_of_datasets_found_on_journal_sites_by_ochsner, #
  82                100*number_of_datasets_found_on_journal_sites_by_ochsner /  number_of_data_producing_publications_found_by_ochsner_that_shared_data)               #
  83append_to_results(" hosted on journal websites, and ")#
  84number_of_datasets_found_elsewhere_by_ochsner = count_true_items(dat.read$ochsner_other)#
  85append_to_results("%d (%.1f%%)", #
  86                number_of_datasets_found_elsewhere_by_ochsner, #
  87                100*number_of_datasets_found_elsewhere_by_ochsner /  number_of_data_producing_publications_found_by_ochsner_that_shared_data)               #
  88append_to_results(" on lab websites and smaller online data repositories ")#
  89append_to_results("(some datasets were found in more than one location).")#
  90results#
  91#
  92start_results("  Combined, GEO and ArrayExpress housed ")#
  93number_of_datasets_found_in_geo_or_arrayexpress_by_ochsner = #
  94    count_true_items(dat.read$ochsner_found_geo_data | dat.read$ochsner_found_arrayexpress_data)#
  95append_to_results("%d (%.1f%%)", #
  96                number_of_datasets_found_in_geo_or_arrayexpress_by_ochsner, #
  97                100*number_of_datasets_found_in_geo_or_arrayexpress_by_ochsner /  number_of_data_producing_publications_found_by_ochsner_that_shared_data)               #
  98append_to_results(" of the manually-located datasets.")#
  99results#
 100#
 101start_results("\nNext, we ran automated queries.  We queried the GEO and ArrayExpress databases with the PubMed IDs of the ")#
 102append_to_results(number_of_data_producing_publications_found_by_ochsner)#
 103append_to_results(" data-producing studies.")#
 104results#
 105#
 106#
 107start_results("  The automated queries returned ")#
 108number_of_oschnser_pmid_link_to_geo_or_arrayexpress = count_true_items(dat.read$centralized_link_to_ochsner_pmid & (dat.read$in_geo | dat.read$in_arrayexpress))#
 109append_to_results(number_of_oschnser_pmid_link_to_geo_or_arrayexpress)#
 110append_to_results(" datasets in total, with ")#
 111number_of_oschnser_pmid_link_to_geo = count_true_items(dat.read$centralized_link_to_ochsner_pmid & dat.read$in_geo)#
 112append_to_results(number_of_oschnser_pmid_link_to_geo)#
 113append_to_results(" datasets from GEO and ")#
 114number_of_oschnser_pmid_link_to_arrayexpress = count_true_items(dat.read$centralized_link_to_ochsner_pmid & dat.read$in_arrayexpress)#
 115append_to_results(number_of_oschnser_pmid_link_to_arrayexpress)#
 116append_to_results(" datasets from ArrayExpress, including ")#
 117number_of_oschnser_pmid_link_to_geo_AND_arrayexpress = count_true_items(dat.read$centralized_link_to_ochsner_pmid & dat.read$in_geo & dat.read$in_arrayexpress)#
 118append_to_results(number_of_oschnser_pmid_link_to_geo_AND_arrayexpress)#
 119append_to_results(" datasets in both databases (ArrayExpress has a policy of importing selected GEO submissions).")#
 120results#
 121#
 122start_results("\nWe compared the two retrieval sets: the automated search found ")#
 123number_datasets_found_by_automated_not_ochsner_geo_arrayexpress = #
 124    count_true_items(dat.read$centralized_link_to_ochsner_pmid & !dat.read$ochsner_found_any_data)#
 125append_to_results(number_datasets_found_by_automated_not_ochsner_geo_arrayexpress)#
 126append_to_results(" datasets overlooked by the manual search, ")#
 127append_to_results("while the manual search found ")#
 128number_datasets_found_by_ochsner_geo_arrayexpress_not_automated = #
 129    count_true_items((dat.read$ochsner_found_geo_data | dat.read$ochsner_found_arrayexpress_data) & !dat.read$centralized_link_to_ochsner_pmid)#
 130append_to_results(number_datasets_found_by_ochsner_geo_arrayexpress_not_automated)    #
 131append_to_results(" datasets in GEO and ArrayExpress that were not found automatically")#
 132append_to_results(" (plus ")#
 133number_datasets_ochsner_not_geo_arrayexpress_or_automated = #
 134    count_true_items((dat.read$ochsner_journal | dat.read$ochsner_other) & !dat.read$centralized_link_to_ochsner_pmid)#
 135append_to_results(number_datasets_ochsner_not_geo_arrayexpress_or_automated)        #
 136append_to_results(" datasets in other internet locations, as described above).")#
 137results#
 138#
 139start_results("  A union of the manual and automated searches includes ")#
 140number_of_datasets_found_via_manual_or_automated = count_true_items(dat.read$data_available)#
 141append_to_results(number_of_datasets_found_via_manual_or_automated)#
 142append_to_results(" distinct datasets")#
 143append_to_results("; we consider this our reference set of 'publicly-available datasets' in the analysis below.")#
 144#
 145results#
 146#
 147start_results("Figure 1 shows the relative recall of the search strategies.")#
 148append_to_results("  As illustrated in the first bar of Figure A, the manual search retrieved ")#
 149append_to_results("%.1f%%", #
 150                100*number_of_datasets_found_by_ochsner / number_of_datasets_found_via_manual_or_automated)               #
 151append_to_results(" of the publicly-available datasets.")#
 152#
 153append_to_results("  The second bar highlights the datasets available in either GEO or ArrayExpress: ")#
 154number_datasets_found_geo_arrayexpress_union = #
 155    count_true_items(dat.read$ochsner_found_geo_data | dat.read$ochsner_found_arrayexpress_data | dat.read$centralized_link_to_ochsner_pmid)#
 156append_to_results("%.1f%%", #
 157                100*number_datasets_found_geo_arrayexpress_union / number_of_datasets_found_via_manual_or_automated)               #
 158append_to_results(" of the total.")#
 159#
 160append_to_results("  Automated searches retrieve ")#
 161append_to_results("%.1f%%", #
 162                100*number_of_oschnser_pmid_link_to_geo_or_arrayexpress / number_datasets_found_geo_arrayexpress_union)               #
 163append_to_results(" of the datasets housed in GEO and/or ArrayExpress, or ")#
 164append_to_results("%.1f%%", #
 165                100*number_of_oschnser_pmid_link_to_geo_or_arrayexpress / number_of_datasets_found_via_manual_or_automated)               #
 166append_to_results(" of datasets housed anywhere.")#
 167#
 168append_to_results("  Finally, the last two bars depict on the databases individually.  Automated searches of just GEO and just ArrayExpress retrieve ")#
 169append_to_results("%.1f%%", #
 170                100*number_of_oschnser_pmid_link_to_geo / number_of_datasets_found_via_manual_or_automated)               #
 171append_to_results(" and ")#
 172append_to_results("%.1f%%", #
 173                100*number_of_oschnser_pmid_link_to_arrayexpress / number_of_datasets_found_via_manual_or_automated)               #
 174append_to_results(" of all available datasets, respectively.")#
 175results#
 176#
 177start_results("\n\n\n<<TABLE 1 goes here!>>\n\n\n")#
 178cat(all_results)#
 179#
 180#
 181start_results("Next, we looked at univariate patterns to see if the datasets retrieved automatically ")#
 182append_to_results("differed from those that were retrieved manually.  ")#
 183append_to_results(strip_newlines("#
 184The odds that a dataset was about cancer, performed on an Affymetrix platform, #
 185involved humans, or involved cultured cells were not significantly associated #
 186with whether the dataset was retrievable through automated mechanisms  (p>0.3). #
 187These odds ratios and their confidence intervals are depicted in Figure 2. #
 188Similarly, in ANOVA analysis, the species was not significantly different #
 189between datasets found automatically and those not found automatically (p=0.7).#
 190"))#
 191#
 192#
 193#
 194library(Hmisc,T)#
 195library(Design,T)#
 196library(sciplot)#
 197library(rmeta)#
 198#
 199dat.raw = dat.read[which(dat.read$data_available == '1'),]#
 200dim(dat.raw)#
 201names(dat.raw) = gsub("_", ".", names(dat.raw))#
 202names(dat.raw) = gsub(" ", ".", names(dat.raw))#
 203names(dat.raw)#
 204#
 205dat = dat.raw#
 206#
 207#
 208#
 209dat$all = dat$pmid > ""#
 210dat$in.arrayexpress.or.geo = dat$in.arrayexpress | dat$in.geo#
 211dat$centralized.arrayexpress = dat$in.arrayexpress & dat$centralized.link.to.ochsner.pmid#
 212dat$centralized.geo = dat$in.geo & dat$centralized.link.to.ochsner.pmid#
 213dat$centralized.arrayexpress.or.geo = dat$in.arrayexpress.or.geo & dat$centralized.link.to.ochsner.pmid#
 214#
 215#
 216levels(dat$species)[table(dat$species) < .03*length(dat$species)] = "Other"#
 217#
 218#
 219affy = grep("Affymetrix", levels(dat$array.design))#
 220levels(dat$array.design)[-affy] = "Other"#
 221affy = grep("Affymetrix", levels(dat$array.design))#
 222levels(dat$array.design)[affy] = "Affymetrix"#
 223#
 224#
 225#
 226fisher.test(table(dat$centralized.link.to.ochsner.pmid, dat$array.design))#
 227fisher.test(table(dat$centralized.link.to.ochsner.pmid, dat$pubmed.is.cancer))#
 228fisher.test(table(dat$centralized.link.to.ochsner.pmid, dat$pubmed.is.cultured.cells))#
 229fisher.test(table(dat$centralized.link.to.ochsner.pmid, dat$pubmed.is.humans))#
 230fisher.test(table(dat$centralized.link.to.ochsner.pmid, dat$species))#
 231#
 232#
 233anova(lrm(species ~ centralized.link.to.ochsner.pmid, dat=dat))#
 234#
 235results#
 236#
 237#
 238#
 239#
 240start_results("\n")#
 241#
 242wilcox.test(dat$pubmed.number.times.cited.in.pmc ~ dat$centralized.link.to.ochsner.pmid)#
 243wilcox.test(dat$number.samples ~ dat$centralized.link.to.ochsner.pmid)#
 244#
 245append_to_results(strip_newlines("We did find some differences.  "))#
 246wilcox.test(dat$impact.factor ~ dat$centralized.link.to.ochsner.pmid)#
 247#
 248#
 249append_to_results(strip_newlines("Histograms of the distribution of #
 250citations (Figure 3), impact factors (Figure 4), and sample size (Figure 5) #
 251illustrate the different distributions for studies with database citation links #
 252to those without.  Searching centralized links in GEO and ArrayExpress can find "))#
 253table_impact_factor = table(dat$centralized.link.to.ochsner.pmid, cut(dat$impact.factor, c(0, 10, 20, 100)))#
 254prop_table_impact_factor = prop.table(table_impact_factor, 2)#
 255append_to_results("%.1f%%", 100*prop_table_impact_factor[2, 3])#
 256append_to_results(" of datasets associated with articles published in journals with impact factors greater than 20, but only ")#
 257append_to_results("%.1f%%", 100*prop_table_impact_factor[2, 2])#
 258append_to_results(" of those with impact factors from 10-20, and only ")#
 259append_to_results("%.1f%%", 100*prop_table_impact_factor[2, 1])#
 260append_to_results(" of the datasets for papers published with impact factors less than 10.")#
 261results#
 262#
 263start_results("\n")#
 264append_to_results("Similarly, our automated search found ")#
 265table_citations = table(dat$centralized.link.to.ochsner.pmid, cut(dat$pubmed.number.times.cited.in.pmc, c(-1, 0, 5, 10000)))#
 266prop_table_citations = prop.table(table_citations, 2)#
 267prop_table_citations#
 268append_to_results("%.1f%%", 100*prop_table_citations[2, 3])#
 269append_to_results(" of datasets whose associated articles had more than 5 citations, ")#
 270append_to_results("%.1f%%", 100*prop_table_citations[2, 2])#
 271append_to_results(" with 0-5 citations, and only ")#
 272append_to_results("%.1f%%", 100*prop_table_citations[2, 1])#
 273append_to_results(" with 0 citations.")#
 274#
 275append_to_results("\nOur automated queries retrieved ")#
 276table_sample_size = table(dat$centralized.link.to.ochsner.pmid, cut(dat$number.samples, c(0, 10, 100, 10000)))#
 277prop_table_sample_size = prop.table(table_sample_size, 2)#
 278prop_table_sample_size#
 279append_to_results("%.1f%%", 100*prop_table_sample_size[2, 3])#
 280append_to_results(" of datasets with more than 100 datapoints, ")#
 281append_to_results("%.1f%%", 100*prop_table_sample_size[2, 2])#
 282append_to_results(" with between 10 and 100, and  ")#
 283append_to_results("%.1f%%", 100*prop_table_sample_size[2, 1]) #
 284append_to_results(" of datasets with fewer than 10 datapoints.")#
 285results#
 286#
 287start_results("\n")#
 288append_to_results(strip_newlines("A journal policy requiring an accession number in published articles #
 289may increase the number of citation links in centralized database ("))#
 290table_journal_policy = table(dat$centralized.link.to.ochsner.pmid, dat$journal.policy.requires)#
 291prop_table_journal_policy = prop.table(table_journal_policy, 2)#
 292append_to_results("%.1f%%", 100*prop_table_journal_policy[2, 2])#
 293append_to_results(strip_newlines(" of datasets can be found through citation links for journals that #
 294require an accession number, vs. "))#
 295append_to_results("%.1f%%", 100*prop_table_journal_policy[2, 1])#
 296append_to_results(strip_newlines(" of datasets in other journals), but this #
 297difference was not statistically significant.  "))#
 298#
 299#
 300fisher.test(table(dat$centralized.link.to.ochsner.pmid, dat$journal.policy.requires))#
 301results#
 302#
 303start_results("\n")#
 304append_to_results(strip_newlines("Multivariate regressions did not reveal any significant relationships between #
 305dataset retrievability and impact factor, number of citations, sample size, #
 306or journal policy."))#
 307#
 308#
 309f = lrm(formula = centralized.link.to.ochsner.pmid #
 310    ~ rcs(log(impact.factor), 4)  #
 311    + journal.policy.requires  #
 312    #
 313    #
 314    , dat=dat, x=T, y=T)#
 315f#
 316anova(f)#
 317#
 318results#
 319cat(all_results)#
 320#
 321#
 322#
 323#
 324#
 325#
 326#
 327plot_on_same_graph = function() {#
 328    par(new=TRUE)#
 329}#
 330#
 331plot_on_new_graph = function() {#
 332    par(new=FALSE)#
 333}#
 334#
 335plot_ci_bar = function(x, bar_number=1, max_number_bars=1, bar_name="", col="black") {#
 336    y_max = 1#
 337    bar_position = bar_number*2 - 1#
 338    x_spaces = max_number_bars*2#
 339    no_factors = rep(TRUE, length(x))#
 340    plot_on_same_graph()#
 341    bargraph.CI(x.factor = no_factors, #
 342                response = x, #
 343                xlim=c(0, x_spaces), #
 344                ylim=c(0, y_max), #
 345                names.arg=c(bar_name), #
 346                col=col, #
 347                space=bar_position)#
 348}#
 349#
 350greys = grey(0:8 / 8)#
 351light_grey = greys[8]#
 352medium_grey = greys[4]#
 353dark_grey = greys[2]#
 354#
 355plot_on_new_graph()#
 356#
 357plot_ci_bar(dat$all, 1, 4, "", light_grey)#
 358plot_ci_bar(dat$all, 2, 4, "", light_grey)#
 359plot_ci_bar(dat$all, 3, 4, "", light_grey)#
 360#
 361plot_ci_bar(dat$in.arrayexpress.or.geo, 1, 4, "GEO and/or\nArrayExpress", medium_grey)#
 362plot_ci_bar(dat$in.geo, 2, 4, "GEO", medium_grey)#
 363plot_ci_bar(dat$in.arrayexpress, 3, 4, "ArrayExpress", medium_grey)#
 364#
 365#
 366#
 367plot_ci_bar(dat$centralized.arrayexpress.or.geo, 1, 4, "", dark_grey)#
 368plot_ci_bar(dat$centralized.geo, 2, 4, "", dark_grey)#
 369plot_ci_bar(dat$centralized.arrayexpress, 3, 4, "", dark_grey)#
 370#
 371#
 372#
 373legend("right", c("\nDatasets not at this location\n", "\nDatasets found only\nby Ochsner search\n", "Datasets found by\nsearch of databases\nfor PubMed IDs"), #
 374    fill=c(light_grey, medium_grey, dark_grey), bty='n')#
 375    #
 376title(main=NULL, ylab="Proportion of article-related publicly-available datasets")#
 377#
 378#
 379         #
 380#
 381#
 382#
 383#
 384#
 385#
 386#
 387hist_back_back_wrapper = function(histdata, splitdata, breaks=NULL, #
 388    title="Histogram", ylabels="", xlabels="", #
 389    log="n", sigfigs=0){#
 390    plotdata = histdata#
 391    if (log=="y") {#
 392        plotdata = log(histdata)#
 393    }#
 394    sp = split(plotdata, splitdata)                    #
 395    out = histbackback(sp, probability=TRUE, brks=breaks)#
 396    print(out)#
 397    frequency_maximum = max(c(out$left, out$right))#
 398    frequency_plot_limit = frequency_maximum * 1.1#
 399    print(frequency_plot_limit)#
 400    out = histbackback(sp, probability=TRUE, #
 401                         main = title, brks=breaks, #
 402                         xlim=c(-frequency_plot_limit, frequency_plot_limit), #
 403                         xlab=xlabels,#
 404                         axes=FALSE)#
 405    print(out)#
 406    if (log=="y") {#
 407        breaklabels = round(exp(out$breaks), sigfigs)#
 408    } else {#
 409        breaklabels = out$breaks#
 410    }#
 411    title(ylab=ylabels, xlab=xlabels)#
 412    xpoints = c(-frequency_plot_limit/2, 0, frequency_plot_limit/2)#
 413    print(xpoints)#
 414    axis(2, at=0:(length(breaklabels)-1), labels=breaklabels)#
 415    axis(1, at=xpoints, labels=round(abs(xpoints)/sum(out$left), 2))#
 416    #
 417    #
 418    barplot(-out$left, col=medium_grey , horiz=TRUE, space=0, add=TRUE, axes=FALSE) #
 419    barplot(out$right, col=light_grey, horiz=TRUE, space=0, add=TRUE, axes=FALSE) #
 420    return(out)#
 421}#
 422#
 423hist_above_wrapper = function(histdata, splitdata, breaks=NULL, #
 424    title=c("", ""), ylabels="", xlabels="", #
 425    use_log="n", sigfigs=0){#
 426    plotdata = histdata#
 427    if (use_log=="y") {#
 428        plotdata = log(histdata)#
 429    }#
 430    sp = split(plotdata, splitdata)                    #
 431    h = hist(plotdata, freq=FALSE, axes=FALSE) #
 432    print(h)#
 433#
 434    par(mfrow=c(2, 1))#
 435    hist_min = 0.9*min(plotdata)#
 436    hist_max = 1.1*max(plotdata)#
 437    h_1 = hist(sp$`1`, breaks=h$breaks, main=title[1], col=medium_grey, xlim=c(hist_min, hist_max), axes=FALSE, xlab="", ylab="") #
 438    #
 439    if (use_log=="y") {#
 440        breaklabels = round(exp(h$breaks), sigfigs)#
 441    } else {#
 442        breaklabels = round(h$breaks, sigfigs)#
 443    }#
 444    print(breaklabels)#
 445    title(ylab=ylabels) #
 446    axis(1, at=h$breaks, labels=breaklabels)#
 447    freq_label = c(0, round(max(h_1$counts)/sum(h_1$counts), 2))#
 448    axis(2, at=c(0, max(h_1$counts)), labels=freq_label)#
 449    #
 450    h_0 = hist(sp$`0`, breaks=h$breaks, main=title[2], col=medium_grey, xlim=c(hist_min, hist_max), axes=FALSE, xlab="", ylab="") #
 451    title(ylab=ylabels, xlab=xlabels)#
 452    axis(1, at=h$breaks, labels=breaklabels)#
 453    freq_label = c(0, round(max(h_0$counts)/sum(h_0$counts), 2))#
 454    axis(2, at=c(0, max(h_0$counts)), labels=freq_label)#
 455#
 456}#
 457#
 458quartz()#
 459hist_above_wrapper(dat$impact.factor, #
 460    dat$centralized.link.to.ochsner.pmid, #
 461    seq(0.5, 4, by=0.25), #
 462    c(paste("Histogram of Impact Factor for datasets\nFound by query of databases for PubMed IDs (n=",count_true_items(dat$centralized.link.to.ochsner.pmid),")", sep=""), #
 463      paste("Not found by query of databases (n=",count_false_items(dat$centralized.link.to.ochsner.pmid),")", sep="")), #
 464    "Proportion of datasets",#
 465    "Impact factor (log scale)", #
 466    use_log="y", sigfigs=0)#
 467#
 468#
 469#
 470quartz()#
 471hist_above_wrapper(dat$pubmed.number.times.cited.in.pmc+.5, #
 472    dat$centralized.link.to.ochsner.pmid, #
 473    NULL, #
 474    c(paste("Histogram of Received Citations for datasets\nFound by query of databases for PubMed IDs (n=",count_true_items(dat$centralized.link.to.ochsner.pmid),")", sep=""), #
 475      paste("Not found by query of databases (n=",count_false_items(dat$centralized.link.to.ochsner.pmid),")", sep="")), #
 476    "Proportion of datasets",#
 477    "Number of citations by articles in PubMed Central (log scale)",#
 478    use_log="y", sigfigs=0)#
 479    #
 480quartz()#
 481hist_above_wrapper(dat$number.samples[!is.na(dat$number.samples)], #
 482    dat$centralized.link.to.ochsner.pmid, #
 483    seq(0, 8, by=1), #
 484    c(paste("Histogram of Dataset Size for datasets\nFound by query of databases for PubMed IDs (n=",count_true_items(dat$centralized.link.to.ochsner.pmid[!is.na(dat$number.samples)]),")", sep=""), #
 485      paste("Not found by query of databases (n=",count_false_items(dat$centralized.link.to.ochsner.pmid[!is.na(dat$number.samples)]),")", sep="")), #
 486    "Proportion of datasets",#
 487    "Number of samples in microarray study (log scale)",#
 488    use_log="y", sigfigs=0)
 489#
 490#
 491#
 492Sys.setenv( PATH=paste(Sys.getenv("PATH"),"/usr/texbin",sep=":") ) #
 493#
 494#
 495library(Rserve)#
 496Rserve(args="--no-save")#
 497#
 498library(plyr)#
 499#
 500setwd("/Users/hpiwowar/Documents/Projects/PLoS ArticleImpact")#
 501source("aim3_functions_20100215.R")
 502dat.raw = read.csv("PLoS_Metrics.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE)
 503dat = data.frame(doi = dat.raw$DOI)#
 504#
 505#
 506dat$days.since.aug.2009         = dat.raw$Days.Since.Aug.2009#
 507dat$pmid                        = dat.raw$PMID#
 508#
 509#
 510#
 511dat$is.plos.biology             = ifelse(dat.raw$Journal == "PLoS Biology", 1, 0)#
 512dat$is.plos.clinical.trials     = ifelse(dat.raw$Journal == "PLoS Clinical Trials", 1, 0)#
 513dat$is.plos.comp.bio            = ifelse(dat.raw$Journal == "PLoS Computational Biology", 1, 0)#
 514dat$is.plos.genetics            = ifelse(dat.raw$Journal == "PLoS Genetics", 1, 0)#
 515dat$is.plos.medicine            = ifelse(dat.raw$Journal == "PLoS Medicine", 1, 0)#
 516dat$is.plos.neglect.tropical    = ifelse(dat.raw$Journal == "PLoS Neglected Tropical Diseases", 1, 0)#
 517dat$is.plos.one                 = ifelse(dat.raw$Journal == "PLoS ONE", 1, 0)#
 518dat$is.plos.pathogens           = ifelse(dat.raw$Journal == "PLoS Pathogens", 1, 0)#
 519dat$is.research                 = factor(dat.raw$X.Research.Article..or..non.Research.Article..)#
 520#
 521dat$num.cites.crossref          = dat.raw$Citations...CrossRef#
 522dat$num.cites.pmc               = dat.raw$Citations...PubMed.Central#
 523dat$num.cites.scopus            = dat.raw$Citations...Scopus#
 524dat$num.page.views.html         = dat.raw$Total.HTML.Page.Views#
 525dat$num.downloads.pdf           = dat.raw$Total.PDF.Downloads#
 526dat$num.downloads.xml           = dat.raw$Total.XML.Downloads#
 527#
 528dat$num.blogs.postgenomic       = dat.raw$Blog.Coverage...Postgenomic#
 529dat$num.blogs.natureblogs       = dat.raw$Blog.Coverage...Nature.Blogs#
 530dat$num.blogs.bloglines         = dat.raw$Blog.Coverage...Bloglines#
 531dat$num.trackbacks              = dat.raw$Number.of.Trackbacks#
 532dat$num.bookmarks.citeulike     = dat.raw$Social.Bookmarking...CiteULike#
 533dat$num.bookmarks.connotea      = dat.raw$Social.Bookmarking...Connotea#
 534dat$num.ratings                 = dat.raw$Number.of.Ratings#
 535dat$avg.rating                  = dat.raw$Average.Rating#
 536dat$num.note.threads            = dat.raw$Number.of.Note.threads#
 537dat$num.note.replies            = dat.raw$Number.of.replies.to.Notes#
 538dat$num.comment.threads         = dat.raw$Number.of.Comment.threads#
 539#
 540#
 541dat$num.ratings.with.comment    = dat.raw$Number.of..Star.Ratings..that.also.include.a.text.comment#
 542#
 543dat.transformed = dat#
 544vars.no.transform = c("doi", "pmid", "days.since.aug.2009")#
 545#
 546vars.factor = c("is.research", "is.plos.clinical.trials", "is.plos.biology",#
 547"is.plos.comp.bio", "is.plos.genetics", "is.plos.medicine", "is.plos.neglect.tropical", "is.plos.one", "is.plos.pathogens")#
 548dat.transformed[,vars.factor] = colwise(factor)(dat[,vars.factor])#
 549#
 550vars.to.transform = !(names(dat.transformed) %in% c(vars.no.transform, vars.factor))#
 551dat.transformed[,vars.to.transform] = tr(dat[,vars.to.transform])#
 552#
 553#
 554summary(dat.transformed)
 555library(Hmisc)#
 556library(psych)#
 557#
 558Hmisc::describe(dat, listunique=0)
 559dat.indep.stats = dat.transformed[,vars.to.transform]
 560library(polycor)#
 561#
 562myhetcorr = hetcor.modified(dat.indep.stats, use="pairwise.complete.obs", std.err=FALSE, pd=FALSE)#
 563mycor.unadjusted = myhetcorr$correlations#
 564#
 565#
 566#
 567#
 568which(is.na(mycor.unadjusted))#
 569#
 570#
 571#
 572#
 573#
 574mycor = adjust.to.positive.definite(mycor.unadjusted)#
 575#
 576#
 577library(gplots)#
 578heatmap.2(mycor, col=bluered(16), cexRow=0.5, cexCol = .8, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(15,15), key=FALSE, keysize=0.1)
 579mycor
 580legend()
 581bluered(16)
 582blue(16)
 583grey(16)
 584greys(16)
 585?bluered
 586showpanel(bluered(64))
 587library(gplots)
 588showpanel(bluered(64))
 589showpanel <- function(col)#
 590{#
 591  image(z=matrix(1:100, ncol=1), col=col, xaxt="n", yaxt="n" )#
 592}
 593showpanel(bluered(64))
 594heatmap.2(mycor, col=bluered(16), cexRow=0.5, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
 595heatmap.2(mycor, col=bluered(16), cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
 596topo
 597topocolors
 598topo()
 599?colors
 600heat.colors
 601heatmap.2(mycor, col=heat.colors(16), cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
 602heatmap.2(mycor, col=topo.colors(16), cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
 603heatmap.2(mycor, col=terrain.colors(16), cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
 604heatmap.2(mycor, col=cm.colors(16), cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
 605cm.colors
 606?cm.colors
 607showpanel(cm.colors(16))
 608library(gplots)#
 609heatmap.2(mycor, col=cm.colors(16), cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
 610cm.colors(16)
 611cm.colors(16)[0:8]
 612showpanel(cm.colors(16)[0:8])
 613showpanel(bluered(16)[0:8])
 614showpanel(bluered(32)[0:16])
 615showpanel(bluered(32)[17:32])
 616heatmap.2(mycor, col=bluered(32)[17:32], cexRow=0.9, cexCol = .9, symm = TRUE, dend = "row", trace = "none", main = "Thesis Data", margins=c(10,10), key=FALSE, keysize=0.1)
 617#
 618#
 619library(nFactors)#
 620eigenvectors.1st <- eigen(mycor) #
 621#
 622aparallel.1st <- parallel(subject=nrow(dat.indep.stats), var=ncol(dat.indep.stats), rep=100, cent=.05)#
 623scree.results.1st <- nScree(eigenvectors.1st$values, aparallel.1st$eigen$qevpea)#
 624summary(scree.results.1st)#
 625plotnScree(scree.results.1st) #
 626#
 627#
 628#
 629#
 630#
 631number.factors.1st = 5
 632#
 633#
 634fit.ml = factanal(dat, number.factors.1st, rotation="promax", covmat=mycor)#
 635print(fit.ml, sort=TRUE)#
 636#
 637#
 638#
 639library(psych)#
 640fit.fa.1st = fa(mycor, number.factors.1st, fm="minres", rotate="promax", #
 641                scores=FALSE, residuals=TRUE, n.obs=max(dim(dat.indep.stats)))#
 642#
 643#
 644print(fit.fa.1st, sort=TRUE)#
 645#
 646#
 647#
 648#
 649#
 650#
 651#
 652#
 653#
 654factor.names.1st = c(#
 655"MR1"="Ratings",#
 656"MR2"="Citations",#
 657"MR3"="Views",#
 658"MR4"="Bookmarks",#
 659"MR5"="Notes")#
 660#
 661for (afactor in names(factor.names.1st)) {#
 662    print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
 663    print.thresh(fit.fa.1st$loadings[, afactor], .4, TRUE)#
 664    print.thresh(fit.fa.1st$loadings[, afactor], -0.4, FALSE)#
 665}
 666#
 667#
 668fit.fa.1st.cor = fit.fa.1st$r.scores#
 669colnames(fit.fa.1st.cor) = factor.names.1st[colnames(fit.fa.1st$loadings)]#
 670rownames(fit.fa.1st.cor) = factor.names.1st[colnames(fit.fa.1st$loadings)]#
 671#
 672eigenvectors.2nd <- eigen(fit.fa.1st.cor) #
 673aparallel.2nd <- parallel(subject=nrow(fit.fa.1st.cor), var=ncol(fit.fa.1st.cor), rep=100, cent=.05)#
 674scree.results.2nd <- nScree(eigenvectors.2nd$values, aparallel.2nd$eigen$qevpea)#
 675scree.results.2nd#
 676plotnScree(scree.results.2nd) #
 677#
 678#
 679number.factors.2nd = 2
 680#
 681#
 682#
 683fit.fa.2nd = fa(fit.fa.1st.cor, number.factors.2nd, fa="minres", rotate="varimax")#
 684print(fit.fa.2nd, sort=TRUE)
 685#
 686U = fit.fa.2nd$uniquenesses * diag(number.factors.1st)#
 687P = fit.fa.2nd$loadings#
 688A = cbind(P, U)#
 689Pvf = fit.fa.1st$loadings#
 690#
 691Pvo = Pvf %*% A
 692#
 693#
 694factor.names.2nd = c(#
 695"MR1"="Views, bookmarks, citations",#
 696"MR2"="Notes and ratings")#
 697#
 698#
 699for (afactor in names(factor.names.2nd)) {#
 700    print(paste(afactor, ": ", factor.names.2nd[afactor], sep=""))    #
 701    print.thresh(Pvo[, afactor], .3, TRUE)#
 702    print.thresh(Pvo[, afactor], -0.3, FALSE)#
 703}#
 704#
 705#
 706for (afactor in names(factor.names.2nd)) {#
 707    print(paste(afactor, ": ", factor.names.2nd[afactor], sep=""))#
 708    print.thresh(fit.fa.2nd$loadings[, afactor], .3, TRUE)#
 709    print.thresh(fit.fa.2nd$loadings[, afactor], -0.3, FALSE)#
 710}
 711#
 712#
 713factor.names.2nd = c(#
 714"MR1"="Views, citations, bookmarks",#
 715"MR2"="Notes and ratings")#
 716#
 717#
 718for (afactor in names(factor.names.2nd)) {#
 719    print(paste(afactor, ": ", factor.names.2nd[afactor], sep=""))    #
 720    print.thresh(Pvo[, afactor], .3, TRUE)#
 721    print.thresh(Pvo[, afactor], -0.3, FALSE)#
 722}#
 723#
 724#
 725for (afactor in names(factor.names.2nd)) {#
 726    print(paste(afactor, ": ", factor.names.2nd[afactor], sep=""))#
 727    print.thresh(fit.fa.2nd$loadings[, afactor], .3, TRUE)#
 728    print.thresh(fit.fa.2nd$loadings[, afactor], -0.3, FALSE)#
 729}
 730#
 731#
 732#
 733#
 734dat.impute.input = dat.indep.stats#
 735#
 736#
 737library(mice)#
 738#
 739#
 740#
 741mice.output = mice(dat.impute.input, m=1)  #
 742#
 743 #
 744#
 745dat.imputed = complete(mice.output, 1)#
 746#
 747dat.for.scores = dat.imputed#
 748#
 749#
 750#
 751scores.1st = factor.scores.bartlett(dat.for.scores, fit.fa.1st)
 752#
 753#
 754load("/Users/hpiwowar/Documents/Code/hpiwowar/pypub/trunk/src/aim3/stats/dat_aim3.RData")#
 755#
 756dat.plos = cbind(dat, scores.1st)#
 757#
 758dat.merge = merge(dat.plos, dat.aim3, by="pmid")#
 759#
 760dat.regress = data.frame(dataset.in.geo.or.ae.int = dat.merge$dataset.in.geo.or.ae.int)#
 761score.names = dimnames(scores.1st)[[2]]#
 762dat.regress[,score.names] = dat.merge[,names(dat.merge) %in% score.names]#
 763#
 764vars.indep = c("days.since.aug.2009", #
 765"num.authors.tr",#
 766"first.author.num.prev.pubs.tr", #
 767"first.author.num.prev.pmc.cites.tr",#
 768"last.author.num.prev.pubs.tr",#
 769"last.author.num.prev.pmc.cites.tr",#
 770"num.grant.numbers.tr",#
 771"pubmed.is.humans",#
 772"pubmed.is.cancer",#
 773"country.usa",#
 774"institution.rank",#
 775"nih.sum.sum.dollars.tr")#
 776#
 777dat.regress[,vars.indep] = dat.merge[,vars.indep]#
 778#
 779#
 780#
 781plot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
 782boxplot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
 783boxplot(log(MR1) ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
 784boxplot(log(MR2) ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
 785boxplot(log(MR3) ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
 786boxplot(log(MR4) ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
 787boxplot(log(MR5) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
 788boxplot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress, main="MR2")
 789boxplot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress, main="MR2")
 790plot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress)
 791boxplot(log(MR2) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
 792#
 793#
 794#
 795#
 796#
 797library(rms)#
 798dd.regress = datadist(dat.regress)#
 799options(datadist='dd.regress')#
 800options(digits=2)#
 801#
 802f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +#
 803    (rcs(days.since.aug.2009, 3) +#
 804    rcs(num.authors.tr, 3) ) +#
 805    rcs(first.author.num.prev.pubs.tr, 3) + #
 806    rcs(first.author.num.prev.pmc.cites.tr, 3) +#
 807    rcs(last.author.num.prev.pubs.tr, 3) + #
 808    rcs(last.author.num.prev.pmc.cites.tr, 3) + #
 809    as.numeric(pubmed.is.humans) + #
 810    as.numeric(pubmed.is.cancer) + #
 811    as.numeric(country.usa) )#
 812#
 813    ,#
 814    dat=dat.regress, x=T, y=T)#
 815anova(f.1st.nonlinear.interactions.reduced)
 816f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +#
 817    rcs(days.since.aug.2009, 3) + #
 818    (rcs(num.authors.tr, 3) + #
 819    rcs(last.author.num.prev.pmc.cites.tr, 3) + #
 820    as.numeric(pubmed.is.humans) + #
 821    as.numeric(pubmed.is.cancer) + #
 822    as.numeric(country.usa) ) )#
 823    #
 824    #
 825    #
 826    #
 827    #
 828    #
 829    ,#
 830    dat=dat.regress, x=T, y=T)#
 831anova(f.1st.nonlinear.interactions.reduced)
 832s = summary(f.1st.nonlinear.interactions.reduced)#
 833s#
 834plot(s)
 835names(dat)
 836f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +#
 837    (rcs(days.since.aug.2009, 10) +#
 838    rcs(num.authors.tr, 3) ) +#
 839    rcs(first.author.num.prev.pubs.tr, 3) + #
 840    rcs(first.author.num.prev.pmc.cites.tr, 3) +#
 841    rcs(last.author.num.prev.pubs.tr, 3) + #
 842    rcs(last.author.num.prev.pmc.cites.tr, 3) + #
 843    as.numeric(pubmed.is.humans) + #
 844    as.numeric(pubmed.is.cancer) + #
 845    as.numeric(country.usa) )#
 846#
 847    ,#
 848    dat=dat.regress, x=T, y=T)#
 849anova(f.1st.nonlinear.interactions.reduced)
 850f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +#
 851    rcs(days.since.aug.2009, 10) + #
 852    (rcs(num.authors.tr, 3) + #
 853    rcs(last.author.num.prev.pmc.cites.tr, 3) + #
 854    as.numeric(pubmed.is.humans) + #
 855    as.numeric(pubmed.is.cancer) + #
 856    as.numeric(country.usa) ) )#
 857    #
 858    #
 859    #
 860    #
 861    #
 862    #
 863    ,#
 864    dat=dat.regress, x=T, y=T)#
 865anova(f.1st.nonlinear.interactions.reduced)
 866s = summary(f.1st.nonlinear.interactions.reduced)#
 867s#
 868plot(s)
 869summ.1st.nonlinear.interactions.reduced = summary(f.1st.nonlinear.interactions.reduced)#
 870summ.1st.nonlinear.interactions.reduced.dimnames = dimnames(summ.1st.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.1st.nonlinear.interactions.reduced)[[1]]),2)]#
 871dimnames(summ.1st.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.1st.nonlinear.interactions.reduced)[[1]]),2)] = factor.names.1st[summ.1st.nonlinear.interactions.reduced.dimnames]#
 872plot(summ.1st.nonlinear.interactions.reduced, q = c(0.95), col=gray(0.5), log=T, cex=.8)#
 873title("Multivariate nonlinear regressions with interactions")
 874summ.1st.nonlinear.interactions.reduced = summary(f.1st.nonlinear.interactions.reduced)#
 875summ.1st.nonlinear.interactions.reduced.dimnames = dimnames(summ.1st.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.1st.nonlinear.interactions.reduced)[[1]]),2)]#
 876dimnames(summ.1st.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.1st.nonlinear.interactions.reduced)[[1]]),2)] = factor.names.1st[summ.1st.nonlinear.interactions.reduced.dimnames]#
 877plot(summ.1st.nonlinear.interactions.reduced, q = c(0.95), col=gray(0.5), log=T, cex=.8)#
 878title("Multivariate nonlinear regressions with interactions")
 879#
 880dat.regress.named = dat.regress#
 881names(dat.regress.named) = c("dataset.in.geo.or.ae.int", factor.names.1st[names(dat.regress)[-1]])#
 882#
 883dots.1st.nonlinear.interactions.reduced = summary(dataset.in.geo.or.ae.int ~ .,#
 884    dat=dat.regress.named)#
 885plot(dots.1st.nonlinear.interactions.reduced, cex.labels=0.5, cex=0.7, #
 886    xlab="Percentage of studies with datasets in GEO or ArrayExpress", #
 887    main="Univariate data sharing behaviour on first order factors")
 888#
 889#
 890loadings.2nd = Pvo#
 891#
 892#
 893fit.pa.2nd.tovars = list(loadings=loadings.2nd[,colnames(fit.fa.2nd$loadings)])#
 894fit.pa.2nd.tovars$uniquenesses = apply(fit.pa.2nd.tovars$loadings^2, 1, sum)#
 895#
 896scores.to.dat.2nd = factor.scores.bartlett(dat.for.scores, fit.pa.2nd.tovars)#
 897#
 898dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat$dataset.in.geo.or.ae.int) #
 899dat.regress.2nd[,dimnames(scores.to.dat.2nd)[[2]]] = scores.to.dat.2nd#
 900#
 901library(rms)#
 902#
 903dd.regress.2nd = datadist(dat.regress.2nd)#
 904options(datadist='dd.regress.2nd')#
 905options(digits=2)#
 906#
 907#
 908f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
 909    rcs(MR3, 4) + (rcs(MR1, 4) + rcs(MR4, 4) + rcs(MR5, 4) + rcs(MR2, 4))^2,#
 910    dat=dat.regress.2nd, x=T, y=T)#
 911anova(f.2nd.nonlinear.interactions.reduced)
 912names(dat.regress.2nd)
 913dimnames(scores.to.dat.2nd)[[2]]
 914dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat$dataset.in.geo.or.ae.int)
 915dat.regress.2nd[,dimnames(scores.to.dat.2nd)[[2]]] = scores.to.dat.2nd
 916scores.to.dat.2nd
 917dim(dat.retress.2nd)
 918dim(dat.regress.2nd)
 919dim(dat.regress.2nd)dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat$dataset.in.geo.or.ae.int)
 920dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat$dataset.in.geo.or.ae.int)
 921dim(dat.regress.2nd)
 922?str(dat.regress.2nd)
 923str(dat.regress.2nd)
 924names(dat)
 925dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat.aim3$dataset.in.geo.or.ae.int) #
 926dat.regress.2nd[,dimnames(scores.to.dat.2nd)[[2]]] = scores.to.dat.2nd
 927dim(dat.regress.2nd)
 928dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat.merge$dataset.in.geo.or.ae.int) #
 929dat.regress.2nd[,dimnames(scores.to.dat.2nd)[[2]]] = scores.to.dat.2nd
 930dim(dat.merge)
 931dim(scores.to.dat.2nd)
 932dim(scores.to.dat.1st)
 933dim(scores.1st)
 934dat.regress.2nd = data.frame(dataset.in.geo.or.ae.int = dat.merge$dataset.in.geo.or.ae.int) #
 935score.names.2nd = dimnames(scores.to.dat.2nd)[[2]]#
 936dat.regress.2nd[,score.names.2nd] = dat.merge[,names(dat.merge) %in% score.names.2nd]
 937#
 938library(rms)#
 939#
 940dd.regress.2nd = datadist(dat.regress.2nd)#
 941options(datadist='dd.regress.2nd')#
 942options(digits=2)#
 943#
 944#
 945f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
 946    rcs(MR3, 4) + (rcs(MR1, 4) + rcs(MR4, 4) + rcs(MR5, 4) + rcs(MR2, 4))^2,#
 947    dat=dat.regress.2nd, x=T, y=T)#
 948anova(f.2nd.nonlinear.interactions.reduced)
 949f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
 950    (rcs(MR1, 4) + rcs(MR2, 4))^2,#
 951    dat=dat.regress.2nd, x=T, y=T)#
 952anova(f.2nd.nonlinear.interactions.reduced)
 953f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ rcs(MR1, 4),#
 954    dat=dat.regress.2nd, x=T, y=T)#
 955anova(f.2nd.nonlinear.interactions.reduced)
 956names(dat.merge)
 957f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
 958    rcs(days.since.aug.2009, 10) + rcs(MR1, 4),#
 959    dat=dat.regress.2nd, x=T, y=T)#
 960anova(f.2nd.nonlinear.interactions.reduced)
 961dat.regress.2nd[,days.since.aug.2009] = dat.merge[,days.since.aug.2009]
 962dat.regress.2nd[,"days.since.aug.2009"] = dat.merge[,"days.since.aug.2009"]
 963f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
 964    rcs(days.since.aug.2009, 10) + rcs(MR1, 4),#
 965    dat=dat.regress.2nd, x=T, y=T)#
 966anova(f.2nd.nonlinear.interactions.reduced)
 967f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
 968    rcs(days.since.aug.2009, 10) + rcs(MR1, 3),#
 969    dat=dat.regress.2nd, x=T, y=T)#
 970anova(f.2nd.nonlinear.interactions.reduced)
 971f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
 972    (rcs(days.since.aug.2009, 10) + rcs(MR1, 3))^2,#
 973    dat=dat.regress.2nd, x=T, y=T)#
 974anova(f.2nd.nonlinear.interactions.reduced)
 975f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ (dataset.in.geo.or.ae.int +#
 976    (rcs(days.since.aug.2009, 10)))#
 977anova(f.2nd.nonlinear.interactions.reduced)
 978f.1st.nonlinear.interactions.reduced = ols(formula = MR2 ~ dataset.in.geo.or.ae.int +#
 979    (rcs(days.since.aug.2009, 10)))#
 980anova(f.2nd.nonlinear.interactions.reduced)
 981f.1st.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +#
 982    rcs(days.since.aug.2009, 10))^2, #
 983    dat=dat.regress.2nd, x=T, y=T)#
 984anova(f.2nd.nonlinear.interactions.reduced)
 985f.2nd.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +#
 986    rcs(days.since.aug.2009, 10))^2, #
 987    dat=dat.regress.2nd, x=T, y=T)#
 988anova(f.2nd.nonlinear.interactions.reduced)
 989#
 990f.2nd.nonlinear.interactions.reduced = lrm(formula = dataset.in.geo.or.ae.int ~ #
 991    (rcs(MR1, 4) + rcs(MR2, 4))^2,#
 992    dat=dat.regress.2nd, x=T, y=T)#
 993anova(f.2nd.nonlinear.interactions.reduced)
 994summ.2nd.nonlinear.interactions.reduced = summary(f.2nd.nonlinear.interactions.reduced)#
 995summ.2nd.nonlinear.interactions.reduced.dimnames = dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]]),2)]#
 996dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]]),2)] = factor.names.2nd[summ.2nd.nonlinear.interactions.reduced.dimnames]#
 997plot(summ.2nd.nonlinear.interactions.reduced, q = c(0.95), col=gray(0.5), log=T, cex=.8)#
 998title("Multivariate nonlinear regression with interactions")
 999dat.regress.2nd[,vars.indep] = dat.merge[,vars.indep]#
1000#
1001library(rms)#
1002#
1003dd.regress.2nd = datadist(dat.regress.2nd)#
1004options(datadist='dd.regress.2nd')#
1005options(digits=2)
1006f.2nd.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +#
1007    (rcs(days.since.aug.2009, 10) +#
1008    rcs(num.authors.tr, 3) ) +#
1009    rcs(first.author.num.prev.pubs.tr, 3) + #
1010    rcs(first.author.num.prev.pmc.cites.tr, 3) +#
1011    rcs(last.author.num.prev.pubs.tr, 3) + #
1012    rcs(last.author.num.prev.pmc.cites.tr, 3) + #
1013    as.numeric(pubmed.is.humans) + #
1014    as.numeric(pubmed.is.cancer) + #
1015    as.numeric(country.usa) )#
1016#
1017    ,#
1018    dat=dat.regress.2nd, x=T, y=T)#
1019anova(f.2nd.nonlinear.interactions.reduced)
1020f.2nd.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +#
1021    (rcs(days.since.aug.2009, 10) +#
1022    rcs(num.authors.tr, 3) ) +#
1023    rcs(first.author.num.prev.pubs.tr, 3) + #
1024    rcs(first.author.num.prev.pmc.cites.tr, 3) +#
1025    rcs(last.author.num.prev.pubs.tr, 3) + #
1026    rcs(last.author.num.prev.pmc.cites.tr, 3) + #
1027    as.numeric(pubmed.is.humans) + #
1028    as.numeric(pubmed.is.cancer) + #
1029    as.numeric(country.usa) )^2#
1030#
1031    ,#
1032    dat=dat.regress.2nd, x=T, y=T)#
1033anova(f.2nd.nonlinear.interactions.reduced)
1034f.2nd.nonlinear.interactions.reduced
1035f.2nd.nonlinear.interactions.reduced = ols(formula = MR1 ~ (dataset.in.geo.or.ae.int +#
1036    (rcs(days.since.aug.2009, 10) +#
1037    rcs(num.authors.tr, 3) ) +#
1038    rcs(first.author.num.prev.pubs.tr, 3) + #
1039    rcs(first.author.num.prev.pmc.cites.tr, 3) +#
1040    rcs(last.author.num.prev.pubs.tr, 3) + #
1041    rcs(last.author.num.prev.pmc.cites.tr, 3) + #
1042    as.numeric(pubmed.is.humans) + #
1043    as.numeric(pubmed.is.cancer) + #
1044    as.numeric(country.usa) )#
1045#
1046    ,#
1047    dat=dat.regress.2nd, x=T, y=T)#
1048anova(f.2nd.nonlinear.interactions.reduced)
1049f.2nd.nonlinear.interactions.reduced
1050dim(dat.regress.2nd)
1051summ.2nd.nonlinear.interactions.reduced = summary(f.2nd.nonlinear.interactions.reduced)#
1052summ.2nd.nonlinear.interactions.reduced.dimnames = dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]]),2)]#
1053dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]][seq(1,length(dimnames(summ.2nd.nonlinear.interactions.reduced)[[1]]),2)] = factor.names.2nd[summ.2nd.nonlinear.interactions.reduced.dimnames]#
1054plot(summ.2nd.nonlinear.interactions.reduced, q = c(0.95), col=gray(0.5), log=T, cex=.8)#
1055title("Multivariate nonlinear regression with interactions")
1056dat.regress.2nd.named = dat.regress.2nd#
1057names(dat.regress.2nd.named) = c("dataset.in.geo.or.ae.int", factor.names.2nd[names(dat.regress.2nd)[-1]])#
1058#
1059dots.2nd.nonlinear.interactions.reduced = summary(dataset.in.geo.or.ae.int ~ .,#
1060    dat=dat.regress.2nd.named)#
1061plot(dots.2nd.nonlinear.interactions.reduced, cex.labels=0.5, cex=0.7, #
1062    xlab="Percentage of studies with datasets in GEO or ArrayExpress", #
1063    main="Univariate data sharing behaviour\non second order factors")
1064dots.2nd.nonlinear.interactions.reduced = summary(dataset.in.geo.or.ae.int ~ .,#
1065    dat=dat.regress.2nd.named)
1066dat.regress.2nd.named = dat.regress.2nd#
1067names(dat.regress.2nd.named) = c("dataset.in.geo.or.ae.int", factor.names.2nd[names(dat.regress.2nd)[-1]])#
1068#
1069dots.2nd.nonlinear.interactions.reduced = summary(dataset.in.geo.or.ae.int ~ .,#
1070    dat=dat.regress.2nd.named)#
1071plot(dots.2nd.nonlinear.interactions.reduced, cex.labels=0.5, cex=0.7, #
1072    xlab="Percentage of studies with datasets in GEO or ArrayExpress", #
1073    main="Univariate data sharing behaviour\non second order factors")
1074?plsmo
1075plsmo(days.since.aug.2009, MR1, dat=dat.regress)
1076names(dat.regress)
1077plsmo(dat.regress$days.since.aug.2009, dat.regress$MR1)
1078par(c(5, 1))#
1079plsmo(dat.regress$days.since.aug.2009, dat.regress$MR1, main="MR1")
1080?par
1081par(mfcol=c(5, 1))#
1082plsmo(dat.regress$days.since.aug.2009, dat.regress$MR1, main="MR1")
1083par(mfcol=c(1, 2))#
1084plsmo(dat.regress$days.since.aug.2009, dat.regress$MR1, main="MR1")
1085for (afactor in names(factor.names.1st)) {#
1086    print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
1087    plsmo(dat.regress$days.since.aug.2009, dat.regress[afactor], main=factor.names.1st[afactor])#
1088#
1089}
1090for (afactor in names(factor.names.1st)) {#
1091    print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
1092    quartz()#
1093    plsmo(dat.regress$days.since.aug.2009, dat.regress[afactor], main=factor.names.1st[afactor])#
1094}
1095for (afactor in names(factor.names.1st)) {#
1096    print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
1097    quartz()#
1098    plsmo(dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
1099    title(factor.names.1st[afactor])#
1100}
1101?reverse
1102?rev
1103max(dat.regress$days.since.aug.2009)
1104?plot
1105for (afactor in names(factor.names.1st)) {#
1106    print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
1107    quartz()#
1108    plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
1109    title(factor.names.1st[afactor])#
1110}
1111for (afactor in names(factor.names.1st)) {#
1112    print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
1113    quartz()#
1114    plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
1115    title(factor.names.1st[afactor])#
1116}
1117for (afactor in names(factor.names.1st)) {#
1118    print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
1119    quartz()#
1120    boxplot(log(afactor) ~ dataset.in.geo.or.ae.int, dat=dat.regress)#
1121    title(factor.names.1st[afactor])#
1122}
1123for (afactor in names(factor.names.1st)) {#
1124    print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
1125    quartz()#
1126    boxplot(log(dat.regress[afactor]) ~ dat.regress$dataset.in.geo.or.ae.int)#
1127    title(factor.names.1st[afactor])#
1128}
1129afactor="MR1"
1130    boxplot(log(dat.regress[afactor]) ~ dat.regress$dataset.in.geo.or.ae.int)
1131max(log(dat.regress[afactor]))
1132max(log(dat.regress[afactor], na.rm=T))
1133summary(dat.regress[afactor])
1134for (afactor in names(factor.names.1st)) {#
1135    print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
1136    quartz()#
1137    boxplot(at.regress[afactor] ~ dat.regress$dataset.in.geo.or.ae.int)#
1138    title(factor.names.1st[afactor])#
1139}
1140#
1141for (afactor in names(factor.names.1st)) {#
1142    print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
1143    quartz()#
1144    boxplot(dat.regress[afactor] ~ dat.regress$dataset.in.geo.or.ae.int)#
1145    title(factor.names.1st[afactor])#
1146}
1147    boxplot(dat.regress[afactor] ~ dat.regress$dataset.in.geo.or.ae.int)
1148plot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress)
1149boxplot(MR2 ~ dataset.in.geo.or.ae.int, dat=dat.regress)
1150boxplot(MR1 ~ dataset.in.geo.or.ae.int, dat=dat.regress)
1151boxplot(log(MR1) ~ dataset.in.geo.or.ae.int, dat=dat.regress)
1152boxplot(log(dat.regress$MR1) ~ dat.regress$dataset.in.geo.or.ae.int)
1153boxplot(log(dat.regress["MR1"]) ~ dat.regress$dataset.in.geo.or.ae.int)
1154boxplot(log(dat.regress["MR1",]) ~ dat.regress$dataset.in.geo.or.ae.int)
1155boxplot(log(dat.regress[,"MR1"]) ~ dat.regress$dataset.in.geo.or.ae.int)
1156for (afactor in names(factor.names.1st)) {#
1157    print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
1158    quartz()#
1159    boxplot(dat.regress[,afactor] ~ dat.regress$dataset.in.geo.or.ae.int)#
1160    title(factor.names.1st[afactor])#
1161}
1162for (afactor in names(factor.names.1st)) {#
1163    print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
1164    quartz()#
1165    boxplot(dat.regress[,afactor] ~ dat.regress$dataset.in.geo.or.ae.int)#
1166    title(factor.names.1st[afactor])#
1167}
1168names(dat)
1169fit.fa.2nd$loadings[, afactor]
1170fit.fa.1st$loadings[, afactor]
1171    loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))
1172loaded
1173    print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
1174    quartz()#
1175    plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
1176    title(factor.names.1st[afactor])#
1177    loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))#
1178    plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[names(dat.regress) %in% loaded], add=T)
1179[names(dat.regress) %in% loaded
1180names(dat.regress) %in% loaded
1181loaded
1182names(dat.regress)
1183for (afactor in names(factor.names.1st)) {#
1184    print(paste(afactor, ": ", factor.names.1st[afactor], sep=""))#
1185    quartz()#
1186    plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
1187    title(factor.names.1st[afactor])#
1188    loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))#
1189    plsmo(-1*dat.merge$days.since.aug.2009, dat.merge[names(dat.marge) %in% loaded], add=T)#
1190}
1191    plsmo(-1*dat.regress$days.since.aug.2009, dat.regress[afactor], ylab=factor.names.1st[afactor])#
1192    title(factor.names.1st[afactor])#
1193    loaded = names(which(abs(fit.fa.1st$loadings[, afactor]) > 0.3))#
1194    plsmo(-1*dat.merge$days.since.aug.2009, dat.merge[names(dat.merge) %in% loaded], add=…

Large files files are truncated, but you can click here to view the full file