PageRenderTime 47ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/scripts/preprocessing_eventcounts.R

http://github.com/hpiwowar/alt-metrics_stats
R | 83 lines | 35 code | 23 blank | 25 comment | 2 complexity | 03d07d635a793026426f0437548787d1 MD5 | raw file
Possible License(s): MIT
  1. #library(Rserve)
  2. #Rserve(args="--no-save")
  3. ######## event_counts.txt
  4. ## This data contains into on metrics for which we only have aggregate counts
  5. ### READ DATA
  6. dat.raw.eventcounts = read.csv("data/raw/event_counts.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE)
  7. ## Look at it
  8. dim(dat.raw.eventcounts)
  9. names(dat.raw.eventcounts)
  10. summary(dat.raw.eventcounts)
  11. ## A bit of data cleaning
  12. dat.eventcounts = dat.raw.eventcounts
  13. ## Make sure all data has good DOIs, detects rogue line breaks etc.
  14. hasGoodDoi = "10." == substr(dat.raw.eventcounts$doi, 1, 3)
  15. summary(hasGoodDoi)
  16. dat.eventcounts[!hasGoodDoi,]
  17. # Now create a date type variable
  18. dat.eventcounts$pubDate = strptime(dat.eventcounts$pubDate, "%Y-%m-%dT")
  19. summary(dat.eventcounts$pubDate)
  20. # Create a column that has days since published
  21. dat.eventcounts$daysSincePublished = as.integer(difftime(max(dat.eventcounts$pubDate), dat.eventcounts$pubDate, units="days"))
  22. hist(dat.eventcounts$daysSincePublished)
  23. ## Adjust some fields to they are the right datatype.
  24. # Change journal strings to factors
  25. dat.eventcounts$journal = factor(dat.raw.eventcounts$journal)
  26. # Change f1000Factor strings to integer counts. "false" means count of 0.
  27. dat.eventcounts$f1000Factor = as.integer(dat.raw.eventcounts$f1000Factor)
  28. dat.eventcounts$f1000Factor[is.na(dat.eventcounts$f1000Factor)] = 0
  29. # Change wikipediaCites NAs to 0s
  30. dat.eventcounts$wikipediaCites[is.na(dat.eventcounts$wikipediaCites)] = 0
  31. # Change mendeleyReadersCount NAs to 0s
  32. dat.eventcounts$mendeleyReadersCount[is.na(dat.eventcounts$mendeleyReadersCount)] = 0
  33. # Change facebookClickCount NAs to 0s
  34. dat.eventcounts$facebookClickCount[is.na(dat.eventcounts$facebookClickCount)] = 0
  35. # delicious count looks strange for now
  36. dat.eventcounts$deliciousCount = as.integer(dat.raw.eventcounts$deliciousCount)
  37. dat.eventcounts$deliciousCount[is.na(dat.eventcounts$deliciousCount)] = 0
  38. dat.eventcounts$deliciousCount[dat.eventcounts$deliciousCount > 1000000] = 1
  39. # rename PMC column
  40. dat.eventcounts$almPubMedCentralCount = dat.eventcounts$almPubMedCount
  41. dat.eventcounts$almPubMedCount = NULL
  42. # There are a few Facebook results from Facebook API with negative numbers
  43. # Not clear what this means (not in Facebook API docs), so setting to NA
  44. facebookColumns = c("facebookShareCount", "facebookLikeCount", "facebookCommentCount", "facebookClickCount")
  45. for (col in facebookColumns) {
  46. dat.eventcounts[which(dat.eventcounts[, col] < 0), col] = NA
  47. }
  48. ## article Type, set NAs to "Research Article"
  49. # and store as a factor
  50. dat.eventcounts$articleType[is.na(dat.raw.eventcounts$articleType)] = "Research Article"
  51. dat.eventcounts$articleType = factor(dat.eventcounts$articleType)
  52. summary(dat.eventcounts$articleType)
  53. ## authorsCount
  54. dat.eventcounts$authorsCount = as.numeric(dat.raw.eventcounts$authorsCount)
  55. ## eliminate columns not in use right now
  56. dat.eventcounts$pmid = NULL
  57. dat.eventcounts$plosSubjectTags = NULL
  58. dat.eventcounts$plosSubSubjectTags = NULL
  59. ## Look again
  60. summary(dat.eventcounts)