PageRenderTime 18ms CodeModel.GetById 2ms app.highlight 11ms RepoModel.GetById 1ms app.codeStats 0ms

/scripts/preprocessing_eventcounts.R

http://github.com/hpiwowar/alt-metrics_stats
R | 83 lines | 35 code | 23 blank | 25 comment | 2 complexity | 03d07d635a793026426f0437548787d1 MD5 | raw file
 1#library(Rserve)
 2#Rserve(args="--no-save")
 3
 4######## event_counts.txt
 5## This data contains into on metrics for which we only have aggregate counts
 6
 7### READ DATA
 8dat.raw.eventcounts = read.csv("data/raw/event_counts.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE)
 9
10## Look at it
11dim(dat.raw.eventcounts)
12names(dat.raw.eventcounts)
13summary(dat.raw.eventcounts)
14
15## A bit of data cleaning
16dat.eventcounts = dat.raw.eventcounts
17
18## Make sure all data has good DOIs, detects rogue line breaks etc.
19hasGoodDoi = "10." == substr(dat.raw.eventcounts$doi, 1, 3)
20summary(hasGoodDoi)
21dat.eventcounts[!hasGoodDoi,]
22
23# Now create a date type variable
24dat.eventcounts$pubDate  = strptime(dat.eventcounts$pubDate, "%Y-%m-%dT")
25summary(dat.eventcounts$pubDate)
26
27# Create a column that has days since published
28dat.eventcounts$daysSincePublished = as.integer(difftime(max(dat.eventcounts$pubDate), dat.eventcounts$pubDate, units="days"))
29hist(dat.eventcounts$daysSincePublished)
30
31## Adjust some fields to they are the right datatype.  
32
33# Change journal strings to factors
34dat.eventcounts$journal = factor(dat.raw.eventcounts$journal)
35
36# Change f1000Factor strings to integer counts.  "false" means count of 0.
37dat.eventcounts$f1000Factor = as.integer(dat.raw.eventcounts$f1000Factor)
38dat.eventcounts$f1000Factor[is.na(dat.eventcounts$f1000Factor)] = 0
39
40# Change wikipediaCites NAs to 0s
41dat.eventcounts$wikipediaCites[is.na(dat.eventcounts$wikipediaCites)] = 0
42
43# Change mendeleyReadersCount NAs to 0s
44dat.eventcounts$mendeleyReadersCount[is.na(dat.eventcounts$mendeleyReadersCount)] = 0
45
46# Change facebookClickCount NAs to 0s
47dat.eventcounts$facebookClickCount[is.na(dat.eventcounts$facebookClickCount)] = 0
48
49# delicious count looks strange for now
50dat.eventcounts$deliciousCount = as.integer(dat.raw.eventcounts$deliciousCount)
51dat.eventcounts$deliciousCount[is.na(dat.eventcounts$deliciousCount)] = 0
52dat.eventcounts$deliciousCount[dat.eventcounts$deliciousCount > 1000000] = 1
53
54# rename PMC column
55dat.eventcounts$almPubMedCentralCount = dat.eventcounts$almPubMedCount
56dat.eventcounts$almPubMedCount = NULL
57
58# There are a few Facebook results from Facebook API with negative numbers
59# Not clear what this means (not in Facebook API docs), so setting to NA
60facebookColumns = c("facebookShareCount", "facebookLikeCount", "facebookCommentCount", "facebookClickCount")
61for (col in facebookColumns) {
62	dat.eventcounts[which(dat.eventcounts[, col] < 0), col] = NA	
63}
64
65## article Type, set NAs to "Research Article" 
66# and store as a factor
67dat.eventcounts$articleType[is.na(dat.raw.eventcounts$articleType)] = "Research Article" 
68dat.eventcounts$articleType = factor(dat.eventcounts$articleType)
69summary(dat.eventcounts$articleType)
70
71## authorsCount
72dat.eventcounts$authorsCount = as.numeric(dat.raw.eventcounts$authorsCount)
73
74## eliminate columns not in use right now
75dat.eventcounts$pmid = NULL
76dat.eventcounts$plosSubjectTags = NULL
77dat.eventcounts$plosSubSubjectTags = NULL
78
79
80## Look again
81summary(dat.eventcounts)
82
83