PageRenderTime 15ms CodeModel.GetById 12ms app.highlight 1ms RepoModel.GetById 1ms app.codeStats 0ms

/scripts/preprocessing_events.R

http://github.com/hpiwowar/alt-metrics_stats
R | 41 lines | 18 code | 7 blank | 16 comment | 2 complexity | 950bd3a266abd6640197a3a81d53a794 MD5 | raw file
 1#library(Rserve)
 2#Rserve(args="--no-save")
 3
 4###### events.txt
 5# file has one row for each event.
 6# in some cases, an event contains a count of occurances that day in the "values" column
 7
 8## READ DATA
 9dat.raw.events = read.csv("data/raw/events.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE)
10
11## Look at it
12dim(dat.raw.events)
13names(dat.raw.events)
14summary(dat.raw.events)
15
16## Now adjust because some events include multiple occurances
17## add a new column called number.events that is ususally 1
18## but is set to the number of occurances in the "value" column 
19### for datatypes with more than one occurance per row
20dat.events = dat.raw.events
21number.rows = table(dat.events$eventType)
22eventTypes = names(number.rows)
23# Assign the contents of value as the number of occurances
24dat.events$number.events = as.integer(dat.events$value)		
25# But for many datatypes the value content is actually a text string.  
26# for these, overwrite with the number of occurances with 1.
27events.with.individual.rows = names(number.rows[number.rows < max(number.rows)])
28for (myEventType in events.with.individual.rows) {
29	dat.events$number.events[dat.events$eventType == myEventType] = 1
30}
31
32## now consolidate these events into a single table of dois, eventsType, and total count
33dat.events.perDoi = as.data.frame(tapply(dat.events$number.events, list(dat.events$doi, dat.events$eventType), sum))
34dat.events.perDoi$doi = rownames(dat.events.perDoi)
35dat.events.perDoi[is.na(dat.events.perDoi)] = 0
36
37# look at it
38colnames(dat.events.perDoi)
39dat.events.perDoi["10.1371/journal.pone.0008280",]
40summary(dat.events.perDoi)
41