preprocessing_events.R - This R code reads a CSV file conta…

/scripts/preprocessing_events.R

http://github.com/hpiwowar/alt-metrics_stats · R · 41 lines · 18 code · 7 blank · 16 comment · 2 complexity · 950bd3a266abd6640197a3a81d53a794 MD5 · raw file


#library(Rserve)
#Rserve(args="--no-save")

###### events.txt
# file has one row for each event.
# in some cases, an event contains a count of occurances that day in the "values" column

## READ DATA
dat.raw.events = read.csv("data/raw/events.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE)

## Look at it
dim(dat.raw.events)
names(dat.raw.events)
summary(dat.raw.events)

## Now adjust because some events include multiple occurances
## add a new column called number.events that is ususally 1
## but is set to the number of occurances in the "value" column 
### for datatypes with more than one occurance per row
dat.events = dat.raw.events
number.rows = table(dat.events$eventType)
eventTypes = names(number.rows)
# Assign the contents of value as the number of occurances
dat.events$number.events = as.integer(dat.events$value)		
# But for many datatypes the value content is actually a text string.  
# for these, overwrite with the number of occurances with 1.
events.with.individual.rows = names(number.rows[number.rows < max(number.rows)])
for (myEventType in events.with.individual.rows) {
	dat.events$number.events[dat.events$eventType == myEventType] = 1
}

## now consolidate these events into a single table of dois, eventsType, and total count
dat.events.perDoi = as.data.frame(tapply(dat.events$number.events, list(dat.events$doi, dat.events$eventType), sum))
dat.events.perDoi$doi = rownames(dat.events.perDoi)
dat.events.perDoi[is.na(dat.events.perDoi)] = 0

# look at it
colnames(dat.events.perDoi)
dat.events.perDoi["10.1371/journal.pone.0008280",]
summary(dat.events.perDoi)