PageRenderTime 37ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/scripts/preprocessing_events.R

http://github.com/hpiwowar/alt-metrics_stats
R | 41 lines | 18 code | 7 blank | 16 comment | 2 complexity | 950bd3a266abd6640197a3a81d53a794 MD5 | raw file
Possible License(s): MIT
  1. #library(Rserve)
  2. #Rserve(args="--no-save")
  3. ###### events.txt
  4. # file has one row for each event.
  5. # in some cases, an event contains a count of occurances that day in the "values" column
  6. ## READ DATA
  7. dat.raw.events = read.csv("data/raw/events.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE)
  8. ## Look at it
  9. dim(dat.raw.events)
  10. names(dat.raw.events)
  11. summary(dat.raw.events)
  12. ## Now adjust because some events include multiple occurances
  13. ## add a new column called number.events that is ususally 1
  14. ## but is set to the number of occurances in the "value" column
  15. ### for datatypes with more than one occurance per row
  16. dat.events = dat.raw.events
  17. number.rows = table(dat.events$eventType)
  18. eventTypes = names(number.rows)
  19. # Assign the contents of value as the number of occurances
  20. dat.events$number.events = as.integer(dat.events$value)
  21. # But for many datatypes the value content is actually a text string.
  22. # for these, overwrite with the number of occurances with 1.
  23. events.with.individual.rows = names(number.rows[number.rows < max(number.rows)])
  24. for (myEventType in events.with.individual.rows) {
  25. dat.events$number.events[dat.events$eventType == myEventType] = 1
  26. }
  27. ## now consolidate these events into a single table of dois, eventsType, and total count
  28. dat.events.perDoi = as.data.frame(tapply(dat.events$number.events, list(dat.events$doi, dat.events$eventType), sum))
  29. dat.events.perDoi$doi = rownames(dat.events.perDoi)
  30. dat.events.perDoi[is.na(dat.events.perDoi)] = 0
  31. # look at it
  32. colnames(dat.events.perDoi)
  33. dat.events.perDoi["10.1371/journal.pone.0008280",]
  34. summary(dat.events.perDoi)