PageRenderTime 39ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/examples/ip_map.R

http://github.com/drewconway/infochimps
R | 77 lines | 38 code | 15 blank | 24 comment | 1 complexity | c8cfceaff4159a4f0df20c94d915ad86 MD5 | raw file
  1. # File-Name: ip_map.R
  2. # Date: 2010-12-03
  3. # Author: Drew Conway
  4. # Email: drew.conway@nyu.edu
  5. # Purpose: Generate map of IP addresses from blog post http://www.drewconway.com/zia/?p=2537
  6. # Data Used: Raw log files from drewconway.com
  7. # Packages Used: infochimps, ggplot2
  8. # Output File:
  9. # Data Output:
  10. # Machine: Drew Conway's MacBook Pro
  11. # Copyright (c) 2010, under the Simplified BSD License.
  12. # For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php
  13. # All rights reserved.
  14. library(infochimps)
  15. library(ggplot2)
  16. library(maps)
  17. # Need to load and clean the data
  18. log.data<-read.delim("your.log.data.txt", sep="-", header=FALSE, as.is=TRUE)
  19. # Note, your log data may be formatted differently and the following data processing may not work properly
  20. log.data<-data.frame(list("IP"=log.data$V1, "Date.Time"=log.data$V3, "Log"=log.data$V4), stringsAsFactors=FALSE)
  21. log.data$IP<-gsub(" ","", log.data$IP)
  22. # First, get the dates in useable format
  23. log.data$Date.Time<-gsub("[\\[ ]","",log.data$Date.Time)
  24. log.data$Date.Time<-strptime(log.data$Date.Time, format="%d/%b/%Y:%H:%M:%S ")
  25. # Filter out only those logs accessing the right blog post
  26. log.data<-log.data[grep("(\\?p\\=|index\\.php)",log.data$Log),]
  27. # Create infochimps session
  28. api.key<-"api.key<-"your.infochimps.api.key""
  29. ic<-infochimps(api.key)
  30. # Get lattitude and longitude data for all of the IPs
  31. ips<-unique(log.data$IP)
  32. get.latlong<-function(ip) {
  33. geo.data<-ip.geo(ip,ic)
  34. return(c(ip, geo.data$lat,geo.data$longitude))
  35. }
  36. # Create data frame to merge into log data
  37. geo.data<-lapply(ips, get.latlong) # this can take awhile depending on how many IPs you have
  38. geo.df<-as.data.frame(do.call("rbind", geo.data),stringsAsFactors=FALSE)
  39. names(geo.df)<-c("IP","Latitude","Longitude")
  40. log.geo<-merge(log.data,geo.df,by="IP")
  41. log.geo$Latitude<-as.numeric(log.geo$Latitude)
  42. log.geo$Longitude<-as.numeric(log.geo$Longitude)
  43. # Create counts, and sort chronologically
  44. log.count<-ddply(log.geo,.(IP, Date.Time, Latitude, Longitude), summarise, Count=length(Log))
  45. log.count<-log.count[with(log.count, order(Date.Time)),]
  46. # Ready to visualize
  47. world.map<-data.frame(map(plot=FALSE)[c("x","y")])
  48. plot.num<-1
  49. for(d in strftime(log.count$Date.Time)) {
  50. log.sub<-log.count[which(strftime(log.count$Date.Time)==d),]
  51. geo.plot<-ggplot(world.map, aes(x=x,y=y))+geom_path(aes(colour="grey"))
  52. geo.plot<-geo.plot+geom_point(data=log.sub, aes(x=Longitude, y=Latitude, color="red", alpha=0.75, size=Count))+
  53. annotate("text",x=-125,y=-5,label=strftime(d, format="%H:%M:%S"))+theme_bw()+
  54. scale_colour_manual(values=c("grey"="grey","red"="red"),legend=FALSE)+scale_alpha(legend=FALSE)+
  55. scale_size(legend=FALSE)+coord_map(projection="lagrange",ylim=c(-40,70),xlim=c(-145,155))+
  56. opts(panel.grid.major=theme_blank(),axis.ticks=theme_blank(),axis.text.x=theme_blank(),axis.text.y=theme_blank())+
  57. xlab("")+ylab("")
  58. ggsave(plot=geo.plot,filename=paste("images/maps/",plot.num,".png",sep=""),width=6,height=4)
  59. plot.num<-plot.num+1
  60. }
  61. # Run this at the command-line to join the files as a movie
  62. # ffmpeg -f image2 -r 10 -i images/maps/%d.png -b 600k blogpost.mp4