PageRenderTime 45ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/examples/slides/infochimps.tex

http://github.com/drewconway/infochimps
LaTeX | 427 lines | 352 code | 53 blank | 22 comment | 0 complexity | bdcd31760c8ace7a2854f3ec4f3b7261 MD5 | raw file
  1. %
  2. % infochimps.tex
  3. %
  4. % Created by Drew Conway on 2010-12-07
  5. %
  6. %
  7. \documentclass[xcolor=dvipsnames, 9pt,handout]{beamer}
  8. \newenvironment{code}{\begin{semiverbatim} \begin{footnotesize}}
  9. {\end{footnotesize}\end{semiverbatim}}
  10. \usepackage{graphicx}
  11. \usepackage{amssymb}
  12. \usepackage{amsfonts}
  13. \usepackage{amsmath}
  14. \usepackage{hyperref}
  15. \usepackage{natbib}
  16. \usepackage{color}
  17. \usepackage{pdfsync}
  18. \usepackage{chancery}
  19. \usepackage{movie15}
  20. \usepackage{pgfpages}
  21. \usepackage{fancyvrb}
  22. \usepackage{colortbl}
  23. \usepackage{listings}
  24. % \definecolor{white}{rgb}{255,255,255}
  25. % \definecolor{darkred}{rgb}{0.5,0,0}
  26. % \definecolor{darkgreen}{rgb}{0,0.5,0}
  27. % \definecolor{lightblue}{rgb}{0,0,0.7}
  28. % \hypersetup{colorlinks,
  29. % linkcolor=white,
  30. % filecolor=darkred,
  31. % urlcolor=lightblue,
  32. % citecolor=darkblue}
  33. \usepackage{beamerthemesplit}
  34. \usetheme{Warsaw}
  35. \usecolortheme[named=Tan]{structure}
  36. \setbeamertemplate{navigation symbols}{}
  37. \setbeamertemplate{itemize items}[triangle]
  38. \setbeamertemplate{enumerate items}[default]
  39. %\setbeameroption{show notes on second screen}
  40. %\logo{\includegraphics[width = 2cm]{nyulogo.png}}
  41. \newcommand{\R}{\mathbb{R}}
  42. \renewcommand{\d}{\mathsf{d}}
  43. \newcommand{\dd}{\partial}
  44. \newcommand{\E}{\mathsf{E}}
  45. \newcommand{\bb}{\mathbf}
  46. \title{How To Be a Real Data Monkey\\Hacking the Infochimps API with R}
  47. \author{Drew Conway}
  48. \date{December 16, 2010}
  49. \begin{document}
  50. \begin{frame}[plain]
  51. \titlepage
  52. \end{frame}
  53. \begin{frame}
  54. \frametitle{Introduction}
  55. \begin{columns}
  56. \column{.5\textwidth}
  57. What is infochimps?
  58. \begin{itemize}
  59. \item Data clearinghouse
  60. \item API
  61. \end{itemize}
  62. The \texttt{infochimps} R package
  63. \begin{itemize}
  64. \item Basic usage framework
  65. \item Looking at the guts
  66. \end{itemize}
  67. Examples
  68. \begin{itemize}
  69. \item Geo-location of blog hits
  70. \item Programming language mentions on Twitter
  71. \end{itemize}
  72. \column{.5\textwidth}
  73. \includegraphics[width=5.5cm]{images/monkeys.jpg}
  74. \end{columns}
  75. \end{frame}
  76. \section{Introduction to Infochimps} % (fold)
  77. \label{sec:introduction_to_infochimps}
  78. \begin{frame}[fragile]
  79. \frametitle{Infochimps.com}
  80. \alert{DISCLAIMER}: I do not work for Infochimps
  81. \begin{itemize}
  82. \item I just think they're awesome
  83. \item Much more info at \url{http://infochimps.com/about/}
  84. \end{itemize}
  85. \vspace{2mm}
  86. %\uncover<2->{\includegraphics[width=3cm]{images/infochimps_logo.jpg}\\
  87. \uncover<2->{\begin{block}{Mission}
  88. We make lists, spreadsheets and datasets easy to find and monkey around with.
  89. \end{block}}
  90. \vspace{2mm}
  91. \uncover<3->{\begin{columns}
  92. \column{.4\textwidth}
  93. Data clearinghouse
  94. \begin{itemize}
  95. \item Buy and sell data sets
  96. \item Handle all overhead
  97. \item Many free data sets, very useful for researchers
  98. \end{itemize}
  99. \column{.6\textwidth}
  100. \includegraphics[width=6.5cm]{images/game_logs.png}
  101. \end{columns}}
  102. \end{frame}
  103. \begin{frame}[fragile]
  104. \frametitle{Infochimps API}
  105. \fbox{\includegraphics[width=11cm]{images/infochimps_screenshot.png}} \\
  106. \vspace{2mm}
  107. \uncover<2->{For more info see vid by Flip Kromer: \url{http://vimeo.com/16819171}}
  108. \end{frame}
  109. % section introduction_to_infochimps (end)
  110. \section{The \texttt{infochimps} R package} % (fold)
  111. \label{sec:infochimps_r_package}
  112. \begin{frame}[fragile]
  113. \frametitle{\texttt{infochimps} R package}
  114. \textbf{Idea}: create functions for every API call to integrate querying in \texttt{R}
  115. \begin{itemize}
  116. \item My first package accepted to CRAN!
  117. \item \url{http://cran.r-project.org/web/packages/infochimps/}
  118. \item Update as new ones API calls roll-out
  119. \end{itemize}
  120. \uncover<2->{Inspired by other R API wrappers
  121. \begin{itemize}
  122. \item \href{http://cran.r-project.org/web/packages/twitteR/}{\texttt{twitteR}} by Jeff Gentry
  123. \item \href{http://cran.r-project.org/web/packages/IBrokers/}{\texttt{iBrokers}} by Jeffrey Ryan
  124. \item \href{http://cran.r-project.org/src/contrib/Archive/nytR/}{\texttt{nytR}} by Shane Conway (archived)
  125. \end{itemize}}
  126. \uncover<3->{\begin{columns}
  127. \column{.5\textwidth}
  128. \begin{block}{Twitter related}
  129. \begin{itemize}
  130. \item \texttt{conversations}
  131. \item \texttt{influence}
  132. \item \texttt{strong.links}
  133. \item \texttt{trstrank}
  134. \item \texttt{word.bag}
  135. \item \texttt{word.stats}
  136. \end{itemize}
  137. \end{block}
  138. \column{.5\textwidth}
  139. \begin{block}{Geo-location related}
  140. \begin{itemize}
  141. \item \texttt{census}
  142. \item \texttt{demographics}
  143. \item \texttt{domain}
  144. \item \texttt{ip.geo}
  145. \end{itemize}
  146. \end{block}
  147. \vspace{8.5mm}
  148. \end{columns}}
  149. \end{frame}
  150. \begin{frame}[fragile]
  151. \frametitle{Basic usage framework}
  152. \begin{center}
  153. \includegraphics[width=8cm]{images/struct.pdf}
  154. \end{center}
  155. \end{frame}
  156. \begin{frame}[fragile]
  157. \frametitle{My first \texttt{infochimps} call}
  158. \alert<1>{Generate \texttt{infochimps} session}
  159. \begin{lstlisting}
  160. > library(infochimps)
  161. > my.api<-``some.long.alpha.numeric''
  162. > ic<-infochimps(my.api)
  163. \end{lstlisting}
  164. \alert<2>{Get statistics for word ``data''}
  165. \begin{lstlisting}
  166. > data.stats<-word.stats(``data'',ic)
  167. > print(data.stats)
  168. $global_stdev_ppb
  169. [1] 2376464
  170. $range
  171. [1] 0.01266617
  172. $tok
  173. [1] ``data''
  174. $global_freq_ppb
  175. [1] 151562.4
  176. \end{lstlisting}
  177. \end{frame}
  178. \begin{frame}[fragile]
  179. \frametitle{The \texttt{word.stats} function}
  180. \begin{lstlisting}[language=R]
  181. word.stats <-
  182. function(tok,session) {
  183. word.url<-paste(session$base,"word_stats.json?tok=",
  184. tok,"&apikey=",session$api.key,sep="")
  185. word.get<-getURL(word.url)
  186. word.data<-fromJSON(word.get)
  187. # Simple error checking
  188. if(is.null(word.data$error)) {
  189. return(word.data)
  190. }
  191. else {
  192. warning(word.data$message[[1]])
  193. return(NA)
  194. }
  195. }
  196. \end{lstlisting}
  197. \uncover<2->{\alert<2>{All function follow this basic framework}}
  198. \end{frame}
  199. % section infochimps_r_package (end)
  200. \section{Blog hits map} % (fold)
  201. \label{sec:blog_hits_map}
  202. \begin{frame}[fragile]
  203. \frametitle{Visualizing the location of blog visitors}
  204. The \texttt{ip.geo} function provides detailed geo-location data for a given IP address
  205. \begin{itemize}
  206. \item City, metro, country and continent codes (with confidence)
  207. \item Zip codes
  208. \item Latitude/Longitude
  209. \item Much more...
  210. \end{itemize}
  211. \uncover<2->{Using web log data from \url{http://drewconway.com/zia}, visualize one days worth of blog hits
  212. \begin{enumerate}
  213. \item Parse log file by IP address and date/time
  214. \item Use \texttt{ip.geo} to find lat/long for each hit
  215. \item Plot on map using \texttt{ggplot2}
  216. \end{enumerate}}
  217. \end{frame}
  218. \begin{frame}[fragile]
  219. \frametitle{Step 1: Parse log file}
  220. \scriptsize{\begin{lstlisting}[language=R]
  221. # Load libraries
  222. library(infochimps)
  223. library(ggplot2)
  224. library(maps)
  225. # Need to load and clean the data
  226. log.data<-read.delim("data/drewconway_com-Dec-2010.txt",
  227. sep="-", header=FALSE, as.is=TRUE)
  228. log.data<-data.frame(list("IP"=log.data$V1, "Date.Time"=log.data$V3,
  229. "Log"=log.data$V4), stringsAsFactors=FALSE)
  230. log.data$IP<-gsub(" ","", log.data$IP)
  231. # First, get the dates in useable format
  232. log.data$Date.Time<-gsub("[\\[ ]","",log.data$Date.Time)
  233. log.data$Date.Time<-strptime(log.data$Date.Time, format="%d/%b/%Y:%H:%M:%S ")
  234. # Filter out only those logs accessing the right blog post
  235. log.data<-log.data[grep("(\\?p\\=|index\\.php)",log.data$Log),]
  236. \end{lstlisting}}
  237. \end{frame}
  238. \begin{frame}[fragile]
  239. \frametitle{Step 2: Get lat/long data}
  240. \scriptsize{\begin{lstlisting}[language=R]
  241. # Create infochimps session
  242. api.key<-"my.long.alpha.numeric"
  243. ic<-infochimps(api.key)
  244. # Get lattitude and longitude data for all of the IPs
  245. ips<-unique(log.data$IP)
  246. get.latlong<-function(ip) {
  247. geo.data<-ip.geo(ip,ic)
  248. return(c(ip, geo.data$lat,geo.data$longitude))
  249. }
  250. # Create data frame to merge into log data
  251. geo.data<-lapply(ips, get.latlong)
  252. geo.df<-as.data.frame(do.call("rbind", geo.data),stringsAsFactors=FALSE)
  253. names(geo.df)<-c("IP","Latitude","Longitude")
  254. log.geo<-merge(log.data,geo.df,by="IP")
  255. log.geo$Latitude<-as.numeric(log.geo$Latitude)
  256. log.geo$Longitude<-as.numeric(log.geo$Longitude)
  257. # Create counts, and sort chronologically
  258. log.count<-ddply(log.geo,.(IP, Date.Time, Latitude, Longitude),
  259. summarise, Count=length(Log))
  260. log.count<-log.count[with(log.count, order(Date.Time)),]
  261. \end{lstlisting}}
  262. \end{frame}
  263. \begin{frame}[fragile]
  264. \frametitle{Step 3: Visualize on global map}
  265. \scriptsize{\begin{lstlisting}[language=R]
  266. # Ready to visualize
  267. world.map<-data.frame(map(plot=FALSE)[c("x","y")])
  268. # Create frame for every second in data
  269. plot.num<-1
  270. for(d in strftime(log.count$Date.Time)) {
  271. log.sub<-log.count[which(strftime(log.count$Date.Time)==d),]
  272. geo.plot<-ggplot(world.map, aes(x=x,y=y))+geom_path(aes(colour=``grey''))
  273. geo.plot<-geo.plot+geom_point(data=log.sub, aes(x=Longitude, y=Latitude,
  274. color=``red'', alpha=0.75, size=Count))+
  275. annotate(``text'',x=-125,y=-5,label=strftime(d, format=``%H:%M:%S''))+
  276. theme_bw()+scale_colour_manual(values=c(``grey''=``grey'',``red''=``red''),
  277. legend=FALSE)+
  278. scale_alpha(legend=FALSE)+scale_size(legend=FALSE)+
  279. coord_map(projection=``lagrange'',ylim=c(-40,70),xlim=c(-145,155))+
  280. opts(panel.grid.major=theme_blank(),axis.ticks=theme_blank(),
  281. axis.text.x=theme_blank(),axis.text.y=theme_blank())+
  282. xlab(``'')+ylab(``'')
  283. ggsave(plot=geo.plot,filename=paste(``images/maps/'',plot.num,``.png'',sep=``''),
  284. width=6,height=4)
  285. plot.num<-plot.num+1
  286. }
  287. # Run this at the command-line to join the files as a movie
  288. # ffmpeg -f image2 -r 5 -i images/maps/%d.png -b 600k blogpost.mp4
  289. \end{lstlisting}}
  290. \end{frame}
  291. \begin{frame}[fragile]
  292. \frametitle{The movie}
  293. \begin{center}
  294. \includemovie[
  295. poster,
  296. text={Blog hits map}
  297. ]{9cm}{7cm}{../blogpost.mp4}
  298. \end{center}
  299. \end{frame}
  300. % section blog_hits_map (end)
  301. \section{Programming languages on Twitter} % (fold)
  302. \label{sec:comparison_of_programming_language_tweeting}
  303. \begin{frame}[fragile]
  304. \frametitle{How do people tweet about different languages?}
  305. \begin{center}
  306. \includegraphics[width=10cm]{images/lang_pop.png}
  307. \end{center}
  308. \begin{tabular}{r}
  309. \hline \\
  310. \scriptsize{Source: \url{http://langpop.com/}, Last updated Sat Nov 27 08:45:50}
  311. \end{tabular}
  312. \end{frame}
  313. \begin{frame}[fragile]
  314. \frametitle{Use the \texttt{word.stats} function}
  315. The \texttt{word.stats} function returns token frequency data
  316. \begin{itemize}
  317. \item Global frequency (parts per-billion)
  318. \item Standard deviation of frequency (parts per-billion)
  319. \item Range (normalized number of unique users who have used it)
  320. \end{itemize}
  321. \uncover<2->{Use the global frequency to create chart for Twitter mentions
  322. \begin{enumerate}
  323. \item Create vector of computer languages
  324. \item User \texttt{word.stats} to collect frequency data
  325. \item Plot bar chat with \texttt{ggplot2}
  326. \end{enumerate}}
  327. \uncover<3->{\alert{Disadvantage}: cannot get stats for languages like C, C#, C++}
  328. \end{frame}
  329. \begin{frame}[fragile]
  330. \frametitle{Getting the data}
  331. \scriptsize{\begin{lstlisting}[language=R]
  332. # rogramming languages
  333. prog.langs<-c("java","php","javascript","python","sql","perl","ruby",
  334. "actionscript","assembly","lisp","delphi","pascal","scheme","haskell",
  335. "tcl","lua","fortran","coldfusion","ada","cobol","erlang","smalltalk",
  336. "scala","ocaml","forth","rexx","rstats")
  337. # Get word stats for all languages
  338. lang.stats<-lapply(prog.langs,function(t) unlist(word.stats(t,ic)))
  339. lang.df<-as.data.frame(do.call("rbind",lang.stats), stringsAsFactors=FALSE)
  340. lang.df$global_stdev_ppb<-as.numeric(lang.df$global_stdev_ppb)
  341. lang.df$range<-as.numeric(lang.df$range)
  342. lang.df$global_freq_ppb<-as.numeric(lang.df$global_freq_ppb)
  343. # Dummy for common words
  344. common<-rep(0,nrow(lang.df))
  345. common[match(c("ruby","assembly","lisp","scheme","ada","forth"),lang.df$tok)]<-1
  346. lang.df$common<-as.factor(common)
  347. # Sort by frequency
  348. lang.df<-lang.df[with(lang.df, order(global_freq_ppb)),]
  349. \end{lstlisting}}
  350. \end{frame}
  351. \begin{frame}[fragile]
  352. \frametitle{Visualizing computer languages mentioned on Twitter}
  353. \includegraphics[width=10cm]{images/lang_pop1.png}
  354. \end{frame}
  355. \begin{frame}[fragile]
  356. \frametitle{Ambiguous terms}
  357. \includegraphics[width=10cm]{images/lang_pop2.png}
  358. \end{frame}
  359. \begin{frame}[fragile]
  360. \frametitle{Terms cleans and logs taken}
  361. \includegraphics[width=10cm]{images/lang_pop3.png}
  362. \end{frame}
  363. % section comparison_of_programming_language_tweeting (end)
  364. \begin{frame}[fragile]
  365. \frametitle{Thank You!}
  366. \begin{tabular}{ll}
  367. E-mail: & drew.conway@nyu.edu \\
  368. Web: & \url{http://drewconway.com/zia} \\
  369. Twitter:& @drewconway
  370. \end{tabular}
  371. \vspace{2cm} \\
  372. Example code and slides available at \url{https://github.com/drewconway/infochimps/tree/master/examples}
  373. \end{frame}
  374. \end{document}