PageRenderTime 50ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/github-key/main.go

https://bitbucket.org/ymotongpoo/github-keys
Go | 333 lines | 265 code | 33 blank | 35 comment | 56 complexity | 1fcc81789aa2cbf764ff34a5f2cb35e4 MD5 | raw file
  1. package main
  2. import (
  3. "bytes"
  4. "fmt"
  5. "io"
  6. "io/ioutil"
  7. "log"
  8. "net/http"
  9. "os"
  10. "regexp"
  11. "runtime"
  12. "strconv"
  13. "strings"
  14. "time"
  15. "code.google.com/p/go.net/html"
  16. gq "github.com/PuerkitoBio/goquery"
  17. "github.com/vmihailenco/redis"
  18. )
  19. const (
  20. BufferSize = 1000
  21. Filename = "github-keys.txt"
  22. // GitHub settings
  23. TimelineURL = "https://github.com/timeline"
  24. SearchURL = "https://github.com/search"
  25. GistTimelineURL = "https://gist.github.com/discover?page="
  26. GistMaxPages = 100
  27. GistIteratePages = 20
  28. ChromeUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
  29. // Redis settings
  30. //
  31. // RedisSockFile is UNIX socket file for redis.
  32. RedisSockFile = "/tmp/redis.sock"
  33. // RedisUserSname is a key name for a set instance including all usernames.
  34. RedisUserSet = "user"
  35. // RedisKeyList is a kay name for a list instance including specific user's SSH keys.
  36. RedisKeyList = "key:%s"
  37. // RedisKeyCount is a key name for a conter counting total ssh keys in the database.
  38. RedisKeyCount = "keycount"
  39. )
  40. var (
  41. TLPollInterval = 30 * time.Second
  42. PagenationInterval = 3 * time.Second
  43. SearchInterval = 3 * time.Second
  44. KeyFetchInterval = 1 * time.Second
  45. GistPollInterval = 1 * time.Hour
  46. RedisSaveInterval = 10 * time.Minute
  47. UsernameRe = regexp.MustCompile(`\/([0-9A-Za-z_\-]+)`)
  48. )
  49. // Key is a pair of username and ssh key.
  50. type Keys struct {
  51. User string
  52. SSHKeys []string
  53. }
  54. // extractUsername extract selector s from GitHub site wide
  55. // "#wrapper" element, and returns username and quit flag.
  56. func extractUsername(d *gq.Document, s string) (<-chan string, <-chan int) {
  57. c := make(chan string)
  58. quit := make(chan int)
  59. go func() {
  60. d.Find("#wrapper").
  61. Find(s).
  62. Find("a").Each(func(_ int, s *gq.Selection) {
  63. href, exist := s.Attr("href")
  64. if !exist {
  65. return
  66. }
  67. match := UsernameRe.FindStringSubmatch(href)
  68. if len(match) == 2 {
  69. c <- match[1]
  70. }
  71. })
  72. quit <- 0
  73. }()
  74. return c, quit
  75. }
  76. // extractUsernameFromTL extract username from GitHub timeline page,
  77. // and returns channels of username and quit flag.
  78. // TODO(ymotongpoo): Extract username from repository path as well.
  79. func extractUsernameFromTL(d *gq.Document) (<-chan string, <-chan int) {
  80. return extractUsername(d, "div.title")
  81. }
  82. // extractUsernameFromGist extract username from Gist discovery page,
  83. // and returns channels of username and quit flag.
  84. func extractUsernameFromGist(d *gq.Document) (<-chan string, <-chan int) {
  85. return extractUsername(d, "span.creator")
  86. }
  87. // extractUsernameFromSearch extract username from GitHub search page,
  88. // and returns channels of username and quit flag.
  89. func extractUsernameFromSearch(d *gq.Document) (<-chan string, <-chan int) {
  90. return extractUsername(d, "div.user-list-item")
  91. }
  92. // getPage stores usernames and errors into passed channels.
  93. func getPage(url, message string,
  94. extractFunc func(d *gq.Document) (<-chan string, <-chan int),
  95. queue chan<- string, e chan<- error) {
  96. req, err := http.NewRequest("GET", url, nil)
  97. if err != nil {
  98. e <- err
  99. return
  100. }
  101. client := &http.Client{}
  102. req.Header.Set("User-Agent", ChromeUA)
  103. resp, err := client.Do(req)
  104. if err != nil {
  105. e <- err
  106. return
  107. }
  108. defer resp.Body.Close()
  109. log.Println(message + " : " + resp.Status)
  110. if resp.StatusCode == 200 {
  111. root, err := html.Parse(resp.Body)
  112. if err != nil {
  113. e <- err
  114. }
  115. doc := gq.NewDocumentFromNode(root)
  116. c, quit := extractFunc(doc)
  117. for {
  118. select {
  119. case username := <-c:
  120. queue <- username
  121. case <-quit:
  122. return
  123. }
  124. }
  125. }
  126. return
  127. }
  128. // GetTimeline stores usernames on GitHub timeline and errors
  129. // into channels queue and e.
  130. func GetTimeline(queue chan<- string, e chan<- error) {
  131. getPage(TimelineURL, "timeline", extractUsernameFromTL, queue, e)
  132. return
  133. }
  134. // GetGistTimeline stores usernames on a gist discovery page into channnel queue
  135. // and put errors into channel e.
  136. func GetGistTimeline(page int, queue chan<- string, e chan<- error) {
  137. url := GistTimelineURL + strconv.Itoa(page)
  138. message := "gist page " + strconv.Itoa(page)
  139. getPage(url, message, extractUsernameFromGist, queue, e)
  140. return
  141. }
  142. // GetSearchResult stores usernaem on a GitHub search page into channel queue
  143. // and put errors into channel e.
  144. func GetSearchResult(option string, queue chan<- string, e chan<- error) {
  145. url := SearchURL + option
  146. message := option
  147. getPage(url, message, extractUsernameFromSearch, queue, e)
  148. return
  149. }
  150. // FetchKeys fetch ssh key data from user specific key page and store those
  151. // into channel keys, and put errors into channel e.
  152. func FetchKeys(username string, keys chan<- Keys, e chan<- error) {
  153. url := "https://github.com/" + username + ".keys"
  154. resp, err := http.Get(url)
  155. if err != nil {
  156. e <- err
  157. return
  158. }
  159. defer resp.Body.Close()
  160. log.Println("key " + username + " : " + resp.Status)
  161. if resp.StatusCode == 200 {
  162. data, err := ioutil.ReadAll(resp.Body)
  163. if err != nil {
  164. e <- err
  165. return
  166. }
  167. sshKeys := strings.Split(string(data), "\n")
  168. keys <- Keys{username, sshKeys}
  169. }
  170. return
  171. }
  172. // InitRedis restores user and SSH key data from old text file database.
  173. func InitRedis(file *os.File) error {
  174. client := redis.NewUnixClient(RedisSockFile, "", -1)
  175. defer client.Close()
  176. var exist *redis.BoolReq
  177. data, err := ioutil.ReadAll(file)
  178. if err != nil {
  179. return err
  180. }
  181. b := bytes.NewBuffer(data)
  182. for {
  183. buf, err := b.ReadString('\n')
  184. switch err {
  185. case io.EOF:
  186. line := strings.TrimSpace(buf)
  187. columns := strings.Split(line, "\t")
  188. if len(columns) == 2 {
  189. username := columns[0]
  190. exist = client.SIsMember(RedisUserSet, username)
  191. if !exist.Val() {
  192. client.SAdd(RedisUserSet, username)
  193. }
  194. }
  195. count := client.SCard(RedisUserSet)
  196. log.Println("init: loaded " + strconv.FormatInt(count.Val(), 10) + " users")
  197. return nil
  198. case nil:
  199. line := strings.TrimSpace(buf)
  200. columns := strings.Split(line, "\t")
  201. if len(columns) == 2 {
  202. username := columns[0]
  203. exist = client.SIsMember(RedisUserSet, username)
  204. if !exist.Val() {
  205. client.SAdd(RedisUserSet, username)
  206. }
  207. }
  208. default:
  209. return err
  210. }
  211. }
  212. return nil
  213. }
  214. func StartCrawling(usernames chan string, errors chan error) {
  215. // initial call
  216. go GetTimeline(usernames, errors)
  217. go func() {
  218. for i := 0; i < GistMaxPages; i++ {
  219. go GetGistTimeline(i+1, usernames, errors)
  220. time.Sleep(PagenationInterval)
  221. }
  222. }()
  223. // Searching with advanced option: "followers = 0"
  224. go func() {
  225. for i := 0; i < GistMaxPages; i++ {
  226. option := "?q=followers%3A0&type=Users&p=" + strconv.Itoa(i+1)
  227. go GetSearchResult(option, usernames, errors)
  228. time.Sleep(SearchInterval)
  229. }
  230. }()
  231. // interval call to github timeline and gist respectively
  232. for {
  233. select {
  234. case <-time.After(TLPollInterval):
  235. log.Println(time.Now())
  236. go GetTimeline(usernames, errors)
  237. case <-time.After(GistPollInterval):
  238. log.Println(time.Now())
  239. for i := 0; i < GistIteratePages; i++ {
  240. go GetGistTimeline(i+1, usernames, errors)
  241. }
  242. }
  243. }
  244. }
  245. // CheckUserKeys create batches of unchecked users and
  246. func CheckUserKeys(usernames chan string, keys chan Keys, errors chan error) {
  247. userClient := redis.NewUnixClient(RedisSockFile, "", -1)
  248. defer userClient.Close()
  249. var exist *redis.BoolReq
  250. for {
  251. select {
  252. case u := <-usernames:
  253. exist = userClient.SIsMember(RedisUserSet, u)
  254. if !exist.Val() {
  255. userClient.SAdd(RedisUserSet, u)
  256. go FetchKeys(u, keys, errors)
  257. time.Sleep(KeyFetchInterval)
  258. }
  259. case e := <-errors:
  260. log.Println(e)
  261. }
  262. }
  263. }
  264. func main() {
  265. runtime.GOMAXPROCS(runtime.NumCPU())
  266. // Check if old text file database exists.
  267. var file *os.File
  268. var err error
  269. file, err = os.Open(Filename)
  270. if err == nil { // If it exsits, restore all data into Redis server.
  271. defer file.Close()
  272. log.Println("init: loading users...")
  273. err = InitRedis(file)
  274. } else {
  275. log.Println("init: starting new db...")
  276. }
  277. usernames := make(chan string, BufferSize)
  278. errors := make(chan error, BufferSize)
  279. keys := make(chan Keys, BufferSize)
  280. // Trigger goroutine
  281. go StartCrawling(usernames, errors)
  282. // username & error channel handling
  283. go CheckUserKeys(usernames, keys, errors)
  284. // key channel handling
  285. client := redis.NewUnixClient(RedisSockFile, "", -1)
  286. var status *redis.StatusReq
  287. var numKeys *redis.IntReq
  288. for {
  289. select {
  290. case k := <-keys:
  291. numKeys = client.RPush(fmt.Sprintf(RedisKeyList, k.User), k.SSHKeys...)
  292. client.IncrBy(RedisKeyCount, numKeys.Val())
  293. case <-time.After(RedisSaveInterval):
  294. status = client.BgSave()
  295. if status.Val() != "" {
  296. log.Println(status.Val())
  297. }
  298. }
  299. }
  300. }