/github-key/main.go
Go | 333 lines | 265 code | 33 blank | 35 comment | 56 complexity | 1fcc81789aa2cbf764ff34a5f2cb35e4 MD5 | raw file
- package main
- import (
- "bytes"
- "fmt"
- "io"
- "io/ioutil"
- "log"
- "net/http"
- "os"
- "regexp"
- "runtime"
- "strconv"
- "strings"
- "time"
- "code.google.com/p/go.net/html"
- gq "github.com/PuerkitoBio/goquery"
- "github.com/vmihailenco/redis"
- )
- const (
- BufferSize = 1000
- Filename = "github-keys.txt"
- // GitHub settings
- TimelineURL = "https://github.com/timeline"
- SearchURL = "https://github.com/search"
- GistTimelineURL = "https://gist.github.com/discover?page="
- GistMaxPages = 100
- GistIteratePages = 20
- ChromeUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
- // Redis settings
- //
- // RedisSockFile is UNIX socket file for redis.
- RedisSockFile = "/tmp/redis.sock"
- // RedisUserSname is a key name for a set instance including all usernames.
- RedisUserSet = "user"
- // RedisKeyList is a kay name for a list instance including specific user's SSH keys.
- RedisKeyList = "key:%s"
- // RedisKeyCount is a key name for a conter counting total ssh keys in the database.
- RedisKeyCount = "keycount"
- )
- var (
- TLPollInterval = 30 * time.Second
- PagenationInterval = 3 * time.Second
- SearchInterval = 3 * time.Second
- KeyFetchInterval = 1 * time.Second
- GistPollInterval = 1 * time.Hour
- RedisSaveInterval = 10 * time.Minute
- UsernameRe = regexp.MustCompile(`\/([0-9A-Za-z_\-]+)`)
- )
- // Key is a pair of username and ssh key.
- type Keys struct {
- User string
- SSHKeys []string
- }
- // extractUsername extract selector s from GitHub site wide
- // "#wrapper" element, and returns username and quit flag.
- func extractUsername(d *gq.Document, s string) (<-chan string, <-chan int) {
- c := make(chan string)
- quit := make(chan int)
- go func() {
- d.Find("#wrapper").
- Find(s).
- Find("a").Each(func(_ int, s *gq.Selection) {
- href, exist := s.Attr("href")
- if !exist {
- return
- }
- match := UsernameRe.FindStringSubmatch(href)
- if len(match) == 2 {
- c <- match[1]
- }
- })
- quit <- 0
- }()
- return c, quit
- }
- // extractUsernameFromTL extract username from GitHub timeline page,
- // and returns channels of username and quit flag.
- // TODO(ymotongpoo): Extract username from repository path as well.
- func extractUsernameFromTL(d *gq.Document) (<-chan string, <-chan int) {
- return extractUsername(d, "div.title")
- }
- // extractUsernameFromGist extract username from Gist discovery page,
- // and returns channels of username and quit flag.
- func extractUsernameFromGist(d *gq.Document) (<-chan string, <-chan int) {
- return extractUsername(d, "span.creator")
- }
- // extractUsernameFromSearch extract username from GitHub search page,
- // and returns channels of username and quit flag.
- func extractUsernameFromSearch(d *gq.Document) (<-chan string, <-chan int) {
- return extractUsername(d, "div.user-list-item")
- }
- // getPage stores usernames and errors into passed channels.
- func getPage(url, message string,
- extractFunc func(d *gq.Document) (<-chan string, <-chan int),
- queue chan<- string, e chan<- error) {
- req, err := http.NewRequest("GET", url, nil)
- if err != nil {
- e <- err
- return
- }
- client := &http.Client{}
- req.Header.Set("User-Agent", ChromeUA)
- resp, err := client.Do(req)
- if err != nil {
- e <- err
- return
- }
- defer resp.Body.Close()
- log.Println(message + " : " + resp.Status)
- if resp.StatusCode == 200 {
- root, err := html.Parse(resp.Body)
- if err != nil {
- e <- err
- }
- doc := gq.NewDocumentFromNode(root)
- c, quit := extractFunc(doc)
- for {
- select {
- case username := <-c:
- queue <- username
- case <-quit:
- return
- }
- }
- }
- return
- }
- // GetTimeline stores usernames on GitHub timeline and errors
- // into channels queue and e.
- func GetTimeline(queue chan<- string, e chan<- error) {
- getPage(TimelineURL, "timeline", extractUsernameFromTL, queue, e)
- return
- }
- // GetGistTimeline stores usernames on a gist discovery page into channnel queue
- // and put errors into channel e.
- func GetGistTimeline(page int, queue chan<- string, e chan<- error) {
- url := GistTimelineURL + strconv.Itoa(page)
- message := "gist page " + strconv.Itoa(page)
- getPage(url, message, extractUsernameFromGist, queue, e)
- return
- }
- // GetSearchResult stores usernaem on a GitHub search page into channel queue
- // and put errors into channel e.
- func GetSearchResult(option string, queue chan<- string, e chan<- error) {
- url := SearchURL + option
- message := option
- getPage(url, message, extractUsernameFromSearch, queue, e)
- return
- }
- // FetchKeys fetch ssh key data from user specific key page and store those
- // into channel keys, and put errors into channel e.
- func FetchKeys(username string, keys chan<- Keys, e chan<- error) {
- url := "https://github.com/" + username + ".keys"
- resp, err := http.Get(url)
- if err != nil {
- e <- err
- return
- }
- defer resp.Body.Close()
- log.Println("key " + username + " : " + resp.Status)
- if resp.StatusCode == 200 {
- data, err := ioutil.ReadAll(resp.Body)
- if err != nil {
- e <- err
- return
- }
- sshKeys := strings.Split(string(data), "\n")
- keys <- Keys{username, sshKeys}
- }
- return
- }
- // InitRedis restores user and SSH key data from old text file database.
- func InitRedis(file *os.File) error {
- client := redis.NewUnixClient(RedisSockFile, "", -1)
- defer client.Close()
- var exist *redis.BoolReq
- data, err := ioutil.ReadAll(file)
- if err != nil {
- return err
- }
- b := bytes.NewBuffer(data)
- for {
- buf, err := b.ReadString('\n')
- switch err {
- case io.EOF:
- line := strings.TrimSpace(buf)
- columns := strings.Split(line, "\t")
- if len(columns) == 2 {
- username := columns[0]
- exist = client.SIsMember(RedisUserSet, username)
- if !exist.Val() {
- client.SAdd(RedisUserSet, username)
- }
- }
- count := client.SCard(RedisUserSet)
- log.Println("init: loaded " + strconv.FormatInt(count.Val(), 10) + " users")
- return nil
- case nil:
- line := strings.TrimSpace(buf)
- columns := strings.Split(line, "\t")
- if len(columns) == 2 {
- username := columns[0]
- exist = client.SIsMember(RedisUserSet, username)
- if !exist.Val() {
- client.SAdd(RedisUserSet, username)
- }
- }
- default:
- return err
- }
- }
- return nil
- }
- func StartCrawling(usernames chan string, errors chan error) {
- // initial call
- go GetTimeline(usernames, errors)
- go func() {
- for i := 0; i < GistMaxPages; i++ {
- go GetGistTimeline(i+1, usernames, errors)
- time.Sleep(PagenationInterval)
- }
- }()
- // Searching with advanced option: "followers = 0"
- go func() {
- for i := 0; i < GistMaxPages; i++ {
- option := "?q=followers%3A0&type=Users&p=" + strconv.Itoa(i+1)
- go GetSearchResult(option, usernames, errors)
- time.Sleep(SearchInterval)
- }
- }()
- // interval call to github timeline and gist respectively
- for {
- select {
- case <-time.After(TLPollInterval):
- log.Println(time.Now())
- go GetTimeline(usernames, errors)
- case <-time.After(GistPollInterval):
- log.Println(time.Now())
- for i := 0; i < GistIteratePages; i++ {
- go GetGistTimeline(i+1, usernames, errors)
- }
- }
- }
- }
- // CheckUserKeys create batches of unchecked users and
- func CheckUserKeys(usernames chan string, keys chan Keys, errors chan error) {
- userClient := redis.NewUnixClient(RedisSockFile, "", -1)
- defer userClient.Close()
- var exist *redis.BoolReq
- for {
- select {
- case u := <-usernames:
- exist = userClient.SIsMember(RedisUserSet, u)
- if !exist.Val() {
- userClient.SAdd(RedisUserSet, u)
- go FetchKeys(u, keys, errors)
- time.Sleep(KeyFetchInterval)
- }
- case e := <-errors:
- log.Println(e)
- }
- }
- }
- func main() {
- runtime.GOMAXPROCS(runtime.NumCPU())
- // Check if old text file database exists.
- var file *os.File
- var err error
- file, err = os.Open(Filename)
- if err == nil { // If it exsits, restore all data into Redis server.
- defer file.Close()
- log.Println("init: loading users...")
- err = InitRedis(file)
- } else {
- log.Println("init: starting new db...")
- }
- usernames := make(chan string, BufferSize)
- errors := make(chan error, BufferSize)
- keys := make(chan Keys, BufferSize)
- // Trigger goroutine
- go StartCrawling(usernames, errors)
- // username & error channel handling
- go CheckUserKeys(usernames, keys, errors)
- // key channel handling
- client := redis.NewUnixClient(RedisSockFile, "", -1)
- var status *redis.StatusReq
- var numKeys *redis.IntReq
- for {
- select {
- case k := <-keys:
- numKeys = client.RPush(fmt.Sprintf(RedisKeyList, k.User), k.SSHKeys...)
- client.IncrBy(RedisKeyCount, numKeys.Val())
- case <-time.After(RedisSaveInterval):
- status = client.BgSave()
- if status.Val() != "" {
- log.Println(status.Val())
- }
- }
- }
- }