/github-fetch.js
JavaScript | 421 lines | 311 code | 45 blank | 65 comment | 45 complexity | f2281a1adc078087f3aeda07e402612a MD5 | raw file
- var config = require('./config');
- var fs = require('fs');
- var Stumpy = require('stumpy');
- var github = require('octonode');
- var mongoose = require('mongoose');
- var Schema = mongoose.Schema;
- /* Logging shit */
- var stumpy = Stumpy("Git4D", {
- replaceConsole: false,
- showTrace: false,
- showLogId: false,
- showLogType: false
- });
- /* Mongoose init */
- mongoose.connect('mongodb://'
- + config.mongo.user + ':'
- + config.mongo.password +'@'
- + config.mongo.host + ':'
- + config.mongo.port + '/'
- + config.mongo.db );
- var db = mongoose.connection;
- db.on('error', console.error.bind(console, 'connection error:'));
- var fileSchema = new Schema({
- path: String, //"dist/amd/toggle.js",
- extension: String,
- mode: String, //"100644",
- type: String, //"blob",
- sha: String, //"3ab8ea6bd5bbc22207f178702ed08032839c1fe8",
- size: Number, //1432,
- url: String, //"https://api.github.com/repos/friendsoftheweb/ftw-toggle/git/blobs/3ab8ea6bd5bbc22207f178702ed08032839c1fe8"
- repo_id: { type: Number, ref: 'Repo' }
- });
- var repoSchema = new Schema({
- _id: Number, //20000168,
- name: String, //"beavis",
- full_name: String, //"pssdbt/beavis",
- private: Boolean, //false,
- description: String, //"Static blog generator written in python, exercise from Python Practice Projects.",
- fork: Boolean, //false,
- created_at: Date, //"2014-05-20T22:42:22Z",
- updated_at: Date, //"2014-11-21T16:25:16Z",https://duckduckgo.com/?q=eralpkaraduman%2F4SQWall
- pushed_at: Date, //"2014-05-27T22:06:38Z",
- homepage: String, //null,
- size: Number, //616,
- stargazers_count: Number, //1,
- watchers_count: Number, //1,
- language: String,//"CSS",
- has_pages: Boolean, //false,
- forks_count: Number, //0,
- mirror_url: String, //null,
- open_issues_count: Number, //0,
- forks: Number, //0,
- open_issues: Number, //0,
- watchers: Number, //1,
- default_branch: String, //"master",
- network_count: Number, //0,
- subscribers_count: Number, //1
- truncated: Boolean, // In case the getTree returns a trancated list of files
- nb_files: Number // Number of files and folders in the repo (might be truncated)
- });
- var File = mongoose.model('File', fileSchema);
- var Repo = mongoose.model('Repo', repoSchema);
- /* Github init */
- var client = github.client({
- username: config.github.user,
- password: config.github.password
- });
- var ghsearch = client.search();
- // Search for repos
- var fetchRepos = function ( query, callback ) {
- ghsearch.repos({
- q: query,
- sort: 'updated',
- order: 'asc',
- page: 10,
- per_page: 1000,
- }, function(err, data, headers){
- console.info(JSON.stringify("x-ratelimit-remaining : " + headers.x-ratelimit-remaining, null, '\t'));
- if (err){
- console.error('Search >>> ' + err);
- } else {
- console.log(JSON.stringify(data.items[0].full_name, null, '\t'));
- //callback(data.items); // Array of search results
- }
- })
- };
- var parseHeadersInfo = function( headers ){
- reset = parseInt(headers["x-ratelimit-reset"]);
- last_id = (headers.link).match(/[0-9]+/i);
-
- console.info("Next reset of queries limit: " + new Date(reset * 1000));
- console.info("Next search last index: " + last_id);
- }
- // Get all repos by order of creation
- var getReposFrom = function ( lastId ){
- client.get('/repositories', { since: lastId }, function (err, status, data, headers) {
- if (err){
- console.error(err)
- if( status === 403 ){
- parseHeadersInfo(headers);
- queryOrWait();
- }
- } else {
- parseHeadersInfo(headers);
- checkRepo(data, 0);
- }
- });
- }
- var checkRepo = function( repos, index ) {
- //console.log( JSON.stringify(repos, null, '\t'));
- if ( index < repos.length ){
- console.log("Checking " + (index+1) + "/" + repos.length + " repos : " + repos[index].full_name );
- saveLocalSetting()
- //repo = repos[index];
- //console.log( JSON.stringify(repos[index], null,'\t') );
- if ( !repos[index].fork ){
- var ghrepo = client.repo( repos[index].full_name );
- // get full info about that repo
- ghrepo.info( function(err, data, headers){
- if (err) {
- console.error("checkRepo >>> " + err);
- checkRepo( repos, index+1 );
- } else {
- last_checked_repo = data.id - 1; // Not really the last checked repo. Just the one before we are achtually checking.
- // Checks if repo already exists in db
- Repo.findOne({ _id : data.id }, 'name', function (err, repo) {
- if (err) {
- checkRepo( repos, index+1 );
- return handleError(err);
- }
- if (repo) {
- console.info('REPO ' + repo.name + " already exists");
- } else {
- // Saving repo
- saveRepo( data );
- }
-
- // If repo is not empty
- if (data.size > 0 ){
- getTree( ghrepo, data.id, function(){
- checkRepo(repos, index+1 );
- });
- } else {
- console.info("Repo " + data.full_name + " is empty" );
- // Check next repo in the list
- checkRepo( repos, index+1 );
- }
- });
- }
- });
- } else {
- console.info( repos[index].full_name + " is a fork");
- checkRepo( repos, index+1 );
- }
- } else {
- queryOrWait();
- }
- }
- var queryOrWait = function(){
- client.limit(function (err, left, max) {
- console.log("left: " + left + " max: " + max );
- if (err) return;
- if ( left < 150 ){
- console.warn("Reset: " + reset );
- console.warn("Now: " + (new Date()).getTime() );
- var delay = (reset * 1000) - Math.round( (new Date()).getTime() );
- if (delay < 0) { delay = 5000 }
- console.warn("waiting " + Math.round(delay/1000) + " secs..." );
- setTimeout( queryOrWait, delay );
- } else {
- getReposFrom( last_id );
- }
- });
- }
- /*
- // Get all files from a repo
- var fetchRepoContent = function (ghrepo, path, branch, callback ){
- ghrepo.contents( path , branch, function(err, content, headers){
- if (err){
- console.error('Get content >>> ' + err);
- } else {
- content.forEach(function( element ){
- if (element.type === "dir" ) {
- fetchRepoContent( ghrepo, element.path, branch, callback);
- } else {
- callback(element);
- }
- });
- }
- });
- }
- */
- // Get all files from repo via Tree
- var getTree = function(ghrepo, repo_id, callback ){
- console.info("Getting commits for " + repo_id);
- ghrepo.commits(function(err, data, headers){
- if (err) {
- console.error("getTree: commits >>> " + err + " repo: " + repo_id);
- callback();
- } else {
- // Fetch tree recursively returns all the files in the directory from that particular commit
- console.info("Getting Tree for " + repo_id );
- ghrepo.tree(data[0].commit.tree.sha, true, function(err, files, headers){
- if(err){
- console.error("getTree: tree >>> " + err + " repo: " + repo_id);
- callback();
- } else {
- // Update repo info with file total etc...
-
- console.info("Getting existing repo " + repo_id );
- Repo.findById(repo_id, function (err, repo) {
- if (err) return handleError(err);
- if( repo ){
- nb_files = (files.tree).length;
-
- if( nb_files ){
- repo.nb_files = nb_files;
- }
- if (files.truncated){
- repo.truncated = true;
- }
- console.info("Updating existing repo " + repo_id );
- repo.save(function (err) {
- if (err) return handleError(err);
- });
- }
- });
- /*
- console.info("Updating existing repo " + repo_id + " nb_files: " + (files.tree).length + " truncated : " + files.truncated );
- Repo.update({_id: repo_id}, { $set: { nb_files: (files.tree).length, truncated : files.truncated }});
- */
- console.info("Searching for files in db from repo " + repo_id );
- File.find({ repo_id: repo_id }, 'sha', function (err, saved_files) {
- if (err) return handleError(err);
- sha_array = saved_files.map(function(file){ return file.sha});
- //console.log( JSON.stringify(sha_array, null, '\t'));
- saveFiles( files, repo_id, sha_array);
- callback();
- });
- }
- });
- }
- });
- }
- var getExtension = function( path ){
- var i = path.lastIndexOf('.');
- // if the '.' is not the first or last character
- if (i > 0 && i < path.length-1 ) {
- if ( path.charAt(i-1) != "/" ){
- var ext = path.substring(i+1).toLowerCase();
- if ( ext.indexOf('/') < 0 ){
- return ext;
- }
- }
- }
- }
- // Store data
- var saveFiles = function( files, repo_id, shas ){
-
- files.tree.forEach(function(file){
-
- var f = new File({
- path: file.path,
- extension: getExtension( file.path ),
- mode: file.mode,
- type: file.type,
- sha: file.sha,
- size: file.size,
- url: file.url,
- repo_id : repo_id,
- });
-
- // if the file extension is in our list of extension we are searching for.
- if ( f.extension != undefined ) {
- if( ext_regex.test( f.extension ) ) {
- // Check if the file exists in db already
- var stumpy_path = (f.path).replace(/%/g, '' ); /// EXISTS for the sole purpose of a "bug" in Stumpy with % character
- if ( shas.indexOf( file.sha ) >= 0 ) {
- console.info('FILE ' + stumpy_path + " ALREADY EXISTS in db");
- } else {
- //console.info( f );
- console.log( "Saving : " + stumpy_path );
- f.save(function (err) {
- if (err) return handleError(err);
- });
- }
- } else {
- console.info("extension: " + f.extension);
- }
- }
- });
-
- }
- var saveRepo = function( repo ){
- console.log( "Saving : " + repo.full_name );
- var r = new Repo({
- _id: repo.id,
- name: repo.name,
- full_name: repo.full_name,
- private: repo.private,
- description: repo.description,
- fork: repo.fork,
- created_at: new Date(repo.created_at),
- updated_at: new Date(repo.updated_at),
- pushed_at: new Date(repo.pushed_at),
- homepage: repo.hoempage,
- size: repo.size,
- stargazers_count: repo.stargazers_count,
- watchers_count: repo.watchers_count,
- language: repo.language,
- has_pages: repo.has_pages,
- forks_count: repo.forks_count,
- mirror_url: repo.mirror_url,
- open_issues_count: repo.open_issues_count,
- forks: repo.forks,
- open_issues: repo.open_issues,
- watchers: repo.watchers,
- default_branch: repo.default_branch,
- network_count: repo.network_count,
- subscribers_count: repo.subscribers_count,
- });
-
- r.save(function (err) {
- if (err) return handleError(err);
- // thats it!
- });
- }
- /* keep track of state of the script in case of crash */
- var saveLocalSetting = function(){
- fs.writeFile('./last_checked', last_checked_repo , function (err) {
- if (err) {
- console.warn('There has been an error saving the last checked repo');
- console.error(err.message);
- return;
- }
- console.info('Last checked repo saved.')
- });
- }
- var loadLastCheckedRepo = function( callback ){
- fs.exists('./last_checked', function(exists){
- if( exists ){
- last_id = parseInt(fs.readFileSync('./last_checked'))
- console.info("Loaded last_id from file: " + last_id);
- }
- callback();
- });
- }
-
- // MAIN
- /* Settings */
- //var query = 'stars:40000..50000';
- //var query = 'stars:>5';
- // The list of file extension we are looking for
- var ext_regex = /^(ora|jpg|jpeg|jfif|tif|tiff|ari|r3d|gif|bmp|png|ppm|pgm|pbm|pnm|webp|cd5|cit|cr2|dds|dib|djvu|ecw|icns|ico|fits|fit|fts|iff|ilbm|lbm|jng|jp2|jps|jpe|jxr|hdp|wdp|liff|mng|nrrd|otb|pam|pdd|pcx|pdn|pgf|pgm|pxr|raw|sgi|rgb|rgba|bw|int|inta|sid|ras|sun|tga|cpt|psd|tub|psp|pspimage|psptub|xcf|vtf|xpm|3dv|cgm|cmx|gbr|svg|ai|cdr|odg|otg|fodg|hpgl|plt|vml|wmf|emf|wmz|emz|xar|oxps|gdraw|amf|blend|dae|dwf|dwg|dxf|flt|ma|mb|obj|ogex|prc|step|skp|stl|u3d|vrml|xaml|xvl|xvrml|x3d|x3dv|x3db|x3dz|x3dbz|x3dvz|3d|3df|3dm|3ds|3dxml|3dmlw|acp|aec|an8|aoi|art|b3d|brep|c4d|cal3d|cad|cob|fbx|iob|iam|idw|ipn|ipt|lwo|md2|md3|mdx|mesh|mrc|ply|pov|rvt|rfa|scad|sldasm|slddrw|sldprt|xsi|tct|tcw|vwx|wrl|x|z3d|eps|pdf|ps|pict|pct|pic|swf|indd|idml|mcf|ppp|pub|qxd|fm|sla|scd|abf|afm|bdf|bmf|fnt|fon|mgf|otf|pcf|pfa|pfb|pfm|afm|fond|sfd|snf|tdf|ttf|woff|woff2|act|gpl|pal|8svx|iff|aif|aifc|aiff|au|bwf|cdda|wav|flac|wv|wma|mp2|mp3|spx|gsm|aac|ogg|ogv|3gp|asf|avchd|avi|cam|flv|m1v|m2v|fla|m4v|mkv|mng|mov|mpeg|mpg|mpe|mp4|h264|svi|swf|fcp|mswmm|imovieproj|wlmp|kdendive|utx|umx|mdl|roq|cur|ani|asset)$/i
- var last_id = config.last_id;
- var last_checked_repo = last_id;
- var reset = Math.round((new Date()).getTime()/1000); // Setting up to the actual time of script start;
- db.once('open', function (callback) {
- console.info('connected');
- loadLastCheckedRepo(function(){
- queryOrWait();
- });
- });
- process.on('SIGINT', function() {
- console.warn(' Received Exit Signal');
-
- setTimeout(function() {
- console.info('Last repo checked: ' + (last_checked_repo) );
- console.warn('-------------------------------------------------');
- console.warn('Please update config.js file with this new value: ');
- console.warn('config.last_id = ' + (last_checked_repo) + ';');
- process.exit(1);
- }, 10);
- });
- /*
- fetchRepos( query, function(data){
- data.forEach( function( repo, index ){
- console.log("Getting info for " + repo.full_name);
-
- var ghrepo = client.repo(repo.full_name);
- fetchRepoContent(ghrepo, '', repo.default_branch, function (data){
- console.log(JSON.stringify(data.path));
- });
-
- });
- });
- */