/extract.py

https://github.com/sbenthall/topical-topology · Python · 89 lines · 63 code · 23 blank · 3 comment · 18 complexity · 022ac96929dc65e5585fcd82a12d506d MD5 · raw file

  1. import re
  2. import os
  3. import sys
  4. from pprint import pprint as pp
  5. import simplejson as json
  6. from settings import *
  7. from utils import *
  8. if len(sys.argv) > 1:
  9. SNOWBALL_PATH = sys.argv[1]
  10. def get_screen_names():
  11. snowball = load_snowball()
  12. return [m['screen_name'] for m in snowball.values() if m.has_key('screen_name')]
  13. URL_REGEX = "http\://\S*"
  14. def clean(tweet):
  15. clean_tweet = re.sub(URL_REGEX, '', tweet)
  16. clean_tweet = re.sub("[\n\r\t]",' ',clean_tweet)
  17. #clean out retweet 'RT'
  18. clean_tweet = re.sub("^RT ",'',clean_tweet)
  19. #remove usernames
  20. clean_tweet = re.sub("@(\w*)",'',clean_tweet)
  21. clean_tweet = re.sub("'ll",'ll',clean_tweet)
  22. clean_tweet = re.sub("'ve",'ve',clean_tweet)
  23. clean_tweet = re.sub("'t",'t',clean_tweet)
  24. if AGGREGATE_TWEETS:
  25. try:
  26. clean_tweet.decode('ascii')
  27. except UnicodeEncodeError:
  28. return ""
  29. return clean_tweet
  30. def parse_json_log(username):
  31. log_name = "%s%s.json"%(LOG_PATH,username)
  32. if os.path.isfile(log_name):
  33. log = json.loads(open(log_name,'r').read())
  34. return [clean(tweet['text']) for tweet in log]
  35. else:
  36. print "No log %s found, returning blank" % (log_name)
  37. return ""
  38. if not os.path.exists(DUMP_PATH):
  39. os.makedirs(DUMP_PATH)
  40. def main():
  41. # w+ create file if it doesn't exist, but overwrite if it does
  42. tweet_file = open("%s" % (DUMP_FILE),'w+')
  43. screen_names = get_screen_names()
  44. for screen_name in screen_names:
  45. print("Parsing tweets for %s" % screen_name)
  46. clean_tweets = []
  47. clean_tweets = parse_json_log(screen_name)
  48. if len(clean_tweets) < CUTOFF:
  49. print "%s has fewer than %d tweets. Leaving out of sample data."%(screen_name, CUTOFF)
  50. else:
  51. if not AGGREGATE_TWEETS:
  52. for clean_tweet in clean_tweets:
  53. print(clean_tweet)
  54. try:
  55. tweet_file.write(u"twitter %s %s\n" % (screen_name, clean_tweet))
  56. tweet_file.flush()
  57. except:
  58. 'Error: Exception writing this tweet'
  59. else:
  60. all_tweets = " ".join(clean_tweets)
  61. print(all_tweets)
  62. tweet_file.write(u"twitter %s %s\n" % (screen_name, all_tweets))
  63. tweet_file.flush()
  64. tweet_file.close()
  65. if __name__ == "__main__":
  66. main()