PageRenderTime 1580ms CodeModel.GetById 28ms RepoModel.GetById 1ms app.codeStats 0ms

/socialnews/news/libs/redditstories.py

https://github.com/tuxcanfly/django-socialnews
Python | 249 lines | 228 code | 7 blank | 14 comment | 0 complexity | 7905e9466b11fc95ec7f186b6e1092ec MD5 | raw file
  1. #!/usr/bin/python
  2. #
  3. # Peteris Krumins (peter@catonmat.net)
  4. # http://www.catonmat.net -- good coders code, great reuse
  5. #
  6. # Released under GNU GPL
  7. #
  8. # Developed as a part of redditriver.com project
  9. # Read how it was designed:
  10. # http://www.catonmat.net/blog/designing-redditriver-dot-com-website
  11. #
  12. import re
  13. import sys
  14. import time
  15. import socket
  16. import urllib2
  17. import datetime
  18. from BeautifulSoup import BeautifulSoup
  19. version = "1.0"
  20. reddit_url = 'http://reddit.com'
  21. subreddit_url = 'http://reddit.com/r'
  22. socket.setdefaulttimeout(30)
  23. class RedesignError(Exception):
  24. """ An exception class thrown when it seems that Reddit has redesigned """
  25. pass
  26. class StoryError(Exception):
  27. """ An exception class thrown when something serious happened """
  28. pass
  29. def get_stories(subreddit="front_page", pages=1, new=False):
  30. """ If subreddit front_page, goes to http://reddit.com, otherwise goes to
  31. http://reddit.com/r/subreddit. Finds all stories accross 'pages' pages
  32. and returns a list of dictionaries of stories.
  33. If new is True, gets new stories at http://reddit.com/new or
  34. http://reddit.com/r/subreddit/new"""
  35. stories = []
  36. if subreddit == "front_page":
  37. url = reddit_url
  38. else:
  39. url = subreddit_url + '/' + subreddit
  40. if new: url += '/new'
  41. position = 1
  42. for i in range(pages):
  43. content = _get_page(url)
  44. entries = _extract_stories(content)
  45. stories.extend(entries)
  46. for story in stories:
  47. story['url'] = story['url'].replace('&', '&')
  48. story['position'] = position
  49. story['subreddit'] = subreddit
  50. position += 1
  51. url = _get_next_page(content)
  52. if not url:
  53. break
  54. return stories;
  55. def _extract_stories(content):
  56. """Given an HTML page, extracts all the stories and returns a list of dicts of them.
  57. See the 'html.examples/story.entry.txt' for an example how HTML of an entry looks like"""
  58. stories = []
  59. soup = BeautifulSoup(content)
  60. entries = soup.findAll('div', id=re.compile('entry_.*'))
  61. for entry in entries:
  62. div_title = entry.find('div', id=re.compile('titlerow_.*'));
  63. if not div_title:
  64. raise RedesignError, "titlerow div was not found"
  65. div_little = entry.find('div', attrs={'class': 'little'});
  66. if not div_little:
  67. raise RedesignError, "little div was not found"
  68. title_a = div_title.find('a', id=re.compile('title_.*'))
  69. if not title_a:
  70. raise RedesignError, "title a was not found"
  71. m = re.search(r'title_t\d_(.+)', title_a['id'])
  72. if not m:
  73. raise RedesignError, "title did not contain a reddit id"
  74. id = m.group(1)
  75. title = title_a.string.strip()
  76. url = title_a['href']
  77. if url.startswith('/'): # link to reddit itself
  78. url = 'http://reddit.com' + url
  79. score_span = div_little.find('span', id=re.compile('score_.*'))
  80. if score_span:
  81. m = re.search(r'(\d+) point', score_span.string)
  82. if not m:
  83. raise RedesignError, "unable to extract score"
  84. score = int(m.group(1))
  85. else: # for just posted links
  86. score = 0 # TODO: when this is merged into module, use redditscore to get the actual score
  87. user_a = div_little.find(lambda tag: tag.name == 'a' and tag['href'].startswith('/user/'))
  88. if not user_a:
  89. user = '(deleted)'
  90. else:
  91. m = re.search('/user/(.+)/', user_a['href'])
  92. if not m:
  93. raise RedesignError, "user 'a' tag did not contain href in format /user/(.+)/"
  94. user = m.group(1)
  95. posted_re = re.compile("posted(?: |\s)+(.+)(?: |\s)+ago") # funny nbsps
  96. posted_text = div_little.find(text = posted_re)
  97. if not posted_text:
  98. raise RedesignError, "posted ago text was not found"
  99. m = posted_re.search(posted_text);
  100. posted_ago = m.group(1)
  101. unix_time = _ago_to_unix(posted_ago)
  102. if not unix_time:
  103. raise RedesignError, "unable to extract story date"
  104. human_time = time.ctime(unix_time)
  105. comment_a = div_little.find(lambda tag: tag.name == 'a' and tag['href'].endswith('/comments/'))
  106. if not comment_a:
  107. raise RedesignError, "no comment 'a' tag was found"
  108. if comment_a.string == "comment":
  109. comments = 0
  110. else:
  111. m = re.search(r'(\d+) comment', comment_a.string)
  112. if not m:
  113. raise RedesignError, "comment could could not be extracted"
  114. comments = int(m.group(1))
  115. stories.append({
  116. 'id': id.encode('utf8'),
  117. 'title': title.encode('utf8'),
  118. 'url': url.encode('utf8'),
  119. 'score': score,
  120. 'comments': comments,
  121. 'user': user.encode('utf8'),
  122. 'unix_time': unix_time,
  123. 'human_time': human_time.encode('utf8')})
  124. return stories
  125. def _ago_to_unix(ago):
  126. m = re.search(r'(\d+) (\w+)', ago, re.IGNORECASE)
  127. if not m:
  128. return 0
  129. delta = int(m.group(1))
  130. units = m.group(2)
  131. if not units.endswith('s'): # singular
  132. units += 's' # append 's' to make it plural
  133. if units == "months":
  134. units = "days"
  135. delta *= 30 # lets take 30 days in a month
  136. elif units == "years":
  137. units = "days"
  138. delta *= 365
  139. dt = datetime.datetime.now() - datetime.timedelta(**{units: delta})
  140. return int(time.mktime(dt.timetuple()))
  141. def _get_page(url):
  142. """ Gets and returns a web page at url """
  143. request = urllib2.Request(url)
  144. request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)')
  145. try:
  146. response = urllib2.urlopen(request)
  147. content = response.read()
  148. except (urllib2.HTTPError, urllib2.URLError, socket.error, socket.sslerror), e:
  149. raise StoryError, e
  150. return content
  151. def _get_next_page(content):
  152. soup = BeautifulSoup(content)
  153. a = soup.find(lambda tag: tag.name == 'a' and tag.string == 'next')
  154. if a:
  155. return reddit_url + a['href']
  156. def print_stories_paragraph(stories):
  157. """ Given a list of dictionaries of stories, prints them out paragraph at a time. """
  158. for story in stories:
  159. print 'position:', story['position']
  160. print 'subreddit:', story['subreddit']
  161. print 'id:', story['id']
  162. print 'title:', story['title']
  163. print 'url:', story['url']
  164. print 'score:', story['score']
  165. print 'comments:', story['comments']
  166. print 'user:', story['user']
  167. print 'unix_time:', story['unix_time']
  168. print 'human_time:', story['human_time']
  169. print
  170. def print_stories_json(stories):
  171. """ Given a list of dictionaries of stories, prints them out in json format."""
  172. import simplejson
  173. print simplejson.dumps(stories, indent=4)
  174. if __name__ == '__main__':
  175. from optparse import OptionParser
  176. description = "A program by Peteris Krumins (http://www.catonmat.net)"
  177. usage = "%prog [options]"
  178. parser = OptionParser(description=description, usage=usage)
  179. parser.add_option("-o", action="store", dest="output", default="paragraph",
  180. help="Output format: paragraph or json. Default: paragraph.")
  181. parser.add_option("-p", action="store", type="int", dest="pages",
  182. default=1, help="How many pages of stories to output. Default: 1.")
  183. parser.add_option("-s", action="store", dest="subreddit", default="front_page",
  184. help="Subreddit to retrieve stories from. Default: front_page.")
  185. parser.add_option("-n", action="store_true", dest="new",
  186. help="Retrieve new stories. Default: nope.")
  187. options, args = parser.parse_args()
  188. output_printers = { 'paragraph': print_stories_paragraph,
  189. 'json': print_stories_json }
  190. if options.output not in output_printers:
  191. print >>sys.stderr, "Valid -o parameter values are: paragraph or json!"
  192. sys.exit(1)
  193. try:
  194. stories = get_stories(options.subreddit, options.pages, options.new)
  195. except RedesignError, e:
  196. print >>sys.stderr, "Reddit has redesigned! %s!" % e
  197. sys.exit(1)
  198. except StoryError, e:
  199. print >>sys.stderr, "Serious error: %s!" % e
  200. sys.exit(1)
  201. output_printers[options.output](stories)