PageRenderTime 44ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/poll_chart/make_chart.py

https://github.com/egor83/hn-stuff
Python | 181 lines | 168 code | 10 blank | 3 comment | 2 complexity | 84d89cad06b8ac03e9d8698f4b5d7a98 MD5 | raw file
  1. import BeautifulSoup
  2. import logging
  3. import datetime
  4. import traceback
  5. from urllib import quote_plus
  6. from google.appengine.ext import db
  7. from google.appengine.api.urlfetch import DownloadError
  8. import gae_tools
  9. import poll_chart.parsing
  10. __author__ = 'egor83, egor.ryabkov()gmail.com'
  11. def create_chart(thread_id, chart_type, show_percents):
  12. poll_data = get_poll_data(thread_id)
  13. if poll_data is not None:
  14. chart_url = build_chart_url(poll_data, chart_type, show_percents)
  15. else:
  16. logging.error('No data fetched for thread %s', thread_id)
  17. raise NoDataError(thread_id)
  18. return chart_url
  19. def build_chart_url(poll_data, chart_type, show_percents):
  20. """(?)Construct chart URL from (parsed?) poll data."""
  21. # from http://code.google.com/intl/en/apis/chart/image/docs/chart_wizard.html
  22. chart_url_base = "http://chart.apis.google.com/chart"
  23. # set the common chart arguments (shared by both types)
  24. chart_arguments = [
  25. "chco=0000FF,008080,00FF00,808000,FF0000", # color
  26. "chs=750x400", # size
  27. "".join(["chtt=", quote_plus(poll_data.title)]), # title
  28. ]
  29. votes = poll_data.votes
  30. votes_text = ",".join(map(str, votes))
  31. # "chd=t:10,50,60,80,40,60,30", # data
  32. chart_arguments.append("".join(["chd=t:", votes_text]))
  33. max_votes = max(votes)
  34. # scale for text format with custom range
  35. chart_arguments.append("chds=0,%i" % max_votes)
  36. labels = map(quote_plus, poll_data.options)
  37. if show_percents:
  38. labels = add_percentages(labels, poll_data.percentages)
  39. # set type-specific chart arguments
  40. if chart_type == 'pie':
  41. chart_arguments.extend([
  42. "cht=p", # chart type
  43. "chp=4.71" # start pie slices from top (3*pi/2 = 4.71 radians)
  44. ])
  45. labels[0] = ("chl=%s" % labels[0]) # labels
  46. chart_arguments.append("|".join(labels))
  47. else: #if chart_type == 'bar': # default option
  48. chart_arguments.extend([
  49. "cht=bhs", # chart type
  50. "chxt=x,y", # visible axes
  51. "chbh=a", # bar width and spacing - needed?
  52. ])
  53. # Y labels (would contain voting options description)
  54. # "chxl=1:|one|two|three"
  55. # data set runs top-to-bottom, and labels the other way around, so I'll
  56. # reverse the latter
  57. labels.reverse()
  58. labels.insert(0, "chxl=1:")
  59. chart_arguments.append("|".join(labels))
  60. # axis ranges - calculate max votes number, put here
  61. chart_arguments.append("chxr=0,0,%i" % max_votes)
  62. args_line = "&".join(chart_arguments)
  63. return "?".join([chart_url_base, args_line])
  64. def add_percentages(labels, percentages):
  65. for idx in range(len(labels)):
  66. labels[idx] = "%s (%.1f%%)" % (labels[idx], percentages[idx])
  67. return labels
  68. def get_poll_data(thread_id):
  69. """Check poll data in cache, if not present or too old - fetch and parse"""
  70. caching_period = datetime.timedelta(minutes = 5)
  71. poll_data = PollData.gql("where thread_id = :1", thread_id).get()
  72. if(poll_data is None or
  73. datetime.datetime.now() - poll_data.caching_time > caching_period):
  74. # no data or data too old, fetch
  75. logging.info('Data for thread %s not found or too old, fetching' %
  76. thread_id)
  77. try:
  78. url = 'http://news.ycombinator.com/item?id=%s' % thread_id
  79. header = 'Crawler, contact owner at egor.ryabkov(at)gmail.com'
  80. page = gae_tools.gae_fetch(url, header, 5)
  81. except DownloadError, de:
  82. # return cached version or None if no cached version is present
  83. logging.warning('Fetching failed (DownloadError: %s), using \
  84. cached data', de)
  85. return poll_data
  86. soup = BeautifulSoup.BeautifulSoup(page)
  87. try:
  88. title, options, votes = poll_chart.parsing.parse_data(soup)
  89. except AttributeError, ae: # not a poll or access to the old page was denied
  90. logging.error('Thread %s page has no poll', thread_id)
  91. # logging.warning(page.read())
  92. # logging.error(traceback.format_exc())
  93. if poll_data:
  94. logging.warning('Access (probably) denied, using cached version.')
  95. return poll_data
  96. else:
  97. raise NoPollError(thread_id)
  98. except IndexError: # not a HN page
  99. logging.error(
  100. 'IndexError @ thread %s - maybe trying to parse a non-HN page?',
  101. thread_id)
  102. raise NoPollOrNotHNPageError(thread_id)
  103. # add percentages, adjust title with total votes count
  104. if votes is not None:
  105. total = sum(votes)
  106. title = ('%s (%i votes)' % (title, total))
  107. percentages = map(lambda x: float(100*x)/total, votes)
  108. else:
  109. percentages = None
  110. # trim option descriptions to 40 chars max (chart look bad otherwise)
  111. max_desc_len = 40
  112. for opt_idx in range(len(options)):
  113. if len(options[opt_idx]) > max_desc_len:
  114. options[opt_idx] = options[opt_idx][0:max_desc_len] + '...'
  115. if poll_data is None:
  116. poll_data = PollData()
  117. poll_data.thread_id = thread_id
  118. poll_data.title = str(title)
  119. poll_data.options = options
  120. poll_data.votes = votes
  121. poll_data.percentages = percentages
  122. poll_data.put()
  123. else:
  124. logging.debug('Obtained data for thread %s from cache' % thread_id)
  125. return poll_data
  126. class PollData(db.Model):
  127. thread_id = db.StringProperty()
  128. caching_time = db.DateTimeProperty(auto_now=True)
  129. title = db.StringProperty()
  130. options = db.StringListProperty()
  131. percentages = db.ListProperty(float)
  132. votes = db.ListProperty(int) # int iso IntProperty, see http://goo.gl/hnDpm
  133. class ChartBuilderError(Exception):
  134. pass
  135. class NoPollError(ChartBuilderError):
  136. def __init__(self, thread_id):
  137. self.thread_id = thread_id
  138. class NoPollOrNotHNPageError(ChartBuilderError):
  139. def __init__(self, thread_id):
  140. self.thread_id = thread_id
  141. class NoDataError(ChartBuilderError):
  142. def __init__(self, thread_id):
  143. self.thread_id = thread_id