PageRenderTime 1020ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/scripts/generate_featured_pages.py

https://github.com/pcdinh/trendingtopics
Python | 153 lines | 142 code | 2 blank | 9 comment | 0 complexity | d54351b949d30535641033dd276c4348 MD5 | raw file
  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3. """
  4. generate_featured_pages.py
  5. Created by Peter Skomoroch on 2009-06-20.
  6. Copyright (c) 2009 Data Wrangling LLC. All rights reserved.
  7. """
  8. import sys
  9. import getopt
  10. import urllib
  11. import urllib2
  12. from BeautifulSoup import BeautifulSoup
  13. import datetime
  14. import MySQLdb
  15. # TODO pass as parameters
  16. MYSERVER = 'trendingtopics.org'
  17. DBNAME = 'trendingtopics_production'
  18. USER = 'root'
  19. PASSWD = ''
  20. help_message = '''
  21. Dynamically creates a blacklist of pageids based on given date by removing
  22. wikipedia featured articles and "on this day" references from the main page.
  23. Usage:
  24. $ python generate_featured_pages.py -d 20090618 > featured_pages.txt
  25. '''
  26. class Usage(Exception):
  27. def __init__(self, msg):
  28. self.msg = msg
  29. def pageid(title):
  30. # quick hack to get page_id from db, rails app might not be running yet
  31. try:
  32. conn = MySQLdb.connect(db=DBNAME, user=USER, passwd=PASSWD)
  33. cursor = conn.cursor()
  34. cursor.execute("""SELECT id FROM pages
  35. WHERE title = '%s';""" % title)
  36. row = cursor.fetchone()
  37. pageid = row[0]
  38. cursor.close()
  39. conn.close()
  40. except:
  41. pageid = 1
  42. return pageid
  43. def get_titles(soup):
  44. """
  45. Extract wikipedia links from soup instance
  46. """
  47. links = [x['href'] for x in soup.findAll('a') if x['href'][0:5]=='/wiki']
  48. ns_zero_urls = [x.replace('/wiki/','') for x in links if x.find(':') == -1]
  49. titles = [urllib.unquote_plus(x.replace('_', ' ')) for x in ns_zero_urls]
  50. return titles
  51. def soupify_url(url):
  52. opener = urllib2.build_opener()
  53. opener.addheaders = [('User-agent', 'TrendingTopics/0.1')]
  54. page = opener.open( url ).read()
  55. soup = BeautifulSoup(page)
  56. return soup
  57. def featured_pages(date):
  58. base = 'http://en.wikipedia.org/wiki/Wikipedia:Today%27s_featured_article/'
  59. # get previous 3 days of featured articles...
  60. url = base + date.strftime("%B_%d,_%Y")
  61. try:
  62. soup = soupify_url(url)
  63. div = soup.findAll(id="bodyContent")
  64. titles = get_titles(div[0])
  65. except:
  66. titles = []
  67. return titles
  68. def featured_pictures(date):
  69. base = 'http://en.wikipedia.org/wiki/Template:POTD/'
  70. url = base + date.strftime("%Y-%m-%d")
  71. try:
  72. soup = soupify_url(url)
  73. table = soup.findAll(cellspacing="5")
  74. titles = get_titles(table[0])
  75. except:
  76. titles = []
  77. return titles
  78. def date_pages(date):
  79. return [date.strftime("%B %d")]
  80. def anniversaries(date):
  81. base = 'http://en.wikipedia.org/wiki/Wikipedia:Selected_anniversaries/'
  82. url = base + date.strftime("%B_%d")
  83. try:
  84. soup = soupify_url(url)
  85. div = soup.findAll(id="bodyContent")
  86. titles = get_titles(div[0])
  87. except:
  88. titles = []
  89. return titles
  90. def titles_for_date(date):
  91. titles = featured_pages(date)
  92. titles.extend(featured_pictures(date))
  93. titles.extend(date_pages(date))
  94. titles.extend(anniversaries(date))
  95. return titles
  96. def main(argv=None):
  97. if argv is None:
  98. argv = sys.argv
  99. try:
  100. try:
  101. opts, args = getopt.getopt(argv[1:], "hd:v", ["help", "date="])
  102. except getopt.error, msg:
  103. raise Usage(msg)
  104. # option processing
  105. for option, value in opts:
  106. if option == "-v":
  107. verbose = True
  108. if option in ("-h", "--help"):
  109. raise Usage(help_message)
  110. if option in ("-d", "--date"):
  111. datestr = value
  112. maxdate = datetime.date(int(datestr[0:4]), int(datestr[4:6]), int(datestr[6:8]))
  113. # find urls recently featured on main page of wikipedia
  114. titles = titles_for_date(maxdate)
  115. # omit any titles featured in the last 2 weeks
  116. for i in range(14):
  117. titles.extend(titles_for_date(maxdate - datetime.timedelta(i+1)))
  118. # generate blacklist of page_ids:
  119. pageids = [pageid(x) for x in set(titles)]
  120. for x in pageids:
  121. try:
  122. sys.stdout.write('%s\n' % x)
  123. except:
  124. pass
  125. except Usage, err:
  126. print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
  127. print >> sys.stderr, "\t for help use --help"
  128. return 2
  129. if __name__ == "__main__":
  130. sys.exit(main())