/tools/get_kottke.py

https://github.com/wilson428/Robottke · Python · 97 lines · 74 code · 14 blank · 9 comment · 18 complexity · 29239fc0d48edbdb5da12beee29178c2 MD5 · raw file

  1. #scans kottke archives and gathers relevant data.
  2. #Probably not important to run this often
  3. import urllib2
  4. import sqlite3
  5. import re
  6. from datetime import datetime
  7. from BeautifulSoup import BeautifulSoup
  8. prefix = '../'
  9. conn = sqlite3.connect(prefix + 'kottke.db')
  10. c = conn.cursor()
  11. t = datetime.now()
  12. months = {"Jan" : "01", "Feb" : "02", "Mar" : "03", "Apr" : "04", "May" : "05", "Jun" : "06", "Jul" : "07", "Aug" : "08", "Sep" : "09", "Oct" : "10", "Nov" : "11", "Dec" : "12" }
  13. # Create table
  14. c.execute("create table if not exists kottke(id INTEGER PRIMARY KEY AUTOINCREMENT, title blob, url blob, published blob, source blob, videos blob, tags blob, hattip blob)")
  15. #year (98,99,0-9,10,11) and month (1-12) to beginning crawling
  16. year = 7
  17. month = 1
  18. while year != 11 or month != 10: #inelegent, but update to stop at current month
  19. syear = str(year)
  20. if year < 10:
  21. syear = "0" + syear
  22. smonth = str(month)
  23. if month < 10:
  24. smonth = "0" + smonth
  25. url = "http://kottke.org/" + syear + "/" + smonth + "/"
  26. print "<-----------------------" + url
  27. soup = BeautifulSoup(urllib2.urlopen(url))
  28. for entry in soup.findAll('div', { "class" : "post" }):
  29. try:
  30. title = entry.h3.contents[0]
  31. except:
  32. title = entry.h2.contents[0]
  33. urlf = title['href']
  34. head = unicode(title.contents[0])
  35. #print head
  36. body = entry.findAll('p')
  37. bodytext = ""
  38. for p in body:
  39. bodytext = bodytext + str(p)
  40. #print bodytext
  41. links = ""
  42. hattip = ""
  43. try:
  44. for link in re.findall("<a href=\"(.+?)\"", bodytext):
  45. ht = re.findall("\(via <a href=\""+link+"\">(.+?)</a>", bodytext)
  46. if ht:
  47. hattip = ht[0] + "<" + link + ">"
  48. else:
  49. links = links + link + ","
  50. except:
  51. print "unbalanced"
  52. meta = entry.findAll('div', {"class" : "meta"})
  53. ls = re.findall("<a href=\"(.+?)\"", str(meta[0]))
  54. tags=""
  55. for tag in ls:
  56. if tag[0:5] == "/tag/":
  57. tags = tags + tag[5:] + ","
  58. d = re.findall("title=\"permanent link\">([a-zA-Z0-9,_ ]+)</a>", str(meta[0]))
  59. dt = re.split(" +", d[0])
  60. dmonth = months[dt[0]]
  61. dyear = dt[2]
  62. day = dt[1][:-1]
  63. if len(day) == 1:
  64. day = "0" + day
  65. dat = dyear + "-" + dmonth + "-" + day
  66. videos = ''
  67. for video in re.findall("src=\"http://www.youtube.com/v/([a-zA-Z0-9_]+)", bodytext):
  68. videos = videos + "http://youtube.com/v/" + video + ","
  69. for video in re.findall("src=\"http://player.vimeo.com/video/([0-9]+)", bodytext):
  70. videos = videos + "http://player.vimeo.com/video/" + video + ","
  71. try:
  72. c.execute("insert into kottke(title, url, published, source, videos, tags, hattip) values (?,?,?,?,?,?,?)", (head, urlf, dat, links, videos, tags, hattip))
  73. #print ''
  74. except:
  75. print "bad character"
  76. #f.write("added " + head + "\n")
  77. conn.commit()
  78. month = month + 1
  79. if month == 13:
  80. month = 1
  81. year = year+1
  82. c.close()
  83. #f.close()