get_kottke.py - Probably not important to run this often Cr…

/tools/get_kottke.py

https://github.com/wilson428/Robottke · Python · 97 lines · 74 code · 14 blank · 9 comment · 18 complexity · 29239fc0d48edbdb5da12beee29178c2 MD5 · raw file


#scans kottke archives and gathers relevant data.
#Probably not important to run this often

import urllib2 
import sqlite3
import re
from datetime import datetime

from BeautifulSoup import BeautifulSoup

prefix = '../'

conn = sqlite3.connect(prefix + 'kottke.db')
c = conn.cursor()
t = datetime.now()

months = {"Jan" : "01", "Feb" : "02", "Mar" : "03", "Apr" : "04", "May" : "05", "Jun" : "06", "Jul" : "07", "Aug" : "08", "Sep" : "09", "Oct" : "10", "Nov" : "11", "Dec" : "12" }

# Create table
c.execute("create table if not exists kottke(id INTEGER PRIMARY KEY AUTOINCREMENT, title blob, url blob, published blob, source blob, videos blob, tags blob, hattip blob)")

#year (98,99,0-9,10,11) and month (1-12) to beginning crawling
year = 7
month = 1

while year != 11 or month != 10: #inelegent, but update to stop at current month
   syear = str(year)
   if year < 10:
      syear = "0" + syear 
   smonth = str(month)
   if month < 10:
      smonth = "0" + smonth 
   url = "http://kottke.org/" + syear + "/" + smonth + "/"
   print "<-----------------------" + url
   soup = BeautifulSoup(urllib2.urlopen(url))
   for entry in soup.findAll('div', { "class" : "post" }):
      try:
         title = entry.h3.contents[0]
      except:
         title = entry.h2.contents[0]
      urlf = title['href']
      head = unicode(title.contents[0])      
      #print head
      body = entry.findAll('p')
      bodytext = ""
      for p in body:
         bodytext = bodytext + str(p)
      #print bodytext
      links = ""
      hattip = ""        

      try:
         for link in re.findall("<a href=\"(.+?)\"", bodytext):
            ht = re.findall("\(via <a href=\""+link+"\">(.+?)</a>", bodytext)
            if ht:
               hattip = ht[0] + "<" + link + ">"
            else:
               links = links + link + ","
      except:
         print "unbalanced"
      meta = entry.findAll('div', {"class" : "meta"})
      ls = re.findall("<a href=\"(.+?)\"", str(meta[0]))
            
      tags=""
      for tag in ls:
         if tag[0:5] == "/tag/":
            tags = tags + tag[5:] + ","
      d = re.findall("title=\"permanent link\">([a-zA-Z0-9,_ ]+)</a>", str(meta[0]))
      dt = re.split(" +", d[0])
      dmonth = months[dt[0]]
      dyear = dt[2]
      day = dt[1][:-1]
      if len(day) == 1:
         day = "0" + day           
      dat = dyear + "-" + dmonth + "-" + day

      videos = ''
      for video in re.findall("src=\"http://www.youtube.com/v/([a-zA-Z0-9_]+)", bodytext):
         videos = videos + "http://youtube.com/v/" + video + ","
      for video in re.findall("src=\"http://player.vimeo.com/video/([0-9]+)", bodytext):
         videos = videos + "http://player.vimeo.com/video/" + video + ","

      try:
         c.execute("insert into kottke(title, url, published, source, videos, tags, hattip) values (?,?,?,?,?,?,?)", (head, urlf, dat, links, videos, tags, hattip))
         #print ''
      except:
         print "bad character"
      #f.write("added " + head + "\n")
      conn.commit()

   month = month + 1
   if month == 13:
      month = 1
      year = year+1

c.close()
#f.close()

Tech Fingerprint

Alerts (3)

'sqlite3.connect(' Use 'with sqlite3.connect()' for automatic connection closure
13
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
59 86