PageRenderTime 48ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/scrape.py

https://github.com/newhouseb/SCPD-Scraper
Python | 56 lines | 40 code | 8 blank | 8 comment | 5 complexity | 34faadc629e2aad2e3a023aa1c550bd5 MD5 | raw file
  1. import re
  2. import os
  3. import passwords
  4. import sys
  5. from subprocess import Popen
  6. from mechanize import Browser
  7. from BeautifulSoup import BeautifulSoup
  8. from multiprocessing import Pool
  9. def download(work):
  10. if os.path.exists(work[1]):
  11. print "Already downloaded", work
  12. return
  13. print "Starting", work
  14. # Put it in a temp file
  15. if os.system("mimms -c %s %s" % (work[0], "_" + work[1])) == 0:
  16. # Move the file on success
  17. os.system("mv %s %s" % ("_" + work[1], work[1]))
  18. print "Finished", work
  19. if __name__ == '__main__':
  20. # Pretend we're just a regular old user (this is naughty, don't try this at home kids)
  21. br = Browser()
  22. br.addheaders = [('User-agent', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6; en-us) AppleWebKit/531.9 (KHTML, like Gecko) Version/4.0.3 Safari/531.9')]
  23. br.set_handle_robots(False)
  24. br.open("https://myvideosu.stanford.edu/oce/currentquarter.aspx")
  25. assert br.viewing_html()
  26. # Import from a module outside of version control your SUNET id and password
  27. br.select_form(name="login")
  28. br["username"] = passwords.my_username
  29. br["password"] = passwords.my_password
  30. # Open the course page for the title you're looking for
  31. response = br.submit()
  32. response = br.follow_link(text=sys.argv[1])
  33. # Build up a list of lectures
  34. links = []
  35. for link in br.links(text="WMP"):
  36. links.append(re.search(r"'(.*)'",link.url).group(1))
  37. videos = []
  38. # These are done serially purely just to not look suspicious, we could probably parallelize this as well
  39. for link in links:
  40. response = br.open(link)
  41. soup = BeautifulSoup(response.read())
  42. video = soup.find('object', id='WMPlayer')['data']
  43. video = re.sub("http","mms",video)
  44. output_name = re.search(r"[a-z]+[0-9]+[a-z]?/[0-9]+",video).group(0).replace("/","_") + ".wmv"
  45. videos.append((video, output_name))
  46. # Make a thread pool and download 5 files at a time
  47. p = Pool(processes=5)
  48. p.map(download, videos)