/crawl.py

https://github.com/tarunrs/osu-events-server · Python · 70 lines · 63 code · 7 blank · 0 comment · 6 complexity · 0b9040664a35a7b97f8621fa454265a8 MD5 · raw file

  1. import urllib2
  2. import re
  3. import time
  4. from datetime import date, timedelta, datetime
  5. from BeautifulSoup import BeautifulSoup, NavigableString, Tag
  6. from Events import OSUEvents, Categories, Locations, Event_Types
  7. from sqlalchemy.orm import sessionmaker
  8. from sqlalchemy import *
  9. def strip_for_dict(str):
  10. pattern = re.compile('[\W_]+')
  11. return pattern.sub('', str)
  12. def load_categories(session):
  13. dict_c = {}
  14. db_categories = session.query(Categories)
  15. for c in db_categories:
  16. dict_c[strip_for_dict(c.category_title)] = c.category_id
  17. return dict_c
  18. def load_event_types(session):
  19. dict_et = {}
  20. db_event_types = session.query(Event_Types)
  21. for c in db_event_types:
  22. dict_et[strip_for_dict(c.event_type_title)] = c.event_type_id
  23. return dict_et
  24. def load_locations(session):
  25. dict_l = {}
  26. db_locations = session.query(Locations)
  27. for l in db_locations:
  28. dict_l[strip_for_dict(l.location_title)] = l.location_id
  29. return dict_l
  30. def get_osu_events(num_days, session, categories, locations, event_types):
  31. which_date = date.today()
  32. td = timedelta(days=1)
  33. for i in range(num_days):
  34. str_date = which_date.strftime('%Y-%m-%-d')
  35. page_url = 'http://www.osu.edu/events/indexDay.php?Event_ID=&Date=' + str_date
  36. html_doc = urllib2.urlopen(page_url).read()
  37. soup = BeautifulSoup(html_doc)
  38. events = soup.table.contents[3].td.findAll("p")
  39. for e in events:
  40. event_name = e.contents[0].text
  41. event_link = "http://www.osu.edu/events/" + str(e.contents[0]['href'])
  42. print event_link
  43. event = OSUEvents(event_name, event_link)
  44. event.load_details(session, categories, locations, event_types)
  45. db_event = session.query(OSUEvents).filter_by(event_link=event.event_link).first()
  46. if not db_event :
  47. session.add(event)
  48. session.commit()
  49. which_date = which_date + td
  50. db = create_engine('mysql://root:tarun123@localhost/osu_events')
  51. Session = sessionmaker(bind=db)
  52. db.echo = False
  53. metadata = MetaData()
  54. metadata.create_all(db)
  55. session = Session()
  56. d_locations = load_locations(session)
  57. d_categories = load_categories(session)
  58. d_event_types = load_event_types(session)
  59. print d_locations
  60. print d_categories
  61. print d_event_types
  62. get_osu_events(12, session, d_categories, d_locations, d_event_types)
  63. print d_categories