PageRenderTime 105ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/catalogparser.py

https://github.com/jeffh/YACS
Python | 106 lines | 91 code | 12 blank | 3 comment | 8 complexity | fe4aa18c6dc34aeae13034c838ec6f3b MD5 | raw file
  1. import urllib2
  2. import re
  3. from BeautifulSoup import BeautifulSoup
  4. from rpi_courses.config import DEPARTMENTS
  5. def load_page(url, data=None):
  6. # load a page given the url, do not include the "http://" as it is already included.
  7. try:
  8. website = urllib2.urlopen("http://" + url, data)
  9. return website.read()
  10. except urllib2.URLError, e:
  11. print "Could not retrieve catalog url because:", e.reason[1]
  12. def get_catalogs(frontpage):
  13. # get the list of catalogs of all the available years from the front page
  14. ids = re.findall('<option value="\d+"\s*\w*>Rensselaer Catalog', frontpage)
  15. out = []
  16. for i in ids:
  17. out.append(re.search("\d+", i).group(0))
  18. return out
  19. def get_courses_link_id(page):
  20. # get the link id to all the courses for a catalog page
  21. link = re.search('\d+" class="navbar" \w+="\d+">Courses</a>', page)
  22. return re.search("\d+", link.group(0)).group(0)
  23. def get_course_ids(department_page):
  24. ids = re.findall('coid=\d+"', department_page)
  25. out = []
  26. for i in ids:
  27. out.append(re.search('\d+', i).group(0))
  28. return out
  29. def get_course_detail(course_page):
  30. course_page = re.sub('<br */?>', '\n', course_page)
  31. soup = BeautifulSoup(course_page, convertEntities=BeautifulSoup.HTML_ENTITIES)
  32. title_text = soup.findAll('h1 h2 h3 h4 h5 h6'.split(' '))[0].text
  33. title = re.search('([\w+\s]+) (\d+\w+) \- (.*)', title_text)
  34. if title:
  35. course = {
  36. 'department': title.group(1),
  37. 'num': title.group(2),
  38. 'title': title.group(3),
  39. 'description': get_course_description(soup.findAll('hr')[0].nextSibling),
  40. 'prereqs': get_course_reqs(soup)
  41. }
  42. else:
  43. print "Failed to parse course:", title_text
  44. course = None
  45. return course
  46. def get_course_description(tag):
  47. current = tag
  48. contents = []
  49. while current and getattr(current, 'name', 'text') not in ('em', 'strong', 'p'):
  50. contents.append(getattr(current, 'text', current.string))
  51. current = current.nextSibling
  52. return ''.join(contents).strip()
  53. def get_course_reqs(tag):
  54. if (tag.findAll('strong')):
  55. current = tag.findAll('strong')[0]
  56. if getattr(current, "text", current.string) == 'Prerequisites/Corequisites:':
  57. return ''.join(current.nextSibling).strip()
  58. return 'None'
  59. def special(tags):
  60. contents = re.findall('>??(.*?)<.*?>', tags)
  61. return "".join(contents)
  62. def parse_catalog(a=False):
  63. courses = {}
  64. url = "catalog.rpi.edu"
  65. ids = get_catalogs(load_page(url))
  66. if a:
  67. catalogs = len(ids)
  68. else:
  69. catalogs = 1
  70. for i in range(catalogs):
  71. catalog_url = url + "/index.php?catoid=" + ids[i]
  72. link_id = get_courses_link_id(load_page(catalog_url))
  73. courses_url = url + "/content.php?catoid=" + ids[i] + "&navoid=" + link_id
  74. # parse need to parse out the coid (course id) from each department list of courses
  75. # then use it in the url: http://catalog.rpi.edu/preview_course.php?catoid=<id>&navoid<link_id>&coid=<course>
  76. # this will bring up the course descriptions and info and only the info for that course.
  77. for e in DEPARTMENTS.keys():
  78. print "parsing", e
  79. course_id = get_course_ids(load_page(courses_url, "filter[27]=" + e))
  80. for c in range(0, len(course_id)):
  81. detail_url = url + "/preview_course.php?catoid=" + ids[i] + "&coid=" + course_id[c]
  82. temp = get_course_detail(load_page(detail_url))
  83. if temp:
  84. key = temp['department'] + temp['num']
  85. if (key not in courses or temp['description'].strip() != '') and re.search('Topics in', temp['title']) is None:
  86. courses[key] = temp
  87. return courses