catalogparser.py | searchcode

/lib/catalogparser.py

https://github.com/jeffh/YACS · Python · 106 lines · 81 code · 19 blank · 6 comment · 19 complexity · fe4aa18c6dc34aeae13034c838ec6f3b MD5 · raw file

import urllib2
import re
from BeautifulSoup import BeautifulSoup
from rpi_courses.config import DEPARTMENTS


def load_page(url, data=None):
    # load a page given the url, do not include the "http://" as it is already included.
    try:
        website = urllib2.urlopen("http://" + url, data)
        return website.read()
    except urllib2.URLError, e:
        print "Could not retrieve catalog url because:", e.reason[1]


def get_catalogs(frontpage):
    # get the list of catalogs of all the available years from the front page
    ids = re.findall('<option value="\d+"\s*\w*>Rensselaer Catalog', frontpage)
    out = []
    for i in ids:
        out.append(re.search("\d+", i).group(0))
    return out


def get_courses_link_id(page):
    # get the link id to all the courses for a catalog page
    link = re.search('\d+" class="navbar" \w+="\d+">Courses</a>', page)
    return re.search("\d+", link.group(0)).group(0)


def get_course_ids(department_page):
    ids = re.findall('coid=\d+"', department_page)
    out = []
    for i in ids:
        out.append(re.search('\d+', i).group(0))
    return out


def get_course_detail(course_page):
    course_page = re.sub('<br */?>', '\n', course_page)
    soup = BeautifulSoup(course_page, convertEntities=BeautifulSoup.HTML_ENTITIES)
    title_text = soup.findAll('h1 h2 h3 h4 h5 h6'.split(' '))[0].text
    title = re.search('([\w+\s]+) (\d+\w+) \- (.*)', title_text)
    if title:
        course = {
            'department': title.group(1),
            'num': title.group(2),
            'title': title.group(3),
            'description': get_course_description(soup.findAll('hr')[0].nextSibling),
            'prereqs': get_course_reqs(soup)
        }
    else:
        print "Failed to parse course:", title_text
        course = None
    return course


def get_course_description(tag):
    current = tag
    contents = []
    while current and getattr(current, 'name', 'text') not in ('em', 'strong', 'p'):
        contents.append(getattr(current, 'text', current.string))
        current = current.nextSibling
    return ''.join(contents).strip()


def get_course_reqs(tag):
    if (tag.findAll('strong')):
        current = tag.findAll('strong')[0]
        if getattr(current, "text", current.string) == 'Prerequisites/Corequisites:':
            return ''.join(current.nextSibling).strip()
    return 'None'


def special(tags):
    contents = re.findall('>??(.*?)<.*?>', tags)
    return "".join(contents)


def parse_catalog(a=False):
    courses = {}
    url = "catalog.rpi.edu"
    ids = get_catalogs(load_page(url))
    if a:
        catalogs = len(ids)
    else:
        catalogs = 1
    for i in range(catalogs):
        catalog_url = url + "/index.php?catoid=" + ids[i]
        link_id = get_courses_link_id(load_page(catalog_url))
        courses_url = url + "/content.php?catoid=" + ids[i] + "&navoid=" + link_id

        # parse need to parse out the coid (course id) from each department list of courses
        # then use it in the url: http://catalog.rpi.edu/preview_course.php?catoid=<id>&navoid<link_id>&coid=<course>
        # this will bring up the course descriptions and info and only the info for that course.
        for e in DEPARTMENTS.keys():
            print "parsing", e
            course_id = get_course_ids(load_page(courses_url, "filter[27]=" + e))
            for c in range(0, len(course_id)):
                detail_url = url + "/preview_course.php?catoid=" + ids[i] + "&coid=" + course_id[c]
                temp = get_course_detail(load_page(detail_url))
                if temp:
                    key = temp['department'] + temp['num']
                    if (key not in courses or temp['description'].strip() != '') and re.search('Topics in', temp['title']) is None:
                        courses[key] = temp
    return courses
Tech Fingerprint

Alerts (9)

'def' Ensure functions have docstrings for documentation
7 16 25 39 58 67 75 80
Complexity hotspot; line 104 (total complexity: 3)
104