/utils/fetcher.py
Python | 912 lines | 892 code | 0 blank | 20 comment | 0 complexity | 8dcdf04f4982b6e6316076d2d27b8c35 MD5 | raw file
Possible License(s): BSD-3-Clause
- #!/usr/bin/env python
- # coding=utf-8
- """
- copyright (c) 2009, paketka@gmail.com et. al
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of the <ORGANIZATION> nor the names of its contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- """
- import urllib
- import httplib
- import HTMLParser
- import libxml2
- import re
- import os
- import sys
- import StringIO
- import unicodedata
- #these definitions below has to be
- #moved to global settings
- PSP_URL = 'www.psp.cz'
- def fixStr(s):
- retVal = s.decode('utf-8')
- retVal = retVal.replace(u'Š', u'')
- retVal = retVal.replace(u'š', u'')
- retVal = retVal.replace(u'ř', u'?')
- retVal = retVal.replace(u'Ř', u'?')
- retVal = retVal.replace(u'Č', u'?')
- retVal = retVal.replace(u'č', u'?')
- retVal = retVal.replace(u'ď', u'?')
- retVal = retVal.replace(u'Ď', u'?')
- retVal = retVal.replace(u'ě', u'?')
- retVal = retVal.replace(u'ě', u'?')
- retVal = retVal.replace(u'ň', u'?')
- retVal = retVal.replace(u'Ň', u'?')
- retVal = retVal.replace(u'ž', u'')
- retVal = retVal.replace(u'ů', u'?')
- retVal = retVal.replace(u'ť', u'?')
- retVal = retVal.replace(u'Ž', u'')
- retVal = retVal.replace(u'ľ', u'?')
- retVal = retVal.replace(u'Ľ', u'?')
- return retVal
- def processHref(node, pattern, keys, record):
- """
- node - libxml2 node, which should contain href property
- pattern - compiled reg ex with named groups
- keys - list of groupnames within pattern
- record - dictionary, which contans keys, they will be filled in
- i.e. for pattern as follows
- re.compile(r'(P<key>.+):(P<val>\d+)')
- the reasonable key array would look like
- ['key', 'val']
- the return value returns number of keys processed
- """
- retVal = 0
- href = node.hasProp('href')
- if href != None:
- hrefVal = href.getContent()
- match = pattern.match(hrefVal)
- if match != None:
- for k in keys:
- record[k] = match.group(k)
- retVal += 1
- return retVal
-
- def fetchTerms():
- """
- fetches electoral terms. it parses result of URL:
- http://www.psp.cz/sqw/fsnem.sqw?zvo=1
- (zvo = zvol volebni obdobi - choose elect. term)
- it's interested only in these, which contains string
- 'Poslanecká sn?movna'
- """
- conn = httplib.HTTPConnection(PSP_URL)
- local_path = '/sqw/fsnem.sqw?zvo=1'
- conn.request('GET', local_path)
- resp = conn.getresponse()
- retVal = []
- if resp.status == 200:
- htmlPage = u''
- htmlPage += resp.read().decode('cp1250', 'ignore')
- opts = libxml2.HTML_PARSE_RECOVER
- opts += libxml2.HTML_PARSE_NOERROR
- opts += libxml2.HTML_PARSE_NOWARNING
- doc = libxml2.htmlReadDoc(htmlPage.encode('iso8859-2', 'ignore'), '', None, opts)
- divs = doc.xpathEval("//div[@class='text no-secmenu']")
- retVal = []
- if len(divs) == 1:
- terms = divs[0].xpathEval('a')
- termPattern = re.compile(r'fsnem.sqw\?o=(?P<termId>\d+)')
- for term in terms:
- tmpRec = {
- 'yearStart' : None,
- 'yearEnd' : None,
- 'termId' : '',
- }
- if processHref(term, termPattern, ['termId'], tmpRec) > 0:
- retVal.append(tmpRec)
- str = fixStr(divs[0].get_content())
- startEnd = re.compile(u'\s*Poslanecká sn?movna \((?P<yearStart>\d+) - (?P<yearEnd>\d+)\).*', re.U)
- startOnly = re.compile(u'\s*Poslanecká sn?movna \(od (?P<yearStart>\d+)\).*', re.U)
- i = 0
- matchedStartOnly = False
- for s in str.split(','):
- match = startEnd.match(s)
- start = ''
- end = ''
- if match != None:
- start = match.group('yearStart')
- end = match.group('yearEnd')
- elif matchedStartOnly == False:
- match = startOnly.match(s)
- if match != None:
- matchedStartOnly = True
- start = match.group('yearStart')
- if match != None:
- retVal[i]['yearStart'] = start
- retVal[i]['yearEnd'] = end
- i += 1
- doc.freeDoc()
- return retVal
- def fetchPSPMeetings(electTerm):
- """
- electTerm - electoral term there 5 terms so far:
- 1. 1992 - 1996
- 2. 1996 - 1998
- 3. 1998 - 2002
- 4. 2002 - 2006
- 5. 2006 - 2010
- each elector term lasts four years here in Czech,
- can be shorter in case MPs give up.
- function fetches list of meetings for particular electoral term.
- there are several meetings during each electoral term.
- each meeting has its unique ID (int number). The number is unique
- to particular elect. term.
- some meetings can lasts more than one day
- """
- conn = httplib.HTTPConnection(PSP_URL)
- local_path = '/sqw/hlasovani.sqw?o=%s' % electTerm
- conn.request('GET', local_path)
- resp = conn.getresponse()
- retVal = []
- if resp.status == 200:
- htmlPage = u''
- htmlPage += resp.read().decode('cp1250', 'ignore')
- opts = libxml2.HTML_PARSE_RECOVER
- opts += libxml2.HTML_PARSE_NOERROR
- opts += libxml2.HTML_PARSE_NOWARNING
- doc = libxml2.htmlReadDoc(htmlPage.encode('iso8859-2', 'ignore'), '', None, opts)
- divs = doc.xpathEval("//div[@class='text no-secmenu']")
- retVal = []
- if len(divs) == 1:
- meetings = divs[0].xpathEval('//b/a')
- dates = divs[0].xpathEval('//b/following-sibling::*')
- i = 1
- datePattern = re.compile(
- r'hl.sqw\?o=\d+&s=\d+&d=(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})'
- )
- mNoPattern = re.compile(r'hl\.sqw\?o=\d+&s=(?P<meetingNo>\d+)')
- for meeting in meetings:
- record = {
- 'name' : '',
- 'url' : '',
- 'days' : [],
- 'elTerm' : electTerm,
- 'meetingNo' : '',
- }
- record['name'] = fixStr(meeting.get_content())
- record['url'] = meeting.hasProp('href').get_content()
- match = mNoPattern.match(record['url'])
- if match != None:
- record['meetingNo'] = match.group('meetingNo')
- while len(dates) > 0 and dates[0].name != 'br':
- date = dates.pop(0)
- tmpRec = {}
- if processHref(date, datePattern, ['day', 'month', 'year'], tmpRec) != 0:
- day = {
- 'day' : int(tmpRec['day']),
- 'month' : int(tmpRec['month']),
- 'year' : int(tmpRec['year']),
- 'url' : date.hasProp('href').get_content(),
- }
- else:
- continue
- record['days'].append(day)
- dates.pop(0)
- retVal.append(record)
- doc.freeDoc()
- return retVal
- def fetchPSPMembers(electTerm):
- """
- electTerm - electoral term there 5 terms so far:
- 1. 1992 - 1996
- 2. 1996 - 1998
- 3. 1998 - 2002
- 4. 2002 - 2006
- 5. 2006 - 2010
- each electoral term lasts four years here in Czech,
- can be shorter in case MPs give up.
- function fetches list of members for particular electoral term.
- there are several meetings during each electoral term.
- each meeting has its unique ID (int number). The number is unique
- to particular elect. term.
- some meetings can lasts more than one day
- """
- conn = httplib.HTTPConnection(PSP_URL)
- local_path = '/sqw/snem.sqw?P1=0&P2=0&l=cz&o=%s' % electTerm
- conn.request('GET', local_path)
- resp = conn.getresponse()
- retVal = []
- if resp.status == 200:
- htmlPage = u''
- htmlPage += resp.read().decode('cp1250', 'ignore')
- opts = libxml2.HTML_PARSE_RECOVER
- opts += libxml2.HTML_PARSE_NOERROR
- opts += libxml2.HTML_PARSE_NOWARNING
- doc = libxml2.htmlReadDoc(htmlPage.encode('iso8859-2', 'ignore'), '', None, opts)
- divs = doc.xpathEval("//div[@class='text no-secmenu']")
- if len(divs) == 1:
- tableRows = divs[0].xpathEval('//table/tr')
- pspPat = re.compile(r'.*detail\.sqw\?id=(?P<pspId>\d+).*')
- postTitle = re.compile(r'.*,(?P<postTitle>.+)')
- areaId = re.compile(r'.*snem.sqw\?.*id=(?P<areaId>\d+).*')
- partyId = re.compile(r'.*snem.sqw\?.*id=(?P<partyId>\d+).*')
- for row in tableRows:
- td = row.get_children()
- memberRecord = {
- 'name': '',
- 'titlePre': '',
- 'titlePost': '',
- 'pspId': '',
- 'partyId': '',
- 'areaId' : '',
- 'boardIds' : [],
- 'commIds': [],
- 'delegIds': []
- }
-
- memberRecord['titlePre'] = fixStr(td.get_content())
- td = td.get_next()
- node = td.get_children()
- if processHref(node, pspPat, ['pspId'], memberRecord) == 0:
- continue
- td = td.get_next()
- td = td.get_next()
- memberRecord['name'] = fixStr(node.get_content())
- tmpName = td.get_content()
- match = postTitle.match(td.get_content())
- if match != None:
- memberRecord['titlePost'] = fixStr(match.group('postTitle'))
- node = td.get_children()
- if processHref(node, areaId, ['areaId'], memberRecord) == 0:
- continue
- td = td.get_next()
- td = td.get_next()
- node = td.get_children()
- if processHref(node, partyId, ['partyId'], memberRecord) == 0:
- continue
- td = td.get_next()
- td = td.get_next()
- for key in ['boardIds', 'commIds', 'delegIds']:
- node = td.get_children()
- list = []
- while node != None:
- tmpRec = {}
- if processHref(node, partyId, ['partyId'], tmpRec) != 0:
- list.append(tmpRec['partyId'])
- node = node.get_next()
- memberRecord[key] = list
- td = td.get_next()
- if td == None:
- break
- td = td.get_next()
- if td == None:
- break
- retVal.append(memberRecord)
- doc.freeDoc()
- return retVal
- def fetchResultsOverview(electTerm, meetingNo):
- conn = httplib.HTTPConnection(PSP_URL)
- stop = False
- retVal = []
- page = 1
- pollNo = 1
- while not stop:
- local_path = '/sqw/phlasa.sqw?o=%s&s=%s&pg=%u' % (electTerm, meetingNo, page)
- page += 1
- conn.request('GET', local_path)
- resp = conn.getresponse()
- if resp.status == 200:
- htmlPage = u''
- htmlPage += resp.read().decode('cp1250', 'ignore')
- opts = libxml2.HTML_PARSE_RECOVER
- opts += libxml2.HTML_PARSE_NOERROR
- opts += libxml2.HTML_PARSE_NOWARNING
- doc = libxml2.htmlReadDoc(htmlPage.encode('iso8859-2', 'ignore'), '', None, opts)
- divs = doc.xpathEval("//div[@class='text no-secmenu']")
- if len(divs) == 1:
- tableRows = divs[0].xpathEval('//table/tr')
- pollPattern = re.compile(r'.*hlasy.sqw\?G=(?P<pollId>\d+).*')
- for row in tableRows:
- td = row.get_children()
- pollRecord = {
- 'pollNo' : '',
- 'pollId' : '',
- 'pollUrl': '',
- 'pointNo': '',
- 'pollName' : '',
- 'pollDetailsURL' : '',
- 'pollResult' : '',
- 'pollDate' : {
- 'year' : 0,
- 'month' : 0,
- 'day' : 0,
- },
- }
- td = td.get_next()
- node = td.get_children()
- if processHref(node, pollPattern, ['pollId'], pollRecord) == 0:
- continue
- pollRecord['pollUrl'] = 'http://www.psp.cz/sqw/' + node.hasProp('href').get_content()
- td = td.get_next()
- pollRecord['pointNo'] = td.get_content()
- td = td.get_next()
- pollRecord['pollName'] = fixStr(td.get_content())
- td = td.get_next()
- node = td.get_children()
- if node.hasProp('href') != None:
- pollRecord['pollDetailsURL'] = node.hasProp('href').get_content()
- datePat = re.compile(r'[^0-9]*(?P<day>\d+)\.\s*[^0-9]*(?P<month>\d+)\.\s*[^0-9]*(?P<year>\d+).*', re.U)
- match = datePat.match(td.get_content())
- if match != None:
- pollRecord['pollDate']['year'] = int(match.group('year'))
- pollRecord['pollDate']['month'] = int(match.group('month'))
- pollRecord['pollDate']['day'] = int(match.group('day'))
- td = td.get_next()
- pollRecord['pollResult'] = fixStr(td.get_content())
- pollRecord['pollNo'] = pollNo
- pollNo += 1
- retVal.append(pollRecord)
- if not stop:
- stop = len(tableRows) == 0
- else:
- stop = True
- doc.freeDoc()
- return retVal
- def fetchPollResult(electTerm, pollId):
- conn = httplib.HTTPConnection(PSP_URL)
- result = []
- local_path = '/sqw/hlasy.sqw?G=%s' % (pollId)
- conn.request('GET', local_path)
- resp = conn.getresponse()
- retVal = []
- if resp.status == 200:
- htmlPage = resp.read()
- opts = libxml2.HTML_PARSE_RECOVER
- opts += libxml2.HTML_PARSE_NOERROR
- opts += libxml2.HTML_PARSE_NOWARNING
- doc = libxml2.htmlReadDoc(htmlPage, '', None, opts)
- divs = doc.xpathEval("//div[@class='text no-secmenu']")
- if len(divs) == 1:
- tableRows = divs[0].xpathEval('//table/tr')
- pspPat = re.compile(r'.*detail\.sqw\?id=(?P<pspId>\d+).*')
- for row in tableRows:
- td = row.get_children()
- while td:
- pollRecord = {
- 'pspId': '',
- 'result': '',
- }
- pollRecord['result'] = td.get_content()
- if not pollRecord['result'] in ['A', 'N', '0', 'Z', 'M', 'X']:
- break
- td = td.get_next()
- node = td.get_children()
- if node == None:
- break
- if processHref(node, pspPat, ['pspId'], pollRecord) == 0:
- break
- retVal.append(pollRecord)
- td = td.get_next()
- doc.freeDoc()
- return retVal
- def fetchMPPhoto(mpId, period):
- conn = httplib.HTTPConnection(PSP_URL)
- result = []
- local_path = '/sqw/detail.sqw?id=%s&o=%s' % (mpId, period)
- conn.request('GET', local_path)
- resp = conn.getresponse()
- retVal = None
- if resp.status == 200:
- htmlPage = resp.read()
- opts = libxml2.HTML_PARSE_RECOVER
- opts += libxml2.HTML_PARSE_NOERROR
- opts += libxml2.HTML_PARSE_NOWARNING
- doc = libxml2.htmlReadDoc(htmlPage, '', None, opts)
- img = doc.xpathEval("//div[@class='text no-secmenu']/table/tr/td/a/img")
- if len(img) == 1 and img[0].hasProp('src') and img[0].hasProp('src').getContent():
- url = img[0].hasProp('src').getContent()
- conn.request('GET', url)
- resp = conn.getresponse()
- if resp.status == 200:
- fname = os.path.join('/tmp/', str(mpId) + '.jpeg')
- f = open(fname, 'w')
- f.write(resp.read())
- f.close()
- retVal = fname
- doc.freeDoc()
-
- return retVal
- def getTParm(doc):
- urls = doc.xpathEval("//tt/a")
- pattern = re.compile(r'.*detail.sqw\?.*t=(?P<tParm>\d+(\,\d+)*).*')
- retVal = ''
- for u in urls:
- record = {}
- if u.get_content() == '+'\
- and processHref(u, pattern, ['tParm'], record) == 1:
- retVal += record['tParm'] + ','
-
- return retVal.strip(',')
-
- def fetchMembership(local_path):
- membPat = re.compile(r'(?P<orgName>.+),?(?P<post>.+)od(?P<start>.+\d\d\d\d)(.*do(?P<end>.+))?', re.U)
- dateTo = re.compile(r'.*do(?P<dateTo>.+)', re.U)
- datePat = re.compile(r'[^0-9]*(?P<day>\d+)\.[^0-9]*(?P<month>\d+)\.[^0-9]*(?P<year>\d+).*', re.U)
- orgIdPat = re.compile(r'.*fsnem\.sqw\?id=(?P<orgId>\d+).*', re.U)
- govIdPat = re.compile(r'.*fsnem\.sqw\?org=(?P<orgId>\d+).*', re.U)
- conn = httplib.HTTPConnection(PSP_URL)
- result = []
- conn.request('GET', local_path)
- resp = conn.getresponse()
- retVal = []
- if resp.status == 200:
- htmlPage = u''
- htmlPage += resp.read().decode('cp1250', 'ignore')
- opts = libxml2.HTML_PARSE_RECOVER
- opts += libxml2.HTML_PARSE_NOERROR
- opts += libxml2.HTML_PARSE_NOWARNING
- doc = libxml2.htmlReadDoc(htmlPage.encode('iso8859-2', 'ignore'), '', None, opts)
- infoBox = doc.xpathEval("//div[@id='text-related-secmenu']/div[@class='text no-secmenu']/table//tr/td/ul/p")
- type = ''
- if len(infoBox) == 0:
- infoBox = doc.xpathEval("//div[@id='text-related-secmenu']")
- lines = fixStr(infoBox[0].get_content()).splitlines()
- i = 0
- links = infoBox[0].xpathEval('//a')
- while links[0].get_content().strip() != 'o':
- links.pop(0)
- while not lines[0].startswith('o'):
- lines.pop(0)
-
- """
- we need to make the list of orgs, the MP is member of parsable
- we are converting it into form as follows:
- o Parlament
- Poslanecká sn#movna,poslanec od 3. 6. 2006
- o Výbor
- Rozpo#tový výbor,#len od 12. 9. 2006
- o Podvýbor
- Podvýbor pro finan#ní hospoda#ení územních samospráv a pro vyu#ívání fond# Evropské unie,#len od 1. 12. 2006
- Podvýbor pro finan#ní hospoda#ení územních samospráv a pro vyu#ívání fond# Evropské unie,p#edseda od 6. 12. 2006
- o Komise
- Stálá komise pro bankovnictví,#len od 8. 11. 2006
- o Klub
- Poslanecký klub Komunistické strany #ech a Moravy,#len od 8. 6. 2006
- o Meziparlamentní skupina v rámci MPU
- Skupina #R - ASEAN,#len od 1. 3. 2007
- Skupina #R - Rakousko,#len od 1. 3. 2007
- Skupina #R - #ína,#len od 21. 3. 2007
- o Instituce
- Prezídium Pozemkového fondu #eské republiky,#len od 16. 12. 2005
- """
- line = u''
- for l in lines:
- line += l
- s = re.sub('(?P<digit>\d)o', '\g<digit>\no', line, re.U)
- s = re.sub('(?P<remember>[^o])\xa0\xa0', '\g<remember>\n', s, re.U)
- lines = s.splitlines()
- for l in links:
- member = {
- 'orgName' : '',
- 'orgId' : '',
- 'orgType' : '',
- 'post' : '',
- 'start' : {
- 'day': 0,
- 'month': 0,
- 'year' : 0,
- },
- 'end' : {
- 'day': 0,
- 'month': 0,
- 'year': 0,
- }
- }
- if l.get_content().startswith('o'):
- type = lines[i].strip('o').strip(' ')
- elif processHref(l, orgIdPat, ['orgId'], member) != 1:
- processHref(l, govIdPat, ['orgId'], member)
- if len(member['orgId']) == 0:
- member['orgId'] = '-1'
- membMatch = None
- try:
- membMatch = membPat.match(lines[i].strip())
- except:
- break
- if membMatch != None:
- for k in ['orgName', 'post']:
- member[k] = membMatch.group(k).strip()
-
- start = membMatch.group('start')
- match = datePat.match(start)
- for k in member['start'].keys():
- member['start'][k] = int(match.group(k))
- try:
- endDate = dateTo.match(start).group('dateTo')
- match = datePat.match(endDate)
- for k in member['end'].keys():
- member['end'][k] = int(match.group(k))
- except:
- pass
- member['orgType'] = type
- retVal.append(member)
- i += 1
- doc.freeDoc()
- return retVal
-
- def fetchMP(mpId, period):
- local_path = '/sqw/detail.sqw?id=%s&o=%s' % (mpId, period)
- conn = httplib.HTTPConnection(PSP_URL)
- result = []
- conn.request('GET', local_path)
- resp = conn.getresponse()
- retVal = None
- if resp.status == 200:
- htmlPage = u''
- htmlPage += resp.read().decode('cp1250', 'ignore')
- opts = libxml2.HTML_PARSE_RECOVER
- opts += libxml2.HTML_PARSE_NOERROR
- opts += libxml2.HTML_PARSE_NOWARNING
- doc = libxml2.htmlReadDoc(htmlPage.encode('iso8859-2', 'ignore'), '', None, opts)
- infoBox = doc.xpathEval('//td/ul')
- if len(infoBox) == 0:
- infoBox = doc.xpathEval('//td')
- if len(infoBox) != 0:
- info = None
- for i in infoBox:
- if i.get_content().rfind('Narozen') != -1:
- info = i
- break
- if info == None:
- doc.freeDoc()
- return None
- retVal = {
- 'birthDate' : {
- 'year' : 0,
- 'day' : 0,
- 'month' : 0,
- },
- 'photoFile' : '',
- 'office' : '',
- 'regOffice' : '',
- 'homePage' : '',
- 'email' : '',
- 'phone' : '',
- 'membship' : [],
- }
- birthPat = re.compile(r'Narozena?:[^0-9]*(?P<day>\d+)\.[^0-9]*(?P<month>\d+)\.[^0-9]*(?P<year>\d+).*', re.U)
- match = None
- retVal['photoFile'] = fetchMPPhoto(mpId, period)
- for line in info.get_content().splitlines():
- match = birthPat.match(line)
- if match != None:
- for key in retVal['birthDate'].keys():
- retVal['birthDate'][key] = int(match.group(key))
- break
-
- tParm = getTParm(info)
- newLocalPath = '/sqw/detail.sqw?id=%s&t=%s&o=%s' % (mpId, tParm, period)
- retVal['membship'] = fetchMembership(newLocalPath)
- try:
- li = info.xpathEval('//tr/td/ul/b')
- retVal['office'] = fixStr(li[0].get_content())
- retVal['phone'] = li[1].get_content()
- retVal['regOffice'] = fixStr(li[2].get_content())
- except:
- pass
- aLinks = info.xpathEval('//a')
- for i in aLinks:
- if i.hasProp('href') and i.hasProp('href').getContent().lower().startswith('http://'):
- retVal['homePage'] = i.hasProp('href').getContent()
- break
- if i.hasProp('href') and i.hasProp('href').getContent().lower().startswith('mailto:'):
- retVal['email'] = i.hasProp('href').getContent()[len('mailto:'):]
-
- doc.freeDoc()
- return retVal
- def getOrgName(orgId, termId):
- local_path = '/sqw/snem.sqw?id=%s&o=%s' % (orgId,termId)
- conn = httplib.HTTPConnection(PSP_URL)
- conn.request('GET', local_path)
- resp = conn.getresponse()
- retVal = ''
- if resp.status == 200:
- htmlPage = u''
- htmlPage += resp.read().decode('cp1250', 'ignore')
- opts = libxml2.HTML_PARSE_RECOVER
- opts += libxml2.HTML_PARSE_NOERROR
- opts += libxml2.HTML_PARSE_NOWARNING
- doc = libxml2.htmlReadDoc(htmlPage.encode('iso8859-2', 'ignore'), '', None, opts)
- try:
- retVal = fixStr(doc.xpathEval("//div[@id='main-zahlavi']/h2")[0].get_content())
- except:
- pass
- doc.freeDoc()
- return retVal
- def fetchMemberByID(id):
- """
- It fetches just name, birthdate and mpId. this creates and updates the
- initial list of members DB.
- """
- url = '/sqw/detail.sqw?id=%s' % id
- conn = httplib.HTTPConnection(PSP_URL)
- conn.request('GET', url)
- resp = conn.getresponse()
- retVal = None
- if resp.status == 200:
- htmlPage = u''
- htmlPage += resp.read().decode('cp1250', 'ignore')
- opts = libxml2.HTML_PARSE_RECOVER
- opts += libxml2.HTML_PARSE_NOERROR
- opts += libxml2.HTML_PARSE_NOWARNING
- doc = libxml2.htmlReadDoc(htmlPage.encode('iso8859-2', 'ignore'), '', None, opts)
-
- name = doc.xpathEval("//div[@id='main-zahlavi']/h2")
- birthPat = re.compile(r'Narozena?:[^0-9]*(?P<day>\d+)\.[^0-9]*(?P<month>\d+)\.[^0-9]*(?P<year>\d+).*', re.U)
- if len(name) == 1:
- retVal = {
- 'titlePre' : '',
- 'name' : '',
- 'surname' : '',
- 'titlePost' : '',
- 'birthDate' : {
- 'year' : 1,
- 'month' : 1,
- 'day' : 1,
- },
- }
- namePattern = re.compile(u'(?P<titlePre>.*\.|.*,)*(?P<name>\s*[^,. ]+)\s+(?P<surname>[^,.]+)(?P<titlePost>,?(.*\.|.*,)+)*', re.U)
- match = namePattern.match( fixStr( name[0].get_content() ) )
- if match != None:
- for i in ['titlePre', 'name', 'surname', 'titlePost']:
- if match.group(i) != None:
- retVal[i] = match.group(i).strip()
- birthAndDiv = doc.xpathEval("//div[@class='text no-secmenu']/ul")
-
- if len(birthAndDiv) == 0:
- birthAndDiv = doc.xpathEval("//div[@class='text no-secmenu']/table/tr/td/ul")
- if len(birthAndDiv) > 0:
- birthDateStr = birthAndDiv[0].get_content().splitlines()[0]
- match = birthPat.match(fixStr(birthDateStr))
- if match != None:
- retVal['birthDate']['year'] = int(match.group('year'))
- retVal['birthDate']['month'] = int(match.group('month'))
- retVal['birthDate']['day'] = int(match.group('day'))
- doc.freeDoc()
- return retVal
- def fetchStenoProto(url):
- pattern = re.compile(r'http://(?P<server>[^/]+)(?P<localPath>.+)?')
- match = pattern.match(url)
- if match == None:
- return None
-
- server = match.group('server')
- localPath = match.group('localPath')
- if localPath == None:
- localPath = ''
- conn = httplib.HTTPConnection(server)
- conn.request('GET', localPath)
- resp = conn.getresponse()
- retVal = None
- doc = None
- if resp.status == 200:
- htmlPage = u''
- htmlPage += resp.read().decode('cp1250', 'ignore')
- opts = libxml2.HTML_PARSE_RECOVER
- opts += libxml2.HTML_PARSE_NOERROR
- opts += libxml2.HTML_PARSE_NOWARNING
- doc = libxml2.htmlReadDoc(htmlPage.encode('iso8859-2', 'ignore'), '', None, opts)
- retVal = doc.xpathEval("//div[@class='text no-secmenu']/p")
- return retVal, doc
-
- def main():
- #m = fetchMembership('/sqw/detail.sqw?id=5270&t=11,3,4,2,1,13,41&o=5')
- #m = fetchMembership('/sqw/detail.sqw?id=295&t=11,3,4,1,83,5&o=5')
- #for i in m:
- # print i['orgName']
- #meetings = fetchPSPMeetings('1')
- #for m in meetings:
- # for k in m.keys():
- # print '%s:\t%s' % (k, m[k])
- # print '-----------------------------------------------------------------------------'
- #print '============================================================================='
- #print fetchMemberByID('23')
- #results = fetchResultsOverview('1', '15')
- #for r in results:
- # for k in r.keys():
- # print '%s:\t%s' % (k, r[k])
- # print '-----------------------------------------------------------------------------'
- #print '============================================================================='
- for i in range(100):
- print fetchPollResult(5, 43836)
- #for p in polls:
- # for k in p.keys():
- # print '%s:\t%s' % (k, p[k])
- # print '-----------------------------------------------------------------------------'
-
- #print fetchMP(285, 3)
- #print fetchRegisterByName('Libor', 'Ambrozek', None)
- if __name__ == '__main__':
- main()
- sys.exit(0)