/modules/internets/api/feed.py
Python | 194 lines | 187 code | 7 blank | 0 comment | 13 complexity | eb8aa6c3ea8679923e691d600b7fbda2 MD5 | raw file
- import httplib
- import json
- import socket
- import urllib2
- import xpath
- from BaseHTTPServer import BaseHTTPRequestHandler
- from BeautifulSoup import BeautifulSoup
- from decimal import Decimal
- from StringIO import StringIO
- from urlparse import urlparse
- from xml.dom.minidom import Element, Document
- from xml.dom.minidom import parse
- class InputError(Exception):
- def __init__(self, msg):
- self.msg = msg
- def __str__(self):
- return str(self.msg)
- class FeedError(Exception):
- def __init__(self, e):
- if hasattr(e, 'code'):
- c = e.code
- if c == 404:
- self.msg = 'not found.'
- elif c == 406:
- self.msg = 'this resource is unavailable.'
- elif c == 500:
- self.msg = 'the server has encountered an unexpected error.'
- elif c == 502:
- self.msg = 'invalid response from the server. Try again later.'
- elif c == 503:
- self.msg = 'this resource is temporarily unavailable. Try again later.'
- elif c == 512:
- self.msg = 'this resource is not supported.'
- else:
- self.msg = 'something went wrong while connecting (%s)' % BaseHTTPRequestHandler.responses[e.code][0]
- self.code = c
- self.url = e.url
- elif hasattr(e, 'reason'):
- r = str(e.reason)
- if r == 'timed out':
- self.msg = 'connection timed out. Try again later.'
- else:
- self.msg = r
- self.code = None
- self.url = None
- elif hasattr(e, 'message'):
- if e.message == '':
- self.msg = 'invalid response from the server. Try again later.'
- self.code = None
- self.url = None
- else:
- pass
- else:
- pass #???
-
- def __str__(self):
- return self.msg
- class HtmlFeed:
- def __init__(self, value, fake_ua=False):
- if value == None:
- raise InputError('Invalid feed input.')
- if isinstance(value, str) or isinstance(value, unicode):
- try:
- opener = urllib2.build_opener()
- if fake_ua:
- opener.addheaders = [('User-Agent', 'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1')]
- else:
- opener.addheaders = [('User-Agent', 'Rizon Internets bot - www.rizon.net')]
- feed = opener.open(value.replace(' ', '%20'), timeout=20)
- self._html = feed.read()
- feed.close()
- except urllib2.URLError, e:
- raise FeedError(e)
- except httplib.BadStatusLine, e:
- raise FeedError(e)
- else:
- raise InputError('Invalid feed input type.')
- def html(self):
- return self._html
-
- def get_soup(self):
- return BeautifulSoup(self._html, convertEntities=BeautifulSoup.HTML_ENTITIES)
- def get_json(value):
- if value == None:
- raise InputError('Invalid feed input.')
-
- if isinstance(value, basestring):
- feed = HtmlFeed(value)
- return json.load(StringIO(feed.html()))
- else:
- raise InputError('Invalid feed input type.')
- class XmlFeed:
- def __init__(self, value, namespaces = None):
- if value == None:
- raise InputError('Invalid feed input.')
- self.namespaces = namespaces
- if isinstance(value, basestring):
- feed = HtmlFeed(value)
- self._element = parse(StringIO(feed.html()))
- elif isinstance(value, Element) or isinstance(value, Document):
- self._element = value
- else:
- raise InputError('Invalid feed input type.')
- error = xpath.findvalue('/error/message', self._element)
- if error != None:
- raise FeedError(error)
- def elements(self, query):
- return [XmlFeed(x, self.namespaces) for x in xpath.find(query, self._element, namespaces=self.namespaces)]
- def text(self, query, default=None):
- result = xpath.findvalue(query, self._element, namespaces=self.namespaces)
- if not result:
- value = default
- else:
- value = result.strip()
- if isinstance(value, unicode):
- try:
- value = value.encode('latin-1').decode('utf-8')
- except:
- pass
-
- return value
- def int(self, query, default = None):
- result = self.text(query, None)
- if result == None:
- return default
- try:
- return int(result)
- except:
- return default
- def decimal(self, query, default = None):
- result = self.text(query, None)
- if result == None:
- return default
- try:
- return Decimal(result)
- except:
- return default
- def bool(self, query, default = None):
- result = self.text(query, None)
- if result == None:
- return default
- if 'true' in result.lower() or result == '1':
- return True
- elif 'false' in result.lower() or result == '0':
- return False
- else:
- try:
- return int(result) > 0
- except:
- return default
- def attribute(self, query, attr, default = None, checkEveryOccurrence = False):
- elements = xpath.find(query, self._element)
-
- if len(elements) > 0 and not checkEveryOccurrence:
- if elements[0].hasAttribute(attr):
- return elements[0].getAttribute(attr)
- else:
- return None
- else:
- for e in elements:
- if e.hasAttribute(attr):
- return e.getAttribute(attr)
- return None