scrape.py | searchcode

/Sources/wwwclient/scrape.py

https://github.com/netconstructor/wwwclient
Python | 921 lines | 872 code | 17 blank | 32 comment | 10 complexity | 86efd62e6db9622660a238bce6efe965 MD5 | raw file
Possible License(s): LGPL-3.0

#!/usr/bin/env python
# -----------------------------------------------------------------------------
# Project   : WWWClient
# -----------------------------------------------------------------------------
# Author    : Sebastien Pierre <sebastien@xprima.com>
# -----------------------------------------------------------------------------
# License   : GNU Lesser General Public License
# Credits   : Xprima.com
# -----------------------------------------------------------------------------
# Creation  : 19-Jun-2006
# Last mod  : 19-Mar-2012
# -----------------------------------------------------------------------------

# TODO: The tree could be created by the iterate function, by directly linking
# nodes. So the tree could be unfolded as a list, or kept folded as a tree. This
# would allow to have still one structure. Ideally, the original HTML could be
# kept to allow easy subset extraction (currently, the data is recreated)

import re, string, htmlentitydefs
import form

__doc__ = """\
The scraping module gives a set of functionalities to manipulate HTML data. All
functions are text oriented, so that they work with any subset of an HTML
document. This is very useful, as it does not require the HTML to be
well-formed, and allows easy selection of HTML fragments."""

DEFAULT_ENCODING = "utf-8"
RE_SPACES    = re.compile("\s+")
RE_HTMLSTART = re.compile("</?(\w+)",      re.I)
RE_HTMLEND   = re.compile("/?>")
RE_HTMLLINK  = re.compile("<[^<]+(href|src|url)\s*=\s*('[^']*'|\"[^\"]*\"|[^ >]*)", re.I)

RE_HTMLCLASS = re.compile("class\s*=\s*['\"]?([\w\-_\d]+)", re.I)
RE_HTMLID    = re.compile("id\s*=\s*['\"]?([\w\-_\d]+)", re.I)
RE_HTMLHREF  = re.compile("href\s*=\s*('[^']*'|\"[^\"]*\"|[^ ]*)", re.I)

RE_SPACES    = re.compile("\s+", re.MULTILINE)
RE_QUERY     = re.compile("^(?P<name>[\w\d_\-]+)?(?P<id>#[\w\d_\-]+)?(?P<class>\.[\w\d_\-]+)?(?P<property>\:[\w\d\-]+)?(?P<count>\[\-?\d+\])?$")

KEEP_ABOVE    = "+"
KEEP_SAME     = "="
KEEP_BELOW    = "-"

# -----------------------------------------------------------------------------
#
# URL
#
# -----------------------------------------------------------------------------

class URL:

	@classmethod
	def Base( self, url ):
		base_url = url
		if not url.endswith("/"):
			i = base_url.rfind("/")
			if i > 0:
				base_url = base_url[:i + 1]
		return base_url

	@classmethod
	def Absolute( self, url, siteURL ):
		base_url = self.Base(siteURL)
		if url.find("http") != 0:
			url = base_url + url
		return url

# -----------------------------------------------------------------------------
#
# HTML TAG INTERFACE
#
# -----------------------------------------------------------------------------

class Tag:
	"""A Tag is an abstract decorator for a portion within a string. Tags are
	used in this module to identify HTML/XML data within strings."""
	OPEN  = "open"
	CLOSE = "close"
	EMPTY = "empty"

	def __init__( self, html, start, end ):
		"""Creates a new new tag."""
		self._html  = html
		self.start  = start
		self.end    = end
	
	def isElement( self ):
		return isinstance(self, ElementTag)

	def isText( self ):
		return isinstance(self, TextTag)

	def html( self ):
		"""Returns the HTML representation of this tag."""
		return self._html[self.start:self.end]

	def __repr__( self ):
		return repr(self._html[self.start:self.end])

class ElementTag(Tag):
	"""Represents a single element tag (open or close) identified within
	a string."""

	def __init__( self, html, start, end, astart=None, aend=None, attributes=None,
	level=None, type=None ):
		"""Creates a new tag element extracted from the given 'html' string."""
		Tag.__init__(self, html, start, end)
		if type == None: type = Tag.OPEN
		self._attributes = attributes
		self.astart      = astart
		self.aend        = aend
		self.level       = level
		self.type        = type

	def attributes( self ):
		if self._attributes == None:
			self._attributes = HTML.parseAttributes(self._html[self.astart:self.aend].strip())
		return self._attributes

	def has( self, name, value=None ):
		"""Tells if this tag has the given attribute"""
		if value is None:
			return self.attributes().has_key(name)
		else:
			return self.attributes().get(name) == value

	def get( self, name ):
		"""Returns the given attribute set for this tag."""
		return self.attributes().get(name)

	def name( self ):
		"""Returns this tag name"""
		if self.type == Tag.OPEN or self.type == Tag.EMPTY:
			return self._html[self.start+1:self.astart].strip()
		else:
			return self._html[self.start+2:self.astart].strip()

	def nameLike( self, what ):
		"""Tells if the name is like the given string/list of string/regexp/list
		of regexpes."""
		if type(what) in (str, unicode):
			# TODO: Handle unicode
			return re.match(what, self.name(), re.I)
		if type(what) in (tuple, list):
			for w in what:
				if self.nameLike(w): return True
		else:
			return what.match(self.name(), re.I)

	def hasName( self, name ):
		return self.name().lower() == name.lower()

	def hasClass( self, name ):
		"""Tells if the element has the given class (case sensitive)"""
		element_class = self.attributes().get("class")
		if element_class and name in element_class.split() != -1:
			return True
		else:
			return False

	def hasId( self, name ):
		"""Tells if the element has the given id (case sensitive)"""
		return self.attributes().get("id") == name

	def text(self, encoding=DEFAULT_ENCODING):
		return u''

class TextTag(Tag):
	"""Represents raw text, not an element."""

	def __init__( self, html, start, end):
		Tag.__init__(self, html, start, end)

	def hasName( self, name ):
		return False

	def hasClass( self, name ):
		"""Tells if the element has the given class (case sensitive)"""
		return False

	def hasId( self, name ):
		"""Tells if the element has the given id (case sensitive)"""
		return False

	def text(self, encoding=DEFAULT_ENCODING):
		return self._html[self.start:self.end].decode(encoding)

	def name(self):
		return "#text"

class TagList:
	"""Represents a list of ElementTag and TextTag, which basically corresponds
	to the tokenization of an HTML string. The list can be folded as a tree
	if necessary."""

	def __init__( self, content=None ):
		"""Creates a blank TagTree. It should be populated with data using the
		`fromHTML` method."""
		if content == None: content = []
		self.content = content

	def append( self, content ):
		assert isinstance(content, Tag) 
		content.context = self
		self.content.append(content)
		return content

	def fromHTML( self, html, scraper=None ):
		"""Creates the tag list content from the given HTML data. This will
		erase the content of this tag list, replacing it by this one."""
		self.content = []
		offset    = 0
		level     = 0 
		end       = False
		if scraper == None: scraper = HTML
		while not end:
			tag = scraper.findNextTag(html, offset)
			# If there is no tag, it is the document
			if tag == None:
				self.append(TextTag( html, start=offset, end=len(html)))
				end = True
			else:
				tag, tag_end_offset = tag
				tag_type, tag_name, tag_start, attr_start, attr_end = tag
				# There may be text inbetween
				if tag_start > offset:
					self.append(TextTag(html, start=offset,end=tag_start))
				# We process the encountered tag
				#new  = level, tag_type, tag_name, tag_start, attr_end + 1, attr_start, attr_end
				new = ElementTag( html, tag_start, attr_end + 1, attr_start, attr_end, type=tag_type, level=level)
				self.append(new)
				last = new
				offset = tag_end_offset
		return self.content

	def tagtree( self, asXML=False ):
		"""Folds this list into a tree, which is returned as result."""
		root       = TagTree(id=-1)
		parents    = [root]
		counter    = 0
		tags_stack = []
		def find_opening_tag(tag, stack):
			for i in range(len(stack)-1,-1,-1):
				this_tag = stack[i]
				if this_tag.name() == tag.name() and this_tag.type == Tag.OPEN:
					return this_tag, i
			return None, -1
		# We iterate on the tags of this taglist
		for tag in self.content:
			#  We create the node
			if isinstance(tag, TextTag):
				parents[-1].append(TagTree(tag))
			else:
				if tag.type in (Tag.OPEN, Tag.EMPTY):
					if tags_stack and HTML_closeWhen( tag, tags_stack[-1] ) and not asXML:
						# This is the special treatment when we have to close
						# tags in HTML
						# FIXME: The two variables are not used
						parent_tag = tags_stack.pop()
						closing_tag = ElementTag(tag._html, tag.start, tag.start, type=Tag.CLOSE)
						parents.pop().close(tag)
					if tag.type == Tag.EMPTY or (not asXML and HTML_isEmpty(tag)):
						node = TagTree(tag, id=counter)
						parents[-1].append(node)
						counter   += 1
					else:
						node = TagTree(tag, id=counter)
						parents[-1].append(node)
						parents.append(node)
						tags_stack.append(tag)
						counter   += 1
				elif tag.type == Tag.CLOSE:
					opening_tag, level = find_opening_tag(tag, tags_stack)
					if not opening_tag:
						#print "WARNING: no opening tag for ", tag
						continue
					else:
						while len(tags_stack) > level:
							stack_tag = tags_stack.pop()
							node      = parents.pop()
						assert stack_tag == opening_tag
						node.close(tag)
				else:
					raise Exception("Unknow Tag.type: %s" % (tag.type))
		root._taglist = self
		return root

	def html( self ):
		"""Converts this tags list to HTML"""
		res = []
		for tag in self.content:
			assert isinstance(tag, Tag) or isinstance(tag, TextTag)
			res.append(tag.html())
		return "".join(res)
	
	def innerhtml(self):
		res = []
		for tag in self.content[1:-1]:
			assert isinstance(tag, Tag) or isinstance(tag, TextTag)
			res.append(tag.html())
		return "".join(res)

	def text(self, encoding=DEFAULT_ENCODING):
		res = []
		for tag in self.content:
			res.append(tag.text(encoding))
		# FIXME: Unicode
		return u"".join(res)

	def __iter__( self ):
		for tag in self.content:
			yield tag
	
	def __str__( self ):
		return str(self.content)

# FIXME: Should inherit from TagNode
class TagTree:
	"""A tag tree wraps one or two tags and allows to structure tags as a tree.
	The tree node instance offers a nice interface to manipulate the HTML
	document as a tree."""

	TEXT  = "#text"

	def __init__( self, startTag=None, endTag=None, id=None ):
		"""TagTrees should be created by an HTMLTools, and not really directly.
		However, if you really want to create a tree yourself, use the
		'startTag' and 'endTags' to specify start and end tags."""
		self._parent   = None
		self._depth    = 0
		self._taglist  = None
		self.startTag  = None
		self.endTag    = None
		self.id        = id
		self.children  = []
		self.name      = None
		self.open(startTag)
		self.close(endTag)
	
	def clone( self, children=None ):
		"""Clones this tree. If the 'children' attribute is 'True', then the
		children will be cloned as well (deep clone)."""
		clone           = TagTree()
		clone._parent   = self._parent
		clone._depth    = self._depth
		clone._taglist  = self._taglist 
		clone.id        = self.id
		clone.name      = self.name
		if children is None:
			clone.children  = []
			for child in self.children:
				clone.children.append(child.clone())
		else:
			clone.children = children
		clone.open(self.startTag)
		clone.close(self.endTag)
		return clone

	def has( self, name, value=None):
		"""Tells if the start tag of this tag tree has an attribute of the given
		name."""
		if self.startTag == None: return None
		return self.startTag.has(name, value)

	def get( self, name):
		"""Gets the start tag of this tag tree attribute with the given
		'name'"""
		if self.startTag == None: return None
		return self.startTag.get(name)

	def attribute(self, name):
		"""Alias for 'get(name)"""
		return self.attributes().get(name)

	def attributes( self ):
		"""Returns the attributes of this tag tree start tag"""
		if self.startTag == None: return {}
		return self.startTag.attributes()

	def setParent( self, parent ):
		"""Sets the parent TagTree for this tag tree."""
		self._parent = parent
		self._depth  = self.parent().depth() + 1

	def parent( self ):
		"""Returns the parnet tag tree (if any)."""
		return self._parent

	def depth( self ):
		"""Returns the depth of this tag tree."""
		return self._depth

	def isRoot( self ):
		"""Tells if this tag tree is a root (has no parent) or not."""
		return self._parent == None

	def _cutBelow( self, data, value ):
		"""Helper function for the `cut()` method."""
		depth = self.depth()
		if depth > value:
			data.append(self)
		else:
			for child in self.children:
				child._cutBelow(data, value) 
		return data

	def cut( self, above=None, below=None, at=None):
		res = []
		assert not above and not at, "Not implemented"
		if not below is None:
			root = TagTree()
			for child in self._cutBelow(res, below):
				root.append(child)
			return root

	def filter( self, reject=None, accept=None, recursive=False ):
		"""Returns a clone of this tree where each child node is filtered
		through the given 'accept' or 'reject' predicate."""
		res  = []
		root = self.clone(children=res)
		for child in self.children:
			if not reject is None:
				if reject(child): continue
			if not accept is None:
				if accept(child):
					if recursive:
						root.append(child.filter(reject=reject,accept=accept,recursive=recursive))
					else:
						root.append(child.clone())
			else:
				if recursive:
					root.append(child.filter(reject=reject,accept=accept,recursive=recursive))
				else:
					root.append(child.clone())
		return root

	def match( self, predicate ):
		"""Tells if the current TagTree matches the given predicate"""
		if self.startTag and predicate(self.startTag):
			return True
		else:
			return False

	def find( self, predicate, recursive=True ):
		"""Returns a list of child nodes (TagTree objects) that match the given predicate. This
		operation is recursive by default."""
		# NOTE: This has been removed, as find means "find inside"
		# if self.startTag and predicate(self.startTag):
		# 	return [self]
		res  = []
		for c in self.children:
			assert isinstance(c, TagTree)
			if predicate(c):
				res.append(c)
			if recursive:
				res = res + c.find(predicate)
		return res

	def open( self, startTag):
		if startTag==None: return
		assert self.startTag == None
		assert self.endTag == None
		assert isinstance(startTag, Tag)
		self.startTag = startTag
		if isinstance(startTag, TextTag):
			self.name     = TagTree.TEXT
		else:
			self.name     = startTag.name()
		assert self.name, repr(startTag.html()) + ":" + startTag.name()
		return self

	def close( self, endTag ):
		if endTag==None: return
		assert self.endTag == None
		assert isinstance(endTag, ElementTag)
		self.endTag = endTag
		return self

	def append( self, node ):
		assert isinstance(node, TagTree)
		node.setParent(self)
		assert node != self
		self.children.append(node)
		self._taglist = None
		return self

	def merge( self, node ):
		assert isinstance(node, TagTree)
		for child in node.children:
			self.append(child)
		return self

	def list( self, contentOnly=False ):
		"""Returns a tag list from this Tree Node."""
		if self._taglist == None:
			content = []
			if self.startTag: content.append(self.startTag)
			for c in self.children: content.extend(c.list(contentOnly=True))
			if self.endTag: content.append(self.endTag)
			self._taglist = TagList(content=content)
		if contentOnly:
			return self._taglist.content
		else:
			return self._taglist

	def hasName( self, name ):
		"""Tells if the element has the given class (case sensitive)"""
		if self.startTag: return self.startTag.hasName(name)
		else: return None

	def hasClass( self, name ):
		"""Tells if the element has the given class (case sensitive)"""
		if self.startTag: return self.startTag.hasClass(name)
		else: return None

	def hasId( self, name ):
		"""Tells if the element has the given id (case sensitive)"""
		if self.startTag: return self.startTag.hasId(name)
		else: return None

	def prettyString( self, ):
		if self.name == self.TEXT:
			return "#text:" + repr(self.startTag.html())
		else:
			if self._parent == None:
				res =  "#root\n"
			else:
				res =  self.startTag.name() 
				res += "["
				if self.id != None: res += "#%d" % (self.id) 
				attr  = []
				for k,v in self.attributes().items():attr.append("%s=%s" % (k,v))
				attr = ",".join(attr)
				if attr: attr = "(%s)" % (attr)
				res += "@%d]%s\n" % (self.depth(), attr)
			for c in self.children:
				ctext = ""
				for line in c.prettyString().split("\n"):
					if not line: continue
					if not ctext:
						ctext  = "   <" + line + "\n"
					else:
						ctext += "    " + line + "\n"
				res += ctext
			return res

	def query( self, query ):
		"""Does a basic CSS-like query on the TagTree. Returns a TagTree"""
		if type(query) not in (tuple, list):
			selectors = filter(lambda _:_.strip(), query.split(" "))
		else:
			selectors = filter(lambda _:_.strip(), query)
		if selectors:
			head      = selectors[0]
			tail      = []
			if len(selectors) >= 1: tail = selectors[1:]
			predicate = lambda: True
			match     = RE_QUERY.match(head)
			assert match, "Invalid selector expression: " + repr(head)
			p_name, p_id, p_class, p_property, p_count = match.group("name"), match.group("id"), match.group("class"), match.group("property"), match.group("count")
			if p_name:
				predicate = lambda _:predicate and _.hasName(p_name)
			if p_id:
				predicate = lambda _:predicate and _.hasId(p_id[1:])
			if p_class:
				predicate = lambda _:predicate and _.hasClass(p_class[1:])
			res = []
			for sub_tree in self.find(predicate):
				res = res + sub_tree.query(tail)
			if p_property:
				if p_property == ":text":
					res = map(lambda _:_.text(), res)
				else:
					raise Exception("Property selector not supproted yet: " + p_property)
			if p_count:
				count = int(p_count[1:-1])
				if count < 0:
					count = len(res) + count
				if count < len(res):
					return [res[count]]
				else:
					return [None]
			else:
				return res
		else:
			return [self]

	def __str__( self ):
		return self.prettyString()
	
	def __repr__( self ):
		return str(self.list())

	def html( self ):
		"""Converts this tags tree to HTML"""
		return self.list().html()

	def text( self ):
		"""Returns only the text tags in this HTML tree"""
		return self.list().text()

	def innerhtml( self ):
		return self.list().innerhtml()

	def __iter__( self ):
		for tag in self.list():
			yield tag

# -----------------------------------------------------------------------------
#
# HTML PRESETS
#
# -----------------------------------------------------------------------------

HTML_EMPTY = """\
AREA BASE BASEFONT BR COL FRAME HR IMG INPUT ISINDEX LINK META PARAM
"""[:-1].split()

HTML_MAYBE_EMPTY = """\
A P
"""[:-1].split()

def HTML_isEmpty( tag ):
	tag_name = tag.name().upper()
	if tag_name in HTML_EMPTY: return True
	if tag_name == "A" and not tag.has("href"): return True
	return False

def HTML_mayBeEmpty( tag ):
	tag_name = tag.name().upper()
	if tag_name in HTML_MAYBE_EMPTY: return True
	return False

def HTML_closeWhen( current, parent ):
	cur_name = (current.name() or "").upper()
	par_name = (parent.name() or "").upper()
	if cur_name == par_name == "TD": return True
	if cur_name == par_name == "TR": return True
	if cur_name == par_name == "P": return True
	if par_name == "P" and cur_name in ("DIV", "TABLE", "UL", "BLOCKQUOTE",
	"FORM"): return True
	return False

# -----------------------------------------------------------------------------
#
# HTML PARSING FUNCTIONS
#
# -----------------------------------------------------------------------------

class HTMLTools:
	"""This class contains a set of tools to process HTML text data easily. This
	class can operate on a full HTML document, or on any subset of the
	document."""

	LEVEL_ACCOUNT = [ "html", "head", "body", "div", "table", "tr", "td" ]

	def __init__( self ):
		pass
	
	# PREDICATES
	# ========================================================================

	def withClass( self, name ):
		"""Predicate that filters node by class"""
		return lambda n:n.hasClass(name)

	def withName( self, name ):
		"""Predicate that filters node by class"""
		return lambda n:n.hasName(name)

	# BASIC PARSING OPERATIONS
	# ========================================================================

	def parse( self, html ):
		"""Returns a tagtree from the given HTML string, tag list or tree
		node."""
		return self.tree(html)
	
	def tree( self, html, asXML=False ):
		tag_list = TagList()
		tag_list.fromHTML(html, scraper=self)
		return tag_list.tagtree(asXML)

	def list( self, data ):
		"""Converts the given text or tagtree into a taglist."""
		if type(data) in (str, unicode):
			tag_list = TagList()
			tag_list.fromHTML(data, scraper=self)
			return tag_list
		elif isinstance(data, TagList):
			return data
		elif isinstance(data, TagTree):
			return data.list()
		else:
			raise Exception("Unsupported data:" + data)

	def html( self, data ):
		"""Converts the given taglist or tagtree into HTML, and returns
		a string or unicode.""" 
		if type(data) == str:
			return data
		elif type(data) == unicode:
			return data
		elif isinstance(data, TagList):
			return data.html()
		elif isinstance(data, TagTree):
			return data.html()
		else:
			raise Exception("Unsupported data:" + repr(data))

	# TEXT OPERATIONS
	# ========================================================================

	def textcut( self, text, cutfrom=None, cutto=None ):
		"""Cuts the text from the given marker, to the given marker."""
		text = self.html(text)
		if cutfrom: start = text.find(cutfrom)
		else: start = 0
		if cutto: end = text.find(cutto)
		else: end = -1
		if start == -1: start = 0
		elif cutfrom: start += len(cutfrom)
		return text[start:end]

	def textlines( self, text, strip=True, empty=False ):
		"""Returns a list of lines for the given HTML text. Lines are stripped
		and empty lines are filtered out by default."""
		text = self.html(text)
		lines = text.split("\n")
		if strip: lines = map(string.strip, lines)
		if not empty: lines = filter(lambda x:x, lines)
		return lines

	def text( self, data, expand=False, norm=False ):
		"""Strips the given tags from HTML text"""
		res = None
		if type(data) in (str, unicode):
			res = data
		else:
			res = data.text()
		if expand: res = self.expand(res)
		if norm: res = self.norm(res)
		return res

	def expand( self, text, encoding=None ):
		"""Expands the entities found in the given text."""
		if not (type(text) in (str, unicode)):
			text = text.text()
		# NOTE: This is based on
		# <http://www.shearersoftware.com/software/developers/htmlfilter/>
		entityStart = text.find('&')
		if entityStart != -1:          # only run bulk of code if there are entities present
			preferUnicodeToISO8859 = True
			prevOffset = 0
			textParts = []
			while entityStart != -1:
				textParts.append(text[prevOffset:entityStart])
				entityEnd = text.find(';', entityStart+1)
				if entityEnd == -1:
					entityEnd = entityStart
					entity = '&'
				else:
					entity = text[entityStart:entityEnd+1]
					if len(entity) < 4 or entity[1] != '#':
						entity = htmlentitydefs.entitydefs.get(entity[1:-1],entity)
					if len(entity) == 1:
						if preferUnicodeToISO8859 and ord(entity) > 127 and hasattr(entity, 'decode'):
							entity = entity.decode('iso-8859-1')
							if type(text) != unicode and encoding:
								entity = entity.encode(encoding)
					else:
						if len(entity) >= 4 and entity[1] == '#':
							if entity[2] in ('X','x'):
								entityCode = int(entity[3:-1], 16)
							else:
								entityCode = int(entity[2:-1])
							if entityCode > 255:
								entity = unichr(entityCode)
							else:
								entity = chr(entityCode)
								if preferUnicodeToISO8859 and hasattr(entity, 'decode'):
									entity = entity.decode('iso-8859-1')
									if type(text) != unicode and encoding:
										entity = entity.encode(encoding)
					textParts.append(entity)
				prevOffset = entityEnd+1
				entityStart = text.find('&', prevOffset)
			textParts.append(text[prevOffset:])
			text = u''.join(textParts)
		return text


	# FORMS-RELATED OPERATIONS
	# ========================================================================

	def forms( self, html ):
		return form.parseForms(self, self.html(html))

	def images( self, html, like=None ):
		"""Iterates through the links found in this document. This yields the
		tag name and the href value."""
		for name, url in self.links(html, like):
			if name == "img":
				yield url

	def links( self, html, like=None ):
		"""Iterates through the links found in this document. This yields the
		tag name and the href value."""
		if html:
			html = self.html(html)
			if like != None:
				if type(like) in (str,unicode): like = re.compile(like)
			res = []
			for match in self.onRE(html, RE_HTMLLINK):
				tag  = match.group()
				tag  = tag.replace("\t"," ")
				tag  = tag.replace("\n"," ")
				tag  = tag[1:tag.find(" ")]
				href = match.group(2)
				if href[0] in ("'", '"'): href = href[1:-1]
				if not like or like.match(href):
					yield tag, href

	# UTILITIES
	# ========================================================================

	def findNextTag( self, html, offset=0 ):
		"""Finds the next tag in the given HTML text from the given offset. This
		returns (tag type, tag name, tag start, attributes start, attributes
		end) and tag end or None."""
		if offset >= len(html) - 1: return None
		m = RE_HTMLSTART.search(html, offset)
		if m == None:
			return None
		n = RE_HTMLEND.search(html, m.end())
		if n == None:
			return HTMLTools.findNextTag(self, html, m.end())
		if m.group()[1] == "/": tag_type = Tag.CLOSE
		elif n.group()[0] == "/": tag_type = Tag.EMPTY
		else: tag_type = Tag.OPEN
		return (tag_type, m.group(1), m.start(), m.end(), n.start()), n.end()

	@staticmethod
	def onRE( text, regexp, off=0 ):
		"""Itearates through the matches for the given regular expression."""
		res = True
		while res:
			res = regexp.search(text, off)
			if res:
				off = res.end()
				yield res

	@staticmethod
	def norm( text ):
		"""Normalizes the spaces (\t, \n, etc) so that everything gets converted
		to single space."""
		return RE_SPACES.sub(" ", text).strip()

	@staticmethod
	def parseTag( text ):
		"""Parses the HTML/XML tag in the given text, returning its name and
		attributes."""
		text  = text.strip()
		space = text.find(" ")
		if   text[0:2] == "</":  start = 2
		elif text[0]   == "<":   start = 1
		else:                    start = 0
		if   text[-2:0] == "/>": end    = -2
		elif text[-1]   == ">":  end   = -1
		else:                    end   = len(text)
		if space:
			name  = text[start:space]
			attr  = text[space:end].strip()
			return (name, HTML.parseAttributes(attr))
		else:
			return (text[start:end].strip(), {})

	@staticmethod
	def parseAttributes(text, attribs = None):
		"""Parses the HTML/XML attributes described in the given text."""
		if attribs == None: attribs = {}
		eq = text.find("=")
		# There may be attributes without a trailing =
		# Like  ''id=all type=radio name=meta value="" checked''
		if eq == -1:
			space = text.find(" ")
			if space == -1:
				name = text.strip()
				if name: attribs[name] = None
				return attribs
			else:
				name = text[:space].strip()
				if name: attribs[name] = None
				return HTML.parseAttributes(text[space+1:], attribs)
		else:
			if eq + 1 == len(text):
				return attribs
			sep = text[eq+1]
			if   sep == "'": end = text.find( "'", eq + 2 )
			elif sep == '"': end = text.find( '"', eq + 2 )
			else: end = text.find(" ", eq)
			# Did we reach the end ?
			name = text[:eq].strip()
			if end == -1:
				value = text[eq+1:]
				if value and value[0] in ("'", '"'): value = value[1:-1]
				else: value = value.strip()
				attribs[name.lower()] = value
				return attribs
			else:
				value = text[eq+1:end+1]
				if value[0] in ("'", '"'): value = value[1:-1]
				else: value = value.strip()
				attribs[name.lower()] = value
				return HTML.parseAttributes(text[end+1:].strip(), attribs)

# We create a shared instance with the scraping tools
HTML = HTMLTools()

# EOF - vim: tw=80 ts=4 sw=4 noet