/lib/pwiki/StringOps.py
Python | 2087 lines | 1916 code | 56 blank | 115 comment | 17 complexity | 11bb72f8177a3572d741e7ddc2ef3916 MD5 | raw file
Possible License(s): LGPL-2.1
Large files files are truncated, but you can click here to view the full file
- ## -*- coding: ISO-8859-1 -*-
-
- """
- Various string operations, like unicode encoding/decoding,
- creating diff information for plain byte sequences
- """
-
- import os, traceback
-
- from struct import pack, unpack
-
- import difflib, codecs, os.path, random, base64, locale, hashlib, tempfile, math
-
- # import urllib_red as urllib
- import urllib, urlparse, cgi
-
- from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE
-
- import wx
-
- import re as _re # import pwiki.srePersistent as reimport pwiki.srePersistent as _re
- from WikiExceptions import *
-
- from Utilities import between
-
-
- LINEEND_SPLIT_RE = _re.compile(r"\r\n?|\n", _re.UNICODE)
-
- from SystemInfo import isUnicode, isOSX, isLinux, isWindows, isWin9x
-
-
- # To generate dependencies for py2exe/py2app
- import encodings.utf_8, encodings.latin_1, encodings.utf_16, \
- encodings.utf_16_be, encodings.utf_16_le, encodings.ascii
-
-
-
- # ---------- Encoding conversion ----------
-
-
- utf8Enc = codecs.getencoder("utf-8")
- utf8Dec = codecs.getdecoder("utf-8")
- utf8Reader = codecs.getreader("utf-8")
- utf8Writer = codecs.getwriter("utf-8")
-
- def convertLineEndings(text, newLe):
- """
- Convert line endings of text to string newLe which should be
- "\n", "\r" or "\r\n". If newLe or text is unicode, the result
- will be unicode, too.
- """
- return newLe.join(LINEEND_SPLIT_RE.split(text))
-
- def lineendToInternal(text):
- return convertLineEndings(text, "\n")
-
-
-
- if isOSX():
- # generate dependencies for py2app
- import encodings.mac_roman
- _mbcsEnc = codecs.getencoder("mac_roman")
- _mbcsDec = codecs.getdecoder("mac_roman")
- mbcsReader = codecs.getreader("mac_roman")
- mbcsWriter = codecs.getwriter("mac_roman")
-
- def lineendToOs(text):
- return convertLineEndings(text, "\r")
-
- elif isLinux():
- # Could be wrong encoding
- # LINUX_ENCODING = "latin-1"
- # LINUX_ENCODING = "utf8"
- LINUX_ENCODING = locale.getpreferredencoding()
-
- if not LINUX_ENCODING:
- LINUX_ENCODING = "utf8"
-
- _mbcsEnc = codecs.getencoder(LINUX_ENCODING)
- _mbcsDec = codecs.getdecoder(LINUX_ENCODING)
- mbcsReader = codecs.getreader(LINUX_ENCODING)
- mbcsWriter = codecs.getwriter(LINUX_ENCODING)
-
- def lineendToOs(text):
- return convertLineEndings(text, "\n")
-
- else:
- # generate dependencies for py2exe
- import encodings.ascii
- import encodings.mbcs
- _mbcsEnc = codecs.getencoder("mbcs")
- _mbcsDec = codecs.getdecoder("mbcs")
- mbcsReader = codecs.getreader("mbcs")
- mbcsWriter = codecs.getwriter("mbcs")
-
- def lineendToOs(text):
- return convertLineEndings(text, "\r\n")
-
-
- def mbcsEnc(input, errors="strict"):
- if isinstance(input, str):
- return input, len(input)
- else:
- return _mbcsEnc(input, errors)
-
-
- def mbcsDec(input, errors="strict"):
- if isinstance(input, unicode):
- return input, len(input)
- else:
- return _mbcsDec(input, errors)
-
-
-
- if os.path.supports_unicode_filenames:
- def dummy(s):
- return s
-
- pathEnc = dummy
- pathDec = dummy
- else:
- def pathEnc(s):
- if s is None:
- return None
- return mbcsEnc(s, "replace")[0]
-
- def pathDec(s):
- if s is None:
- return None
- return mbcsDec(s, "replace")[0]
-
-
- if isWindows():
- if not os.path.supports_unicode_filenames:
- raise InternalError("This Python version does not support unicode paths")
-
- # To process pathes longer than 255 characters, Windows (NT and following)
- # expects an absolute path prefixed with \\?\
-
- def longPathEnc(s):
- if s is None:
- return None
- # if s.startswith("\\\\?\\"):
- if s.startswith("\\\\"):
- return s
-
- return u"\\\\?\\" + os.path.abspath(s)
-
- def longPathDec(s):
- if s is None:
- return None
- if s.startswith("\\\\?\\"):
- return s[4:]
-
- return s
-
- else:
- longPathEnc = pathEnc
- longPathDec = pathDec
-
-
- if isUnicode():
- def uniToGui(text):
- """
- Convert unicode text to a format usable for wx GUI
- """
- return text # Nothing to do
-
- def guiToUni(text):
- """
- Convert wx GUI string format to unicode
- """
- return text # Nothing to do
- else:
- def uniToGui(text):
- """
- Convert unicode text to a format usable for wx GUI
- """
- return mbcsEnc(text, "replace")[0]
-
- def guiToUni(text):
- """
- Convert wx GUI string format to unicode
- """
- return mbcsDec(text, "replace")[0]
-
-
- # TODO!
- def unicodeToCompFilename(us):
- """
- Encode a unicode filename to a filename compatible to (hopefully)
- any filesystem encoding by converting unicode to '=xx' for
- characters up to 255 and '$xxxx' above. Each 'x represents a hex
- character
- """
- result = []
- for c in us:
- if ord(c) > 255:
- result.append("$%04x" % ord(c))
- continue
- if c in u"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"+\
- u"{}()+-_,.%": # Allowed characters
- result.append(str(c))
- continue
-
- result.append("=%02x" % ord(c))
-
- return "".join(result)
-
-
- # def unicodeToAllCharFilename
-
- def strWithNone(s):
- if s is None:
- return ""
-
- return s
-
- def uniWithNone(u):
- if u is None:
- return u""
-
- return u
-
-
- def strToBool(s, default=False):
- """
- Try to interpret string (or unicode) s as
- boolean, return default if string can't be
- interpreted
- """
-
- if s is None:
- return default
-
- # Try to interpret as integer
- try:
- return int(s) != 0
- except ValueError:
- # Not an integer
- s = s.lower()
- if s in (u"true", u"yes", u"on"):
- return True
- if s in (u"false", u"no", u"off"):
- return False
-
- return default
-
-
- # TODO More formats
- def fileContentToUnicode(content):
- """
- Try to detect the text encoding of content
- and return converted unicode
- """
- if content.startswith(BOM_UTF8):
- return content[len(BOM_UTF8):].decode("utf-8", "replace")
- elif content.startswith(BOM_UTF16_BE):
- return content[len(BOM_UTF16_BE):].decode("utf-16-be", "replace")
- elif content.startswith(BOM_UTF16_LE):
- return content[len(BOM_UTF16_LE):].decode("utf-16-le", "replace")
- else:
- return mbcsDec(content, "replace")[0]
-
-
- def contentToUnicode(content):
- """
- Try to detect the text encoding of content
- and return converted unicode
- """
- if isinstance(content, unicode):
- return content
-
- if content.startswith(BOM_UTF8):
- return content[len(BOM_UTF8):].decode("utf-8", "replace")
- elif content.startswith(BOM_UTF16_BE):
- return content[len(BOM_UTF16_BE):].decode("utf-16-be", "replace")
- elif content.startswith(BOM_UTF16_LE):
- return content[len(BOM_UTF16_LE):].decode("utf-16-le", "replace")
- else:
- try:
- return content.decode("utf-8", "strict")
- except UnicodeDecodeError:
- return mbcsDec(content, "replace")[0]
-
-
-
-
-
- def loadEntireTxtFile(filename):
- """
- Load entire file (text mode) and return its content.
- """
- rf = open(pathEnc(filename), "rU")
- try:
- result = rf.read()
- return result
- finally:
- rf.close()
-
-
- # def writeEntireTxtFile(filename, content):
- # """
- # Write entire file (text mode).
- # content can either be a byte string or a tuple or list of byte strings
- # which are then written one by one to the file.
- # """
- # rf = open(pathEnc(filename), "w")
- # try:
- # if isinstance(content, tuple) or isinstance(content, list):
- # for c in content:
- # rf.write(c)
- # else:
- # rf.write(content)
- # return
- # finally:
- # rf.close()
-
-
- # def writeEntireFileFast(filename, content, textMode=False):
- # """
- # Fast write of bytestring content without temporary file and
- # error checking.
- # """
- # if textMode:
- # rf = open(pathEnc(filename), "w")
- # else:
- # rf = open(pathEnc(filename), "wb")
- #
- # try:
- # rf.write(content)
- # finally:
- # rf.close()
-
-
- def loadEntireFile(filename, textMode=False):
- """
- Load entire file and return its content.
- """
- if textMode:
- rf = open(pathEnc(filename), "rU")
- else:
- rf = open(pathEnc(filename), "rb")
-
- try:
- return rf.read()
- finally:
- rf.close()
-
-
-
- def writeEntireFile(filename, content, textMode=False):
- """
- Write entire file.
- content can either be a bytestring or a tuple or list of bytestrings
- which are then written one by one to the file.
- If textMode is True, content can also be a unistring or sequence
- of them (no mixed bytestring/unistring sequences allowed!)
- which are then converted to UTF-8 and written to file with prefixed BOM
- for utf-8. In textMode, lineEndings are properly converted to the
- appropriate for the OS.
- """
- import TempFileSet
-
- basePath = os.path.split(filename)[0]
- suffix = os.path.splitext(filename)[1]
-
- if basePath == "":
- basePath = u"."
-
- tempPath = TempFileSet.createTempFile(content, suffix=suffix, path=basePath,
- textMode=textMode)
-
- if os.path.exists(filename):
- os.unlink(filename)
-
- os.rename(tempPath, filename)
-
-
-
- def getFileSignatureBlock(filename, timeCoarsening=None):
- """
- Returns the file signature block for a given file. It is a bytestring
- containing size and modification date of the file and can be compared to a
- db-stored version to check for file changes outside of WikidPad.
-
- The timeCoarsening can be a number of seconds (or fractions thereof).
- The modification time is rounded UP to a number divisible by timeCoarsening.
-
- If a wiki is moved between file systems with different time granularity
- (e.g. NTFS uses 100ns, FAT uses 2s for mod. time) the file would be seen as
- dirty and cache data would be rebuild without need without coarsening.
- """
- statinfo = os.stat(pathEnc(filename))
-
- if timeCoarsening is None or timeCoarsening <= 0:
- return pack(">BQd", 0, statinfo.st_size, statinfo.st_mtime)
-
- ct = int(math.ceil(statinfo.st_mtime / timeCoarsening)) * timeCoarsening
-
- return pack(">BQd", 0, statinfo.st_size, ct)
-
-
-
-
- def removeBracketsFilename(fn):
- """
- Remove brackets (real brackets, not configurable) from a filename
- """
- n, ext = os.path.splitext(fn)
- if n.startswith(u"[") and n.endswith(u"]"):
- n = n[1:-1]
-
- return n + ext
-
-
- def revStr(s):
- """
- Return reversed string
- """
- s = list(s)
- s.reverse()
- return u"".join(s)
-
- def splitKeep(s, delim):
- """
- Similar to split, but keeps the delimiter as separate element, e.g.
- splitKeep("aaabaaabaa", "b") -> ["aaa", "b", "aaa", "b", "aa"]
- """
- result = []
- for e in s.split(delim):
- result.append(e)
- result.append(delim)
-
- return result[:-1]
-
- def splitIndentDeepness(text):
- """
- Return tuple (d, t) where d is deepness of indentation and t is text
- without the indentation.
- """
- pl = len(text)
- text = text.lstrip()
- return (pl-len(text), text)
-
- def splitIndent(text):
- """
- Return tuple (ind, t) where ind is a string of the indentation characters
- (normally spaces) and t is text without the indentation.
- """
- pl = len(text)
- textOnly = text.lstrip()
- return (text[:pl-len(textOnly)], textOnly)
-
- def measureIndent(indent):
- return len(indent)
-
-
- def findLineStart(text, pos):
- # This is even right if no newline is found
- return text.rfind(u"\n", 0, pos) + 1
-
-
- def findLineEnd(text, pos):
- result = text.find(u"\n", pos)
- if result == -1:
- return len(text)
- else:
- return result
-
-
-
- LASTWORDSTART_RE = _re.compile(r"(?:.*\W)?()\w", _re.UNICODE)
- FIRSTWORDEND_RE = _re.compile(r".*?()(?:\W|(?!.))", _re.UNICODE)
-
-
-
- def getNearestWordStart(text, pos):
- lsPos = findLineStart(text, pos)
-
- match = LASTWORDSTART_RE.match(text, lsPos, pos + 1)
- if match is not None:
- return match.start(1)
- else:
- return pos
-
-
- def getNearestWordEnd(text, pos):
- match = FIRSTWORDEND_RE.match(text, pos)
- if match is not None:
- return match.start(1)
- else:
- return pos
-
-
- def styleSelection(text, start, afterEnd, startChars, endChars=None):
- """
- Called when selected text (between start and afterEnd)
- e.g. in editor should be styled with startChars and endChars
- text -- Whole text
- start -- Start position of selection
- afterEnd -- After end position of selection
-
- startChars -- Characters to place before selection
- endChars -- Characters to place after selection. If None, startChars
- is used for that, too
-
- Returns tuple (replacement, repStart, repAfterEnd, selStart, selAfterEnd) where
-
- replacement -- replacement text
- repStart -- Start of characters to delete in original text
- repAfterEnd -- After end of characters to delete
- selStart -- Recommended start of editor selection after replacement
- was done
- selAfterEnd -- Recommended after end of editor selection after replacement
- """
- if endChars is None:
- endChars = startChars
-
- if start == afterEnd:
- start = getNearestWordStart(text, start)
- afterEnd = getNearestWordEnd(text, start)
-
- emptySelection = start == afterEnd # is selection empty
-
- replacement = startChars + text[start:afterEnd] + endChars
-
- if emptySelection:
- # If selection is empty, cursor should in the end
- # stand between the style characters
- cursorPos = afterEnd + len(startChars)
- else:
- # If not, it will stand after styled word
- cursorPos = afterEnd + len(startChars) + len(endChars)
-
- return (replacement, start, afterEnd, cursorPos, cursorPos)
-
-
-
- def splitFill(text, delim, count, fill=u""):
- """
- Split text by delim into up to count pieces. If less
- pieces than count+1 are available, additional pieces are added containing
- fill.
- """
- result = text.split(delim, count)
- if len(result) < count + 1:
- result += [fill] * (count + 1 - len(result))
-
- return result
-
-
- # def splitUnifName(unifName):
- # """
- # Split a unified name path and return a list of components.
- # If a part of the path must contain a slash it is quoted as double slash.
- #
- # Some unified names shouldn't be processed by this function, especially
- # "wikipage/..." unifNames
- # """
- # result =
-
-
-
- def matchWhole(reObj, s):
- """
- reObj -- Compiled regular expression
- s -- String to match
-
- Similar to reObj.match(s), but returns MatchObject only if the
- whole string s is covered by the match, returns None otherwise
- """
- mat = reObj.match(s)
- if not mat:
- return None
- if mat.end(0) < len(s):
- return None
-
- return mat
-
-
-
- def obfuscateShortcut(shortcut):
- """
- Necessary to prevent wxPython from interpreting e.g. CTRL+LEFT in a menu
- item as being a shortcut. I haven't found a better way.
- Unused at the moment.
- """
- return u"".join([u"\u200B" + c for c in shortcut])
-
-
-
- ## Copied from xml.sax.saxutils and modified to reduce dependencies
- def escapeHtml(data):
- """
- Escape &, <, > and line breaks in a unicode string of data.
- """
-
- # must do ampersand first
-
- return data.replace(u"&", u"&").replace(u">", u">").\
- replace(u"<", u"<").replace(u"\n", u"<br />\n")
-
-
- def escapeHtmlNoBreaks(data):
- """
- Escape &, <, and > (no line breaks) in a unicode string of data.
- """
-
- # must do ampersand first
-
- return data.replace(u"&", u"&").replace(u">", u">").\
- replace(u"<", u"<")
-
-
-
-
- class AbstractHtmlItem:
- """
- Abstract base for some "things" appearing in HTML. This and derived classes
- mainly needed for the "htmlEquivalent" token in a wiki AST
- """
- def __init__(self):
- pass
-
- def asString(self):
- raise NotImplementedError
-
- def clone(self):
- raise NotImplementedError
-
- def __repr__(self):
- return self.__class__.__name__ + ":" + self.asString()
-
-
- class HtmlStartTag(AbstractHtmlItem):
- """
- Regular start tag
- """
- def __init__(self, tag, attributes=None):
- self.tag = tag
- if attributes is None:
- self.attributes = {}
- else:
- self.attributes = dict((k, escapeHtml(v).replace(u"\"", u"""))
- for k, v in attributes.iteritems())
-
- def addAttribute(self, key, value):
- if value is None:
- value = key
-
- self.attributes[key] = escapeHtml(value).replace(u"\"", u""")
-
-
- def addEscapedAttribute(self, key, value):
- if value is None:
- value = key
-
- self.attributes[key] = value
-
-
- def addEscapedAttributes(self, attrSeq):
- for key, value in attrSeq:
- self.addEscapedAttribute(key, value)
-
-
- def getTag(self):
- return self.tag
-
- def getStringForAttributes(self):
- return u" ".join(
- k + u"=\"" + v + u"\""
- for k, v in self.attributes.iteritems())
-
- def asString(self):
- if len(self.attributes) == 0:
- return u"<" + self.tag + u">"
-
- attrString = self.getStringForAttributes()
- return u"<" + self.tag + u" " + attrString + u">"
-
-
- def clone(self):
- return HtmlStartTag(self.tag, self.attributes)
-
-
- class HtmlEmptyTag(HtmlStartTag):
- """
- Start tag which is also end tag
- """
-
- def asString(self):
- if len(self.attributes) == 0:
- return u"<" + self.tag + u" />"
-
- attrString = self.getStringForAttributes()
- return u"<" + self.tag + u" " + attrString + u" />"
-
- def clone(self):
- return HtmlEmptyTag(self.tag, self.attributes)
-
-
- class HtmlEndTag(AbstractHtmlItem):
- """
- Regular end tag
- """
- def __init__(self, tag):
- self.tag = tag
-
- def asString(self):
- return u"</" + self.tag + u">"
-
- def clone(self):
- return HtmlEndTag(self.tag)
-
-
- class HtmlEntity(AbstractHtmlItem):
- """
- Entity
- """
- def __init__(self, entity):
- if entity[0] != "&":
- entity = "&" + entity
-
- if entity[-1] != ";":
- entity += ";"
-
- self.entity = entity
-
- def asString(self):
- return self.entity
-
- def clone(self):
- return HtmlEntity(self.entity)
-
-
-
- def escapeForIni(text, toEscape=u""):
- """
- Return an escaped version of string. Always escaped will be backslash and
- all characters with ASCII value < 32. Additional characters can be given in
- the toEscape parameter (as unicode string, only characters < 128,
- not the backslash).
-
- Returns: unicode string
- """
- # Escape '\'
- text = text.replace(u"\\", u"\\x%02x" % ord("\\"))
-
- # Escape everything with ord < 32
- for i in xrange(32):
- text = text.replace(unichr(i), u"\\x%02x" % i)
-
- for c in toEscape:
- text = text.replace(c, u"\\x%02x" % ord(c))
-
- return text
-
-
- def _unescapeForIniHelper(match):
- return unichr(int(match.group(1), 16))
-
- def unescapeForIni(text):
- """
- Inverse of escapeForIni()
- """
- return _re.sub(ur"\\x([0-9a-f]{2})", _unescapeForIniHelper, text)
-
-
- # def escapeWithRe(text):
- # return text.replace(u"\\", u"\\\\").replace("\n", "\\n").\
- # replace("\r", "\\r")
-
- def unescapeWithRe(text):
- """
- Unescape things like \n or \f. Throws exception if unescaping fails
- """
- return _re.sub(u"", text, u"", 1)
-
-
- def re_sub_escape(pattern):
- """
- Escape the replacement pattern for a re.sub function
- """
- return pattern.replace(u"\\", u"\\\\").replace(u"\n", u"\\n").replace(
- u"\r", u"\\r").replace(u"\t", u"\\t").replace(u"\f", u"\\f")
-
-
- HTML_DIGITCOLOR = _re.compile(
- ur"^#[0-9a-fA-F]{3}(?:[0-9a-fA-F]{3})?$",
- _re.DOTALL | _re.UNICODE | _re.MULTILINE)
-
-
- # def htmlColorToRgbTuple(desc):
- def colorDescToRgbTuple(desc):
- """
- Converts a color description to an RGB tuple or None if
- description is invalid.
- Color description can be:
- HTML 6-digits color, e.g. #C0D623
- HTML 3-digits color, e.g. #4E2 which converts to #44EE22 (TODO: HTML standard?)
- HTML color name
- """
- global HTML_DIGITCOLOR, _COLORBASE
-
- if not HTML_DIGITCOLOR.match(desc):
- try:
- desc = _COLORBASE[desc.replace(" ", "").lower()]
- except KeyError:
- return None
-
- if len(desc) == 4:
- desc = "#" + desc[1] + desc[1] + desc[2] + desc[2] + desc[3] + desc[3]
- try:
- r = int(desc[1:3], 16)
- g = int(desc[3:5], 16)
- b = int(desc[5:7], 16)
- return (r, g, b)
- except:
- return None
-
-
- # def colorDescToRgbTuple(desc):
- # """
- # Converts a color description to an RGB tuple or None if
- # description is invalid.
- # Color description can be:
- # HTML 6-digits color, e.g. #C0D623
- # HTML 3-digits color, e.g. #4E2 which converts to #44EE22 (TODO: HTML standard?)
- # HTML color name
- # """
- # desc = desc.strip()
- # if len(desc) == 0:
- # return None
- #
- # if desc[0] != "#":
- # desc = desc.replace(" ", "").lower()
- # desc = _COLORBASE.get(desc)
- # if desc is None:
- # return None
- #
- # if len(desc) == 4:
- # desc = "#" + desc[1] + desc[1] + desc[2] + desc[2] + desc[3] + desc[3]
- #
- # if len(desc) != 7:
- # return None
- # try:
- # r = int(desc[1:3], 16)
- # g = int(desc[3:5], 16)
- # b = int(desc[5:7], 16)
- # return (r, g, b)
- # except:
- # return None
-
-
- def rgbToHtmlColor(r, g, b):
- """
- Return HTML color '#hhhhhh' format string.
- """
- return "#%02X%02X%02X" % (r, g, b)
-
-
- def base64BlockEncode(data):
- """
- Cut a sequence of base64 characters into chunks of 70 characters
- and join them with newlines. Pythons base64 decoder can read this.
- """
- b64 = base64.b64encode(data)
-
- result = []
- while len(b64) > 70:
- result.append(b64[:70])
- b64 = b64[70:]
-
- if len(b64) > 0:
- result.append(b64)
-
- return u"\n".join(result)
-
-
- # Just for completeness
- base64BlockDecode = base64.b64decode
-
-
-
- EXTENDED_STRFTIME_RE = _re.compile(
- r"([^%]+|%(?:%|[%aAbBcdHIJmMpSUwWxXyYZ])|(?:%u))",
- _re.DOTALL | _re.UNICODE | _re.MULTILINE)
-
-
- def formatWxDate(frmStr, date):
- """
- Format a date (wxDateTime) according to frmStr similar to strftime.
- """
- if frmStr == "":
- return frmStr
-
- resParts = []
-
- for part in EXTENDED_STRFTIME_RE.split(frmStr):
- if not part:
- continue
- if part == "%u":
- # Create weekday following ISO-8601
- wd = date.GetWeekDay()
- if wd == 0:
- # Sunday has number 7
- wd = 7
- resParts.append("%i" % wd)
- else:
- resParts.append(part)
-
- frmStr = "".join(resParts)
-
- return date.Format(unescapeWithRe(frmStr))
-
-
- def strftimeUB(frmStr, timet=None):
- """
- Similar to time.strftime, but uses a time_t number as time (no structure),
- also unescapes some backslash codes, supports unicode and shows local time
- if timet is GMT.
- """
- if timet is None:
- return formatWxDate(frmStr, wx.DateTime_Now())
-
- try:
- return formatWxDate(frmStr, wx.DateTimeFromTimeT(timet))
- except TypeError:
- return _(u"Inval. timestamp") # TODO Better errorhandling?
-
-
-
- def splitpath(path):
- """
- Cut a path into all of its pieces, starting with drive name, through
- all path components up to the name of the file (if any).
- Returns a list of the elements, first and/or last element may be
- empty strings.
- Maybe use os.path.abspath before calling it
- """
- dr, path = os.path.splitdrive(path)
- result = []
- while True:
- head, last = os.path.split(path)
- if head == path: break
- result.append(last)
- path = head
- result.append(dr)
- result.reverse()
- return result
-
-
- def getRelativeFilePathAndTestContained(location, toFilePath):
- """
- Returns a relative (if possible) path to address the file
- toFilePath if you are in directory location as first tuple item.
-
-
- Function returns None as first tuple item if an absolute path is needed!
-
- Tests if toFilePath is a file or dir contained in location and returns
- truth value in second tuple item
-
- Both parameters should be normalized with os.path.abspath
- location -- Directory where you are
- toFilePath -- absolute path to file you want to reach
- """
- locParts = splitpath(location)
- if locParts[-1] == "":
- del locParts[-1]
-
- locLen = len(locParts)
- fileParts = splitpath(toFilePath)
-
- for i in xrange(len(locParts)):
- if len(fileParts) == 0:
- break # TODO Error ???
-
- if os.path.normcase(locParts[0]) != os.path.normcase(fileParts[0]):
- break
-
- del locParts[0]
- del fileParts[0]
-
- result = []
-
- if len(locParts) == locLen:
- # Nothing matches at all, absolute path needed
- return None, False
-
- isContained = len(fileParts) > 0
- if len(locParts) > 0:
- # go back some steps
- result += [".."] * len(locParts)
- isContained = False
-
- result += fileParts
-
- if len(result) == 0:
- return u"", False
- else:
- return os.path.join(*result), isContained
-
-
-
- def relativeFilePath(location, toFilePath):
- """
- Returns a relative (if possible) path to address the file
- toFilePath if you are in directory location.
- Both parameters should be normalized with os.path.abspath
-
- Function returns None if an absolute path is needed!
-
- location -- Directory where you are
- toFilePath -- absolute path to file you want to reach
- """
- return getRelativeFilePathAndTestContained(location, toFilePath)[0]
-
-
- def testContainedInDir(location, toFilePath):
- """
- Tests if toFilePath is a file or dir contained in location.
- Both parameters should be normalized with os.path.abspath
- """
- return getRelativeFilePathAndTestContained(location, toFilePath)[1]
-
-
-
-
- def _asciiFlexibleUrlUnquote(part):
- """
- Unquote ascii-only parts of an url
- """
- if len(part) == 0:
- return u""
- # Get bytes out of percent-quoted URL
- linkBytes = urllib.unquote(part)
- # Try to interpret bytes as UTF-8
- try:
- return linkBytes.decode("utf8", "strict")
- except UnicodeDecodeError:
- # Failed -> try mbcs
- try:
- return mbcsDec(linkBytes, "strict")[0]
- except UnicodeDecodeError:
- # Failed, too -> leave link part unmodified. TODO: Doesn't make sense, will fail as well.
- return unicode(part)
-
-
- def flexibleUrlUnquote(link):
- """
- Tries to unquote an url.
- TODO: Faster and more elegantly.
-
- link -- unistring
- """
- if link is None:
- return None
-
- i = 0
- result = SnippetCollector()
-
- while i < len(link):
-
- asciiPart = ""
- while i < len(link) and ord(link[i]) < 128:
- asciiPart += chr(ord(link[i]))
- i += 1
-
- result += _asciiFlexibleUrlUnquote(asciiPart)
-
- unicodePart = u""
- while i < len(link) and ord(link[i]) >= 128:
- unicodePart += link[i]
- i += 1
-
- result += unicodePart
-
- return unicode(result.value())
-
-
-
- URL_RESERVED = frozenset((u";", u"?", u":", u"@", u"&", u"=", u"+", u",", u"/",
- u"{", u"}", u"|", u"\\", u"^", u"~", u"[", u"]", u"`", u'"', u"%"))
-
-
-
- def urlQuote(s, safe='/'):
- """
- Modified version of urllib.quote supporting unicode.
-
- Each part of a URL, e.g. the path info, the query, etc., has a
- different set of reserved characters that must be quoted.
-
- RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
- the following reserved characters.
-
- reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
- "$" | ","
-
- Each of these characters is reserved in some component of a URL,
- but not necessarily in all of them.
-
- The function is intended for quoting the path
- section of a URL. Thus, it will not encode '/'. This character
- is reserved, but in typical usage the quote function is being
- called on a path where the existing slash characters are used as
- reserved characters.
-
- The characters u"{", u"}", u"|", u"\", u"^", u"~", u"[", u"]", u"`"
- are considered unsafe and should be quoted as well.
- """
- result = []
-
- for c in s:
- if c not in safe and (ord(c) < 33 or c in URL_RESERVED):
- result.append("%%%02X" % ord(c))
- else:
- result.append(c)
-
- return "".join(result)
-
-
-
- def urlQuoteSpecific(s, toQuote=''):
- """
- Only quote characters in toQuote
- """
- result = []
-
- for c in s:
- if c in toQuote:
- result.append("%%%02X" % ord(c))
- else:
- result.append(c)
-
- return "".join(result)
-
-
-
- def ntUrlFromPathname(p, addSafe=''):
- r"""
- Modified version of nturl2path.pathname2url.
-
- Convert a DOS/Windows path name to a file url.
-
- C:\foo\bar\spam.foo
-
- becomes
-
- ///C:/foo/bar/spam.foo
- """
- if not ':' in p:
- # No drive specifier, just convert slashes and quote the name
- # if p[:2] == '\\\\':
- # # path is something like \\host\path\on\remote\host
- # # convert this to ////host/path/on/remote/host
- # # (notice doubling of slashes at the start of the path)
- # p = '\\\\' + p
- components = p.split('\\')
- return urlQuote('/'.join(components), safe='/' + addSafe)
- comp = p.split(':')
- if len(comp) != 2 or len(comp[0]) > 1:
- error = 'Bad path: ' + p
- raise IOError, error
-
- drive = urlQuote(comp[0].upper(), safe='/' + addSafe)
- components = comp[1].split('\\')
- path = '///' + drive + ':'
- for comp in components:
- if comp:
- path = path + '/' + urlQuote(comp, safe='/' + addSafe)
- return path
-
-
-
- def _macpncomp2url(component, addSafe):
- component = urlQuote(component[:31], safe=addSafe) # We want to quote slashes
- return component
-
- def macUrlFromPathname(pathname, addSafe=''):
- """
- Modified version of macurl2path.pathname2url.
-
- convert mac pathname to /-delimited pathname
- """
- if '/' in pathname:
- raise RuntimeError, "Cannot convert pathname containing slashes"
- components = pathname.split(':')
- # Remove empty first and/or last component
- if components[0] == '':
- del components[0]
- if components[-1] == '':
- del components[-1]
- # Replace empty string ('::') by .. (will result in '/../' later)
- for i in range(len(components)):
- if components[i] == '':
- components[i] = '..'
- # Truncate names longer than 31 bytes
- components = [_macpncomp2url(c, addSafe) for c in components]
- # components = map(_macpncomp2url, components)
-
- if os.path.isabs(pathname):
- return '/' + '/'.join(components)
- else:
- return '/'.join(components)
-
-
- if os.name == 'nt':
- urlFromPathname = ntUrlFromPathname
- elif os.name == 'mac':
- urlFromPathname = macUrlFromPathname
- else:
- def urlFromPathname(fn, addSafe=''):
- if isinstance(fn, unicode):
- fn = utf8Enc(fn, "replace")[0]
-
- # riscos not supported
- url = urlQuote(fn, safe='/$' + addSafe)
- # url.replace("%24", "$")
-
- return url
-
-
-
-
- def ntPathnameFromUrl(url, testFileType=True):
- r"""
- Modified version of nturl2path.url2pathname.
-
- Convert a URL to a DOS path.
-
- ///C|/foo/bar/spam.foo
-
- becomes
-
- C:\foo\bar\spam.foo
-
- testFileType -- ensure that URL has type "file" (and starts with "file:")
- throw RuntimeError if not.
- """
- import string
- if url.startswith("file:") or url.startswith("wiki:"):
- url = url[5:]
- elif testFileType:
- raise RuntimeError, 'Cannot convert non-local URL to pathname'
-
- # Strip fragment or query if present
- url, dummy = decomposeUrlQsFrag(url)
-
- if (not ':' in url) and (not '|' in url) and (not '%3A' in url) and (not '%3a' in url):
- # No drive specifier, just convert slashes
- if url[:4] == '////':
- # path is something like ////host/path/on/remote/host
- # convert this to \\host\path\on\remote\host
- # (notice halving of slashes at the start of the path)
- url = url[2:]
- components = url.split('/')
- # make sure not to convert quoted slashes :-)
- return flexibleUrlUnquote('\\'.join(components))
-
- comp = None
- for driveDelim in ('|', ':', '%3A', '%3a'):
- comp = url.split(driveDelim)
- if len(comp) != 2 or len(comp[0]) == 0 or comp[0][-1] not in string.ascii_letters:
- comp = None
- continue
- break
-
- if comp is None:
- error = 'Bad URL: ' + url
- raise IOError(error)
-
-
- # comp = url.split('|')
- # if len(comp) == 1:
- # comp = url.split(':')
- #
- # if len(comp) != 2 or len(comp[0]) == 0 or comp[0][-1] not in string.ascii_letters:
- # error = 'Bad URL: ' + url
- # raise IOError, error
-
- drive = comp[0][-1].upper()
- components = comp[1].split('/')
- path = drive + ':'
- for comp in components:
- if comp:
- path = path + '\\' + flexibleUrlUnquote(comp)
- return path
-
-
-
- def macPathnameFromUrl(url, testFileType=True):
- "Convert /-delimited url to mac pathname"
- #
- # XXXX The .. handling should be fixed...
- #
- tp = urllib.splittype(url)[0]
- if tp and tp != 'file' and tp != 'wiki':
- raise RuntimeError, 'Cannot convert non-local URL to pathname'
- # Turn starting /// into /, an empty hostname means current host
- if url[:3] == '///':
- url = url[2:]
- elif url[:2] == '//':
- raise RuntimeError, 'Cannot convert non-local URL to pathname'
-
- # Strip fragment or query if present
- url, dummy = decomposeUrlQsFrag(url)
-
- components = url.split('/')
- # Remove . and embedded ..
- i = 0
- while i < len(components):
- if components[i] == '.':
- del components[i]
- elif components[i] == '..' and i > 0 and \
- components[i-1] not in ('', '..'):
- del components[i-1:i+1]
- i = i-1
- elif components[i] == '' and i > 0 and components[i-1] != '':
- del components[i]
- else:
- i = i+1
- if not components[0]:
- # Absolute unix path, don't start with colon
- rv = ':'.join(components[1:])
- else:
- # relative unix path, start with colon. First replace
- # leading .. by empty strings (giving ::file)
- i = 0
- while i < len(components) and components[i] == '..':
- components[i] = ''
- i = i + 1
- rv = ':' + ':'.join(components)
- # and finally unquote slashes and other funny characters
- return flexibleUrlUnquote(rv)
-
-
- def elsePathnameFromUrl(url, testFileType=True):
- "Convert /-delimited url to pathname"
- #
- # XXXX The .. handling should be fixed...
- #
- if url.startswith("file:///") or url.startswith("wiki:///"):
- url = url[7:] # Third '/' remains
- elif url.startswith("file:") or url.startswith("wiki:"):
- url = url[5:]
- elif testFileType:
- raise RuntimeError, 'Cannot convert non-local URL to pathname'
-
- # Strip fragment or query if present
- url, dummy = decomposeUrlQsFrag(url)
-
- return flexibleUrlUnquote(url)
-
-
-
-
- if os.name == 'nt':
- pathnameFromUrl = ntPathnameFromUrl
- elif os.name == 'mac':
- pathnameFromUrl = macPathnameFromUrl
- else:
- # pathnameFromUrl = flexibleUrlUnquote
- pathnameFromUrl = elsePathnameFromUrl
-
-
-
- _DECOMPOSE_URL_RE = _re.compile(ur"([^?#]*)((?:[?#].*)?)", _re.UNICODE | _re.DOTALL);
-
-
- def decomposeUrlQsFrag(url):
- """
- Find first '?' or '#' (query string or fragment) in URL and split URL
- there so that the parts can be (un-)quoted differently.
- Returns a 2-tuple with main part and additional part of URL.
- """
- return _DECOMPOSE_URL_RE.match(url).groups()
-
-
- def composeUrlQsFrag(mainUrl, additional):
- """
- Compose main URL and additional part back into one URL. Currently a very
- simple function but may become more complex later.
- """
- return mainUrl + additional
-
-
-
- def _quoteChar(c):
- oc = ord(c)
- if oc < 256:
- return u"%%%02X" % oc
- else:
- return u"@%04X" % oc
-
-
- _ESCAPING_CHARACTERS = u"%@~"
-
- _FORBIDDEN_CHARACTERS = frozenset(u":/\\*?\"'<>|;![]" + _ESCAPING_CHARACTERS)
- _FORBIDDEN_START = _FORBIDDEN_CHARACTERS | frozenset(u".$ -")
-
- # Allowed ascii characters remaining: #&()+,=[]^_`{}
-
-
- def iterCompatibleFilename(baseName, suffix, asciiOnly=False, maxLength=120,
- randomLength=10):
- """
- Generator to create filenames compatible to (hopefully) all major
- OSs/filesystems.
-
- Encode a unicode filename to a filename compatible to (hopefully)
- any filesystem encoding by converting unicode to '%xx' for
- characters up to 250 and '@xxxx' above. Each 'x represents a hex
- character.
-
- If the resulting name is too long it is shortened.
-
- If the first returned filename isn't accepted, a sequence of random
- characters, delimited by a tilde '~' is added. If the filename is then
- too long it is also shortened.
-
- The first random sequence isn't random but a MD5-hash of baseName.
-
- Each time you ask for next filename, a new sequence of random characters
- is created.
-
- baseName - Base name to use for the filename
- suffix - Suffix (must include the dot) of the filename. The suffix must not
- be empty, is not quoted in any way and should follow the
- rules of the filesystem(s)
- asciiOnly - Iff True, all non-ascii characters are replaced.
- maxLength - Maximum length of filename including encoded basename,
- random sequence and suffix
- randomLength - Length of the random sequence (without leading tilde)
-
- """
- maxLength = between(20 + len(suffix) + randomLength, maxLength, 250)
-
- baseName = mbcsDec(baseName)[0]
-
- if len(baseName) > 0:
- c = baseName[0]
- if ord(c) < 32 or c in _FORBIDDEN_START or \
- (asciiOnly and ord(c) > 127):
- baseQuoted = [_quoteChar(c)]
- else:
- baseQuoted = [c]
-
- for c in baseName[1:]:
- if ord(c) < 32 or c in _FORBIDDEN_CHARACTERS or \
- (asciiOnly and ord(c) > 127):
- baseQuoted.append(_quoteChar(c))
- else:
- baseQuoted.append(c)
-
- else:
- baseQuoted = []
-
- overallLength = sum(len(bq) for bq in baseQuoted) + len(suffix)
-
- # Shorten baseQuoted if needed. This method ensures that no half-quoted
- # character (e.g. "@3") is remaining
- while overallLength > maxLength:
- overallLength -= len(baseQuoted.pop())
-
- if len(baseName) > 0:
- # First try, no random part
- yield u"".join(baseQuoted) + suffix
-
- # Add random part to length
- overallLength += 1 + randomLength
-
- # Shorten baseQuoted again
- while overallLength > maxLength:
- overallLength -= len(baseQuoted.pop())
-
- beforeRandom = u"".join(baseQuoted) + u"~"
-
- # Now we try MD5-Hash. This is one last try to create a filename which
- # is non-ambigously connected to the baseName
- hashStr = getMd5B36ByString(baseName)[-randomLength:]
- if len(hashStr) < randomLength:
- hashStr = u"0" * (randomLength - len(hashStr)) + hashStr
-
- yield beforeRandom + hashStr + suffix
-
- # Now build infinite random names
- while True:
- yield beforeRandom + createRandomString(randomLength) + suffix
-
-
- def _unquoteCharRepl(matchObj):
- s = matchObj.group(0)
-
- if s[0] == "%":
- v = int(s[1:3], 16)
- return unichr(v)
- else: # s[0] == "@":
- v = int(s[1:5], 16)
- return unichr(v)
-
-
- _FILENAME_UNQUOTE_RE = _re.compile(ur"%[A-Fa-f0-9]{2}|@[A-Fa-f0-9]{4}",
- _re.UNICODE | _re.DOTALL | _re.MULTILINE)
-
-
- def guessBaseNameByFilename(filename, suffix=u""):
- """
- Try to guess the basename for a particular file name created by
- iterCompatibleFilename() as far as it can be reconstructed.
- """
- # Filename may contain a path, so at first, strip it
- filename = os.path.basename(filename)
-
- if filename.endswith(suffix):
- filename = filename[:-len(suffix)]
- # else?
-
- # After a tilde begins the random part, so remove
- tildI = filename.find(u"~")
- if tildI > 0: # tildI == 0 would mean a nameless file
- filename = filename[:tildI]
-
- return _FILENAME_UNQUOTE_RE.sub(_unquoteCharRepl, filename)
-
-
-
-
- _RNDBASESEQ = u"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-
- def createRandomString(length):
- """
- Create a unicode string of length random characters and digits
- """
- return u"".join([random.choice(_RNDBASESEQ) for i in range(length)])
-
-
-
- # _RNDBASENOHEX = u"GHIJKLMNOPQRSTUVWXYZ"
- #
- # def createRandomStringNoHexFirst(length):
- # """
- # Create a unicode string of length random characters and digits.
- # First char. must not be a possible hexadecimal digit.
- # """
- # if length == 0:
- # return u""
- #
- # return random.choice(_RNDBASENOHEX) + u"".join([random.choice(_RNDBASESEQ)
- # for i in range(length - 1)])
-
-
- def getMd5B36ByString(text):
- """
- Calculate the MD5 hash of text (if unicode after conversion to utf-8)
- and return it as unistring for numeric base 36.
-
- Based on http://code.activestate.com/recipes/111286/
- """
- if isinstance(text, unicode):
- text = text.encode("utf-8")
-
- # digest = hashlib.md5(text).digest()
- #
- # # make an integer out of the number
- # x = 0L
- # for digit in digest:
- # x = x*256 + ord(digit)
-
- x = int(hashlib.md5(text).hexdigest(), 16)
-
- # create the result in base len(_RNDBASESEQ) (=36)
- res=""
- if x == 0:
- res = _RNDBASESEQ[0]
- while x>0:
- digit = x % len(_RNDBASESEQ)
- res = _RNDBASESEQ[digit] + res
- x //= len(_RNDBASESEQ)
-
- return res
-
-
-
-
- def boolToChar(b):
- if b:
- return "1"
- else:
- return "\0"
-
- def charToBool(c):
- return c != "\0"
-
- def boolToInt(b):
- if b:
- return 1
- else:
- return 0
-
-
- def strToBin(s):
- """
- s -- String to convert to binary (NOT unicode!)
- """
- return pack(">I", len(s)) + s # Why big-endian? Why not?
-
- def binToStr(b):
- """
- Returns tuple (s, br) with string s and rest of the binary data br
- """
- l = unpack(">I", b[:4])[0]
- s = b[4 : 4+l]
- br = b[4+l : ]
- return (s, br)
-
-
- # def orderBySuggestion(strs, sugg):
- # """
- # Order string iterable strs in a way that all strings also present in
- # sequence sugg come first in resulting list, then the strings from strs
- # which are not in sugg in arbitrary order.
- # """
- # s = set(strs)
- # result = []
- # for e in sugg:
- # if e in s:
- # result.append(e)
- # s.remove(e)
- #
- # for e in s:
- # result.append(e)
- #
- # return result
-
-
- def wikiUrlToPathWordAndAnchor(url):
- """
- Split a "wiki:" protocol URL into the path of the config file,
- the name of the wikiword and the anchor to open if given in query string.
-
- Returns (path, wikiword, anchor) where wikiword and/or anchor may be None
- """
- # Change "wiki:" url to "http:" for urlparse
- linkHt = "http:" + url[5:]
- parsed = urlparse.urlparse(linkHt)
- # Parse query string into dictionary
- queryDict = cgi.parse_qs(parsed[4])
- # Retrieve wikiword to open if existing
- # queryDict values are lists of values therefore this expression
- wikiWordToOpen = flexibleUrlUnquote(queryDict.get("page", (None,))[0])
- anchorToOpen = flexibleUrlUnquote(queryDict.get("anchor", (None,))[0])
-
- # Modify parsed to create clean url by clearing query and fragment
- parsed = list(parsed)
- parsed[4] = ""
- parsed[5] = ""
- parsed = tuple(parsed)
-
- filePath = pathnameFromUrl(urlparse.urlunparse(parsed)[5:], False)
-
- # filePath = urllib.url2pathname(url)
-
- return (filePath, wikiWordToOpen, anchorToOpen)
-
-
- def pathWordAndAnchorToWikiUrl(filePath, wikiWordToOpen, anchorToOpen):
- url = urlFromPathname(filePath)
-
- queryStringNeeded = (wikiWordToOpen is not None) or \
- (anchorToOpen is not None)
-
- result = ["wiki:", url]
- if queryStringNeeded:
- result.append("?")
- ampNeeded = False
-
- if wikiWordToOpen is not None:
- result.append("page=")
- result.append(urlQuote(wikiWordToOpen, safe=""))
- ampNeeded = True
-
- if anchorToOpen is not None:
- if ampNeeded:
- result.append("&")
- result.append("anchor=")
- result.append(urlQuote(anchorToOpen, safe=""))
- ampNeeded = True
-
- return "".join(result)
-
-
- def joinRegexes(patternList):
- return u"(?:(?:" + u")|(?:".join(patternList) + u"))"
-
-
-
- class SnippetCollector(object):
- """
- Collects (byte/uni)string snippets in a list. This is faster than
- using string += string.
- """
- def __init__(self):
- self.snippets = []
- self.length = 0
-
- def drop(self, length):
- """
- Remove last length (byte/uni)characters
- """
- assert self.length >= length
-
- while length > 0 and len(self.snippets) > 0:
- if length < len(self.snippets[-1]):
- self.snippets[-1] = self.snippets[-1][:-length]
- self.length -= length
- break;
-
- if length >= len(self.snippets[-1]):
- length -= len(self.snippets[-1])
- self.length -= len(self.snippets[-1])
- del self.snippets[-1]
-
- def append(self, s):
- if len(s) == 0:
- return
-
- self.length += len(s)
- self.snippets.append(s)
-
-
- def __iadd__(self, s):
- self.append(s)
- return self
-
- def value(self):
- return "".join(self.snippets)
-
- def __len__(self):
- return self.length
-
-
- class Conjunction:
- """
- Used to create SQL statements. Example:
- conjunction = Conjunction("where ", "and ")
- whereClause = ""
- if ...:
- whereClause += conjunction() + "word = ? "
- if ...:
- whereClause += conjunction() + "key = ? "
-
- will always create a valid where-clause
- """
- def __init__(self, firstpart, otherpart):
- self.firstpart = firstpart
- self.otherpart = otherpart
- self.first = True
-
- def __call__(self):
- if self.first:
- self.first = False
- return self.firstpart
- else:
- return self.otherpart
-
- def __repr__(self):
- return "<Conjunction(%s, %s) %s>" % (self.firstpart, self.otherpart,
- self.first)
-
-
-
- # ---------- Handling diff information ----------
-
-
- def difflibToCompact(ops, b):
- """
- Rewrite sequence of op_codes returned by difflib.SequenceMatcher.get_opcodes
- to the compact opcode format.
-
- 0: replace, 1: delete, 2: insert
-
- b -- second string to match
- """
- result = []
- # ops.reverse()
- for tag, i1, i2, j1, j2 in ops:
- if tag == "equal":
- continue
- elif tag == "replace":
- result.append((0, i1, i2, b[j1:j2]))
- elif tag == "delete":
- result.append((1, i1, i2))
- elif tag == "insert":
- result.append((2, i1, b[j1:j2]))
-
- retur…
Large files files are truncated, but you can click here to view the full file