/python/lib/django-0.96/django/utils/text.py
Python | 204 lines | 196 code | 3 blank | 5 comment | 10 complexity | e2f3d5fc01742ae5df69770756f54381 MD5 | raw file
- import re
- from django.conf import settings
- # Capitalizes the first letter of a string.
- capfirst = lambda x: x and x[0].upper() + x[1:]
- def wrap(text, width):
- """
- A word-wrap function that preserves existing line breaks and most spaces in
- the text. Expects that existing line breaks are posix newlines.
- """
- def _generator():
- it = iter(text.split(' '))
- word = it.next()
- yield word
- pos = len(word) - word.rfind('\n') - 1
- for word in it:
- if "\n" in word:
- lines = word.split('\n')
- else:
- lines = (word,)
- pos += len(lines[0]) + 1
- if pos > width:
- yield '\n'
- pos = len(lines[-1])
- else:
- yield ' '
- if len(lines) > 1:
- pos = len(lines[-1])
- yield word
- return "".join(_generator())
- def truncate_words(s, num):
- "Truncates a string after a certain number of words."
- length = int(num)
- words = s.split()
- if len(words) > length:
- words = words[:length]
- if not words[-1].endswith('...'):
- words.append('...')
- return ' '.join(words)
- def truncate_html_words(s, num):
- """
- Truncates html to a certain number of words (not counting tags and comments).
- Closes opened tags if they were correctly closed in the given html.
- """
- length = int(num)
- if length <= 0:
- return ''
- html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input')
- # Set up regular expressions
- re_words = re.compile(r'&.*?;|<.*?>|([A-Za-z0-9][\w-]*)')
- re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>')
- # Count non-HTML words and keep note of open tags
- pos = 0
- ellipsis_pos = 0
- words = 0
- open_tags = []
- while words <= length:
- m = re_words.search(s, pos)
- if not m:
- # Checked through whole string
- break
- pos = m.end(0)
- if m.group(1):
- # It's an actual non-HTML word
- words += 1
- if words == length:
- ellipsis_pos = pos
- continue
- # Check for tag
- tag = re_tag.match(m.group(0))
- if not tag or ellipsis_pos:
- # Don't worry about non tags or tags after our truncate point
- continue
- closing_tag, tagname, self_closing = tag.groups()
- tagname = tagname.lower() # Element names are always case-insensitive
- if self_closing or tagname in html4_singlets:
- pass
- elif closing_tag:
- # Check for match in open tags list
- try:
- i = open_tags.index(tagname)
- except ValueError:
- pass
- else:
- # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags
- open_tags = open_tags[i+1:]
- else:
- # Add it to the start of the open tags list
- open_tags.insert(0, tagname)
- if words <= length:
- # Don't try to close tags if we don't need to truncate
- return s
- out = s[:ellipsis_pos] + ' ...'
- # Close any tags still open
- for tag in open_tags:
- out += '</%s>' % tag
- # Return string
- return out
- def get_valid_filename(s):
- """
- Returns the given string converted to a string that can be used for a clean
- filename. Specifically, leading and trailing spaces are removed; other
- spaces are converted to underscores; and all non-filename-safe characters
- are removed.
- >>> get_valid_filename("john's portrait in 2004.jpg")
- 'johns_portrait_in_2004.jpg'
- """
- s = s.strip().replace(' ', '_')
- return re.sub(r'[^-A-Za-z0-9_.]', '', s)
- def get_text_list(list_, last_word='or'):
- """
- >>> get_text_list(['a', 'b', 'c', 'd'])
- 'a, b, c or d'
- >>> get_text_list(['a', 'b', 'c'], 'and')
- 'a, b and c'
- >>> get_text_list(['a', 'b'], 'and')
- 'a and b'
- >>> get_text_list(['a'])
- 'a'
- >>> get_text_list([])
- ''
- """
- if len(list_) == 0: return ''
- if len(list_) == 1: return list_[0]
- return '%s %s %s' % (', '.join([str(i) for i in list_][:-1]), last_word, list_[-1])
- def normalize_newlines(text):
- return re.sub(r'\r\n|\r|\n', '\n', text)
- def recapitalize(text):
- "Recapitalizes text, placing caps after end-of-sentence punctuation."
- # capwords = ()
- text = text.lower()
- capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
- text = capsRE.sub(lambda x: x.group(1).upper(), text)
- # for capword in capwords:
- # capwordRE = re.compile(r'\b%s\b' % capword, re.I)
- # text = capwordRE.sub(capword, text)
- return text
- def phone2numeric(phone):
- "Converts a phone number with letters into its numeric equivalent."
- letters = re.compile(r'[A-PR-Y]', re.I)
- char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
- 'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
- 'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
- 's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
- 'y': '9', 'x': '9'}.get(m.group(0).lower())
- return letters.sub(char2number, phone)
- # From http://www.xhaus.com/alan/python/httpcomp.html#gzip
- # Used with permission.
- def compress_string(s):
- import cStringIO, gzip
- zbuf = cStringIO.StringIO()
- zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
- zfile.write(s)
- zfile.close()
- return zbuf.getvalue()
- ustring_re = re.compile(u"([\u0080-\uffff])")
- def javascript_quote(s, quote_double_quotes=False):
- def fix(match):
- return r"\u%04x" % ord(match.group(1))
- if type(s) == str:
- s = s.decode(settings.DEFAULT_CHARSET)
- elif type(s) != unicode:
- raise TypeError, s
- s = s.replace('\\', '\\\\')
- s = s.replace('\r', '\\r')
- s = s.replace('\n', '\\n')
- s = s.replace('\t', '\\t')
- s = s.replace("'", "\\'")
- if quote_double_quotes:
- s = s.replace('"', '"')
- return str(ustring_re.sub(fix, s))
- smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
- def smart_split(text):
- """
- Generator that splits a string by spaces, leaving quoted phrases together.
- Supports both single and double quotes, and supports escaping quotes with
- backslashes. In the output, strings will keep their initial and trailing
- quote marks.
- >>> list(smart_split('This is "a person\'s" test.'))
- ['This', 'is', '"a person\'s"', 'test.']
- """
- for bit in smart_split_re.finditer(text):
- bit = bit.group(0)
- if bit[0] == '"':
- yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
- elif bit[0] == "'":
- yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
- else:
- yield bit