/donomo_archive/lib/reportlab/lib/textsplit.py
Python | 210 lines | 204 code | 1 blank | 5 comment | 0 complexity | b60572385b3fb21e3b44f88f589e8471 MD5 | raw file
Possible License(s): BSD-3-Clause
- #Copyright ReportLab Europe Ltd. 2000-2006
- #see license.txt for license details
- #history http://www.reportlab.co.uk/cgi-bin/viewcvs.cgi/public/reportlab/trunk/reportlab/lib/textsplit.py
- """Helpers for text wrapping, hyphenation, Asian text splitting and kinsoku shori.
- How to split a 'big word' depends on the language and the writing system. This module
- works on a Unicode string. It ought to grow by allowing ore algoriths to be plugged
- in based on possible knowledge of the language and desirable 'niceness' of the algorithm.
- """
- __version__=''' $Id: textsplit.py 2833 2006-04-05 16:01:20Z rgbecker $ '''
- from types import StringType, UnicodeType
- import unicodedata
- from reportlab.pdfbase.pdfmetrics import stringWidth
- from reportlab.rl_config import _FUZZ
- CANNOT_START_LINE = [
- #strongly prohibited e.g. end brackets, stop, exclamation...
- u'!\',.:;?!")]\u3001\u3002\u300d\u300f\u3011\u3015\uff3d\u3011\uff09',
- #middle priority e.g. continuation small vowels - wrapped on two lines but one string...
- u'\u3005\u2015\u3041\u3043\u3045\u3047\u3049\u3063\u3083\u3085\u3087\u308e\u30a1\u30a3'
- u'\u30a5\u30a7\u30a9\u30c3\u30e3\u30e5\u30e7\u30ee\u30fc\u30f5\u30f6',
- #weakly prohibited - continuations, celsius symbol etc.
- u'\u309b\u309c\u30fb\u30fd\u30fe\u309d\u309e\u2015\u2010\xb0\u2032\u2033\u2103\uffe0\uff05\u2030'
- ]
- ALL_CANNOT_START = u''.join(CANNOT_START_LINE)
- CANNOT_END_LINE = [
- #strongly prohibited
- u'\u2018\u201c\uff08[{\uff08\u3014\uff3b\uff5b\u3008\u300a\u300c\u300e\u3010',
- #weaker - currency symbols, hash, postcode - prefixes
- u'$\u00a3@#\uffe5\uff04\uffe1\uff20\u3012\u00a7'
- ]
- ALL_CANNOT_END = u''.join(CANNOT_END_LINE)
- def getCharWidths(word, fontName, fontSize):
- """Returns a list of glyph widths. Should be easy to optimize in _rl_accel
- >>> getCharWidths('Hello', 'Courier', 10)
- [6.0, 6.0, 6.0, 6.0, 6.0]
- >>> from reportlab.pdfbase.cidfonts import UnicodeCIDFont
- >>> from reportlab.pdfbase.pdfmetrics import registerFont
- >>> registerFont(UnicodeCIDFont('HeiseiMin-W3'))
- >>> getCharWidths(u'\u6771\u4EAC', 'HeiseiMin-W3', 10) #most kanji are 100 ems
- [10.0, 10.0]
- """
- #character-level function call; the performance is going to SUCK
- return [stringWidth(uChar, fontName, fontSize) for uChar in word]
- def wordSplit(word, availWidth, fontName, fontSize, encoding='utf8'):
- """Attempts to break a word which lacks spaces into two parts, the first of which
- fits in the remaining space. It is allowed to add hyphens or whatever it wishes.
- This is intended as a wrapper for some language- and user-choice-specific splitting
- algorithms. It should only be called after line breaking on spaces, which covers western
- languages and is highly optimised already. It works on the 'last unsplit word'.
- Presumably with further study one could write a Unicode splitting algorithm for text
- fragments whick was much faster.
- Courier characters should be 6 points wide.
- >>> wordSplit('HelloWorld', 30, 'Courier', 10)
- [[0.0, 'Hello'], [0.0, 'World']]
- >>> wordSplit('HelloWorld', 31, 'Courier', 10)
- [[1.0, 'Hello'], [1.0, 'World']]
- """
- if type(word) is not UnicodeType:
- uword = word.decode(encoding)
- else:
- uword = word
- charWidths = getCharWidths(uword, fontName, fontSize)
- lines = dumbSplit(uword, charWidths, availWidth)
- if type(word) is not UnicodeType:
- lines2 = []
- #convert back
- for (extraSpace, text) in lines:
- lines2.append([extraSpace, text.encode(encoding)])
- lines = lines2
- return lines
- def dumbSplit(word, widths, availWidth):
- """This function attempts to fit as many characters as possible into the available
- space, cutting "like a knife" between characters. This would do for Chinese.
- It returns a list of (text, extraSpace) items where text is a Unicode string,
- and extraSpace is the points of unused space available on the line. This is a
- structure which is fairly easy to display, and supports 'backtracking' approaches
- after the fact.
- Test cases assume each character is ten points wide...
- >>> dumbSplit(u'Hello', [10]*5, 60)
- [[10.0, u'Hello']]
- >>> dumbSplit(u'Hello', [10]*5, 50)
- [[0.0, u'Hello']]
- >>> dumbSplit(u'Hello', [10]*5, 40)
- [[0.0, u'Hell'], [30, u'o']]
- """
- _more = """
- #>>> dumbSplit(u'Hello', [10]*5, 4) # less than one character
- #(u'', u'Hello')
- # this says 'Nihongo wa muzukashii desu ne!' (Japanese is difficult isn't it?) in 12 characters
- >>> jtext = u'\u65e5\u672c\u8a9e\u306f\u96e3\u3057\u3044\u3067\u3059\u306d\uff01'
- >>> dumbSplit(jtext, [10]*11, 30) #
- (u'\u65e5\u672c\u8a9e', u'\u306f\u96e3\u3057\u3044\u3067\u3059\u306d\uff01')
- """
- assert type(word) is UnicodeType
- lines = []
- widthUsed = 0.0
- lineStartPos = 0
- for (i, w) in enumerate(widths):
- widthUsed += w
- if widthUsed > availWidth + _FUZZ:
- #used more than can fit...
- #ping out with previous cut, then set up next line with one character
- extraSpace = availWidth - widthUsed + w
- #print 'ending a line; used %d, available %d' % (widthUsed, availWidth)
- selected = word[lineStartPos:i]
- #This is the most important of the Japanese typography rules.
- #if next character cannot start a line, wrap it up to this line so it hangs
- #in the right margin. We won't do two or more though - that's unlikely and
- #would result in growing ugliness.
- nextChar = word[i]
- if nextChar in ALL_CANNOT_START:
- #it's punctuation or a closing bracket of some kind. 'wrap up'
- #so it stays on the line above, slightly exceeding our target width.
- #print 'wrapping up', repr(nextChar)
- selected += nextChar
- extraSpace -= w
- i += 1
-
- lines.append([extraSpace, selected])
- lineStartPos = i
- widthUsed = w
- i -= 1
- #any characters left?
- if widthUsed > 0:
- extraSpace = availWidth - widthUsed
- lines.append([extraSpace, word[lineStartPos:]])
- return lines
- def kinsokuShoriSplit(word, widths, availWidth):
- #NOT USED OR FINISHED YET!
- """Split according to Japanese rules according to CJKV (Lunde).
- Essentially look for "nice splits" so that we don't end a line
- with an open bracket, or start one with a full stop, or stuff like
- that. There is no attempt to try to split compound words into
- constituent kanji. It currently uses wrap-down: packs as much
- on a line as possible, then backtracks if needed
- This returns a number of words each of which should just about fit
- on a line. If you give it a whole paragraph at once, it will
- do all the splits.
- It's possible we might slightly step over the width limit
- if we do hanging punctuation marks in future (e.g. dangle a Japanese
- full stop in the right margin rather than using a whole character
- box.
- """
- lines = []
- assert len(word) == len(widths)
- curWidth = 0.0
- curLine = []
- i = 0 #character index - we backtrack at times so cannot use for loop
- while 1:
- ch = word[i]
- w = widths[i]
- if curWidth + w < availWidth:
- curLine.append(ch)
- curWidth += w
- else:
- #end of line. check legality
- if ch in CANNOT_END_LINE[0]:
- pass
- #to be completed
- # This recipe refers:
- #
- # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
- import re
- rx=re.compile(u"([\u2e80-\uffff])", re.UNICODE)
- def cjkwrap(text, width, encoding="utf8"):
- return reduce(lambda line, word, width=width: '%s%s%s' %
- (line,
- [' ','\n', ''][(len(line)-line.rfind('\n')-1
- + len(word.split('\n',1)[0] ) >= width) or
- line[-1:] == '\0' and 2],
- word),
- rx.sub(r'\1\0 ', unicode(text,encoding)).split(' ')
- ).replace('\0', '').encode(encoding)
- if __name__=='__main__':
- import doctest, textsplit
- doctest.testmod(textsplit)