PageRenderTime 27ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/donomo_archive/lib/reportlab/lib/textsplit.py

https://github.com/alexissmirnov/donomo
Python | 210 lines | 204 code | 1 blank | 5 comment | 0 complexity | b60572385b3fb21e3b44f88f589e8471 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. #Copyright ReportLab Europe Ltd. 2000-2006
  2. #see license.txt for license details
  3. #history http://www.reportlab.co.uk/cgi-bin/viewcvs.cgi/public/reportlab/trunk/reportlab/lib/textsplit.py
  4. """Helpers for text wrapping, hyphenation, Asian text splitting and kinsoku shori.
  5. How to split a 'big word' depends on the language and the writing system. This module
  6. works on a Unicode string. It ought to grow by allowing ore algoriths to be plugged
  7. in based on possible knowledge of the language and desirable 'niceness' of the algorithm.
  8. """
  9. __version__=''' $Id: textsplit.py 2833 2006-04-05 16:01:20Z rgbecker $ '''
  10. from types import StringType, UnicodeType
  11. import unicodedata
  12. from reportlab.pdfbase.pdfmetrics import stringWidth
  13. from reportlab.rl_config import _FUZZ
  14. CANNOT_START_LINE = [
  15. #strongly prohibited e.g. end brackets, stop, exclamation...
  16. u'!\',.:;?!")]\u3001\u3002\u300d\u300f\u3011\u3015\uff3d\u3011\uff09',
  17. #middle priority e.g. continuation small vowels - wrapped on two lines but one string...
  18. u'\u3005\u2015\u3041\u3043\u3045\u3047\u3049\u3063\u3083\u3085\u3087\u308e\u30a1\u30a3'
  19. u'\u30a5\u30a7\u30a9\u30c3\u30e3\u30e5\u30e7\u30ee\u30fc\u30f5\u30f6',
  20. #weakly prohibited - continuations, celsius symbol etc.
  21. u'\u309b\u309c\u30fb\u30fd\u30fe\u309d\u309e\u2015\u2010\xb0\u2032\u2033\u2103\uffe0\uff05\u2030'
  22. ]
  23. ALL_CANNOT_START = u''.join(CANNOT_START_LINE)
  24. CANNOT_END_LINE = [
  25. #strongly prohibited
  26. u'\u2018\u201c\uff08[{\uff08\u3014\uff3b\uff5b\u3008\u300a\u300c\u300e\u3010',
  27. #weaker - currency symbols, hash, postcode - prefixes
  28. u'$\u00a3@#\uffe5\uff04\uffe1\uff20\u3012\u00a7'
  29. ]
  30. ALL_CANNOT_END = u''.join(CANNOT_END_LINE)
  31. def getCharWidths(word, fontName, fontSize):
  32. """Returns a list of glyph widths. Should be easy to optimize in _rl_accel
  33. >>> getCharWidths('Hello', 'Courier', 10)
  34. [6.0, 6.0, 6.0, 6.0, 6.0]
  35. >>> from reportlab.pdfbase.cidfonts import UnicodeCIDFont
  36. >>> from reportlab.pdfbase.pdfmetrics import registerFont
  37. >>> registerFont(UnicodeCIDFont('HeiseiMin-W3'))
  38. >>> getCharWidths(u'\u6771\u4EAC', 'HeiseiMin-W3', 10) #most kanji are 100 ems
  39. [10.0, 10.0]
  40. """
  41. #character-level function call; the performance is going to SUCK
  42. return [stringWidth(uChar, fontName, fontSize) for uChar in word]
  43. def wordSplit(word, availWidth, fontName, fontSize, encoding='utf8'):
  44. """Attempts to break a word which lacks spaces into two parts, the first of which
  45. fits in the remaining space. It is allowed to add hyphens or whatever it wishes.
  46. This is intended as a wrapper for some language- and user-choice-specific splitting
  47. algorithms. It should only be called after line breaking on spaces, which covers western
  48. languages and is highly optimised already. It works on the 'last unsplit word'.
  49. Presumably with further study one could write a Unicode splitting algorithm for text
  50. fragments whick was much faster.
  51. Courier characters should be 6 points wide.
  52. >>> wordSplit('HelloWorld', 30, 'Courier', 10)
  53. [[0.0, 'Hello'], [0.0, 'World']]
  54. >>> wordSplit('HelloWorld', 31, 'Courier', 10)
  55. [[1.0, 'Hello'], [1.0, 'World']]
  56. """
  57. if type(word) is not UnicodeType:
  58. uword = word.decode(encoding)
  59. else:
  60. uword = word
  61. charWidths = getCharWidths(uword, fontName, fontSize)
  62. lines = dumbSplit(uword, charWidths, availWidth)
  63. if type(word) is not UnicodeType:
  64. lines2 = []
  65. #convert back
  66. for (extraSpace, text) in lines:
  67. lines2.append([extraSpace, text.encode(encoding)])
  68. lines = lines2
  69. return lines
  70. def dumbSplit(word, widths, availWidth):
  71. """This function attempts to fit as many characters as possible into the available
  72. space, cutting "like a knife" between characters. This would do for Chinese.
  73. It returns a list of (text, extraSpace) items where text is a Unicode string,
  74. and extraSpace is the points of unused space available on the line. This is a
  75. structure which is fairly easy to display, and supports 'backtracking' approaches
  76. after the fact.
  77. Test cases assume each character is ten points wide...
  78. >>> dumbSplit(u'Hello', [10]*5, 60)
  79. [[10.0, u'Hello']]
  80. >>> dumbSplit(u'Hello', [10]*5, 50)
  81. [[0.0, u'Hello']]
  82. >>> dumbSplit(u'Hello', [10]*5, 40)
  83. [[0.0, u'Hell'], [30, u'o']]
  84. """
  85. _more = """
  86. #>>> dumbSplit(u'Hello', [10]*5, 4) # less than one character
  87. #(u'', u'Hello')
  88. # this says 'Nihongo wa muzukashii desu ne!' (Japanese is difficult isn't it?) in 12 characters
  89. >>> jtext = u'\u65e5\u672c\u8a9e\u306f\u96e3\u3057\u3044\u3067\u3059\u306d\uff01'
  90. >>> dumbSplit(jtext, [10]*11, 30) #
  91. (u'\u65e5\u672c\u8a9e', u'\u306f\u96e3\u3057\u3044\u3067\u3059\u306d\uff01')
  92. """
  93. assert type(word) is UnicodeType
  94. lines = []
  95. widthUsed = 0.0
  96. lineStartPos = 0
  97. for (i, w) in enumerate(widths):
  98. widthUsed += w
  99. if widthUsed > availWidth + _FUZZ:
  100. #used more than can fit...
  101. #ping out with previous cut, then set up next line with one character
  102. extraSpace = availWidth - widthUsed + w
  103. #print 'ending a line; used %d, available %d' % (widthUsed, availWidth)
  104. selected = word[lineStartPos:i]
  105. #This is the most important of the Japanese typography rules.
  106. #if next character cannot start a line, wrap it up to this line so it hangs
  107. #in the right margin. We won't do two or more though - that's unlikely and
  108. #would result in growing ugliness.
  109. nextChar = word[i]
  110. if nextChar in ALL_CANNOT_START:
  111. #it's punctuation or a closing bracket of some kind. 'wrap up'
  112. #so it stays on the line above, slightly exceeding our target width.
  113. #print 'wrapping up', repr(nextChar)
  114. selected += nextChar
  115. extraSpace -= w
  116. i += 1
  117. lines.append([extraSpace, selected])
  118. lineStartPos = i
  119. widthUsed = w
  120. i -= 1
  121. #any characters left?
  122. if widthUsed > 0:
  123. extraSpace = availWidth - widthUsed
  124. lines.append([extraSpace, word[lineStartPos:]])
  125. return lines
  126. def kinsokuShoriSplit(word, widths, availWidth):
  127. #NOT USED OR FINISHED YET!
  128. """Split according to Japanese rules according to CJKV (Lunde).
  129. Essentially look for "nice splits" so that we don't end a line
  130. with an open bracket, or start one with a full stop, or stuff like
  131. that. There is no attempt to try to split compound words into
  132. constituent kanji. It currently uses wrap-down: packs as much
  133. on a line as possible, then backtracks if needed
  134. This returns a number of words each of which should just about fit
  135. on a line. If you give it a whole paragraph at once, it will
  136. do all the splits.
  137. It's possible we might slightly step over the width limit
  138. if we do hanging punctuation marks in future (e.g. dangle a Japanese
  139. full stop in the right margin rather than using a whole character
  140. box.
  141. """
  142. lines = []
  143. assert len(word) == len(widths)
  144. curWidth = 0.0
  145. curLine = []
  146. i = 0 #character index - we backtrack at times so cannot use for loop
  147. while 1:
  148. ch = word[i]
  149. w = widths[i]
  150. if curWidth + w < availWidth:
  151. curLine.append(ch)
  152. curWidth += w
  153. else:
  154. #end of line. check legality
  155. if ch in CANNOT_END_LINE[0]:
  156. pass
  157. #to be completed
  158. # This recipe refers:
  159. #
  160. # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
  161. import re
  162. rx=re.compile(u"([\u2e80-\uffff])", re.UNICODE)
  163. def cjkwrap(text, width, encoding="utf8"):
  164. return reduce(lambda line, word, width=width: '%s%s%s' %
  165. (line,
  166. [' ','\n', ''][(len(line)-line.rfind('\n')-1
  167. + len(word.split('\n',1)[0] ) >= width) or
  168. line[-1:] == '\0' and 2],
  169. word),
  170. rx.sub(r'\1\0 ', unicode(text,encoding)).split(' ')
  171. ).replace('\0', '').encode(encoding)
  172. if __name__=='__main__':
  173. import doctest, textsplit
  174. doctest.testmod(textsplit)