PageRenderTime 515ms CodeModel.GetById 157ms app.highlight 226ms RepoModel.GetById 124ms app.codeStats 1ms

/src/linebreak/linebreak.c

http://ftk.googlecode.com/
C | 734 lines | 482 code | 39 blank | 213 comment | 67 complexity | 2c9982e70e464692f69c1c214521dace MD5 | raw file
  1/* vim: set tabstop=4 shiftwidth=4: */
  2
  3/*
  4 * Line breaking in a Unicode sequence.  Designed to be used in a
  5 * generic text renderer.
  6 *
  7 * Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
  8 *
  9 * This software is provided 'as-is', without any express or implied
 10 * warranty.  In no event will the author be held liable for any damages
 11 * arising from the use of this software.
 12 *
 13 * Permission is granted to anyone to use this software for any purpose,
 14 * including commercial applications, and to alter it and redistribute
 15 * it freely, subject to the following restrictions:
 16 *
 17 * 1. The origin of this software must not be misrepresented; you must
 18 *    not claim that you wrote the original software.  If you use this
 19 *    software in a product, an acknowledgement in the product
 20 *    documentation would be appreciated but is not required.
 21 * 2. Altered source versions must be plainly marked as such, and must
 22 *    not be misrepresented as being the original software.
 23 * 3. This notice may not be removed or altered from any source
 24 *    distribution.
 25 *
 26 * The main reference is Unicode Standard Annex 14 (UAX #14):
 27 *		<URL:http://www.unicode.org/reports/tr14/>
 28 *
 29 * When this library was designed, this annex was at Revision 19, for
 30 * Unicode 5.0.0:
 31 *		<URL:http://www.unicode.org/reports/tr14/tr14-19.html>
 32 *
 33 * This library has been updated according to Revision 24, for
 34 * Unicode 5.2.0:
 35 *		<URL:http://www.unicode.org/reports/tr14/tr14-24.html>
 36 *
 37 * The Unicode Terms of Use are available at
 38 *		<URL:http://www.unicode.org/copyright.html>
 39 */
 40
 41/**
 42 * @file	linebreak.c
 43 *
 44 * Implementation of the line breaking algorithm as described in Unicode
 45 * Standard Annex 14.
 46 *
 47 * @version	2.0, 2010/01/03
 48 * @author	Wu Yongwei
 49 */
 50
 51#include <assert.h>
 52#include <stddef.h>
 53#include <string.h>
 54#include "linebreak.h"
 55#include "linebreakdef.h"
 56
 57/**
 58 * Size of the second-level index to the line breaking properties.
 59 */
 60#define LINEBREAK_INDEX_SIZE 40
 61
 62/**
 63 * Version number of the library.
 64 */
 65const int linebreak_version = LINEBREAK_VERSION;
 66
 67/**
 68 * Enumeration of break actions.  They are used in the break action
 69 * pair table below.
 70 */
 71enum BreakAction
 72{
 73	DIR_BRK,		/**< Direct break opportunity */
 74	IND_BRK,		/**< Indirect break opportunity */
 75	CMI_BRK,		/**< Indirect break opportunity for combining marks */
 76	CMP_BRK,		/**< Prohibited break for combining marks */
 77	PRH_BRK			/**< Prohibited break */
 78};
 79
 80/**
 81 * Break action pair table.  This is a direct mapping of Table 2 of
 82 * Unicode Standard Annex 14, Revision 24.
 83 */
 84static enum BreakAction baTable[LBP_JT][LBP_JT] = {
 85	{	/* OP */
 86		PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
 87		PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
 88		PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, CMP_BRK,
 89		PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK },
 90	{	/* CL */
 91		DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
 92		PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
 93		DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 94		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
 95	{	/* CP */
 96		DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
 97		PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
 98		DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
 99		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
100	{	/* QU */
101		PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
102		PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
103		IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
104		PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
105	{	/* GL */
106		IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
107		PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
108		IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
109		PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
110	{	/* NS */
111		DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
112		PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
113		DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
114		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
115	{	/* EX */
116		DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
117		PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
118		DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
119		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
120	{	/* SY */
121		DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
122		PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
123		DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
124		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
125	{	/* IS */
126		DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
127		PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
128		DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
129		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
130	{	/* PR */
131		IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
132		PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
133		DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
134		PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
135	{	/* PO */
136		IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
137		PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
138		DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
139		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
140	{	/* NU */
141		IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
142		PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
143		IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
144		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
145	{	/* AL */
146		IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
147		PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
148		IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
149		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
150	{	/* ID */
151		DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
152		PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
153		IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
154		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
155	{	/* IN */
156		DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
157		PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
158		IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
159		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
160	{	/* HY */
161		DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
162		PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
163		DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
164		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
165	{	/* BA */
166		DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
167		PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
168		DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
169		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
170	{	/* BB */
171		IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
172		PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
173		IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
174		PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
175	{	/* B2 */
176		DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
177		PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
178		DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK, CMI_BRK,
179		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
180	{	/* ZW */
181		DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
182		DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
183		DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK, DIR_BRK,
184		DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
185	{	/* CM */
186		IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
187		PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
188		IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
189		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
190	{	/* WJ */
191		IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
192		PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
193		IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
194		PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
195	{	/* H2 */
196		DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
197		PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
198		IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
199		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
200	{	/* H3 */
201		DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
202		PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
203		IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
204		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK },
205	{	/* JL */
206		DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
207		PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
208		IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
209		PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK },
210	{	/* JV */
211		DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
212		PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
213		IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
214		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
215	{	/* JT */
216		DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
217		PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
218		IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
219		PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK }
220};
221
222/**
223 * Struct for the second-level index to the line breaking properties.
224 */
225struct LineBreakPropertiesIndex
226{
227	utf32_t end;					/**< End coding point */
228	struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */
229};
230
231/**
232 * Second-level index to the line breaking properties.
233 */
234static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =
235{
236	{ 0xFFFFFFFF, lb_prop_default }
237};
238
239/**
240 * Initializes the second-level index to the line breaking properties.
241 * If it is not called, the performance of #get_char_lb_class_lang (and
242 * thus the main functionality) can be pretty bad, especially for big
243 * code points like those of Chinese.
244 */
245void init_linebreak(void)
246{
247	size_t i;
248	size_t iPropDefault;
249	size_t len;
250	size_t step;
251
252	len = 0;
253	while (lb_prop_default[len].prop != LBP_Undefined)
254		++len;
255	step = len / LINEBREAK_INDEX_SIZE;
256	iPropDefault = 0;
257	for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
258	{
259		lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
260		iPropDefault += step;
261		lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
262	}
263	lb_prop_index[--i].end = 0xFFFFFFFF;
264}
265
266/**
267 * Gets the language-specific line breaking properties.
268 *
269 * @param lang	language of the text
270 * @return		pointer to the language-specific line breaking
271 *				properties array if found; \c NULL otherwise
272 */
273static struct LineBreakProperties *get_lb_prop_lang(const char *lang)
274{
275	struct LineBreakPropertiesLang *lbplIter;
276	if (lang != NULL)
277	{
278		for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
279		{
280			if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
281			{
282				return lbplIter->lbp;
283			}
284		}
285	}
286	return NULL;
287}
288
289/**
290 * Gets the line breaking class of a character from a line breaking
291 * properties array.
292 *
293 * @param ch	character to check
294 * @param lbp	pointer to the line breaking properties array
295 * @return		the line breaking class if found; \c LBP_XX otherwise
296 */
297static enum LineBreakClass get_char_lb_class(
298		utf32_t ch,
299		struct LineBreakProperties *lbp)
300{
301	while (lbp->prop != LBP_Undefined && ch >= lbp->start)
302	{
303		if (ch <= lbp->end)
304			return lbp->prop;
305		++lbp;
306	}
307	return LBP_XX;
308}
309
310/**
311 * Gets the line breaking class of a character from the default line
312 * breaking properties array.
313 *
314 * @param ch	character to check
315 * @return		the line breaking class if found; \c LBP_XX otherwise
316 */
317static enum LineBreakClass get_char_lb_class_default(
318		utf32_t ch)
319{
320	size_t i = 0;
321	while (ch > lb_prop_index[i].end)
322		++i;
323	assert(i < LINEBREAK_INDEX_SIZE);
324	return get_char_lb_class(ch, lb_prop_index[i].lbp);
325}
326
327/**
328 * Gets the line breaking class of a character for a specific
329 * language.  This function will check the language-specific data first,
330 * and then the default data if there is no language-specific property
331 * available for the character.
332 *
333 * @param ch		character to check
334 * @param lbpLang	pointer to the language-specific line breaking
335 *					properties array
336 * @return			the line breaking class if found; \c LBP_XX
337 *					otherwise
338 */
339static enum LineBreakClass get_char_lb_class_lang(
340		utf32_t ch,
341		struct LineBreakProperties *lbpLang)
342{
343	enum LineBreakClass lbcResult;
344
345	/* Find the language-specific line breaking class for a character */
346	if (lbpLang)
347	{
348		lbcResult = get_char_lb_class(ch, lbpLang);
349		if (lbcResult != LBP_XX)
350			return lbcResult;
351	}
352
353	/* Find the generic language-specific line breaking class, if no
354	 * language context is provided, or language-specific data are not
355	 * available for the specific character in the specified language */
356	return get_char_lb_class_default(ch);
357}
358
359/**
360 * Resolves the line breaking class for certain ambiguous or complicated
361 * characters.  They are treated in a simplistic way in this
362 * implementation.
363 *
364 * @param lbc	line breaking class to resolve
365 * @param lang	language of the text
366 * @return		the resolved line breaking class
367 */
368static enum LineBreakClass resolve_lb_class(
369		enum LineBreakClass lbc,
370		const char *lang)
371{
372	switch (lbc)
373	{
374	case LBP_AI:
375		if (lang != NULL &&
376				(strncmp(lang, "zh", 2) == 0 ||	/* Chinese */
377				 strncmp(lang, "ja", 2) == 0 ||	/* Japanese */
378				 strncmp(lang, "ko", 2) == 0))	/* Korean */
379		{
380			return LBP_ID;
381		}
382		/* Fall through */
383	case LBP_SA:
384	case LBP_SG:
385	case LBP_XX:
386		return LBP_AL;
387	default:
388		return lbc;
389	}
390}
391
392/**
393 * Gets the next Unicode character in a UTF-8 sequence.  The index will
394 * be advanced to the next complete character, unless the end of string
395 * is reached in the middle of a UTF-8 sequence.
396 *
397 * @param[in]     s		input UTF-8 string
398 * @param[in]     len	length of the string in bytes
399 * @param[in,out] ip	pointer to the index
400 * @return				the Unicode character beginning at the index; or
401 *						#EOS if end of input is encountered
402 */
403utf32_t lb_get_next_char_utf8(
404		const utf8_t *s,
405		size_t len,
406		size_t *ip)
407{
408	utf8_t ch;
409	utf32_t res;
410
411	assert(*ip <= len);
412	if (*ip == len)
413		return EOS;
414	ch = s[*ip];
415
416	if (ch < 0xC2 || ch > 0xF4)
417	{	/* One-byte sequence, tail (should not occur), or invalid */
418		*ip += 1;
419		return ch;
420	}
421	else if (ch < 0xE0)
422	{	/* Two-byte sequence */
423		if (*ip + 2 > len)
424			return EOS;
425		res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
426		*ip += 2;
427		return res;
428	}
429	else if (ch < 0xF0)
430	{	/* Three-byte sequence */
431		if (*ip + 3 > len)
432			return EOS;
433		res = ((ch & 0x0F) << 12) +
434			  ((s[*ip + 1] & 0x3F) << 6) +
435			  ((s[*ip + 2] & 0x3F));
436		*ip += 3;
437		return res;
438	}
439	else
440	{	/* Four-byte sequence */
441		if (*ip + 4 > len)
442			return EOS;
443		res = ((ch & 0x07) << 18) +
444			  ((s[*ip + 1] & 0x3F) << 12) +
445			  ((s[*ip + 2] & 0x3F) << 6) +
446			  ((s[*ip + 3] & 0x3F));
447		*ip += 4;
448		return res;
449	}
450}
451
452/**
453 * Gets the next Unicode character in a UTF-16 sequence.  The index will
454 * be advanced to the next complete character, unless the end of string
455 * is reached in the middle of a UTF-16 surrogate pair.
456 *
457 * @param[in]     s		input UTF-16 string
458 * @param[in]     len	length of the string in words
459 * @param[in,out] ip	pointer to the index
460 * @return				the Unicode character beginning at the index; or
461 *						#EOS if end of input is encountered
462 */
463utf32_t lb_get_next_char_utf16(
464		const utf16_t *s,
465		size_t len,
466		size_t *ip)
467{
468	utf16_t ch;
469
470	assert(*ip <= len);
471	if (*ip == len)
472		return EOS;
473	ch = s[(*ip)++];
474
475	if (ch < 0xD800 || ch > 0xDBFF)
476	{	/* If the character is not a high surrogate */
477		return ch;
478	}
479	if (*ip == len)
480	{	/* If the input ends here (an error) */
481		--(*ip);
482		return EOS;
483	}
484	if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
485	{	/* If the next character is not the low surrogate (an error) */
486		return ch;
487	}
488	/* Return the constructed character and advance the index again */
489	return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
490}
491
492/**
493 * Gets the next Unicode character in a UTF-32 sequence.  The index will
494 * be advanced to the next character.
495 *
496 * @param[in]     s		input UTF-32 string
497 * @param[in]     len	length of the string in dwords
498 * @param[in,out] ip	pointer to the index
499 * @return				the Unicode character beginning at the index; or
500 *						#EOS if end of input is encountered
501 */
502utf32_t lb_get_next_char_utf32(
503		const utf32_t *s,
504		size_t len,
505		size_t *ip)
506{
507	assert(*ip <= len);
508	if (*ip == len)
509		return EOS;
510	return s[(*ip)++];
511}
512
513/**
514 * Sets the line breaking information for a generic input string.
515 *
516 * @param[in]  s			input string
517 * @param[in]  len			length of the input
518 * @param[in]  lang			language of the input
519 * @param[out] brks			pointer to the output breaking data,
520 *							containing #LINEBREAK_MUSTBREAK,
521 *							#LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
522 *							or #LINEBREAK_INSIDEACHAR
523 * @param[in] get_next_char	function to get the next UTF-32 character
524 */
525void set_linebreaks(
526		const void *s,
527		size_t len,
528		const char *lang,
529		char *brks,
530		get_next_char_t get_next_char)
531{
532	utf32_t ch;
533	enum LineBreakClass lbcCur;
534	enum LineBreakClass lbcNew;
535	enum LineBreakClass lbcLast;
536	struct LineBreakProperties *lbpLang;
537	size_t posCur = 0;
538	size_t posLast = 0;
539
540	--posLast;	/* To be ++'d later */
541	ch = get_next_char(s, len, &posCur);
542	if (ch == EOS)
543		return;
544	lbpLang = get_lb_prop_lang(lang);
545	lbcCur = resolve_lb_class(get_char_lb_class_lang(ch, lbpLang), lang);
546	lbcNew = LBP_Undefined;
547
548nextline:
549
550	/* Special treatment for the first character */
551	switch (lbcCur)
552	{
553	case LBP_LF:
554	case LBP_NL:
555		lbcCur = LBP_BK;
556		break;
557	case LBP_SP:
558		lbcCur = LBP_WJ;
559		break;
560	default:
561		break;
562	}
563
564	/* Process a line till an explicit break or end of string */
565	for (;;)
566	{
567		for (++posLast; posLast < posCur - 1; ++posLast)
568		{
569			brks[posLast] = LINEBREAK_INSIDEACHAR;
570		}
571		assert(posLast == posCur - 1);
572		lbcLast = lbcNew;
573		ch = get_next_char(s, len, &posCur);
574		if (ch == EOS)
575			break;
576		lbcNew = get_char_lb_class_lang(ch, lbpLang);
577		if (lbcCur == LBP_BK || (lbcCur == LBP_CR && lbcNew != LBP_LF))
578		{
579			brks[posLast] = LINEBREAK_MUSTBREAK;
580			lbcCur = resolve_lb_class(lbcNew, lang);
581			goto nextline;
582		}
583
584		switch (lbcNew)
585		{
586		case LBP_SP:
587			brks[posLast] = LINEBREAK_NOBREAK;
588			continue;
589		case LBP_BK:
590		case LBP_LF:
591		case LBP_NL:
592			brks[posLast] = LINEBREAK_NOBREAK;
593			lbcCur = LBP_BK;
594			continue;
595		case LBP_CR:
596			brks[posLast] = LINEBREAK_NOBREAK;
597			lbcCur = LBP_CR;
598			continue;
599		case LBP_CB:
600			brks[posLast] = LINEBREAK_ALLOWBREAK;
601			lbcCur = LBP_BA;
602			continue;
603		default:
604			break;
605		}
606
607		lbcNew = resolve_lb_class(lbcNew, lang);
608
609		assert(lbcCur <= LBP_JT);
610		assert(lbcNew <= LBP_JT);
611		switch (baTable[lbcCur - 1][lbcNew - 1])
612		{
613		case DIR_BRK:
614			brks[posLast] = LINEBREAK_ALLOWBREAK;
615			break;
616		case CMI_BRK:
617		case IND_BRK:
618			if (lbcLast == LBP_SP)
619			{
620				brks[posLast] = LINEBREAK_ALLOWBREAK;
621			}
622			else
623			{
624				brks[posLast] = LINEBREAK_NOBREAK;
625			}
626			break;
627		case CMP_BRK:
628			brks[posLast] = LINEBREAK_NOBREAK;
629			if (lbcLast != LBP_SP)
630				continue;
631			break;
632		case PRH_BRK:
633			brks[posLast] = LINEBREAK_NOBREAK;
634			break;
635		}
636
637		lbcCur = lbcNew;
638	}
639
640	assert(posLast == posCur - 1 && posCur <= len);
641	/* Break after the last character */
642	brks[posLast] = LINEBREAK_MUSTBREAK;
643	/* When the input contains incomplete sequences */
644	while (posCur < len)
645	{
646		brks[posCur++] = LINEBREAK_INSIDEACHAR;
647	}
648}
649
650/**
651 * Sets the line breaking information for a UTF-8 input string.
652 *
653 * @param[in]  s	input UTF-8 string
654 * @param[in]  len	length of the input
655 * @param[in]  lang	language of the input
656 * @param[out] brks	pointer to the output breaking data, containing
657 *					#LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
658 *					#LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
659 */
660void set_linebreaks_utf8(
661		const utf8_t *s,
662		size_t len,
663		const char *lang,
664		char *brks)
665{
666	set_linebreaks(s, len, lang, brks,
667				   (get_next_char_t)lb_get_next_char_utf8);
668}
669
670/**
671 * Sets the line breaking information for a UTF-16 input string.
672 *
673 * @param[in]  s	input UTF-16 string
674 * @param[in]  len	length of the input
675 * @param[in]  lang	language of the input
676 * @param[out] brks	pointer to the output breaking data, containing
677 *					#LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
678 *					#LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
679 */
680void set_linebreaks_utf16(
681		const utf16_t *s,
682		size_t len,
683		const char *lang,
684		char *brks)
685{
686	set_linebreaks(s, len, lang, brks,
687				   (get_next_char_t)lb_get_next_char_utf16);
688}
689
690/**
691 * Sets the line breaking information for a UTF-32 input string.
692 *
693 * @param[in]  s	input UTF-32 string
694 * @param[in]  len	length of the input
695 * @param[in]  lang	language of the input
696 * @param[out] brks	pointer to the output breaking data, containing
697 *					#LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
698 *					#LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
699 */
700void set_linebreaks_utf32(
701		const utf32_t *s,
702		size_t len,
703		const char *lang,
704		char *brks)
705{
706	set_linebreaks(s, len, lang, brks,
707				   (get_next_char_t)lb_get_next_char_utf32);
708}
709
710/**
711 * Tells whether a line break can occur between two Unicode characters.
712 * This is a wrapper function to expose a simple interface.  Generally
713 * speaking, it is better to use #set_linebreaks_utf32 instead, since
714 * complicated cases involving combining marks, spaces, etc. cannot be
715 * correctly processed.
716 *
717 * @param char1 the first Unicode character
718 * @param char2 the second Unicode character
719 * @param lang  language of the input
720 * @return      one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
721 *				#LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
722 */
723int is_line_breakable(
724		utf32_t char1,
725		utf32_t char2,
726		const char* lang)
727{
728	utf32_t s[2];
729	char brks[2];
730	s[0] = char1;
731	s[1] = char2;
732	set_linebreaks_utf32(s, 2, lang, brks);
733	return brks[0];
734}