PageRenderTime 117ms CodeModel.GetById 13ms app.highlight 97ms RepoModel.GetById 1ms app.codeStats 0ms

/core/unicode.d

http://github.com/wilkie/djehuty
D | 1081 lines | 725 code | 205 blank | 151 comment | 209 complexity | b305b6c838fc8b68a11c4b8195c90c9c MD5 | raw file
   1/*
   2 * unicode.d
   3 *
   4 * This module implements unicode functions that were badly needed.
   5 *
   6 * Author: Dave Wilkinson
   7 *
   8 */
   9
  10module core.unicode;
  11
  12import core.definitions;
  13
  14private static const uint halfShift = 10;
  15private static const uint halfBase = 0x0010000;
  16private static const uint halfMask = 0x3FF;
  17
  18private const auto UNI_SUR_HIGH_START	= 0xD800;
  19private const auto UNI_SUR_HIGH_END		= 0xDBFF;
  20private const auto UNI_SUR_LOW_START	= 0xDC00;
  21private const auto UNI_SUR_LOW_END		= 0xDFFF;
  22
  23private const auto UNI_REPLACEMENT_CHAR = cast(dchar)0x0000FFFD;
  24private const auto UNI_MAX_BMP = cast(dchar)0x0000FFFF;
  25private const auto UNI_MAX_UTF16 = cast(dchar)0x0010FFFF;
  26private const auto UNI_MAX_UTF32 = cast(dchar)0x7FFFFFFF;
  27private const auto UNI_MAX_LEGAL_UTF32 = cast(dchar)0x0010FFFF;
  28
  29private static const ubyte firstByteMark[7] = [ 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC ];
  30
  31/*
  32 * Index into the table below with the first byte of a UTF-8 sequence to
  33 * get the number of trailing bytes that are supposed to follow it.
  34 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
  35 * left as-is for anyone who may want to do such conversion, which was
  36 * allowed in earlier algorithms.
  37 */
  38static const char trailingBytesForUTF8[256] = [
  39    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  40    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  41    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  42    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  43    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  44    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  45    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  46    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
  47];
  48
  49/*
  50 * Magic values subtracted from a buffer value during UTF8 conversion.
  51 * This table contains as many values as there might be trailing bytes
  52 * in a UTF-8 sequence.
  53 */
  54static const uint offsetsFromUTF8[6] = [ 0x00000000, 0x00003080, 0x000E2080,
  55		     0x03C82080, 0xFA082080, 0x82082080 ];
  56
  57
  58/*
  59 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
  60 * This must be called with the length pre-determined by the first byte.
  61 * If not calling this from ConvertUTF8to*, then the length can be set by:
  62 *  length = trailingBytesForUTF8[*source]+1;
  63 * and the sequence is illegal right away if there aren't that many bytes
  64 * available.
  65 * If presented with a length > 4, this returns false.  The Unicode
  66 * definition of UTF-8 goes up to 4-byte sequences.
  67 */
  68
  69private bool isLegalUTF8(char* source, int length) {
  70    char a;
  71    char *srcptr = source+length;
  72    switch (length) {
  73	    default: return false;
  74		/* Everything else falls through when "true"... */
  75	    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  76	    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
  77	    case 2: if ((a = (*--srcptr)) > 0xBF) return false;
  78
  79		switch (*source) {
  80		    /* no fall-through in this inner switch */
  81		    case 0xE0: if (a < 0xA0) return false; break;
  82		    case 0xED: if (a > 0x9F) return false; break;
  83		    case 0xF0: if (a < 0x90) return false; break;
  84		    case 0xF4: if (a > 0x8F) return false; break;
  85		    default:   if (a < 0x80) return false;
  86		}
  87
  88	    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
  89    }
  90    if (*source > 0xF4) return false;
  91    return true;
  92}
  93
  94// For efficiency, we have full
  95// control of the buffer length.
  96
  97struct Unicode {
  98static:
  99
 100	string toUtf8(string src) {
 101		return cast(string)src.dup;
 102	}
 103
 104	string toUtf8(wstring src) {
 105		if (src.length == 0) {
 106			return cast(string)"";
 107		}
 108
 109		char[] container = new char[src.length*4];
 110
 111		const auto byteMask = 0xBF;
 112		const auto byteMark = 0x80;
 113
 114		wchar* source = src.ptr;
 115		wchar* sourceEnd = &src[$-1] + 1;
 116
 117		char* target = container.ptr;
 118		char* targetEnd = &container[$-1] + 1;
 119
 120		uint bytesToWrite;
 121
 122		dchar ch;
 123
 124		while(source !is sourceEnd) {
 125
 126			ch = *source++;
 127
 128			// If we have a surrogate pair, we convert to UTF-32
 129			if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
 130				dchar ch2 = cast(dchar)*source;
 131
 132				/* If it's a low surrogate, convert to UTF32. */
 133				if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
 134					ch = ((ch - UNI_SUR_HIGH_START) << 10) + (ch2 - UNI_SUR_LOW_START) + halfBase;
 135					source++;
 136				}
 137				else {
 138					// unpaired high surrogate
 139					// illegal
 140
 141					// TODO: do not break, just add a character and continue to produce valid string
 142					source--;
 143					break;
 144				}
 145			}
 146			else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
 147				// illegal
 148
 149				// TODO: do not break, just add a character and continue to produce valid string
 150				source--;
 151				break;
 152			}
 153
 154			/* Figure out how many bytes the result will require */
 155			if (ch < cast(dchar)0x80) {
 156				bytesToWrite = 1;
 157			}
 158			else if (ch < cast(dchar)0x800) {
 159				bytesToWrite = 2;
 160			}
 161			else if (ch < cast(dchar)0x10000) {
 162				bytesToWrite = 3;
 163			}
 164			else if (ch < cast(dchar)0x110000) {
 165				bytesToWrite = 4;
 166			}
 167			else {
 168				bytesToWrite = 3;
 169				ch = UNI_REPLACEMENT_CHAR;
 170			}
 171
 172			target += bytesToWrite;
 173
 174			switch (bytesToWrite) { /* note: everything falls through. */
 175				case 4: *--target = cast(char)((ch | byteMark) & byteMask); ch >>= 6;
 176				case 3: *--target = cast(char)((ch | byteMark) & byteMask); ch >>= 6;
 177				case 2: *--target = cast(char)((ch | byteMark) & byteMask); ch >>= 6;
 178				case 1: *--target = cast(char)(ch | firstByteMark[bytesToWrite]);
 179
 180				default: break;
 181			}
 182			target += bytesToWrite;
 183		}
 184
 185		return container[0..target - container.ptr];
 186//		return "";
 187	}
 188
 189	string toUtf8(dstring src) {
 190		if (src is null || src.length == 0) {
 191			return cast(string)"";
 192		}
 193
 194		char[] container = new char[src.length*4];
 195
 196		const auto byteMask = 0xBF;
 197		const auto byteMark = 0x80;
 198
 199		dchar* source = src.ptr;
 200		dchar* sourceEnd = &src[$-1] + 1;
 201
 202		char* target = container.ptr;
 203		char* targetEnd = &container[$-1] + 1;
 204
 205		uint bytesToWrite;
 206
 207		dchar ch;
 208
 209		while (source < sourceEnd) {
 210
 211			bytesToWrite = 0;
 212			ch = *source++;
 213
 214			/*
 215			 * Figure out how many bytes the result will require. Turn any
 216			 * illegally large UTF32 things (> Plane 17) into replacement chars.
 217			 */
 218
 219			if (ch < cast(dchar)0x80) {
 220				bytesToWrite = 1;
 221			}
 222			else if (ch < cast(dchar)0x800) {
 223				bytesToWrite = 2;
 224			}
 225			else if (ch < cast(dchar)0x10000) {
 226				bytesToWrite = 3;
 227			}
 228			else if (ch <= UNI_MAX_LEGAL_UTF32) {
 229				bytesToWrite = 4;
 230			}
 231			else {
 232				bytesToWrite = 3;
 233				ch = UNI_REPLACEMENT_CHAR;
 234			}
 235
 236			target += bytesToWrite;
 237
 238			switch (bytesToWrite) { /* note: everything falls through. */
 239				case 4: *--target = cast(char)((ch | byteMark) & byteMask); ch >>= 6;
 240				case 3: *--target = cast(char)((ch | byteMark) & byteMask); ch >>= 6;
 241				case 2: *--target = cast(char)((ch | byteMark) & byteMask); ch >>= 6;
 242				case 1: *--target = cast(char) (ch | firstByteMark[bytesToWrite]);
 243
 244				default: break;
 245			}
 246			target += bytesToWrite;
 247		}
 248
 249		uint targetLen = target - container.ptr;
 250
 251		string ret = cast(string)container[0..targetLen];
 252		return ret;
 253	}
 254
 255	wstring toUtf16(string src) {
 256		if (src.length == 0) {
 257			return cast(wstring)"";
 258		}
 259
 260		wchar[] container = new wchar[src.length];
 261
 262		char* source = src.ptr;
 263		char* sourceEnd = &src[$-1] + 1;
 264
 265		wchar* target = container.ptr;
 266		wchar* targetEnd = &container[$-1] + 1;
 267
 268		dchar ch;
 269
 270		while (source < sourceEnd) {
 271			ch = 0;
 272
 273			ushort extraBytesToRead = trailingBytesForUTF8[*source];
 274
 275			if (source + extraBytesToRead >= sourceEnd) {
 276				// sourceExhausted
 277				break;
 278			}
 279			/* Do this check whether lenient or strict */
 280			if (! isLegalUTF8(source, extraBytesToRead+1)) {
 281				// sourceIllegal
 282				break;
 283			}
 284
 285			switch (extraBytesToRead) {
 286				case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
 287				case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
 288				case 3: ch += *source++; ch <<= 6;
 289				case 2: ch += *source++; ch <<= 6;
 290				case 1: ch += *source++; ch <<= 6;
 291				case 0: ch += *source++;
 292				default: break;
 293			}
 294			ch -= offsetsFromUTF8[extraBytesToRead];
 295
 296			if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
 297				/* UTF-16 surrogate values are illegal in UTF-32 */
 298				if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
 299					// illegal
 300					*target++ = UNI_REPLACEMENT_CHAR;
 301				}
 302				else {
 303					*target++ = cast(wchar)ch; /* normal case */
 304				}
 305			}
 306			else if (ch > UNI_MAX_UTF16) {
 307				// illegal
 308				*target++ = UNI_REPLACEMENT_CHAR;
 309			}
 310			else {
 311				/* target is a character in range 0xFFFF - 0x10FFFF. */
 312
 313				ch -= halfBase;
 314				*target++ = cast(wchar)((ch >> halfShift) + UNI_SUR_HIGH_START);
 315				*target++ = cast(wchar)((ch & halfMask) + UNI_SUR_LOW_START);
 316			}
 317		}
 318
 319		return cast(wstring)container[0..target - container.ptr];
 320	}
 321
 322	wstring toUtf16(wstring src) {
 323		return cast(wstring)src.dup;
 324	}
 325
 326	wstring toUtf16(dstring src) {
 327		if (src.length == 0) {
 328			return cast(wstring)"";
 329		}
 330
 331		wchar[] container = new wchar[src.length];
 332
 333		dchar* source = src.ptr;
 334		dchar* sourceEnd = &src[$-1] + 1;
 335
 336		wchar* target = container.ptr;
 337		wchar* targetEnd = &container[$-1] + 1;
 338
 339		dchar ch;
 340
 341		while (source < sourceEnd) {
 342			ch = *source++;
 343			if (ch <= UNI_MAX_BMP) {
 344				/* Target is a character <= 0xFFFF */
 345
 346				/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
 347				if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
 348					*target++ = UNI_REPLACEMENT_CHAR;
 349				}
 350				else {
 351					*target++ = cast(wchar)ch; /* normal case */
 352				}
 353			}
 354			else if (ch > UNI_MAX_LEGAL_UTF32) {
 355				*target++ = UNI_REPLACEMENT_CHAR;
 356			}
 357			else {
 358				/* target is a character in range 0xFFFF - 0x10FFFF. */
 359				ch -= halfBase;
 360				*target++ = cast(wchar)((ch >> halfShift) + UNI_SUR_HIGH_START);
 361				*target++ = cast(wchar)((ch & halfMask) + UNI_SUR_LOW_START);
 362			}
 363		}
 364
 365		return cast(wstring)container[0..target - container.ptr];
 366	}
 367
 368	dstring toUtf32(string src) {
 369		if (src.length == 0) {
 370			return cast(dstring)"";
 371		}
 372
 373		dchar[] container = new dchar[src.length];
 374
 375		char* source = src.ptr;
 376		char* sourceEnd = &src[$-1] + 1;
 377
 378		dchar* target = container.ptr;
 379		dchar* targetEnd = &container[$-1] + 1;
 380
 381		ushort extraBytesToRead;
 382
 383		dchar ch;
 384
 385		while (source < sourceEnd) {
 386			ch = 0;
 387			extraBytesToRead = trailingBytesForUTF8[*source];
 388
 389			if (source + extraBytesToRead >= sourceEnd) {
 390				// sourceExhausted
 391				break;
 392			}
 393
 394			if (!isLegalUTF8(source, extraBytesToRead+1)) {
 395				// sourceIllegal
 396				break;
 397			}
 398
 399			/*
 400			 * The cases all fall through. See "Note A" below.
 401			 */
 402			switch (extraBytesToRead) {
 403				case 5: ch += *source++; ch <<= 6;
 404				case 4: ch += *source++; ch <<= 6;
 405				case 3: ch += *source++; ch <<= 6;
 406				case 2: ch += *source++; ch <<= 6;
 407				case 1: ch += *source++; ch <<= 6;
 408				case 0: ch += *source++;
 409				default: break;
 410			}
 411
 412			ch -= offsetsFromUTF8[extraBytesToRead];
 413
 414			if (ch <= UNI_MAX_LEGAL_UTF32) {
 415				/*
 416				 * UTF-16 surrogate values are illegal in UTF-32, and anything
 417				 * over Plane 17 (> 0x10FFFF) is illegal.
 418				 */
 419				if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
 420					*target++ = UNI_REPLACEMENT_CHAR;
 421				}
 422				else {
 423					*target++ = ch;
 424				}
 425			}
 426			else {
 427				/* i.e., ch > UNI_MAX_LEGAL_UTF32 */
 428				// sourceIllegal
 429				*target++ = UNI_REPLACEMENT_CHAR;
 430			}
 431		}
 432
 433		return cast(dstring)container[0..target - container.ptr];
 434	}
 435
 436	dstring toUtf32(wstring src) {
 437		if (src.length == 0) {
 438			return cast(dstring)"";
 439		}
 440
 441		dchar[] container = new dchar[src.length];
 442
 443		wchar* source = src.ptr;
 444		wchar* sourceEnd = &src[$-1] + 1;
 445
 446		dchar* target = container.ptr;
 447		dchar* targetEnd = &container[$-1] + 1;
 448
 449		dchar ch, ch2;
 450
 451		while (source < sourceEnd) {
 452			ch = *source++;
 453			/* If we have a surrogate pair, convert to UTF32 first. */
 454			if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
 455				/* If the 16 bits following the high surrogate are in the source buffer... */
 456				if (source < sourceEnd) {
 457					ch2 = *source;
 458					/* If it's a low surrogate, convert to UTF32. */
 459					if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
 460						ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) + halfBase;
 461						source++;
 462					}
 463				}
 464				else {
 465					/* We don't have the 16 bits following the high surrogate. */
 466					//--source; /* return to the high surrogate */
 467					// sourceExhausted
 468					break;
 469				}
 470			}
 471
 472			*target++ = ch;
 473		}
 474
 475		return cast(dstring)container[0..target - container.ptr];
 476	}
 477
 478	dstring toUtf32(dstring src) {
 479		return cast(dstring)src.dup;
 480	}
 481
 482	// character conversions
 483	dchar toUtf32Char(string src) {
 484		// grab the first character,
 485		// convert it to a UTF-32 character,
 486		// and then return
 487		return toUtf32(src)[0];
 488	}
 489
 490	dchar toUtf32Char(wstring src) {
 491		return toUtf32(src)[0];
 492	}
 493
 494	dchar toUtf32Char(dstring src) {
 495		// Useless function
 496
 497		return src[0];
 498	}
 499
 500	bool isDeadChar(char[] chr) {
 501		dchar dchr = toUtf32Char(chr);
 502		return isDeadChar(dchr);
 503	}
 504
 505	bool isDeadChar(wchar[] chr) {
 506		dchar dchr = toUtf32Char(chr);
 507		return isDeadChar(dchr);
 508	}
 509
 510	bool isDeadChar(dchar[] chr) {
 511		return isDeadChar(chr[0]);
 512	}
 513
 514	bool isDeadChar(dchar chr) {
 515		// if it is a dead character
 516		return ((
 517			(chr >= 0x300 && chr <= 0x36F) ||		// Combining Diacritical Marks
 518			(chr >= 0x1DC0 && chr <= 0x1DFF) ||		// Combining Diacritical Marks Supplement
 519			(chr >= 0x20D0 && chr <= 0x20FF) ||		// Combining Diacritical Marks for Symbols
 520			(chr >= 0xFE20 && chr <= 0xFE2F)		// Combining Half Marks
 521			));
 522	}
 523
 524	// character conversions
 525	dchar[] toUtf32Chars(string src) {
 526		// grab the first character,
 527		// convert it to a UTF-32 character,
 528		// and then return
 529
 530		dchar[] container;
 531
 532		if (src.length == 0) {
 533			return [];
 534		}
 535
 536		char* source = src.ptr;
 537		char* sourceEnd = &src[$-1] + 1;
 538
 539		ushort extraBytesToRead;
 540
 541		dchar ch;
 542
 543		while (source < sourceEnd) {
 544			ch = 0;
 545
 546			extraBytesToRead = trailingBytesForUTF8[*source];
 547
 548			if (source + extraBytesToRead >= sourceEnd) {
 549				// sourceExhausted
 550				if (container.length == 0) {
 551					container ~=  UNI_REPLACEMENT_CHAR;
 552				}
 553				return container;
 554			}
 555
 556			if (!isLegalUTF8(source, extraBytesToRead+1)) {
 557				// sourceIllegal
 558				if (container.length == 0) {
 559					container ~=  UNI_REPLACEMENT_CHAR;
 560				}
 561				return container;
 562			}
 563
 564			/*
 565			 * The cases all fall through. See "Note A" below.
 566			 */
 567			switch (extraBytesToRead) {
 568				case 5: ch += *source++; ch <<= 6;
 569				case 4: ch += *source++; ch <<= 6;
 570				case 3: ch += *source++; ch <<= 6;
 571				case 2: ch += *source++; ch <<= 6;
 572				case 1: ch += *source++; ch <<= 6;
 573				case 0: ch += *source++;
 574				default: break;
 575			}
 576
 577			ch -= offsetsFromUTF8[extraBytesToRead];
 578
 579			if (ch <= UNI_MAX_LEGAL_UTF32) {
 580				/*
 581				 * UTF-16 surrogate values are illegal in UTF-32, and anything
 582				 * over Plane 17 (> 0x10FFFF) is illegal.
 583				 */
 584				if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
 585					if (container.length == 0) {
 586						container ~=  UNI_REPLACEMENT_CHAR;
 587					}
 588					return container;
 589				}
 590				// else: found a valid character
 591			}
 592			else {
 593				/* i.e., ch > UNI_MAX_LEGAL_UTF32 */
 594				// sourceIllegal
 595				if (container.length == 0) {
 596					container ~=  UNI_REPLACEMENT_CHAR;
 597				}
 598				return container;
 599			}
 600
 601			if (container.length > 0) {
 602				if (!isDeadChar(ch)) {
 603					break;
 604				}
 605			}
 606			container ~=  ch;
 607		}
 608
 609		return container;
 610	}
 611
 612	dchar[] toUtf32Chars(wstring src) {
 613		// grab the first character,
 614		// convert it to a UTF-32 character,
 615		// and then return
 616		dchar[] container;
 617
 618		if (src.length == 0) {
 619			return [];
 620		}
 621
 622		wchar* source = src.ptr;
 623		wchar* sourceEnd = &src[$-1] + 1;
 624
 625		dchar ch, ch2;
 626
 627		while(source < sourceEnd) {
 628			ch = *source++;
 629			/* If we have a surrogate pair, convert to UTF32 first. */
 630			if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
 631				/* If the 16 bits following the high surrogate are in the source buffer... */
 632				if (source < sourceEnd) {
 633					ch2 = *source;
 634					/* If it's a low surrogate, convert to UTF32. */
 635					if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
 636						ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) + halfBase;
 637						// found a valid character
 638					}
 639					else {
 640						container ~= UNI_REPLACEMENT_CHAR;
 641						return container;
 642					}
 643				}
 644				else {
 645					/* We don't have the 16 bits following the high surrogate. */
 646					// sourceExhausted
 647					container ~= UNI_REPLACEMENT_CHAR;
 648					return container;
 649				}
 650			}
 651			// else: found a valid character
 652			if (container.length > 0) {
 653				if (isDeadChar(ch)) {
 654					container ~= ch;
 655				}
 656				else {
 657					break;
 658				}
 659			}
 660			else {
 661				container ~= ch;
 662			}
 663		}
 664
 665		return container;
 666	}
 667
 668	dchar[] toUtf32Chars(dstring src) {
 669		dchar[] container;
 670
 671		if (src.length == 0) {
 672			return [];
 673		}
 674
 675		container ~= src[0];
 676
 677		foreach(s; src[1..$]) {
 678			if (isDeadChar(s)) {
 679				container ~= s;
 680			}
 681			else {
 682				break;
 683			}
 684		}
 685
 686		return cast(dchar[])container;
 687	}
 688
 689	wchar[] toUtf16Chars(dstring src) {
 690		wchar[] container;
 691
 692		if (src.length == 0) {
 693			return cast(wchar[])container;
 694		}
 695
 696		dchar* source = src.ptr;
 697		dchar* sourceEnd = &src[$-1] + 1;
 698
 699		dchar ch;
 700
 701		while (source < sourceEnd) {
 702			ch = *source++;
 703			if (ch <= UNI_MAX_BMP) {
 704				/* Target is a character <= 0xFFFF */
 705
 706				/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
 707				if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
 708					if (container.length == 0) {
 709						container ~= UNI_REPLACEMENT_CHAR;
 710					}
 711					return cast(wchar[])container;
 712				}
 713				else {
 714					if (container.length > 0 && !isDeadChar(ch)) {
 715						break;
 716					}
 717					container ~= cast(wchar)ch; /* normal case */
 718				}
 719			}
 720			else if (ch > UNI_MAX_LEGAL_UTF32) {
 721				if (container.length == 0) {
 722					container ~= UNI_REPLACEMENT_CHAR;
 723				}
 724				return cast(wchar[])container;
 725			}
 726			else {
 727				/* target is a character in range 0xFFFF - 0x10FFFF. */
 728				ch -= halfBase;
 729				if (container.length > 0 && !isDeadChar(ch)) {
 730					break;
 731				}
 732				container ~= cast(wchar)((ch >> halfShift) + UNI_SUR_HIGH_START);
 733				container ~= cast(wchar)((ch & halfMask) + UNI_SUR_LOW_START);
 734			}
 735		}
 736
 737		return cast(wchar[])container;
 738	}
 739
 740	char[] toUtf8Chars(dstring src) {
 741		char[] container;
 742
 743		if (src.length == 0) {
 744			return [];
 745		}
 746
 747		dchar* source = src.ptr;
 748		dchar* sourceEnd = &src[$-1] + 1;
 749
 750		dchar ch;
 751
 752		return cast(char[])container;
 753	}
 754
 755	// string length stuffs
 756	uint utflen(string src) {
 757		if (src.length == 0) {
 758			return 0;
 759		}
 760
 761		char* source = src.ptr;
 762		char* sourceEnd = &src[$-1] + 1;
 763
 764		ushort extraBytesToRead;
 765
 766		dchar ch;
 767
 768		uint len;
 769
 770		while (source < sourceEnd) {
 771			ch = 0;
 772			extraBytesToRead = trailingBytesForUTF8[*source];
 773
 774			if (source + extraBytesToRead >= sourceEnd) {
 775				// sourceExhausted
 776				break;
 777			}
 778
 779			if (!isLegalUTF8(source, extraBytesToRead+1)) {
 780				// sourceIllegal
 781				break;
 782			}
 783
 784			/*
 785			 * The cases all fall through. See "Note A" below.
 786			 */
 787			switch (extraBytesToRead) {
 788				case 5: ch += *source++; ch <<= 6;
 789				case 4: ch += *source++; ch <<= 6;
 790				case 3: ch += *source++; ch <<= 6;
 791				case 2: ch += *source++; ch <<= 6;
 792				case 1: ch += *source++; ch <<= 6;
 793				case 0: ch += *source++;
 794				default: break;
 795			}
 796
 797			ch -= offsetsFromUTF8[extraBytesToRead];
 798
 799			if (ch <= UNI_MAX_LEGAL_UTF32) {
 800				/*
 801				 * UTF-16 surrogate values are illegal in UTF-32, and anything
 802				 * over Plane 17 (> 0x10FFFF) is illegal.
 803				 */
 804				if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
 805					ch = UNI_REPLACEMENT_CHAR;
 806				}
 807			}
 808			else {
 809				/* i.e., ch > UNI_MAX_LEGAL_UTF32 */
 810				// sourceIllegal
 811				ch = UNI_REPLACEMENT_CHAR;
 812			}
 813
 814			// if it is not a dead character
 815			if (!isDeadChar(ch)) {
 816				// it is a valid character
 817				len++;
 818			}
 819		}
 820
 821		return len;
 822	}
 823
 824	uint utflen(wstring src) {
 825		if (src.length == 0) {
 826			return 0;
 827		}
 828
 829		wchar* source = src.ptr;
 830		wchar* sourceEnd = &src[$-1] + 1;
 831
 832		uint len = 0;
 833
 834		dchar ch, ch2;
 835
 836		while(source < sourceEnd) {
 837			ch = *source++;
 838			if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
 839				if (source < sourceEnd) {
 840					ch2 = *source;
 841					if (!(ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)) {
 842						// invalid surrogate
 843						source--;
 844						ch = UNI_REPLACEMENT_CHAR;
 845					}
 846					else {
 847						ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) + halfBase;
 848					}
 849				}
 850				else {
 851					break;
 852				}
 853			}
 854
 855			// if it is not a dead character
 856			if (!isDeadChar(ch)) {
 857				// it is a valid character
 858				len++;
 859			}
 860		}
 861
 862		return len;
 863	}
 864
 865	uint utflen(dstring src) {
 866		if (src.length == 0) {
 867			return 0;
 868		}
 869
 870		uint len;
 871
 872		for (int i=0; i<src.length; i++) {
 873			// if it is not a dead character
 874			if (!isDeadChar(src[i])) {
 875				// it is a valid character
 876				len++;
 877			}
 878		}
 879
 880		return len;
 881	}
 882
 883	// Unicode Indices
 884
 885	uint[] calcIndices(string src) {
 886		if (src is null || src == "") {
 887			return [];
 888		}
 889
 890		uint[] ret = new uint[src.length];
 891
 892		char* source = src.ptr;
 893		char* sourceEnd = &src[$-1] + 1;
 894
 895		ushort extraBytesToRead;
 896
 897		dchar ch;
 898
 899		uint len;
 900		uint i;
 901		uint* retPtr = ret.ptr;
 902
 903		while (source < sourceEnd) {
 904			ch = 0;
 905			extraBytesToRead = trailingBytesForUTF8[*source];
 906
 907			if (source + extraBytesToRead >= sourceEnd) {
 908				// sourceExhausted
 909				break;
 910			}
 911
 912			if (!isLegalUTF8(source, extraBytesToRead+1)) {
 913				// sourceIllegal
 914				break;
 915			}
 916
 917			/*
 918			 * The cases all fall through. See "Note A" below.
 919			 */
 920			switch (extraBytesToRead) {
 921				case 5: ch += *source++; ch <<= 6;
 922				case 4: ch += *source++; ch <<= 6;
 923				case 3: ch += *source++; ch <<= 6;
 924				case 2: ch += *source++; ch <<= 6;
 925				case 1: ch += *source++; ch <<= 6;
 926				case 0: ch += *source++;
 927				default: break;
 928			}
 929
 930			ch -= offsetsFromUTF8[extraBytesToRead];
 931
 932			if (ch <= UNI_MAX_LEGAL_UTF32) {
 933				/*
 934				 * UTF-16 surrogate values are illegal in UTF-32, and anything
 935				 * over Plane 17 (> 0x10FFFF) is illegal.
 936				 */
 937				if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
 938					ch = UNI_REPLACEMENT_CHAR;
 939				}
 940			}
 941			else {
 942				/* i.e., ch > UNI_MAX_LEGAL_UTF32 */
 943				// sourceIllegal
 944				ch = UNI_REPLACEMENT_CHAR;
 945			}
 946
 947			// if it is not a dead character
 948			if (!isDeadChar(ch)) {
 949				// it is a valid character
 950				*retPtr++ = i;
 951				len++;
 952			}
 953
 954			i += extraBytesToRead+1;
 955		}
 956
 957		return ret[0..len];
 958	}
 959
 960	uint[] calcIndices(wstring src) {
 961		if (src is null || src == "") {
 962			return [];
 963		}
 964
 965		uint[] ret = new uint[src.length];
 966
 967		wchar* source = src.ptr;
 968		wchar* sourceEnd = &src[$-1] + 1;
 969
 970		uint len;
 971		uint i;
 972		uint mv;
 973		uint* retPtr = ret.ptr;
 974
 975		dchar ch, ch2;
 976
 977		while(source < sourceEnd) {
 978			ch = *source++;
 979			mv++;
 980			if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
 981				if (source < sourceEnd) {
 982					ch2 = *source++;
 983					mv++;
 984					if (!(ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)) {
 985						// invalid surrogate
 986						mv--;
 987						source--;
 988						ch = UNI_REPLACEMENT_CHAR;
 989					}
 990					else {
 991						ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) + halfBase;
 992					}
 993				}
 994				else {
 995					break;
 996				}
 997			}
 998
 999			// if it is not a dead character
1000			if (!isDeadChar(ch)) {
1001				// it is a valid character
1002				*retPtr++ = i;
1003				len++;
1004			}
1005
1006			i += mv;
1007			mv = 0;
1008		}
1009
1010		return ret[0..len];
1011	}
1012
1013	uint[] calcIndices(dstring src) {
1014		if (src is null || src == "") {
1015			return [];
1016		}
1017
1018		uint[] ret = new uint[src.length];
1019
1020		uint len;
1021
1022		for (int i=0; i<src.length; i++) {
1023			// if it is not a dead character
1024			if (!isDeadChar(src[i])) {
1025				// it is a valid character
1026				ret[len] = i;
1027				len++;
1028			}
1029		}
1030
1031		return ret;
1032	}
1033
1034	bool isStartChar(char chr) {
1035		// Look for non-surrogate entries
1036		if ((chr & 0b11000000) == 0b10000000) { // Signature for a follow up byte
1037			return false;
1038		}
1039		return true;
1040	}
1041
1042	bool isStartChar(wchar chr) {
1043		// Look for non-surrogate entries
1044		if (chr >= UNI_SUR_LOW_START && chr <= UNI_SUR_LOW_END) {
1045			return false;
1046		}
1047		return true;
1048	}
1049
1050	bool isStartChar(dchar chr) {
1051		// Obvious
1052		return true;
1053	}
1054
1055	dchar fromCP866(char chr) {
1056		if (chr < 0x80) {
1057			return cast(dchar)chr;
1058		}
1059
1060		return CP866_to_UTF32[chr-128];
1061	}
1062
1063
1064private:
1065	// Codepage Encodings
1066
1067	dchar CP866_to_UTF32[] = [
1068
1069		0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
1070		0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
1071		0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,
1072
1073		0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
1074		0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
1075		0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
1076
1077		0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044F,
1078		0x0401, 0x0451, 0x0404, 0x0454, 0x0407, 0x0457, 0x040E, 0x045E, 0x00B0, 0x2219, 0x00B7, 0x221A, 0x2116, 0x00A4, 0x25A0, 0x00A0,
1079
1080	];
1081}