PageRenderTime 39ms CodeModel.GetById 1ms RepoModel.GetById 0ms app.codeStats 0ms

/src/main/java/com/alibaba/fastjson/util/UTF8Decoder.java

https://bitbucket.org/xiejuntao/xdesktop
Java | 215 lines | 154 code | 24 blank | 37 comment | 68 complexity | 224975a572953925b771ee020ba09a17 MD5 | raw file
  1. package com.alibaba.fastjson.util;
  2. import java.nio.Buffer;
  3. import java.nio.ByteBuffer;
  4. import java.nio.CharBuffer;
  5. import java.nio.charset.Charset;
  6. import java.nio.charset.CharsetDecoder;
  7. import java.nio.charset.CoderResult;
  8. /* Legal UTF-8 Byte Sequences
  9. *
  10. * # Code Points Bits Bit/Byte pattern
  11. * 1 7 0xxxxxxx
  12. * U+0000..U+007F 00..7F
  13. *
  14. * 2 11 110xxxxx 10xxxxxx
  15. * U+0080..U+07FF C2..DF 80..BF
  16. *
  17. * 3 16 1110xxxx 10xxxxxx 10xxxxxx
  18. * U+0800..U+0FFF E0 A0..BF 80..BF
  19. * U+1000..U+FFFF E1..EF 80..BF 80..BF
  20. *
  21. * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  22. * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
  23. * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
  24. * U+100000..U10FFFF F4 80..8F 80..BF 80..BF
  25. *
  26. */
  27. public class UTF8Decoder extends CharsetDecoder {
  28. private final static Charset charset = Charset.forName("UTF-8");
  29. public UTF8Decoder(){
  30. super(charset, 1.0f, 1.0f);
  31. }
  32. private static boolean isNotContinuation(int b) {
  33. return (b & 0xc0) != 0x80;
  34. }
  35. // [C2..DF] [80..BF]
  36. private static final boolean isMalformed2(int b1, int b2) {
  37. return (b1 & 0x1e) == 0x0 || (b2 & 0xc0) != 0x80;
  38. }
  39. // [E0] [A0..BF] [80..BF]
  40. // [E1..EF] [80..BF] [80..BF]
  41. private static boolean isMalformed3(int b1, int b2, int b3) {
  42. return (b1 == (byte) 0xe0 && (b2 & 0xe0) == 0x80) || (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
  43. }
  44. // [F0] [90..BF] [80..BF] [80..BF]
  45. // [F1..F3] [80..BF] [80..BF] [80..BF]
  46. // [F4] [80..8F] [80..BF] [80..BF]
  47. // only check 80-be range here, the [0xf0,0x80...] and [0xf4,0x90-...]
  48. // will be checked by Surrogate.neededFor(uc)
  49. private static final boolean isMalformed4(int b2, int b3, int b4) {
  50. return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || (b4 & 0xc0) != 0x80;
  51. }
  52. private static CoderResult lookupN(ByteBuffer src, int n) {
  53. for (int i = 1; i < n; i++) {
  54. if (isNotContinuation(src.get())) return CoderResult.malformedForLength(i);
  55. }
  56. return CoderResult.malformedForLength(n);
  57. }
  58. public static CoderResult malformedN(ByteBuffer src, int nb) {
  59. switch (nb) {
  60. case 1:
  61. int b1 = src.get();
  62. if ((b1 >> 2) == -2) {
  63. // 5 bytes 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  64. if (src.remaining() < 4) return CoderResult.UNDERFLOW;
  65. return lookupN(src, 5);
  66. }
  67. if ((b1 >> 1) == -2) {
  68. // 6 bytes 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  69. if (src.remaining() < 5) {
  70. return CoderResult.UNDERFLOW;
  71. }
  72. return lookupN(src, 6);
  73. }
  74. return CoderResult.malformedForLength(1);
  75. case 2: // always 1
  76. return CoderResult.malformedForLength(1);
  77. case 3:
  78. b1 = src.get();
  79. int b2 = src.get(); // no need to lookup b3
  80. return CoderResult.malformedForLength(((b1 == (byte) 0xe0 && (b2 & 0xe0) == 0x80) || isNotContinuation(b2)) ? 1 : 2);
  81. case 4: // we don't care the speed here
  82. b1 = src.get() & 0xff;
  83. b2 = src.get() & 0xff;
  84. if (b1 > 0xf4 || (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || isNotContinuation(b2)) return CoderResult.malformedForLength(1);
  85. if (isNotContinuation(src.get())) return CoderResult.malformedForLength(2);
  86. return CoderResult.malformedForLength(3);
  87. default:
  88. throw new IllegalStateException();
  89. }
  90. }
  91. private static CoderResult malformed(ByteBuffer src, int sp, CharBuffer dst, int dp, int nb) {
  92. src.position(sp - src.arrayOffset());
  93. CoderResult cr = malformedN(src, nb);
  94. updatePositions(src, sp, dst, dp);
  95. return cr;
  96. }
  97. private static CoderResult xflow(Buffer src, int sp, int sl, Buffer dst, int dp, int nb) {
  98. updatePositions(src, sp, dst, dp);
  99. return (nb == 0 || sl - sp < nb) ? CoderResult.UNDERFLOW : CoderResult.OVERFLOW;
  100. }
  101. private CoderResult decodeArrayLoop(ByteBuffer src, CharBuffer dst) {
  102. // This method is optimized for ASCII input.
  103. byte[] srcArray = src.array();
  104. int srcPosition = src.arrayOffset() + src.position();
  105. int srcLength = src.arrayOffset() + src.limit();
  106. char[] destArray = dst.array();
  107. int destPosition = dst.arrayOffset() + dst.position();
  108. int destLength = dst.arrayOffset() + dst.limit();
  109. int destLengthASCII = destPosition + Math.min(srcLength - srcPosition, destLength - destPosition);
  110. // ASCII only loop
  111. while (destPosition < destLengthASCII && srcArray[srcPosition] >= 0) {
  112. destArray[destPosition++] = (char) srcArray[srcPosition++];
  113. }
  114. while (srcPosition < srcLength) {
  115. int b1 = srcArray[srcPosition];
  116. if (b1 >= 0) {
  117. // 1 byte, 7 bits: 0xxxxxxx
  118. if (destPosition >= destLength) {
  119. return xflow(src, srcPosition, srcLength, dst, destPosition, 1);
  120. }
  121. destArray[destPosition++] = (char) b1;
  122. srcPosition++;
  123. } else if ((b1 >> 5) == -2) {
  124. // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
  125. if (srcLength - srcPosition < 2 || destPosition >= destLength) {
  126. return xflow(src, srcPosition, srcLength, dst, destPosition, 2);
  127. }
  128. int b2 = srcArray[srcPosition + 1];
  129. if (isMalformed2(b1, b2)) {
  130. return malformed(src, srcPosition, dst, destPosition, 2);
  131. }
  132. destArray[destPosition++] = (char) (((b1 << 6) ^ b2) ^ 0x0f80);
  133. srcPosition += 2;
  134. } else if ((b1 >> 4) == -2) {
  135. // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
  136. if (srcLength - srcPosition < 3 || destPosition >= destLength) {
  137. return xflow(src, srcPosition, srcLength, dst, destPosition, 3);
  138. }
  139. int b2 = srcArray[srcPosition + 1];
  140. int b3 = srcArray[srcPosition + 2];
  141. if (isMalformed3(b1, b2, b3)) {
  142. return malformed(src, srcPosition, dst, destPosition, 3);
  143. }
  144. destArray[destPosition++] = (char) (((b1 << 12) ^ (b2 << 6) ^ b3) ^ 0x1f80);
  145. srcPosition += 3;
  146. } else if ((b1 >> 3) == -2) {
  147. // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  148. if (srcLength - srcPosition < 4 || destLength - destPosition < 2) {
  149. return xflow(src, srcPosition, srcLength, dst, destPosition, 4);
  150. }
  151. int b2 = srcArray[srcPosition + 1];
  152. int b3 = srcArray[srcPosition + 2];
  153. int b4 = srcArray[srcPosition + 3];
  154. int uc = ((b1 & 0x07) << 18) | ((b2 & 0x3f) << 12) | ((b3 & 0x3f) << 06) | (b4 & 0x3f);
  155. if (isMalformed4(b2, b3, b4) || !Surrogate.neededFor(uc)) {
  156. return malformed(src, srcPosition, dst, destPosition, 4);
  157. }
  158. destArray[destPosition++] = Surrogate.high(uc);
  159. destArray[destPosition++] = Surrogate.low(uc);
  160. srcPosition += 4;
  161. } else {
  162. return malformed(src, srcPosition, dst, destPosition, 1);
  163. }
  164. }
  165. return xflow(src, srcPosition, srcLength, dst, destPosition, 0);
  166. }
  167. protected CoderResult decodeLoop(ByteBuffer src, CharBuffer dst) {
  168. return decodeArrayLoop(src, dst);
  169. }
  170. static final void updatePositions(Buffer src, int sp, Buffer dst, int dp) {
  171. // src.position(sp - src.arrayOffset());
  172. // dst.position(dp - dst.arrayOffset());
  173. src.position(sp);
  174. dst.position(dp);
  175. }
  176. private static class Surrogate {
  177. public static final int UCS4_MIN = 0x10000;
  178. public static final int UCS4_MAX = (1 << 20) + UCS4_MIN - 1;
  179. public static boolean neededFor(int uc) {
  180. return (uc >= UCS4_MIN) && (uc <= UCS4_MAX);
  181. }
  182. public static char high(int uc) {
  183. assert neededFor(uc);
  184. return (char) (0xd800 | (((uc - UCS4_MIN) >> 10) & 0x3ff));
  185. }
  186. public static char low(int uc) {
  187. assert neededFor(uc);
  188. return (char) (0xdc00 | ((uc - UCS4_MIN) & 0x3ff));
  189. }
  190. }
  191. }