/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs

https://github.com/dotnet/runtime · C# · 217 lines · 77 code · 38 blank · 102 comment · 6 complexity · 3756ea5f4b8b0b04d1498cc055fbf59d MD5 · raw file

  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. using System.Runtime.CompilerServices;
  4. using System.Diagnostics;
  5. namespace System.Text.Unicode
  6. {
  7. internal static partial class Utf16Utility
  8. {
  9. /// <summary>
  10. /// Returns true iff the UInt32 represents two ASCII UTF-16 characters in machine endianness.
  11. /// </summary>
  12. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  13. internal static bool AllCharsInUInt32AreAscii(uint value)
  14. {
  15. return (value & ~0x007F_007Fu) == 0;
  16. }
  17. /// <summary>
  18. /// Returns true iff the UInt64 represents four ASCII UTF-16 characters in machine endianness.
  19. /// </summary>
  20. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  21. internal static bool AllCharsInUInt64AreAscii(ulong value)
  22. {
  23. return (value & ~0x007F_007F_007F_007Ful) == 0;
  24. }
  25. /// <summary>
  26. /// Given a UInt32 that represents two ASCII UTF-16 characters, returns the invariant
  27. /// lowercase representation of those characters. Requires the input value to contain
  28. /// two ASCII UTF-16 characters in machine endianness.
  29. /// </summary>
  30. /// <remarks>
  31. /// This is a branchless implementation.
  32. /// </remarks>
  33. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  34. internal static uint ConvertAllAsciiCharsInUInt32ToLowercase(uint value)
  35. {
  36. // ASSUMPTION: Caller has validated that input value is ASCII.
  37. Debug.Assert(AllCharsInUInt32AreAscii(value));
  38. // the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value >= 'A'
  39. uint lowerIndicator = value + 0x0080_0080u - 0x0041_0041u;
  40. // the 0x80 bit of each word of 'upperIndicator' will be set iff the word has value > 'Z'
  41. uint upperIndicator = value + 0x0080_0080u - 0x005B_005Bu;
  42. // the 0x80 bit of each word of 'combinedIndicator' will be set iff the word has value >= 'A' and <= 'Z'
  43. uint combinedIndicator = (lowerIndicator ^ upperIndicator);
  44. // the 0x20 bit of each word of 'mask' will be set iff the word has value >= 'A' and <= 'Z'
  45. uint mask = (combinedIndicator & 0x0080_0080u) >> 2;
  46. return value ^ mask; // bit flip uppercase letters [A-Z] => [a-z]
  47. }
  48. /// <summary>
  49. /// Given a UInt32 that represents two ASCII UTF-16 characters, returns the invariant
  50. /// uppercase representation of those characters. Requires the input value to contain
  51. /// two ASCII UTF-16 characters in machine endianness.
  52. /// </summary>
  53. /// <remarks>
  54. /// This is a branchless implementation.
  55. /// </remarks>
  56. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  57. internal static uint ConvertAllAsciiCharsInUInt32ToUppercase(uint value)
  58. {
  59. // Intrinsified in mono interpreter
  60. // ASSUMPTION: Caller has validated that input value is ASCII.
  61. Debug.Assert(AllCharsInUInt32AreAscii(value));
  62. // the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value >= 'a'
  63. uint lowerIndicator = value + 0x0080_0080u - 0x0061_0061u;
  64. // the 0x80 bit of each word of 'upperIndicator' will be set iff the word has value > 'z'
  65. uint upperIndicator = value + 0x0080_0080u - 0x007B_007Bu;
  66. // the 0x80 bit of each word of 'combinedIndicator' will be set iff the word has value >= 'a' and <= 'z'
  67. uint combinedIndicator = (lowerIndicator ^ upperIndicator);
  68. // the 0x20 bit of each word of 'mask' will be set iff the word has value >= 'a' and <= 'z'
  69. uint mask = (combinedIndicator & 0x0080_0080u) >> 2;
  70. return value ^ mask; // bit flip lowercase letters [a-z] => [A-Z]
  71. }
  72. /// <summary>
  73. /// Given a UInt32 that represents two ASCII UTF-16 characters, returns true iff
  74. /// the input contains one or more lowercase ASCII characters.
  75. /// </summary>
  76. /// <remarks>
  77. /// This is a branchless implementation.
  78. /// </remarks>
  79. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  80. internal static bool UInt32ContainsAnyLowercaseAsciiChar(uint value)
  81. {
  82. // ASSUMPTION: Caller has validated that input value is ASCII.
  83. Debug.Assert(AllCharsInUInt32AreAscii(value));
  84. // the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value >= 'a'
  85. uint lowerIndicator = value + 0x0080_0080u - 0x0061_0061u;
  86. // the 0x80 bit of each word of 'upperIndicator' will be set iff the word has value > 'z'
  87. uint upperIndicator = value + 0x0080_0080u - 0x007B_007Bu;
  88. // the 0x80 bit of each word of 'combinedIndicator' will be set iff the word has value >= 'a' and <= 'z'
  89. uint combinedIndicator = (lowerIndicator ^ upperIndicator);
  90. return (combinedIndicator & 0x0080_0080u) != 0;
  91. }
  92. /// <summary>
  93. /// Given a UInt32 that represents two ASCII UTF-16 characters, returns true iff
  94. /// the input contains one or more uppercase ASCII characters.
  95. /// </summary>
  96. /// <remarks>
  97. /// This is a branchless implementation.
  98. /// </remarks>
  99. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  100. internal static bool UInt32ContainsAnyUppercaseAsciiChar(uint value)
  101. {
  102. // ASSUMPTION: Caller has validated that input value is ASCII.
  103. Debug.Assert(AllCharsInUInt32AreAscii(value));
  104. // the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value >= 'A'
  105. uint lowerIndicator = value + 0x0080_0080u - 0x0041_0041u;
  106. // the 0x80 bit of each word of 'upperIndicator' will be set iff the word has value > 'Z'
  107. uint upperIndicator = value + 0x0080_0080u - 0x005B_005Bu;
  108. // the 0x80 bit of each word of 'combinedIndicator' will be set iff the word has value >= 'A' and <= 'Z'
  109. uint combinedIndicator = (lowerIndicator ^ upperIndicator);
  110. return (combinedIndicator & 0x0080_0080u) != 0;
  111. }
  112. /// <summary>
  113. /// Given two UInt32s that represent two ASCII UTF-16 characters each, returns true iff
  114. /// the two inputs are equal using an ordinal case-insensitive comparison.
  115. /// </summary>
  116. /// <remarks>
  117. /// This is a branchless implementation.
  118. /// </remarks>
  119. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  120. internal static bool UInt32OrdinalIgnoreCaseAscii(uint valueA, uint valueB)
  121. {
  122. // Intrinsified in mono interpreter
  123. // ASSUMPTION: Caller has validated that input values are ASCII.
  124. Debug.Assert(AllCharsInUInt32AreAscii(valueA));
  125. Debug.Assert(AllCharsInUInt32AreAscii(valueB));
  126. // a mask of all bits which are different between A and B
  127. uint differentBits = valueA ^ valueB;
  128. // the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value < 'A'
  129. uint lowerIndicator = valueA + 0x0100_0100u - 0x0041_0041u;
  130. // the 0x80 bit of each word of 'upperIndicator' will be set iff (word | 0x20) has value > 'z'
  131. uint upperIndicator = (valueA | 0x0020_0020u) + 0x0080_0080u - 0x007B_007Bu;
  132. // the 0x80 bit of each word of 'combinedIndicator' will be set iff the word is *not* [A-Za-z]
  133. uint combinedIndicator = lowerIndicator | upperIndicator;
  134. // Shift all the 0x80 bits of 'combinedIndicator' into the 0x20 positions, then set all bits
  135. // aside from 0x20. This creates a mask where all bits are set *except* for the 0x20 bits
  136. // which correspond to alpha chars (either lower or upper). For these alpha chars only, the
  137. // 0x20 bit is allowed to differ between the two input values. Every other char must be an
  138. // exact bitwise match between the two input values. In other words, (valueA & mask) will
  139. // convert valueA to uppercase, so (valueA & mask) == (valueB & mask) answers "is the uppercase
  140. // form of valueA equal to the uppercase form of valueB?" (Technically if valueA has an alpha
  141. // char in the same position as a non-alpha char in valueB, or vice versa, this operation will
  142. // result in nonsense, but it'll still compute as inequal regardless, which is what we want ultimately.)
  143. // The line below is a more efficient way of doing the same check taking advantage of the XOR
  144. // computation we performed at the beginning of the method.
  145. return (((combinedIndicator >> 2) | ~0x0020_0020u) & differentBits) == 0;
  146. }
  147. /// <summary>
  148. /// Given two UInt64s that represent four ASCII UTF-16 characters each, returns true iff
  149. /// the two inputs are equal using an ordinal case-insensitive comparison.
  150. /// </summary>
  151. /// <remarks>
  152. /// This is a branchless implementation.
  153. /// </remarks>
  154. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  155. internal static bool UInt64OrdinalIgnoreCaseAscii(ulong valueA, ulong valueB)
  156. {
  157. // Intrinsified in mono interpreter
  158. // ASSUMPTION: Caller has validated that input values are ASCII.
  159. Debug.Assert(AllCharsInUInt64AreAscii(valueA));
  160. Debug.Assert(AllCharsInUInt64AreAscii(valueB));
  161. // the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value >= 'A'
  162. ulong lowerIndicator = valueA + 0x0080_0080_0080_0080ul - 0x0041_0041_0041_0041ul;
  163. // the 0x80 bit of each word of 'upperIndicator' will be set iff (word | 0x20) has value <= 'z'
  164. ulong upperIndicator = (valueA | 0x0020_0020_0020_0020ul) + 0x0100_0100_0100_0100ul - 0x007B_007B_007B_007Bul;
  165. // the 0x20 bit of each word of 'combinedIndicator' will be set iff the word is [A-Za-z]
  166. ulong combinedIndicator = (0x0080_0080_0080_0080ul & lowerIndicator & upperIndicator) >> 2;
  167. // Convert both values to lowercase (using the combined indicator from the first value)
  168. // and compare for equality. It's possible that the first value will contain an alpha character
  169. // where the second value doesn't (or vice versa), and applying the combined indicator will
  170. // create nonsensical data, but the comparison would have failed anyway in this case so it's
  171. // a safe operation to perform.
  172. //
  173. // This 64-bit method is similar to the 32-bit method, but it performs the equivalent of convert-to-
  174. // lowercase-then-compare rather than convert-to-uppercase-and-compare. This particular operation
  175. // happens to be faster on x64.
  176. return (valueA | combinedIndicator) == (valueB | combinedIndicator);
  177. }
  178. }
  179. }