/src/tools/wrappers-generator/unicode/utf8_parser.e

http://github.com/tybor/Liberty · Specman e · 162 lines · 105 code · 6 blank · 51 comment · 10 complexity · 97496a742be9949e0992a052d9e7a667 MD5 · raw file

  1. -- See the Copyright notice at the end of this file.
  2. --
  3. expanded class UTF8_PARSER
  4. --
  5. -- The purpose of the UTF8_PARSER is to analyse STRINGs encoded with
  6. -- UTF-8 format in order to create corresponding UNICODE_STRING objects.
  7. --
  8. -- Any sequence is decoded, "invalid" character is used where UTF-8
  9. -- sequence is wrong.
  10. insert
  11. STRING_HANDLER
  12. feature {ANY}
  13. decode (utf8_string: STRING; unicode_string: UNICODE_STRING) is
  14. -- Decode the `utf8_string' and append it in `unicode_string'.
  15. -- After this call the `first_error' attribute may be non Void
  16. -- to explain the error encountered during the `utf8_string'
  17. -- decoding.
  18. -- In case of error, the `first_error_index' is also used to
  19. -- indicate the position of the first error into the original
  20. -- `utf8_string', `first_error_character' indicate the
  21. -- position of the first error character added in `unicode_string'.
  22. require
  23. utf8_string /= Void
  24. unicode_string /= Void
  25. local
  26. i, k, seq_length: INTEGER; v: INTEGER; storage: NATIVE_ARRAY[CHARACTER]; maxi: INTEGER
  27. do
  28. from
  29. first_error := Void
  30. storage := utf8_string.storage
  31. maxi := utf8_string.count - 1
  32. until
  33. i > maxi
  34. loop
  35. v := storage.item(i).code
  36. i := i + 1
  37. inspect
  38. v
  39. when 0 .. 127 then
  40. unicode_string.extend(v)
  41. k := 0
  42. when 192 .. 223 then
  43. v := v - 192
  44. k := 2
  45. when 224 .. 239 then
  46. v := v - 224
  47. k := 3
  48. when 240 .. 247 then
  49. v := v - 240
  50. k := 4
  51. else
  52. unicode_string.extend(65533)
  53. if first_error = Void then
  54. first_error := once "Invalid byte as first character of %
  55. %UTF-8 sequence."
  56. first_error_index := i
  57. first_error_character := unicode_string.count
  58. end
  59. k := 0
  60. end
  61. from
  62. seq_length := k
  63. until
  64. k <= 1
  65. loop
  66. if i <= maxi and then storage.item(i).code.in_range(128, 191) then
  67. v := v * 64 + storage.item(i).code - 128
  68. i := i + 1
  69. k := k - 1
  70. else
  71. unicode_string.extend(65533)
  72. if first_error = Void then
  73. first_error := once ""
  74. first_error.clear_count
  75. if i <= maxi then
  76. first_error.append(once "Invalid byte in UTF-8 %
  77. %sequence. This character is %
  78. % number ")
  79. else
  80. first_error.append(once " Missing character number ")
  81. end
  82. (seq_length - k + 2).append_in(first_error)
  83. first_error.append(once " in ")
  84. seq_length.append_in(first_error)
  85. first_error.append(once " bytes sequence.")
  86. first_error_index := i
  87. first_error_character := unicode_string.count
  88. end
  89. k := 0
  90. end
  91. end
  92. if k = 1 then
  93. if v < 128 or else v < 2048 and then seq_length > 2 or else v < 65536 and then seq_length > 3 then
  94. unicode_string.extend(65533)
  95. if first_error = Void then
  96. first_error := once "Overlong sequence, must be refused %
  97. %by any UTF-8 complient decoder for%
  98. % security reasons."
  99. first_error_index := i - seq_length + 1
  100. first_error_character := unicode_string.count
  101. end
  102. elseif not unicode_string.valid_unicode(v) then
  103. unicode_string.extend(65533)
  104. if first_error = Void then
  105. first_error := once ""
  106. first_error.copy(once "Invalid unicode value: 0x")
  107. v.to_hexadecimal_in(first_error)
  108. first_error.append(once ". Please check unicode charts.")
  109. first_error_index := i - seq_length + 1
  110. first_error_character := unicode_string.count
  111. end
  112. else
  113. unicode_string.extend(v)
  114. end
  115. end
  116. end
  117. end
  118. first_error: STRING
  119. -- When `first_error' is not Void, this means that an error
  120. -- has occured during the last `decode' call. In such a
  121. -- situation, `first_error' is an explanation of the error.
  122. first_error_index: INTEGER
  123. -- Meaningful only when `first_error' is not Void to indicate
  124. -- the position of the error in `utf8_string' (see `decode').
  125. first_error_character: INTEGER
  126. -- Meaningful only when `first_error' is not Void to indicate
  127. -- the position of the first error character added to
  128. -- `unicode_string' (see `decode').
  129. end -- class UTF8_PARSER
  130. --
  131. -- ------------------------------------------------------------------------------------------------------------
  132. -- Copyright notice below. Please read.
  133. --
  134. -- This file is part of the SmartEiffel standard library.
  135. -- Copyright(C) 1994-2002: INRIA - LORIA (INRIA Lorraine) - ESIAL U.H.P. - University of Nancy 1 - FRANCE
  136. -- Copyright(C) 2003-2006: INRIA - LORIA (INRIA Lorraine) - I.U.T. Charlemagne - University of Nancy 2 - FRANCE
  137. --
  138. -- Authors: Dominique COLNET, Philippe RIBET, Cyril ADRIAN, Vincent CROIZIER, Frederic MERIZEN
  139. --
  140. -- Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
  141. -- documentation files (the "Software"), to deal in the Software without restriction, including without
  142. -- limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  143. -- the Software, and to permit persons to whom the Software is furnished to do so, subject to the following
  144. -- conditions:
  145. --
  146. -- The above copyright notice and this permission notice shall be included in all copies or substantial
  147. -- portions of the Software.
  148. --
  149. -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
  150. -- LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
  151. -- EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  152. -- AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
  153. -- OR OTHER DEALINGS IN THE SOFTWARE.
  154. --
  155. -- http://SmartEiffel.loria.fr - SmartEiffel@loria.fr
  156. -- ------------------------------------------------------------------------------------------------------------