PageRenderTime 20ms CodeModel.GetById 16ms app.highlight 1ms RepoModel.GetById 2ms app.codeStats 0ms

/src/tools/wrappers-generator/unicode/utf8_parser.e

http://github.com/tybor/Liberty
Specman e | 162 lines | 105 code | 6 blank | 51 comment | 10 complexity | 97496a742be9949e0992a052d9e7a667 MD5 | raw file
  1-- See the Copyright notice at the end of this file.
  2--
  3expanded class UTF8_PARSER
  4	--
  5	-- The purpose of the UTF8_PARSER is to analyse STRINGs encoded with
  6	-- UTF-8 format in order to create corresponding UNICODE_STRING objects.
  7	--
  8	-- Any sequence is decoded, "invalid" character is used where UTF-8
  9	-- sequence is wrong.
 10
 11insert
 12	STRING_HANDLER
 13
 14feature {ANY}
 15	decode (utf8_string: STRING; unicode_string: UNICODE_STRING) is
 16			-- Decode the `utf8_string' and append it in `unicode_string'.
 17			-- After this call the `first_error' attribute may be non Void
 18			-- to explain the error encountered during the `utf8_string'
 19			-- decoding.
 20			-- In case of error, the `first_error_index' is also used to
 21			-- indicate the position of the first error into the original
 22			-- `utf8_string', `first_error_character' indicate the
 23			-- position of the first error character added in `unicode_string'.
 24		require
 25			utf8_string /= Void
 26			unicode_string /= Void
 27		local
 28			i, k, seq_length: INTEGER; v: INTEGER; storage: NATIVE_ARRAY[CHARACTER]; maxi: INTEGER
 29		do
 30			from
 31				first_error := Void
 32				storage := utf8_string.storage
 33				maxi := utf8_string.count - 1
 34			until
 35				i > maxi
 36			loop
 37				v := storage.item(i).code
 38				i := i + 1
 39				inspect
 40					v
 41				when 0 .. 127 then
 42					unicode_string.extend(v)
 43					k := 0
 44				when 192 .. 223 then
 45					v := v - 192
 46					k := 2
 47				when 224 .. 239 then
 48					v := v - 224
 49					k := 3
 50				when 240 .. 247 then
 51					v := v - 240
 52					k := 4
 53				else
 54					unicode_string.extend(65533)
 55					if first_error = Void then
 56						first_error := once "Invalid byte as first character of %
 57				      %UTF-8 sequence."
 58						first_error_index := i
 59						first_error_character := unicode_string.count
 60					end
 61					k := 0
 62				end
 63				from
 64					seq_length := k
 65				until
 66					k <= 1
 67				loop
 68					if i <= maxi and then storage.item(i).code.in_range(128, 191) then
 69						v := v * 64 + storage.item(i).code - 128
 70						i := i + 1
 71						k := k - 1
 72					else
 73						unicode_string.extend(65533)
 74						if first_error = Void then
 75							first_error := once ""
 76							first_error.clear_count
 77							if i <= maxi then
 78								first_error.append(once "Invalid byte in UTF-8 %
 79						 %sequence. This character is %
 80						 % number ")
 81							else
 82								first_error.append(once " Missing character number ")
 83							end
 84							(seq_length - k + 2).append_in(first_error)
 85							first_error.append(once " in ")
 86							seq_length.append_in(first_error)
 87							first_error.append(once " bytes sequence.")
 88							first_error_index := i
 89							first_error_character := unicode_string.count
 90						end
 91						k := 0
 92					end
 93				end
 94				if k = 1 then
 95					if v < 128 or else v < 2048 and then seq_length > 2 or else v < 65536 and then seq_length > 3 then
 96						unicode_string.extend(65533)
 97						if first_error = Void then
 98							first_error := once "Overlong sequence, must be refused %
 99					%by any UTF-8 complient decoder for%
100					% security reasons."
101							first_error_index := i - seq_length + 1
102							first_error_character := unicode_string.count
103						end
104					elseif not unicode_string.valid_unicode(v) then
105						unicode_string.extend(65533)
106						if first_error = Void then
107							first_error := once ""
108							first_error.copy(once "Invalid unicode value: 0x")
109							v.to_hexadecimal_in(first_error)
110							first_error.append(once ". Please check unicode charts.")
111							first_error_index := i - seq_length + 1
112							first_error_character := unicode_string.count
113						end
114					else
115						unicode_string.extend(v)
116					end
117				end
118			end
119		end
120
121	first_error: STRING
122			-- When `first_error' is not Void, this means that an error
123			-- has occured during the last `decode' call. In such a
124			-- situation, `first_error' is an explanation of the error.
125
126	first_error_index: INTEGER
127			-- Meaningful only when `first_error' is not Void to indicate
128			-- the position of the error in `utf8_string' (see `decode').
129
130	first_error_character: INTEGER
131			-- Meaningful only when `first_error' is not Void to indicate
132			-- the position of the first error character added to
133			-- `unicode_string' (see `decode').
134
135end -- class UTF8_PARSER
136--
137-- ------------------------------------------------------------------------------------------------------------
138-- Copyright notice below. Please read.
139--
140-- This file is part of the SmartEiffel standard library.
141-- Copyright(C) 1994-2002: INRIA - LORIA (INRIA Lorraine) - ESIAL U.H.P.       - University of Nancy 1 - FRANCE
142-- Copyright(C) 2003-2006: INRIA - LORIA (INRIA Lorraine) - I.U.T. Charlemagne - University of Nancy 2 - FRANCE
143--
144-- Authors: Dominique COLNET, Philippe RIBET, Cyril ADRIAN, Vincent CROIZIER, Frederic MERIZEN
145--
146-- Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
147-- documentation files (the "Software"), to deal in the Software without restriction, including without
148-- limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
149-- the Software, and to permit persons to whom the Software is furnished to do so, subject to the following
150-- conditions:
151--
152-- The above copyright notice and this permission notice shall be included in all copies or substantial
153-- portions of the Software.
154--
155-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
156-- LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
157-- EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
158-- AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
159-- OR OTHER DEALINGS IN THE SOFTWARE.
160--
161-- http://SmartEiffel.loria.fr - SmartEiffel@loria.fr
162-- ------------------------------------------------------------------------------------------------------------