/src/tools/wrappers-generator/unicode/utf8_parser.e
Specman e | 162 lines | 105 code | 6 blank | 51 comment | 10 complexity | 97496a742be9949e0992a052d9e7a667 MD5 | raw file
1-- See the Copyright notice at the end of this file. 2-- 3expanded class UTF8_PARSER 4 -- 5 -- The purpose of the UTF8_PARSER is to analyse STRINGs encoded with 6 -- UTF-8 format in order to create corresponding UNICODE_STRING objects. 7 -- 8 -- Any sequence is decoded, "invalid" character is used where UTF-8 9 -- sequence is wrong. 10 11insert 12 STRING_HANDLER 13 14feature {ANY} 15 decode (utf8_string: STRING; unicode_string: UNICODE_STRING) is 16 -- Decode the `utf8_string' and append it in `unicode_string'. 17 -- After this call the `first_error' attribute may be non Void 18 -- to explain the error encountered during the `utf8_string' 19 -- decoding. 20 -- In case of error, the `first_error_index' is also used to 21 -- indicate the position of the first error into the original 22 -- `utf8_string', `first_error_character' indicate the 23 -- position of the first error character added in `unicode_string'. 24 require 25 utf8_string /= Void 26 unicode_string /= Void 27 local 28 i, k, seq_length: INTEGER; v: INTEGER; storage: NATIVE_ARRAY[CHARACTER]; maxi: INTEGER 29 do 30 from 31 first_error := Void 32 storage := utf8_string.storage 33 maxi := utf8_string.count - 1 34 until 35 i > maxi 36 loop 37 v := storage.item(i).code 38 i := i + 1 39 inspect 40 v 41 when 0 .. 127 then 42 unicode_string.extend(v) 43 k := 0 44 when 192 .. 223 then 45 v := v - 192 46 k := 2 47 when 224 .. 239 then 48 v := v - 224 49 k := 3 50 when 240 .. 247 then 51 v := v - 240 52 k := 4 53 else 54 unicode_string.extend(65533) 55 if first_error = Void then 56 first_error := once "Invalid byte as first character of % 57 %UTF-8 sequence." 58 first_error_index := i 59 first_error_character := unicode_string.count 60 end 61 k := 0 62 end 63 from 64 seq_length := k 65 until 66 k <= 1 67 loop 68 if i <= maxi and then storage.item(i).code.in_range(128, 191) then 69 v := v * 64 + storage.item(i).code - 128 70 i := i + 1 71 k := k - 1 72 else 73 unicode_string.extend(65533) 74 if first_error = Void then 75 first_error := once "" 76 first_error.clear_count 77 if i <= maxi then 78 first_error.append(once "Invalid byte in UTF-8 % 79 %sequence. This character is % 80 % number ") 81 else 82 first_error.append(once " Missing character number ") 83 end 84 (seq_length - k + 2).append_in(first_error) 85 first_error.append(once " in ") 86 seq_length.append_in(first_error) 87 first_error.append(once " bytes sequence.") 88 first_error_index := i 89 first_error_character := unicode_string.count 90 end 91 k := 0 92 end 93 end 94 if k = 1 then 95 if v < 128 or else v < 2048 and then seq_length > 2 or else v < 65536 and then seq_length > 3 then 96 unicode_string.extend(65533) 97 if first_error = Void then 98 first_error := once "Overlong sequence, must be refused % 99 %by any UTF-8 complient decoder for% 100 % security reasons." 101 first_error_index := i - seq_length + 1 102 first_error_character := unicode_string.count 103 end 104 elseif not unicode_string.valid_unicode(v) then 105 unicode_string.extend(65533) 106 if first_error = Void then 107 first_error := once "" 108 first_error.copy(once "Invalid unicode value: 0x") 109 v.to_hexadecimal_in(first_error) 110 first_error.append(once ". Please check unicode charts.") 111 first_error_index := i - seq_length + 1 112 first_error_character := unicode_string.count 113 end 114 else 115 unicode_string.extend(v) 116 end 117 end 118 end 119 end 120 121 first_error: STRING 122 -- When `first_error' is not Void, this means that an error 123 -- has occured during the last `decode' call. In such a 124 -- situation, `first_error' is an explanation of the error. 125 126 first_error_index: INTEGER 127 -- Meaningful only when `first_error' is not Void to indicate 128 -- the position of the error in `utf8_string' (see `decode'). 129 130 first_error_character: INTEGER 131 -- Meaningful only when `first_error' is not Void to indicate 132 -- the position of the first error character added to 133 -- `unicode_string' (see `decode'). 134 135end -- class UTF8_PARSER 136-- 137-- ------------------------------------------------------------------------------------------------------------ 138-- Copyright notice below. Please read. 139-- 140-- This file is part of the SmartEiffel standard library. 141-- Copyright(C) 1994-2002: INRIA - LORIA (INRIA Lorraine) - ESIAL U.H.P. - University of Nancy 1 - FRANCE 142-- Copyright(C) 2003-2006: INRIA - LORIA (INRIA Lorraine) - I.U.T. Charlemagne - University of Nancy 2 - FRANCE 143-- 144-- Authors: Dominique COLNET, Philippe RIBET, Cyril ADRIAN, Vincent CROIZIER, Frederic MERIZEN 145-- 146-- Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 147-- documentation files (the "Software"), to deal in the Software without restriction, including without 148-- limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 149-- the Software, and to permit persons to whom the Software is furnished to do so, subject to the following 150-- conditions: 151-- 152-- The above copyright notice and this permission notice shall be included in all copies or substantial 153-- portions of the Software. 154-- 155-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT 156-- LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO 157-- EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN 158-- AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 159-- OR OTHER DEALINGS IN THE SOFTWARE. 160-- 161-- http://SmartEiffel.loria.fr - SmartEiffel@loria.fr 162-- ------------------------------------------------------------------------------------------------------------