/src/tools/wrappers-generator/unicode/unicode_parser_buffer.e
Specman e | 380 lines | 316 code | 24 blank | 40 comment | 29 complexity | 77bb4635e1482fd8cbc79af12db08f21 MD5 | raw file
1-- See the Copyright notice at the end of this file. 2-- 3class UNICODE_PARSER_BUFFER 4 -- 5 -- A UTF-8 aware parser buffer for XML parsing 6 -- 7 8insert 9 RECYCLABLE 10 PLATFORM 11 12creation {ANY} 13 connect_to 14 15feature {ANY} 16 unknown_character: INTEGER is 0x0000fffd 17 18feature {ANY} 19 connect_to (a_url: like url; a_encoding: STRING) is 20 require 21 not is_connected 22 do 23 if backlog = Void then 24 create backlog.make(0) 25 else 26 check 27 backlog.is_empty 28 end 29 end 30 url := a_url 31 if a_encoding /= Void then 32 set_encoding(a_encoding) 33 else 34 size := 0 35 end 36 index := -1 37 line := 1 38 column := 1 39 at_error := False 40 next 41 end 42 43 disconnect is 44 do 45 url := Void 46 backlog.clear_count 47 ensure 48 not is_connected 49 end 50 51 set_encoding (a_encoding: STRING) is 52 require 53 a_encoding /= Void 54 local 55 enc: STRING 56 do 57 enc := once "" 58 enc.copy(a_encoding) 59 enc.to_upper 60 inspect 61 enc 62 when "UTF-8" then 63 size := 4 64 when "UTF-16" then 65 size := 8 66 not_yet_implemented 67 else 68 size := 1 69 end 70 end 71 72 url: URL 73 index, line, column: INTEGER 74 at_error: BOOLEAN 75 76 is_connected: BOOLEAN is 77 do 78 Result := url /= Void and then url.is_connected 79 end 80 81 end_of_input: BOOLEAN is 82 require 83 is_connected 84 do 85 Result := index > backlog.upper and then url.input.end_of_input 86 end 87 88 can_read_character: BOOLEAN is 89 do 90 Result := index < backlog.upper or else url.input.can_read_character 91 end 92 93 next is 94 require 95 is_connected 96 not end_of_input 97 can_read_character 98 not at_error 99 local 100 input: INPUT_STREAM; n, w, x, y, z, b1, b2, b3: INTEGER 101 do 102 if index = backlog.upper then 103 input := url.input 104 inspect 105 size 106 when 0 then 107 -- no encoding set, it is an error to have a non-ASCII character 108 input.read_character 109 if not input.end_of_input then 110 if input.last_character.code > 0x7f then 111 at_error := True 112 else 113 backlog.add_last(input.last_character.code) 114 end 115 end 116 when 1 then 117 -- ASCII and 8-bit pages 118 input.read_character 119 if not input.end_of_input then 120 backlog.add_last(input.last_character.code) 121 end 122 when 4 then 123 -- UTF-8 124 input.read_character 125 if not input.end_of_input then 126 n := input.last_character.code 127 if n < 0x00000080 then 128 backlog.add_last(n) 129 else 130 breakpoint 131 input.read_character 132 if not input.end_of_input then 133 if n & 0x000000e0 = 0x000000c0 then 134 -- 2 bytes 135 y := n 136 input.read_character 137 if input.end_of_input then 138 backlog.add_last(unknown_character) 139 else 140 z := input.last_character.code 141 if z & 0x000000c0 = 0x00000080 then 142 y := y & 0x0000001f 143 z := z & 0x0000003f 144 b1 := y |>>> 2 145 b2 := ((y & 0x00000003) |<< 6) & z 146 backlog.add_last(b1 | b2) 147 else 148 backlog.add_last(unknown_character) 149 end 150 end 151 elseif n & 0x000000f0 = 0x000000e0 then 152 -- 3 bytes 153 x := n 154 input.read_character 155 if input.end_of_input then 156 backlog.add_last(unknown_character) 157 else 158 y := input.last_character.code 159 if y & 0x000000c0 = 0x00000080 then 160 input.read_character 161 if input.end_of_input then 162 backlog.add_last(unknown_character) 163 else 164 z := input.last_character.code 165 if z & 0x000000c0 = 0x00000080 then 166 x := x & 0x0000000f 167 y := y & 0x0000003f 168 z := z & 0x0000003f 169 b1 := (x |<< 4) & (y |>>> 2) 170 b2 := ((y & 0x00000003) |<< 6) & z 171 backlog.add_last(b1 | b2) 172 else 173 backlog.add_last(unknown_character) 174 end 175 end 176 else 177 backlog.add_last(unknown_character) 178 end 179 end 180 elseif n & 0x000000f8 = 0x000000f0 then 181 -- 4 bytes 182 w := n 183 input.read_character 184 if input.end_of_input then 185 backlog.add_last(unknown_character) 186 else 187 x := input.last_character.code 188 input.read_character 189 if input.end_of_input then 190 backlog.add_last(unknown_character) 191 else 192 y := input.last_character.code 193 if y & 0x000000c0 = 0x00000080 then 194 input.read_character 195 if input.end_of_input then 196 backlog.add_last(unknown_character) 197 else 198 z := input.last_character.code 199 if z & 0x000000c0 = 0x00000080 then 200 w := w & 0x00000007 201 x := x & 0x0000003f 202 y := y & 0x0000003f 203 z := z & 0x0000003f 204 b1 := (w |<< 2) | (x |>>> 4) 205 b2 := ((x & 0x0000000f) |<< 4) | (y |>> 2) 206 b3 := ((y & 0x00000003) |<< 6) & z 207 backlog.add_last(b1 | b2 | b3) 208 else 209 backlog.add_last(unknown_character) 210 end 211 end 212 else 213 backlog.add_last(unknown_character) 214 end 215 end 216 end 217 else 218 -- invalid code 219 backlog.add_last(unknown_character) 220 end 221 end 222 end 223 end 224 when 8 then 225 -- UTF-16 226 not_yet_implemented 227 end 228 end 229 if not at_error then 230 index := index + 1 231 232 if not end_of_input then 233 if code = '%N'.code then 234 line := line + 1 235 column := 1 236 else 237 column := column + 1 238 end 239 end 240 end 241 ensure 242 at_error or else (not url.input.end_of_input implies backlog.valid_index(index)) 243 at_error or else (url.input.end_of_input implies index = backlog.upper + 1) 244 end 245 246 previous is 247 require 248 is_connected 249 index > 0 250 local 251 i: INTEGER 252 do 253 index := index - 1 254 if code = '%N'.code then 255 line := line - 1 256 from 257 column := 0 258 i := index 259 until 260 i = 0 or else backlog.item(i) = '%N'.code 261 loop 262 column := column + 1 263 i := i - 1 264 end 265 else 266 column := column - 1 267 end 268 ensure 269 not end_of_input 270 backlog.valid_index(index) 271 end 272 273 save_position: UNICODE_PARSER_POSITION is 274 do 275 Result.set(index, line, column) 276 end 277 278 restore_position (a_position: UNICODE_PARSER_POSITION) is 279 do 280 index := a_position.index 281 line := a_position.line 282 column := a_position.column 283 end 284 285 set_index (a_index: like index) is 286 require 287 valid_index(a_index) 288 do 289 index := a_index 290 end 291 292 valid_index (a_index: like index): BOOLEAN is 293 do 294 Result := backlog.valid_index(index) 295 end 296 297 character: CHARACTER is 298 require 299 is_connected 300 is_character 301 do 302 Result := code.to_character 303 ensure 304 Result.code = code 305 end 306 307 code: INTEGER is 308 require 309 is_connected 310 do 311 Result := backlog.item(index) 312 end 313 314 is_character: BOOLEAN is 315 require 316 is_connected 317 local 318 i: INTEGER 319 do 320 i := code 321 Result := i >= 0 and then i <= Maximum_character_code 322 end 323 324 append_substring_in (string: UNICODE_STRING; first, last: INTEGER) is 325 require 326 string /= Void 327 first <= last 328 valid_index(first) 329 valid_index(last) 330 local 331 i: INTEGER 332 do 333 from 334 i := first 335 until 336 i > last 337 loop 338 string.extend(backlog.item(i)) 339 i := i + 1 340 end 341 end 342 343feature {} 344 backlog: FAST_ARRAY[INTEGER] 345 size: INTEGER 346 347feature {RECYCLING_POOL} 348 recycle is 349 do 350 url := Void 351 end 352 353end -- class UNICODE_PARSER_BUFFER 354-- 355-- ------------------------------------------------------------------------------------------------------------ 356-- Copyright notice below. Please read. 357-- 358-- This file is part of the SmartEiffel standard library. 359-- Copyright(C) 1994-2002: INRIA - LORIA (INRIA Lorraine) - ESIAL U.H.P. - University of Nancy 1 - FRANCE 360-- Copyright(C) 2003-2006: INRIA - LORIA (INRIA Lorraine) - I.U.T. Charlemagne - University of Nancy 2 - FRANCE 361-- 362-- Authors: Dominique COLNET, Philippe RIBET, Cyril ADRIAN, Vincent CROIZIER, Frederic MERIZEN 363-- 364-- Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 365-- documentation files (the "Software"), to deal in the Software without restriction, including without 366-- limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 367-- the Software, and to permit persons to whom the Software is furnished to do so, subject to the following 368-- conditions: 369-- 370-- The above copyright notice and this permission notice shall be included in all copies or substantial 371-- portions of the Software. 372-- 373-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT 374-- LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO 375-- EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN 376-- AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 377-- OR OTHER DEALINGS IN THE SOFTWARE. 378-- 379-- http://SmartEiffel.loria.fr - SmartEiffel@loria.fr 380-- ------------------------------------------------------------------------------------------------------------