/src/tools/wrappers-generator/unicode/unicode_parser_buffer.e

http://github.com/tybor/Liberty · Specman e · 380 lines · 316 code · 24 blank · 40 comment · 29 complexity · 77bb4635e1482fd8cbc79af12db08f21 MD5 · raw file

  1. -- See the Copyright notice at the end of this file.
  2. --
  3. class UNICODE_PARSER_BUFFER
  4. --
  5. -- A UTF-8 aware parser buffer for XML parsing
  6. --
  7. insert
  8. RECYCLABLE
  9. PLATFORM
  10. creation {ANY}
  11. connect_to
  12. feature {ANY}
  13. unknown_character: INTEGER is 0x0000fffd
  14. feature {ANY}
  15. connect_to (a_url: like url; a_encoding: STRING) is
  16. require
  17. not is_connected
  18. do
  19. if backlog = Void then
  20. create backlog.make(0)
  21. else
  22. check
  23. backlog.is_empty
  24. end
  25. end
  26. url := a_url
  27. if a_encoding /= Void then
  28. set_encoding(a_encoding)
  29. else
  30. size := 0
  31. end
  32. index := -1
  33. line := 1
  34. column := 1
  35. at_error := False
  36. next
  37. end
  38. disconnect is
  39. do
  40. url := Void
  41. backlog.clear_count
  42. ensure
  43. not is_connected
  44. end
  45. set_encoding (a_encoding: STRING) is
  46. require
  47. a_encoding /= Void
  48. local
  49. enc: STRING
  50. do
  51. enc := once ""
  52. enc.copy(a_encoding)
  53. enc.to_upper
  54. inspect
  55. enc
  56. when "UTF-8" then
  57. size := 4
  58. when "UTF-16" then
  59. size := 8
  60. not_yet_implemented
  61. else
  62. size := 1
  63. end
  64. end
  65. url: URL
  66. index, line, column: INTEGER
  67. at_error: BOOLEAN
  68. is_connected: BOOLEAN is
  69. do
  70. Result := url /= Void and then url.is_connected
  71. end
  72. end_of_input: BOOLEAN is
  73. require
  74. is_connected
  75. do
  76. Result := index > backlog.upper and then url.input.end_of_input
  77. end
  78. can_read_character: BOOLEAN is
  79. do
  80. Result := index < backlog.upper or else url.input.can_read_character
  81. end
  82. next is
  83. require
  84. is_connected
  85. not end_of_input
  86. can_read_character
  87. not at_error
  88. local
  89. input: INPUT_STREAM; n, w, x, y, z, b1, b2, b3: INTEGER
  90. do
  91. if index = backlog.upper then
  92. input := url.input
  93. inspect
  94. size
  95. when 0 then
  96. -- no encoding set, it is an error to have a non-ASCII character
  97. input.read_character
  98. if not input.end_of_input then
  99. if input.last_character.code > 0x7f then
  100. at_error := True
  101. else
  102. backlog.add_last(input.last_character.code)
  103. end
  104. end
  105. when 1 then
  106. -- ASCII and 8-bit pages
  107. input.read_character
  108. if not input.end_of_input then
  109. backlog.add_last(input.last_character.code)
  110. end
  111. when 4 then
  112. -- UTF-8
  113. input.read_character
  114. if not input.end_of_input then
  115. n := input.last_character.code
  116. if n < 0x00000080 then
  117. backlog.add_last(n)
  118. else
  119. breakpoint
  120. input.read_character
  121. if not input.end_of_input then
  122. if n & 0x000000e0 = 0x000000c0 then
  123. -- 2 bytes
  124. y := n
  125. input.read_character
  126. if input.end_of_input then
  127. backlog.add_last(unknown_character)
  128. else
  129. z := input.last_character.code
  130. if z & 0x000000c0 = 0x00000080 then
  131. y := y & 0x0000001f
  132. z := z & 0x0000003f
  133. b1 := y |>>> 2
  134. b2 := ((y & 0x00000003) |<< 6) & z
  135. backlog.add_last(b1 | b2)
  136. else
  137. backlog.add_last(unknown_character)
  138. end
  139. end
  140. elseif n & 0x000000f0 = 0x000000e0 then
  141. -- 3 bytes
  142. x := n
  143. input.read_character
  144. if input.end_of_input then
  145. backlog.add_last(unknown_character)
  146. else
  147. y := input.last_character.code
  148. if y & 0x000000c0 = 0x00000080 then
  149. input.read_character
  150. if input.end_of_input then
  151. backlog.add_last(unknown_character)
  152. else
  153. z := input.last_character.code
  154. if z & 0x000000c0 = 0x00000080 then
  155. x := x & 0x0000000f
  156. y := y & 0x0000003f
  157. z := z & 0x0000003f
  158. b1 := (x |<< 4) & (y |>>> 2)
  159. b2 := ((y & 0x00000003) |<< 6) & z
  160. backlog.add_last(b1 | b2)
  161. else
  162. backlog.add_last(unknown_character)
  163. end
  164. end
  165. else
  166. backlog.add_last(unknown_character)
  167. end
  168. end
  169. elseif n & 0x000000f8 = 0x000000f0 then
  170. -- 4 bytes
  171. w := n
  172. input.read_character
  173. if input.end_of_input then
  174. backlog.add_last(unknown_character)
  175. else
  176. x := input.last_character.code
  177. input.read_character
  178. if input.end_of_input then
  179. backlog.add_last(unknown_character)
  180. else
  181. y := input.last_character.code
  182. if y & 0x000000c0 = 0x00000080 then
  183. input.read_character
  184. if input.end_of_input then
  185. backlog.add_last(unknown_character)
  186. else
  187. z := input.last_character.code
  188. if z & 0x000000c0 = 0x00000080 then
  189. w := w & 0x00000007
  190. x := x & 0x0000003f
  191. y := y & 0x0000003f
  192. z := z & 0x0000003f
  193. b1 := (w |<< 2) | (x |>>> 4)
  194. b2 := ((x & 0x0000000f) |<< 4) | (y |>> 2)
  195. b3 := ((y & 0x00000003) |<< 6) & z
  196. backlog.add_last(b1 | b2 | b3)
  197. else
  198. backlog.add_last(unknown_character)
  199. end
  200. end
  201. else
  202. backlog.add_last(unknown_character)
  203. end
  204. end
  205. end
  206. else
  207. -- invalid code
  208. backlog.add_last(unknown_character)
  209. end
  210. end
  211. end
  212. end
  213. when 8 then
  214. -- UTF-16
  215. not_yet_implemented
  216. end
  217. end
  218. if not at_error then
  219. index := index + 1
  220. if not end_of_input then
  221. if code = '%N'.code then
  222. line := line + 1
  223. column := 1
  224. else
  225. column := column + 1
  226. end
  227. end
  228. end
  229. ensure
  230. at_error or else (not url.input.end_of_input implies backlog.valid_index(index))
  231. at_error or else (url.input.end_of_input implies index = backlog.upper + 1)
  232. end
  233. previous is
  234. require
  235. is_connected
  236. index > 0
  237. local
  238. i: INTEGER
  239. do
  240. index := index - 1
  241. if code = '%N'.code then
  242. line := line - 1
  243. from
  244. column := 0
  245. i := index
  246. until
  247. i = 0 or else backlog.item(i) = '%N'.code
  248. loop
  249. column := column + 1
  250. i := i - 1
  251. end
  252. else
  253. column := column - 1
  254. end
  255. ensure
  256. not end_of_input
  257. backlog.valid_index(index)
  258. end
  259. save_position: UNICODE_PARSER_POSITION is
  260. do
  261. Result.set(index, line, column)
  262. end
  263. restore_position (a_position: UNICODE_PARSER_POSITION) is
  264. do
  265. index := a_position.index
  266. line := a_position.line
  267. column := a_position.column
  268. end
  269. set_index (a_index: like index) is
  270. require
  271. valid_index(a_index)
  272. do
  273. index := a_index
  274. end
  275. valid_index (a_index: like index): BOOLEAN is
  276. do
  277. Result := backlog.valid_index(index)
  278. end
  279. character: CHARACTER is
  280. require
  281. is_connected
  282. is_character
  283. do
  284. Result := code.to_character
  285. ensure
  286. Result.code = code
  287. end
  288. code: INTEGER is
  289. require
  290. is_connected
  291. do
  292. Result := backlog.item(index)
  293. end
  294. is_character: BOOLEAN is
  295. require
  296. is_connected
  297. local
  298. i: INTEGER
  299. do
  300. i := code
  301. Result := i >= 0 and then i <= Maximum_character_code
  302. end
  303. append_substring_in (string: UNICODE_STRING; first, last: INTEGER) is
  304. require
  305. string /= Void
  306. first <= last
  307. valid_index(first)
  308. valid_index(last)
  309. local
  310. i: INTEGER
  311. do
  312. from
  313. i := first
  314. until
  315. i > last
  316. loop
  317. string.extend(backlog.item(i))
  318. i := i + 1
  319. end
  320. end
  321. feature {}
  322. backlog: FAST_ARRAY[INTEGER]
  323. size: INTEGER
  324. feature {RECYCLING_POOL}
  325. recycle is
  326. do
  327. url := Void
  328. end
  329. end -- class UNICODE_PARSER_BUFFER
  330. --
  331. -- ------------------------------------------------------------------------------------------------------------
  332. -- Copyright notice below. Please read.
  333. --
  334. -- This file is part of the SmartEiffel standard library.
  335. -- Copyright(C) 1994-2002: INRIA - LORIA (INRIA Lorraine) - ESIAL U.H.P. - University of Nancy 1 - FRANCE
  336. -- Copyright(C) 2003-2006: INRIA - LORIA (INRIA Lorraine) - I.U.T. Charlemagne - University of Nancy 2 - FRANCE
  337. --
  338. -- Authors: Dominique COLNET, Philippe RIBET, Cyril ADRIAN, Vincent CROIZIER, Frederic MERIZEN
  339. --
  340. -- Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
  341. -- documentation files (the "Software"), to deal in the Software without restriction, including without
  342. -- limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
  343. -- the Software, and to permit persons to whom the Software is furnished to do so, subject to the following
  344. -- conditions:
  345. --
  346. -- The above copyright notice and this permission notice shall be included in all copies or substantial
  347. -- portions of the Software.
  348. --
  349. -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
  350. -- LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
  351. -- EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  352. -- AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
  353. -- OR OTHER DEALINGS IN THE SOFTWARE.
  354. --
  355. -- http://SmartEiffel.loria.fr - SmartEiffel@loria.fr
  356. -- ------------------------------------------------------------------------------------------------------------