/core/io/encodings/utf8/utf8.factor

http://github.com/abeaumont/factor · Factor · 111 lines · 84 code · 23 blank · 4 comment · 16 complexity · 497b1cb8be413899eee19e00a874cf79 MD5 · raw file

  1. ! Copyright (C) 2006, 2008 Daniel Ehrenberg.
  2. ! See http://factorcode.org/license.txt for BSD license.
  3. USING: accessors byte-arrays math math.order kernel sequences
  4. sbufs vectors growable io continuations namespaces io.encodings
  5. combinators strings ;
  6. IN: io.encodings.utf8
  7. ! Decoding UTF-8
  8. SINGLETON: utf8
  9. <PRIVATE
  10. : starts-2? ( char -- ? )
  11. dup [ -6 shift 0b10 number= ] when ; inline
  12. : append-nums ( stream byte -- stream char )
  13. over stream-read1 dup starts-2?
  14. [ [ 6 shift ] dip 0b111111 bitand bitor ]
  15. [ 2drop replacement-char ] if ; inline
  16. : minimum-code-point ( char minimum -- char )
  17. over > [ drop replacement-char ] when ; inline
  18. : maximum-code-point ( char maximum -- char )
  19. over < [ drop replacement-char ] when ; inline
  20. : double ( stream byte -- stream char )
  21. 0b11111 bitand append-nums
  22. 0x80 minimum-code-point ; inline
  23. : triple ( stream byte -- stream char )
  24. 0b1111 bitand append-nums append-nums
  25. 0x800 minimum-code-point ; inline
  26. : quadruple ( stream byte -- stream char )
  27. 0b111 bitand append-nums append-nums append-nums
  28. 0x10000 minimum-code-point
  29. 0x10FFFF maximum-code-point ; inline
  30. : begin-utf8 ( stream byte -- stream char )
  31. dup 127 > [
  32. {
  33. { [ dup -5 shift 0b110 = ] [ double ] }
  34. { [ dup -4 shift 0b1110 = ] [ triple ] }
  35. { [ dup -3 shift 0b11110 = ] [ quadruple ] }
  36. [ drop replacement-char ]
  37. } cond
  38. ] when ; inline
  39. : decode-utf8 ( stream -- char/f )
  40. dup stream-read1 dup [ begin-utf8 ] when nip ; inline
  41. M: utf8 decode-char
  42. drop decode-utf8 ; inline
  43. ! Encoding UTF-8
  44. : encoded ( stream char -- )
  45. 0b111111 bitand 0b10000000 bitor swap stream-write1 ; inline
  46. : char>utf8 ( char stream -- )
  47. over 127 <= [ stream-write1 ] [
  48. swap {
  49. { [ dup -11 shift zero? ] [
  50. 2dup -6 shift 0b11000000 bitor swap stream-write1
  51. encoded
  52. ] }
  53. { [ dup -16 shift zero? ] [
  54. 2dup -12 shift 0b11100000 bitor swap stream-write1
  55. 2dup -6 shift encoded
  56. encoded
  57. ] }
  58. [
  59. 2dup -18 shift 0b11110000 bitor swap stream-write1
  60. 2dup -12 shift encoded
  61. 2dup -6 shift encoded
  62. encoded
  63. ]
  64. } cond
  65. ] if ; inline
  66. M: utf8 encode-char
  67. drop char>utf8 ;
  68. M: utf8 encode-string
  69. drop
  70. over aux>>
  71. [ [ char>utf8 ] curry each ]
  72. [ [ >byte-array ] dip stream-write ] if ;
  73. PRIVATE>
  74. : code-point-length ( n -- x )
  75. [ 1 ] [
  76. log2 {
  77. { [ dup 0 6 between? ] [ 1 ] }
  78. { [ dup 7 10 between? ] [ 2 ] }
  79. { [ dup 11 15 between? ] [ 3 ] }
  80. { [ dup 16 20 between? ] [ 4 ] }
  81. } cond nip
  82. ] if-zero ;
  83. : code-point-offsets ( string -- indices )
  84. 0 [ code-point-length + ] accumulate swap suffix ;
  85. : utf8-index> ( n string -- n' )
  86. code-point-offsets [ <= ] with find drop ;
  87. : >utf8-index ( n string -- n' )
  88. code-point-offsets nth ;