PageRenderTime 36ms CodeModel.GetById 18ms app.highlight 14ms RepoModel.GetById 1ms app.codeStats 0ms

/core/io/encodings/utf8/utf8.factor

http://github.com/abeaumont/factor
Unknown | 111 lines | 88 code | 23 blank | 0 comment | 0 complexity | 497b1cb8be413899eee19e00a874cf79 MD5 | raw file
  1! Copyright (C) 2006, 2008 Daniel Ehrenberg.
  2! See http://factorcode.org/license.txt for BSD license.
  3USING: accessors byte-arrays math math.order kernel sequences
  4sbufs vectors growable io continuations namespaces io.encodings
  5combinators strings ;
  6IN: io.encodings.utf8
  7
  8! Decoding UTF-8
  9
 10SINGLETON: utf8
 11
 12<PRIVATE 
 13
 14: starts-2? ( char -- ? )
 15    dup [ -6 shift 0b10 number= ] when ; inline
 16
 17: append-nums ( stream byte -- stream char )
 18    over stream-read1 dup starts-2?
 19    [ [ 6 shift ] dip 0b111111 bitand bitor ]
 20    [ 2drop replacement-char ] if ; inline
 21
 22: minimum-code-point ( char minimum -- char )
 23    over > [ drop replacement-char ] when ; inline
 24
 25: maximum-code-point ( char maximum -- char )
 26    over < [ drop replacement-char ] when ; inline
 27
 28: double ( stream byte -- stream char )
 29    0b11111 bitand append-nums
 30    0x80 minimum-code-point ; inline
 31
 32: triple ( stream byte -- stream char )
 33    0b1111 bitand append-nums append-nums
 34    0x800 minimum-code-point ; inline
 35
 36: quadruple ( stream byte -- stream char )
 37    0b111 bitand append-nums append-nums append-nums
 38    0x10000 minimum-code-point
 39    0x10FFFF maximum-code-point ; inline
 40
 41: begin-utf8 ( stream byte -- stream char )
 42    dup 127 > [
 43        {
 44            { [ dup -5 shift 0b110 = ] [ double ] }
 45            { [ dup -4 shift 0b1110 = ] [ triple ] }
 46            { [ dup -3 shift 0b11110 = ] [ quadruple ] }
 47            [ drop replacement-char ]
 48        } cond
 49    ] when ; inline
 50
 51: decode-utf8 ( stream -- char/f )
 52    dup stream-read1 dup [ begin-utf8 ] when nip ; inline
 53
 54M: utf8 decode-char
 55    drop decode-utf8 ; inline
 56
 57! Encoding UTF-8
 58
 59: encoded ( stream char -- )
 60    0b111111 bitand 0b10000000 bitor swap stream-write1 ; inline
 61
 62: char>utf8 ( char stream -- )
 63    over 127 <= [ stream-write1 ] [
 64        swap {
 65            { [ dup -11 shift zero? ] [
 66                2dup -6 shift 0b11000000 bitor swap stream-write1
 67                encoded
 68            ] }
 69            { [ dup -16 shift zero? ] [
 70                2dup -12 shift 0b11100000 bitor swap stream-write1
 71                2dup -6 shift encoded
 72                encoded
 73            ] }
 74            [
 75                2dup -18 shift 0b11110000 bitor swap stream-write1
 76                2dup -12 shift encoded
 77                2dup -6 shift encoded
 78                encoded
 79            ]
 80        } cond
 81    ] if ; inline
 82
 83M: utf8 encode-char
 84    drop char>utf8 ;
 85
 86M: utf8 encode-string
 87    drop
 88    over aux>>
 89    [ [ char>utf8 ] curry each ]
 90    [ [ >byte-array ] dip stream-write ] if ;
 91
 92PRIVATE>
 93
 94: code-point-length ( n -- x )
 95    [ 1 ] [
 96        log2 {
 97            { [ dup 0 6 between? ] [ 1 ] }
 98            { [ dup 7 10 between? ] [ 2 ] }
 99            { [ dup 11 15 between? ] [ 3 ] }
100            { [ dup 16 20 between? ] [ 4 ] }
101        } cond nip
102    ] if-zero ;
103
104: code-point-offsets ( string -- indices )
105    0 [ code-point-length + ] accumulate swap suffix ;
106
107: utf8-index> ( n string -- n' )
108    code-point-offsets [ <= ] with find drop ;
109
110: >utf8-index ( n string -- n' )
111    code-point-offsets nth ;