PageRenderTime 63ms CodeModel.GetById 19ms app.highlight 39ms RepoModel.GetById 2ms app.codeStats 0ms

/utf8/core.h

https://code.google.com/p/phonetisaurus/
C++ Header | 358 lines | 269 code | 46 blank | 43 comment | 84 complexity | b73d30e88b9db3eab9732728cadabd76 MD5 | raw file
  1// Copyright 2006 Nemanja Trifunovic
  2
  3/*
  4Permission is hereby granted, free of charge, to any person or organization
  5obtaining a copy of the software and accompanying documentation covered by
  6this license (the "Software") to use, reproduce, display, distribute,
  7execute, and transmit the Software, and to prepare derivative works of the
  8Software, and to permit third-parties to whom the Software is furnished to
  9do so, all subject to the following:
 10
 11The copyright notices in the Software and this entire statement, including
 12the above license grant, this restriction and the following disclaimer,
 13must be included in all copies of the Software, in whole or in part, and
 14all derivative works of the Software, unless such copies or derivative
 15works are solely in the form of machine-executable object code generated by
 16a source language processor.
 17
 18THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24DEALINGS IN THE SOFTWARE.
 25*/
 26
 27
 28#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30
 31#include <iterator>
 32
 33namespace utf8
 34{
 35    // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
 36    // You may need to change them to match your system.
 37    // These typedefs have the same names as ones from cstdint, or boost/cstdint
 38    typedef unsigned char   uint8_t;
 39    typedef unsigned short  uint16_t;
 40    typedef unsigned int    uint32_t;
 41
 42// Helper code - not intended to be directly called by the library users. May be changed at any time
 43namespace internal
 44{
 45    // Unicode constants
 46    // Leading (high) surrogates: 0xd800 - 0xdbff
 47    // Trailing (low) surrogates: 0xdc00 - 0xdfff
 48    const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
 49    const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
 50    const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
 51    const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
 52    const uint16_t LEAD_OFFSET         = LEAD_SURROGATE_MIN - (0x10000 >> 10);
 53    const uint32_t SURROGATE_OFFSET    = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
 54
 55    // Maximum valid value for a Unicode code point
 56    const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
 57
 58    template<typename octet_type>
 59    inline uint8_t mask8(octet_type oc)
 60    {
 61        return static_cast<uint8_t>(0xff & oc);
 62    }
 63    template<typename u16_type>
 64    inline uint16_t mask16(u16_type oc)
 65    {
 66        return static_cast<uint16_t>(0xffff & oc);
 67    }
 68    template<typename octet_type>
 69    inline bool is_trail(octet_type oc)
 70    {
 71        return ((mask8(oc) >> 6) == 0x2);
 72    }
 73
 74    template <typename u16>
 75    inline bool is_lead_surrogate(u16 cp)
 76    {
 77        return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
 78    }
 79
 80    template <typename u16>
 81    inline bool is_trail_surrogate(u16 cp)
 82    {
 83        return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
 84    }
 85
 86    template <typename u16>
 87    inline bool is_surrogate(u16 cp)
 88    {
 89        return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
 90    }
 91
 92    template <typename u32>
 93    inline bool is_code_point_valid(u32 cp)
 94    {
 95        return (cp <= CODE_POINT_MAX && !is_surrogate(cp));
 96    }
 97
 98    template <typename octet_iterator>
 99    inline typename std::iterator_traits<octet_iterator>::difference_type
100    sequence_length(octet_iterator lead_it)
101    {
102        uint8_t lead = mask8(*lead_it);
103        if (lead < 0x80)
104            return 1;
105        else if ((lead >> 5) == 0x6)
106            return 2;
107        else if ((lead >> 4) == 0xe)
108            return 3;
109        else if ((lead >> 3) == 0x1e)
110            return 4;
111        else
112            return 0;
113    }
114
115    template <typename octet_difference_type>
116    inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
117    {
118        if (cp < 0x80) {
119            if (length != 1) 
120                return true;
121        }
122        else if (cp < 0x800) {
123            if (length != 2) 
124                return true;
125        }
126        else if (cp < 0x10000) {
127            if (length != 3) 
128                return true;
129        }
130
131        return false;
132    }
133
134    enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
135
136    /// get_sequence_x functions decode utf-8 sequences of the length x
137
138    template <typename octet_iterator>
139    utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point)
140    {
141        if (it != end) {
142            if (code_point)
143                *code_point = mask8(*it);
144            return UTF8_OK;
145        }
146        return NOT_ENOUGH_ROOM;
147    }
148
149    template <typename octet_iterator>
150    utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point)
151    {
152        utf_error ret_code = NOT_ENOUGH_ROOM;
153
154        if (it != end) {
155            uint32_t cp = mask8(*it);
156            if (++it != end) {
157                if (is_trail(*it)) {
158                    cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
159
160                    if (code_point)
161                        *code_point = cp;
162                    ret_code = UTF8_OK;
163                }
164                else
165                    ret_code = INCOMPLETE_SEQUENCE;
166            }
167            else
168                ret_code = NOT_ENOUGH_ROOM;
169        }
170
171        return ret_code;
172    }
173
174    template <typename octet_iterator>
175    utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point)
176    {
177        utf_error ret_code = NOT_ENOUGH_ROOM;
178
179        if (it != end) {
180            uint32_t cp = mask8(*it);
181            if (++it != end) {
182                if (is_trail(*it)) {
183                    cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
184                    if (++it != end) {
185                        if (is_trail(*it)) {
186                            cp += (*it) & 0x3f;
187
188                            if (code_point)
189                                *code_point = cp;
190                            ret_code = UTF8_OK;
191                        }
192                        else 
193                            ret_code = INCOMPLETE_SEQUENCE;
194                    }
195                    else
196                        ret_code = NOT_ENOUGH_ROOM;
197                }
198                else
199                    ret_code = INCOMPLETE_SEQUENCE;
200            }
201            else
202                ret_code = NOT_ENOUGH_ROOM;
203        }
204
205        return ret_code;
206    }
207
208    template <typename octet_iterator>
209    utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point)
210    {
211        utf_error ret_code = NOT_ENOUGH_ROOM;
212
213        if (it != end) {
214            uint32_t cp = mask8(*it);
215            if (++it != end) {
216                if (is_trail(*it)) {
217                    cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
218                    if (++it != end) {
219                        if (is_trail(*it)) {
220                            cp += (mask8(*it) << 6) & 0xfff;
221                            if (++it != end) {
222                                if (is_trail(*it)) {
223                                    cp += (*it) & 0x3f;
224
225                                    if (code_point)
226                                        *code_point = cp;
227                                    ret_code = UTF8_OK;
228                                }
229                                else
230                                    ret_code = INCOMPLETE_SEQUENCE;
231                            }
232                            else
233                                ret_code = NOT_ENOUGH_ROOM;
234                        }
235                        else
236                            ret_code = INCOMPLETE_SEQUENCE;
237                    }
238                    else
239                        ret_code = NOT_ENOUGH_ROOM;
240                }
241                else 
242                    ret_code = INCOMPLETE_SEQUENCE;
243            }
244            else
245                ret_code = NOT_ENOUGH_ROOM;
246        }
247
248        return ret_code;
249    }
250
251    template <typename octet_iterator>
252    utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
253    {
254        // Save the original value of it so we can go back in case of failure
255        // Of course, it does not make much sense with i.e. stream iterators
256        octet_iterator original_it = it;
257
258        uint32_t cp = 0;
259        // Determine the sequence length based on the lead octet
260        typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
261        octet_difference_type length = sequence_length(it);
262        if (length == 0)
263            return INVALID_LEAD;
264
265        // Now that we have a valid sequence length, get trail octets and calculate the code point
266        utf_error err = UTF8_OK;
267        switch (length) {
268            case 1:
269                err = get_sequence_1(it, end, &cp);
270                break;
271            case 2:
272                err = get_sequence_2(it, end, &cp);
273            break;
274            case 3:
275                err = get_sequence_3(it, end, &cp);
276            break;
277            case 4:
278                err = get_sequence_4(it, end, &cp);
279            break;
280        }
281
282        if (err == UTF8_OK) {
283            // Decoding succeeded. Now, security checks...
284            if (is_code_point_valid(cp)) {
285                if (!is_overlong_sequence(cp, length)){
286                    // Passed! Return here.
287                    if (code_point)
288                        *code_point = cp;
289                    ++it;
290                    return UTF8_OK;
291                }
292                else
293                    err = OVERLONG_SEQUENCE;
294            }
295            else 
296                err = INVALID_CODE_POINT;
297        }
298
299        // Failure branch - restore the original value of the iterator
300        it = original_it;
301        return err;
302    }
303
304    template <typename octet_iterator>
305    inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
306        return validate_next(it, end, 0);
307    }
308
309} // namespace internal
310
311    /// The library API - functions intended to be called by the users
312
313    // Byte order mark
314    const uint8_t bom[] = {0xef, 0xbb, 0xbf};
315
316    template <typename octet_iterator>
317    octet_iterator find_invalid(octet_iterator start, octet_iterator end)
318    {
319        octet_iterator result = start;
320        while (result != end) {
321            internal::utf_error err_code = internal::validate_next(result, end);
322            if (err_code != internal::UTF8_OK)
323                return result;
324        }
325        return result;
326    }
327
328    template <typename octet_iterator>
329    inline bool is_valid(octet_iterator start, octet_iterator end)
330    {
331        return (find_invalid(start, end) == end);
332    }
333
334    template <typename octet_iterator>
335    inline bool starts_with_bom (octet_iterator it, octet_iterator end)
336    {
337        return (
338            ((it != end) && (internal::mask8(*it++)) == bom[0]) &&
339            ((it != end) && (internal::mask8(*it++)) == bom[1]) &&
340            ((it != end) && (internal::mask8(*it))   == bom[2])
341           );
342    }
343	
344	//Deprecated in release 2.3 
345    template <typename octet_iterator>
346    inline bool is_bom (octet_iterator it)
347    {
348        return (
349            (internal::mask8(*it++)) == bom[0] &&
350            (internal::mask8(*it++)) == bom[1] &&
351            (internal::mask8(*it))   == bom[2]
352           );
353    }
354} // namespace utf8
355
356#endif // header guard
357
358