PageRenderTime 81ms CodeModel.GetById 9ms app.highlight 66ms RepoModel.GetById 1ms app.codeStats 0ms

/Src/Dependencies/Boost/boost/spirit/home/support/char_encoding/unicode/create_tables.cpp

http://hadesmem.googlecode.com/
C++ | 583 lines | 491 code | 68 blank | 24 comment | 27 complexity | 27698ec7a3dc184f72b17c8bdd967ac6 MD5 | raw file
  1/*=============================================================================
  2    Copyright (c) 2001-2011 Joel de Guzman
  3
  4    Distributed under the Boost Software License, Version 1.0. (See accompanying
  5    file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6=============================================================================*/
  7#include <boost/config/warning_disable.hpp>
  8#include <boost/spirit/include/qi.hpp>
  9#include <boost/spirit/include/phoenix.hpp>
 10#include <boost/unordered_map.hpp>
 11#include <boost/algorithm/string/trim.hpp>
 12#include <boost/cstdint.hpp>
 13#include <boost/foreach.hpp>
 14#include <boost/array.hpp>
 15#include <boost/scoped_array.hpp>
 16#include <boost/range/iterator_range.hpp>
 17
 18#include <iostream>
 19#include <fstream>
 20#include <vector>
 21#include <algorithm>
 22#include <string>
 23#include <map>
 24
 25// We place the data here. Each line comprises various fields
 26typedef std::vector<std::string> ucd_line;
 27typedef std::vector<ucd_line> ucd_vector;
 28typedef std::vector<ucd_line>::iterator ucd_iterator;
 29
 30// spirit and phoenix using declarations
 31using boost::spirit::qi::parse;
 32using boost::spirit::qi::hex;
 33using boost::spirit::qi::char_;
 34using boost::spirit::qi::eol;
 35using boost::spirit::qi::rule;
 36using boost::spirit::qi::omit;
 37using boost::spirit::qi::_1;
 38using boost::spirit::qi::_val;
 39using boost::phoenix::push_back;
 40using boost::phoenix::ref;
 41
 42// basic unsigned types
 43using boost::uint8_t;
 44using boost::uint16_t;
 45using boost::uint32_t;
 46
 47// a char range
 48struct ucd_range
 49{
 50    ucd_range(uint32_t start, uint32_t finish)
 51        : start(start), finish(finish) {}
 52            
 53    // we need this so we can use ucd_range as a multimap key 
 54    friend bool operator<(ucd_range const& a, ucd_range const& b)
 55    {
 56        return a.start < b.start;
 57    }
 58            
 59    uint32_t start;
 60    uint32_t finish;
 61};
 62
 63class ucd_info
 64{
 65public:
 66
 67    ucd_info(char const* filename)
 68    {
 69        std::ifstream in(filename, std::ios_base::in);
 70        if (!in)
 71        {
 72            std::cerr << "Error: Could not open input file: "
 73                << filename << std::endl;
 74        }
 75        else
 76        {
 77            std::string data;               // We will read the contents here.
 78            in.unsetf(std::ios::skipws);    // No white space skipping!
 79            std::copy(
 80                std::istream_iterator<char>(in),
 81                std::istream_iterator<char>(),
 82                std::back_inserter(data));
 83
 84            typedef std::string::const_iterator iterator_type;
 85            iterator_type f = data.begin();
 86            iterator_type l = data.end();
 87
 88            rule<iterator_type> endl = -('#' >> *(char_-eol)) >> eol;
 89            rule<iterator_type, std::string()> field = *(char_-(';'|endl)) >> (';'|&endl);
 90            rule<iterator_type, ucd_line()> line = +(field-endl) >> endl;
 91            rule<iterator_type, std::vector<ucd_line>()> file = +(endl | line[push_back(_val, _1)]);
 92            
 93            parse(f, l, file, info);
 94        }
 95    }
 96    
 97    template <typename Array>
 98    void collect(Array& data, int field, bool collect_properties = true) const
 99    {
100        BOOST_ASSERT(!info.empty());
101        ucd_vector::const_iterator current = info.begin();
102        ucd_vector::const_iterator end = info.end();
103        
104        while (current != end)
105        {
106            std::string range = (*current)[0];
107            boost::trim(range);
108            
109            std::string::const_iterator f = range.begin();
110            std::string::const_iterator l = range.end();
111
112            // get the code-point range
113            uint32_t start;
114            uint32_t finish;
115            parse(f, l, hex[ref(start) = ref(finish) = _1] >> -(".." >> hex[ref(finish) = _1]));
116            
117            // special case for UnicodeData.txt ranges:
118            if ((*current)[1].find("First>") != std::string::npos)
119            {
120                ++current;
121                BOOST_ASSERT(current != end);
122                BOOST_ASSERT((*current)[1].find("Last>") != std::string::npos);
123                                
124                std::string range = (*current)[0];
125                boost::trim(range);
126                f = range.begin();
127                l = range.end();
128
129                parse(f, l, hex[ref(finish) = _1]);
130            }
131            
132            std::string code;
133            if (field < int(current->size()))
134                code = (*current)[field];
135            boost::trim(code);
136            // Only collect properties we are interested in
137            if (collect_properties) // code for properties
138            {
139                if (!ignore_property(code)) 
140                {
141                    for (uint32_t i = start; i <= finish; ++i)
142                        data[i] |= map_property(code);
143                }
144            }
145            else // code for actual numeric values
146            {
147                for (uint32_t i = start; i <= finish; ++i)
148                {
149                    if (code.empty())
150                    {
151                        data[i] = 0; // signal that this code maps to itself
152                    }
153                    else
154                    {
155                        f = code.begin();
156                        l = code.end();
157                        parse(f, l, hex, data[i]);
158                    }
159                }
160            }
161            ++current;
162        }
163    }
164    
165private:
166
167    static bool ignore_property(std::string const& p)
168    {
169        // We don't handle all properties
170        std::map<std::string, int>& pm = get_property_map();
171        std::map<std::string, int>::iterator i = pm.find(p);
172        return i == pm.end();        
173    }
174
175    static int
176    map_property(std::string const& p)
177    {
178        std::map<std::string, int>& pm = get_property_map();
179        std::map<std::string, int>::iterator i = pm.find(p);
180        BOOST_ASSERT(i != pm.end());
181        return i->second;
182    }
183
184    static std::map<std::string, int>& 
185    get_property_map()
186    {
187        // The properties we are interested in:
188        static std::map<std::string, int> map;
189        if (map.empty())
190        {
191            // General_Category
192            map["Lu"] = 0;
193            map["Ll"] = 1;
194            map["Lt"] = 2;
195            map["Lm"] = 3;
196            map["Lo"] = 4;
197            
198            map["Mn"] = 8;
199            map["Me"] = 9;
200            map["Mc"] = 10;
201            
202            map["Nd"] = 16;
203            map["Nl"] = 17;
204            map["No"] = 18;
205            
206            map["Zs"] = 24;
207            map["Zl"] = 25;
208            map["Zp"] = 26;
209            
210            map["Cc"] = 32;
211            map["Cf"] = 33;
212            map["Co"] = 34;
213            map["Cs"] = 35;
214            map["Cn"] = 36;
215            
216            map["Pd"] = 40;
217            map["Ps"] = 41;
218            map["Pe"] = 42;
219            map["Pc"] = 43;
220            map["Po"] = 44;
221            map["Pi"] = 45;
222            map["Pf"] = 46;
223            
224            map["Sm"] = 48;
225            map["Sc"] = 49;
226            map["Sk"] = 50;
227            map["So"] = 51;
228            
229            // Derived Properties.
230            map["Alphabetic"] = 64;
231            map["Uppercase"] = 128;
232            map["Lowercase"] = 256;
233            map["White_Space"] = 512;
234            map["Hex_Digit"] = 1024;
235            map["Noncharacter_Code_Point"] = 2048;
236            map["Default_Ignorable_Code_Point"] = 4096;
237
238            // Script
239            map["Arabic"] = 0;
240            map["Imperial_Aramaic"] = 1;
241            map["Armenian"] = 2;
242            map["Avestan"] = 3;
243            map["Balinese"] = 4;
244            map["Bamum"] = 5;
245            map["Bengali"] = 6;
246            map["Bopomofo"] = 7;
247            map["Braille"] = 8;
248            map["Buginese"] = 9;
249            map["Buhid"] = 10;
250            map["Canadian_Aboriginal"] = 11;
251            map["Carian"] = 12;
252            map["Cham"] = 13;
253            map["Cherokee"] = 14;
254            map["Coptic"] = 15;
255            map["Cypriot"] = 16;
256            map["Cyrillic"] = 17;
257            map["Devanagari"] = 18;
258            map["Deseret"] = 19;
259            map["Egyptian_Hieroglyphs"] = 20;
260            map["Ethiopic"] = 21;
261            map["Georgian"] = 22;
262            map["Glagolitic"] = 23;
263            map["Gothic"] = 24;
264            map["Greek"] = 25;
265            map["Gujarati"] = 26;
266            map["Gurmukhi"] = 27;
267            map["Hangul"] = 28;
268            map["Han"] = 29;
269            map["Hanunoo"] = 30;
270            map["Hebrew"] = 31;
271            map["Hiragana"] = 32;
272            map["Katakana_Or_Hiragana"] = 33;
273            map["Old_Italic"] = 34;
274            map["Javanese"] = 35;
275            map["Kayah_Li"] = 36;
276            map["Katakana"] = 37;
277            map["Kharoshthi"] = 38;
278            map["Khmer"] = 39;
279            map["Kannada"] = 40;
280            map["Kaithi"] = 41;
281            map["Tai_Tham"] = 42;
282            map["Lao"] = 43;
283            map["Latin"] = 44;
284            map["Lepcha"] = 45;
285            map["Limbu"] = 46;
286            map["Linear_B"] = 47;
287            map["Lisu"] = 48;
288            map["Lycian"] = 49;
289            map["Lydian"] = 50;
290            map["Malayalam"] = 51;
291            map["Mongolian"] = 52;
292            map["Meetei_Mayek"] = 53;
293            map["Myanmar"] = 54;
294            map["Nko"] = 55;
295            map["Ogham"] = 56;
296            map["Ol_Chiki"] = 57;
297            map["Old_Turkic"] = 58;
298            map["Oriya"] = 59;
299            map["Osmanya"] = 60;
300            map["Phags_Pa"] = 61;
301            map["Inscriptional_Pahlavi"] = 62;
302            map["Phoenician"] = 63;
303            map["Inscriptional_Parthian"] = 64;
304            map["Rejang"] = 65;
305            map["Runic"] = 66;
306            map["Samaritan"] = 67;
307            map["Old_South_Arabian"] = 68;
308            map["Saurashtra"] = 69;
309            map["Shavian"] = 70;
310            map["Sinhala"] = 71;
311            map["Sundanese"] = 72;
312            map["Syloti_Nagri"] = 73;
313            map["Syriac"] = 74;
314            map["Tagbanwa"] = 75;
315            map["Tai_Le"] = 76;
316            map["New_Tai_Lue"] = 77;
317            map["Tamil"] = 78;
318            map["Tai_Viet"] = 79;
319            map["Telugu"] = 80;
320            map["Tifinagh"] = 81;
321            map["Tagalog"] = 82;
322            map["Thaana"] = 83;
323            map["Thai"] = 84;
324            map["Tibetan"] = 85;
325            map["Ugaritic"] = 86;
326            map["Vai"] = 87;
327            map["Old_Persian"] = 88;
328            map["Cuneiform"] = 89;
329            map["Yi"] = 90;
330            map["Inherited"] = 91;
331            map["Common"] = 92;
332            map["Unknown"] = 93;
333        }
334        return map;
335    }
336
337    ucd_vector info;
338};
339
340template <typename T, uint32_t block_size_ = 256>
341class ucd_table_builder
342{
343public:
344
345    static uint32_t const block_size = block_size_;
346    static uint32_t const full_span = 0x110000;
347    typedef T value_type;
348
349    ucd_table_builder() : p(new T[full_span])
350    {
351        for (uint32_t i = 0; i < full_span; ++i)
352            p[i] = 0;
353    }
354    
355    void collect(char const* filename, int field, bool collect_properties = true)
356    {
357        std::cout << "collecting " << filename << std::endl;
358        ucd_info info(filename);
359        info.collect(p, field, collect_properties);
360    }
361    
362    void build(std::vector<uint8_t>& stage1, std::vector<T const*>& stage2)
363    {        
364        std::cout << "building tables" << std::endl;
365        std::map<block_ptr, std::vector<T const*> > blocks;
366        for (T const* i = p.get(); i < (p.get() + full_span); i += block_size)
367            blocks[block_ptr(i)].push_back(i);
368        
369        // Not enough bits to store the block indices.
370        BOOST_ASSERT(blocks.size() < (1 << (sizeof(uint8_t) * 8)));
371        
372        typedef std::pair<block_ptr, std::vector<T const*> > blocks_value_type;
373        std::map<T const*, std::vector<T const*> > sorted_blocks;
374        BOOST_FOREACH(blocks_value_type const& val, blocks)
375        {
376            sorted_blocks[val.first.p] = val.second;
377        }
378
379        stage1.clear();
380        stage1.reserve(full_span / block_size);
381        stage1.resize(full_span / block_size);
382        stage2.clear();
383        stage2.reserve(blocks.size());
384
385        typedef std::pair<T const*, std::vector<T const*> > sorted_blocks_value_type;
386        BOOST_FOREACH(sorted_blocks_value_type const& val, sorted_blocks)
387        {
388            stage2.push_back(val.first);
389            BOOST_FOREACH(T const* val2, val.second)
390            {
391                stage1[(val2 - p.get()) / block_size] = stage2.size() - 1;
392            }
393        }
394    }
395        
396private:
397    
398    struct block_ptr
399    {
400        block_ptr(T const* p) : p(p) {}
401
402        friend bool operator<(block_ptr a, block_ptr b)
403        {
404            return std::lexicographical_compare(
405                a.p, a.p + block_size, b.p, b.p + block_size);
406        }
407        
408        T const* p;
409    };
410
411    boost::scoped_array<T> p;
412};
413
414template <typename Out>
415void print_tab(Out& out, int tab)
416{
417    for (int i = 0; i < tab; ++i)
418        out << ' ';
419}
420
421template <typename Out, typename C>
422void print_table(Out& out, C const& c, bool trailing_comma, int width = 4, int group = 16)
423{
424    int const tab = 4;
425    C::size_type size = c.size();
426    BOOST_ASSERT(size > 1);
427    print_tab(out, tab);
428    out << std::setw(width) << int(c[0]);
429    for (C::size_type i = 1; i < size; ++i)
430    {
431        out << ", ";
432        if ((i % group) == 0)
433        {
434            out << std::endl;
435            print_tab(out, tab);
436        }
437        out << std::setw(width) << int(c[i]);
438    }
439    
440    if (trailing_comma)
441        out << ", " << std::endl;
442}
443
444template <typename Out>
445void print_head(Out& out)
446{
447    out 
448        << "/*=============================================================================\n"
449        << "    Copyright (c) 2001-2011 Joel de Guzman\n"
450        << "\n"
451        << "    Distributed under the Boost Software License, Version 1.0. (See accompanying\n"
452        << "    file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)\n"
453        << "\n"
454        << "    AUTOGENERATED. DO NOT EDIT!!!\n"
455        << "==============================================================================*/\n"
456        << "#include <boost/cstdint.hpp>\n"
457        << "\n"
458        << "namespace boost { namespace spirit { namespace ucd { namespace detail\n"
459        << "{"
460        ;
461}
462
463template <typename Out>
464void print_tail(Out& out)
465{
466    out 
467        << "\n"
468        << "}}}} // namespace boost::spirit::unicode::detail\n"
469        ;
470}
471
472char const* get_int_type_name(int size)
473{
474    switch (size)
475    {
476        case 1: return "::boost::uint8_t";
477        case 2: return "::boost::uint16_t";
478        case 4: return "::boost::uint32_t";
479        case 5: return "::boost::uint64_t";
480        default: BOOST_ASSERT(false); return 0; // invalid size
481    };
482}
483
484template <typename Out, typename Builder>
485void print_file(Out& out, Builder& builder, int field_width, char const* name)
486{
487    std::cout << "Generating " << name << " tables" << std::endl;
488
489    uint32_t const block_size = Builder::block_size;
490    typedef typename Builder::value_type value_type;
491    print_head(out);
492    
493    std::vector<uint8_t> stage1;
494    std::vector<value_type const*> stage2;
495    builder.build(stage1, stage2);
496    std::cout << "Block Size: " << block_size << std::endl;
497    std::cout << "Total Bytes: " 
498        << stage1.size()+(stage2.size()*block_size*sizeof(value_type)) 
499        << std::endl;
500
501    out
502        << "\n"
503        << "    static const ::boost::uint8_t " << name << "_stage1[] = {\n"
504        << "\n"
505        ;
506    
507    print_table(out, stage1, false, 3);
508    char const* int_name = get_int_type_name(sizeof(value_type));
509
510    out 
511        << "\n"
512        << "    };"
513        << "\n"
514        << "\n"
515        << "    static const " << int_name << ' ' << name << "_stage2[] = {"
516        ;
517
518    int block_n = 0;
519    for (int i = 0; i < int(stage2.size()); ++i)
520    {
521        value_type const* p = stage2[i];
522        bool last = (i+1 == stage2.size());
523        out << "\n\n    // block " << block_n++ << std::endl;
524        print_table(out, 
525            boost::iterator_range<value_type const*>(p, p+block_size), !last, field_width);
526    }
527
528    out 
529        << "\n"
530        << "    };"
531        << "\n"
532        ;
533    
534    out 
535        << "\n"
536        << "    inline " << int_name << ' ' << name << "_lookup(::boost::uint32_t ch)\n"
537        << "    {\n"
538        << "        ::boost::uint32_t block_offset = " << name << "_stage1[ch / " << block_size << "] * " << block_size << ";\n"
539        << "        return " << name << "_stage2[block_offset + ch % " << block_size << "];\n"
540        << "    }\n"
541        ;
542    
543    print_tail(out);
544}
545
546int main()
547{
548    // The category tables
549    {
550        std::ofstream out("category_table.hpp");
551        ucd_table_builder<uint16_t, 256> builder;
552        builder.collect("UnicodeData.txt", 2);
553        builder.collect("DerivedCoreProperties.txt", 1);
554        builder.collect("PropList.txt", 1);
555        print_file(out, builder, 4, "category");
556    }
557    
558    // The script tables
559    {
560        std::ofstream out("script_table.hpp");
561        ucd_table_builder<uint8_t, 256> builder;
562        builder.collect("Scripts.txt", 1);
563        print_file(out, builder, 3, "script");
564    }
565    
566    // The lowercase tables
567    {
568        std::ofstream out("lowercase_table.hpp");
569        ucd_table_builder<uint32_t, 256> builder;
570        builder.collect("UnicodeData.txt", 13, false);
571        print_file(out, builder, 6, "lowercase");
572    }
573    
574    // The uppercase tables
575    {
576        std::ofstream out("uppercase_table.hpp");
577        ucd_table_builder<uint32_t, 256> builder;
578        builder.collect("UnicodeData.txt", 12, false);
579        print_file(out, builder, 6, "uppercase");
580    }
581
582    return 0;
583}