/Src/Dependencies/Boost/boost/spirit/home/support/char_encoding/unicode/create_tables.cpp

http://hadesmem.googlecode.com/ · C++ · 583 lines · 491 code · 68 blank · 24 comment · 27 complexity · 27698ec7a3dc184f72b17c8bdd967ac6 MD5 · raw file

  1. /*=============================================================================
  2. Copyright (c) 2001-2011 Joel de Guzman
  3. Distributed under the Boost Software License, Version 1.0. (See accompanying
  4. file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5. =============================================================================*/
  6. #include <boost/config/warning_disable.hpp>
  7. #include <boost/spirit/include/qi.hpp>
  8. #include <boost/spirit/include/phoenix.hpp>
  9. #include <boost/unordered_map.hpp>
  10. #include <boost/algorithm/string/trim.hpp>
  11. #include <boost/cstdint.hpp>
  12. #include <boost/foreach.hpp>
  13. #include <boost/array.hpp>
  14. #include <boost/scoped_array.hpp>
  15. #include <boost/range/iterator_range.hpp>
  16. #include <iostream>
  17. #include <fstream>
  18. #include <vector>
  19. #include <algorithm>
  20. #include <string>
  21. #include <map>
  22. // We place the data here. Each line comprises various fields
  23. typedef std::vector<std::string> ucd_line;
  24. typedef std::vector<ucd_line> ucd_vector;
  25. typedef std::vector<ucd_line>::iterator ucd_iterator;
  26. // spirit and phoenix using declarations
  27. using boost::spirit::qi::parse;
  28. using boost::spirit::qi::hex;
  29. using boost::spirit::qi::char_;
  30. using boost::spirit::qi::eol;
  31. using boost::spirit::qi::rule;
  32. using boost::spirit::qi::omit;
  33. using boost::spirit::qi::_1;
  34. using boost::spirit::qi::_val;
  35. using boost::phoenix::push_back;
  36. using boost::phoenix::ref;
  37. // basic unsigned types
  38. using boost::uint8_t;
  39. using boost::uint16_t;
  40. using boost::uint32_t;
  41. // a char range
  42. struct ucd_range
  43. {
  44. ucd_range(uint32_t start, uint32_t finish)
  45. : start(start), finish(finish) {}
  46. // we need this so we can use ucd_range as a multimap key
  47. friend bool operator<(ucd_range const& a, ucd_range const& b)
  48. {
  49. return a.start < b.start;
  50. }
  51. uint32_t start;
  52. uint32_t finish;
  53. };
  54. class ucd_info
  55. {
  56. public:
  57. ucd_info(char const* filename)
  58. {
  59. std::ifstream in(filename, std::ios_base::in);
  60. if (!in)
  61. {
  62. std::cerr << "Error: Could not open input file: "
  63. << filename << std::endl;
  64. }
  65. else
  66. {
  67. std::string data; // We will read the contents here.
  68. in.unsetf(std::ios::skipws); // No white space skipping!
  69. std::copy(
  70. std::istream_iterator<char>(in),
  71. std::istream_iterator<char>(),
  72. std::back_inserter(data));
  73. typedef std::string::const_iterator iterator_type;
  74. iterator_type f = data.begin();
  75. iterator_type l = data.end();
  76. rule<iterator_type> endl = -('#' >> *(char_-eol)) >> eol;
  77. rule<iterator_type, std::string()> field = *(char_-(';'|endl)) >> (';'|&endl);
  78. rule<iterator_type, ucd_line()> line = +(field-endl) >> endl;
  79. rule<iterator_type, std::vector<ucd_line>()> file = +(endl | line[push_back(_val, _1)]);
  80. parse(f, l, file, info);
  81. }
  82. }
  83. template <typename Array>
  84. void collect(Array& data, int field, bool collect_properties = true) const
  85. {
  86. BOOST_ASSERT(!info.empty());
  87. ucd_vector::const_iterator current = info.begin();
  88. ucd_vector::const_iterator end = info.end();
  89. while (current != end)
  90. {
  91. std::string range = (*current)[0];
  92. boost::trim(range);
  93. std::string::const_iterator f = range.begin();
  94. std::string::const_iterator l = range.end();
  95. // get the code-point range
  96. uint32_t start;
  97. uint32_t finish;
  98. parse(f, l, hex[ref(start) = ref(finish) = _1] >> -(".." >> hex[ref(finish) = _1]));
  99. // special case for UnicodeData.txt ranges:
  100. if ((*current)[1].find("First>") != std::string::npos)
  101. {
  102. ++current;
  103. BOOST_ASSERT(current != end);
  104. BOOST_ASSERT((*current)[1].find("Last>") != std::string::npos);
  105. std::string range = (*current)[0];
  106. boost::trim(range);
  107. f = range.begin();
  108. l = range.end();
  109. parse(f, l, hex[ref(finish) = _1]);
  110. }
  111. std::string code;
  112. if (field < int(current->size()))
  113. code = (*current)[field];
  114. boost::trim(code);
  115. // Only collect properties we are interested in
  116. if (collect_properties) // code for properties
  117. {
  118. if (!ignore_property(code))
  119. {
  120. for (uint32_t i = start; i <= finish; ++i)
  121. data[i] |= map_property(code);
  122. }
  123. }
  124. else // code for actual numeric values
  125. {
  126. for (uint32_t i = start; i <= finish; ++i)
  127. {
  128. if (code.empty())
  129. {
  130. data[i] = 0; // signal that this code maps to itself
  131. }
  132. else
  133. {
  134. f = code.begin();
  135. l = code.end();
  136. parse(f, l, hex, data[i]);
  137. }
  138. }
  139. }
  140. ++current;
  141. }
  142. }
  143. private:
  144. static bool ignore_property(std::string const& p)
  145. {
  146. // We don't handle all properties
  147. std::map<std::string, int>& pm = get_property_map();
  148. std::map<std::string, int>::iterator i = pm.find(p);
  149. return i == pm.end();
  150. }
  151. static int
  152. map_property(std::string const& p)
  153. {
  154. std::map<std::string, int>& pm = get_property_map();
  155. std::map<std::string, int>::iterator i = pm.find(p);
  156. BOOST_ASSERT(i != pm.end());
  157. return i->second;
  158. }
  159. static std::map<std::string, int>&
  160. get_property_map()
  161. {
  162. // The properties we are interested in:
  163. static std::map<std::string, int> map;
  164. if (map.empty())
  165. {
  166. // General_Category
  167. map["Lu"] = 0;
  168. map["Ll"] = 1;
  169. map["Lt"] = 2;
  170. map["Lm"] = 3;
  171. map["Lo"] = 4;
  172. map["Mn"] = 8;
  173. map["Me"] = 9;
  174. map["Mc"] = 10;
  175. map["Nd"] = 16;
  176. map["Nl"] = 17;
  177. map["No"] = 18;
  178. map["Zs"] = 24;
  179. map["Zl"] = 25;
  180. map["Zp"] = 26;
  181. map["Cc"] = 32;
  182. map["Cf"] = 33;
  183. map["Co"] = 34;
  184. map["Cs"] = 35;
  185. map["Cn"] = 36;
  186. map["Pd"] = 40;
  187. map["Ps"] = 41;
  188. map["Pe"] = 42;
  189. map["Pc"] = 43;
  190. map["Po"] = 44;
  191. map["Pi"] = 45;
  192. map["Pf"] = 46;
  193. map["Sm"] = 48;
  194. map["Sc"] = 49;
  195. map["Sk"] = 50;
  196. map["So"] = 51;
  197. // Derived Properties.
  198. map["Alphabetic"] = 64;
  199. map["Uppercase"] = 128;
  200. map["Lowercase"] = 256;
  201. map["White_Space"] = 512;
  202. map["Hex_Digit"] = 1024;
  203. map["Noncharacter_Code_Point"] = 2048;
  204. map["Default_Ignorable_Code_Point"] = 4096;
  205. // Script
  206. map["Arabic"] = 0;
  207. map["Imperial_Aramaic"] = 1;
  208. map["Armenian"] = 2;
  209. map["Avestan"] = 3;
  210. map["Balinese"] = 4;
  211. map["Bamum"] = 5;
  212. map["Bengali"] = 6;
  213. map["Bopomofo"] = 7;
  214. map["Braille"] = 8;
  215. map["Buginese"] = 9;
  216. map["Buhid"] = 10;
  217. map["Canadian_Aboriginal"] = 11;
  218. map["Carian"] = 12;
  219. map["Cham"] = 13;
  220. map["Cherokee"] = 14;
  221. map["Coptic"] = 15;
  222. map["Cypriot"] = 16;
  223. map["Cyrillic"] = 17;
  224. map["Devanagari"] = 18;
  225. map["Deseret"] = 19;
  226. map["Egyptian_Hieroglyphs"] = 20;
  227. map["Ethiopic"] = 21;
  228. map["Georgian"] = 22;
  229. map["Glagolitic"] = 23;
  230. map["Gothic"] = 24;
  231. map["Greek"] = 25;
  232. map["Gujarati"] = 26;
  233. map["Gurmukhi"] = 27;
  234. map["Hangul"] = 28;
  235. map["Han"] = 29;
  236. map["Hanunoo"] = 30;
  237. map["Hebrew"] = 31;
  238. map["Hiragana"] = 32;
  239. map["Katakana_Or_Hiragana"] = 33;
  240. map["Old_Italic"] = 34;
  241. map["Javanese"] = 35;
  242. map["Kayah_Li"] = 36;
  243. map["Katakana"] = 37;
  244. map["Kharoshthi"] = 38;
  245. map["Khmer"] = 39;
  246. map["Kannada"] = 40;
  247. map["Kaithi"] = 41;
  248. map["Tai_Tham"] = 42;
  249. map["Lao"] = 43;
  250. map["Latin"] = 44;
  251. map["Lepcha"] = 45;
  252. map["Limbu"] = 46;
  253. map["Linear_B"] = 47;
  254. map["Lisu"] = 48;
  255. map["Lycian"] = 49;
  256. map["Lydian"] = 50;
  257. map["Malayalam"] = 51;
  258. map["Mongolian"] = 52;
  259. map["Meetei_Mayek"] = 53;
  260. map["Myanmar"] = 54;
  261. map["Nko"] = 55;
  262. map["Ogham"] = 56;
  263. map["Ol_Chiki"] = 57;
  264. map["Old_Turkic"] = 58;
  265. map["Oriya"] = 59;
  266. map["Osmanya"] = 60;
  267. map["Phags_Pa"] = 61;
  268. map["Inscriptional_Pahlavi"] = 62;
  269. map["Phoenician"] = 63;
  270. map["Inscriptional_Parthian"] = 64;
  271. map["Rejang"] = 65;
  272. map["Runic"] = 66;
  273. map["Samaritan"] = 67;
  274. map["Old_South_Arabian"] = 68;
  275. map["Saurashtra"] = 69;
  276. map["Shavian"] = 70;
  277. map["Sinhala"] = 71;
  278. map["Sundanese"] = 72;
  279. map["Syloti_Nagri"] = 73;
  280. map["Syriac"] = 74;
  281. map["Tagbanwa"] = 75;
  282. map["Tai_Le"] = 76;
  283. map["New_Tai_Lue"] = 77;
  284. map["Tamil"] = 78;
  285. map["Tai_Viet"] = 79;
  286. map["Telugu"] = 80;
  287. map["Tifinagh"] = 81;
  288. map["Tagalog"] = 82;
  289. map["Thaana"] = 83;
  290. map["Thai"] = 84;
  291. map["Tibetan"] = 85;
  292. map["Ugaritic"] = 86;
  293. map["Vai"] = 87;
  294. map["Old_Persian"] = 88;
  295. map["Cuneiform"] = 89;
  296. map["Yi"] = 90;
  297. map["Inherited"] = 91;
  298. map["Common"] = 92;
  299. map["Unknown"] = 93;
  300. }
  301. return map;
  302. }
  303. ucd_vector info;
  304. };
  305. template <typename T, uint32_t block_size_ = 256>
  306. class ucd_table_builder
  307. {
  308. public:
  309. static uint32_t const block_size = block_size_;
  310. static uint32_t const full_span = 0x110000;
  311. typedef T value_type;
  312. ucd_table_builder() : p(new T[full_span])
  313. {
  314. for (uint32_t i = 0; i < full_span; ++i)
  315. p[i] = 0;
  316. }
  317. void collect(char const* filename, int field, bool collect_properties = true)
  318. {
  319. std::cout << "collecting " << filename << std::endl;
  320. ucd_info info(filename);
  321. info.collect(p, field, collect_properties);
  322. }
  323. void build(std::vector<uint8_t>& stage1, std::vector<T const*>& stage2)
  324. {
  325. std::cout << "building tables" << std::endl;
  326. std::map<block_ptr, std::vector<T const*> > blocks;
  327. for (T const* i = p.get(); i < (p.get() + full_span); i += block_size)
  328. blocks[block_ptr(i)].push_back(i);
  329. // Not enough bits to store the block indices.
  330. BOOST_ASSERT(blocks.size() < (1 << (sizeof(uint8_t) * 8)));
  331. typedef std::pair<block_ptr, std::vector<T const*> > blocks_value_type;
  332. std::map<T const*, std::vector<T const*> > sorted_blocks;
  333. BOOST_FOREACH(blocks_value_type const& val, blocks)
  334. {
  335. sorted_blocks[val.first.p] = val.second;
  336. }
  337. stage1.clear();
  338. stage1.reserve(full_span / block_size);
  339. stage1.resize(full_span / block_size);
  340. stage2.clear();
  341. stage2.reserve(blocks.size());
  342. typedef std::pair<T const*, std::vector<T const*> > sorted_blocks_value_type;
  343. BOOST_FOREACH(sorted_blocks_value_type const& val, sorted_blocks)
  344. {
  345. stage2.push_back(val.first);
  346. BOOST_FOREACH(T const* val2, val.second)
  347. {
  348. stage1[(val2 - p.get()) / block_size] = stage2.size() - 1;
  349. }
  350. }
  351. }
  352. private:
  353. struct block_ptr
  354. {
  355. block_ptr(T const* p) : p(p) {}
  356. friend bool operator<(block_ptr a, block_ptr b)
  357. {
  358. return std::lexicographical_compare(
  359. a.p, a.p + block_size, b.p, b.p + block_size);
  360. }
  361. T const* p;
  362. };
  363. boost::scoped_array<T> p;
  364. };
  365. template <typename Out>
  366. void print_tab(Out& out, int tab)
  367. {
  368. for (int i = 0; i < tab; ++i)
  369. out << ' ';
  370. }
  371. template <typename Out, typename C>
  372. void print_table(Out& out, C const& c, bool trailing_comma, int width = 4, int group = 16)
  373. {
  374. int const tab = 4;
  375. C::size_type size = c.size();
  376. BOOST_ASSERT(size > 1);
  377. print_tab(out, tab);
  378. out << std::setw(width) << int(c[0]);
  379. for (C::size_type i = 1; i < size; ++i)
  380. {
  381. out << ", ";
  382. if ((i % group) == 0)
  383. {
  384. out << std::endl;
  385. print_tab(out, tab);
  386. }
  387. out << std::setw(width) << int(c[i]);
  388. }
  389. if (trailing_comma)
  390. out << ", " << std::endl;
  391. }
  392. template <typename Out>
  393. void print_head(Out& out)
  394. {
  395. out
  396. << "/*=============================================================================\n"
  397. << " Copyright (c) 2001-2011 Joel de Guzman\n"
  398. << "\n"
  399. << " Distributed under the Boost Software License, Version 1.0. (See accompanying\n"
  400. << " file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)\n"
  401. << "\n"
  402. << " AUTOGENERATED. DO NOT EDIT!!!\n"
  403. << "==============================================================================*/\n"
  404. << "#include <boost/cstdint.hpp>\n"
  405. << "\n"
  406. << "namespace boost { namespace spirit { namespace ucd { namespace detail\n"
  407. << "{"
  408. ;
  409. }
  410. template <typename Out>
  411. void print_tail(Out& out)
  412. {
  413. out
  414. << "\n"
  415. << "}}}} // namespace boost::spirit::unicode::detail\n"
  416. ;
  417. }
  418. char const* get_int_type_name(int size)
  419. {
  420. switch (size)
  421. {
  422. case 1: return "::boost::uint8_t";
  423. case 2: return "::boost::uint16_t";
  424. case 4: return "::boost::uint32_t";
  425. case 5: return "::boost::uint64_t";
  426. default: BOOST_ASSERT(false); return 0; // invalid size
  427. };
  428. }
  429. template <typename Out, typename Builder>
  430. void print_file(Out& out, Builder& builder, int field_width, char const* name)
  431. {
  432. std::cout << "Generating " << name << " tables" << std::endl;
  433. uint32_t const block_size = Builder::block_size;
  434. typedef typename Builder::value_type value_type;
  435. print_head(out);
  436. std::vector<uint8_t> stage1;
  437. std::vector<value_type const*> stage2;
  438. builder.build(stage1, stage2);
  439. std::cout << "Block Size: " << block_size << std::endl;
  440. std::cout << "Total Bytes: "
  441. << stage1.size()+(stage2.size()*block_size*sizeof(value_type))
  442. << std::endl;
  443. out
  444. << "\n"
  445. << " static const ::boost::uint8_t " << name << "_stage1[] = {\n"
  446. << "\n"
  447. ;
  448. print_table(out, stage1, false, 3);
  449. char const* int_name = get_int_type_name(sizeof(value_type));
  450. out
  451. << "\n"
  452. << " };"
  453. << "\n"
  454. << "\n"
  455. << " static const " << int_name << ' ' << name << "_stage2[] = {"
  456. ;
  457. int block_n = 0;
  458. for (int i = 0; i < int(stage2.size()); ++i)
  459. {
  460. value_type const* p = stage2[i];
  461. bool last = (i+1 == stage2.size());
  462. out << "\n\n // block " << block_n++ << std::endl;
  463. print_table(out,
  464. boost::iterator_range<value_type const*>(p, p+block_size), !last, field_width);
  465. }
  466. out
  467. << "\n"
  468. << " };"
  469. << "\n"
  470. ;
  471. out
  472. << "\n"
  473. << " inline " << int_name << ' ' << name << "_lookup(::boost::uint32_t ch)\n"
  474. << " {\n"
  475. << " ::boost::uint32_t block_offset = " << name << "_stage1[ch / " << block_size << "] * " << block_size << ";\n"
  476. << " return " << name << "_stage2[block_offset + ch % " << block_size << "];\n"
  477. << " }\n"
  478. ;
  479. print_tail(out);
  480. }
  481. int main()
  482. {
  483. // The category tables
  484. {
  485. std::ofstream out("category_table.hpp");
  486. ucd_table_builder<uint16_t, 256> builder;
  487. builder.collect("UnicodeData.txt", 2);
  488. builder.collect("DerivedCoreProperties.txt", 1);
  489. builder.collect("PropList.txt", 1);
  490. print_file(out, builder, 4, "category");
  491. }
  492. // The script tables
  493. {
  494. std::ofstream out("script_table.hpp");
  495. ucd_table_builder<uint8_t, 256> builder;
  496. builder.collect("Scripts.txt", 1);
  497. print_file(out, builder, 3, "script");
  498. }
  499. // The lowercase tables
  500. {
  501. std::ofstream out("lowercase_table.hpp");
  502. ucd_table_builder<uint32_t, 256> builder;
  503. builder.collect("UnicodeData.txt", 13, false);
  504. print_file(out, builder, 6, "lowercase");
  505. }
  506. // The uppercase tables
  507. {
  508. std::ofstream out("uppercase_table.hpp");
  509. ucd_table_builder<uint32_t, 256> builder;
  510. builder.collect("UnicodeData.txt", 12, false);
  511. print_file(out, builder, 6, "uppercase");
  512. }
  513. return 0;
  514. }