PageRenderTime 161ms CodeModel.GetById 35ms RepoModel.GetById 0ms app.codeStats 0ms

/src/intern/drw_textcodec.cpp

https://gitlab.com/benley/libdxfrw
C++ | 498 lines | 433 code | 34 blank | 31 comment | 262 complexity | ae7a4f5e3e8af09fc554e0a3579c6ffa MD5 | raw file
Possible License(s): GPL-2.0
  1. #include "drw_textcodec.h"
  2. #include <sstream>
  3. #include <iomanip>
  4. #include <algorithm>
  5. #include "../drw_base.h"
  6. #include "drw_cptables.h"
  7. #include "drw_cptable932.h"
  8. #include "drw_cptable936.h"
  9. #include "drw_cptable949.h"
  10. #include "drw_cptable950.h"
  11. DRW_TextCodec::DRW_TextCodec() {
  12. version = DRW::AC1021;
  13. conv = new DRW_Converter(NULL, 0);
  14. }
  15. DRW_TextCodec::~DRW_TextCodec() {
  16. delete conv;
  17. }
  18. void DRW_TextCodec::setVersion(std::string *v){
  19. std::string versionStr = *v;
  20. if (versionStr == "AC1009" || versionStr == "AC1006") {
  21. version = DRW::AC1009;
  22. cp = "ANSI_1252";
  23. setCodePage(&cp);
  24. } else if (versionStr == "AC1012" || versionStr == "AC1014"
  25. || versionStr == "AC1015" || versionStr == "AC1018") {
  26. version = DRW::AC1015;
  27. if (cp.empty()) { //codepage not set, initialize
  28. cp = "ANSI_1252";
  29. setCodePage(&cp);
  30. }
  31. } else {
  32. version = DRW::AC1021;
  33. cp = "ANSI_1252";
  34. }
  35. }
  36. void DRW_TextCodec::setCodePage(std::string *c){
  37. cp = correctCodePage(*c);
  38. delete conv;
  39. if (version == DRW::AC1009 || version == DRW::AC1015) {
  40. if (cp == "ANSI_874")
  41. conv = new DRW_ConvTable(DRW_Table874, CPLENGHTCOMMON);
  42. else if (cp == "ANSI_932")
  43. conv = new DRW_Conv932Table(DRW_Table932, DRW_LeadTable932,
  44. DRW_DoubleTable932, CPLENGHT932);
  45. else if (cp == "ANSI_936")
  46. conv = new DRW_ConvDBCSTable(DRW_Table936, DRW_LeadTable936,
  47. DRW_DoubleTable936, CPLENGHT936);
  48. else if (cp == "ANSI_949")
  49. conv = new DRW_ConvDBCSTable(DRW_Table949, DRW_LeadTable949,
  50. DRW_DoubleTable949, CPLENGHT949);
  51. else if (cp == "ANSI_950")
  52. conv = new DRW_ConvDBCSTable(DRW_Table950, DRW_LeadTable950,
  53. DRW_DoubleTable950, CPLENGHT950);
  54. else if (cp == "ANSI_1250")
  55. conv = new DRW_ConvTable(DRW_Table1250, CPLENGHTCOMMON);
  56. else if (cp == "ANSI_1251")
  57. conv = new DRW_ConvTable(DRW_Table1251, CPLENGHTCOMMON);
  58. else if (cp == "ANSI_1253")
  59. conv = new DRW_ConvTable(DRW_Table1253, CPLENGHTCOMMON);
  60. else if (cp == "ANSI_1254")
  61. conv = new DRW_ConvTable(DRW_Table1254, CPLENGHTCOMMON);
  62. else if (cp == "ANSI_1255")
  63. conv = new DRW_ConvTable(DRW_Table1255, CPLENGHTCOMMON);
  64. else if (cp == "ANSI_1256")
  65. conv = new DRW_ConvTable(DRW_Table1256, CPLENGHTCOMMON);
  66. else if (cp == "ANSI_1257")
  67. conv = new DRW_ConvTable(DRW_Table1257, CPLENGHTCOMMON);
  68. else if (cp == "ANSI_1258")
  69. conv = new DRW_ConvTable(DRW_Table1258, CPLENGHTCOMMON);
  70. else if (cp == "UTF-8") { //DXF older than 2007 are write in win codepages
  71. cp = "ANSI_1252";
  72. conv = new DRW_Converter(NULL, 0);
  73. } else
  74. conv = new DRW_ConvTable(DRW_Table1252, CPLENGHTCOMMON);
  75. } else {
  76. conv = new DRW_Converter(NULL, 0);
  77. }
  78. }
  79. std::string DRW_TextCodec::toUtf8(std::string s) {
  80. return conv->toUtf8(&s);
  81. }
  82. std::string DRW_TextCodec::fromUtf8(std::string s) {
  83. return conv->fromUtf8(&s);
  84. }
  85. std::string DRW_Converter::toUtf8(std::string *s) {
  86. std::string result;
  87. int j = 0;
  88. unsigned int i= 0;
  89. for (i=0; i < s->length(); i++) {
  90. unsigned char c = s->at(i);
  91. if (c < 0x80) { //ascii check for /U+????
  92. if (c == '\\' && i+6 < s->length() && s->at(i+1) == 'U' && s->at(i+2) == '+') {
  93. result += s->substr(j,i-j);
  94. result += encodeText(s->substr(i,7));
  95. i +=6;
  96. j = i+1;
  97. }
  98. } else if (c < 0xE0 ) {//2 bits
  99. i++;
  100. } else if (c < 0xF0 ) {//3 bits
  101. i +=2;
  102. } else if (c < 0xF8 ) {//4 bits
  103. i +=3;
  104. }
  105. }
  106. result += s->substr(j);
  107. return result;
  108. }
  109. std::string DRW_ConvTable::fromUtf8(std::string *s) {
  110. std::string result;
  111. bool notFound;
  112. int code;
  113. int j = 0;
  114. for (unsigned int i=0; i < s->length(); i++) {
  115. unsigned char c = s->at(i);
  116. if (c > 0x7F) { //need to decode
  117. result += s->substr(j,i-j);
  118. std::string part1 = s->substr(i,4);
  119. int l;
  120. code = decodeNum(part1, &l);
  121. j = i+l;
  122. i = j - 1;
  123. notFound = true;
  124. for (int k=0; k<cpLenght; k++){
  125. if(table[k] == code) {
  126. result += CPOFFSET + k; //translate from table
  127. notFound = false;
  128. break;
  129. }
  130. }
  131. if (notFound)
  132. result += decodeText(code);
  133. }
  134. }
  135. result += s->substr(j);
  136. return result;
  137. }
  138. std::string DRW_ConvTable::toUtf8(std::string *s) {
  139. std::string res;
  140. std::string::iterator it;
  141. for ( it=s->begin() ; it < s->end(); it++ ) {
  142. unsigned char c = *it;
  143. if (c < 0x80) {
  144. //check for \U+ encoded text
  145. if (c == '\\') {
  146. if (it+6 < s->end() && *(it+1) == 'U' && *(it+2) == '+') {
  147. res += encodeText(std::string(it, it+7));
  148. it +=6;
  149. } else {
  150. res +=c; //no \U+ encoded text write
  151. }
  152. } else
  153. res +=c; //c!='\' ascii char write
  154. } else {//end c < 0x80
  155. res += encodeNum(table[c-0x80]); //translate from table
  156. }
  157. } //end for
  158. return res;
  159. }
  160. std::string DRW_Converter::encodeText(std::string stmp){
  161. int code;
  162. #if defined(__APPLE__)
  163. int Succeeded = sscanf (&( stmp.substr(3,4)[0]), "%x", &code );
  164. if ( !Succeeded || Succeeded == EOF )
  165. code = 0;
  166. #else
  167. std::istringstream sd(stmp.substr(3,4));
  168. sd >> std::hex >> code;
  169. #endif
  170. return encodeNum(code);
  171. }
  172. std::string DRW_Converter::decodeText(int c){
  173. std::string res = "\\U+";
  174. std::string num;
  175. #if defined(__APPLE__)
  176. std::string str(16, '\0');
  177. snprintf (&(str[0]), 16, "%04X", c );
  178. num = str;
  179. #else
  180. std::stringstream ss;
  181. ss << std::uppercase << std::setfill('0') << std::setw(4) << std::hex << c;
  182. ss >> num;
  183. #endif
  184. res += num;
  185. return res;
  186. }
  187. std::string DRW_Converter::encodeNum(int c){
  188. unsigned char ret[5];
  189. if (c < 128) { // 0-7F US-ASCII 7 bits
  190. ret[0] = c;
  191. ret[1] = 0;
  192. } else if (c < 0x800) { //80-07FF 2 bytes
  193. ret[0] = 0xC0 | (c >> 6);
  194. ret[1] = 0x80 | (c & 0x3f);
  195. ret[2] = 0;
  196. } else if (c< 0x10000) { //800-FFFF 3 bytes
  197. ret[0] = 0xe0 | (c >> 12);
  198. ret[1] = 0x80 | ((c >> 6) & 0x3f);
  199. ret[2] = 0x80 | (c & 0x3f);
  200. ret[3] = 0;
  201. } else { //10000-10FFFF 4 bytes
  202. ret[0] = 0xf0 | (c >> 18);
  203. ret[1] = 0x80 | ((c >> 12) & 0x3f);
  204. ret[2] = 0x80 | ((c >> 6) & 0x3f);
  205. ret[3] = 0x80 | (c & 0x3f);
  206. ret[4] = 0;
  207. }
  208. return std::string((char*)ret);
  209. }
  210. /** 's' is a string with at least 4 bytes lenght
  211. ** returned 'b' is byte lenght of encoded char: 2,3 or 4
  212. **/
  213. int DRW_Converter::decodeNum(std::string s, int *b){
  214. int code= 0;
  215. unsigned char c = s.at(0);
  216. if ( (c& 0xE0) == 0xC0) { //2 bytes
  217. code = ( c&0x1F)<<6;
  218. code = (s.at(1) &0x3F) | code;
  219. *b = 2;
  220. } else if ( (c& 0xF0) == 0xE0) { //3 bytes
  221. code = ( c&0x0F)<<12;
  222. code = ((s.at(1) &0x3F)<<6) | code;
  223. code = (s.at(2) &0x3F) | code;
  224. *b = 3;
  225. } else if ( (c& 0xF8) == 0xF0) { //4 bytes
  226. code = ( c&0x07)<<18;
  227. code = ((s.at(1) &0x3F)<<12) | code;
  228. code = ((s.at(2) &0x3F)<<6) | code;
  229. code = (s.at(3) &0x3F) | code;
  230. *b = 4;
  231. }
  232. return code;
  233. }
  234. std::string DRW_ConvDBCSTable::fromUtf8(std::string *s) {
  235. std::string result;
  236. bool notFound;
  237. int code;
  238. int j = 0;
  239. for (unsigned int i=0; i < s->length(); i++) {
  240. unsigned char c = s->at(i);
  241. if (c > 0x7F) { //need to decode
  242. result += s->substr(j,i-j);
  243. std::string part1 = s->substr(i,4);
  244. int l;
  245. code = decodeNum(part1, &l);
  246. j = i+l;
  247. i = j - 1;
  248. notFound = true;
  249. for (int k=0; k<cpLenght; k++){
  250. if(doubleTable[k][1] == code) {
  251. int data = doubleTable[k][0];
  252. char d[3];
  253. d[0] = data >> 8;
  254. d[1] = data & 0xFF;
  255. d[2]= '\0';
  256. result += d; //translate from table
  257. notFound = false;
  258. break;
  259. }
  260. }
  261. if (notFound)
  262. result += decodeText(code);
  263. } //direct conversion
  264. }
  265. result += s->substr(j);
  266. return result;
  267. }
  268. std::string DRW_ConvDBCSTable::toUtf8(std::string *s) {
  269. std::string res;
  270. std::string::iterator it;
  271. for ( it=s->begin() ; it < s->end(); it++ ) {
  272. bool notFound = true;
  273. unsigned char c = *it;
  274. if (c < 0x80) {
  275. notFound = false;
  276. //check for \U+ encoded text
  277. if (c == '\\') {
  278. if (it+6 < s->end() && *(it+1) == 'U' && *(it+2) == '+') {
  279. res += encodeText(std::string(it, it+7));
  280. it +=6;
  281. } else {
  282. res +=c; //no \U+ encoded text write
  283. }
  284. } else
  285. res +=c; //c!='\' ascii char write
  286. } else if(c == 0x80 ){//1 byte table
  287. notFound = false;
  288. res += encodeNum(0x20AC);//euro sign
  289. } else {//2 bytes
  290. ++it;
  291. int code = (c << 8) | (unsigned char )(*it);
  292. int sta = leadTable[c-0x81];
  293. int end = leadTable[c-0x80];
  294. for (int k=sta; k<end; k++){
  295. if(doubleTable[k][0] == code) {
  296. res += encodeNum(doubleTable[k][1]); //translate from table
  297. notFound = false;
  298. break;
  299. }
  300. }
  301. }
  302. //not found
  303. if (notFound) res += encodeNum(NOTFOUND936);
  304. } //end for
  305. return res;
  306. }
  307. std::string DRW_Conv932Table::fromUtf8(std::string *s) {
  308. std::string result;
  309. bool notFound;
  310. int code;
  311. int j = 0;
  312. for (unsigned int i=0; i < s->length(); i++) {
  313. unsigned char c = s->at(i);
  314. if (c > 0x7F) { //need to decode
  315. result += s->substr(j,i-j);
  316. std::string part1 = s->substr(i,4);
  317. int l;
  318. code = decodeNum(part1, &l);
  319. j = i+l;
  320. i = j - 1;
  321. notFound = true;
  322. // 1 byte table
  323. if (code > 0xff60 && code < 0xFFA0) {
  324. result += code - CPOFFSET932; //translate from table
  325. notFound = false;
  326. }
  327. if (notFound && ( code<0xF8 || (code>0x390 && code<0x542) ||
  328. (code>0x200F && code<0x9FA1) || code>0xF928 )) {
  329. for (int k=0; k<cpLenght; k++){
  330. if(doubleTable[k][1] == code) {
  331. int data = doubleTable[k][0];
  332. char d[3];
  333. d[0] = data >> 8;
  334. d[1] = data & 0xFF;
  335. d[2]= '\0';
  336. result += d; //translate from table
  337. notFound = false;
  338. break;
  339. }
  340. }
  341. }
  342. if (notFound)
  343. result += decodeText(code);
  344. } //direct conversion
  345. }
  346. result += s->substr(j);
  347. return result;
  348. }
  349. std::string DRW_Conv932Table::toUtf8(std::string *s) {
  350. std::string res;
  351. std::string::iterator it;
  352. for ( it=s->begin() ; it < s->end(); it++ ) {
  353. bool notFound = true;
  354. unsigned char c = *it;
  355. if (c < 0x80) {
  356. notFound = false;
  357. //check for \U+ encoded text
  358. if (c == '\\') {
  359. if (it+6 < s->end() && *(it+1) == 'U' && *(it+2) == '+') {
  360. res += encodeText(std::string(it, it+7));
  361. it +=6;
  362. } else {
  363. res +=c; //no \U+ encoded text write
  364. }
  365. } else
  366. res +=c; //c!='\' ascii char write
  367. } else if(c > 0xA0 && c < 0xE0 ){//1 byte table
  368. notFound = false;
  369. res += encodeNum(c + CPOFFSET932); //translate from table
  370. } else {//2 bytes
  371. ++it;
  372. int code = (c << 8) | (unsigned char )(*it);
  373. int sta;
  374. int end=0;
  375. if (c > 0x80 && c < 0xA0) {
  376. sta = DRW_LeadTable932[c-0x81];
  377. end = DRW_LeadTable932[c-0x80];
  378. } else if (c > 0xDF && c < 0xFD){
  379. sta = DRW_LeadTable932[c-0xC1];
  380. end = DRW_LeadTable932[c-0xC0];
  381. }
  382. if (end > 0) {
  383. for (int k=sta; k<end; k++){
  384. if(DRW_DoubleTable932[k][0] == code) {
  385. res += encodeNum(DRW_DoubleTable932[k][1]); //translate from table
  386. notFound = false;
  387. break;
  388. }
  389. }
  390. }
  391. }
  392. //not found
  393. if (notFound) res += encodeNum(NOTFOUND932);
  394. } //end for
  395. return res;
  396. }
  397. std::string DRW_TextCodec::correctCodePage(const std::string& s) {
  398. //stringstream cause crash in OS/X, bug#3597944
  399. std::string cp=s;
  400. transform(cp.begin(), cp.end(), cp.begin(), toupper);
  401. //Latin/Thai
  402. if (cp=="ANSI_874" || cp=="CP874" || cp=="ISO8859-11" || cp=="TIS-620") {
  403. return "ANSI_874";
  404. //Central Europe and Eastern Europe
  405. } else if (cp=="ANSI_1250" || cp=="CP1250" || cp=="ISO8859-2") {
  406. return "ANSI_1250";
  407. //Cyrillic script
  408. } else if (cp=="ANSI_1251" || cp=="CP1251" || cp=="ISO8859-5" || cp=="KOI8-R" ||
  409. cp=="KOI8-U" || cp=="IBM 866") {
  410. return "ANSI_1251";
  411. //Western Europe
  412. } else if (cp=="ANSI_1252" || cp=="CP1252" || cp=="LATIN1" || cp=="ISO-8859-1" ||
  413. cp=="CP819" || cp=="CSISO" || cp=="IBM819" || cp=="ISO_8859-1" || cp=="APPLE ROMAN" ||
  414. cp=="ISO8859-1" || cp=="ISO8859-15" || cp=="ISO-IR-100" || cp=="L1" || cp=="IBM 850") {
  415. return "ANSI_1252";
  416. //Greek
  417. } else if (cp=="ANSI_1253" || cp=="CP1253" || cp=="iso8859-7") {
  418. return "ANSI_1253";
  419. //Turkish
  420. } else if (cp=="ANSI_1254" || cp=="CP1254" || cp=="iso8859-9" || cp=="iso8859-3") {
  421. return "ANSI_1254";
  422. //Hebrew
  423. } else if (cp=="ANSI_1255" || cp=="CP1255" || cp=="iso8859-8") {
  424. return "ANSI_1255";
  425. //Arabic
  426. } else if (cp=="ANSI_1256" || cp=="CP1256" || cp=="ISO8859-6") {
  427. return "ANSI_1256";
  428. //Baltic
  429. } else if (cp=="ANSI_1257" || cp=="CP1257" || cp=="ISO8859-4" || cp=="ISO8859-10" || cp=="ISO8859-13") {
  430. return "ANSI_1257";
  431. //Vietnamese
  432. } else if (cp=="ANSI_1258" || cp=="CP1258") {
  433. return "ANSI_1258";
  434. //Japanese
  435. } else if (cp=="ANSI_932" || cp=="SHIFT-JIS" || cp=="SHIFT_JIS" || cp=="CSSHIFTJIS" ||
  436. cp=="CSWINDOWS31J" || cp=="MS_KANJI" || cp=="X-MS-CP932" || cp=="X-SJIS" ||
  437. cp=="EUCJP" || cp=="EUC-JP" || cp=="CSEUCPKDFMTJAPANESE" || cp=="X-EUC" ||
  438. cp=="X-EUC-JP" || cp=="JIS7") {
  439. return "ANSI_932";
  440. //Chinese PRC GBK (XGB) simplified
  441. } else if (cp=="ANSI_936" || cp=="GBK" || cp=="GB2312" || cp=="CHINESE" || cp=="CN-GB" ||
  442. cp=="CSGB2312" || cp=="CSGB231280" || cp=="CSISO58BG231280" ||
  443. cp=="GB_2312-80" || cp=="GB231280" || cp=="GB2312-80" || cp=="GBK" ||
  444. cp=="ISO-IR-58" || cp=="GB18030") {
  445. return "ANSI_936";
  446. //Korean
  447. } else if (cp=="ANSI_949" || cp=="EUCKR") {
  448. return "ANSI_949";
  449. //Chinese Big5 (Taiwan, Hong Kong SAR)
  450. } else if (cp=="ANSI_950" || cp=="BIG5" || cp=="CN-BIG5" || cp=="CSBIG5" ||
  451. cp=="X-X-BIG5" || cp=="BIG5-HKSCS") {
  452. return "ANSI_950";
  453. //celtic
  454. /* } else if (cp=="ISO8859-14") {
  455. return "ISO8859-14";
  456. } else if (cp=="TSCII") {
  457. return "TSCII"; //tamil
  458. } else if (cp=="UTF16") {
  459. return "UTF16"; */
  460. } else if (cp=="UTF-8" || cp=="UTF8" || cp=="UTF88-BIT") {
  461. return "UTF-8";
  462. }
  463. return "ANSI_1252";
  464. }