PageRenderTime 1609ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 1ms

/googleclient/third_party/icu38/files/source/common/ucnvhz.c

http://o3d.googlecode.com/
C | 591 lines | 445 code | 73 blank | 73 comment | 81 complexity | 3c65c6943143f32bea2ae2b3e133b088 MD5 | raw file
Possible License(s): GPL-3.0, LGPL-2.0, LGPL-3.0, LGPL-2.1, MPL-2.0-no-copyleft-exception, BSD-3-Clause, GPL-2.0, Apache-2.0, MIT, CPL-1.0
  1. /*
  2. **********************************************************************
  3. * Copyright (C) 2000-2006, International Business Machines
  4. * Corporation and others. All Rights Reserved.
  5. **********************************************************************
  6. * file name: ucnvhz.c
  7. * encoding: US-ASCII
  8. * tab size: 8 (not used)
  9. * indentation:4
  10. *
  11. * created on: 2000oct16
  12. * created by: Ram Viswanadha
  13. * 10/31/2000 Ram Implemented offsets logic function
  14. *
  15. */
  16. #include "unicode/utypes.h"
  17. #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
  18. #include "cmemory.h"
  19. #include "unicode/ucnv.h"
  20. #include "unicode/ucnv_cb.h"
  21. #include "unicode/uset.h"
  22. #include "ucnv_bld.h"
  23. #include "ucnv_cnv.h"
  24. #define UCNV_TILDE 0x7E /* ~ */
  25. #define UCNV_OPEN_BRACE 0x7B /* { */
  26. #define UCNV_CLOSE_BRACE 0x7D /* } */
  27. #define SB_ESCAPE "\x7E\x7D"
  28. #define DB_ESCAPE "\x7E\x7B"
  29. #define TILDE_ESCAPE "\x7E\x7E"
  30. #define ESC_LEN 2
  31. #define CONCAT_ESCAPE_MACRO( args, targetIndex,targetLength,strToAppend, err, len,sourceIndex){ \
  32. while(len-->0){ \
  33. if(targetIndex < targetLength){ \
  34. args->target[targetIndex] = (unsigned char) *strToAppend; \
  35. if(args->offsets!=NULL){ \
  36. *(offsets++) = sourceIndex-1; \
  37. } \
  38. targetIndex++; \
  39. } \
  40. else{ \
  41. args->converter->charErrorBuffer[(int)args->converter->charErrorBufferLength++] = (unsigned char) *strToAppend; \
  42. *err =U_BUFFER_OVERFLOW_ERROR; \
  43. } \
  44. strToAppend++; \
  45. } \
  46. }
  47. typedef struct{
  48. UConverter* gbConverter;
  49. int32_t targetIndex;
  50. int32_t sourceIndex;
  51. UBool isEscapeAppended;
  52. UBool isStateDBCS;
  53. UBool isTargetUCharDBCS;
  54. }UConverterDataHZ;
  55. static void
  56. _HZOpen(UConverter *cnv, const char *name,const char *locale,uint32_t options, UErrorCode *errorCode){
  57. cnv->toUnicodeStatus = 0;
  58. cnv->fromUnicodeStatus= 0;
  59. cnv->mode=0;
  60. cnv->fromUChar32=0x0000;
  61. cnv->extraInfo = uprv_malloc(sizeof(UConverterDataHZ));
  62. if(cnv->extraInfo != NULL){
  63. uprv_memset(cnv->extraInfo, 0, sizeof(UConverterDataHZ));
  64. ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode);
  65. }
  66. else {
  67. *errorCode = U_MEMORY_ALLOCATION_ERROR;
  68. return;
  69. }
  70. }
  71. static void
  72. _HZClose(UConverter *cnv){
  73. if(cnv->extraInfo != NULL) {
  74. ucnv_close (((UConverterDataHZ *) (cnv->extraInfo))->gbConverter);
  75. if(!cnv->isExtraLocal) {
  76. uprv_free(cnv->extraInfo);
  77. }
  78. cnv->extraInfo = NULL;
  79. }
  80. }
  81. static void
  82. _HZReset(UConverter *cnv, UConverterResetChoice choice){
  83. if(choice<=UCNV_RESET_TO_UNICODE) {
  84. cnv->toUnicodeStatus = 0;
  85. cnv->mode=0;
  86. if(cnv->extraInfo != NULL){
  87. ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
  88. }
  89. }
  90. if(choice!=UCNV_RESET_TO_UNICODE) {
  91. cnv->fromUnicodeStatus= 0;
  92. cnv->fromUChar32=0x0000;
  93. if(cnv->extraInfo != NULL){
  94. ((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = FALSE;
  95. ((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0;
  96. ((UConverterDataHZ*)cnv->extraInfo)->sourceIndex = 0;
  97. ((UConverterDataHZ*)cnv->extraInfo)->isTargetUCharDBCS = FALSE;
  98. }
  99. }
  100. }
  101. /**************************************HZ Encoding*************************************************
  102. * Rules for HZ encoding
  103. *
  104. * In ASCII mode, a byte is interpreted as an ASCII character, unless a
  105. * '~' is encountered. The character '~' is an escape character. By
  106. * convention, it must be immediately followed ONLY by '~', '{' or '\n'
  107. * (<LF>), with the following special meaning.
  108. * 1. The escape sequence '~~' is interpreted as a '~'.
  109. * 2. The escape-to-GB sequence '~{' switches the mode from ASCII to GB.
  110. * 3. The escape sequence '~\n' is a line-continuation marker to be
  111. * consumed with no output produced.
  112. * In GB mode, characters are interpreted two bytes at a time as (pure)
  113. * GB codes until the escape-from-GB code '~}' is read. This code
  114. * switches the mode from GB back to ASCII. (Note that the escape-
  115. * from-GB code '~}' ($7E7D) is outside the defined GB range.)
  116. *
  117. * Source: RFC 1842
  118. */
  119. static void
  120. UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
  121. UErrorCode* err){
  122. char tempBuf[2];
  123. const char *mySource = ( char *) args->source;
  124. UChar *myTarget = args->target;
  125. const char *mySourceLimit = args->sourceLimit;
  126. UChar32 targetUniChar = 0x0000;
  127. UChar mySourceChar = 0x0000;
  128. UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);
  129. tempBuf[0]=0;
  130. tempBuf[1]=0;
  131. if ((args->converter == NULL) || (args->targetLimit < args->target) || (mySourceLimit < args->source)){
  132. *err = U_ILLEGAL_ARGUMENT_ERROR;
  133. return;
  134. }
  135. while(mySource< mySourceLimit){
  136. if(myTarget < args->targetLimit){
  137. mySourceChar= (unsigned char) *mySource++;
  138. switch(mySourceChar){
  139. case 0x0A:
  140. if(args->converter->mode ==UCNV_TILDE){
  141. args->converter->mode=0;
  142. }
  143. *(myTarget++)=(UChar)mySourceChar;
  144. continue;
  145. case UCNV_TILDE:
  146. if(args->converter->mode ==UCNV_TILDE){
  147. *(myTarget++)=(UChar)mySourceChar;
  148. args->converter->mode=0;
  149. continue;
  150. }
  151. else if(args->converter->toUnicodeStatus !=0){
  152. args->converter->mode=0;
  153. break;
  154. }
  155. else{
  156. args->converter->mode = UCNV_TILDE;
  157. continue;
  158. }
  159. case UCNV_OPEN_BRACE:
  160. if(args->converter->mode == UCNV_TILDE){
  161. args->converter->mode=0;
  162. myData->isStateDBCS = TRUE;
  163. continue;
  164. }
  165. else{
  166. break;
  167. }
  168. case UCNV_CLOSE_BRACE:
  169. if(args->converter->mode == UCNV_TILDE){
  170. args->converter->mode=0;
  171. myData->isStateDBCS = FALSE;
  172. continue;
  173. }
  174. else{
  175. break;
  176. }
  177. default:
  178. /* if the first byte is equal to TILDE and the trail byte
  179. * is not a valid byte then it is an error condition
  180. */
  181. if(args->converter->mode == UCNV_TILDE){
  182. args->converter->mode=0;
  183. mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
  184. goto SAVE_STATE;
  185. }
  186. break;
  187. }
  188. if(myData->isStateDBCS){
  189. if(args->converter->toUnicodeStatus == 0x00){
  190. args->converter->toUnicodeStatus = (UChar) mySourceChar;
  191. continue;
  192. }
  193. else{
  194. tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ;
  195. tempBuf[1] = (char) (mySourceChar+0x80);
  196. mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
  197. args->converter->toUnicodeStatus =0x00;
  198. targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
  199. tempBuf, 2, args->converter->useFallback);
  200. }
  201. }
  202. else{
  203. if(args->converter->fromUnicodeStatus == 0x00){
  204. targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
  205. mySource - 1, 1, args->converter->useFallback);
  206. }
  207. else{
  208. goto SAVE_STATE;
  209. }
  210. }
  211. if(targetUniChar < 0xfffe){
  212. if(args->offsets) {
  213. args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 1-(myData->isStateDBCS));
  214. }
  215. *(myTarget++)=(UChar)targetUniChar;
  216. }
  217. else if(targetUniChar>=0xfffe){
  218. SAVE_STATE:
  219. if(targetUniChar == 0xfffe){
  220. *err = U_INVALID_CHAR_FOUND;
  221. }
  222. else{
  223. *err = U_ILLEGAL_CHAR_FOUND;
  224. }
  225. if(myData->isStateDBCS){
  226. /* this should never occur since isStateDBCS is set to true
  227. * only after tempBuf[0] and tempBuf[1]
  228. * are set to the input .. just to please BEAM
  229. */
  230. if(tempBuf[0]==0 || tempBuf[1]==0){
  231. *err = U_INTERNAL_PROGRAM_ERROR;
  232. }else{
  233. args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80);
  234. args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80);
  235. args->converter->toULength=2;
  236. }
  237. }
  238. else{
  239. args->converter->toUBytes[0] = (uint8_t)mySourceChar;
  240. args->converter->toULength=1;
  241. }
  242. break;
  243. }
  244. }
  245. else{
  246. *err =U_BUFFER_OVERFLOW_ERROR;
  247. break;
  248. }
  249. }
  250. args->target = myTarget;
  251. args->source = mySource;
  252. }
  253. static void
  254. UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
  255. UErrorCode * err){
  256. const UChar *mySource = args->source;
  257. char *myTarget = args->target;
  258. int32_t* offsets = args->offsets;
  259. int32_t mySourceIndex = 0;
  260. int32_t myTargetIndex = 0;
  261. int32_t targetLength = (int32_t)(args->targetLimit - myTarget);
  262. int32_t mySourceLength = (int32_t)(args->sourceLimit - args->source);
  263. int32_t length=0;
  264. uint32_t targetUniChar = 0x0000;
  265. UChar32 mySourceChar = 0x0000;
  266. UConverterDataHZ *myConverterData=(UConverterDataHZ*)args->converter->extraInfo;
  267. UBool isTargetUCharDBCS = (UBool) myConverterData->isTargetUCharDBCS;
  268. UBool oldIsTargetUCharDBCS = isTargetUCharDBCS;
  269. int len =0;
  270. const char* escSeq=NULL;
  271. if ((args->converter == NULL) || (args->targetLimit < myTarget) || (args->sourceLimit < args->source)){
  272. *err = U_ILLEGAL_ARGUMENT_ERROR;
  273. return;
  274. }
  275. if(args->converter->fromUChar32!=0 && myTargetIndex < targetLength) {
  276. goto getTrail;
  277. }
  278. /*writing the char to the output stream */
  279. while (mySourceIndex < mySourceLength){
  280. targetUniChar = missingCharMarker;
  281. if (myTargetIndex < targetLength){
  282. mySourceChar = (UChar) mySource[mySourceIndex++];
  283. oldIsTargetUCharDBCS = isTargetUCharDBCS;
  284. if(mySourceChar ==UCNV_TILDE){
  285. /*concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);*/
  286. len = ESC_LEN;
  287. escSeq = TILDE_ESCAPE;
  288. CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
  289. continue;
  290. }
  291. else{
  292. length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData,
  293. mySourceChar,&targetUniChar,args->converter->useFallback);
  294. }
  295. /* only DBCS or SBCS characters are expected*/
  296. /* DB haracters with high bit set to 1 are expected */
  297. if(length > 2 || length==0 ||(((targetUniChar & 0x8080) != 0x8080)&& length==2)){
  298. targetUniChar= missingCharMarker;
  299. }
  300. if (targetUniChar != missingCharMarker){
  301. myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF);
  302. if(oldIsTargetUCharDBCS != isTargetUCharDBCS || !myConverterData->isEscapeAppended ){
  303. /*Shifting from a double byte to single byte mode*/
  304. if(!isTargetUCharDBCS){
  305. len =ESC_LEN;
  306. escSeq = SB_ESCAPE;
  307. CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
  308. myConverterData->isEscapeAppended = TRUE;
  309. }
  310. else{ /* Shifting from a single byte to double byte mode*/
  311. len =ESC_LEN;
  312. escSeq = DB_ESCAPE;
  313. CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
  314. myConverterData->isEscapeAppended = TRUE;
  315. }
  316. }
  317. if(isTargetUCharDBCS){
  318. if( myTargetIndex <targetLength){
  319. myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80);
  320. if(offsets){
  321. *(offsets++) = mySourceIndex-1;
  322. }
  323. if(myTargetIndex < targetLength){
  324. myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80);
  325. if(offsets){
  326. *(offsets++) = mySourceIndex-1;
  327. }
  328. }else{
  329. args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
  330. *err = U_BUFFER_OVERFLOW_ERROR;
  331. }
  332. }else{
  333. args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80);
  334. args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
  335. *err = U_BUFFER_OVERFLOW_ERROR;
  336. }
  337. }else{
  338. if( myTargetIndex <targetLength){
  339. myTarget[myTargetIndex++] = (char) (targetUniChar );
  340. if(offsets){
  341. *(offsets++) = mySourceIndex-1;
  342. }
  343. }else{
  344. args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
  345. *err = U_BUFFER_OVERFLOW_ERROR;
  346. }
  347. }
  348. }
  349. else{
  350. /* oops.. the code point is unassigned */
  351. /*Handle surrogates */
  352. /*check if the char is a First surrogate*/
  353. if(UTF_IS_SURROGATE(mySourceChar)) {
  354. if(UTF_IS_SURROGATE_FIRST(mySourceChar)) {
  355. args->converter->fromUChar32=mySourceChar;
  356. getTrail:
  357. /*look ahead to find the trail surrogate*/
  358. if(mySourceIndex < mySourceLength) {
  359. /* test the following code unit */
  360. UChar trail=(UChar) args->source[mySourceIndex];
  361. if(UTF_IS_SECOND_SURROGATE(trail)) {
  362. ++mySourceIndex;
  363. mySourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUChar32, trail);
  364. args->converter->fromUChar32=0x00;
  365. /* there are no surrogates in GB2312*/
  366. *err = U_INVALID_CHAR_FOUND;
  367. /* exit this condition tree */
  368. } else {
  369. /* this is an unmatched lead code unit (1st surrogate) */
  370. /* callback(illegal) */
  371. *err=U_ILLEGAL_CHAR_FOUND;
  372. }
  373. } else {
  374. /* no more input */
  375. *err = U_ZERO_ERROR;
  376. }
  377. } else {
  378. /* this is an unmatched trail code unit (2nd surrogate) */
  379. /* callback(illegal) */
  380. *err=U_ILLEGAL_CHAR_FOUND;
  381. }
  382. } else {
  383. /* callback(unassigned) for a BMP code point */
  384. *err = U_INVALID_CHAR_FOUND;
  385. }
  386. args->converter->fromUChar32=mySourceChar;
  387. break;
  388. }
  389. }
  390. else{
  391. *err = U_BUFFER_OVERFLOW_ERROR;
  392. break;
  393. }
  394. targetUniChar=missingCharMarker;
  395. }
  396. args->target += myTargetIndex;
  397. args->source += mySourceIndex;
  398. myConverterData->isTargetUCharDBCS = isTargetUCharDBCS;
  399. }
  400. static void
  401. _HZ_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
  402. UConverter *cnv = args->converter;
  403. UConverterDataHZ *convData=(UConverterDataHZ *) cnv->extraInfo;
  404. char *p;
  405. char buffer[4];
  406. p = buffer;
  407. if( convData->isTargetUCharDBCS){
  408. *p++= UCNV_TILDE;
  409. *p++= UCNV_CLOSE_BRACE;
  410. convData->isTargetUCharDBCS=FALSE;
  411. }
  412. *p++= (char)cnv->subChars[0];
  413. ucnv_cbFromUWriteBytes(args,
  414. buffer, (int32_t)(p - buffer),
  415. offsetIndex, err);
  416. }
  417. /*
  418. * Structure for cloning an HZ converter into a single memory block.
  419. * ucnv_safeClone() of the HZ converter will align the entire cloneHZStruct,
  420. * and then ucnv_safeClone() of the sub-converter may additionally align
  421. * subCnv inside the cloneHZStruct, for which we need the deadSpace after
  422. * subCnv. This is because UAlignedMemory may be larger than the actually
  423. * necessary alignment size for the platform.
  424. * The other cloneHZStruct fields will not be moved around,
  425. * and are aligned properly with cloneHZStruct's alignment.
  426. */
  427. struct cloneHZStruct
  428. {
  429. UConverter cnv;
  430. UConverter subCnv;
  431. UAlignedMemory deadSpace;
  432. UConverterDataHZ mydata;
  433. };
  434. static UConverter *
  435. _HZ_SafeClone(const UConverter *cnv,
  436. void *stackBuffer,
  437. int32_t *pBufferSize,
  438. UErrorCode *status)
  439. {
  440. struct cloneHZStruct * localClone;
  441. int32_t size, bufferSizeNeeded = sizeof(struct cloneHZStruct);
  442. if (U_FAILURE(*status)){
  443. return 0;
  444. }
  445. if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
  446. *pBufferSize = bufferSizeNeeded;
  447. return 0;
  448. }
  449. localClone = (struct cloneHZStruct *)stackBuffer;
  450. /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
  451. uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataHZ));
  452. localClone->cnv.extraInfo = &localClone->mydata;
  453. localClone->cnv.isExtraLocal = TRUE;
  454. /* deep-clone the sub-converter */
  455. size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
  456. ((UConverterDataHZ*)localClone->cnv.extraInfo)->gbConverter =
  457. ucnv_safeClone(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, &localClone->subCnv, &size, status);
  458. return &localClone->cnv;
  459. }
  460. static void
  461. _HZ_GetUnicodeSet(const UConverter *cnv,
  462. const USetAdder *sa,
  463. UConverterUnicodeSet which,
  464. UErrorCode *pErrorCode) {
  465. /* the tilde '~' is hardcoded in the converter */
  466. sa->add(sa->set, 0x7e);
  467. /* add all of the code points that the sub-converter handles */
  468. ((UConverterDataHZ*)cnv->extraInfo)->
  469. gbConverter->sharedData->impl->
  470. getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,
  471. sa, which, pErrorCode);
  472. }
  473. static const UConverterImpl _HZImpl={
  474. UCNV_HZ,
  475. NULL,
  476. NULL,
  477. _HZOpen,
  478. _HZClose,
  479. _HZReset,
  480. UConverter_toUnicode_HZ_OFFSETS_LOGIC,
  481. UConverter_toUnicode_HZ_OFFSETS_LOGIC,
  482. UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
  483. UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
  484. NULL,
  485. NULL,
  486. NULL,
  487. _HZ_WriteSub,
  488. _HZ_SafeClone,
  489. _HZ_GetUnicodeSet
  490. };
  491. static const UConverterStaticData _HZStaticData={
  492. sizeof(UConverterStaticData),
  493. "HZ",
  494. 0,
  495. UCNV_IBM,
  496. UCNV_HZ,
  497. 1,
  498. 4,
  499. { 0x1a, 0, 0, 0 },
  500. 1,
  501. FALSE,
  502. FALSE,
  503. 0,
  504. 0,
  505. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
  506. };
  507. const UConverterSharedData _HZData={
  508. sizeof(UConverterSharedData),
  509. ~((uint32_t) 0),
  510. NULL,
  511. NULL,
  512. &_HZStaticData,
  513. FALSE,
  514. &_HZImpl,
  515. 0
  516. };
  517. #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */