/thirdparty/icu4c/common/ucnvhz.cpp
C++ | 625 lines | 464 code | 58 blank | 103 comment | 86 complexity | 684562e9e3ae73ec150a52507bd6eb51 MD5 | raw file
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- **********************************************************************
- * Copyright (C) 2000-2015, International Business Machines
- * Corporation and others. All Rights Reserved.
- **********************************************************************
- * file name: ucnvhz.c
- * encoding: UTF-8
- * tab size: 8 (not used)
- * indentation:4
- *
- * created on: 2000oct16
- * created by: Ram Viswanadha
- * 10/31/2000 Ram Implemented offsets logic function
- *
- */
- #include "unicode/utypes.h"
- #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
- #include "cmemory.h"
- #include "unicode/ucnv.h"
- #include "unicode/ucnv_cb.h"
- #include "unicode/uset.h"
- #include "unicode/utf16.h"
- #include "ucnv_bld.h"
- #include "ucnv_cnv.h"
- #include "ucnv_imp.h"
- #define UCNV_TILDE 0x7E /* ~ */
- #define UCNV_OPEN_BRACE 0x7B /* { */
- #define UCNV_CLOSE_BRACE 0x7D /* } */
- #define SB_ESCAPE "\x7E\x7D"
- #define DB_ESCAPE "\x7E\x7B"
- #define TILDE_ESCAPE "\x7E\x7E"
- #define ESC_LEN 2
- #define CONCAT_ESCAPE_MACRO(args, targetIndex,targetLength,strToAppend, err, len,sourceIndex) UPRV_BLOCK_MACRO_BEGIN { \
- while(len-->0){ \
- if(targetIndex < targetLength){ \
- args->target[targetIndex] = (unsigned char) *strToAppend; \
- if(args->offsets!=NULL){ \
- *(offsets++) = sourceIndex-1; \
- } \
- targetIndex++; \
- } \
- else{ \
- args->converter->charErrorBuffer[(int)args->converter->charErrorBufferLength++] = (unsigned char) *strToAppend; \
- *err =U_BUFFER_OVERFLOW_ERROR; \
- } \
- strToAppend++; \
- } \
- } UPRV_BLOCK_MACRO_END
- typedef struct{
- UConverter* gbConverter;
- int32_t targetIndex;
- int32_t sourceIndex;
- UBool isEscapeAppended;
- UBool isStateDBCS;
- UBool isTargetUCharDBCS;
- UBool isEmptySegment;
- }UConverterDataHZ;
- U_CDECL_BEGIN
- static void U_CALLCONV
- _HZOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
- UConverter *gbConverter;
- if(pArgs->onlyTestIsLoadable) {
- ucnv_canCreateConverter("GBK", errorCode); /* errorCode carries result */
- return;
- }
- gbConverter = ucnv_open("GBK", errorCode);
- if(U_FAILURE(*errorCode)) {
- return;
- }
- cnv->toUnicodeStatus = 0;
- cnv->fromUnicodeStatus= 0;
- cnv->mode=0;
- cnv->fromUChar32=0x0000;
- cnv->extraInfo = uprv_calloc(1, sizeof(UConverterDataHZ));
- if(cnv->extraInfo != NULL){
- ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = gbConverter;
- }
- else {
- ucnv_close(gbConverter);
- *errorCode = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- }
- static void U_CALLCONV
- _HZClose(UConverter *cnv){
- if(cnv->extraInfo != NULL) {
- ucnv_close (((UConverterDataHZ *) (cnv->extraInfo))->gbConverter);
- if(!cnv->isExtraLocal) {
- uprv_free(cnv->extraInfo);
- }
- cnv->extraInfo = NULL;
- }
- }
- static void U_CALLCONV
- _HZReset(UConverter *cnv, UConverterResetChoice choice){
- if(choice<=UCNV_RESET_TO_UNICODE) {
- cnv->toUnicodeStatus = 0;
- cnv->mode=0;
- if(cnv->extraInfo != NULL){
- ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
- ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE;
- }
- }
- if(choice!=UCNV_RESET_TO_UNICODE) {
- cnv->fromUnicodeStatus= 0;
- cnv->fromUChar32=0x0000;
- if(cnv->extraInfo != NULL){
- ((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = FALSE;
- ((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0;
- ((UConverterDataHZ*)cnv->extraInfo)->sourceIndex = 0;
- ((UConverterDataHZ*)cnv->extraInfo)->isTargetUCharDBCS = FALSE;
- }
- }
- }
- /**************************************HZ Encoding*************************************************
- * Rules for HZ encoding
- *
- * In ASCII mode, a byte is interpreted as an ASCII character, unless a
- * '~' is encountered. The character '~' is an escape character. By
- * convention, it must be immediately followed ONLY by '~', '{' or '\n'
- * (<LF>), with the following special meaning.
- * 1. The escape sequence '~~' is interpreted as a '~'.
- * 2. The escape-to-GB sequence '~{' switches the mode from ASCII to GB.
- * 3. The escape sequence '~\n' is a line-continuation marker to be
- * consumed with no output produced.
- * In GB mode, characters are interpreted two bytes at a time as (pure)
- * GB codes until the escape-from-GB code '~}' is read. This code
- * switches the mode from GB back to ASCII. (Note that the escape-
- * from-GB code '~}' ($7E7D) is outside the defined GB range.)
- *
- * Source: RFC 1842
- *
- * Note that the formal syntax in RFC 1842 is invalid. I assume that the
- * intended definition of single-byte-segment is as follows (pedberg):
- * single-byte-segment = single-byte-seq 1*single-byte-char
- */
- static void U_CALLCONV
- UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
- UErrorCode* err){
- char tempBuf[2];
- const char *mySource = ( char *) args->source;
- UChar *myTarget = args->target;
- const char *mySourceLimit = args->sourceLimit;
- UChar32 targetUniChar = 0x0000;
- int32_t mySourceChar = 0x0000;
- UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);
- tempBuf[0]=0;
- tempBuf[1]=0;
- /* Calling code already handles this situation. */
- /*if ((args->converter == NULL) || (args->targetLimit < args->target) || (mySourceLimit < args->source)){
- *err = U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }*/
-
- while(mySource< mySourceLimit){
-
- if(myTarget < args->targetLimit){
-
- mySourceChar= (unsigned char) *mySource++;
- if(args->converter->mode == UCNV_TILDE) {
- /* second byte after ~ */
- args->converter->mode=0;
- switch(mySourceChar) {
- case 0x0A:
- /* no output for ~\n (line-continuation marker) */
- continue;
- case UCNV_TILDE:
- if(args->offsets) {
- args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2);
- }
- *(myTarget++)=(UChar)mySourceChar;
- myData->isEmptySegment = FALSE;
- continue;
- case UCNV_OPEN_BRACE:
- case UCNV_CLOSE_BRACE:
- myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE);
- if (myData->isEmptySegment) {
- myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
- *err = U_ILLEGAL_ESCAPE_SEQUENCE;
- args->converter->toUCallbackReason = UCNV_IRREGULAR;
- args->converter->toUBytes[0] = UCNV_TILDE;
- args->converter->toUBytes[1] = static_cast<uint8_t>(mySourceChar);
- args->converter->toULength = 2;
- args->target = myTarget;
- args->source = mySource;
- return;
- }
- myData->isEmptySegment = TRUE;
- continue;
- default:
- /* if the first byte is equal to TILDE and the trail byte
- * is not a valid byte then it is an error condition
- */
- /*
- * Ticket 5691: consistent illegal sequences:
- * - We include at least the first byte in the illegal sequence.
- * - If any of the non-initial bytes could be the start of a character,
- * we stop the illegal sequence before the first one of those.
- */
- myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
- *err = U_ILLEGAL_ESCAPE_SEQUENCE;
- args->converter->toUBytes[0] = UCNV_TILDE;
- if( myData->isStateDBCS ?
- (0x21 <= mySourceChar && mySourceChar <= 0x7e) :
- mySourceChar <= 0x7f
- ) {
- /* The current byte could be the start of a character: Back it out. */
- args->converter->toULength = 1;
- --mySource;
- } else {
- /* Include the current byte in the illegal sequence. */
- args->converter->toUBytes[1] = static_cast<uint8_t>(mySourceChar);
- args->converter->toULength = 2;
- }
- args->target = myTarget;
- args->source = mySource;
- return;
- }
- } else if(myData->isStateDBCS) {
- if(args->converter->toUnicodeStatus == 0x00){
- /* lead byte */
- if(mySourceChar == UCNV_TILDE) {
- args->converter->mode = UCNV_TILDE;
- } else {
- /* add another bit to distinguish a 0 byte from not having seen a lead byte */
- args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100);
- myData->isEmptySegment = FALSE; /* the segment has something, either valid or will produce a different error, so reset this */
- }
- continue;
- }
- else{
- /* trail byte */
- int leadIsOk, trailIsOk;
- uint32_t leadByte = args->converter->toUnicodeStatus & 0xff;
- targetUniChar = 0xffff;
- /*
- * Ticket 5691: consistent illegal sequences:
- * - We include at least the first byte in the illegal sequence.
- * - If any of the non-initial bytes could be the start of a character,
- * we stop the illegal sequence before the first one of those.
- *
- * In HZ DBCS, if the second byte is in the 21..7e range,
- * we report only the first byte as the illegal sequence.
- * Otherwise we convert or report the pair of bytes.
- */
- leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21);
- trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
- if (leadIsOk && trailIsOk) {
- tempBuf[0] = (char) (leadByte+0x80) ;
- tempBuf[1] = (char) (mySourceChar+0x80);
- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
- tempBuf, 2, args->converter->useFallback);
- mySourceChar= (leadByte << 8) | mySourceChar;
- } else if (trailIsOk) {
- /* report a single illegal byte and continue with the following DBCS starter byte */
- --mySource;
- mySourceChar = (int32_t)leadByte;
- } else {
- /* report a pair of illegal bytes if the second byte is not a DBCS starter */
- /* add another bit so that the code below writes 2 bytes in case of error */
- mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
- }
- args->converter->toUnicodeStatus =0x00;
- }
- }
- else{
- if(mySourceChar == UCNV_TILDE) {
- args->converter->mode = UCNV_TILDE;
- continue;
- } else if(mySourceChar <= 0x7f) {
- targetUniChar = (UChar)mySourceChar; /* ASCII */
- myData->isEmptySegment = FALSE; /* the segment has something valid */
- } else {
- targetUniChar = 0xffff;
- myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
- }
- }
- if(targetUniChar < 0xfffe){
- if(args->offsets) {
- args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 1-(myData->isStateDBCS));
- }
- *(myTarget++)=(UChar)targetUniChar;
- }
- else /* targetUniChar>=0xfffe */ {
- if(targetUniChar == 0xfffe){
- *err = U_INVALID_CHAR_FOUND;
- }
- else{
- *err = U_ILLEGAL_CHAR_FOUND;
- }
- if(mySourceChar > 0xff){
- args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8);
- args->converter->toUBytes[1] = (uint8_t)mySourceChar;
- args->converter->toULength=2;
- }
- else{
- args->converter->toUBytes[0] = (uint8_t)mySourceChar;
- args->converter->toULength=1;
- }
- break;
- }
- }
- else{
- *err =U_BUFFER_OVERFLOW_ERROR;
- break;
- }
- }
- args->target = myTarget;
- args->source = mySource;
- }
- static void U_CALLCONV
- UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
- UErrorCode * err){
- const UChar *mySource = args->source;
- char *myTarget = args->target;
- int32_t* offsets = args->offsets;
- int32_t mySourceIndex = 0;
- int32_t myTargetIndex = 0;
- int32_t targetLength = (int32_t)(args->targetLimit - myTarget);
- int32_t mySourceLength = (int32_t)(args->sourceLimit - args->source);
- uint32_t targetUniChar = 0x0000;
- UChar32 mySourceChar = 0x0000;
- UConverterDataHZ *myConverterData=(UConverterDataHZ*)args->converter->extraInfo;
- UBool isTargetUCharDBCS = (UBool) myConverterData->isTargetUCharDBCS;
- UBool oldIsTargetUCharDBCS;
- int len =0;
- const char* escSeq=NULL;
-
- /* Calling code already handles this situation. */
- /*if ((args->converter == NULL) || (args->targetLimit < myTarget) || (args->sourceLimit < args->source)){
- *err = U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }*/
- if(args->converter->fromUChar32!=0 && myTargetIndex < targetLength) {
- goto getTrail;
- }
- /*writing the char to the output stream */
- while (mySourceIndex < mySourceLength){
- targetUniChar = missingCharMarker;
- if (myTargetIndex < targetLength){
-
- mySourceChar = (UChar) mySource[mySourceIndex++];
-
- oldIsTargetUCharDBCS = isTargetUCharDBCS;
- if(mySourceChar ==UCNV_TILDE){
- /*concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);*/
- len = ESC_LEN;
- escSeq = TILDE_ESCAPE;
- CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
- continue;
- } else if(mySourceChar <= 0x7f) {
- targetUniChar = mySourceChar;
- } else {
- int32_t length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData,
- mySourceChar,&targetUniChar,args->converter->useFallback);
- /* we can only use lead bytes 21..7D and trail bytes 21..7E */
- if( length == 2 &&
- (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) &&
- (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1)
- ) {
- targetUniChar -= 0x8080;
- } else {
- targetUniChar = missingCharMarker;
- }
- }
- if (targetUniChar != missingCharMarker){
- myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF);
- if(oldIsTargetUCharDBCS != isTargetUCharDBCS || !myConverterData->isEscapeAppended ){
- /*Shifting from a double byte to single byte mode*/
- if(!isTargetUCharDBCS){
- len =ESC_LEN;
- escSeq = SB_ESCAPE;
- CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
- myConverterData->isEscapeAppended = TRUE;
- }
- else{ /* Shifting from a single byte to double byte mode*/
- len =ESC_LEN;
- escSeq = DB_ESCAPE;
- CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
- myConverterData->isEscapeAppended = TRUE;
-
- }
- }
-
- if(isTargetUCharDBCS){
- if( myTargetIndex <targetLength){
- myTarget[myTargetIndex++] =(char) (targetUniChar >> 8);
- if(offsets){
- *(offsets++) = mySourceIndex-1;
- }
- if(myTargetIndex < targetLength){
- myTarget[myTargetIndex++] =(char) targetUniChar;
- if(offsets){
- *(offsets++) = mySourceIndex-1;
- }
- }else{
- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
- *err = U_BUFFER_OVERFLOW_ERROR;
- }
- }else{
- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) (targetUniChar >> 8);
- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
- *err = U_BUFFER_OVERFLOW_ERROR;
- }
- }else{
- if( myTargetIndex <targetLength){
- myTarget[myTargetIndex++] = (char) (targetUniChar );
- if(offsets){
- *(offsets++) = mySourceIndex-1;
- }
-
- }else{
- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
- *err = U_BUFFER_OVERFLOW_ERROR;
- }
- }
- }
- else{
- /* oops.. the code point is unassigned */
- /*Handle surrogates */
- /*check if the char is a First surrogate*/
- if(U16_IS_SURROGATE(mySourceChar)) {
- if(U16_IS_SURROGATE_LEAD(mySourceChar)) {
- args->converter->fromUChar32=mySourceChar;
- getTrail:
- /*look ahead to find the trail surrogate*/
- if(mySourceIndex < mySourceLength) {
- /* test the following code unit */
- UChar trail=(UChar) args->source[mySourceIndex];
- if(U16_IS_TRAIL(trail)) {
- ++mySourceIndex;
- mySourceChar=U16_GET_SUPPLEMENTARY(args->converter->fromUChar32, trail);
- args->converter->fromUChar32=0x00;
- /* there are no surrogates in GB2312*/
- *err = U_INVALID_CHAR_FOUND;
- /* exit this condition tree */
- } else {
- /* this is an unmatched lead code unit (1st surrogate) */
- /* callback(illegal) */
- *err=U_ILLEGAL_CHAR_FOUND;
- }
- } else {
- /* no more input */
- *err = U_ZERO_ERROR;
- }
- } else {
- /* this is an unmatched trail code unit (2nd surrogate) */
- /* callback(illegal) */
- *err=U_ILLEGAL_CHAR_FOUND;
- }
- } else {
- /* callback(unassigned) for a BMP code point */
- *err = U_INVALID_CHAR_FOUND;
- }
- args->converter->fromUChar32=mySourceChar;
- break;
- }
- }
- else{
- *err = U_BUFFER_OVERFLOW_ERROR;
- break;
- }
- targetUniChar=missingCharMarker;
- }
- args->target += myTargetIndex;
- args->source += mySourceIndex;
- myConverterData->isTargetUCharDBCS = isTargetUCharDBCS;
- }
- static void U_CALLCONV
- _HZ_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
- UConverter *cnv = args->converter;
- UConverterDataHZ *convData=(UConverterDataHZ *) cnv->extraInfo;
- char *p;
- char buffer[4];
- p = buffer;
-
- if( convData->isTargetUCharDBCS){
- *p++= UCNV_TILDE;
- *p++= UCNV_CLOSE_BRACE;
- convData->isTargetUCharDBCS=FALSE;
- }
- *p++= (char)cnv->subChars[0];
- ucnv_cbFromUWriteBytes(args,
- buffer, (int32_t)(p - buffer),
- offsetIndex, err);
- }
- /*
- * Structure for cloning an HZ converter into a single memory block.
- */
- struct cloneHZStruct
- {
- UConverter cnv;
- UConverter subCnv;
- UConverterDataHZ mydata;
- };
- static UConverter * U_CALLCONV
- _HZ_SafeClone(const UConverter *cnv,
- void *stackBuffer,
- int32_t *pBufferSize,
- UErrorCode *status)
- {
- struct cloneHZStruct * localClone;
- int32_t size, bufferSizeNeeded = sizeof(struct cloneHZStruct);
- if (U_FAILURE(*status)){
- return nullptr;
- }
- if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
- *pBufferSize = bufferSizeNeeded;
- return nullptr;
- }
- localClone = (struct cloneHZStruct *)stackBuffer;
- /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
- uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataHZ));
- localClone->cnv.extraInfo = &localClone->mydata;
- localClone->cnv.isExtraLocal = TRUE;
- /* deep-clone the sub-converter */
- size = (int32_t)sizeof(UConverter);
- ((UConverterDataHZ*)localClone->cnv.extraInfo)->gbConverter =
- ucnv_safeClone(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, &localClone->subCnv, &size, status);
- return &localClone->cnv;
- }
- static void U_CALLCONV
- _HZ_GetUnicodeSet(const UConverter *cnv,
- const USetAdder *sa,
- UConverterUnicodeSet which,
- UErrorCode *pErrorCode) {
- /* HZ converts all of ASCII */
- sa->addRange(sa->set, 0, 0x7f);
- /* add all of the code points that the sub-converter handles */
- ucnv_MBCSGetFilteredUnicodeSetForUnicode(
- ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData,
- sa, which, UCNV_SET_FILTER_HZ,
- pErrorCode);
- }
- U_CDECL_END
- static const UConverterImpl _HZImpl={
- UCNV_HZ,
-
- NULL,
- NULL,
-
- _HZOpen,
- _HZClose,
- _HZReset,
-
- UConverter_toUnicode_HZ_OFFSETS_LOGIC,
- UConverter_toUnicode_HZ_OFFSETS_LOGIC,
- UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
- UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
- NULL,
-
- NULL,
- NULL,
- _HZ_WriteSub,
- _HZ_SafeClone,
- _HZ_GetUnicodeSet,
- NULL,
- NULL
- };
- static const UConverterStaticData _HZStaticData={
- sizeof(UConverterStaticData),
- "HZ",
- 0,
- UCNV_IBM,
- UCNV_HZ,
- 1,
- 4,
- { 0x1a, 0, 0, 0 },
- 1,
- FALSE,
- FALSE,
- 0,
- 0,
- { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
- };
- const UConverterSharedData _HZData=
- UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_HZStaticData, &_HZImpl);
- #endif /* #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION */