PageRenderTime 57ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/ATF2/control-software/epics-3.14.8/extensions/src/ChannelArchiver/ThirdParty/xerces-c-src2_4_0/src/xercesc/util/XMLUTF8Transcoder.cpp

http://atf2flightsim.googlecode.com/
C++ | 452 lines | 225 code | 50 blank | 177 comment | 39 complexity | 653a9a8b07c133bfbb5e4a6b3d90673a MD5 | raw file
Possible License(s): BSD-2-Clause, LGPL-2.0, IPL-1.0, BSD-3-Clause
  1. /*
  2. * The Apache Software License, Version 1.1
  3. *
  4. * Copyright (c) 1999-2000 The Apache Software Foundation. All rights
  5. * reserved.
  6. *
  7. * Redistribution and use in source and binary forms, with or without
  8. * modification, are permitted provided that the following conditions
  9. * are met:
  10. *
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. *
  14. * 2. Redistributions in binary form must reproduce the above copyright
  15. * notice, this list of conditions and the following disclaimer in
  16. * the documentation and/or other materials provided with the
  17. * distribution.
  18. *
  19. * 3. The end-user documentation included with the redistribution,
  20. * if any, must include the following acknowledgment:
  21. * "This product includes software developed by the
  22. * Apache Software Foundation (http://www.apache.org/)."
  23. * Alternately, this acknowledgment may appear in the software itself,
  24. * if and wherever such third-party acknowledgments normally appear.
  25. *
  26. * 4. The names "Xerces" and "Apache Software Foundation" must
  27. * not be used to endorse or promote products derived from this
  28. * software without prior written permission. For written
  29. * permission, please contact apache\@apache.org.
  30. *
  31. * 5. Products derived from this software may not be called "Apache",
  32. * nor may "Apache" appear in their name, without prior written
  33. * permission of the Apache Software Foundation.
  34. *
  35. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  36. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  37. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  38. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  39. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  40. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  41. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  42. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  43. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  44. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  45. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  46. * SUCH DAMAGE.
  47. * ====================================================================
  48. *
  49. * This software consists of voluntary contributions made by many
  50. * individuals on behalf of the Apache Software Foundation, and was
  51. * originally based on software copyright (c) 1999, International
  52. * Business Machines, Inc., http://www.ibm.com . For more information
  53. * on the Apache Software Foundation, please see
  54. * <http://www.apache.org/>.
  55. */
  56. /**
  57. * $Id: XMLUTF8Transcoder.cpp,v 1.1.1.1 2009/03/14 06:42:28 whitegr Exp $
  58. */
  59. // ---------------------------------------------------------------------------
  60. // Includes
  61. // ---------------------------------------------------------------------------
  62. #include <xercesc/util/TranscodingException.hpp>
  63. #include <xercesc/util/XMLString.hpp>
  64. #include <xercesc/util/XMLUniDefs.hpp>
  65. #include <xercesc/util/XMLUTF8Transcoder.hpp>
  66. #include <xercesc/util/UTFDataFormatException.hpp>
  67. XERCES_CPP_NAMESPACE_BEGIN
  68. // ---------------------------------------------------------------------------
  69. // Local static data
  70. //
  71. // gUTFBytes
  72. // A list of counts of trailing bytes for each initial byte in the input.
  73. //
  74. // gUTFByteIndicator
  75. // For a UTF8 sequence of n bytes, n>=2, the first byte of the
  76. // sequence must contain n 1's followed by precisely 1 0 with the
  77. // rest of the byte containing arbitrary bits. This array stores
  78. // the required bit pattern for validity checking.
  79. // gUTFByteIndicatorTest
  80. // When bitwise and'd with the observed value, if the observed
  81. // value is correct then a result matching gUTFByteIndicator will
  82. // be produced.
  83. //
  84. // gUTFOffsets
  85. // A list of values to offset each result char type, according to how
  86. // many source bytes when into making it.
  87. //
  88. // gFirstByteMark
  89. // A list of values to mask onto the first byte of an encoded sequence,
  90. // indexed by the number of bytes used to create the sequence.
  91. // ---------------------------------------------------------------------------
  92. static const XMLByte gUTFBytes[256] =
  93. {
  94. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  95. , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  96. , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  97. , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  98. , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  99. , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  100. , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  101. , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  102. , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  103. , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  104. , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  105. , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  106. , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
  107. , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
  108. , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
  109. , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
  110. };
  111. static const XMLByte gUTFByteIndicator[6] =
  112. {
  113. 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
  114. };
  115. static const XMLByte gUTFByteIndicatorTest[6] =
  116. {
  117. 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE
  118. };
  119. static const XMLUInt32 gUTFOffsets[6] =
  120. {
  121. 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
  122. };
  123. static const XMLByte gFirstByteMark[7] =
  124. {
  125. 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
  126. };
  127. // ---------------------------------------------------------------------------
  128. // XMLUTF8Transcoder: Constructors and Destructor
  129. // ---------------------------------------------------------------------------
  130. XMLUTF8Transcoder::XMLUTF8Transcoder(const XMLCh* const encodingName
  131. , const unsigned int blockSize) :
  132. XMLTranscoder(encodingName, blockSize)
  133. {
  134. }
  135. XMLUTF8Transcoder::~XMLUTF8Transcoder()
  136. {
  137. }
  138. // ---------------------------------------------------------------------------
  139. // XMLUTF8Transcoder: Implementation of the transcoder API
  140. // ---------------------------------------------------------------------------
  141. unsigned int
  142. XMLUTF8Transcoder::transcodeFrom(const XMLByte* const srcData
  143. , const unsigned int srcCount
  144. , XMLCh* const toFill
  145. , const unsigned int maxChars
  146. , unsigned int& bytesEaten
  147. , unsigned char* const charSizes)
  148. {
  149. // Watch for pathological scenario. Shouldn't happen, but...
  150. if (!srcCount || !maxChars)
  151. return 0;
  152. // If debugging, make sure that the block size is legal
  153. #if defined(XERCES_DEBUG)
  154. checkBlockSize(maxChars);
  155. #endif
  156. //
  157. // Get pointers to our start and end points of the input and output
  158. // buffers.
  159. //
  160. const XMLByte* srcPtr = srcData;
  161. const XMLByte* srcEnd = srcPtr + srcCount;
  162. XMLCh* outPtr = toFill;
  163. XMLCh* outEnd = outPtr + maxChars;
  164. unsigned char* sizePtr = charSizes;
  165. //
  166. // We now loop until we either run out of input data, or room to store
  167. // output chars.
  168. //
  169. while ((srcPtr < srcEnd) && (outPtr < outEnd))
  170. {
  171. // Special-case ASCII, which is a leading byte value of <= 127
  172. if (*srcPtr <= 127)
  173. {
  174. *outPtr++ = XMLCh(*srcPtr++);
  175. *sizePtr++ = 1;
  176. continue;
  177. }
  178. // See how many trailing src bytes this sequence is going to require
  179. const unsigned int trailingBytes = gUTFBytes[*srcPtr];
  180. //
  181. // If there are not enough source bytes to do this one, then we
  182. // are done. Note that we done >= here because we are implicitly
  183. // counting the 1 byte we get no matter what.
  184. //
  185. // If we break out here, then there is nothing to undo since we
  186. // haven't updated any pointers yet.
  187. //
  188. if (srcPtr + trailingBytes >= srcEnd)
  189. break;
  190. // Looks ok, so lets build up the value
  191. // or at least let's try to do so--remembering that
  192. // we cannot assume the encoding to be valid:
  193. // first, test first byte
  194. if((gUTFByteIndicatorTest[trailingBytes] & *srcPtr) != gUTFByteIndicator[trailingBytes]) {
  195. char pos[2] = {(char)0x31, 0};
  196. char len[2] = {(char)trailingBytes+0x31, 0};
  197. char byte[2] = {*srcPtr,0};
  198. ThrowXML3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, pos, byte, len);
  199. }
  200. XMLUInt32 tmpVal = *srcPtr++;
  201. tmpVal <<= 6;
  202. for(unsigned int i=1; i<trailingBytes; i++)
  203. {
  204. if((*srcPtr & 0xC0) == 0x80)
  205. {
  206. tmpVal += *srcPtr++;
  207. tmpVal <<= 6;
  208. }
  209. else
  210. {
  211. char len[2] = {(char)trailingBytes+0x31, 0};
  212. char pos[2]= {(char)i+0x31, 0};
  213. char byte[2] = {*srcPtr,0};
  214. ThrowXML3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, pos, byte, len);
  215. }
  216. }
  217. if((*srcPtr & 0xC0) == 0x80)
  218. {
  219. tmpVal += *srcPtr++;
  220. }
  221. else
  222. {
  223. char len[2] = {(char)trailingBytes+0x31, 0};
  224. char byte[2] = {*srcPtr,0};
  225. ThrowXML3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, len, byte, len);
  226. }
  227. // since trailingBytes comes from an array, this logic is redundant
  228. // default :
  229. // ThrowXML(TranscodingException, XMLExcepts::Trans_BadSrcSeq);
  230. //}
  231. tmpVal -= gUTFOffsets[trailingBytes];
  232. //
  233. // If it will fit into a single char, then put it in. Otherwise
  234. // encode it as a surrogate pair. If its not valid, use the
  235. // replacement char.
  236. //
  237. if (!(tmpVal & 0xFFFF0000))
  238. {
  239. *sizePtr++ = trailingBytes + 1;
  240. *outPtr++ = XMLCh(tmpVal);
  241. }
  242. else if (tmpVal > 0x10FFFF)
  243. {
  244. //
  245. // If we've gotten more than 32 chars so far, then just break
  246. // out for now and lets process those. When we come back in
  247. // here again, we'll get no chars and throw an exception. This
  248. // way, the error will have a line and col number closer to
  249. // the real problem area.
  250. //
  251. if ((outPtr - toFill) > 32)
  252. break;
  253. ThrowXML(TranscodingException, XMLExcepts::Trans_BadSrcSeq);
  254. }
  255. else
  256. {
  257. //
  258. // If we have enough room to store the leading and trailing
  259. // chars, then lets do it. Else, pretend this one never
  260. // happened, and leave it for the next time. Since we don't
  261. // update the bytes read until the bottom of the loop, by
  262. // breaking out here its like it never happened.
  263. //
  264. if (outPtr + 1 >= outEnd)
  265. break;
  266. // Store the leading surrogate char
  267. tmpVal -= 0x10000;
  268. *sizePtr++ = trailingBytes + 1;
  269. *outPtr++ = XMLCh((tmpVal >> 10) + 0xD800);
  270. //
  271. // And then the treailing char. This one accounts for no
  272. // bytes eaten from the source, so set the char size for this
  273. // one to be zero.
  274. //
  275. *sizePtr++ = 0;
  276. *outPtr++ = XMLCh(tmpVal & 0x3FF) + 0xDC00;
  277. }
  278. }
  279. // Update the bytes eaten
  280. bytesEaten = srcPtr - srcData;
  281. // Return the characters read
  282. return outPtr - toFill;
  283. }
  284. unsigned int
  285. XMLUTF8Transcoder::transcodeTo( const XMLCh* const srcData
  286. , const unsigned int srcCount
  287. , XMLByte* const toFill
  288. , const unsigned int maxBytes
  289. , unsigned int& charsEaten
  290. , const UnRepOpts options)
  291. {
  292. // Watch for pathological scenario. Shouldn't happen, but...
  293. if (!srcCount || !maxBytes)
  294. return 0;
  295. //
  296. // Get pointers to our start and end points of the input and output
  297. // buffers.
  298. //
  299. const XMLCh* srcPtr = srcData;
  300. const XMLCh* srcEnd = srcPtr + srcCount;
  301. XMLByte* outPtr = toFill;
  302. XMLByte* outEnd = toFill + maxBytes;
  303. while (srcPtr < srcEnd)
  304. {
  305. //
  306. // Tentatively get the next char out. We have to get it into a
  307. // 32 bit value, because it could be a surrogate pair.
  308. //
  309. XMLUInt32 curVal = *srcPtr;
  310. //
  311. // If its a leading surrogate, then lets see if we have the trailing
  312. // available. If not, then give up now and leave it for next time.
  313. //
  314. unsigned int srcUsed = 1;
  315. if ((curVal >= 0xD800) && (curVal <= 0xDBFF))
  316. {
  317. if (srcPtr + 1 >= srcEnd)
  318. break;
  319. // Create the composite surrogate pair
  320. curVal = ((curVal - 0xD800) << 10)
  321. + ((*(srcPtr + 1) - 0xDC00) + 0x10000);
  322. // And indicate that we ate another one
  323. srcUsed++;
  324. }
  325. // Figure out how many bytes we need
  326. unsigned int encodedBytes;
  327. if (curVal < 0x80)
  328. encodedBytes = 1;
  329. else if (curVal < 0x800)
  330. encodedBytes = 2;
  331. else if (curVal < 0x10000)
  332. encodedBytes = 3;
  333. else if (curVal < 0x200000)
  334. encodedBytes = 4;
  335. else if (curVal < 0x4000000)
  336. encodedBytes = 5;
  337. else if (curVal <= 0x7FFFFFFF)
  338. encodedBytes = 6;
  339. else
  340. {
  341. // If the options say to throw, then throw
  342. if (options == UnRep_Throw)
  343. {
  344. XMLCh tmpBuf[16];
  345. XMLString::binToText(curVal, tmpBuf, 16, 16);
  346. ThrowXML2
  347. (
  348. TranscodingException
  349. , XMLExcepts::Trans_Unrepresentable
  350. , tmpBuf
  351. , getEncodingName()
  352. );
  353. }
  354. // Else, use the replacement character
  355. *outPtr++ = chSpace;
  356. srcPtr += srcUsed;
  357. continue;
  358. }
  359. //
  360. // If we cannot fully get this char into the output buffer,
  361. // then leave it for the next time.
  362. //
  363. if (outPtr + encodedBytes > outEnd)
  364. break;
  365. // We can do it, so update the source index
  366. srcPtr += srcUsed;
  367. //
  368. // And spit out the bytes. We spit them out in reverse order
  369. // here, so bump up the output pointer and work down as we go.
  370. //
  371. outPtr += encodedBytes;
  372. switch(encodedBytes)
  373. {
  374. case 6 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
  375. curVal >>= 6;
  376. case 5 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
  377. curVal >>= 6;
  378. case 4 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
  379. curVal >>= 6;
  380. case 3 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
  381. curVal >>= 6;
  382. case 2 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
  383. curVal >>= 6;
  384. case 1 : *--outPtr = XMLByte
  385. (
  386. curVal | gFirstByteMark[encodedBytes]
  387. );
  388. }
  389. // Add the encoded bytes back in again to indicate we've eaten them
  390. outPtr += encodedBytes;
  391. }
  392. // Fill in the chars we ate
  393. charsEaten = (srcPtr - srcData);
  394. // And return the bytes we filled in
  395. return (outPtr - toFill);
  396. }
  397. bool XMLUTF8Transcoder::canTranscodeTo(const unsigned int toCheck) const
  398. {
  399. // We can represent anything in the Unicode (with surrogates) range
  400. return (toCheck <= 0x10FFFF);
  401. }
  402. XERCES_CPP_NAMESPACE_END