PageRenderTime 52ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/projects/poi-3.6/src/java/org/apache/poi/util/StringUtil.java

https://gitlab.com/essere.lab.public/qualitas.class-corpus
Java | 393 lines | 218 code | 20 blank | 155 comment | 47 complexity | aab06a0443fd4647a181965c85817481 MD5 | raw file
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.util;
  16. import java.io.UnsupportedEncodingException;
  17. import java.text.FieldPosition;
  18. import java.text.NumberFormat;
  19. import org.apache.poi.hssf.record.RecordInputStream;
  20. /**
  21. * Title: String Utility Description: Collection of string handling utilities<p/>
  22. *
  23. * Note - none of the methods in this class deals with {@link org.apache.poi.hssf.record.ContinueRecord}s. For such
  24. * functionality, consider using {@link RecordInputStream
  25. } *
  26. *
  27. *@author Andrew C. Oliver
  28. *@author Sergei Kozello (sergeikozello at mail.ru)
  29. *@author Toshiaki Kamoshida (kamoshida.toshiaki at future dot co dot jp)
  30. */
  31. public class StringUtil {
  32. private static final String ENCODING_ISO_8859_1 = "ISO-8859-1";
  33. private StringUtil() {
  34. // no instances of this class
  35. }
  36. /**
  37. * Given a byte array of 16-bit unicode characters in Little Endian
  38. * format (most important byte last), return a Java String representation
  39. * of it.
  40. *
  41. * { 0x16, 0x00 } -0x16
  42. *
  43. * @param string the byte array to be converted
  44. * @param offset the initial offset into the
  45. * byte array. it is assumed that string[ offset ] and string[ offset +
  46. * 1 ] contain the first 16-bit unicode character
  47. * @param len the length of the final string
  48. * @return the converted string, never <code>null</code>.
  49. * @exception ArrayIndexOutOfBoundsException if offset is out of bounds for
  50. * the byte array (i.e., is negative or is greater than or equal to
  51. * string.length)
  52. * @exception IllegalArgumentException if len is too large (i.e.,
  53. * there is not enough data in string to create a String of that
  54. * length)
  55. */
  56. public static String getFromUnicodeLE(
  57. final byte[] string,
  58. final int offset,
  59. final int len)
  60. throws ArrayIndexOutOfBoundsException, IllegalArgumentException {
  61. if ((offset < 0) || (offset >= string.length)) {
  62. throw new ArrayIndexOutOfBoundsException("Illegal offset");
  63. }
  64. if ((len < 0) || (((string.length - offset) / 2) < len)) {
  65. throw new IllegalArgumentException("Illegal length " + len);
  66. }
  67. try {
  68. return new String(string, offset, len * 2, "UTF-16LE");
  69. } catch (UnsupportedEncodingException e) {
  70. throw new RuntimeException(e);
  71. }
  72. }
  73. /**
  74. * Given a byte array of 16-bit unicode characters in little endian
  75. * format (most important byte last), return a Java String representation
  76. * of it.
  77. *
  78. * { 0x16, 0x00 } -0x16
  79. *
  80. * @param string the byte array to be converted
  81. * @return the converted string, never <code>null</code>
  82. */
  83. public static String getFromUnicodeLE(byte[] string) {
  84. if(string.length == 0) { return ""; }
  85. return getFromUnicodeLE(string, 0, string.length / 2);
  86. }
  87. /**
  88. * Read 8 bit data (in ISO-8859-1 codepage) into a (unicode) Java
  89. * String and return.
  90. * (In Excel terms, read compressed 8 bit unicode as a string)
  91. *
  92. * @param string byte array to read
  93. * @param offset offset to read byte array
  94. * @param len length to read byte array
  95. * @return String generated String instance by reading byte array
  96. */
  97. public static String getFromCompressedUnicode(
  98. final byte[] string,
  99. final int offset,
  100. final int len) {
  101. try {
  102. int len_to_use = Math.min(len, string.length - offset);
  103. return new String(string, offset, len_to_use, ENCODING_ISO_8859_1);
  104. } catch (UnsupportedEncodingException e) {
  105. throw new RuntimeException(e);
  106. }
  107. }
  108. public static String readCompressedUnicode(LittleEndianInput in, int nChars) {
  109. char[] buf = new char[nChars];
  110. for (int i = 0; i < buf.length; i++) {
  111. buf[i] = (char) in.readUByte();
  112. }
  113. return new String(buf);
  114. }
  115. /**
  116. * InputStream <tt>in</tt> is expected to contain:
  117. * <ol>
  118. * <li>ushort nChars</li>
  119. * <li>byte is16BitFlag</li>
  120. * <li>byte[]/char[] characterData</li>
  121. * </ol>
  122. * For this encoding, the is16BitFlag is always present even if nChars==0.
  123. */
  124. public static String readUnicodeString(LittleEndianInput in) {
  125. int nChars = in.readUShort();
  126. byte flag = in.readByte();
  127. if ((flag & 0x01) == 0) {
  128. return readCompressedUnicode(in, nChars);
  129. }
  130. return readUnicodeLE(in, nChars);
  131. }
  132. /**
  133. * InputStream <tt>in</tt> is expected to contain:
  134. * <ol>
  135. * <li>byte is16BitFlag</li>
  136. * <li>byte[]/char[] characterData</li>
  137. * </ol>
  138. * For this encoding, the is16BitFlag is always present even if nChars==0.
  139. * <br/>
  140. * This method should be used when the nChars field is <em>not</em> stored
  141. * as a ushort immediately before the is16BitFlag. Otherwise, {@link
  142. * #readUnicodeString(LittleEndianInput)} can be used.
  143. */
  144. public static String readUnicodeString(LittleEndianInput in, int nChars) {
  145. byte is16Bit = in.readByte();
  146. if ((is16Bit & 0x01) == 0) {
  147. return readCompressedUnicode(in, nChars);
  148. }
  149. return readUnicodeLE(in, nChars);
  150. }
  151. /**
  152. * OutputStream <tt>out</tt> will get:
  153. * <ol>
  154. * <li>ushort nChars</li>
  155. * <li>byte is16BitFlag</li>
  156. * <li>byte[]/char[] characterData</li>
  157. * </ol>
  158. * For this encoding, the is16BitFlag is always present even if nChars==0.
  159. */
  160. public static void writeUnicodeString(LittleEndianOutput out, String value) {
  161. int nChars = value.length();
  162. out.writeShort(nChars);
  163. boolean is16Bit = hasMultibyte(value);
  164. out.writeByte(is16Bit ? 0x01 : 0x00);
  165. if (is16Bit) {
  166. putUnicodeLE(value, out);
  167. } else {
  168. putCompressedUnicode(value, out);
  169. }
  170. }
  171. /**
  172. * OutputStream <tt>out</tt> will get:
  173. * <ol>
  174. * <li>byte is16BitFlag</li>
  175. * <li>byte[]/char[] characterData</li>
  176. * </ol>
  177. * For this encoding, the is16BitFlag is always present even if nChars==0.
  178. * <br/>
  179. * This method should be used when the nChars field is <em>not</em> stored
  180. * as a ushort immediately before the is16BitFlag. Otherwise, {@link
  181. * #writeUnicodeString(LittleEndianOutput, String)} can be used.
  182. */
  183. public static void writeUnicodeStringFlagAndData(LittleEndianOutput out, String value) {
  184. boolean is16Bit = hasMultibyte(value);
  185. out.writeByte(is16Bit ? 0x01 : 0x00);
  186. if (is16Bit) {
  187. putUnicodeLE(value, out);
  188. } else {
  189. putCompressedUnicode(value, out);
  190. }
  191. }
  192. /**
  193. * @return the number of bytes that would be written by {@link #writeUnicodeString(LittleEndianOutput, String)}
  194. */
  195. public static int getEncodedSize(String value) {
  196. int result = 2 + 1;
  197. result += value.length() * (StringUtil.hasMultibyte(value) ? 2 : 1);
  198. return result;
  199. }
  200. /**
  201. * Takes a unicode (java) string, and returns it as 8 bit data (in ISO-8859-1
  202. * codepage).
  203. * (In Excel terms, write compressed 8 bit unicode)
  204. *
  205. * @param input the String containing the data to be written
  206. * @param output the byte array to which the data is to be written
  207. * @param offset an offset into the byte arrat at which the data is start
  208. * when written
  209. */
  210. public static void putCompressedUnicode(String input, byte[] output, int offset) {
  211. byte[] bytes;
  212. try {
  213. bytes = input.getBytes(ENCODING_ISO_8859_1);
  214. } catch (UnsupportedEncodingException e) {
  215. throw new RuntimeException(e);
  216. }
  217. System.arraycopy(bytes, 0, output, offset, bytes.length);
  218. }
  219. public static void putCompressedUnicode(String input, LittleEndianOutput out) {
  220. byte[] bytes;
  221. try {
  222. bytes = input.getBytes(ENCODING_ISO_8859_1);
  223. } catch (UnsupportedEncodingException e) {
  224. throw new RuntimeException(e);
  225. }
  226. out.write(bytes);
  227. }
  228. /**
  229. * Takes a unicode string, and returns it as little endian (most
  230. * important byte last) bytes in the supplied byte array.
  231. * (In Excel terms, write uncompressed unicode)
  232. *
  233. * @param input the String containing the unicode data to be written
  234. * @param output the byte array to hold the uncompressed unicode, should be twice the length of the String
  235. * @param offset the offset to start writing into the byte array
  236. */
  237. public static void putUnicodeLE(String input, byte[] output, int offset) {
  238. byte[] bytes;
  239. try {
  240. bytes = input.getBytes("UTF-16LE");
  241. } catch (UnsupportedEncodingException e) {
  242. throw new RuntimeException(e);
  243. }
  244. System.arraycopy(bytes, 0, output, offset, bytes.length);
  245. }
  246. public static void putUnicodeLE(String input, LittleEndianOutput out) {
  247. byte[] bytes;
  248. try {
  249. bytes = input.getBytes("UTF-16LE");
  250. } catch (UnsupportedEncodingException e) {
  251. throw new RuntimeException(e);
  252. }
  253. out.write(bytes);
  254. }
  255. public static String readUnicodeLE(LittleEndianInput in, int nChars) {
  256. char[] buf = new char[nChars];
  257. for (int i = 0; i < buf.length; i++) {
  258. buf[i] = (char) in.readUShort();
  259. }
  260. return new String(buf);
  261. }
  262. /**
  263. * Apply printf() like formatting to a string.
  264. * Primarily used for logging.
  265. * @param message the string with embedded formatting info
  266. * eg. "This is a test %2.2"
  267. * @param params array of values to format into the string
  268. * @return The formatted string
  269. */
  270. public static String format(String message, Object[] params) {
  271. int currentParamNumber = 0;
  272. StringBuffer formattedMessage = new StringBuffer();
  273. for (int i = 0; i < message.length(); i++) {
  274. if (message.charAt(i) == '%') {
  275. if (currentParamNumber >= params.length) {
  276. formattedMessage.append("?missing data?");
  277. } else if (
  278. (params[currentParamNumber] instanceof Number)
  279. && (i + 1 < message.length())) {
  280. i
  281. += matchOptionalFormatting(
  282. (Number) params[currentParamNumber++],
  283. message.substring(i + 1),
  284. formattedMessage);
  285. } else {
  286. formattedMessage.append(
  287. params[currentParamNumber++].toString());
  288. }
  289. } else {
  290. if ((message.charAt(i) == '\\')
  291. && (i + 1 < message.length())
  292. && (message.charAt(i + 1) == '%')) {
  293. formattedMessage.append('%');
  294. i++;
  295. } else {
  296. formattedMessage.append(message.charAt(i));
  297. }
  298. }
  299. }
  300. return formattedMessage.toString();
  301. }
  302. private static int matchOptionalFormatting(
  303. Number number,
  304. String formatting,
  305. StringBuffer outputTo) {
  306. NumberFormat numberFormat = NumberFormat.getInstance();
  307. if ((0 < formatting.length())
  308. && Character.isDigit(formatting.charAt(0))) {
  309. numberFormat.setMinimumIntegerDigits(
  310. Integer.parseInt(formatting.charAt(0) + ""));
  311. if ((2 < formatting.length())
  312. && (formatting.charAt(1) == '.')
  313. && Character.isDigit(formatting.charAt(2))) {
  314. numberFormat.setMaximumFractionDigits(
  315. Integer.parseInt(formatting.charAt(2) + ""));
  316. numberFormat.format(number, outputTo, new FieldPosition(0));
  317. return 3;
  318. }
  319. numberFormat.format(number, outputTo, new FieldPosition(0));
  320. return 1;
  321. } else if (
  322. (0 < formatting.length()) && (formatting.charAt(0) == '.')) {
  323. if ((1 < formatting.length())
  324. && Character.isDigit(formatting.charAt(1))) {
  325. numberFormat.setMaximumFractionDigits(
  326. Integer.parseInt(formatting.charAt(1) + ""));
  327. numberFormat.format(number, outputTo, new FieldPosition(0));
  328. return 2;
  329. }
  330. }
  331. numberFormat.format(number, outputTo, new FieldPosition(0));
  332. return 1;
  333. }
  334. /**
  335. * @return the encoding we want to use, currently hardcoded to ISO-8859-1
  336. */
  337. public static String getPreferredEncoding() {
  338. return ENCODING_ISO_8859_1;
  339. }
  340. /**
  341. * check the parameter has multibyte character
  342. *
  343. * @param value string to check
  344. * @return boolean result true:string has at least one multibyte character
  345. */
  346. public static boolean hasMultibyte(String value) {
  347. if (value == null)
  348. return false;
  349. for (int i = 0; i < value.length(); i++) {
  350. char c = value.charAt(i);
  351. if (c > 0xFF) {
  352. return true;
  353. }
  354. }
  355. return false;
  356. }
  357. /**
  358. * Checks to see if a given String needs to be represented as Unicode
  359. *
  360. * @param value
  361. * @return true if string needs Unicode to be represented.
  362. */
  363. public static boolean isUnicodeString(final String value) {
  364. try {
  365. return !value.equals(new String(value.getBytes(ENCODING_ISO_8859_1),
  366. ENCODING_ISO_8859_1));
  367. } catch (UnsupportedEncodingException e) {
  368. return true;
  369. }
  370. }
  371. }