PageRenderTime 132ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 1ms

/java-1.7.0-openjdk/openjdk/jaxws/sources/jaxws_src/src/com/sun/xml/internal/messaging/saaj/packaging/mime/internet/MimeUtility.java

#
Java | 1508 lines | 781 code | 122 blank | 605 comment | 278 complexity | ef2eabfd2ed4c82e8b3de282259c7066 MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause-No-Nuclear-License-2014, LGPL-3.0, LGPL-2.0
  1. /*
  2. * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
  3. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4. *
  5. * This code is free software; you can redistribute it and/or modify it
  6. * under the terms of the GNU General Public License version 2 only, as
  7. * published by the Free Software Foundation. Oracle designates this
  8. * particular file as subject to the "Classpath" exception as provided
  9. * by Oracle in the LICENSE file that accompanied this code.
  10. *
  11. * This code is distributed in the hope that it will be useful, but WITHOUT
  12. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
  14. * version 2 for more details (a copy is included in the LICENSE file that
  15. * accompanied this code).
  16. *
  17. * You should have received a copy of the GNU General Public License version
  18. * 2 along with this work; if not, write to the Free Software Foundation,
  19. * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20. *
  21. * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22. * or visit www.oracle.com if you need additional information or have any
  23. * questions.
  24. */
  25. /*
  26. * @(#)MimeUtility.java 1.45 03/03/10
  27. */
  28. package com.sun.xml.internal.messaging.saaj.packaging.mime.internet;
  29. import java.io.*;
  30. import java.util.*;
  31. import javax.activation.DataHandler;
  32. import javax.activation.DataSource;
  33. import com.sun.xml.internal.messaging.saaj.packaging.mime.MessagingException;
  34. import com.sun.xml.internal.messaging.saaj.packaging.mime.util.*;
  35. /**
  36. * This is a utility class that provides various MIME related
  37. * functionality. <p>
  38. *
  39. * There are a set of methods to encode and decode MIME headers as
  40. * per RFC 2047. A brief description on handling such headers is
  41. * given below: <p>
  42. *
  43. * RFC 822 mail headers <strong>must</strong> contain only US-ASCII
  44. * characters. Headers that contain non US-ASCII characters must be
  45. * encoded so that they contain only US-ASCII characters. Basically,
  46. * this process involves using either BASE64 or QP to encode certain
  47. * characters. RFC 2047 describes this in detail. <p>
  48. *
  49. * In Java, Strings contain (16 bit) Unicode characters. ASCII is a
  50. * subset of Unicode (and occupies the range 0 - 127). A String
  51. * that contains only ASCII characters is already mail-safe. If the
  52. * String contains non US-ASCII characters, it must be encoded. An
  53. * additional complexity in this step is that since Unicode is not
  54. * yet a widely used charset, one might want to first charset-encode
  55. * the String into another charset and then do the transfer-encoding.
  56. * <p>
  57. * Note that to get the actual bytes of a mail-safe String (say,
  58. * for sending over SMTP), one must do
  59. * <p><blockquote><pre>
  60. *
  61. * byte[] bytes = string.getBytes("iso-8859-1");
  62. *
  63. * </pre></blockquote><p>
  64. *
  65. * The <code>setHeader</code> and <code>addHeader</code> methods
  66. * on MimeMessage and MimeBodyPart assume that the given header values
  67. * are Unicode strings that contain only US-ASCII characters. Hence
  68. * the callers of those methods must insure that the values they pass
  69. * do not contain non US-ASCII characters. The methods in this class
  70. * help do this. <p>
  71. *
  72. * The <code>getHeader</code> family of methods on MimeMessage and
  73. * MimeBodyPart return the raw header value. These might be encoded
  74. * as per RFC 2047, and if so, must be decoded into Unicode Strings.
  75. * The methods in this class help to do this. <p>
  76. *
  77. * Several System properties control strict conformance to the MIME
  78. * spec. Note that these are not session properties but must be set
  79. * globally as System properties. <p>
  80. *
  81. * The <code>mail.mime.decodetext.strict</code> property controls
  82. * decoding of MIME encoded words. The MIME spec requires that encoded
  83. * words start at the beginning of a whitespace separated word. Some
  84. * mailers incorrectly include encoded words in the middle of a word.
  85. * If the <code>mail.mime.decodetext.strict</code> System property is
  86. * set to <code>"false"</code>, an attempt will be made to decode these
  87. * illegal encoded words. The default is true. <p>
  88. *
  89. * The <code>mail.mime.encodeeol.strict</code> property controls the
  90. * choice of Content-Transfer-Encoding for MIME parts that are not of
  91. * type "text". Often such parts will contain textual data for which
  92. * an encoding that allows normal end of line conventions is appropriate.
  93. * In rare cases, such a part will appear to contain entirely textual
  94. * data, but will require an encoding that preserves CR and LF characters
  95. * without change. If the <code>mail.mime.decodetext.strict</code>
  96. * System property is set to <code>"true"</code>, such an encoding will
  97. * be used when necessary. The default is false. <p>
  98. *
  99. * In addition, the <code>mail.mime.charset</code> System property can
  100. * be used to specify the default MIME charset to use for encoded words
  101. * and text parts that don't otherwise specify a charset. Normally, the
  102. * default MIME charset is derived from the default Java charset, as
  103. * specified in the <code>file.encoding</code> System property. Most
  104. * applications will have no need to explicitly set the default MIME
  105. * charset. In cases where the default MIME charset to be used for
  106. * mail messages is different than the charset used for files stored on
  107. * the system, this property should be set.
  108. *
  109. * @version 1.45, 03/03/10
  110. * @author John Mani
  111. * @author Bill Shannon
  112. */
  113. public class MimeUtility {
  114. // This class cannot be instantiated
  115. private MimeUtility() { }
  116. public static final int ALL = -1;
  117. private static final int BUFFER_SIZE = 1024;
  118. private static boolean decodeStrict = true;
  119. private static boolean encodeEolStrict = false;
  120. private static boolean foldEncodedWords = false;
  121. private static boolean foldText = true;
  122. static {
  123. try {
  124. String s = System.getProperty("mail.mime.decodetext.strict");
  125. // default to true
  126. decodeStrict = s == null || !s.equalsIgnoreCase("false");
  127. s = System.getProperty("mail.mime.encodeeol.strict");
  128. // default to false
  129. encodeEolStrict = s != null && s.equalsIgnoreCase("true");
  130. s = System.getProperty("mail.mime.foldencodedwords");
  131. // default to false
  132. foldEncodedWords = s != null && s.equalsIgnoreCase("true");
  133. s = System.getProperty("mail.mime.foldtext");
  134. // default to true
  135. foldText = s == null || !s.equalsIgnoreCase("false");
  136. } catch (SecurityException sex) {
  137. // ignore it
  138. }
  139. }
  140. /**
  141. * Get the content-transfer-encoding that should be applied
  142. * to the input stream of this datasource, to make it mailsafe. <p>
  143. *
  144. * The algorithm used here is: <br>
  145. * <ul>
  146. * <li>
  147. * If the primary type of this datasource is "text" and if all
  148. * the bytes in its input stream are US-ASCII, then the encoding
  149. * is "7bit". If more than half of the bytes are non-US-ASCII, then
  150. * the encoding is "base64". If less than half of the bytes are
  151. * non-US-ASCII, then the encoding is "quoted-printable".
  152. * <li>
  153. * If the primary type of this datasource is not "text", then if
  154. * all the bytes of its input stream are US-ASCII, the encoding
  155. * is "7bit". If there is even one non-US-ASCII character, the
  156. * encoding is "base64".
  157. * </ul>
  158. *
  159. * @param ds DataSource
  160. * @return the encoding. This is either "7bit",
  161. * "quoted-printable" or "base64"
  162. */
  163. public static String getEncoding(DataSource ds) {
  164. ContentType cType = null;
  165. InputStream is = null;
  166. String encoding = null;
  167. try {
  168. cType = new ContentType(ds.getContentType());
  169. is = ds.getInputStream();
  170. } catch (Exception ex) {
  171. return "base64"; // what else ?!
  172. }
  173. boolean isText = cType.match("text/*");
  174. // if not text, stop processing when we see non-ASCII
  175. int i = checkAscii(is, ALL, !isText);
  176. switch (i) {
  177. case ALL_ASCII:
  178. encoding = "7bit"; // all ascii
  179. break;
  180. case MOSTLY_ASCII:
  181. encoding = "quoted-printable"; // mostly ascii
  182. break;
  183. default:
  184. encoding = "base64"; // mostly binary
  185. break;
  186. }
  187. // Close the input stream
  188. try {
  189. is.close();
  190. } catch (IOException ioex) { }
  191. return encoding;
  192. }
  193. /**
  194. * Same as <code>getEncoding(DataSource)</code> except that instead
  195. * of reading the data from an <code>InputStream</code> it uses the
  196. * <code>writeTo</code> method to examine the data. This is more
  197. * efficient in the common case of a <code>DataHandler</code>
  198. * created with an object and a MIME type (for example, a
  199. * "text/plain" String) because all the I/O is done in this
  200. * thread. In the case requiring an <code>InputStream</code> the
  201. * <code>DataHandler</code> uses a thread, a pair of pipe streams,
  202. * and the <code>writeTo</code> method to produce the data. <p>
  203. *
  204. * @since JavaMail 1.2
  205. */
  206. public static String getEncoding(DataHandler dh) {
  207. ContentType cType = null;
  208. String encoding = null;
  209. /*
  210. * Try to pick the most efficient means of determining the
  211. * encoding. If this DataHandler was created using a DataSource,
  212. * the getEncoding(DataSource) method is typically faster. If
  213. * the DataHandler was created with an object, this method is
  214. * much faster. To distinguish the two cases, we use a heuristic.
  215. * A DataHandler created with an object will always have a null name.
  216. * A DataHandler created with a DataSource will usually have a
  217. * non-null name.
  218. *
  219. * XXX - This is actually quite a disgusting hack, but it makes
  220. * a common case run over twice as fast.
  221. */
  222. if (dh.getName() != null)
  223. return getEncoding(dh.getDataSource());
  224. try {
  225. cType = new ContentType(dh.getContentType());
  226. } catch (Exception ex) {
  227. return "base64"; // what else ?!
  228. }
  229. if (cType.match("text/*")) {
  230. // Check all of the available bytes
  231. AsciiOutputStream aos = new AsciiOutputStream(false, false);
  232. try {
  233. dh.writeTo(aos);
  234. } catch (IOException ex) { } // ignore it
  235. switch (aos.getAscii()) {
  236. case ALL_ASCII:
  237. encoding = "7bit"; // all ascii
  238. break;
  239. case MOSTLY_ASCII:
  240. encoding = "quoted-printable"; // mostly ascii
  241. break;
  242. default:
  243. encoding = "base64"; // mostly binary
  244. break;
  245. }
  246. } else { // not "text"
  247. // Check all of available bytes, break out if we find
  248. // at least one non-US-ASCII character
  249. AsciiOutputStream aos =
  250. new AsciiOutputStream(true, encodeEolStrict);
  251. try {
  252. dh.writeTo(aos);
  253. } catch (IOException ex) { } // ignore it
  254. if (aos.getAscii() == ALL_ASCII) // all ascii
  255. encoding = "7bit";
  256. else // found atleast one non-ascii character, use b64
  257. encoding = "base64";
  258. }
  259. return encoding;
  260. }
  261. /**
  262. * Decode the given input stream. The Input stream returned is
  263. * the decoded input stream. All the encodings defined in RFC 2045
  264. * are supported here. They include "base64", "quoted-printable",
  265. * "7bit", "8bit", and "binary". In addition, "uuencode" is also
  266. * supported.
  267. *
  268. * @param is input stream
  269. * @param encoding the encoding of the stream.
  270. * @return decoded input stream.
  271. */
  272. public static InputStream decode(InputStream is, String encoding)
  273. throws MessagingException {
  274. if (encoding.equalsIgnoreCase("base64"))
  275. return new BASE64DecoderStream(is);
  276. else if (encoding.equalsIgnoreCase("quoted-printable"))
  277. return new QPDecoderStream(is);
  278. else if (encoding.equalsIgnoreCase("uuencode") ||
  279. encoding.equalsIgnoreCase("x-uuencode") ||
  280. encoding.equalsIgnoreCase("x-uue"))
  281. return new UUDecoderStream(is);
  282. else if (encoding.equalsIgnoreCase("binary") ||
  283. encoding.equalsIgnoreCase("7bit") ||
  284. encoding.equalsIgnoreCase("8bit"))
  285. return is;
  286. else
  287. throw new MessagingException("Unknown encoding: " + encoding);
  288. }
  289. /**
  290. * Wrap an encoder around the given output stream.
  291. * All the encodings defined in RFC 2045 are supported here.
  292. * They include "base64", "quoted-printable", "7bit", "8bit" and
  293. * "binary". In addition, "uuencode" is also supported.
  294. *
  295. * @param os output stream
  296. * @param encoding the encoding of the stream.
  297. * @return output stream that applies the
  298. * specified encoding.
  299. */
  300. public static OutputStream encode(OutputStream os, String encoding)
  301. throws MessagingException {
  302. if (encoding == null)
  303. return os;
  304. else if (encoding.equalsIgnoreCase("base64"))
  305. return new BASE64EncoderStream(os);
  306. else if (encoding.equalsIgnoreCase("quoted-printable"))
  307. return new QPEncoderStream(os);
  308. else if (encoding.equalsIgnoreCase("uuencode") ||
  309. encoding.equalsIgnoreCase("x-uuencode") ||
  310. encoding.equalsIgnoreCase("x-uue"))
  311. return new UUEncoderStream(os);
  312. else if (encoding.equalsIgnoreCase("binary") ||
  313. encoding.equalsIgnoreCase("7bit") ||
  314. encoding.equalsIgnoreCase("8bit"))
  315. return os;
  316. else
  317. throw new MessagingException("Unknown encoding: " +encoding);
  318. }
  319. /**
  320. * Wrap an encoder around the given output stream.
  321. * All the encodings defined in RFC 2045 are supported here.
  322. * They include "base64", "quoted-printable", "7bit", "8bit" and
  323. * "binary". In addition, "uuencode" is also supported.
  324. * The <code>filename</code> parameter is used with the "uuencode"
  325. * encoding and is included in the encoded output.
  326. *
  327. * @param os output stream
  328. * @param encoding the encoding of the stream.
  329. * @param filename name for the file being encoded (only used
  330. * with uuencode)
  331. * @return output stream that applies the
  332. * specified encoding.
  333. * @since JavaMail 1.2
  334. */
  335. public static OutputStream encode(OutputStream os, String encoding,
  336. String filename)
  337. throws MessagingException {
  338. if (encoding == null)
  339. return os;
  340. else if (encoding.equalsIgnoreCase("base64"))
  341. return new BASE64EncoderStream(os);
  342. else if (encoding.equalsIgnoreCase("quoted-printable"))
  343. return new QPEncoderStream(os);
  344. else if (encoding.equalsIgnoreCase("uuencode") ||
  345. encoding.equalsIgnoreCase("x-uuencode") ||
  346. encoding.equalsIgnoreCase("x-uue"))
  347. return new UUEncoderStream(os, filename);
  348. else if (encoding.equalsIgnoreCase("binary") ||
  349. encoding.equalsIgnoreCase("7bit") ||
  350. encoding.equalsIgnoreCase("8bit"))
  351. return os;
  352. else
  353. throw new MessagingException("Unknown encoding: " +encoding);
  354. }
  355. /**
  356. * Encode a RFC 822 "text" token into mail-safe form as per
  357. * RFC 2047. <p>
  358. *
  359. * The given Unicode string is examined for non US-ASCII
  360. * characters. If the string contains only US-ASCII characters,
  361. * it is returned as-is. If the string contains non US-ASCII
  362. * characters, it is first character-encoded using the platform's
  363. * default charset, then transfer-encoded using either the B or
  364. * Q encoding. The resulting bytes are then returned as a Unicode
  365. * string containing only ASCII characters. <p>
  366. *
  367. * Note that this method should be used to encode only
  368. * "unstructured" RFC 822 headers. <p>
  369. *
  370. * Example of usage:
  371. * <p><blockquote><pre>
  372. *
  373. * MimeBodyPart part = ...
  374. * String rawvalue = "FooBar Mailer, Japanese version 1.1"
  375. * try {
  376. * // If we know for sure that rawvalue contains only US-ASCII
  377. * // characters, we can skip the encoding part
  378. * part.setHeader("X-mailer", MimeUtility.encodeText(rawvalue));
  379. * } catch (UnsupportedEncodingException e) {
  380. * // encoding failure
  381. * } catch (MessagingException me) {
  382. * // setHeader() failure
  383. * }
  384. *
  385. * </pre></blockquote><p>
  386. *
  387. * @param text unicode string
  388. * @return Unicode string containing only US-ASCII characters
  389. * @exception UnsupportedEncodingException if the encoding fails
  390. */
  391. public static String encodeText(String text)
  392. throws UnsupportedEncodingException {
  393. return encodeText(text, null, null);
  394. }
  395. /**
  396. * Encode a RFC 822 "text" token into mail-safe form as per
  397. * RFC 2047. <p>
  398. *
  399. * The given Unicode string is examined for non US-ASCII
  400. * characters. If the string contains only US-ASCII characters,
  401. * it is returned as-is. If the string contains non US-ASCII
  402. * characters, it is first character-encoded using the specified
  403. * charset, then transfer-encoded using either the B or Q encoding.
  404. * The resulting bytes are then returned as a Unicode string
  405. * containing only ASCII characters. <p>
  406. *
  407. * Note that this method should be used to encode only
  408. * "unstructured" RFC 822 headers.
  409. *
  410. * @param text the header value
  411. * @param charset the charset. If this parameter is null, the
  412. * platform's default chatset is used.
  413. * @param encoding the encoding to be used. Currently supported
  414. * values are "B" and "Q". If this parameter is null, then
  415. * the "Q" encoding is used if most of characters to be
  416. * encoded are in the ASCII charset, otherwise "B" encoding
  417. * is used.
  418. * @return Unicode string containing only US-ASCII characters
  419. */
  420. public static String encodeText(String text, String charset,
  421. String encoding)
  422. throws UnsupportedEncodingException {
  423. return encodeWord(text, charset, encoding, false);
  424. }
  425. /**
  426. * Decode "unstructured" headers, that is, headers that are defined
  427. * as '*text' as per RFC 822. <p>
  428. *
  429. * The string is decoded using the algorithm specified in
  430. * RFC 2047, Section 6.1.1. If the charset-conversion fails
  431. * for any sequence, an UnsupportedEncodingException is thrown.
  432. * If the String is not an RFC 2047 style encoded header, it is
  433. * returned as-is <p>
  434. *
  435. * Example of usage:
  436. * <p><blockquote><pre>
  437. *
  438. * MimeBodyPart part = ...
  439. * String rawvalue = null;
  440. * String value = null;
  441. * try {
  442. * if ((rawvalue = part.getHeader("X-mailer")[0]) != null)
  443. * value = MimeUtility.decodeText(rawvalue);
  444. * } catch (UnsupportedEncodingException e) {
  445. * // Don't care
  446. * value = rawvalue;
  447. * } catch (MessagingException me) { }
  448. *
  449. * return value;
  450. *
  451. * </pre></blockquote><p>
  452. *
  453. * @param etext the possibly encoded value
  454. * @exception UnsupportedEncodingException if the charset
  455. * conversion failed.
  456. */
  457. public static String decodeText(String etext)
  458. throws UnsupportedEncodingException {
  459. /*
  460. * We look for sequences separated by "linear-white-space".
  461. * (as per RFC 2047, Section 6.1.1)
  462. * RFC 822 defines "linear-white-space" as SPACE | HT | CR | NL.
  463. */
  464. String lwsp = " \t\n\r";
  465. StringTokenizer st;
  466. /*
  467. * First, lets do a quick run thru the string and check
  468. * whether the sequence "=?" exists at all. If none exists,
  469. * we know there are no encoded-words in here and we can just
  470. * return the string as-is, without suffering thru the later
  471. * decoding logic.
  472. * This handles the most common case of unencoded headers
  473. * efficiently.
  474. */
  475. if (etext.indexOf("=?") == -1)
  476. return etext;
  477. // Encoded words found. Start decoding ...
  478. st = new StringTokenizer(etext, lwsp, true);
  479. StringBuffer sb = new StringBuffer(); // decode buffer
  480. StringBuffer wsb = new StringBuffer(); // white space buffer
  481. boolean prevWasEncoded = false;
  482. while (st.hasMoreTokens()) {
  483. char c;
  484. String s = st.nextToken();
  485. // If whitespace, append it to the whitespace buffer
  486. if (((c = s.charAt(0)) == ' ') || (c == '\t') ||
  487. (c == '\r') || (c == '\n'))
  488. wsb.append(c);
  489. else {
  490. // Check if token is an 'encoded-word' ..
  491. String word;
  492. try {
  493. word = decodeWord(s);
  494. // Yes, this IS an 'encoded-word'.
  495. if (!prevWasEncoded && wsb.length() > 0) {
  496. // if the previous word was also encoded, we
  497. // should ignore the collected whitespace. Else
  498. // we include the whitespace as well.
  499. sb.append(wsb);
  500. }
  501. prevWasEncoded = true;
  502. } catch (ParseException pex) {
  503. // This is NOT an 'encoded-word'.
  504. word = s;
  505. // possibly decode inner encoded words
  506. if (!decodeStrict)
  507. word = decodeInnerWords(word);
  508. // include colleced whitespace ..
  509. if (wsb.length() > 0)
  510. sb.append(wsb);
  511. prevWasEncoded = false;
  512. }
  513. sb.append(word); // append the actual word
  514. wsb.setLength(0); // reset wsb for reuse
  515. }
  516. }
  517. return sb.toString();
  518. }
  519. /**
  520. * Encode a RFC 822 "word" token into mail-safe form as per
  521. * RFC 2047. <p>
  522. *
  523. * The given Unicode string is examined for non US-ASCII
  524. * characters. If the string contains only US-ASCII characters,
  525. * it is returned as-is. If the string contains non US-ASCII
  526. * characters, it is first character-encoded using the platform's
  527. * default charset, then transfer-encoded using either the B or
  528. * Q encoding. The resulting bytes are then returned as a Unicode
  529. * string containing only ASCII characters. <p>
  530. *
  531. * This method is meant to be used when creating RFC 822 "phrases".
  532. * The InternetAddress class, for example, uses this to encode
  533. * it's 'phrase' component.
  534. *
  535. * @param text unicode string
  536. * @return Array of Unicode strings containing only US-ASCII
  537. * characters.
  538. * @exception UnsupportedEncodingException if the encoding fails
  539. */
  540. public static String encodeWord(String word)
  541. throws UnsupportedEncodingException {
  542. return encodeWord(word, null, null);
  543. }
  544. /**
  545. * Encode a RFC 822 "word" token into mail-safe form as per
  546. * RFC 2047. <p>
  547. *
  548. * The given Unicode string is examined for non US-ASCII
  549. * characters. If the string contains only US-ASCII characters,
  550. * it is returned as-is. If the string contains non US-ASCII
  551. * characters, it is first character-encoded using the specified
  552. * charset, then transfer-encoded using either the B or Q encoding.
  553. * The resulting bytes are then returned as a Unicode string
  554. * containing only ASCII characters. <p>
  555. *
  556. * @param text unicode string
  557. * @param charset the MIME charset
  558. * @param encoding the encoding to be used. Currently supported
  559. * values are "B" and "Q". If this parameter is null, then
  560. * the "Q" encoding is used if most of characters to be
  561. * encoded are in the ASCII charset, otherwise "B" encoding
  562. * is used.
  563. * @return Unicode string containing only US-ASCII characters
  564. * @exception UnsupportedEncodingException if the encoding fails
  565. */
  566. public static String encodeWord(String word, String charset,
  567. String encoding)
  568. throws UnsupportedEncodingException {
  569. return encodeWord(word, charset, encoding, true);
  570. }
  571. /*
  572. * Encode the given string. The parameter 'encodingWord' should
  573. * be true if a RFC 822 "word" token is being encoded and false if a
  574. * RFC 822 "text" token is being encoded. This is because the
  575. * "Q" encoding defined in RFC 2047 has more restrictions when
  576. * encoding "word" tokens. (Sigh)
  577. */
  578. private static String encodeWord(String string, String charset,
  579. String encoding, boolean encodingWord)
  580. throws UnsupportedEncodingException {
  581. // If 'string' contains only US-ASCII characters, just
  582. // return it.
  583. int ascii = checkAscii(string);
  584. if (ascii == ALL_ASCII)
  585. return string;
  586. // Else, apply the specified charset conversion.
  587. String jcharset;
  588. if (charset == null) { // use default charset
  589. jcharset = getDefaultJavaCharset(); // the java charset
  590. charset = getDefaultMIMECharset(); // the MIME equivalent
  591. } else // MIME charset -> java charset
  592. jcharset = javaCharset(charset);
  593. // If no transfer-encoding is specified, figure one out.
  594. if (encoding == null) {
  595. if (ascii != MOSTLY_NONASCII)
  596. encoding = "Q";
  597. else
  598. encoding = "B";
  599. }
  600. boolean b64;
  601. if (encoding.equalsIgnoreCase("B"))
  602. b64 = true;
  603. else if (encoding.equalsIgnoreCase("Q"))
  604. b64 = false;
  605. else
  606. throw new UnsupportedEncodingException(
  607. "Unknown transfer encoding: " + encoding);
  608. StringBuffer outb = new StringBuffer(); // the output buffer
  609. doEncode(string, b64, jcharset,
  610. // As per RFC 2047, size of an encoded string should not
  611. // exceed 75 bytes.
  612. // 7 = size of "=?", '?', 'B'/'Q', '?', "?="
  613. 75 - 7 - charset.length(), // the available space
  614. "=?" + charset + "?" + encoding + "?", // prefix
  615. true, encodingWord, outb);
  616. return outb.toString();
  617. }
  618. private static void doEncode(String string, boolean b64,
  619. String jcharset, int avail, String prefix,
  620. boolean first, boolean encodingWord, StringBuffer buf)
  621. throws UnsupportedEncodingException {
  622. // First find out what the length of the encoded version of
  623. // 'string' would be.
  624. byte[] bytes = string.getBytes(jcharset);
  625. int len;
  626. if (b64) // "B" encoding
  627. len = BEncoderStream.encodedLength(bytes);
  628. else // "Q"
  629. len = QEncoderStream.encodedLength(bytes, encodingWord);
  630. int size;
  631. if ((len > avail) && ((size = string.length()) > 1)) {
  632. // If the length is greater than 'avail', split 'string'
  633. // into two and recurse.
  634. doEncode(string.substring(0, size/2), b64, jcharset,
  635. avail, prefix, first, encodingWord, buf);
  636. doEncode(string.substring(size/2, size), b64, jcharset,
  637. avail, prefix, false, encodingWord, buf);
  638. } else {
  639. // length <= than 'avail'. Encode the given string
  640. ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
  641. OutputStream eos; // the encoder
  642. if (b64) // "B" encoding
  643. eos = new BEncoderStream(os);
  644. else // "Q" encoding
  645. eos = new QEncoderStream(os, encodingWord);
  646. try { // do the encoding
  647. eos.write(bytes);
  648. eos.close();
  649. } catch (IOException ioex) { }
  650. byte[] encodedBytes = os.toByteArray(); // the encoded stuff
  651. // Now write out the encoded (all ASCII) bytes into our
  652. // StringBuffer
  653. if (!first) // not the first line of this sequence
  654. if (foldEncodedWords)
  655. buf.append("\r\n "); // start a continuation line
  656. else
  657. buf.append(" "); // line will be folded later
  658. buf.append(prefix);
  659. for (int i = 0; i < encodedBytes.length; i++)
  660. buf.append((char)encodedBytes[i]);
  661. buf.append("?="); // terminate the current sequence
  662. }
  663. }
  664. /**
  665. * The string is parsed using the rules in RFC 2047 for parsing
  666. * an "encoded-word". If the parse fails, a ParseException is
  667. * thrown. Otherwise, it is transfer-decoded, and then
  668. * charset-converted into Unicode. If the charset-conversion
  669. * fails, an UnsupportedEncodingException is thrown.<p>
  670. *
  671. * @param eword the possibly encoded value
  672. * @exception ParseException if the string is not an
  673. * encoded-word as per RFC 2047.
  674. * @exception UnsupportedEncodingException if the charset
  675. * conversion failed.
  676. */
  677. public static String decodeWord(String eword)
  678. throws ParseException, UnsupportedEncodingException {
  679. if (!eword.startsWith("=?")) // not an encoded word
  680. throw new ParseException();
  681. // get charset
  682. int start = 2; int pos;
  683. if ((pos = eword.indexOf('?', start)) == -1)
  684. throw new ParseException();
  685. String charset = javaCharset(eword.substring(start, pos));
  686. // get encoding
  687. start = pos+1;
  688. if ((pos = eword.indexOf('?', start)) == -1)
  689. throw new ParseException();
  690. String encoding = eword.substring(start, pos);
  691. // get encoded-sequence
  692. start = pos+1;
  693. if ((pos = eword.indexOf("?=", start)) == -1)
  694. throw new ParseException();
  695. String word = eword.substring(start, pos);
  696. try {
  697. // Extract the bytes from word
  698. ByteArrayInputStream bis =
  699. new ByteArrayInputStream(ASCIIUtility.getBytes(word));
  700. // Get the appropriate decoder
  701. InputStream is;
  702. if (encoding.equalsIgnoreCase("B"))
  703. is = new BASE64DecoderStream(bis);
  704. else if (encoding.equalsIgnoreCase("Q"))
  705. is = new QDecoderStream(bis);
  706. else
  707. throw new UnsupportedEncodingException(
  708. "unknown encoding: " + encoding);
  709. // For b64 & q, size of decoded word <= size of word. So
  710. // the decoded bytes must fit into the 'bytes' array. This
  711. // is certainly more efficient than writing bytes into a
  712. // ByteArrayOutputStream and then pulling out the byte[]
  713. // from it.
  714. int count = bis.available();
  715. byte[] bytes = new byte[count];
  716. // count is set to the actual number of decoded bytes
  717. count = is.read(bytes, 0, count);
  718. // Finally, convert the decoded bytes into a String using
  719. // the specified charset
  720. String s = new String(bytes, 0, count, charset);
  721. if (pos + 2 < eword.length()) {
  722. // there's still more text in the string
  723. String rest = eword.substring(pos + 2);
  724. if (!decodeStrict)
  725. rest = decodeInnerWords(rest);
  726. s += rest;
  727. }
  728. return s;
  729. } catch (UnsupportedEncodingException uex) {
  730. // explicitly catch and rethrow this exception, otherwise
  731. // the below IOException catch will swallow this up!
  732. throw uex;
  733. } catch (IOException ioex) {
  734. // Shouldn't happen.
  735. throw new ParseException();
  736. } catch (IllegalArgumentException iex) {
  737. /* An unknown charset of the form ISO-XXX-XXX, will cause
  738. * the JDK to throw an IllegalArgumentException ... Since the
  739. * JDK will attempt to create a classname using this string,
  740. * but valid classnames must not contain the character '-',
  741. * and this results in an IllegalArgumentException, rather than
  742. * the expected UnsupportedEncodingException. Yikes
  743. */
  744. throw new UnsupportedEncodingException();
  745. }
  746. }
  747. /**
  748. * Look for encoded words within a word. The MIME spec doesn't
  749. * allow this, but many broken mailers, especially Japanese mailers,
  750. * produce such incorrect encodings.
  751. */
  752. private static String decodeInnerWords(String word)
  753. throws UnsupportedEncodingException {
  754. int start = 0, i;
  755. StringBuffer buf = new StringBuffer();
  756. while ((i = word.indexOf("=?", start)) >= 0) {
  757. buf.append(word.substring(start, i));
  758. int end = word.indexOf("?=", i);
  759. if (end < 0)
  760. break;
  761. String s = word.substring(i, end + 2);
  762. try {
  763. s = decodeWord(s);
  764. } catch (ParseException pex) {
  765. // ignore it, just use the original string
  766. }
  767. buf.append(s);
  768. start = end + 2;
  769. }
  770. if (start == 0)
  771. return word;
  772. if (start < word.length())
  773. buf.append(word.substring(start));
  774. return buf.toString();
  775. }
  776. /**
  777. * A utility method to quote a word, if the word contains any
  778. * characters from the specified 'specials' list.<p>
  779. *
  780. * The <code>HeaderTokenizer</code> class defines two special
  781. * sets of delimiters - MIME and RFC 822. <p>
  782. *
  783. * This method is typically used during the generation of
  784. * RFC 822 and MIME header fields.
  785. *
  786. * @param word word to be quoted
  787. * @param specials the set of special characters
  788. * @return the possibly quoted word
  789. * @see javax.mail.internet.HeaderTokenizer#MIME
  790. * @see javax.mail.internet.HeaderTokenizer#RFC822
  791. */
  792. public static String quote(String word, String specials) {
  793. int len = word.length();
  794. /*
  795. * Look for any "bad" characters, Escape and
  796. * quote the entire string if necessary.
  797. */
  798. boolean needQuoting = false;
  799. for (int i = 0; i < len; i++) {
  800. char c = word.charAt(i);
  801. if (c == '"' || c == '\\' || c == '\r' || c == '\n') {
  802. // need to escape them and then quote the whole string
  803. StringBuffer sb = new StringBuffer(len + 3);
  804. sb.append('"');
  805. sb.append(word.substring(0, i));
  806. int lastc = 0;
  807. for (int j = i; j < len; j++) {
  808. char cc = word.charAt(j);
  809. if ((cc == '"') || (cc == '\\') ||
  810. (cc == '\r') || (cc == '\n'))
  811. if (cc == '\n' && lastc == '\r')
  812. ; // do nothing, CR was already escaped
  813. else
  814. sb.append('\\'); // Escape the character
  815. sb.append(cc);
  816. lastc = cc;
  817. }
  818. sb.append('"');
  819. return sb.toString();
  820. } else if (c < 040 || c >= 0177 || specials.indexOf(c) >= 0)
  821. // These characters cause the string to be quoted
  822. needQuoting = true;
  823. }
  824. if (needQuoting) {
  825. StringBuffer sb = new StringBuffer(len + 2);
  826. sb.append('"').append(word).append('"');
  827. return sb.toString();
  828. } else
  829. return word;
  830. }
  831. /**
  832. * Fold a string at linear whitespace so that each line is no longer
  833. * than 76 characters, if possible. If there are more than 76
  834. * non-whitespace characters consecutively, the string is folded at
  835. * the first whitespace after that sequence. The parameter
  836. * <code>used</code> indicates how many characters have been used in
  837. * the current line; it is usually the length of the header name. <p>
  838. *
  839. * Note that line breaks in the string aren't escaped; they probably
  840. * should be.
  841. *
  842. * @param used characters used in line so far
  843. * @param s the string to fold
  844. * @return the folded string
  845. */
  846. /*public*/ static String fold(int used, String s) {
  847. if (!foldText)
  848. return s;
  849. int end;
  850. char c;
  851. // Strip trailing spaces
  852. for (end = s.length() - 1; end >= 0; end--) {
  853. c = s.charAt(end);
  854. if (c != ' ' && c != '\t')
  855. break;
  856. }
  857. if (end != s.length() - 1)
  858. s = s.substring(0, end + 1);
  859. // if the string fits now, just return it
  860. if (used + s.length() <= 76)
  861. return s;
  862. // have to actually fold the string
  863. StringBuffer sb = new StringBuffer(s.length() + 4);
  864. char lastc = 0;
  865. while (used + s.length() > 76) {
  866. int lastspace = -1;
  867. for (int i = 0; i < s.length(); i++) {
  868. if (lastspace != -1 && used + i > 76)
  869. break;
  870. c = s.charAt(i);
  871. if (c == ' ' || c == '\t')
  872. if (!(lastc == ' ' || lastc == '\t'))
  873. lastspace = i;
  874. lastc = c;
  875. }
  876. if (lastspace == -1) {
  877. // no space, use the whole thing
  878. sb.append(s);
  879. s = "";
  880. used = 0;
  881. break;
  882. }
  883. sb.append(s.substring(0, lastspace));
  884. sb.append("\r\n");
  885. lastc = s.charAt(lastspace);
  886. sb.append(lastc);
  887. s = s.substring(lastspace + 1);
  888. used = 1;
  889. }
  890. sb.append(s);
  891. return sb.toString();
  892. }
  893. /**
  894. * Unfold a folded header. Any line breaks that aren't escaped and
  895. * are followed by whitespace are removed.
  896. *
  897. * @param s the string to unfold
  898. * @return the unfolded string
  899. */
  900. /*public*/ static String unfold(String s) {
  901. if (!foldText)
  902. return s;
  903. StringBuffer sb = null;
  904. int i;
  905. while ((i = indexOfAny(s, "\r\n")) >= 0) {
  906. int start = i;
  907. int l = s.length();
  908. i++; // skip CR or NL
  909. if (i < l && s.charAt(i - 1) == '\r' && s.charAt(i) == '\n')
  910. i++; // skip LF
  911. if (start == 0 || s.charAt(start - 1) != '\\') {
  912. char c;
  913. // if next line starts with whitespace, skip all of it
  914. // XXX - always has to be true?
  915. if (i < l && ((c = s.charAt(i)) == ' ' || c == '\t')) {
  916. i++; // skip whitespace
  917. while (i < l && ((c = s.charAt(i)) == ' ' || c == '\t'))
  918. i++;
  919. if (sb == null)
  920. sb = new StringBuffer(s.length());
  921. if (start != 0) {
  922. sb.append(s.substring(0, start));
  923. sb.append(' ');
  924. }
  925. s = s.substring(i);
  926. continue;
  927. }
  928. // it's not a continuation line, just leave it in
  929. if (sb == null)
  930. sb = new StringBuffer(s.length());
  931. sb.append(s.substring(0, i));
  932. s = s.substring(i);
  933. } else {
  934. // there's a backslash at "start - 1"
  935. // strip it out, but leave in the line break
  936. if (sb == null)
  937. sb = new StringBuffer(s.length());
  938. sb.append(s.substring(0, start - 1));
  939. sb.append(s.substring(start, i));
  940. s = s.substring(i);
  941. }
  942. }
  943. if (sb != null) {
  944. sb.append(s);
  945. return sb.toString();
  946. } else
  947. return s;
  948. }
  949. /**
  950. * Return the first index of any of the characters in "any" in "s",
  951. * or -1 if none are found.
  952. *
  953. * This should be a method on String.
  954. */
  955. private static int indexOfAny(String s, String any) {
  956. return indexOfAny(s, any, 0);
  957. }
  958. private static int indexOfAny(String s, String any, int start) {
  959. try {
  960. int len = s.length();
  961. for (int i = start; i < len; i++) {
  962. if (any.indexOf(s.charAt(i)) >= 0)
  963. return i;
  964. }
  965. return -1;
  966. } catch (StringIndexOutOfBoundsException e) {
  967. return -1;
  968. }
  969. }
  970. /**
  971. * Convert a MIME charset name into a valid Java charset name. <p>
  972. *
  973. * @param charset the MIME charset name
  974. * @return the Java charset equivalent. If a suitable mapping is
  975. * not available, the passed in charset is itself returned.
  976. */
  977. public static String javaCharset(String charset) {
  978. if (mime2java == null || charset == null)
  979. // no mapping table, or charset parameter is null
  980. return charset;
  981. String alias = (String)mime2java.get(charset.toLowerCase());
  982. return alias == null ? charset : alias;
  983. }
  984. /**
  985. * Convert a java charset into its MIME charset name. <p>
  986. *
  987. * Note that a future version of JDK (post 1.2) might provide
  988. * this functionality, in which case, we may deprecate this
  989. * method then.
  990. *
  991. * @param charset the JDK charset
  992. * @return the MIME/IANA equivalent. If a mapping
  993. * is not possible, the passed in charset itself
  994. * is returned.
  995. * @since JavaMail 1.1
  996. */
  997. public static String mimeCharset(String charset) {
  998. if (java2mime == null || charset == null)
  999. // no mapping table or charset param is null
  1000. return charset;
  1001. String alias = (String)java2mime.get(charset.toLowerCase());
  1002. return alias == null ? charset : alias;
  1003. }
  1004. private static String defaultJavaCharset;
  1005. private static String defaultMIMECharset;
  1006. /**
  1007. * Get the default charset corresponding to the system's current
  1008. * default locale. If the System property <code>mail.mime.charset</code>
  1009. * is set, a system charset corresponding to this MIME charset will be
  1010. * returned. <p>
  1011. *
  1012. * @return the default charset of the system's default locale,
  1013. * as a Java charset. (NOT a MIME charset)
  1014. * @since JavaMail 1.1
  1015. */
  1016. public static String getDefaultJavaCharset() {
  1017. if (defaultJavaCharset == null) {
  1018. /*
  1019. * If mail.mime.charset is set, it controls the default
  1020. * Java charset as well.
  1021. */
  1022. String mimecs = null;
  1023. try {
  1024. mimecs = System.getProperty("mail.mime.charset");
  1025. } catch (SecurityException ex) { } // ignore it
  1026. if (mimecs != null && mimecs.length() > 0) {
  1027. defaultJavaCharset = javaCharset(mimecs);
  1028. return defaultJavaCharset;
  1029. }
  1030. try {
  1031. defaultJavaCharset = System.getProperty("file.encoding",
  1032. "8859_1");
  1033. } catch (SecurityException sex) {
  1034. class NullInputStream extends InputStream {
  1035. public int read() {
  1036. return 0;
  1037. }
  1038. }
  1039. InputStreamReader reader =
  1040. new InputStreamReader(new NullInputStream());
  1041. defaultJavaCharset = reader.getEncoding();
  1042. if (defaultJavaCharset == null)
  1043. defaultJavaCharset = "8859_1";
  1044. }
  1045. }
  1046. return defaultJavaCharset;
  1047. }
  1048. /*
  1049. * Get the default MIME charset for this locale.
  1050. */
  1051. static String getDefaultMIMECharset() {
  1052. if (defaultMIMECharset == null) {
  1053. try {
  1054. defaultMIMECharset = System.getProperty("mail.mime.charset");
  1055. } catch (SecurityException ex) { } // ignore it
  1056. }
  1057. if (defaultMIMECharset == null)
  1058. defaultMIMECharset = mimeCharset(getDefaultJavaCharset());
  1059. return defaultMIMECharset;
  1060. }
  1061. // Tables to map MIME charset names to Java names and vice versa.
  1062. // XXX - Should eventually use J2SE 1.4 java.nio.charset.Charset
  1063. private static Hashtable mime2java;
  1064. private static Hashtable java2mime;
  1065. static {
  1066. java2mime = new Hashtable(40);
  1067. mime2java = new Hashtable(10);
  1068. try {
  1069. // Use this class's classloader to load the mapping file
  1070. // XXX - we should use SecuritySupport, but it's in another package
  1071. InputStream is =
  1072. com.sun.xml.internal.messaging.saaj.packaging.mime.internet.MimeUtility.class.getResourceAsStream(
  1073. "/META-INF/javamail.charset.map");
  1074. if (is != null) {
  1075. is = new LineInputStream(is);
  1076. // Load the JDK-to-MIME charset mapping table
  1077. loadMappings((LineInputStream)is, java2mime);
  1078. // Load the MIME-to-JDK charset mapping table
  1079. loadMappings((LineInputStream)is, mime2java);
  1080. }
  1081. } catch (Exception ex) { }
  1082. // If we didn't load the tables, e.g., because we didn't have
  1083. // permission, load them manually. The entries here should be
  1084. // the same as the default javamail.charset.map.
  1085. if (java2mime.isEmpty()) {
  1086. java2mime.put("8859_1", "ISO-8859-1");
  1087. java2mime.put("iso8859_1", "ISO-8859-1");
  1088. java2mime.put("ISO8859-1", "ISO-8859-1");
  1089. java2mime.put("8859_2", "ISO-8859-2");
  1090. java2mime.put("iso8859_2", "ISO-8859-2");
  1091. java2mime.put("ISO8859-2", "ISO-8859-2");
  1092. java2mime.put("8859_3", "ISO-8859-3");
  1093. java2mime.put("iso8859_3", "ISO-8859-3");
  1094. java2mime.put("ISO8859-3", "ISO-8859-3");
  1095. java2mime.put("8859_4", "ISO-8859-4");
  1096. java2mime.put("iso8859_4", "ISO-8859-4");
  1097. java2mime.put("ISO8859-4", "ISO-8859-4");
  1098. java2mime.put("8859_5", "ISO-8859-5");
  1099. java2mime.put("iso8859_5", "ISO-8859-5");
  1100. java2mime.put("ISO8859-5", "ISO-8859-5");
  1101. java2mime.put("8859_6", "ISO-8859-6");
  1102. java2mime.put("iso8859_6", "ISO-8859-6");
  1103. java2mime.put("ISO8859-6", "ISO-8859-6");
  1104. java2mime.put("8859_7", "ISO-8859-7");
  1105. java2mime.put("iso8859_7", "ISO-8859-7");
  1106. java2mime.put("ISO8859-7", "ISO-8859-7");
  1107. java2mime.put("8859_8", "ISO-8859-8");
  1108. java2mime.put("iso8859_8", "ISO-8859-8");
  1109. java2mime.put("ISO8859-8", "ISO-8859-8");
  1110. java2mime.put("8859_9", "ISO-8859-9");
  1111. java2mime.put("iso8859_9", "ISO-8859-9");
  1112. java2mime.put("ISO8859-9", "ISO-8859-9");
  1113. java2mime.put("SJIS", "Shift_JIS");
  1114. java2mime.put("MS932", "Shift_JIS");
  1115. java2mime.put("JIS", "ISO-2022-JP");
  1116. java2mime.put("ISO2022JP", "ISO-2022-JP");
  1117. java2mime.put("EUC_JP", "euc-jp");
  1118. java2mime.put("KOI8_R", "koi8-r");
  1119. java2mime.put("EUC_CN", "euc-cn");
  1120. java2mime.put("EUC_TW", "euc-tw");
  1121. java2mime.put("EUC_KR", "euc-kr");
  1122. }
  1123. if (mime2java.isEmpty()) {
  1124. mime2java.put("iso-2022-cn", "ISO2022CN");
  1125. mime2java.put("iso-2022-kr", "ISO2022KR");
  1126. mime2java.put("utf-8", "UTF8");
  1127. mime2java.put("utf8", "UTF8");
  1128. mime2java.put("ja_jp.iso2022-7", "ISO2022JP");
  1129. mime2java.put("ja_jp.eucjp", "EUCJIS");
  1130. mime2java.put("euc-kr", "KSC5601");
  1131. mime2java.put("euckr", "KSC5601");
  1132. mime2java.put("us-ascii", "ISO-8859-1");
  1133. mime2java.put("x-us-ascii", "ISO-8859-1");
  1134. }
  1135. }
  1136. private static void loadMappings(LineInputStream is, Hashtable table) {
  1137. String currLine;
  1138. while (true) {
  1139. try {
  1140. currLine = is.readLine();
  1141. } catch (IOException ioex) {
  1142. break; // error in reading, stop
  1143. }
  1144. if (currLine == null) // end of file, stop
  1145. break;
  1146. if (currLine.startsWith("--") && currLine.endsWith("--"))
  1147. // end of this table
  1148. break;
  1149. // ignore empty lines and comments
  1150. if (currLine.trim().length() == 0 || currLine.startsWith("#"))
  1151. continue;
  1152. // A valid entry is of the form <key><separator><value>
  1153. // where, <separator> := SPACE | HT. Parse this
  1154. StringTokenizer tk = new StringTokenizer(currLine, " \t");
  1155. try {
  1156. String key = tk.nextToken();
  1157. String value = tk.nextToken();
  1158. table.put(key.toLowerCase(), value);
  1159. } catch (NoSuchElementException nex) { }
  1160. }
  1161. }
  1162. static final int ALL_ASCII = 1;
  1163. static final int MOSTLY_ASCII = 2;
  1164. static final int MOSTLY_NONASCII = 3;
  1165. /**
  1166. * Check if the given string contains non US-ASCII characters.
  1167. * @param s string
  1168. * @return ALL_ASCII if all characters in the string
  1169. * belong to the US-ASCII charset. MOSTLY_ASCII
  1170. * if more than half of the available characters
  1171. * are US-ASCII characters. Else MOSTLY_NONASCII.
  1172. */
  1173. static int checkAscii(String s) {
  1174. int ascii = 0, non_ascii = 0;
  1175. int l = s.length();
  1176. for (int i = 0; i < l; i++) {
  1177. if (nonascii((int)s.charAt(i))) // non-ascii
  1178. non_ascii++;
  1179. else
  1180. ascii++;
  1181. }
  1182. if (non_ascii == 0)
  1183. return ALL_ASCII;
  1184. if (ascii > non_ascii)
  1185. return MOSTLY_ASCII;
  1186. return MOSTLY_NONASCII;
  1187. }
  1188. /**
  1189. * Check if the given byte array contains non US-ASCII characters.
  1190. * @param b byte array
  1191. * @return ALL_ASCII if all characters in the string
  1192. * belong to the US-ASCII charset. MOSTLY_ASCII
  1193. * if more than half of the available characters
  1194. * are US-ASCII characters. Else MOSTLY_NONASCII.
  1195. *
  1196. * XXX - this method is no longer used
  1197. */
  1198. static int checkAscii(byte[] b) {
  1199. int ascii = 0, non_ascii = 0;
  1200. for (int i=0; i < b.length; i++) {
  1201. // The '&' operator automatically causes b[i] to be promoted
  1202. // to an int, and we mask out the higher bytes in the int
  1203. // so that the resulting value is not a negative integer.
  1204. if (nonascii(b[i] & 0xff)) // non-ascii
  1205. non_ascii++;
  1206. else
  1207. ascii++;
  1208. }
  1209. if (non_ascii == 0)
  1210. return ALL_ASCII;
  1211. if (ascii > non_ascii)
  1212. return MOSTLY_ASCII;
  1213. return MOSTLY_NONASCII;
  1214. }
  1215. /**
  1216. * Check if the given input stream contains non US-ASCII characters.
  1217. * Upto <code>max</code> bytes are checked. If <code>max</code> is
  1218. * set to <code>ALL</code>, then all the bytes available in this
  1219. * input stream are checked. If <code>breakOnNonAscii</code> is true
  1220. * the check terminates when the first non-US-ASCII character is
  1221. * found and MOSTLY_NONASCII is returned. Else, the check continues
  1222. * till <code>max</code> bytes or till the end of stream.
  1223. *
  1224. * @param is the input stream
  1225. * @param max maximum bytes to check for. The special value
  1226. * ALL indicates that all the bytes in this input
  1227. * stream must be checked.
  1228. * @param breakOnNonAscii if <code>true</code>, then terminate the
  1229. * the check when the first non-US-ASCII character
  1230. * is found.
  1231. * @return ALL_ASCII if all characters in the string
  1232. * belong to the US-ASCII charset. MOSTLY_ASCII
  1233. * if more than half of the available characters
  1234. * are US-ASCII characters. Else MOSTLY_NONASCII.
  1235. */
  1236. static int checkAscii(InputStream is, int max, boolean breakOnNonAscii) {
  1237. int ascii = 0, non_ascii = 0;
  1238. int len;
  1239. int block = 4096;
  1240. int linelen = 0;
  1241. boolean longLine = false, badEOL = false;
  1242. boolean checkEOL = encodeEolStrict && breakOnNonAscii;
  1243. byte buf[] = null;
  1244. if (max != 0) {
  1245. block = (max == ALL) ? 4096 : Math.min(max, 4096);
  1246. buf = new byte[block];
  1247. }
  1248. while (max != 0) {
  1249. try {
  1250. if ((len = is.read(buf, 0, block)) == -1)
  1251. break;
  1252. int lastb = 0;
  1253. for (int i = 0; i < len; i++) {
  1254. // The '&' operator automatically causes b[i] to
  1255. // be promoted to an int, and we mask out the higher
  1256. // bytes in the int so that the resulting value is
  1257. // not a negative integer.
  1258. int b = buf[i] & 0xff;
  1259. if (checkEOL &&
  1260. ((lastb == '\r' && b != '\n') ||
  1261. (lastb != '\r' && b == '\n')))
  1262. badEOL = true;
  1263. if (b == '\r' || b == '\n')
  1264. linelen = 0;
  1265. else {
  1266. linelen++;
  1267. if (linelen > 998) // 1000 - CRLF
  1268. longLine = true;
  1269. }
  1270. if (nonascii(b)) { // non-ascii
  1271. if (breakOnNonAscii) // we are done
  1272. return MOSTLY_NONASCII;
  1273. else
  1274. non_ascii++;
  1275. } else
  1276. ascii++;
  1277. lastb = b;
  1278. }
  1279. } catch (IOException ioex) {
  1280. break;
  1281. }
  1282. if (max != ALL)
  1283. max -= len;
  1284. }
  1285. if (max == 0 && breakOnNonAscii)
  1286. // We have been told to break on the first non-ascii character.
  1287. // We haven't got any non-ascii character yet, but then we
  1288. // have not checked all of the available bytes either. So we
  1289. // cannot say for sure that this input stream is ALL_ASCII,
  1290. // and hence we must play safe and return MOSTLY_NONASCII
  1291. return MOSTLY_NONASCII;
  1292. if (non_ascii == 0) { // no non-us-ascii characters so far
  1293. // If we're looking at non-text data, and we saw CR without LF
  1294. // or vice versa, consider this mostly non-ASCII so that it
  1295. // will be base64 encoded (since the quoted-printable encoder
  1296. // doesn't encode this case properly).
  1297. if (badEOL)
  1298. return MOSTLY_NONASCII;
  1299. // if we've seen a long line, we degrade to mostly ascii
  1300. else if (longLine)
  1301. return MOSTLY_ASCII;
  1302. else
  1303. return ALL_ASCII;
  1304. }
  1305. if (ascii > non_ascii) // mostly ascii
  1306. return MOSTLY_ASCII;
  1307. return MOSTLY_NONASCII;
  1308. }
  1309. static final boolean nonascii(int b) {
  1310. return b >= 0177 || (b < 040 && b != '\r' && b != '\n' && b != '\t');
  1311. }
  1312. }
  1313. /**
  1314. * An OutputStream that determines whether the data written to
  1315. * it is all ASCII, mostly ASCII, or mostly non-ASCII.
  1316. */
  1317. class AsciiOutputStream extends OutputStream {
  1318. private boolean breakOnNonAscii;
  1319. private int ascii = 0, non_ascii = 0;
  1320. private int linelen = 0;
  1321. private boolean longLine = false;
  1322. private boolean badEOL = false;
  1323. private boolean checkEOL = false;
  1324. private int lastb = 0;
  1325. private int ret = 0;
  1326. public AsciiOutputStream(boolean breakOnNonAscii, boolean encodeEolStrict) {
  1327. this.breakOnNonAscii = breakOnNonAscii;
  1328. checkEOL = encodeEolStrict && breakOnNonAscii;
  1329. }
  1330. public void write(int b) throws IOException {
  1331. check(b);
  1332. }
  1333. public void write(byte b[]) throws IOException {
  1334. write(b, 0, b.length);
  1335. }
  1336. public void write(byte b[], int off, int len) throws IOException {
  1337. len += off;
  1338. for (int i = off; i < len ; i++)
  1339. check(b[i]);
  1340. }
  1341. private final void check(int b) throws IOException {
  1342. b &= 0xff;
  1343. if (checkEOL &&
  1344. ((lastb == '\r' && b != '\n') || (lastb != '\r' && b == '\n')))
  1345. badEOL = true;
  1346. if (b == '\r' || b == '\n')
  1347. linelen = 0;
  1348. else {
  1349. linelen++;
  1350. if (linelen > 998) // 1000 - CRLF
  1351. longLine = true;
  1352. }
  1353. if (MimeUtility.nonascii(b)) { // non-ascii
  1354. non_ascii++;
  1355. if (breakOnNonAscii) { // we are done
  1356. ret = MimeUtility.MOSTLY_NONASCII;
  1357. throw new EOFException();
  1358. }
  1359. } else
  1360. ascii++;
  1361. lastb = b;
  1362. }
  1363. /**
  1364. * Return ASCII-ness of data stream.
  1365. */
  1366. public int getAscii() {
  1367. if (ret != 0)
  1368. return ret;
  1369. // If we're looking at non-text data, and we saw CR without LF
  1370. // or vice versa, consider this mostly non-ASCII so that it
  1371. // will be base64 encoded (since the quoted-printable encoder
  1372. // doesn't encode this case properly).
  1373. if (badEOL)
  1374. return MimeUtility.MOSTLY_NONASCII;
  1375. else if (non_ascii == 0) { // no non-us-ascii characters so far
  1376. // if we've seen a long line, we degrade to mostly ascii
  1377. if (longLine)
  1378. return MimeUtility.MOSTLY_ASCII;
  1379. else
  1380. return MimeUtility.ALL_ASCII;
  1381. }
  1382. if (ascii > non_ascii) // mostly ascii
  1383. return MimeUtility.MOSTLY_ASCII;
  1384. return MimeUtility.MOSTLY_NONASCII;
  1385. }
  1386. }