PageRenderTime 48ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/saxonB/net/sf/saxon/functions/EscapeURI.java

https://bitbucket.org/dmwelch/phdxnat_pipeline
Java | 250 lines | 178 code | 25 blank | 47 comment | 44 complexity | d9686500711df97c985b4365eb1c5387 MD5 | raw file
  1. package net.sf.saxon.functions;
  2. import net.sf.saxon.trans.Err;
  3. import net.sf.saxon.charcode.UnicodeCharacterSet;
  4. import net.sf.saxon.event.HTMLURIEscaper;
  5. import net.sf.saxon.expr.XPathContext;
  6. import net.sf.saxon.om.FastStringBuffer;
  7. import net.sf.saxon.om.Item;
  8. import net.sf.saxon.trans.XPathException;
  9. import net.sf.saxon.value.StringValue;
  10. import java.util.Arrays;
  11. /**
  12. * This class supports the functions encode-for-uri() and iri-to-uri()
  13. */
  14. public class EscapeURI extends SystemFunction {
  15. public static final int ENCODE_FOR_URI = 1;
  16. public static final int IRI_TO_URI = 2;
  17. public static final int HTML_URI = 3;
  18. public static boolean[] allowedASCII = new boolean[128];
  19. static {
  20. Arrays.fill(allowedASCII, 0, 32, false);
  21. Arrays.fill(allowedASCII, 33, 127, true);
  22. allowedASCII[(int)'"'] = false;
  23. allowedASCII[(int)'<'] = false;
  24. allowedASCII[(int)'>'] = false;
  25. allowedASCII[(int)'\\'] = false;
  26. allowedASCII[(int)'^'] = false;
  27. allowedASCII[(int)'`'] = false;
  28. allowedASCII[(int)'{'] = false;
  29. allowedASCII[(int)'|'] = false;
  30. allowedASCII[(int)'}'] = false;
  31. }
  32. /**
  33. * Evaluate the function
  34. */
  35. public Item evaluateItem(XPathContext c) throws XPathException {
  36. Item item = argument[0].evaluateItem(c);
  37. if (item == null) {
  38. return StringValue.EMPTY_STRING;
  39. }
  40. final CharSequence s = item.getStringValueCS();
  41. switch (operation) {
  42. case ENCODE_FOR_URI:
  43. return StringValue.makeStringValue(escape(s, "-_.~"));
  44. case IRI_TO_URI:
  45. return StringValue.makeStringValue(iriToUri(s));
  46. case HTML_URI:
  47. return StringValue.makeStringValue(HTMLURIEscaper.escapeURL(s, false));
  48. default:
  49. throw new UnsupportedOperationException("Unknown escape operation");
  50. }
  51. }
  52. /**
  53. * Escape special characters in a URI. The characters that are %HH-encoded are
  54. * all non-ASCII characters
  55. * @param s the URI to be escaped
  56. * @return the %HH-encoded string
  57. */
  58. public static CharSequence iriToUri(CharSequence s) {
  59. // NOTE: implements a late spec change which says that characters that are illegal in an IRI,
  60. // for example "\", must be %-encoded.
  61. if (allAllowedAscii(s)) {
  62. // it's worth doing a prescan to avoid the cost of copying in the common all-ASCII case
  63. return s;
  64. }
  65. FastStringBuffer sb = new FastStringBuffer(s.length()+20);
  66. for (int i=0; i<s.length(); i++) {
  67. final char c = s.charAt(i);
  68. if (c>=0x7f || !allowedASCII[(int)c]) {
  69. escapeChar(c, ((i+1)<s.length() ? s.charAt(i+1) : ' '), sb);
  70. } else {
  71. sb.append(c);
  72. }
  73. }
  74. return sb;
  75. }
  76. private static boolean allAllowedAscii(CharSequence s) {
  77. for (int i=0; i<s.length(); i++) {
  78. final char c = s.charAt(i);
  79. if (c>=0x7f || !allowedASCII[(int)c]) {
  80. return false;
  81. }
  82. }
  83. return true;
  84. }
  85. /**
  86. * Escape special characters in a URI. The characters that are %HH-encoded are
  87. * all non-ASCII characters, plus all ASCII characters except (a) letter A-Z
  88. * and a-z, (b) digits 0-9, and (c) characters listed in the allowedPunctuation
  89. * argument
  90. * @param s the URI to be escaped
  91. * @param allowedPunctuation ASCII characters other than letters and digits that
  92. * should NOT be %HH-encoded
  93. * @return the %HH-encoded string
  94. */
  95. public static CharSequence escape(CharSequence s, String allowedPunctuation) {
  96. FastStringBuffer sb = new FastStringBuffer(s.length());
  97. for (int i=0; i<s.length(); i++) {
  98. char c = s.charAt(i);
  99. if ((c>='a' && c<='z') || (c>='A' && c<='Z') || (c>='0' && c<='9')) {
  100. sb.append(c);
  101. } else if (c<=0x20 || c>=0x7f) {
  102. escapeChar(c, ((i+1)<s.length() ? s.charAt(i+1) : ' '), sb);
  103. } else if (allowedPunctuation.indexOf(c) >= 0) {
  104. sb.append(c);
  105. } else {
  106. escapeChar(c, ' ', sb);
  107. }
  108. }
  109. return sb;
  110. }
  111. private static final String hex = "0123456789ABCDEF";
  112. /**
  113. * Escape a single character in %HH representation, or a pair of two chars representing
  114. * a surrogate pair
  115. * @param c the character to be escaped, or the first character of a surrogate pair
  116. * @param c2 the second character of a surrogate pair
  117. * @param sb the buffer to contain the escaped result
  118. */
  119. private static void escapeChar(char c, char c2, FastStringBuffer sb) {
  120. byte[] array = new byte[4];
  121. int used = UnicodeCharacterSet.getUTF8Encoding(c, c2, array);
  122. for (int b=0; b<used; b++) {
  123. int v = (int)array[b] & 0xff;
  124. sb.append('%');
  125. sb.append(hex.charAt(v/16));
  126. sb.append(hex.charAt(v%16));
  127. }
  128. }
  129. /**
  130. * Check that any percent-encoding within a URI is well-formed. The method assumes that a percent
  131. * sign followed by two hex digits represents an octet of the UTF-8 representation of a character;
  132. * any other percent sign is assumed to represent itself.
  133. * @param uri the string to be checked for validity
  134. * @throws XPathException if the string is not validly percent-encoded
  135. */
  136. public static void checkPercentEncoding(String uri) throws XPathException {
  137. for (int i=0; i<uri.length();) {
  138. char c = uri.charAt(i);
  139. byte[] bytes;
  140. // Note: we're translating the UTF-8 byte sequence but then not using the value
  141. int expectedOctets;
  142. if (c == '%') {
  143. if (i+2 >= uri.length()) {
  144. throw new XPathException("% sign in URI must be followed by two hex digits" +
  145. Err.wrap(uri));
  146. }
  147. int h1 = hexDigits.indexOf(uri.charAt(i+1));
  148. if (h1 > 15) {
  149. h1 -= 6;
  150. }
  151. int h2 = hexDigits.indexOf(uri.charAt(i+2));
  152. if (h2 > 15) {
  153. h2 -= 6;
  154. }
  155. if (h1 >= 0 && h2 >= 0) {
  156. int b = h1<<4 | h2;
  157. expectedOctets = UTF8RepresentationLength[h1];
  158. if (expectedOctets == -1) {
  159. throw new XPathException("First %-encoded octet in URI is not valid as the start of a UTF-8 " +
  160. "character: first two bits must not be '10'" +
  161. Err.wrap(uri));
  162. }
  163. bytes = new byte[expectedOctets];
  164. bytes[0] = (byte)b;
  165. i+=3;
  166. for (int q=1; q<expectedOctets; q++) {
  167. if (i+2 > uri.length() || uri.charAt(i) != '%') {
  168. throw new XPathException("Incomplete %-encoded UTF-8 octet sequence in URI " +
  169. Err.wrap(uri));
  170. }
  171. h1 = hexDigits.indexOf(uri.charAt(i+1));
  172. if (h1 > 15) {
  173. h1 -= 6;
  174. }
  175. h2 = hexDigits.indexOf(uri.charAt(i+2));
  176. if (h2 > 15) {
  177. h2 -= 6;
  178. }
  179. if (h1 < 0 || h2 < 0) {
  180. throw new XPathException("Invalid %-encoded UTF-8 octet sequence in URI" +
  181. Err.wrap(uri));
  182. }
  183. if (UTF8RepresentationLength[h1] != -1) {
  184. throw new XPathException("In a URI, a %-encoded UTF-8 octet after the first " +
  185. "must have '10' as the first two bits" +
  186. Err.wrap(uri));
  187. }
  188. b = h1<<4 | h2;
  189. bytes[q] = (byte)b;
  190. i += 3;
  191. }
  192. } else {
  193. throw new XPathException("% sign in URI must be followed by two hex digits" +
  194. Err.wrap(uri));
  195. }
  196. } else {
  197. i++;
  198. }
  199. }
  200. }
  201. private static String hexDigits = "0123456789abcdefABCDEF";
  202. // Length of a UTF8 byte sequence, as a function of the first nibble
  203. private static int[] UTF8RepresentationLength = {1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 2, 2, 3, 4};
  204. }
  205. //
  206. // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
  207. // you may not use this file except in compliance with the License. You may obtain a copy of the
  208. // License at http://www.mozilla.org/MPL/
  209. //
  210. // Software distributed under the License is distributed on an "AS IS" basis,
  211. // WITHOUT WARRANTY OF ANY KIND, either express or implied.
  212. // See the License for the specific language governing rights and limitations under the License.
  213. //
  214. // The Original Code is: all this file.
  215. //
  216. // The Initial Developer of the Original Code is Michael H. Kay
  217. //
  218. // Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
  219. //
  220. // Contributor(s): none.
  221. //