/std/uri.d

http://github.com/jcd/phobos · D · 543 lines · 377 code · 68 blank · 98 comment · 117 complexity · 21e9a997e028aaa31aefdcfb3eb308c0 MD5 · raw file

  1. // Written in the D programming language.
  2. /**
  3. * Encode and decode Uniform Resource Identifiers (URIs).
  4. * URIs are used in internet transfer protocols.
  5. * Valid URI characters consist of letters, digits,
  6. * and the characters $(B ;/?:@&=+$,-_.!~*'())
  7. * Reserved URI characters are $(B ;/?:@&=+$,)
  8. * Escape sequences consist of $(B %) followed by two hex digits.
  9. *
  10. * See_Also:
  11. * $(LINK2 http://www.ietf.org/rfc/rfc3986.txt, RFC 3986)<br>
  12. * $(LINK2 http://en.wikipedia.org/wiki/Uniform_resource_identifier, Wikipedia)
  13. * Macros:
  14. * WIKI = Phobos/StdUri
  15. *
  16. * Copyright: Copyright Digital Mars 2000 - 2009.
  17. * License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
  18. * Authors: $(WEB digitalmars.com, Walter Bright)
  19. * Source: $(PHOBOSSRC std/_uri.d)
  20. */
  21. /* Copyright Digital Mars 2000 - 2009.
  22. * Distributed under the Boost Software License, Version 1.0.
  23. * (See accompanying file LICENSE_1_0.txt or copy at
  24. * http://www.boost.org/LICENSE_1_0.txt)
  25. */
  26. module std.uri;
  27. //debug=uri; // uncomment to turn on debugging writefln's
  28. debug(uri) private import std.stdio;
  29. /* ====================== URI Functions ================ */
  30. private import std.ascii;
  31. private import std.c.stdlib;
  32. private import std.utf;
  33. import core.exception : OutOfMemoryError;
  34. import std.exception : assumeUnique;
  35. class URIException : Exception
  36. {
  37. @safe pure nothrow this()
  38. {
  39. super("URI Exception");
  40. }
  41. @safe pure nothrow this(string msg)
  42. {
  43. super("URI Exception: " ~ msg);
  44. }
  45. }
  46. enum
  47. {
  48. URI_Alpha = 1,
  49. URI_Reserved = 2,
  50. URI_Mark = 4,
  51. URI_Digit = 8,
  52. URI_Hash = 0x10, // '#'
  53. }
  54. immutable char[16] hex2ascii = "0123456789ABCDEF";
  55. __gshared ubyte[128] uri_flags; // indexed by character
  56. shared static this()
  57. {
  58. // Initialize uri_flags[]
  59. static void helper(immutable char[] p, uint flags)
  60. {
  61. for (int i = 0; i < p.length; i++)
  62. uri_flags[p[i]] |= flags;
  63. }
  64. uri_flags['#'] |= URI_Hash;
  65. for (int i = 'A'; i <= 'Z'; i++)
  66. {
  67. uri_flags[i] |= URI_Alpha;
  68. uri_flags[i + 0x20] |= URI_Alpha; // lowercase letters
  69. }
  70. helper("0123456789", URI_Digit);
  71. helper(";/?:@&=+$,", URI_Reserved);
  72. helper("-_.!~*'()", URI_Mark);
  73. }
  74. private string URI_Encode(dstring string, uint unescapedSet)
  75. {
  76. uint j;
  77. uint k;
  78. dchar V;
  79. dchar C;
  80. // result buffer
  81. char[50] buffer = void;
  82. char* R;
  83. uint Rlen;
  84. uint Rsize; // alloc'd size
  85. auto len = string.length;
  86. R = buffer.ptr;
  87. Rsize = buffer.length;
  88. Rlen = 0;
  89. for (k = 0; k != len; k++)
  90. {
  91. C = string[k];
  92. // if (C in unescapedSet)
  93. if (C < uri_flags.length && uri_flags[C] & unescapedSet)
  94. {
  95. if (Rlen == Rsize)
  96. {
  97. char* R2;
  98. Rsize *= 2;
  99. if (Rsize > 1024) {
  100. R2 = (new char[Rsize]).ptr;
  101. }
  102. else
  103. {
  104. R2 = cast(char *)alloca(Rsize * char.sizeof);
  105. if (!R2)
  106. throw new OutOfMemoryError("Alloca failure");
  107. }
  108. R2[0..Rlen] = R[0..Rlen];
  109. R = R2;
  110. }
  111. R[Rlen] = cast(char)C;
  112. Rlen++;
  113. }
  114. else
  115. {
  116. char[6] Octet;
  117. uint L;
  118. V = C;
  119. // Transform V into octets
  120. if (V <= 0x7F)
  121. {
  122. Octet[0] = cast(char) V;
  123. L = 1;
  124. }
  125. else if (V <= 0x7FF)
  126. {
  127. Octet[0] = cast(char)(0xC0 | (V >> 6));
  128. Octet[1] = cast(char)(0x80 | (V & 0x3F));
  129. L = 2;
  130. }
  131. else if (V <= 0xFFFF)
  132. {
  133. Octet[0] = cast(char)(0xE0 | (V >> 12));
  134. Octet[1] = cast(char)(0x80 | ((V >> 6) & 0x3F));
  135. Octet[2] = cast(char)(0x80 | (V & 0x3F));
  136. L = 3;
  137. }
  138. else if (V <= 0x1FFFFF)
  139. {
  140. Octet[0] = cast(char)(0xF0 | (V >> 18));
  141. Octet[1] = cast(char)(0x80 | ((V >> 12) & 0x3F));
  142. Octet[2] = cast(char)(0x80 | ((V >> 6) & 0x3F));
  143. Octet[3] = cast(char)(0x80 | (V & 0x3F));
  144. L = 4;
  145. }
  146. /+
  147. else if (V <= 0x3FFFFFF)
  148. {
  149. Octet[0] = cast(char)(0xF8 | (V >> 24));
  150. Octet[1] = cast(char)(0x80 | ((V >> 18) & 0x3F));
  151. Octet[2] = cast(char)(0x80 | ((V >> 12) & 0x3F));
  152. Octet[3] = cast(char)(0x80 | ((V >> 6) & 0x3F));
  153. Octet[4] = cast(char)(0x80 | (V & 0x3F));
  154. L = 5;
  155. }
  156. else if (V <= 0x7FFFFFFF)
  157. {
  158. Octet[0] = cast(char)(0xFC | (V >> 30));
  159. Octet[1] = cast(char)(0x80 | ((V >> 24) & 0x3F));
  160. Octet[2] = cast(char)(0x80 | ((V >> 18) & 0x3F));
  161. Octet[3] = cast(char)(0x80 | ((V >> 12) & 0x3F));
  162. Octet[4] = cast(char)(0x80 | ((V >> 6) & 0x3F));
  163. Octet[5] = cast(char)(0x80 | (V & 0x3F));
  164. L = 6;
  165. }
  166. +/
  167. else
  168. {
  169. throw new URIException("Undefined UTF-32 code point");
  170. }
  171. if (Rlen + L * 3 > Rsize)
  172. {
  173. char *R2;
  174. Rsize = 2 * (Rlen + L * 3);
  175. if (Rsize > 1024) {
  176. R2 = (new char[Rsize]).ptr;
  177. }
  178. else
  179. {
  180. R2 = cast(char *)alloca(Rsize * char.sizeof);
  181. if (!R2)
  182. throw new OutOfMemoryError("Alloca failure");
  183. }
  184. R2[0..Rlen] = R[0..Rlen];
  185. R = R2;
  186. }
  187. for (j = 0; j < L; j++)
  188. {
  189. R[Rlen] = '%';
  190. R[Rlen + 1] = hex2ascii[Octet[j] >> 4];
  191. R[Rlen + 2] = hex2ascii[Octet[j] & 15];
  192. Rlen += 3;
  193. }
  194. }
  195. }
  196. return R[0..Rlen].idup;
  197. }
  198. uint ascii2hex(dchar c)
  199. {
  200. return (c <= '9') ? c - '0' :
  201. (c <= 'F') ? c - 'A' + 10 :
  202. c - 'a' + 10;
  203. }
  204. private dstring URI_Decode(string string, uint reservedSet)
  205. {
  206. uint j;
  207. uint k;
  208. uint V;
  209. dchar C;
  210. // Result array, allocated on stack
  211. dchar* R;
  212. uint Rlen;
  213. auto len = string.length;
  214. auto s = string.ptr;
  215. // Preallocate result buffer R guaranteed to be large enough for result
  216. auto Rsize = len;
  217. if (Rsize > 1024 / dchar.sizeof) {
  218. R = (new dchar[Rsize]).ptr;
  219. }
  220. else
  221. {
  222. R = cast(dchar *)alloca(Rsize * dchar.sizeof);
  223. if (!R)
  224. throw new OutOfMemoryError("Alloca failure");
  225. }
  226. Rlen = 0;
  227. for (k = 0; k != len; k++)
  228. {
  229. char B;
  230. uint start;
  231. C = s[k];
  232. if (C != '%')
  233. {
  234. R[Rlen] = C;
  235. Rlen++;
  236. continue;
  237. }
  238. start = k;
  239. if (k + 2 >= len)
  240. throw new URIException("Unexpected end of URI");
  241. if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
  242. throw new URIException("Expected two hexadecimal digits after '%'");
  243. B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
  244. k += 2;
  245. if ((B & 0x80) == 0)
  246. {
  247. C = B;
  248. }
  249. else
  250. {
  251. uint n;
  252. for (n = 1; ; n++)
  253. {
  254. if (n > 4)
  255. throw new URIException("UTF-32 code point size too large");
  256. if (((B << n) & 0x80) == 0)
  257. {
  258. if (n == 1)
  259. throw new URIException("UTF-32 code point size too small");
  260. break;
  261. }
  262. }
  263. // Pick off (7 - n) significant bits of B from first byte of octet
  264. V = B & ((1 << (7 - n)) - 1); // (!!!)
  265. if (k + (3 * (n - 1)) >= len)
  266. throw new URIException("UTF-32 unaligned String");
  267. for (j = 1; j != n; j++)
  268. {
  269. k++;
  270. if (s[k] != '%')
  271. throw new URIException("Expected: '%'");
  272. if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
  273. throw new URIException("Expected two hexadecimal digits after '%'");
  274. B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
  275. if ((B & 0xC0) != 0x80)
  276. throw new URIException("Incorrect UTF-32 multi-byte sequence");
  277. k += 2;
  278. V = (V << 6) | (B & 0x3F);
  279. }
  280. if (V > 0x10FFFF)
  281. throw new URIException("Unknown UTF-32 code point");
  282. C = V;
  283. }
  284. if (C < uri_flags.length && uri_flags[C] & reservedSet)
  285. {
  286. // R ~= s[start .. k + 1];
  287. int width = (k + 1) - start;
  288. for (int ii = 0; ii < width; ii++)
  289. R[Rlen + ii] = s[start + ii];
  290. Rlen += width;
  291. }
  292. else
  293. {
  294. R[Rlen] = C;
  295. Rlen++;
  296. }
  297. }
  298. assert(Rlen <= Rsize); // enforce our preallocation size guarantee
  299. // Copy array on stack to array in memory
  300. return R[0..Rlen].idup;
  301. }
  302. /*************************************
  303. * Decodes the URI string encodedURI into a UTF-8 string and returns it.
  304. * Escape sequences that resolve to reserved URI characters are not replaced.
  305. * Escape sequences that resolve to the '#' character are not replaced.
  306. */
  307. string decode(string encodedURI)
  308. {
  309. auto s = URI_Decode(encodedURI, URI_Reserved | URI_Hash);
  310. return std.utf.toUTF8(s);
  311. }
  312. /*******************************
  313. * Decodes the URI string encodedURI into a UTF-8 string and returns it. All
  314. * escape sequences are decoded.
  315. */
  316. string decodeComponent(string encodedURIComponent)
  317. {
  318. auto s = URI_Decode(encodedURIComponent, 0);
  319. return std.utf.toUTF8(s);
  320. }
  321. /*****************************
  322. * Encodes the UTF-8 string uri into a URI and returns that URI. Any character
  323. * not a valid URI character is escaped. The '#' character is not escaped.
  324. */
  325. string encode(string uri)
  326. {
  327. auto s = std.utf.toUTF32(uri);
  328. return URI_Encode(s, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark);
  329. }
  330. /********************************
  331. * Encodes the UTF-8 string uriComponent into a URI and returns that URI.
  332. * Any character not a letter, digit, or one of -_.!~*'() is escaped.
  333. */
  334. string encodeComponent(string uriComponent)
  335. {
  336. auto s = std.utf.toUTF32(uriComponent);
  337. return URI_Encode(s, URI_Alpha | URI_Digit | URI_Mark);
  338. }
  339. /***************************
  340. * Does string s[] start with a URL?
  341. * Returns:
  342. * -1 it does not
  343. * len it does, and s[0..len] is the slice of s[] that is that URL
  344. */
  345. size_t uriLength(string s)
  346. {
  347. /* Must start with one of:
  348. * http://
  349. * https://
  350. * www.
  351. */
  352. import std.string : icmp;
  353. size_t i;
  354. if (s.length <= 4)
  355. return -1;
  356. if (s.length > 7 && std.string.icmp(s[0 .. 7], "http://") == 0) {
  357. i = 7;
  358. }
  359. else
  360. {
  361. if (s.length > 8 && std.string.icmp(s[0 .. 8], "https://") == 0)
  362. i = 8;
  363. else
  364. return -1;
  365. }
  366. // if (icmp(s[0 .. 4], "www.") == 0)
  367. // i = 4;
  368. size_t lastdot;
  369. for (; i < s.length; i++)
  370. {
  371. auto c = s[i];
  372. if (isAlphaNum(c))
  373. continue;
  374. if (c == '-' || c == '_' || c == '?' ||
  375. c == '=' || c == '%' || c == '&' ||
  376. c == '/' || c == '+' || c == '#' ||
  377. c == '~' || c == '$')
  378. continue;
  379. if (c == '.')
  380. {
  381. lastdot = i;
  382. continue;
  383. }
  384. break;
  385. }
  386. //if (!lastdot || (i - lastdot != 3 && i - lastdot != 4))
  387. if (!lastdot)
  388. return -1;
  389. return i;
  390. }
  391. unittest
  392. {
  393. string s1 = "http://www.digitalmars.com/~fred/fredsRX.html#foo end!";
  394. assert (uriLength(s1) == 49);
  395. string s2 = "no uri here";
  396. assert (uriLength(s2) == -1);
  397. }
  398. /***************************
  399. * Does string s[] start with an email address?
  400. * Returns:
  401. * -1 it does not
  402. * len it does, and s[0..i] is the slice of s[] that is that email address
  403. * References:
  404. * RFC2822
  405. */
  406. size_t emailLength(string s)
  407. {
  408. size_t i;
  409. if (!isAlpha(s[0]))
  410. return -1;
  411. for (i = 1; 1; i++)
  412. {
  413. if (i == s.length)
  414. return -1;
  415. auto c = s[i];
  416. if (isAlphaNum(c))
  417. continue;
  418. if (c == '-' || c == '_' || c == '.')
  419. continue;
  420. if (c != '@')
  421. return -1;
  422. i++;
  423. break;
  424. }
  425. /* Now do the part past the '@'
  426. */
  427. size_t lastdot;
  428. for (; i < s.length; i++)
  429. {
  430. auto c = s[i];
  431. if (isAlphaNum(c))
  432. continue;
  433. if (c == '-' || c == '_')
  434. continue;
  435. if (c == '.')
  436. {
  437. lastdot = i;
  438. continue;
  439. }
  440. break;
  441. }
  442. if (!lastdot || (i - lastdot != 3 && i - lastdot != 4))
  443. return -1;
  444. return i;
  445. }
  446. unittest
  447. {
  448. string s1 = "my.e-mail@www.example-domain.com with garbage added";
  449. assert (emailLength(s1) == 32);
  450. string s2 = "no email address here";
  451. assert (emailLength(s2) == -1);
  452. }
  453. unittest
  454. {
  455. debug(uri) writeln("uri.encodeURI.unittest");
  456. string s = "http://www.digitalmars.com/~fred/fred's RX.html#foo";
  457. string t = "http://www.digitalmars.com/~fred/fred's%20RX.html#foo";
  458. auto r = encode(s);
  459. debug(uri) writefln("r = '%s'", r);
  460. assert(r == t);
  461. r = decode(t);
  462. debug(uri) writefln("r = '%s'", r);
  463. assert(r == s);
  464. r = encode( decode("%E3%81%82%E3%81%82") );
  465. assert(r == "%E3%81%82%E3%81%82");
  466. r = encodeComponent("c++");
  467. assert(r == "c%2B%2B");
  468. auto str = new char[10_000_000];
  469. str[] = 'A';
  470. r = encodeComponent(assumeUnique(str));
  471. foreach (char c; r)
  472. assert(c == 'A');
  473. r = decode("%41%42%43");
  474. debug(uri) writeln(r);
  475. }