PageRenderTime 85ms CodeModel.GetById 12ms app.highlight 68ms RepoModel.GetById 1ms app.codeStats 0ms

/std/uri.d

http://github.com/jcd/phobos
D | 543 lines | 377 code | 68 blank | 98 comment | 117 complexity | 21e9a997e028aaa31aefdcfb3eb308c0 MD5 | raw file
  1// Written in the D programming language.
  2
  3/**
  4 * Encode and decode Uniform Resource Identifiers (URIs).
  5 * URIs are used in internet transfer protocols.
  6 * Valid URI characters consist of letters, digits,
  7 * and the characters $(B ;/?:@&=+$,-_.!~*'())
  8 * Reserved URI characters are $(B ;/?:@&=+$,)
  9 * Escape sequences consist of $(B %) followed by two hex digits.
 10 *
 11 * See_Also:
 12 *  $(LINK2 http://www.ietf.org/rfc/rfc3986.txt, RFC 3986)<br>
 13 *  $(LINK2 http://en.wikipedia.org/wiki/Uniform_resource_identifier, Wikipedia)
 14 * Macros:
 15 *  WIKI = Phobos/StdUri
 16 *
 17 * Copyright: Copyright Digital Mars 2000 - 2009.
 18 * License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
 19 * Authors:   $(WEB digitalmars.com, Walter Bright)
 20 * Source:    $(PHOBOSSRC std/_uri.d)
 21 */
 22/*          Copyright Digital Mars 2000 - 2009.
 23 * Distributed under the Boost Software License, Version 1.0.
 24 *    (See accompanying file LICENSE_1_0.txt or copy at
 25 *          http://www.boost.org/LICENSE_1_0.txt)
 26 */
 27module std.uri;
 28
 29//debug=uri;        // uncomment to turn on debugging writefln's
 30debug(uri) private import std.stdio;
 31
 32/* ====================== URI Functions ================ */
 33
 34private import std.ascii;
 35private import std.c.stdlib;
 36private import std.utf;
 37import core.exception : OutOfMemoryError;
 38import std.exception : assumeUnique;
 39
 40class URIException : Exception
 41{
 42    @safe pure nothrow this()
 43    {
 44        super("URI Exception");
 45    }
 46
 47    @safe pure nothrow this(string msg)
 48    {
 49        super("URI Exception: " ~ msg);
 50    }
 51}
 52
 53enum
 54{
 55    URI_Alpha = 1,
 56    URI_Reserved = 2,
 57    URI_Mark = 4,
 58    URI_Digit = 8,
 59    URI_Hash = 0x10,        // '#'
 60}
 61
 62immutable char[16] hex2ascii = "0123456789ABCDEF";
 63
 64__gshared ubyte[128] uri_flags;       // indexed by character
 65
 66shared static this()
 67{
 68    // Initialize uri_flags[]
 69    static void helper(immutable char[] p, uint flags)
 70    {
 71        for (int i = 0; i < p.length; i++)
 72            uri_flags[p[i]] |= flags;
 73    }
 74
 75    uri_flags['#'] |= URI_Hash;
 76
 77    for (int i = 'A'; i <= 'Z'; i++)
 78    {
 79        uri_flags[i] |= URI_Alpha;
 80        uri_flags[i + 0x20] |= URI_Alpha;   // lowercase letters
 81    }
 82    helper("0123456789", URI_Digit);
 83    helper(";/?:@&=+$,", URI_Reserved);
 84    helper("-_.!~*'()",  URI_Mark);
 85}
 86
 87
 88private string URI_Encode(dstring string, uint unescapedSet)
 89{
 90    uint j;
 91    uint k;
 92    dchar V;
 93    dchar C;
 94
 95    // result buffer
 96    char[50] buffer = void;
 97    char* R;
 98    uint Rlen;
 99    uint Rsize; // alloc'd size
100
101    auto len = string.length;
102
103    R = buffer.ptr;
104    Rsize = buffer.length;
105    Rlen = 0;
106
107    for (k = 0; k != len; k++)
108    {
109        C = string[k];
110        // if (C in unescapedSet)
111        if (C < uri_flags.length && uri_flags[C] & unescapedSet)
112        {
113            if (Rlen == Rsize)
114            {
115                char* R2;
116
117                Rsize *= 2;
118                if (Rsize > 1024) {
119                    R2 = (new char[Rsize]).ptr;
120                }
121                else
122                {
123                    R2 = cast(char *)alloca(Rsize * char.sizeof);
124                    if (!R2)
125                        throw new OutOfMemoryError("Alloca failure");
126                }
127                R2[0..Rlen] = R[0..Rlen];
128                R = R2;
129            }
130            R[Rlen] = cast(char)C;
131            Rlen++;
132        }
133        else
134        {
135            char[6] Octet;
136            uint L;
137
138            V = C;
139
140            // Transform V into octets
141            if (V <= 0x7F)
142            {
143                Octet[0] = cast(char) V;
144                L = 1;
145            }
146            else if (V <= 0x7FF)
147            {
148                Octet[0] = cast(char)(0xC0 | (V >> 6));
149                Octet[1] = cast(char)(0x80 | (V & 0x3F));
150                L = 2;
151            }
152            else if (V <= 0xFFFF)
153            {
154                Octet[0] = cast(char)(0xE0 | (V >> 12));
155                Octet[1] = cast(char)(0x80 | ((V >> 6) & 0x3F));
156                Octet[2] = cast(char)(0x80 | (V & 0x3F));
157                L = 3;
158            }
159            else if (V <= 0x1FFFFF)
160            {
161                Octet[0] = cast(char)(0xF0 | (V >> 18));
162                Octet[1] = cast(char)(0x80 | ((V >> 12) & 0x3F));
163                Octet[2] = cast(char)(0x80 | ((V >> 6) & 0x3F));
164                Octet[3] = cast(char)(0x80 | (V & 0x3F));
165                L = 4;
166            }
167            /+
168            else if (V <= 0x3FFFFFF)
169            {
170                Octet[0] = cast(char)(0xF8 | (V >> 24));
171                Octet[1] = cast(char)(0x80 | ((V >> 18) & 0x3F));
172                Octet[2] = cast(char)(0x80 | ((V >> 12) & 0x3F));
173                Octet[3] = cast(char)(0x80 | ((V >> 6) & 0x3F));
174                Octet[4] = cast(char)(0x80 | (V & 0x3F));
175                L = 5;
176            }
177            else if (V <= 0x7FFFFFFF)
178            {
179                Octet[0] = cast(char)(0xFC | (V >> 30));
180                Octet[1] = cast(char)(0x80 | ((V >> 24) & 0x3F));
181                Octet[2] = cast(char)(0x80 | ((V >> 18) & 0x3F));
182                Octet[3] = cast(char)(0x80 | ((V >> 12) & 0x3F));
183                Octet[4] = cast(char)(0x80 | ((V >> 6) & 0x3F));
184                Octet[5] = cast(char)(0x80 | (V & 0x3F));
185                L = 6;
186            }
187            +/
188            else
189            {
190                throw new URIException("Undefined UTF-32 code point");
191            }
192
193            if (Rlen + L * 3 > Rsize)
194            {
195                char *R2;
196
197                Rsize = 2 * (Rlen + L * 3);
198                if (Rsize > 1024) {
199                    R2 = (new char[Rsize]).ptr;
200                }
201                else
202                {
203                    R2 = cast(char *)alloca(Rsize * char.sizeof);
204                    if (!R2)
205                        throw new OutOfMemoryError("Alloca failure");
206                }
207                R2[0..Rlen] = R[0..Rlen];
208                R = R2;
209            }
210
211            for (j = 0; j < L; j++)
212            {
213                R[Rlen] = '%';
214                R[Rlen + 1] = hex2ascii[Octet[j] >> 4];
215                R[Rlen + 2] = hex2ascii[Octet[j] & 15];
216
217                Rlen += 3;
218            }
219        }
220    }
221
222    return R[0..Rlen].idup;
223}
224
225uint ascii2hex(dchar c)
226{
227    return (c <= '9') ? c - '0' :
228        (c <= 'F') ? c - 'A' + 10 :
229        c - 'a' + 10;
230}
231
232private dstring URI_Decode(string string, uint reservedSet)
233{
234    uint j;
235    uint k;
236    uint V;
237    dchar C;
238
239    // Result array, allocated on stack
240    dchar* R;
241    uint Rlen;
242
243    auto len = string.length;
244    auto s = string.ptr;
245
246    // Preallocate result buffer R guaranteed to be large enough for result
247    auto Rsize = len;
248    if (Rsize > 1024 / dchar.sizeof) {
249        R = (new dchar[Rsize]).ptr;
250    }
251    else
252    {
253        R = cast(dchar *)alloca(Rsize * dchar.sizeof);
254        if (!R)
255            throw new OutOfMemoryError("Alloca failure");
256    }
257    Rlen = 0;
258
259    for (k = 0; k != len; k++)
260    {
261        char B;
262        uint start;
263
264        C = s[k];
265        if (C != '%')
266        {
267            R[Rlen] = C;
268            Rlen++;
269            continue;
270        }
271        start = k;
272        if (k + 2 >= len)
273            throw new URIException("Unexpected end of URI");
274        if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
275            throw new URIException("Expected two hexadecimal digits after '%'");
276        B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
277        k += 2;
278        if ((B & 0x80) == 0)
279        {
280            C = B;
281        }
282        else
283        {
284            uint n;
285
286            for (n = 1; ; n++)
287            {
288                if (n > 4)
289                    throw new URIException("UTF-32 code point size too large");
290                if (((B << n) & 0x80) == 0)
291                {
292                    if (n == 1)
293                        throw new URIException("UTF-32 code point size too small");
294                    break;
295                }
296            }
297
298            // Pick off (7 - n) significant bits of B from first byte of octet
299            V = B & ((1 << (7 - n)) - 1);   // (!!!)
300
301            if (k + (3 * (n - 1)) >= len)
302                throw new URIException("UTF-32 unaligned String");
303            for (j = 1; j != n; j++)
304            {
305                k++;
306                if (s[k] != '%')
307                    throw new URIException("Expected: '%'");
308                if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
309                    throw new URIException("Expected two hexadecimal digits after '%'");
310                B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
311                if ((B & 0xC0) != 0x80)
312                    throw new URIException("Incorrect UTF-32 multi-byte sequence");
313                k += 2;
314                V = (V << 6) | (B & 0x3F);
315            }
316            if (V > 0x10FFFF)
317                throw new URIException("Unknown UTF-32 code point");
318            C = V;
319        }
320        if (C < uri_flags.length && uri_flags[C] & reservedSet)
321        {
322            // R ~= s[start .. k + 1];
323            int width = (k + 1) - start;
324            for (int ii = 0; ii < width; ii++)
325                R[Rlen + ii] = s[start + ii];
326            Rlen += width;
327        }
328        else
329        {
330            R[Rlen] = C;
331            Rlen++;
332        }
333    }
334    assert(Rlen <= Rsize);  // enforce our preallocation size guarantee
335
336    // Copy array on stack to array in memory
337    return R[0..Rlen].idup;
338}
339
340/*************************************
341 * Decodes the URI string encodedURI into a UTF-8 string and returns it.
342 * Escape sequences that resolve to reserved URI characters are not replaced.
343 * Escape sequences that resolve to the '#' character are not replaced.
344 */
345
346string decode(string encodedURI)
347{
348    auto s = URI_Decode(encodedURI, URI_Reserved | URI_Hash);
349    return std.utf.toUTF8(s);
350}
351
352/*******************************
353 * Decodes the URI string encodedURI into a UTF-8 string and returns it. All
354 * escape sequences are decoded.
355 */
356
357string decodeComponent(string encodedURIComponent)
358{
359    auto s = URI_Decode(encodedURIComponent, 0);
360    return std.utf.toUTF8(s);
361}
362
363/*****************************
364 * Encodes the UTF-8 string uri into a URI and returns that URI. Any character
365 * not a valid URI character is escaped. The '#' character is not escaped.
366 */
367
368string encode(string uri)
369{
370    auto s = std.utf.toUTF32(uri);
371    return URI_Encode(s, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark);
372}
373
374/********************************
375 * Encodes the UTF-8 string uriComponent into a URI and returns that URI.
376 * Any character not a letter, digit, or one of -_.!~*'() is escaped.
377 */
378
379string encodeComponent(string uriComponent)
380{
381    auto s = std.utf.toUTF32(uriComponent);
382    return URI_Encode(s, URI_Alpha | URI_Digit | URI_Mark);
383}
384
385/***************************
386 * Does string s[] start with a URL?
387 * Returns:
388 *  -1    it does not
389 *  len  it does, and s[0..len] is the slice of s[] that is that URL
390 */
391
392size_t uriLength(string s)
393{
394    /* Must start with one of:
395     *  http://
396     *  https://
397     *  www.
398     */
399    import std.string : icmp;
400
401    size_t i;
402
403    if (s.length <= 4)
404        return -1;
405
406    if (s.length > 7 && std.string.icmp(s[0 .. 7], "http://") == 0) {
407        i = 7;
408    }
409    else
410    {
411        if (s.length > 8 && std.string.icmp(s[0 .. 8], "https://") == 0)
412            i = 8;
413        else
414            return -1;
415    }
416    //    if (icmp(s[0 .. 4], "www.") == 0)
417    //  i = 4;
418
419    size_t lastdot;
420    for (; i < s.length; i++)
421    {
422        auto c = s[i];
423        if (isAlphaNum(c))
424            continue;
425        if (c == '-' || c == '_' || c == '?' ||
426                c == '=' || c == '%' || c == '&' ||
427                c == '/' || c == '+' || c == '#' ||
428                c == '~' || c == '$')
429            continue;
430        if (c == '.')
431        {
432            lastdot = i;
433            continue;
434        }
435        break;
436    }
437    //if (!lastdot || (i - lastdot != 3 && i - lastdot != 4))
438    if (!lastdot)
439        return -1;
440
441    return i;
442}
443
444unittest
445{
446    string s1 = "http://www.digitalmars.com/~fred/fredsRX.html#foo end!";
447    assert (uriLength(s1) == 49);
448    string s2 = "no uri here";
449    assert (uriLength(s2) == -1);
450}
451
452
453/***************************
454 * Does string s[] start with an email address?
455 * Returns:
456 *  -1    it does not
457 *  len   it does, and s[0..i] is the slice of s[] that is that email address
458 * References:
459 *  RFC2822
460 */
461size_t emailLength(string s)
462{
463    size_t i;
464
465    if (!isAlpha(s[0]))
466        return -1;
467
468    for (i = 1; 1; i++)
469    {
470        if (i == s.length)
471            return -1;
472        auto c = s[i];
473        if (isAlphaNum(c))
474            continue;
475        if (c == '-' || c == '_' || c == '.')
476            continue;
477        if (c != '@')
478            return -1;
479        i++;
480        break;
481    }
482
483    /* Now do the part past the '@'
484     */
485    size_t lastdot;
486    for (; i < s.length; i++)
487    {
488        auto c = s[i];
489        if (isAlphaNum(c))
490            continue;
491        if (c == '-' || c == '_')
492            continue;
493        if (c == '.')
494        {
495            lastdot = i;
496            continue;
497        }
498        break;
499    }
500    if (!lastdot || (i - lastdot != 3 && i - lastdot != 4))
501        return -1;
502
503    return i;
504}
505
506unittest
507{
508    string s1 = "my.e-mail@www.example-domain.com with garbage added";
509    assert (emailLength(s1) == 32);
510    string s2 = "no email address here";
511    assert (emailLength(s2) == -1);
512}
513
514
515unittest
516{
517    debug(uri) writeln("uri.encodeURI.unittest");
518
519    string s = "http://www.digitalmars.com/~fred/fred's RX.html#foo";
520    string t = "http://www.digitalmars.com/~fred/fred's%20RX.html#foo";
521
522    auto r = encode(s);
523    debug(uri) writefln("r = '%s'", r);
524    assert(r == t);
525    r = decode(t);
526    debug(uri) writefln("r = '%s'", r);
527    assert(r == s);
528
529    r = encode( decode("%E3%81%82%E3%81%82") );
530    assert(r == "%E3%81%82%E3%81%82");
531
532    r = encodeComponent("c++");
533    assert(r == "c%2B%2B");
534
535    auto str = new char[10_000_000];
536    str[] = 'A';
537    r = encodeComponent(assumeUnique(str));
538    foreach (char c; r)
539        assert(c == 'A');
540
541    r = decode("%41%42%43");
542    debug(uri) writeln(r);
543}