/hphp/runtime/base/zend-url.cpp
C++ | 424 lines | 310 code | 53 blank | 61 comment | 178 complexity | 3931cf30e36ee698f4f82250a788505a MD5 | raw file
Possible License(s): LGPL-2.1, BSD-2-Clause, BSD-3-Clause, MPL-2.0-no-copyleft-exception, MIT, LGPL-2.0, Apache-2.0
- /*
- +----------------------------------------------------------------------+
- | HipHop for PHP |
- +----------------------------------------------------------------------+
- | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
- | Copyright (c) 1997-2018 The PHP Group |
- +----------------------------------------------------------------------+
- | This source file is subject to version 3.01 of the PHP license, |
- | that is bundled with this package in the file LICENSE.PHP, and is |
- | available through the world-wide-web at the following url: |
- | http://www.php.net/license/3_01.txt |
- | If you did not receive a copy of the PHP license and are unable to |
- | obtain it through the world-wide-web, please send a note to |
- | license@php.net so we can mail you a copy immediately. |
- +----------------------------------------------------------------------+
- */
- #include "hphp/runtime/base/zend-url.h"
- #include "hphp/runtime/base/zend-string.h"
- #include "hphp/runtime/base/string-util.h"
- #include <folly/portability/String.h>
- namespace HPHP {
- ///////////////////////////////////////////////////////////////////////////////
- const StaticString s_file("file");
- static void replace_controlchars(String& output, const char *str, int len) {
- if (!str) return;
- unsigned char *s = (unsigned char *)str;
- unsigned char *e = (unsigned char *)str + len;
- output = String(str, len, CopyString);
- char *outbuf = output.mutableData();
- while (s < e) {
- if (iscntrl(*s)) {
- *outbuf='_';
- }
- s++;
- outbuf++;
- }
- output.setSize(len);
- }
- bool url_parse(Url &output, const char *str, size_t length) {
- char port_buf[6];
- // s: full string
- // ue: end of full string
- // p: start of string slice we're looking at
- // e: index of something we searched for, e.g. ':'. usually end of string
- // slice, but not always
- // pp: start of string sub-slice
- const char *s, *e, *p, *pp, *ue;
- s = str;
- ue = s + length;
- /* parse scheme */
- if ((e = (const char *)memchr((const void *)s, ':', length)) && e != s) {
- /* validate scheme */
- p = s;
- while (p < e) {
- /* scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] */
- if (!isalpha(*p) && !isdigit(*p) &&
- *p != '+' && *p != '.' && *p != '-') {
- if (e + 1 < ue && e < s + strcspn(s, "?#")) {
- goto parse_port;
- } else if (s + 1 < ue && *s == '/' && *(s + 1) == '/') {
- /* relative-scheme URL */
- s += 2;
- e = nullptr;
- goto parse_host;
- } else {
- goto just_path;
- }
- }
- p++;
- }
- if ((e + 1) == ue) { /* only scheme is available */
- replace_controlchars(output.scheme, s, (e - s));
- return true;
- }
- /*
- * certain schemas like mailto: and zlib: may not have any / after them
- * this check ensures we support those.
- */
- if (*(e+1) != '/') {
- /* check if the data we get is a port this allows us to
- * correctly parse things like a.com:80
- */
- p = e + 1;
- while (p < ue && isdigit(*p)) {
- p++;
- }
- if ((p == ue || *p == '/') && (p - e) < 7) {
- goto parse_port;
- }
- replace_controlchars(output.scheme, s, (e - s));
- s = e + 1;
- goto just_path;
- } else {
- replace_controlchars(output.scheme, s, (e - s));
- if (e + 2 < ue && *(e+2) == '/') {
- s = e + 3;
- if (output.scheme.get()->isame(s_file.get())) {
- if (e + 3 < ue && *(e + 3) == '/') {
- /* support windows drive letters as in:
- file:///c:/somedir/file.txt
- */
- if (e + 5 < ue && e[4] != '\0' && e[5] == ':') {
- s = e + 4;
- }
- goto just_path;
- }
- }
- } else {
- s = e + 1;
- goto just_path;
- }
- }
- } else if (e) { /* no scheme; starts with colon: look for port */
- parse_port:
- p = e + 1;
- pp = p;
- while (pp < ue && pp - p < 6 && isdigit(*pp)) {
- pp++;
- }
- if (pp - p > 0 && pp - p < 6 && (pp == ue || *pp == '/')) {
- memcpy(port_buf, p, (pp-p));
- port_buf[pp-p] = '\0';
- auto port = atoi(port_buf);
- if (port > 0 && port <= 65535) {
- output.port = port;
- if (s + 1 < ue && *s == '/' && *(s+1) == '/') {
- /* relative-scheme URL */
- s += 2;
- }
- } else {
- return false;
- }
- } else if (p == pp && pp == ue) {
- return false;
- } else if (s + 1 < ue && *s == '/' && *(s+1) == '/') {
- /* relative-scheme URL */
- s += 2;
- } else {
- goto just_path;
- }
- } else if (s + 1 < ue && *s == '/' && *(s +1 ) == '/') {
- /* relative-scheme URL */
- s += 2;
- } else {
- goto just_path;
- }
- parse_host:
- /* Binary-safe strcspn(s, "/?#") */
- e = ue;
- if ((p = (const char*)memchr(s, '/', e - s))) {
- e = p;
- }
- if ((p = (const char*)memchr(s, '?', e - s))) {
- e = p;
- }
- if ((p = (const char*)memchr(s, '#', e - s))) {
- e = p;
- }
- /* check for login and password */
- if ((p = (const char*)memrchr(s, '@', (e-s)))) {
- if ((pp = (const char*)memchr(s, ':', (p-s)))) {
- replace_controlchars(output.user, s, (pp - s));
- pp++;
- replace_controlchars(output.pass, pp, (p-pp));
- } else {
- replace_controlchars(output.user, s, (p-s));
- }
- s = p + 1;
- }
- /* check for port */
- if (s < ue && *s == '[' && *(e-1) == ']') {
- /* Short circuit portscan,
- we're dealing with an
- IPv6 embedded address */
- p = nullptr;
- } else {
- p = (const char*)memrchr(s, ':', e - s);
- }
- if (p) {
- if (!output.port) {
- p++;
- if (e-p > 5) { /* port cannot be longer then 5 characters */
- return false;
- } else if (e - p > 0) {
- memcpy(port_buf, p, (e-p));
- port_buf[e-p] = '\0';
- auto port = atoi(port_buf);
- if (port > 0 && port <= 65535) {
- output.port = port;
- } else {
- return false;
- }
- }
- p--;
- }
- } else {
- p = e;
- }
- /* check if we have a valid host, if we don't reject the string as url */
- if ((p-s) < 1) {
- return false;
- }
- replace_controlchars(output.host, s, (p - s));
- if (e == ue) {
- return true;
- }
- s = e;
- just_path:
- e = ue;
- p = (const char*)memchr(s, '#', (e - s));
- if (p) {
- p++;
- if (p < e) {
- replace_controlchars(output.fragment, p, e - p);
- }
- e = p - 1;
- }
- p = (const char*)memchr(s, '?', (e - s));
- if (p) {
- p++;
- if (p < e) {
- replace_controlchars(output.query, p, e - p);
- }
- e = p - 1;
- }
- if (s < e || s == ue) {
- replace_controlchars(output.path, s, e - s);
- }
- return true;
- }
- ///////////////////////////////////////////////////////////////////////////////
- static int php_htoi(char *s) {
- int value;
- int c;
- c = ((unsigned char *)s)[0];
- if (isupper(c))
- c = tolower(c);
- value = (c >= '0' && c <= '9' ? c - '0' : c - 'a' + 10) * 16;
- c = ((unsigned char *)s)[1];
- if (isupper(c))
- c = tolower(c);
- value += c >= '0' && c <= '9' ? c - '0' : c - 'a' + 10;
- return (value);
- }
- /* rfc1738:
- ...The characters ";",
- "/", "?", ":", "@", "=" and "&" are the characters which may be
- reserved for special meaning within a scheme...
- ...Thus, only alphanumerics, the special characters "$-_.+!*'(),", and
- reserved characters used for their reserved purposes may be used
- unencoded within a URL...
- For added safety, we only leave -_. unencoded.
- */
- static unsigned char hexchars[] = "0123456789ABCDEF";
- String url_encode(const char *s, size_t len) {
- String retString(safe_address(len, 3, 1), ReserveString);
- register unsigned char c;
- unsigned char *to, *start;
- unsigned char const *from, *end;
- from = (unsigned char const *)s;
- end = (unsigned char const *)s + len;
- start = to = (unsigned char *)retString.mutableData();
- while (from < end) {
- c = *from++;
- if (c == ' ') {
- *to++ = '+';
- } else if ((c < '0' && c != '-' && c != '.') ||
- (c < 'A' && c > '9') ||
- (c > 'Z' && c < 'a' && c != '_') ||
- (c > 'z')) {
- to[0] = '%';
- to[1] = hexchars[c >> 4];
- to[2] = hexchars[c & 15];
- to += 3;
- } else {
- *to++ = c;
- }
- }
- retString.setSize(to - start);
- return retString;
- }
- String url_decode(const char *s, size_t len) {
- String retString(s, len, CopyString);
- char *str = retString.mutableData();
- char *dest = str;
- char *data = str;
- while (len--) {
- if (*data == '+') {
- *dest = ' ';
- }
- else if (*data == '%' && len >= 2 && isxdigit((int) *(data + 1))
- && isxdigit((int) *(data + 2))) {
- *dest = (char) php_htoi(data + 1);
- data += 2;
- len -= 2;
- } else {
- *dest = *data;
- }
- data++;
- dest++;
- }
- retString.setSize(dest - str);
- return retString;
- }
- size_t url_decode_ex(char *value, size_t len) {
- assertx(value && *value); // check before calling this function
- if (len == 0) return 0;
- size_t i = 0, o = 0;
- unsigned char *s = (unsigned char *)value;
- unsigned char *end = s + len;
- while (s + i < end) {
- if (s[i] == '+') {
- s[o++] = ' ';
- i++;
- } else if (s[i] == '%' && isxdigit(s[i+1]) && isxdigit(s[i+2])) {
- char num;
- num = (s[i+1] >= 'A') ? ((s[i+1] & 0xdf) - 'A') + 10 : (s[i+1] - '0');
- num *= 16;
- num += (s[i+2] >= 'A') ? ((s[i+2] & 0xdf) - 'A') + 10 : (s[i+2] - '0');
- s[o++] = num;
- i+=3;
- } else {
- s[o++] = s[i++];
- }
- }
- if (i && o) s[o] = '\0';
- return o;
- }
- String url_raw_encode(const char *s, size_t len) {
- String retString(safe_address(len, 3, 1), ReserveString);
- size_t x, y;
- unsigned char *str = (unsigned char *)retString.mutableData();
- for (x = 0, y = 0; len--; x++, y++) {
- str[y] = (unsigned char) s[x];
- if ((str[y] < '0' && str[y] != '-' && str[y] != '.') ||
- (str[y] < 'A' && str[y] > '9') ||
- (str[y] > 'Z' && str[y] < 'a' && str[y] != '_') ||
- (str[y] > 'z' && str[y] != '~')) {
- str[y++] = '%';
- str[y++] = hexchars[(unsigned char) s[x] >> 4];
- str[y] = hexchars[(unsigned char) s[x] & 15];
- }
- }
- retString.setSize(y);
- return retString;
- }
- String url_raw_decode(const char *s, size_t len) {
- String retString(s, len, CopyString);
- char *str = retString.mutableData();
- char *dest = str;
- char *data = str;
- while (len--) {
- if (*data == '%' && len >= 2 && isxdigit((int) *(data + 1))
- && isxdigit((int) *(data + 2))) {
- *dest = (char) php_htoi(data + 1);
- data += 2;
- len -= 2;
- } else {
- *dest = *data;
- }
- data++;
- dest++;
- }
- retString.setSize(dest - str);
- return retString;
- }
- ///////////////////////////////////////////////////////////////////////////////
- }