PageRenderTime 43ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 1ms

/hphp/runtime/base/zend-url.cpp

http://github.com/facebook/hiphop-php
C++ | 424 lines | 310 code | 53 blank | 61 comment | 178 complexity | 3931cf30e36ee698f4f82250a788505a MD5 | raw file
Possible License(s): LGPL-2.1, BSD-2-Clause, BSD-3-Clause, MPL-2.0-no-copyleft-exception, MIT, LGPL-2.0, Apache-2.0
  1. /*
  2. +----------------------------------------------------------------------+
  3. | HipHop for PHP |
  4. +----------------------------------------------------------------------+
  5. | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
  6. | Copyright (c) 1997-2018 The PHP Group |
  7. +----------------------------------------------------------------------+
  8. | This source file is subject to version 3.01 of the PHP license, |
  9. | that is bundled with this package in the file LICENSE.PHP, and is |
  10. | available through the world-wide-web at the following url: |
  11. | http://www.php.net/license/3_01.txt |
  12. | If you did not receive a copy of the PHP license and are unable to |
  13. | obtain it through the world-wide-web, please send a note to |
  14. | license@php.net so we can mail you a copy immediately. |
  15. +----------------------------------------------------------------------+
  16. */
  17. #include "hphp/runtime/base/zend-url.h"
  18. #include "hphp/runtime/base/zend-string.h"
  19. #include "hphp/runtime/base/string-util.h"
  20. #include <folly/portability/String.h>
  21. namespace HPHP {
  22. ///////////////////////////////////////////////////////////////////////////////
  23. const StaticString s_file("file");
  24. static void replace_controlchars(String& output, const char *str, int len) {
  25. if (!str) return;
  26. unsigned char *s = (unsigned char *)str;
  27. unsigned char *e = (unsigned char *)str + len;
  28. output = String(str, len, CopyString);
  29. char *outbuf = output.mutableData();
  30. while (s < e) {
  31. if (iscntrl(*s)) {
  32. *outbuf='_';
  33. }
  34. s++;
  35. outbuf++;
  36. }
  37. output.setSize(len);
  38. }
  39. bool url_parse(Url &output, const char *str, size_t length) {
  40. char port_buf[6];
  41. // s: full string
  42. // ue: end of full string
  43. // p: start of string slice we're looking at
  44. // e: index of something we searched for, e.g. ':'. usually end of string
  45. // slice, but not always
  46. // pp: start of string sub-slice
  47. const char *s, *e, *p, *pp, *ue;
  48. s = str;
  49. ue = s + length;
  50. /* parse scheme */
  51. if ((e = (const char *)memchr((const void *)s, ':', length)) && e != s) {
  52. /* validate scheme */
  53. p = s;
  54. while (p < e) {
  55. /* scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] */
  56. if (!isalpha(*p) && !isdigit(*p) &&
  57. *p != '+' && *p != '.' && *p != '-') {
  58. if (e + 1 < ue && e < s + strcspn(s, "?#")) {
  59. goto parse_port;
  60. } else if (s + 1 < ue && *s == '/' && *(s + 1) == '/') {
  61. /* relative-scheme URL */
  62. s += 2;
  63. e = nullptr;
  64. goto parse_host;
  65. } else {
  66. goto just_path;
  67. }
  68. }
  69. p++;
  70. }
  71. if ((e + 1) == ue) { /* only scheme is available */
  72. replace_controlchars(output.scheme, s, (e - s));
  73. return true;
  74. }
  75. /*
  76. * certain schemas like mailto: and zlib: may not have any / after them
  77. * this check ensures we support those.
  78. */
  79. if (*(e+1) != '/') {
  80. /* check if the data we get is a port this allows us to
  81. * correctly parse things like a.com:80
  82. */
  83. p = e + 1;
  84. while (p < ue && isdigit(*p)) {
  85. p++;
  86. }
  87. if ((p == ue || *p == '/') && (p - e) < 7) {
  88. goto parse_port;
  89. }
  90. replace_controlchars(output.scheme, s, (e - s));
  91. s = e + 1;
  92. goto just_path;
  93. } else {
  94. replace_controlchars(output.scheme, s, (e - s));
  95. if (e + 2 < ue && *(e+2) == '/') {
  96. s = e + 3;
  97. if (output.scheme.get()->isame(s_file.get())) {
  98. if (e + 3 < ue && *(e + 3) == '/') {
  99. /* support windows drive letters as in:
  100. file:///c:/somedir/file.txt
  101. */
  102. if (e + 5 < ue && e[4] != '\0' && e[5] == ':') {
  103. s = e + 4;
  104. }
  105. goto just_path;
  106. }
  107. }
  108. } else {
  109. s = e + 1;
  110. goto just_path;
  111. }
  112. }
  113. } else if (e) { /* no scheme; starts with colon: look for port */
  114. parse_port:
  115. p = e + 1;
  116. pp = p;
  117. while (pp < ue && pp - p < 6 && isdigit(*pp)) {
  118. pp++;
  119. }
  120. if (pp - p > 0 && pp - p < 6 && (pp == ue || *pp == '/')) {
  121. memcpy(port_buf, p, (pp-p));
  122. port_buf[pp-p] = '\0';
  123. auto port = atoi(port_buf);
  124. if (port > 0 && port <= 65535) {
  125. output.port = port;
  126. if (s + 1 < ue && *s == '/' && *(s+1) == '/') {
  127. /* relative-scheme URL */
  128. s += 2;
  129. }
  130. } else {
  131. return false;
  132. }
  133. } else if (p == pp && pp == ue) {
  134. return false;
  135. } else if (s + 1 < ue && *s == '/' && *(s+1) == '/') {
  136. /* relative-scheme URL */
  137. s += 2;
  138. } else {
  139. goto just_path;
  140. }
  141. } else if (s + 1 < ue && *s == '/' && *(s +1 ) == '/') {
  142. /* relative-scheme URL */
  143. s += 2;
  144. } else {
  145. goto just_path;
  146. }
  147. parse_host:
  148. /* Binary-safe strcspn(s, "/?#") */
  149. e = ue;
  150. if ((p = (const char*)memchr(s, '/', e - s))) {
  151. e = p;
  152. }
  153. if ((p = (const char*)memchr(s, '?', e - s))) {
  154. e = p;
  155. }
  156. if ((p = (const char*)memchr(s, '#', e - s))) {
  157. e = p;
  158. }
  159. /* check for login and password */
  160. if ((p = (const char*)memrchr(s, '@', (e-s)))) {
  161. if ((pp = (const char*)memchr(s, ':', (p-s)))) {
  162. replace_controlchars(output.user, s, (pp - s));
  163. pp++;
  164. replace_controlchars(output.pass, pp, (p-pp));
  165. } else {
  166. replace_controlchars(output.user, s, (p-s));
  167. }
  168. s = p + 1;
  169. }
  170. /* check for port */
  171. if (s < ue && *s == '[' && *(e-1) == ']') {
  172. /* Short circuit portscan,
  173. we're dealing with an
  174. IPv6 embedded address */
  175. p = nullptr;
  176. } else {
  177. p = (const char*)memrchr(s, ':', e - s);
  178. }
  179. if (p) {
  180. if (!output.port) {
  181. p++;
  182. if (e-p > 5) { /* port cannot be longer then 5 characters */
  183. return false;
  184. } else if (e - p > 0) {
  185. memcpy(port_buf, p, (e-p));
  186. port_buf[e-p] = '\0';
  187. auto port = atoi(port_buf);
  188. if (port > 0 && port <= 65535) {
  189. output.port = port;
  190. } else {
  191. return false;
  192. }
  193. }
  194. p--;
  195. }
  196. } else {
  197. p = e;
  198. }
  199. /* check if we have a valid host, if we don't reject the string as url */
  200. if ((p-s) < 1) {
  201. return false;
  202. }
  203. replace_controlchars(output.host, s, (p - s));
  204. if (e == ue) {
  205. return true;
  206. }
  207. s = e;
  208. just_path:
  209. e = ue;
  210. p = (const char*)memchr(s, '#', (e - s));
  211. if (p) {
  212. p++;
  213. if (p < e) {
  214. replace_controlchars(output.fragment, p, e - p);
  215. }
  216. e = p - 1;
  217. }
  218. p = (const char*)memchr(s, '?', (e - s));
  219. if (p) {
  220. p++;
  221. if (p < e) {
  222. replace_controlchars(output.query, p, e - p);
  223. }
  224. e = p - 1;
  225. }
  226. if (s < e || s == ue) {
  227. replace_controlchars(output.path, s, e - s);
  228. }
  229. return true;
  230. }
  231. ///////////////////////////////////////////////////////////////////////////////
  232. static int php_htoi(char *s) {
  233. int value;
  234. int c;
  235. c = ((unsigned char *)s)[0];
  236. if (isupper(c))
  237. c = tolower(c);
  238. value = (c >= '0' && c <= '9' ? c - '0' : c - 'a' + 10) * 16;
  239. c = ((unsigned char *)s)[1];
  240. if (isupper(c))
  241. c = tolower(c);
  242. value += c >= '0' && c <= '9' ? c - '0' : c - 'a' + 10;
  243. return (value);
  244. }
  245. /* rfc1738:
  246. ...The characters ";",
  247. "/", "?", ":", "@", "=" and "&" are the characters which may be
  248. reserved for special meaning within a scheme...
  249. ...Thus, only alphanumerics, the special characters "$-_.+!*'(),", and
  250. reserved characters used for their reserved purposes may be used
  251. unencoded within a URL...
  252. For added safety, we only leave -_. unencoded.
  253. */
  254. static unsigned char hexchars[] = "0123456789ABCDEF";
  255. String url_encode(const char *s, size_t len) {
  256. String retString(safe_address(len, 3, 1), ReserveString);
  257. register unsigned char c;
  258. unsigned char *to, *start;
  259. unsigned char const *from, *end;
  260. from = (unsigned char const *)s;
  261. end = (unsigned char const *)s + len;
  262. start = to = (unsigned char *)retString.mutableData();
  263. while (from < end) {
  264. c = *from++;
  265. if (c == ' ') {
  266. *to++ = '+';
  267. } else if ((c < '0' && c != '-' && c != '.') ||
  268. (c < 'A' && c > '9') ||
  269. (c > 'Z' && c < 'a' && c != '_') ||
  270. (c > 'z')) {
  271. to[0] = '%';
  272. to[1] = hexchars[c >> 4];
  273. to[2] = hexchars[c & 15];
  274. to += 3;
  275. } else {
  276. *to++ = c;
  277. }
  278. }
  279. retString.setSize(to - start);
  280. return retString;
  281. }
  282. String url_decode(const char *s, size_t len) {
  283. String retString(s, len, CopyString);
  284. char *str = retString.mutableData();
  285. char *dest = str;
  286. char *data = str;
  287. while (len--) {
  288. if (*data == '+') {
  289. *dest = ' ';
  290. }
  291. else if (*data == '%' && len >= 2 && isxdigit((int) *(data + 1))
  292. && isxdigit((int) *(data + 2))) {
  293. *dest = (char) php_htoi(data + 1);
  294. data += 2;
  295. len -= 2;
  296. } else {
  297. *dest = *data;
  298. }
  299. data++;
  300. dest++;
  301. }
  302. retString.setSize(dest - str);
  303. return retString;
  304. }
  305. size_t url_decode_ex(char *value, size_t len) {
  306. assertx(value && *value); // check before calling this function
  307. if (len == 0) return 0;
  308. size_t i = 0, o = 0;
  309. unsigned char *s = (unsigned char *)value;
  310. unsigned char *end = s + len;
  311. while (s + i < end) {
  312. if (s[i] == '+') {
  313. s[o++] = ' ';
  314. i++;
  315. } else if (s[i] == '%' && isxdigit(s[i+1]) && isxdigit(s[i+2])) {
  316. char num;
  317. num = (s[i+1] >= 'A') ? ((s[i+1] & 0xdf) - 'A') + 10 : (s[i+1] - '0');
  318. num *= 16;
  319. num += (s[i+2] >= 'A') ? ((s[i+2] & 0xdf) - 'A') + 10 : (s[i+2] - '0');
  320. s[o++] = num;
  321. i+=3;
  322. } else {
  323. s[o++] = s[i++];
  324. }
  325. }
  326. if (i && o) s[o] = '\0';
  327. return o;
  328. }
  329. String url_raw_encode(const char *s, size_t len) {
  330. String retString(safe_address(len, 3, 1), ReserveString);
  331. size_t x, y;
  332. unsigned char *str = (unsigned char *)retString.mutableData();
  333. for (x = 0, y = 0; len--; x++, y++) {
  334. str[y] = (unsigned char) s[x];
  335. if ((str[y] < '0' && str[y] != '-' && str[y] != '.') ||
  336. (str[y] < 'A' && str[y] > '9') ||
  337. (str[y] > 'Z' && str[y] < 'a' && str[y] != '_') ||
  338. (str[y] > 'z' && str[y] != '~')) {
  339. str[y++] = '%';
  340. str[y++] = hexchars[(unsigned char) s[x] >> 4];
  341. str[y] = hexchars[(unsigned char) s[x] & 15];
  342. }
  343. }
  344. retString.setSize(y);
  345. return retString;
  346. }
  347. String url_raw_decode(const char *s, size_t len) {
  348. String retString(s, len, CopyString);
  349. char *str = retString.mutableData();
  350. char *dest = str;
  351. char *data = str;
  352. while (len--) {
  353. if (*data == '%' && len >= 2 && isxdigit((int) *(data + 1))
  354. && isxdigit((int) *(data + 2))) {
  355. *dest = (char) php_htoi(data + 1);
  356. data += 2;
  357. len -= 2;
  358. } else {
  359. *dest = *data;
  360. }
  361. data++;
  362. dest++;
  363. }
  364. retString.setSize(dest - str);
  365. return retString;
  366. }
  367. ///////////////////////////////////////////////////////////////////////////////
  368. }