PageRenderTime 62ms CodeModel.GetById 35ms RepoModel.GetById 1ms app.codeStats 0ms

/core/Tracker/PageUrl.php

https://github.com/CodeYellowBV/piwik
PHP | 328 lines | 190 code | 35 blank | 103 comment | 29 complexity | c706890a56bf4c0defd1a98128ef344d MD5 | raw file
Possible License(s): LGPL-3.0, JSON, MIT, GPL-3.0, LGPL-2.1, GPL-2.0, AGPL-1.0, BSD-2-Clause, BSD-3-Clause
  1. <?php
  2. /**
  3. * Piwik - free/libre analytics platform
  4. *
  5. * @link http://piwik.org
  6. * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later
  7. *
  8. */
  9. namespace Piwik\Tracker;
  10. use Piwik\Common;
  11. use Piwik\Config;
  12. use Piwik\UrlHelper;
  13. class PageUrl
  14. {
  15. /**
  16. * Map URL prefixes to integers.
  17. * @see self::normalizeUrl(), self::reconstructNormalizedUrl()
  18. */
  19. public static $urlPrefixMap = array(
  20. 'http://www.' => 1,
  21. 'http://' => 0,
  22. 'https://www.' => 3,
  23. 'https://' => 2
  24. );
  25. protected static $queryParametersToExclude = array('gclid', 'fb_xd_fragment', 'fb_comment_id',
  26. 'phpsessid', 'jsessionid', 'sessionid', 'aspsessionid',
  27. 'doing_wp_cron');
  28. /**
  29. * Given the Input URL, will exclude all query parameters set for this site
  30. *
  31. * @static
  32. * @param $originalUrl
  33. * @param $idSite
  34. * @return bool|string
  35. */
  36. public static function excludeQueryParametersFromUrl($originalUrl, $idSite)
  37. {
  38. $originalUrl = self::cleanupUrl($originalUrl);
  39. $parsedUrl = @parse_url($originalUrl);
  40. $parsedUrl = self::cleanupHostAndHashTag($parsedUrl, $idSite);
  41. $parametersToExclude = self::getQueryParametersToExclude($idSite);
  42. if (empty($parsedUrl['query'])) {
  43. if (empty($parsedUrl['fragment'])) {
  44. return UrlHelper::getParseUrlReverse($parsedUrl);
  45. }
  46. // Exclude from the hash tag as well
  47. $queryParameters = UrlHelper::getArrayFromQueryString($parsedUrl['fragment']);
  48. $parsedUrl['fragment'] = UrlHelper::getQueryStringWithExcludedParameters($queryParameters, $parametersToExclude);
  49. $url = UrlHelper::getParseUrlReverse($parsedUrl);
  50. return $url;
  51. }
  52. $queryParameters = UrlHelper::getArrayFromQueryString($parsedUrl['query']);
  53. $parsedUrl['query'] = UrlHelper::getQueryStringWithExcludedParameters($queryParameters, $parametersToExclude);
  54. $url = UrlHelper::getParseUrlReverse($parsedUrl);
  55. return $url;
  56. }
  57. /**
  58. * Returns the array of parameters names that must be excluded from the Query String in all tracked URLs
  59. * @static
  60. * @param $idSite
  61. * @return array
  62. */
  63. public static function getQueryParametersToExclude($idSite)
  64. {
  65. $campaignTrackingParameters = Common::getCampaignParameters();
  66. $campaignTrackingParameters = array_merge(
  67. $campaignTrackingParameters[0], // campaign name parameters
  68. $campaignTrackingParameters[1] // campaign keyword parameters
  69. );
  70. $website = Cache::getCacheWebsiteAttributes($idSite);
  71. $excludedParameters = isset($website['excluded_parameters'])
  72. ? $website['excluded_parameters']
  73. : array();
  74. if (!empty($excludedParameters)) {
  75. Common::printDebug('Excluding parameters "' . implode(',', $excludedParameters) . '" from URL');
  76. }
  77. $parametersToExclude = array_merge($excludedParameters,
  78. self::$queryParametersToExclude,
  79. $campaignTrackingParameters);
  80. $parametersToExclude = array_map('strtolower', $parametersToExclude);
  81. return $parametersToExclude;
  82. }
  83. /**
  84. * Returns true if URL fragments should be removed for a specific site,
  85. * false if otherwise.
  86. *
  87. * This function uses the Tracker cache and not the MySQL database.
  88. *
  89. * @param $idSite int The ID of the site to check for.
  90. * @return bool
  91. */
  92. public static function shouldRemoveURLFragmentFor($idSite)
  93. {
  94. $websiteAttributes = Cache::getCacheWebsiteAttributes($idSite);
  95. return !$websiteAttributes['keep_url_fragment'];
  96. }
  97. /**
  98. * Cleans and/or removes the URL fragment of a URL.
  99. *
  100. * @param $urlFragment string The URL fragment to process.
  101. * @param $idSite int|bool If not false, this function will check if URL fragments
  102. * should be removed for the site w/ this ID and if so,
  103. * the returned processed fragment will be empty.
  104. *
  105. * @return string The processed URL fragment.
  106. */
  107. public static function processUrlFragment($urlFragment, $idSite = false)
  108. {
  109. // if we should discard the url fragment for this site, return an empty string as
  110. // the processed url fragment
  111. if ($idSite !== false
  112. && PageUrl::shouldRemoveURLFragmentFor($idSite)
  113. ) {
  114. return '';
  115. } else {
  116. // Remove trailing Hash tag in ?query#hash#
  117. if (substr($urlFragment, -1) == '#') {
  118. $urlFragment = substr($urlFragment, 0, strlen($urlFragment) - 1);
  119. }
  120. return $urlFragment;
  121. }
  122. }
  123. /**
  124. * Will cleanup the hostname (some browser do not strolower the hostname),
  125. * and deal ith the hash tag on incoming URLs based on website setting.
  126. *
  127. * @param $parsedUrl
  128. * @param $idSite int|bool The site ID of the current visit. This parameter is
  129. * only used by the tracker to see if we should remove
  130. * the URL fragment for this site.
  131. * @return array
  132. */
  133. protected static function cleanupHostAndHashTag($parsedUrl, $idSite = false)
  134. {
  135. if (empty($parsedUrl)) {
  136. return $parsedUrl;
  137. }
  138. if (!empty($parsedUrl['host'])) {
  139. $parsedUrl['host'] = mb_strtolower($parsedUrl['host'], 'UTF-8');
  140. }
  141. if (!empty($parsedUrl['fragment'])) {
  142. $parsedUrl['fragment'] = PageUrl::processUrlFragment($parsedUrl['fragment'], $idSite);
  143. }
  144. return $parsedUrl;
  145. }
  146. /**
  147. * Converts Matrix URL format
  148. * from http://example.org/thing;paramA=1;paramB=6542
  149. * to http://example.org/thing?paramA=1&paramB=6542
  150. *
  151. * @param string $originalUrl
  152. * @return string
  153. */
  154. public static function convertMatrixUrl($originalUrl)
  155. {
  156. $posFirstSemiColon = strpos($originalUrl, ";");
  157. if ($posFirstSemiColon === false) {
  158. return $originalUrl;
  159. }
  160. $posQuestionMark = strpos($originalUrl, "?");
  161. $replace = ($posQuestionMark === false);
  162. if ($posQuestionMark > $posFirstSemiColon) {
  163. $originalUrl = substr_replace($originalUrl, ";", $posQuestionMark, 1);
  164. $replace = true;
  165. }
  166. if ($replace) {
  167. $originalUrl = substr_replace($originalUrl, "?", strpos($originalUrl, ";"), 1);
  168. $originalUrl = str_replace(";", "&", $originalUrl);
  169. }
  170. return $originalUrl;
  171. }
  172. /**
  173. * Clean up string contents (filter, truncate, ...)
  174. *
  175. * @param string $string Dirty string
  176. * @return string
  177. */
  178. public static function cleanupString($string)
  179. {
  180. $string = trim($string);
  181. $string = str_replace(array("\n", "\r", "\0"), '', $string);
  182. $limit = Config::getInstance()->Tracker['page_maximum_length'];
  183. $clean = substr($string, 0, $limit);
  184. return $clean;
  185. }
  186. protected static function reencodeParameterValue($value, $encoding)
  187. {
  188. if (is_string($value)) {
  189. $decoded = urldecode($value);
  190. if (@mb_check_encoding($decoded, $encoding)) {
  191. $value = urlencode(mb_convert_encoding($decoded, 'UTF-8', $encoding));
  192. }
  193. }
  194. return $value;
  195. }
  196. protected static function reencodeParametersArray($queryParameters, $encoding)
  197. {
  198. foreach ($queryParameters as &$value) {
  199. if (is_array($value)) {
  200. $value = self::reencodeParametersArray($value, $encoding);
  201. } else {
  202. $value = PageUrl::reencodeParameterValue($value, $encoding);
  203. }
  204. }
  205. return $queryParameters;
  206. }
  207. /**
  208. * Checks if query parameters are of a non-UTF-8 encoding and converts the values
  209. * from the specified encoding to UTF-8.
  210. * This method is used to workaround browser/webapp bugs (see #3450). When
  211. * browsers fail to encode query parameters in UTF-8, the tracker will send the
  212. * charset of the page viewed and we can sometimes work around invalid data
  213. * being stored.
  214. *
  215. * @param array $queryParameters Name/value mapping of query parameters.
  216. * @param bool|string $encoding of the HTML page the URL is for. Used to workaround
  217. * browser bugs & mis-coded webapps. See #3450.
  218. *
  219. * @return array
  220. */
  221. public static function reencodeParameters(&$queryParameters, $encoding = false)
  222. {
  223. // if query params are encoded w/ non-utf8 characters (due to browser bug or whatever),
  224. // encode to UTF-8.
  225. if ($encoding !== false
  226. && strtolower($encoding) != 'utf-8'
  227. && function_exists('mb_check_encoding')
  228. ) {
  229. $queryParameters = PageUrl::reencodeParametersArray($queryParameters, $encoding);
  230. }
  231. return $queryParameters;
  232. }
  233. public static function cleanupUrl($url)
  234. {
  235. $url = Common::unsanitizeInputValue($url);
  236. $url = PageUrl::cleanupString($url);
  237. $url = PageUrl::convertMatrixUrl($url);
  238. return $url;
  239. }
  240. /**
  241. * Build the full URL from the prefix ID and the rest.
  242. *
  243. * @param string $url
  244. * @param integer $prefixId
  245. * @return string
  246. */
  247. public static function reconstructNormalizedUrl($url, $prefixId)
  248. {
  249. $map = array_flip(self::$urlPrefixMap);
  250. if ($prefixId !== null && isset($map[$prefixId])) {
  251. $fullUrl = $map[$prefixId] . $url;
  252. } else {
  253. $fullUrl = $url;
  254. }
  255. // Clean up host & hash tags, for URLs
  256. $parsedUrl = @parse_url($fullUrl);
  257. $parsedUrl = PageUrl::cleanupHostAndHashTag($parsedUrl);
  258. $url = UrlHelper::getParseUrlReverse($parsedUrl);
  259. if (!empty($url)) {
  260. return $url;
  261. }
  262. return $fullUrl;
  263. }
  264. /**
  265. * Extract the prefix from a URL.
  266. * Return the prefix ID and the rest.
  267. *
  268. * @param string $url
  269. * @return array
  270. */
  271. public static function normalizeUrl($url)
  272. {
  273. foreach (self::$urlPrefixMap as $prefix => $id) {
  274. if (strtolower(substr($url, 0, strlen($prefix))) == $prefix) {
  275. return array(
  276. 'url' => substr($url, strlen($prefix)),
  277. 'prefixId' => $id
  278. );
  279. }
  280. }
  281. return array('url' => $url, 'prefixId' => null);
  282. }
  283. public static function getUrlIfLookValid($url)
  284. {
  285. $url = PageUrl::cleanupString($url);
  286. if (!UrlHelper::isLookLikeUrl($url)) {
  287. Common::printDebug("WARNING: URL looks invalid and is discarded");
  288. $url = false;
  289. return $url;
  290. }
  291. return $url;
  292. }
  293. }