/core/Tracker/PageUrl.php
PHP | 328 lines | 190 code | 35 blank | 103 comment | 29 complexity | c706890a56bf4c0defd1a98128ef344d MD5 | raw file
Possible License(s): LGPL-3.0, JSON, MIT, GPL-3.0, LGPL-2.1, GPL-2.0, AGPL-1.0, BSD-2-Clause, BSD-3-Clause
- <?php
- /**
- * Piwik - free/libre analytics platform
- *
- * @link http://piwik.org
- * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later
- *
- */
- namespace Piwik\Tracker;
- use Piwik\Common;
- use Piwik\Config;
- use Piwik\UrlHelper;
- class PageUrl
- {
- /**
- * Map URL prefixes to integers.
- * @see self::normalizeUrl(), self::reconstructNormalizedUrl()
- */
- public static $urlPrefixMap = array(
- 'http://www.' => 1,
- 'http://' => 0,
- 'https://www.' => 3,
- 'https://' => 2
- );
- protected static $queryParametersToExclude = array('gclid', 'fb_xd_fragment', 'fb_comment_id',
- 'phpsessid', 'jsessionid', 'sessionid', 'aspsessionid',
- 'doing_wp_cron');
- /**
- * Given the Input URL, will exclude all query parameters set for this site
- *
- * @static
- * @param $originalUrl
- * @param $idSite
- * @return bool|string
- */
- public static function excludeQueryParametersFromUrl($originalUrl, $idSite)
- {
- $originalUrl = self::cleanupUrl($originalUrl);
- $parsedUrl = @parse_url($originalUrl);
- $parsedUrl = self::cleanupHostAndHashTag($parsedUrl, $idSite);
- $parametersToExclude = self::getQueryParametersToExclude($idSite);
- if (empty($parsedUrl['query'])) {
- if (empty($parsedUrl['fragment'])) {
- return UrlHelper::getParseUrlReverse($parsedUrl);
- }
- // Exclude from the hash tag as well
- $queryParameters = UrlHelper::getArrayFromQueryString($parsedUrl['fragment']);
- $parsedUrl['fragment'] = UrlHelper::getQueryStringWithExcludedParameters($queryParameters, $parametersToExclude);
- $url = UrlHelper::getParseUrlReverse($parsedUrl);
- return $url;
- }
- $queryParameters = UrlHelper::getArrayFromQueryString($parsedUrl['query']);
- $parsedUrl['query'] = UrlHelper::getQueryStringWithExcludedParameters($queryParameters, $parametersToExclude);
- $url = UrlHelper::getParseUrlReverse($parsedUrl);
- return $url;
- }
- /**
- * Returns the array of parameters names that must be excluded from the Query String in all tracked URLs
- * @static
- * @param $idSite
- * @return array
- */
- public static function getQueryParametersToExclude($idSite)
- {
- $campaignTrackingParameters = Common::getCampaignParameters();
- $campaignTrackingParameters = array_merge(
- $campaignTrackingParameters[0], // campaign name parameters
- $campaignTrackingParameters[1] // campaign keyword parameters
- );
- $website = Cache::getCacheWebsiteAttributes($idSite);
- $excludedParameters = isset($website['excluded_parameters'])
- ? $website['excluded_parameters']
- : array();
- if (!empty($excludedParameters)) {
- Common::printDebug('Excluding parameters "' . implode(',', $excludedParameters) . '" from URL');
- }
- $parametersToExclude = array_merge($excludedParameters,
- self::$queryParametersToExclude,
- $campaignTrackingParameters);
- $parametersToExclude = array_map('strtolower', $parametersToExclude);
- return $parametersToExclude;
- }
- /**
- * Returns true if URL fragments should be removed for a specific site,
- * false if otherwise.
- *
- * This function uses the Tracker cache and not the MySQL database.
- *
- * @param $idSite int The ID of the site to check for.
- * @return bool
- */
- public static function shouldRemoveURLFragmentFor($idSite)
- {
- $websiteAttributes = Cache::getCacheWebsiteAttributes($idSite);
- return !$websiteAttributes['keep_url_fragment'];
- }
- /**
- * Cleans and/or removes the URL fragment of a URL.
- *
- * @param $urlFragment string The URL fragment to process.
- * @param $idSite int|bool If not false, this function will check if URL fragments
- * should be removed for the site w/ this ID and if so,
- * the returned processed fragment will be empty.
- *
- * @return string The processed URL fragment.
- */
- public static function processUrlFragment($urlFragment, $idSite = false)
- {
- // if we should discard the url fragment for this site, return an empty string as
- // the processed url fragment
- if ($idSite !== false
- && PageUrl::shouldRemoveURLFragmentFor($idSite)
- ) {
- return '';
- } else {
- // Remove trailing Hash tag in ?query#hash#
- if (substr($urlFragment, -1) == '#') {
- $urlFragment = substr($urlFragment, 0, strlen($urlFragment) - 1);
- }
- return $urlFragment;
- }
- }
- /**
- * Will cleanup the hostname (some browser do not strolower the hostname),
- * and deal ith the hash tag on incoming URLs based on website setting.
- *
- * @param $parsedUrl
- * @param $idSite int|bool The site ID of the current visit. This parameter is
- * only used by the tracker to see if we should remove
- * the URL fragment for this site.
- * @return array
- */
- protected static function cleanupHostAndHashTag($parsedUrl, $idSite = false)
- {
- if (empty($parsedUrl)) {
- return $parsedUrl;
- }
- if (!empty($parsedUrl['host'])) {
- $parsedUrl['host'] = mb_strtolower($parsedUrl['host'], 'UTF-8');
- }
- if (!empty($parsedUrl['fragment'])) {
- $parsedUrl['fragment'] = PageUrl::processUrlFragment($parsedUrl['fragment'], $idSite);
- }
- return $parsedUrl;
- }
- /**
- * Converts Matrix URL format
- * from http://example.org/thing;paramA=1;paramB=6542
- * to http://example.org/thing?paramA=1¶mB=6542
- *
- * @param string $originalUrl
- * @return string
- */
- public static function convertMatrixUrl($originalUrl)
- {
- $posFirstSemiColon = strpos($originalUrl, ";");
- if ($posFirstSemiColon === false) {
- return $originalUrl;
- }
- $posQuestionMark = strpos($originalUrl, "?");
- $replace = ($posQuestionMark === false);
- if ($posQuestionMark > $posFirstSemiColon) {
- $originalUrl = substr_replace($originalUrl, ";", $posQuestionMark, 1);
- $replace = true;
- }
- if ($replace) {
- $originalUrl = substr_replace($originalUrl, "?", strpos($originalUrl, ";"), 1);
- $originalUrl = str_replace(";", "&", $originalUrl);
- }
- return $originalUrl;
- }
- /**
- * Clean up string contents (filter, truncate, ...)
- *
- * @param string $string Dirty string
- * @return string
- */
- public static function cleanupString($string)
- {
- $string = trim($string);
- $string = str_replace(array("\n", "\r", "\0"), '', $string);
- $limit = Config::getInstance()->Tracker['page_maximum_length'];
- $clean = substr($string, 0, $limit);
- return $clean;
- }
- protected static function reencodeParameterValue($value, $encoding)
- {
- if (is_string($value)) {
- $decoded = urldecode($value);
- if (@mb_check_encoding($decoded, $encoding)) {
- $value = urlencode(mb_convert_encoding($decoded, 'UTF-8', $encoding));
- }
- }
- return $value;
- }
- protected static function reencodeParametersArray($queryParameters, $encoding)
- {
- foreach ($queryParameters as &$value) {
- if (is_array($value)) {
- $value = self::reencodeParametersArray($value, $encoding);
- } else {
- $value = PageUrl::reencodeParameterValue($value, $encoding);
- }
- }
- return $queryParameters;
- }
- /**
- * Checks if query parameters are of a non-UTF-8 encoding and converts the values
- * from the specified encoding to UTF-8.
- * This method is used to workaround browser/webapp bugs (see #3450). When
- * browsers fail to encode query parameters in UTF-8, the tracker will send the
- * charset of the page viewed and we can sometimes work around invalid data
- * being stored.
- *
- * @param array $queryParameters Name/value mapping of query parameters.
- * @param bool|string $encoding of the HTML page the URL is for. Used to workaround
- * browser bugs & mis-coded webapps. See #3450.
- *
- * @return array
- */
- public static function reencodeParameters(&$queryParameters, $encoding = false)
- {
- // if query params are encoded w/ non-utf8 characters (due to browser bug or whatever),
- // encode to UTF-8.
- if ($encoding !== false
- && strtolower($encoding) != 'utf-8'
- && function_exists('mb_check_encoding')
- ) {
- $queryParameters = PageUrl::reencodeParametersArray($queryParameters, $encoding);
- }
- return $queryParameters;
- }
- public static function cleanupUrl($url)
- {
- $url = Common::unsanitizeInputValue($url);
- $url = PageUrl::cleanupString($url);
- $url = PageUrl::convertMatrixUrl($url);
- return $url;
- }
- /**
- * Build the full URL from the prefix ID and the rest.
- *
- * @param string $url
- * @param integer $prefixId
- * @return string
- */
- public static function reconstructNormalizedUrl($url, $prefixId)
- {
- $map = array_flip(self::$urlPrefixMap);
- if ($prefixId !== null && isset($map[$prefixId])) {
- $fullUrl = $map[$prefixId] . $url;
- } else {
- $fullUrl = $url;
- }
- // Clean up host & hash tags, for URLs
- $parsedUrl = @parse_url($fullUrl);
- $parsedUrl = PageUrl::cleanupHostAndHashTag($parsedUrl);
- $url = UrlHelper::getParseUrlReverse($parsedUrl);
- if (!empty($url)) {
- return $url;
- }
- return $fullUrl;
- }
- /**
- * Extract the prefix from a URL.
- * Return the prefix ID and the rest.
- *
- * @param string $url
- * @return array
- */
- public static function normalizeUrl($url)
- {
- foreach (self::$urlPrefixMap as $prefix => $id) {
- if (strtolower(substr($url, 0, strlen($prefix))) == $prefix) {
- return array(
- 'url' => substr($url, strlen($prefix)),
- 'prefixId' => $id
- );
- }
- }
- return array('url' => $url, 'prefixId' => null);
- }
- public static function getUrlIfLookValid($url)
- {
- $url = PageUrl::cleanupString($url);
- if (!UrlHelper::isLookLikeUrl($url)) {
- Common::printDebug("WARNING: URL looks invalid and is discarded");
- $url = false;
- return $url;
- }
- return $url;
- }
- }