PageRenderTime 62ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 1ms

/plugins/Actions/ArchivingHelper.php

https://github.com/CodeYellowBV/piwik
PHP | 612 lines | 355 code | 68 blank | 189 comment | 83 complexity | 3bf3c41ef78c6bc624e5e7627f8fa4dc MD5 | raw file
Possible License(s): LGPL-3.0, JSON, MIT, GPL-3.0, LGPL-2.1, GPL-2.0, AGPL-1.0, BSD-2-Clause, BSD-3-Clause
  1. <?php
  2. /**
  3. * Piwik - free/libre analytics platform
  4. *
  5. * @link http://piwik.org
  6. * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later
  7. *
  8. */
  9. namespace Piwik\Plugins\Actions;
  10. use PDOStatement;
  11. use Piwik\Config;
  12. use Piwik\DataTable\Row\DataTableSummaryRow;
  13. use Piwik\DataTable;
  14. use Piwik\DataTable\Manager;
  15. use Piwik\DataTable\Row;
  16. use Piwik\Metrics;
  17. use Piwik\Piwik;
  18. use Piwik\Tracker\Action;
  19. use Piwik\Tracker\PageUrl;
  20. use Zend_Db_Statement;
  21. /**
  22. * This static class provides:
  23. * - logic to parse/cleanup Action names,
  24. * - logic to efficiently process aggregate the array data during Archiving
  25. *
  26. */
  27. class ArchivingHelper
  28. {
  29. const OTHERS_ROW_KEY = '';
  30. /**
  31. * Ideally this should use the DataArray object instead of custom data structure
  32. *
  33. * @param Zend_Db_Statement|PDOStatement $query
  34. * @param string|bool $fieldQueried
  35. * @param array $actionsTablesByType
  36. * @return int
  37. */
  38. static public function updateActionsTableWithRowQuery($query, $fieldQueried, & $actionsTablesByType)
  39. {
  40. $rowsProcessed = 0;
  41. while ($row = $query->fetch()) {
  42. if (empty($row['idaction'])) {
  43. $row['type'] = ($fieldQueried == 'idaction_url' ? Action::TYPE_PAGE_URL : Action::TYPE_PAGE_TITLE);
  44. // This will be replaced with 'X not defined' later
  45. $row['name'] = '';
  46. // Yes, this is kind of a hack, so we don't mix 'page url not defined' with 'page title not defined' etc.
  47. $row['idaction'] = -$row['type'];
  48. }
  49. if ($row['type'] != Action::TYPE_SITE_SEARCH) {
  50. unset($row[Metrics::INDEX_SITE_SEARCH_HAS_NO_RESULT]);
  51. }
  52. // This will appear as <url /> in the API, which is actually very important to keep
  53. // eg. When there's at least one row in a report that does not have a URL, not having this <url/> would break HTML/PDF reports.
  54. $url = '';
  55. if ($row['type'] == Action::TYPE_SITE_SEARCH
  56. || $row['type'] == Action::TYPE_PAGE_TITLE
  57. ) {
  58. $url = null;
  59. } elseif (!empty($row['name'])
  60. && $row['name'] != DataTable::LABEL_SUMMARY_ROW) {
  61. $url = PageUrl::reconstructNormalizedUrl((string)$row['name'], $row['url_prefix']);
  62. }
  63. if (isset($row['name'])
  64. && isset($row['type'])
  65. ) {
  66. $actionName = $row['name'];
  67. $actionType = $row['type'];
  68. $urlPrefix = $row['url_prefix'];
  69. $idaction = $row['idaction'];
  70. // in some unknown case, the type field is NULL, as reported in #1082 - we ignore this page view
  71. if (empty($actionType)) {
  72. if ($idaction != DataTable::LABEL_SUMMARY_ROW) {
  73. self::setCachedActionRow($idaction, $actionType, false);
  74. }
  75. continue;
  76. }
  77. $actionRow = self::getActionRow($actionName, $actionType, $urlPrefix, $actionsTablesByType);
  78. self::setCachedActionRow($idaction, $actionType, $actionRow);
  79. } else {
  80. $actionRow = self::getCachedActionRow($row['idaction'], $row['type']);
  81. // Action processed as "to skip" for some reasons
  82. if ($actionRow === false) {
  83. continue;
  84. }
  85. }
  86. if (is_null($actionRow)) {
  87. continue;
  88. }
  89. // Here we do ensure that, the Metadata URL set for a given row, is the one from the Pageview with the most hits.
  90. // This is to ensure that when, different URLs are loaded with the same page name.
  91. // For example http://piwik.org and http://id.piwik.org are reported in Piwik > Actions > Pages with /index
  92. // But, we must make sure http://piwik.org is used to link & for transitions
  93. // Note: this code is partly duplicated from Row->sumRowMetadata()
  94. if (!is_null($url)
  95. && !$actionRow->isSummaryRow()
  96. ) {
  97. if (($existingUrl = $actionRow->getMetadata('url')) !== false) {
  98. if (!empty($row[Metrics::INDEX_PAGE_NB_HITS])
  99. && $row[Metrics::INDEX_PAGE_NB_HITS] > $actionRow->maxVisitsSummed
  100. ) {
  101. $actionRow->setMetadata('url', $url);
  102. $actionRow->maxVisitsSummed = $row[Metrics::INDEX_PAGE_NB_HITS];
  103. }
  104. } else {
  105. $actionRow->setMetadata('url', $url);
  106. $actionRow->maxVisitsSummed = !empty($row[Metrics::INDEX_PAGE_NB_HITS]) ? $row[Metrics::INDEX_PAGE_NB_HITS] : 0;
  107. }
  108. }
  109. if ($row['type'] != Action::TYPE_PAGE_URL
  110. && $row['type'] != Action::TYPE_PAGE_TITLE
  111. ) {
  112. // only keep performance metrics when they're used (i.e. for URLs and page titles)
  113. if (array_key_exists(Metrics::INDEX_PAGE_SUM_TIME_GENERATION, $row)) {
  114. unset($row[Metrics::INDEX_PAGE_SUM_TIME_GENERATION]);
  115. }
  116. if (array_key_exists(Metrics::INDEX_PAGE_NB_HITS_WITH_TIME_GENERATION, $row)) {
  117. unset($row[Metrics::INDEX_PAGE_NB_HITS_WITH_TIME_GENERATION]);
  118. }
  119. if (array_key_exists(Metrics::INDEX_PAGE_MIN_TIME_GENERATION, $row)) {
  120. unset($row[Metrics::INDEX_PAGE_MIN_TIME_GENERATION]);
  121. }
  122. if (array_key_exists(Metrics::INDEX_PAGE_MAX_TIME_GENERATION, $row)) {
  123. unset($row[Metrics::INDEX_PAGE_MAX_TIME_GENERATION]);
  124. }
  125. }
  126. unset($row['name']);
  127. unset($row['type']);
  128. unset($row['idaction']);
  129. unset($row['url_prefix']);
  130. foreach ($row as $name => $value) {
  131. // in some edge cases, we have twice the same action name with 2 different idaction
  132. // - this happens when 2 visitors visit the same new page at the same time, and 2 actions get recorded for the same name
  133. // - this could also happen when 2 URLs end up having the same label (eg. 2 subdomains get aggregated to the "/index" page name)
  134. if (($alreadyValue = $actionRow->getColumn($name)) !== false) {
  135. $newValue = self::getColumnValuesMerged($name, $alreadyValue, $value);
  136. $actionRow->setColumn($name, $newValue);
  137. } else {
  138. $actionRow->addColumn($name, $value);
  139. }
  140. }
  141. // if the exit_action was not recorded properly in the log_link_visit_action
  142. // there would be an error message when getting the nb_hits column
  143. // we must fake the record and add the columns
  144. if ($actionRow->getColumn(Metrics::INDEX_PAGE_NB_HITS) === false) {
  145. // to test this code: delete the entries in log_link_action_visit for
  146. // a given exit_idaction_url
  147. foreach (self::getDefaultRow()->getColumns() as $name => $value) {
  148. $actionRow->addColumn($name, $value);
  149. }
  150. }
  151. $rowsProcessed++;
  152. }
  153. // just to make sure php copies the last $actionRow in the $parentTable array
  154. $actionRow =& $actionsTablesByType;
  155. return $rowsProcessed;
  156. }
  157. public static function removeEmptyColumns($dataTable)
  158. {
  159. // Delete all columns that have a value of zero
  160. $dataTable->filter('ColumnDelete', array(
  161. $columnsToRemove = array(Metrics::INDEX_PAGE_IS_FOLLOWING_SITE_SEARCH_NB_HITS),
  162. $columnsToKeep = array(),
  163. $deleteIfZeroOnly = true
  164. ));
  165. }
  166. /**
  167. * For rows which have subtables (eg. directories with sub pages),
  168. * deletes columns which don't make sense when all values of sub pages are summed.
  169. *
  170. * @param $dataTable DataTable
  171. */
  172. public static function deleteInvalidSummedColumnsFromDataTable($dataTable)
  173. {
  174. foreach ($dataTable->getRows() as $id => $row) {
  175. if (($idSubtable = $row->getIdSubDataTable()) !== null
  176. || $id === DataTable::ID_SUMMARY_ROW
  177. ) {
  178. if ($idSubtable !== null) {
  179. $subtable = Manager::getInstance()->getTable($idSubtable);
  180. self::deleteInvalidSummedColumnsFromDataTable($subtable);
  181. }
  182. if ($row instanceof DataTableSummaryRow) {
  183. $row->recalculate();
  184. }
  185. foreach (Archiver::$columnsToDeleteAfterAggregation as $name) {
  186. $row->deleteColumn($name);
  187. }
  188. }
  189. }
  190. // And this as well
  191. ArchivingHelper::removeEmptyColumns($dataTable);
  192. }
  193. /**
  194. * Returns the limit to use with RankingQuery for this plugin.
  195. *
  196. * @return int
  197. */
  198. public static function getRankingQueryLimit()
  199. {
  200. $configGeneral = Config::getInstance()->General;
  201. $configLimit = $configGeneral['archiving_ranking_query_row_limit'];
  202. $limit = $configLimit == 0 ? 0 : max(
  203. $configLimit,
  204. $configGeneral['datatable_archiving_maximum_rows_actions'],
  205. $configGeneral['datatable_archiving_maximum_rows_subtable_actions']
  206. );
  207. // FIXME: This is a quick fix for #3482. The actual cause of the bug is that
  208. // the site search & performance metrics additions to
  209. // ArchivingHelper::updateActionsTableWithRowQuery expect every
  210. // row to have 'type' data, but not all of the SQL queries that are run w/o
  211. // ranking query join on the log_action table and thus do not select the
  212. // log_action.type column.
  213. //
  214. // NOTES: Archiving logic can be generalized as follows:
  215. // 0) Do SQL query over log_link_visit_action & join on log_action to select
  216. // some metrics (like visits, hits, etc.)
  217. // 1) For each row, cache the action row & metrics. (This is done by
  218. // updateActionsTableWithRowQuery for result set rows that have
  219. // name & type columns.)
  220. // 2) Do other SQL queries for metrics we can't put in the first query (like
  221. // entry visits, exit vists, etc.) w/o joining log_action.
  222. // 3) For each row, find the cached row by idaction & add the new metrics to
  223. // it. (This is done by updateActionsTableWithRowQuery for result set rows
  224. // that DO NOT have name & type columns.)
  225. //
  226. // The site search & performance metrics additions expect a 'type' all the time
  227. // which breaks the original pre-rankingquery logic. Ranking query requires a
  228. // join, so the bug is only seen when ranking query is disabled.
  229. if ($limit === 0) {
  230. $limit = 100000;
  231. }
  232. return $limit;
  233. }
  234. /**
  235. * @param $columnName
  236. * @param $alreadyValue
  237. * @param $value
  238. * @return mixed
  239. */
  240. private static function getColumnValuesMerged($columnName, $alreadyValue, $value)
  241. {
  242. if ($columnName == Metrics::INDEX_PAGE_MIN_TIME_GENERATION) {
  243. if (empty($alreadyValue)) {
  244. $newValue = $value;
  245. } else if (empty($value)) {
  246. $newValue = $alreadyValue;
  247. } else {
  248. $newValue = min($alreadyValue, $value);
  249. }
  250. return $newValue;
  251. }
  252. if ($columnName == Metrics::INDEX_PAGE_MAX_TIME_GENERATION) {
  253. $newValue = max($alreadyValue, $value);
  254. return $newValue;
  255. }
  256. $newValue = $alreadyValue + $value;
  257. return $newValue;
  258. }
  259. static public $maximumRowsInDataTableLevelZero;
  260. static public $maximumRowsInSubDataTable;
  261. static public $columnToSortByBeforeTruncation;
  262. static protected $actionUrlCategoryDelimiter = null;
  263. static protected $actionTitleCategoryDelimiter = null;
  264. static protected $defaultActionName = null;
  265. static protected $defaultActionNameWhenNotDefined = null;
  266. static protected $defaultActionUrlWhenNotDefined = null;
  267. static public function reloadConfig()
  268. {
  269. // for BC, we read the old style delimiter first (see #1067)Row
  270. $actionDelimiter = @Config::getInstance()->General['action_category_delimiter'];
  271. if (empty($actionDelimiter)) {
  272. self::$actionUrlCategoryDelimiter = Config::getInstance()->General['action_url_category_delimiter'];
  273. self::$actionTitleCategoryDelimiter = Config::getInstance()->General['action_title_category_delimiter'];
  274. } else {
  275. self::$actionUrlCategoryDelimiter = self::$actionTitleCategoryDelimiter = $actionDelimiter;
  276. }
  277. self::$defaultActionName = Config::getInstance()->General['action_default_name'];
  278. self::$columnToSortByBeforeTruncation = Metrics::INDEX_NB_VISITS;
  279. self::$maximumRowsInDataTableLevelZero = Config::getInstance()->General['datatable_archiving_maximum_rows_actions'];
  280. self::$maximumRowsInSubDataTable = Config::getInstance()->General['datatable_archiving_maximum_rows_subtable_actions'];
  281. DataTable::setMaximumDepthLevelAllowedAtLeast(self::getSubCategoryLevelLimit() + 1);
  282. }
  283. /**
  284. * The default row is used when archiving, if data is inconsistent in the DB,
  285. * there could be pages that have exit/entry hits, but don't yet
  286. * have a record in the table (or the record was truncated).
  287. *
  288. * @return Row
  289. */
  290. static private function getDefaultRow()
  291. {
  292. static $row = false;
  293. if ($row === false) {
  294. // This row is used in the case where an action is know as an exit_action
  295. // but this action was not properly recorded when it was hit in the first place
  296. // so we add this fake row information to make sure there is a nb_hits, etc. column for every action
  297. $row = new Row(array(
  298. Row::COLUMNS => array(
  299. Metrics::INDEX_NB_VISITS => 1,
  300. Metrics::INDEX_NB_UNIQ_VISITORS => 1,
  301. Metrics::INDEX_PAGE_NB_HITS => 1,
  302. )));
  303. }
  304. return $row;
  305. }
  306. /**
  307. * Given a page name and type, builds a recursive datatable where
  308. * each level of the tree is a category, based on the page name split by a delimiter (slash / by default)
  309. *
  310. * @param string $actionName
  311. * @param int $actionType
  312. * @param int $urlPrefix
  313. * @param array $actionsTablesByType
  314. * @return DataTable
  315. */
  316. private static function getActionRow($actionName, $actionType, $urlPrefix = null, &$actionsTablesByType)
  317. {
  318. // we work on the root table of the given TYPE (either ACTION_URL or DOWNLOAD or OUTLINK etc.)
  319. /* @var DataTable $currentTable */
  320. $currentTable =& $actionsTablesByType[$actionType];
  321. if(is_null($currentTable)) {
  322. throw new \Exception("Action table for type '$actionType' was not found during Actions archiving.");
  323. }
  324. // check for ranking query cut-off
  325. if ($actionName == DataTable::LABEL_SUMMARY_ROW) {
  326. $summaryRow = $currentTable->getRowFromId(DataTable::ID_SUMMARY_ROW);
  327. if ($summaryRow === false) {
  328. $summaryRow = $currentTable->addSummaryRow(self::createSummaryRow());
  329. }
  330. return $summaryRow;
  331. }
  332. // go to the level of the subcategory
  333. $actionExplodedNames = self::getActionExplodedNames($actionName, $actionType, $urlPrefix);
  334. list($row, $level) = $currentTable->walkPath(
  335. $actionExplodedNames, self::getDefaultRowColumns(), self::$maximumRowsInSubDataTable);
  336. return $row;
  337. }
  338. /**
  339. * Returns the configured sub-category level limit.
  340. *
  341. * @return int
  342. */
  343. public static function getSubCategoryLevelLimit()
  344. {
  345. return Config::getInstance()->General['action_category_level_limit'];
  346. }
  347. /**
  348. * Returns default label for the action type
  349. *
  350. * @param $type
  351. * @return string
  352. */
  353. static public function getUnknownActionName($type)
  354. {
  355. if (empty(self::$defaultActionNameWhenNotDefined)) {
  356. self::$defaultActionNameWhenNotDefined = Piwik::translate('General_NotDefined', Piwik::translate('Actions_ColumnPageName'));
  357. self::$defaultActionUrlWhenNotDefined = Piwik::translate('General_NotDefined', Piwik::translate('Actions_ColumnPageURL'));
  358. }
  359. if ($type == Action::TYPE_PAGE_TITLE) {
  360. return self::$defaultActionNameWhenNotDefined;
  361. }
  362. return self::$defaultActionUrlWhenNotDefined;
  363. }
  364. /**
  365. * Explodes action name into an array of elements.
  366. *
  367. * NOTE: before calling this function make sure ArchivingHelper::reloadConfig(); is called
  368. *
  369. * for downloads:
  370. * we explode link http://piwik.org/some/path/piwik.zip into an array( 'piwik.org', '/some/path/piwik.zip' );
  371. *
  372. * for outlinks:
  373. * we explode link http://dev.piwik.org/some/path into an array( 'dev.piwik.org', '/some/path' );
  374. *
  375. * for action urls:
  376. * we explode link http://piwik.org/some/path into an array( 'some', 'path' );
  377. *
  378. * for action names:
  379. * we explode name 'Piwik / Category 1 / Category 2' into an array('Piwik', 'Category 1', 'Category 2');
  380. *
  381. * @param string $name action name
  382. * @param int $type action type
  383. * @param int $urlPrefix url prefix (only used for TYPE_PAGE_URL)
  384. * @return array of exploded elements from $name
  385. */
  386. static public function getActionExplodedNames($name, $type, $urlPrefix = null)
  387. {
  388. // Site Search does not split Search keywords
  389. if ($type == Action::TYPE_SITE_SEARCH) {
  390. return array($name);
  391. }
  392. $name = str_replace("\n", "", $name);
  393. $name = self::parseNameFromPageUrl($name, $type, $urlPrefix);
  394. // outlinks and downloads
  395. if(is_array($name)) {
  396. return $name;
  397. }
  398. $split = self::splitNameByDelimiter($name, $type);
  399. if (empty($split)) {
  400. $defaultName = self::getUnknownActionName($type);
  401. return array(trim($defaultName));
  402. }
  403. $lastPageName = end($split);
  404. // we are careful to prefix the page URL / name with some value
  405. // so that if a page has the same name as a category
  406. // we don't merge both entries
  407. if ($type != Action::TYPE_PAGE_TITLE) {
  408. $lastPageName = '/' . $lastPageName;
  409. } else {
  410. $lastPageName = ' ' . $lastPageName;
  411. }
  412. $split[count($split) - 1] = $lastPageName;
  413. return array_values($split);
  414. }
  415. /**
  416. * Gets the key for the cache of action rows from an action ID and type.
  417. *
  418. * @param int $idAction
  419. * @param int $actionType
  420. * @return string|int
  421. */
  422. private static function getCachedActionRowKey($idAction, $actionType)
  423. {
  424. return $idAction == DataTable::LABEL_SUMMARY_ROW
  425. ? $actionType . '_others'
  426. : $idAction;
  427. }
  428. /**
  429. * Static cache to store Rows during processing
  430. */
  431. static protected $cacheParsedAction = array();
  432. public static function clearActionsCache()
  433. {
  434. self::$cacheParsedAction = array();
  435. }
  436. /**
  437. * Get cached action row by id & type. If $idAction is set to -1, the 'Others' row
  438. * for the specific action type will be returned.
  439. *
  440. * @param int $idAction
  441. * @param int $actionType
  442. * @return Row|false
  443. */
  444. private static function getCachedActionRow($idAction, $actionType)
  445. {
  446. $cacheLabel = self::getCachedActionRowKey($idAction, $actionType);
  447. if (!isset(self::$cacheParsedAction[$cacheLabel])) {
  448. // This can happen when
  449. // - We select an entry page ID that was only seen yesterday, so wasn't selected in the first query
  450. // - We count time spent on a page, when this page was only seen yesterday
  451. return false;
  452. }
  453. return self::$cacheParsedAction[$cacheLabel];
  454. }
  455. /**
  456. * Set cached action row for an id & type.
  457. *
  458. * @param int $idAction
  459. * @param int $actionType
  460. * @param \DataTable\Row
  461. */
  462. private static function setCachedActionRow($idAction, $actionType, $actionRow)
  463. {
  464. $cacheLabel = self::getCachedActionRowKey($idAction, $actionType);
  465. self::$cacheParsedAction[$cacheLabel] = $actionRow;
  466. }
  467. /**
  468. * Returns the default columns for a row in an Actions DataTable.
  469. *
  470. * @return array
  471. */
  472. private static function getDefaultRowColumns()
  473. {
  474. return array(Metrics::INDEX_NB_VISITS => 0,
  475. Metrics::INDEX_NB_UNIQ_VISITORS => 0,
  476. Metrics::INDEX_PAGE_NB_HITS => 0,
  477. Metrics::INDEX_PAGE_SUM_TIME_SPENT => 0);
  478. }
  479. /**
  480. * Creates a summary row for an Actions DataTable.
  481. *
  482. * @return Row
  483. */
  484. private static function createSummaryRow()
  485. {
  486. return new Row(array(
  487. Row::COLUMNS =>
  488. array('label' => DataTable::LABEL_SUMMARY_ROW) + self::getDefaultRowColumns()
  489. ));
  490. }
  491. private static function splitNameByDelimiter($name, $type)
  492. {
  493. if(is_array($name)) {
  494. return $name;
  495. }
  496. if ($type == Action::TYPE_PAGE_TITLE) {
  497. $categoryDelimiter = self::$actionTitleCategoryDelimiter;
  498. } else {
  499. $categoryDelimiter = self::$actionUrlCategoryDelimiter;
  500. }
  501. if (empty($categoryDelimiter)) {
  502. return array(trim($name));
  503. }
  504. $split = explode($categoryDelimiter, $name, self::getSubCategoryLevelLimit());
  505. // trim every category and remove empty categories
  506. $split = array_map('trim', $split);
  507. $split = array_filter($split, 'strlen');
  508. // forces array key to start at 0
  509. $split = array_values($split);
  510. return $split;
  511. }
  512. private static function parseNameFromPageUrl($name, $type, $urlPrefix)
  513. {
  514. $urlRegexAfterDomain = '([^/]+)[/]?([^#]*)[#]?(.*)';
  515. if ($urlPrefix === null) {
  516. // match url with protocol (used for outlinks / downloads)
  517. $urlRegex = '@^http[s]?://' . $urlRegexAfterDomain . '$@i';
  518. } else {
  519. // the name is a url that does not contain protocol and www anymore
  520. // we know that normalization has been done on db level because $urlPrefix is set
  521. $urlRegex = '@^' . $urlRegexAfterDomain . '$@i';
  522. }
  523. $matches = array();
  524. preg_match($urlRegex, $name, $matches);
  525. if (!count($matches)) {
  526. return $name;
  527. }
  528. $urlHost = $matches[1];
  529. $urlPath = $matches[2];
  530. $urlFragment = $matches[3];
  531. if (in_array($type, array(Action::TYPE_DOWNLOAD, Action::TYPE_OUTLINK))) {
  532. return array(trim($urlHost), '/' . trim($urlPath));
  533. }
  534. $name = $urlPath;
  535. if ($name === '' || substr($name, -1) == '/') {
  536. $name .= self::$defaultActionName;
  537. }
  538. $urlFragment = PageUrl::processUrlFragment($urlFragment);
  539. if (!empty($urlFragment)) {
  540. $name .= '#' . $urlFragment;
  541. }
  542. return $name;
  543. }
  544. }