PageRenderTime 45ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/app/Libs/PHPCrawl/UrlCache/PHPCrawlerSQLiteURLCache.class.php

https://gitlab.com/vnsoftdev/sna
PHP | 301 lines | 173 code | 47 blank | 81 comment | 20 complexity | f557671608deb642544edc014e135e09 MD5 | raw file
  1. <?php
  2. /**
  3. * Class for caching/storing URLs/links in a SQLite-database-file.
  4. *
  5. * @package phpcrawl
  6. * @internal
  7. */
  8. class PHPCrawlerSQLiteURLCache extends PHPCrawlerURLCacheBase
  9. {
  10. /**
  11. * PDO-object for querying SQLite-file.
  12. *
  13. * @var PDO
  14. */
  15. protected $PDO;
  16. /**
  17. * Prepared statement for inserting URLS into the db-file as PDOStatement-object.
  18. *
  19. * @var PDOStatement
  20. */
  21. protected $PreparedInsertStatement;
  22. protected $sqlite_db_file;
  23. protected $db_analyzed = false;
  24. /**
  25. * Initiates an SQLite-URL-cache.
  26. *
  27. * @param string $file The SQLite-fiel to use.
  28. * @param bool $create_tables Defines whether all necessary tables should be created
  29. */
  30. public function __construct($file, $create_tables = false)
  31. {
  32. $this->sqlite_db_file = $file;
  33. $this->openConnection($create_tables);
  34. }
  35. public function getUrlCount()
  36. {
  37. $Result = $this->PDO->query("SELECT count(id) AS sum FROM urls WHERE processed = 0;");
  38. $row = $Result->fetch(PDO::FETCH_ASSOC);
  39. $Result->closeCursor();
  40. return $row["sum"];
  41. }
  42. /**
  43. * Returns the next URL from the cache that should be crawled.
  44. *
  45. * @return PhpCrawlerURLDescriptor An PhpCrawlerURLDescriptor or NULL if currently no
  46. * URL to process.
  47. */
  48. public function getNextUrl()
  49. {
  50. PHPCrawlerBenchmark::start("fetching_next_url_from_sqlitecache");
  51. $ok = $this->PDO->exec("BEGIN EXCLUSIVE TRANSACTION");
  52. // Get row with max priority-level
  53. $Result = $this->PDO->query("SELECT max(priority_level) AS max_priority_level FROM urls WHERE in_process = 0 AND processed = 0;");
  54. $row = $Result->fetch(PDO::FETCH_ASSOC);
  55. if ($row["max_priority_level"] == null)
  56. {
  57. $Result->closeCursor();
  58. $this->PDO->exec("COMMIT;");
  59. return null;
  60. }
  61. $Result = $this->PDO->query("SELECT * FROM urls WHERE priority_level = ".$row["max_priority_level"]." and in_process = 0 AND processed = 0;");
  62. $row = $Result->fetch(PDO::FETCH_ASSOC);
  63. $Result->closeCursor();
  64. // Update row (set in process-flag)
  65. $this->PDO->exec("UPDATE urls SET in_process = 1 WHERE id = ".$row["id"].";");
  66. $this->PDO->exec("COMMIT;");
  67. PHPCrawlerBenchmark::stop("fetching_next_url_from_sqlitecache");
  68. // Return URL
  69. return new PHPCrawlerURLDescriptor($row["url_rebuild"], $row["link_raw"], $row["linkcode"], $row["linktext"], $row["refering_url"], $row["url_link_depth"]);
  70. }
  71. /**
  72. * Has no function in this class
  73. */
  74. public function getAllURLs()
  75. {
  76. }
  77. /**
  78. * Removes all URLs and all priority-rules from the URL-cache.
  79. */
  80. public function clear()
  81. {
  82. $this->PDO->exec("DELETE FROM urls;");
  83. $this->PDO->exec("VACUUM;");
  84. }
  85. /**
  86. * Adds an URL to the url-cache
  87. *
  88. * @param PHPCrawlerURLDescriptor $UrlDescriptor
  89. */
  90. public function addURL(PHPCrawlerURLDescriptor $UrlDescriptor)
  91. {
  92. if ($UrlDescriptor == null) return;
  93. // Hash of the URL
  94. $map_key = md5($UrlDescriptor->url_rebuild);
  95. // Get priority of URL
  96. $priority_level = $this->getUrlPriority($UrlDescriptor->url_rebuild);
  97. $this->createPreparedInsertStatement();
  98. // Insert URL via prepared statement
  99. $this->PreparedInsertStatement->execute(array(":priority_level" => $priority_level,
  100. ":distinct_hash" => $map_key,
  101. ":link_raw" => $UrlDescriptor->link_raw,
  102. ":linkcode" => $UrlDescriptor->linkcode,
  103. ":linktext" => $UrlDescriptor->linktext,
  104. ":refering_url" => $UrlDescriptor->refering_url,
  105. ":url_rebuild" => $UrlDescriptor->url_rebuild,
  106. ":is_redirect_url" => $UrlDescriptor->is_redirect_url,
  107. ":url_link_depth" => $UrlDescriptor->url_link_depth));
  108. }
  109. /**
  110. * Adds an bunch of URLs to the url-cache
  111. *
  112. * @param array $urls A numeric array containing the URLs as PHPCrawlerURLDescriptor-objects
  113. */
  114. public function addURLs($urls)
  115. {
  116. PHPCrawlerBenchmark::start("adding_urls_to_sqlitecache");
  117. $this->PDO->exec("BEGIN EXCLUSIVE TRANSACTION;");
  118. $cnt = count($urls);
  119. for ($x=0; $x<$cnt; $x++)
  120. {
  121. if ($urls[$x] != null)
  122. {
  123. $this->addURL($urls[$x]);
  124. }
  125. // Commit after 1000 URLs (reduces memory-usage)
  126. if ($x%1000 == 0 && $x > 0)
  127. {
  128. $this->PDO->exec("COMMIT;");
  129. $this->PDO->exec("BEGIN EXCLUSIVE TRANSACTION;");
  130. }
  131. }
  132. $this->PDO->exec("COMMIT;");
  133. $this->PreparedInsertStatement->closeCursor();
  134. if ($this->db_analyzed == false)
  135. {
  136. $this->PDO->exec("ANALYZE;");
  137. $this->db_analyzed = true;
  138. }
  139. PHPCrawlerBenchmark::stop("adding_urls_to_sqlitecache");
  140. }
  141. /**
  142. * Marks the given URL in the cache as "followed"
  143. *
  144. * @param PHPCrawlerURLDescriptor $UrlDescriptor
  145. */
  146. public function markUrlAsFollowed(PHPCrawlerURLDescriptor $UrlDescriptor)
  147. {
  148. PHPCrawlerBenchmark::start("marking_url_as_followes");
  149. $hash = md5($UrlDescriptor->url_rebuild);
  150. $this->PDO->exec("UPDATE urls SET processed = 1, in_process = 0 WHERE distinct_hash = '".$hash."';");
  151. PHPCrawlerBenchmark::stop("marking_url_as_followes");
  152. }
  153. /**
  154. * Checks whether there are URLs left in the cache that should be processed or not.
  155. *
  156. * @return bool
  157. */
  158. public function containsURLs()
  159. {
  160. PHPCrawlerBenchmark::start("checking_for_urls_in_cache");
  161. $Result = $this->PDO->query("SELECT id FROM urls WHERE processed = 0 OR in_process = 1 LIMIT 1;");
  162. $has_columns = $Result->fetchColumn();
  163. $Result->closeCursor();
  164. PHPCrawlerBenchmark::stop("checking_for_urls_in_cache");
  165. if ($has_columns != false)
  166. {
  167. return true;
  168. }
  169. else return false;
  170. }
  171. /**
  172. * Cleans/purges the URL-cache from inconsistent entries.
  173. */
  174. public function purgeCache()
  175. {
  176. // Set "in_process" to 0 for all URLs
  177. $this->PDO->exec("UPDATE urls SET in_process = 0;");
  178. }
  179. /**
  180. * Creates the sqlite-db-file and opens connection to it.
  181. *
  182. * @param bool $create_tables Defines whether all necessary tables should be created
  183. */
  184. protected function openConnection($create_tables = false)
  185. {
  186. PHPCrawlerBenchmark::start("connecting_to_sqlite_db");
  187. // Open sqlite-file
  188. try
  189. {
  190. $this->PDO = new PDO("sqlite:".$this->sqlite_db_file);
  191. }
  192. catch (Exception $e)
  193. {
  194. throw new Exception("Error creating SQLite-cache-file, ".$e->getMessage().", try installing sqlite3-extension for PHP.");
  195. }
  196. $this->PDO->exec("PRAGMA journal_mode = OFF");
  197. $this->PDO->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING);
  198. $this->PDO->setAttribute(PDO::ATTR_TIMEOUT, 100);
  199. if ($create_tables == true)
  200. {
  201. // Create url-table (if not exists)
  202. $this->PDO->exec("CREATE TABLE IF NOT EXISTS urls (id integer PRIMARY KEY AUTOINCREMENT,
  203. in_process bool DEFAULT 0,
  204. processed bool DEFAULT 0,
  205. priority_level integer,
  206. distinct_hash TEXT UNIQUE,
  207. link_raw TEXT,
  208. linkcode TEXT,
  209. linktext TEXT,
  210. refering_url TEXT,
  211. url_rebuild TEXT,
  212. is_redirect_url bool,
  213. url_link_depth integer);");
  214. // Create indexes (seems that indexes make the whole thingy slower)
  215. $this->PDO->exec("CREATE INDEX IF NOT EXISTS priority_level ON urls (priority_level);");
  216. $this->PDO->exec("CREATE INDEX IF NOT EXISTS distinct_hash ON urls (distinct_hash);");
  217. $this->PDO->exec("CREATE INDEX IF NOT EXISTS in_process ON urls (in_process);");
  218. $this->PDO->exec("CREATE INDEX IF NOT EXISTS processed ON urls (processed);");
  219. $this->PDO->exec("ANALYZE;");
  220. }
  221. PHPCrawlerBenchmark::stop("connecting_to_sqlite_db");
  222. }
  223. /**
  224. * Creates the prepared statement for insterting URLs into database (if not done yet)
  225. */
  226. protected function createPreparedInsertStatement()
  227. {
  228. if ($this->PreparedInsertStatement == null)
  229. {
  230. // Prepared statement for URL-inserts
  231. $this->PreparedInsertStatement = $this->PDO->prepare("INSERT OR IGNORE INTO urls (priority_level, distinct_hash, link_raw, linkcode, linktext, refering_url, url_rebuild, is_redirect_url, url_link_depth)
  232. VALUES(:priority_level,
  233. :distinct_hash,
  234. :link_raw,
  235. :linkcode,
  236. :linktext,
  237. :refering_url,
  238. :url_rebuild,
  239. :is_redirect_url,
  240. :url_link_depth);");
  241. }
  242. }
  243. /**
  244. * Cleans up the cache after is it not needed anymore.
  245. */
  246. public function cleanup()
  247. {
  248. // Has to be done, otherwise sqlite-file is locked on Windows-OS
  249. $this->PDO = null;
  250. $this->PreparedInsertStatement = null;
  251. unlink($this->sqlite_db_file);
  252. }
  253. }
  254. ?>