/app/Libs/PHPCrawl/UrlCache/PHPCrawlerSQLiteURLCache.class.php
PHP | 301 lines | 173 code | 47 blank | 81 comment | 20 complexity | f557671608deb642544edc014e135e09 MD5 | raw file
- <?php
- /**
- * Class for caching/storing URLs/links in a SQLite-database-file.
- *
- * @package phpcrawl
- * @internal
- */
- class PHPCrawlerSQLiteURLCache extends PHPCrawlerURLCacheBase
- {
- /**
- * PDO-object for querying SQLite-file.
- *
- * @var PDO
- */
- protected $PDO;
-
- /**
- * Prepared statement for inserting URLS into the db-file as PDOStatement-object.
- *
- * @var PDOStatement
- */
- protected $PreparedInsertStatement;
-
- protected $sqlite_db_file;
-
- protected $db_analyzed = false;
-
- /**
- * Initiates an SQLite-URL-cache.
- *
- * @param string $file The SQLite-fiel to use.
- * @param bool $create_tables Defines whether all necessary tables should be created
- */
- public function __construct($file, $create_tables = false)
- {
- $this->sqlite_db_file = $file;
- $this->openConnection($create_tables);
- }
-
- public function getUrlCount()
- {
- $Result = $this->PDO->query("SELECT count(id) AS sum FROM urls WHERE processed = 0;");
- $row = $Result->fetch(PDO::FETCH_ASSOC);
- $Result->closeCursor();
- return $row["sum"];
- }
-
- /**
- * Returns the next URL from the cache that should be crawled.
- *
- * @return PhpCrawlerURLDescriptor An PhpCrawlerURLDescriptor or NULL if currently no
- * URL to process.
- */
- public function getNextUrl()
- {
- PHPCrawlerBenchmark::start("fetching_next_url_from_sqlitecache");
-
- $ok = $this->PDO->exec("BEGIN EXCLUSIVE TRANSACTION");
-
- // Get row with max priority-level
- $Result = $this->PDO->query("SELECT max(priority_level) AS max_priority_level FROM urls WHERE in_process = 0 AND processed = 0;");
- $row = $Result->fetch(PDO::FETCH_ASSOC);
-
- if ($row["max_priority_level"] == null)
- {
- $Result->closeCursor();
- $this->PDO->exec("COMMIT;");
- return null;
- }
-
- $Result = $this->PDO->query("SELECT * FROM urls WHERE priority_level = ".$row["max_priority_level"]." and in_process = 0 AND processed = 0;");
- $row = $Result->fetch(PDO::FETCH_ASSOC);
- $Result->closeCursor();
-
- // Update row (set in process-flag)
- $this->PDO->exec("UPDATE urls SET in_process = 1 WHERE id = ".$row["id"].";");
-
- $this->PDO->exec("COMMIT;");
-
- PHPCrawlerBenchmark::stop("fetching_next_url_from_sqlitecache");
-
- // Return URL
- return new PHPCrawlerURLDescriptor($row["url_rebuild"], $row["link_raw"], $row["linkcode"], $row["linktext"], $row["refering_url"], $row["url_link_depth"]);
- }
-
- /**
- * Has no function in this class
- */
- public function getAllURLs()
- {
- }
-
- /**
- * Removes all URLs and all priority-rules from the URL-cache.
- */
- public function clear()
- {
- $this->PDO->exec("DELETE FROM urls;");
- $this->PDO->exec("VACUUM;");
- }
-
- /**
- * Adds an URL to the url-cache
- *
- * @param PHPCrawlerURLDescriptor $UrlDescriptor
- */
- public function addURL(PHPCrawlerURLDescriptor $UrlDescriptor)
- {
- if ($UrlDescriptor == null) return;
-
- // Hash of the URL
- $map_key = md5($UrlDescriptor->url_rebuild);
-
- // Get priority of URL
- $priority_level = $this->getUrlPriority($UrlDescriptor->url_rebuild);
-
- $this->createPreparedInsertStatement();
-
- // Insert URL via prepared statement
- $this->PreparedInsertStatement->execute(array(":priority_level" => $priority_level,
- ":distinct_hash" => $map_key,
- ":link_raw" => $UrlDescriptor->link_raw,
- ":linkcode" => $UrlDescriptor->linkcode,
- ":linktext" => $UrlDescriptor->linktext,
- ":refering_url" => $UrlDescriptor->refering_url,
- ":url_rebuild" => $UrlDescriptor->url_rebuild,
- ":is_redirect_url" => $UrlDescriptor->is_redirect_url,
- ":url_link_depth" => $UrlDescriptor->url_link_depth));
- }
-
- /**
- * Adds an bunch of URLs to the url-cache
- *
- * @param array $urls A numeric array containing the URLs as PHPCrawlerURLDescriptor-objects
- */
- public function addURLs($urls)
- {
- PHPCrawlerBenchmark::start("adding_urls_to_sqlitecache");
-
- $this->PDO->exec("BEGIN EXCLUSIVE TRANSACTION;");
-
- $cnt = count($urls);
- for ($x=0; $x<$cnt; $x++)
- {
- if ($urls[$x] != null)
- {
- $this->addURL($urls[$x]);
- }
-
- // Commit after 1000 URLs (reduces memory-usage)
- if ($x%1000 == 0 && $x > 0)
- {
- $this->PDO->exec("COMMIT;");
- $this->PDO->exec("BEGIN EXCLUSIVE TRANSACTION;");
- }
- }
-
- $this->PDO->exec("COMMIT;");
- $this->PreparedInsertStatement->closeCursor();
-
- if ($this->db_analyzed == false)
- {
- $this->PDO->exec("ANALYZE;");
- $this->db_analyzed = true;
- }
-
- PHPCrawlerBenchmark::stop("adding_urls_to_sqlitecache");
- }
-
- /**
- * Marks the given URL in the cache as "followed"
- *
- * @param PHPCrawlerURLDescriptor $UrlDescriptor
- */
- public function markUrlAsFollowed(PHPCrawlerURLDescriptor $UrlDescriptor)
- {
- PHPCrawlerBenchmark::start("marking_url_as_followes");
- $hash = md5($UrlDescriptor->url_rebuild);
- $this->PDO->exec("UPDATE urls SET processed = 1, in_process = 0 WHERE distinct_hash = '".$hash."';");
- PHPCrawlerBenchmark::stop("marking_url_as_followes");
- }
-
- /**
- * Checks whether there are URLs left in the cache that should be processed or not.
- *
- * @return bool
- */
- public function containsURLs()
- {
- PHPCrawlerBenchmark::start("checking_for_urls_in_cache");
-
- $Result = $this->PDO->query("SELECT id FROM urls WHERE processed = 0 OR in_process = 1 LIMIT 1;");
-
- $has_columns = $Result->fetchColumn();
-
- $Result->closeCursor();
-
- PHPCrawlerBenchmark::stop("checking_for_urls_in_cache");
-
- if ($has_columns != false)
- {
- return true;
- }
- else return false;
- }
-
- /**
- * Cleans/purges the URL-cache from inconsistent entries.
- */
- public function purgeCache()
- {
- // Set "in_process" to 0 for all URLs
- $this->PDO->exec("UPDATE urls SET in_process = 0;");
- }
-
- /**
- * Creates the sqlite-db-file and opens connection to it.
- *
- * @param bool $create_tables Defines whether all necessary tables should be created
- */
- protected function openConnection($create_tables = false)
- {
- PHPCrawlerBenchmark::start("connecting_to_sqlite_db");
-
- // Open sqlite-file
- try
- {
- $this->PDO = new PDO("sqlite:".$this->sqlite_db_file);
- }
- catch (Exception $e)
- {
- throw new Exception("Error creating SQLite-cache-file, ".$e->getMessage().", try installing sqlite3-extension for PHP.");
- }
-
- $this->PDO->exec("PRAGMA journal_mode = OFF");
-
- $this->PDO->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING);
- $this->PDO->setAttribute(PDO::ATTR_TIMEOUT, 100);
-
- if ($create_tables == true)
- {
- // Create url-table (if not exists)
- $this->PDO->exec("CREATE TABLE IF NOT EXISTS urls (id integer PRIMARY KEY AUTOINCREMENT,
- in_process bool DEFAULT 0,
- processed bool DEFAULT 0,
- priority_level integer,
- distinct_hash TEXT UNIQUE,
- link_raw TEXT,
- linkcode TEXT,
- linktext TEXT,
- refering_url TEXT,
- url_rebuild TEXT,
- is_redirect_url bool,
- url_link_depth integer);");
-
- // Create indexes (seems that indexes make the whole thingy slower)
- $this->PDO->exec("CREATE INDEX IF NOT EXISTS priority_level ON urls (priority_level);");
- $this->PDO->exec("CREATE INDEX IF NOT EXISTS distinct_hash ON urls (distinct_hash);");
- $this->PDO->exec("CREATE INDEX IF NOT EXISTS in_process ON urls (in_process);");
- $this->PDO->exec("CREATE INDEX IF NOT EXISTS processed ON urls (processed);");
-
- $this->PDO->exec("ANALYZE;");
- }
-
- PHPCrawlerBenchmark::stop("connecting_to_sqlite_db");
- }
-
- /**
- * Creates the prepared statement for insterting URLs into database (if not done yet)
- */
- protected function createPreparedInsertStatement()
- {
- if ($this->PreparedInsertStatement == null)
- {
- // Prepared statement for URL-inserts
- $this->PreparedInsertStatement = $this->PDO->prepare("INSERT OR IGNORE INTO urls (priority_level, distinct_hash, link_raw, linkcode, linktext, refering_url, url_rebuild, is_redirect_url, url_link_depth)
- VALUES(:priority_level,
- :distinct_hash,
- :link_raw,
- :linkcode,
- :linktext,
- :refering_url,
- :url_rebuild,
- :is_redirect_url,
- :url_link_depth);");
- }
- }
-
- /**
- * Cleans up the cache after is it not needed anymore.
- */
- public function cleanup()
- {
- // Has to be done, otherwise sqlite-file is locked on Windows-OS
- $this->PDO = null;
- $this->PreparedInsertStatement = null;
-
- unlink($this->sqlite_db_file);
- }
- }
- ?>