/components/ContentParserComponent.php
https://gitlab.com/potiev/machine-learning · PHP · 265 lines · 156 code · 39 blank · 70 comment · 19 complexity · f7c86734396522432c12000b1cbfd290 MD5 · raw file
- <?php
- namespace app\components;
- use ErrorException;
- use Yii;
- use yii\helpers\ArrayHelper;
- /**
- * FUNCTION
- * A class that is used to parse the XML content file.
- * USAGE
- * Call setStartTime - sets the start time
- * Call setXMLFile(filename) - set the filename of our XML file
- * Call startParse - starts parsing the document and inserting the data in our
- * MySQL database.
- * PROPERTIES
- * The basic properties to get this class going:
- * count_rows (@int):
- * How many rows we have done so far
- * count_rows_temp (@int):
- * A temporary counter (--Reset after ECHO_STATS_FREQUENCY rows)
- *
- * XML tags and their contents
- * current_tag (@string):
- * Hold what tag we currently are in
- * permitted_tags (@array)
- * An array that holds the permitted tags
- *
- * Properties for the XML structure:
- * -------
- * CONTENT LINKS (content_links):
- * topic (@string)
- * type (@string)
- * resource (@string)
- * catid (@int)
- *
- * CONTENT DESCRIPTION (content_description):
- * external_page (@string)
- * title (@string)
- * description (@string)
- * priority (@int)
- * METHODS
- * startParse, _startTagProcessor, _endTagProcessor, _charDataProcessor
- ****/
- class ContentParserComponent extends ParserComponent
- {
- public $descriptions = [];
- public $links = [];
- public $count_rows;
- public $count_rows_temp;
- public $current_tag;
- public $permitted_tags;
- //(content_links)
- public $topic;
- public $type;
- public $resource;
- public $catid;
- //(content_description)
- public $external_page;
- public $title;
- public $description;
- public $priority;
- public $ext_topic;
- public function startParse()
- {
- $this->printToConsole("Start parsing content.rdf.u8 file");
- $this->xml_file = 'data/content.rdf.u8';
- $this->setStartTime();
- //Starts with clean properties
- //(content_links)
- $this->topic = '';
- $this->type = '';
- $this->resource = '';
- //(content_description)
- $this->external_page = '';
- $this->title = '';
- $this->description = '';
- $this->priority = 0;
- $this->ext_topic = '';
- $this->current_tag = '';
- //Here we specify what tags are legal
- $this->permitted_tags = ['link', 'link1'];
- $this->_startToParse();
- //Print out that it is finished!
- $this->printToConsole("Finished processing content RDF file!");
- $this->printToConsole("Inserted rows into the database: $this->count_rows");
- }
- /**
- * Function that processes the start tags.
- * @param object $__parser What parser is it dude? heh
- * @param string $__tag_name The name of the current tagname
- * $param array $__attributes Attributes of the tag_name
- **/
- public function _startTagProcessor($__parser, $__tag_name, $__attributes)
- {
- //Sets what tag we currently are in
- $this->current_tag = $__tag_name;
- //Check if the current tag is topic
- if ($this->current_tag == 'Topic')
- {
- //Reset catid
- $this->catid = '';
- //If it's true get id
- $this->topic = $__attributes['r:id'];
- }
- //Check if the current tag is external page
- if (strtolower($this->current_tag) == 'externalpage')
- {
- //If it's true get id
- $this->external_page = $__attributes['about'];
- }
- //Check if the tag is equal to some of our permitted tags
- if (ArrayHelper::isIn(strtolower($this->current_tag), $this->permitted_tags))
- {
- //Set type to be equal with the name
- $this->type = $__tag_name;
- //Set the resource to be equal the resource found in the tag
- $this->resource = $__attributes['r:resource'];
- }
- }
- /**
- * This is our end tag processor.
- * @param object $__parser What parser is it dude? heh
- * @param string $__tag_name The name of the current tag name
- **/
- public function _endTagProcessor($__parser, $__tag_name)
- {
- $db = Yii::$app->db;
- //Check if the end tag is external_page
- if (strtolower($__tag_name) == 'externalpage')
- {
- if (strpos($this->topic, "Top/Science") === FALSE) return;
- $this->descriptions[] = [
- $this->external_page,
- trim($this->title),
- trim($this->description),
- $this->priority,
- $this->ext_topic,
- ];
- $this->count_rows++; //Count rows
- $this->count_rows_temp++; //Temporary count rows - used to make a milestone
- $this->external_page = '';
- $this->title = '';
- $this->description = '';
- $this->priority = 0;
- $this->ext_topic = '';
- $this->current_tag = '';
- if (count($this->descriptions) == $this->batchInsertRowsCount) {
- $db->createCommand()->batchInsert('content_description',
- [
- 'externalpage',
- 'title',
- 'description',
- 'priority',
- 'topic'
- ],
- $this->descriptions
- )->execute();
- $this->descriptions = [];
- }
- }
- //Check if the end tag is in the range of permitted tags
- if (ArrayHelper::isIn(strtolower($__tag_name), $this->permitted_tags))
- {
- if (strpos($this->topic, "Top/Science") === FALSE) return;
- $this->links[] = [
- $this->catid,
- $this->topic,
- $this->type,
- $this->resource
- ];
- $this->count_rows++; //Count rows
- $this->count_rows_temp++; //Temporary count rows - used to make a milestone
- $this->type = '';
- $this->resource = '';
- $this->current_tag = '';
- if (count($this->links == $this->batchInsertRowsCount)) {
- $db->createCommand()->batchInsert(
- 'content_links',
- [
- 'catid',
- 'topic',
- 'type',
- 'resource'
- ],
- $this->links
- )->execute();
- $this->links = [];
- }
- }
- //Check if the stats are set
- if (ECHO_STATS) {
- //Check if ECHO_STATS_FREQUENCY is reached
- if ($this->count_rows_temp == ECHO_STATS_FREQUENCY)
- {
- $this->count_rows_temp = 0;
- $this->_echoStatus($this->start_time, $this->count_rows, 'Yet another '.ECHO_STATS_FREQUENCY.' rows reached! - content RDF document');
- }
- }
- }
- public function _charDataProcessor($__parser, $__data)
- {
- //Checks if there is something between the tags
- if(trim($__data) != '')
- {
- //Finds out what kind of data it is
- switch($this->current_tag) {
- case 'catid':
- $this->catid = $__data;
- break;
- case 'd:Title':
- $this->title = $__data;
- break;
- case 'd:Description':
- $this->description = $__data;
- break;
- case 'priority':
- $this->priority = $__data;
- break;
- case 'topic':
- $this->ext_topic = $__data;
- break;
- default:
- break;
- }
- }
- }
- }