/components/ContentParserComponent.php

https://gitlab.com/potiev/machine-learning · PHP · 265 lines · 156 code · 39 blank · 70 comment · 19 complexity · f7c86734396522432c12000b1cbfd290 MD5 · raw file

  1. <?php
  2. namespace app\components;
  3. use ErrorException;
  4. use Yii;
  5. use yii\helpers\ArrayHelper;
  6. /**
  7. * FUNCTION
  8. * A class that is used to parse the XML content file.
  9. * USAGE
  10. * Call setStartTime - sets the start time
  11. * Call setXMLFile(filename) - set the filename of our XML file
  12. * Call startParse - starts parsing the document and inserting the data in our
  13. * MySQL database.
  14. * PROPERTIES
  15. * The basic properties to get this class going:
  16. * count_rows (@int):
  17. * How many rows we have done so far
  18. * count_rows_temp (@int):
  19. * A temporary counter (--Reset after ECHO_STATS_FREQUENCY rows)
  20. *
  21. * XML tags and their contents
  22. * current_tag (@string):
  23. * Hold what tag we currently are in
  24. * permitted_tags (@array)
  25. * An array that holds the permitted tags
  26. *
  27. * Properties for the XML structure:
  28. * -------
  29. * CONTENT LINKS (content_links):
  30. * topic (@string)
  31. * type (@string)
  32. * resource (@string)
  33. * catid (@int)
  34. *
  35. * CONTENT DESCRIPTION (content_description):
  36. * external_page (@string)
  37. * title (@string)
  38. * description (@string)
  39. * priority (@int)
  40. * METHODS
  41. * startParse, _startTagProcessor, _endTagProcessor, _charDataProcessor
  42. ****/
  43. class ContentParserComponent extends ParserComponent
  44. {
  45. public $descriptions = [];
  46. public $links = [];
  47. public $count_rows;
  48. public $count_rows_temp;
  49. public $current_tag;
  50. public $permitted_tags;
  51. //(content_links)
  52. public $topic;
  53. public $type;
  54. public $resource;
  55. public $catid;
  56. //(content_description)
  57. public $external_page;
  58. public $title;
  59. public $description;
  60. public $priority;
  61. public $ext_topic;
  62. public function startParse()
  63. {
  64. $this->printToConsole("Start parsing content.rdf.u8 file");
  65. $this->xml_file = 'data/content.rdf.u8';
  66. $this->setStartTime();
  67. //Starts with clean properties
  68. //(content_links)
  69. $this->topic = '';
  70. $this->type = '';
  71. $this->resource = '';
  72. //(content_description)
  73. $this->external_page = '';
  74. $this->title = '';
  75. $this->description = '';
  76. $this->priority = 0;
  77. $this->ext_topic = '';
  78. $this->current_tag = '';
  79. //Here we specify what tags are legal
  80. $this->permitted_tags = ['link', 'link1'];
  81. $this->_startToParse();
  82. //Print out that it is finished!
  83. $this->printToConsole("Finished processing content RDF file!");
  84. $this->printToConsole("Inserted rows into the database: $this->count_rows");
  85. }
  86. /**
  87. * Function that processes the start tags.
  88. * @param object $__parser What parser is it dude? heh
  89. * @param string $__tag_name The name of the current tagname
  90. * $param array $__attributes Attributes of the tag_name
  91. **/
  92. public function _startTagProcessor($__parser, $__tag_name, $__attributes)
  93. {
  94. //Sets what tag we currently are in
  95. $this->current_tag = $__tag_name;
  96. //Check if the current tag is topic
  97. if ($this->current_tag == 'Topic')
  98. {
  99. //Reset catid
  100. $this->catid = '';
  101. //If it's true get id
  102. $this->topic = $__attributes['r:id'];
  103. }
  104. //Check if the current tag is external page
  105. if (strtolower($this->current_tag) == 'externalpage')
  106. {
  107. //If it's true get id
  108. $this->external_page = $__attributes['about'];
  109. }
  110. //Check if the tag is equal to some of our permitted tags
  111. if (ArrayHelper::isIn(strtolower($this->current_tag), $this->permitted_tags))
  112. {
  113. //Set type to be equal with the name
  114. $this->type = $__tag_name;
  115. //Set the resource to be equal the resource found in the tag
  116. $this->resource = $__attributes['r:resource'];
  117. }
  118. }
  119. /**
  120. * This is our end tag processor.
  121. * @param object $__parser What parser is it dude? heh
  122. * @param string $__tag_name The name of the current tag name
  123. **/
  124. public function _endTagProcessor($__parser, $__tag_name)
  125. {
  126. $db = Yii::$app->db;
  127. //Check if the end tag is external_page
  128. if (strtolower($__tag_name) == 'externalpage')
  129. {
  130. if (strpos($this->topic, "Top/Science") === FALSE) return;
  131. $this->descriptions[] = [
  132. $this->external_page,
  133. trim($this->title),
  134. trim($this->description),
  135. $this->priority,
  136. $this->ext_topic,
  137. ];
  138. $this->count_rows++; //Count rows
  139. $this->count_rows_temp++; //Temporary count rows - used to make a milestone
  140. $this->external_page = '';
  141. $this->title = '';
  142. $this->description = '';
  143. $this->priority = 0;
  144. $this->ext_topic = '';
  145. $this->current_tag = '';
  146. if (count($this->descriptions) == $this->batchInsertRowsCount) {
  147. $db->createCommand()->batchInsert('content_description',
  148. [
  149. 'externalpage',
  150. 'title',
  151. 'description',
  152. 'priority',
  153. 'topic'
  154. ],
  155. $this->descriptions
  156. )->execute();
  157. $this->descriptions = [];
  158. }
  159. }
  160. //Check if the end tag is in the range of permitted tags
  161. if (ArrayHelper::isIn(strtolower($__tag_name), $this->permitted_tags))
  162. {
  163. if (strpos($this->topic, "Top/Science") === FALSE) return;
  164. $this->links[] = [
  165. $this->catid,
  166. $this->topic,
  167. $this->type,
  168. $this->resource
  169. ];
  170. $this->count_rows++; //Count rows
  171. $this->count_rows_temp++; //Temporary count rows - used to make a milestone
  172. $this->type = '';
  173. $this->resource = '';
  174. $this->current_tag = '';
  175. if (count($this->links == $this->batchInsertRowsCount)) {
  176. $db->createCommand()->batchInsert(
  177. 'content_links',
  178. [
  179. 'catid',
  180. 'topic',
  181. 'type',
  182. 'resource'
  183. ],
  184. $this->links
  185. )->execute();
  186. $this->links = [];
  187. }
  188. }
  189. //Check if the stats are set
  190. if (ECHO_STATS) {
  191. //Check if ECHO_STATS_FREQUENCY is reached
  192. if ($this->count_rows_temp == ECHO_STATS_FREQUENCY)
  193. {
  194. $this->count_rows_temp = 0;
  195. $this->_echoStatus($this->start_time, $this->count_rows, 'Yet another '.ECHO_STATS_FREQUENCY.' rows reached! - content RDF document');
  196. }
  197. }
  198. }
  199. public function _charDataProcessor($__parser, $__data)
  200. {
  201. //Checks if there is something between the tags
  202. if(trim($__data) != '')
  203. {
  204. //Finds out what kind of data it is
  205. switch($this->current_tag) {
  206. case 'catid':
  207. $this->catid = $__data;
  208. break;
  209. case 'd:Title':
  210. $this->title = $__data;
  211. break;
  212. case 'd:Description':
  213. $this->description = $__data;
  214. break;
  215. case 'priority':
  216. $this->priority = $__data;
  217. break;
  218. case 'topic':
  219. $this->ext_topic = $__data;
  220. break;
  221. default:
  222. break;
  223. }
  224. }
  225. }
  226. }