/library/PHPMorphy/utils/dict_stuff/mrd/reader.php

https://github.com/ganzalizz/proweb_cms · PHP · 315 lines · 243 code · 70 blank · 2 comment · 21 complexity · 961a5976ea542e2bfff2afc3599e1b19 MD5 · raw file

  1. <?php
  2. require_once(dirname(__FILE__) . '/../../libs/iterators.php');
  3. require_once(dirname(__FILE__) . '/../dict/model.php');
  4. class phpMorphy_Mrd_Exception extends Exception { }
  5. abstract class phpMorphy_Mrd_Section implements Iterator, Countable {
  6. const INTERNAL_ENCODING = 'utf-8';
  7. protected
  8. $file_it,
  9. $encoding,
  10. $start_line,
  11. $current_line,
  12. $section_size;
  13. function __construct(SeekableIterator $file, $encoding, $startLine) {
  14. $this->file_it = $file;
  15. $this->encoding = $this->prepareEncoding($encoding);
  16. $this->start_line = $startLine;
  17. $this->section_size = $this->readSectionSize($file);
  18. }
  19. protected function prepareEncoding($encoding) {
  20. $encoding = strtolower($encoding);
  21. if($encoding == 'utf8') {
  22. $encoding = 'utf-8';
  23. }
  24. return $encoding;
  25. }
  26. protected function openFile($fileName) {
  27. return new SplFileObject($fileName);
  28. }
  29. function getSectionLinesCount() {
  30. return $this->count() + 1;
  31. }
  32. function count() {
  33. return $this->section_size;
  34. }
  35. function key() {
  36. return $this->current_line;
  37. }
  38. function getPosition() {
  39. return $this->current_line;
  40. }
  41. function rewind() {
  42. $this->current_line = 0;
  43. $this->file_it->seek($this->start_line + 1);
  44. }
  45. function valid() {
  46. if($this->current_line >= $this->section_size) {
  47. return false;
  48. }
  49. if(!$this->file_it->valid()) {
  50. throw new phpMorphy_Mrd_Exception(
  51. "Too small section {$this->current_line} lines gathered, $this->section_size expected"
  52. );
  53. }
  54. return true;
  55. }
  56. function current() {
  57. return $this->processLine(rtrim($this->file_it->current()));
  58. }
  59. function next() {
  60. $this->file_it->next();
  61. $this->current_line++;
  62. }
  63. protected function iconv($string) {
  64. if($this->encoding == self::INTERNAL_ENCODING) {
  65. return $string;
  66. }
  67. return iconv($this->encoding, self::INTERNAL_ENCODING, $string);
  68. }
  69. protected function readSectionSize(SeekableIterator $it) {
  70. $it->seek($this->start_line);
  71. if(!$it->valid()) {
  72. throw new phpMorphy_Mrd_Exception("Can`t read section size, iterator not valid");
  73. }
  74. $size = trim($it->current());
  75. if(!preg_match('~^[0-9]+$~', $size)) {
  76. throw new phpMorphy_Mrd_Exception("Invalid section size: $size");
  77. }
  78. return (int)$size;
  79. }
  80. protected function processLine($line) {
  81. return $line;
  82. }
  83. }
  84. class phpMorphy_Mrd_Section_Flexias extends phpMorphy_Mrd_Section {
  85. const COMMENT_STRING = 'q//q';
  86. protected function processLine($line) {
  87. $line = $this->iconv($this->removeComment($line));
  88. $model = new phpMorphy_Dict_FlexiaModel($this->getPosition());
  89. foreach(explode('%', substr($line, 1)) as $token) {
  90. //$parts = array_map('trim', explode('*', $token));
  91. $parts = explode('*', $token);
  92. switch(count($parts)) {
  93. case 2:
  94. $ancode = $parts[1];
  95. $prefix = '';
  96. break;
  97. case 3:
  98. $ancode = $parts[1];
  99. $prefix = $parts[2];
  100. break;
  101. default:
  102. throw new phpMorphy_Mrd_Exception("Invalid flexia string($token) in str($line)");
  103. }
  104. $flexia = $parts[0];
  105. $model->append(
  106. new phpMorphy_Dict_Flexia(
  107. $prefix, //$this->iconv($prefix),
  108. $flexia, //$this->iconv($flexia),
  109. $ancode
  110. )
  111. );
  112. }
  113. return $model;
  114. }
  115. protected function removeComment($line) {
  116. if(false !== ($pos = strrpos($line, self::COMMENT_STRING))) {
  117. return rtrim(substr($line, 0, $pos));
  118. } else {
  119. return $line;
  120. }
  121. }
  122. }
  123. class phpMorphy_Mrd_Section_Accents extends phpMorphy_Mrd_Section {
  124. const UNKNOWN_ACCENT_VALUE = 255;
  125. protected function processLine($line) {
  126. if(substr($line, -1, 1) == ';') {
  127. $line = substr($line, 0, -1);
  128. }
  129. $result = new phpMorphy_Dict_AccentModel($this->getPosition());
  130. $result->import(
  131. new ArrayIterator(
  132. array_map(
  133. array($this, 'processAccentValue'),
  134. explode(';', $line)
  135. )
  136. )
  137. );
  138. return $result;
  139. }
  140. protected function processAccentValue($item) {
  141. $item = (int)$item;
  142. if($item == self::UNKNOWN_ACCENT_VALUE) {
  143. $item = null;
  144. }
  145. return $item;
  146. }
  147. }
  148. class phpMorphy_Mrd_Section_Sessions extends phpMorphy_Mrd_Section {
  149. }
  150. class phpMorphy_Mrd_Section_Prefixes extends phpMorphy_Mrd_Section {
  151. protected function processLine($line) {
  152. $line = $this->iconv($line);
  153. $result = new phpMorphy_Dict_PrefixSet($this->getPosition());
  154. $result->import(
  155. new ArrayIterator(
  156. array_map('trim', explode(',', $line))
  157. )
  158. );
  159. return $result;
  160. }
  161. }
  162. class phpMorphy_Mrd_Section_Lemmas extends phpMorphy_Mrd_Section {
  163. protected function processLine($line) {
  164. //if(6 != count($tokens = array_map('trim', explode(' ', $line)))) {
  165. $line = $this->iconv($line);
  166. if(6 != count($tokens = explode(' ', $line))) {
  167. throw new phpMorphy_Mrd_Exception("Invalid lemma str('$line'), too few tokens");
  168. }
  169. $base = trim($tokens[0]);
  170. if($base === '#') {
  171. $base = '';
  172. }
  173. $lemma = new phpMorphy_Dict_Lemma(
  174. $base, //$this->iconv(trim($tokens[0])), // base
  175. (int)$tokens[1], // flexia_id
  176. (int)$tokens[2] // accent_id
  177. );
  178. if('-' !== $tokens[4]) {
  179. $lemma->setAncodeId($tokens[4]);
  180. }
  181. if('-' !== $tokens[5]) {
  182. $lemma->setPrefixId((int)$tokens[5]);
  183. }
  184. return $lemma;
  185. }
  186. }
  187. class phpMorphy_Mrd_File {
  188. protected
  189. $flexias,
  190. $accents,
  191. $sessions,
  192. $prefixes,
  193. $lemmas
  194. ;
  195. function __construct($fileName, $encoding) {
  196. $line = 0;
  197. $this->initSections($line, $fileName, $encoding);
  198. }
  199. protected function initSections(&$startLine, $fileName, $encoding) {
  200. foreach($this->getSectionsNames() as $sectionName) {
  201. try {
  202. $section = $this->createNewSection(
  203. $sectionName,
  204. $fileName,
  205. $encoding,
  206. $startLine
  207. );
  208. $this->$sectionName = $section;
  209. } catch(Exception $e) {
  210. throw new phpMorphy_Mrd_Exception("Can`t init '$sectionName' section: " . $e->getMessage());
  211. }
  212. }
  213. }
  214. protected function createNewSection($sectionName, $fileName, $encoding, &$lineNo) {
  215. $sect_clazz = $this->getSectionClassName($sectionName);
  216. $section = new $sect_clazz($this->openFile($fileName), $encoding, $lineNo);
  217. $lineNo += $section->getSectionLinesCount();
  218. return $section;
  219. }
  220. protected function getSectionsNames() {
  221. return array(
  222. 'flexias',
  223. 'accents',
  224. 'sessions',
  225. 'prefixes',
  226. 'lemmas'
  227. );
  228. }
  229. protected function openFile($fileName) {
  230. return new SplFileObject($fileName);
  231. }
  232. protected function getSectionClassName($sectionName) {
  233. return 'phpMorphy_Mrd_Section_' . ucfirst(strtolower($sectionName));
  234. }
  235. function __get($propName) {
  236. if(!preg_match('/^\w+_section$/', $propName)) {
  237. throw new phpMorphy_Mrd_Exception("Unsupported prop name given $propName");
  238. }
  239. list($sect_name) = explode('_', $propName);
  240. if(!isset($this->$sect_name)) {
  241. throw new phpMorphy_Mrd_Exception("Invalid section name given $propName");
  242. }
  243. return $this->$sect_name;
  244. }
  245. }