PageRenderTime 53ms CodeModel.GetById 12ms RepoModel.GetById 0ms app.codeStats 0ms

/application/modules/domas/models/Predictive.php

https://bitbucket.org/mad3linux/bigbox
PHP | 184 lines | 112 code | 17 blank | 55 comment | 5 complexity | e0f8aa3254d3b4b52253120d7a01b04d MD5 | raw file
Possible License(s): GPL-2.0, MIT, Apache-2.0, LGPL-2.1, MPL-2.0-no-copyleft-exception
  1. <?php
  2. use NlpTools\Tokenizers\WhitespaceTokenizer;
  3. use NlpTools\Models\FeatureBasedNB;
  4. use NlpTools\Classifiers\MultinomialNBClassifier;
  5. use NlpTools\Documents\TokensDocument;
  6. use NlpTools\FeatureFactories\DataAsFeatures;
  7. use NlpTools\Documents\TrainingSet;
  8. /**
  9. * LanguageDetector wraps a simple Naive Bayes classifier to ease
  10. * its use, its training and its serialization.
  11. */
  12. class Domas_Model_Predictive extends Zend_Db_Table_Abstract {
  13. protected $tok;
  14. protected $ff;
  15. protected $languages;
  16. protected $model;
  17. protected $cls;
  18. protected $_db;
  19. /**
  20. * Classify a snippet of source code
  21. *
  22. * @param string $code
  23. */
  24. public function category($code, $params) {
  25. // if(!$this->cls)
  26. // throw new \RuntimeException("No classification model defined");
  27. $data = $this->_db->fetchAll("select distinct(prediksi) as pred from zpraba_predictive where tema=? and stype=?", array($params['tid'], $params['stype']));
  28. $stat = array(0 => "meningkat",
  29. 1 => "menurun",
  30. 2 => "normal");
  31. $mod = $this->loadFromFile(APPLICATION_PATH .
  32. '/../public/predictive/' .
  33. $params['stype'] .
  34. "/" .
  35. $params['tid']);
  36. $this->tok = new WhitespaceTokenizer();
  37. foreach($data as $v) {
  38. foreach($stat as $z) {
  39. $var[] = $v['pred'] . " " . $z;
  40. }
  41. }
  42. $this->languages = $var;
  43. // die("x");
  44. $doc = new TokensDocument($this->tok->tokenize($code));
  45. return $mod->classify($this->languages, $doc);
  46. }
  47. /**
  48. * Classify a file as source code.
  49. *
  50. * @param string $f The filename
  51. */
  52. /**
  53. * Train on a directory with the structure explained in function
  54. * buildTrainingSet
  55. *
  56. * @param string $dir
  57. */
  58. public function train($tid, $stype) {
  59. $this->tok = new WhitespaceTokenizer();
  60. $this->ff = new DataAsFeatures();
  61. // if(!file_exists($dir)|| !is_dir($dir))
  62. // throw new \RuntimeException("Can't find training directory");
  63. $training = $this->_db->fetchAll("select * from zpraba_predictive where tema=? and stype=?", array($tid, $stype));
  64. // Zend_Debug::dump($training); die();
  65. $this->model = new FeatureBasedNB();
  66. $tset = new TrainingSet();
  67. $stat = array(0 => "meningkat",
  68. 1 => "menurun",
  69. 2 => "normal");
  70. foreach($training as $d) {
  71. $var[] = $d['Prediksi'] . " " . $stat[$d['nilai']];
  72. $tset->addDocument($d['Prediksi'] .
  73. " " .
  74. $stat[$d['nilai']], // class
  75. new TokensDocument($this->tok->tokenize($d['text'])// The actual document
  76. ));
  77. }
  78. $this->languages = $var;
  79. $this->model->train($this->ff, $tset);
  80. return new MultinomialNBClassifier($this->ff, $this->model);
  81. }
  82. /**
  83. * Evaluate on a directory with the structure explained in function
  84. * buildTrainingSet
  85. *
  86. * @param string $dir
  87. * @return float The accuracy of the classification
  88. */
  89. public function evaluate($dir) {
  90. if(!file_exists($dir)|| !is_dir($dir))
  91. throw new \RuntimeException("Can't find evaluation directory");
  92. $tset = self::buildTrainingSet($dir);
  93. $correct = 0;
  94. foreach($tset as $class => $doc) {
  95. $correct += (int)($this->cls->classify($this->languages, $doc)== $class);
  96. }
  97. return $correct / count($tset);
  98. }
  99. /**
  100. * Build a training set from a directory using the following convention:
  101. * The directory should contain one subdirectory for each class. The class
  102. * name is the subdirectory's base name. Each subdirectory should contain
  103. * one file for each document.
  104. *
  105. * @param string $dir
  106. * @return TrainingSet
  107. */
  108. public static function buildTrainingSet($dir) {
  109. $tok = new WhitespaceTokenizer();
  110. $tset = new TrainingSet();
  111. foreach(new DirectoryIterator($dir)as $d) {
  112. if($d->isFile()|| $d->isDot())
  113. continue;
  114. $class = $d->getBasename();
  115. foreach(new DirectoryIterator($d->getPathname())as $f) {
  116. if(!$f->isFile())
  117. continue;
  118. $tset->addDocument($class, new TokensDocument($tok->tokenize(file_get_contents($f->getPathname()))));
  119. }
  120. }
  121. return $tset;
  122. }
  123. /**
  124. * The only things to be saved are the languages and the model
  125. */
  126. public function __sleep() {
  127. return array('languages',
  128. 'model');
  129. }
  130. /**
  131. * On wakeup rebuild the classifier, feature factory and tokenizer
  132. */
  133. public function __wakeup() {
  134. // $this->tok = new WhitespaceTokenizer();
  135. // $this->ff = new DataAsFeatures();
  136. // $this->cls = new MultinomialNBClassifier($this->ff, $this->model);
  137. }
  138. /**
  139. * To ensure problem free serialization (especially with utf-8 characters)
  140. * we encode the serialized object in base64
  141. *
  142. * @param LanguageDetector $instance
  143. * @return string The serialized LanguageDetector instance
  144. */
  145. public static function serialize($instance) {
  146. return base64_encode(serialize($instance));
  147. }
  148. /**
  149. * Decode from base64 and unserialize as usual
  150. *
  151. * @param string $str
  152. * @return LanguageDetector
  153. */
  154. public static function unserialize($str) {
  155. return unserialize(base64_decode($str));
  156. }
  157. public function saveToFile($instance, $file) {
  158. file_put_contents($file, self::serialize($instance));
  159. }
  160. public function loadFromFile($file) {
  161. if(!file_exists($file))
  162. throw new \RuntimeException("Can't find file");
  163. return self::unserialize(file_get_contents($file));
  164. }
  165. }