/application/modules/domas/models/Predictive.php
PHP | 184 lines | 112 code | 17 blank | 55 comment | 5 complexity | e0f8aa3254d3b4b52253120d7a01b04d MD5 | raw file
Possible License(s): GPL-2.0, MIT, Apache-2.0, LGPL-2.1, MPL-2.0-no-copyleft-exception
- <?php
- use NlpTools\Tokenizers\WhitespaceTokenizer;
- use NlpTools\Models\FeatureBasedNB;
- use NlpTools\Classifiers\MultinomialNBClassifier;
- use NlpTools\Documents\TokensDocument;
- use NlpTools\FeatureFactories\DataAsFeatures;
- use NlpTools\Documents\TrainingSet;
- /**
- * LanguageDetector wraps a simple Naive Bayes classifier to ease
- * its use, its training and its serialization.
- */
- class Domas_Model_Predictive extends Zend_Db_Table_Abstract {
- protected $tok;
- protected $ff;
- protected $languages;
- protected $model;
- protected $cls;
- protected $_db;
-
- /**
- * Classify a snippet of source code
- *
- * @param string $code
- */
- public function category($code, $params) {
- // if(!$this->cls)
- // throw new \RuntimeException("No classification model defined");
- $data = $this->_db->fetchAll("select distinct(prediksi) as pred from zpraba_predictive where tema=? and stype=?", array($params['tid'], $params['stype']));
- $stat = array(0 => "meningkat",
- 1 => "menurun",
- 2 => "normal");
- $mod = $this->loadFromFile(APPLICATION_PATH .
- '/../public/predictive/' .
- $params['stype'] .
- "/" .
- $params['tid']);
- $this->tok = new WhitespaceTokenizer();
- foreach($data as $v) {
- foreach($stat as $z) {
- $var[] = $v['pred'] . " " . $z;
- }
- }
- $this->languages = $var;
- // die("x");
- $doc = new TokensDocument($this->tok->tokenize($code));
- return $mod->classify($this->languages, $doc);
- }
-
- /**
- * Classify a file as source code.
- *
- * @param string $f The filename
- */
-
- /**
- * Train on a directory with the structure explained in function
- * buildTrainingSet
- *
- * @param string $dir
- */
- public function train($tid, $stype) {
- $this->tok = new WhitespaceTokenizer();
- $this->ff = new DataAsFeatures();
- // if(!file_exists($dir)|| !is_dir($dir))
- // throw new \RuntimeException("Can't find training directory");
- $training = $this->_db->fetchAll("select * from zpraba_predictive where tema=? and stype=?", array($tid, $stype));
- // Zend_Debug::dump($training); die();
- $this->model = new FeatureBasedNB();
- $tset = new TrainingSet();
- $stat = array(0 => "meningkat",
- 1 => "menurun",
- 2 => "normal");
- foreach($training as $d) {
- $var[] = $d['Prediksi'] . " " . $stat[$d['nilai']];
- $tset->addDocument($d['Prediksi'] .
- " " .
- $stat[$d['nilai']], // class
- new TokensDocument($this->tok->tokenize($d['text'])// The actual document
- ));
- }
- $this->languages = $var;
- $this->model->train($this->ff, $tset);
- return new MultinomialNBClassifier($this->ff, $this->model);
- }
-
- /**
- * Evaluate on a directory with the structure explained in function
- * buildTrainingSet
- *
- * @param string $dir
- * @return float The accuracy of the classification
- */
- public function evaluate($dir) {
- if(!file_exists($dir)|| !is_dir($dir))
- throw new \RuntimeException("Can't find evaluation directory");
- $tset = self::buildTrainingSet($dir);
- $correct = 0;
- foreach($tset as $class => $doc) {
- $correct += (int)($this->cls->classify($this->languages, $doc)== $class);
- }
- return $correct / count($tset);
- }
-
- /**
- * Build a training set from a directory using the following convention:
- * The directory should contain one subdirectory for each class. The class
- * name is the subdirectory's base name. Each subdirectory should contain
- * one file for each document.
- *
- * @param string $dir
- * @return TrainingSet
- */
- public static function buildTrainingSet($dir) {
- $tok = new WhitespaceTokenizer();
- $tset = new TrainingSet();
- foreach(new DirectoryIterator($dir)as $d) {
- if($d->isFile()|| $d->isDot())
- continue;
- $class = $d->getBasename();
- foreach(new DirectoryIterator($d->getPathname())as $f) {
- if(!$f->isFile())
- continue;
- $tset->addDocument($class, new TokensDocument($tok->tokenize(file_get_contents($f->getPathname()))));
- }
- }
- return $tset;
- }
-
- /**
- * The only things to be saved are the languages and the model
- */
- public function __sleep() {
- return array('languages',
- 'model');
- }
-
- /**
- * On wakeup rebuild the classifier, feature factory and tokenizer
- */
- public function __wakeup() {
- // $this->tok = new WhitespaceTokenizer();
- // $this->ff = new DataAsFeatures();
- // $this->cls = new MultinomialNBClassifier($this->ff, $this->model);
- }
-
- /**
- * To ensure problem free serialization (especially with utf-8 characters)
- * we encode the serialized object in base64
- *
- * @param LanguageDetector $instance
- * @return string The serialized LanguageDetector instance
- */
- public static function serialize($instance) {
- return base64_encode(serialize($instance));
- }
-
- /**
- * Decode from base64 and unserialize as usual
- *
- * @param string $str
- * @return LanguageDetector
- */
- public static function unserialize($str) {
- return unserialize(base64_decode($str));
- }
- public function saveToFile($instance, $file) {
- file_put_contents($file, self::serialize($instance));
- }
- public function loadFromFile($file) {
- if(!file_exists($file))
- throw new \RuntimeException("Can't find file");
- return self::unserialize(file_get_contents($file));
- }
- }