Predictive.php | searchcode

/application/modules/domas/models/Predictive.php

https://bitbucket.org/mad3linux/bigbox
PHP | 184 lines | 112 code | 17 blank | 55 comment | 5 complexity | e0f8aa3254d3b4b52253120d7a01b04d MD5 | raw file
Possible License(s): GPL-2.0, MIT, Apache-2.0, LGPL-2.1, MPL-2.0-no-copyleft-exception

<?php
use NlpTools\Tokenizers\WhitespaceTokenizer;
use NlpTools\Models\FeatureBasedNB;
use NlpTools\Classifiers\MultinomialNBClassifier;
use NlpTools\Documents\TokensDocument;
use NlpTools\FeatureFactories\DataAsFeatures;
use NlpTools\Documents\TrainingSet;

/**
 * LanguageDetector wraps a simple Naive Bayes classifier to ease
 * its use, its training and its serialization.
 */

class Domas_Model_Predictive extends Zend_Db_Table_Abstract {
    protected $tok;
    protected $ff;
    protected $languages;
    protected $model;
    protected $cls;
    protected $_db;
    
    /**
     * Classify a snippet of source code
     *
     * @param string $code
     */
    public function category($code, $params) {
        // if(!$this->cls)
        //    throw new \RuntimeException("No classification model defined");
        $data = $this->_db->fetchAll("select distinct(prediksi) as pred from zpraba_predictive where tema=? and stype=?", array($params['tid'], $params['stype']));
        $stat = array(0 => "meningkat",
                      1 => "menurun",
                      2 => "normal");
        $mod = $this->loadFromFile(APPLICATION_PATH . 
                                   '/../public/predictive/' . 
                                   $params['stype'] . 
                                   "/" . 
                                   $params['tid']);
        $this->tok = new WhitespaceTokenizer();

        foreach($data as $v) {

            foreach($stat as $z) {
                $var[] = $v['pred'] . " " . $z;
            }
        }
        $this->languages = $var;
        // die("x");
        $doc = new TokensDocument($this->tok->tokenize($code));
        return $mod->classify($this->languages, $doc);
    }
    
    /**
     * Classify a file as source code.
     *
     * @param string $f The filename
     */
    
    /**
     * Train on a directory with the structure explained in function
     * buildTrainingSet
     *
     * @param string $dir
     */
    public function train($tid, $stype) {
        $this->tok = new WhitespaceTokenizer();
        $this->ff = new DataAsFeatures();
        // if(!file_exists($dir)|| !is_dir($dir))
        //   throw new \RuntimeException("Can't find training directory");
        $training = $this->_db->fetchAll("select * from zpraba_predictive where tema=? and stype=?", array($tid, $stype));
        //  Zend_Debug::dump($training); die();
        $this->model = new FeatureBasedNB();
        $tset = new TrainingSet();
        $stat = array(0 => "meningkat",
                      1 => "menurun",
                      2 => "normal");

        foreach($training as $d) {
            $var[] = $d['Prediksi'] . " " . $stat[$d['nilai']];
            $tset->addDocument($d['Prediksi'] . 
                               " " . 
                               $stat[$d['nilai']], // class
            new TokensDocument($this->tok->tokenize($d['text'])// The actual document
           ));
        }
        $this->languages = $var;
        $this->model->train($this->ff, $tset);
        return new MultinomialNBClassifier($this->ff, $this->model);
    }
    
    /**
     * Evaluate on a directory with the structure explained in function
     * buildTrainingSet
     *
     * @param  string $dir
     * @return float  The accuracy of the classification
     */
    public function evaluate($dir) {
        if(!file_exists($dir)|| !is_dir($dir))
            throw new \RuntimeException("Can't find evaluation directory");
        $tset = self::buildTrainingSet($dir);
        $correct = 0;

        foreach($tset as $class => $doc) {
            $correct += (int)($this->cls->classify($this->languages, $doc)== $class);
        }
        return $correct / count($tset);
    }
    
    /**
     * Build a training set from a directory using the following convention:
     * The directory should contain one subdirectory for each class. The class
     * name is the subdirectory's base name. Each subdirectory should contain
     * one file for each document.
     *
     * @param  string      $dir
     * @return TrainingSet
     */
    public static function buildTrainingSet($dir) {
        $tok = new WhitespaceTokenizer();
        $tset = new TrainingSet();

        foreach(new DirectoryIterator($dir)as $d) {
            if($d->isFile()|| $d->isDot())
                continue;
            $class = $d->getBasename();

            foreach(new DirectoryIterator($d->getPathname())as $f) {
                if(!$f->isFile())
                    continue;
                $tset->addDocument($class, new TokensDocument($tok->tokenize(file_get_contents($f->getPathname()))));
            }
        }
        return $tset;
    }
    
    /**
     * The only things to be saved are the languages and the model
     */
    public function __sleep() {
        return array('languages',
                     'model');
    }
    
    /**
     * On wakeup rebuild the classifier, feature factory and tokenizer
     */
    public function __wakeup() {
        //   $this->tok = new WhitespaceTokenizer();
        // $this->ff = new DataAsFeatures();
        // $this->cls = new MultinomialNBClassifier($this->ff, $this->model);
    }
    
    /**
     * To ensure problem free serialization (especially with utf-8 characters)
     * we encode the serialized object in base64
     *
     * @param  LanguageDetector $instance
     * @return string           The serialized LanguageDetector instance
     */
    public static function serialize($instance) {
        return base64_encode(serialize($instance));
    }
    
    /**
     * Decode from base64 and unserialize as usual
     *
     * @param  string $str
     * @return LanguageDetector
     */
    public static function unserialize($str) {
        return unserialize(base64_decode($str));
    }

    public function saveToFile($instance, $file) {
        file_put_contents($file, self::serialize($instance));
    }

    public function loadFromFile($file) {
        if(!file_exists($file))
            throw new \RuntimeException("Can't find file");
        return self::unserialize(file_get_contents($file));
    }
}