lowd_meek.py | searchcode

/binary-classifiers/experiments/lowd_meek.py

https://github.com/ftramer/Steal-ML · Python · 187 lines · 127 code · 33 blank · 27 comment · 32 complexity · f117db04e4829bb31f361fa9f3299d35 MD5 · raw file

__author__ = 'Fan'

import copy
import logging

import numpy as np

from sklearn import svm
from sklearn.datasets import load_svmlight_file

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
from algorithms.OnlineBase import OnlineBase


class LordMeek(OnlineBase):
    def __init__(self, target, test_xy, error=None, delta=None):
        self.X_test, self.y_test = test_xy
        super(self.__class__, self).__init__('LM', +1, -1, target, len(self.X_test[0]), 'uniform', error)

        self.e = error
        self.delta = delta

        if 0 in self.y_test:
            self.NEG = 0
        elif -1 in self.y_test:
            self.NEG = -1
        else:
            print 'Watch out for test file! Neither 0 nor 1 is included!'

    def find_starters(self):
        """
        This function finds a pair of instances. One positive and one negative
        :param clf: classifier being extracted
        :return: (x+, x-) a pair of instances
        """
        # perdict = 1 ? inner(x, coef) + intercept_ > 0 : 0

        x_n, x_p = (None, None)
        x_n_found = False
        x_p_found = False
        for d in self.X_test:
            if x_n_found and x_p_found:
                break

            if self.query(d) == 1 and (not x_p_found):
                x_p = d
                x_p_found = True
            elif self.query(d) == self.NEG and (not x_n_found):
                x_n = d
                x_n_found = True
        return x_p, x_n

    def find_witness(self):
        x_p, x_n = self.find_starters()
        assert x_p is not None and self.query(x_p) == 1
        assert x_n is not None and self.query(x_n) == self.NEG
        dim = len(x_p)
        assert dim == len(x_n)

        last_p = -1
        for i in xrange(0, dim):
            # record the old value
            last_x_p_i = x_p[i]
            # change the value
            x_p[i] = x_n[i]
            if self.query(x_p) == self.NEG:
                # if flips
                last_x_p = copy.copy(x_p)
                last_x_p[i] = last_x_p_i
                assert self.query(x_p) == self.NEG and self.query(last_x_p) == 1
                logger.debug('witness found for dim %d' % i)
                return i, last_x_p, x_p

        return None

    def line_search(self, x, i):
        """
        starting at x (a negative point), search along dimension i, find a point very close to boundary
        :param x: starting point
        :param i: dimension to search
        :return: return the point near boundary
        """
        # make sure to start at a negative point
        assert self.query(x) == self.NEG
        # detach
        new_x = copy.copy(x)

        # phase II: binary search between init and x[i]
        def b(l, r):
            # print 'binary search [%f, %f]' % (l, r)
            # c(l) = 1 && c(r) = 0
            m = 0.5 * (l + r)
            new_x[i] = m
            if self.query(new_x) == self.NEG:
                return b(l, m)
            else:
                if abs(l - m) < self.e:
                    return m, abs(l - m)
                return b(m, r)

        # phase I: exponential explore
        init_xi = x[i]
        step = 1.0 / 100

        # TODO not float64 yet
        while new_x[i] < np.finfo('f').max:
            new_x[i] += step
            if self.query(new_x) == 1:
                return b(new_x[i], init_xi)

            new_x[i] = init_xi
            new_x[i] -= step
            if self.query(new_x) == 1:
                return b(new_x[i], init_xi)

            step *= 2

    def do(self):
        f, sp, sn = self.find_witness()
        sp_f = sp[f]
        sn_f = sn[f]

        w_f = 1.0 * (sp_f - sn_f) / abs(sp_f - sn_f)
        x0, _ = self.push_to_b(sn, sp, self.e)

        # get a x1 with gap(x0,x1) = 1 & c(x1) = 0
        x1 = copy.copy(x0)
        x1[f] -= w_f

        u = np.zeros(len(x0))
        w = np.zeros(len(x0))  # target
        w[f] = w_f
        for i in xrange(0, len(x0)):
            if i == f:
                continue
            # unit vector along the ith dimension
            u[i] = 1.0
            a = np.add(x1, u / self.delta)
            b = np.add(x1, -u / self.delta)
            if self.query(a) == self.query(b):
                w[i] = 0
            else:
                logger.debug('Line search for dim %d', i)
                new_x_i, err = self.line_search(x1, i)
                w[i] = 1.0 / (new_x_i - x1[i])
            u[i] = 0.0

        b = self.clf1.intercept_ / self.clf1.coef_[0][f]
        # print w, b
        # print self.clf1.coef_ / self.clf1.coef_[0][f]

        # test
        error_clf = 0.0
        error_lrn = 0.0
        for test_x, test_y in zip(self.X_test, self.y_test):
            t = 1 if np.inner(w, test_x) + b > 0 else self.NEG
            if t != test_y:
                error_lrn += 1
            if self.clf1.predict(test_x) != test_y:
                error_clf += 1

        pe_clf = 1 - error_clf/ len(self.y_test)
        pe_lrn = 1 - error_lrn/ len(self.y_test)

        print 'L_test = %f' % max(pe_clf - pe_lrn, .0)
        print 'L_unif = %f' % (0.0,)


if __name__ == '__main__':

    X_train, y_train = load_svmlight_file('../targets/diabetes/test.scale', n_features=8)
    X_test, y_test = load_svmlight_file('../targets/diabetes/test.scale', n_features=8)
    X_train = X_train.todense().tolist()
    X_test  = X_test.todense().tolist()

    clf = svm.LinearSVC()
    clf.fit(X_train, y_train)

    n_features = len(X_train[0])
    deltas = (1, .1, .01, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7)
    for e in deltas:
        delta = 1.0 / 10000
        print 'error bound=%f' % e
        ex = LordMeek(clf, (X_test, y_test), error=e, delta=delta)
        ex.do()
        print 'nq=%d' % (ex.get_n_query())
Tech Fingerprint

Alerts (6)

'def' Ensure functions have docstrings for documentation
54 90 119
Complexity hotspot; lines 156 to 158 (total complexity: 4)
156 157 158