/binary-classifiers/experiments/lowd_meek.py

https://github.com/ftramer/Steal-ML · Python · 187 lines · 127 code · 33 blank · 27 comment · 32 complexity · f117db04e4829bb31f361fa9f3299d35 MD5 · raw file

  1. __author__ = 'Fan'
  2. import copy
  3. import logging
  4. import numpy as np
  5. from sklearn import svm
  6. from sklearn.datasets import load_svmlight_file
  7. logger = logging.getLogger(__name__)
  8. logger.setLevel(logging.INFO)
  9. from algorithms.OnlineBase import OnlineBase
  10. class LordMeek(OnlineBase):
  11. def __init__(self, target, test_xy, error=None, delta=None):
  12. self.X_test, self.y_test = test_xy
  13. super(self.__class__, self).__init__('LM', +1, -1, target, len(self.X_test[0]), 'uniform', error)
  14. self.e = error
  15. self.delta = delta
  16. if 0 in self.y_test:
  17. self.NEG = 0
  18. elif -1 in self.y_test:
  19. self.NEG = -1
  20. else:
  21. print 'Watch out for test file! Neither 0 nor 1 is included!'
  22. def find_starters(self):
  23. """
  24. This function finds a pair of instances. One positive and one negative
  25. :param clf: classifier being extracted
  26. :return: (x+, x-) a pair of instances
  27. """
  28. # perdict = 1 ? inner(x, coef) + intercept_ > 0 : 0
  29. x_n, x_p = (None, None)
  30. x_n_found = False
  31. x_p_found = False
  32. for d in self.X_test:
  33. if x_n_found and x_p_found:
  34. break
  35. if self.query(d) == 1 and (not x_p_found):
  36. x_p = d
  37. x_p_found = True
  38. elif self.query(d) == self.NEG and (not x_n_found):
  39. x_n = d
  40. x_n_found = True
  41. return x_p, x_n
  42. def find_witness(self):
  43. x_p, x_n = self.find_starters()
  44. assert x_p is not None and self.query(x_p) == 1
  45. assert x_n is not None and self.query(x_n) == self.NEG
  46. dim = len(x_p)
  47. assert dim == len(x_n)
  48. last_p = -1
  49. for i in xrange(0, dim):
  50. # record the old value
  51. last_x_p_i = x_p[i]
  52. # change the value
  53. x_p[i] = x_n[i]
  54. if self.query(x_p) == self.NEG:
  55. # if flips
  56. last_x_p = copy.copy(x_p)
  57. last_x_p[i] = last_x_p_i
  58. assert self.query(x_p) == self.NEG and self.query(last_x_p) == 1
  59. logger.debug('witness found for dim %d' % i)
  60. return i, last_x_p, x_p
  61. return None
  62. def line_search(self, x, i):
  63. """
  64. starting at x (a negative point), search along dimension i, find a point very close to boundary
  65. :param x: starting point
  66. :param i: dimension to search
  67. :return: return the point near boundary
  68. """
  69. # make sure to start at a negative point
  70. assert self.query(x) == self.NEG
  71. # detach
  72. new_x = copy.copy(x)
  73. # phase II: binary search between init and x[i]
  74. def b(l, r):
  75. # print 'binary search [%f, %f]' % (l, r)
  76. # c(l) = 1 && c(r) = 0
  77. m = 0.5 * (l + r)
  78. new_x[i] = m
  79. if self.query(new_x) == self.NEG:
  80. return b(l, m)
  81. else:
  82. if abs(l - m) < self.e:
  83. return m, abs(l - m)
  84. return b(m, r)
  85. # phase I: exponential explore
  86. init_xi = x[i]
  87. step = 1.0 / 100
  88. # TODO not float64 yet
  89. while new_x[i] < np.finfo('f').max:
  90. new_x[i] += step
  91. if self.query(new_x) == 1:
  92. return b(new_x[i], init_xi)
  93. new_x[i] = init_xi
  94. new_x[i] -= step
  95. if self.query(new_x) == 1:
  96. return b(new_x[i], init_xi)
  97. step *= 2
  98. def do(self):
  99. f, sp, sn = self.find_witness()
  100. sp_f = sp[f]
  101. sn_f = sn[f]
  102. w_f = 1.0 * (sp_f - sn_f) / abs(sp_f - sn_f)
  103. x0, _ = self.push_to_b(sn, sp, self.e)
  104. # get a x1 with gap(x0,x1) = 1 & c(x1) = 0
  105. x1 = copy.copy(x0)
  106. x1[f] -= w_f
  107. u = np.zeros(len(x0))
  108. w = np.zeros(len(x0)) # target
  109. w[f] = w_f
  110. for i in xrange(0, len(x0)):
  111. if i == f:
  112. continue
  113. # unit vector along the ith dimension
  114. u[i] = 1.0
  115. a = np.add(x1, u / self.delta)
  116. b = np.add(x1, -u / self.delta)
  117. if self.query(a) == self.query(b):
  118. w[i] = 0
  119. else:
  120. logger.debug('Line search for dim %d', i)
  121. new_x_i, err = self.line_search(x1, i)
  122. w[i] = 1.0 / (new_x_i - x1[i])
  123. u[i] = 0.0
  124. b = self.clf1.intercept_ / self.clf1.coef_[0][f]
  125. # print w, b
  126. # print self.clf1.coef_ / self.clf1.coef_[0][f]
  127. # test
  128. error_clf = 0.0
  129. error_lrn = 0.0
  130. for test_x, test_y in zip(self.X_test, self.y_test):
  131. t = 1 if np.inner(w, test_x) + b > 0 else self.NEG
  132. if t != test_y:
  133. error_lrn += 1
  134. if self.clf1.predict(test_x) != test_y:
  135. error_clf += 1
  136. pe_clf = 1 - error_clf/ len(self.y_test)
  137. pe_lrn = 1 - error_lrn/ len(self.y_test)
  138. print 'L_test = %f' % max(pe_clf - pe_lrn, .0)
  139. print 'L_unif = %f' % (0.0,)
  140. if __name__ == '__main__':
  141. X_train, y_train = load_svmlight_file('../targets/diabetes/test.scale', n_features=8)
  142. X_test, y_test = load_svmlight_file('../targets/diabetes/test.scale', n_features=8)
  143. X_train = X_train.todense().tolist()
  144. X_test = X_test.todense().tolist()
  145. clf = svm.LinearSVC()
  146. clf.fit(X_train, y_train)
  147. n_features = len(X_train[0])
  148. deltas = (1, .1, .01, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7)
  149. for e in deltas:
  150. delta = 1.0 / 10000
  151. print 'error bound=%f' % e
  152. ex = LordMeek(clf, (X_test, y_test), error=e, delta=delta)
  153. ex.do()
  154. print 'nq=%d' % (ex.get_n_query())