/src/R/model/util.R

http://github.com/beechung/Latent-Factor-Models · R · 653 lines · 490 code · 28 blank · 135 comment · 356 complexity · 27f7ad38dea32e068c4098fd1363f9f8 MD5 · raw file

  1. ### Copyright (c) 2011, Yahoo! Inc. All rights reserved.
  2. ### Copyrights licensed under the New BSD License. See the accompanying LICENSE file for terms.
  3. ###
  4. ### Author: Bee-Chung Chen
  5. ###
  6. ### Find (theta1, theta2) that minimize the following loss
  7. ###
  8. ### loss = sum_i E_{y,z}[ (y[i] - theta1' z[i,] - theta2' x[i,])^2 ]
  9. ### = E[sum_i y[i]^2] - 2 theta' a + theta' A theta,
  10. ###
  11. ### where theta = c( theta1, theta2 )
  12. ### a = c( E[ sum_i z[i,]*y[i] ], sum_i x[i,]*E[y[i]] );
  13. ### A = rbind(cbind( E[ sum_i z[i,] z[i,]' ], sum_i E[z[i,]] x[i,]' ),
  14. ### cbind( sum_i x[i,] E[z[i,]]', sum_i x[i,] x[i,]' ))
  15. ###
  16. ### Log-likelihood = -(1/2) * ( N log(var_y) + loss / var_y )
  17. ###
  18. ### INPUT:
  19. ### y[i] = E[y[i]]
  20. ### sum.y.sq = E[sum_i y[i]^2]
  21. ### z[i,] = E[z[i,]]
  22. ### sum.zz = E[ sum_i z[i,] z[i,]' ]
  23. ### x[i,] = x[i,]
  24. ### sum.zy = E[ sum_i z[i,]*y[i] ]
  25. ### num = number of cases involved in sum_i
  26. ###
  27. ### subset: A vector of indices for i
  28. ###
  29. ### algo: the fitting algorithm to be used to fit theta2
  30. ### In this case, theta2' x[i,] = theta2(x[i,]); and
  31. ### theta2 will be fitted first (then treated as a constant when fitting theta1).
  32. ###
  33. ### OUTPUT: coeff.z=theta1, coeff.x=theta2, loss
  34. ###
  35. fit.E.gaussin.loglik <- function(
  36. y, sum.y.sq, x, z=NULL, sum.zz=NULL, sum.zy=NULL, num, subset=NULL,
  37. lambda=0.1, algo=NULL, algo.control=NULL
  38. ){
  39. nCases = length(y); nFeatures = ncol(x);
  40. if(!is.null(z)){
  41. if(is.matrix(z)) nFactors = ncol(z)
  42. else if(is.vector(z)) nFactors = 1
  43. else stop("z is not matrix or vector");
  44. if(nFactors > 1 && nrow(z) != nCases) stop("nrow(z) != nCases");
  45. if(nFactors == 1 && length(z) != nCases) stop("length(z) != nCases");
  46. if(length(sum.zy) != nFactors) stop("length(sum.zy) != nFactors");
  47. if(nFactors > 1 && any(dim(sum.zz) != c(nFactors, nFactors))) stop("any(dim(sum.z.sq) != c(nFactors, nFactors))");
  48. if(nFactors == 1 && length(sum.zz) != 1) stop("length(sum.zz) != 1");
  49. }else{
  50. nFactors = 0;
  51. if(!is.null(sum.zz)) stop("!is.null(sum.zz)");
  52. if(!is.null(sum.zy)) stop("!is.null(sum.zy)");
  53. }
  54. if(nrow(x) != nCases) stop("nrow(x) != nCases");
  55. if(length(sum.y.sq) != 1) stop("length(sum.y.sq) != 1")
  56. if(!is.null(subset)){
  57. if(is.logical(subset)){
  58. if(length(subset) != nCases) stop("is.logical(subset) && length(subset) != nCases");
  59. if(all(subset)) subset = NULL; # no need to select
  60. }else if(length(subset) == nCases){
  61. if(all(subset == 1:nCases)) subset = NULL; # no need to select
  62. }
  63. }
  64. if(!is.null(subset)){
  65. y = y[subset]; x = x[subset,,drop=FALSE];
  66. if(!is.null(z)) z = z[subset,,drop=FALSE];
  67. }
  68. if(length(y) != num) stop("length(y) != num");
  69. if(is.null(algo)){
  70. a = c(drop(sum.zy), drop(t(x)%*%y));
  71. A = matrix(NA, nrow=nFactors+nFeatures, ncol=nFactors+nFeatures);
  72. if(!is.null(z)){
  73. A[1:nFactors,1:nFactors] = sum.zz;
  74. zx = t(z) %*% x;
  75. A[1:nFactors,nFactors+(1:nFeatures)] = zx;
  76. A[nFactors+(1:nFeatures),1:nFactors] = t(zx);
  77. }
  78. A[nFactors+(1:nFeatures),nFactors+(1:nFeatures)] = t(x) %*% x;
  79. A.inv = solve(A + lambda*diag(nFactors+nFeatures));
  80. theta = drop(A.inv %*% a);
  81. out = list(
  82. coeff.z = if(nFactors>0) theta[1:nFactors] else NULL,
  83. coeff.x = theta[nFactors+(1:nFeatures)],
  84. loss = sum.y.sq - 2 * t(theta) %*% a + t(theta) %*% A %*% theta
  85. );
  86. }else{
  87. theta2 = reg.train(x=x, y=y, algo=algo, control=algo.control);
  88. theta2.x = reg.predict(model=theta2, x=x, algo=algo);
  89. if(!is.null(z)){
  90. E.diff = sum.zy - t(z)%*%theta2.x;
  91. theta1 = solve(sum.zz + lambda*diag(nFactors)) %*% E.diff;
  92. loss = sum.y.sq - 2*sum(y*theta2.x) + sum(theta2.x^2) - 2*t(theta1)%*%E.diff + t(theta1)%*%sum.zz%*%theta1;
  93. }else{
  94. theta1 = NULL;
  95. loss = sum.y.sq - 2*sum(y*theta2.x) + sum(theta2.x^2);
  96. }
  97. out = list(coeff.z=theta1, coeff.x=theta2, loss=loss);
  98. }
  99. return(out);
  100. }
  101. loss.E.gaussin.loglik <- function(
  102. y, sum.y.sq, x, coeff.x, z=NULL, coeff.z=NULL,
  103. sum.zz=NULL, sum.zy=NULL, num, subset=NULL, algo=NULL
  104. ){
  105. nCases = length(y); nFeatures = ncol(x);
  106. if(!is.null(z)){
  107. if(is.matrix(z)) nFactors = ncol(z)
  108. else if(is.vector(z)) nFactors = 1
  109. else stop("z is not matrix or vector");
  110. if(nFactors > 1 && nrow(z) != nCases) stop("nrow(z) != nCases");
  111. if(nFactors == 1 && length(z) != nCases) stop("length(z) != nCases");
  112. if(nFactors > 1 && any(dim(sum.zz) != c(nFactors, nFactors))) stop("any(dim(sum.z.sq) != c(nFactors, nFactors))");
  113. if(nFactors == 1 && length(sum.zz) != 1) stop("length(sum.zz) != 1");
  114. if(length(sum.zy) != nFactors) stop("length(sum.zy) != nFactors");
  115. }else{
  116. nFactors = 0;
  117. if(!is.null(sum.zz)) stop("!is.null(sum.zz)");
  118. if(!is.null(sum.zy)) stop("!is.null(sum.zy)");
  119. if(!is.null(coeff.z)) stop("!is.null(coeff.z)");
  120. }
  121. if(nrow(x) != nCases) stop("nrow(x) != nCases");
  122. if(length(sum.y.sq) != 1) stop("length(sum.y.sq) != 1")
  123. if(!is.null(subset)){
  124. if(is.logical(subset)){
  125. if(length(subset) != nCases) stop("is.logical(subset) && length(subset) != nCases");
  126. if(all(subset)) subset = NULL; # no need to select
  127. }else if(length(subset) == nCases){
  128. if(all(subset == 1:nCases)) subset = NULL; # no need to select
  129. }
  130. }
  131. if(!is.null(subset)){
  132. y = y[subset]; x = x[subset,,drop=FALSE];
  133. if(!is.null(z)) z = z[subset,,drop=FALSE];
  134. }
  135. if(length(y) != num) stop("length(y) != num");
  136. pred.x = reg.predict(model=coeff.x, x=x, algo=algo);
  137. ans = sum.y.sq - 2*sum(pred.x * y);
  138. if(!is.null(z)){
  139. ans = ans - 2*t(coeff.z)%*%sum.zy;
  140. ans = ans + t(coeff.z) %*% sum.zz %*% coeff.z;
  141. ans = ans + 2 * t(coeff.z) %*% t(z) %*% pred.x;
  142. }
  143. ans = ans + sum(pred.x^2);
  144. return(ans);
  145. }
  146. E.gaussin.loglik <- function(
  147. y, var_y, sum.y.sq, x, coeff.x, z=NULL, coeff.z=NULL,
  148. sum.zz=NULL, sum.zy=NULL, num, subset=NULL, algo=NULL
  149. ){
  150. # sanity check is performed inside loss.E.gaussin.loglik
  151. loss = loss.E.gaussin.loglik(
  152. coeff.z=coeff.z, coeff.x=coeff.x, y=y, z=z, x=x,
  153. sum.y.sq=sum.y.sq, sum.zz=sum.zz, sum.zy=sum.zy, num=num, subset=subset, algo=algo
  154. );
  155. loglik = -(1/2)*(num*log(var_y) + loss / var_y);
  156. return(loglik);
  157. }
  158. ###
  159. ### Find the beta and var_y that maximize
  160. ### -(1/2) * sum_i E[ log(var_y) + (w_i/var_y)*(y_i - beta'x_i)^2 ]
  161. ### i.e., minimize
  162. ### sum(w_i > 0) * log(var_y) + (1/var_y) * loss,
  163. ### where loss = sum_i w_i*{ (E[y_i] - beta'E[x_i])^2 + beta'Var[x_i]beta - 2beta'Cov[y_i,x_i] + Var[y_i] }
  164. ###
  165. ### Let XX = sum_i { w_i * (Var[x_i] + E[x_i]E[x_i]') }
  166. ### XY = sum_i { w_i * (E[x_i]E[y_i] + Cov[y_i,x_i]) }
  167. ### The solution is
  168. ### beta = XX^-1 XY
  169. ### var_y = loss / sum(w_i > 0)
  170. ### = (beta' XX beta - 2 XY beta + sum_i w_i*{E[y_i]^2 + Var[y_i]}) / sum(w_i > 0)
  171. ###
  172. ### INPUT:
  173. ### response.mean[i] = E[y_i]
  174. ### response.var[i] = Var[y_i]
  175. ### feature.mean[i,] = E[x_i]
  176. ### feature.var[i,,] = Var[x_i] or diag(feature.var[i,]) = Var[x_i]
  177. ### feature.cov[i,] = Cov[y_i,x_i]
  178. ### weight[i] = w_i
  179. ### NOTE:
  180. ### * Let feature.var[i,,] be a d x d matrix and the length of feature.mean[i,] is n.
  181. ### If n > d, then feature.var correspond to the first d features, and other features
  182. ### have 0 variance.
  183. ### * Let feature.cov[i,] has length d and feature.mean[i,] has length n.
  184. ### If n > d, then feature.cov correspond to the first d features, and other features
  185. ### have 0 covariance.
  186. ###
  187. ### OUTPUT:
  188. ### out$coeff = beta
  189. ### out$loss = loss
  190. ### out$num = sum(w_i > 0)
  191. ### out$var = var_y = loss / num
  192. ###
  193. fit.lm.random.effect <- function(
  194. response.mean, feature.mean,
  195. response.var=NULL, feature.var=NULL, feature.cov=NULL, weight=NULL,
  196. lambda=0 # lambda for ridge regression
  197. ){
  198. nFeatures = ncol(feature.mean);
  199. if(length(response.mean) != nrow(feature.mean)) stop("length(response.mean) != nrow(feature.mean)");
  200. if(!is.null(response.var) && length(response.mean) != length(response.var)) stop("length(response.mean) != length(response.var)");
  201. if(!is.null(feature.cov) && length(response.mean) != nrow(feature.cov)) stop("length(response.mean) != nrow(feature.cov)");
  202. if(!is.null(feature.cov) && ncol(feature.cov) > nFeatures) stop("ncol(feature.cov) > nFeatures");
  203. if(!is.null(response.var) && any(response.var < 0)) stop("Some response.var < 0");
  204. if(!is.null(feature.var) && any(feature.var < 0)) stop("Some feature.var < 0");
  205. if(is.null(lambda) || is.na(lambda)) lambda = 0;
  206. if(!is.null(weight)){
  207. if(length(weight) != length(response.mean)) stop("length(weight) != nrow(response.mean)");
  208. }else{
  209. weight = rep(1, length(response.mean));
  210. }
  211. if(any(weight < 0)) stop("any(weight < 0)");
  212. Var.X = matrix(0, nrow=nFeatures, ncol=nFeatures);
  213. if(is.null(feature.var)){
  214. # do nothing
  215. }else if(is.vector(feature.var)){
  216. if(length(feature.var) != nrow(feature.mean)) stop("length(feature.var) != nrow(feature.mean)");
  217. Var.X = sum(feature.var * weight) * diag(nFeatures);
  218. }else if(length(dim(feature.var)) == 2){
  219. if(nrow(feature.var) != nrow(feature.mean)) stop("nrow(feature.var) != nrow(feature.mean)");
  220. if(ncol(feature.var) > ncol(feature.mean)) stop("ncol(feature.var) > ncol(feature.mean)")
  221. else if(ncol(feature.var) != ncol(feature.cov)) stop("ncol(feature.var) != ncol(feature.cov)");
  222. v = apply(feature.var * weight, 2, sum);
  223. d = ncol(feature.var);
  224. temp = diag(v, nrow=d, ncol=d);
  225. Var.X[1:d,1:d] = temp;
  226. }else if(length(dim(feature.var)) == 3){
  227. if(dim(feature.var)[1] != nrow(feature.mean)) stop("dim(feature.var)[1] != nrow(feature.mean)");
  228. if(dim(feature.var)[2] > ncol(feature.mean)) stop("dim(feature.var)[2] > ncol(feature.mean)")
  229. else if(dim(feature.var)[2] != ncol(feature.cov)) stop("dim(feature.var)[2] != ncol(feature.cov)");
  230. if(dim(feature.var)[3] != dim(feature.var)[2]) stop("dim(feature.var)[3] != dim(feature.var)[2]");
  231. d = dim(feature.var)[2];
  232. temp = apply(feature.var * weight, c(2,3), sum);
  233. Var.X[1:d,1:d] = temp;
  234. }else stop("feature.var error");
  235. feature.mean.t.W = t(feature.mean * weight);
  236. E.X.E.X = feature.mean.t.W %*% feature.mean;
  237. XX = Var.X + E.X.E.X + lambda * diag(nFeatures);
  238. Cov.Y.X = rep(0, nFeatures);
  239. if(!is.null(feature.cov)){
  240. d = ncol(feature.cov);
  241. temp = apply(feature.cov * weight, 2, sum);
  242. Cov.Y.X[1:d] = temp;
  243. }
  244. E.X.E.Y = feature.mean.t.W %*% response.mean;
  245. XY = drop(E.X.E.Y + Cov.Y.X);
  246. beta = drop(solve(XX) %*% XY);
  247. if(is.null(response.var)){
  248. E.y2 = sum(response.mean^2 * weight);
  249. }else{
  250. E.y2 = sum((response.mean^2 + response.var) * weight);
  251. }
  252. loss = drop(t(beta) %*% (Var.X + E.X.E.X) %*% beta - 2 * t(beta) %*% XY + E.y2);
  253. num = sum(weight > 0);
  254. var_y = loss / num;
  255. if(var_y < 0){
  256. loss.2 = loss.lm.random.effect(
  257. coeff=beta, response.mean=response.mean, feature.mean=feature.mean,
  258. response.var=response.var, feature.var=feature.var, feature.cov=feature.cov, weight=weight
  259. );
  260. stop("Variance = ",var_y," < 0 (loss: ",loss," vs ",loss.2,")");
  261. }
  262. out = list(coeff=beta, loss=loss, num=num, var=var_y);
  263. return(out);
  264. }
  265. ### loss = sum_i w_i*{ (E[y_i] - beta'E[x_i])^2 + beta'Var[x_i]beta - 2beta'Cov[y_i,x_i] + Var[y_i] }
  266. loss.lm.random.effect <- function(
  267. coeff, response.mean, feature.mean,
  268. response.var=NULL, feature.var=NULL, feature.cov=NULL, weight=NULL
  269. ){
  270. nFeatures = ncol(feature.mean);
  271. nObs = length(response.mean);
  272. if(nObs != nrow(feature.mean)) stop("nObs != nrow(feature.mean)");
  273. if(!is.null(response.var) && nObs != length(response.var)) stop("nObs != length(response.var)");
  274. if(!is.null(feature.cov) && nObs != nrow(feature.cov)) stop("nObs != nrow(feature.cov)");
  275. if(!is.null(feature.cov) && ncol(feature.cov) > nFeatures) stop("ncol(feature.cov) > nFeatures (",ncol(feature.cov)," vs ",nFeatures,")");
  276. if(length(coeff) != nFeatures) stop("length(ceoff) != nFeatures");
  277. if(!is.null(weight)){
  278. if(length(weight) != nObs) stop("length(weight) != nObs");
  279. }else weight = rep(1, nObs);
  280. if(any(weight < 0)) stop("any(weight < 0)");
  281. if(is.null(response.var)) response.var = 0;
  282. err.square = drop((response.mean - feature.mean %*% coeff)^2);
  283. if(is.null(feature.var)){
  284. beta.V.beta = 0;
  285. }else if(is.vector(feature.var)){
  286. if(length(feature.var) != nObs) stop("length(feature.var) != nObs");
  287. beta.V.beta = feature.var * sum(coeff^2);
  288. }else if(length(dim(feature.var)) == 2){
  289. if(nrow(feature.var) != nObs) stop("nrow(feature.var) != nObs");
  290. if(ncol(feature.var) > nFeatures) stop("ncol(feature.var) > nFeatures");
  291. d = ncol(feature.var);
  292. beta.V.beta = drop(coeff[1:d] %*% (t(feature.var) * coeff[1:d]));
  293. }else if(length(dim(feature.var)) == 3){
  294. if(dim(feature.var)[1] != nObs) stop("dim(feature.var)[1] != nObs");
  295. if(dim(feature.var)[2] > nFeatures) stop("dim(feature.var)[2] > nFeatures");
  296. if(dim(feature.var)[3] != dim(feature.var)[2]) stop("dim(feature.var)[3] != dim(feature.var)[2]");
  297. d = dim(feature.var)[2];
  298. temp = rep(coeff[1:d], each=nObs*d);
  299. V.beta = apply(feature.var*temp, c(1,2), sum);
  300. beta.V.beta = drop(V.beta %*% coeff[1:d]);
  301. }else stop("feature.var error");
  302. beta.Cov = 0;
  303. if(!is.null(feature.cov)) beta.Cov = drop(feature.cov %*% coeff[1:ncol(feature.cov)]);
  304. loss = sum( weight * (err.square + beta.V.beta - 2*beta.Cov + response.var) );
  305. return(loss);
  306. }
  307. ### E[loglikelihood]
  308. ### = -(1/2) * sum_i E[ log(var_y) + (w_i/var_y)*(y_i - beta'x_i)^2 ]
  309. ### = -(1/2) * ( sum(w_i > 0) * log(var_y) + (1/var_y) * loss ),
  310. ### where loss = sum_i w_i*{ (E[y_i] - beta'E[x_i])^2 + beta'Var[x_i]beta - 2beta'Cov[y_i,x_i] + Var[y_i] }
  311. Eloglik.lm.random.effect <- function(
  312. coeff, var, response.mean, feature.mean,
  313. response.var=NULL, feature.var=NULL, feature.cov=NULL, weight=NULL,
  314. algo=NULL
  315. ){
  316. nObs = length(response.mean);
  317. if(length(var) != 1){
  318. if(!is.null(weight)) stop("length(var) != 1");
  319. sum_log_var = sum(log(var));
  320. weight = 1/var;
  321. var = 1;
  322. }else{
  323. if(is.null(weight)) weight = rep(1, nObs);
  324. sum_log_var = sum(weight > 0) * log(var);
  325. }
  326. if(any(weight < 0)) stop("any(weight < 0)");
  327. if(length(weight) != nObs) stop("length(weight) != nObs");
  328. if(is.null(algo)){
  329. loss = loss.lm.random.effect(
  330. coeff=coeff, response.mean=response.mean, feature.mean=feature.mean,
  331. response.var=response.var, feature.var=feature.var, feature.cov=feature.cov, weight=weight
  332. );
  333. }else{
  334. if(!is.null(feature.var) || !is.null(feature.cov)) stop("When algo is not NULL, you cannot specify feature.var or feature.cov");
  335. loss = reg.loss(model=coeff, x=feature.mean, y=response.mean, algo=algo, y.var=response.var, weight=weight);
  336. }
  337. loglik = -(1/2)*(sum_log_var + loss / var);
  338. return(loglik);
  339. }
  340. ###
  341. ### model = NA would predict 0 for any input
  342. ###
  343. reg.predict <- function(model, x, algo, ncol=NULL){
  344. if(is.null(nrow(x))) stop("is.null(nrow(x))");
  345. if(is.null(ncol)){
  346. if(length(model) == 1 && is.na(model)){
  347. return(rep(0.0,nrow(x)));
  348. }else if(is.list(model) && !is.null(model$name)){
  349. return(drop(algo$predict(model=model, x=x)));
  350. }else if(is.vector(model)){
  351. if(length(model)==0 && ncol(x)==0) return(rep(0.0,nrow(x)));
  352. return(drop(x %*% model));
  353. }else{
  354. cat("str(model) = \n");
  355. str(model);
  356. stop("Unknown input model");
  357. }
  358. }else if(is.matrix(model)){
  359. if(ncol(model) != ncol) stop("ncol(model) != ncol");
  360. if(ncol(x)==0 && nrow(model)==0) return(array(0.0, dim=c(nrow(x),ncol)));
  361. return(as.matrix(x %*% model));
  362. }else{
  363. if(ncol != length(model)) stop("ncol != length(model)");
  364. out = matrix(NA, nrow=nrow(x), ncol=ncol);
  365. for(i in 1:ncol){
  366. out[,i] = reg.predict(model=model[[i]], x=x, algo=algo, ncol=NULL);
  367. }
  368. return(out);
  369. }
  370. }
  371. ###
  372. ### Model:
  373. ### y[i] ~ N(mean = h(x[i,]), var = var_y/weight[i])
  374. ### OUTPUT:
  375. ### out$coeff = h (the regression function)
  376. ### out$loss = sum_i weight[i] * { (E[y[i]] - h(x[i,]))^2 + Var[y[i]] }
  377. ### out$num = sum(weight > 0)
  378. ### out$var = var_y = loss / num
  379. ###
  380. reg.train <- function(x, y, algo, y.var=0, weight=NULL, control=NULL, max.nObs=NULL){
  381. if(length(y) != nrow(x)) stop("length(y) != nrow(x)");
  382. if(length(y.var) != 1 && length(y.var) != length(y)) stop("length(y.var) != length(y)");
  383. if(!is.null(weight) && any(weight == 0)){
  384. select = weight != 0;
  385. y = y[select];
  386. x = x[select,,drop=FALSE];
  387. }
  388. if(!is.null(max.nObs) && length(y) > max.nObs){
  389. select = sample(length(y), max.nObs);
  390. y = y[select];
  391. x = x[select,,drop=FALSE];
  392. }
  393. if(is.null(control)) control = algo$control;
  394. if(is.null(weight)) weight=rep(1,length(y));
  395. if(any(weight <= 0)) stop("any(weight <= 0)");
  396. model = algo$train(x=x, y=y, weight=weight, control=control);
  397. pred = drop(algo$predict(model=model, x=x));
  398. loss = sum( weight * ((y - pred)^2 + y.var) );
  399. out = list(coeff=model, loss=loss, num=length(y), var=loss/length(y));
  400. return(out);
  401. }
  402. reg.loss <- function(model, x, y, algo, y.var=0, weight=NULL){
  403. pred = reg.predict(model=model, x=x, algo=algo);
  404. if(is.null(weight)) weight = 1;
  405. loss = sum( weight * ((y - pred)^2 + y.var) );
  406. return(loss);
  407. }
  408. ###
  409. ### Gaussian likelihood function
  410. ###
  411. loglik.gaussian <- function(pred.x, x, var_x){
  412. if(length(var_x) != 1) stop("length(var_x) != 1");
  413. loglik = -(1/2) * ( sum((x - pred.x)^2 / var_x) + length(x) * log(var_x) );
  414. return(loglik);
  415. }
  416. ###
  417. ### Get feature matrix
  418. ### The input data x is a data frame in one of the two formats:
  419. ### (1) Dense format:
  420. ### x = data.frame(id, feature1, feature2, ...)
  421. ### (2) Sparse format:
  422. ### x = data.frame(id, index, value)
  423. ### id is the case ID
  424. ### index is the feature index
  425. ### value is the feature value
  426. ### Output a matrix (may be in the sparseMatrix format), in which
  427. ### the nth row correspond to selected.id[n]
  428. get.feature.matrix <- function(
  429. x, # input data
  430. id.colname, # name of the ID column in x
  431. selected.id, # a list of ID to select
  432. add.intercept=FALSE, # whether to add a column of all ones
  433. err.prefix="", err.x.name=NULL, err.select.name=NULL
  434. ){
  435. if(add.intercept){ regFormula = formula(~.); default = 1.0; }
  436. else{ regFormula = formula(~.-1); default = 0.0; }
  437. if(is.null(x)) return(matrix(default, nrow=length(selected.id), ncol=1));
  438. x.name = if(is.null(err.x.name)) "input table x" else err.x.name;
  439. if(!(id.colname %in% names(x))) stop(err.prefix,"Cannot find ID column '", id.colname,"' in ",x.name);
  440. if(ncol(x) == 3 && all(c(id.colname, "index", "value") %in% names(x))){
  441. # Sparse format
  442. nCases = length(selected.id);
  443. nFeatures = max(x$index);
  444. x$row = match(x[[id.colname]], selected.id);
  445. x = x[!is.na(x$row),];
  446. if(add.intercept){
  447. nFeatures = nFeatures + 1;
  448. intercept = data.frame(row=1:nCases, index=nFeatures, value=1.0);
  449. x = rbind(x[,c("row","index","value")], intercept);
  450. }
  451. out = sparseMatrix(i=x$row, j=x$index, x=x$value, dims=c(nCases, nFeatures));
  452. }else{
  453. # Dense format
  454. select = match(selected.id, x[[id.colname]]);
  455. if(any(is.na(select))){
  456. temp = if(is.null(err.select.name)) "" else paste(" in ",err.select.name,sep="");
  457. stop(err.prefix,"Some IDs",temp," cannot be found in ",x.name,"$",id.colname);
  458. }
  459. out = model.matrix(regFormula, x[select,-match(id.colname, names(x)),drop=FALSE]);
  460. }
  461. return(out);
  462. }
  463. is.sparse.feature <- function(x){
  464. if(is.data.frame(x) && ncol(x) == 3 && all(c("index", "value") %in% names(x))) return(TRUE);
  465. return(FALSE);
  466. }
  467. ###
  468. ### For non-Gaussian response
  469. ###
  470. ### Initialize the observation table
  471. ### Observation table: obs = data.frame(user, item, y)
  472. ### Add the following to the code before the EM procedure.
  473. ### obs = init.obs(obs, is.logistic);
  474. ### After the call, obs$response will contain the binary response.
  475. ### Note obs$y will be used to store the Gaussian response values
  476. ### for variational approximation
  477. init.obs <- function(obs, is.logistic){
  478. if(is.null(obs)) return(NULL);
  479. obs$response = obs$y; # Make a copy of the original response
  480. # since obs$y may be changed for variational approx.
  481. if(is.logistic){
  482. labels = unique(obs$response);
  483. if(length(labels) != 2) stop("The response is not binary: ",paste(labels[1:min(10,length(labels))],collapse=", "));
  484. labels = sort(labels);
  485. if(any(labels != c(0,1)) && any(labels != c(-1,1))) stop("Binary response must be {0,1} or {-1,1}");
  486. }
  487. return(obs);
  488. }
  489. ### Generate Gaussain response
  490. ### Add the following code in the EM loop before the E-step
  491. ### response = generate.response(obs=obs, param=param, is.logistic=is.logistic, verbose=verbose);
  492. ### obs$y = response$y;
  493. ### param$var_y = response$var_y;
  494. generate.response <- function(obs, param, is.logistic, verbose=0){
  495. if(is.logistic){
  496. # variational approximation
  497. if(verbose >= 1) cat("generate gaussian response for logistic\n");
  498. if(length(param$xi) != nrow(obs)) stop("length(param$xi) != nObs");
  499. if(length(obs$response) != nrow(obs)) stop("length(param$xi) != nObs");
  500. response = obs$response;
  501. if(all(response %in% c(0,1))) response[response == 0] = -1;
  502. if(!all(response %in% c(-1,1))) stop("Binary response must be {-1,1} at this point");
  503. var_y = 1/(2 * logistic.lambda(param$xi));
  504. y = response * var_y / 2;
  505. }else{
  506. var_y = param$var_y;
  507. y = obs$y;
  508. }
  509. return(list(y=y, var_y=var_y));
  510. }
  511. ### Update the parameters
  512. ### Add the following code in the EM loop after the M-step
  513. ### param = update.param(factor.mean=mc_e$mean, param=param.new, obs=obs, is.logistic=is.logistic);
  514. update.param <- function(factor.mean, param, obs, is.logistic, factor.var=NULL, x_obs=NULL){
  515. if(is.logistic){
  516. if(!is.null(factor.mean$pred.y.square)){
  517. if(length(factor.mean$pred.y.square) != nrow(obs)) stop("length(factor.mean$pred.y.square) != nrow(obs)");
  518. param$xi = sqrt(factor.mean$pred.y.square);
  519. }else if(!is.null(factor.mean$fScore) && !is.null(factor.var$fScore)){
  520. if(length(factor.mean$fScore) != nrow(obs)) stop("length(factor.mean$fScore) != nrow(obs)");
  521. if(length(factor.var$fScore) != nrow(obs)) stop("length(factor.var$fScore) != nrow(obs)");
  522. xb = reg.predict(model=param$b, x=x_obs, algo=param$reg.algo);
  523. pred = xb + factor.mean$fScore;
  524. param$xi = sqrt(pred^2 + factor.var$fScore);
  525. }else stop("Cannot compute xi for variational logistic");
  526. }
  527. return(param);
  528. }
  529. # input pred.y is based on the Gaussian model
  530. obsLoglik.from.gaussian <- function(pred.y, y, var_y, is.logistic){
  531. if(length(y) != length(pred.y)) stop("length(y) != length(pred.y)");
  532. if(is.logistic){
  533. if(all(y %in% c(0,1))) y[y == 0] = -1;
  534. if(!all(y %in% c(-1,1))) stop("Binary response must be {-1,1} at this point");
  535. loglik = sum( -log1p(exp(-y * pred.y) ) );
  536. attr(loglik, "loss") = -loglik / length(y);
  537. }else{
  538. loglik = loglik.gaussian(pred.x=pred.y, x=y, var_x=var_y);
  539. attr(loglik, "loss") = sqrt(mean( (y - pred.y)^2 ));
  540. }
  541. return(loglik);
  542. }
  543. # input pred.y is based on the Gaussian model
  544. # output$pred.y is the input for the Gaussian model
  545. # is the predicted probability for the Logistic model
  546. predict.response.from.gaussian <- function(pred.y, y, param, is.logistic){
  547. if(!is.null(y)) loglik = obsLoglik.from.gaussian(pred.y=pred.y, y=y, var_y=param$var_y, is.logistic=is.logistic)
  548. else loglik = NULL;
  549. if(is.logistic){
  550. pred.y = 1/(1+exp(-pred.y));
  551. if(!is.null(y)){
  552. if(min(y) == -1) temp.y = 2*pred.y - 1
  553. else temp.y = pred.y;
  554. }
  555. }else{
  556. temp.y = pred.y;
  557. }
  558. if(!is.null(y)){
  559. rmse = sqrt(mean( (y - temp.y)^2 ));
  560. mae = mean( abs(y - temp.y) );
  561. }else{
  562. rmse = NULL;
  563. mae = NULL;
  564. }
  565. return(list(pred.y=pred.y, true.y=y, rmse=rmse, mae=mae, loglik=loglik, test.loss=attr(loglik, "loss")));
  566. }
  567. logistic.lambda <- function(xi){
  568. return(tanh(xi/2) / (4*xi));
  569. }
  570. ###
  571. ### Output to the output directory
  572. ### IDs: a list of userIDs, itemIDs, etc (can be null)
  573. ### prediction$test.loss must exist if prediction is not null
  574. ### loglik is the complete data training log likelihood
  575. ###
  576. output.to.dir <- function(
  577. out.dir, factor, param, IDs, prediction, loglik,
  578. minTestLoss, nSamples, iter, out.level, out.overwrite,
  579. TimeEStep, TimeMStep, TimeTest, verbose,
  580. other=NULL, name="est", data.train=NULL
  581. ){
  582. if(!is.null(prediction) && is.null(prediction$test.loss)) stop("prediction$test.loss does not exist");
  583. if(!is.null(prediction) && is.null(prediction$rmse)) stop("prediction$rmse does not exist");
  584. if(is.null(attr(loglik, "loss"))) stop("is.null(attr(loglik, 'loss'))");
  585. if(out.level <= 0) return(NULL);
  586. b.time.write = proc.time();
  587. if(is.null(out.dir)) stop("Please specify out.dir");
  588. if(iter == 0){
  589. if(file.exists(paste(out.dir,"/",name,".last",sep="")) && !out.overwrite){
  590. stop("Output File '",out.dir,"' EXISTS!!");
  591. }else if(!file.exists(out.dir)){
  592. dir.create(out.dir, recursive=TRUE);
  593. }
  594. smry.file = paste(out.dir,"/summary",sep="");
  595. if(file.exists(smry.file)) file.remove(smry.file);
  596. }
  597. thisTestLoss = if(is.null(prediction)) -1 else prediction$test.loss;
  598. TestRMSE = if(is.null(prediction)) -1 else prediction$rmse;
  599. if(iter == 0){
  600. if(out.level >= 2) save(file=paste(out.dir,"/",name,".0",sep=""), list=c("factor", "param"));
  601. }else{
  602. file = paste(out.dir,"/",name,".last",sep="");
  603. if(out.level >= 2 && iter >= 2){
  604. file.prev = paste(out.dir,"/",name,".",(iter-1),sep="");
  605. if(file.exists(file.prev)) file.remove(file.prev);
  606. file.rename(file, file.prev);
  607. }
  608. save(file=file, list=c("factor", "param", "prediction", "IDs", "other", "data.train"));
  609. if(!is.null(prediction)){
  610. if(thisTestLoss == minTestLoss) file.copy(file, paste(out.dir,"/",name,".minTestLoss",sep=""), overwrite=TRUE);
  611. }
  612. save(file=paste(out.dir,"/param.",iter,sep=""), list=c("param"));
  613. }
  614. if(length(nSamples) != 1){
  615. nSamples = if(iter > 0) nSamples[iter] else 0;
  616. }
  617. summary = data.frame(Method="MCEM", Iter=iter, nSteps=nSamples, CDlogL=loglik, TestLoss=thisTestLoss, LossInTrain=attr(loglik,"loss"), TestRMSE=TestRMSE, TimeEStep=TimeEStep, TimeMStep=TimeMStep, TimeTest=TimeTest);
  618. file = paste(out.dir,"/summary",sep="");
  619. if(file.exists(file)) write.table(summary, file=file, append=TRUE, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
  620. else write.table(summary, file=file, append=FALSE, quote=FALSE, sep="\t", row.names=FALSE, col.names=TRUE);
  621. if(verbose > 0){
  622. time.used.write = proc.time() - b.time.write;
  623. cat("write a model & summary info on to disk (used ",time.used.write[3]," sec)\n",sep="");
  624. }
  625. }