PageRenderTime 51ms CodeModel.GetById 14ms app.highlight 29ms RepoModel.GetById 1ms app.codeStats 1ms

/src/R/model/util.R

http://github.com/beechung/Latent-Factor-Models
R | 653 lines | 490 code | 28 blank | 135 comment | 356 complexity | 27f7ad38dea32e068c4098fd1363f9f8 MD5 | raw file
  1### Copyright (c) 2011, Yahoo! Inc.  All rights reserved.
  2### Copyrights licensed under the New BSD License. See the accompanying LICENSE file for terms.
  3### 
  4### Author: Bee-Chung Chen
  5
  6###
  7### Find (theta1, theta2) that minimize the following loss
  8###
  9###    loss = sum_i E_{y,z}[ (y[i] - theta1' z[i,] - theta2' x[i,])^2 ]
 10###         = E[sum_i y[i]^2] - 2 theta' a + theta' A theta,
 11###
 12### where theta = c( theta1,  theta2 )
 13###           a = c( E[ sum_i z[i,]*y[i] ],  sum_i x[i,]*E[y[i]] );
 14###           A = rbind(cbind( E[ sum_i z[i,] z[i,]' ],  sum_i E[z[i,]] x[i,]' ),
 15###                     cbind( sum_i x[i,] E[z[i,]]',    sum_i x[i,] x[i,]'    ))
 16###
 17### Log-likelihood = -(1/2) * ( N log(var_y) + loss / var_y )
 18###
 19### INPUT:
 20###    y[i]     = E[y[i]]
 21###    sum.y.sq = E[sum_i y[i]^2]
 22###    z[i,]    = E[z[i,]]
 23###    sum.zz   = E[ sum_i z[i,] z[i,]' ]
 24###    x[i,]    = x[i,]
 25###    sum.zy   = E[ sum_i z[i,]*y[i] ]
 26###    num      = number of cases involved in sum_i
 27###
 28###    subset: A vector of indices for i
 29###
 30###	   algo: the fitting algorithm to be used to fit theta2
 31###          In this case, theta2' x[i,] = theta2(x[i,]); and
 32###			 theta2 will be fitted first (then treated as a constant when fitting theta1).
 33###
 34### OUTPUT: coeff.z=theta1, coeff.x=theta2, loss
 35###
 36fit.E.gaussin.loglik <- function(
 37	y, sum.y.sq, x, z=NULL, sum.zz=NULL, sum.zy=NULL, num, subset=NULL, 
 38	lambda=0.1, algo=NULL, algo.control=NULL
 39){
 40	nCases = length(y);  nFeatures = ncol(x);
 41	if(!is.null(z)){
 42		if(is.matrix(z))      nFactors = ncol(z)
 43		else if(is.vector(z)) nFactors = 1
 44		else stop("z is not matrix or vector");
 45		if(nFactors >  1 && nrow(z)   != nCases) stop("nrow(z) != nCases");  
 46		if(nFactors == 1 && length(z) != nCases) stop("length(z) != nCases");  
 47		if(length(sum.zy) != nFactors) stop("length(sum.zy) != nFactors");
 48		if(nFactors >  1 && any(dim(sum.zz) != c(nFactors, nFactors))) stop("any(dim(sum.z.sq) != c(nFactors, nFactors))");
 49		if(nFactors == 1 && length(sum.zz) != 1) stop("length(sum.zz) != 1");
 50	}else{
 51		nFactors = 0;
 52		if(!is.null(sum.zz)) stop("!is.null(sum.zz)");
 53		if(!is.null(sum.zy)) stop("!is.null(sum.zy)");
 54	}
 55	if(nrow(x) != nCases) stop("nrow(x) != nCases");
 56	if(length(sum.y.sq) != 1) stop("length(sum.y.sq) != 1")
 57	if(!is.null(subset)){
 58		if(is.logical(subset)){
 59			if(length(subset) != nCases) stop("is.logical(subset) && length(subset) != nCases");
 60			if(all(subset)) subset = NULL; # no need to select
 61		}else if(length(subset) == nCases){
 62			if(all(subset == 1:nCases)) subset = NULL; # no need to select
 63		}
 64	}
 65	if(!is.null(subset)){
 66		y = y[subset];  x = x[subset,,drop=FALSE];
 67		if(!is.null(z)) z = z[subset,,drop=FALSE];
 68	}
 69	if(length(y) != num) stop("length(y) != num");
 70	if(is.null(algo)){
 71		a = c(drop(sum.zy), drop(t(x)%*%y));
 72		A = matrix(NA, nrow=nFactors+nFeatures, ncol=nFactors+nFeatures);
 73		if(!is.null(z)){
 74			A[1:nFactors,1:nFactors] = sum.zz;
 75			zx = t(z) %*% x;
 76			A[1:nFactors,nFactors+(1:nFeatures)] = zx;
 77			A[nFactors+(1:nFeatures),1:nFactors] = t(zx);
 78		}
 79		A[nFactors+(1:nFeatures),nFactors+(1:nFeatures)] = t(x) %*% x;
 80		A.inv = solve(A + lambda*diag(nFactors+nFeatures));
 81		theta = drop(A.inv %*% a);
 82		out = list(
 83				coeff.z = if(nFactors>0) theta[1:nFactors] else NULL,
 84				coeff.x = theta[nFactors+(1:nFeatures)],
 85				loss = sum.y.sq - 2 * t(theta) %*% a + t(theta) %*% A %*% theta
 86		);
 87	}else{
 88		theta2 = reg.train(x=x, y=y, algo=algo, control=algo.control);
 89		theta2.x = reg.predict(model=theta2, x=x, algo=algo);
 90		if(!is.null(z)){
 91			E.diff = sum.zy - t(z)%*%theta2.x;
 92			theta1 = solve(sum.zz + lambda*diag(nFactors)) %*% E.diff;
 93			loss = sum.y.sq - 2*sum(y*theta2.x) + sum(theta2.x^2) - 2*t(theta1)%*%E.diff + t(theta1)%*%sum.zz%*%theta1;
 94		}else{
 95			theta1 = NULL;
 96			loss = sum.y.sq - 2*sum(y*theta2.x) + sum(theta2.x^2);
 97		}
 98		out = list(coeff.z=theta1, coeff.x=theta2, loss=loss);
 99	}
100	return(out);
101}
102loss.E.gaussin.loglik <- function(
103	y, sum.y.sq, x, coeff.x, z=NULL, coeff.z=NULL,
104	sum.zz=NULL, sum.zy=NULL, num, subset=NULL, algo=NULL
105){
106	nCases = length(y);  nFeatures = ncol(x);
107	if(!is.null(z)){
108		if(is.matrix(z))      nFactors = ncol(z)
109		else if(is.vector(z)) nFactors = 1
110		else stop("z is not matrix or vector");
111		if(nFactors >  1 && nrow(z)   != nCases) stop("nrow(z) != nCases");  
112		if(nFactors == 1 && length(z) != nCases) stop("length(z) != nCases");  
113		if(nFactors >  1 && any(dim(sum.zz) != c(nFactors, nFactors))) stop("any(dim(sum.z.sq) != c(nFactors, nFactors))");
114		if(nFactors == 1 && length(sum.zz) != 1) stop("length(sum.zz) != 1");
115		if(length(sum.zy) != nFactors) stop("length(sum.zy) != nFactors");
116	}else{
117		nFactors = 0;
118		if(!is.null(sum.zz)) stop("!is.null(sum.zz)");
119		if(!is.null(sum.zy)) stop("!is.null(sum.zy)");
120		if(!is.null(coeff.z)) stop("!is.null(coeff.z)");
121	}
122	if(nrow(x) != nCases) stop("nrow(x) != nCases");
123	if(length(sum.y.sq) != 1) stop("length(sum.y.sq) != 1")
124	if(!is.null(subset)){
125		if(is.logical(subset)){
126			if(length(subset) != nCases) stop("is.logical(subset) && length(subset) != nCases");
127			if(all(subset)) subset = NULL; # no need to select
128		}else if(length(subset) == nCases){
129			if(all(subset == 1:nCases)) subset = NULL; # no need to select
130		}
131	}
132	if(!is.null(subset)){
133		y = y[subset];  x = x[subset,,drop=FALSE];
134		if(!is.null(z)) z = z[subset,,drop=FALSE];
135	}
136	if(length(y) != num) stop("length(y) != num");
137	pred.x = reg.predict(model=coeff.x, x=x, algo=algo);
138	ans = sum.y.sq - 2*sum(pred.x * y);
139	if(!is.null(z)){
140		ans = ans - 2*t(coeff.z)%*%sum.zy;		
141		ans = ans + t(coeff.z) %*% sum.zz %*% coeff.z;
142		ans = ans + 2 * t(coeff.z) %*% t(z) %*% pred.x;
143	}
144	ans = ans + sum(pred.x^2);
145	return(ans);
146}
147E.gaussin.loglik <- function(
148	y, var_y, sum.y.sq, x, coeff.x, z=NULL, coeff.z=NULL, 
149	sum.zz=NULL, sum.zy=NULL, num, subset=NULL, algo=NULL
150){
151	# sanity check is performed inside loss.E.gaussin.loglik
152	loss = loss.E.gaussin.loglik(
153			coeff.z=coeff.z, coeff.x=coeff.x, y=y, z=z, x=x, 
154			sum.y.sq=sum.y.sq, sum.zz=sum.zz, sum.zy=sum.zy, num=num, subset=subset, algo=algo
155	);
156	loglik = -(1/2)*(num*log(var_y) + loss / var_y);
157	return(loglik);
158}
159
160###
161### Find the beta and var_y that maximize
162###		-(1/2) * sum_i E[ log(var_y) + (w_i/var_y)*(y_i - beta'x_i)^2 ]
163### i.e., minimize
164###		    sum(w_i > 0) * log(var_y) + (1/var_y) * loss,
165###     where loss = sum_i w_i*{ (E[y_i] - beta'E[x_i])^2 + beta'Var[x_i]beta - 2beta'Cov[y_i,x_i] + Var[y_i] }
166###
167###	Let XX = sum_i { w_i * (Var[x_i] + E[x_i]E[x_i]') }
168###     XY = sum_i { w_i * (E[x_i]E[y_i] + Cov[y_i,x_i]) }
169### The solution is
170###		beta = XX^-1 XY
171###    var_y = loss / sum(w_i > 0)
172###          = (beta' XX beta - 2 XY beta + sum_i w_i*{E[y_i]^2 + Var[y_i]}) / sum(w_i > 0)
173###
174### INPUT:
175###	   response.mean[i]  = E[y_i]
176###    response.var[i]   = Var[y_i]
177###     feature.mean[i,] = E[x_i]
178###     feature.var[i,,] = Var[x_i] or diag(feature.var[i,]) = Var[x_i]
179###     feature.cov[i,]  = Cov[y_i,x_i]
180###          weight[i]   = w_i
181### NOTE:
182###    * Let feature.var[i,,] be a d x d matrix and the length of feature.mean[i,] is n.
183###      If n > d, then feature.var correspond to the first d features, and other features
184###      have 0 variance.
185###    * Let feature.cov[i,] has length d and feature.mean[i,] has length n.
186###      If n > d, then feature.cov correspond to the first d features, and other features
187###      have 0 covariance.
188###
189### OUTPUT:
190###    out$coeff = beta
191###    out$loss  = loss
192###    out$num   = sum(w_i > 0)
193###    out$var   = var_y = loss / num
194###
195fit.lm.random.effect <- function(
196	response.mean, feature.mean,
197	response.var=NULL, feature.var=NULL, feature.cov=NULL, weight=NULL,
198	lambda=0 # lambda for ridge regression
199){
200	nFeatures = ncol(feature.mean);
201	if(length(response.mean) != nrow(feature.mean)) stop("length(response.mean) != nrow(feature.mean)");
202	if(!is.null(response.var) && length(response.mean) != length(response.var)) stop("length(response.mean) != length(response.var)");
203	if(!is.null(feature.cov)  && length(response.mean) != nrow(feature.cov)) stop("length(response.mean) != nrow(feature.cov)");
204	if(!is.null(feature.cov)  && ncol(feature.cov) > nFeatures) stop("ncol(feature.cov) > nFeatures");
205
206	if(!is.null(response.var) && any(response.var < 0)) stop("Some response.var < 0");
207	if(!is.null(feature.var)  && any(feature.var < 0))  stop("Some feature.var < 0");
208	
209	if(is.null(lambda) || is.na(lambda)) lambda = 0;
210	
211	if(!is.null(weight)){
212		if(length(weight) != length(response.mean)) stop("length(weight) != nrow(response.mean)");
213	}else{
214		weight = rep(1, length(response.mean));
215	}
216	if(any(weight < 0)) stop("any(weight < 0)");
217
218	Var.X = matrix(0, nrow=nFeatures, ncol=nFeatures);
219	if(is.null(feature.var)){
220		# do nothing
221	}else if(is.vector(feature.var)){
222		if(length(feature.var) != nrow(feature.mean)) stop("length(feature.var) != nrow(feature.mean)");
223		Var.X = sum(feature.var * weight) * diag(nFeatures);
224	}else if(length(dim(feature.var)) == 2){
225		if(nrow(feature.var) != nrow(feature.mean)) stop("nrow(feature.var) != nrow(feature.mean)");
226		if(ncol(feature.var) >  ncol(feature.mean)) stop("ncol(feature.var) > ncol(feature.mean)")
227		else if(ncol(feature.var) != ncol(feature.cov)) stop("ncol(feature.var) != ncol(feature.cov)");
228		v = apply(feature.var * weight, 2, sum);
229		d = ncol(feature.var);
230		temp = diag(v, nrow=d, ncol=d);
231		Var.X[1:d,1:d] = temp;
232	}else if(length(dim(feature.var)) == 3){
233		if(dim(feature.var)[1] != nrow(feature.mean)) stop("dim(feature.var)[1] != nrow(feature.mean)");
234		if(dim(feature.var)[2] >  ncol(feature.mean)) stop("dim(feature.var)[2] > ncol(feature.mean)")
235		else if(dim(feature.var)[2] != ncol(feature.cov)) stop("dim(feature.var)[2] != ncol(feature.cov)");
236		if(dim(feature.var)[3] != dim(feature.var)[2]) stop("dim(feature.var)[3] != dim(feature.var)[2]");
237		d = dim(feature.var)[2];
238		temp = apply(feature.var * weight, c(2,3), sum);
239		Var.X[1:d,1:d] = temp;
240	}else stop("feature.var error");
241	feature.mean.t.W = t(feature.mean * weight);
242	E.X.E.X = feature.mean.t.W %*% feature.mean;
243	XX = Var.X + E.X.E.X + lambda * diag(nFeatures);
244	
245	Cov.Y.X = rep(0, nFeatures);
246	if(!is.null(feature.cov)){
247		d = ncol(feature.cov);
248		temp = apply(feature.cov * weight, 2, sum);
249		Cov.Y.X[1:d] = temp;
250	}
251	E.X.E.Y = feature.mean.t.W %*% response.mean;
252	XY = drop(E.X.E.Y + Cov.Y.X);
253	
254	beta = drop(solve(XX) %*% XY);
255
256	if(is.null(response.var)){
257		E.y2 = sum(response.mean^2 * weight);
258	}else{
259		E.y2 = sum((response.mean^2 + response.var) * weight);
260	}
261	loss = drop(t(beta) %*% (Var.X + E.X.E.X) %*% beta - 2 * t(beta) %*% XY + E.y2);
262	num  = sum(weight > 0);
263	var_y = loss / num;
264	if(var_y < 0){
265		loss.2 = loss.lm.random.effect(
266			coeff=beta, response.mean=response.mean, feature.mean=feature.mean, 
267			response.var=response.var, feature.var=feature.var, feature.cov=feature.cov, weight=weight
268		);
269		stop("Variance = ",var_y," < 0 (loss: ",loss," vs ",loss.2,")");
270	}
271	out = list(coeff=beta, loss=loss, num=num, var=var_y);
272	return(out);
273}
274### loss = sum_i w_i*{ (E[y_i] - beta'E[x_i])^2 + beta'Var[x_i]beta - 2beta'Cov[y_i,x_i] + Var[y_i] }
275loss.lm.random.effect <- function(
276	coeff, response.mean, feature.mean,
277	response.var=NULL, feature.var=NULL, feature.cov=NULL, weight=NULL
278){
279	nFeatures = ncol(feature.mean);
280	nObs      = length(response.mean);
281	if(nObs != nrow(feature.mean)) stop("nObs != nrow(feature.mean)");
282	if(!is.null(response.var) && nObs != length(response.var)) stop("nObs != length(response.var)");
283	if(!is.null(feature.cov)  && nObs != nrow(feature.cov)) stop("nObs != nrow(feature.cov)");
284	if(!is.null(feature.cov)  && ncol(feature.cov) > nFeatures) stop("ncol(feature.cov) > nFeatures (",ncol(feature.cov)," vs ",nFeatures,")");
285	if(length(coeff) != nFeatures) stop("length(ceoff) != nFeatures");
286	
287	if(!is.null(weight)){
288		if(length(weight) != nObs) stop("length(weight) != nObs");
289	}else weight = rep(1, nObs);
290	if(any(weight < 0)) stop("any(weight < 0)");
291	if(is.null(response.var)) response.var = 0;
292	
293	err.square = drop((response.mean - feature.mean %*% coeff)^2);
294	
295	if(is.null(feature.var)){
296		beta.V.beta = 0;
297	}else if(is.vector(feature.var)){
298		if(length(feature.var) != nObs) stop("length(feature.var) != nObs");
299		beta.V.beta = feature.var * sum(coeff^2);
300	}else if(length(dim(feature.var)) == 2){
301		if(nrow(feature.var) != nObs) stop("nrow(feature.var) != nObs");
302		if(ncol(feature.var) > nFeatures) stop("ncol(feature.var) > nFeatures");
303		d = ncol(feature.var);
304		beta.V.beta = drop(coeff[1:d] %*% (t(feature.var) * coeff[1:d]));
305	}else if(length(dim(feature.var)) == 3){
306		if(dim(feature.var)[1] != nObs) stop("dim(feature.var)[1] != nObs");
307		if(dim(feature.var)[2] > nFeatures) stop("dim(feature.var)[2] > nFeatures");
308		if(dim(feature.var)[3] != dim(feature.var)[2]) stop("dim(feature.var)[3] != dim(feature.var)[2]");
309		d = dim(feature.var)[2];
310		temp = rep(coeff[1:d], each=nObs*d);
311		V.beta = apply(feature.var*temp, c(1,2), sum);
312		beta.V.beta = drop(V.beta %*% coeff[1:d]);
313	}else stop("feature.var error");
314	
315	beta.Cov = 0;
316	if(!is.null(feature.cov)) beta.Cov = drop(feature.cov %*% coeff[1:ncol(feature.cov)]);
317	loss = sum( weight * (err.square + beta.V.beta - 2*beta.Cov + response.var) );
318	return(loss);
319}
320
321### E[loglikelihood] 
322###	  = -(1/2) * sum_i E[ log(var_y) + (w_i/var_y)*(y_i - beta'x_i)^2 ]
323###	  = -(1/2) * ( sum(w_i > 0) * log(var_y) + (1/var_y) * loss ),
324###     where loss = sum_i w_i*{ (E[y_i] - beta'E[x_i])^2 + beta'Var[x_i]beta - 2beta'Cov[y_i,x_i] + Var[y_i] }
325Eloglik.lm.random.effect <- function(
326	coeff, var, response.mean, feature.mean,
327	response.var=NULL, feature.var=NULL, feature.cov=NULL, weight=NULL,
328	algo=NULL
329){
330	nObs = length(response.mean);
331	if(length(var) != 1){
332		if(!is.null(weight)) stop("length(var) != 1");
333		sum_log_var = sum(log(var));
334		weight = 1/var;
335		var = 1;
336	}else{
337		if(is.null(weight)) weight = rep(1, nObs);
338		sum_log_var = sum(weight > 0) * log(var);
339	}
340	if(any(weight < 0)) stop("any(weight < 0)");
341	if(length(weight) != nObs) stop("length(weight) != nObs");
342	if(is.null(algo)){
343		loss = loss.lm.random.effect(
344			coeff=coeff, response.mean=response.mean, feature.mean=feature.mean, 
345			response.var=response.var, feature.var=feature.var, feature.cov=feature.cov, weight=weight
346		);
347	}else{
348		if(!is.null(feature.var) || !is.null(feature.cov)) stop("When algo is not NULL, you cannot specify feature.var or feature.cov");
349		loss = reg.loss(model=coeff, x=feature.mean, y=response.mean, algo=algo, y.var=response.var, weight=weight);
350	}
351	loglik = -(1/2)*(sum_log_var + loss / var);
352	return(loglik);
353}
354
355###
356### model = NA would predict 0 for any input
357###
358reg.predict <- function(model, x, algo, ncol=NULL){
359	if(is.null(nrow(x))) stop("is.null(nrow(x))");
360	if(is.null(ncol)){
361		if(length(model) == 1 && is.na(model)){
362			return(rep(0.0,nrow(x)));
363		}else if(is.list(model) && !is.null(model$name)){
364			return(drop(algo$predict(model=model, x=x)));
365		}else if(is.vector(model)){
366			if(length(model)==0 && ncol(x)==0) return(rep(0.0,nrow(x)));
367			return(drop(x %*% model));
368		}else{
369			cat("str(model) = \n");
370			str(model);
371			stop("Unknown input model");	
372		}
373	}else if(is.matrix(model)){
374		if(ncol(model) != ncol) stop("ncol(model) != ncol");
375		if(ncol(x)==0 && nrow(model)==0) return(array(0.0, dim=c(nrow(x),ncol)));
376		return(as.matrix(x %*% model));
377	}else{
378		if(ncol != length(model)) stop("ncol != length(model)");
379		out = matrix(NA, nrow=nrow(x), ncol=ncol);
380		for(i in 1:ncol){
381			out[,i] = reg.predict(model=model[[i]], x=x, algo=algo, ncol=NULL);
382		}
383		return(out);
384	}
385}
386
387###
388### Model:
389###	   y[i] ~ N(mean = h(x[i,]),  var = var_y/weight[i])
390### OUTPUT:
391###    out$coeff = h (the regression function)
392###    out$loss  = sum_i weight[i] * { (E[y[i]] - h(x[i,]))^2 + Var[y[i]] }
393###    out$num   = sum(weight > 0)
394###    out$var   = var_y = loss / num
395###
396reg.train <- function(x, y, algo, y.var=0, weight=NULL, control=NULL, max.nObs=NULL){
397	if(length(y) != nrow(x)) stop("length(y) != nrow(x)");
398	if(length(y.var) != 1 && length(y.var) != length(y)) stop("length(y.var) != length(y)");
399	if(!is.null(weight) && any(weight == 0)){
400		select = weight != 0;
401		y = y[select];
402		x = x[select,,drop=FALSE];
403	}
404	if(!is.null(max.nObs) && length(y) > max.nObs){
405		select = sample(length(y), max.nObs);
406		y = y[select];
407		x = x[select,,drop=FALSE];
408	}
409	if(is.null(control)) control = algo$control;
410	if(is.null(weight))  weight=rep(1,length(y));
411	if(any(weight <= 0)) stop("any(weight <= 0)");
412	model = algo$train(x=x, y=y, weight=weight, control=control);
413	pred  = drop(algo$predict(model=model, x=x));
414	loss  = sum( weight * ((y - pred)^2 + y.var) );
415	out   = list(coeff=model, loss=loss, num=length(y), var=loss/length(y));
416	return(out);
417}
418
419reg.loss <- function(model, x, y, algo, y.var=0, weight=NULL){
420	pred  = reg.predict(model=model, x=x, algo=algo);
421	if(is.null(weight)) weight = 1;
422	loss  = sum( weight * ((y - pred)^2 + y.var) );
423	return(loss);
424}
425
426###
427### Gaussian likelihood function
428###
429loglik.gaussian <- function(pred.x, x, var_x){
430	if(length(var_x) != 1) stop("length(var_x) != 1");
431	loglik = -(1/2) * ( sum((x - pred.x)^2 / var_x) + length(x) * log(var_x) );
432	return(loglik);
433}
434
435###
436### Get feature matrix
437###	The input data x is a data frame in one of the two formats:
438### (1) Dense format:
439###     x = data.frame(id, feature1, feature2, ...)
440### (2) Sparse format:
441###     x = data.frame(id, index, value)
442###           id is the case ID
443###           index is the feature index
444###           value is the feature value
445### Output a matrix (may be in the sparseMatrix format), in which
446### the nth row correspond to selected.id[n]
447get.feature.matrix <- function(
448	x, # input data
449	id.colname,  # name of the ID column in x
450	selected.id, # a list of ID to select
451	add.intercept=FALSE, # whether to add a column of all ones
452	err.prefix="", err.x.name=NULL, err.select.name=NULL
453){
454	if(add.intercept){ regFormula = formula(~.);   default = 1.0; }
455	else{              regFormula = formula(~.-1); default = 0.0; }
456	
457	if(is.null(x)) return(matrix(default, nrow=length(selected.id), ncol=1));
458	
459	x.name = if(is.null(err.x.name)) "input table x" else err.x.name;
460	if(!(id.colname %in% names(x))) stop(err.prefix,"Cannot find ID column '", id.colname,"' in ",x.name);
461	if(ncol(x) == 3 && all(c(id.colname, "index", "value") %in% names(x))){
462		# Sparse format
463		nCases    = length(selected.id);
464		nFeatures = max(x$index);
465		x$row = match(x[[id.colname]], selected.id);
466		x = x[!is.na(x$row),];
467		if(add.intercept){
468			nFeatures = nFeatures + 1;
469			intercept = data.frame(row=1:nCases, index=nFeatures, value=1.0);
470			x = rbind(x[,c("row","index","value")], intercept);
471		}
472		out = sparseMatrix(i=x$row, j=x$index, x=x$value, dims=c(nCases, nFeatures));
473	}else{
474		# Dense format
475		select = match(selected.id, x[[id.colname]]);
476		if(any(is.na(select))){
477			temp = if(is.null(err.select.name)) "" else paste(" in ",err.select.name,sep="");
478			stop(err.prefix,"Some IDs",temp," cannot be found in ",x.name,"$",id.colname);
479		} 
480		out = model.matrix(regFormula, x[select,-match(id.colname, names(x)),drop=FALSE]);
481	}
482	return(out);
483}
484
485is.sparse.feature <- function(x){
486	if(is.data.frame(x) && ncol(x) == 3 && all(c("index", "value") %in% names(x))) return(TRUE);
487	return(FALSE);
488}
489
490###
491### For non-Gaussian response
492### 
493### Initialize the observation table
494###   Observation table: obs = data.frame(user, item, y)
495###	  Add the following to the code before the EM procedure.
496###      obs = init.obs(obs, is.logistic);
497###   After the call, obs$response will contain the binary response.
498###   Note obs$y will be used to store the Gaussian response values
499###   for variational approximation
500init.obs <- function(obs, is.logistic){
501	if(is.null(obs)) return(NULL);
502	obs$response = obs$y; # Make a copy of the original response
503	                      # since obs$y may be changed for variational approx.
504	if(is.logistic){
505		labels = unique(obs$response);
506		if(length(labels) != 2) stop("The response is not binary: ",paste(labels[1:min(10,length(labels))],collapse=", "));
507		labels = sort(labels);
508		if(any(labels != c(0,1)) && any(labels != c(-1,1))) stop("Binary response must be {0,1} or {-1,1}");
509	}
510	return(obs);
511}
512### Generate Gaussain response
513### Add the following code in the EM loop before the E-step
514###    response = generate.response(obs=obs, param=param, is.logistic=is.logistic, verbose=verbose);
515###    obs$y = response$y;
516###    param$var_y = response$var_y;
517generate.response <- function(obs, param, is.logistic, verbose=0){
518	if(is.logistic){
519		# variational approximation
520		if(verbose >= 1) cat("generate gaussian response for logistic\n");
521		if(length(param$xi) != nrow(obs)) stop("length(param$xi) != nObs");
522		if(length(obs$response) != nrow(obs)) stop("length(param$xi) != nObs");
523		response = obs$response;
524		if(all(response %in% c(0,1))) response[response == 0] = -1;
525		if(!all(response %in% c(-1,1))) stop("Binary response must be {-1,1} at this point");
526		var_y = 1/(2 * logistic.lambda(param$xi));
527		y = response * var_y / 2;
528	}else{
529		var_y = param$var_y;
530		y     = obs$y;
531	}
532	return(list(y=y, var_y=var_y));
533}
534### Update the parameters
535### Add the following code in the EM loop after the M-step
536###    param = update.param(factor.mean=mc_e$mean, param=param.new, obs=obs, is.logistic=is.logistic);
537update.param <- function(factor.mean, param, obs, is.logistic, factor.var=NULL, x_obs=NULL){
538	if(is.logistic){
539		if(!is.null(factor.mean$pred.y.square)){
540			if(length(factor.mean$pred.y.square) != nrow(obs)) stop("length(factor.mean$pred.y.square) != nrow(obs)");
541			param$xi = sqrt(factor.mean$pred.y.square);
542		}else if(!is.null(factor.mean$fScore) && !is.null(factor.var$fScore)){
543			if(length(factor.mean$fScore) != nrow(obs)) stop("length(factor.mean$fScore) != nrow(obs)");
544			if(length(factor.var$fScore) != nrow(obs)) stop("length(factor.var$fScore) != nrow(obs)");
545			xb = reg.predict(model=param$b, x=x_obs, algo=param$reg.algo);
546			pred = xb + factor.mean$fScore;
547			param$xi = sqrt(pred^2 + factor.var$fScore);
548		}else stop("Cannot compute xi for variational logistic");
549	}
550	return(param);
551}
552# input pred.y is based on the Gaussian model
553obsLoglik.from.gaussian <- function(pred.y, y, var_y, is.logistic){
554	if(length(y) != length(pred.y)) stop("length(y) != length(pred.y)");
555	if(is.logistic){
556		if(all(y %in% c(0,1))) y[y == 0] = -1;
557		if(!all(y %in% c(-1,1))) stop("Binary response must be {-1,1} at this point");
558		loglik = sum( -log1p(exp(-y * pred.y) ) );
559		attr(loglik, "loss") = -loglik / length(y);
560	}else{
561		loglik = loglik.gaussian(pred.x=pred.y, x=y, var_x=var_y);
562		attr(loglik, "loss") = sqrt(mean( (y - pred.y)^2 ));
563	}
564	return(loglik);
565}
566# input pred.y is based on the Gaussian model
567# output$pred.y is the input for the Gaussian model
568#               is the predicted probability for the Logistic model
569predict.response.from.gaussian <- function(pred.y, y, param, is.logistic){
570	if(!is.null(y)) loglik = obsLoglik.from.gaussian(pred.y=pred.y, y=y, var_y=param$var_y, is.logistic=is.logistic)
571	else            loglik = NULL;
572	if(is.logistic){
573		pred.y = 1/(1+exp(-pred.y));
574		if(!is.null(y)){
575			if(min(y) == -1) temp.y = 2*pred.y - 1
576			else             temp.y = pred.y;
577		}
578	}else{
579		temp.y = pred.y;
580	}
581	if(!is.null(y)){
582		rmse = sqrt(mean( (y - temp.y)^2 ));
583		mae  = mean( abs(y - temp.y) );
584	}else{
585		rmse = NULL;
586		mae  = NULL;
587	}
588	return(list(pred.y=pred.y, true.y=y, rmse=rmse, mae=mae, loglik=loglik, test.loss=attr(loglik, "loss")));
589}
590logistic.lambda <- function(xi){
591	return(tanh(xi/2) / (4*xi));
592}
593
594###
595### Output to the output directory
596###	IDs: a list of userIDs, itemIDs, etc (can be null)
597### prediction$test.loss must exist if prediction is not null
598###	loglik is the complete data training log likelihood
599###
600output.to.dir <- function(
601	out.dir, factor, param, IDs, prediction, loglik, 
602	minTestLoss, nSamples, iter, out.level, out.overwrite, 
603	TimeEStep, TimeMStep, TimeTest, verbose,
604	other=NULL, name="est", data.train=NULL
605){
606	if(!is.null(prediction) && is.null(prediction$test.loss)) stop("prediction$test.loss does not exist");
607	if(!is.null(prediction) && is.null(prediction$rmse))      stop("prediction$rmse does not exist");
608	if(is.null(attr(loglik, "loss"))) stop("is.null(attr(loglik, 'loss'))");
609	
610	if(out.level <= 0) return(NULL);
611	b.time.write = proc.time();
612	if(is.null(out.dir)) stop("Please specify out.dir");
613	if(iter == 0){
614		if(file.exists(paste(out.dir,"/",name,".last",sep="")) && !out.overwrite){
615			stop("Output File '",out.dir,"' EXISTS!!");
616		}else if(!file.exists(out.dir)){
617			dir.create(out.dir, recursive=TRUE);
618		}
619		smry.file = paste(out.dir,"/summary",sep="");
620		if(file.exists(smry.file)) file.remove(smry.file);
621	}
622	
623	thisTestLoss = if(is.null(prediction)) -1 else prediction$test.loss;
624	TestRMSE     = if(is.null(prediction)) -1 else prediction$rmse;
625	
626	if(iter == 0){
627		if(out.level >= 2) save(file=paste(out.dir,"/",name,".0",sep=""), list=c("factor", "param"));
628	}else{
629		file = paste(out.dir,"/",name,".last",sep="");
630		if(out.level >= 2 && iter >= 2){
631			file.prev = paste(out.dir,"/",name,".",(iter-1),sep="");
632			if(file.exists(file.prev)) file.remove(file.prev);
633			file.rename(file, file.prev);
634		}
635		save(file=file, list=c("factor", "param", "prediction", "IDs", "other", "data.train"));
636		if(!is.null(prediction)){
637			if(thisTestLoss == minTestLoss) file.copy(file, paste(out.dir,"/",name,".minTestLoss",sep=""), overwrite=TRUE);
638		}
639		save(file=paste(out.dir,"/param.",iter,sep=""), list=c("param"));
640	}
641	if(length(nSamples) != 1){
642		nSamples = if(iter > 0) nSamples[iter] else 0;
643	}
644	
645	summary = data.frame(Method="MCEM", Iter=iter, nSteps=nSamples, CDlogL=loglik, TestLoss=thisTestLoss, LossInTrain=attr(loglik,"loss"), TestRMSE=TestRMSE, TimeEStep=TimeEStep, TimeMStep=TimeMStep, TimeTest=TimeTest);
646	file = paste(out.dir,"/summary",sep="");
647	if(file.exists(file)) write.table(summary, file=file, append=TRUE,  quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
648	else                  write.table(summary, file=file, append=FALSE, quote=FALSE, sep="\t", row.names=FALSE, col.names=TRUE);
649	if(verbose > 0){
650		time.used.write = proc.time() - b.time.write;
651		cat("write a model & summary info on to disk (used ",time.used.write[3]," sec)\n",sep="");
652	}
653}