fit-EM.R - The R code implements an EM algorithm for estima…

/src/multi-app/R/model/fit-EM.R

http://github.com/beechung/Latent-Factor-Models · R · 439 lines · 342 code · 43 blank · 54 comment · 113 complexity · 05fa1c9f713972d9f02422a1295bb2f9 MD5 · raw file

### Copyright (c) 2011, Yahoo! Inc.  All rights reserved.
### Copyrights licensed under the New BSD License. See the accompanying LICENSE file for terms.
### 
### Author: Bee-Chung Chen

###
### Use the EM algorithm to fit the model
### NOTE:
###   * ridge.lambda[c("A", "B", "beta")] are the numbers to be added to the
###     diagonal when fitting the regression to get A, B, and beta
###
fit.EM <- function(
	# Input data
	feature,  # data.frame(user, app, index, x, w)
	response, # data.frame(user, app, item,  y, w)
	# Model setup
	nGlobalFactors,     # num of global factors per user
	nLocalFactors=NULL, # num of local  factors per user (may be a vector of length: #app)
	identity.A=FALSE, identity.B=FALSE, # whether A/B is a identity matrix
	is.y.logistic=FALSE, # whether to use a logistic model for y
	is.x.logistic=FALSE, # whether to use a logistic model for x
	# Test data (optional)
	test.feature=NULL,
	test.response=NULL,
	# Model-fitting parameters
	nIter=30, # num of EM iterations
	ridge.lambda=c(A=0, B=0, beta=0), # lambda values for ridge regression for A, B, beta
	keep.users.without.obs.in.E.step=FALSE,
	keep.users.without.obs.in.M.step=FALSE,
	fix.var_u=1, # Fix var_u to some number (set to NULL if you do not want to fix it)
	# Initialization parameters
	param=NULL, # directly set all the parameters (if not NULL, the following will not be used)
	var_x=1, var_y=1, var_z=1, # initial var (may be vectors of length: #app)
	var_u=1,                   # initial var (length: 1)
	A.sd=1, B.sd=1, beta.sd=1, # A_k ~ N(mean=0, sd=A.sd), and so on.
	# Output options
	out.level=0,  # out.level=1: Save the model in out.dir/model.last and out.dir/model.minTestLoss
	out.dir=NULL, # out.level=2: Save the model after each iteration i in out.dir/model.i
	out.overwrite=FALSE, # whether to allow overwriting existing files
	# Debugging options
	debug=0, verbose=0, use.C=TRUE,
	show.marginal.loglik=FALSE
){
	option = 0;
	if(keep.users.without.obs.in.E.step) option = option + 1;
	if(keep.users.without.obs.in.M.step) option = option + 2;
	option = as.integer(option);
	
	if(!is.null(test.response) && out.level <= 0 && verbose <= 0) stop("!is.null(test.response) && out.level <= 0 && verbose <= 0");
	if(out.level > 0){
		if(is.null(out.dir)) stop("out.dir = NULL");
		if(file.exists(paste(out.dir,"/model.last",sep="")) && !out.overwrite) stop(out.dir," already exists!!");
		if(!file.exists(out.dir)) dir.create(out.dir, recursive=TRUE, mode="0755");
	}
	
	if(is.x.logistic){
		if(!is.null(feature$w)) stop("When is.x.logistic is TRUE, you cannot specify feature$w")
		feature = init.obs.logistic(feature,  target="x");
	}
	if(is.y.logistic){
		if(!is.null(response$w)) stop("When is.y.logistic is TRUE, you cannot specify response$w")
		response = init.obs.logistic(response, target="y");
	}
	
	if(verbose > 0) cat("INITIALIZE THE PARAMETERS\n");
	begin.time.entire = proc.time();
	begin.time = proc.time();
	model = init.simple(
		feature=feature, response=response, nGlobalFactors=nGlobalFactors,
		param=param, nLocalFactors=nLocalFactors, 
		var_x=var_x, var_y=var_y, var_z=var_z, var_u=var_u, 
		identity.A=identity.A, identity.B=identity.B,
		is.x.logistic=is.x.logistic, is.y.logistic=is.y.logistic,
		A.sd=A.sd, B.sd=B.sd, beta.sd=beta.sd
	);
	if(!is.null(fix.var_u)) model$param$var_u = as.double(fix.var_u);
	time.used = proc.time() - begin.time;
	if(verbose > 0) cat("time used: ",time.used[3]," sec\n",sep="");
	size = check.syntax.all(feature=feature, response=response, param=model$param, factor=model$factor, check.indices=TRUE);
	if(!is.null(test.response)){
		check.syntax.all(feature=test.feature, response=test.response, param=model$param, factor=model$factor, check.indices=TRUE, test.data=TRUE);
	}
	if(verbose >= 2){
		cat("--------------------------------------------------------\n",
			"   Problem Dimensionality:\n",
			"--------------------------------------------------------\n",sep="");
		print(size);
	}
	
	if(verbose > 0) cat("START  THE EM-PROCEDURE\n");
	buffer = NULL; buffer.addr = NULL;
	
	prediction = NULL;
	best.model = NULL;  best.testLoss = Inf;
	time.pred  = NA;
	test.loss.y = rep(NA, nIter+1);  test.loss.x = rep(NA, nIter+1);
	marginal.loglik = rep(NA, nIter+1);
	
	if(!is.null(test.response)){
		begin.time = proc.time();
		prediction = predict.x.and.y(feature=test.feature, response=test.response, param=model$param, factor=model$factor);
		time.pred = proc.time() - begin.time;
		test.loss.y[1] = prediction$loss.y;
		test.loss.x[1] = prediction$loss.x;
		if(prediction$loss.y < best.testLoss){
			best.testLoss = prediction$loss.y;
			best.model = deepCopy(model);
		}
	}
	ans = output.results(
		method="EM", feature=feature, response=response, model=model, prediction=prediction,
		out.dir=out.dir, out.level=out.level,
		minTestLoss=best.testLoss, iter=0, show.marginal.loglik=show.marginal.loglik,
		TimeEM=time.used, TimeTest=time.pred, verbose=verbose,
		other=NULL, name="model"
	);
	marginal.loglik[1] = ans$marginal.loglik;
	
	for(iter in seq_len(nIter)){
		if(verbose > 0){
			cat("--------------------------------------------------------\n",
				"   EM-Iteration: ",iter,"\n",
				"--------------------------------------------------------\n",sep="");
		}
		begin.time = proc.time();
		
		# variational approx for logistic model
		if(is.x.logistic){
			feature$w = get.var.logistic(     obs=feature, param=model$param, target="x", verbose=verbose);
			feature$x = get.response.logistic(obs=feature, param=model$param, target="x", verbose=verbose);
			model$param$var_x[] = 1;
		}
		if(is.y.logistic){
			response$w = get.var.logistic(     obs=response, param=model$param, target="y", verbose=verbose);
			response$y = get.response.logistic(obs=response, param=model$param, target="y", verbose=verbose);
			model$param$var_y[] = 1;
		}
		
		if(use.C){
			model.addr = get.address(model);
			buffer = fit.EM.one.iteration.C(
				feature=feature, response=response, param=model$param, factor=model$factor,
				ridge.lambda=ridge.lambda, 
				option=option, debug=debug, verbose=verbose, buffer=buffer
			);
			# Now, model$param and model$factor contain the result after one EM iteration
			# (this is call by reference, not the behavior of regular R functions)
		
			# sanity check
			temp = get.address(model);
			if(is.diff(model.addr,temp,precision=0)) stop("model address changed!!");
			if(is.null(buffer.addr)) buffer.addr = get.address(buffer);
			temp = get.address(buffer);
			if(is.diff(buffer.addr,temp,precision=0)) stop("buffer address changed!!");
			
		}else{
			model = fit.EM.one.iteration.R(feature=feature, response=response, param=model$param)
		}
		if(!is.null(fix.var_u)) model$param$var_u = as.double(fix.var_u);
		
		# variational approx for logistic model
		if(is.x.logistic){
			mean.score = predict.x.from.z(feature=feature, param=model$param, z=model$factor$z, add.noise=FALSE);
			model$param = update.param.logistic(param=model$param, mean.score=mean.score, var.score=model$factor$var.x.score, target="x");
		}
		if(is.y.logistic){
			mean.score = predict.y.from.z(response=response, param=model$param, z=model$factor$z, add.noise=FALSE);
			model$param = update.param.logistic(param=model$param, mean.score=mean.score, var.score=model$factor$var.y.score, target="y");
		}
		
		time.used = proc.time() - begin.time;
		if(verbose > 0) cat("time used: ",time.used[3]," sec\n",sep="");
		
		if(!is.null(test.response)){
			begin.time = proc.time();
			prediction = predict.x.and.y(feature=test.feature, response=test.response, param=model$param, factor=model$factor);
			time.pred = proc.time() - begin.time;
			test.loss.y[iter+1] = prediction$loss.y;
			test.loss.x[iter+1] = prediction$loss.x;
			if(prediction$loss.y < best.testLoss){
				best.testLoss = prediction$loss.y;
				best.model = deepCopy(model);
			}
		}
		ans = output.results(
				method="EM", feature=feature, response=response, model=model, prediction=prediction,
				out.dir=out.dir, out.level=out.level,
				minTestLoss=best.testLoss, iter=iter, show.marginal.loglik=show.marginal.loglik,
				TimeEM=time.used, TimeTest=time.pred, verbose=verbose,
				other=NULL, name="model"
		);
		marginal.loglik[iter+1] = ans$marginal.loglik;
	}
		
	time.used = proc.time() - begin.time.entire;
	if(verbose > 0) cat("END OF THE EM-PROCEDURE\nTotal time used: ",time.used[3]," sec\n",sep="");
	
	out = list(model=model, model.min.test.loss=best.model, test.loss.x=test.loss.x, test.loss.y=test.loss.y, marginal.loglik=marginal.loglik);
	
	return(out);
}

###
### Initialization
###
init.simple <- function(
	feature, response, nGlobalFactors, 
	param=NULL, # directly set all the parameters
	nLocalFactors=NULL,        # which may be an array of length: nApps
	var_x=1, var_y=1, var_z=1, # which may be arrays of length: nApps
	var_u=1, identity.A=FALSE, identity.B=FALSE,
	is.y.logistic=FALSE, # whether to use a logistic model for y
	is.x.logistic=FALSE, # whether to use a logistic model for x
	A.sd=1, B.sd=1, beta.sd=1
){
	check.syntax.obs(feature=feature, response=response);
	nApps  = max(feature$app,  response$app);
	nUsers = max(feature$user, response$user);
	
	if(!is.null(param)){
		size = check.syntax.param(param);
		if(nApps != size$nApps) stop("Input param has ",param$nApps," applications, but the input data has ",nApps);
		if(nGlobalFactors != size$nGlobalFactors) stop("Input param has ",param$nGlobalFactors," global factors per user, but you specify ",nGlobalFactors);
		if(is.null(nLocalFactors)){
			nLocalFactors = size$nLocalFactors;
		}else{
			if(length(nLocalFactors) == 1) nLocalFactors = rep(nLocalFactors, nApps);
			if(length(nLocalFactors) != nApps) stop("length(nLocalFactors) != nApps");
			if(any(nLocalFactors != size$nLocalFactors)) stop("Input param has different nLocalFactors than your specification");
		}
		z = list();
		for(k in seq_len(nApps)){
			z[[k]] = matrix(0.0, nrow=nUsers, ncol=nLocalFactors[k]);
		}
		factor = list(u=matrix(0.0, nrow=nUsers, ncol=nGlobalFactors), z=z);
		out = list(param=param, factor=factor);
		return(out);
	}
	
	if(!identity.B && is.null(nLocalFactors)) stop("Please specify nLocalFactors");
	if(is.null(nLocalFactors)) nLocalFactors = rep(NA, nApps);
	
	if(length(nLocalFactors) == 1) nLocalFactors = rep(nLocalFactors, nApps);
	if(length(var_x) == 1) var_x = rep(var_x, nApps);
	if(length(var_y) == 1) var_y = rep(var_y, nApps);
	if(length(var_z) == 1) var_z = rep(var_z, nApps);
	if(length(nLocalFactors) != nApps) stop("length(nLocalFactors) != nApps");
	if(length(var_x) != nApps) stop("length(var_x) != nApps");
	if(length(var_y) != nApps) stop("length(var_y) != nApps");
	if(length(var_z) != nApps) stop("length(var_z) != nApps");
	
	A = list(); B = list(); b = list(); alpha = list(); beta = list();
	
	temp.f = tapply(seq_len(length(feature$app)),  list(feature$app),  FUN=c, simplify=FALSE);
	temp.r = tapply(seq_len(length(response$app)), list(response$app), FUN=c, simplify=FALSE);
	
	for(k in seq_len(nApps)){
		m = as.character(k); # IMPORTANT: use string to access temp.f and temp.r
		
		select = temp.f[[m]];
		obs = feature[select,];
		if(length(select) == 0){
			if(identity.B) nLocalFactors[k] = 0;
			B[[k]] = matrix(0.0, nrow=0, ncol=nLocalFactors[k]);
			b[[k]] = rep(0.0, 0);
		}else if(identity.B){
			nFeatures = max(obs$index);
			if(is.na(nLocalFactors[k])) nLocalFactors[k] = nFeatures
			else if(nLocalFactors[k] != nFeatures) stop("Misspecification of nLocalFactors[",k,"] = ",nLocalFactors[k]," with identity.B");
			B[[k]] = 1.0
			b[[k]] = rep(0.0, nFeatures);
		}else{
			nFeatures = max(obs$index);
			B[[k]] = matrix(rnorm(nFeatures*nLocalFactors[k],sd=B.sd), nrow=nFeatures, ncol=nLocalFactors[k]);
			b[[k]] = rep(0.0, nFeatures);
			agg = aggregate(obs$x, by=list(by=obs$index), FUN=mean);
			b[[k]][agg$by] = agg$x;
		}
		
		if(identity.A){
			A[[k]] = 1;
			if(nLocalFactors[k] != nGlobalFactors) stop("Misspecification of nLocalFactors[",k,"] = ",nLocalFactors[k]," with identity.A");
		}else{
			A[[k]] = matrix(rnorm(nLocalFactors[k]*nGlobalFactors,sd=A.sd), nrow=nLocalFactors[k], ncol=nGlobalFactors);	
		}
		
		select = temp.r[[m]];
		obs = response[select,];
		if(length(select) == 0){
			beta[[k]]  = matrix(0.0, nrow=0, ncol=nLocalFactors[k]);
			alpha[[k]] = rep(0.0, 0);
		}else{
			nItems = max(obs$item);
			beta[[k]]  = matrix(rnorm(nItems*nLocalFactors[k],sd=beta.sd), nrow=nItems, ncol=nLocalFactors[k]);
			alpha[[k]] = rep(0.0, nItems);
			agg = aggregate(obs$y, by=list(by=obs$item), FUN=mean);
			alpha[[k]][agg$by] = agg$x;
		}
	}
	param = list(A=A, B=B, b=b, alpha=alpha, beta=beta, var_x=var_x, var_y=var_y, var_z=var_z, var_u=var_u,
	             is.x.logistic=is.x.logistic, is.y.logistic=is.y.logistic);

	z = list();
	for(k in seq_len(nApps)){
		z[[k]] = matrix(0.0, nrow=nUsers, ncol=nLocalFactors[k]);
	}
	factor = list(u=matrix(0.0, nrow=nUsers, ncol=nGlobalFactors), z=z);
	
	if(is.x.logistic){
		param = init.param.logistic(param, feature, value=1.0, target="x");
		factor$var.x.score = rep(1.0, nrow(feature));
	}
	if(is.y.logistic){
		param = init.param.logistic(param, response, value=1.0, target="y");
		factor$var.y.score = rep(1.0, nrow(response));
	}	
	
	out = list(param=param, factor=factor);
	return(out);
}

###
### One EM iteration
###		Run E-Step once, M-step once
###	Input:
###		feature = data.frame(user, app, index, x, w)
###    response = data.frame(user, app, item,  y, w)
###       param = list(A, B, b, alpha, beta, var_x, var_y, var_z, var_u)
### Output: list(param, factor)
###		param: The parameter values after the M-step
###    factor=list(u, z): The posterior mean of u and z
###    if(is.x.logistic) factor$var.x.score[ikm] = Var[B_{k,m} z_{ik}]  
###    if(is.y.logistic) factor$var.y.score[ijk] = Var[beta_{jk} z_{ik}]
fit.EM.one.iteration.R <- function(feature, response, param){
	stop("Please implement this function");
}
### IMPORTANT NOTE:
###   * In the C version, to reduce the memory footprint, the content of the input 
###     param and factor will be changed to the updated values.
###     This is call by reference, and is NOT the behavior of regular R functions.
###     buffer is the temp space (which can be NULL) and is also call by reference.
###   * ridge.lambda[c("A", "B", "beta")] are the numbers to be added to the
###     diagonal when fitting the regression to get A, B, and beta
###   * buffer = list(A, B, b, alpha, beta, z)
###     contains the packed version of A, B, b, ... for C/C++ functions
fit.EM.one.iteration.C <- function(
	feature, response, param, factor,
	ridge.lambda=c(A=0, B=0, beta=0),
	buffer=NULL, option=0, debug=0, verbose=0
){
	size = check.syntax.all(feature=feature, response=response, param=param, factor=factor);
	
	if(is.null(buffer)) buffer = list()
	else                check_names(buffer, "buffer", required=c("A", "B", "b", "alpha", "beta", "z"));

	if(length(ridge.lambda) != 3) stop("length(ridge.lambda) != 3");
	if(any(c("A", "B", "beta") != names(ridge.lambda))) stop("ridge.lambda must be a named vector with names: A, B, beta (in this order)");
	
	for(name in c("A", "B", "b", "alpha", "beta")){
		if(is.null(buffer[[name]])) buffer[[name]] = pack.list.of.matrices(param[[name]])
		else                  pack.list.of.matrices(param[[name]], output=buffer[[name]]);
		check_type_size(buffer[[name]]$data, "double", NA);
		check_type_size(buffer[[name]]$dim,  "int",    NA);
	}
	for(name in c("z")){
		if(is.null(buffer[[name]])) buffer[[name]] = pack.list.of.matrices(factor[[name]])
		else                  pack.list.of.matrices(factor[[name]], output=buffer[[name]]);
		check_type_size(buffer[[name]]$data, "double", NA);
		check_type_size(buffer[[name]]$dim,  "int",    NA);
	}
	for(name in c("var_x", "var_y", "var_z")){
		check_type_size(param[[name]], "double", size$nApps);
	}
	check_type_size(param[["var_u"]], "double", 1);
	check_type_size(factor[["u"]], "double", c(size$nUsers, size$nGlobalFactors));

	for(name in c("user", "app", "index")){
		check_type_size(feature[[name]], "int", nrow(feature));
	}
	check_type_size(feature[["x"]], "double", nrow(feature));
	if(is.null(feature[["w"]])){
		feature_has_w = as.integer(0);
	}else{
		feature_has_w = as.integer(1);
		check_type_size(feature[["w"]], "double", nrow(feature));
	}

	for(name in c("user", "app", "item")){
		check_type_size(response[[name]], "int", nrow(response));
	}
	check_type_size(response[["y"]], "double", nrow(response));
	if(is.null(response[["w"]])){
		response_has_w = as.integer(0);
	}else{
		response_has_w = as.integer(1);
		check_type_size(response[["w"]], "double", nrow(response));
	}
	
	for(name in c("nFeatures", "nItems", "nLocalFactors")){
		check_type_size(size[[name]], "int", size$nApps)
	}
	
	check_type_size(factor[["var.y.score"]], "double", length(factor[["var.y.score"]]));
	check_type_size(factor[["var.x.score"]], "double", length(factor[["var.x.score"]]));
	
	.C("EM_one_iteration",
		# Parameters: INPUT and OUTPUT
		buffer[["A"]]$data, buffer[["A"]]$dim,  buffer[["B"]]$data, buffer[["B"]]$dim,  buffer[["b"]]$data, buffer[["b"]]$dim,
		buffer[["alpha"]]$data, buffer[["alpha"]]$dim,  buffer[["beta"]]$data, buffer[["beta"]]$dim,
		param[["var_x"]], param[["var_y"]], param[["var_z"]], param[["var_u"]],
		# Posterior mean of factors: OUTPUT
		factor[["u"]],  buffer[["z"]]$data, buffer[["z"]]$dim,
		# Posterior variance for logistic regression: OUTPUT
		factor[["var.x.score"]], as.integer(length(factor[["var.x.score"]])),
		factor[["var.y.score"]], as.integer(length(factor[["var.y.score"]])),
		# Feature table: INPUT
		feature[["user"]], feature[["app"]], feature[["index"]], feature[["x"]], feature[["w"]],
		as.integer(nrow(feature)), feature_has_w,
		# Response table: INPUT
		response[["user"]], response[["app"]], response[["item"]], response[["y"]], response[["w"]],
		as.integer(nrow(response)), response_has_w,
		# Ridge regression parameters: INPUT
		as.double(ridge.lambda), as.integer(length(ridge.lambda)),
		# Size information: INPUT
		as.integer(size$nApps), as.integer(size$nUsers), as.integer(size$nGlobalFactors),
		size$nFeatures, size$nItems, size$nLocalFactors,
		# Others
		as.integer(option), as.integer(verbose), as.integer(debug),
		DUP=FALSE
	);
	
	for(name in c("A", "B", "b", "alpha", "beta")){
		unpack.list.of.matrices(buffer[[name]], output=param[[name]]);
	}
	unpack.list.of.matrices(buffer[["z"]], output=factor[["z"]]);
	
	return(buffer);
}
Summary ✨

The R code implements an EM algorithm for estimating parameters of a Bayesian model. It takes as input feature and response tables, ridge regression parameters, and size information. The output is a buffer containing packed matrices and dimensions, which are then unpacked into the original parameter objects. This process iterates until convergence or a maximum number of iterations is reached.
Alerts (8)

Complexity hotspot; lines 243 to 250 (total complexity: 20)
243 244 245 246 247 248 249 250