multicontext_model.R - The R code runs a multicontext model…

/src/R/examples/multicontext_model.R

http://github.com/beechung/Latent-Factor-Models · R · 270 lines · 164 code · 11 blank · 95 comment · 0 complexity · cc1b1b8d1299df22f682c83a8559de94 MD5 · raw file

### Copyright (c) 2011, Yahoo! Inc.  All rights reserved.
### Copyrights licensed under the New BSD License. See the accompanying LICENSE file for terms.
### 
### Author: Bee-Chung Chen

###
### Preparation:
###    (1) Set your path/alias to run the right version of R
###    (2) make  (in public-factor-models/, not in any subdirectory)
###    (3) Take a look at public-factor-models/src/R/model/Notation-multicontext.txt
###        for the specification of the model.
###    (4) Run R (in public-factor-models/, not in any subdirectory)
###

###
### Example 1: Run the fitting code with synthetic data
###
# (1) Generate some data
#     See src/R/model/multicontext_model_genData.R for details
library(Matrix);
dyn.load("lib/c_funcs.so");
source("src/R/c_funcs.R");
source("src/R/util.R");
source("src/R/model/util.R");
source("src/R/model/multicontext_model_genData.R");
source("src/R/model/multicontext_model_utils.R");
set.seed(0);
d = generate.GaussianData(
		nSrcNodes=203, nDstNodes=203, nObs=10003, 
		nSrcContexts=4, nDstContexts=5, nEdgeContexts=3, nFactors=2, has.gamma=FALSE, has.u=TRUE,
		nObsFeatures=2, nSrcFeatures=3, nDstFeatures=3, nCtxFeatures=1,
		b.sd=1, g0.sd=1, d0.sd=1, h0.sd=0, G.sd=1, D.sd=1, H.sd=0, q.sd=1, r.sd=1,
		q.mean=5, r.mean=5,
		var_y=0.1, var_alpha=0.5, var_beta=0.5, var_gamma=1, var_v=1, var_u=1, var_w=1,
		var_alpha_global=0.2, var_beta_global=0.2,
		has.intercept=FALSE,
		sparse.matrices=TRUE, frac.zeroFeatures=0.2
);
# (2) Create training/test split
select.train = runif(nrow(d$obs),min=0,max=1) < 0.75;
obs = d$obs;  names(obs) = c("src_id", "dst_id", "src_context", "dst_context", "ctx_id", "y");
obs.train = obs[ select.train,];  x_obs.train = data.frame(as.matrix(d$feature$x_obs)[ select.train,,drop=FALSE]);
obs.test  = obs[!select.train,];  x_obs.test  = data.frame(as.matrix(d$feature$x_obs)[!select.train,,drop=FALSE]);
x_src = data.frame(src_id=1:nrow(d$feature$x_src), as.matrix(d$feature$x_src));
x_dst = data.frame(dst_id=1:nrow(d$feature$x_dst), as.matrix(d$feature$x_dst));
# The following are input data tables:
#     obs.train, obs.test, x_src, x_dst
# obs.train and obs.test contain the training and test rating data
#      The columns of these two tables are:
#      1. src_id: e.g., user_id
#      2. dst_id: e.g., item_id
#      3. src_context: (optional) This is the context in which the source node gives the rating
#      4. dst_context: (optional) This is the context in which the destination node receives the rating
#      5. ctx_id:      (optional) This is the context of this (src_id, dst_id) pair
#      6. y: This is the rating that the source node gives the destination node
# Note: You may set all/any of src_context, dst_context, ctx_id to NULL if there is no context info
#            or set all of them to the same vector
#       The number of contexts cannot be too many; otherwise, the program will be very slow
str(obs.train); # to see the data structure
str(obs.test);  # to see the data structure
# x_src is the source node (e.g., user) feature table
#       The first column src_id specifies the source node ID
str(x_src); # to see the data structure
# x_dst is the destination node (e.g., item) feature table
#       The first column dst_id specifies the destination node ID
str(x_dst); # to see the data structure

# (3) Index training data
#     See src/R/model/multicontext_model_utils.R: indexData() for details
data.train = indexData(
		obs=obs.train, src.dst.same=TRUE, rm.self.link=TRUE,
		x_obs=x_obs.train, x_src=x_src, x_dst=x_dst,
		add.intercept=FALSE,
);
# (4) Index test data
#     See src/R/model/multicontext_model_utils.R: indexTestData() for details
data.test = indexTestData(
		data.train=data.train, obs=obs.test,
		x_obs=x_obs.test, x_src=x_src, x_dst=x_dst,
);
# (5) Setup the model(s) to be fitted
#     See src/R/model/multicontext_model_EM.R: run.multicontext(), fit.multicontext()
#     Note run.multicontext() is a wrapper to fit multiple models using fit.multicontext().
setting = data.frame(
		name          = c("wuv", "wvv"),
		nFactors      = c(    2,     2), # number of interaction factors
		has.u         = c(    T,     F), # whether to use u_i' v_j or v_i' v_j
		has.gamma     = c(    F,     F), # just set to F
		nLocalFactors = c(    0,     0), # just set to 0
		is.logistic   = c(    F,     F)  # whether to use the logistic model for binary rating
);
dyn.load("lib/c_funcs.so");
source("src/R/c_funcs.R");
source("src/R/util.R");
source("src/R/model/util.R");
source("src/R/model/multicontext_model_genData.R");
source("src/R/model/multicontext_model_utils.R");
source("src/R/model/multicontext_model_MStep.R");
source("src/R/model/multicontext_model_EM.R");
set.seed(2);
# (6) Run the fitting code
#     See src/R/model/multicontext_model_EM.R: run.multicontext(), fit.multicontext()
#     Note run.multicontext() is a wrapper to fit multiple models using fit.multicontext().
ans = run.multicontext(
		obs=data.train$obs,         # Observation table
		feature=data.train$feature, # Features
		setting=setting,    # Model setting
		nSamples=200,   # Number of samples drawn in each E-step: could be a vector of size nIter.
		nBurnIn=20,     # Number of burn-in draws before take samples for the E-step: could be a vector of size nIter.
		nIter=20,       # Number of EM iterations
		test.obs=data.test$obs,         # Test data: Observations for testing (optional)
		test.feature=data.test$feature, #            Features for testing     (optional)
		ridge.lambda=1,
		IDs=data.test$IDs,
		out.level=1,         # out.level=1: Save the factor & parameter values to out.dir/model.last and out.dir/model.minTestLoss
		out.dir="/tmp/test", # out.level=2: Save the factor & parameter values of each iteration i to out.dir/model.i
		out.overwrite=TRUE,     # whether to overwrite the output directory if it exists
		debug=0,      # Set to 0 to disable internal sanity checking; Set to 100 for most detailed sanity checking
		verbose=1,    # Set to 0 to disable console output; Set to 100 to print everything to the console
		verbose.M=2
);
# There may be some warning messages, which are mostly debugging messages and do not mean real problems.

# (7) Checking the model summary
ans$summary[,c("name", "nFactors", "has.u", "has.gamma", "nLocalFactors", "is.logistic", "best.test.loss", "last.test.loss")];

# (8) Load the fitted model(s)
#     Here, I only use the "wuv" model as an example
# (8.1) Check the summary file
read.table("/tmp/test_wuv/summary", header=TRUE, sep="\t", as.is=TRUE);
# (8.2) Load the model
load("/tmp/test_wuv/model.last");
#       Now, factor and param contain the fitted model
str(factor);
str(param);
# (8.3) Make prediction
prediction = predict.multicontext(
	model=list(factor=factor, param=param), 
	obs=data.test$obs, feature=data.test$feature, is.logistic=FALSE
);
# Now, prediction$pred.y contains the predicted rating for data.test$obs
str(prediction);


###
### Example 2: Run the fitting code with synthetic data using SPARSE feature matrix
###
# (1) Generate some data
#     See src/R/model/multicontext_model_genData.R for details
library(Matrix);
dyn.load("lib/c_funcs.so");
source("src/R/c_funcs.R");
source("src/R/util.R");
source("src/R/model/util.R");
source("src/R/model/multicontext_model_genData.R");
source("src/R/model/multicontext_model_utils.R");
set.seed(0);
d = generate.GaussianData(
		nSrcNodes=1003, nDstNodes=1003, nObs=100003, 
		nSrcContexts=3, nDstContexts=3, nEdgeContexts=1, nFactors=3, has.gamma=FALSE, has.u=FALSE,
		nObsFeatures=13, nSrcFeatures=19, nDstFeatures=23, nCtxFeatures=1,
		b.sd=1, g0.sd=1, d0.sd=1, h0.sd=0, G.sd=1, D.sd=1, H.sd=0, q.sd=1, r.sd=1,
		q.mean=5, r.mean=5,
		var_y=0.1, var_alpha=0.5, var_beta=0.5, var_gamma=1, var_v=1, var_u=1, var_w=1,
		var_alpha_global=0.2, var_beta_global=0.2,
		has.intercept=FALSE,
		sparse.matrices=TRUE, index.value.format=TRUE, frac.zeroFeatures=0.5
);
names(d$obs) = c("src_id", "dst_id", "src_context", "dst_context", "ctx_id", "y");
d$obs$ctx_id = NULL;
rating.data = d$obs;
x_obs=d$feature$x_obs[order(d$feature$x_obs$row,d$feature$x_obs$col),];  names(x_obs) = c("obs_id", "index", "value");
x_src=d$feature$x_src[order(d$feature$x_src$row,d$feature$x_src$col),];  names(x_src) = c("src_id", "index", "value");
x_dst=d$feature$x_dst[order(d$feature$x_dst$row,d$feature$x_dst$col),];  names(x_dst) = c("dst_id", "index", "value");

#
# Input data: rating.data, x_obs, x_src, x_dst (you need to prepare these four tables for your data)
# Note: All ID numbers start from 1 (not 0)
#
str(rating.data); # see the data structure
# rating.data is the rating data table with the following columns:
#      1. src_id: e.g., user_id or voter_id
#      2. dst_id: e.g., item_id or author_id
#      3. src_context: (optional) This is the context in which the source node gives the rating
#      4. dst_context: (optional) This is the context in which the destination node receives the rating
#      5. y: This is the rating that the source node gives the destination node
#      6. ctx_id: (optional) This is the context of this (src_id, dst_id) pair
# Note: You may set all/any of src_context, dst_context, ctx_id to NULL if there is no context info
#       The number of contexts cannot be too many; otherwise, the program will be very slow
str(x_obs);
# x_obs is the feature table for observations with the following columns
#      1. obs_id: observation ID (obs_id=n corresponds to the nth row of rating.data)
#      2. index:  feature index
#      3. value:  feature value
str(x_src);
# x_src is the feature table for source nodes with the following columns
#      1. src_id: source node ID (this correspond to the src_id column in rating.data)
#      2. index:  feature index
#      3. value:  feature value
str(x_dst);
# x_dst is the feature table for destination nodes with the following columns
#      1. dst_id: destination node ID (this correspond to the dst_id column in rating.data)
#      2. index:  feature index
#      3. value:  feature value

# (2) Create training/test split
set.seed(1);
select.train = sample(nrow(rating.data), floor(nrow(rating.data)*0.75));
select.test  = setdiff(1:nrow(rating.data), select.train);
obs.train = rating.data[select.train,];  x_obs.train = x_obs[x_obs$obs_id %in% select.train,];  x_obs.train$obs_id = match(x_obs.train$obs_id, select.train);
obs.test  = rating.data[select.test, ];  x_obs.test  = x_obs[x_obs$obs_id %in% select.test, ];  x_obs.test$obs_id  = match(x_obs.test$obs_id,  select.test);

# (3) Index training data
#     See src/R/model/multicontext_model_utils.R: indexData() for details
data.train = indexData(
		obs=obs.train, src.dst.same=TRUE, rm.self.link=TRUE,
		x_obs=x_obs.train, x_src=x_src, x_dst=x_dst,
		add.intercept=FALSE,
);
# (4) Index test data
#     See src/R/model/multicontext_model_utils.R: indexTestData() for details
data.test = indexTestData(
		data.train=data.train, obs=obs.test,
		x_obs=x_obs.test, x_src=x_src, x_dst=x_dst,
);
# (5) Setup the model(s) to be fitted
#     See src/R/model/multicontext_model_EM.R: run.multicontext(), fit.multicontext()
#     Note run.multicontext() is a wrapper to fit multiple models using fit.multicontext().
setting = data.frame(
		name          = c( "uv",  "vv"),
		nFactors      = c(    3,     3), # number of interaction factors
		has.u         = c(    T,     F), # whether to use u_i' v_j or v_i' v_j
		has.gamma     = c(    F,     F), # just set to F
		nLocalFactors = c(    0,     0), # just set to 0
		is.logistic   = c(    F,     F)  # whether to use the logistic model for binary rating
);
# (6) Run the fitting code
#     See src/R/model/multicontext_model_EM.R: run.multicontext(), fit.multicontext()
#     Note run.multicontext() is a wrapper to fit multiple models using fit.multicontext().
dyn.load("lib/c_funcs.so");
source("src/R/c_funcs.R");
source("src/R/util.R");
source("src/R/model/util.R");
source("src/R/model/multicontext_model_genData.R");
source("src/R/model/multicontext_model_utils.R");
source("src/R/model/multicontext_model_MStep.R");
source("src/R/model/multicontext_model_EM.R");
source("src/R/model/GLMNet.R");
rnd.seed=1;
ans = run.multicontext(
		obs=data.train$obs,         # Observation table
		feature=data.train$feature, # Features
		setting=setting,    # Model setting
		nSamples=200,   # Number of samples drawn in each E-step: could be a vector of size nIter.
		nBurnIn=20,     # Number of burn-in draws before take samples for the E-step: could be a vector of size nIter.
		nIter=10,       # Number of EM iterations
		test.obs=data.test$obs,         # Test data: Observations for testing (optional)
		test.feature=data.test$feature, #            Features for testing     (optional)
		reg.algo=GLMNet,
		IDs=data.test$IDs,
		rnd.seed.init=rnd.seed, rnd.seed.fit=rnd.seed,
		out.level=1,         # out.level=1: Save the factor & parameter values to out.dir/model.last and out.dir/model.minTestLoss
		out.dir="/tmp/test", # out.level=2: Save the factor & parameter values of each iteration i to out.dir/model.i
		out.overwrite=TRUE,     # whether to overwrite the output directory if it exists
		debug=0,      # Set to 0 to disable internal sanity checking; Set to 100 for most detailed sanity checking
		verbose=1,    # Set to 0 to disable console output; Set to 100 to print everything to the console
		verbose.M=2
);
ans$summary[,c("name", "nFactors", "has.u", "has.gamma", "nLocalFactors", "is.logistic", "best.test.loss", "last.test.loss")];