Nothing
# RANKING EXAMPLE
cat("Running ranking (LambdaMart) example.\n")
# Create synthetic data that shows how pairwise training can be better
# Note: no claim to represent 'real world' data!
generate.data <- function(N) {
# create query groups, with an average size of 25 items each
num.queries <- floor(N/25)
query <- sample(1:num.queries, N, replace=TRUE)
# X1 is a variable determined by query group only
query.level <- runif(num.queries)
X1 <- query.level[query]
# X2 varies with each item
X2 <- runif(N)
# X3 is uncorrelated with target
X3 <- runif(N)
# The target
Y <- X1 + X2
# Add some random noise to X2 that is correlated with
# queries, but uncorrelated with items
X2 <- X2 + scale(runif(num.queries))[query]
# Add some random noise to target
SNR <- 5 # signal-to-noise ratio
sigma <- sqrt(var(Y)/SNR)
Y <- Y + runif(N, 0, sigma)
data.frame(Y, query=query, X1, X2, X3)
}
cat('Generating data\n')
N=1000
data.train <- generate.data(N)
# Now we fit 3 different models to the same data:
# * Gaussian
# * Pairwise with NDCG ranking metric
# * Pairwise with CONC (fraction of concordant pairs) ranking metric
cat('Fitting a model with gaussian loss function\n')
gbm.gaussian <- gbm(Y~X1+X2+X3, # formula
data=data.train, # dataset
distribution='gaussian', # loss function: gaussian
n.trees=2000, # number of trees
shrinkage=0.005, # learning rate
interaction.depth=3, # number per splits per tree
bag.fraction = 0.5, # subsampling fraction
train.fraction = 1, # fraction of data for training
n.minobsinnode = 10, # minimum number of obs for split
keep.data=TRUE, # store copy of input data in model
cv.folds=5, # number of cross validation folds
verbose = FALSE, # don't print progress
n.cores = 1) # use a single core (to prevent possible problems caused by wronly detecting cores)
# estimate number of trees
best.iter.gaussian <- gbm.perf(gbm.gaussian, method="cv")
title('Training of gaussian model')
cat('Fitting a model with pairwise loss function (ranking metric: normalized discounted cumulative gain)\n')
gbm.ndcg <- gbm(Y~X1+X2+X3, # formula
data=data.train, # dataset
distribution=list( # loss function:
name='pairwise', # pairwise
metric="ndcg", # ranking metric: normalized discounted cumulative gain
group='query'), # column indicating query groups
n.trees=2000, # number of trees
shrinkage=0.005, # learning rate
interaction.depth=3, # number per splits per tree
bag.fraction = 0.5, # subsampling fraction
train.fraction = 1, # fraction of data for training
n.minobsinnode = 10, # minimum number of obs for split
keep.data=TRUE, # store copy of input data in model
cv.folds=5, # number of cross validation folds
verbose = FALSE, # don't print progress
n.cores = 1) # use a single core
# estimate number of trees
best.iter.ndcg <- gbm.perf(gbm.ndcg, method='cv')
title('Training of pairwise model with ndcg metric')
cat('Fit a model with pairwise loss function (ranking metric: fraction of concordant pairs)\n')
gbm.conc <- gbm(Y~X1+X2+X3, # formula
data=data.train, # dataset
distribution=list( # loss function:
name='pairwise', # pairwise
metric="conc", # ranking metric: concordant pairs
group='query'), # column indicating query groups
n.trees=2000, # number of trees
shrinkage=0.005, # learning rate
interaction.depth=3, # number per splits per tree
bag.fraction = 0.5, # subsampling fraction
train.fraction = 1, # fraction of data for training
n.minobsinnode = 10, # minimum number of obs for split
keep.data=TRUE, # store copy of input data in model
cv.folds=5, # number of cross validation folds
verbose = FALSE, # don't print progress
n.cores = 1) # use a single core
# estimate number of trees
best.iter.conc <- gbm.perf(gbm.conc, method='cv')
title('Training of pairwise model with conc metric')
# plot variable importance
par.old <- par(mfrow=c(1,3))
summary(gbm.gaussian, n.trees=best.iter.gaussian, main='gaussian')
summary(gbm.ndcg, n.trees=best.iter.ndcg, main='pairwise (ndcg)')
summary(gbm.conc, n.trees=best.iter.conc, main='pairwise (conc)')
par(par.old)
cat("Generating some new data\n")
data.test <- generate.data(N)
cat("Calculating predictions\n")
predictions <- data.frame(random=runif(N),
X2=data.test$X2,
gaussian=predict(gbm.gaussian, data.test, best.iter.gaussian),
pairwise.ndcg=predict(gbm.ndcg, data.test, best.iter.ndcg),
pairwise.conc=predict(gbm.conc, data.test, best.iter.conc))
cat("Computing loss metrics\n")
result.table <- data.frame(measure=c('random', 'X2 only', 'gaussian', 'pairwise (ndcg)', 'pairwise (conc)'),
squared.loss=sapply(1:length(predictions), FUN=function(i) {
gbm.loss(y=data.test$Y, predictions[[i]], w=rep(1,N), offset=NA, dist=list(name="gaussian"), baseline=0) }),
ndcg5.loss=sapply(1:length(predictions), FUN=function(i) {
gbm.loss(y=data.test$Y, predictions[[i]], w=rep(1,N), offset=NA, dist=list(name='pairwise', metric="ndcg"),
baseline=0, group=data.test$query, max.rank=5) }),
concordant.pairs.loss=sapply(1:length(predictions), FUN=function(i) {
gbm.loss(y=data.test$Y, predictions[[i]], w=rep(1,N), offset=NA, dist=list(name='pairwise', metric="conc"),
baseline=0, group=data.test$query, max.rank=0) }),
row.names=NULL)
cat('Performance measures for the different models on the test set (smaller is better):\n')
print(result.table,digits=2)
# Brief explanation: Variable X1 is not correlated with the order of items, only
# with queries. Variable X2 is the only one that is correlated with the order of
# items within queries. However, it has a high query-correlated variance.
# Therefore, the 'optimal' possible ranking is just by X2. Of course, the
# pairwise models don't know this and don't completely achieve the same
# accuracy, due to noise and data limitation.
#
# The Gaussian model uses mostly X1, due to the high variance of X2; on the
# contrary, the pairwise models rely mainly on X2. The loss table shows that
# both pairwise models are better in terms of the ranking metrics, but worse in
# terms of squared loss.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.