#' @useDynLib rankdistext
#' @importFrom Rcpp sourceCpp
#' @import stats
NULL
#' Calculate Kendall distance matrix between rankings
#'
#' @param ranking a matrix of rankings
#' @return Kendall distance matrix between rankings. The value in ith row and jth column is the
#' Kendall distance between the ith and jth rankings.
#' @export
DistanceMatrix <- function(ranking){
n = ncol(ranking)
w = c(rep(0,n-2),1)
distcalc = function(x){
fai.coeff = matrix(CWeightGivenPi(ranking,x),nrow=nrow(ranking))
as.numeric(fai.coeff %*% w)
}
apply(ranking,1,distcalc)
}
#' Calculate Kendall distance
#' #' Calculate Kendall distance matrix between one ranking and a matrix of rankings
#' @param mat a matrix of rankings
#' @param r a single ranking
#' @return a vector of Kendall distance
#' @export
DistanceBlock <- function(mat,r){
n = ncol(mat)
w = c(rep(0,n-2),1)
fai.coeff = matrix(CWeightGivenPi(mat,r),nrow=nrow(mat))
as.numeric(fai.coeff %*% w)
}
#' Calculate Kendall distance between a pair of rankings
#' #' Calculate Kendall distance matrix between a pair of rankings
#' @param r1 a single ranking
#' @param r2 a single ranking
#' @return Kendall distance value
#' @export
DistancePair <- function(r1,r2){
n = length(r1)
w = c(rep(0,n-2),1)
fai.coeff = matrix(CWeightGivenPi(r1,r2),nrow=1)
as.numeric(fai.coeff %*% w)
}
#' Print a brief summary of the fitted model
#' Print a brief summary of the fitted model. This includes information about goodness
#' of fit as well as parameter estimation.
#' @param model a ranking model returned by a call to RankDistanceModel function.
#' @export
ModelSummary <- function(model){
clus = rev(order(model$p))
nobj = length(model$modal_ranking.est[[1]])
mod_name = deparse(substitute(model))
cat("Summary of",mod_name,"\n")
cat(rep("=",nchar(mod_name)+10),"\n",sep="")
cat("Goodness of Fit\n")
cat("SSR:\t",model$SSR,"\n")
cat("BIC:\t",model$BIC,"\n")
cat("dof:\t",model$free_params,"\n")
cat("Parameter Estimation\n")
cat("Cluster",LETTERS[1:nobj],"p","Parameters\n",sep="\t")
for (i in 1:length(clus)){
cat(i,model$modal_ranking.est[[clus[i]]],round(model$p[clus[i]],digits=2),round(model$w.est[[clus[i]]],digits=2),"\n",sep="\t")
}
}
#' Generate simple examples
#'
#' This function generates simple examples for illustrative proposes.
#' The sample contains the rankings of five objects and the underlying model is a Mallow's phi
#' model with dispersion parameter set to 0.2
#' and modal ranking set to (1,2,3,4,5)
#' @param ranking TRUE if "ranking" representation is used in the output data; otherwise "ordering" representation is used.
#' @param central The modal ranking.
#' @param lambda The parameter in the model.
#' @export
GenerateExample <- function(ranking=TRUE, central = 1:5, lambda = 0.2){
rankings <- rbind(1:5,permute::allPerms(5))
Kdist <- DistanceBlock(rankings,central)
prob <- exp(-lambda*Kdist)
prob <- prob/sum(prob)
indx <- sample(1:120, 2000,replace=TRUE,prob=prob)
count <- as.numeric(table(indx))
if (ranking){
return(list(ranking=rankings,count=count))
} else {
return(list(ordering=OrderingToRanking(rankings),count=count))
}
}
#' Generate simple examples of top-q rankings
#'
#' This function generates simple examples for illustrative proposes.
#' The sample contains the top-3 rankings of five objects and the underlying model is a weighted Kendall distance model
#' with default weights set to (0.7,0.5,0.3,0)
#' and modal ranking set to (1,2,3,4,4)
#' @param central The modal ranking.
#' @param w The weights in the model.
#' @export
GenerateExampleTopQ <- function(central=c(1,2,3,4,4),w=c(0.7,0.5,0.3,0)){
prankings <- rbind(1:5,permute::allPerms(5))
prankings[prankings>3] <- 4
prankings <- prankings[!duplicated(prankings),]
fai <- wToparam(w)
distmat <- matrix(CWeightGivenPi(prankings,central),ncol=nrow(prankings),byrow=TRUE)
distvec <- as.numeric(fai%*%distmat)
probs <- exp(-distvec)/sum(exp(-distvec))
samples <- sample(1:nrow(prankings),10000,replace=TRUE,prob=probs)
count <- table(samples)
count <- as.numeric(count)
return(list(ranking=prankings,count=count))
}
#' Create Hash Value for Rank
#'
#' Sometimes it is handier to deal with rankings into a hash value. \code{RanktoHash} returns hash values for ranks. Maximum 52 objects are supported.
#' @param r a vector or matrix of rankings. Each row of the matrix represents a ranking.
#' The ranking should be a integer from one to number of objects. No NA is allowed
#' @return a vector of character strings representing the hash values.
#' @seealso \code{\link{HashtoRank}} for a reverse operation.
#' @export
RanktoHash <- function(r){
char <- c(letters,LETTERS)
if (!is.matrix(r)){
a <- paste(char[r],collapse="")
} else {
a <- apply(r,1,function(x)paste(char[x],collapse=""))
}
as.vector(a,mode="character")
}
#' Obtain Ranking from Hash Value
#'
#' \code{HashToRank} returns rankings from given hash values. Maximum 52 objects are supported.
#' @param h A vector of hash values.
#' @return a matrix of rankings if input has more than one element or a vector of rankings if input has only one element
#' @seealso \code{\link{RanktoHash}} for a reverse operation.
#' @export
HashtoRank <- function(h){
char <- c(letters,LETTERS)
transvec <- seq_along(char)
names(transvec) <- char
nobj <- nchar(h[1])
numrank <- transvec[unlist(strsplit(h,split=""))]
if (length(h)==1){
as.vector(numrank,mode="numeric")
} else {
matrix(data=numrank,ncol=nobj,byrow=TRUE)
}
}
#' Transformation between Rankings and Orderings
#'
#' \code{OrderingToRanking} transforms between ranking representation and ordering representation.
#'
#' Ranking representation encodes the position of objects. Ordering representation is an ordered sequence of objects.
#' For example ranking (2 3 1 4) is equivalent to ordering (3 1 2 4), which means object 3 is first, object 1 is second, followed by object 2 and 4.
#' Also note that we can use this function to transform rankings into orderings, and applying this function twice will not change the input value.
#' @param ordering a matrix of orderings or rankings. Each row contains an observation.
#' @return a matrix of transformed rankings or orderings. Each row contains an observation.
#' @export
OrderingToRanking <- function(ordering){
if (is.matrix(ordering)){
ranking <- t(apply(ordering,1,order))
} else {
ranking <- order(ordering)
}
ranking
}
#' Fit A Mixture of Distance-based Models
#'
#' \code{RankDistModel} fits a mixture of ranking models based on weighted Kendall distance.
#'
#' The procedure will estimate central rankings, the probability of each cluster and weights.
#'
#' @param dat A \linkS4class{RankData} object.
#' @param init A \linkS4class{RankInit} object.
#' @param ctrl A \linkS4class{RankControl} object.
#'
#' @return A list containing the following components:
#' \describe{
#' \item{\code{modal_ranking.est}}{the estimated pi0 for each cluster.}
#' \item{\code{p}}{the probability of each cluster.}
#' \item{\code{w.est}}{the estimated weights of each cluster.}
#' \item{\code{alpha}}{the estimated alpha for each cluster.}
#' \item{\code{param.est}}{the param parametrisation of weights of each cluster.}
#' \item{\code{SSR}}{the sum of squares of Pearson residuals}
#' \item{\code{log_likelihood}}{the fitted log_likelihood}
#' \item{\code{BIC}}{the fitted Bayesian Information Criterion value}
#' \item{\code{free_params}}{the number of free parameters in the model}
#' \item{\code{expectation}}{the expected value of each observation given by the model}
#' \item{\code{iteration}}{the number of EM iteration}
#' \item{\code{model.call}}{the function call}
#' }
#' @export
RankDistanceModel <- function(dat,init,ctrl){
flag_transform = (min(dat@topq) != dat@nobj - 1)
if (class(ctrl) == "RankControlWeightedKendall"){
flag_transform = (ctrl@assumption == "equal-probability")
}
if (flag_transform){
dat_org = dat
dat = BreakTieEqualProb(dat)
}
# get parameters
tt <- dat@nobj # number of objects
n <- dat@nobs # total observations
distinctn <- dat@ndistinct # total distinct observations
clu <- init@clu
count <- dat@count
p_clu <- matrix(ncol = distinctn, nrow = clu)
# setting up return values
modal_ranking.est <- list()
modal_ranking.est.last <- list()
#alpha is added to incorporate randomness.
alpha <- numeric(clu)
p <- rep(1/clu,clu)
p.last <- numeric(clu)
param <- list()
param.last <- list()
func.call <- match.call()
loopind <- 0
if (clu > 1L){ # use EM to fit multi-cluster model
# further initialization
modal_ranking.est <- init@modal_ranking.init
param <- init@param.init
alpha <- init@alpha.init
init.clu = list()
for (i in 1:clu){
init.clu[[i]] <- new("RankInit", param.init=list(param[[i]]), alpha.init = alpha[[i]],
modal_ranking.init=list(modal_ranking.est[[i]]), clu=1L)
}
while(TRUE){
loopind <- loopind + 1
# E step
z <- matrix(nrow = distinctn, ncol = clu)
# z is a matrix with rows corresponding to distinct rankings and columns corresponding
# to clusters.
for (i in 1:clu){
z[,i] <- p[i]*FindProb(dat, ctrl, modal_ranking.est[[i]], c(param[[i]], alpha[[i]]))
}
sums <- rowSums(z)
z <- z/sums #the dimensions of z remain unchanged.
# M step
p <- t(z) %*% count
p <- p/sum(p)
if (any(p<0.03)){
warning("One cluster has probability smaller than 3%; Try fewer clusters")
return (list(p=p, modal_ranking.est=modal_ranking.est, param=param,
alpha = alpha, iteration=loopind))
}
for ( i in 1:clu){
dat.clu <- UpdateCount(dat, z[,i] * count)
if (loopind > 1){
init.clu[[i]]@param.init <- list(param[[i]])
init.clu[[i]]@modal_ranking.init <- list(modal_ranking.est[[i]])
}
#Might need to change EMSolver() function in file utils.R, as it does not work
#for models other than WeightedKendall. Will do it when the majority
#of work is finished, as it is not directly related to this FYP.
solve.clu <- SearchPi0Ext(dat.clu, init.clu[[i]], ctrl)
modal_ranking.est[[i]] <- solve.clu$pi0.ranking
param[[i]] <- solve.clu$param.est
alpha[[i]] <- solve.clu$alpha.est
p_clu[i,] <- FindProb(dat,ctrl,modal_ranking.est[[i]],c(param[[i]],
alpha[[i]]))*p[i]
}
#Observed data log likelihood.
log_likelihood_clu <- sum(log(colSums(p_clu))%*%dat@count)
# break?
if(loopind != 1){
if (loopind < ctrl@EM_limit) {
cond1 <- all( abs(p.last - p) < ctrl@EM_epsilon)
cond2 <- all( abs(unlist(modal_ranking.est.last) -
unlist(modal_ranking.est)) < ctrl@EM_epsilon)
cond3 <- all( abs(unlist(param.last) - unlist(param))
< ctrl@EM_epsilon)
if (cond1 && cond2 && cond3){
break
}
#TODO: need to change the stopping criterion. Mar 29 21:17
#Experiment with the old stopping rule for the time being. Mar 29 21:51
else if( log_likelihood_clu - log_likelihood_clu.last <
abs(log_likelihood_clu)*1e-5 ){
print("Log likelihood failed to increase.")
break
}
}
else {
print(paste("Algorithm did not converge in",ctrl@EM_limit,"iterations"))
return(NULL)
}
}
p.last <- p
modal_ranking.est.last <- modal_ranking.est
param.last <- param
log_likelihood_clu.last <- log_likelihood_clu
} # inf loop
est_prob <- colSums(p_clu)
}
else {
# fit single cluster model
single_cluster_mod <- SearchPi0Ext(dat,init,ctrl)
modal_ranking.est <- list(single_cluster_mod$pi0.ranking)
p <- 1
alpha <- single_cluster_mod$alpha.est
param <- list(single_cluster_mod$param.est)
log_likelihood_clu.last <- single_cluster_mod$log_likelihood
est_prob <- FindProb(dat,ctrl,modal_ranking.est[[1]],c(param[[1]], alpha))*p[1]
}
# finishing up with parameter estimation and summary statistics
p <- as.numeric(p)
v <- length(p) - 1 + sum(unlist(param)!=0) + length(alpha)
if (flag_transform){
full_prob = rep(0, length(dat_org@count))
# aggregate probability
# iterate through different q
for (i in 1:length(dat_org@topq)){
ind_start <- dat_org@q_ind[i]
ind_end <- dat_org@q_ind[i+1] - 1
this_q <- dat_org@topq[i]
ranking_q <- dat@ranking
ranking_q[ranking_q > this_q] <- this_q + 1
hash_q = RanktoHash(ranking_q)
# iterate through partial rankings
for (ind_partial in ind_start:ind_end){
this_partial <- dat_org@ranking[ind_partial, ]
this_hash <- RanktoHash(this_partial)
ind_agg<- which(hash_q == this_hash)
prob_agg<- sum(est_prob[ind_agg])
full_prob[ind_partial] <- prob_agg
}
# conditional
p_q = dat_org@subobs[i] / dat_org@nobs
full_prob[ind_start:ind_end] = full_prob[ind_start:ind_end]*p_q
}
log_like<- dat_org@count %*% log(full_prob)
bic <- -2*log_like + v*log(n)
expectation <- as.numeric(full_prob*n)
SSR <- sum((expectation-dat_org@count)^2/expectation)
}
else {
log_like<- dat@count %*% as.numeric(log(est_prob))
#I think the following line of code is incorrect. Leave for further checking.
#log_likelihood_clu.last <- log_like
bic <- -2*log_like + v*log(n)
expectation <- as.numeric(est_prob * n)
SSR <- sum((expectation-dat@count)^2/expectation)
}
#I think that in the following code, log_likelihood_clu.last is in doubt.
ret <- list(p=p, modal_ranking.est=modal_ranking.est, w.est=lapply(param,paramTow),
alpha = alpha, param.est=param,SSR=SSR,log_likelihood=log_like,
free_params=v, BIC=bic,expectation=expectation,iteration=loopind,
model.call=func.call)
#bic <- 2*log_likelihood_clu.last - v*log(n)
#ret <- list(p=p, modal_ranking.est=modal_ranking.est,
# w.est=lapply(param,paramTow), alpha = alpha,
# param.est=param, SSR=SSR,
# log_likelihood=log_likelihood_clu.last,
# free_params=v, BIC=bic, expectation=expectation,
# iteration=loopind,
# model.call=func.call)
ret
}
#' Find Initial Values of param
#'
#' \code{MomentEst} finds the initial values of param which can be used in the subsequent optimization problems.
#' Linear model is fitted to the log odds of rankings.
#'
#' @param dat a RankData object
#' @param size the number of samples to take in the linear model
#' @param pi0 an optional argument showing the location of central ranking.
#' If not provided, Borda Count method is used to estimate the central ranking.
#' @return A list containing following components:
#' \describe{
#' \item{\code{param.est}}{the estimated parameter values.}
#' \item{\code{modal_ranking}}{the supplied modal ranking or the modal ranking estimated by Borda count method.}
#' }
#' @examples MomentsEst(apa_obj,40)
#' @export
MomentsEst <- function(dat,size,pi0=NULL){
# estimating pi0
avg_rank <- dat@count %*% dat@ranking
if (is.null(pi0)){
modal_ranking <- sort(OrderingToRanking(dat@ranking[1,]))[order(avg_rank)]
} else {
modal_ranking <- pi0
}
nparam <- dat@nobj - 1
# construct equation set
prob_obs <- dat@count/dat@nobs
# row-wise
distance_mat <- matrix(CWeightGivenPi(dat@ranking,modal_ranking),nrow = dat@ndistinct)
pair <- sample(1:nrow(distance_mat), 2*size, replace=TRUE)
pair_mat <- matrix(pair,nrow=2)
logodd <- numeric(size)
design_mat <- matrix(ncol=nparam, nrow=size)
for (i in 1:size){
logodd[i] <- log(prob_obs[pair_mat[1,i]]/prob_obs[pair_mat[2,i]])
design_mat[i,] <- distance_mat[pair_mat[2,i],]-distance_mat[pair_mat[1,i],]
}
param.est <- stats::lm(logodd~design_mat-1)
param.est <- param.est$coefficients
param.est[param.est<0] <- 0
names(param.est) <- NULL
list(param.est = param.est, modal_ranking = modal_ranking)
}
if (FALSE){
param <- numeric()
paramMat <- NULL
i <- 1
while (i <= 100){
param <- MomentsEst(dat2, 10000)
if (!any(is.na(param))){
paramMat <- rbind(paramMat, param)
i <- i + 1
}
}
mean <- colMeans(paramMat)
ratio <- matrix(nrow = 100, ncol= 4)
for (i in 1:100){
ratio[i,] <- paramMat[i,]/mean
}
}
#' American Psychological Association (APA) election data
#'
#' A dataset containing 5738 complete votes in APA election. There are 5 candidates in total.
#'
#' @format a RankData object
#'
#' @references Marden, J. I. (1995). Analyzing and Modeling Rank Data (94-96). Chapman Hall, New York.
"apa_obj"
#' American Psychological Association (APA) election data (partial rankings included)
#'
#' A dataset containing 5738 complete votes and 9711 partial votes in APA election. There are 5 candidates in total.
#'
#' @format a RankData object
#'
#' @references Marden, J. I. (1995). Analyzing and Modeling Rank Data (94-96). Chapman Hall, New York.
"apa_partial_obj"
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.