# R/rankdistext.R In simonfqy/rankdistext:

#### Documented in DistanceBlockDistanceMatrixDistancePairGenerateExampleGenerateExampleTopQHashtoRankModelSummaryMomentsEstOrderingToRankingRankDistanceModelRanktoHash

#' @useDynLib rankdistext
#' @importFrom Rcpp sourceCpp
#' @import stats
NULL

#' Calculate Kendall distance matrix between rankings
#'
#' @param ranking a matrix of rankings
#' @return  Kendall distance matrix between rankings. The value in ith row and jth column is the
#' Kendall distance between the ith and jth rankings.
#' @export
DistanceMatrix <- function(ranking){
n = ncol(ranking)
w = c(rep(0,n-2),1)
distcalc = function(x){
fai.coeff = matrix(CWeightGivenPi(ranking,x),nrow=nrow(ranking))
as.numeric(fai.coeff %*% w)
}
apply(ranking,1,distcalc)
}

#' Calculate Kendall distance
#' #' Calculate Kendall distance matrix between one ranking and a matrix of rankings
#' @param mat a matrix of rankings
#' @param r a single ranking
#' @return a vector of Kendall distance
#' @export
DistanceBlock <- function(mat,r){
n = ncol(mat)
w = c(rep(0,n-2),1)
fai.coeff = matrix(CWeightGivenPi(mat,r),nrow=nrow(mat))
as.numeric(fai.coeff %*% w)
}

#' Calculate Kendall distance between a pair of rankings
#' #' Calculate Kendall distance matrix between a pair of rankings
#' @param r1 a single ranking
#' @param r2 a single ranking
#' @return Kendall distance value
#' @export
DistancePair <- function(r1,r2){
n = length(r1)
w = c(rep(0,n-2),1)
fai.coeff = matrix(CWeightGivenPi(r1,r2),nrow=1)
as.numeric(fai.coeff %*% w)
}

#' Print a brief summary of the fitted model
#' Print a brief summary of the fitted model. This includes information about goodness
#' of fit as well as parameter estimation.
#' @param model a ranking model returned by a call to RankDistanceModel function.
#' @export
ModelSummary <- function(model){
clus = rev(order(model$p)) nobj = length(model$modal_ranking.est[[1]])
mod_name = deparse(substitute(model))
cat("Summary of",mod_name,"\n")
cat(rep("=",nchar(mod_name)+10),"\n",sep="")
cat("Goodness of Fit\n")
cat("SSR:\t",model$SSR,"\n") cat("BIC:\t",model$BIC,"\n")
cat("dof:\t",model$free_params,"\n") cat("Parameter Estimation\n") cat("Cluster",LETTERS[1:nobj],"p","Parameters\n",sep="\t") for (i in 1:length(clus)){ cat(i,model$modal_ranking.est[[clus[i]]],round(model$p[clus[i]],digits=2),round(model$w.est[[clus[i]]],digits=2),"\n",sep="\t")
}
}

#' Generate simple examples
#'
#' This function generates simple examples for illustrative proposes.
#' The sample contains the rankings of five objects and the underlying model is a Mallow's phi
#' model with dispersion parameter set to 0.2
#' and modal ranking set to (1,2,3,4,5)
#' @param ranking TRUE if "ranking" representation is used in the output data; otherwise "ordering" representation is used.
#' @param central The modal ranking.
#' @param lambda The parameter in the model.
#' @export
GenerateExample <- function(ranking=TRUE, central = 1:5, lambda = 0.2){
rankings <- rbind(1:5,permute::allPerms(5))
Kdist <- DistanceBlock(rankings,central)
prob <- exp(-lambda*Kdist)
prob <- prob/sum(prob)
indx <- sample(1:120, 2000,replace=TRUE,prob=prob)
count <- as.numeric(table(indx))
if (ranking){
return(list(ranking=rankings,count=count))
} else {
return(list(ordering=OrderingToRanking(rankings),count=count))
}
}

#' Generate simple examples of top-q rankings
#'
#' This function generates simple examples for illustrative proposes.
#' The sample contains the top-3 rankings of five objects and the underlying model is a weighted Kendall distance model
#' with default weights set to (0.7,0.5,0.3,0)
#' and modal ranking set to (1,2,3,4,4)
#' @param central The modal ranking.
#' @param w The weights in the model.
#' @export
GenerateExampleTopQ <- function(central=c(1,2,3,4,4),w=c(0.7,0.5,0.3,0)){
prankings <- rbind(1:5,permute::allPerms(5))
prankings[prankings>3] <- 4
prankings <- prankings[!duplicated(prankings),]
fai <- wToparam(w)
distmat <- matrix(CWeightGivenPi(prankings,central),ncol=nrow(prankings),byrow=TRUE)
distvec <- as.numeric(fai%*%distmat)
probs <- exp(-distvec)/sum(exp(-distvec))
samples <- sample(1:nrow(prankings),10000,replace=TRUE,prob=probs)
count <- table(samples)
count <- as.numeric(count)
return(list(ranking=prankings,count=count))
}

#' Create Hash Value for Rank
#'
#' Sometimes it is handier to deal with rankings into a hash value. \code{RanktoHash} returns hash values for ranks. Maximum 52 objects are supported.
#' @param r  a vector or matrix of rankings. Each row of the matrix represents a ranking.
#'   The ranking should be a integer from one to number of objects. No NA is allowed
#' @return a vector of character strings representing the hash values.
#' @seealso \code{\link{HashtoRank}} for a reverse operation.
#' @export
RanktoHash <- function(r){
char <- c(letters,LETTERS)
if (!is.matrix(r)){
a <- paste(char[r],collapse="")
} else {
a <- apply(r,1,function(x)paste(char[x],collapse=""))
}
as.vector(a,mode="character")
}

#' Obtain Ranking from Hash Value
#'
#' \code{HashToRank} returns rankings from given hash values. Maximum 52 objects are supported.
#' @param h  A vector of hash values.
#' @return a matrix of rankings if input has more than one element or a vector of rankings if input has only one element
#' @seealso \code{\link{RanktoHash}} for a reverse operation.
#' @export
HashtoRank <- function(h){
char <- c(letters,LETTERS)
transvec <- seq_along(char)
names(transvec) <- char
nobj <- nchar(h[1])
numrank <- transvec[unlist(strsplit(h,split=""))]
if (length(h)==1){
as.vector(numrank,mode="numeric")
} else {
matrix(data=numrank,ncol=nobj,byrow=TRUE)
}

}

#' Transformation between Rankings and Orderings
#'
#' \code{OrderingToRanking} transforms between ranking representation and ordering representation.
#'
#'    Ranking representation encodes the position of objects. Ordering representation is an ordered sequence of objects.
#'    For example ranking (2 3 1 4) is equivalent to ordering (3 1 2 4), which means object 3 is first, object 1 is second, followed by object 2 and 4.
#'    Also note that we can use this function to transform rankings into orderings, and applying this function twice will not change the input value.
#' @param ordering  a matrix of orderings or rankings. Each row contains an observation.
#' @return  a matrix of transformed rankings or orderings. Each row contains an observation.
#' @export
OrderingToRanking <- function(ordering){
if (is.matrix(ordering)){
ranking <- t(apply(ordering,1,order))
} else {
ranking <- order(ordering)
}
ranking
}

#' Fit A Mixture of Distance-based Models
#'
#' \code{RankDistModel} fits a mixture of ranking models based on weighted Kendall distance.
#'
#' The procedure will estimate central rankings, the probability of each cluster and weights.
#'
#' @param dat A \linkS4class{RankData} object.
#' @param init A \linkS4class{RankInit} object.
#' @param ctrl A \linkS4class{RankControl} object.
#'
#' @return A list containing the following components:
#' \describe{
#' \item{\code{modal_ranking.est}}{the estimated pi0 for each cluster.}
#' \item{\code{p}}{the probability of each cluster.}
#' \item{\code{w.est}}{the estimated weights of each cluster.}
#' \item{\code{alpha}}{the estimated alpha for each cluster.}
#' \item{\code{param.est}}{the param parametrisation of weights of each cluster.}
#' \item{\code{SSR}}{the sum of squares of Pearson residuals}
#' \item{\code{log_likelihood}}{the fitted log_likelihood}
#' \item{\code{BIC}}{the fitted Bayesian Information Criterion value}
#' \item{\code{free_params}}{the number of free parameters in the model}
#' \item{\code{expectation}}{the expected value of each observation given by the model}
#' \item{\code{iteration}}{the number of EM iteration}
#' \item{\code{model.call}}{the function call}
#' }
#' @export
RankDistanceModel <- function(dat,init,ctrl){
flag_transform = (min(dat@topq) != dat@nobj - 1)
if (class(ctrl) == "RankControlWeightedKendall"){
flag_transform = (ctrl@assumption == "equal-probability")
}
if (flag_transform){
dat_org = dat
dat = BreakTieEqualProb(dat)
}
# get parameters
tt <- dat@nobj # number of objects
n <- dat@nobs    # total observations
distinctn <- dat@ndistinct	# total distinct observations
clu <- init@clu
count <- dat@count
p_clu <- matrix(ncol = distinctn, nrow = clu)
# setting up return values
modal_ranking.est <- list()
modal_ranking.est.last <- list()
#alpha is added to incorporate randomness.
alpha <- numeric(clu)
p <- rep(1/clu,clu)
p.last <- numeric(clu)
param <- list()
param.last <- list()
func.call <- match.call()
loopind <- 0

if (clu > 1L){  # use EM to fit multi-cluster model
# further initialization
modal_ranking.est <- init@modal_ranking.init
param <- init@param.init
alpha <- init@alpha.init
init.clu = list()
for (i in 1:clu){
init.clu[[i]] <- new("RankInit", param.init=list(param[[i]]), alpha.init = alpha[[i]],
modal_ranking.init=list(modal_ranking.est[[i]]), clu=1L)
}
while(TRUE){
loopind <- loopind + 1
# E step
z <- matrix(nrow = distinctn, ncol = clu)
# z is a matrix with rows corresponding to distinct rankings and columns corresponding
# to clusters.
for (i in 1:clu){
z[,i] <- p[i]*FindProb(dat, ctrl, modal_ranking.est[[i]], c(param[[i]], alpha[[i]]))
}
sums <- rowSums(z)
z <- z/sums #the dimensions of z remain unchanged.

# M step
p <- t(z) %*% count
p <- p/sum(p)
if (any(p<0.03)){
warning("One cluster has probability smaller than 3%; Try fewer clusters")
return (list(p=p, modal_ranking.est=modal_ranking.est, param=param,
alpha = alpha, iteration=loopind))
}
for ( i in 1:clu){
dat.clu <- UpdateCount(dat, z[,i] * count)
if (loopind > 1){
init.clu[[i]]@param.init <- list(param[[i]])
init.clu[[i]]@modal_ranking.init <- list(modal_ranking.est[[i]])
}
#Might need to change EMSolver() function in file utils.R, as it does not work
#for models other than WeightedKendall. Will do it when the majority
#of work is finished, as it is not directly related to this FYP.
solve.clu <- SearchPi0Ext(dat.clu, init.clu[[i]], ctrl)
modal_ranking.est[[i]] <- solve.clu$pi0.ranking param[[i]] <- solve.clu$param.est
alpha[[i]] <- solve.clu$alpha.est p_clu[i,] <- FindProb(dat,ctrl,modal_ranking.est[[i]],c(param[[i]], alpha[[i]]))*p[i] } #Observed data log likelihood. log_likelihood_clu <- sum(log(colSums(p_clu))%*%dat@count) # break? if(loopind != 1){ if (loopind < ctrl@EM_limit) { cond1 <- all( abs(p.last - p) < ctrl@EM_epsilon) cond2 <- all( abs(unlist(modal_ranking.est.last) - unlist(modal_ranking.est)) < ctrl@EM_epsilon) cond3 <- all( abs(unlist(param.last) - unlist(param)) < ctrl@EM_epsilon) if (cond1 && cond2 && cond3){ break } #TODO: need to change the stopping criterion. Mar 29 21:17 #Experiment with the old stopping rule for the time being. Mar 29 21:51 else if( log_likelihood_clu - log_likelihood_clu.last < abs(log_likelihood_clu)*1e-5 ){ print("Log likelihood failed to increase.") break } } else { print(paste("Algorithm did not converge in",ctrl@EM_limit,"iterations")) return(NULL) } } p.last <- p modal_ranking.est.last <- modal_ranking.est param.last <- param log_likelihood_clu.last <- log_likelihood_clu } # inf loop est_prob <- colSums(p_clu) } else { # fit single cluster model single_cluster_mod <- SearchPi0Ext(dat,init,ctrl) modal_ranking.est <- list(single_cluster_mod$pi0.ranking)
p <- 1
alpha <- single_cluster_mod$alpha.est param <- list(single_cluster_mod$param.est)
log_likelihood_clu.last <- single_cluster_mod$log_likelihood est_prob <- FindProb(dat,ctrl,modal_ranking.est[[1]],c(param[[1]], alpha))*p[1] } # finishing up with parameter estimation and summary statistics p <- as.numeric(p) v <- length(p) - 1 + sum(unlist(param)!=0) + length(alpha) if (flag_transform){ full_prob = rep(0, length(dat_org@count)) # aggregate probability # iterate through different q for (i in 1:length(dat_org@topq)){ ind_start <- dat_org@q_ind[i] ind_end <- dat_org@q_ind[i+1] - 1 this_q <- dat_org@topq[i] ranking_q <- dat@ranking ranking_q[ranking_q > this_q] <- this_q + 1 hash_q = RanktoHash(ranking_q) # iterate through partial rankings for (ind_partial in ind_start:ind_end){ this_partial <- dat_org@ranking[ind_partial, ] this_hash <- RanktoHash(this_partial) ind_agg<- which(hash_q == this_hash) prob_agg<- sum(est_prob[ind_agg]) full_prob[ind_partial] <- prob_agg } # conditional p_q = dat_org@subobs[i] / dat_org@nobs full_prob[ind_start:ind_end] = full_prob[ind_start:ind_end]*p_q } log_like<- dat_org@count %*% log(full_prob) bic <- -2*log_like + v*log(n) expectation <- as.numeric(full_prob*n) SSR <- sum((expectation-dat_org@count)^2/expectation) } else { log_like<- dat@count %*% as.numeric(log(est_prob)) #I think the following line of code is incorrect. Leave for further checking. #log_likelihood_clu.last <- log_like bic <- -2*log_like + v*log(n) expectation <- as.numeric(est_prob * n) SSR <- sum((expectation-dat@count)^2/expectation) } #I think that in the following code, log_likelihood_clu.last is in doubt. ret <- list(p=p, modal_ranking.est=modal_ranking.est, w.est=lapply(param,paramTow), alpha = alpha, param.est=param,SSR=SSR,log_likelihood=log_like, free_params=v, BIC=bic,expectation=expectation,iteration=loopind, model.call=func.call) #bic <- 2*log_likelihood_clu.last - v*log(n) #ret <- list(p=p, modal_ranking.est=modal_ranking.est, # w.est=lapply(param,paramTow), alpha = alpha, # param.est=param, SSR=SSR, # log_likelihood=log_likelihood_clu.last, # free_params=v, BIC=bic, expectation=expectation, # iteration=loopind, # model.call=func.call) ret } #' Find Initial Values of param #' #' \code{MomentEst} finds the initial values of param which can be used in the subsequent optimization problems. #' Linear model is fitted to the log odds of rankings. #' #' @param dat a RankData object #' @param size the number of samples to take in the linear model #' @param pi0 an optional argument showing the location of central ranking. #' If not provided, Borda Count method is used to estimate the central ranking. #' @return A list containing following components: #' \describe{ #' \item{\code{param.est}}{the estimated parameter values.} #' \item{\code{modal_ranking}}{the supplied modal ranking or the modal ranking estimated by Borda count method.} #' } #' @examples MomentsEst(apa_obj,40) #' @export MomentsEst <- function(dat,size,pi0=NULL){ # estimating pi0 avg_rank <- dat@count %*% dat@ranking if (is.null(pi0)){ modal_ranking <- sort(OrderingToRanking(dat@ranking[1,]))[order(avg_rank)] } else { modal_ranking <- pi0 } nparam <- dat@nobj - 1 # construct equation set prob_obs <- dat@count/dat@nobs # row-wise distance_mat <- matrix(CWeightGivenPi(dat@ranking,modal_ranking),nrow = dat@ndistinct) pair <- sample(1:nrow(distance_mat), 2*size, replace=TRUE) pair_mat <- matrix(pair,nrow=2) logodd <- numeric(size) design_mat <- matrix(ncol=nparam, nrow=size) for (i in 1:size){ logodd[i] <- log(prob_obs[pair_mat[1,i]]/prob_obs[pair_mat[2,i]]) design_mat[i,] <- distance_mat[pair_mat[2,i],]-distance_mat[pair_mat[1,i],] } param.est <- stats::lm(logodd~design_mat-1) param.est <- param.est$coefficients
param.est[param.est<0] <- 0
names(param.est) <- NULL
list(param.est = param.est, modal_ranking = modal_ranking)
}

if (FALSE){
param <- numeric()
paramMat <- NULL
i <- 1
while (i <= 100){
param <- MomentsEst(dat2, 10000)
if (!any(is.na(param))){
paramMat <- rbind(paramMat, param)
i <- i + 1
}
}
mean <- colMeans(paramMat)
ratio <- matrix(nrow = 100, ncol= 4)
for (i in 1:100){
ratio[i,] <- paramMat[i,]/mean
}
}

#' American Psychological Association (APA) election data
#'
#' A dataset containing 5738 complete votes in APA election. There are 5 candidates in total.
#'
#' @format a RankData object
#'
#' @references Marden, J. I. (1995). Analyzing and Modeling Rank Data (94-96). Chapman Hall, New York.
"apa_obj"

#' American Psychological Association (APA) election data (partial rankings included)
#'
#' A dataset containing 5738 complete votes and 9711 partial votes in APA election. There are 5 candidates in total.
#'
#' @format a RankData object
#'
#' @references Marden, J. I. (1995). Analyzing and Modeling Rank Data (94-96). Chapman Hall, New York.
"apa_partial_obj"

simonfqy/rankdistext documentation built on May 26, 2017, 7:04 a.m.