R/rankdistext.R

#' @useDynLib rankdistext
#' @importFrom Rcpp sourceCpp
#' @import stats
NULL


#' Calculate Kendall distance matrix between rankings
#'
#' @param ranking a matrix of rankings
#' @return  Kendall distance matrix between rankings. The value in ith row and jth column is the
#' Kendall distance between the ith and jth rankings.
#' @export
DistanceMatrix <- function(ranking){
  n = ncol(ranking)
  w = c(rep(0,n-2),1)
  distcalc = function(x){
    fai.coeff = matrix(CWeightGivenPi(ranking,x),nrow=nrow(ranking))
    as.numeric(fai.coeff %*% w)
  }
  apply(ranking,1,distcalc)
}

#' Calculate Kendall distance
#' #' Calculate Kendall distance matrix between one ranking and a matrix of rankings
#' @param mat a matrix of rankings
#' @param r a single ranking
#' @return a vector of Kendall distance
#' @export
DistanceBlock <- function(mat,r){
  n = ncol(mat)
  w = c(rep(0,n-2),1)
  fai.coeff = matrix(CWeightGivenPi(mat,r),nrow=nrow(mat))
  as.numeric(fai.coeff %*% w)
}

#' Calculate Kendall distance between a pair of rankings
#' #' Calculate Kendall distance matrix between a pair of rankings
#' @param r1 a single ranking
#' @param r2 a single ranking
#' @return Kendall distance value
#' @export
DistancePair <- function(r1,r2){
  n = length(r1)
  w = c(rep(0,n-2),1)
  fai.coeff = matrix(CWeightGivenPi(r1,r2),nrow=1)
  as.numeric(fai.coeff %*% w)
}

#' Print a brief summary of the fitted model
#' Print a brief summary of the fitted model. This includes information about goodness
#' of fit as well as parameter estimation.
#' @param model a ranking model returned by a call to RankDistanceModel function.
#' @export
ModelSummary <- function(model){
  clus = rev(order(model$p))
  nobj = length(model$modal_ranking.est[[1]])
  mod_name = deparse(substitute(model))
  cat("Summary of",mod_name,"\n")
  cat(rep("=",nchar(mod_name)+10),"\n",sep="")
  cat("Goodness of Fit\n")
  cat("SSR:\t",model$SSR,"\n")
  cat("BIC:\t",model$BIC,"\n")
  cat("dof:\t",model$free_params,"\n")
  cat("Parameter Estimation\n")
  cat("Cluster",LETTERS[1:nobj],"p","Parameters\n",sep="\t")
  for (i in 1:length(clus)){
    cat(i,model$modal_ranking.est[[clus[i]]],round(model$p[clus[i]],digits=2),round(model$w.est[[clus[i]]],digits=2),"\n",sep="\t")
  }
}

#' Generate simple examples
#'
#' This function generates simple examples for illustrative proposes.
#' The sample contains the rankings of five objects and the underlying model is a Mallow's phi
#' model with dispersion parameter set to 0.2
#' and modal ranking set to (1,2,3,4,5)
#' @param ranking TRUE if "ranking" representation is used in the output data; otherwise "ordering" representation is used.
#' @param central The modal ranking.
#' @param lambda The parameter in the model.
#' @export
GenerateExample <- function(ranking=TRUE, central = 1:5, lambda = 0.2){
  rankings <- rbind(1:5,permute::allPerms(5))
  Kdist <- DistanceBlock(rankings,central)
  prob <- exp(-lambda*Kdist)
  prob <- prob/sum(prob)
  indx <- sample(1:120, 2000,replace=TRUE,prob=prob)
  count <- as.numeric(table(indx))
  if (ranking){
    return(list(ranking=rankings,count=count))
  } else {
    return(list(ordering=OrderingToRanking(rankings),count=count))
  }
}

#' Generate simple examples of top-q rankings
#'
#' This function generates simple examples for illustrative proposes.
#' The sample contains the top-3 rankings of five objects and the underlying model is a weighted Kendall distance model
#' with default weights set to (0.7,0.5,0.3,0)
#' and modal ranking set to (1,2,3,4,4)
#' @param central The modal ranking.
#' @param w The weights in the model.
#' @export
GenerateExampleTopQ <- function(central=c(1,2,3,4,4),w=c(0.7,0.5,0.3,0)){
  prankings <- rbind(1:5,permute::allPerms(5))
  prankings[prankings>3] <- 4
  prankings <- prankings[!duplicated(prankings),]
  fai <- wToparam(w)
  distmat <- matrix(CWeightGivenPi(prankings,central),ncol=nrow(prankings),byrow=TRUE)
  distvec <- as.numeric(fai%*%distmat)
  probs <- exp(-distvec)/sum(exp(-distvec))
  samples <- sample(1:nrow(prankings),10000,replace=TRUE,prob=probs)
  count <- table(samples)
  count <- as.numeric(count)
  return(list(ranking=prankings,count=count))
}

#' Create Hash Value for Rank
#'
#' Sometimes it is handier to deal with rankings into a hash value. \code{RanktoHash} returns hash values for ranks. Maximum 52 objects are supported.
#' @param r  a vector or matrix of rankings. Each row of the matrix represents a ranking.
#'   The ranking should be a integer from one to number of objects. No NA is allowed
#' @return a vector of character strings representing the hash values.
#' @seealso \code{\link{HashtoRank}} for a reverse operation.
#' @export
RanktoHash <- function(r){
  char <- c(letters,LETTERS)
  if (!is.matrix(r)){
    a <- paste(char[r],collapse="")
  } else {
    a <- apply(r,1,function(x)paste(char[x],collapse=""))
  }
  as.vector(a,mode="character")
}


#' Obtain Ranking from Hash Value
#'
#' \code{HashToRank} returns rankings from given hash values. Maximum 52 objects are supported.
#' @param h  A vector of hash values.
#' @return a matrix of rankings if input has more than one element or a vector of rankings if input has only one element
#' @seealso \code{\link{RanktoHash}} for a reverse operation.
#' @export
HashtoRank <- function(h){
  char <- c(letters,LETTERS)
  transvec <- seq_along(char)
  names(transvec) <- char
  nobj <- nchar(h[1])
  numrank <- transvec[unlist(strsplit(h,split=""))]
  if (length(h)==1){
    as.vector(numrank,mode="numeric")
  } else {
    matrix(data=numrank,ncol=nobj,byrow=TRUE)
  }

}


#' Transformation between Rankings and Orderings
#'
#' \code{OrderingToRanking} transforms between ranking representation and ordering representation.
#'
#'    Ranking representation encodes the position of objects. Ordering representation is an ordered sequence of objects.
#'    For example ranking (2 3 1 4) is equivalent to ordering (3 1 2 4), which means object 3 is first, object 1 is second, followed by object 2 and 4.
#'    Also note that we can use this function to transform rankings into orderings, and applying this function twice will not change the input value.
#' @param ordering  a matrix of orderings or rankings. Each row contains an observation.
#' @return  a matrix of transformed rankings or orderings. Each row contains an observation.
#' @export
OrderingToRanking <- function(ordering){
  if (is.matrix(ordering)){
    ranking <- t(apply(ordering,1,order))
  } else {
    ranking <- order(ordering)
  }
  ranking
}



#' Fit A Mixture of Distance-based Models
#'
#' \code{RankDistModel} fits a mixture of ranking models based on weighted Kendall distance.
#'
#' The procedure will estimate central rankings, the probability of each cluster and weights.
#'
#' @param dat A \linkS4class{RankData} object.
#' @param init A \linkS4class{RankInit} object.
#' @param ctrl A \linkS4class{RankControl} object.
#'
#' @return A list containing the following components:
#' \describe{
#' \item{\code{modal_ranking.est}}{the estimated pi0 for each cluster.}
#' \item{\code{p}}{the probability of each cluster.}
#' \item{\code{w.est}}{the estimated weights of each cluster.}
#' \item{\code{alpha}}{the estimated alpha for each cluster.}
#' \item{\code{param.est}}{the param parametrisation of weights of each cluster.}
#' \item{\code{SSR}}{the sum of squares of Pearson residuals}
#' \item{\code{log_likelihood}}{the fitted log_likelihood}
#' \item{\code{BIC}}{the fitted Bayesian Information Criterion value}
#' \item{\code{free_params}}{the number of free parameters in the model}
#' \item{\code{expectation}}{the expected value of each observation given by the model}
#' \item{\code{iteration}}{the number of EM iteration}
#' \item{\code{model.call}}{the function call}
#' }
#' @export
RankDistanceModel <- function(dat,init,ctrl){
  flag_transform = (min(dat@topq) != dat@nobj - 1)
  if (class(ctrl) == "RankControlWeightedKendall"){
    flag_transform = (ctrl@assumption == "equal-probability")
  }
  if (flag_transform){
    dat_org = dat
    dat = BreakTieEqualProb(dat)
  }
  # get parameters
  tt <- dat@nobj # number of objects
  n <- dat@nobs    # total observations
  distinctn <- dat@ndistinct	# total distinct observations
  clu <- init@clu
  count <- dat@count
  p_clu <- matrix(ncol = distinctn, nrow = clu)
  # setting up return values
  modal_ranking.est <- list()
  modal_ranking.est.last <- list()
  #alpha is added to incorporate randomness.
  alpha <- numeric(clu)
  p <- rep(1/clu,clu)
  p.last <- numeric(clu)
  param <- list()
  param.last <- list()
  func.call <- match.call()
  loopind <- 0

  if (clu > 1L){  # use EM to fit multi-cluster model
    # further initialization
    modal_ranking.est <- init@modal_ranking.init
    param <- init@param.init
    alpha <- init@alpha.init
    init.clu = list()
    for (i in 1:clu){
      init.clu[[i]] <- new("RankInit", param.init=list(param[[i]]), alpha.init = alpha[[i]],
                           modal_ranking.init=list(modal_ranking.est[[i]]), clu=1L)
    }
    while(TRUE){
      loopind <- loopind + 1
      # E step
      z <- matrix(nrow = distinctn, ncol = clu)
      # z is a matrix with rows corresponding to distinct rankings and columns corresponding
      # to clusters.
      for (i in 1:clu){
        z[,i] <- p[i]*FindProb(dat, ctrl, modal_ranking.est[[i]], c(param[[i]], alpha[[i]]))
      }
      sums <- rowSums(z)
      z <- z/sums #the dimensions of z remain unchanged.

      # M step
      p <- t(z) %*% count
      p <- p/sum(p)
      if (any(p<0.03)){
        warning("One cluster has probability smaller than 3%; Try fewer clusters")
        return (list(p=p, modal_ranking.est=modal_ranking.est, param=param,
                     alpha = alpha, iteration=loopind))
      }
      for ( i in 1:clu){
        dat.clu <- UpdateCount(dat, z[,i] * count)
        if (loopind > 1){
          init.clu[[i]]@param.init <- list(param[[i]])
          init.clu[[i]]@modal_ranking.init <- list(modal_ranking.est[[i]])
        }
        #Might need to change EMSolver() function in file utils.R, as it does not work
        #for models other than WeightedKendall. Will do it when the majority
        #of work is finished, as it is not directly related to this FYP.
        solve.clu <- SearchPi0Ext(dat.clu, init.clu[[i]], ctrl)
        modal_ranking.est[[i]] <- solve.clu$pi0.ranking
        param[[i]] <- solve.clu$param.est
        alpha[[i]] <- solve.clu$alpha.est
        p_clu[i,] <- FindProb(dat,ctrl,modal_ranking.est[[i]],c(param[[i]],
                                                                alpha[[i]]))*p[i]
      }
      #Observed data log likelihood.
      log_likelihood_clu <- sum(log(colSums(p_clu))%*%dat@count)
      # break?
      if(loopind != 1){
        if (loopind < ctrl@EM_limit) {
          cond1 <- all( abs(p.last - p) < ctrl@EM_epsilon)
          cond2 <- all( abs(unlist(modal_ranking.est.last) -
                              unlist(modal_ranking.est)) < ctrl@EM_epsilon)
          cond3 <- all( abs(unlist(param.last) - unlist(param))
                        < ctrl@EM_epsilon)
          if (cond1 && cond2 && cond3){
            break
          }
          #TODO: need to change the stopping criterion. Mar 29 21:17
          #Experiment with the old stopping rule for the time being. Mar 29 21:51
          else if( log_likelihood_clu - log_likelihood_clu.last  <
                   abs(log_likelihood_clu)*1e-5 ){
            print("Log likelihood failed to increase.")
            break
          }
        }
        else {
          print(paste("Algorithm did not converge in",ctrl@EM_limit,"iterations"))
          return(NULL)
        }
      }
      p.last <- p
      modal_ranking.est.last <- modal_ranking.est
      param.last <- param
      log_likelihood_clu.last <- log_likelihood_clu
    } # inf loop
    est_prob <- colSums(p_clu)
  }
  else {
    # fit single cluster model
    single_cluster_mod <- SearchPi0Ext(dat,init,ctrl)
    modal_ranking.est <- list(single_cluster_mod$pi0.ranking)
    p <- 1
    alpha <- single_cluster_mod$alpha.est
    param <- list(single_cluster_mod$param.est)
    log_likelihood_clu.last <- single_cluster_mod$log_likelihood
    est_prob <- FindProb(dat,ctrl,modal_ranking.est[[1]],c(param[[1]], alpha))*p[1]
  }
  # finishing up with parameter estimation and summary statistics
  p <- as.numeric(p)
  v <- length(p) - 1 + sum(unlist(param)!=0) + length(alpha)
  if (flag_transform){
    full_prob = rep(0, length(dat_org@count))
    # aggregate probability
    # iterate through different q
    for (i in 1:length(dat_org@topq)){
      ind_start <- dat_org@q_ind[i]
      ind_end <- dat_org@q_ind[i+1] - 1
      this_q <- dat_org@topq[i]
      ranking_q <- dat@ranking
      ranking_q[ranking_q > this_q] <- this_q + 1
      hash_q = RanktoHash(ranking_q)
      # iterate through partial rankings
      for (ind_partial in ind_start:ind_end){
        this_partial <- dat_org@ranking[ind_partial, ]
        this_hash <- RanktoHash(this_partial)
        ind_agg<- which(hash_q == this_hash)
        prob_agg<- sum(est_prob[ind_agg])
        full_prob[ind_partial] <- prob_agg
      }
      # conditional
      p_q = dat_org@subobs[i] / dat_org@nobs
      full_prob[ind_start:ind_end] = full_prob[ind_start:ind_end]*p_q
    }
    log_like<- dat_org@count %*% log(full_prob)
    bic <- -2*log_like + v*log(n)
    expectation <- as.numeric(full_prob*n)
    SSR <- sum((expectation-dat_org@count)^2/expectation)
  }
  else {
    log_like<- dat@count %*% as.numeric(log(est_prob))
    #I think the following line of code is incorrect. Leave for further checking.
    #log_likelihood_clu.last <- log_like

    bic <- -2*log_like + v*log(n)
    expectation <- as.numeric(est_prob * n)
    SSR <- sum((expectation-dat@count)^2/expectation)
  }
  #I think that in the following code, log_likelihood_clu.last is in doubt.
  ret <- list(p=p, modal_ranking.est=modal_ranking.est, w.est=lapply(param,paramTow),
              alpha = alpha, param.est=param,SSR=SSR,log_likelihood=log_like,
              free_params=v, BIC=bic,expectation=expectation,iteration=loopind,
              model.call=func.call)

  #bic <- 2*log_likelihood_clu.last - v*log(n)
  #ret <- list(p=p,       modal_ranking.est=modal_ranking.est,
  #            w.est=lapply(param,paramTow), alpha = alpha,
  #            param.est=param,     SSR=SSR,
  #            log_likelihood=log_likelihood_clu.last,
  #            free_params=v,     BIC=bic,     expectation=expectation,
  #            iteration=loopind,
  #            model.call=func.call)
  ret
}


#' Find Initial Values of param
#'
#' \code{MomentEst} finds the initial values of param which can be used in the subsequent optimization problems.
#' Linear model is fitted to the log odds of rankings.
#'
#' @param dat a RankData object
#' @param size the number of samples to take in the linear model
#' @param pi0 an optional argument showing the location of central ranking.
#' If not provided, Borda Count method is used to estimate the central ranking.
#' @return A list containing following components:
#' \describe{
#' \item{\code{param.est}}{the estimated parameter values.}
#' \item{\code{modal_ranking}}{the supplied modal ranking or the modal ranking estimated by Borda count method.}
#' }
#' @examples MomentsEst(apa_obj,40)
#' @export
MomentsEst <- function(dat,size,pi0=NULL){
  # estimating pi0
  avg_rank <- dat@count %*% dat@ranking
  if (is.null(pi0)){
    modal_ranking <- sort(OrderingToRanking(dat@ranking[1,]))[order(avg_rank)]
  } else {
    modal_ranking <- pi0
  }
  nparam <- dat@nobj - 1
  # construct equation set
  prob_obs <- dat@count/dat@nobs
  # row-wise
  distance_mat <- matrix(CWeightGivenPi(dat@ranking,modal_ranking),nrow = dat@ndistinct)
  pair <- sample(1:nrow(distance_mat), 2*size, replace=TRUE)
  pair_mat <- matrix(pair,nrow=2)
  logodd <- numeric(size)
  design_mat <- matrix(ncol=nparam, nrow=size)
  for (i in 1:size){
    logodd[i] <- log(prob_obs[pair_mat[1,i]]/prob_obs[pair_mat[2,i]])
    design_mat[i,] <- distance_mat[pair_mat[2,i],]-distance_mat[pair_mat[1,i],]
  }
  param.est <- stats::lm(logodd~design_mat-1)
  param.est <- param.est$coefficients
  param.est[param.est<0] <- 0
  names(param.est) <- NULL
  list(param.est = param.est, modal_ranking = modal_ranking)
}

if (FALSE){
  param <- numeric()
  paramMat <- NULL
  i <- 1
  while (i <= 100){
    param <- MomentsEst(dat2, 10000)
    if (!any(is.na(param))){
      paramMat <- rbind(paramMat, param)
      i <- i + 1
    }
  }
  mean <- colMeans(paramMat)
  ratio <- matrix(nrow = 100, ncol= 4)
  for (i in 1:100){
    ratio[i,] <- paramMat[i,]/mean
  }
}

#' American Psychological Association (APA) election data
#'
#' A dataset containing 5738 complete votes in APA election. There are 5 candidates in total.
#'
#' @format a RankData object
#'
#' @references Marden, J. I. (1995). Analyzing and Modeling Rank Data (94-96). Chapman Hall, New York.
"apa_obj"

#' American Psychological Association (APA) election data (partial rankings included)
#'
#' A dataset containing 5738 complete votes and 9711 partial votes in APA election. There are 5 candidates in total.
#'
#' @format a RankData object
#'
#' @references Marden, J. I. (1995). Analyzing and Modeling Rank Data (94-96). Chapman Hall, New York.
"apa_partial_obj"
simonfqy/rankdistext documentation built on May 29, 2019, 8:19 p.m.