R/cata.R

Defines functions topc code.topk salton rv.coef toMatrix toWideMatrix barray mcnemarQ cochranQ

Documented in barray cochranQ code.topk mcnemarQ rv.coef salton toMatrix topc toWideMatrix

#' Cochran's Q test
#'
#' Conduct Cochran's Q test assuming equal columns proportions for matched binary
#' responses versus the alternative hypothesis of unequal column proportions.
#' @name cochranQ
#' @usage cochranQ(X, quiet = FALSE, digits = getOption("digits"))
#' @param X matrix of \eqn{I} assessors (rows) and \eqn{J} products (columns)
#' where values are \code{0} (not checked) or \code{1} (checked)
#' @param quiet if \code{FALSE} (default) then it prints information related to 
#' the test; if \code{TRUE} it returns only the test statistic (\code{Q})
#' @param digits for rounding
#' @return Cochran's Q test results (statistic, degrees of freedom, p-value)
#' @export
#' @details
#' Method returns test statistic, degrees of freedom, and p value from Cochran's
#' Q test.
#' 
#' @encoding UTF-8
#' @seealso \code{\link[cata]{mcnemarQ}}
#' @author J.C. Castura
#' @references  
#' Cochran, W.G. (1950). The comparison of percentages in matched samples. 
#' \emph{Biometrika}, 37, 256-266, \doi{10.2307/2332378} 
#' 
#' Meyners, M., Castura, J.C., & Carr, B.T. (2013). Existing and  
#' new approaches for the analysis of CATA data. \emph{Food Quality and Preference}, 
#' 30, 309-319, \doi{10.1016/j.foodqual.2013.06.010}
#' @examples
#' # Cochran's Q test on the first 50 consumers on the first attribute ("Fresh")
#' cochranQ(bread$cata[1:50, , 1], digits=3)
#' 
#' # Same, returning only test statistics for the first 4 attributes
#' t(res <- apply(bread$cata[1:50, , 1:4], 3, cochranQ, quiet=TRUE, digits=3))
cochranQ <- function(X, quiet = FALSE, digits = getOption("digits")){
  if(is.vector(X)){
    X <- matrix(X, nrow = 1)
  }
  X <- matrix(as.integer(X), nrow = nrow(X), ncol = ncol(X), dimnames=dimnames(X))
  Jn <- ncol(X)
  X <- X[which((rowSums(X) < Jn) & (rowSums(X) > 0)), , drop=FALSE] 
  if(sum(X) == 0){
    if(!quiet){
      print("No variability in X")
    }
    return(c(Q = NA, df = Jn - 1, p.value = NA, effect.size = NA))
  }
  Jn <- ncol(X)
  # Cochran's Q test
  numQ <- Jn * (Jn-1) * (sum(colSums(X)^2) - (sum(X)^2)/Jn)
  denomQ <-  Jn * sum(X) - sum(rowSums(X)^2)
  Q = numQ / denomQ
  # chance-corrected effect size
  # In <- nrow(X) # blocks
  # if(In == 1){ 
  #   udelta <- delta <- 0
  #   R = NA
  # } else { # this condition is needed to calculate delta
  #   Ipairs <- utils::combn(In, 2)
  #   dnum <- sum(apply(Ipairs, 2, function(pindx) {
  #     sum(abs(X[pindx[1], ] - X[pindx[2], ]))
  #   }))
  #   delta <- dnum/(Jn * ncol(Ipairs))
  #   p_i <- apply(X, 1, mean)
  #   udelta <- 1/ncol(Ipairs) * ((sum(p_i)*(In - sum(p_i))) - sum((p_i*(1-p_i))))
  #   R = 1 - delta/udelta
  # }
  # report output
  o <- c(Q  = round(Q, digits = digits), 
         df = Jn - 1,
         p.value = round(stats::pchisq(Q, Jn - 1, lower.tail = FALSE),
                          digits = digits))#, effect.size=round(R, digits = digits))
  if(!quiet){
    cat(paste(c("",
                "Cochran's Q test", 
                "----------------",
                "H0: Citation rates equal for all products (columns)",
                "H1: Citation rates not equal for all products",
                ""), 
              collapse = '\n'))
    print(o[1:3])
    # cat(paste(c("",
    #             "Chance-corrected effect size: ", o["effect.size"], ""), 
    #           collapse = '\n'))
  }
  invisible(o)
}

#' McNemar's test
#'
#' Pairwise tests are conducted using the two-tailed binomial test. These tests
#' can be conducted after Cochran's Q test. 
#' @name mcnemarQ
#' @usage mcnemarQ(X, quiet = FALSE, digits = getOption("digits"))
#' @param X matrix of \eqn{I} assessors (rows) and \eqn{J} products (columns)
#' where values are \code{0} (not checked) or \code{1} (checked)
#' @param quiet if \code{FALSE} (default) then it prints information related to 
#' the test; if \code{TRUE} it returns only the test statistic (\code{Q})
#' @param digits for rounding
#' @return Test results for all McNemar pairwise tests conducted via the 
#' binomial test
#' @export
#' @encoding UTF-8
#' @seealso \code{\link[cata]{cochranQ}}
#' @author J.C. Castura
#' @references  
#' Cochran, W.G. (1950). The comparison of percentages in matched samples. 
#' \emph{Biometrika}, 37, 256-266. \doi{10.2307/2332378}
#' 
#' McNemar, Q. (1947). Note on the sampling error of the difference between 
#' correlated proportions or percentages. \emph{Psychometrika}, 12(2), 153-157.
#' \doi{10.1007/BF02295996}
#' 
#' Meyners, M., Castura, J.C., & Carr, B.T. (2013). Existing and  
#' new approaches for the analysis of CATA data. \emph{Food Quality and 
#' Preference}, 30, 309-319, \doi{10.1016/j.foodqual.2013.06.010}
#' 
#' @examples
#' # McNemar's exact pairwise test for all product pairs
#' # on the first 50 consumers and the first attribute ("Fresh")
#' mcnemarQ(bread$cata[1:50, , 1])
#' 
#' # Same, returning only results for the first 4 attributes
#' (res <- apply(bread$cata[1:50, , 1:4], 3, mcnemarQ, quiet=TRUE, simplify=FALSE))
mcnemarQ <- function(X, quiet = FALSE, digits = getOption("digits")){
  if(is.vector(X)){
    if(!quiet){
      print("X must be a matrix with at least 2 rows and 2 columns")
    }
    return(NA)
  }
  X <- matrix(as.integer(X), nrow = nrow(X), ncol = ncol(X), dimnames=dimnames(X))
  Jn <- ncol(X)
  X <- X[which((rowSums(X) < Jn) & (rowSums(X) > 0)), , drop=FALSE] # this also deals with missings!
  if(sum(X) == 0){
    if(!quiet){
      print("No variability in X")
    }
    return(NA)
  }
  res <- matrix(NA, nrow = choose(Jn, 2), ncol = 6,
                dimnames = list(NULL, c("index.1", "index.2", "b", "c", 
                                        "proportion", "p.value")))
  res[,1:2] <- t(utils::combn(Jn, 2))
  if(Jn == 2){
    res[,3:4] <- colSums(X)
    res[,5] <- res[,3] / sum(res[,3:4])
  } else {
    res[,3:4] <- apply(barray(X), 2:3, sum)
    res[,5] <- res[,3] / rowSums(res[,3:4])
  }
  res[,6] <- mapply(function(x, y){ 
    2*sum(stats::dbinom(0:min(x, sum(x,y)-x), size = sum(x,y), 
                        prob = .5))}, res[,3], res[,4])
  res[,6] <- mapply(min, res[,6], 1)
  if(!quiet){
    cat(paste(c("",
                "McNemar's pairwise test", 
                "-----------------------", "",
                "H0: Citation rates equal for both products",
                "H1: Citation rates not equal for both products",
                "", ""), 
              collapse = '\n'))
    res.print <- res
    res.print[,5:6] <- round(res.print[,5:6], digits = digits)
    print(as.data.frame(res.print, row.names = ""))
  }
  invisible(res)
}

#' Convert 3d array of CATA data to 4d array of CATA differences
#'
#' Converts a three-dimensional array (\eqn{I} assessors, \eqn{J}
#' products, \eqn{M} attributes) to a four-dimensional array of product
#' comparisons (\eqn{I} assessors, \eqn{J(J-1)/2}
#' product comparisons, two outcomes (of type \code{b} or \code{c}), \eqn{M} 
#' attributes)
#'  
#' @name barray
#' @usage barray(X, values = "bc", type.in = "binary", type.out = "binary")
#' @param X three-dimensional array (\eqn{I} assessors, \eqn{J}
#' products, \eqn{M} attributes) where values are \code{0} (not checked) 
#' or \code{1} (checked)
#' @param values \code{"bc"} (default) returns two outcomes: \code{b} and 
#' \code{c}; otherwise \code{"abcd"} returns four outcomes: \code{a}, \code{b}, 
#' \code{c}, \code{d}.
#' @param type.in type of data submitted; default (\code{binary}) may be set to
#' \code{ordinal} or \code{scale}.
#' @param type.out currently only \code{binary} is implemented
#' @return A four-dimensional array of product comparisons having \eqn{I} 
#' assessors, \eqn{J(J-1)/2} product comparisons, outcomes (see \code{values}
#' parameter), \eqn{M} attributes
#' @export
#' @encoding UTF-8
#' @author J.C. Castura
#' @references Castura, J.C., Meyners, M., Varela, P., & Næs, T. (2022). 
#' Clustering consumers based on product discrimination in check-all-that-apply 
#' (CATA) data. \emph{Food Quality and Preference}, 104564. 
#' \doi{10.1016/j.foodqual.2022.104564}.
#' 
#' @examples
#' # Get the 4d array of CATA differences for the first 8 consumers
#' b <- barray(bread$cata[1:8,,])
barray <- function(X, values = "bc", type.in = "binary", type.out = "binary"){
  abcd_1attribute <- function(X, type.in = "binary"){
    .abcd_1pair <- function(x, y, type.in = "binary"){
      x <- c(x) # product 'x' (1 response per consumer)
      y <- c(y) # product 'y' (1 response per consumer)
      if(type.in == "binary"){
        return(list(a = sum(x+y == 2), b = sum(x-y == 1),
                    c = sum(x-y == -1), d = sum(x+y == 0)))
      }
      if(type.in %in% c("scale", "ordinal")){
        return(list(a = sum(all(x == y, x != 0)), 
                    b = sum(x > y),
                    c = sum(x < y), 
                    d = sum(all(x == y, x == 0))))
      }
    }
    if(is.vector(X)){
      X <- matrix(X, nrow = 1)
    } else {
      X <- as.matrix(X) # assessors x products
    }
    pair.items <- utils::combn(ncol(X),2)
    this.pair <- matrix(0, nrow = ncol(pair.items), ncol = 4, 
                        dimnames = list(NULL, letters[1:4]))
    for(i in 1:ncol(pair.items)){
      this.pair[i, ] <- unlist(.abcd_1pair(X[,pair.items[1,i]], 
                                           X[,pair.items[2,i]], type.in = type.in))
    }
    return(this.pair)
  }
  nI <- dim(X)[1]
  nJ <- dim(X)[2]
  if(length(dim(X))==2){
    oX <- X
    X <- array(NA, c(nI, nJ, 1), dimnames = list(dimnames(X)[[1]],
                                                 dimnames(X)[[2]], 1))
    X[,,1] <- oX
  }
  nM <- dim(X)[3]
  out <- array(0, c(nI, nJ*(nJ-1)/2, 4, nM), 
               dimnames = list(dimnames(X)[[1]], 
                               apply(t(utils::combn(nJ,2)), 1, paste0, collapse = "_"), 
                               letters[1:4], 
                               dimnames(X)[[3]]#colnames(X[1,,])
                               ))
  for (i in 1:nI){
    for(m in 1:nM){
      this.data <- X[i,,m]
      out[i,,1:4,m] <- abcd_1attribute(this.data, type.in = type.in)
    }
  }
  if (values == "bc") {
    return(out[,, 2:3,, drop=FALSE])
  } else {
    if (values == "abcd") {
      return(out)
    } else {
      return(print(paste("Invalid option: values =", values), quote = FALSE))
    }
  }
}

#' Converts 3d array of CATA data to a wide 2d matrix format
#'
#' Converts a three-dimensional array (\eqn{I} assessors, \eqn{J}
#' products, \eqn{M} attributes) to a two-dimensional matrix
#' (\eqn{J} products, (\eqn{I} assessors, \eqn{M} attributes))
#'  
#' @name toWideMatrix
#' @usage toWideMatrix(X)
#' @param X three-dimensional array (\eqn{I} assessors, \eqn{J}
#' products, \eqn{M} attributes) where values are \code{0} (not checked) 
#' or \code{1} (checked)
#' @return A matrix with \code{J} products in rows and \eqn{I} 
#' assessors \eqn{\times M} attributes in columns
#' @export
#' @encoding UTF-8
#' @author J.C. Castura
#' @examples
#' # convert CATA results from the first 8 consumers and the first 4 attributes
#' # to a wide matrix
#' toWideMatrix(bread$cata[1:8,,1:4])
toWideMatrix <- function(X){
  out <- X[1,,]
  for(i in 2:dim(X)[[1L]]){
    out <- cbind(out, X[i,,])
  }
  return(out)
}

#' Converts 3d array of CATA data to a tall 2d matrix format
#'
#' Converts a three-dimensional array (\eqn{I} assessors, \eqn{J}
#' products, \eqn{M} attributes) to a two-dimensional matrix with
#' (\eqn{I} assessors, \eqn{J} products) rows and (\eqn{M} 
#' attributes) columns, optionally preceded by two columns of row headers.
#'  
#' @name toMatrix
#' @usage toMatrix(X, header.rows = TRUE, oneI = FALSE, oneM = FALSE)
#' @param X three-dimensional array (\eqn{I} assessors, \eqn{J}
#' products, \eqn{M} attributes) where values are \code{0} (not checked) 
#' or \code{1} (checked)
#' @param header.rows \code{TRUE} (default) includes row headers; set to
#' \code{FALSE} to exclude these headers
#' @param oneI indicates whether calculation is for one assessor (default: 
#' \code{FALSE})
#' @param oneM  indicates whether calculation is for one attribute (default: 
#' \code{FALSE})
#' @return A matrix with \eqn{I} assessors \eqn{\times J} products in rows
#' and \eqn{M} attributes in columns (preceded by 2 columns)
#' of headers if \code{header.rows = TRUE}
#' @export
#' @encoding UTF-8
#' @examples
#' # convert CATA results from the first 8 consumers and the first 4 attributes
#' # to a tall matrix
#' toMatrix(bread$cata[1:8,,1:4])
toMatrix <- function(X, header.rows = TRUE, oneI = FALSE, oneM = FALSE){
  if(any(oneI, oneM)) {
    if(all(oneI, oneM)){
      dims <- c(1, length(X), 1)
    } else {
      dims <- dim(X) # 2D
      if(oneI){
        dims <- c(1, dims) # nI was 1
      }
      if(oneM){
        dims <- c(dims, 1) # nM was 1
      }
    }
    X <- array(X, dim = dims)
  }
  if(length(dim(X)) == 2){
    X <- array(X, c(dim(X), 1), dimnames = list(
      dimnames(X)[[1]], dimnames(X)[[2]], "response"))
  }
  dims <- dim(X)
  if(length(dims) != 3){
    return("Function is to convert an array to a matrix")
  } 
  nI <- dims[1]
  nJ <- dims[2]
  nM <- dims[3]
  if(is.null(dimnames(X)[[1]])[1]) dimnames(X)[[1]] <- 1:nI
  if(is.null(dimnames(X)[[2]])[1]) dimnames(X)[[2]] <- 1:nJ
  if(is.null(dimnames(X)[[3]])[1]) dimnames(X)[[3]] <- 1:nM
  
  this.rownames <- paste0(rep(dimnames(X)[[1]], each = nJ), "_",
                          rep(dimnames(X)[[2]], times = nI))
  Xout <- matrix(NA, ncol = dim(X)[[3]], nrow = prod(dim(X)[-3]),
                 dimnames = list(this.rownames, dimnames(X)[[3]]))
  for(cc in 1:nM){
    Xout[,cc] <- c(aperm(X[,,cc], 2:1))
  }
  if(header.rows){
    Xout <- data.frame(subject = as.factor(rep(dimnames(X)[[1]], each = nJ)),
                       product = as.factor(rep(dimnames(X)[[2]], times = nI)),
                       Xout)
  }
  return(Xout)
}

#' Calculate \eqn{RV} Coefficient
#'
#' Calculate \eqn{RV} coefficient
#' @name rv.coef
#' @usage rv.coef(X, Y, method = 1)
#' @param X input matrix (same dimensions as \code{Y})
#' @param Y input matrix (same dimensions as \code{X})
#' @param method \code{1} (default) and \code{2} give identical \eqn{RV} coefficients
#' @return \eqn{RV} coefficient
#' @export
#' @encoding UTF-8
#' @author J.C. Castura
#' @references Robert, P., & Escoufier, Y. (1976). A unifying tool for linear 
#' multivariate statistical methods: the RV-coefficient. \emph{Journal of the Royal 
#' Statistical Society: Series C (Applied Statistics)}, 25, 257-265.
#' \doi{10.2307/2347233}
#' @examples
#' # Generate some data
#' set.seed(123)
#' X <- matrix(rnorm(8), nrow = 4)
#' Y <- matrix(rnorm(8), nrow = 4)
#' 
#' # get the RV coefficient
#' rv.coef(X, Y)
rv.coef <- function(X, Y, method = 1){
  tr <- function(X){
    sum(diag(X), na.rm = TRUE)
  }
  X <- sweep(X, 2, colMeans(X))#apply(X, 2, mean), "-")
  Y <- sweep(Y, 2, colMeans(Y))#apply(Y, 2, mean), "-")
  XX <- tcrossprod(X,X)
  YY <- tcrossprod(Y,Y)
  if(method==1){
    out <- tr(XX %*% YY) /
      sqrt(tr(XX %*% XX) %*% tr(YY %*% YY) )
  }
  if(method==2){
    out <- sum(c(XX)*c(YY), na.rm=TRUE) /
      sqrt(sum(XX^2, na.rm=TRUE)*sum(YY^2, na.rm=TRUE))
  }
  return(c(out))
}

#' Salton's cosine measure
#'
#' Calculate Salton's cosine measure
#' @name salton
#' @usage salton(X, Y)
#' @param X input matrix (same dimensions as \code{Y})
#' @param Y input matrix (same dimensions as \code{X})
#' @return Salton's cosine measure
#' @export
#' @encoding UTF-8
#' @author J.C. Castura
#' @references Salton, G., & McGill, M.J. (1983). \emph{Introduction to Modern 
#' Information Retrieval}. Toronto: McGraw-Hill.
#' @examples
#' # Generate some data
#' set.seed(123)
#' X <- matrix(rnorm(8), nrow = 4)
#' Y <- matrix(rnorm(8), nrow = 4)
#' 
#' # get Salton's cosine measure
#' salton(X, Y)
salton <- function(X, Y){
  out <- sum(c(X)*c(Y)) / sqrt(sum(X^2)*sum(Y^2))
  return(out)
}


#' Apply top-k box coding to scale data
#'
#' Apply top-k box coding to scale data. Using defaults give top-2 box (T2B) coding.
#' @name code.topk
#' @usage code.topk(X, zero.below = 8, one.above = 7)
#' @param X input matrix
#' @param zero.below default is \code{8}; values below this numeric threshold 
#' will be coded \code{0}; use \code{NULL} if there is no such threshold
#' @param one.above default is \code{7}; values above this numeric threshold 
#' will be coded \code{1}; use \code{NULL} if there is no such threshold
#' @return matrix \code{X} with top-k coding applied
#' @export
#' @encoding UTF-8
#' @author J.C. Castura
#' @references Castura, J.C., Meyners, M., Pohjanheimo, T., Varela, P., & Næs, T. (2023). 
#' An approach for clustering consumers by their top-box and top-choice responses. 
#' \emph{Journal of Sensory Studies}, e12860. \doi{10.1111/joss.12860}
#' @examples
#' # Generate some data
#' set.seed(123)
#' X <- matrix(sample(1:9, 100, replace = TRUE), nrow = 5)
#' 
#' # apply top-2 box (T2B) coding
#' code.topk(X, zero.below = 8, one.above = 7)
code.topk <- function(X, zero.below = 8, one.above = 7){
  Y <- X
  if(!is.null(zero.below) && !is.null(one.above)){
    if(zero.below != one.above + 1){
      return(
        print("zero.below should be equal to one.above+1 if both specified"))
    }
  }
  if(!is.null(zero.below)){
    Y[Y < zero.below] <- 0
  }
  if(!is.null(one.above)){
    Y[Y > one.above] <- 1
  }
  return(matrix(as.integer(Y), nrow = nrow(Y), ncol = ncol(Y), dimnames = dimnames(Y)) )
  #return(Y)
}

#' Apply top-c choices coding to a vector of scale data from a respondent
#'
#' Apply top-c choices coding to a vector of scale data from a respondent
#' @name topc
#' @usage topc(x, c = 2, coding = "B")
#' @param x input matrix
#' @param c number of top choices considered to be 'success'; other choices are 
#' considered to be 'failure' and are coded \code{0}
#' @param coding \code{"B"} (default) codes all successes as \code{1}; 
#' \code{"N"} codes all successes with their numeric coding
#' @return matrix \code{X} with top-k coding applied
#' @export
#' @encoding UTF-8
#' @author J.C. Castura
#' @references Castura, J.C., Meyners, M., Pohjanheimo, T., Varela, P., & Næs, T. (2023). 
#' An approach for clustering consumers by their top-box and top-choice responses. 
#' \emph{Journal of Sensory Studies}, e12860. \doi{10.1111/joss.12860}
#' @examples
#' # Generate some data
#' set.seed(123)
#' X <- matrix(sample(1:9, 100, replace = TRUE), nrow = 5)
#' 
#' # apply top-2 choice (T2C) coding
#' apply(X, 1, topc)
topc <- function(x, c = 2, coding = "B"){
  #anything with a value of 0 is special, so give in the max value
  y <- max(x)-x+1
  ranky <- rank(y, ties.method = "average")
  ranky.u <- sort(unique(ranky))
  it <- 0
  indx <- c()
  nc <- 0
  while(nc < c){
    it <- it + 1
    indx <- c(indx, which(ranky == ranky.u[it]))
    nc <- length(indx)
  }
  out <- x*0
  if(coding %in% "B"){
    out[indx] <- 1
  }
  if(coding %in% "N"){
    out[indx] <- x[indx]
  }
  return(out)
}

Try the cata package in your browser

Any scripts or data that you put into this service are public.

cata documentation built on Sept. 12, 2025, 1:08 a.m.