R/distance.R

Defines functions distanceNormalized print.hdistance distanceHartmann getDistributionParameters simulateSlaterAndHartmannDistribution getSlaterPaperPars quasiDistributionDistanceSlater get_upper_triangle generate_quasis generate_quasi distanceSlater slaterStandardization print.distance dist_minmax distance

Documented in distance distanceHartmann distanceNormalized distanceSlater dist_minmax print.distance print.hdistance quasiDistributionDistanceSlater slaterStandardization

#### Distances ####


#' Distance measures (between constructs or elements).
#'
#' Various distance measures between elements or constructs are calculated.
#'
#' @param x           `repgrid` object.
#' @param along       Whether to calculate distance for 1 = constructs (default)
#'                    or for 2= elements.
#' @param dmethod     The distance measure to be used. This must be one of
#'                    "euclidean", "maximum", "manhattan", "canberra", "binary"
#'                    or "minkowski". Any unambiguous substring can be given.
#'                    For additional information on the different types type
#'                    `?dist`.
#' @param  p          The power of the Minkowski distance, in case `"minkowski"`
#'                    is used as argument for `dmethod`.
#' @param normalize   Use normalized distances. The distances are divided by the
#'                    highest possible value given the rating scale fo the grid,
#'                    so all distances are in the interval `[0,1]`.
#' @param trim        The number of characters a construct or element is trimmed to (default is
#'                    `20`). If `NA` no trimming occurs. Trimming
#'                    simply saves space when displaying correlation of constructs
#'                    with long names.
#' @param index       Whether to print the number of the construct or element
#'                    in front of the name (default is `TRUE`). This is useful to avoid
#'                    identical row names, which may cause an error.
#' @param ...         Additional parameters to be passed to function `dist`.
#'                    Type `dist` for further information.
#' @return  `matrix` object.
#' @export
#' @examples
#'
#' # between constructs
#' distance(bell2010, along = 1)
#' distance(bell2010, along = 1, normalize = TRUE)
#'
#' # between elements
#' distance(bell2010, along = 2)
#'
#' # several distance methods
#' distance(bell2010, dm = "man") # manhattan distance
#' distance(bell2010, dm = "mink", p = 3) # minkowski metric to the power of 3
#'
#' # to save the results without printing to the console
#' d <- distance(bell2010, trim = 7)
#' d
#'
#' # some more options when printing the distance matrix
#' print(d, digits = 5)
#' print(d, col.index = FALSE)
#' print(d, upper = FALSE)
#'
#' # accessing entries from the matrix
#' d[1, 3]
#'
distance <- function(x, along = 1, dmethod = "euclidean",
                     p = 2, normalize = FALSE, trim = 20, index = TRUE, ...) {
  dmethods <- c(
    "euclidean", "maximum", "manhattan", # possible distance methods
    "canberra", "binary", "minkowski"
  )
  dmethod <- match.arg(dmethod, dmethods) # match method
  if (!inherits(x, "repgrid")) { # check if x is repgrid object
    stop("Object x must be of class 'repgrid'")
  }

  r <- getRatingLayer(x, trim = trim)
  if (along == 2) {
    r <- t(r)
  }
  d <- dist(r, method = dmethod, p = p, ...)
  d <- as.matrix(d)
  d <- addNamesToMatrix2(x, d, index = index, trim = trim, along = along)
  class(d) <- c("distance", "matrix")
  attr(d, "arguments") <- list(
    along = along, dmethod = dmethod, p = p,
    notes = NULL, cutoff = NULL,
    normalize = normalize
  )
  # normalize distances
  if (normalize) {
    mx <- dist_minmax(x, along = along, dmethod = dmethod, p = p, max.only = TRUE)
    d <- d / mx
  }

  return(d)
}


#' Calculate minimal and maximal possible distance
#'
#' While the minimal distance will usually be zero, the maximal distance can be used to normalize arbitrary distances.
#' @keywords internal
#'
dist_minmax <- function(x, along = 1, dmethod = "euclidean", p = 2, max.only = FALSE) {
  R <- ratings(x)
  # constructs = 1, elements = 2
  if (along == 2) {
    R <- t(R)
  }
  r3 <- r2 <- r1 <- R[1, , drop = FALSE] # make it work with single constructs or element
  sc <- getScale(x)
  r1[, ] <- sc["min"]
  r2[, ] <- sc["min"]
  r3[, ] <- sc["max"]
  r <- rbind(r1, r2, r3)
  d <- dist(r, method = dmethod, p = p)
  minmax <- range(as.vector(d))
  if (max.only) {
    return(minmax[2])
  }
  minmax
}


#' Print method for class distance.
#'
#' @param x           Object of class distance.
#' @param digits      Numeric. Number of digits to round to (default is
#'                    `2`).
#' @param col.index   Logical. Whether to add an extra index column so the
#'                    column names are indexes instead of construct names. This option
#'                    renders a neater output as long construct names will stretch
#'                    the output (default is `TRUE`).
#' @param upper       Whether to display upper triangle of correlation matrix only
#'                    (default is `TRUE`).
#' @param cutoffs     Cutoff values. Values below or above this interval are not
#'                    printed. For Slater distances `c(.8, 1.2)` are common
#'                    values.
#' @param diag        Whether to show the matrix diagonal.
#' @param ...         Not evaluated.
#' @export
#' @method            print distance
#' @keywords          internal
#'
print.distance <- function(x, digits = 2, col.index = TRUE,
                           upper = TRUE, diag = FALSE, cutoffs = NA, ...) {
  diag <- !diag # convert as used in upper.tri
  args <- attr(x, "arguments")
  d <- x
  class(d) <- "matrix"
  d <- round(d, digits)
  e <- format(d, nsmall = digits) # convert to characters for printing

  ## console output ##
  blank <- paste(rep(" ", max(nchar(as.vector(e)))), collapse = "", sep = "")

  # remove values above or below explicit cutoff
  if (!is.na(cutoffs[1])) {
    n.s. <- min(cutoffs) < d & d < max(cutoffs)
    e[n.s.] <- blank
  }

  if (upper) {
    e[lower.tri(e, diag = diag)] <- blank
  }
  # make index column for neater colnames
  if (col.index) {
    e <- addIndexColumnToMatrix(e)
  } else {
    colnames(e) <- seq_len(ncol(e))
  }
  e <- as.data.frame(e)
  if (args$along == 1) {
    cat("\n############################")
    cat("\nDistances between constructs")
    cat("\n############################")
  } else if (args$along == 2) {
    cat("\n##########################")
    cat("\nDistances between elements")
    cat("\n##########################")
  }
  cat("\n\nDistance method: ", args$dmethod)
  if (args$dmethod == "minkowski") {
    cat("\nPower p:", args$p)
  }
  cat("\nNormalized:", args$normalize)
  cat("\n")
  print(e)
  if (!is.null(args$notes)) {
    cat(args$notes)
  }
  cat("\n")
}


#### Slater distance ####

#' Internal workhorse for Slater standardization.
#'
#' Function uses a matrix as input. All overhead
#' of `repgrid` class is avoided. Needed for speedy simulations.
#'
#' @param x   A matrix.
#' @keywords internal
#' @export
#'
slaterStandardization <- function(x) {
  E <- dist(t(x), diag = TRUE, upper = TRUE) # euclidean distance
  E <- as.matrix(E)
  m <- ncol(x) # number of elements
  D <- sweep(x, 1, apply(x, 1, mean)) # row-center data
  S <- sum(diag(t(D) %*% D))
  U <- (2 * S / (m - 1))^0.5
  E / U # divide by expected distance unit
}


#' Slater distances (standardized Euclidean distances).
#'
#' The euclidean distance is often used as a measure of similarity between elements (see [distance()]. A drawback of
#' this measure is that it depends on the range of the rating scale and the number of constructs used, i. e. on the
#' size of a grid. \cr An approach to standardize the euclidean distance to make it independent from size and range of
#' ratings and was proposed by Slater (1977, pp. 94). The 'Slater distance' is the Euclidean distance divided by the
#' expected distance. Slater distances bigger than 1 are greater than expected, lesser than 1 are smaller than
#' expected. The minimum value is 0 and values bigger than 2 are rarely found. Slater distances have been be used to
#' compare inter-element distances between different grids, where the grids do not need to have the same constructs or
#' elements. Hartmann (1992) showed that Slater distance is not independent of grid size. Also the distribution of the
#' Slater distances is asymmetric. Hence, the upper and lower limit to infer 'significance' of distance is not
#' symmetric. The practical relevance of Hartmann's findings have been demonstrated by Schoeneich and Klapp (1998). To
#' calculate Hartmann's version of the standardized distances see [distanceHartmann()]
#'
#' @section Calculation: The Slater distance is calculated as follows.
#' For a derivation see Slater (1977, p.94).       \cr
#' Let matrix \eqn{D}{D} contain the row centered ratings. Then
#'    \deqn{P = D^TD}{P = D^TD} and
#'    \deqn{S = tr(P)}{S = tr(P)}
#' The expected 'unit of expected distance' results as     \cr
#' \deqn{U = (2S/(m-1))^{1/2}}{U = (2S/(m-1))^.5}
#' where \eqn{m}{m} denotes the number of elements of the grid.
#' The standardized Slater distances is the matrix of Euclidean distances
#' \eqn{E}{E} divided by the expected distance \eqn{U}{U}.
#' \deqn{E/U}{E/U}
#'
#' @inheritParams     distance
#' @return  A matrix with Slater distances.
#'
#' @references   Hartmann, A. (1992). Element comparisons in repertory grid technique: Results and consequences of a
#'   Monte Carlo study. *International Journal of Personal Construct Psychology, 5*(1), 41-56.
#'
#'   Schoeneich, F., & Klapp, B. F. (1998). Standardization of interelement distances in repertory grid technique and
#'   its consequences for psychological interpretation of self-identity plots: An empirical study.
#'  *Journal of Constructivist Psychology, 11*(1), 49-58.
#'
#'   Slater, P. (1977). *The measurement of intrapersonal space by Grid technique.* Vol. II. London: Wiley.
#' @export
#' @seealso [distanceHartmann()]
#' @examples
#'
#' distanceSlater(bell2010)
#' distanceSlater(bell2010, trim = 40)
#'
#' d <- distanceSlater(bell2010)
#' print(d)
#' print(d, digits = 4)
#'
#' # using Norris and Makhlouf-Norris (problematic) cutoffs
#' print(d, cutoffs = c(.8, 1.2))
#'
distanceSlater <- function(x, trim = 20, index = TRUE) {
  if (!inherits(x, "repgrid")) {
    stop("Object must be of class 'repgrid'")
  }
  E <- distance(x, along = 2, index = index)
  E.sl <- slaterStandardization(E)
  notes <- c(
    "\nNote that Slater distances cannot be compared across grids",
    "with a different number of constructs (see Hartmann, 1992).\n"
  )
  attr(E.sl, "arguments") <- list(
    along = 2, dmethod = "Slater (standardized Euclidean)",
    p = 2, notes = notes
  )
  class(E.sl) <- c("distance", "matrix")
  E.sl
}


#### Hartmann distance ####

# helper functions for Slater distribution simulation
#
generate_quasi <- function(nc = 5, ne = 10, r = 1:5, prob = rep(1, length(r))) {
  matrix(sample(r, size = nc * ne, replace = T, prob = prob), ncol = ne)
}


generate_quasis <- function(n, nc = 5, ne = 10, r = 1:5, prob = rep(1, length(r))) {
  replicate(n, generate_quasi(nc = nc, ne = ne, r = r, prob = prob), simplify = FALSE)
}


get_upper_triangle <- function(x) {
  x[upper.tri(x, diag = FALSE)]
}


quasiDistributionDistanceSlater <- function(reps, nc, ne, range,
                                            prob = NULL, progress = TRUE) {
  q <- generate_quasis(reps, nc = nc, ne = ne, r = range, prob = NULL)
  fun <- lapply
  if (progress) {
    fun <- lapply_pb
  }
  dist.sl <- fun(q, slaterStandardization)
  dist.sl <- lapply(dist.sl, get_upper_triangle)
  unlist(dist.sl)
}


# quasiDistributionDistanceSlater <- function(rep, nc, ne, range, prob=NULL, progress=TRUE)
# {
#   quasis <- randomGrids(rep, nc=nc, ne=ne, range=range, prob=prob, options=0)
#   if (progress)                 # whether to show progress bar
#     lapply_fun <- lapply_pb else
#       lapply_fun <- lapply
#   quasis.sd <- lapply_fun(quasis, function(x){
#     ds <- distanceSlater(x)
#     ds[lower.tri(ds, diag=FALSE)]
#   })
#   unlist(quasis.sd)         # return as vector
# }



# Return a list with the mean and sd as indicated in Hartmann's (1992) paper.
#
getSlaterPaperPars <- function(nc) {
  constructs <- NULL # dummy to avoid R CMD CHECK non-visible variable binding NOTE

  # hartmann only provides values for a number of constructs between 7 and 21.
  # Smaller and bigger grids use the parameters with the next best number of
  # constructs from Hartmann
  if (nc < 7) {
    nc <- 7
  }
  if (nc > 21) {
    nc <- 21
  }

  ## constants ##
  # parameters for Slater distance distributions used to calculated Hartmann
  # distances as supplied by Hartmann, 1992,  p. 51. (only defined for 7 to 21
  # constructs)
  #
  hartmann.pars <- data.frame(
    constructs = 7:21,
    mean = c(
      .97596, .97902, 0.98236, .98322, .98470,
      .98643, .98749, .98811, .98908, .98972,
      .99034, .99092, .99135, .99193, .99228
    ),
    sd = c(
      .21910, .20376, .19211, .18240, .17396, .16416, .15860, .15374, .14700,
      .14303, .13832, .13444, .13082, .12676, .12365
    )
  )

  # "Therefore the means of all Z-transformed percentiles [avering three scale
  # range, MH] are suggested to be used as cutoffs for distance interpretation.
  # The use of the 5th and the 95th percentiles is recommended (see Table 5)"
  # (Hartmann, 1992, p.52).
  #
  hartmann.cutoffs <- c(
    p01 = 2.492, p05 = 1.777, p10 = 1.387,
    p90 = -1.186, p95 = -1.519, p99 = -2.129
  )
  slater.mean <- unlist(subset(hartmann.pars, constructs == nc, mean))
  slater.sd <- unlist(subset(hartmann.pars, constructs == nc, sd))
  list(mean = slater.mean, sd = slater.sd)
}


simulateSlaterAndHartmannDistribution <- function(reps = 1000, nc, ne, range,
                                                  prob = NULL,
                                                  progress = TRUE) {
  # this operation takes some time
  slater.vals <- quasiDistributionDistanceSlater(
    reps = reps, nc = nc,
    ne = ne, range = range,
    prob = prob,
    progress = progress
  )
  # mean and sd of Slater distribution
  mean.slater <- mean(slater.vals, na.rm = TRUE)
  sd.slater <- sd(slater.vals, na.rm = TRUE)

  # conversion to Hartmann values
  hartmann.vals <- -1 * (slater.vals - mean.slater) / sd.slater
  list(
    slater = slater.vals,
    hartmann = hartmann.vals
  )
}


# NOT USED
# # caclulate coverage probability_
# coverageProbability <- function(x, cutoffs)
# {
#   l  <- length(x)
#   oneCoverProb <- function(cutoff)
#     sum(x < cutoff) / l
#   sapply(cutoffs, oneCoverProb)
# }


getDistributionParameters <- function(x, probs = c(.01, .025, .05, .1, .9, .95, .975, .99),
                                      na.rm = TRUE) {
  pars <- describe(x)
  qs <- quantile(x, probs = probs, na.rm = na.rm)
  # cover.probs <- coverageProbability(x, cutoffs)       # get coverage probabalities for cutoffs
  list(pars = pars, quantiles = qs)
}


#' Calculate Hartmann distance
#'
#' Hartmann (1992) showed in a simulation study that Slater distances (see [distanceSlater()]) based on random grids,
#' for which Slater coined the expression quasis, have a skewed distribution, a mean and a standard deviation depending
#' on the number of constructs elicited. He suggested a linear transformation (z-transformation) which takes into
#' account the estimated (or expected) mean and the standard deviation of the derived distribution to standardize
#' Slater distance scores across different grid sizes. 'Hartmann distances' represent a more accurate version of
#' 'Slater distances'. Note that Hartmann distances are multiplied by -1. Hence, negative Hartmann values represent
#' dissimilarity, i.e. a big Slater distance. \cr
#'
#' There are two ways to use this function. Hartmann distances can either be calculated based on the reference values
#' (i.e. means and standard deviations of Slater distance distributions) as given by Hartmann in his paper. The second
#' option is to conduct an instant simulation for the supplied grid size for each calculation. The second option will
#' be more accurate when a big number of quasis is used in the simulation. \cr
#'
#' It is also possible to return the quantiles of the sample distribution and only the element distances considered
#' 'significant' according to the quantiles defined.
#'
#'
#' @section Calculation:
#'
#' The 'Hartmann distance' is calculated as follows (Hartmann 1992, p. 49).  \cr
#' \deqn{D = -1 (\frac{D_{slater} - M_c}{sd_c})}{D = -1 (D_slater - M_c / sd_c)}
#' Where \eqn{D_{slater}}{D_slater} denotes the Slater distances of the grid,
#' \eqn{M_c}{M_c} the sample distribution's mean value and
#' \eqn{sd_c}{sd_c} the sample distribution's standard deviation.
#'
#' @title 'Hartmann distance' (standardized Slater distances).
#' @param x           `repgrid` object.
#' @param method      The method used for distance calculation, on of
#'                    `"paper", "simulate", "new"`. `"paper"` uses the
#'                    parameters as given in Hartmann (1992) for calculation.
#'                    `"simulate"` (default) simulates a Slater distribution
#'                    for the calculation. In a future version the time consuming
#'                    simulation will be replaced by more accurate parameters for
#'                    Hartmann distances than used in Hartmann (1992).
#' @param reps        Number of random grids to generate sample distribution for
#'                    Slater distances (default is `10000`). Note that
#'                    a lot of samples may take a while to calculate.
#' @param prob        The probability of each rating value to occur.
#'                    If `NULL` (default) the distribution is uniform.
#'                    The number of values must match the length of the rating scale.
#' @param progress    Whether to show a progress bar during simulation
#'                    (default is `TRUE`) (for `method="simulate"`).
#'                    May be useful when the distribution is estimated on the basis
#'                    of many quasis.
#' @param distributions Whether to additionally return the values of the simulated
#'                    distributions (Slater etc.) The default is `FALSE`
#'                    as it will quickly boost the object size.
#' @return  A matrix containing Hartmann distances. In the attributes several additional parameters can be found:
#'
#' - `arguments`: A list of several parameters including `mean` and `sd` of Slater distribution.
#' - `quantiles`: Quantiles for Slater and Hartmann distance distribution.
#' - `distributions`: List with values of the simulated distributions.
#'
#' @references   Hartmann, A. (1992). Element comparisons in repertory  grid technique: Results and consequences of a
#'   Monte  Carlo study. *International Journal of Personal Construct Psychology, 5*(1), 41-56.
#' @export
#' @seealso [distanceSlater()]
#' @examples \dontrun{
#'
#' ### basics  ###
#'
#' distanceHartmann(bell2010)
#' distanceHartmann(bell2010, method = "simulate")
#' h <- distanceHartmann(bell2010, method = "simulate")
#' h
#'
#' # printing options
#' print(h)
#' print(h, digits = 6)
#' # 'significant' distances only
#' print(h, p = c(.05, .95))
#'
#' # access cells of distance matrix
#' h[1, 2]
#'
#' ### advanced  ###
#'
#' # histogram of Slater distances and indifference region
#' h <- distanceHartmann(bell2010, distributions = TRUE)
#' l <- attr(h, "distributions")
#' hist(l$slater, breaks = 100)
#' hist(l$hartmann, breaks = 100)
#' }
#'
distanceHartmann <- function(x, method = "paper", reps = 10000,
                             prob = NULL, progress = TRUE, distributions = FALSE) {
  if (distributions == TRUE & method != "simulate") {
    method <- "simulate"
    warning("'method' set to 'simulate' to return distributions", call. = FALSE)
  }
  if (!inherits(x, "repgrid")) {
    stop("Object must be of class 'repgrid'")
  }
  ps <- seq(0, 1, .001) # probabilty for quantiles to return

  # select parameter derivation for transformation
  method <- match.arg(method, c("paper", "simulate", "new"))

  ## get grid parameters ##
  mm <- getScale(x) # get min and max scale value
  range <- mm[1]:mm[2]
  nc <- getNoOfConstructs(x)
  ne <- getNoOfElements(x)

  # derive parameters mean and sd by simulation of Slater distance distributions
  # for given grid size
  if (method == "simulate") {
    v <- simulateSlaterAndHartmannDistribution(
      reps = reps, nc = nc, ne = ne, range = range,
      prob = prob, progress = progress
    )
    sl <- getDistributionParameters(v$slater)
    hm <- getDistributionParameters(v$hartmann)
  }

  # use parameters mean and sd from Hartmann paper or simulated
  # TODO: use my own simulated parameters with bigger sample size
  #       for more accuracy than Hartmann. This will give accurate results
  #       while still avoiding time-consuming simulations.
  notes <- NULL
  if (method == "paper") {
    p <- getSlaterPaperPars(nc)
    notes <- c(
      "\nFor calculation the parameters from Hartmann (1992) were used.",
      "Use 'method=new' or method='simulate' for a more accurate version.\n"
    )
  } else if (method == "simulate") {
    p <- list(mean = sl$pars$mean, sd = sl$pars$sd)
  } else if (method == "new") {
    stop("method 'new' has not yet been implemented", call. = FALSE)
  } else {
    stop("'method' must be 'paper', 'simulate' or 'new'", call. = FALSE)
  }

  # linear transformation to derive Hartmann distance from Slater distances
  # (Hartmman, 1992, p. 49)
  D <- distanceSlater(x)
  H <- -1 * (D - p$mean) / p$sd
  # diag(H) <- 0      # replace diagonal as zeros (Hmmm?)

  # prepare output
  attr(H, "arguments") <- list(
    along = 2, dmethod = "Hartmann (standardized Slater distances)", p = 2,
    notes = notes,
    parameters = p,
    method = method
  )
  if (method == "simulate") {
    quantiles.slater <- quantile(v$slater, probs = ps)
    quantiles.hartmann <- quantile(v$hartmann, probs = ps)
  } else {
    quantiles.slater <- NULL
    quantiles.hartmann <- NULL
  }

  attr(H, "quantiles") <- list(
    slater = quantiles.slater,
    hartmann = quantiles.hartmann
  )
  # return Slater and Hartmann simulated distribution values if requested
  # caution: objects get quite big ~ 6mb with 10000 reps
  if (distributions) {
    attr(H, "distributions") <- v
  }
  class(H) <- c("hdistance", "distance", "matrix")
  H
}


#' Print method for class hdistance (Hartmann distance objects).
#'
#' Additionally allows the specification of p-values. The corresponding quantiles are calculated.  These are contained
#' as attributes in the `hdistance` object as returned by distanceHartmann. The lowest and highest values are used as
#' cutoffs in `print.distance`.
#'
#' @param x Object of class hdistance.
#' @inheritParams print.distance
#' @param p Quantiles corresponding to probabilities are used as cutoffs. Currently only works for Hartmann
#'   distances. If used `cutoffs` is overwritten.
#' @param ... Not evaluated.
#'
#' @export
#' @method print hdistance
#' @keywords internal
print.hdistance <- function(x, digits = 2, col.index = TRUE,
                            upper = TRUE, diag = FALSE, cutoffs = NA,
                            p = NA, ...) {
  # only calculate quantiles when they are supplied in the object.
  # this is currently only the case for method="simulate". Will change
  # when bigger simulations are finished.
  do.quantiles <- FALSE
  if (attr(x, "arguments")$method == "simulate") {
    do.quantiles <- TRUE
  }

  # select quantiles to use (Hartmann or Power-transformed Hartmann distances)
  dmethod <- attr(x, "arguments")$dmethod
  if (dmethod == "Hartmann (standardized Slater distances)") {
    dm <- "hartmann"
  } else {
    dm <- "bc"
  }

  # calculate quantiles (cutoff values) from lowest and biggest p value. get
  # quantiles from distance object (only for Hartmann distance). not the best
  # approach, maybe change this in the future to use a lookup table when solid
  # cutoffs are available.
  if (!is.na(p[1]) & do.quantiles) {
    which.p <- round(seq(0, 1, .001), 6) %in% round(p, 6) # to avoid floating number representation inequalities
    qs <- attr(x, "quantiles")[[dm]][which.p]
    cutoffs <- qs
  }

  # call standard printing function for distance objects
  print.distance(
    x = x, digits = digits, col.index = col.index, upper = upper,
    diag = diag, cutoffs = cutoffs
  )

  # add quantiles used as cutoffs
  if (!is.na(p[1]) & do.quantiles) {
    cat("Quantiles:\n")
    print(round(qs, 4))
    cat("\nThe smallest and biggest quantiles are used as cutoffs.\n")
  }
}


#### Power transformed Hartmann distance ####


#' Calculate power-transformed Hartmann distances.
#'
#' Hartmann (1992) suggested a transformation of Slater (1977) distances to make
#' them independent from the size of a grid. Hartmann distances are supposed to
#' yield stable cutoff values used to determine 'significance' of inter-element
#' distances. It can be shown that Hartmann distances are still affected by grid
#' parameters like size and the range of the rating scale used (Heckmann, 2012).
#' The function `distanceNormalize` applies a Box-Cox (1964) transformation to the
#' Hartmann distances in order to remove the skew of the Hartmann distance
#' distribution. The normalized values show to have more stable cutoffs
#' (quantiles) and better properties for comparison across grids of different
#' size and scale range.  \cr
#'
#' The function `distanceNormalize` can also return
#' the quantiles of the sample distribution and only the element distances
#' considered 'significant' according to the quantiles defined.
#'
#' @section Calculations:
#' The 'power transformed Hartmann distance' are calculated as
#' follows: The simulated Hartmann distribution is added a constant as the
#' Box-Cox transformation can only be applied to positive values. Then a range
#' of values for lambda in the Box-Cox transformation (Box & Cox, 1964) are
#' tried out. The best lambda is the one maximizing the correlation of the
#' quantiles with the standard normal distribution. The lambda value maximizing
#' normality is used to transform Hartmann distances. As the resulting scale of
#' the power transformation depends on lambda, the resulting values are
#' z-transformed to derive a common scaling.
#'
#' The code for the calculation of the optimal lambda was written by Ioannis
#' Kosmidis.
#'
#' @param x           `repgrid` object.
#' @param reps        Number of random grids to generate to produce
#'                    sample distribution for Hartmann distances
#'                    (default is `1000`). Note that
#'                    a lot of samples may take a while to calculate.
#' @inheritParams     distanceHartmann
#'
#' @return  A matrix containing the standardized distances. \cr
#'                    Further data is contained in the object's attributes: \cr
#'                    \item{`"arguments"`}{A list of several parameters
#'                           including `mean` and `sd` of Slater distribution.}
#'                    \item{`"quantiles"`}{Quantiles for Slater, Hartmann
#'                          and power transformed distance distributions.}
#'                    \item{`"distributions"`}{List with values of the
#'                          simulated distributions, if `distributions=TRUE`.}
#'
#' @references   Box, G. E. P., & Cox, D. R. (1964). An Analysis of Transformations.
#'  *Journal of the Royal Statistical Society. Series B (Methodological), 26*(2), 211-252.
#'
#'   Hartmann, A. (1992). Element comparisons in repertory grid technique: Results and consequences of a Monte Carlo
#'   study. *International Journal of Personal Construct Psychology, 5*(1), 41-56.
#'
#'   Heckmann, M. (2012). *Standardizing inter-element distances in grids - A revision of Hartmann's distances*, 11th
#'   Biennal Conference of the European Personal Construct Association (EPCA), Dublin, Ireland, Paper presentation,
#'   July 2012.
#'
#'   Slater, P. (1977). *The measurement of intrapersonal space by Grid technique*. London: Wiley.
#'
#' @export
#' @seealso  [distanceHartmann()] and [distanceSlater()].
#' @examples \dontrun{
#'
#' ### basics  ###
#'
#' distanceNormalized(bell2010)
#' n <- distanceNormalized(bell2010)
#' n
#'
#' # printing options
#' print(n)
#' print(n, digits = 4)
#' # 'significant' distances only
#' print(n, p = c(.05, .95))
#'
#' # access cells of distance matrix
#' n[1, 2]
#'
#' ### advanced  ###
#'
#' # histogram of Slater distances and indifference region
#' n <- distanceNormalized(bell2010, distributions = TRUE)
#' l <- attr(n, "distributions")
#' hist(l$bc, breaks = 100)
#' }
#'
distanceNormalized <- function(x, reps = 1000, prob = NULL, progress = TRUE,
                               distributions = TRUE) {
  if (!inherits(x, "repgrid")) {
    stop("Object must be of class 'repgrid'")
  }

  ps <- seq(0, 1, .001) # probabilty for quantiles to return

  # calculate Hartmann and Slater distances
  h <- distanceHartmann(x,
    reps = reps, prob = prob, progress = progress,
    method = "simulate", distributions = distributions
  )

  # optimal lambda for Box-Cox transformation. Add constant as only defined
  # for positive values. Use offest 1 for same treatment of tails.
  d <- attr(h, "distributions")
  constant <- abs(min(c(d$hartmann, h))) + 1.00001
  bc <- optimal.boxcox(d$hartmann + constant)

  # parameters to standardize power transformed Hartmann values
  lambda.max <- bc$lambda
  sd.bc <- sd(bc$x)
  mean.bc <- mean(bc$x)

  # function to perform Box-Cox tranformation plus standardization
  bc.transform <- function(x) { # , constant, lambda.max, sd.bc, mean.bc){
    res <- ((x + constant)^lambda.max - 1) / lambda.max # power transformation
    (res - mean.bc) / (sd.bc) # z-transformation
  }

  # make bc transformations for all Hartmann data
  bc.dist <- bc.transform(d$hartmann) # transform simulated Hartmann distribution
  bc.vals <- bc.transform(h) # transform grid data
  bc.qs <- quantile(bc.dist, ps, na.rm = TRUE)

  # prepare output
  notes <- NULL
  l <- list(dmethod = "Power transformed Hartmann distances", notes = notes)
  attr(h, "arguments") <- modifyList(attr(h, "arguments"), l)

  # add distribution of power-transformed values
  if (distributions) {
    attr(h, "distributions")$bc <- bc.dist
  }
  # add quantiles of power-transformed values
  attr(h, "quantiles")$bc <- bc.qs
  h
}
markheckmann/OpenRepGrid documentation built on Sept. 17, 2024, 3:49 a.m.