R/basis.R

Defines functions pairwise_basis cdp_basis_ cdp_basis pb_basis sbp_basis cbalance_approx cc_basis pc_basis alr_basis clr_basis olr_basis ilr_basis basis check_dim

Documented in alr_basis basis cbalance_approx cc_basis cdp_basis clr_basis ilr_basis olr_basis pairwise_basis pb_basis pc_basis sbp_basis

check_dim = function(dim){
  if(!is.numeric(dim)){
    stop("Dimension should be a number", call. = FALSE)
  }
  if(!dim>0){
    stop("dimension should be positive", call. = FALSE)
  }
}


#' @title Coordinates basis
#'
#' @description
#' Obtain coordinates basis
#' @param H coordinates for which basis should be shown
#' @return basis used to create coordinates H
#' @export
basis = function(H){
  if(is.null(attr(H, 'basis'))) return(message('No basis specified'))
  attr(H, 'basis')
}

#' Isometric/Orthonormal log-ratio basis for log-transformed compositions.
#'
#' By default the basis of the clr-given by Egozcue et al., 2013
#' Build an isometric log-ratio basis for a composition with k+1 parts
#' \deqn{h_i = \sqrt{\frac{i}{i+1}} \log\frac{\sqrt[i]{\prod_{j=1}^i x_j}}{x_{i+1}}}{%
#' h[i] = \sqrt(i/(i+1)) ( log(x[1] \ldots x[i])/i - log(x[i+1]) )}
#' for \eqn{i \in 1\ldots k}.
#'
#'Modifying parameter type (pivot or cdp) other ilr/olr basis can be generated
#'
#' @param dim number of components
#' @param type if different than `pivot` (pivot balances) or `cdp` (codapack balances) default balances are returned, which computes a triangular Helmert matrix as defined by Egozcue et al., 2013.
#' @return matrix
#' @references
#' Egozcue, J.J., Pawlowsky-Glahn, V., Mateu-Figueras, G. and Barceló-Vidal C. (2003).
#' \emph{Isometric logratio transformations for compositional data analysis}.
#' Mathematical Geology, \strong{35}(3) 279-300
#' @examples
#' ilr_basis(5)
#' @export
ilr_basis = function(dim, type = 'default'){
  check_dim(dim)
  if(type == 'cdp'){
    B = cdp_basis_(dim)
  }else{
    B = ilr_basis_default(dim)
    if(type == 'pivot'){
      B = (-B)[,ncol(B):1, drop = FALSE][nrow(B):1,]
    }
  }
  colnames(B) = sprintf("ilr%d", 1:ncol(B))
  rownames(B) = sprintf("c%d", 1:nrow(B))
  B
}

#' @rdname ilr_basis
#' @export
olr_basis = function(dim, type = 'default'){
  check_dim(dim)
  if(type == 'cdp'){
    B = cdp_basis_(dim)
  }else{
    B = ilr_basis_default(dim)
    if(type == 'pivot'){
      B = (-B)[,ncol(B):1, drop = FALSE][nrow(B):1,]
    }
  }
  colnames(B) = sprintf("olr%d", 1:ncol(B))
  rownames(B) = sprintf("c%d", 1:nrow(B))
  B
}

#' Centered log-ratio basis
#'
#' Compute the transformation matrix to express a composition using
#' the linearly dependant centered log-ratio coordinates.
#'
#' @param dim number of parts
#' @return matrix
#' @references
#' Aitchison, J. (1986)
#' \emph{The Statistical Analysis of Compositional Data}.
#' Monographs on Statistics and Applied Probability. Chapman & Hall Ltd., London (UK). 416p.
#' @examples
#' (B <- clr_basis(5))
#' # CLR coordinates are linearly dependant coordinates.
#' (clr_coordinates <- coordinates(c(1,2,3,4,5), B))
#' # The sum of all coordinates equal to zero
#' sum(clr_coordinates) < 1e-15
#' @export
clr_basis = function(dim){
  check_dim(dim)
  B = clr_basis_default(dim)
  colnames(B) = sprintf("clr%d", 1:ncol(B))
  rownames(B) = sprintf("c%d", 1:nrow(B))
  B
}


#' Additive log-ratio basis
#'
#' Compute the transformation matrix to express a composition using the oblique additive log-ratio
#' coordinates.
#'
#' @param dim number of parts
#' @param denominator part used as denominator (default behaviour is to use last part)
#' @param numerator parts to be used as numerator. By default all except the denominator parts are chosen following original order.
#' @return matrix
#' @examples
#' alr_basis(5)
#' # Third part is used as denominator
#' alr_basis(5, 3)
#' # Third part is used as denominator, and
#' # other parts are rearranged
#' alr_basis(5, 3, c(1,5,2,4))
#' @references
#' Aitchison, J. (1986)
#' \emph{The Statistical Analysis of Compositional Data}.
#' Monographs on Statistics and Applied Probability. Chapman & Hall Ltd., London (UK). 416p.
#' @export
alr_basis = function(dim, denominator = dim, numerator = which(denominator != 1:dim)){
  check_dim(dim)
  res = alr_basis_default(dim)
  res = cbind(res, 0)
  if(dim != denominator){
    res[c(denominator, dim),] = res[c(dim, denominator),, drop = FALSE]
    res[,c(denominator, dim)] = res[,c(dim, denominator), drop = FALSE]
  }
  B = res[,numerator, drop = FALSE]
  colnames(B) = sprintf("alr%d", 1:ncol(B))
  rownames(B) = sprintf("c%d", 1:nrow(B))
  B
}

#' Isometric log-ratio basis based on Principal Components.
#'
#' Different approximations to approximate the principal balances of a compositional dataset.
#'
#' @param X compositional dataset
#' @return matrix
#'
#' @export
pc_basis = function(X){
  B = ilr_basis(ncol(X))
  B = B %*% svd(scale(log(as.matrix(X)) %*% B, scale=FALSE))$v

  parts = colnames(X)
  if(is.null(parts)){
    parts = paste0('c', 1:nrow(B))
  }
  rownames(B) = parts
  colnames(B) = paste0('pc', 1:ncol(B))
  as.matrix(B)
}

#' Isometric log-ratio basis based on canonical correlations
#'
#'
#' @param Y compositional dataset
#' @param X explanatory dataset
#' @return matrix
#'
cc_basis = function(Y, X){
  Y = as.matrix(Y)
  X = cbind(X)
  B = ilr_basis_default(ncol(Y))
  cc = stats::cancor(coordinates(Y), X)
  B = B %*% cc$xcoef
  parts = colnames(Y)
  if(is.null(parts)){
    parts = paste0('c', 1:nrow(B))
  }
  rownames(B) = parts
  colnames(B) = paste0('cc', 1:ncol(B))
  B
}

#' Balance generated from the first canonical correlation component
#'
#'
#' @param Y compositional dataset
#' @param X explanatory dataset
#' @return matrix
#'
#' @export
cbalance_approx = function(Y,X){
  Y = as.matrix(Y)
  X = cbind(X)
  B = ilr_basis(ncol(Y))
  cc1 = B %*% stats::cancor(coordinates(Y), X)$xcoef[,1,drop=F]
  ord = order(abs(cc1))
  cb1_ = sign(cc1)
  cb1 = cb1_
  cor1 = abs(suppressWarnings(stats::cancor(coordinates(Y,sbp_basis(cb1_)), X)$cor))
  for(i in 1:(ncol(Y)-2)){
    cb1_[ord[i]] = 0
    cor1_ = abs(suppressWarnings(stats::cancor(coordinates(Y,sbp_basis(cb1_)), X)$cor))
    if(cor1_ > cor1){
      cb1 = cb1_
      cor1 = cor1_
    }
  }
  suppressWarnings(sbp_basis(cb1))
}

#' Isometric log-ratio basis based on Balances
#'
#' Build an \code{\link{ilr_basis}} using a sequential binary partition or
#' a generic coordinate system based on balances.
#'
#' @param sbp parts to consider in the numerator and the denominator. Can be
#' defined either using a list of formulas setting parts (see examples) or using
#' a matrix where each column define a balance. Positive values are parts in
#' the numerator, negative values are parts in the denominator, zeros are parts
#' not used to build the balance.
#' @param data composition from where name parts are extracted
#' @param fill should the balances be completed to become an orthonormal basis?
#'  if the given balances are not orthonormal, the function will complete the
#'  balance to become a basis.
#' @param silent inform about orthogonality
#' @return matrix
#' @examples
#' X = data.frame(a=1:2, b=2:3, c=4:5, d=5:6, e=10:11, f=100:101, g=1:2)
#' sbp_basis(list(b1 = a~b+c+d+e+f+g,
#'                b2 = b~c+d+e+f+g,
#'                b3 = c~d+e+f+g,
#'                b4 = d~e+f+g,
#'                b5 = e~f+g,
#'                b6 = f~g), data = X)
#' sbp_basis(list(b1 = a~b,
#'                b2 = b1~c,
#'                b3 = b2~d,
#'                b4 = b3~e,
#'                b5 = b4~f,
#'                b6 = b5~g), data = X)
#' # A non-orthogonal basis can also be calculated.
#' sbp_basis(list(b1 = a+b+c~e+f+g,
#'               b2 = d~a+b+c,
#'               b3 = d~e+g,
#'               b4 = a~e+b,
#'               b5 = b~f,
#'               b6 = c~g), data = X)
#' @export
sbp_basis = function(sbp, data = NULL, fill = FALSE, silent=FALSE){
  if(is.null(data) & is.matrix(sbp)){
    # P = t(sbp)
    df = as.data.frame(matrix(1, nrow(sbp), nrow = 1))
    if(!is.null(rownames(sbp))){
      colnames(df) = rownames(sbp)
    }
    str_to_frm = function(vec){
      frm = paste(stats::aggregate(nm ~ vec, subset(data.frame(nm = paste0('`',names(df), '`'), vec = -1 * vec,
                                                               stringsAsFactors = FALSE), vec != 0),
                                   FUN = paste, collapse= ' + ')[['nm']], collapse=' ~ ')
      stats::as.formula(frm)
    }
    return(sbp_basis(apply(sbp, 2, str_to_frm),
                     data = df,
                     fill = fill,
                     silent = silent))
    # return(do.call('sbp_basis', c(apply(P, 1, str_to_frm), list(data=df,
                                                                # fill = fill,
                                                                # silent = silent)))) #, envir = as.environment('package:coda.base')
  }

  if (!is.data.frame(data) && !is.environment(data) && ( (is.matrix(data) && !is.null(colnames(data))) | !is.null(attr(data, "class"))))
    data <- as.data.frame(data)
  else if (is.array(data))
    stop("'data' must be a data.frame or a matrix with column names")


  if(!all(unlist(lapply(sbp, all.vars)) %in% c(names(data), names(sbp)))){
    stop("Balances should be columns of 'data'")
  }
  nms = setdiff(names(sbp), "")
  if(length(nms) > 0){
    substitutions = lapply(sbp, all.vars)
    substitutions = substitutions[nms]
    while(!all(is.na(substitutions)) &
          !all(unlist(substitutions) %in% names(data))){
      for(nm in nms){
        substitutions = lapply(substitutions, function(subs){
          I = match(nm, subs)
          if(!is.na(I)){
            c(subs[-I], substitutions[[nm]])
          }else{
            subs
          }
        })
      }
    }
  }

  sbp_split = function(part){
    RIGHT = attr(stats::terms(part), 'term.labels')
    LEFT = setdiff(all.vars(part), RIGHT)
    if(length(nms) > 0){
      for(nm in nms){
        I = match(nm, RIGHT)
        if(!is.na(I)){
          RIGHT = c(RIGHT[-I], substitutions[[nm]])
        }
        I = match(nm, LEFT)
        if(!is.na(I)){
          LEFT = c(LEFT[-I], substitutions[[nm]])
        }
      }
    }
    list(LEFT, RIGHT)
  }

  sbp_clean = lapply(sbp, sbp_split)
  RES = sapply(sbp_clean, function(balance){
    I1 = length(balance[[1]])
    I2 = length(balance[[2]])
    l = +1/I1 * sqrt(I1*I2/(I1+I2))
    r = -1/I2 * sqrt(I1*I2/(I1+I2))
    bal = stats::setNames(rep(0, length(names(data))), names(data))
    bal[balance[[1]]] = bal[balance[[1]]] + l
    bal[balance[[2]]] = bal[balance[[2]]] + r
    bal
  })
  if(fill){
    return(Recall(fill_sbp(sign(RES))))
  }
  if(!silent){
    if(qr(RES)$rank != NCOL(data)-1){
      warning('Given partition is not a basis')
    }else{
      Z = t(RES) %*% RES
      if( !all( Z - diag(diag(Z), nrow=NROW(Z), ncol=NCOL(Z)) < 10e-10 ) ){
        warning('Given basis is not orthogonal')
      }else{
        if(!all( Z - diag(1, nrow=NROW(Z), ncol=NCOL(Z)) < 10e-10 )){
          warning('Given basis is not orthonormal')
        }
      }
    }
  }
  RES
}



#' Isometric log-ratio basis based on Principal Balances.
#'
#' Exact method to calculate the principal balances of a compositional dataset. Different methods to approximate the principal balances of a compositional dataset are also included.
#'
#' @param X compositional dataset
#' @param method method to be used with Principal Balances. Methods available are: 'exact', 'constrained' or 'cluster'.
#' @param constrained.complete_up When searching up, should the algorithm try to find possible siblings for the current balance (TRUE) or build a parent directly forcing current balance to be part of the next balance (default: FALSE). While the first is more exhaustive and given better results the second is faster and can be used with highe dimensional datasets.
#' @param cluster.method Method to be used with the hclust function (default: `ward.D2`) or any other method available in  hclust function
#' @param ordering should the principal balances found be returned ordered? (first column, first
#' principal balance and so on)
#' @param ... parameters passed to hclust function
#' @return matrix
#' @references
#' Martín-Fernández, J.A., Pawlowsky-Glahn, V., Egozcue, J.J., Tolosana-Delgado R. (2018).
#' Advances in Principal Balances for Compositional Data.
#' \emph{Mathematical Geosciencies}, 50, 273-298.
#' @examples
#' set.seed(1)
#' X = matrix(exp(rnorm(5*100)), nrow=100, ncol=5)
#'
#' # Optimal variance obtained with Principal components
#' (v1 <- apply(coordinates(X, 'pc'), 2, var))
#' # Optimal variance obtained with Principal balances
#' (v2 <- apply(coordinates(X,pb_basis(X, method='exact')), 2, var))
#' # Solution obtained using constrained method
#' (v3 <- apply(coordinates(X,pb_basis(X, method='constrained')), 2, var))
#' # Solution obtained using Ward method
#' (v4 <- apply(coordinates(X,pb_basis(X, method='cluster')), 2, var))
#'
#' # Plotting the variances
#' barplot(rbind(v1,v2,v3,v4), beside = TRUE, ylim = c(0,2),
#'         legend = c('Principal Components','PB (Exact method)',
#'                    'PB (Constrained)','PB (Ward approximation)'),
#'         names = paste0('Comp.', 1:4), args.legend = list(cex = 0.8), ylab = 'Variance')
#'
#' @export
pb_basis = function(X, method, constrained.complete_up = FALSE, cluster.method = 'ward.D2',
                    ordering = TRUE, ...){
  X = as.matrix(X)
  if(!(all(X > 0))){
    stop("All components must be strictly positive.", call. = FALSE)
  }
  if(method %in% c('constrained', 'exact')){
    if(method == 'exact'){
      M = 'PB'
      B = find_PB(X)
    }
    if(method == 'constrained'){
      M = 'CS'
      # B = t(fBalChip(X)$bal)
      B = constrained_pb(as.matrix(X))
    }
    if(method == 'constrained2'){
      M = 'CS'
      B = find_PB_using_pc(X)
    }
    # if(method == 'lsearch'){
    #   if(rep == 0){
    #     B = find_PB_pc_local_search(X)
    #   }else{
    #     B = find_PB_rnd_local_search(stats::cov(log(X)), rep=rep)
    #   }
    # }
  }else if(method == 'cluster'){
    M = 'CL'
    # Passing arguments to hclust function
    hh = stats::hclust(stats::as.dist(variation_array(X)), method=cluster.method, ...)
    B = matrix(0, ncol = nrow(hh$merge), nrow = ncol(X))
    for(i in 1:nrow(hh$merge)){
      if(hh$merge[i,1] < 0 & hh$merge[i,2] < 0){
        B[-hh$merge[i,],i] = c(-1,+1)
      }else{
        if(hh$merge[i,1] > 0){
          B[B[,hh$merge[i,1]] != 0,i] = -1
        }else{
          B[-hh$merge[i,1],i] = -1
        }
        if(hh$merge[i,2] > 0){
          B[B[,hh$merge[i,2]] != 0,i] = +1
        }else{
          B[-hh$merge[i,2],i] = +1
        }
      }
    }
    B = sbp_basis(B[,nrow(hh$merge):1, drop = FALSE])
  } else{
    stop(sprintf("Method %s does not exist", method))
  }
  if(ordering){
    B = B[,order(apply(coordinates(X, B, basis_return = FALSE), 2, stats::var), decreasing = TRUE), drop = FALSE]
  }
  parts = colnames(X)
  if(is.null(parts)){
    parts = paste0('c', 1:nrow(B))
  }
  rownames(B) = parts
  colnames(B) = paste0('pb', 1:ncol(B))
  B
}

#' Isometric log-ratio basis based on Balances.
#'
#' The function return default balances used in CoDaPack software.
#'
#' @param dim dimension to build the ILR basis based on balanced balances
#' @return matrix
#' @export
cdp_basis = function(dim){
  check_dim(dim)
  B = cdp_basis_(dim)
  rownames(B) = paste0("c", 1:dim)
  colnames(B) = paste0("ilr", 1:ncol(B))
  B
}

cdp_basis_ = function(dim, wR = 1:ceiling(dim/2), wL = ceiling(dim/2) + 1:floor(dim/2)){
  R = length(wR)
  L = length(wL)
  D = R + L
  v = rep(0, dim)
  v[wR] = +sqrt(L/R/D)
  v[wL] = -sqrt(R/L/D)
  if(R == 1 & L == 1){
    return(v)
  }
  if(R == 1){
    return(cbind(v,
                 Recall(dim, wR = wL[1:ceiling(L/2)], wL = wL[ceiling(L/2) + 1:floor(L/2)])))
  }
  if(L == 1){
    return(cbind(v,
                 Recall(dim, wR = wR[1:ceiling(R/2)], wL = wR[ceiling(R/2) + 1:floor(R/2)])))
  }
  cbind(v,
        Recall(dim, wR = wR[1:ceiling(R/2)], wL = wR[ceiling(R/2) + 1:floor(R/2)]),
        Recall(dim, wR = wL[1:ceiling(L/2)], wL = wL[ceiling(L/2) + 1:floor(L/2)]))
}

#' Pairwise log-ratio generator system
#'
#' The function returns all combinations of pairs of log-ratios.
#'
#' @param dim dimension to build the pairwise log-ratio generator system
#' @return matrix
#' @export
pairwise_basis = function(dim){
  check_dim(dim)
  I = utils::combn(dim,2)
  B = apply(I, 2, function(i){
    b = rep(0, dim)
    b[i] = c(1,-1)
    b
  })
  colnames(B) = paste0('pw', apply(I, 2, paste, collapse = '_'))
  rownames(B) = paste0("c", 1:dim)
  B
}

Try the coda.base package in your browser

Any scripts or data that you put into this service are public.

coda.base documentation built on Nov. 26, 2023, 1:07 a.m.