R/linalg.R

Defines functions treat_outliers vect.treat_outliers magnitude geomean mat.profile vect.dist mat.extend vect.extend spherical.dist column.delta.down vect.normalise mat.normalize vect.normalize vect.standardize vect.centralize mat.standardize column.average.up column.delta.up column.regslope.down regslope column.cumulative.backward column.cumulative.forward vect.cumulative.backward vect.cumulative.forward column.feed.backward vect.feed.backward column.feed.forward vect.feed.forward column.average.down vect.shift.up vect.shift.down column.shift.down column.shift.up

Documented in column.feed.backward column.feed.forward vect.extend vect.feed.backward vect.feed.forward vect.normalize

# Header
# Filename:      linalg.R
# Version History:

# Version   Date                 Action
# ----------------------------------
# 1.0.0     11 September 2013    Initial Issue.
# 1.1.0     24 May 2016          Some function names changed.
# 1.2.0     28 October 2016      Function vect.shift.up() added
# 1.2.1     16 March 2017        Function mat.profile() added
# 1.2.3     05 July 2018         Function column.shift.up() column.shift.down() modified
# 1.3.0     14 May 2020          Functions column.feed.forward(), column.feed.backward(), vect.feed.forward(), vect.feed.backward() added
# 1.3.1     15 May 2020          Functions column.cumulative.forward(), column.cumulative.backward(), vect.cumulative.forward(), vect.cumulative.backward() added


#' @export
column.shift.up <- function(A, col, k=1, keep.rows=FALSE){
  N = nrow(A)
  if(N == 0){return(A)}
  if(N > k){
    A[1:(N-k), col] = A[(k + 1):N, col]
  }
  if (keep.rows){
    A[(N-k+1):N,col] = NA
    return(A)
  } else {
    return(A[(-N+k-1):(-N),])
  }
}

#' @export
column.shift.down <- function(A, col, k=1, keep.rows=FALSE){
  N = nrow(A)
  if(N == 0){return(A)}
  if(N > 1){
    A[(k + 1):N, col] = A[1:(N-k), col]
  }
  if (keep.rows){
    A[1:k, col] = NA
    return(A)
  } else {
    return(A[(-1):(-k),])
  }
}

#' @export
vect.shift.down <- function(v, k=1, keep.rows=FALSE){
  n = length(v)
  v[(k + 1):n] = v[1:(n-k)]
  if (keep.rows){
    v[1:k] = NA
    return(v)
  } else {
    return(v[(-1):(-k)])
  }
}

#' @export
vect.shift.up <- function(v, k=1, keep.rows=FALSE){
  n = length(v)
  v[1:(n-k)] = v[(k + 1):n]
  if (keep.rows){
    v[(n-k+1):n] = NA
    return(v)
  } else {
    return(v[(-n+k-1):(-n)])
  }
}


#' @export
column.average.down <- function(A, col=1:dim(A)[2], k=1, keep.rows=FALSE){
  # For column(s) given as "col", each row is replaced by the moving average among
  # that row and "K" rows before that.
  d=dim(A)
  n = d[1]
  m = d[2]
  V = A
  for (j in col){
    for (i in (k+1):n){
      V[i, j] = mean(A[(i-k):i, j])
    }
  }
  
  if (keep.rows){
    V[1:k, col] = NA
    return(V)
  } else {
    return(V[(-1):(-k),])
  }
}


#' Feeds each missing value with the value of its previous element. Order input vector appropriately brfore calling this function.
#' @export
vect.feed.forward = function(v, index = sequence(length(v))){
  wna = intersect(index, which(is.na(v))) %-% 1
  for(i in wna){
    v[i] <- v[i - 1]
  }
  v
}

#' Feeds missing values of each row with the values of its previous row. Order columns appropriately brfore calling this function.
#' If id_col is specified, then the table is grouped by that column before 
#' Input tbl is a table: data.frame, tibble or matrix
#' @export
column.feed.forward = function(tbl, col = 1, id_col = NULL){
  if(is.null(id_col)){dup = tbl %>% nrow %>% sequence} else {dup = tbl %>% pull(id_col) %>% duplicated %>% which}
  for(j in col){
    tbl[, j] <- tbl %>% pull(j) %>% vect.feed.forward(index = dup)
  }
  tbl
}  

#' Feeds each missing value with the value of its next element. Order input vector appropriately brfore calling this function.
#' @export
vect.feed.backward = function(v, index = sequence(length(v)) %>% rev){
  wna = intersect(index, which(is.na(v))) %-% length(v)
  for(i in wna){
    v[i] <- v[i + 1]
  }
  v
}

#' Feeds missing values of each row with the values of its next row. Order columns appropriately brfore calling this function.
#' If id_col is specified, then the table is grouped by that column before 
#' Input tbl is a table: data.frame, tibble or matrix
#' @export
column.feed.backward = function(tbl, col = 1, id_col = NULL){
  if(is.null(id_col)){dup = tbl %>% nrow %>% sequence %>% rev} else {dup = tbl %>% pull(id_col) %>% rev %>% duplicated %>% which; dup = nrow(tbl) - dup + 1}
  for(j in col){
    tbl[, j] <- tbl %>% pull(j) %>% vect.feed.backward(index = dup)
  }
  tbl
}  

#' @export
vect.cumulative.forward = function(v, index = sequence(length(v))){
  for(i in (index %-% 1)){
    v[i] <- v[i] + v[i - 1]
  }
  v
}

#' @export
vect.cumulative.backward = function(v, index = sequence(length(v)) %>% rev){
  for(i in (index %-% length(v))){
    v[i] <- v[i] + v[i + 1]
  }
  v
}


#' @export
column.cumulative.forward = function(tbl, col = 1, id_col = NULL, aggregator = cumsum){
  # if(is.null(id_col)){dup = tbl %>% nrow %>% sequence %>% setdiff(1)} else {dup = tbl %>% pull(id_col) %>% duplicated %>% which}
  for(j in col){
    # tbl[, j] <- tbl %>% pull(j) %>% vect.cumulative.forward(index = dup)
    tbl[, j] <- tbl %>% pull(j) %>% ave(id = tbl %>% pull(id_col), FUN = aggregator)
  }
  tbl
}  

#' @export
column.cumulative.backward = function(tbl, col = 1, id_col = NULL, aggregator = cumsum){
  # if(is.null(id_col)){dup = tbl %>% nrow %>% sequence %>% rev} else {dup = tbl %>% pull(id_col) %>% rev %>% duplicated %>% which; dup = nrow(tbl) - dup + 1}
  for(j in col){
    # tbl[, j] <- tbl %>% pull(j) %>% vect.cumulatiive.backward(index = dup)
    tbl[, j] <- tbl %>% pull(j) %>% rev %>% ave(id = tbl %>% pull(id_col) %>% rev, FUN = aggregator) %>% rev
    
  }
  tbl
}  

#' @export
regslope <- function(y){
  # Fits a line over the vector data and returns the slope of the regression line
  N      = length(y)
  x      = sequence(N)
  x.bar  = (N+1)/2
  x2.bar = x.bar*(2*N+1)/3
  y.bar  = mean(y)
  xy.bar = mean(x*y)
  return ((xy.bar - x.bar*y.bar)/(x2.bar - x.bar^2))
}

#' @export
column.regslope.down <- function(A, col=1:dim(A)[2], k=1, keep.rows=FALSE){
  # For column(s) given as "col", each row is replaced by the slope of the lease squares (regression) line among
  # that row and "K" rows before that.
  d=dim(A)
  n = d[1]
  m = d[2]
  V = A
  for (j in col){
    for (i in (k+1):n){
      V[i, j] = regslope(A[(i-k):i, j])
    }
  }
  
  if (keep.rows){
    V[1:k, col] = NA
    return(V)
  } else {
    return(V[(-1):(-k),])
  }
}

#' @export
column.delta.up <- function(A, col=1:dim(A)[2], k=1, keep.rows=FALSE){
  d=dim(A)
  n = d[1]
  m = d[2]
  
  V=A[,col]
  if (length(col)==1){V[1:(n-k)] = A[(k + 1):n, col]}
  else{V[1:(n-k),] = A[(k + 1):n, col]}
  A[,col] = A[,col] - V
  if (keep.rows){
    A[(n-k+1):n,col]=NA
    return(A)
  } else {
    return(A[(-n+k-1):(-n),])
  }
}

#' @export
column.average.up <- function(A, col=1:dim(A)[2], k=1, keep.rows=FALSE){
  # For column(s) given as "col", each row is replaced by the moving average among
  # that row and "K" rows after that.
  
  d=dim(A)
  n = d[1]
  m = d[2]
  
  V <- A
  for (j in col){
    for (i in 1:(n-k)){
      V[i,j] = mean(A[i:(k + i), j])
    }
  }
  if (keep.rows){
    V[(n-k+1):n,col]=NA
    return(V)
  } else {
    return(V[(-n+k-1):(-n),])
  }
}

#' @export
mat.standardize <- function(M, byrow = FALSE){
  # Standardizes each column and returns a matrix of the same dimention as the given matrix where
  # all the columns are standardized (centralized and divided by standard deviation)
  # if byrow is TRUE, then each row is considered as a vector and will be standardized
  if (byrow){return(t(scale(t(M))))} else {return(scale(M))}
}

#' @export
vect.centralize <- function(v){
  # This function centralizes the given vector
  # Each element is subtracted by the mean of vector
  # x_i <- x_i - mean(x)
  v_bar=mean(v)
  return(v - v_bar)
}

#' @export
vect.standardize <- function(v, cent = TRUE){
  # This function returns the z factors of given vector v
  return(v %>% scale(center = cent) %>% as.numeric)
}

#' Normalizes the given vector by dividing each element by the vector magnitude
#'
#' @param v a vector of numerics to be normalized
#' @param cent A logical. Should the vector be also centralized?
#' @return A numeric vector containing the normalized values. Sum of squares of elements of the output vector will be 1.0.
#' @examples
#' vect.normalize(c(3, 4))
#' [1] 0.6 0.8
#'
#' @export
vect.normalize <- function(v, cent = FALSE){
  # This function normalizes the given vector
  # and returns a unit vector parallel to the given vector
  if (cent){v=vect.centralize(v)}
  l = sqrt(sum(v*v))
  if (l==0.0){return(v)}
  else {return(v/l)}
}

#' @export
mat.normalize <- function(M, dimension = 2){
  assert(dimension %in% c(1,2), 'Error: dimension must be 1 or 2')
  S = apply(M, dimension, vect.normalize)
  if (dimension == 1){S = t(S)}
  return (S)
  
}

#' @export
vect.normalise <- function(v){
  # This function normalises the given vector by dividing all its elements by sum of elements
  # and returns a vector parallel to the given vector so that sum of all its elements is one.
  l = sum(v)
  if (l==0.0){
    print("vect.normalise Error: Sum of elements of the given matrix is zero. Given argument returned")
    return(v)}
  else {return(v/l)}
}

#' @export
column.delta.down <- function(A, col=1:dim(A)[2], k=1, keep.rows=FALSE){
  # This function gets matrix "A" as input and finds the difference between its column(s) given
  # by argument "col" and its shifted down column(s)
  
  d=dim(A)
  n = d[1]
  m = d[2]
  
  V=A[,col]
  if (length(col)==1){V[(k + 1):n] = A[1:(n-k), col]}
  else{V[(k + 1):n, ] = A[1:(n-k), col]}
  
  A[,col] = A[,col] - V
  if (keep.rows){
    A[1:k, col] = NA
    return(A)
  } else {
    return(A[(-1):(-k),])
  }
}

# spherical.dist = cosine.dissimilarity (Reflects the angle between the vectors)
#' @export
spherical.dist <- function(u,v){
  # Finds the cosine of the angle between two vectors and Returns the dissimilarity
  # of two vectors as 1-cos(u,v). The output is between 0 and 2
  u.mod   = sqrt(sum(u^2))
  v.mod   = sqrt(sum(v^2))
  uv.prod = sum(u*v)
  
  cos.theta = uv.prod/(u.mod*v.mod)
  return(1 - cos.theta)
}

#' @export
difference <- function (v1, v2) {
  # Returns the euclidean distance between two given vectors v1 & v2
  d = v1 - v2
  return (sqrt(sum(d^2)))
}

#' @export
vect.extend <- function(v, n){
  m = length(v)
  if (m >= n){
    return(v[sequence(n)])
  }
  ve = v
  while (n > m + length(ve)){ve = c(ve, v)}
  return (c(ve, v[sequence(n - length(ve))]))
}

#' @export
mat.extend <- function(M, n, on_rows = T){
  if (on_rows){m = nrow(M)}else{m = ncol(M)}
  if (m >= n){
    if (on_rows){return(M[sequence(n),, drop = F])}else{return(M[,sequence(n), drop = F])}
  }
  Me = M
  if (on_rows){
    while (n > m + nrow(Me)){Me = rbind(Me, M)}
    return (rbind(Me, M[sequence(n - nrow(Me)),, drop = F]))
  } else{
    while (n > m + ncol(Me)){Me = cbind(Me, M)}
    return (cbind(Me, M[,sequence(n - ncol(Me)), drop = F]))
  }
}

#' @export
vect.dist <- function(v1, v2, metric = 'euclidean'){
  n1 = length(v1)
  n2 = length(v2)
  assert(n1 == n2, 'Error: Given vectors must have the same length')
  if      (metric == 'euclidean'){return(difference(v1, v2))}
  else if (metric == 'spherical'){return(spherical.dist(v1, v2))}
  else if (metric == 'binary'){return(mean(xor(v1, v2)))}
  else if (metric == 'manhattan'){return(sum(abs(v1 - v2)))}
  else if (metric == 'canberra'){return(sum(abs(v1 - v2)/abs(v1 + v2), na.rm = TRUE))}
  else if (metric == 'maximum'){return(max(abs(v1 - v2)))}
  else {print('Error: Given metric is not recognized!')}
}

# Test this function with a list of square matrices
#' @export
mat.profile = function(M, cn = NULL, func = sum, on_rows = T, remove_na = T){
  if (!inherits(M, 'list')){M = list(M = M)}
  if (!inherits(func, 'list')){func = list(func)}
  
  dfcn = chooseif(inherits(cn, 'character'), chooseif(on_rows, colnames(M[[1]]), rownames(M[[1]])), chooseif(on_rows, sequence(ncol(M[[1]])),sequence(nrow(M[[1]]))))
  cn %<>% verify(c('character', 'integer', 'factor'), domain = dfcn, default = dfcn, varname = 'cn')
  
  if ((length(func) == 1) & (length(M) > 1)) (func = rep(func, length(M)))
  
  if (is.empty(M)) {return(matrix())}
  res = NULL
  for (i in sequence(length(M))){
    verify(func[[i]], 'function', varname = 'func')
    if(inherits(M[[i]], 'data.frame')){M[[i]] = as.matrix(M[[i]])}
    verify(M[[i]], 'matrix', dims = dim(M[[1]]), varname = 'M[[' %++% i  %++% ']]')
    
    column = chooseif(on_rows,M[[i]][, cn, drop = F], M[[i]][cn, , drop = F]) %>% apply(2 - on_rows, func[[i]])
    
    res %<>% cbind(column)
  }
  
  if (is.null(names(M))){colnames(res) <- paste('M', sequence(ncol(res)), sep = '.')} else {colnames(res) <- names(M)}
  if (on_rows){rownames(res) <- rownames(M[[1]])} else {rownames(res) <- colnames(M[[1]])}
  if (remove_na)  {res = res[!is.na(rowSums(res)),, drop = F]}
  
  return(res)
}

# Returns the geometric mean of vector v
#' @export
geomean = function(v, na.rm = T){
  v %>% log(base = exp(1)) %>% mean(na.rm = na.rm) %>% exp
}

#' @export
magnitude = function(v, na.rm = T){
  v^2 %>% as.numeric %>% sum(na.rm = na.rm) %>% sqrt
}

# Only works on numeric or integer vectors
# Argument 'quant_cut' must be between 0 and 1,
# Argument 'sd_cut' is usually around 3 or 4
vect.treat_outliers = function(v, sd_cut = NULL, quant_cut = NULL, right = T, left = T, action = c('trim', 'remove', 'na', 'adapt'), numerics_only = F){
  action = match.arg(action)
  vclass = chif(numerics_only, 'numeric', c('numeric', 'integer'))
  if(!inherits(v, vclass)) return(v)
  
  if(!is.null(sd_cut)){
    mu = mean(v, na.rm = T)
    sg = sd(v, na.rm = T)
    ul = mu + sd_cut*sg
    ll = mu - sd_cut*sg
  } else if(!is.null(quant_cut)){
    ul = quantile(v, probs = 1.0 - quant_cut)
    ll = quantile(v, probs = quant_cut)
  } else {
    ul = max(v, na.rm = T)
    ll = min(v, na.rm = T)
    cat( '\n', 'No quant_cut or sd_cut specified, no outliers detected!', '\n')
  }
  
  too_high = which(v > ul)
  too_low  = which(v < ll)
  
  if(right & (length(too_high) > 0)){
    switch(action,
           'trim' = {v[too_high] <- ul},
           'remove' = {v[too_high] = NA},
           'na'     = {v[too_high] = NA},
           'adapt'  = {v[too_high] <- ul + log(1.0 + v[too_high] - ul)})
  }
  
  if(left & (length(too_low) > 0)){
    switch(action,
           'trim' = {v[too_low] <- ll},
           'remove' = {v[too_low] = NA},
           'na'     = {v[too_low] = NA},
           'adapt'  = {v[too_low] <- ll - log(1.0 + ll - v[too_low])})
  }
  
  if(action == 'remove') v %<>% na.omit
  
  return(v)
}

treat_outliers = function(X, ...){
  X %>% ncol %>% sequence %>% sapply(function(i) {class(X[,i])[1]}) -> clss
  wn = which(clss %in% c('numeric', 'integer'))
  X[, wn] = X[, wn, drop = F] %>% as.matrix %>% apply(2, vect.treat_outliers, ...)
  X
}
genpack/gener documentation built on March 14, 2023, 9:52 a.m.