R/data_preprocessing.R
In SOMbrero: SOM Bound to Realize Euclidean and Relational Outputs

# Various preprocessings for data #############################################

## Data preprocessing ####

preprocessData <- function(x.data, scaling, tolerance = 10^(-10)) {
  ## note: NA in 'x.data' seems to be properly handled for 'unitvar' and 'center'
  switch(scaling,
         "unitvar" = unitvarPreprocess(x.data),
         "center"  = centerPreprocessing(x.data),
         "none"    = as.matrix(x.data),
         "chi2"    = korrespPreprocess(x.data),
         "cosine"  = cosinePreprocess(x.data, tolerance = tolerance))
}


### 'unitvar' preprocessing

unitvarPreprocess <- function(x.data) {
  all.variances <- apply(x.data, 2, sd, na.rm = TRUE)
  all.variances[is.na(all.variances)] <- 0
  if (any(all.variances == 0)) {
    ### remove data with null variance
    to.remove <- which(all.variances == 0)
    name.remove <- paste0(colnames(x.data)[to.remove], collapse = ", ")
    out.message <- "Removing constant variable(s) (required by 'unit variance' scaling): columns"
    out.message <- paste(out.message, name.remove)
    warning(out.message, call. = TRUE, immediate. = TRUE)
    x.data <- x.data[, -to.remove]
  }

  return(scale(x.data, center = TRUE, scale = TRUE))
}


### 'center' preprocessing

centerPreprocessing <- function(x.data) {
  return(scale(x.data, center = TRUE, scale = FALSE))
}


### 'cosine' preprocessing (for "relational" case)

### TODO: code cleaning is needed
cosinePreprocess <- function(diss.matrix, x.new = NULL, tolerance = 10^(-10)) {
  ## TODO: remove x.new that is not used anymore
  sum.posdata <- rowSums(abs(diss.matrix))
  if (any(sum.posdata == 0)) {
    stop("One of the rows is 0. Cosine preprocessing can not be performed.",
         call. = TRUE, immediate = TRUE)
  }
  
  # tolerance is used to solve numeric instabilities
  # similarity matrix by double centering
  sim.matrix <- - 0.5* (diag(1, nrow(diss.matrix)) - 1/nrow(diss.matrix)) %*%
    diss.matrix %*% (diag(1, nrow(diss.matrix)) - 1/nrow(diss.matrix))
  sim.matrix <- round(sim.matrix, - log10(tolerance))
  
  if (is.null(x.new)) {
    # normalize the original dissimilarity matrix
    scaled.ker <- sweep(sweep(sim.matrix, 2, sqrt(diag(sim.matrix)), "/"),
                        1, sqrt(diag(sim.matrix)), "/")
    # came back to dissimilarity
    # if cosine preprocess is useless it is the original dissimilarity
    scaled.diss <- 2 - 2 * scaled.ker
    rownames(scaled.diss) <- rownames(diss.matrix)
    colnames(scaled.diss) <- colnames(diss.matrix)
    scaled.diss <- round(scaled.diss, - log10(tolerance))
  } else {
    # normalize additional dissimilarity lines like the similarity matrix
    sim.x <- t(apply(x.new, 1, function(x) 
      - .5 * (x - mean(x) - colMeans(diss.matrix) + mean(diss.matrix))))
    sim.x <- round(sim.x, -log10(tolerance))
    auto.sim <- round(apply(x.new,1,mean) - mean(diss.matrix) / 2, 
                      -log10(tolerance))
    if (nrow(x.new) > 1) {
      scaled.ker <- sweep(sweep(sim.x, 2, sqrt(diag(sim.matrix)), "/"), 1,
                          sqrt(auto.sim), "/")
    } else {
      scaled.ker <- sweep(sim.x, 2, sqrt(diag(sim.matrix)), "/") / sqrt(auto.sim)
    }
    scaled.diss <- 2 - 2 * scaled.ker
    colnames(scaled.diss) <- colnames(diss.matrix)
    rownames(scaled.diss) <- rownames(x.new)
    scaled.diss <- round(scaled.diss, -log10(tolerance))
  }
  
  return(scaled.diss)
}


### 'korresp' preprocessing (for "relational" case)

### TODO: code cleaning is needed
korrespPreprocess <- function(cont.table) {
  both.profiles <- matrix(0, nrow = nrow(cont.table) + ncol(cont.table),
                          ncol = ncol(cont.table) + nrow(cont.table))
  # row profiles
  both.profiles[1:nrow(cont.table), 1:ncol(cont.table)] <-
    cont.table / 
    outer(sqrt(rowSums(cont.table)), sqrt(colSums(cont.table) / sum(cont.table)))  
  # column profiles
  both.profiles[(nrow(cont.table)+1):(nrow(cont.table) + ncol(cont.table)),
                (ncol(cont.table)+1):(ncol(cont.table) + nrow(cont.table))] <- 
    t(cont.table) / outer(sqrt(colSums(cont.table)), 
                          sqrt(rowSums(cont.table)/sum(cont.table)))
  # best column to complete row profiles
  best.col <- apply(both.profiles[1:nrow(cont.table), 1:ncol(cont.table)],
                    1, which.max)
  both.profiles[1:nrow(cont.table), (ncol(cont.table) + 1):ncol(both.profiles)] <- 
    both.profiles[best.col + nrow(cont.table),
                  (ncol(cont.table) + 1):ncol(both.profiles)]
  # best row to complete col profiles
  best.row <- apply(both.profiles[(nrow(cont.table)+1):
                                    (nrow(cont.table) + ncol(cont.table)),
                                  (ncol(cont.table)+1):
                                    (ncol(cont.table) + nrow(cont.table))],
                    1, which.max)
  both.profiles[(nrow(cont.table)+1):(nrow(cont.table) + ncol(cont.table)),
                1:ncol(cont.table)] <-
    both.profiles[best.row, 1:ncol(cont.table)]
  # names
  rownames(both.profiles) <- c(rownames(cont.table),colnames(cont.table))
  colnames(both.profiles) <- c(colnames(cont.table),rownames(cont.table))
  return(both.profiles)
}

## Prototype preprocessing ####

preprocessProto <- function(prototypes, scaling, x.data) {
  if (scaling %in% c("unitvar", "center")) 
    x.mean <- colMeans(x.data, na.rm = TRUE)
  
  if (scaling == "unitvar")
    x.var <- apply(x.data, 2, sd, na.rm = TRUE)
  
  switch(scaling,
         "unitvar" = scale(prototypes, center = x.mean, scale = x.var),
         "center"  = scale(prototypes, center = x.mean, scale = FALSE),
         "none"    = prototypes,
         "chi2"    = prototypes,
         "cosine"  = prototypes)
}

dePreprocessProto <- function(prototypes, scaling, x.data) {
  if (scaling %in% c("unitvar", "center")) 
    x.mean <- - colMeans(x.data, na.rm = TRUE)
  
  if (scaling == "unitvar") {
    x.var <- 1 / apply(x.data, 2, sd, na.rm = TRUE)
    x.mean <- x.mean * x.var
  }
  
  switch(scaling,
         "unitvar" = scale(prototypes, center = x.mean, scale = x.var),
         "center"  = scale(prototypes, center = x.mean, scale = FALSE),
         "none"    = prototypes,
         "chi2"    = prototypes,
         "cosine"  = prototypes)
}

Any scripts or data that you put into this service are public.

SOMbrero documentation built on Nov. 5, 2025, 6:36 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

SOMbrero
SOM Bound to Realize Euclidean and Relational Outputs

R/data_preprocessing.R
In SOMbrero: SOM Bound to Realize Euclidean and Relational Outputs

Defines functions dePreprocessProto preprocessProto korrespPreprocess cosinePreprocess centerPreprocessing unitvarPreprocess preprocessData

Try the SOMbrero package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

SOMbrero SOM Bound to Realize Euclidean and Relational Outputs

R/data_preprocessing.R In SOMbrero: SOM Bound to Realize Euclidean and Relational Outputs

Defines functions dePreprocessProto preprocessProto korrespPreprocess cosinePreprocess centerPreprocessing unitvarPreprocess preprocessData

Try the SOMbrero package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

SOMbrero
SOM Bound to Realize Euclidean and Relational Outputs

R/data_preprocessing.R
In SOMbrero: SOM Bound to Realize Euclidean and Relational Outputs