R/nroPreprocess.R
In Numero: Statistical Framework to Define Subgroups in Complex Datasets

Documented in nroPreprocess

nroPreprocess <- function(
    data,
    method="standard",
    clip=5.0,
    resolution=100,
    trim=FALSE) {

    # Convert input to numeric matrix.
    data <- nroRcppMatrix(data, trim=trim[[1]])
    binary <- attr(data, "binary")

    # Check if any rows or columns were excluded.
    if(length(attr(data, "excl.rows")) > 0)
        warning("Unusable rows excluded.")
    if(length(attr(data, "excl.columns")) > 0)
        warning("Unusable columns excluded.")

    # Check input size.
    if(nrow(data) < 1) {
        warning("No usable data.")
        return(NULL)
    }

    # Check method.
    method <- as.character(method[[1]])
    method <- intersect(method,
        c("","standard","uniform","tapered","normal"))
    if(length(method) < 1) stop("Unknown method.")

    # Check resolution.
    resolution <- as.integer(resolution[[1]])
    if(resolution < 20) # see downsampling
        stop("Unusable resolution.")

    # Standardize location and scale.
    ds.in <- data
    ds.out <- NA*ds.in
    for(vn in colnames(ds.out))
        ds.out[,vn] <- nroPreprocess.std(ds.in[,vn], method)

    # Downsample data model.
    model <- nroPreprocess.down(ds.in, ds.out, resolution, method)

    # Truncate extreme values.
    if((length(clip) > 0) && (method != "")) {
        mu <- NULL; sigma <- NULL
        if((method == "standard") || (method == "normal")) {
	    mu <- 0; sigma <- 1}
        for(vn in colnames(ds.out)) {
            ds.out[,vn] <- nroPreprocess.clip(
	        ds.out[,vn], clip, mu=mu, sigma=sigma)
        }
    }

    # If no preprocessing, binary variables remain binary.
    if(method == "") {
        binary <- intersect(binary, colnames(ds.out))
        attr(ds.out, "binary") <- binary
    }

    # Return results.
    attr(ds.out, "mapping") <- model
    return(ds.out)
}

#---------------------------------------------------------------------------

nroPreprocess.std <- function(x, method) {

    # Check variance.
    sigma <- stats::sd(x, na.rm=TRUE)
    if(!is.finite(sigma)) return(0*x)
    if(sigma <= .Machine$double.eps) return(0*x)

    # No standardization.
    if(length(method) < 1) return(x)
    if(nchar(method) < 1) return(x)

    # Rank-based standardization.
    if((method == "uniform") || (method == "tapered")) {
        z <- rank(x, na.last="keep")
        z <- (z - min(z, na.rm=TRUE))
	z <- (2*z/max(z, na.rm=TRUE) - 1)
	if(method == "tapered") z <- (z + 2*(z^3))/3
        return(z)
    }
    if(method == "normal") {
        nvals <- sum(is.finite(x), na.rm=TRUE)
        z <- rank(x, na.last="keep")
        z <- stats::qnorm(z/(nvals + 1))
	return(z)
    }

    # Default method left.
    if(method != "standard") stop("Unknown method.")

    # Protect against extreme outliers.
    tx <- stats::na.omit(x)
    nuniq <- length(unique(tx))
    if(nuniq >= 10) {
        q <- stats::pnorm(c(-1,0,1))
        q <- stats::quantile(tx, q)
        tx <- nroPreprocess.clip(tx, clip=8.0,
	    mu=q[2], sigma=as.double(q[3] - q[1] + 1e-9))
    }

    # Check if logarithm is useful.
    tmin <- min(tx, na.rm=TRUE)
    if((tmin >= 0) && (sum(is.finite(tx)) >= 10)) {
         t.log <- log(tx + 1e-20)

         # Downsample for Shapiro test.
         mask <- which(0*t.log == 0)
         if(length(mask) > 5000)
	     mask <- sample(mask, size=5000)    

         # Test for normality.
         suppressWarnings(w <- stats::shapiro.test(tx[mask]))
         suppressWarnings(w.log <- stats::shapiro.test(t.log[mask]))
	 if((w$p.value < 0.05) && (w$statistic < w.log$statistic)) {
             x <- log(x + 1e-20)
             tx <- t.log
	 }	 
    }

    # Basic statistics.
    mu <- mean(tx, na.rm=TRUE)
    sigma <- stats::sd(tx, na.rm=TRUE)

    # Standardize scale and location.
    z <- (x - mu)/max(sigma, 1e-20)
    return(z)
}

#---------------------------------------------------------------------------

nroPreprocess.clip <- function(x, clip, mu, sigma) {
    if(length(clip) < 1) return(x)
    if(!is.finite(clip)) return(x)
    if(length(mu) < 1) mu <- mean(x, na.rm=TRUE)
    if(length(sigma) < 1) sigma <- stats::sd(x, na.rm=TRUE)
    xmin <- (mu - clip*sigma)
    xmax <- (mu + clip*sigma)
    x[which(x < xmin)] <- xmin
    x[which(x > xmax)] <- xmax
    return(x)
}
    
#---------------------------------------------------------------------------

nroPreprocess.down <- function(x, y, resol, method) {
    if(method == "") return(NULL)

    # Nothing to do.
    results <- list()
    results$input <- x
    results$output <- y
    if(nrow(x) <= resol) return(results)

    # Prepare result matrices.
    results$input <- matrix(NA, nrow=resol, ncol=ncol(x))
    results$output <- matrix(NA, nrow=resol, ncol=ncol(x))
    colnames(results$input) <- colnames(x)
    colnames(results$output) <- colnames(x)

    # Reduce resolution.
    ranked <- (method == "uniform") || (method == "tapered")
    for(vn in colnames(x)) {
       rows <- is.finite(x[,vn]*y[,vn])
       u <- x[rows,vn]
       v <- y[rows,vn]

       # Remove duplicates.
       mask <- which(!duplicated(u))
       u <- u[mask]
       v <- v[mask]
       n <- length(u)
       if(n < 2) next

       # Sort by input value.
       sorted <- order(u)
       u <- u[sorted]
       v <- v[sorted]

       # Set sentinel points.
       if(!ranked) {
           q <- c(1, 10, (resol - 10), (resol - 1))/resol
           sigma.u <- stats::quantile(u, c(0.01, 0.1, 0.9, 0.99), na.rm=T)
           sigma.v <- stats::quantile(v, c(0.01, 0.1, 0.9, 0.99), na.rm=T)
           delta.u <- diff(sigma.u)
           delta.v <- diff(sigma.v)
           u <- c((u[1] - 3*delta.u[1]), u, (u[n] + 3*delta.u[3]))
           v <- c((v[1] - 3*delta.v[1]), v, (v[n] + 3*delta.v[3]))
       }

       # Select sampling points.
       n <- length(u)
       pivots <- seq(from=2, to=(n-1), length.out=(resol-2))
       pivots <- c(1, pivots, n)
       u.pivots <- stats::approx(x=(1:n), y=u, xout=pivots)$y

       # Interpolate output values.
       results$input[,vn] <- u.pivots
       results$output[,vn] <- stats::approx(x=u, y=v, xout=u.pivots)$y
    }
    return(results)
}

Any scripts or data that you put into this service are public.

Numero documentation built on Sept. 13, 2025, 1:09 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Numero
Statistical Framework to Define Subgroups in Complex Datasets

R/nroPreprocess.R
In Numero: Statistical Framework to Define Subgroups in Complex Datasets

Defines functions nroPreprocess.down nroPreprocess.clip nroPreprocess.std nroPreprocess

Documented in nroPreprocess

Try the Numero package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Numero Statistical Framework to Define Subgroups in Complex Datasets

R/nroPreprocess.R In Numero: Statistical Framework to Define Subgroups in Complex Datasets

Defines functions nroPreprocess.down nroPreprocess.clip nroPreprocess.std nroPreprocess

Documented in nroPreprocess

Try the Numero package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Numero
Statistical Framework to Define Subgroups in Complex Datasets

R/nroPreprocess.R
In Numero: Statistical Framework to Define Subgroups in Complex Datasets