R/CCDF.R

Defines functions CCDF

Documented in CCDF

#' Function to compute (un)conditional cumulative distribution function (CDF), used by plot_CCDF function.
#'
#'@param Y a numeric vector of size \code{n} containing the
#'preprocessed expressions from \code{n} samples (or cells).
#'
#'@param X a data frame containing numeric or factor vector(s) of size \code{n}
#'containing the variable(s) to be tested (the condition(s) to be tested). 
#' 
#'@param Z a data frame containing numeric or factor vector(s) of size \code{n}
#'containing the covariate(s).
#'
#'@param method a character string indicating which method to use to
#'compute the CCDF, either \code{'linear regression'}, \code{'logistic regression'}
#' and  \code{'permutations'} or \code{'RF'} for Random Forests.
#'Default is \code{'linear regression'} since it is the method used in the test.
#'
#'@param fast a logical flag indicating whether the fast implementation of
#'logistic regression should be used. Only if \code{'dist_permutations'} is specified.
#'Default is \code{TRUE}.
#'
#'@param space_y a logical flag indicating whether the y thresholds are spaced. 
#'When \code{space_y} is \code{TRUE}, a regular sequence between the minimum and 
#'the maximum of the observations is used. Default is \code{FALSE}.
#'
#'@param number_y an integer value indicating the number of y thresholds (and therefore
#'the number of regressions) to perform the test. Default is \code{length(Y)}.
#'
#'@importFrom stats model.matrix
#' 
#'@export
#' 

#'@return A list with the following elements:\itemize{
#'   \item \code{cdf}: a vector of the cumulative distribution function of a given gene.
#'   \item \code{ccdf}: a vector of the conditional cumulative distribution function of a given gene, computed
#'   given \code{X}. Only if \code{Z} is \code{NULL}.
#'   \item \code{ccdf_nox}: a vector of the conditional cumulative distribution function of a given gene, computed
#'   given \code{Z} only (i.e. \code{X} is ignored.). Only if \code{Z} is not \code{NULL}.
#'   \item \code{ccdf_x}: a vector of the conditional cumulative distribution function of a given gene, computed
#'   given \code{X} and \code{Z}. Only if \code{Z} is not \code{NULL}.
#'   \item \code{y_sort}: a vector of the sorted expression points at which the CDF and the CCDFs are calculated.
#'   \item \code{x_sort}: a vector of the variables associated with \code{y_sort}.
#'   \item \code{z_sort}: a vector of the covariates associated with \code{y_sort}. Only if \code{Z} is not \code{NULL}.
#' }
#' 
#'@examples
#' 
#'X <- as.factor(rbinom(n=100, size = 1, prob = 0.5))
#'Y <- ((X==1)*rnorm(n = 50,0,1)) + ((X==0)*rnorm(n = 50,0.5,1))
#'res <- CCDF(Y,data.frame(X=X),method="linear regression")

CCDF <- function(Y,X,Z=NULL,method=c("linear regression","logistic regression","RF"), fast=TRUE, space_y=FALSE, number_y=length(Y)){
  
  if (space_y){
   y <- seq(min(Y[-which(Y==min(Y))]),max(Y),length.out=number_y)
  }
  else{
    y <- sort(unique(Y))
  }
  
  if (class(Z)=="NULL"){
    n_Y <- length(Y)
    # temp_order <- sort(Y,index.return=TRUE)$ix
    # y <- sort(unique(Y))
    # y_sort <- sort(Y)
    # x_sort <- X[temp_order]
    #modelmat <- model.matrix(Y~X)
    

    colnames(X) <- sapply(1:ncol(X), function(i){paste0('X',i)})
    modelmat <- model.matrix(~.,data=X)
  
    
    ind_X <- which(substring(colnames(modelmat),1,1)=="X")
    
    cdf <- list()
    ccdf <- list()
    x_sort <- NULL
    y_sort <- NULL
    
    for (i in 1:(length(y)-1)){
      #w <- which(Y==y[i])
      if (i==1){
        w <- which(Y<=y[i])
      }
      else{
        w <- which((Y<=y[i])&(Y>y[i-1]))
      }
      x_sort <- c(x_sort,X[w,])
      y_sort <- c(y_sort,Y[w])
      indi_Y <- 1*(Y<=y[i])
      
      # unCDF
      cdf[[i]] <- rep(sum(indi_Y)/n_Y, length(w))
      
      if (length(unique(indi_Y))==1){
        ccdf[[i]] <- rep(1,length(w))
      }
      
      else{
        
        if (method=="RF"){
          # CCDF
          rf <- randomForest(data.frame(X=X, row.names = NULL), indi_Y,ntree=100)
          ccdf[[i]] <- predict(rf)[w]
        }
        
        
        else if (method=="logistic regression"){
          # CDF
          if(fast){
            #fast
            glm_coef <- RcppNumerical::fastLR(x=modelmat, y=indi_Y,
                                              eps_f = 1e-08, eps_g = 1e-08)$coefficients
          }else{
            #safe
            glm_coef <- glm(indi_Y ~ 1 + X, family = binomial(link = "logit"))$coefficients
          }
          if (is.null(dim(modelmat[w,]))){
            exp_predlin <- exp(-sum(t(glm_coef*t(modelmat[w, ]))))
          }
          else{
            exp_predlin <- exp(-apply(t(glm_coef*t(modelmat[w, ])),1,sum))
          }
          ccdf[[i]] <- 1/(1+exp_predlin) # exp_predlin/(1+exp_predlin)
        }
        
        else if (method=="linear regression"){
          reg <- lm(indi_Y ~ 1 + modelmat[,-1])
          ccdf[[i]] <- predict(reg)[w]
        }
      }
    }
    
    if (is.factor(X[,1])){
      x_sort <- as.factor(x_sort)
    }
    ccdf <- unlist(ccdf, use.names = FALSE)
    cdf <- unlist(cdf, use.names = FALSE)
    return(list(cdf=cdf, ccdf=ccdf, y=y_sort, x=x_sort))
  }
  
  else{
    
    n_Y <- length(Y)

    colnames(X) <- sapply(1:ncol(X), function(i){paste0('X',i)})
    colnames(Z) <- sapply(1:ncol(Z), function(i){paste0('Z',i)})
    modelmat <- model.matrix(~.,data=cbind(X,Z))
    
    ind_X <- which(substring(colnames(modelmat),1,1)=="X")
    
    x_sort <- NULL
    y_sort <- NULL
    z_sort <- NULL
    
    ccdf_x <- list()
    ccdf_nox <- list()
    cdf <- list()
    
    for (i in 1:(length(y)-1)){

      #new_data <- data.frame(X[w],Z[w])
      #names(new_data) <- c("X","Z")
      
      #cdf[[i]] <- rep(sum(indi_Y)/n_Y, length(w))
      
      if (i==1){
        w <- which(Y<=y[i])
      }
      else{
        w <- which((Y<=y[i])&(Y>y[i-1]))
      }
      
      x_sort <- c(x_sort,X[w,])
      y_sort <- c(y_sort,Y[w])
      z_sort <- c(z_sort,Z[w,])
      indi_Y <- 1*(Y<=y[i])
      
      # unCDF
      cdf[[i]] <- rep(sum(indi_Y)/n_Y, length(w))
      
      if (length(unique(indi_Y))==1){
        ccdf[[i]] <- rep(1,length(w))
      }
      
      if (method=="RF"){
        # CDF
        rf_x <- randomForest(data.frame(X=X, Z=Z, row.names = NULL), indi_Y,ntree=50)
        ccdf_x[[i]] <- predict(rf_x)[w]
        # CCDF
        rf_nox <- randomForest(data.frame(Z=Z, row.names = NULL), indi_Y,ntree=50)
        ccdf_nox[[i]] <-  predict(rf_nox)[w]
        
      }
      
      
      else if (method=="logistic regression"){
        if (fast){
          glm_coef_x <- RcppNumerical::fastLR(x=modelmat, y=indi_Y)$coefficients
          glm_coef_nox <- RcppNumerical::fastLR(x= modelmat[,-ind_X], y=indi_Y)$coefficients
          
        }
        else{
          glm_coef_x <- glm.fit(x=modelmat,y=indi_Y, family = binomial())$coefficients
          glm_coef_nox <- glm.fit(x=modelmat[,-ind_X],y=indi_Y, family = binomial())$coefficients
        }
        if (is.null(dim(modelmat[w,]))){
          exp_predlin_x <- exp(-sum(t(glm_coef_x*t(modelmat[w, ]))))
          exp_predlin_nox <- exp(-sum(t(glm_coef_nox*t(modelmat[w,-ind_X]))))
        }
        else{
          exp_predlin_x <- exp(-apply(t(glm_coef_x*t(modelmat[w, ])),1,sum))
          exp_predlin_nox <- exp(-apply(t(glm_coef_nox*t(modelmat[w,-ind_X])),1,sum))
        }
        ccdf_x[[i]] <- 1/(1+exp_predlin_x) # exp_predlin_x/(1+exp_predlin_x)
        ccdf_nox[[i]] <- 1/(1+exp_predlin_nox) # exp_predlin_nox/(1+exp_predlin_nox)
        
      }
      
      else if (method=="linear regression"){
        reg_x <- lm(indi_Y ~ 1 + modelmat[,-1])
        ccdf_x[[i]] <- predict(reg_x)[w]
        reg_nox <- lm(indi_Y ~ 1 + modelmat[,-c(1,ind_X)])
        ccdf_nox[[i]] <- predict(reg_nox)[w]
      }
    }
    
    if (is.factor(X[,1])){
      x_sort <- as.factor(x_sort)
    }
    if (is.factor(Z[,1])){
      z_sort <- as.factor(z_sort)
    }
    ccdf_x <- unlist(ccdf_x, use.names = FALSE)
    ccdf_nox <- unlist(ccdf_nox, use.names = FALSE)
    cdf <- unlist(cdf, use.names = FALSE)
    
    return(list(cdf=cdf, ccdf_nox=ccdf_nox, ccdf_x=ccdf_x, y=y_sort, x=x_sort, z=z_sort))
  }
}

Try the ccdf package in your browser

Any scripts or data that you put into this service are public.

ccdf documentation built on Sept. 24, 2021, 9:07 a.m.