R/predicteff.R

Defines functions predicteff

Documented in predicteff

#' Predicts treatment effects on an outcome for individuals  randomly sampled from the entire dataset (default 20%).
#'
#' @details This function sets up feature and treatment-effects data, fits
#' random causal forest and identify the individuals with significant
#' treatment effects. Each individual is characterized by a set of feature data and the
#' the effect of treatment on the individual is given by the estimated difference of an
#' outcome between treating and and not treating when the feature data are kept constant.
#' Individuals with significant treatment effects
#' are considered for those whose confidence intervals for the treatment
#' estimate do not overlap 0. Consensus profiles of individuals with
#' positive, and negative, treatment effects are obtained from majority votes of
#' adjusted features, binarized over the population means.
#'
#' The result is two profiles, associated with positive and negative
#' treatment effects, given by logical vectors across the
#' features. The logical value of a given profile at feature indicates whether the
#' adjusted feature of a new individual should be higher than the feature population
#' mean if the individual is successfully targeted by the profile. See \link[teff]{targetprofile}.
#'
#' @export
#' @param x a \code{list} with a fields \code{teffdata} and \code{features}.
#' \code{teffdata} is a \code{data.frame} (or \code{matrix}) with the treatment
#' \code{$t} and the outcome \code{$eff} variables, and covariates across
#' subjects. \code{features} is a \code{matrix} which the profiling
#' is of subjects if performed. The features are adjusted for the covariates
#' on the \code{teffdata} before fitting the causal random forest forest.
#'
#' @param featuresinf a \code{vector} of characters with the names of the
#' features to be used; a \code{matrix} of characters whose columns are
#' names of features whose values will be averaged (Default:NULL).
#' @param cores an integer with the number of cores for parallel
#' computation (Default:1)
#' @param seed and integer with the random seed for splitting data into
#' train (80%) and test (20%) sets.
#' @param plot.overlap a logical. If \code{TRUE} then it plots the overlap of
#' adjusted feature data across treatments. Parameter available for less
#' than 20 features (Default: \code{FALSE}).
#' @param quant a number from 0 to 1 with the quantile of features to be selected
#' with top information score from the causal forest.
#' By default it selects all the features (Default: Inf).
#' @param profile a logical. If \code{TRUE} then it estimates a profile of binarized
#' feature data for the individuals with significantly positive and negative treatment effects, respectively.
#' @param dup a logical that indicates whether the feature and teff data should
#' be duplicated in case of small datasets.
#' @param resplevel a number indicating the level of response for assessing positive ore negative treatment effects (default 0).
#' @return  a \code{list} of class \code{pteff} with fields:
#' \describe{
#' \item{predictions:}{a \code{vector} with the estimated treatment effect
#' of the individuals in the test set.}
#' \item{featurenames:}{a \code{vector} with the names of the features used.}
#' \item{cl:}{a \code{vector} with the lower limit of the 95% confidence
#' intervals for the estimated treatment effect.}
#' \item{cu:}{a \code{vector} with the upper limit of the 95% confidence
#' intervals for the estimated treatment effect.}
#' \item{subsids:}{a \code{vector} with ids of subjects in the test set.}
#' \item{treatment:}{a \code{vector} with treatment effect in the test set.}
#' \item{profile:}{a \code{list} with fields \code{profpositive} and \code{profnegative}
#' that are matrices with binarized feature data for the individuals with
#' significantly positive and negative treatment effects, respectively.}
#' }
#' @export
#'
#' @examples
#' data(tcell)
#' homologous<- matrix(c("DDX3Y","DDX3X","KDM5D","KDM5C","PRKY","PRKX","RPS4Y1",
#' "RPS4X","TXLNGY", "TXLNG",
#' "USP9Y", "USP9X", "XIST", "XIST", "TSIX", "TSIX"), nrow=2)
#' predicteff(tcell, featuresinf=homologous, profile=TRUE)
#'
predicteff <- function(x,
                    featuresinf=NULL,
                    cores=1,
                    seed=1234,
                    plot.overlap=FALSE,
                    quant=Inf,
                    dup=FALSE,
                    profile=FALSE,
                    resplevel=0){

  ##Data set up
  ########################

  teffdata <- x$teffdata
  features <- t(x$features)
  subsids <- colnames(features)

  #redefine features on X by taking the average across featuresinf rows
  if(is.matrix(featuresinf)){

    X <- lapply((1:ncol(featuresinf)), function(i)
      colMeans(features[featuresinf[,i], ]))

    X <- do.call(cbind,X)
    colnames(X) <-  sapply(1:ncol(featuresinf), function(i) paste0(featuresinf[,i], collapse="-"))

    if(plot.overlap){
      wh <- which(colSums(sapply(data.frame(featuresinf), duplicated))==0)
      colnames(X) <-  paste(featuresinf[1,],c(featuresinf[2,wh]," "," "), sep="-")
    }

    nms <- colnames(X)
    featuresinf <- nms

  }else{
    if(is.null(featuresinf)) featuresinf <- rownames(features)
    selfeatures <- which(rownames(features)%in%featuresinf)
    X <- t(features[as.numeric(selfeatures), ])
    nms <- colnames(X)
  }

  #obtain residuals on XX of features data after adjusting for covariates in teffdata
  XX <- parallel::mclapply(1:ncol(X), function(i){
    covvars <- colnames(teffdata)
    xi <- X[,i]
    fla <- paste("xi ~", paste(covvars, collapse="+"))

    lm(as.formula(fla) , data=data.frame(teffdata))$residuals
  }, mc.cores = cores)


  #redefine feature data
  X <- do.call(cbind, XX)
  colnames(X) <- nms


  #Formating data for RCF
  ########################

  #effect
  Y <- teffdata[,"eff"]

  #treatment
  W <- as.numeric(as.factor(teffdata[,"t"]))==2

  #randomly selection of a test comprising 20% of subjects
  set.seed(seed)
  sm <- sample(1:nrow(X),floor(nrow(X)*0.2))
  smt <- (1:nrow(X))[-sm]

  if(dup){
    sm <- c(sm,sm)
    smt <- c(smt,smt)
  }

  X.test <- X[sm,]
  Y.test <- Y[sm]
  W.test <- W[sm]

  #randomly select a training
  X.train <- X[smt,]
  Y.train <- Y[smt]
  W.train <- W[smt]

  #plots covariate overlap
  ########################

  if(plot.overlap){
    if(ncol(X)>20) stop("plot.overlap not allowed for number of features > 20")

    print("... plots of t:eff interactions on the outcome in interactions.pdf \n")

    pdf("./interactions.pdf")
      cc <- W.train
      cc[W.train] <- "orange"
      cc[!W.train] <- "blue"

      for(ii in 1:ncol(X.train)){

        plot(X.train[,ii], log2(Y.train+1), col=cc, main=colnames(X.train)[ii],pch=20, xlab="Mean expression residual", ylab="Outcome")
        abline(lm(log2(Y.train+1)[W.train] ~ X.train[W.train,ii]),lwd=1.5, lty=2, col="blue")
        abline(lm(log2(Y.train+1)[!W.train] ~ X.train[!W.train,ii]),lwd=1.5, lty=2, col="orange")
        legend("topleft",legend=c("Male", "Female"), col=c("orange", "blue"), pch=20, cex=0.7)
      }


    dev.off()


    pdf("./overlap.pdf")
      print("... plots of covariate overlap across tretments in overlap.pdf \n")
      for(ii in 1:ncol(X.train)){

        int <- seq(min(X.train)-0.5,max(X.train)+0.5, 0.5)

        hist(X.train[W.train,ii], freq=FALSE, br=int, main=gsub("\n ","-",colnames(X.train)[ii]),
             xlab="Mean expression residual", ylab="Density", ylim=c(0,0.9), col="blue", cex.lab=1.3 )

        hist(X.train[!W.train,ii], add=TRUE, freq=FALSE, br=int, col="orange", cex.lab=1.3)

        legend("topright", legend=c("Male", "Female"), col=c("orange", "blue"),
               lty=1, bty = "n", cex=1.3)
      }

    dev.off()
  }

  #fit RCF
  ########################

  tau.forest <- grf::causal_forest(X.train, Y.train, W.train, num.trees = 4000)
  tau.hat <- predict(tau.forest, X.test, estimate.variance = TRUE)
  names(tau.hat$predictions) <- subsids[sm]

  sigma.hat <- sqrt(tau.hat$variance.estimates)

  v1 <- grf::variable_importance(tau.forest)
  o.imp <- order(-v1)
  imp <- o.imp[1]
  featurecommon <- data.frame(featurenames=colnames(X), imp=v1)[o.imp,]

  featureimp <- lapply(featuresinf, function(ll){
    #select features results in each study for common feature symbols
    out <- featurecommon[which(as.character(featurecommon$featurenames)%in%ll), ]
    #select probe with highest score
    if(length(nrow(out))!=0)
      out <- out[order(-out[,2])[1], ]
    out
  })

  featureimp <- do.call(rbind,featureimp)
  featureimp <- featureimp[order(featureimp$imp, decreasing = TRUE),]

  #extract confidence intervals
  cl <- tau.hat$predictions - 1.96*sigma.hat
  cu <- tau.hat$predictions + 1.96*sigma.hat
  #output
  res <- list(predictions=tau.hat$predictions,
              featurenames=featureimp,
              cl=cl, cu=cu, subsids=subsids[sm],
              treatment = W.test*1,
              resplevel=resplevel)

  if(dup==TRUE){
    selnotdup <- duplicated(subsids[sm])
    res <- list(predictions=tau.hat$predictions[selnotdup],
                featurenames=featureimp,
                cl=cl[selnotdup], cu=cu[selnotdup],
                subsids=subsids[sm][selnotdup],
                treatment = (W.test*1)[selnotdup],
                resplevel=resplevel)
  }
  #add profiling to output
  if (profile){

    sighetpositive <- (cl > resplevel) + 1
    sighetnegative <-  (cu < resplevel) + 1


    #get profile for top featurenames define by quant
    cutoff <- max(which(cumsum(featureimp$imp) < quant))
    colimp <- featureimp$imp[1:cutoff]

    #select trasncription profiles (residuals) for featurenames passing the cutoff
    whimp <- as.numeric(rownames(featureimp))
    Xscale <- scale(X.test)[, whimp[1:cutoff]]

    #Extract a binarized profiles across important featurenames, where featurenames up or
    #downregulated in their study
    profall <- lapply(1:ncol(Xscale), function(i) (Xscale[,i] > 0))
    profall <- do.call(cbind, profall)

    #Profiles of individuals with significant heterogeneous treatment-effects
    profpositive <- profall[sighetpositive==2,]
    profnegative <- profall[sighetnegative==2,]


    if(is.matrix(profpositive)==FALSE)
      profpositive <- matrix(profpositive, ncol=ncol(profall))

    if(is.matrix(profnegative)==FALSE)
      profnegative <- matrix(profnegative, ncol=ncol(profall))

    #summarize subject profiles into a single one representing individuals with
    #positive treatment effects
    profpositive <- matrix(colMeans(profpositive, na.rm=TRUE)>0.5, nrow=1)

    #and negative treatment effects
    profnegative <- matrix(colMeans(profnegative, na.rm=TRUE)>0.5, nrow=1)

    colnames(profpositive) <- colnames(Xscale)
    colnames(profnegative) <- colnames(Xscale)

    ##########
    #gather output
#    res <- list(predictions=tau.hat$predictions, featurenames=featureimp,  cl=cl, cu=cu, subsids=subsids[sm], treatment = W.test*1, profile=list(profpositive=profpositive,profnegative=profnegative), resplevel=resplevel)

    res$profile <-  list(profpositive=profpositive,profnegative=profnegative)
    res$resplevel <-  resplevel

  }


  attr(res, "class") <- "pteff"

  return(res)
}
teff-package/teff documentation built on March 20, 2022, 8:25 p.m.