R/synthesize.R
In riskRegression: Risk Regression Models and Prediction Scores for Survival Analysis with Competing Risks

Documented in synthesize synthesize.formula synthesize.lvm

### synthesize.R ---
#----------------------------------------------------------------------
## Author: Johan Sebastian Ohlendorff & Vilde Hansteen Ung & Thomas Alexander Gerds
## Created: Apr 28 2021 (09:04)
## Version:
## Last-Updated: Jul 12 2022 (11:19) 
##           By: Thomas Alexander Gerds
##     Update #: 95
#----------------------------------------------------------------------
##
### Commentary:
##
### Change Log:
#----------------------------------------------------------------------
##
### Code:
##' Synthesizes survival data (also works for linear models and generalized linear models).
##' The idea is to be able to simulate new data sets that mimic the original data.
##' See the vignette \code{vignette("synthesize",package = "riskRegression")} for more details.
##'
##' The simulation engine is: lava.
##' @title Cooking and synthesizing survival data
##' @description Fit parametric regression models to the outcome distribution and optionally
##' also parametric regression models for the joint distribution of the predictors
##' structural equation models. 
##' Then the function \code{sim.synth} can be called on the  resulting object to
##' to simulate from the parametric model based on the machinery of the \code{lava} package
##' @aliases synthesize.formula synthesize.lvm 
##' @param object Specification of the synthesizing model structures. Either a \code{formula} or a \code{lvm} object. See examples.
##' @param data Data to be synthesized.
##' @param recursive Let covariates recursively depend on each other.
##' @param max.levels Integer used to guess which variables are categorical. When set to \code{10}, the default,
##'                   variables with less than 10 unique values in data are treated as categorical.
##' @param verbose Logical. If \code{TRUE} then more messages and warnings are provided.
##' @param logtrans Vector of covariate names that should be log-transformed. This is primarily for internal use.
##' @param fix.names Fix possible problematic covariate names. 
##' @param ... Not used yet.
##' @return lava object
##' @seealso lvm
##' @examples
##' # pbc data
##' library(survival)
##' library(lava)
##' data(pbc)
##' pbc <- na.omit(pbc[,c("time","status","sex","age","bili")])
##' pbc$logbili <- log(pbc$bili)
##' v_synt <- synthesize(object=Surv(time,status)~sex+age+logbili,data=pbc)
##' d <- simsynth(v_synt,1000)
##' fit_sim <- coxph(Surv(time,status==1)~age+sex+logbili,data=d)
##' fit_real <- coxph(Surv(time,status==1)~age+sex+logbili,data=pbc)
##' # compare estimated log-hazard ratios between simulated and real data
##' cbind(coef(fit_sim),coef(fit_real))
##'
##' u <- lvm()
##' distribution(u,~sex) <- binomial.lvm()
##' distribution(u,~age) <- normal.lvm()
##' distribution(u,~trt) <- binomial.lvm()
##' distribution(u,~logbili) <- normal.lvm()
##' u <-eventTime(u,time~min(time.cens=0,time.transplant=1,time.death=2), "status")
##' lava::regression(u,logbili~age+sex) <- 1
##' lava::regression(u,time.transplant~sex+age+logbili) <- 1
##' lava::regression(u,time.death~sex+age+logbili) <- 1
##' lava::regression(u,time.cens~1) <- 1
##' transform(u,logbili~bili) <- function(x){log(x)}
##' u_synt <- synthesize(object=u, data=na.omit(pbc))
##' set.seed(8)
##' d <- simsynth(u_synt,n=1000)
##' # note: synthesize may relabel status variable
##' fit_sim <- coxph(Surv(time,status==1)~age+sex+logbili,data=d)
##' fit_real <- coxph(Surv(time,status==1)~age+sex+log(bili),data=pbc)
##' # compare estimated log-hazard ratios between simulated and real data
##' cbind(coef(fit_sim),coef(fit_real))
##'
##' #
##' # Cancer data
##' #
##' data(cancer)
##' b <- lvm()
##' distribution(b,~rx) <- binomial.lvm()
##' distribution(b,~age) <- normal.lvm()
##' distribution(b,~resid.ds) <- binomial.lvm()
##' distribution(b,~ecog.ps) <- binomial.lvm()
##' lava::regression(b,time.death~age+rx+resid.ds) <- 1
##' b<-eventTime(b,futime~min(time.cens=0,time.death=1), "fustat")
##' b_synt <- synthesize(object = b, data = ovarian)
##' D <- simsynth(b_synt,1000)
##' fit_real <- coxph(Surv(futime,fustat)~age+rx+resid.ds, data=ovarian)
##' fit_sim <- coxph(Surv(futime,fustat)~age+rx+resid.ds, data=D)
##' cbind(coef(fit_sim),coef(fit_real))
##' w_synt <- synthesize(object=Surv(futime,fustat)~age+rx+resid.ds, data=ovarian)
##' D <- simsynth(w_synt,1000)
##' fit_sim <- coxph(Surv(futime,fustat==1)~age+rx+resid.ds,data=D)
##' fit_real <- coxph(Surv(futime,fustat==1)~age+rx+resid.ds,data=ovarian)
##' # compare estimated log-hazard ratios between simulated and real data
##' cbind(coef(fit_sim),coef(fit_real))
##'
##'
##' @export
##' @author Thomas A. Gerds <tag@@biostat.ku.dk>
synthesize <- function(object, data,...){
    requireNamespace("lava")
    UseMethod("synthesize",object=object)
}

##' @export synthesize.formula
##' @export
#' @rdname synthesize
#' @method synthesize formula
synthesize.formula <- function(object, # a formula object Surv(time,event) or Hist(time,event)
                               data,
                               recursive=FALSE,
                               max.levels=10,
                               verbose=FALSE,
                               ...){
    requireNamespace("lava",quietly=TRUE)

    # time to event outcome
    tt <- all.vars(update(object,".~1"))

    # covariates
    vv <- all.vars(formula(delete.response(terms(object))))
    fml <- paste(vv,collapse = "+")
    vv.withtrans <- attr(terms(formula(delete.response(terms(object)))),"term.labels")
    logtrans <- vv[vv!=vv.withtrans]

    # could be done more easily if the order of the covariates is not important,
    # but it might be due to recursive
    # potentially being set to TRUE
    # this changes the formula to be logtransformed
    hasLog <- length(logtrans)>0
    for (v in logtrans){
      if (is.null(data[[v]])){
        if (verbose) warning(paste0("Covariate ",v," is not in data set. Removing it from the formula."))
        fml <- gsub(paste0("[+]",v),"",fml)
        vv <- logtrans[logtrans!=v]
      }
      else {
        data[[paste0("log",v)]] <- log(data[[v]])
        s<-paste0("log",v)
        fml <- gsub(v,s,fml)
      }
    }

    # check if covariates are categorical
    if (hasLog){
      #include transformed variables in model
      vv1 <- vv
      vv <-gsub("[(]|[)]","",vv.withtrans)
    }

    object <- update(object,as.formula(paste0("~",fml)))
    object <- lava::lvm(object)

    # specify distributions of covariates in the lava object
    for (v in vv){
      if (is.null(data[[v]])){
        if(verbose) warning(paste0("Covariate ",v," is not in data set. Removing it from the formula"))
        fml <- gsub(paste0("[+]",v),"",fml)
        vv <- vv[vv!=v]
        object <- lava::rmvar(object,v)
      }
      else if (categorize(v,max.levels,data) == 2){
        data[[v]] <- factor(data[[v]])
        lava::distribution(object,v) <- lava::binomial.lvm()
      }
      else if (categorize(v,max.levels,data) == 1){
        data[[v]] <- factor(data[[v]])
        object <- lava::categorical(object, v, K=length(unique(data[[v]])))
      }
      else {
        lava::distribution(object,v) <- lava::normal.lvm()
      }
    }

    # include recursive structure of covariates if requested
    if (recursive){
      cv <- vv
      while (length(cv) > 1) {
        cv2 <- cv[2:length(cv)]
        #lacks implementation in case it is categorical
        #needs multinomial logistic regression to be implemented
        if (categorize(cv[1],max.levels,data) == 1){
          warning("Categorical variables not fully tested for recursive")
          object <- lava::categorical(object, cv[1], K=length(unique(data[[cv[1]]])))
        }
        else if (categorize(cv[1],max.levels,data) == 2){
          lava::distribution(object,cv[1]) <- lava::binomial.lvm()
        }
        regression(object) <- as.formula(paste0(cv[1],"~",paste(cv2,collapse = "+")))
        cv <- cv2
      }
    }
    # outcome
    # binary/continuous outcome
    if (length(tt)==1){
        if (is.null(data[[tt[1]]])){stop("Response variable not in data set. Add it to the data, then try again.")}
        len.tt <- length(unique(data[[tt[[1]]]]))
        is.binary <- length(unique(data[[tt[[1]]]]))==2
        if (len.tt == 2){
            lava::distribution(object,tt[[1]]) <- lava::binomial.lvm()
        }
        else if (len.tt > 2 && len.tt < max.levels){
          warning("Categorical responses are not supported (yet).")
          object <- lava::categorical(object, tt[1], K=len.tt)
        }
        else {
          lava::distribution(object,tt[[1]]) <- lava::normal.lvm()
        }
    }else{
        # event time outcome
        if(is.null(data[[tt[1]]]) || is.null(data[[tt[2]]])) {
          stop("Response variable not in data set. Add it to the data, then try again.")
        }
        events <- sort(unique(data[[tt[[2]]]]))
        et.formula <- formula(paste0(tt[[1]]," ~ min(",paste(paste0("time.event.",events,"=",events),collapse=", "),")"))
        object <- lava::eventTime(object,et.formula, tt[[2]])
    }
    # call the next method, now the class of object is 'lvm' :)
    attr(x=object,which="from.formula") <- TRUE
    out <- synthesize(object=object,data=data,verbose=verbose,logtrans=logtrans,...)
    out
}


#' @export synthesize.lvm
#' @export
#' @rdname synthesize
#' @method synthesize lvm
synthesize.lvm <- function(object,
                           data,
                           max.levels = 10,
                           logtrans = NULL,
                           verbose=FALSE,
                           fix.names = FALSE,
                           ...){
    from.formula <- length(attr(object,"from.formula"))>0
    if (from.formula && verbose){
      warning("from.formula should be set to false if you did not specify a formula")
    }
    # check whether variables in model are in data set
    if (!from.formula && !all(object$attributes$eventHistory$time$names %in% names(data))) {
      stop("Time or status variable could not be found in data set.")
    }
    var.model <- colnames(object$M)
    # should deal with transformations check if non transformed are in data and if not add them
    # use grep to find transformed variables
    has.eventTime <- length(object$attributes$eventHistory)>0
    if (has.eventTime){
      timename <- names(object$attributes$eventHistory)
      timeplusevent <- object$attributes$eventHistory[[timename]]$names
      if (!(timeplusevent[1] %in% names(data)) || !(timeplusevent[2] %in% names(data))) {
        if (verbose) warning("time/status not found. Removing it")
        object <- lava::rmvar(object,timeplusevent[1])
        object <- lava::rmvar(object,timeplusevent[2])
      }
    }
    else {
      timename <- NULL
      timeplusevent <- NULL
    }
    # should check group transform
    # if (any(grepl("grp|group",var.model))) {
    #   stop("")
    # }
    # should check logtransform in data and add if necessary
    if (!from.formula && is.null(logtrans) && any(grepl("log",var.model))){
      #these have log in front
      trans <- var.model[grepl("log",var.model)]
      #log trans dont have log in front
      logtrans <- sub("log","",trans)
      for (v in logtrans){
        if (is.null(data[[v]]) && is.null(data[[paste0("log",v)]]) ){
          if(verbose) warning(paste0("Could not find the variable: ", v, "in the data"))
          object <- lava::rmvar(object,v)
          object <- lava::rmvar(object,paste0("log",v))
          trans <- trans[trans!=paste0("log",v)]
          logtrans <- logtrans[logtrans!=v]
        }
        else {
          data[[paste0("log",v)]] <- log(data[[v]])
        }
      }
    }

    # Check if all variables in data also occur in object
    # here we need to remove them if they are not
    if(!all(others <- names(data) %in% dimnames(object$M)[[1]])){
      if (verbose)
        warning("Some variables in dataset are not in object (or the names don't match).\n These variables are not synthesized:\n",
                paste0(names(data)[!others],collapse="\n"))
      # if(data.table::is.data.table(object))
      #     data <- data[,dimnames(object$M)[[1]],with=FALSE]
      # elsesub("log","",logtrans)
      #     data <- data[,dimnames(object$M)[[1]],drop=FALSE]
    }
    #check if variables in model are in data

    if (!all(others <- dimnames(object$M)[[1]] %in% names(data))){
      if (verbose) warning("Some variables in object are not in dataset (or the names don't match).\n These variables are not synthesized:\n",paste0(dimnames(object$M)[[1]][!others],collapse="\n"))
      #we have checked the logtransformed
      miss <- dimnames(object$M)[[1]][!others]
      if (length(logtrans) > 0) miss <- intersect(trans,intersect(logtrans,miss))
      if (has.eventTime) miss <- setdiff(miss, object$attributes$eventHistory[[timename]]$latentTimes)
      for (v in miss){
        object <- lava::rmvar(object,v)
      }
    }

    # note: will be a problem if there are NAs in data, should check at beginning of function

    # find intersection between variables in model with variables in data (these are the actual variables of the lava object)
    var.model <- colnames(object$M)
    var.model <- intersect(var.model,names(data))
    ismissingvar <- sapply(var.model, function(x) {anyNA(data[[x]])})
    if (any(ismissingvar )) {
      missing.var <- names(ismissingvar[ismissingvar])
      printvar <- paste(missing.var[1:min(length(missing.var),20)],collapse = ", ")
      stop(paste0("There should not be NAs for the variables in the model. The following variables have NAs: \n ",printvar))
    }

    sim_model <- lava::lvm()

    latent_vars <- endogenous(object)
    # should check factors in object are factors in data. If not, transform them
    if (!from.formula){
      for (v in var.model){
        #ignore transformed variables do not take the variables from the event history model
        if (!(v %in% object$attributes$eventHistory[[timename]]$names) && categorize(v,max.levels,data) != 0){
          data[[v]] <- factor(data[[v]])
        }
      }
    }

    # Set distributions:
    # 1. normal
    # 2. binomial
    # 3. categorical
    # deal with exogenous variables (which are without covariates)
    labels <- list()
    categorical.vars <- c()
    for(var in exogenous(object)){
        var_formula <- as.formula(paste0("~", var))
        #case: gaussian

        if("gaussian" %in% attributes(object$attributes$distribution[[var]])$family){
            lava::distribution(sim_model,var_formula) <- lava::normal.lvm(mean=mean(data[[var]]),
                                                                          sd=sd(data[[var]]))
        }
        #case: binary
        else if("binomial" %in% attributes(object$attributes$distribution[[var]])$family){
            labels[[var]] <- levels(data[[var]])
            lava::distribution(sim_model, var_formula) <- lava::binomial.lvm(p=mean(factor(data[[var]])==levels(factor(data[[var]]))[2]))
        }
        #case: categorical
        else if(var %in% names(object$attributes$nordinal)){
            categorical.vars <- c(categorical.vars, var)
            num_cat <- object$attributes$nordinal[[var]] #what if levels is NULL?
            cat_probs <- vector(mode="numeric", length=num_cat-1)
            for(i in 1:(num_cat-1)){
                cat_probs[i] <- mean(data[[var]] == levels(data[[var]])[i])
            }
            sim_model <- lava::categorical(sim_model,
                                           var_formula,
                                           labels=levels(data[[var]]),
                                           K=num_cat,
                                           p=cat_probs
                                           )
        }
        else {
            stop("Error. Distribution not supported!")
        }
    }

    
    # dichotomize categorical variables
    # should not do this if the cat. variable is not in lvm.object
    # note: name of dich. variables must correspond with names used in lava::regression formulas outside synthesize-func.
    dichotomized_variables <- c()
    for(var in colnames(data)[grepl('factor|character', sapply(data, class))]){
        #fix factors with weird characters including binary
        #note: we do not fix formulas and weirdly named variables
        #  can give problems if synthesized directly from lvm
        # don't change 0/1 variables even if they are factors
        if (from.formula || fix.names){
          # remove warning, i.e. don't compare if the number of levels is greater than 2
          if (length(levels(data[[var]])) ==2 && all(levels(data[[var]]) == c("0","1"))){
          }
          else {
            lvls <- make.names(levels(data[[var]]))
            levels(data[[var]]) <- lvls
          }
        }
        if(!any(grepl(var,  dimnames(object$M)[[1]]))){
            #variable not in lvm object
        } else if (length(levels(data[[var]]))>2){
            dichotomized_variables <- c(dichotomized_variables, var)
            for(lvl in levels(data[[var]])[-1]){
                data[[paste0(var, lvl)]] <- 1*(data[[var]] == lvl)
                formula <- as.formula(paste0(var, lvl,"~",var))
                # updates the formula in a local environment; otherwise the lvl will be tied to the last value of lvl in the for loop
                sim_model <- local({
                    l <- lvl
                    lava::transform(sim_model, formula) <- function(x){1*(x==l)}
                    return(sim_model)})
            }
        }
    }
    # get covariates as string and include dichotomized variables instead of the original ones
    get_covariates <- function(obj, v, dichotomized_variables) {
        if (!(v %in% dimnames(obj$M)[[2]])){
          return("1")
        }
        covariates <- dimnames(obj$M)[[2]][obj$M[,v] == 1]
        if (length(covariates) == 0){
            return("1")
        }
        else {
            fml <- paste(covariates,collapse = "+")
            for (var in dichotomized_variables){
                lvl <- paste0(var,levels(data[[var]])[-1])
                cv <- paste0(lvl,collapse = "+")
                fml <- gsub(var,cv,fml)
            }
            return(fml)
        }
    }

    # define latent event time variables
    # note: will be a problem if there are several eventTime variables. Should loop through all these
    has.eventTime <- length(object$attributes$eventHistory)>0
    if (has.eventTime){
        covariates <- get_covariates(object,timename,dichotomized_variables)
        response1 <- paste(object$attributes$eventHistory[[timename]]$names, collapse = ",")

        #include event time variables and fit the regression model
        for (latvar in object$attributes$eventHistory[[timename]]$latentTime){
            if (!from.formula){
              covariates <- get_covariates(object,latvar,dichotomized_variables)
            }
            latvar_formula <- as.formula(paste0("~", latvar))
            status_ind <- object$attributes$eventHistory[[timename]]$events[which(object$attributes$eventHistory[[timename]]$latentTimes %in% latvar)]
            response <- paste0(response1, "==", status_ind)
            surv_formula <- as.formula(paste0("Surv(", response, ")~", covariates))
            if (!(inherits(status_ind,"numeric"))) {stop("event or status variable has to be numeric")}
            G <- survreg(surv_formula, data = data)
            reg_formula <- as.formula(paste0(latvar, "~", covariates))
            lava::distribution(sim_model, latvar_formula) <- lava::coxWeibull.lvm(scale=exp(-coef(G)["(Intercept)"]/G$scale),shape=1/G$scale)
            lava::regression(sim_model, reg_formula) <- -coef(G)[-1]/G$scale

        }
        #calculate time and status afterwards
        events <- object$attributes$eventHistory[[timename]]$events
        if (!from.formula){
          cens <- object$attributes$eventHistory[[timename]]$latentTime
          et.formula <- formula(paste0(timename," ~ min(",paste0(cens,"=",events,collapse=", "),")"))
          sim_model <- lava::eventTime(sim_model,et.formula, object$attributes$eventHistory[[timename]]$names[2])
        }
        else {
          et.formula <- formula(paste0(timename," ~ min(",paste(paste0("time.event.",events,"=",events),collapse=", "),")"))
          sim_model <- lava::eventTime(sim_model,et.formula, object$attributes$eventHistory[[timename]]$names[2])
        }
    }

    # Estimate regression coefficients in real data
    # and add them to the lvm object using lava::regression
    for(var in latent_vars[!(latent_vars %in% object$attributes$eventHistory$time$names)]){
        covariates <- get_covariates(object,var,dichotomized_variables)
        reg_formula <- as.formula(paste0(var, "~", covariates))
        # we have three types of regression to deal with now. Either
        # 1. linear regression
        # 2. logistic regression
        # 3. multinomial logistic regression
        if ("binomial" %in% attributes(object$attributes$distribution[[var]])$family){
            # might also lack levels here
            labels[[var]] <- levels(data[[var]])
            fit <- glm(reg_formula,data=data,family="binomial")
            p0<-exp(coef(fit)[1])/(1+exp(coef(fit)[1]))
            lava::distribution(sim_model,as.formula(paste0("~", var))) <- lava::binomial.lvm(p=p0)
            lava::regression(sim_model,reg_formula)<-coef(fit)[-1]
        }
        #case: categorical
        else if (var %in% dichotomized_variables){
          categorical.vars <- c(categorical.vars, var)
          if (verbose) warning("Synthesize untested for categorical variables")
          for(lvl in levels(data[[var]])[-1]){
            reg_formula <- as.formula(paste0(var, lvl, "~", covariates))
            fit <- glm(reg_formula,data=data,family="binomial")
            p0<-exp(coef(fit)[1])/(1+exp(coef(fit)[1]))
            lava::distribution(sim_model,as.formula(paste0("~", var))) <- lava::binomial.lvm(p=p0)
            lava::regression(sim_model,reg_formula)<-coef(fit)[-1]
          }
        }
        # ignore them
        else if (has.eventTime && var %in% object$attributes$eventHistory[[timename]]$latentTime){}
        # case: gaussian
        else {
            fit <- lm(reg_formula,data=data)
            lava::regression(sim_model,reg_formula)<-coef(fit)[-1]
            lava::distribution(sim_model,as.formula(paste0("~", var))) <- lava::normal.lvm(mean = coef(fit)[1], sd = summary(fit)$sigma)
        }

        #else {if ("gaussian"%in% attributes(object$attributes$distribution[[var]])$family){
        #    stop("Distribution is not supported.")
        #}
    }

    #transform logtransformed covariates back
    for (v in logtrans){
        transform(sim_model,as.formula(paste0(v,"~log",v))) <- function(x){exp(x)}
    }
    res <- list(lava.object = sim_model,labels = labels,categories = categorical.vars)
    class(res) <- c("synth",class(res))
    return(res)
}

#returns categorical values
# 0=numeric, 1=categorical with more levels, 2=binary
categorize <- function(v,max.levels,data){
  is.cat <- is.factor(data[[v]])
  nu <- length(unique(data[[v]]))
  if (!is.cat) {is.cat <- (nu < max.levels)}
  is.bin <- nu==2
  return(is.bin+is.cat)
}


#----------------------------------------------------------------------
### synthesize.R ends here
Any scripts or data that you put into this service are public.
riskRegression documentation built on May 29, 2024, 10:59 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
riskRegression
Risk Regression Models and Prediction Scores for Survival Analysis with Competing Risks

R/synthesize.R
In riskRegression: Risk Regression Models and Prediction Scores for Survival Analysis with Competing Risks

Defines functions categorize synthesize.lvm synthesize.formula synthesize

Documented in synthesize synthesize.formula synthesize.lvm

Try the riskRegression package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

riskRegression Risk Regression Models and Prediction Scores for Survival Analysis with Competing Risks

R/synthesize.R In riskRegression: Risk Regression Models and Prediction Scores for Survival Analysis with Competing Risks

Defines functions categorize synthesize.lvm synthesize.formula synthesize

Documented in synthesize synthesize.formula synthesize.lvm

Try the riskRegression package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

riskRegression
Risk Regression Models and Prediction Scores for Survival Analysis with Competing Risks

R/synthesize.R
In riskRegression: Risk Regression Models and Prediction Scores for Survival Analysis with Competing Risks