R/bayesian.R

Defines functions bn.results.samples bn.prep.data bn.fit.dag bn.prep.dag

Documented in bn.fit.dag bn.prep.dag bn.prep.data bn.results.samples

##############################################################
# Bayesian Networks
# 4/4/2021
# created by T. Gärtner (thomas.gaertner@student.hpi.de)
##############################################################



#' Prepare Dag for BN Methods
#' @description This function reformat a `DAGitty` to `bn.learn` DAG. This is required for further methods.
#' @param dag DAG in a DAGitty format
#' @return bnlearn graph based on DAGitty graph
#' @examples
#' bn.prep.dag(simpatdag)
#' @export
bn.prep.dag <- function(dag){
  exposure <- dagitty::exposures(dag)
  outcome <- dagitty::outcomes(dag)
  names <- names(dag)
  bn.dag <- bnlearn::empty.graph(nodes = names(dag))
  edges <- dagitty::edges(dag)
  edges$from <- edges$v
  edges$to <- edges$w
  edges[] <- lapply(edges,as.character)
  bnlearn::arcs(bn.dag) = edges[,c("from","to")]
  plot(bn.dag, main = "Transformed Bn DAG")
  return(list(dag = bn.dag, exposure = exposure, outcome = outcome, variables=names))
}


#' Bayesian Network Analysis
#' @description This function is an Interface to the fitting function for Bayesian Networks from bnlearn
#' @param dag DAG generated by
#' @param data R data frame to fit the Bayesian network
#' @param method Method used to analyze. Default is `bayes`. Alternative is `mle`
#' @return bn.fit object
#' @examples
#' # specify column names
#' id <- "patient_id"
#' time_col <- "day"
#'
#' # Load data
#' load("data/simpatdag.rda")
#' load("data/simpatdat.rda")
#'
#' # Dag preprocessing
#' bn.dag <- bn.prep.dag(simpatdag)
#'
#' # Data Preprocessing (Factorization)
#' simpatdat$Uncertain_Low_Back_Pain <- as.factor(simpatdat$Uncertain_Low_Back_Pain)
#' simpatdat$Activity <- cut(simpatdat$Activity, 3, labels=c("low Activity", "middle Activity", "high Activity"))
#' bn.data <- bn.prep.data(bn.dag, simpatdat, id, time_col)
#' bn.fit.dag(bn.data, bn.dag, method="bayes")
#' @export
bn.fit.dag <- function(data, dag, method = "bayes"){
  fit <- bnlearn::bn.fit(dag$dag, data, method = method)
  return(fit)
}


#' Preprocess data
#' @description This function is preprocessing a dataframe to be used for bn.fit. For that, columns were transformed into factors and lags were created if required.
#' @param dag result from bn.prep.dag
#' @param data R data frame to fit the Bayesian network
#' @param id column name of patient identifier (only needed when lag column)
#' @param time_col column name of time (integer expected, only needed when lag column)
#' @param factorize specify weather all columns should be factors or not
#' @return data frame
#' @examples
#' # specify column names
#' id <- "patient_id"
#' time_col <- "day"
#'
#' # Load data
#' load("data/simpatdag.rda")
#' load("data/simpatdat.rda")
#'
#' # Dag preprocessing
#' bn.dag <- bn.prep.dag(simpatdag)
#'
#' # Data Preprocessing (Factorization)
#' simpatdat$Uncertain_Low_Back_Pain <- as.factor(simpatdat$Uncertain_Low_Back_Pain)
#' simpatdat$Activity <- cut(simpatdat$Activity, 3, labels=c("low Activity", "middle Activity", "high Activity"))
#' bn.data <- bn.prep.data(bn.dag, simpatdat, id, time_col)
#' @export
bn.prep.data <- function(dag, data, id="patient_id", time_col="day", factorize=FALSE, normalize=FALSE){
  library(doParallel)
  # get relevant columns
  names <- dag$variables
  # calculate lag
  for(name in names){
    # test, if lag is included
    if(grepl(".lag_", name, fixed=TRUE)){
      # extract lag and col information
      res <- stringr::str_split(name, ".lag_", n = 2, simplify = TRUE)
      var_name <- res[1]
      lag <- as.numeric(res[2])
      # create lag column
      
      no_cores <- detectCores() - 1
      registerDoParallel(cores=no_cores)
      cl <- makeCluster(no_cores, type="FORK")
      data <- foreach(id=unique(data[,"patient_id"]), .combine=rbind) %dopar% {
        dat <- data[data[,"patient_id"]==id,]
        dat[,name] <- c(rep(NA, lag), dat[,var_name])[1:nrow(dat)]
        dat}
      stopCluster(cl)
      #for(row in c(1:nrow(data))){
      #  data_point <- data[data[,id] == data[row,id] & data[,time_col] == data[row,time_col]-lag,]
      #  if (!nrow(data_point)==0){
      #    data[row,name] <- data_point[1,var_name]
      #  }else{
      #    data[row,name] <- NA
      #  }
      #}
      if(lapply(data, class)[[var_name]]=="factor"){
        data[,name] <- as.factor(data[,name])
        levels(data[,name]) <- levels(data[,var_name])
      }
    }
    # factorize
    if(normalize){
      data[,name] <- prep.normalize(data = data[,name], scale="uni")
    }
    if(factorize & name != dag$outcome){
      data[,name] <- as.factor(data[,name])
    }
  }
  # filter column names
  data <- data[,names]
  # return dataframe
  return(data)
}


#' Calculate Results
#' @description This function calculates for each exposure the mean outcome on a sample and was implemented to deliver fast results. Nevertheless, the function should not used extensively for giving results as they are not reproducable.
#' @param dag result from bn.prep.dag
#' @param fitted.model bn.fitted model
#' @param exposure_levels levels to test exposure for
#' @param verbose Identify if the function should print the results.
#' @return data frame
#' @examples
#' bn.results.samples(fitted.bn, bn.dag, exposure_levels, FALSE)
#' @export
bn.results.samples <- function(fitted.model, dag, exposure_levels, verbose=TRUE){
  # Function to calculate standard error
  standard.error <- function(x){
    return(sd(x)/sqrt(length(x)))
  }

  results <- data.frame(Exposure = rep(NA,0), Mean = rep(NA,0), Std.Error = rep(NA,0))

  data <- list()

  # Loop over all exposures
  for(exp_level in exposure_levels){
    exp <- list(exp = exp_level)
    names(exp) <- dag$exposure

    sim.outcome <- na.omit(bnlearn::cpdist(fitted.model, node = c(dag$outcome), evidence = exp, method = "lw"))

    sim.mean <- mean(sim.outcome[,dag$outcome])
    sim.err <- standard.error(sim.outcome[,dag$outcome])

    results[nrow(results)+1,] <- c(exp, sim.mean, sim.err)

    if(verbose){
      lines <- c(exp_level,
                 paste("Mean", sim.mean),
                 paste("Std.Error", sim.err))

      writeLines(paste(lines, sep="\n", collapse = "\n"))
    }
    data[exp_level] <- sim.outcome
  }
  return(results)
}
thogaertner/cinof1 documentation built on Jan. 8, 2022, 10:37 a.m.