R/simulation.r

Defines functions DF.to.longDT DF.to.long doLTCF sim simfull simobs simFromDAG getactions get_gennames get_allts CopyAttributes FormAttributes

Documented in DF.to.long DF.to.longDT doLTCF sim simfull simobs

FormAttributes <- function(DAG, parents, dataform, LTCF, tvals, netind_cl) {
  list(DAG = DAG,
      parents = parents,
      dataform = dataform,
      LTCF = LTCF,
      tvals = tvals,
      netind_cl = netind_cl,
      node_nms = sapply(N(DAG), '[[', "name"),	# node names from the DAG name attributes
      actnodes = attr(DAG, "actnodes"),	# save the action nodes (if defined)
      acttimes = attr(DAG, "acttimes"),	# action time points (if defined)
      attnames = attr(DAG, "attnames"),	# save the attribute names (if defined)
      attrs = attr(DAG, "attrs")			# save the attribute values (if defined)
      )
}
# copy all DAG related attributes from a full data.frame attribute list (skipping row names and such)
# can modify some of the attributes, s.a., dataform, tvals & LTCF if provided
CopyAttributes <- function(attrslist, dataform=NULL, tvals=NULL, LTCF=NULL) {
  list(DAG=attrslist[["DAG"]],
      parents=attrslist[["parents"]],
      dataform = if (!is.null(dataform)) dataform else attrslist[["dataform"]],
      LTCF = if (!is.null(LTCF)) LTCF else attrslist[["LTCF"]],
      tvals = if (!is.null(tvals)) tvals else attrslist[["tvals"]],
      node_nms = attrslist[["node_nms"]],
      actnodes = attrslist[["actnodes"]],
      acttimes = attrslist[["acttimes"]],
      attnames = attrslist[["attnames"]],
      attrs = attrslist[["attrs"]],
      target = attrslist[["target"]]
      )
}

get_allts <- function(obs.df) { # calculate actual t values used in the simulated data (wide format)
  node_nms <- attributes(obs.df)$names[-1]
  all_ts <- as.integer(unique(sapply(strsplit(node_nms, "_"), '[', 2)))
  all_ts <- sort(all_ts[!is.na(all_ts)]) #remove NA/NULL cases and sort
}

get_gennames <- function(obs.df) { # grab generic names (without time points) from simulated data (wide format)
  node_nms <- attributes(obs.df)$names[-1]
  gennames <- as.character(unique(sapply(strsplit(node_nms, "_"), '[', 1)))
  # gennames <- sort(gennames[!is.na(gennames)]) #remove NA/NULL cases and sort
}

# given names of actions, extract the corresponding DAG.action objects from DAG
getactions <- function(DAG, actions) {
  if (!is.character(actions)) stop("actions must be a character vector of action names")
  # check the actions actually exist in the DAG, then grab them
  all_action_DAGs <- A(DAG)
  if (length(all_action_DAGs)<1) stop("DAG objects contains no actions, define actions with DAG+action before simulating action-specific data...")
  all_action_names <- sapply(all_action_DAGs, function(DAG) attr(DAG, "actname"))
  # verify that each action name in actions has only 1 unique match
  action_idx_sel <- NULL
  for (action in actions) {
    action_idx <- which(all_action_names %in% action)
    if (length(action_idx)<1) {
      stop("Couldn't locate action: "%+% action %+% " , first define action by adding it to the DAG object with DAG+action")
    } else if (length(action_idx)>1) {
      stop("Found more than one action matching action name: " %+% action)
    }
    action_idx_sel <- c(action_idx_sel, action_idx)
  }
  actions <- all_action_DAGs[action_idx_sel]
}

# Internal function for simulation data from any DAG
# If prev.data is not NULL all the nodes in DAG will be evaluated in the environment of prev.data alone
simFromDAG <- function(DAG, Nsamp, wide = TRUE, LTCF = NULL, rndseed = NULL, rndseed.reset.node = NULL, prev.data = NULL, verbose = getOption("simcausal.verbose")) {
  assertthat::assert_that(assertthat::is.count(Nsamp) || as.integer(Nsamp)==0L)
  if (!is.null(rndseed)) {
    set.seed(rndseed)
  }
  if (!is.DAG(DAG)) stop("DAG argument must be an object of class DAG")

  # SAMPLING FROM ARBITRARY NODE DISTR FUN:
  sampleNodeDistr <- function(newNodeParams, distr, EFUP.prev, cur.node, expr_str, asis.samp = FALSE) {
    N_notNA_samp <- sum(!EFUP.prev)
    user.env <- cur.node$node.env
    asis.flags <- attr(newNodeParams$dist_params, "asis.flags")
    # print("asis.flags: "); print(asis.flags)
    # if (all(asis.flags)) asis.samp <- TRUE

    standardize_param <- function(distparam) {
      check_len <- function(param) {
        len <- length(param)
        if (len==1) {
          param <- rep.int(param, N_notNA_samp)
        } else if (len==Nsamp) {
          param <- param[!EFUP.prev]
        } else {
          # stop("error while evaluating node "%+% cur.node$name %+%" expression(s): "%+%expr_str%+%".\n One of the distribution parameters evaluated to an incorrect vector length, check syntax.")
          if (verbose) message("evaluating node "%+% cur.node$name %+%" expression(s): "%+%expr_str%+%
              ".\n One of the distribution parameters evaluated to non-standard vector length (its neither 1 nor n), make sure the distribution function knows how to handle it.")
        }
        param
      }

      # print("distparam: "); print(distparam)
      # print("class(distparam): "); print(class(distparam))

      if (is(distparam,"list")) {
        distparam <- lapply(distparam, check_len)
        distparam <- do.call("cbind", distparam)
      } else if (is.vector(distparam)) {
        distparam <- check_len(distparam)
      } else if (is.matrix(distparam)) {
        if (nrow(distparam)==1) { # for mtx with 1 row -> repeat N_notNA_samp times:
          # distparam <- matrix(distparam, nrow = N_notNA_samp, ncol = ncol(distparam), byrow = TRUE)
        } else if (nrow(distparam)==Nsamp) { # for mtx with Nsamp rows -> subset mtx[!EFUP.prev,]
          distparam <- distparam[!EFUP.prev, , drop=FALSE]
        } else {
          # stop("error while evaluating node "%+% cur.node$name %+%" expression(s): "%+%expr_str%+%".\n One of the distribution parameters evaluated to an incorrect vector length, check syntax.")
          if (verbose) message("evaluating node "%+% cur.node$name %+%
              " expression(s): One of the distribution parameters evaluated to a matrix of non-standard dimensions (its neither 1 nor n rows), make sure the distribution function knows how to handle it.")
        }
      } else {
        # print("expr_str"); print(expr_str)
        # print(class(expr_str))
        # stop("...one of the distribution parameters evaluated to unsported data type...")
        # stop("error while evaluating node "%+% cur.node$name %+%" expression(s): "%+%expr_str%+%".\n One of the formulas evaluated to an unsported data type, check syntax.")
        if (verbose) message("The node " %+%  cur.node$name %+% " expression(s): "%+%expr_str%+%
            ".\n One of the formulas evaluated to unknown data type. Make sure the distribution function knowns how to handle it...")
      }
      distparam
    }

    # 1) Check & get the distribution function from the user-calling environment
    # 2) If it doesn't exist, search the package namespace, if not found throw an exception
    # More efficient version:
    if (is.null(distr.fun <- get0(distr, mode = "function"))) {
      if (!is.null(distr.fun <- get0(distr, envir = user.env, mode = "function"))) {
        if (verbose) message(distr %+% ": note this distribution could not be located in package namespace, simulating from user-defined distribution found under the same name")
      } else {
        namespace_distr <- unlist(strsplit(distr, "::"))
        distr.fun <- tryCatch(getFromNamespace(x = namespace_distr[2], ns = namespace_distr[1], envir = user.env))
        if (inherits(distr.fun, "try-error")) stop(distr %+% ": this node distribution function could not be located")
        # stop(distr %+%": this distribution function can\'t be found")
      }
    }

    # *) Go over distribution parameters and for each parameter:
    # check for consistency; make it an appropriate length; turn each list parameter into matrix;
    dprint("before standardize newNodeParams$dist_params:"); dprint(newNodeParams$dist_params)
    if (!asis.samp) {
      for (idx in seq_along(newNodeParams$dist_params)) {
        if (!asis.flags[[idx]] || (is.vector(newNodeParams$dist_params[[idx]]) && length(newNodeParams$dist_params[[idx]]) == Nsamp))
          newNodeParams$dist_params[[idx]] <- standardize_param(newNodeParams$dist_params[[idx]])
      }
    }
    dprint("after standardize newNodeParams$dist_params:"); dprint(newNodeParams$dist_params)

    # *) Add n argument to dist_params
    newNodeParams$dist_params <-  append(list(n=N_notNA_samp), newNodeParams$dist_params)

    # *) Call the distribution function, passing the distribution parameters:
    newVar_Samp <- try(do.call(distr.fun, newNodeParams$dist_params))

    if(inherits(newVar_Samp, "try-error")) {
      stop("failed while sampling node '" %+% cur.node$name %+% "', from distribution '" %+% distr %+% "'")
    }

    if (asis.samp) {
      return(newVar_Samp)
    # output is a vector
    } else if (is.null(dim(newVar_Samp))||is.vector(newVar_Samp)) {
      newVar <- rep.int(NA, Nsamp)
      if (is.factor(newVar_Samp)) {
        newVar[!EFUP.prev] <- as.numeric(levels(newVar_Samp))[newVar_Samp]
        newVar <- factor(newVar)
      } else {
        newVar[!EFUP.prev] <- newVar_Samp
      }
      return(newVar)
    # output is a matrix -> test dimensionality is correct (equal length(names)), but do nothing else
    } else if (is.matrix(newVar_Samp)) {
      if (ncol(newVar_Samp)!=length(cur.node$mv.names)) stop("dimensionality (number of columns) sampled by multivariate node is not equal to length of `name` argument for node: " %+% cur.node$name)
      colnames(newVar_Samp) <- cur.node$mv.names
      return(newVar_Samp)
    } else {
      stop("unrecognized sampled variable output type, only vectors or matrices are allowed, node: " %+% cur.node$name)
    }
  }

  #---------------------------------------------------------------------------------
  # Iteratively create observed DAG nodes (wide format data.fame of Nsamp observations)
  #---------------------------------------------------------------------------------
  EFUP.prev <- rep(FALSE, Nsamp)
  LTCF.prev <- rep(FALSE, Nsamp)

  obs.df <- data.frame(ID = seq(vector(length = Nsamp))) # obs.df <- data.frame(ID = seq(1:Nsamp))
  obs.dt <- data.table(obs.df)

  # for efficiency, allocate columns in DT to maximum number of variables, if it exceeds 1025 (default)
  DT.cols <- (length(DAG)+1)
  if (DT.cols > 1025) {
    alloc.col(obs.dt, DT.cols)
  }
  # alloc.col(obs.dt, max(min(length(DAG)+1, 1000), 200))

  #---------------------------------------------------------------------------------
  # CHECKS PERFORMED DURING EVALUTION:
  #---------------------------------------------------------------------------------
  # at each node check:
  # *) (formulas) check all formulas are evaluable R expressions (try(eval))
  # *) (formulas) check each cur.node formula references:
  #	node names that already exist, and all node names must have order < cur.node$order

  #---------------------------------------------------------------------------------
  # consider pre-allocating memory for newNodeParams and newVar
  #---------------------------------------------------------------------------------
  newVar <- vector(length = Nsamp)
  NvarsPerNode <- sapply(DAG, function(node) length(node[["mv.names"]]))
  totalNvars <- sum(NvarsPerNode)
  MVvars <- unlist(lapply(DAG, function(node) node[["mv.names"]]))
  MVvarMapNode <- NULL
  for (node in DAG) {
    for (mv.name in node[["mv.names"]]) MVvarMapNode <- append(MVvarMapNode, list(list(MVname = mv.name, Node = node$name)))
  }
  names(MVvarMapNode) <- MVvars

  NodeParentsNms <- vector("list", length = length(DAG))	# list that will have parents names for each node
  names(NodeParentsNms) <- names(DAG)

  # initialize the node formula evaluator class (Define_sVar):
  node_evaluator <- Define_sVar$new() # netind_cl = netind_cl
  t_pts <- NULL
  netind_cl <- NULL

  for (cur.node in DAG) {
    t <- cur.node$t # define variable for a current time point t
    t_new <- !(t %in% t_pts) # set to TRUE when switch to new time point t occurs otherwise FALSE
    t_pts <- c(t_pts, t)
    t_pts <- as.integer(unique(t_pts)) # vector that keeps track of unique timepoints t
    gnodename <- as.character(unlist(strsplit(cur.node$name, "_"))[1])

    #------------------------------------------------------------------------
    # CHECKS NODE DISTRIBUTIONS & EVALUATE NODE DISTRIBUTION PARAMS PROVIDED IN NODE FORMULA(S)
    #------------------------------------------------------------------------
    # TO DO:
    # Need to make sure that action attributes never get separate columns in df, since they are just constants
    # The attributes should be added to the eval env as variables
    # See if current expr naming strucutre in Define_sVar can be recycled for sampling from multivariate densities
    # NEED TO ADDRESS: WHEN A CONSTANT (length==1) IS PASSED AS A PARAMETER, DON'T ALWAYS WANT TO TURN IT INTO A VECTOR OF LENGTH n
    #------------------------------------------------------------------------

    # setting the node-specific user calling environment for the evaluator:
    # node_evaluator$set.user.env(cur.node$node.env)
    eval_expr_res <- node_evaluator$eval.nodeforms(cur.node = cur.node, data.df = if (!is.null(prev.data)) prev.data else obs.dt)
    par.names <- unique(unlist(lapply(eval_expr_res, '[[', "par.nodes")))
    eval_dist_params <- lapply(eval_expr_res, '[[' ,"evaled_expr")
    attr(eval_dist_params, "asis.flags") <- attr(cur.node$dist_params, "asis.flags")
    newNodeParams <- list(dist_params = eval_dist_params, par.names = par.names)

    # dprint("cur.node t:" %+% cur.node$t)
    # dprint("original nodeform expr:"); dprint(lapply(cur.node$dist_params, function(expr) as.character(expr)))
    # dprint("full eval_expr_res:"); dprint(eval_expr_res)
    # dprint("eval_dist_params:"); dprint(eval_dist_params)

    #------------------------------------------------------------------------
    # SAMPLE NEW COVARIATE BASED ON DISTR PARAMS
    #------------------------------------------------------------------------

    # reset the seed if the current node matches argument rndseed.reset.node
    if (!is.null(rndseed) && !is.null(rndseed.reset.node)) {
      if (rndseed.reset.node %in% cur.node$name) set.seed(NULL)
    }

    distr <- cur.node$distr
    if ("DAG.net" %in% class(cur.node)) {

      if (!is.null(netind_cl) && verbose) message("Network is being re-sampled during the same simulation!")
      distr <- cur.node$netfun

      # ***************************************************************
      # USING AS-IS FOR SAMPLING NETWORK BECAUSE THE OUTPUT IS A MATRIX. THIS IS A BAD WAY TO SOLVE THIS.
      # A BETTER SOLUTION IS TO ALLOW THE distr RESULT TO BE A VECTOR OR MATRIX (for multivariate RVs)
      # *****************************************************************
      NetIndMat <- sampleNodeDistr(newNodeParams = newNodeParams, distr = distr, EFUP.prev = EFUP.prev,
                                  cur.node = cur.node, expr_str = cur.node$dist_params, asis.samp = TRUE)
      cur.node$Kmax <- ncol(NetIndMat)
      netind_cl <- NetIndClass$new(nobs = Nsamp, Kmax = cur.node$Kmax)
      netind_cl$NetInd <- NetIndMat
      node_evaluator$set.net(netind_cl) # set the network for the evaluator:

      # Kmax.new <- ncol(NetIndMat)
      # if (as.integer(Kmax.new) != as.integer(cur.node$Kmax)) {
      #   if (verbose) message("Kmax is reset to a new value; old Kmax: " %+% cur.node$Kmax %+% "; new Kmax: " %+% Kmax.new)
      #   cur.node$Kmax <- Kmax.new
      # }
      # node_evaluator$Kmax <- netind_cl$Kmax
      newVar <- NULL
      NodeParentsNms <- NodeParentsNms[-which(names(NodeParentsNms) %in% cur.node$name)]

    } else {
      # print("cur.node: "); print(cur.node$name)
      # print("newNodeParams$par.names: "); print(newNodeParams$par.names)
      # print("MVvarMapNode"); print(MVvarMapNode)
      if (!is.null(newNodeParams$par.names)) {
        for (idx in seq_along(newNodeParams$par.names)) {
          choose_node_idx <- names(MVvarMapNode) %in% newNodeParams$par.names[idx]
          if (sum(choose_node_idx)>0) newNodeParams$par.names[idx] <- MVvarMapNode[choose_node_idx][[1]][["Node"]]
        }
        NodeParentsNms[[cur.node$name]] <- c(NodeParentsNms[[cur.node$name]], newNodeParams$par.names)
      }

      newVar <- sampleNodeDistr(newNodeParams = newNodeParams, distr = distr, EFUP.prev = EFUP.prev,
                                cur.node = cur.node, expr_str = cur.node$dist_params)
    }

    if (!is.null(newVar)) {
      # already performed inside sampleNodeDistr, so commented out:
      # newVar[EFUP.prev] <- NA
      # uncomment to assign 0 instead of NA for missing (after EOF=TRUE node evals to 1)
      # newVar[EFUP.prev] <- 0
      #------------------------------------------------------------------------
      # LTCF is a generic name for a failure/censoring node, after failure occurred (node=1), carry forward (impute) all future covariate values (for example, survival outcome with EFUP=TRUE)
      #------------------------------------------------------------------------
      if (!is.null(t)) {	# check time points were specified by user
        # check current t is past the first time-point & we want to last time-point carried forward (LTCF)
        if ((sum(LTCF.prev) > 0) & (t > t_pts[1]) & !is.null(LTCF)) {
          # Carry last observation forward for those who reached EFUP (failed or but not censored)
          prevtVarnm <- gnodename %+% "_" %+% (t-1)
          # Check first that the variable exists (has been defined at the previous time point), if it doesn't exist, just assign NA
          # if(!any(names(obs.df)%in%prevtVarnm)) {
          if(!any(names(obs.dt)%in%prevtVarnm)) {
            warning(gnodename%+%": is undefined at t="%+%(t-1)%+%", hence cannot be carried forward to t="%+%t)
            newVar[LTCF.prev] <- NA
          } else {
            # newVar[LTCF.prev] <- obs.df[LTCF.prev, (gnodename %+% "_" %+% (t-1))]
            # newVar[LTCF.prev] <- obs.dt[LTCF.prev, (gnodename %+% "_" %+% (t-1)), with = FALSE]
            newVar[LTCF.prev] <- obs.dt[LTCF.prev, ][[(gnodename %+% "_" %+% (t-1))]]
          }
        }
        # carry forward the censoring indicator as well (after evaluating to censored=1) - disabled call doLTCF() with censoring node name instead
        # if ((!is.null(LTCF))) {
        # 	if (is.EFUP(cur.node) & (!(LTCF%in%gnodename))) {
        # 		newVar[EFUP.prev & !(LTCF.prev)] <- 1
        # 	}
        # }
      }
      #------------------------------------------------------------------------
      # modify the observed data by adding new sampled covariate
      # *** TODO: need to internally save the evaluation results EFU.flag for the following lines to work correctly:
        # in eval.target():
        # vec_EFUP <- sapply(N(DAG)[outnode_nms], is.EFUP)
        # in eval.MSM():
        # vec_EFUP <- sapply(DAG[outnode_nms], is.EFUP)
        # in doLTCF():
        # if  (is.EFUP(cur.node)&(cur.outcome))
      #------------------------------------------------------------------------

      # 2 ways to assign several new vars with data.table (1st is the fastest)
      if (is.matrix(newVar)) {
        for (col in cur.node$mv.names) {
          obs.dt[!EFUP.prev, (col) := newVar[, col]]
          # obs.df <- within(obs.df, {assign(col, newVar[, col])})
        }
      } else {
        obs.dt[, (cur.node$name) := newVar]
        # obs.df <- within(obs.df, {assign(cur.node$name, newVar)})
      }
      # another version, requires making a copy of newVar
      # obs.dt[,(cur.node$name):=as.data.table(newVar)]

      if (!is.null(cur.node$EFU)) {
        # EFU is now evaluated separately for each observation, only when evaluates to TRUE then the current node starts acting as a censoring variable
        # EFU.flag <- node_evaluator$eval.EFU(cur.node = cur.node, data.df = if (!is.null(prev.data)) prev.data else obs.df)
        EFU.flag <- node_evaluator$eval.EFU(cur.node = cur.node, data.df = if (!is.null(prev.data)) prev.data else obs.dt)
        EFU.TRUE <- EFU.flag %in% TRUE
        # EFU.TRUE <- which(EFU.flag %in% TRUE)
        # print("EFU.flag: "); print(EFU.flag); print(EFU.flag %in% TRUE)
        # check for censoring only when at least one observation had EFU=TRUE
        if (sum(EFU.TRUE)>0) {
          # EFUP.now <- (obs.df[,ncol(obs.df)]%in%1) & (EFU.TRUE)
          EFUP.now <- (obs.dt[[cur.node$name[1]]]%in%1) & (EFU.TRUE)
          EFUP.prev <- (EFUP.prev | EFUP.now)
          if ((!is.null(LTCF)) && (LTCF%in%gnodename[1]) && is.null(prev.data)) { # is this node the one to be carried forward (LTCF node)? mark the observations with value = 1 (carry forward)
            LTCF.prev <- (LTCF.prev | EFUP.now)
          }
        }
      }
    }
  }

  # Collect all attributes to be assigned to the obs data
  # all_ts <- get_allts(obs.df) # calculate actual time values used in the simulated data
  all_ts <- get_allts(obs.dt) # calculate actual time values used in the simulated data

  if (sum(LTCF.prev)>0) {	# only change the LTCF attribute if outcome carry forward imputation was really carried out for at least one obs
    LTCF_flag <- LTCF
  } else {
    LTCF_flag <- NULL
  }

  # -----------------------------------------------------------------------------------
  # Remove latent variables from the simulated dataset (if any were declared as latent)
  # -----------------------------------------------------------------------------------
  latent.v <- attr(DAG, "latent.v")
  if (!is.null(latent.v)) {
    # excl.cols <- colnames(obs.df)%in%latent.v
    excl.cols <- colnames(obs.dt)%in%latent.v
    if (sum(excl.cols) > 0) {
      # obs.df <- obs.df[,!excl.cols]
      obs.dt[,(colnames(obs.dt)[excl.cols]) := NULL]
    }
  }

  newattrs <- FormAttributes(DAG = DAG, parents = NodeParentsNms, dataform = "wide", LTCF = LTCF_flag, tvals = all_ts, netind_cl = netind_cl)
  newattrs_names <- names(newattrs)
  for (attr_name in newattrs_names) {
    setattr(x = obs.dt, name = attr_name, value = newattrs[[attr_name]])
  }

  if (!wide) {
    if (length(all_ts)>1) {		# only perform conversion when there is more than one t in the simulated data
      # obs.df <- DF.to.long(obs.df)
      obs.dt <- DF.to.longDT(obs.dt)
    } else {
      warning("Simulated data returned in wide format. Can't convert to long format, since only one time-point was detected.")
    }
  }
  class(obs.dt) <- "data.frame"
  # reset the seed to NULL if it was previously set
  if (!is.null(rndseed)) {
    set.seed(NULL)
  }
  # return(obs.df)
  return(obs.dt)
}

#' Simulate Observed Data
#'
#' This function simulates observed data from a DAG object.
#' @param DAG A DAG objects that has been locked with set.DAG(DAG). Observed data from this DAG will be simulated.
#' @param n Number of observations to sample.
#' @param wide A logical, if TRUE the output data is generated in wide format, if FALSE, the output longitudinal data in generated in long format
#' @param LTCF If forward imputation is desired for the missing variable values, this argument should be set to the name of the node that indicates the end of follow-up event. See the vignette, \code{\link{sim}} and \code{\link{doLTCF}} for additional details.
#' @param rndseed Seed for the random number generator.
#' @param rndseed.reset.node When \code{rndseed} is specified, use this argument to specify the name of the \code{DAG} node at which the random number generator seed is reset back to \code{NULL} (simulation function will call \code{set.seed(NULL)}).
#' Can be useful if one wishes to simulate data using the set seed \code{rndseed} only for the first K nodes of the DAG and use an entirely random sample when simulating the rest of the nodes starting at K+1 and on.
#' The name of such (K+1)th order \code{DAG} node should be then specified with this argument.
#' @param verbose Set to \code{TRUE} to print messages on status and information to the console.
#'  Turn this off by default using options(simcausal.verbose=FALSE).
#' @return A \code{data.frame} where each column is sampled from the conditional distribution specified by the corresponding \code{DAG} object node.
#' @family simulation functions
#' @seealso \code{\link{simfull}} - a wrapper function for simulating full data only; \code{\link{sim}} - a wrapper function for simulating both types of data; \code{\link{doLTCF}} for forward imputation of the missing values in already simulating data; \code{\link{DF.to.long}}, \code{\link{DF.to.longDT}} - converting longitudinal data from wide to long formats.
#' @export
simobs <- function(DAG, n, wide = TRUE, LTCF = NULL, rndseed = NULL, rndseed.reset.node = NULL, verbose = getOption("simcausal.verbose")) {
  if (!is.DAG(DAG)) stop("DAG argument must be an object of class DAG")
  simFromDAG(DAG=DAG, Nsamp=n, wide=wide, LTCF=LTCF, rndseed=rndseed, rndseed.reset.node=rndseed.reset.node, verbose=verbose)
}

#' Simulate Full Data (From Action DAG(s))
#'
#' This function simulates full data based on a list of intervention DAGs, returning a list of \code{data.frame}s.
#' @param actions Actions specifying the counterfactual DAG. This argument must be either an object of class DAG.action or a list of DAG.action objects.
#' @param n Number of observations to sample.
#' @param wide A logical, if TRUE the output data is generated in wide format, if FALSE, the output longitudinal data in generated in long format
#' @param LTCF If forward imputation is desired for the missing variable values, this argument should be set to the name of the node that indicates the end of follow-up event. See the vignette, \code{\link{sim}} and \code{\link{doLTCF}} for additional details.
#' @param rndseed Seed for the random number generator.
#' @param rndseed.reset.node When \code{rndseed} is specified, use this argument to specify the name of the \code{DAG} node at which the random number generator seed is reset back to \code{NULL} (simulation function will call \code{set.seed(NULL)}).
#' Can be useful if one wishes to simulate data using the set seed \code{rndseed} only for the first K nodes of the DAG and use an entirely random sample when simulating the rest of the nodes starting at K+1 and on.
#' The name of such (K+1)th order \code{DAG} node should be then specified with this argument.
#' @param verbose Set to \code{TRUE} to print messages on status and information to the console.
#'  Turn this off by default using options(simcausal.verbose=FALSE).
#' @return A named list, each item is a \code{data.frame} corresponding to an action specified by the actions argument, action names are used for naming these list items.
#' @family simulation functions
#' @seealso \code{\link{simobs}} - a wrapper function for simulating observed data only; \code{\link{sim}} - a wrapper function for simulating both types of data; \code{\link{doLTCF}} for forward imputation of the missing values in already simulating data; \code{\link{DF.to.long}}, \code{\link{DF.to.longDT}} - converting longitudinal data from wide to long formats.
#' @export
simfull <- function(actions, n, wide = TRUE, LTCF = NULL, rndseed = NULL, rndseed.reset.node = NULL, verbose = getOption("simcausal.verbose")) {
  if (!is.null(rndseed)) {
    set.seed(rndseed)
  }
  if (("DAG.action" %in% class(actions))) {
    actname <- get.actname(actions)
    actions <- list(actions)
    names(actions) <- actname
  } else if ("list" %in% class(actions)) {
    checkDAGs <- sapply(actions, is.DAG.action)
    if (!all(checkDAGs)) stop("argument 'actions': all elements of the list must be DAG.action objects")
  } else {
    stop("argument 'actions' must be an object of class 'DAG.action' or a list of 'DAG.action' objects")
  }

  fulldf.list <- list()	# list that will contain full data (each item = one dataframe per intervention)
  for (idx_act in c(1:length(actions))) {	# loop over each action/intervention/regimen DAG
    actname <- names(actions)[idx_act]
    actDAG <- actions[[idx_act]]
    fulldf <- try(simFromDAG(DAG=actDAG, Nsamp=n, wide=wide, LTCF=LTCF, rndseed=rndseed, rndseed.reset.node=rndseed.reset.node, verbose=verbose))
    if(inherits(fulldf, "try-error")) {
    	stop("simulating full data for action '"%+%actname%+%"' resulted in error...", call. = FALSE)
    }
    # adding action indicator column to the data.frame
    # newattrs <- CopyAttributes(attrslist=attributes(fulldf)) 	# save attributes
    # actindvar <- rep.int(actname, times=nrow(fulldf))
    # fulldf <- cbind(action=actindvar, fulldf)
    # attributes(fulldf) <- c(attributes(fulldf), newattrs)
    # attr(fulldf, "node_nms") <- c(actname, attr(fulldf, "node_nms"))
    # names(attr(fulldf, "node_nms"))[1] <- actname
    fulldf.list <- c(fulldf.list, list(fulldf))
  }
  if (length(actions)!=length(fulldf.list)) stop("simfull(): couldn't create separate df for each action DAG")
  names(fulldf.list) <- names(actions)
  return(fulldf.list)
}

#' Simulate Observed or Full Data from \code{DAG} Object
#'
#' This function simulates full data based on a list of intervention DAGs, returning a list of \code{data.frame}s. See the vignette for examples and detailed description.
#' @section Forward Imputation:
#' By default, when LTCF is left unspecified, all variables that follow after any end of follow-up (EFU) event are set to missing (NA).
#' The end of follow-up event occurs when a binary node of type \code{EFU=TRUE} is equal to 1, indicating a failing or right-censoring event.
#' To forward impute the values of the time-varying nodes after the occurrence of the \code{EFU} event, set the LTCF argument to a name of the EFU node representing this event.
#' For additional details and examples see the vignette and \code{\link{doLTCF}} function.
#' @param DAG A DAG objects that has been locked with set.DAG(DAG). Observed data from this DAG will be simulated if actions argument is omitted.
#' @param actions Character vector of action names which will be extracted from the DAG object. Alternatively, this can be a list of action DAGs selected with \code{A(DAG)} function, in which case the argument \code{DAG} is unused. When \code{actions} is omitted, the function returns simulated observed data (see \code{simobs}).
#' @param n Number of observations to sample.
#' @param wide A logical, if TRUE the output data is generated in wide format, if FALSE, the output longitudinal data in generated in long format
#' @param LTCF If forward imputation is desired for the missing variable values, this argument should be set to the name of the node that indicates the end of follow-up event.
#' @param rndseed Seed for the random number generator.
#' @param rndseed.reset.node When \code{rndseed} is specified, use this argument to specify the name of the \code{DAG} node at which the random number generator seed is reset back to \code{NULL} (simulation function will call \code{set.seed(NULL)}).
#' Can be useful if one wishes to simulate data using the set seed \code{rndseed} only for the first K nodes of the DAG and use an entirely random sample when simulating the rest of the nodes starting at K+1 and on.
#' The name of such (K+1)th order \code{DAG} node should be then specified with this argument.
#' @param verbose Set to \code{TRUE} to print messages on status and information to the console.
#'  Turn this off by default using options(simcausal.verbose=FALSE).
#' @return If actions argument is missing a simulated data.frame is returned, otherwise the function returns a named list of action-specific simulated data.frames with action names giving names to corresponding list items.
#' @family simulation functions
#' @seealso \code{\link{simobs}} - a wrapper function for simulating observed data only; \code{\link{simfull}} - a wrapper function for simulating full data only; \code{\link{doLTCF}} - forward imputation of the missing values in already simulating data; \code{\link{DF.to.long}}, \code{\link{DF.to.longDT}} - converting longitudinal data from wide to long formats.
#' @example tests/examples/sim.impute.examples12.R
#' @references Sofrygin O, van der Laan MJ, Neugebauer R (2017).
#' "simcausal R Package: Conducting Transparent and Reproducible Simulation Studies of Causal Effect Estimation with Complex Longitudinal Data."
#' Journal of Statistical Software, 81(2), 1-47. doi: 10.18637/jss.v081.i02.
#' @export
sim <- function(DAG, actions, n, wide = TRUE, LTCF = NULL, rndseed = NULL, rndseed.reset.node = NULL, verbose = getOption("simcausal.verbose")) {
  # *) check if actions argument is missing -> simulate observed data from the DAG
  # *) if actions consist of characters, try to extract those actions from the DAG and simulate full data
  if (!is.integerish(n)) stop("n must be an integer")
  # if (!(n >= 1)) stop("n must be a positive integer")

  # SIMULATE OBSERVED DATA FROM DAG (if no actions)
  if (missing(actions)) {
    if (!is.DAG(DAG)) stop("DAG argument must be an object of class DAG")
    if (!is.DAGlocked(DAG)) stop("call set.DAG() before attempting to simulate data from DAG")
    if (verbose) message("simulating observed dataset from the DAG object")
    return(simobs(DAG = DAG, n = n, wide = wide, LTCF = LTCF, rndseed = rndseed, rndseed.reset.node = rndseed.reset.node, verbose = verbose))
  # SIMULATE FULL DATA FROM ACTION DAGs (when actions are present)
  } else {
    if (is.character(actions)) { # grab appropriate actions from the DAG by name
      if (!is.DAG(DAG)) stop("DAG argument must be an object of class DAG")
      actions <- getactions(DAG, actions)
    } else if (is.list(actions)) {
      checkDAGs <- sapply(actions, is.DAG)
      if (!all(checkDAGs)) stop("actions must be either a list of DAGs or a vector of action names")
    } else {
      stop("argument actions must be a character vector of action names or a list of action DAGs")
    }
    if (verbose) message("simulating action-specific datasets for action(s): "%+%paste(names(actions), collapse = " "))
    return(simfull(actions = actions, n = n, wide = wide, LTCF = LTCF, rndseed = rndseed, rndseed.reset.node = rndseed.reset.node, verbose = verbose))
  }
}

#' Missing Variable Imputation with Last Time Point Value Carried Forward (LTCF)
#'
#' Forward imputation for missing variable values in simulated data after a particular end of the follow-up event. The end of follow-up event is defined by the node of type \code{EOF=TRUE} being equal to 1.
#' @section Details:
#' The default behavior of the \code{sim} function consists in setting all nodes that temporally follow an \code{EFU} node whose simulated value is 1 to missing (i.e., \code{NA}).
#' The argument \code{LTCF} of the \code{sim} function can however be used to change this default behavior and impute some of these missing values with \emph{last time point value carried forward} (LTCF).
#' More specifically, only the missing values of time-varying nodes (i.e., those with non-missing \code{t} argument) that follow the end of follow-up event encoded by the \code{EFU} node specified by the \code{LTCF} argument will be imputed.
#' One can use the function \code{doLTCF} to apply the \emph{last time point value carried forward} (LTCF) imputation to an existing simulated dataset obtained from the function \code{sim} that was called with its default imputation setting (i.e., with no \code{LTCF} argument).
#' Illustration of the use of the LTCF imputation functionality are provided in the package vignette.
#'
#' The first example below shows the default data format of the \code{sim} function after an end of the follow-up event and how this behavior can be modified to generate data with LTCF imputation by either using the \code{LTCF} argument of the
#' \code{sim} function or by calling the \code{doLTCF} function. The second example demonstrates how to use the \code{doLTCF} function to perform LTCF imputation on already existing data simulated with the \code{sim} function based on its default non-imputation behavior.
#' @param data Simulated \code{data.frame} in wide format
#' @param LTCF Character string specifying the outcome node that is the indicator of the end of follow-up (observations with value of the outcome variable being 1 indicate that the end of follow-up has been reached). The outcome variable must be a binary node that was declared with \code{EFU=TRUE}.
#' @return Modified \code{data.frame}, all time-varying missing variables after the \code{EFU} outcome specified in \code{LTCF} are forward imputed with their last available non-missing value.
#' @family data manipulation functions
#' @seealso \code{\link{sim}}, \code{\link{simobs}} and \code{\link{simfull}} for simulating data with and without carry forward imputation.
#' @example tests/examples/sim.impute.examples12.R
#' @export
doLTCF <- function(data, LTCF) {
  DAG <- attr(data, "DAG")
  Nsamp <- nrow(data)
  LTCF.prev <- rep(FALSE,Nsamp)
  t_pts <- NULL
  cur.outcome <- FALSE
  if (is.longfmt(data)) stop("Last Timepoint Carry Forward (LTCF) can only be executed for data in wide format")
  #------------------------------------------------------------------------
  # Loop over nodes in DAG and keep track of EFUP observations in the data (already sampled)
  #------------------------------------------------------------------------
  # all_node_names <- as.character(unlist(lapply(DAG, "[[", "mv.names")))
  # [[1]][["mv.names"]]
  # for (cur.node in DAG) {
  #   print(cur.node[["name"]])
  # }
  for (cur.node in DAG) {
  # for (cur.node %in% all_node_names)
    # gnodename <- as.character(unlist(strsplit(cur.node$name, "_"))[1])
    # cur.node.name <- cur.node[["mv.names"]]
    gnodename <- as.character(unlist(lapply(strsplit(cur.node[["mv.names"]], "_"), "[[", 1)))
    # gnodename <- as.character(unlist(strsplit(cur.node[["mv.names"]], "_"))[1])
    # gnodename <- as.character(unlist(strsplit(cur.node, "_"))[1])

    t <- cur.node$t # define variable for a current time point t
    t_new <- !(t%in%t_pts) # set to TRUE when switch to new time point t occurs otherwise FALSE
    t_pts <- c(t_pts,t); t_pts <- as.integer(unique(t_pts)) # vector that keeps track of unique timepoints t
    if (gnodename[1]%in%LTCF) { # if this generic node name is the outcome we need to carry forward this observation when it evaluates to 1
      cur.outcome <- TRUE
    } else {
      cur.outcome <- FALSE
    }
    if (!is.null(t)) {  # check time points were specified by user
      # check current t is past the first time-point & we want to last time-point carried forward (LTCF)
      if ((sum(LTCF.prev) > 0) & (t > t_pts[1])) { # Carry last observation forward for those who reached EFUP (outcome value = 1)
        if (!gnodename[1]%+%"_"%+%(t-1) %in% names(data)){
          stop("Cannot perform last time point carry forward (LTCF) imputation. No node name '" %+% gnodename%+%"_"%+%(t-1) %+% "' found. Please make sure to define appropriate time-point for this node.")
        }
        for (name in gnodename)
          data[LTCF.prev, name%+%"_"%+%(t)] <- data[LTCF.prev, (name%+%"_"%+%(t-1))]
      }
    }
    if  (is.EFUP(cur.node)&(cur.outcome)) { # if cur.node is EFUP=TRUE type set all observations that had value=1 to LTCF.prev[indx]=TRUE
      LTCF.prev <- (LTCF.prev | (data[,cur.node$name]%in%1))
    }
    # browser()
    # if (!is.null(cur.node[["mv.names"]])) {
    #   for (cur.node in cur.node[["mv.names"]]) {
    #   }
    # }
  }



  if (sum(LTCF.prev)>0) {	# only change the LTCF attribute if outcome carry forward imputation was carried out for at least one obs
    LTCF_flag <- LTCF
  } else {
    LTCF_flag <- NULL
  }
  attr(data, "LTCF") <- LTCF_flag
  # newattrs <- CopyAttributes(attrslist=attrslist, LTCF=LTCF_flag) 	# save attributes from input format data
  # attributes(simdf_long) <- c(attributes(simdf_long), newattrs)
  return(data)
}

#' Convert Data from Wide to Long Format Using \code{reshape}
#'
#' This utility function takes a simulated data.frame in wide format as an input and converts it into a long format (slower compared to \code{\link{DF.to.longDT}}).
#'
#' Keeps all covariates that appear only once and at the first time-point constant (carry-forward).
#'
#' All covariates that appear fewer than range(t) times are imputed with NA for missing time-points.
#'
#' Observations with all NA's for all time-varying covariates are removed.
#'
#' When removing NA's the time-varying covariates that are attributes (attnames) are not considered.
#'
#' @param df_wide A \code{data.frame} in wide format
#' @return A \code{data.frame} object in long format
#' @seealso \code{\link{DF.to.longDT}} - a faster version of \code{DF.to.long} that uses \code{data.table} package
#' @family data manipulation functions
#' @export
DF.to.long <- function(df_wide) {
  if (is.data.table(df_wide)) class(df_wide) <- "data.frame"

  Nsamp <- nrow(df_wide)
  all_ts <- get_allts(df_wide) # calculate actual time values used in the simulated data
  attnames <- attr(df_wide, "attnames")
  node_nms <- attr(df_wide, "node_nms")
  node_nms_split <- strsplit(node_nms, "_")
  node_nms_vec <- sapply(node_nms_split, '[', 1)
  node_nms_unq <- unique(node_nms_vec)

  # if there are no time-points (t) attributes, then the long vs. wide format is undefined.
  if (length(all_ts)==0) return(df_wide)

  varying <- list()
  v.names <- NULL
  bsl.names <- NULL
  for (cur_node_name in node_nms_unq) {
    idx_node <- sapply(node_nms_split, function(t_nodename) t_nodename[1]%in%cur_node_name)
    count_ts <- sum(idx_node)
    node_t_vals <- unlist(sapply(node_nms_split, function(t_nodename) if(t_nodename[1]%in%cur_node_name) as.integer(t_nodename[2])))
    # if node name has no "_", assume its bsl and assign first time-point:
    if (all(is.na(node_t_vals))) node_t_vals <- all_ts[1]

    dprint("cur_node_name: "%+%cur_node_name);
    dprint("all_ts"); dprint(all_ts)
    # dprint("idx_node"); dprint(idx_node)
    dprint("count_ts"); dprint(count_ts)
    dprint("node_t_vals"); dprint(node_t_vals)

    if (count_ts==length(all_ts)) { # the number of time-points is equal to all time-points => # everything is well defined
      varying <- c(varying, list(node_nms[idx_node]))
      v.names <- c(v.names, cur_node_name)
    } else if ((count_ts==1) & (all_ts[all_ts%in%node_t_vals][1]==all_ts[1])) { # only one time-point and its the first-one
      # node is baseline: everything is fine, rename baseline var removing the _t
      bsl.dat.name <- node_nms[which(node_nms_vec%in%cur_node_name)][1]
      bsl.names <- c(bsl.names, cur_node_name)
      if (bsl.dat.name!=cur_node_name) {
        names(df_wide)[names(df_wide) %in% bsl.dat.name] <- cur_node_name
      }
    } else { # in between 1 & 2 => need to impute to length(all_ts) filling up with NAs
      ts_to_NAs <- all_ts[!all_ts%in%node_t_vals]
      ts_not_NAs <- all_ts[all_ts%in%node_t_vals]
      NA_nodenames <- cur_node_name%+%"_"%+%ts_to_NAs # define these node names as missing and add them to df
      df_NAts <- matrix(NA, nrow=Nsamp, ncol=length(ts_to_NAs))
      colnames(df_NAts) <- NA_nodenames
      df_wide <- cbind(df_wide, df_NAts)
      all_vnames <- cur_node_name%+%"_"%+%all_ts
      varying <- c(varying, list(all_vnames))
      v.names <- c(v.names, cur_node_name)
    }
  }
  simdf_long <- reshape(data=df_wide, varying=varying, v.names=v.names, timevar = "t", times = all_ts, sep="_", direction="long")
  simdf_long <- simdf_long[order(simdf_long$ID, simdf_long$t),] # 1) long format is sorted by t first, then ID -> needs to be sorted by ID then time
  simdf_long <- simdf_long[,-ncol(simdf_long)]  # 2) remove last column and reorder columns ID, t, ...
  if (length(attnames)>0) v.names <- v.names[!v.names%in%attnames]
  dprint("v.names"); dprint(v.names)
  # dprint("simdf_long after reshape"); dprint(simdf_long)
  simdf_long <- simdf_long[rowSums(is.na(simdf_long[,v.names]))<length(v.names),] # 3) remove last NA rows (after censoring or EFUP)
  newattrs <- CopyAttributes(attrslist=attributes(df_wide), dataform="long") 	# save attributes from input wide format data
  attributes(simdf_long) <- c(attributes(simdf_long), newattrs)
  attr(simdf_long, "v.names") <- v.names
  simdf_long
}

#' @import data.table
NULL
#' Faster Conversion of Data from Wide to Long Format Using \code{dcast.data.table}
#'
#' Faster utility function for converting wide-format \code{data.frame} into a long format.
#' Internally uses \pkg{data.table} package functions \code{melt.data.table} and \code{dcast.data.table}.
#'
#' Keeps all covariates that appear only once and at the first time-point constant (carry-forward).
#'
#' All covariates that appear fewer than range(t) times are imputed with NA for missing time-points.
#'
#' Observations with all NA's for all time-varying covariates are removed.
#'
#' When removing NA's the time-varying covariates that are attributes (attnames) are not considered.
#'
#' @param df_wide A \code{data.frame} or \code{data.table} in wide format
#' @param return_DF \code{TRUE} (default) to return a \code{data.frame}, \code{FALSE} returns a \code{data.table}
#' @return A \code{data.frame} in long format
#' @family data manipulation functions
#' @export
DF.to.longDT <- function(df_wide, return_DF = TRUE) {
  assertthat::assert_that(assertthat::is.flag(return_DF))
  assertthat::assert_that(data.table::is.data.table(df_wide)||is.data.frame(df_wide))
  # wrapping any call inside this function call will suppress all warnings:
  SuppressAllWarnings <- function(expr) {
    h <- function (w) {
      if (!is.null(w$message)) invokeRestart("muffleWarning")
    }
    withCallingHandlers(expr, warning = h)
  }
  # old attributes saved:
  newattrs <- CopyAttributes(attrslist=attributes(df_wide), dataform="long") 	# save attributes from input wide format data
  Nsamp <- nrow(df_wide)
  all_ts <- get_allts(df_wide) # calculate actual time values used in the simulated data
  attnames <- attr(df_wide, "attnames")
  # node_nms <- attr(df_wide, "node_nms")
  node_nms <- colnames(df_wide)
  # browser()
  node_nms_split <- strsplit(node_nms, "_")
  # node_nms_split2 <- strsplit(node_nms, '.', fixed = TRUE)
  node_nms_vec <- sapply(node_nms_split, '[', 1)
  # sapply(node_nms_split2, '[', 1)

  node_nms_unq <- unique(node_nms_vec)
  varying <- list()
  v.names <- NULL
  bsl.names <- NULL

  dprint("all_ts"); dprint(all_ts)
  dprint("node_nms"); dprint(node_nms)

  if (exists("setDTthreads")) setDTthreads(1)

  # if there are no time-points (t) attributes, then the long vs. wide format is undefined.
  if (length(all_ts)==0) return(df_wide)
  #******************************************************************************************************
  # this will create a copy of the object, wont modify the original object in the calling environment
  if (!is.data.table(df_wide)) df_wide <- data.table(df_wide)
  # !!!!! NOTE: THIS WILL convert the data.frame in the calling environment to data.table!!!!!!
  # To cancel that run class(df_wide) <- "data.frame" afterwards
  # data.table::setDT(df_wide, giveNames=TRUE, keep.rownames=FALSE)  # convert data.frame to data.table by reference (without copying)
  #******************************************************************************************************
  DF.to.longDT_run <- function(dat_df, t_pts, lvars, bslvars=NULL) {
    t_vec <- "_"%+%(t_pts)

    for (var_nm in lvars) { # one TV var at a time approach
      value_vars <- var_nm%+%t_vec
      SuppressAllWarnings(
        DT_melt <- melt(dat_df, id.vars = "ID", measure.vars = value_vars, variable.factor = TRUE, na.rm = FALSE)
      )
      # DT_melt <- melt.data.table(dat_df, id.vars="ID", measure.vars=value_vars, variable.factor=TRUE, na.rm=FALSE)
      # DT_melt <- data.table::melt(dat_df, id.vars="ID", measure.vars=value_vars, variable.factor=TRUE, na.rm=FALSE)
      var_nm_rep <- rep.int(var_nm, nrow(DT_melt))
      t_rep <- rep(t_pts, each=nrow(dat_df))
      DT_melt[, c("LVname","t") := list(var_nm_rep,t_rep)]
      DT_l <- data.table::dcast.data.table(DT_melt, t + ID ~ LVname)
      data.table::setkeyv(DT_l, c("ID", "t"))
      # data.table::setkey(DT_l, ID, t)
      if (which(lvars%in%var_nm)==1) {
        DT_l_fin <- DT_l
      } else {
        DT_l_fin[, (var_nm) := DT_l[[var_nm]]]
      }
    }
    if (!is.null(bslvars)) {
      DT_l_bsl <- dat_df[,unique(c("ID",bslvars)), with=FALSE]
      # data.table::setkey(DT_l_bsl, ID)  # needs to be sorted
      data.table::setkeyv(DT_l_bsl, "ID")  # needs to be sorted
      DT_l_fin <- DT_l_bsl[DT_l_fin]
    }
    DT_l_fin
  }
  # for each node, determinine if its baseline or time-varying
  # if time-varying and columns are missing for some of its time-points => impute those time-points as NA (last ifelse)
  for (cur_node_name in node_nms_unq) {
    idx_node <- sapply(node_nms_split, function(t_nodename) t_nodename[1]%in%cur_node_name)
    count_ts <- sum(idx_node)
    node_t_vals <- unlist(sapply(node_nms_split, function(t_nodename) if(t_nodename[1]%in%cur_node_name) as.integer(t_nodename[2])))
    # if node name has no "_", assume its bsl and assign first time-point:
    if (all(is.na(node_t_vals))) node_t_vals <- all_ts[1]
    dprint("cur_node_name: "%+%cur_node_name);
    dprint("all_ts"); dprint(all_ts)
    # dprint("idx_node"); dprint(idx_node)
    dprint("count_ts"); dprint(count_ts)
    dprint("node_t_vals"); dprint(node_t_vals)

    if (count_ts==length(all_ts)) { # the number of time-points is equal to all time-points =>  everything is well defined
      varying <- c(varying, list(node_nms[idx_node]))
      v.names <- c(v.names, cur_node_name)
    } else if ((count_ts==1) & (all_ts[all_ts%in%node_t_vals][1]==all_ts[1])) { # only one time-point and its the first-one
      # node is baseline: everything is fine, do nothing, just save its name
      bsl.dat.name <- node_nms[which(node_nms_vec%in%cur_node_name)][1]
      bsl.names <- c(bsl.names, cur_node_name)
      if (bsl.dat.name!=cur_node_name) {
        setnames(df_wide, bsl.dat.name, cur_node_name)
      }
    } else { # in between 1 & 2 => need to impute to length(all_ts) filling up with NAs
      ts_to_NAs <- all_ts[!all_ts%in%node_t_vals]
      ts_not_NAs <- all_ts[all_ts%in%node_t_vals]
      NA_nodenames <- cur_node_name%+%"_"%+%ts_to_NAs # define these node names as missing and add them to df
      df_wide[, (NA_nodenames) := NA]
      all_vnames <- cur_node_name%+%"_"%+%all_ts
      varying <- c(varying, list(all_vnames))
      v.names <- c(v.names, cur_node_name)
    }
  }
  dprint("v.names"); dprint(v.names)
  dprint("bsl.names"); dprint(bsl.names)
  #*** the baseline variable names are also passed as last arg
  simDT_long <- DF.to.longDT_run(dat_df=df_wide, t_pts=all_ts, lvars=v.names, bslvars=bsl.names)
  if (length(attnames)>0) v.names <- v.names[!v.names%in%attnames]
  dprint("v.names after rem attrs"); dprint(v.names)
  simDT_long <- simDT_long[rowSums(is.na(simDT_long[,v.names, with=FALSE]))<length(v.names),] # 3) remove last NA rows (after censoring or EFUP)

  # attributes(simDT_long) <- c(attributes(simDT_long), newattrs)
  newattrs_names <- names(newattrs)
  for (attr_name in newattrs_names) {
    setattr(x = simDT_long, name = attr_name, value = newattrs[[attr_name]])
  }
  # attr(simDT_long, "v.names") <- v.names
  setattr(x = simDT_long, name = "v.names", value = v.names)

  if (return_DF) class(simDT_long) <- "data.frame"

  return(simDT_long)
}

# NOT IMPLEMENTED
# will convert the dataset back to wide format from long, get the necessary attributes from simdf_long
# convert_to_wide <- function(simdf_long) {
# 	# Nsamp <- nrow(df_wide)
#     # all_ts <- get_allts(simdf_long) # calculate actual time values used in the simulated data
#     attnames <- attr(simdf_long, "attnames")
#     node_nms <- attributes(simdf_long)$names[-1]
#     # node_nms_split <- strsplit(node_nms, "_")
#     # node_nms_unq <- unique(sapply(node_nms_split, '[', 1))
# 	varying <- list()
# 	v.names <- NULL
# 	simdf_wide <- reshape(data=simdf_long, idvar=, varying=varying, v.names=v.names, timevar = "t", times = all_ts, sep="_", direction="wide")
# 	simdf_wide
# }
osofr/simcausal documentation built on Oct. 21, 2022, 3:09 a.m.