R/grur_imputations.R

Defines functions grur_imputations

Documented in grur_imputations

# grur imputations module

#' @name grur_imputations
#' @title Map-independent imputations of missing genotypes
#'
#' @description The goal of this module is to provide a simple solution for
#' a complicated problem: missing genotypes in RADseq genomic datasets.
#' This function will performed \strong{map-independent imputations} of missing
#' genotypes.
#'
#' \strong{Key features:}
#'
#' \itemize{
#' \item \strong{Imputation methods: } several machine learning algorithms;
#' Random forests (predictive and on-the-fly-imputations); 
#' Extreme gradient tree boosting (XGBoost);
#' Fast, distributed, high performance gradient
#' boosting (LightGBM). Furthermore, the module allows to
#' compare these algorithms with the classic Strawman imputation
#' (~ max/mean/mode: the most frequently observed, i.e. non-missing, genotypes is used).
#' \item \strong{Hierarchical level: } Imputations conducted by strata or globally.
#' \item \strong{SNP linkage/Haplotypes: } Correlation among SNPs is accounted for during
#' rf and tree boosting imputations, i.e. the imputations are 
#' automatically conducted by haplotypes
#' when marker meta-information is available (chromosome, locus and position,
#' usually taken from VCF files). The alternative, considers all the markers 
#' independent and imputation is conducted by SNPs/markers.
#' \item \strong{Genotype likelihood (GL): } The GL info is detected automatically
#' (GL column in FORMAT field of VCF files). Genotypes with higher likelihood
#' will have higher probability during bootstrap samples of trees in Random
#' forests (under devel).
#' \item \strong{Predictive mean matching: } random forest in prediction mode
#' uses a fast k-nearest neighbor (KNN) searching algorithms 
#' (see argument documentation and details below).
#' \item \strong{Optimized for speed: } There's a
#' \href{https://github.com/thierrygosselin/grur#troubleshooting}{tutorial} and a
#' \href{https://github.com/thierrygosselin/grur#vignettes-and-examples}{vignette}
#' detailing the procedure to install the packages from source to enable OpenMP.
#' Depending on algorithm used, a progress bar is sometimes available to see 
#' if you have time for a coffee break!
#' }
#'
#'
#' Before using this function to populate the original dataset with synthetic
#' data, grur assumes:
#' \enumerate{
#' \item \strong{filter your data: } please check out \code{\link[radiator]{filter_rad}}.
#' \item \strong{correlations: } machine learning algorithms will work better and 
#' faster if correlation are reduced. Check your dataset for short and long LD 
#' \code{\link[radiator]{filter_rad}} and \code{\link[radiator]{filter_ld}}.
#' \item \strong{pattern of individual heterozygosity: } If you have individual 
#' heterozygosity patterns and/or correlation of individual heterozygosity with 
#' missingness, you might want to skip imputation and go back in the wet-lab.
#' Please check out \code{\link[radiator]{detect_mixed_genomes}} and 
#' \code{\link[radiator]{detect_het_outliers}}.
#' \item \strong{patterns of missingness: } look for patterns of missingness 
#' with \code{\link[grur]{missing_visualization}} to better
#' tailor the arguments inside this function and explore the reasons for their presence 
#' (see \href{https://www.dropbox.com/s/4zf032g6yjatj0a/vignette_missing_data_analysis.nb.html?dl=0}{vignette}).
#' \item \strong{default arguments: } Use for testing only.
#' Please, please, please, don't use defaults for publications!
#' }

#' @param data (4 options) A file or object generated by \code{radiator}:
#' \itemize{
#' \item tidy data
#' \item Genomic Data Structure (GDS)
#' }
#'
#' \emph{How to get GDS and tidy data ?}
#' Look into \code{\link{tidy_genomic_data}}

#' @param strata (optional)
#' The strata file is a tab delimited file with a minimum of 2 columns headers:
#' \code{INDIVIDUALS} and \code{STRATA}. Documented in \code{\link[radiator]{read_strata}}.
#' Default: \code{strata = NULL}.

#' @param imputation.method (character, optional)
#' Methods available for map-independent imputations of missing genotypes
#' (see details for more info):
#'
#' \enumerate{
#' \item \code{imputation.method = "max"} Strawman imputation,
#' the most frequently observed genotypes (ties are broken at random).
#'
#' \item \code{imputation.method = "rf"} On-the-fly-imputations using
#' Random Forests algorithm.
#'
#' \item \code{imputation.method = "rf_pred"} Random Forests algorithm is used
#' as a prediction problem.
#'
#' \item \code{imputation.method = "xgboost"} extreme gradient boosting trees
#' using depth-wise tree growth.
#' 
#' \item \code{imputation.method = "lightgbm"} for a light and fast 
#' leaf-wise tree growth gradient boosting algorithm (in devel).
#'
#' \code{imputation.method = NULL} the function will stop.
#' Default: \code{imputation.method = NULL}.
#' }

#' @param hierarchical.levels (character, optional) \code{c("global", "strata")}.
#' Should the imputations be computed by markers globally or by strata.
#' Historically, this was \code{"populations"}.
#'
#' Note that imputing genotype globally in conjunction with
#' \code{imputation.method = "max" or "strata"} can potentially create huge bias.
#' e.g. by introducing foreign genotypes/haplotypes in some strata/populations
#' (see note for more info).
#' Default: \code{hierarchical.levels = "strata"}.

#' @inheritParams radiator::genomic_converter


#' @param verbose (optional, logical) When \code{verbose = TRUE}
#' the function is a little more chatty during execution.
#' Default: \code{verbose = TRUE}.

#' @param parallel.core (optional, integer) The number of core used for parallel.
#' For some algorithms, markers will be imputed in parallel and 
#' strata are processed sequentially.
#' Default: \code{parallel.core = parallel::detectCores() - 1}.

#' @param filename (optional) The file extension appended to
#' the \code{filename} provided is \code{.rad}.
#' With default: \code{filename = NULL}, the imputed tidy data frame is
#' in the global environment only (i.e. not written in the working directory...).


#' @param ... (optional) Advance mode that allows to pass further arguments
#' for fine-tuning the function. Also used for legacy arguments (see advance mode or
#' special sections below).


#' @return The output in your global environment is the imputed tidy data frame.
#' If \code{filename} is provided, the imputed tidy data frame is also
#' written to the working directory.

#' @details
#' \strong{Predictive mean matching:}
#'
#' Random Forests already behave like a nearest neighbor
#' classifier, with adaptive metric. Now we have the option to conduct
#' predictive mean matching on top of the prediction based missing value
#' imputation. PMM tries to raise the variance in the resulting conditional
#' distributions to a realistic level.
#' The closest k predicted values are identified by a fast
#' k-nearest neighbour approach wrapped in the package
#' \href{https://github.com/mayer79/missRanger}{missRanger}
#' Returned value correspond to the mean value.
#'
#'
#' \strong{haplotype/SNP approach:}
#'
#' The \emph{haplotype approach} is automatically used when markers meta-information
#' is detected (chromosome/CHROM, locus/ID and SNP/POS columns, usually from a VCF file).
#' Missing genotypes from SNPs on the same locus or same RADseq read is undertaken
#' simulteneously to account for the correlation of the linked SNPs. When one or
#' more SNPs on the same read/haplotype is missing, the haplotype is deleted and
#' consequently, imputation might results in different genotype for those SNPs
#' that were not missing. This approach is much safer than potentially creating
#' weird chimeras during haplotype imputations.
#' Alternatively, a \emph{snp approach} is used, and the SNP are considered
#' independent. Imputations of genotypes is then conducted for each marker separately.
#'
#'
#' \strong{Imputing globally or by strata ?}
#' \code{hierarchical.levels = "global"} argument will act differently depending
#' on the \code{imputation.method} selected.
#'
#' \strong{Strawman imputations (~ max/mean/mode) considerations: }
#'
#' With \code{imputation.method = "max"} and \code{hierarchical.levels = "global"}
#' \emph{will likely create bias}.
#'
#' \emph{Example 1 (unbalanced sample size):} Consider 2 populations evolving more
#' by drift than selection: pop1 (n = 36) and pop2 (n = 50).
#' You'll likely have a few polymorphic marker, where pop1 and pop2 are
#' monomorphic for different alleles (pop1 is fixed for the minor/ALT allele and
#' pop2 is fixed for the major/REF allele). Missing genotypes in pop1
#' using the most common filling technique in the literature (using mean/mode/max),
#' will result in pop1 having individuals with the REF allele.
#' Not something you want... unless your population membership is not 100% accurate,
#' (e.g. you might have migrants or wrong assignation),
#' which in this case you still don't want to impute with
#' \code{imputation.method = "max"} (see alternative below).
#'
#' \emph{Example 2 (balanced sample size):} pop1 (n = 100) and pop2 (n = 100).
#' For a particular marker, pop1 as 85 individuals genotyped and pop2 100.
#' Again, if the populations are fixed for different alleles
#' (pop1 = ALT and pop2 = REF), you will end up having REF allele in your pop1,
#' not something you want... unless your population membership is not 100% accurate,
#' (e.g. you might have migrants or wrong assignation),
#' which in this case you still don't want to impute with
#' \code{imputation.method = "max"} (see alternative below).
#'
#' \strong{Random Forests imputations: }
#'
#' Random Forests use machine learning and you can take this into account while
#' choosing argument values. Uncertain of the groupings ? Use random forests with
#' \code{hierarchical.levels = "global"}. Random forests will account for the
#' potential linkage and correlation between
#' markers and genotypes to make the best imputations available. This can potentially
#' results in genotypes for a certain combo population/marker with new groupings
#' (e.g. a new allele). This is much more accurate and not the same thing as
#' the \code{imputation.method = "max"} because the imputed genotype is validated
#' by considering all other variables (all the other markers genotyped for the individual).
#' \emph{Test the option and report bug if you find one.}
#'
#' \emph{random forest with on-the-fly-imputation (rf): }the technique is described
#' in Tang and Ishwaran (2017). Non-missing genotypes are used for
#' the split-statistics. Daughter node assignation membership use random
#' non-missing genotypes from the inbag data. Missing genotypes are imputed at
#' terminal nodes using maximal class rule with out-of-bag non-missing genotypes.
#' Example of computation time: for 1500 individuals, 20 000 markers and
#' 2% of overall missing genotypes using the hierarchical levels globally
#' takes around 10 hours on 6 CPUs to complete the imputations.
#'
#' \emph{random forest as a prediction problem (rf_pred): }markers with
#' missing genotypes are imputed one at a time. The fitted forest is used to
#' predict missing genotypes. Missingness in the response variables are
#' incorporated as attributes for growing the forest.
#' 
#' \emph{boosting trees:} Prediction method is used for both XGBoost and LightGBM.
#' 
#' \href{https://lightgbm.readthedocs.io/en/latest/index.html}{LightGBM documentation}
#' 
#' \href{http://xgboost.readthedocs.io/en/latest/}{XGBoost documentation}
#' 
#' \href{https://sites.google.com/view/lauraepp/parameters}{LightGBM and XGBoost parameters optimization}
#' 
#' 
#' 
#' @section Advance mode:
#' 
#' \emph{dots-dots-dots ...} allows to pass several arguments for fine-tuning the imputations:
#'
#' \code{subsample.markers} (optional, integer) To speed up computation and rapidly
#' test the function's arguments (e.g. using 200 markers).
#' Default: \code{subsample.markers = NULL}.
#'
#' \code{random.seed} (integer, optional) For reproducibility, set an integer
#' that will be used to initialize the random generator. With default,
#' a random number is generated. Currently not implemented for XGBoost and LightGBM.
#' Default: \code{random.seed = NULL}.
#' 
#' \code{pmm} (integer, optional) Predictive mean matching used in conjunction with
#' random Forests and lightgbm (\code{imputation.method = "rf_pred"} or
#' \code{imputation.method = "lightgbm"}).
#' Number of candidate non-missing
#' value to sample from during the predictive mean matching step.
#' A fast k-nearest neighbor searching algorithms is used with this approach.
#' \code{pmm = 2} will use 2 neighbors.
#' Default: \code{pmm = 0}, will avoids this step.
#' 
#' 
#' \code{cpu.boost} (optional, integer). Number of core for XGBoost and LightGBM.
#' For the best speed, set this to the number of real CPU cores,
#' not the number of threads. 
#' Most CPU using hyper-threading to generate 2 threads per CPU core.
#' Be aware that task manager or any CPU monitoring tool might report cores
#' not being fully utilized. This is normal.
#' Default: \code{cpu_boost = parallel::detectCores() / 2}.
#' 
#' 
#' \strong{eXtreme Gradient Boosting tree method:}
#' \enumerate{
#' \item \code{eta}
#' \item \code{gamma}
#' \item \code{max_depth}
#' \item \code{min_child_weight}
#' \item \code{subsample}
#' \item \code{colsample_bytree}
#' \item \code{num_parallel_tree}
#' \item \code{nrounds}
#' \item \code{save_name}
#' \item \code{early_stopping_rounds}
#' }
#' Documented in \href{https://xgboost.readthedocs.io/en/latest/R-package/index.html}{xgboost}.
#'
#' \strong{LightGBM:}
#' \enumerate{
#' \item \code{boosting}
#' \item \code{objective}
#' \item \code{learning_rate}
#' \item \code{feature_fraction}
#' \item \code{bagging_fraction}
#' \item \code{bagging_freq}
#' \item \code{max_depth}
#' \item \code{min_data_in_leaf}
#' \item \code{num_leaves}
#' \item \code{early_stopping_rounds}
#' \item \code{nrounds}
#' \item \code{max_depth}
#' \item \code{iteration.subsample: } is the number of iteration
#' to conduct training of the model with new subsamples.
#' Default: \code{iteration.subsample = 2}.
#' }
#' Documented in \href{https://lightgbm.readthedocs.io/en/latest/index.html}{LightGBM}
#' 
#' \strong{Random forests methods:}
#' \enumerate{
#' \item \code{num.tree}
#' \item \code{nodesize}
#' \item \code{nsplit}
#' \item \code{nimpute}
#' }
#' Documented in \href{https://kogalur.github.io/randomForestSRC/index.html}{randomForestSRC}
#' 
# Multiple Correspondence Analysis option available (upcomming):
# \emph{ncp}.
# Refer to \code{\link[missMDA]{imputeMCA}} for argument documentation.

#' @note
#'
#' \strong{Reference genome or linkage map available ?}
#' Numerous approaches are available and more appropriate, please search
#' the literature
#' (\href{https://online.papersapp.com/collections/05d6e65a-73c9-49e6-9c75-289a818f76f3/share}{references}).
#'
#'
#' \strong{What's the simple imputation message when running the function ?}
#'
#' Before conducting the imputations by strata with the machine learning algorithms, 
#' the data is first screened for markers that are
#' monomorphic WITHIN the strata/populations.
#' Because for those cases, it's clear what the
#' missing genotypes should be, the imputations is very \emph{simple} and missing
#' genotypes are imputed with the only genotype found for the particular strata/populations.
#' There's no need for a fancy method here.
#' The small cost in time is worth it, because the model inside 
#' the machine learning algorithms will benefit from having a more complete and
#' reliable genotype matrix.
#'
#' @section Life cycle:
#' \strong{Deprecated arguments:}
#'
#' \itemize{
#' \item \code{hierarchical.levels = "populations"} update your codes with "strata".
#' \item \code{imputations.group} is now replaced by \code{hierarchical.levels}
#' \item \code{impute} is no longer available.
#' Imputing using \code{impute = "allele"} option was wrong because it
#' was using F1 genotypes for imputations. Now imputation is only conducted at
#' the genotype level.
#' \item \code{iteration.rf} is no longer used. This argument is now available
#' inside the \code{...} for on-the-fly-imputations (see details). The default
#' is now set to 10 iterations.
#' \item \code{split.number} is automatically set.
#' }

#' @seealso
#' The package \href{https://github.com/imbs-hl/ranger}{ranger}
#' (see Wright and Ziegler, 2016) provides a fast C++ version
#' of the original implementation of rf from Breiman (2001).
#' 
#' \href{https://github.com/mayer79/missRanger}{missRanger}
#' 
#' The package \href{https://github.com/dmlc/xgboost}{randomForestSRC} 
#' (see Tang and Ishwaran, 2017) provides
#' a fast on-the-fly imputation method.
#' 
#' \href{https://github.com/stekhoven/missForest}{missForest}
#' 
#' The package \href{https://github.com/dmlc/xgboost}{XGBoost} 
#' (Chen and Guestrin, 2016) provides
#' a fast C++ implementation for the extreme gradient tree boosting algorithm.
#' 
#' The package \href{https://github.com/Microsoft/LightGBM}{LightGBM} is an
#' exciting new algorithm to conduct tree boosting.




#' @export
#' @rdname grur_imputations

#' @examples
#' \dontrun{
#' # The simplest way to run when you have a tidy dataset:
#'
#' wolf.imputed <- grur::grur_imputations(
#' data = "wolf.tidy.dataset.rad",
#' imputation.method = "lightgbm")
#'
#' # This will impute the missing genotypes by strata (in this case the strata,
#' is the population), population using lightgbm.
#' # The remaining arguments will be the defaults.
#'
#' # When you start with a vcf file you can use magrittr %>% to `pipe` the
#' # result. Below, an example with more arguments offered by the functions:
#'
#' wolf.imp <- radiator::tidy_genomic_data(
#'     data = "batch_1.vcf",
#'     strata = "strata.wolf.10pop.tsv",
#'     vcf.metadata = TRUE,
#'     whitelist.markers = "whitelist.loci.txt",
#'     verbose = TRUE) %>%
#' grur::grur_imputations(
#'     data = ., imputation.method = "xgboost", parallel.core = 32)
#' }

#' @references Wright, M. N. & Ziegler, A. (2016).
#' ranger: A Fast Implementation of Random Forests for High Dimensional Data
#' in C++ and R.
#' Journal of Statistical Software, in press. http://arxiv.org/abs/1508.04409.
#' @references Breiman, L. (2001). Random forests. Machine learning, 45(1), 5-32.
#' @references Chen T, Guestrin C. (2016).
#' XGBoost: A scalable tree boosting system. arXivorg. 2016.
#' doi:10.1145/2939672.2939785
#' @references Tang F, Ishwaran H. (2017) Random Forest Missing Data Algorithms.
#' arXiorg: 1–24.

#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}

grur_imputations <- function(
  data,
  output = NULL,
  strata = NULL,
  imputation.method = NULL,
  hierarchical.levels = "strata",
  verbose = TRUE,
  parallel.core = parallel::detectCores() - 1,
  filename = NULL,
  ...
) {
  if (is.null(imputation.method)) {
    message("Imputation method: NULL")
    message("Returning the original data")
    return(data)
  }
  if (verbose) {
    cat("################################################################################\n")
    cat("############################ grur::grur_imputations ############################\n")
    cat("################################################################################\n")
  }
  
  # Cleanup---------------------------------------------------------------------
  file.date <- format(Sys.time(), "%Y%m%d@%H%M")
  if (verbose) message("Execution date/time: ", file.date)
  old.dir <- getwd()
  opt.change <- getOption("width")
  options(width = 70)
  timing <- proc.time()# for timing
  res <- list() # store imputed data and output...
  #back to the original directory and options
  on.exit(setwd(old.dir), add = TRUE)
  on.exit(options(width = opt.change), add = TRUE)
  on.exit((timing <- proc.time() - timing), add = TRUE)
  on.exit(if (verbose) message("\nComputation time, overall: ", round(timing[[3]]), " sec"), add = TRUE)
  on.exit(if (verbose) cat("############################## grur_imputations ################################\n"), add = TRUE)
  
  
  # Checking for missing and/or default arguments & package required -----------
  if (missing(data)) rlang::abort("data file missing")
  
  if (imputation.method == "lightgbm") {
    # if (!requireNamespace("lightgbm", quietly = TRUE)) {
      rlang::abort("lightgbm needed for this imputation option to work.
Please follow the vignette for install instructions")
    # }
  }
  
  if (imputation.method == "xgboost") {
    if (!requireNamespace("xgboost", quietly = TRUE)) {
      rlang::abort("xgboost needed for this imputation option to work.
Please follow the vignette for install instructions")
    }
  }
  
  if (imputation.method == "rf") {
    if (!requireNamespace("randomForestSRC", quietly = TRUE)) {
      rlang::abort("randomForestSRC needed for this imputation option to work.
Please follow the vignette for install instructions")
    }
  }
  
  if (imputation.method == "rf_pred") {
    if (!requireNamespace("ranger", quietly = TRUE)) {
      rlang::abort("ranger needed for this imputation option to work.
           Please follow the vignette for install instructions")
    }
  }
  
  if (imputation.method == "bpca") rlang::abort("Imputations with bpca is under heavy testing... try again soon!")
  # if (imputation.method == "bpca") {
  #   if (!requireNamespace("pcaMethods", quietly = TRUE)) {
  #     rlang::abort("pcaMethods needed for this imputation option to work.
  #          Please follow the vignette for install instructions")
  #   }
  # }
  
  
  
  
  # Options selected & Messages ------------------------------------------------
  message("Imputation method: ", imputation.method)
  message("Hierarchical levels: ", hierarchical.levels)
  
  # Function call and dotslist -------------------------------------------------
  rad.dots <- radiator::radiator_dots(
    func.name = as.list(sys.call())[[1]],
    fd = rlang::fn_fmls_names(),
    args.list = as.list(environment()),
    dotslist = rlang::dots_list(..., .homonyms = "error", .check_assign = TRUE), 
    keepers = c("eta", "gamma", "max_depth", "min_child_weight", "subsample", "colsample_bytree",
                "num_parallel_tree", "nrounds", "save_name", "early_stopping_rounds",
                "num.tree", "nodesize", "nsplit", "nimpute", "ncp", "boosting",
                "objective", "learning_rate",
                "feature_fraction", "bagging_fraction", "bagging_freq",
                "min_data_in_leaf", "num_leaves",
                "early_stopping_rounds", "iteration.subsample",
                "subsample.markers", "random.seed", "pmm", "cpu.boost"
    ),
    verbose = verbose
  )
  if (is.null(pmm)) pmm <- 0
  if (is.null(cpu.boost)) cpu.boost <- parallel::detectCores() / 2
  # if (is.null(subsample.markers)) subsample.markers <- 1
  if (pmm > 0) {
    if (!requireNamespace("missRanger", quietly = TRUE)) {
      rlang::abort("missRanger needed for this imputation option to work.
           Please follow the vignette for install instructions")
    }
  }
  # randomForestSRC arguments ------------------------------------------------
  
  # @param num.tree (integer, optional) The number of trees to grow
  # when \code{imputation.method = "rf"} or \code{imputation.method = "rf_pred"}.
  # Default: \code{num.tree = 50}.
  
  if (is.null(num.tree)) num.tree = 50
  if (is.null(nodesize)) nodesize = 1
  if (is.null(nsplit)) nsplit = 10
  if (is.null(nimpute)) nimpute = 10
  
  # MCA ----------------------------------------------------------------------
  # if (!is.null(ncp)) ncp = 2
  
  # XGBoost arguments --------------------------------------------------------
  # learning rate
  if (is.null(eta)) eta = 0.1
  
  # gamma for minimum loss reduction (larger the number, more conservative the algorithm)
  # prevent overfitting through regularization
  if (is.null(gamma)) gamma = 0
  
  # min_child_weight: minimum sum of instance weight (hessian) needed in a child.
  if (is.null(min_child_weight)) min_child_weight = 1
  
  # subsample: subsample ratio of the training instance for growing tree and prevent overfitting
  if (is.null(subsample)) subsample = 0.5
  
  # colsample_bytree: subsample ratio of columns when growing each tree
  if (is.null(colsample_bytree)) colsample_bytree = 1
  
  # num_parallel_tree: Experimental parameter.
  # number of trees to grow per round.
  if (is.null(num_parallel_tree)) num_parallel_tree = 1
  
  # save_name: the name or path for periodically saved model file
  if (is.null(save_name)) save_name = "imputation.model.temp"
  
  # Common to XGBoost and LightGBM arguments ---------------------------------
  # early_stopping_rounds: If NULL, the early stopping function is not triggered.
  # If set to an integer k, training with a validation set will stop
  # if the performance doesn't improve for k rounds.
  if (is.null(early_stopping_rounds)) early_stopping_rounds = 20
  
  # nrounds: the max number of iterations
  if (is.null(nrounds)) nrounds = 1000
  
  # max_depth
  if (is.null(max_depth)) {
    if (imputation.method == "xgboost") max_depth = 6
    if (imputation.method == "lightgbm") max_depth = -1# infinite depth
  }
  
  
  # LightGBM arguments -------------------------------------------------------
  if (is.null(boosting)) boosting = "dart"
  if (is.null(objective)) objective = "multiclass"
  if (is.null(learning_rate)) learning_rate = 0.1
  if (is.null(feature_fraction)) feature_fraction = 0.9
  if (is.null(bagging_fraction)) bagging_fraction = 0.9
  if (is.null(bagging_freq)) bagging_freq = 1
  
  # minimum dat per leaf
  # It is a LightGBM specific feature allowing to choose how many observations
  # must exist before putting them in a leaf during each tree split consideration.
  if (!is.null(min_data_in_leaf)) min_data_in_leaf = 20 # dataset with 100 obs, needs to lower this (in XGBoost = 1)
  # number of leaves in one tree
  if (!is.null(num_leaves)) num_leaves = 31
  
  # iteration.subsample
  # number of leaves in one tree
  if (!is.null(iteration.subsample)) iteration.subsample = 2
  
  if (verbose) {
    if (imputation.method == "xgboost") {
      message("Extreme gradient tree boosting options:")
      message("    learning rate: ", eta)
      message("    regularization, minimum loss reduction (gamma): ", gamma)
      message("    maximum depth of tree: ", max_depth)
      message("    minimum sum of instance weight: ", min_child_weight)
      message("    subsample ratio for training when growing trees (prevent overfitting): ", subsample)
      message("    subsample ratio of columns when growing trees: ", colsample_bytree)
      message("    number of trees to grow per round: ", num_parallel_tree)
      message("    max number of iterations: ", nrounds)
      message("    filename for the model for periodical saving: ", save_name)
      message("    early stopping round integer: ", early_stopping_rounds, "\n")
      message("    number of threads (cpu.boost): ", cpu.boost)
    }
    
    if (imputation.method == "lightgbm") {
      message("Light Gradient Boosting Machine options:")
      message("    boosting: ", boosting)
      message("    objective: ", objective)
      message("    learning rate: ", learning_rate)
      message("    feature_fraction: ", feature_fraction)
      message("    bagging_fraction: ", bagging_fraction)
      message("    max_depth: ", max_depth)
      message("    min_data_in_leaf: ", min_data_in_leaf)
      message("    num_leaves: ", num_leaves)
      message("    early_stopping_rounds: ", early_stopping_rounds)
      message("    iteration.subsample: ", iteration.subsample)
      message("    Predictive Mean Matching: ", pmm)
      message("    max number of iterations: ", nrounds)
      message("    number of threads (cpu.boost): ", cpu.boost)
    }
    
    
    if (imputation.method == "rf") {
      message("On-the-fly-imputations options:")
      message("    number of trees to grow: ", num.tree)
      message("    minimum terminal node size: ", nodesize)
      message("    non-negative integer value used to specify random splitting: ", nsplit)
      message("    number of iterations: ", nimpute)
    }
    
    if (imputation.method == "rf_pred") {
      message("Random Forests options:")
      message("    number of trees: ", num.tree)
      message("    minimum terminal node size: ", nodesize)
      message("    non-negative integer value used to specify random splitting: ", nsplit)
      message("    number of iterations: ", nimpute)
      message("    predictive mean matching: ", pmm, "\n")
    }
    
    # if (imputation.method == "bpca") {
    #   # message("Multiple Correspondence Analysis options:")
    #   # message("    number of dimentions used to predict the missing values: ", ncp)
    #   # message("    MCA algorithm: Regularized")
    # }
    
    message("Number of CPUs used during grur's operations: ", parallel.core)
    if (!is.null(filename)) message("Filename: ", filename)
  }
  
  # Set seed for sampling reproducibility ------------------------------------
  if (is.null(random.seed)) {
    random.seed <- sample(x = 1:1000000, size = 1)
    set.seed(random.seed)
  } else {
    set.seed("num.tree")
  }
  message("\nSeed: ", random.seed)
  
  # Folders---------------------------------------------------------------------
  path.folder <- radiator::generate_folder(
    f = getwd(),
    rad.folder = "radiator_genomic_converter",
    internal = FALSE,
    file.date = file.date,
    verbose = verbose)
  
  # write the dots file
  radiator::write_rad(
    data = rad.dots,
    path = path.folder,
    filename = stringi::stri_join("grur_imputations_args_", file.date, ".tsv"),
    tsv = TRUE,
    internal = FALSE,
    verbose = verbose
  )    
  
  
  # Detect format --------------------------------------------------------------
  data.type <- radiator::detect_genomic_format(data)
  
  if (!data.type %in% c("tbl_df", "fst.file", "SeqVarGDSClass", "gds.file")) {
    rlang::abort("Input not supported for this function: read function documentation")
  }
  
  
  # Import data --------------------------------------------------------------
  if (data.type %in% c("tbl_df", "fst.file")) {
    if (is.vector(data)) data %<>% radiator::tidy_wide(data = ., import.metadata = TRUE)
    data.type <- "tbl_df"
  }
  # GDS
  if (data.type %in% c("SeqVarGDSClass", "gds.file")) {
    if (!"SeqVarTools" %in% utils::installed.packages()[,"Package"]) {
      rlang::abort('Please install SeqVarTools for this option:\n
                     install.packages("BiocManager")
                     BiocManager::install("SeqVarTools")')
    }
    
    if (data.type == "gds.file") {
      data <- radiator::read_rad(data, verbose = verbose)
      data.type <- "SeqVarGDSClass"
    }
    
    data <- radiator::gds2tidy(gds = data, parallel.core = parallel.core)
    data.type <- "tbl_df"
  }
  # strata--------------------------------------------------------------------
  if (!is.null(strata)) {
    strata.df <- radiator::read_strata(strata = strata, verbose = verbose) %$% strata
    if (!rlang::has_name(strata.df, "STRATA")) {
      rlang::abort("strata file/object requires a columns named: STRATA")
    }
    
    if (!rlang::has_name(strata.df, "INDIVIDUALS")) {
      rlang::abort("strata file/object requires a columns named: INDIVIDUALS")
    }
    data %<>% radiator::join_strata(data = ., strata = strata.df, verbose = verbose)
  }
  strata.df <- NULL
  
  if (rlang::has_name(data, "POP_ID") && !rlang::has_name(data, "STRATA")) {
    data %<>% dplyr::rename(STRATA = POP_ID)
  }
  
  # Subsampling markers ------------------------------------------------------
  if (!is.null(subsample.markers)) {
    # subsample.markers <- 500
    sample.markers <- dplyr::distinct(data, MARKERS) %>%
      dplyr::sample_n(tbl = ., size = subsample.markers)
    
    readr::write_tsv(
      x = sample.markers,
      file = stringi::stri_join("subsampled.markers_",
                                subsample.markers,
                                "_random.seed_", 
                                random.seed, ".tsv"))
    sample.markers %<>% purrr::flatten_chr(.)
    data <- dplyr::filter(data, MARKERS %in% sample.markers)
    sample.markers <- NULL
  }
  
  # stats about the dataset --------------------------------------------------
  message("\nNumber of populations: ", dplyr::n_distinct(data$STRATA))
  message("Number of individuals: ", dplyr::n_distinct(data$INDIVIDUALS))
  message("Number of markers: ", dplyr::n_distinct(data$MARKERS))
  
  if (imputation.method == "rf") package.used <- "randomForestSRC"
  if (imputation.method == "xgboost") package.used <- "xgboost"
  if (imputation.method == "lightgbm") package.used <- "lightgbm"
  if (imputation.method %in% c("max", "rf_pred", "bpca")) package.used <- NULL
  
  if (!is.null(package.used)) {
    message("\nNote: If you have speed issues:")
    message("    1. Have you installed, ", package.used," package with OpenMP enabled ?")
    message("    2. Have you followed grur's vignette ?")
  }
  
  # GT FORMAT and ALLELES CALIBRATIONS -----------------------------------------
  # Note to myself: is working with GT format required or another one could be used ?
  
  if (!rlang::has_name(data, "GT")) {
    message("\nGT format not found, generating the format...")
    data <- radiator::calibrate_alleles(
      data = data,
      parallel.core = parallel.core,
      verbose = TRUE
      ) %$%
      input %>% 
      dplyr::rename(STRATA = POP_ID)
  }
  
  
  # output the proportion of missing genotypes BEFORE imputations
  na.before <- dplyr::summarise(.data = data, MISSING = round(length(GT[GT == "000000"])/length(GT), 6)) %>%
    purrr::flatten_dbl(.) %>% format(., scientific = FALSE)
  message("\nProportion of missing genotypes before imputations: ", na.before)
  
  # necessary steps to make sure we work with unique markers and not duplicated LOCUS
  if (!rlang::has_name(data, "MARKERS") && rlang::has_name(data, "LOCUS")) {
    data <- dplyr::rename(.data = data, MARKERS = LOCUS)
  }
  
  # Generate simple markers id -----------------------------------------------
  # formula in RF difficult with markers containing separators and numbers
  if (imputation.method %in% c("rf", "xgboost", "lightgbm")) {
    markers.meta <- dplyr::distinct(.data = data, MARKERS) %>%
      dplyr::mutate(NEW_MARKERS = stringi::stri_join("M", seq(1, nrow(.))))
    data <- dplyr::full_join(markers.meta, data, by = "MARKERS")
    
    markers.meta <- suppressWarnings(
      dplyr::select(
        data,
        dplyr::one_of(c("NEW_MARKERS", "MARKERS", "CHROM", "LOCUS", "POS"))) %>% 
        dplyr::distinct(NEW_MARKERS, .keep_all = TRUE))
    
    data <- dplyr::select(.data = data, -MARKERS) %>%
      dplyr::rename(MARKERS = NEW_MARKERS)
  } else {
    markers.meta <- suppressWarnings(
      dplyr::select(
        data,
        dplyr::one_of(c("MARKERS", "CHROM", "LOCUS", "POS"))) %>% 
        dplyr::distinct(MARKERS, .keep_all = TRUE))
  }
  
  # scan for REF allele column -----------------------------------------------
  ref.column <- FALSE
  if (rlang::has_name(data, "REF")) ref.column <- TRUE
  
  # Check if biallelic  ------------------------------------------------------
  biallelic <- radiator::detect_biallelic_markers(data = data)
  
  # Manage Genotype Likelihood -----------------------------------------------
  if (rlang::has_name(data, "GL")) {
    if (verbose) message("\nGenotype likelihood (GL) column detected")
  }
  have <- colnames(data)
  
  # For haplotype VCF
  if (!biallelic && ref.column && rlang::has_name(data, "GT_VCF_NUC")) {
    haplo.vcf.check <- TRUE
    
    want <- c("MARKERS", "CHROM", "LOCUS", "POS", "STRATA", "INDIVIDUALS", "GT_VCF_NUC", "GL")
    selected.columns <- purrr::keep(.x = have, .p = have %in% want)
    
    data <- dplyr::select(.data = data,
                          dplyr::one_of(selected.columns)) %>%
      dplyr::mutate(GT = replace(GT_VCF_NUC, which(GT_VCF_NUC == "./."), NA)) %>%
      dplyr::select(-GT_VCF_NUC)
  } else {
    haplo.vcf.check <- FALSE
    
    want <- c("MARKERS", "CHROM", "LOCUS", "POS", "STRATA", "INDIVIDUALS", "GT", "GL")
    selected.columns <- purrr::keep(.x = have, .p = have %in% want)
    
    data <- dplyr::select(.data = data,
                          dplyr::one_of(selected.columns)) %>%
      dplyr::mutate(GT = replace(GT, which(GT == "000000"), NA))
  }
  have <- want <- selected.columns <- NULL
  
  # keep bk of stratification ------------------------------------------------
  strata.before <- dplyr::distinct(.data = data, INDIVIDUALS, STRATA)
  
  # SNP/haplotype approach -----------------------------------------------------
  
  # detect the presence of SNP/LOCUS info and combine SNPs on the same RADseq locus together
  if (rlang::has_name(data, "CHROM") && rlang::has_name(data, "LOCUS") && imputation.method != "max") {
    data <- tidyr::unite(data = data, col = CHROM_LOCUS, CHROM, LOCUS) %>%
      dplyr::select(-POS) # no longer necessary info in MARKERS
    
    # Locus with > 1 SNP
    snp.number <- dplyr::distinct(.data = data, MARKERS, CHROM_LOCUS) %>%
      dplyr::count(CHROM_LOCUS)
    
    if (max(snp.number$n) > 1) {
      # Encoding SNPs into haplotypes
      # combining SNPs into groups defined by chromosomes and locus info
      if (verbose) message("Encoding SNPs into haplotypes")
      
      if (rlang::has_name(data, "GL")) {
        keep.gl <- dplyr::ungroup(data) %>%
          dplyr::filter(!is.na(GL)) %>%
          dplyr::distinct(CHROM_LOCUS, STRATA, INDIVIDUALS, GL) %>%
          dplyr::group_by(CHROM_LOCUS, STRATA, INDIVIDUALS) %>%
          dplyr::summarise(GL = mean(GL))
        data <- dplyr::select(.data = data, -GL)
      } else {
        keep.gl <- NULL
      }
      
      locus.multiple.snp <- dplyr::filter(.data = snp.number, n > 1) %>%
        dplyr::select(CHROM_LOCUS) %>%
        purrr::flatten_chr(.)
      data.multiple.snp <- dplyr::filter(.data = data, CHROM_LOCUS %in% locus.multiple.snp)
      data.one.snp <- dplyr::filter(.data = data, !CHROM_LOCUS %in% locus.multiple.snp)
      if (length(locus.multiple.snp) > 100) {
        # parallel
        data <- list()
        data <- grur::grur_future(
          .x = locus.multiple.snp,
          .f = encoding_snp,
          flat.future = "dfr",
          split.vec = FALSE,
          split.with = NULL,
          parallel.core = parallel.core,
          data = data.multiple.snp
        ) %>% 
          dplyr::bind_rows(data.one.snp)
          
        
        # data <- .grur_parallel_mc(
        #   X = locus.multiple.snp,
        #   FUN = encoding_snp,
        #   mc.cores = parallel.core,
        #   data = data.multiple.snp
        # ) %>% dplyr::bind_rows(.) %>%
        #   dplyr::bind_rows(data.one.snp)
      } else {
        data <- purrr::map_dfr(
          .x = locus.multiple.snp,
          .f = encoding_snp,
          data = data.multiple.snp) %>% 
          dplyr::bind_rows(data.one.snp)
      }
      
      # include GL back and use relative measure group_by locus
      if (!is.null(keep.gl)) {
        data <- dplyr::filter(.data = data, !is.na(GT)) %>%
          dplyr::select(CHROM_LOCUS, STRATA, INDIVIDUALS) %>%
          dplyr::left_join(keep.gl, by = c("CHROM_LOCUS", "STRATA", "INDIVIDUALS")) %>%
          dplyr::right_join(data, by = c("CHROM_LOCUS", "STRATA", "INDIVIDUALS")) %>%
          dplyr::select(MARKERS, CHROM_LOCUS, STRATA, INDIVIDUALS, GT, GL) %>%
          dplyr::group_by(CHROM_LOCUS) %>%
          dplyr::mutate(GL = GL/max(GL, na.rm = TRUE)) %>%
          dplyr::ungroup(.)
      }
      separate.haplo <- TRUE
      data.multiple.snp <- data.one.snp <- snp.number <- keep.gl <- NULL
    } else {
      separate.haplo <- FALSE
    }
    # End of grouping SNPs
    
    # remove unnecessary column
    data <- dplyr::select(.data = data, -CHROM_LOCUS)
  } else {
    separate.haplo <- FALSE
  }#End preparing SNP/haplo approach
  
  # Strawman imputations (max) -------------------------------------------------
  if (imputation.method == "max") {
    if (hierarchical.levels == "strata") {
      if (verbose) message("Using the most observed genotype per marker/strata for imputations")
      if (rlang::has_name(data, "GL")) {
        data.imp <- dplyr::select(data, MARKERS, STRATA, INDIVIDUALS, GT, GL) %>%
          dplyr::group_by(MARKERS, STRATA) %>%
          dplyr::mutate(
            GT = stringi::stri_replace_na(GT, replacement = max(GT, na.rm = TRUE)),
            GT = replace(GT, which(GT == "NA"), NA),
            GL = stringi::stri_replace_na(GL, replacement = mean(GL, na.rm = TRUE)),
            GL = replace(GL, which(GL == "NA"), NA),
            GL = as.numeric(GL)) %>%
          dplyr::ungroup(.)
      } else {
        data.imp <- dplyr::select(data, MARKERS, STRATA, INDIVIDUALS, GT) %>%
          dplyr::group_by(MARKERS, STRATA) %>%
          dplyr::mutate(GT = stringi::stri_replace_na(GT, replacement = max(GT, na.rm = TRUE)),
                        GT = replace(GT, which(GT == "NA"), NA)) %>%
          dplyr::ungroup(.)
      }
      data <- NULL
      # # detect remaining NA
      # # e.g. if one strata is missing all GT... when not using common markers
      # if (anyNA(data.one.snp)) {
      #   warning("Missing data is still present in the dataset",
      #           "\n    2 options:",
      #           "\n    run the function again with hierarchical.levels = 'global'",
      #           "\n    use common.markers = TRUE when using hierarchical.levels = 'strata'")
      # }
    }# End imputation max populations
    
    # global
    if (hierarchical.levels == "global") {
      if (verbose) message("Using the most observed genotype per marker for imputations")
      if (rlang::has_name(data, "GL")) {
        data.imp <- dplyr::select(data, MARKERS, STRATA, INDIVIDUALS, GT, GL) %>%
          dplyr::group_by(MARKERS) %>%
          dplyr::mutate(
            GT = stringi::stri_replace_na(str = GT, replacement = max(GT, na.rm = TRUE)),
            GL = as.numeric(stringi::stri_replace_na(str = GL, replacement = mean(GL, na.rm = TRUE)))
          ) %>%
          dplyr::ungroup(.)
      } else {
        data.imp <- dplyr::select(data, MARKERS, STRATA, INDIVIDUALS, GT) %>%
          dplyr::group_by(MARKERS) %>%
          dplyr::mutate(GT = stringi::stri_replace_na(str = GT, replacement = max(GT, na.rm = TRUE))) %>%
          dplyr::ungroup(.)
      }
      data <- NULL
    } # End imputation max global
  }#End imputation max
  
  # Imputation with Random Forests and tree boosting ---------------------------
  ### Note to myself: Need to add CHROM hierarchy
  ### to impute one chromosome at a time
  
  if (imputation.method %in% c("rf", "rf_pred", "xgboost", "lightgbm", "bpca")) {
    # Vector of markers
    markers.list <- unique(data$MARKERS)
    
    # Simple imputation for monomorphic markers in some pop ------------------
    
    # Problem encountered:
    # Merged SNPs might end up be missing if one of the SNP on the read is missing
    # It's usually safer to delete the whole genotype and impute it back,
    # instead of creating weird chimeras
    
    # In some dataset I've seen markers becomming monomorphic after this change
    # These markers have very very very low polymorphism and further test are
    # required to validate this technique. 
    # remedy : better MAF filtering can remove those problem...
    
    if (hierarchical.levels == "strata") {
      # 1) dont' waist time imputing
      # 2) do some screening first
      # 3) the small cost in time is worth it,
      # 4) because machine learning algorithm  in RF and xgboost will benefit having more complete 
      #    and reliable genotypes
      
      if (verbose) message("Scanning dataset for strata(s) with monomorphic marker(s)")
      # scanning for populations with one genotype group
      scan.pop <- dplyr::group_by(.data = data, MARKERS, STRATA, GT) %>%
        dplyr::tally(.)
      
      markers.pop.na <- dplyr::filter(.data = scan.pop, is.na(GT)) %>%
        dplyr::select(MARKERS, STRATA)
      
      if (nrow(markers.pop.na) > 1) {
        simple.imputation <- dplyr::filter(.data = scan.pop, !is.na(GT)) %>%
          dplyr::select(-n) %>%
          dplyr::group_by(MARKERS, STRATA) %>%
          dplyr::tally(.) %>%
          dplyr::filter(n == 1) %>%
          dplyr::select(MARKERS, STRATA) %>%
          dplyr::semi_join(markers.pop.na, by = c("MARKERS", "STRATA"))
        
        simple.imputation.number <- nrow(simple.imputation)
        
        if (simple.imputation.number > 1) {
          if (verbose) message("    Simple strawman imputations conducted on ", simple.imputation.number, " markers/pops combo")
          # update markers.list
          markers.list <- dplyr::anti_join(markers.pop.na, simple.imputation, by = c("MARKERS", "STRATA")) %>%
            dplyr::ungroup(.) %>%
            dplyr::distinct(MARKERS) %>%
            purrr::flatten_chr(.)
          
          data.imp <- dplyr::inner_join(
            data, simple.imputation, by = c("MARKERS", "STRATA")) %>%
            dplyr::group_by(MARKERS, STRATA) %>%
            dplyr::mutate(
              GT = stringi::stri_replace_na(
                GT, replacement = max(GT, na.rm = TRUE))) %>%
            dplyr::ungroup(.)
          
          # if GL is present give the mean value for the imputed genotype
          # note to myself: update doc to say what you're doing here...
          if (rlang::has_name(data.imp, "GL")) {
            data.imp <- data.imp %>%
              dplyr::group_by(MARKERS, STRATA) %>%
              dplyr::mutate(GL = as.numeric(stringi::stri_replace_na(GL, replacement = mean(GL, na.rm = TRUE)))) %>%
              dplyr::ungroup(.)
          }
          # update dataframe
          data <- dplyr::anti_join(
            data, simple.imputation, by = c("MARKERS", "STRATA")) %>%
            dplyr::bind_rows(data.imp) %>%
            dplyr::arrange(STRATA, INDIVIDUALS, MARKERS)
        }
      }
      
      # removed unused object
      data.imp <- simple.imputation <- simple.imputation.number <- NULL
      scan.pop <- markers.pop.na <- NULL
    }# End imputation prep by pop
    
    
    # after some testing the next step is not warranted 
    # prefer to let the algorithm decide how to impute those one. 
    if (hierarchical.levels == "not_needed") {
      # if (hierarchical.levels == "global") {
      scan.markers.na <- dplyr::filter(data, is.na(GT)) %>%
        dplyr::distinct(MARKERS) %>%
        purrr::flatten_chr(.)
      
      if (length(scan.markers.na) < length(markers.list)) {
        
        simple.imputation <- dplyr::group_by(.data = data, MARKERS, GT) %>%
          dplyr::tally(.) %>%
          dplyr::filter(!is.na(GT)) %>%
          dplyr::select(-n) %>%
          dplyr::group_by(MARKERS) %>%
          dplyr::tally(.) %>%
          dplyr::filter(n == 1) %>%
          dplyr::select(MARKERS) %>%
          purrr::flatten_chr(.)
        
        simple.imputation.number <- length(simple.imputation)
        if (simple.imputation.number >= 1) {
          if (verbose) message("Simple imputations conducted on ", simple.imputation.number, " markers")
          
          data.imp <- dplyr::filter(data, MARKERS %in% simple.imputation) %>%
            dplyr::group_by(MARKERS) %>%
            dplyr::mutate(
              GT = stringi::stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>%
            dplyr::ungroup(.)
          
          # if GL is present give the mean value for the imputed genotype
          if (rlang::has_name(data.imp, "GL")) {
            data.imp <- data.imp %>%
              dplyr::group_by(MARKERS) %>%
              dplyr::mutate(
                GL = as.numeric(stringi::stri_replace_na(GL, replacement = mean(GL, na.rm = TRUE)))) %>%
              dplyr::ungroup(.)
          }
          
          data <- dplyr::filter(data, !MARKERS %in% simple.imputation) %>%
            dplyr::bind_rows(data.imp)
          
          data.imp <- NULL
          
          markers.list <- dplyr::filter(data, is.na(GT)) %>%
            dplyr::distinct(MARKERS) %>%
            purrr::flatten_chr(.)
          
          # data.imp.bk <- dplyr::filter(data, !MARKERS %in% markers.list)
        } else {
          markers.list <- scan.markers.na
          # data.imp.bk <- dplyr::filter(data, !MARKERS %in% markers.list)
        }
      }
      # else {
      # data.imp.bk <- NULL
      # }
      scan.markers.na <- NULL
    }
    
    
    
    # On-the-fly-imputations using Random Forests ------------------------------
    if (imputation.method == "rf") {
      if (verbose) message("On-the-fly-imputations using Random Forests algorithm")
      
      # Parallel computations options for randomForestSRC
      rf.option <- getOption("rf.cores")
      mc.option <- getOption("mc.cores")
      options(rf.cores = parallel.core, mc.cores = parallel.core)
      
      # prepare data
      data.imp <- dplyr::select(data, MARKERS, STRATA, INDIVIDUALS, GT) %>%
        grur::rad_wide(x = ., formula = "INDIVIDUALS + STRATA ~ MARKERS",values_from = "GT") %>%
        dplyr::mutate_all(.tbl = ., .funs = factor)
      
      # Random Forest by pop
      if (hierarchical.levels == "strata") {
        if (verbose) message("    Imputations computed by strata, take a break...")
        data.imp <- split(x = data.imp, f = data.imp$STRATA) %>%
          purrr::map_df(
            .x = ., .f = impute_rf,
            num.tree = num.tree, nodesize = nodesize, nsplit = nsplit,
            nimpute = nimpute,
            verbose = FALSE,
            hierarchical.levels = "strata") %>%
          dplyr::mutate_all(.tbl = ., .funs = as.character) %>%
          grur::rad_long(x = ., cols = "INDIVIDUALS", names_to = "MARKERS", values_to = "GT", variable_factor = FALSE) %>%
          dplyr::right_join(strata.before, by = "INDIVIDUALS") %>%
          dplyr::arrange(STRATA, INDIVIDUALS, MARKERS)
      }#End RF by pop
      
      # Random Forests global
      if (hierarchical.levels == "global") { # Globally/overall
        if (verbose) message("    Imputations computed globally, take a break...")
        data.imp <- impute_rf(
          x = data.imp,
          num.tree = num.tree, nodesize = nodesize, nsplit = nsplit,
          nimpute = nimpute, verbose = FALSE,
          hierarchical.levels = "global") %>%
          dplyr::mutate_all(.tbl = ., .funs = as.character) %>%
          grur::rad_long(x = ., cols = c("STRATA", "INDIVIDUALS"), names_to = "MARKERS", values_to = "GT", variable_factor = FALSE) %>%
          dplyr::mutate(STRATA = factor(STRATA)) %>% 
          dplyr::arrange(STRATA, INDIVIDUALS, MARKERS)
      } #End RF global
      
      
      # separate the haplotypes/snp group
      # separating SNPs on the same locus and chromosome
      # back to original data format
      if (separate.haplo) {
        if (verbose) message("Decoding haplotypes")
        data.imp <- decoding_haplotypes(
          data = data.imp, parallel.core = parallel.core)
      }
      
      
      # Restore original Parallel computations options
      options(rf.cores = rf.option, mc.cores = mc.option)
    }# End on-the-fly-imputations using RF
    
    # Random Forest imputation as a prediction problem -------------------------
    if (imputation.method == "rf_pred") {
      if (verbose) message("Using Random Forests algorithm as a prediction problem, take a break...")
      
      if (hierarchical.levels == "strata") {
        data.imp <- purrr::map_dfr(
          .x = data,
          .f = grur_imputer,
          hierarchical.levels = hierarchical.levels,
          num.tree = num.tree,
          pmm = pmm,
          random.seed = random.seed,
          parallel.core = parallel.core) 
      }
      
      # Random Forests global
      if (hierarchical.levels == "global") { # Globally/overall
        # if (verbose) message("Imputations computed globally, take a break...")
        data.rf.imp <- list() # to store results
        data.rf.imp <- grur_imputer(
          data = data,
          hierarchical.levels = hierarchical.levels,
          num.tree = num.tree,
          pmm = pmm,
          random.seed = random.seed,
          parallel.core = parallel.core)
      } # End imputation RF global
      
      
    }# End rf_pred
    
    # XGBoost & LightGBM ------------------------------------------------------
    if (imputation.method %in% c("xgboost", "lightgbm")) {
      # prep data
      if (hierarchical.levels == "strata") {
        data %<>% 
          dplyr::ungroup(.) %>%
          dplyr::arrange(STRATA, INDIVIDUALS, MARKERS) %>%
          dplyr::mutate(
            # GT_N = as.numeric(factor(replace(GT, which(is.na(GT)), 0))),
            POP_ID_N = as.numeric(factor(STRATA)),
            INDIVIDUALS_N = as.numeric(factor(INDIVIDUALS))
          ) %>%
          dplyr::group_by(MARKERS) %>%
          dplyr::mutate(GT_N = factorize_gt(GT)) %>%
          dplyr::ungroup(.)
        readr::write_tsv(x = data, file = "imputation_factor_dictionary.rad")
        data.boost <- data %>%
          dplyr::select(MARKERS, STRATA = POP_ID_N, INDIVIDUALS = INDIVIDUALS_N, GT = GT_N) %>%
          dplyr::arrange(STRATA, INDIVIDUALS, MARKERS) %>%
          grur::rad_wide(x = ., formula = "INDIVIDUALS + STRATA ~ MARKERS", values_from = "GT") %>% 
          as.matrix(.) %>% 
          Matrix::Matrix(., sparse = TRUE)
      }
      
      if (hierarchical.levels == "global") {
        data %<>% 
          dplyr::ungroup(.) %>%
          dplyr::arrange(STRATA, INDIVIDUALS, MARKERS) %>%
          dplyr::select(-STRATA) %>%
          dplyr::mutate(INDIVIDUALS_N = as.numeric(factor(INDIVIDUALS))) %>%
          dplyr::group_by(MARKERS) %>%
          dplyr::mutate(GT_N = factorize_gt(GT)) %>%
          dplyr::ungroup(.)
        #fst::write.fst(x = data, path = "imputation_factor_dictionary.rad")
        
        data.boost <- data %>%
          dplyr::select(MARKERS, INDIVIDUALS = INDIVIDUALS_N, GT = GT_N) %>%
          dplyr::arrange(MARKERS, INDIVIDUALS) %>%
          grur::rad_wide(x = ., formula = "INDIVIDUALS ~ MARKERS", values_from = "GT") %>% 
          as.matrix(.) %>% 
          Matrix::Matrix(., sparse = TRUE)
      }
      data <- gl.wide <- NULL
      
      if (imputation.method == "xgboost") {
        if (verbose) message("Using extreme gradient tree boosting algorithm, take a break...")
        markers.df <- tibble::tibble(MARKERS = markers.list) %>% 
          dplyr::mutate(SPLIT_VEC = split_imp(x = ., cpu.rounds = 20, parallel.core = cpu.boost))
        boost.split <- unique(markers.df$SPLIT_VEC)
        # single thread test e.g.
        # cpu.boost <- 1
        # cpu.boost <- 2
        # cpu.boost <- 4
        # cpu.boost <- 8
        # system.time(data.imp <- purrr::map_df(
        #   .x = boost.split,
        #   .f = grur_boost_imputer,
        #   markers.df = markers.df,
        #   data.xgb = data.boost,
        #   gl.wide = gl.wide,
        #   eta = eta,
        #   gamma = gamma,
        #   max_depth = max_depth,
        #   min_child_weight = min_child_weight,
        #   subsample = subsample,
        #   colsample_bytree = colsample_bytree,
        #   num_parallel_tree = num_parallel_tree,
        #   cpu.boost = cpu.boost,
        #   nrounds = nrounds,
        #   early_stopping_rounds = early_stopping_rounds,
        #   save_name = save_name))
        
        # parallel testing core option
        # cpu.boost <- 1
        # cpu.boost <- 2
        # cpu.boost <- 4
        # cpu.boost <- 8
        # parallel.core <- 2
        # parallel.core <- 4
        # parallel.core <- 8
        
        markers.df <- tibble::tibble(MARKERS = markers.list) %>%
          dplyr::mutate(SPLIT_VEC = split_imp(x = ., cpu.rounds = 2, 
                                              parallel.core = parallel.core))
        boost.split <- unique(markers.df$SPLIT_VEC)
        
        data.imp <- list()
        data.imp <- grur::grur_future(
          .x = boost.split,
          .f = grur_boost_imputer,
          flat.future = "dfr",
          parallel.core = parallel.core,
          markers.df = markers.df,
          data.xgb = data.boost,
          gl.wide = gl.wide,
          eta = eta,
          gamma = gamma,
          max_depth = max_depth,
          min_child_weight = min_child_weight,
          subsample = subsample,
          colsample_bytree = colsample_bytree,
          num_parallel_tree = num_parallel_tree,
          cpu.boost = cpu.boost,
          nrounds = nrounds,
          early_stopping_rounds = early_stopping_rounds,
          save_name = save_name
          )
        
        # data.imp <- .grur_parallel_mc(
        #   X = boost.split,
        #   FUN = grur_boost_imputer,
        #   mc.cores = parallel.core,
        #   markers.df = markers.df,
        #   data.xgb = data.boost,
        #   gl.wide = gl.wide,
        #   eta = eta,
        #   gamma = gamma,
        #   max_depth = max_depth,
        #   min_child_weight = min_child_weight,
        #   subsample = subsample,
        #   colsample_bytree = colsample_bytree,
        #   num_parallel_tree = num_parallel_tree,
        #   cpu.boost = cpu.boost,
        #   nrounds = nrounds,
        #   early_stopping_rounds = early_stopping_rounds,
        #   save_name = save_name) %>% 
        #   dplyr::bind_rows(.)
      }
      if (imputation.method == "lightgbm") {
        if (verbose) message("Using Light Gradient Boosting Machine algorithm, take a break...")
        markers.df <- tibble::tibble(MARKERS = markers.list) %>%
          dplyr::mutate(
            SPLIT_VEC = as.integer(floor((10 * (1:nrow(.) - 1) / 
                                            nrow(.)) + 1)))
        boost.split <- unique(markers.df$SPLIT_VEC)
        
        message("    Imputations conducted in 10 rounds")
        # testing serial
        # cpu.boost <- 4
        # boosting <- "dart"
        # learning_rate <- 0.1
        # bagging_freq <- 1
        # max_depth <- 9
        # num_leaves <- 512
        
        data.imp <- purrr::map_df(
          .x = boost.split,
          .f = grur_lgbm_imputer,
          markers.df = markers.df,
          data.gbm = data.boost,
          boosting = boosting,
          objective = objective,
          learning_rate = learning_rate,
          feature_fraction = feature_fraction,
          bagging_fraction = bagging_fraction,
          bagging_freq = bagging_freq,
          max_depth = max_depth,
          min_data_in_leaf = min_data_in_leaf,
          num_leaves = num_leaves,
          cpu.boost = cpu.boost,
          early_stopping_rounds = early_stopping_rounds,
          nrounds = nrounds,
          iteration.subsample = iteration.subsample,
          pmm = pmm)
        
        
        # LightGBM doesn't work well in parallel for features
        # data.imp <- list()
        # data.imp <- .grur_parallel_mc(
        #   X = boost.split,
        #   FUN = grur_lgbm_imputer,
        #   mc.cores = parallel.core,
        #   markers.df = markers.df,
        #   data.gbm = data.boost,
        #   boosting = boosting,
        #   objective = objective,
        #   learning_rate = learning_rate,
        #   feature_fraction = feature_fraction,
        #   bagging_fraction = bagging_fraction,
        #   max_depth = max_depth,
        #   min_data_in_leaf = min_data_in_leaf,
        #   num_leaves = num_leaves,
        #   cpu.boost = 2,
        #   early_stopping_rounds = early_stopping_rounds,
        #   nrounds = nrounds) %>% 
        #   dplyr::bind_rows(.)
      }
      
      data.boost <- NULL# remove unused objects
      
      # save.image("imputation.test.RData")
      #Note to myself: if you want to further explore error
      # the column need parsing...
      # error.lightgbm <-  dplyr::select(data.imp, MARKERS, ERROR)
      data.imp <- data.imp %>% dplyr::select(IMPUTED_DATA) %>% tidyr::unnest(.)
      
      # Remove factor/integer type genotype (revert back to original)
      data.imp <- defactorize_gt(data.to.change = data.imp)
      
      # Reintroduce the stratification (check if required)
      data.imp <- dplyr::left_join(strata.before, data.imp, by = "INDIVIDUALS") %>%
        dplyr::arrange(STRATA, INDIVIDUALS, MARKERS)
      strata.before <- NULL# remove unused objects
      
      if (separate.haplo) {
        # separate the haplotypes/snp group
        # Decoding haplotypes
        # separating SNPs on the same locus and chromosome, back to original data format
        if (verbose) message("Decoding haplotypes")
        data.imp <- decoding_haplotypes(
          data = data.imp, parallel.core = parallel.core)
      }
      
      
      # Impute GL
      #Note: will no longer work with stacks
      # if (rlang::has_name(data.imp, "GL") && hierarchical.levels == "strata") {
      #   message("Imputing GL with mean value per populations")
      #   data.imp <- dplyr::group_by(.data = data.imp, MARKERS, STRATA, GT) %>%
      #     dplyr::mutate(
      #       GL = stringi::stri_replace_na(GL, replacement = mean(GL, na.rm = TRUE)),
      #       GL = replace(GL, which(GL %in% c("NA", "NaN")), NA),
      #       GL = as.numeric(GL)) %>%
      #     dplyr::group_by(MARKERS, GT) %>%
      #     dplyr::mutate(
      #       GL = as.numeric(stringi::stri_replace_na(GL, replacement = mean(GL, na.rm = TRUE)))
      #     ) %>%
      #     dplyr::ungroup(.)
      # }
      # if (rlang::has_name(data.imp, "GL") && hierarchical.levels == "global") {
      #   message("Imputing GL with mean overall value")
      #   data.imp <- dplyr::group_by(.data = data.imp, MARKERS, GT) %>%
      #     dplyr::mutate(
      #       GL = as.numeric(stringi::stri_replace_na(GL, replacement = mean(GL, na.rm = TRUE)))
      #     ) %>%
      #     dplyr::ungroup(.)
      # }
    }# End boost
    
    
  } # End imputation RF, xgboost lightgbm
  
  # prep results ---------------------------------------------------------------
  
  # Replace NA by 000000 in GT column if found
  if (anyNA(data.imp)) {
    warning("Missing data is still present in the dataset",
            "\n    2 options:",
            "\n    run the function again with hierarchical.levels = 'global'",
            "\n    use common.markers = TRUE when using hierarchical.levels = 'strata'")
    if (haplo.vcf.check) {
      data.imp$GT <- stringi::stri_replace_na(str = data.imp$GT, replacement = "./.")
    } else {
      data.imp$GT <- stringi::stri_replace_na(str = data.imp$GT, replacement = "000000")
    }
  }
  
  if (haplo.vcf.check) {
    data.imp %<>% dplyr::rename(GT_VCF_NUC = GT)
  }
  
  # Compute REF/ALT allele... might have change depending on prop of missing values
  if (verbose) message("Adjusting REF/ALT alleles to account for imputations...")
  data.imp <- radiator::calibrate_alleles(
    data = dplyr::rename(data.imp, POP_ID = STRATA),
    biallelic = biallelic,
    parallel.core = parallel.core,
    verbose = verbose) %$% input
  
  if (rlang::has_name(data.imp, "POLYMORPHIC.x")) data.imp %<>% dplyr::select(-POLYMORPHIC.x)
  if (rlang::has_name(data.imp, "POLYMORPHIC.y")) data.imp %<>% dplyr::select(-POLYMORPHIC.y)
  
  data.imp %<>% dplyr::rename(STRATA = POP_ID)
  # Integrate markers.meta columns and sort
  
  if (!is.null(markers.meta)) {
    want <- c( "MARKERS", "CHROM", "LOCUS", "POS", "STRATA",
               "INDIVIDUALS", "REF", "ALT", "GT", "GT_VCF",
               "GT_VCF_NUC", "GT_BIN", "GL")
    
    if (rlang::has_name(markers.meta, "NEW_MARKERS")) {
      data.imp <- suppressWarnings(
        dplyr::left_join(
          dplyr::rename(data.imp, NEW_MARKERS = MARKERS),
          markers.meta, by = "NEW_MARKERS") %>%
          dplyr::select(dplyr::one_of(want))
      )
    } else {
      data.imp <- suppressWarnings(
        dplyr::left_join(
          data.imp, markers.meta, by = "MARKERS") %>%
          dplyr::select(dplyr::one_of(want))
      )
    }
    want <- markers.meta <- NULL
  } else {
    data.imp %<>% dplyr::arrange(STRATA, INDIVIDUALS, MARKERS)
  }
  
  # Write to working directory
  if (!is.null(filename)) {
    if (is.null(subsample.markers)) {
      tidy.name <- stringi::stri_join(filename, ".rad")
    } else {
      tidy.name <- stringi::stri_join(
        filename, "_subsample.markers_", subsample.markers, ".rad")
    }
    # if (verbose) message("Writing the imputed data: \n", tidy.name)
    radiator::write_rad(data = data.imp, path = tidy.name, filename = tidy.name, verbose = TRUE)
  }
  
  # Missing after imputation:
  na.after <- dplyr::summarise(.data = data.imp, MISSING = round(length(GT[GT == "000000"])/length(GT), 6)) %>%
    purrr::flatten_dbl(.) %>% format(., scientific = FALSE)
  message("\nProportion of missing genotypes after imputations: ", na.after)
  
  # Error notices
  if (imputation.method == "xgboost" && file.exists("grur_imputations_error.txt")) {
    message("\n\nError notice: Tree boosting imputations encountered error(s),
              please look in the working directory for a file:
              grur_imputations_error.txt
              email the problem to the author: thierrygosselin@icloud.com")
  }
  # make sure that the class of the imputed dataset is a tibble
  # return(data.imp)
  res$data.imp <- tibble::as_tibble(data.imp)
  data.imp <- NULL
  # genomic_converter-----------------------------------------------------------
  if (!is.null(output)) {
    message("\nTransferring data to genomic converter...")
    res$output <- radiator::genomic_converter(
      data = res$data.imp,
      output = output,
      parallel.core = parallel.core,
      filename = filename,
      verbose = FALSE,
      filter.common.markers = FALSE,
      filter.monomorphic = FALSE,
      internal = TRUE)
  }
  
  
  # RESULTS---------------------------------------------------------------------
  return(res)
} # End imputations

# Internal functions -------------------------  --------------------------------
# on-the-fly-imputations with randomForestSRC package --------------------------

#' @title impute_rf
#' @description on-the-fly-imputations using randomForestSRC package
#' @rdname impute_rf
#' @keywords internal
#' @export

impute_rf <- function(
  x,
  num.tree = 10,
  nodesize = 1,
  # splitrule = "random",
  nsplit = 10,
  nimpute = 10,
  verbose = FALSE,
  hierarchical.levels = "strata") {
  
  if (hierarchical.levels == "strata") {
    message("        Imputations for strata: ", unique(x$STRATA))
    x %<>% dplyr::select(-STRATA)
  }
  
  res <- randomForestSRC::impute.rfsrc(
    data = data.frame(x),
    ntree = num.tree,
    nodesize = nodesize,
    # splitrule = "random",
    nsplit = nsplit, #split.number,
    nimpute = nimpute, #iteration.rf,
    do.trace = verbose)
  return(res)
} # End on-the-fly imputation function


# grur_imputer ---------------------------------------------------------------
#' @title grur_imputer
#' @description imputations using Ranger package and predictive mean matching
#' @rdname grur_imputer
#' @keywords internal
#' @export

grur_imputer <- function(
  data,
  num.tree = 100,
  pmm = 0,
  random.seed = NULL,
  parallel.core = parallel::detectCores() - 1,
  hierarchical.levels = "strata",
  markers.list = markers.list,
  verbose = verbose
) {
  # data <- data #test
  
  data %<>% 
    dplyr::select(STRATA, INDIVIDUALS, MARKERS, GT) %>%
    dplyr::mutate(GT = replace(GT, which(is.na(GT)), "missing")) %>%
    grur::rad_wide(x = ., formula = "INDIVIDUALS + STRATA ~ MARKERS", values_from = "GT")
  
  # data[is.na(data.model)] <- "missing"
  data.gl <- data.na <- NULL # remove after test
  
  
  # initiate while loop
  i <- 1
  pred.error <- rep(1, length(markers.list))
  names(pred.error) <- markers.list
  oob.error <- TRUE
  maxiter <- 10000
  data.imp <- tibble::tibble(character(0))
  
  # imp <- list()
  while (oob.error && i <= maxiter) {
    data.last <- data
    pred.error.last <- pred.error
    
    data.rf <- list()
    data.rf <- grur::grur_future(
      .x = markers.list,
      .f = impute_genotypes,
      flat.future = "dfr",
      parallel.core = parallel.core,
      data = data,
      data.na = data.na,
      data.gl = data.gl,
      num.tree = num.tree,
      pmm = pmm,
      random.seed = random.seed,
      hierarchical.levels = hierarchical.levels,
      pred.error = pred.error
    )
    # data.rf <- .grur_parallel_mc(
    #   X = markers.list,
    #   FUN = impute_genotypes,
    #   mc.cores = parallel.core,
    #   data = data,
    #   data.na = data.na,
    #   data.gl = data.gl,
    #   num.tree = num.tree,
    #   pmm = pmm,
    #   random.seed = random.seed,
    #   parallel.core = parallel.core,
    #   hierarchical.levels = hierarchical.levels,
    #   # markers.linkage = markers.linkage,
    #   pred.error = pred.error
    # ) %>%
    # dplyr::bind_rows(.)
    
    # system.time(test <- purrr::map(
    #   .x = markers.list, .f = impute_genotypes,
    #   data = data,
    #   data.na = data.na,
    #   data.gl = data.gl,
    #   data.imp = data.imp,
    #   num.tree = num.tree,
    #   pmm = pmm,
    #   random.seed = random.seed,
    #   parallel.core = parallel.core,
    #   hierarchical.levels = hierarchical.levels,
    #   # markers.linkage = markers.linkage,
    #   pred.error = pred.error))
    # 
    # 
    # test <- dplyr::bind_rows(data.imp)
    
    # update error
    oob.error <- mean(pred.error) < mean(pred.error.last)
    i <- i + 1 # update iteration
  } # End of loop
  
  if (i == maxiter && oob.error || i == 2) {
    imputed.dataset <- data
  } else {
    imputed.dataset <- data.last
  }
  
  imputed.dataset <- dplyr::mutate_all(.tbl = imputed.dataset,
                                       .funs = as.character, exclude = NA)
  
  
  # results
  return(data.imp)
} #End grur_imputer

# impute_genotypes -------------------------------------------------------------
#' @title impute_genotypes
#' @description imputations using Ranger package and predictive mean matching of missRanger
#' @rdname impute_genotypes
#' @keywords internal
#' @export

impute_genotypes <- carrier::crate(function(
  markers.list,
  data,
  data.na,
  data.gl = NULL,
  data.imp,
  num.tree = 100,
  pmm = 0,
  random.seed = NULL,
  parallel.core = parallel::detectCores() - 1,
  hierarchical.levels = "strata",
  # markers.linkage = "multivariate",
  pred.error = pred.error
) {
  # m <- "BINDED_M7114_M7115_M7116_M7117"
  # m <- "BINDED_M1_M2_M3_M4_M5"
  # m <- "BINDED_M101_M102"
  # m <- "M993"
  `%>%` <- magrittr::`%>%`
  `%<>%` <- magrittr::`%<>%`
  
  m <- markers.list
  message("Marker: ", m)# for diagnostic
  
  # Handling complete and missing data ---------------------------------------
  data.model <- dplyr::filter(.data = data, rlang::.data[[m]] != "missing") %>%
    dplyr::mutate_all(.tbl = ., .funs = factor)
  data.missing <- dplyr::filter(.data = data, rlang::.data[[m]] == "missing") %>%
    dplyr::select(-dplyr::one_of(m))
  
  # If all missing screening # this should be done with markers.list before all this
  # if (nrow(data.missing) > 0) {
  
  # GL
  data.gl <- data.complete <- NULL # remove after test
  
  if (!is.null(data.gl)) {
    # mean GL per sample
    # case.weights <- suppressWarnings(
    #   dplyr::select(.data = data.complete, INDIVIDUALS) %>%
    #     dplyr::left_join(data.gl, by = "INDIVIDUALS") %>%
    #     dplyr::ungroup(.) %>%
    #     dplyr::select(-dplyr::one_of(c("STRATA", "INDIVIDUALS"))) %>%
    #     purrrlyr::invoke_rows(.f = purrr::lift_vd(mean), .to = "GL", .collate = "cols") %>%
    #     dplyr::select(GL) %>%
    #     purrr::flatten_dbl(.)
    # )
    # mean GL per markers
    # split.select.weights <- suppressWarnings(
    #   dplyr::select(.data = data.complete, INDIVIDUALS) %>%
    #     dplyr::left_join(data.gl, by = "INDIVIDUALS") %>%
    #     dplyr::ungroup(.) %>%
    #     dplyr::select(-dplyr::one_of(c(m, "STRATA", "INDIVIDUALS"))) %>%
    #     dplyr::summarise_all(.tbl = ., .funs = mean) %>%
    #     purrr::flatten_dbl(.)
    # )
  } else {
    case.weights <- NULL
    split.select.weights <- NULL
  }
  message("Data preparation: ok")# for diagnostic
  
  # Formula ------------------------------------------------------------------
  # if (markers.linkage == "multivariate") {
  # if (hierarchical.levels == "strata") {
  # discard.columns <- c(m, "STRATA", "INDIVIDUALS")
  # discard.columns <- c(m, "STRATA")
  # model.columns <- setdiff(colnames(data.complete), discard.columns)
  model.columns <- setdiff(colnames(data.model), m)
  rf.formula <- stats::as.formula(
    stringi::stri_join(m, " ~ ",
                       stringi::stri_join(model.columns, collapse = "+")))
  always.split.variables <- NULL
  always.split.variables <- "STRATA"
  # } else {
  #   discard.columns <- c(m, "INDIVIDUALS")
  #   model.columns <- setdiff(colnames(data.complete), discard.columns)
  #   model.columns <- setdiff(colnames(data.complete), m)
  #   rf.formula <- stats::as.formula(
  #     stringi::stri_join(m, " ~ ",
  #                        stringi::stri_join(model.columns, collapse = "+")))
  #   # rf.formula <- stats::reformulate(termlabels = "STRATA", response = m)
  #   always.split.variables <- c("STRATA")
  # }
  # } else {#univariate (one marker at a time)
  #   if (hierarchical.levels == "strata") {
  #     rf.formula <- stats::reformulate(termlabels = ".", response = m)
  #     always.split.variables <- NULL
  #   } else {
  #     rf.formula <- stats::reformulate(termlabels = "STRATA", response = m)
  #     always.split.variables <- c("STRATA")
  #   }
  # }#End composing formula
  message("Formula: ok")# for diagnostic
  # RF -----------------------------------------------------------------------
  ranger.res <- ranger::ranger(
    formula = rf.formula,
    data = data.model,
    # num.trees = 1000,
    num.trees = num.tree,
    case.weights = case.weights,
    split.select.weights = split.select.weights,
    always.split.variables = always.split.variables,
    num.threads = 1,
    # num.threads = parallel.core,
    seed = random.seed)
  
  # ranger.res
  predicted <- stats::predict(ranger.res$forest, data.missing)$predictions
  
  message("RF: ok")# for diagnostic
  # predictive mean matching ---------------------------------------------
  if (pmm > 0) {
    
    # ytrain <- dplyr::select(data.complete, GT) %>%
    #   dplyr::mutate(GT = as.character(GT)) %>%
    #   purrr::flatten_chr(.)
    
    # wide format
    # ytrain <- dplyr::select(.data = data.complete, dplyr::one_of(m)) %>%
    #   dplyr::ungroup(.) %>%
    #   dplyr::mutate_all(.tbl = ., .funs = as.character) %>%
    #   purrr::flatten_chr(.)
    
    # long format (doesn't give reliable results with missRanger...)
    # ytrain <- dplyr::select(.data = data.complete, GT_IMP) %>%
    #   dplyr::ungroup(.) %>%
    #   dplyr::mutate_all(.tbl = ., .funs = as.character) %>%
    #   purrr::flatten_chr(.)
    
    # To pass Travis
    # predicted <- missRanger::pmm(
    #   xtrain = ranger.res$predictions,
    #   xtest = predicted,
    #   ytrain = ytrain,
    #   k = pmm)
    predicted <- NULL
  }
  
  message("pred.mean.matching: ok")# for diagnostic
  
  # data.missing[,m] <- predicted
  # data <- suppressWarnings(dplyr::bind_rows(data.complete, data.missing) %>% dplyr::arrange(INDIVIDUALS))
  # data.imp <- data[,m]
  
  
  # imp <- dplyr::select(.data = data2, dplyr::one_of(c("STRATA", "INDIVIDUALS", m))) %>%
  #   decoding_haplotypes(parallel.core = parallel.core)
  # imp[[m]] <- decoding_haplotypes(data = data.imp, parallel.core = parallel.core)
  
  data.imp <- dplyr::select(.data = data.missing, INDIVIDUALS) %>%
    dplyr::mutate(
      MARKERS = rep(m, nrow(data.missing)),
      GT = predicted
    )
  
  
  
  pred.error[[m]] <- ranger.res$prediction.error
  if (is.nan(pred.error[[m]])) pred.error[[m]] <- 0
  
  res <- list(data = data, pred.error = pred.error, data.imp = data.imp)
  # } else {
  #   pred.error[[m]] <- 0
  #   data.imp <- data[,m]
  #   res <- list(data = data, pred.error = pred.error, data.imp = data.imp)
  # }
  # difference with missForest, missRanger and randomForestSRC:
  # other package are updating the dataset with the imputed value and continue
  # looping through the columns, creating a bias or differences between columns
  # imputed first through last as none have the same predictor columns missigness
  # e.g. missRanger: completed <- union(completed, v)
  # I think it's preferable to leave the data frame as is and merge columns in the end
  
  # if (separate.haplo) {
  #   imputed.dataset <- imputed.dataset %>%
  #     tidyr::separate(data = ., col = GT, into = haplo.meta, sep = "-", remove = TRUE) %>%
  #     tidyr::gather(data = ., key = MARKERS, value = GT, -c(CHROM_LOCUS, STRATA, INDIVIDUALS))
  # }
  return(res)
  message("results: ok")# for diagnostic
}) #End impute_genotypes

# grur_boost_imputer ---------------------------------------------------------------
#' @title grur_boost_imputer
#' @description imputations using eXtreme Gradient Boosting Tree
#' @rdname grur_boost_imputer
#' @keywords internal
#' @export
grur_boost_imputer <- carrier::crate(function(
  boost.split = NULL,
  markers.df = NULL,
  data.xgb = NULL,
  gl.wide = NULL,
  eta = 0.2,
  gamma = 0,
  max_depth = 6,
  min_child_weight = 1,
  subsample = 0.8,
  colsample_bytree = 1,
  num_parallel_tree = 1,
  cpu.boost = 1,
  nrounds = 200,
  early_stopping_rounds = 20,
  save_name = "imputation.model.temp"
) {
  `%>%` <- magrittr::`%>%`
  `%<>%` <- magrittr::`%<>%`
  
  # boost.split <- 3
  # markers.list <- dplyr::distinct(markers.df, MARKERS) %>% purrr::flatten_chr(.)
  markers.list <- dplyr::filter(markers.df, SPLIT_VEC == boost.split) %>%
    dplyr::select(MARKERS) %>% purrr::flatten_chr(.)
  
  xgb_imp <- function(
    markers.list = NULL, data.xgb = NULL, gl.wide = NULL,
    eta = 0.2, gamma = 0, max_depth = 6, min_child_weight = 1,
    subsample = 0.8, colsample_bytree = 1, num_parallel_tree = 1,
    nthread = 1, nrounds = 200, early_stopping_rounds = 20,
    save_name = "imputation.model.temp"
  ) {
    m <- markers.list
    message("Imputation of marker: ", m)
    # preparing data
    all.var <- colnames(data.xgb)
    train.var <- !all.var %in% c(m)
    data.na <- is.na(data.xgb[, all.var, drop = FALSE])
    select.na <- data.na[, m]
    all.var <- data.na <- NULL
    train.data <- data.xgb[!select.na, train.var]
    train.label <- data.xgb[!select.na, m]
    data.xgb <- xgboost::xgb.DMatrix(data = train.data, label = train.label, missing = NA)
    train.data <- NULL
    train.missing <- data.xgb[select.na, train.var, drop = FALSE]
    if (nrow(train.missing) == 0) rlang::abort("code error: email author")
    train.missing.label <- data.xgb[select.na, m]
    train.var <- select.na <- NULL
    id.string <- train.missing[, "INDIVIDUALS"]
    train.missing <- xgboost::xgb.DMatrix(
      data = train.missing, label = train.missing.label, missing = NA)
    train.missing.label <- NULL
    # prepare table
    res <- tibble::tibble(
      INDIVIDUALS = id.string,
      MARKERS = rep(m, length(id.string)))
    
    params <- list(
      booster = "dart",
      silent = 0,
      eta = eta,
      gamma = gamma,
      max_depth = max_depth,
      min_child_weight = min_child_weight,
      subsample = subsample,
      colsample_bytree = colsample_bytree,
      num_parallel_tree = num_parallel_tree,
      objective = "multi:softmax",
      num_class = length(unique(train.label)), #max(data.label) + 1,
      nthread = nthread #XGBoost is actually faster when set to 1 for most dataset...
    )
    
    watchlist <- list(train = data.xgb) # not an argument of xgboost::xgboost
    
    callbacks <- list(xgboost::cb.early.stop(
      stopping_rounds = early_stopping_rounds,
      metric_name = "train_merror", verbose = FALSE))
    
    # Catch error while tree boosting
    safe_boost <- purrr::safely(.f = xgboost::xgb.train)
    
    model <- safe_boost(
      data = data.xgb,
      missing = NA,
      weight = NULL,
      params = params,
      nrounds = nrounds,
      verbose = 0,
      watchlist = watchlist,
      callbacks = callbacks,
      save_period = 0,
      save_name = save_name)
    
    if (is.null(model$error)) {
      res$GT <- stats::predict(
        model$result, train.missing,
        ntreelimit = model$result$best_ntreelimit,
        missing = NA)
    } else {
      readr::write_lines(x = boost.res$error, file = "grur_imputations_error.txt", append = TRUE)
      res$GT <- as.numeric(rep(NA, nrow(res)))
    }
    
    # Unused arguments
    m <- params <- watchlist <- callbacks <- model <- train.missing <- model <- NULL
    return(res)
  }#End xgb_imp
  
  boost.res <- purrr::map_df(
    .x = markers.list, .f = xgb_imp,
    data.xgb = data.xgb, gl.wide = gl.wide,
    eta = eta, gamma = gamma,
    max_depth = max_depth, min_child_weight = min_child_weight,
    subsample = subsample, colsample_bytree = colsample_bytree,
    num_parallel_tree = num_parallel_tree,
    nthread = cpu.boost, nrounds = nrounds,
    early_stopping_rounds = early_stopping_rounds,
    save_name = save_name)
  
  return(boost.res)
  
})#End xgboost


# grur_lgbm_imputer ------------------------------------------------------------
#' @title grur_lgbm_imputer
#' @description imputations using Ligh Gradient Boosting Machine
#' @rdname grur_lgbm_imputer
#' @keywords internal
#' @export

grur_lgbm_imputer <- function(
  boost.split = NULL,
  markers.df = NULL,
  data.gbm = NULL,
  boosting = "dart",
  objective = "multiclass",
  learning_rate = 0.1,
  feature_fraction = 0.9,
  bagging_fraction = 0.9,
  bagging_freq = 1,
  max_depth = -1,
  min_data_in_leaf = 20,#default
  num_leaves = 31,#default
  cpu.boost = parallel::detectCores() / 2,
  early_stopping_rounds = 10,
  nrounds = 200,
  iteration.subsample = 2,
  pmm = 2
) {
  timing.imp <- proc.time() #for timing
  # boost.split <- 1 #test
  # data.gbm <- data.boost #test
  markers.list <- dplyr::filter(markers.df, SPLIT_VEC == boost.split) %>%
    dplyr::select(MARKERS) %>% purrr::flatten_chr(.)
  
  lightbgm_imp <- function(
    markers.list = NULL,
    data.gbm = NULL,
    boosting = "dart",
    objective = "multiclass",
    learning_rate = 0.1,
    feature_fraction = 0.9,
    bagging_fraction = 0.9,
    bagging_freq = 1,
    max_depth = -1,
    min_data_in_leaf = 20,#default
    num_leaves = 31,#default
    cpu.boost = parallel::detectCores() / 2,
    early_stopping_rounds = 10,
    nrounds = 200,
    iteration.subsample = 2,
    pmm = 2
  ) {
    # message("Imputations markers: ", markers.list)
    # preparing data
    # markers.list <- "BINDED_M667_M668"
    all.var<- colnames(data.gbm)
    data.var <- !all.var %in% c(markers.list)
    data.na <- is.na(data.gbm[, all.var, drop = FALSE])
    select.na <- data.na[, markers.list]
    
    # use train.gbm but here it's just to recycle object name below
    # here it's all the data
    xtrain <- data.gbm <- data.gbm[!select.na, data.var] #xtrain for PMM
    data.label <- data.gbm[!select.na, markers.list]
    ytrain <- unname(data.label)#ytrain for PMM
    
    # Note to myself: sampling for training and test set was not a good choice
    # with genomic data. Some training sets didn't contain all the potential genotypes.
    # tried sample with prob but trying a stratified sampling below
    
    # before
    # sample 90% of rows (samples) for training
    # train.row <- sort(sample(x = 1:nrow(train.gbm), size = ceiling(0.9 * nrow(train.gbm)), replace = FALSE))
    # test.row <- purrr::discard(.x = 1:nrow(train.gbm), .p = 1:nrow(train.gbm) %in% train.row)
    
    subsampling_gbm <- function(
      iteration.subsample, data.label, data.gbm,
      boosting = "dart",
      objective = "multiclass",
      learning_rate = 0.1,
      feature_fraction = 0.9,
      bagging_fraction = 0.9,
      bagging_freq = 1,
      max_depth = -1,
      min_data_in_leaf = 20,#default
      num_leaves = 31,#default
      cpu.boost = parallel::detectCores() / 2,
      early_stopping_rounds = 10,
      nrounds = 200) {
      # now using this function to make sure that label as all the potential num_classes
      stratified_sampling <- function(x, size = 0.9) {
        # x <- train.label
        # size <-  0.9
        data <- tibble::tibble(LABEL = x) %>%
          dplyr::mutate(ROWS = seq(1, n(), by = 1))
        
        sampled <- data %>% 
          dplyr::group_by(LABEL) %>% 
          dplyr::sample_frac(tbl = ., size = size) %>% 
          dplyr::arrange(ROWS)
        
        train.row <- sampled$ROWS
        train.label <- sampled$LABEL
        test.label <- data %>% dplyr::filter(!ROWS %in% sampled$ROWS) %>% 
          dplyr::select(LABEL) %>% purrr::flatten_dbl(.)
        return(res = list(
          train.row = train.row, 
          train.label = train.label,
          test.label = test.label))
      }#End stratified_sampling
      
      sampled <- stratified_sampling(data.label)
      
      train.row <- sampled$train.row
      test.gbm <- data.gbm[-train.row,]
      test.label <- sampled$test.label
      train.gbm <- data.gbm[train.row,]
      train.label <- sampled$train.label
      sampled <- NULL
      # train.gbm <- lightgbm::lgb.Dataset(data = train.gbm, label = train.label)
      
      # test.gbm <- lightgbm::lgb.Dataset.create.valid(dataset = train.gbm,
      # data = test.gbm, label = test.label)
      valids <- list(test = test.gbm)
      
      # parameters
      params <- list(
        boosting = boosting, #"gbdt"#"dart",
        # boosting = "gbdt",
        objective = objective,
        num_classes = length(unique(train.label)),
        learning_rate = learning_rate,
        feature_fraction = feature_fraction,
        bagging_fraction = bagging_fraction,
        bagging_freq = bagging_freq,
        max_depth = max_depth,
        min_data_in_leaf = min_data_in_leaf,
        num_leaves = num_leaves,
        device = "cpu",
        num_threads = cpu.boost,
        early_stopping_rounds = early_stopping_rounds,
        # metric = "multi_logloss")
        metric = "multi_error")
      
      
      # model <- lightgbm::lgb.train(
      #   # init_model = model,
      #   params = params,
      #   data = train.gbm,
      #   nrounds = nrounds,
      #   valids = valids,
      #   pred_early_stop = TRUE,
      #   verbose = -1,
      #   reset_data = TRUE
      # )
      
      model.df <- tibble::tibble(ITERATIONS = iteration.subsample, MODEL = list(model), SCORE = model$best_score)
      gc(verbose = FALSE)
      return(model.df)
    } # End of loop
    
    # #testing
    # learning_rate <- 0.05
    # feature_fraction <- 0.5
    # bagging_fraction <- 0.5
    # bagging_freq <- 1
    # min_data_in_leaf <- 20
    # num_leaves <- 31
    # max_depth <- -1
    # early_stopping_rounds <- 20
    # iteration.subsample <- 20
    
    model <- purrr::map_df(
      .x = 1:iteration.subsample,
      .f = subsampling_gbm,
      data.label = data.label, 
      data.gbm = data.gbm,
      boosting = boosting,
      objective = objective,
      learning_rate = learning_rate,
      feature_fraction = feature_fraction,
      bagging_fraction = bagging_fraction,
      bagging_freq = bagging_freq,
      max_depth = max_depth,
      min_data_in_leaf = min_data_in_leaf,
      num_leaves = num_leaves,
      cpu.boost = cpu.boost,
      early_stopping_rounds = early_stopping_rounds,
      nrounds = nrounds
    ) %>% 
      dplyr::filter(SCORE == min(SCORE)) %>% 
      dplyr::distinct(SCORE, .keep_all = TRUE) %>% 
      dplyr::select(MODEL) %>% 
      purrr::flatten(.)
    model <- model$MODEL
    error <- model$best_score
    
    train.missing <- data.gbm[select.na, data.var, drop = FALSE]
    id.string <- train.missing[, "INDIVIDUALS"]
    
    all.var <- data.na <- train.gbm <- data.var <- select.na <- data.gbm <- data.gbm <- NULL
    
    res <- model$predict(train.missing, reshape = TRUE) %>% #using best_iter by default
      tibble::as_tibble(.) %>% 
      `colnames<-`(sort(unique(data.label))) %>%
      dplyr::mutate(
        INDIVIDUALS = id.string,
        MARKERS = rep(markers.list, n())) %>% 
      grur::rad_long(x = ., cols = c("MARKERS", "INDIVIDUALS"), names_to = "GT", values_to = "SCORE", variable_factor = FALSE) %>%
      dplyr::group_by(INDIVIDUALS, MARKERS) %>% 
      dplyr::filter(SCORE == max(SCORE)) %>%
      dplyr::distinct(INDIVIDUALS, MARKERS, .keep_all = TRUE) %>% 
      dplyr::arrange(INDIVIDUALS) %>%
      dplyr::mutate(GT = as.numeric(GT)) %>% 
      dplyr::select(-SCORE) %>% dplyr::ungroup(.)
    
    train.missing <- NULL
    
    # Predictive Mean Matching
    # pmm = 2 # test
    if (pmm > 0) {
      xtrain <- tibble::as_tibble(model$predict(xtrain, reshape = TRUE)) %>% 
        `colnames<-`(sort(unique(data.label))) %>%
        dplyr::mutate(INDIVIDUALS = unique(xtrain[, "INDIVIDUALS"])) %>% 
        grur::rad_long(x = ., cols = "INDIVIDUALS", names_to = "GT", values_to = "SCORE", variable_factor = FALSE) %>%
        dplyr::group_by(INDIVIDUALS) %>% 
        dplyr::filter(SCORE == max(SCORE)) %>%
        dplyr::distinct(INDIVIDUALS, .keep_all = TRUE) %>% 
        dplyr::arrange(INDIVIDUALS) %>%
        dplyr::mutate(GT = as.numeric(GT)) %>% 
        dplyr::select(-SCORE) %>%
        dplyr::ungroup(.) %>% 
        dplyr::select(GT) %>% 
        purrr::flatten_dbl(.)
      res$GT <- missRanger::pmm(xtrain, xtest = res$GT, ytrain, k = pmm, seed = NULL)
    }
    res <- tibble::tibble(IMPUTED_DATA = list(res), MARKERS = markers.list, ERROR = error)
    return(res)
  }# End lightbgm_imp
  
  #testing
  # learning_rate <- 0.1
  # feature_fraction <- 0.9
  # bagging_fraction <- 0.9
  # bagging_freq <- 1
  # min_data_in_leaf <- 20
  # num_leaves <- 31
  # max_depth <- -1
  # early_stopping_rounds <- 20
  # iteration.subsample <- 2
  # pmm <- 2
  
  res <- purrr::map_df(
    .x = markers.list, .f = lightbgm_imp,
    data.gbm = data.gbm,
    boosting = boosting, objective = objective,
    learning_rate = learning_rate,
    feature_fraction = feature_fraction,
    bagging_fraction = bagging_fraction,
    max_depth = max_depth,
    min_data_in_leaf = min_data_in_leaf,
    num_leaves = num_leaves,
    cpu.boost = cpu.boost,
    early_stopping_rounds = early_stopping_rounds,
    nrounds = nrounds,
    iteration.subsample = 2, pmm = pmm)
  timing.imp <- proc.time() - timing.imp
  message("    Imputations round ", boost.split, "/10 conducted in ", round(timing.imp[[3]]), " sec")
  return(res)
}#End grur_lgbm_imputer

# encoding_snp --------------------------------------------------------------------
#' @title encoding_snp
#' @description bind snp found on the same locus
#' @rdname encoding_snp
#' @keywords internal
#' @export
encoding_snp <- carrier::crate(function(locus.list = NULL, data = NULL) {
  `%>%` <- magrittr::`%>%`
  `%<>%` <- magrittr::`%<>%`
  
  # locus.list <- "1_135"
  res <- dplyr::filter(.data = data, CHROM_LOCUS %in% locus.list)
  binded.markers <- dplyr::distinct(.data = res, MARKERS) %>%
    purrr::flatten_chr(.) %>%
    stringi::stri_join(., collapse = "_")
  binded.markers <- stringi::stri_join("BINDED_", binded.markers)
  
  res %<>% 
    grur::rad_wide(x = ., formula = "CHROM_LOCUS + INDIVIDUALS + STRATA ~ MARKERS", values_from = "GT") %>% 
    dplyr::group_by(CHROM_LOCUS, STRATA, INDIVIDUALS) %>%
    tidyr::unite(data = ., col = GT, -CHROM_LOCUS, -STRATA, -INDIVIDUALS, sep = "_", remove = TRUE) %>%
    dplyr::mutate(#Haplotype with combination of SNP and NA = NA (up for an argument?)
      GT = stringi::stri_replace_all_fixed(
        str = GT, pattern = "NA", replacement = NA, vectorize_all = FALSE),
      MARKERS = rep(binded.markers, dplyr::n())
    ) %>%
    dplyr::ungroup(.) %>%
    dplyr::select(MARKERS, CHROM_LOCUS, STRATA, INDIVIDUALS, GT)
  
  return(res)
})#End encoding_snp

# decoding_haplotypes------------------------------------------------------------
#' @title decoding_haplotypes
#' @description separate snp group merged with encoding_snp
#' @rdname decoding_haplotypes
#' @keywords internal
#' @export

decoding_haplotypes <- function(data = NULL, parallel.core = parallel::detectCores() - 1) {
  # data <- data.imp.bk#test
  # data <- data.imp#test
  
  # find the markers that need sep.
  if (rlang::has_name(data, "MARKERS")) {
    markers.sep <- dplyr::distinct(data, MARKERS) %>%
      dplyr::filter(stringi::stri_detect_regex(str = MARKERS, pattern = "^BINDED")) %>%
      purrr::flatten_chr(.)
  } else {
    col.names.data <- colnames(data)
    markers.sep <- purrr::keep(
      .x = col.names.data,
      .p = stringi::stri_detect_regex(str = col.names.data, pattern = "^BINDED"))
    col.names.data <- NULL
  }
  # nested function required ---------------------------------------------------
  separate_locus <- carrier::crate(function(binded.markers = NULL, data = NULL) {
    # binded.markers <- markers.sep[[1]]
    `%>%` <- magrittr::`%>%`
    `%<>%` <- magrittr::`%<>%`
    
    col.replace <- stringi::stri_replace_all_fixed(
      str = binded.markers, pattern = "BINDED_", replacement = "", vectorize_all = FALSE)
    
    if (rlang::has_name(data, "MARKERS")) {
      data.sep <- dplyr::filter(.data = data, MARKERS %in% binded.markers) %>%
        dplyr::select(-MARKERS) %>%
        tidyr::separate_(
          data = .,
          col = "GT",
          into = stringi::stri_split_fixed(str = col.replace, pattern = "_", simplify = TRUE),
          sep = "_", extra = "drop")
      
      if (rlang::has_name(data.sep, "GL")) {
        data.sep %<>% 
          grur::rad_long(x = ., cols = c("STRATA", "INDIVIDUALS", "GL"), names_to = "MARKERS", values_to = "GT", variable_factor = FALSE) %>%
          dplyr::select(STRATA, INDIVIDUALS, MARKERS, GT, GL) %>%
          dplyr::arrange(STRATA, INDIVIDUALS, MARKERS, GT, GL)
      } else {
        data.sep %<>% 
          grur::rad_long(x = ., cols = c("STRATA", "INDIVIDUALS"), names_to = "MARKERS", values_to = "GT", variable_factor = FALSE) %>%
          dplyr::select(STRATA, INDIVIDUALS, MARKERS, GT) %>%
          dplyr::arrange(STRATA, INDIVIDUALS, MARKERS, GT)
      }
    } else {
      data.sep <- dplyr::select(.data = data, dplyr::one_of(c("STRATA", "INDIVIDUALS", binded.markers)))
      colnames(data.sep) <- c("STRATA", "INDIVIDUALS", col.replace)
      data.sep <- tidyr::separate_(
        data = data.sep,
        col = col.replace,
        into = stringi::stri_split_fixed(str = col.replace, pattern = "_", simplify = TRUE),
        sep = "_", extra = "drop"
      ) %>%
        grur::rad_long(x = ., cols = c("STRATA", "INDIVIDUALS"), names_to = "MARKERS", values_to = "GT", variable_factor = FALSE)
    }
    
    return(data.sep)
  })#End separate_locus
  
  if (length(markers.sep) > 0) {
    if (length(markers.sep) > 100) {
      data.sep <- list()
      data.sep <- grur::grur_future(
        .x = markers.sep,
        .f = separate_locus,
        flat.future = "dfr",
        split.vec = FALSE,
        split.with = NULL,
        parallel.core = parallel.core,
        data = data
      )
      
      # data.sep <- .grur_parallel_mc(
      #   X = markers.sep,
      #   FUN = separate_locus,
      #   mc.cores = parallel.core,
      #   data = data
      # ) %>% dplyr::bind_rows(.)
    } else {
      data.sep <- purrr::map_dfr(.x = markers.sep, .f = separate_locus, data = data)
    }
    
    # Include markers 1 snp/read
    if (rlang::has_name(data, "MARKERS")) {
      markers.no.sep <- dplyr::distinct(data, MARKERS) %>%
        dplyr::filter(!stringi::stri_detect_regex(str = MARKERS, pattern = "^BINDED")) %>%
        purrr::flatten_chr(.)
    } else {
      col.names.data <- colnames(data)
      markers.no.sep <- purrr::discard(
        .x = col.names.data,
        .p = stringi::stri_detect_regex(str = col.names.data, pattern = "^BINDED"))
      markers.no.sep <- purrr::discard(
        .x = markers.no.sep,
        .p = markers.no.sep %in% c("STRATA", "INDIVIDUALS"))
      col.names.data <- NULL
    }
    
    if (length(markers.no.sep) > 0) {
      if (rlang::has_name(data, "MARKERS")) {
        data.no.sep <- suppressWarnings(
          dplyr::filter(.data = data, MARKERS %in% markers.no.sep) %>%
            dplyr::select(dplyr::one_of(c("STRATA", "INDIVIDUALS", "MARKERS", "GT", "GL")))
        )
      } else {
        data.no.sep <- suppressWarnings(
          dplyr::select(
            .data = data,
            dplyr::one_of(c("STRATA", "INDIVIDUALS", markers.no.sep))) %>%
            grur::rad_long(x = ., cols = c("STRATA", "INDIVIDUALS"), names_to = "MARKERS", values_to = "GT", variable_factor = FALSE)
        )
      }
    } else {
      data.no.sep <- NULL
    }
    
    # combined data
    if (!is.null(data.no.sep)) {
      data.sep <- dplyr::bind_rows(data.sep, data.no.sep) %>%
        dplyr::arrange(STRATA, INDIVIDUALS, MARKERS)
    }
  } else {
    data.sep <- data
  }
  return(data.sep)
}#End decoding_haplotypes

# factorize_gt------------------------------------------------------------
#' @title factorize_gt
#' @description Necessary to factorize by markers in tidy format.
#' XGBoost needs numbering to start at , hence the codes below.
#' @rdname factorize_gt
#' @keywords internal
#' @export

factorize_gt <- function(x) {
  x <- as.numeric(factor(x)) - 1
}#End factorize_gt


# defactorize_gt------------------------------------------------------------
#' @title defactorize_gt
#' @description Function to "defactorize/decode" the imputed data back to original.
#' @rdname defactorize_gt
#' @keywords internal
#' @export

defactorize_gt <- function(data.to.change,
                           data.with.info = "imputation_factor_dictionary.rad") {
  #data.with.info <- fst::read.fst(path = data.with.info)
  data.with.info <- readr::read_tsv(data.with.info)
  
  clean.id <- dplyr::distinct(.data = data.with.info, INDIVIDUALS, INDIVIDUALS_N)
  clean.gt <- dplyr::distinct(.data = data.with.info, MARKERS, GT, GT_N) %>%
    tidyr::drop_na(.)
  
  if (rlang::has_name(data.to.change, "STRATA")) {
    clean.pop <- dplyr::distinct(.data = data.with.info, STRATA, POP_ID_N)
    res <- suppressWarnings(
      dplyr::arrange(.data = data.to.change, STRATA, INDIVIDUALS) %>%
        dplyr::rename(POP_ID_N = STRATA, INDIVIDUALS_N = INDIVIDUALS, GT_N = GT) %>%
        dplyr::inner_join(clean.id, by = "INDIVIDUALS_N") %>%
        dplyr::select(-INDIVIDUALS_N) %>%
        dplyr::inner_join(clean.pop, by = "POP_ID_N") %>%
        dplyr::select(-POP_ID_N) %>%
        dplyr::left_join(clean.gt, by = c("MARKERS", "GT_N")) %>%
        dplyr::select(STRATA, INDIVIDUALS, MARKERS, GT) %>%
        dplyr::bind_rows(
          tidyr::drop_na(
            data = dplyr::select(
              .data = data.with.info,
              dplyr::one_of(c("STRATA", "INDIVIDUALS", "MARKERS", "GT", "GL"))
            ))))
  } else {
    res <- suppressWarnings(
      dplyr::arrange(.data = data.to.change, INDIVIDUALS) %>%
        dplyr::rename(INDIVIDUALS_N = INDIVIDUALS, GT_N = GT) %>%
        dplyr::inner_join(clean.id, by = "INDIVIDUALS_N") %>%
        dplyr::select(-INDIVIDUALS_N) %>%
        dplyr::left_join(clean.gt, by = c("MARKERS", "GT_N")) %>%
        dplyr::select(INDIVIDUALS, MARKERS, GT) %>%
        dplyr::bind_rows(
          tidyr::drop_na(
            data = dplyr::select(
              .data = data.with.info,
              dplyr::one_of(c("INDIVIDUALS", "MARKERS", "GT", "GL"))
            ))))
  }
  return(res)
}#End defactorize_gt


#' @title split_imp
#' @description Split markers into chunk for parallel imputations processing
#' @rdname split_imp
#' @keywords internal
#' @export
split_imp <- function(x, cpu.rounds, parallel.core = parallel::detectCores() - 1) {
  n.row <- nrow(x)
  split.vec <- as.integer(floor((parallel.core * cpu.rounds * (1:n.row - 1) / n.row) + 1))
  return(split.vec)
}#End split_imp
thierrygosselin/grur documentation built on Oct. 28, 2020, 5:48 p.m.