R/assignment_ngs.R

# Write a gsi_sim file from STACKS VCF file

#' @name assignment_ngs
#' @title Assignment analysis in gsi_sim with GBS data produced by STACKS workflow
#' @description \code{gsi_sim} is a tool for doing and simulating genetic stock
#' identification and developed by Eric C. Anderson.
#' The arguments in the \code{assignment_ngs} function were tailored for the
#' reality of GBS data for assignment analysis while
#' maintaining a reproducible workflow.
#' The input data is a VCF file produced by STACKS or a data frame. Individuals, populations and
#' markers can be filtered and/or selected in several ways using blacklist,
#' whitelist and other arguments. Map-independent imputation of missing genotype
#' using Random Forest or the most frequent category is also available.
#' Markers can be randomly selected for a classic LOO (Leave-One-Out)
#' assignment or chosen based on ranked Fst for a thl
#' (Training, Holdout, Leave-one-out) assignment analysis.

#' @param data Options include the VCF (1) or an haplotype files (2) created in STACKS 
#' (\code{data = "batch_1.vcf"} and \code{data = "batch_1.haplotypes.tsv"}, 
#' respectively) or a data frame (3) with tab separating the 
#' genotypes in columns (\code{data = "data.assignment.tsv"}). 
#' The 1st column is the \code{POP_ID}, 2nd colum 
#' the \code{INDIVIDUALS} and the remaining columns are the markers IDs
#' containing genotypes in the format: 3 digits per allele
#' and no space between alleles (e.g. 235240 : allele1 = 235 and allele2 = 240).
#' Missing genotypes are coded \code{0} or \code{000000}. 
#' Note that the \code{POP_ID} column can be any hierarchical grouping. 
#' See the argument \code{strata} for other means of controlling grouping used 
#' in the assignment. The last option for data input is a PLINK file in 
#' \code{tped/tfam} format (e.g. \code{data =  "data.assignment.tped"}). 
#' The first 2 columns of the \code{tfam} file will be used for the 
#' \code{strata} argument below, unless a new one is provided. 
#' Columns 1, 3 and 4 of the \code{tped} are discarded. The remaining columns 
#' correspond to the genotype in the format \code{01/04} 
#' where \code{A = 01, C = 02, G = 03 and T = 04}. For \code{A/T} format, use 
#' PLINK or bash to convert.
#' Use VCFTOOLS \url{http://vcftools.sourceforge.net/} with \code{--plink-tped} 
#' to convert very large VCF file. For \code{.ped} file conversion to 
#' \code{.tped} use PLINK \url{http://pngu.mgh.harvard.edu/~purcell/plink/} 
#' with \code{--recode transpose}.
#' 
#' @param assignment.analysis Assignment analysis conducted with 
#' \code{assignment.analysis = "gsi_sim"} or 
#' \code{assignment.analysis = "adegenet"}.
#' 
#' @param whitelist.markers (optional) A whitelist containing CHROM (character
#' or integer) and/or LOCUS (integer) and/or
#' POS (integer) columns header. To filter by chromosome and/or locus and/or by snp.
#' The whitelist is in the working directory (e.g. "whitelist.txt").
#' de novo CHROM column with 'un' need to be changed to 1. 
#' Default \code{NULL} for no whitelist of markers. In the VCF, the column ID is
#' the LOCUS identification.

#' @param monomorphic.out (optional) For PLINK file, should the monomorphic 
#' markers present in the dataset be filtered out ? 
#' Default: \code{monomorphic.out = TRUE}.

#' @param blacklist.genotype (optional) Useful to erase genotype with below 
#' average quality, e.g. genotype with more than 2 alleles in diploid likely 
#' sequencing errors or genotypes with poor genotype likelihood or coverage. 
#' The blacklist as a minimum of 2 column headers (markers and individuals). 
#' Markers can be 1 column (CHROM or LOCUS or POS), 
#' a combination of 2 (e.g. CHROM and POS or CHROM and LOCUS or LOCUS and POS) or 
#' all 3 (CHROM, LOCUS, POS) The markers columns must be designated: CHROM (character
#' or integer) and/or LOCUS (integer) and/or POS (integer). The id column designated
#' INDIVIDUALS (character) columns header. The blacklist must be in the working 
#' directory (e.g. "blacklist.genotype.txt"). For de novo VCF, CHROM column 
#' with 'un' need to be changed to 1. Default \code{NULL} for no blacklist of 
#' genotypes to erase.

#' @param snp.ld (optional) For VCF file only. With anonymous markers from
#' RADseq/GBS de novo discovery, you can minimize linkage disequilibrium (LD) by
#' choosing among these 3 options: \code{"random"} selection, \code{"first"} or
#' \code{"last"} SNP on the same short read/haplotype. 
#' Default: \code{snp.ld = NULL}.
#' Note that for other file type, use stackr package for haplotype file and 
#' create a whitelist, for plink and data frames, use PLINK linkage 
#' disequilibrium based SNP pruning option.
#' @param common.markers (optional) Logical. Default = \code{FALSE}.
#' With \code{TRUE}, will keep markers genotyped in all the populations.


#' @param maf.thresholds (string, double, optional) String with 
#' local/populations and global/overall maf thresholds, respectively.
#' Default: \code{maf.thresholds = NULL}. 
#' e.g. \code{maf.thresholds = c(0.05, 0.1)} for a local maf threshold 
#' of 0.05 and a global threshold of 0.1. Available for VCF, PLINK and data frame 
#' files. Use stackr for haplotypes files.
#' @param maf.pop.num.threshold (integer, optional) When maf thresholds are used,
#' this argument is for the number of pop required to pass the maf thresholds
#' to keep the locus. Default: \code{maf.pop.num.threshold = 1}
#' @param maf.approach (character, optional). By \code{maf.approach = "SNP"} or 
#' by \code{maf.approach = "haplotype"}.
#' The function will consider the SNP or ID/LOCUS/haplotype/read MAF statistics 
#' to filter the markers.
#' Default is \code{maf.approach = "SNP"}. The \code{haplotype} approach is 
#' restricted to VCF file.
#' @param maf.operator (character, optional) \code{maf.operator = "AND"} or default \code{maf.operator = "OR"}.
#' When filtering over LOCUS or SNP, do you want the local \code{"AND"}
#' global MAF to pass the thresholds, or ... you want the local \code{"OR"}
#' global MAF to pass the thresholds, to keep the marker?

#' @param max.marker An optional integer useful to subsample marker number in 
#' large PLINK file. Default: \code{max.marker = NULL}. e.g. if the data set 
#' contains 200 000 markers and \code{max.marker = 10000} 10000 markers are
#' subsampled randomly from the 200000 markers. Use \code{whitelist.markers} to
#' keep specific markers.
#' @param marker.number (Integer or string of number or "all") Calculations with
#' fixed or subsample of your markers. Default= \code{"all"}.
#' e.g. To test 500, 1000, 2000 and all  the markers:
#' \code{marker.number = c(500, 1000, 2000, "all"}.
#' To use only 500 makers \code{marker.number = 500}.
#' @param blacklist.id (optional) A blacklist with individual ID and
#' a column header 'INDIVIDUALS'. The blacklist is in the working directory
#' (e.g. "blacklist.txt").

#' @param sampling.method (character) Should the markers be randomly selected
#' \code{"random"} for a classic Leave-One-Out (LOO) assignment or
#' chosen based on ranked Fst \code{"ranked"}, used in a
#' Training-Holdout-Leave One Out (thl) assignment ?
#' @param thl (character, integer, proportion) For \code{sampling.method = "ranked"} only.
#' Default \code{1}, 1 individual sample is used as holdout. This individual is not
#' participating in the markers ranking. For each marker number,
#' the analysis will be repeated with all the indiviuals in the data set
#' (e.g. 500 individuals, 500 times 500, 1000, 2000 markers).
#' If a proportion is used e.g. \code{0.15},= 15% of individuals in each
#' populations are chosen randomly as holdout individuals.
#' With \code{thl = "all"} all individuals are used for ranking (not good) and
#' \code{iteration.method} argument below is set to \code{1} by default.
#' For the other thl values, you can create different holdout individuals lists
#' with the \code{iteration.method} argument below.
#' @param iteration.method With random marker selection the iterations argument =
#' the number of iterations to repeat marker resampling, default is \code{10}
#' With \code{marker.number = c(500, 1000)} and default iterations setting,
#' 500 markers will be randomly chosen 10 times and 1000 markers will be randomly
#' chosen 10 times. For the ranked method, using \code{thl = 1}, the analysis
#' will be repeated for each individuals in the data set for every
#' \code{marker.number} selected. With a proportion argument \code{thl = 0.15},
#' 15% of individuals in each populations are chosen randomly as holdout
#' individuals and this process is reapeated the number of times chosen by the
#' \code{iteration.method} value.


#' @param folder (optional) The name of the folder created in the working directory to save the files/results.
#' @param gsi_sim.filename (optional) The name of the file written to the directory.
#' Use the extension ".txt" at the end. Default \code{assignment_data.txt}.
#' The number of markers used will be appended to the name of the file.
#' @param keep.gsi.files (Boolean) Default \code{FALSE} The input and output gsi_sim files
#' will be deleted from the directory when finished processing.
#' With \code{TRUE}, remember to allocate a large chunk of the disk space for the analysis.
#' @param pop.levels (required) A character string with your populations ordered.
#' @param pop.labels (optional) A character string for your populations labels.
#' If you need to rename sampling sites in \code{pop.levels} or combined sites/pop
#' into a different names, here is the place.
#' @param pop.id.start The start of your population id
#' in the name of your individual sample. Your individuals are identified 
#' in this form : SPECIES-POPULATION-MATURITY-YEAR-ID = CHI-QUE-ADU-2014-020,
#' then, \code{pop.id.start} = 5. If you didn't name your individuals
#' with the pop id in it, use the \code{strata} argument. 
#' @param pop.id.end The end of your population id
#' in the name of your individual sample. Your individuals are identified 
#' in this form : SPECIES-POPULATION-MATURITY-YEAR-ID = CHI-QUE-ADU-2014-020,
#' then, \code{pop.id.end} = 7. If you didn't name your individuals
#' with the pop id in it, use the \code{strata} argument.
#' @param strata (optional) A tab delimited file with 2 columns with header:
#' \code{INDIVIDUALS} and \code{STRATA}. Default: \code{strata = NULL}. With a 
#' data frame of genotypes the strata is the INDIVIDUALS and POP_ID columns, with
#' PLINK files, the \code{tfam} first 2 columns are used. 
#' If a \code{strata} file is specified, the strata file will have
#' precedence. The \code{STRATA} column can be any hierarchical grouping.

#' @param pop.select (string) Conduct the assignment analysis on a
#' selected list of populations. Default = \code{NULL} for no selection and keep
#' all population.
#' e.g. \code{pop.select = "QUE"} to select QUE population samples.
#' \code{pop.select = c("QUE", "ONT")} to select QUE and ONT population samples.

#' @param subsample (Integer or Proportion) Default is no sumsampling, \code{subsample = NULL}.
#' With a proportion argument \code{subsample = 0.15}, 15 percent of individuals
#' in each populations are chosen randomly to represent the dataset.
#' With \code{subsample = 36}, 36 individuals in each populations are chosen
#' randomly to represent the dataset.
#' @param iteration.subsample (Integer) The number of iterations to repeat 
#' subsampling, default: \code{iteration.subsample = 1}.
#' With \code{subsample = 20} and \code{iteration.subsample = 10},
#' 20 individuals/populations will be randomly chosen 10 times.


#' @param imputation.method Should a map-independent imputations of markers be
#' computed. Available choices are: (1) \code{FALSE} for no imputation.
#' (2) \code{"max"} to use the most frequent category for imputations.
#' (3) \code{"rf"} using Random Forest algorithm. 
#' Default: \code{imputation.method = FALSE}.
#' @param impute (character) Imputation on missing genotype 
#' \code{impute = "genotype"} or alleles \code{impute = "allele"}.
#' @param imputations.group \code{"global"} or \code{"populations"}.
#' Should the imputations be computed globally or by populations. If you choose
#' global, turn the verbose to \code{TRUE}, to see progress.
#' Default = \code{"populations"}.
#' @param num.tree The number of trees to grow in Random Forest. Default is 100.
#' @param iteration.rf The number of iterations of missing data algorithm
#' in Random Forest. Default is 10.
#' @param split.number Non-negative integer value used to specify
#' random splitting in Random Forest. Default is 100.
#' @param verbose Logical. Should trace output be enabled on each iteration
#' in Random Forest ? Default is \code{FALSE}.
#' @param parallel.core (optional) The number of core for OpenMP shared-memory parallel
#' programming of Random Forest imputations. For more info on how to install the
#' OpenMP version see \code{\link[randomForestSRC]{randomForestSRC-package}}.
#' If not selected \code{detectCores()-1} is used as default.
#' @details You need to have either the \code{pop.id.start} and \code{pop.id.end}
#' or the \code{strata} argument, to identify your populations.
#' 
#' The imputations using Random Forest requires more time to compute
#' and can take several
#' minutes and hours depending on the size of the dataset and polymorphism of
#' the species used. e.g. with a low polymorphic taxa, and a data set
#' containing 30\% missing data, 5 000 haplotypes loci and 500 individuals
#' will require 15 min.
#' The Fst is based on Weir and Cockerham 1984 equations.
#' @return Depending on arguments selected, several files are written to the your
#' working directory or \code{folder}
#' The output in your global environment is a list. To view the assignment results
#' \code{$assignment} to view the ggplot2 figure \code{$plot.assignment}. 
#' See example below.

#' @note \code{assignment_ngs} assumes that the command line version of gsi_sim 
#' is properly installed into \code{file.path(system.file(package = "assigner"), "bin", "gsi_sim")}.
#' Things are set up so that it will try running gsi_sim, and if it does not find it, the 
#' program will throw an error and ask the user to run \code{\link{install_gsi_sim}}
#' which will do its best to put a usable copy of gsi_sim where it is needed.  To do 
#' so, you must be connected to the internet.  If that doesn't work, you will
#' need to compile the program yourself, or get it yourself, and the manually copy
#' it to \code{file.path(system.file(package = "assigner"), "bin", "gsi_sim")}.
#' To compile gsi_sim, follow the 
#' instruction here: \url{https://github.com/eriqande/gsi_sim}.

#' @export
#' @rdname assignment_ngs
#' @import dplyr
#' @import parallel
#' @import stringi
#' @import adegenet
#' @importFrom purrr map
#' @importFrom purrr flatten
#' @importFrom purrr keep
#' @importFrom purrr discard
#' @importFrom data.table fread

#' @examples
#' \dontrun{
#' assignment.treefrog <- assignment_ngs(
#' data = "batch_1.vcf",
#' whitelist.markers = "whitelist.vcf.txt",
#' snp.ld = NULL,
#' common.markers = TRUE,
#' marker.number = c(500, 5000, "all"),
#' sampling.method = "ranked",
#' thl = 0.3,
#' blacklist.id = "blacklist.id.lobster.tsv",
#' subsample = 25,
#' iteration.subsample = 10
#' gsi_sim.filename = "treefrog.txt",
#' keep.gsi.files = FALSE,
#' pop.levels = c("PAN", "COS")
#' pop.id.start = 5, pop.id.end = 7,
#' imputation.method = FALSE,
#' parallel.core = 12
#' )
#' 
#' Since the 'folder' argument is missing, it will be created automatically
#' inside your working directory.
#' 
#' To create a dataframe with the assignment results: 
#' assignment <- assignment.treefrog$assignment.
#' 
#' To plot the assignment using ggplot2 and facet 
#' (with subsample by current pop):
#' assignment.treefrog$plot.assignment + facet_grid(SUBSAMPLE~CURRENT).
#' 
#' To save the plot:
#' ggsave("assignment.treefrog.THL.subsample.pdf", height = 35, 
#' width = 60,dpi = 600, units = "cm", useDingbats = F)
#' }


#' @references Anderson, Eric C., Robin S. Waples, and Steven T. Kalinowski. (2008)
#' An improved method for predicting the accuracy of genetic stock identification.
#' Canadian Journal of Fisheries and Aquatic Sciences 65, 7:1475-1486.
#' @references Anderson, E. C. (2010) Assessing the power of informative subsets of
#' loci for population assignment: standard methods are upwardly biased.
#' Molecular ecology resources 10, 4:701-710.
#' @references Catchen JM, Amores A, Hohenlohe PA et al. (2011)
#' Stacks: Building and Genotyping Loci De Novo From Short-Read Sequences.
#' G3, 1, 171-182.
#' @references Catchen JM, Hohenlohe PA, Bassham S, Amores A, Cresko WA (2013)
#' Stacks: an analysis tool set for population genomics.
#' Molecular Ecology, 22, 3124-3140.
#' @references Weir BS, Cockerham CC (1984) Estimating F-Statistics for the
#' Analysis of Population Structure. Evolution, 38, 1358–1370.
#' @references Ishwaran H. and Kogalur U.B. (2015). Random Forests for Survival,
#'  Regression and Classification (RF-SRC), R package version 1.6.1.
#' @references Ishwaran H. and Kogalur U.B. (2007). Random survival forests
#' for R. R News 7(2), 25-31.
#' @references Ishwaran H., Kogalur U.B., Blackstone E.H. and Lauer M.S. (2008).
#' Random survival forests. Ann. Appl. Statist. 2(3), 841--860.
#' @references Danecek P, Auton A, Abecasis G et al. (2011)
#' The variant call format and VCFtools.
#' Bioinformatics, 27, 2156-2158.
#' @references Purcell S, Neale B, Todd-Brown K, Thomas L, Ferreira MAR, 
#' Bender D, et al. 
#' PLINK: a tool set for whole-genome association and population-based linkage 
#' analyses. 
#' American Journal of Human Genetics. 2007; 81: 559–575. doi:10.1086/519795
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}

# required to pass the R CMD check and have 'no visible binding for global variable'
if (getRversion() >= "2.15.1") {
  utils::globalVariables(
    c("ID", "#CHROM", "CHROM", "FORMAT", "INDIVIDUALS", "FORMAT_ID", "LOCUS",
      "POS", "REF", "ALT", "POP_ID", "READ_DEPTH", "ALLELE_DEPTH", "GL",
      "ERASE", "GT", "MARKERS", "QQ", "PQ", "N", "MAF_GLOBAL", "MAF_LOCAL",
      "ALLELES", "POP_ID", "GT", "INDIVIDUALS", "MARKERS", "POP_ID", "nal",
      "ALLELES_GROUP", "ALLELES", "N_IND_GENE", "P", "N", "nal_sq",
      "nal_sq_sum", "nal_sq_sum_nt", "npl", "het", "mho", "mhom", "dum",
      "dum1", "SSG", "ntal", "SSP", "ntalb", "SSi", "MSI", "sigw", "MSP",
      "siga", "sigb", "lsiga", "lsigb", "lsigw", "FST", "MARKERS",
      "MARKERS_ALLELES", "ALLELES", "POP_ID", "INDIVIDUALS", "filename",
      "ID", "KEEPER", "ASSIGN", "OTHERS", "CURRENT", "INFERRED",
      "SECOND_BEST_POP", "SCORE", "SECOND_BEST_SCORE", "NUMBER", "INDIVIDUALS_ALLELES",
      "MARKER_NUMBER", "MISSING_DATA", "TOTAL", "ASSIGNMENT_PERC",
      "MARKERS", "CURRENT", "INFERRED", "MISSING_DATA",
      "ITERATIONS", "METHOD", "TOTAL", "MEAN_i", "MEAN", "ASSIGNMENT_PERC",
      "SE", "MEDIAN", "MIN", "MAX", "QUANTILE25", "QUANTILE75", "SE_MIN",
      "SE_MAX", ".", "QUAL", "FILTER", "INFO", "pb", "SUBSAMPLE", "STRATA", 
      "sum.pop", "A1", "A2", "INDIVIDUALS_2", "Cnt", "Catalog ID", "GROUP",
      "COUNT", "MAX_COUNT_MARKERS", "hierarchy"
    )
  )
}

assignment_ngs <- function(data,
                           assignment.analysis,
                           whitelist.markers = NULL,
                           monomorphic.out = TRUE,
                           blacklist.genotype = NULL,
                           snp.ld = NULL,
                           common.markers = NULL,
                           maf.thresholds = NULL,
                           maf.pop.num.threshold = 1,
                           maf.approach = "SNP",
                           maf.operator = "OR",
                           max.marker = NULL,
                           marker.number = "all",
                           blacklist.id = NULL,
                           pop.levels,
                           pop.labels,
                           pop.id.start, 
                           pop.id.end,
                           strata = NULL,
                           pop.select = NULL,
                           subsample = NULL,
                           iteration.subsample = 1,
                           sampling.method,
                           thl = 1,
                           iteration.method = 10,
                           folder,
                           gsi_sim.filename = "assignment_data.txt",
                           keep.gsi.files,
                           imputation.method = FALSE,
                           impute = "genotypes",
                           imputations.group = "populations",
                           num.tree = 100,
                           iteration.rf = 10,
                           split.number = 100,
                           verbose = FALSE,
                           parallel.core = NULL,
                           ...) {
  
  message("Assignment analysis using stackr and gsi_sim")
  
  # Checking for missing and/or default arguments ******************************
  if (missing(data)) stop("Input file missing")
  if (missing(assignment.analysis)) stop("assignment.analysis argument missing")
  if (missing(whitelist.markers)) whitelist.markers <- NULL # no Whitelist
  if (missing(monomorphic.out)) monomorphic.out <- TRUE # remove monomorphic
  if (missing(blacklist.genotype)) blacklist.genotype <- NULL # no genotype to erase
  if (missing(snp.ld)) snp.ld <- NULL
  if (missing(common.markers)) common.markers <- FALSE
  if (missing(maf.thresholds)) maf.thresholds <- NULL
  if (missing(maf.pop.num.threshold)) maf.pop.num.threshold <- 1
  if (missing(maf.approach)) maf.approach <- "SNP"
  if (missing(maf.operator)) maf.operator <- "OR"
  if (missing(max.marker)) max.marker <- NULL
  if (missing(marker.number)) marker.number <- "all"
  if (missing(blacklist.id)) blacklist.id <- NULL # No blacklist of ID
  if (missing(pop.levels)) stop("pop.levels required")
  if (missing(pop.labels)) pop.labels <- pop.levels # pop.labels
  if (missing(pop.id.start)) pop.id.start <- NULL
  if (missing(pop.id.end)) pop.id.end <- NULL
  if (missing(strata)) strata <- NULL
  if (missing(pop.select)) pop.select <- NULL
  if (missing(subsample)) subsample <- NULL
  if (missing(iteration.subsample)) iteration.subsample <- 1
  if (missing(sampling.method)) stop("Sampling method required")
  if (sampling.method == "ranked" & missing(thl)) thl <- 1 # thl
  if (missing(iteration.method)) iteration.method <- 10
  if (sampling.method  == "ranked" & thl == "all") iteration.method <- 1
  if (sampling.method == "ranked" & thl == 1) iteration.method <- 1
  if (missing(gsi_sim.filename)) gsi_sim.filename <- "assignment_data.txt"
  if (missing(keep.gsi.files)) keep.gsi.files <- FALSE
  if (missing(imputation.method)) imputation.method <- FALSE
  if (missing(imputations.group)) imputations.group <- "populations"
  if (imputation.method != FALSE & missing(impute)) stop("impute argument is necessary")
  if (imputation.method == FALSE & missing(impute)) impute <- NULL
  if (missing(num.tree)) num.tree <- 100
  if (missing(iteration.rf)) iteration.rf <- 10
  if (missing(split.number)) split.number <- 100
  if (missing(verbose)) verbose <- FALSE
  if (missing(parallel.core) | is.null(parallel.core)) parallel.core <- detectCores()-1
  if (missing(folder)) folder <- NULL

  # File type detection ********************************************************
  data.type <- readChar(con = data, nchars = 16L, useBytes = TRUE)
  
  if (identical(data.type, "##fileformat=VCF") | stri_detect_fixed(str = data, pattern = ".vcf")) {
    data.type <- "vcf.file"
    message("File type: VCF")
  }
  
  if (stri_detect_fixed(str = data, pattern = ".tped")) {
    data.type <- "plink.file"
    message("File type: PLINK")
    if (!file.exists(stri_replace_all_fixed(str = data, pattern = ".tped", replacement = ".tfam", vectorize_all = FALSE))) {
      stop("Missing tfam file with the same prefix as your tped")
    }
  } 
  
  if (stri_detect_fixed(str = data.type, pattern = "POP_ID") | stri_detect_fixed(str = data.type, pattern = "INDIVIDUALS")) {
    data.type <- "df.file"
    message("File type: data frame of genotype")
  }
  
  if (stri_detect_fixed(str = data.type, pattern = "Catalog")) {
    data.type <- "haplo.file"
    message("File type: haplotypes from stacks")
    if (is.null(blacklist.genotype)) {
      stop("blacklist.genotype file missing. 
              Use stackr's missing_genotypes function to create this blacklist")
    }
  }
  
  # Create a folder based on filename to save the output files *****************
  if (is.null(folder)) {
    # Get date and time to have unique filenaming
    file.date <- stri_replace_all_fixed(Sys.time(), pattern = " EDT", replacement = "")
    file.date <- stri_replace_all_fixed(file.date, pattern = c("-", " ", ":"), replacement = c("", "@", ""), vectorize_all = FALSE)
    file.date <- stri_sub(file.date, from = 1, to = 13)
    
    if (imputation.method == "FALSE") {
      message("Map-imputation: no")
      directory <- stri_join(getwd(),"/", "assignment_analysis_", "method_", sampling.method, "_no_imputations_", file.date, "/", sep = "")
      dir.create(file.path(directory))
    } else {
      message("Map-imputation: yes")
      directory <- stri_join(getwd(),"/","assignment_analysis_", "method_", sampling.method, "_imputations_", imputation.method,"_", imputations.group, "_", file.date, "/", sep = "")
      dir.create(file.path(directory))
    }
    message(stri_join("Folder: ", directory))
    file.data <- NULL #unused object
  } else {
    directory <- stri_join(getwd(), "/", folder, "/", sep = "")
    dir.create(file.path(directory))
    message(stri_join("Folder: ", directory))
  }
  
  # Import whitelist of markers ************************************************
  if (is.null(whitelist.markers)) { # no Whitelist
    message("Whitelist of markers: no")
  } else { # with Whitelist of markers
    message("Whitelist of markers: yes")
    whitelist.markers <- read_tsv(whitelist.markers, col_names = TRUE)
    columns.names.whitelist <- colnames(whitelist.markers)
    if ("CHROM" %in% columns.names.whitelist) {
      whitelist.markers$CHROM <- as.character(whitelist.markers$CHROM)
    }
  }
  
  if (data.type == "haplo.file") {
    whitelist.markers <- select(.data = whitelist.markers, LOCUS)
    columns.names.whitelist <- colnames(whitelist.markers)
  }
  
  # Import blacklist id ********************************************************
  if (is.null(blacklist.id)) { # No blacklist of ID
    message("Blacklisted individuals: no")
  } else { # With blacklist of ID
    message("Blacklisted individuals: yes")
    blacklist.id <- read_tsv(blacklist.id, col_names = TRUE)
  }
  
  # Import data ****************************************************************
  if (data.type == "vcf.file") { # VCF
    message("Importing the VCF...")
    
    if (is.null(strata) & is.null(pop.id.start) & is.null(pop.id.end)) {
      stop("pop.id.start and pop.id.end or strata arguments are required to 
           identify your populations with a VCF file")
    }
    
    input <- data.table::fread(
      input = data,
      sep = "\t",
      stringsAsFactors = FALSE, 
      header = TRUE,
      # Automatically filter with blacklist of id
      drop = c("QUAL", "FILTER", "INFO", blacklist.id$INDIVIDUALS),
      skip = "CHROM",
      showProgress = TRUE,
      verbose = FALSE
    ) %>% 
      as_data_frame() %>% 
      rename(LOCUS = ID, CHROM = `#CHROM`) %>%
      mutate(
        CHROM = stri_replace_all_fixed(CHROM, pattern = "un", replacement = "1")
      )
    
    # Filter with whitelist of markers
    if (!is.null(whitelist.markers)) {
      input <- suppressWarnings(semi_join(input, whitelist.markers, by = columns.names.whitelist))
    }
    
    # Detect STACKS version
    if (stri_detect_fixed(input$FORMAT[1], "AD")) {
      stacks.version <- "new"
    } else {
      stacks.version <- "old"
    }
    input <- input %>% select(-FORMAT)
    
    # Tidying the VCF to make it easy to work on the data for conversion
    message("Making the VCF population wise")
    input <- input %>%
      tidyr::gather(INDIVIDUALS, FORMAT_ID, -c(CHROM, LOCUS, POS, REF, ALT)) # Gather individuals in 1 colummn
    
    if (is.null(strata)){
      input <- input %>%
        mutate( # Make population ready
          POP_ID = substr(INDIVIDUALS, pop.id.start, pop.id.end),
          POP_ID = factor(stri_replace_all_fixed(POP_ID, pop.levels, pop.labels, vectorize_all = FALSE), levels = unique(pop.labels), ordered = TRUE),
          INDIVIDUALS =  as.character(INDIVIDUALS)
        )
    } else { # Make population ready with the strata provided
      strata.df <- read_tsv(file = strata, col_names = TRUE, col_types = "cc") %>% 
        rename(POP_ID = STRATA)
      
      input <- input %>%
        mutate(INDIVIDUALS =  as.character(INDIVIDUALS)) %>% 
        left_join(strata.df, by = "INDIVIDUALS") %>% 
        mutate(POP_ID = factor(POP_ID, levels = unique(pop.labels), ordered =TRUE))
    }
    
    # Pop select
    if (!is.null(pop.select)) {
      message(stri_join(length(pop.select), "population(s) selected", sep = " "))
      input <- suppressWarnings(input %>% filter(POP_ID %in% pop.select))
    }
    
    # Tidy VCF
    message("Tidying the vcf...")
    if (stacks.version == "new") { # with new version of stacks > v.1.29
      input <- input %>%
        tidyr::separate(FORMAT_ID, c("GT", "READ_DEPTH", "ALLELE_DEPTH", "GL"),
                        sep = ":", extra = "warn") %>%
        select(-c(READ_DEPTH, ALLELE_DEPTH, GL))
    } else { # stacks version prior to v.1.29 had no Allele Depth field...
      input <- input %>%
        tidyr::separate(FORMAT_ID, c("GT", "READ_DEPTH", "GL"),
                        sep = ":", extra = "warn") %>%
        select(-c(READ_DEPTH, GL))
    }
  } # End import VCF
  
  if (data.type == "plink.file") { # PLINK
    message("Importing the PLINK files...")
    strata.df <- data.table::fread(
      input = stri_replace_all_fixed(str = data, pattern = ".tped", replacement = ".tfam", vectorize_all = FALSE),
      sep = " ", 
      header = FALSE, 
      stringsAsFactors = FALSE,
      verbose = FALSE,
      select = c(1,2),
      colClasses=list(character = c(1,2)),
      col.names = c("POP_ID", "INDIVIDUALS"),
      showProgress = TRUE, 
      data.table = FALSE)
    
    # remove "_" in individual name and replace with "-"
    strata.df$INDIVIDUALS <- stri_replace_all_fixed(str = strata.df$INDIVIDUALS, pattern = "_", replacement = "-", vectorize_all = TRUE)
    
    tped.header.prep <- strata.df %>% 
      select(INDIVIDUALS) %>%
      mutate(NUMBER = seq(1,n())) %>%
      mutate(ALLELE1 = rep("A1", n()), ALLELE2 = rep("A2", n())) %>%
      tidyr::gather(ALLELES_GROUP, ALLELES, -c(INDIVIDUALS, NUMBER)) %>%
      arrange(NUMBER) %>% 
      select(-ALLELES_GROUP) %>% 
      tidyr::unite(INDIVIDUALS_ALLELES, c(INDIVIDUALS, ALLELES), sep = "_", remove = FALSE) %>% 
      arrange(NUMBER) %>% 
      mutate(NUMBER = seq(from = (1 + 4), to = n() + 4)) %>% 
      select(-ALLELES)
    
    tped.header.names <- c("LOCUS", tped.header.prep$INDIVIDUALS_ALLELES)
    tped.header.integer <- c(2, tped.header.prep$NUMBER)
    
    if (!is.null(blacklist.id)) { # using the blacklist of individuals
      whitelist.id <- tped.header.prep %>% 
        anti_join(blacklist.id, by = "INDIVIDUALS") %>% 
        arrange(NUMBER)
      tped.header.names <- c("LOCUS", whitelist.id$INDIVIDUALS_ALLELES)
      tped.header.integer <- c(2, whitelist.id$NUMBER)
    }
    
    input <- data.table::fread( # import PLINK
      input = data, 
      sep = " ", 
      header = FALSE, 
      stringsAsFactors = FALSE, 
      verbose = FALSE,
      select = tped.header.integer,
      col.names = tped.header.names,
      showProgress = TRUE,
      data.table = FALSE)
    
    # Filter with whitelist of markers
    if (!is.null(whitelist.markers)) {
      message("Filtering with whitelist of markers")
      input <- suppressWarnings(semi_join(input, whitelist.markers, by = columns.names.whitelist))
    }
    
    # To reduce the size of the dataset we subsample the markers with max.marker
    if (!is.null(max.marker)) {
      message("Using the max.marker to reduce the size of the dataset")
      input <- sample_n(tbl = input, size = max(as.numeric(max.marker)), replace = FALSE)
      
      max.marker.subsample.select <- input %>% 
        select(LOCUS) %>% 
        distinct(LOCUS) %>% 
        arrange(LOCUS)
      
      write_tsv(# save results
        x = max.marker.subsample.select, 
        path = paste0(directory,"max.marker.subsample.select.tsv")
      )
    }
    
    # Using the argument strata if provided to replace the current one
    if (!is.null(strata)) {
      strata.df <- read_tsv(file = strata, col_names = TRUE, col_types = "cc") %>% 
        rename(POP_ID = STRATA)
    }
    
    # Make tidy
    message("Tidying the PLINK file and integrating the tfam/strata file, for large dataset this may take several minutes...")
    input <- input %>% 
      tidyr::gather(key = INDIVIDUALS_ALLELES, value = GT, -LOCUS) %>%
      mutate(INDIVIDUALS = stri_replace_all_fixed(str = INDIVIDUALS_ALLELES, pattern = c("_A1", "_A2"), replacement = "", vectorize_all = FALSE)) %>% 
      left_join(strata.df, by = "INDIVIDUALS") %>% 
      mutate(
        POP_ID = factor(POP_ID, levels = pop.levels, ordered =TRUE),
        GT = stri_pad_left(str = GT, width = 3, pad = "0")
      )
    
    # Pop select
    if (!is.null(pop.select)) {
      message(stri_join(length(pop.select), "population(s) selected", sep = " "))
      input <- suppressWarnings(input %>% filter(POP_ID %in% pop.select))
    }
    
    # removing untyped markers across all-pop
    remove.missing.gt <- input %>%
      select(LOCUS, GT) %>%
      filter(GT != "000")
    
    untyped.markers <- n_distinct(input$LOCUS) - n_distinct(remove.missing.gt$LOCUS)
    if (untyped.markers > 0) {
      message(paste0("Number of marker with 100 % missing genotypes: ", untyped.markers))
      input <- suppressWarnings(
        semi_join(input, 
                  remove.missing.gt %>% 
                    select(LOCUS) %>% 
                    distinct(LOCUS), 
                  by = "LOCUS")
      )
    }
    
    # Removing monomorphic markers
    if (monomorphic.out == TRUE) {
      message("Removing monomorphic markers...")
      mono.markers <- remove.missing.gt %>%
        group_by(LOCUS, GT) %>%
        distinct %>% 
        group_by(LOCUS) %>%
        tally %>% 
        filter(n == 1) %>% 
        select(LOCUS) %>% 
        arrange(LOCUS)
      
      # Remove the markers from the dataset
      input <- anti_join(input, mono.markers, by = "LOCUS")
      message(paste0("Number of monomorphic markers removed: ", n_distinct(mono.markers$LOCUS)))
    }
    # Unused objects
    tped.header.prep <- NULL
    tped.header.integer <- NULL
    tped.header.names <- NULL
    remove.missing.gt <- NULL
    mono.markers <- NULL
  } # End import PLINK
  
  if (data.type == "df.file") { # DATA FRAME OF GENOTYPES
    message("Importing the data frame")
    input <- data.table::fread(
      input = data, 
      sep = "\t", 
      header = TRUE, 
      stringsAsFactors = FALSE, 
      verbose = FALSE,
      showProgress = TRUE,
      data.table = FALSE
    ) %>% 
      as_data_frame() %>% 
      tidyr::gather(key = LOCUS, value = GT, -c(INDIVIDUALS, POP_ID)) %>% 
      mutate(
        POP_ID = factor(stri_replace_all_fixed(POP_ID, pop.levels, pop.labels, vectorize_all = FALSE), levels = unique(pop.labels), ordered =T),
        GT = stri_pad_left(str = GT, width = 6, pad = "0")
      )
    
    # Filter with whitelist of markers
    if (!is.null(whitelist.markers)) {
      message("Filtering with whitelist of markers")
      input <- suppressWarnings(semi_join(input, whitelist.markers, by = columns.names.whitelist))
    }
    
    # Filter with blacklist of individuals
    if (!is.null(blacklist.id)) {
      message("Filtering with blacklist of individuals")
      input <- suppressWarnings(anti_join(input, blacklist.id, by = "INDIVIDUALS"))
    }
    
    # Using the argument strata if provided to replace the current one
    if (is.null(strata)) {
      strata.df <- input %>% select(INDIVIDUALS, POP_ID) %>% distinct(INDIVIDUALS)
    } else {
      strata.df <- read_tsv(file = strata, col_names = TRUE, col_types = "cc") %>% 
        rename(POP_ID = STRATA)
      input <- left_join(input, strata.df, by = "INDIVIDUALS")
    }
    
    # Pop select
    if (!is.null(pop.select)) {
      message(stri_join(length(pop.select), "population(s) selected", sep = " "))
      input <- suppressWarnings(input %>% filter(POP_ID %in% pop.select))
    }
  } # End import data frame of genotypes
  
  if (data.type == "haplo.file") { # Haplotype file
    message("Importing the stacks haplotype file")
    input <- data.table::fread(
      input = data, 
      sep = "\t", 
      header = TRUE, 
      stringsAsFactors = FALSE, 
      verbose = FALSE,
      showProgress = TRUE,
      data.table = FALSE
    ) %>% 
      as_data_frame() %>% 
      select(-Cnt) %>% 
      rename(LOCUS = `Catalog ID`) %>%
      tidyr::gather(INDIVIDUALS, GT, -LOCUS)
    
    # Filter with whitelist of markers
    if (!is.null(whitelist.markers)) {
      message("Filtering with whitelist of markers")
      input <- suppressWarnings(semi_join(input, whitelist.markers, by = columns.names.whitelist))
    }
    
    # Filter with blacklist of individuals
    if (!is.null(blacklist.id)) {
      message("Filtering with blacklist of individuals")
      input <- suppressWarnings(anti_join(input, blacklist.id, by = "INDIVIDUALS"))
    }
    
    if (is.null(strata)){
      input <- input %>%
        mutate( # Make population ready
          POP_ID = substr(INDIVIDUALS, pop.id.start, pop.id.end),
          POP_ID = factor(stri_replace_all_fixed(POP_ID, pop.levels, pop.labels, vectorize_all = FALSE), levels = unique(pop.labels), ordered = TRUE),
          INDIVIDUALS =  as.character(INDIVIDUALS)
        )
    } else { # Make population ready with the strata provided
      strata.df <- read_tsv(file = strata, col_names = TRUE, col_types = "cc") %>% 
        rename(POP_ID = STRATA)
      
      input <- input %>%
        mutate(INDIVIDUALS =  as.character(INDIVIDUALS)) %>% 
        left_join(strata.df, by = "INDIVIDUALS") %>% 
        mutate(POP_ID = factor(POP_ID, levels = unique(pop.labels), ordered =TRUE))
    }
    
    # Pop select
    if (!is.null(pop.select)) {
      message(stri_join(length(pop.select), "population(s) selected", sep = " "))
      input <- suppressWarnings(input %>% filter(POP_ID %in% pop.select))
    }
  } # End import haplotypes file
  
  # Blacklist genotypes ********************************************************
  if (is.null(blacklist.genotype)) { # no Whitelist
    message("Erasing genotype: no")
  } else {
    message("Erasing genotype: yes")
    blacklist.genotype <- read_tsv(blacklist.genotype, col_names = TRUE)
    columns.names.blacklist.genotype <- colnames(blacklist.genotype)
    if ("CHROM" %in% columns.names.blacklist.genotype) {
      columns.names.blacklist.genotype$CHROM <- as.character(columns.names.blacklist.genotype$CHROM)
    }
    
    if (data.type == "haplo.file") {
      blacklist.genotype <- select(.data = blacklist.genotype, INDIVIDUALS, LOCUS)
      columns.names.blacklist.genotype <- colnames(blacklist.genotype)
    }
    
    # control check to keep only individuals in pop.select
    if (!is.null(pop.select)) {
      message("Control check to keep only individuals present in pop.select")
      # updating the blacklist.genotype
      if (is.null(strata)){
        blacklist.genotype <- suppressWarnings(
          blacklist.genotype  %>% 
            mutate( # Make population ready
              POP_ID = substr(INDIVIDUALS, pop.id.start, pop.id.end),
              POP_ID = factor(stri_replace_all_fixed(POP_ID, pop.levels, pop.labels, vectorize_all = F), levels = unique(pop.labels), ordered =T),
              INDIVIDUALS =  as.character(INDIVIDUALS) 
            ) %>% 
            filter(POP_ID %in% pop.select) %>% 
            select(-POP_ID)
        )
      } else {
        blacklist.genotype <- suppressWarnings(
          blacklist.genotype %>%
            mutate(INDIVIDUALS =  as.character(INDIVIDUALS)) %>% 
            left_join(strata.df, by = "INDIVIDUALS") %>% 
            filter(POP_ID %in% pop.select) %>% 
            select(-POP_ID)
        )
      }
    }
    
    # control check to keep only whitelisted markers from the blacklist of genotypes
    if (!is.null(whitelist.markers)) {
      blacklist.genotype <- blacklist.genotype
      message("Control check to keep only whitelisted markers present in the blacklist of genotypes to erase.")
      # updating the whitelist of markers to have all columns that id markers
      if (data.type == "vcf.file"){
        whitelist.markers.ind <- input %>% select(CHROM, LOCUS, POS, INDIVIDUALS) %>% distinct(CHROM, LOCUS, POS, INDIVIDUALS)
      } else {
        whitelist.markers.ind <- input %>% select(LOCUS, INDIVIDUALS) %>% distinct(LOCUS, INDIVIDUALS)
      }
      
      # updating the blacklist.genotype
      blacklist.genotype <- suppressWarnings(semi_join(whitelist.markers.ind, blacklist.genotype, by = columns.names.blacklist.genotype))
      columns.names.blacklist.genotype <- colnames(blacklist.genotype)
    }
    
    # control check to remove blacklisted individuals from the blacklist of genotypes
    if (!is.null(blacklist.id)) {
      message("Control check to remove blacklisted individuals present in the blacklist of genotypes to erase.")
      blacklist.genotype <- suppressWarnings(anti_join(blacklist.genotype, blacklist.id, by = "INDIVIDUALS"))
      columns.names.blacklist.genotype <- colnames(blacklist.genotype)
    }
    
    # Add one column that will allow to include the blacklist in the dataset 
    # by x column(s) of markers
    blacklist.genotype <- mutate(.data = blacklist.genotype, ERASE = rep("erase", n()))
    
    input <- suppressWarnings(
      input %>%
        full_join(blacklist.genotype, by = columns.names.blacklist.genotype) %>%
        mutate(ERASE = stri_replace_na(str = ERASE, replacement = "ok"))
    )
    
    if (data.type == "vcf.file") {
      input <- input %>% 
        mutate(GT = ifelse(ERASE == "erase", "./.", GT)) %>% 
        select(-ERASE)
    }
    
    if (data.type == "plink.file") {
      input <- input %>% 
        mutate(GT = ifelse(ERASE == "erase", "000", GT)) %>% 
        select(-ERASE)
    }
    
    if (data.type == "df.file") {
      input <- input %>% 
        mutate(GT = ifelse(ERASE == "erase", "000000", GT)) %>% 
        select(-ERASE)
    }
    
    if (data.type == "haplo.file") {
      input <- input %>% 
        mutate(GT = ifelse(ERASE == "erase", "-", GT)) %>% 
        select(-ERASE)
    }
    
  } # End erase genotypes
  
  # dump unused object
  blacklist.id <- NULL
  whitelist.markers <- NULL
  whitelist.markers.ind <- NULL
  blacklist.genotype <- NULL
  
  # subsampling data ***********************************************************
  # Function:
  subsampling_data <- function(iteration.subsample, ...) {
    # message(paste0("Creating data subsample: ", iteration.subsample))
    if (is.null(subsample)) {
      subsample.select <- ind.pop.df %>% 
        mutate(SUBSAMPLE = rep(iteration.subsample, n()))
    } else {
      if (subsample > 1) { # integer
        subsample.select <- ind.pop.df %>%
          group_by(POP_ID) %>%
          sample_n(subsample, replace = FALSE) %>% # sampling individuals for each pop
          arrange(POP_ID, INDIVIDUALS) %>% 
          mutate(SUBSAMPLE = rep(iteration.subsample, n())) %>% 
          ungroup()
      }
      if (subsample < 1) { # proportion
        subsample.select <- ind.pop.df %>%
          group_by(POP_ID) %>%
          sample_frac(subsample, replace = FALSE) %>% # sampling individuals for each pop
          arrange(POP_ID, INDIVIDUALS) %>% 
          mutate(SUBSAMPLE = rep(iteration.subsample, n())) %>% 
          ungroup()
      }
    }
    return(subsample.select)
  } # End subsampling function
  # create the subsampling list
  ind.pop.df <- input %>% select(POP_ID, INDIVIDUALS) %>% distinct(POP_ID, INDIVIDUALS)
  subsample.list <- map(.x = 1:iteration.subsample, .f = subsampling_data, subsample = subsample)
  
  # keep track of subsampling individuals and write to directory
  if (is.null(subsample)) {
    message("Subsampling: not selected")
  } else {
    message("Subsampling: selected")
    subsampling.individuals <- bind_rows(subsample.list)
    write_tsv(x = subsampling.individuals, path = paste0(directory, "subsampling.individuals.tsv"), col_names = TRUE, append = FALSE)
  } # End subsampling
  
  # unused objects
  subsampling.individuals <- NULL
  ind.pop.df <- NULL
  
  # assignment analysis ********************************************************
  # Function:
  assignment_function <- function(data, ...) {
    # data <- subsample.list[[1]] # test
    subsampling.individuals <- data
    subsample.id <- unique(subsampling.individuals$SUBSAMPLE)
    
    if (!is.null(subsample)) {
      message(paste("Analyzing subsample: ", subsample.id))
    }
    
    # Updating directories for subsampling
    if (is.null(subsample)) {
      directory.subsample <- directory
    } else {
      directory.subsample <- paste0(directory, "subsample_", subsample.id, "/")
      dir.create(file.path(directory.subsample))
    }
    
    # Keep only the subsample
    input <- input %>%
      semi_join(subsampling.individuals, by = c("POP_ID", "INDIVIDUALS"))
    
    # unused object
    data <- NULL
    subsampling.individuals <- NULL
    
    # LD control... keep only 1 SNP per haplotypes/reads (optional) ************
    if (!is.null(snp.ld)) {
      if (data.type != "vcf.file") {
        stop("snp.ld is only available for VCF file, use stackr package for 
haplotype file and create a whitelist, for other file type, use 
             PLINK linkage disequilibrium based SNP pruning option")
      }
      message("Minimizing LD...")
      snp.locus <- input %>% select(LOCUS, POS) %>% distinct(POS)
      # Random selection
      if (snp.ld == "random") {
        snp.select <- snp.locus %>%
          group_by(LOCUS) %>%
          sample_n(size = 1, replace = FALSE)
        message(stri_join("Number of original SNP = ", n_distinct(snp.locus$POS), "\n", "Number of SNP randomly selected to keep 1 SNP per read/haplotype = ", n_distinct(snp.select$POS), "\n", "Number of SNP removed = ", n_distinct(snp.locus$POS) - n_distinct(snp.select$POS)))
      }
      
      # Fist SNP on the read
      if (snp.ld == "first") {
        snp.select <- snp.locus %>%
          group_by(LOCUS) %>%
          summarise(POS = min(POS))
        message(stri_join("Number of original SNP = ", n_distinct(snp.locus$POS), "\n", "Number of SNP after keeping the first SNP on the read/haplotype = ", n_distinct(snp.select$POS), "\n", "Number of SNP removed = ", n_distinct(snp.locus$POS) - n_distinct(snp.select$POS)))
      }
      
      # Last SNP on the read
      if (snp.ld == "last") {
        snp.select <- snp.locus %>%
          group_by(LOCUS) %>%
          summarise(POS = max(POS))
        message(stri_join("Number of original SNP = ", n_distinct(snp.locus$POS), "\n", "Number of SNP after keeping the first SNP on the read/haplotype = ", n_distinct(snp.select$POS), "\n", "Number of SNP removed = ", n_distinct(snp.locus$POS) - n_distinct(snp.select$POS)))
      }
      
      # filtering the VCF to minimize LD
      input <- input %>% semi_join(snp.select, by = c("LOCUS", "POS"))
      message("Filtering the tidy VCF to minimize LD by keeping only 1 SNP per short read/haplotype")
    } # End of snp.ld control
    
    # Unique markers id: for VCF combine CHROM, LOCUS and POS into MARKERS *****
    # For other type of file change LOCUS to MARKERS
    if (data.type != "vcf.file") {
      input <- input %>% rename(MARKERS = LOCUS)
    } else {
      input <- input %>%
        mutate(
          POS = stri_pad_left(str = POS, width = 8, pad = "0"),
          LOCUS = stri_pad_left(str = LOCUS, width = 8, pad = "0")
        ) %>%
        arrange(CHROM, LOCUS, POS) %>%
        tidyr::unite(MARKERS, c(CHROM, LOCUS, POS), sep = "_")
    } # End Unique markers id
    
    # Markers in common between all populations (optional) *********************
    if (common.markers == TRUE) { # keep only markers present in all pop
      message("Using markers common in all populations:")
      pop.number <- n_distinct(input$POP_ID)
      
      if (data.type == "vcf.file") pop.filter <- input %>% filter(GT != "./.")
      if (data.type == "plink.file") pop.filter <- input %>% filter(GT != "000")
      if (data.type == "df.file") pop.filter <- input %>% filter(GT != "000000")
      if (data.type == "haplo.file") pop.filter <- input %>% filter(GT != "-")
      
      pop.filter <- pop.filter %>% 
        group_by(MARKERS) %>%
        filter(n_distinct(POP_ID) == pop.number) %>%
        arrange(MARKERS) %>%
        select(MARKERS) %>%
        distinct(MARKERS)
      
      message(stri_join("Number of original markers = ", n_distinct(input$MARKERS), 
                        "\n", "Number of markers present in all the populations = ", 
                        n_distinct(pop.filter$MARKERS), "\n", 
                        "Number of markers removed = ", 
                        n_distinct(input$MARKERS) - n_distinct(pop.filter$MARKERS))
      )
      input <- suppressWarnings(input %>% semi_join(pop.filter, by = "MARKERS"))
      pop.filter <- NULL # ununsed object
    } # End common markers
    
    # Minor Allele Frequency filter ********************************************
    # maf.thresholds <- c(0.05, 0.1) # test
    if (!is.null(maf.thresholds)) { # with MAF
      maf.local.threshold <- maf.thresholds[1]
      maf.global.threshold <- maf.thresholds[2]
      message("MAF filter: yes")
      
      if (data.type == "vcf.file") {
        maf.local <- input %>%
          filter(GT != "./.") %>%
          group_by(MARKERS, POP_ID, REF, ALT) %>%
          summarise(
            N = as.numeric(n()),
            PQ = as.numeric(length(GT[GT == "1/0" | GT == "0/1"])),
            QQ = as.numeric(length(GT[GT == "1/1"]))
          ) %>%
          mutate(MAF_LOCAL = ((QQ * 2) + PQ) / (2 * N))
        
        maf.global <- maf.local %>%
          group_by(MARKERS) %>%
          summarise_each_(funs(sum), vars = c("N", "PQ", "QQ")) %>%
          mutate(MAF_GLOBAL = ((QQ * 2) + PQ) / (2 * N)) %>%
          select(MARKERS, MAF_GLOBAL)
        
        maf.data <- maf.global %>%
          left_join(maf.local, by = c("MARKERS")) %>%
          select(MARKERS, POP_ID, MAF_LOCAL, MAF_GLOBAL)
        
        maf.local <- NULL
        maf.global <- NULL
      } # end maf calculations with vcf
      
      if (data.type == "plink.file" | data.type == "df.file") {
        message("Calculating global and local MAF, this may take some time on large data set")
        
        # For data frame we split the alleles here to prep for MAF
        if (data.type == "df.file") { # for data frame of genotypes
          maf.data <- input %>% 
            tidyr::separate(data = ., col = GT, into = .(A1, A2), sep = 3, remove = TRUE) %>% 
            tidyr::gather(data = ., key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS, POP_ID)) %>%
            select(MARKERS, GT, POP_ID) %>% 
            filter(GT != "000")
        }
        
        if (data.type == "plink.file") { # For PLINK and common code below
          maf.data <- input %>%
            select(MARKERS, GT, POP_ID) %>% 
            filter(GT != "000")
        }
        
        maf.data <- maf.data %>%
          group_by(MARKERS, GT, POP_ID) %>%
          tally %>%
          arrange(MARKERS, GT) %>% 
          group_by(MARKERS, GT) %>%
          mutate(sum.pop = sum(n)) %>% 
          group_by(MARKERS) %>%
          mutate(MAF_GLOBAL = min(sum.pop)/sum(n)) %>% 
          group_by(MARKERS, POP_ID) %>%
          mutate(MAF_LOCAL = n/sum(n)) %>% 
          arrange(MARKERS, POP_ID, GT) %>% 
          group_by(MARKERS, POP_ID) %>% 
          filter(n == min(n)) %>% 
          distinct(MARKERS, POP_ID) %>% 
          select(MARKERS, POP_ID, MAF_LOCAL, MAF_GLOBAL)
      }# end maf calculations with PLINK or data frame of genotypes
      
      if (data.type == "haplo.file") {
        stop("MAF filtering is only available for haplotype file, use stackr
package and update your whitelist")
      }
      
      write_tsv(x = maf.data, 
                path = paste0(directory.subsample,"maf.data.tsv"), 
                col_names = TRUE, 
                append = FALSE
      )
      message("The MAF table was written in your folder")
      
      # # update the vcf with the maf info
      # input <- full_join(input, maf.data, by = c("MARKERS", "POP_ID"))
      if (maf.approach == "haplotype") {
        if (data.type != "vcf.file") {
          stop("The haplotype approach during MAF filtering is for VCF files only")
        }
        vcf.maf <- tidyr::separate(data = maf.data, 
                                   col = MARKERS, 
                                   into = c("CHROM", "LOCUS", "POS"), 
                                   sep = "_", 
                                   remove = FALSE, 
                                   extra = "warn"
        )
        
        if (maf.operator == "OR") {
          vcf.maf <- maf.data %>%
            group_by(LOCUS, POP_ID) %>%
            summarise(
              MAF_GLOBAL = mean(MAF_GLOBAL, na.rm = TRUE),
              MAF_LOCAL = min(MAF_LOCAL, na.rm = TRUE)
            ) %>%
            filter(MAF_LOCAL >= maf.local.threshold | MAF_GLOBAL >= maf.global.threshold) %>%
            group_by(LOCUS) %>%
            tally() %>%
            filter(n >= maf.pop.num.threshold) %>%
            select(LOCUS) %>%
            left_join(input, by = "LOCUS") %>%
            arrange(LOCUS, POP_ID)
        } else { # AND operator between local and global maf
          vcf.maf <- maf.data %>%
            group_by(LOCUS, POP_ID) %>%
            summarise(
              MAF_GLOBAL = mean(MAF_GLOBAL, na.rm = TRUE),
              MAF_LOCAL = min(MAF_LOCAL, na.rm = TRUE)
            ) %>%
            filter(MAF_LOCAL >= maf.local.threshold & MAF_GLOBAL >= maf.global.threshold) %>%
            group_by(LOCUS) %>%
            tally() %>%
            filter(n >= maf.pop.num.threshold) %>%
            select(LOCUS) %>%
            left_join(input, by = "LOCUS") %>%
            arrange(LOCUS, POP_ID)
        }
        vcf.maf <- vcf.maf %>% select(-c(CHROM, LOCUS, POS))
      } # end maf haplotype approach
      
      if (maf.approach == "SNP") { # SNP approach
        if (maf.operator == "OR") {
          vcf.maf <- maf.data %>%
            group_by(MARKERS, POP_ID) %>%
            summarise(
              MAF_GLOBAL = mean(MAF_GLOBAL, na.rm = TRUE),
              MAF_LOCAL = min(MAF_LOCAL, na.rm = TRUE)
            ) %>%
            filter(MAF_LOCAL >= maf.local.threshold | MAF_GLOBAL >= maf.global.threshold) %>%
            group_by(MARKERS) %>%
            tally() %>%
            filter(n >= maf.pop.num.threshold) %>%
            select(MARKERS) %>%
            left_join(input, by = "MARKERS") %>%
            arrange(MARKERS, POP_ID)
        } else { # AND operator between local and global maf
          vcf.maf <- maf.data %>%
            group_by(MARKERS, POP_ID) %>%
            summarise(
              MAF_GLOBAL = mean(MAF_GLOBAL, na.rm = TRUE),
              MAF_LOCAL = min(MAF_LOCAL, na.rm = TRUE)
            ) %>%
            filter(MAF_LOCAL >= maf.local.threshold & MAF_GLOBAL >= maf.global.threshold) %>%
            group_by(MARKERS) %>%
            tally() %>%
            filter(n >= maf.pop.num.threshold) %>%
            select(MARKERS) %>%
            left_join(input, by = "MARKERS") %>%
            arrange(MARKERS, POP_ID)
        }
      } # end maf snp approach
      
      
      message(stri_join("The number of MARKERS removed by the MAF filters = ", 
                        n_distinct(input$MARKERS)-n_distinct(vcf.maf$MARKERS), "\n", 
                        "The number of MARKERS before -> after the MAF filters: ", 
                        n_distinct(input$MARKERS)," -> ", n_distinct(vcf.maf$MARKERS), 
                        " MARKERS"))
      
      input <- vcf.maf
      
      # unused object
      vcf.maf <- NULL 
      maf.data <- NULL
    } # End of MAF filters
    
    # Change the genotype coding  **********************************************
    message("Recoding genotypes")
    # adegenet & gsi_sim
    
    if (data.type == "vcf.file" & assignment.analysis == "gsi_sim") { # for VCF input
      input <- input %>%
        mutate(
          REF= stri_replace_all_fixed(str = REF, pattern = c("A", "C", "G", "T"), replacement = c("1", "2", "3", "4"), vectorize_all = FALSE), # replace nucleotide with numbers
          ALT = stri_replace_all_fixed(str = ALT, pattern = c("A", "C", "G", "T"), replacement = c("1", "2", "3", "4"), vectorize_all = FALSE),# replace nucleotide with numbers
          GT = ifelse(GT == "0/0", stri_join(REF, REF, sep = "_"),
                      ifelse(GT == "1/1",  stri_join(ALT, ALT, sep = "_"),
                             ifelse(GT == "0/1", stri_join(REF, ALT, sep = "_"),
                                    ifelse(GT == "1/0", stri_join(ALT, REF, sep = "_"), "0_0")
                             )
                      )
          )
        ) %>%
        arrange(MARKERS, POP_ID) %>%
        select(-c(REF, ALT))
      
      gsim.prep <- input %>%
        tidyr::separate(col = GT, into = c("A1", "A2"), sep = "_") %>%  # separate the genotypes into alleles
        tidyr::gather(key = ALLELES, GT, -c(MARKERS, INDIVIDUALS, POP_ID))
    }
    if (data.type == "vcf.file" & assignment.analysis == "adegenet") {
      genind.prep <- input %>%
        mutate(GT = stri_replace_all_fixed(str = GT, pattern = "/", replacement = ":", vectorize_all = FALSE)) %>%
        mutate(GT = stri_replace_all_fixed(str = GT, pattern = c("0:0", "1:1", "0:1", "1:0", ".:."), replacement = c("2_0", "0_2", "1_1", "1_1", "NA_NA"), vectorize_all = FALSE)) %>%
        select(-REF, -ALT) %>% 
        arrange(MARKERS, POP_ID) %>%
        tidyr::separate(col = GT, into = c("A1", "A2"), sep = "_", extra = "drop", remove = TRUE) %>%
        tidyr::gather(key = ALLELES, value = COUNT, -c(MARKERS, INDIVIDUALS, POP_ID)) %>% # make tidy
        tidyr::unite(MARKERS_ALLELES, MARKERS, ALLELES, sep = ".", remove = TRUE) %>%
        mutate(COUNT = replace(COUNT, which(COUNT == "NA"), NA)) %>% 
        group_by(POP_ID, INDIVIDUALS) %>%
        tidyr::spread(data = ., key = MARKERS_ALLELES, value = COUNT) %>%
        arrange(POP_ID, INDIVIDUALS)
    }
    
    if (data.type == "haplo.file") { # for haplotypes
      gsim.prep <- suppressWarnings(
        input %>%
          mutate(GT = stri_replace_all_fixed(GT, "-", "000/000", vectorize_all=F)) %>% 
          arrange(MARKERS) %>% 
          tidyr::separate(
            col = GT, into = c("A1", "A2"), 
            sep = "/", extra = "drop", remove = TRUE
          ) %>%
          mutate(
            A2 = stri_replace_na(str = A2, replacement = "000"),
            A2 = ifelse(A2 == "000", A1, A2)
            ) %>%
          tidyr::gather(data = ., key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS, POP_ID)) %>%
          ungroup()
      )
    }
    if (data.type == "haplo.file" & assignment.analysis == "gsi_sim") {
      input <- suppressWarnings(
        gsim.prep %>%
          filter(GT != "000") %>% 
          tidyr::spread(data = ., key = MARKERS, value = GT) %>% # this reintroduce the missing, but with NA
          ungroup() %>% 
          plyr::colwise(.fun = factor, exclude = NA)(.)
      )
      
      input <- suppressWarnings(
        input %>%
          ungroup() %>% 
          mutate_each(funs(as.integer), -c(INDIVIDUALS, POP_ID, ALLELES)) %>%
          ungroup() %>% 
          tidyr::gather(data = ., key = MARKERS, value = GT, -c(INDIVIDUALS, POP_ID, ALLELES)) %>% 
          mutate(
            GT = stri_replace_na(str = GT, replacement = "000"),
            GT = stri_pad_left(str = GT, width = 3, pad = "0")
            ) %>% 
          tidyr::spread(data = ., key = ALLELES, value = GT) %>% 
          tidyr::unite(data = ., GT, A1, A2, sep = "", remove = TRUE)
      )
      
      # for haplo.file we need to change back again the gsim.prep file
      gsim.prep <- input %>% 
        tidyr::separate(data = ., col = GT, into = .(A1, A2), sep = 3, remove = TRUE) %>% 
        tidyr::gather(data = ., key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS, POP_ID)) 
    }
    if (data.type == "df.file") { # For data frame of genotypes
      gsim.prep <- input %>% 
        tidyr::separate(data = ., col = GT, into = .(A1, A2), sep = 3, remove = TRUE) %>% 
        tidyr::gather(data = ., key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS, POP_ID)) 
    }
    if (data.type == "plink.file") { # for PLINK
      message("Recoding genotypes for gsi_sim")
      gsim.prep <- input %>% 
        tidyr::separate(col = INDIVIDUALS_ALLELES, into = c("INDIVIDUALS_2", "ALLELES"), sep = "_", remove = TRUE) %>% 
        select(-INDIVIDUALS_2)
    }
    if (data.type == "df.file" | data.type == "plink.file" | data.type == "haplo.file") {
      if (assignment.analysis == "adegenet" ) {
        genind.prep <- suppressWarnings(
          gsim.prep %>%
            filter(GT != "000") %>% 
            tidyr::spread(data = ., key = MARKERS, value = GT) %>% # this reintroduce the missing, but with NA
            ungroup() %>% 
            plyr::colwise(.fun = factor, exclude = NA)(.)
        )
        
        genind.prep <- suppressWarnings(
          genind.prep %>%
            ungroup() %>% 
            mutate_each(funs(as.integer), -c(INDIVIDUALS, POP_ID, ALLELES)) %>%
            ungroup() %>% 
            tidyr::gather(data = ., key = MARKERS, value = GT, -c(INDIVIDUALS, POP_ID, ALLELES)) %>% 
            mutate(GT = stri_replace_na(str = GT, replacement = "000")) %>%
            filter(GT != "000") %>%
            select(-ALLELES) %>%
            group_by(POP_ID, INDIVIDUALS, MARKERS, GT) %>% 
            tally %>%
            ungroup() %>%
            tidyr::unite(MARKERS_ALLELES, MARKERS, GT, sep = ":", remove = TRUE) %>%
            arrange(POP_ID, INDIVIDUALS, MARKERS_ALLELES) %>% 
            group_by(POP_ID, INDIVIDUALS) %>% 
            tidyr::spread(data = ., key = MARKERS_ALLELES, value = n) %>%
            ungroup() %>%
            tidyr::gather(data = ., key = MARKERS_ALLELES, value = COUNT, -c(INDIVIDUALS, POP_ID)) %>% 
            tidyr::separate(data = ., col = MARKERS_ALLELES, into = c("MARKERS", "ALLELES"), sep = ":", remove = TRUE) %>% 
            mutate(COUNT = as.numeric(stri_replace_na(str = COUNT, replacement = "0"))) %>% 
            group_by(INDIVIDUALS, MARKERS) %>%
            mutate(MAX_COUNT_MARKERS = max(COUNT, na.rm = TRUE)) %>%
            ungroup() %>% 
            mutate(COUNT = ifelse(MAX_COUNT_MARKERS == 0, "erase", COUNT)) %>%
            select(-MAX_COUNT_MARKERS) %>% 
            mutate(COUNT = replace(COUNT, which(COUNT == "erase"), NA)) %>% 
            arrange(POP_ID, INDIVIDUALS, MARKERS, ALLELES) %>% 
            tidyr::unite(MARKERS_ALLELES, MARKERS, ALLELES, sep = ".", remove = TRUE) %>%
            tidyr::spread(data = ., key = MARKERS_ALLELES, value = COUNT) %>% 
            arrange(POP_ID, INDIVIDUALS)
        )
      }
    }
    
    # only adegenet
    if (assignment.analysis == "adegenet") {
      # genind arguments common to all data.type
      ind <- as.character(genind.prep$INDIVIDUALS)
      pop <- genind.prep$POP_ID
      genind.df <- genind.prep %>% ungroup() %>% 
        select(-c(INDIVIDUALS, POP_ID))
      rownames(genind.df) <- ind
      loc.names <- colnames(genind.df)
      strata <- genind.prep %>% ungroup() %>% select(INDIVIDUALS, POP_ID) %>% distinct(INDIVIDUALS, POP_ID)
      
      # genind constructor
      prevcall <- match.call()
      genind.object <- genind(tab = genind.df, pop = pop, prevcall = prevcall, ploidy = 2, type = "codom", strata = strata, hierarchy = NULL)
      
      # sum <- summary(genind.object) # test
      # sum$NA.perc # test
      
      ind <- NULL
      pop <- NULL
      genind.df <- NULL
      genind.prep <- NULL
    }
    
    # Imputations **************************************************************
    if (imputation.method != "FALSE") {
      message("Preparing the data for imputations")
      
      if (data.type == "vcf.file") { # for VCF input
        if (impute == "genotype") {
          input.prep <- input %>%
            select(-REF, -ALT) %>%
            mutate(GT = stri_replace_all_fixed(str = GT, pattern = "/", replacement = ":", vectorize_all = FALSE)) %>%
            mutate(
              GT = stri_replace_all_fixed(GT, pattern = ".:.", replacement = "NA", vectorize_all = FALSE),
              GT = replace(GT, which(GT == "NA"), NA)
            ) %>%
            group_by(INDIVIDUALS, POP_ID) %>% 
            tidyr::spread(data = ., key = MARKERS, value = GT) %>%
            ungroup() %>% 
            arrange(POP_ID, INDIVIDUALS)
        }
        
        if (impute == "allele") {
          input.prep <- input %>%
            select(-REF, -ALT) %>%
            mutate(GT = stri_replace_all_fixed(str = GT, pattern = "/", replacement = ":", vectorize_all = FALSE)) %>%
            mutate(
              GT = stri_replace_all_fixed(GT, pattern = ".:.", replacement = "NA:NA", vectorize_all = FALSE)
            ) %>% 
            tidyr::separate(col = GT, into = c("A1", "A2"), sep = ":") %>%  # separate the genotypes into alleles
            tidyr::gather(key = ALLELES, GT, -c(MARKERS, INDIVIDUALS, POP_ID)) %>%
            mutate(GT = replace(GT, which(GT == "NA"), NA)) %>%
            group_by(INDIVIDUALS, POP_ID, ALLELES) %>% 
            tidyr::spread(data = ., key = MARKERS, value = GT) %>% 
            ungroup() %>% 
            arrange(POP_ID, INDIVIDUALS)
        }
        
      } # End VCF prep file for imputation
      
      if (data.type == "plink.file") {
        
        if (impute == "genotype"){
          input.prep <- gsim.prep %>% 
            group_by(MARKERS, INDIVIDUALS, POP_ID) %>% 
            tidyr::spread(data = ., key = ALLELES, value = GT) %>% 
            tidyr::unite(col = GT, A1, A2, sep = "", remove = TRUE) %>% 
            mutate(
              GT = stri_replace_all_fixed(GT, pattern = "000000", replacement = "NA", vectorize_all = FALSE),
              GT = replace(GT, which(GT == "NA"), NA)
            ) %>%
            group_by(INDIVIDUALS, POP_ID) %>% 
            tidyr::spread(data = ., key = MARKERS, value = GT) %>%
            ungroup() %>% 
            arrange(POP_ID, INDIVIDUALS)
        }
        
        if (impute == "allele"){
          input.prep <- gsim.prep %>%
            mutate(
              GT = stri_replace_all_fixed(GT, pattern = "000", replacement = "NA", vectorize_all = FALSE),
              GT = replace(GT, which(GT == "NA"), NA)
            ) %>%
            group_by(INDIVIDUALS, POP_ID, ALLELES) %>% 
            tidyr::spread(data = ., key = MARKERS, value = GT) %>%
            ungroup() %>% 
            arrange(POP_ID, INDIVIDUALS)
        }
        
        # glimpse(test)
      } # End PLINK prep file for imputation
      
      if (data.type == "df.file" | data.type == "haplo.file") { # for df input
        if (impute == "genotype") {
          input.prep <- input %>%
            mutate(
              GT = stri_replace_all_fixed(GT, pattern = "000000", replacement = "NA", vectorize_all = FALSE),
              GT = replace(GT, which(GT == "NA"), NA)
            ) %>%
            group_by(INDIVIDUALS, POP_ID) %>% 
            tidyr::spread(data = ., key = MARKERS, value = GT) %>%
            ungroup() %>% 
            arrange(POP_ID, INDIVIDUALS)
        }
        if (impute == "allele") {
          input.prep <- gsim.prep %>%
            mutate(
              GT = stri_replace_all_fixed(GT, pattern = "000", replacement = "NA", vectorize_all = FALSE),
              GT = replace(GT, which(GT == "NA"), NA)
            ) %>%
            group_by(INDIVIDUALS, POP_ID, ALLELES) %>% 
            tidyr::spread(data = ., key = MARKERS, value = GT) %>% 
            ungroup() %>% 
            arrange(POP_ID, INDIVIDUALS)
        }
      } # End data frame prep file for imputation
      
      # Imputation with Random Forest
      if (imputation.method == "rf") {
        # Parallel computations options
        options(rf.cores = parallel.core, mc.cores = parallel.core)
        
        # imputations using Random Forest with the package randomForestSRC
        impute_genotype_rf <- function(x) {
          randomForestSRC::impute.rfsrc(data = x,
                                        ntree = num.tree,
                                        nodesize = 1,
                                        nsplit = split.number,
                                        nimpute = iteration.rf,
                                        do.trace = verbose)
        } # End of imputation function
        
        # Random Forest by pop
        if (imputations.group == "populations") {
          message("Imputations computed by populations, take a break...")
          df.split.pop <- split(x = input.prep, f = input.prep$POP_ID) # slip data frame by population
          pop.list <- names(df.split.pop) # list the pop
          imputed.dataset <-list() # create empty list
          
          # Function to go through the populations
          impute_rf_pop <- function(pop.list, ...){
            sep.pop <- df.split.pop[[pop.list]]
            sep.pop <- suppressWarnings(
              plyr::colwise(factor, exclude = NA)(sep.pop)
            )
            # message of progress for imputations by population
            message(paste("Completed imputations for pop ", pop.list, sep = ""))
            # imputed.dataset[[i]] <- impute_markers_rf(sep.pop) # test with foreach
            imputed.dataset <- impute_genotype_rf(sep.pop)
            return(imputed.dataset)
          } # End impute_rf_pop
          
          input.imp <- list()
          input.imp <- parallel::mclapply(
            X = pop.list, 
            FUN = impute_rf_pop, 
            mc.preschedule = FALSE, 
            mc.silent = FALSE, 
            mc.cores = parallel.core
          )
          
          # Compiling the results
          message("Compiling imputations results")
          input.imp <- suppressWarnings(bind_rows(input.imp))
          
          # Second round of imputations (globally) to remove introduced NA 
          # In case that some pop don't have the markers
          input.imp <- suppressWarnings(plyr::colwise(factor, exclude = NA)(input.imp)) # Make the columns factor
          input.imp <- impute_genotype_rf(input.imp) # impute globally
          
          # dump unused objects
          df.split.pop <- NULL
          pop.list <- NULL
          sep.pop <- NULL
          imputed.dataset <- NULL
          input.prep <- NULL
          
        } # End imputation RF populations
        # Random Forest global
        if (imputations.group == "global") { # Globally (not by pop_id)
          message("Imputations computed globally, take a break...")
          input.prep <- plyr::colwise(factor, exclude = NA)(input.prep)
          input.imp <- impute_genotype_rf(input.prep)
          
          input.prep <- NULL # remove unused object
        } # End imputation RF global
      } # End imputation RF
      # Imputation using the most common genotype
      if (imputation.method == "max") { # End imputation max
        if (imputations.group == "populations") {
          message("Imputations computed by populations")
          
          if (impute == "genotype"){
            input.imp <- suppressWarnings(
              input.prep %>%
                tidyr::gather(MARKERS, GT, -c(INDIVIDUALS, POP_ID)) %>%
                group_by(MARKERS, POP_ID) %>%
                mutate(
                  GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE)),
                  GT = replace(GT, which(GT == "NA"), NA)
                ) %>%
                # the next 2 steps are necessary to remove introduced NA if some pop don't have the markers
                # will take the global observed values by markers for those cases.
                group_by(MARKERS) %>%
                mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>%
                group_by(INDIVIDUALS, POP_ID) %>% 
                tidyr::spread(data = ., key = MARKERS, value = GT) %>%
                ungroup()
            )
          }
          
          if (impute == "allele"){
            input.imp <- suppressWarnings(
              input.prep %>%
                tidyr::gather(MARKERS, GT, -c(INDIVIDUALS, POP_ID, ALLELES)) %>%
                group_by(MARKERS, POP_ID) %>%
                mutate(
                  GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE)),
                  GT = replace(GT, which(GT == "NA"), NA)
                ) %>%
                # the next 2 steps are necessary to remove introduced NA if some pop don't have the markers
                # will take the global observed values by markers for those cases.
                group_by(MARKERS) %>%
                mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>%
                group_by(INDIVIDUALS, POP_ID, ALLELES) %>% 
                tidyr::spread(data = ., key = MARKERS, value = GT) %>%
                ungroup()
            )
          }
          
          input.prep <- NULL # remove unused object
          
        } # End imputation max populations 
        if (imputations.group == "global") {
          # Globally (not by pop_id)
          message("Imputations computed globally")
          if (impute == "genotype"){
            input.imp <- suppressWarnings(
              input.prep %>%
                tidyr::gather(MARKERS, GT, -c(INDIVIDUALS, POP_ID)) %>%
                group_by(MARKERS) %>%
                mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>%
                group_by(INDIVIDUALS, POP_ID) %>% 
                tidyr::spread(data = ., key = MARKERS, value = GT) %>%
                ungroup()
            )
          }
          
          if (impute == "allele"){
            input.imp <- suppressWarnings(
              input.prep %>%
                tidyr::gather(MARKERS, GT, -c(INDIVIDUALS, POP_ID, ALLELES)) %>%
                group_by(MARKERS) %>%
                mutate(GT = stri_replace_na(GT, replacement = max(GT, na.rm = TRUE))) %>%
                group_by(INDIVIDUALS, POP_ID, ALLELES) %>% 
                tidyr::spread(data = ., key = MARKERS, value = GT) %>%
                ungroup()
            )
          }
          
          input.prep <- NULL # remove unused object
        } # End imputation max global 
      } # End imputations max
      
      # prepare the imputed dataset for gsi_sim or adegenet
      message("Preparing imputed data set for gsi_sim...")
      if (assignment.analysis == "gsi_sim") {
        if (impute == "genotype") {
          if (data.type == "vcf.file") { # for VCF input
            gsi.prep.imp <- suppressWarnings(
              input.imp %>%
                tidyr::gather(key = MARKERS, value = GT, -c(INDIVIDUALS, POP_ID)) %>% # make tidy
                tidyr::separate(col = GT, into = c("A1", "A2"), sep = "_", extra = "drop", remove = TRUE) %>%
                tidyr::gather(key = ALLELES, value = GT, -c(MARKERS, INDIVIDUALS, POP_ID))
            )
          }
          if (data.type == "plink.file" | data.type == "df.file" | data.type == "haplo.file") {
            gsi.prep.imp <- input.imp %>%
              tidyr::gather(key = MARKERS, GT, -c(INDIVIDUALS, POP_ID)) %>% 
              tidyr::separate(col = GT, into = c("A1", "A2"), sep = 3) %>%  # separate the genotypes into alleles
              tidyr::gather(key = ALLELES, GT, -c(MARKERS, INDIVIDUALS, POP_ID))
          }
        }
        if (impute == "allele") {
          gsi.prep.imp <- suppressWarnings(
            input.imp %>%
              tidyr::gather(key = MARKERS, value = GT, -c(INDIVIDUALS, POP_ID, ALLELES))
          )
        }
      }

      # adegenet
      if (data.type == "vcf.file" & assignment.analysis == "adegenet" ) {
        genind.prep.imp <- input.imp %>% 
          tidyr::gather(key = MARKERS, value = GT, -c(POP_ID, INDIVIDUALS, ALLELES)) %>% # make tidy
          tidyr::spread(data = ., key = ALLELES, value = GT) %>%
          tidyr::unite(GT, A1, A2, sep = ":", remove = TRUE) %>%
          mutate(GT = stri_replace_all_fixed(str = GT, pattern = c("0:0", "1:1", "0:1", "1:0"), replacement = c("2_0", "0_2", "1_1", "1_1"), vectorize_all = FALSE)) %>%
          arrange(MARKERS, POP_ID) %>%
          tidyr::separate(col = GT, into = c("A1", "A2"), sep = "_", extra = "drop", remove = TRUE) %>%
          tidyr::gather(key = ALLELES, value = COUNT, -c(MARKERS, INDIVIDUALS, POP_ID)) %>% # make tidy
          tidyr::unite(MARKERS_ALLELES, MARKERS, ALLELES, sep = ".", remove = TRUE) %>%
          group_by(POP_ID, INDIVIDUALS) %>%
          tidyr::spread(data = ., key = MARKERS_ALLELES, value = COUNT) %>%
          arrange(POP_ID, INDIVIDUALS)
      }
      if (data.type == "df.file" | data.type == "plink.file" | data.type == "haplo.file") {
        if (assignment.analysis == "adegenet" ) {
          genind.prep.imp <- suppressWarnings(
            input.imp %>%
              ungroup() %>% 
              plyr::colwise(.fun = factor, exclude = NA)(.)
          )
          
          genind.prep.imp <- suppressWarnings(
            genind.prep.imp %>%
              ungroup() %>% 
              mutate_each(funs(as.integer), -c(INDIVIDUALS, POP_ID, ALLELES)) %>%
              ungroup() %>% 
              tidyr::gather(data = ., key = MARKERS, value = GT, -c(INDIVIDUALS, POP_ID, ALLELES)) %>% 
              select(-ALLELES) %>%
              group_by(POP_ID, INDIVIDUALS, MARKERS, GT) %>% 
              tally %>%
              ungroup() %>%
              tidyr::unite(MARKERS_ALLELES, MARKERS, GT, sep = ":", remove = TRUE) %>%
              arrange(POP_ID, INDIVIDUALS, MARKERS_ALLELES) %>% 
              group_by(POP_ID, INDIVIDUALS) %>% 
              tidyr::spread(data = ., key = MARKERS_ALLELES, value = n) %>%
              ungroup() %>%
              tidyr::gather(data = ., key = MARKERS_ALLELES, value = COUNT, -c(INDIVIDUALS, POP_ID)) %>% 
              tidyr::separate(data = ., col = MARKERS_ALLELES, into = c("MARKERS", "ALLELES"), sep = ":", remove = TRUE) %>% 
              mutate(COUNT = as.numeric(stri_replace_na(str = COUNT, replacement = "0"))) %>% 
              ungroup() %>%
              arrange(POP_ID, INDIVIDUALS, MARKERS, ALLELES) %>% 
              tidyr::unite(MARKERS_ALLELES, MARKERS, ALLELES, sep = ".", remove = TRUE) %>%
              tidyr::spread(data = ., key = MARKERS_ALLELES, value = COUNT) %>% 
              arrange(POP_ID, INDIVIDUALS)
          )
        }
      }
      
      # only adegenet
      if (assignment.analysis == "adegenet") {
        # genind arguments common to all data.type
        ind <- as.character(genind.prep.imp$INDIVIDUALS)
        pop <- genind.prep.imp$POP_ID
        genind.df <- genind.prep.imp %>% ungroup() %>% 
          select(-c(INDIVIDUALS, POP_ID))
        rownames(genind.df) <- ind
        loc.names <- colnames(genind.df)
        strata <- genind.prep.imp %>% ungroup() %>% select(INDIVIDUALS, POP_ID) %>% distinct(INDIVIDUALS, POP_ID)
        
        # genind constructor
        prevcall <- match.call()
        genind.object.imp <- genind(tab = genind.df, pop = pop, prevcall = prevcall, ploidy = 2, type = "codom", strata = strata, hierarchy = NULL)
        
        # sum <- summary(genind.object.imp) # test
        # sum$NA.perc # test
        
        ind <- NULL
        pop <- NULL
        genind.df <- NULL
        # genind.prep <- NULL
        # genind.prep.imp <- NULL
      }
      
      input.imp <- NULL # remove unused object
    } # End imputations
    
    # Sampling of markers ******************************************************
    # unique list of markers after all the filtering
    # if "all" is present in the list, change to the maximum number of markers
    unique.markers <- input %>% 
      select(MARKERS) %>% 
      distinct(MARKERS) %>% 
      arrange(MARKERS)
    

    marker.number <- stri_replace_all_fixed(str = marker.number, pattern = "all", 
                                            replacement = nrow(unique.markers), 
                                            vectorize_all = TRUE)
    marker.number <- as.numeric(marker.number)
    
    # In marker.number, remove marker group higher than the max number of markers
    removing.marker <- purrr::keep(.x = marker.number, .p = marker.number > nrow(unique.markers))
    
    if (length(removing.marker) > 0) {
      message.marker <- stri_c(removing.marker, collapse = ", ")
      message("Removing marker.number higher than the max number of markers: ", message.marker)
    }
    
    marker.number <- purrr::discard(.x = marker.number, .p = marker.number > nrow(unique.markers))
    
    
    # Functions ******************************************************************
    # Fst function: Weir & Cockerham 1984
    fst_WC84 <- function(data, holdout.samples, ...) {
      # data <- input # test
      # holdout.samples <- holdout$INDIVIDUALS # test
      
      # data.type <- data$GT[[1]] # VCF vs DF # test
      pop.number <- n_distinct(data$POP_ID)
      
      if (is.null(holdout.samples)) { # use all the individuals
        if (data.type == "vcf.file") { # for VCF input
          # if (stri_detect_fixed(data.type, "_")){ # for VCF file input # test
          data.genotyped <- data %>%
            filter(GT != "0_0")
        } else { # for df and haplotype files
          data.genotyped <- data %>%
          filter(GT != "000000") # Check for df and plink...
        }
      } else { # with holdout set
        if (data.type == "vcf.file") { # for VCF input
          # if (stri_detect_fixed(data.type, "_")){ # for VCF file input
          data.genotyped <- data %>%
            filter(GT != "0_0") %>% # remove missing genotypes
            filter(!INDIVIDUALS %in% holdout.samples) # remove supplementary individual before ranking markers with Fst
        } else { # for df and haplotype files
          data.genotyped <- data %>%
            filter(GT != "000000") %>% # remove missing genotypes
            filter(!INDIVIDUALS %in% holdout.samples) # remove supplementary individual before ranking markers with Fst
        }
      }
      
      n.pop.locus <- data.genotyped %>%
        select(MARKERS, POP_ID) %>%
        group_by(MARKERS) %>%
        distinct(POP_ID) %>%
        tally %>%
        rename(npl = n)
      
      ind.count.locus <- data.genotyped %>%
        group_by(MARKERS) %>%
        tally
      
      ind.count.locus.pop <- data.genotyped %>%
        group_by(POP_ID, MARKERS) %>%
        tally %>%
        rename(nal = n) %>%
        mutate(
          nal_sq = nal^2,
          N_IND_GENE = nal*2
        )
      
      #common
      if (data.type == "vcf.file") { # for VCF input
        # if (stri_detect_fixed(data.type, "_")){ # for VCF file input
        freq.al.locus <- data.genotyped %>%
          mutate(A1 = stri_sub(GT, 1, 1), A2 = stri_sub(GT, 3, 3)) %>% 
          select(-GT) %>%
          tidyr::gather(key = ALLELES_GROUP, ALLELES, -c(INDIVIDUALS, POP_ID, MARKERS))
        # tidyr::separate(col = GT, into = c("A1", "A2"), sep = "_") %>%  # test takes longer
      } else { # for DF, haplo and plink file input
        freq.al.locus <- data.genotyped %>%
          tidyr::separate(col = GT, into = c("A1", "A2"), sep = 3) %>%  # separate the genotypes into alleles
          tidyr::gather(key = ALLELES_GROUP, ALLELES, -c(INDIVIDUALS, POP_ID, MARKERS))
      }
      
      #pop
      freq.al.locus.pop <- suppressWarnings(
        freq.al.locus %>%
          group_by(POP_ID, MARKERS, ALLELES) %>%
          tally %>%
          full_join(ind.count.locus.pop, by = c("POP_ID", "MARKERS")) %>%
          mutate(P = n/N_IND_GENE) %>% # Freq. Allele per pop
          select(POP_ID, MARKERS, ALLELES, P) %>%
          group_by(MARKERS, ALLELES) %>%
          tidyr::spread(data = ., key = POP_ID, value = P) %>%
          tidyr::gather(key = POP_ID, value = P, -c(MARKERS, ALLELES)) %>%
          mutate(P = as.numeric(stri_replace_na(str = P, replacement = 0))) %>%
          full_join(ind.count.locus.pop, by = c("POP_ID", "MARKERS"))
      )    
      
      freq.al.locus.global <- suppressWarnings(
        freq.al.locus %>%
          group_by(MARKERS, ALLELES) %>%
          tally %>%
          full_join(
            ind.count.locus %>%
              rename(N = n), 
            by = "MARKERS"
          ) %>%
          mutate(pb = n/(2*N)) %>% # Global Freq. Allele
          select(MARKERS, ALLELES, pb)
      )    
      
      mean.n.pop.corrected.per.locus <- suppressWarnings(
        ind.count.locus.pop %>%
          group_by(MARKERS) %>%
          summarise(nal_sq_sum = sum(nal_sq, na.rm = TRUE)) %>%
          full_join(ind.count.locus, by = "MARKERS") %>%
          mutate(nal_sq_sum_nt = (n - nal_sq_sum/n)) %>%
          full_join(n.pop.locus, by = "MARKERS") %>%
          mutate(ncal = nal_sq_sum_nt/(npl-1)) %>%
          select(MARKERS, ncal)
      )
      
      ncal <- suppressWarnings(
        freq.al.locus %>%
          select(MARKERS, ALLELES) %>%
          group_by(MARKERS, ALLELES) %>%
          distinct(MARKERS, ALLELES) %>%
          full_join(ind.count.locus, by = "MARKERS") %>%
          rename(ntal = n) %>%
          full_join(mean.n.pop.corrected.per.locus, by = "MARKERS")
      )
      if (data.type == "vcf.file") { # for VCF input
        # if (stri_detect_fixed(data.type, "_")){ # for VCF file input
        data.genotyped.het <- data.genotyped %>%
          mutate(het = ifelse(stri_sub(GT, 1, 1) != stri_sub(GT, 3, 3), 1, 0))
      } else { # for DF file input
        data.genotyped.het <- data.genotyped %>%
          mutate(het = ifelse(stri_sub(GT, 1, 3) != stri_sub(GT, 4, 6), 1, 0))
      }
      
      fst.ranked <- suppressWarnings(
        data.genotyped.het %>%
          group_by(MARKERS, POP_ID) %>%
          summarise(mho = sum(het, na.rm = TRUE)) %>%  # = the number of heterozygote individuals per pop and markers
          group_by(MARKERS) %>%
          tidyr::spread(data = ., key = POP_ID, value = mho) %>%
          tidyr::gather(key = POP_ID, value = mho, -MARKERS) %>%
          mutate(mho = as.numeric(stri_replace_na(str = mho, replacement = 0))) %>%
          full_join(freq.al.locus.pop, by = c("POP_ID", "MARKERS")) %>%
          mutate(
            mhom = round(((2 * nal * P - mho)/2), 0),
            dum = nal * (P - 2 * P^2) + mhom
          ) %>%
          group_by(MARKERS, ALLELES) %>%
          full_join(freq.al.locus.global, by = c("MARKERS", "ALLELES")) %>%
          mutate(
            SSi = sum(dum, na.rm = TRUE),
            dum1 = nal * (P - pb)^2
          ) %>%
          group_by(MARKERS, ALLELES) %>%
          mutate(SSP = 2 * sum(dum1, na.rm = TRUE)) %>%
          group_by(MARKERS, POP_ID) %>%
          mutate(SSG = nal * P - mhom) %>%
          group_by(MARKERS, ALLELES) %>%
          full_join(ncal, by = c("MARKERS", "ALLELES")) %>%
          full_join(n.pop.locus, by = "MARKERS") %>%
          rename(ntalb = npl) %>%
          mutate(
            sigw = round(sum(SSG, na.rm = TRUE), 2)/ntal,
            MSP = SSP/(ntalb - 1),
            MSI = SSi/(ntal - ntalb),
            sigb = 0.5 * (MSI - sigw),
            siga = 1/2/ncal * (MSP - MSI)
          ) %>%
          group_by(MARKERS) %>%
          summarise(
            lsiga = sum(siga, na.rm = TRUE),
            lsigb = sum(sigb, na.rm = TRUE),
            lsigw = sum(sigw, na.rm = TRUE),
            FST = round(lsiga/(lsiga + lsigb + lsigw), 6),
            FIS = round(lsigb/(lsigb + lsigw), 6)
          ) %>%
          arrange(desc(FST)) %>%
          select(MARKERS, FST) %>%
          mutate(RANKING = seq(from = 1, to = n()))
      )
      
      # select(MARKERS, FIS, FST)
      # remove unused objects
      data <- NULL
      data.genotyped <- NULL
      data.genotyped.het <- NULL
      n.pop.locus <- NULL
      ind.count.locus <- NULL
      ind.count.locus.pop <- NULL
      freq.al.locus <- NULL
      freq.al.locus.pop <- NULL
      freq.al.locus.global <- NULL
      mean.n.pop.corrected.per.locus <- NULL
      ncal <- NULL
      
      return(fst.ranked)
    } # End fst_WC84 function
    
    # Write the files
    write_gsi <- function (data, markers.names, filename, i, m, ...) {
      
      data$POP_ID <- droplevels(x = data$POP_ID)
      n.individuals <- n_distinct(data$INDIVIDUALS)  # number of individuals
      pop <- data$POP_ID  # Create a vector with the population ordered by levels
      data <- suppressWarnings(data %>% select(-POP_ID))  # remove pop id
      gsi_sim.split <- split(data, pop)  # split gsi_sim by populations
      filename <- filename  # gsi_sim filename
      
      # directory <- getwd() # test
      # Line 1: number of individuals and the number of markers
      line1_gsi_sim <- as.data.frame(stri_join(n.individuals, m, sep = " "))
      write.table(line1_gsi_sim, file = paste0(directory.subsample, filename), col.names = FALSE, row.names = FALSE, quote = FALSE)
      
      # Markers names
      loci.table <- as.data.frame(markers.names)
      write_delim(x = loci.table, path = paste0(directory.subsample, filename), delim = "\n", append = TRUE, col_names = FALSE)
      
      # remaining lines, individuals and genotypes
      for (k in levels(pop)) {
        pop.line <- as.data.frame(stri_join("pop", k, sep = " "))
        write_delim(x = pop.line, path = paste0(directory.subsample, filename), delim = "\n", append = TRUE, col_names = FALSE)
        write_delim(x = gsi_sim.split[[k]], path = paste0(directory.subsample, filename), delim = " ", append = TRUE, col_names = FALSE)
      }
      return(filename)
    } # End write_gsi function
    
    # Assignment with gsi_sim
    if (assignment.analysis == "gsi_sim") {
      assignment_analysis <- function(data, select.markers, markers.names, missing.data, i, m, holdout, filename, ...) {
        # data <- gsim.prep #test
        # data <- genind.prep #test
        # missing.data <- "no.imputation" #test
        data.select <- suppressWarnings(
          data %>%
            semi_join(select.markers, by = "MARKERS") %>%
            arrange(MARKERS) %>%  # make tidy
            tidyr::unite(col = MARKERS_ALLELES, MARKERS , ALLELES, sep = "_") %>%
            arrange(POP_ID, INDIVIDUALS, MARKERS_ALLELES) %>%
            tidyr::spread(data = ., key = MARKERS_ALLELES, value = GT) %>%
            arrange(POP_ID, INDIVIDUALS)
        )
        
        # Write gsi_sim input file to directory
        input.gsi <- write_gsi(data = data.select, markers.names = markers.names, filename = filename, i = i, m = m)
        
        # Run gsi_sim ------------------------------------------------------------
        input.gsi <- stri_join(directory.subsample, input.gsi)
        output.gsi <- stri_replace_all_fixed(input.gsi, pattern = "txt", replacement = "output.txt")
        setwd(directory.subsample)
        system(paste(gsi_sim_binary(), "-b", input.gsi, "--self-assign > ", output.gsi))
        
        # Option remove the input file from directory to save space
        if (keep.gsi.files == FALSE) file.remove(input.gsi)
        
        # Get Assignment results -------------------------------------------------
        # Keep track of the holdout individual
        if (sampling.method == "ranked") {
          if (thl == "all") {
            holdout.id <- NULL
          } else {
            holdout.id <- holdout$INDIVIDUALS
          }
        }
        
        # Number of markers
        n.locus <- m
        
        assignment <- suppressWarnings(
          read_delim(output.gsi, col_names = "ID", delim = "\t") %>%
            tidyr::separate(ID, c("KEEPER", "ASSIGN"), sep = ":/", extra = "warn") %>%
            filter(KEEPER == "SELF_ASSIGN_A_LA_GC_CSV") %>%
            tidyr::separate(ASSIGN, c("INDIVIDUALS", "ASSIGN"), sep = ";", extra = "merge") %>%
            tidyr::separate(ASSIGN, c("INFERRED", "OTHERS"), sep = ";", convert = TRUE, numerals = "no.loss", extra = "merge") %>%
            tidyr::separate(OTHERS, c("SCORE", "OTHERS"), sep = ";;", convert = TRUE, numerals = "no.loss", extra = "merge") %>%
            tidyr::separate(OTHERS, c("SECOND_BEST_POP", "OTHERS"), sep = ";", convert = TRUE, numerals = "no.loss", extra = "merge") %>%
            tidyr::separate(OTHERS, c("SECOND_BEST_SCORE", "OTHERS"), sep = ";;", convert = TRUE, numerals = "no.loss")
        )
        
        if (!is.null(pop.id.start)){
          assignment <- suppressWarnings(
            assignment %>%
              mutate(
                CURRENT = factor(stri_sub(INDIVIDUALS, pop.id.start, pop.id.end), levels = pop.levels, labels = pop.labels, ordered = T),
                CURRENT = droplevels(CURRENT),
                INFERRED = factor(INFERRED, levels = unique(pop.labels), ordered = T),
                INFERRED = droplevels(INFERRED),
                SECOND_BEST_POP = factor(SECOND_BEST_POP, levels = unique(pop.labels), ordered = T),
                SECOND_BEST_POP = droplevels(SECOND_BEST_POP),
                SCORE = round(SCORE, 2),
                SECOND_BEST_SCORE = round(SECOND_BEST_SCORE, 2),
                MARKER_NUMBER = as.numeric(rep(n.locus, n())),
                MISSING_DATA = rep(missing.data, n())
              ) %>%
              select(INDIVIDUALS, CURRENT, INFERRED, SCORE, SECOND_BEST_POP, SECOND_BEST_SCORE, MARKER_NUMBER, MISSING_DATA) %>%
              arrange(CURRENT) %>%
              select(INDIVIDUALS, CURRENT, INFERRED, SCORE, MARKER_NUMBER, MISSING_DATA)
          )
        } else {
          assignment <- suppressWarnings(
            assignment %>%
              mutate(INDIVIDUALS = as.character(INDIVIDUALS)) %>% 
              left_join(strata.df, by = "INDIVIDUALS") %>%
              rename(CURRENT = POP_ID) %>% 
              mutate(
                CURRENT = factor(CURRENT, levels = unique(pop.labels), ordered =TRUE),
                # CURRENT = factor(CURRENT, levels = pop.levels, labels = pop.labels, ordered = TRUE),
                CURRENT = droplevels(CURRENT),
                INFERRED = factor(INFERRED, levels = unique(pop.labels), ordered = TRUE),
                INFERRED = droplevels(INFERRED),
                SECOND_BEST_POP = factor(SECOND_BEST_POP, levels = unique(pop.labels), ordered = TRUE),
                SECOND_BEST_POP = droplevels(SECOND_BEST_POP),
                SCORE = round(SCORE, 2),
                SECOND_BEST_SCORE = round(SECOND_BEST_SCORE, 2),
                MARKER_NUMBER = as.numeric(rep(n.locus, n())),
                MISSING_DATA = rep(missing.data, n())
              ) %>%
              select(INDIVIDUALS, CURRENT, INFERRED, SCORE, SECOND_BEST_POP, SECOND_BEST_SCORE, MARKER_NUMBER, MISSING_DATA) %>%
              arrange(CURRENT) %>%
              select(INDIVIDUALS, CURRENT, INFERRED, SCORE, MARKER_NUMBER, MISSING_DATA)
          )
        }
        if (sampling.method == "ranked") {
          if (thl == "all") {
            assignment <- assignment
          } else {
            assignment <- filter(.data = assignment, INDIVIDUALS %in% holdout.id)
          }
        }
        
        # Option remove the output file from directory to save space
        if (keep.gsi.files == FALSE) file.remove(output.gsi)
        
        # saving preliminary results
        if (sampling.method == "random") {
          assignment <- mutate(.data = assignment, ITERATIONS = rep(i, n()))
        }
        
        if (sampling.method == "ranked") {
          if (thl != 1 & thl != "all") {
            assignment <- assignment %>%
              mutate(
                CURRENT = factor(CURRENT, levels = unique(pop.labels), ordered = TRUE),
                CURRENT = droplevels(CURRENT)
              ) %>% 
              group_by(CURRENT, MARKER_NUMBER, MISSING_DATA) %>%
              summarise(
                n = length(CURRENT[as.character(CURRENT) == as.character(INFERRED)]),
                TOTAL = length(CURRENT)
              ) %>%
              ungroup() %>% 
              mutate(
                ASSIGNMENT_PERC = round(n/TOTAL*100, 0),
                ITERATIONS = rep(i, n())
              ) %>% 
              select(-n, -TOTAL)
          }
        }
        return(assignment)
      } # End assignment_analysis function
    }
    
    # Assignment with adegenet
    if (assignment.analysis == "adegenet") {
      assignment_analysis_adegenet <- function(data, select.markers, markers.names, missing.data, i, m, holdout, ...) {
        # data <- genind.object #test
        # missing.data <- "no.imputation" #test
        data.select <- data[loc = select.markers$MARKERS]
        
        # Run adegenet *********************************************************
        pop.data <- data.select@pop
        pop.data <- droplevels(pop.data)
        
        
        if (sampling.method == "random") {
          # Alpha-Score DAPC
          # When all the individuals are accounted for in the model construction
          dapc.best.optim.a.score <- optim.a.score(dapc(data.select, n.da = length(levels(pop.data)), n.pca = round((length(indNames(data.select))/3)-1, 0)), pop = pop.data, plot = FALSE)$best
          message(stri_paste("a-score optimisation for iteration:", i, sep = " ")) # message not working in parallel...
          
          # DAPC with all the data
          dapc.all <- dapc(data.select, n.da = length(levels(pop.data)), n.pca = dapc.best.optim.a.score, pop = pop.data)
          message(stri_paste("DAPC iteration:", i, sep = " "))
          message(stri_paste("DAPC marker group:", m, sep = " "))
        }
        
        if (sampling.method == "ranked") {

          # Alpha-Score DAPC training data
          training.data <- data.select[!indNames(data.select) %in% holdout$INDIVIDUALS] # training dataset
          pop.training <- training.data@pop
          pop.training <- droplevels(pop.training)
          
          dapc.best.optim.a.score <- optim.a.score(dapc(training.data, n.da = length(levels(pop.training)), n.pca = round(((length(indNames(training.data))/3)-1), 0)), pop = pop.training, plot = FALSE)$best
          message(stri_paste("a-score optimisation for iteration:", i, sep = " "))
          
          dapc.training <- dapc(training.data, n.da = length(levels(pop.training)), n.pca = dapc.best.optim.a.score, pop = pop.training)
          message(stri_paste("DAPC of training data set for iteration:", i, sep = " "))
          
          # DAPC holdout individuals
          holdout.data <- data.select[indNames(data.select) %in% holdout$INDIVIDUALS] # holdout dataset
          pop.holdout <- holdout.data@pop
          pop.holdout <- droplevels(pop.holdout)
          assignment.levels <- levels(pop.holdout) # for figure
          rev.assignment.levels <- rev(assignment.levels)  # for figure 
        
          dapc.predict.holdout <- predict.dapc(dapc.training, newdata = holdout.data)
          message(stri_paste("Assigning holdout data for iteration:", i, sep = " "))
        }
        
        
        # Get Assignment results -------------------------------------------------
        # Keep track of the holdout individual
        if (sampling.method == "ranked") {
          if (thl == "all") {
            holdout.id <- NULL
          } else {
            holdout.id <- holdout$INDIVIDUALS
          }
        }
        
        # Number of markers
        n.locus <- m
        if (sampling.method == "random") {
          assignment <- data_frame(ASSIGNMENT_PERC = summary(dapc.all)$assign.per.pop*100) %>% 
            bind_cols(data_frame(POP_ID = levels(pop.data))) %>%
            mutate(ASSIGNMENT_PERC = round(ASSIGNMENT_PERC, 2)) %>% 
            select(POP_ID, ASSIGNMENT_PERC)
        }        
      
        if (sampling.method == "ranked") {
          assignment <- data.frame(INDIVIDUALS = indNames(holdout.data), POP_ID = pop.holdout, ASSIGN = dapc.predict.holdout$assign, dapc.predict.holdout$posterior) %>% 
            rename(CURRENT = POP_ID, INFERRED = ASSIGN) %>%
              mutate(
                CURRENT = factor(CURRENT, levels = rev.assignment.levels, ordered = TRUE),
                INFERRED = factor(INFERRED, levels = assignment.levels, ordered = TRUE)
              )
          
          # assignment <- data.frame(INDIVIDUALS = indNames(holdout.data), POP_ID = pop.holdout, ASSIGN = dapc.predict.holdout$assign, dapc.predict.holdout$posterior) %>% 
          #   select(CURRENT = POP_ID, INFERRED = ASSIGN) %>%
          #   mutate(
          #     CURRENT = factor(CURRENT, levels = rev.assignment.levels, ordered = TRUE),
          #     INFERRED = factor(INFERRED, levels = assignment.levels, ordered = TRUE)
          #   ) %>% 
          #   group_by(CURRENT, INFERRED) %>% 
          #   tally %>% 
          #   group_by(CURRENT) %>% 
          #   mutate(TOTAL = sum(n)) %>% 
          #   ungroup() %>% 
          #   mutate(ASSIGNMENT_PERC = round(n/TOTAL*100, 0)) %>% 
          #   filter(CURRENT == INFERRED)
      }
        
        assignment <- assignment %>% 
          mutate(
            ITERATIONS = rep(i, n()),
            MARKER_NUMBER = as.numeric(rep(n.locus, n())),
            MISSING_DATA = rep(missing.data, n())
          )
        
        # if (sampling.method == "ranked") {
        #   if (thl != 1 & thl != "all") {
        #     assignment <- assignment %>%
        #       mutate(
        #         CURRENT = factor(CURRENT, levels = unique(pop.labels), ordered = TRUE),
        #         CURRENT = droplevels(CURRENT)
        #       ) %>% 
        #       group_by(CURRENT, MARKER_NUMBER, MISSING_DATA) %>%
        #       summarise(
        #         n = length(CURRENT[as.character(CURRENT) == as.character(INFERRED)]),
        #         TOTAL = length(CURRENT)
        #       ) %>%
        #       ungroup() %>% 
        #       mutate(
        #         ASSIGNMENT_PERC = round(n/TOTAL*100, 0),
        #         ITERATIONS = rep(i, n())
        #       ) %>% 
        #       select(-n, -TOTAL)
        #   }
        # }
        return(assignment)
      } # End assignment_analysis_adegenet function
    } # End assignment_function
    
    # Random method ************************************************************
    if (sampling.method == "random") {
      message("Conducting Assignment analysis with markers selected randomly")
      # Number of times to repeat the sampling of markers
      iterations.list <- 1:iteration.method
      # iterations.list <- 1:200 # test
      
      # Plan A: use a list containing all the lists of marker combinations
      # Plan B: use nesting foreach loop (%:%)
      # Plan A is faster
      
      # Function: Random selection of marker function + iteration.method
      marker_selection <- function(iteration.method) {
        m <- as.numeric(m)
        select.markers <- sample_n(tbl = unique.markers, size = m, replace = FALSE) %>%
          arrange(MARKERS) %>%
          mutate(
            ITERATIONS = rep(iteration.method, n()),
            MARKER_NUMBER = rep(m, n())
          )
      }
      markers.random.lists <- list()
      
      message("Making a list containing all the markers combinations")
      # Go through the function with the marker number selected
      for (m in marker.number) {
        res <- purrr::map(.x = iterations.list, .f = marker_selection)
        markers.random.lists[[m]] <- res
      }
      markers.random.lists <- purrr::flatten(markers.random.lists)
      # test <- markers.random.selection.list[[101]]
      
      markers.random.lists.table <- as_data_frame(bind_rows(markers.random.lists))
      write_tsv(x = markers.random.lists.table, path = paste0(directory.subsample, "markers.random.tsv"), col_names = TRUE, append = FALSE)
      
      # Set seed for random sampling
      random.seed <- sample(x = 1:1000000, size = 1)
      # set.seed(random.seed)
      # parallel::clusterSetRNGStream(cl = cl, iseed = random.seed)
      random.seed <- data.frame(RANDOM_SEED_NUMBER = random.seed)
      write_tsv(x = random.seed, path = paste0(directory.subsample, "random_seed_assignment_ngs.tsv"), col_names = TRUE, append = FALSE)
      
      message("Starting parallel computations for the assignment analysis
First sign of progress may take some time
Progress can be monitored with activity in the folder...")
      mrl <- NULL
      holdout <- NULL
      assignment.random <- list()
      assignment_random <- function(markers.random.lists, ...) {
        mrl <- markers.random.lists
        # mrl <- markers.random.lists[1] # test
        mrl <- data.frame(mrl)                      # marker random list
        i <- as.numeric(unique(mrl$ITERATIONS))     # iteration
        m <- as.numeric(unique(mrl$MARKER_NUMBER))  # number of marker selected
        
        select.markers <- mrl %>%                   # markers
          ungroup() %>% 
          select(MARKERS) %>% 
          arrange(MARKERS)
        
        # get the list of loci after filter
        markers.names <- unique(select.markers$MARKERS)
        
        # Assignment analysis without imputations
        filename <- stri_replace_all_fixed(gsi_sim.filename,
                                           pattern = ".txt",
                                           replacement = stri_join(
                                             i, m, 
                                             "no_imputation.txt", sep = "_"
                                           )
        )
        if (assignment.analysis == "gsi_sim") {
          assignment.no.imp <- assignment_analysis(data = gsim.prep,
                                                   select.markers = select.markers,
                                                   markers.names = markers.names,
                                                   missing.data = "no.imputation", 
                                                   i = i, 
                                                   m = m,
                                                   holdout = NULL,
                                                   filename = filename
          )
        }
        if (assignment.analysis == "adegenet") {
          assignment.no.imp <- assignment_analysis_adegenet(data = genind.object,
                                                            select.markers = select.markers,
                                                            markers.names = markers.names,
                                                            missing.data = "no.imputation", 
                                                            i = i, 
                                                            m = m,
                                                            holdout = NULL
          )
        }
        
        # unused objects
        filename <- NULL
        
        # With imputations
        if (imputation.method != FALSE) {# with imputations
          if (imputation.method == "rf") {
            if (imputations.group == "populations") {
              missing.data <- "imputed RF populations"
            } else {
              missing.data <- "imputed RF global"
            }
          } else {
            if (imputations.group == "populations") {
              missing.data <- "imputed max populations"
            } else {
              missing.data <- "imputed max global"
            }
          }
          # Assignment analysis WITH imputations
          filename <- stri_replace_all_fixed(gsi_sim.filename,
                                             pattern = "txt",
                                             replacement = stri_join(
                                               i, m, 
                                               "imputed.txt", sep = "_"
                                             )
          )
          if (assignment.analysis == "gsi_sim") {
            assignment.imp <- assignment_analysis(data = gsi.prep.imp,
                                                  select.markers = select.markers,
                                                  markers.names = markers.names,
                                                  missing.data = missing.data, 
                                                  i = i,
                                                  m = m,
                                                  holdout = holdout,
                                                  filename = filename
            )
          }
          if (assignment.analysis == "adegenet") {
            assignment.imp <- assignment_analysis_adegenet(data = genind.object.imp,
                                                           select.markers = select.markers,
                                                           markers.names = markers.names,
                                                           missing.data = missing.data, 
                                                           i = i,
                                                           m = m,
                                                           holdout = holdout
            )
          }
          
          # unused objects
          select.markers <- NULL
          markers.names <- NULL
        } # End with imputations
        
        #compile assignment results each marker number for the iteration
        if (imputation.method == FALSE) {
          assignment <- assignment.no.imp
          gsi.prep.imp <- NULL
          input.imp <- NULL
        } else {
          assignment <- bind_rows(assignment.no.imp, assignment.imp)
        }
        assignment <- mutate(.data = assignment, ITERATIONS = rep(i, n()))
        return(assignment)
      } # End of iterations for both with and without imputations
      
      assignment.res <- NULL
      assignment.res <- parallel::mclapply(
        X = markers.random.lists, 
        FUN = assignment_random, 
        mc.preschedule = FALSE, 
        mc.silent = FALSE, 
        mc.cores = parallel.core
      )
      
      # Compiling the results
      message("Compiling results")
      
      assignment.res <- suppressWarnings(
        bind_rows(assignment.res) %>%
          mutate(METHOD = rep("LOO", n()))
      )
      
      if (assignment.analysis == "gsi_sim") {
        assignment.stats.pop <- suppressWarnings(
          assignment.res %>%
            group_by(CURRENT, INFERRED, MARKER_NUMBER, MISSING_DATA, ITERATIONS, METHOD) %>%
            tally %>%
            group_by(CURRENT, MARKER_NUMBER, MISSING_DATA, ITERATIONS, METHOD) %>%
            mutate(TOTAL = sum(n)) %>%
            ungroup() %>%
            mutate(MEAN_i = round(n/TOTAL*100, 0)) %>%
            filter(as.character(CURRENT) == as.character(INFERRED)) %>%
            select(CURRENT, MEAN_i, MARKER_NUMBER, MISSING_DATA, ITERATIONS, METHOD) %>%
            mutate(
              CURRENT = factor(CURRENT, levels = unique(pop.labels), ordered = T),
              CURRENT = droplevels(CURRENT)
            ) %>%
            group_by(CURRENT, MARKER_NUMBER, MISSING_DATA, METHOD) %>%
            summarise(
              MEAN = round(mean(MEAN_i), 2),
              SE = round(sqrt(var(MEAN_i)/length(MEAN_i)), 2),
              MIN = round(min(MEAN_i), 2),
              MAX = round(max(MEAN_i), 2),
              MEDIAN = round(median(MEAN_i), 2),
              QUANTILE25 = round(quantile(MEAN_i, 0.25), 2),
              QUANTILE75 = round(quantile(MEAN_i, 0.75), 2)
            ) %>%
            arrange(CURRENT, MARKER_NUMBER)
        )
      }
      
      if (assignment.analysis == "adegenet") {
        assignment.stats.pop <- suppressWarnings(
          assignment.res %>%
            rename(CURRENT = POP_ID) %>% 
            group_by(CURRENT, MARKER_NUMBER, MISSING_DATA, METHOD) %>%
            summarise(
              MEAN = round(mean(ASSIGNMENT_PERC), 2),
              SE = round(sqrt(var(ASSIGNMENT_PERC)/length(ASSIGNMENT_PERC)), 2),
              MIN = round(min(ASSIGNMENT_PERC), 2),
              MAX = round(max(ASSIGNMENT_PERC), 2),
              MEDIAN = round(median(ASSIGNMENT_PERC), 2),
              QUANTILE25 = round(quantile(ASSIGNMENT_PERC, 0.25), 2),
              QUANTILE75 = round(quantile(ASSIGNMENT_PERC, 0.75), 2)
            ) %>%
            ungroup %>% 
            mutate(
              CURRENT = factor(CURRENT, levels = unique(pop.labels), ordered = TRUE),
              CURRENT = droplevels(CURRENT)
            ) %>%
            arrange(CURRENT, MARKER_NUMBER)
        )
      }
      
      # Next step is common for gsi_sim and adegenet
      pop.levels.assignment.stats.overall <- c(levels(assignment.stats.pop$CURRENT), "OVERALL")
      
      assignment.stats.overall <- assignment.stats.pop %>%
        group_by(MARKER_NUMBER, MISSING_DATA, METHOD) %>%
        rename(ASSIGNMENT_PERC = MEAN) %>%
        summarise(
          MEAN = round(mean(ASSIGNMENT_PERC), 2),
          SE = round(sqrt(var(ASSIGNMENT_PERC)/length(ASSIGNMENT_PERC)), 2),
          MIN = round(min(ASSIGNMENT_PERC), 2),
          MAX = round(max(ASSIGNMENT_PERC), 2),
          MEDIAN = round(median(ASSIGNMENT_PERC), 2),
          QUANTILE25 = round(quantile(ASSIGNMENT_PERC, 0.25), 2),
          QUANTILE75 = round(quantile(ASSIGNMENT_PERC, 0.75), 2)
        ) %>%
        mutate(CURRENT = rep("OVERALL", n())) %>%
        arrange(CURRENT, MARKER_NUMBER)
      
      assignment.summary.stats <- suppressWarnings(
        bind_rows(assignment.stats.pop, assignment.stats.overall) %>%
          mutate(CURRENT = factor(CURRENT, levels = pop.levels.assignment.stats.overall, ordered = TRUE)) %>%
          arrange(CURRENT, MARKER_NUMBER) %>%
          mutate(
            SE_MIN = MEAN - SE,
            SE_MAX = MEAN + SE,
            ITERATIONS = rep(iteration.method, n())
          ) %>%
          select(CURRENT, MARKER_NUMBER, MEAN, MEDIAN, SE, MIN, MAX, QUANTILE25, QUANTILE75, SE_MIN, SE_MAX, METHOD, MISSING_DATA, ITERATIONS)
      )
      
      
      # Write the tables to directory
      # assignment results
      if (imputation.method == FALSE) {
        filename.assignment.res <- stri_join("assignment.res", "no.imputation", sampling.method, "tsv", sep = ".")
      } else { # with imputations
        filename.assignment.res <- stri_join("assignment.res", "imputed", sampling.method, "tsv", sep = ".")
      }
      write_tsv(x = assignment.res, path = paste0(directory.subsample,filename.assignment.res), col_names = TRUE, append = FALSE)
      
      # assignment summary stats
      if (imputation.method == FALSE) {
        filename.assignment.sum <- stri_join("assignment.summary.stats", "no.imputation", sampling.method, "tsv", sep = ".")
      } else { # with imputations
        filename.assignment.sum <- stri_join("assignment.summary.stats", "imputed", sampling.method, "tsv", sep = ".")
      }
      write_tsv(x = assignment.summary.stats, path = paste0(directory.subsample,filename.assignment.sum), col_names = TRUE, append = FALSE)
      
    } # End method random
    
    # Ranked method ************************************************************
    if (sampling.method == "ranked") {
      message("Conducting Assignment analysis with ranked markers")
      
      # if (data.type == "haplo.file") {
      #   input <- gsim.prep %>%
      #     mutate(GT = stri_pad_left(str = GT, width = 3, pad = "0")) %>% 
      #     group_by(POP_ID, INDIVIDUALS, MARKERS) %>% 
      #     tidyr::spread(data = ., key = ALLELES, value = GT) %>% 
      #     tidyr::unite(col = GT, A1, A2, sep = "", remove = TRUE)
      #   if (imputation.method != FALSE) {
      #     input.imp <- input.imp %>%
      #       tidyr::gather(key = MARKERS, GT, -c(INDIVIDUALS, POP_ID, ALLELES)) %>% 
      #       mutate(GT = stri_pad_left(str = GT, width = 3, pad = "0")) %>% 
      #       group_by(POP_ID, INDIVIDUALS, MARKERS) %>% 
      #       tidyr::spread(data = ., key = ALLELES, value = GT) %>% 
      #       tidyr::unite(col = GT, A1, A2, sep = "", remove = TRUE)
      #   }
      # }
      
      # List of all individuals
      ind.pop.df<- input %>% 
        ungroup %>% 
        select(POP_ID, INDIVIDUALS) %>% 
        distinct(POP_ID, INDIVIDUALS)
      
      # thl selection
      message("Using thl method, ranking Fst with training samples...")
      if (thl == 1) {
        # Will go through the individuals in the list one by one.
        iterations.list <- ind.pop.df$INDIVIDUALS
        # Keep track of holdout individuals
        holdout.individuals <- ind.pop.df %>%
          mutate(ITERATIONS = stri_join("HOLDOUT", seq(1:n()), sep = "_"))
      } else if (thl == "all") { # no holdout for that one
        iterations.list <- iteration.method
        holdout.individuals <- NULL
        message("Warning: using all the individuals for ranking markers based on Fst\nNo holdout samples")
        message("Recommended reading: \nAnderson, E. C. (2010) Assessing the power of informative subsets of
                loci for population assignment: standard methods are upwardly biased.\nMolecular ecology resources 10, 4:701-710.")
      } else {
        # Create x (iterations) list of y (thl) proportion of individuals per pop.
        if (stri_detect_fixed(thl, ".") & thl < 1) {
          # iteration.method <- 5 # test
          # thl <- 0.4 # test
          holdout.individuals.list <- list()
          iterations.list <- 1:iteration.method
          for (x in 1:iteration.method) {
            holdout.individuals <- ind.pop.df %>%
              group_by(POP_ID) %>%
              sample_frac(thl, replace = FALSE) %>%  # sampling fraction for each pop
              arrange(POP_ID, INDIVIDUALS) %>%
              ungroup() %>%
              select(INDIVIDUALS) %>%
              mutate(ITERATIONS = rep(x, n()))
            holdout.individuals.list[[x]] <- holdout.individuals
          }
          holdout.individuals <- as.data.frame(bind_rows(holdout.individuals.list))
        }
        
        # Create x (iterations) list of y (thl) individuals per pop.
        if (thl > 1) {
          holdout.individuals.list <- list()
          iterations.list <- 1:iteration.method
          for (x in 1:iteration.method) {
            holdout.individuals <- ind.pop.df %>%
              group_by(POP_ID) %>%
              sample_n(thl, replace = FALSE) %>% # sampling individuals for each pop
              arrange(POP_ID, INDIVIDUALS) %>%
              ungroup() %>%
              select(INDIVIDUALS) %>%
              mutate(ITERATIONS = rep(x, n()))
            holdout.individuals.list[[x]] <- holdout.individuals
          }
          holdout.individuals <- as.data.frame(bind_rows(holdout.individuals.list))
        }
      } # End tracking holdout individuals
      write_tsv(x = holdout.individuals, 
                path = paste0(directory.subsample,"holdout.individuals.tsv"), 
                col_names = TRUE, 
                append = FALSE
      )
      message("Holdout samples saved in your folder")
      
      # Going through the loop of holdout individuals
      message("Starting parallel computations for the assignment analysis
First sign of progress may take some time
Progress can be monitored with activity in the folder...")
      
      assignment_ranking <- function(iterations.list, ...) {
        # i <- "TRI_09" #test
        # i <- "CAR_01" #test
        # i <- 1
        # i <- 5
        i <- iterations.list
        
        # Ranking Fst with training dataset (keep holdout individuals out)
        message("Ranking markers based on Fst")
        if (thl == "all") {
          holdout <- NULL
          fst.ranked <- fst_WC84(data = input, holdout.samples = NULL)
          if (imputation.method != FALSE) {
            fst.ranked.imp <- fst_WC84(data = input.imp, holdout.samples = NULL)
          }
        } else if (thl == 1) {
          holdout <- data.frame(INDIVIDUALS = i)
          fst.ranked <- fst_WC84(data = input, holdout.samples = holdout$INDIVIDUALS)
          if (imputation.method != FALSE) {
            fst.ranked.imp <- fst_WC84(data = input.imp, holdout.samples = holdout$INDIVIDUALS)
          }
        } else {
          holdout <- data.frame(holdout.individuals.list[i])
          fst.ranked <- fst_WC84(data = input, holdout.samples = holdout$INDIVIDUALS)
          if (imputation.method != FALSE) {
            fst.ranked.imp <- fst_WC84(data = input.imp, holdout.samples = holdout$INDIVIDUALS)
          }
        }
        
        fst.ranked.filename <- stri_join("fst.ranked_", i, ".tsv", sep = "") # No imputation
        write_tsv(
          x = fst.ranked, 
          path = paste0(directory.subsample, fst.ranked.filename), 
          col_names = TRUE, 
          append = FALSE
        )
        
        if (imputation.method != FALSE) {  # With imputations
          fst.ranked.filename.imp <- stri_join("fst.ranked_", i,
                                               "_imputed",
                                               ".tsv", 
                                               sep = ""
          ) 
          write_tsv(x = fst.ranked.imp, 
                    path = paste0(directory.subsample, fst.ranked.filename.imp), 
                    col_names = TRUE, 
                    append = FALSE
          )
        }
        
        # Markers numbers loop function
        message("Going throught the marker.number")
        assignment.marker <- list() # Create empty lists to feed the results
        assignment_marker_loop <- function(m, ...) {
          message("Marker number: ", m)
          # m <- 200 # test
          # m <- 400 # test
          m <- as.numeric(m)
          RANKING <- NULL
          select.markers <- filter(.data = fst.ranked, RANKING <= m) %>%
            select(MARKERS)
          
          # get the list of markers after filter
          markers.names <- unique(select.markers$MARKERS)
          
          # Assignment analysis without imputations
          filename <- stri_replace_all_fixed(gsi_sim.filename,
                                             pattern = "txt",
                                             replacement = stri_join(
                                               i, m, 
                                               "no.imputation", "txt", sep = "."
                                             )
          )
          if (assignment.analysis == "gsi_sim") {
          assignment.no.imp <- assignment_analysis(data = gsim.prep,
                                                   select.markers = select.markers,
                                                   markers.names = markers.names,
                                                   missing.data = "no.imputation", 
                                                   i = i, 
                                                   m = m,
                                                   holdout = holdout,
                                                   filename = filename
          )
          }
          
          if (assignment.analysis == "adegenet") {
            assignment.no.imp <- assignment_analysis_adegenet(data = genind.object,
                                                              select.markers = select.markers,
                                                              markers.names = markers.names,
                                                              missing.data = "no.imputation", 
                                                              i = i, 
                                                              m = m,
                                                              holdout = holdout
            )
          }
          # unused objects
          select.markers <- NULL
          markers.names <- NULL
          RANKING <- NULL
          filename <- NULL
          
          # With imputations
          if (imputation.method != FALSE) {  # with imputations
            
            select.markers <- filter(.data = fst.ranked.imp, RANKING <= m) %>%
              select(MARKERS)
            
            # get the list of markers after filter
            markers.names <- unique(select.markers$MARKERS)  # not the same in no imputation
            
            if (imputation.method == "rf") {
              if (imputations.group == "populations") {
                missing.data <- "imputed RF populations"
              } else {
                missing.data <- "imputed RF global"
              }
            } else {
              if (imputations.group == "populations") {
                missing.data <- "imputed max populations"
              } else {
                missing.data <- "imputed max global"
              }
            }
            
            # Assignment analysis WITH imputations
            filename <- stri_replace_all_fixed(gsi_sim.filename,
                                               pattern = "txt",
                                               replacement = stri_join(
                                                 i, m, 
                                                 "imputed", "txt", sep = "."
                                               )
            )
            if (assignment.analysis == "gsi_sim") {
              assignment.imp <- assignment_analysis(data = gsi.prep.imp,
                                                    select.markers = select.markers,
                                                    markers.names = markers.names,
                                                    missing.data = missing.data, 
                                                    i = i,
                                                    m = m,
                                                    holdout = holdout,
                                                    filename = filename
              )
            }
            if (assignment.analysis == "adegenet") {
              assignment.imp <- assignment_analysis_adegenet(data = gsi.prep.imp,
                                                             select.markers = select.markers,
                                                             markers.names = markers.names,
                                                             missing.data = missing.data, 
                                                             i = i,
                                                             m = m,
                                                             holdout = holdout
              )
            }
            
            # unused objects
            select.markers <- NULL
            markers.names <- NULL
            RANKING <- NULL
          } # End with imputations
          
          #compile assignment results each marker number for the iteration
          if (imputation.method == FALSE) {# with imputations
            assignment <- assignment.no.imp
            fst.ranked.imp <- NULL
            gsi.prep.imp <- NULL
            input.imp <- NULL
          } else {
            assignment <- bind_rows(assignment.no.imp, assignment.imp)
          }
          m <- as.character(m)
          assignment.marker[[m]] <- assignment
          return(assignment.marker)
        }  # End marker number loop for both with and without imputations
        
        assignment.marker <- map(
          .x = marker.number, 
          .f = assignment_marker_loop,
          fst.ranked = fst.ranked,
          fst.ranked.imp = fst.ranked.imp,
          i = i,
          holdout = holdout,
          input = input,
          gsim.prep = gsim.prep,
          input.imp = input.imp,
          # vcf = vcf.imp, # was an error before, double check...
          gsi.prep.imp = gsi.prep.imp,
          pop.levels = pop.levels,
          pop.labels = pop.labels,
          pop.id.start =  pop.id.start,
          pop.id.end = pop.id.end,
          sampling.method = sampling.method,
          thl = thl,
          iteration.method = iteration.method,
          gsi_sim.filename = gsi_sim.filename,
          keep.gsi.files = keep.gsi.files,
          imputation.method = imputation.method,
          parallel.core = parallel.core
        )
        
        message("Summarizing the assignment analysis results by iterations and marker group")
        assignment.res.summary <- suppressWarnings(
          bind_rows(purrr::flatten(assignment.marker)) %>%
            mutate(METHOD = rep(stri_join("thl_", thl) , n()))
        )
        
        res.filename <- stri_join("assignment_", i, ".tsv", sep = "") # No imputation
        write_tsv(x = assignment.res.summary, path = paste0(directory.subsample, res.filename), 
                  col_names = TRUE, 
                  append = FALSE
        )
        return(assignment.res.summary)
      }  # End assignment ranking function
      
      # using mclapply
      assignment.res <- list()
      assignment.res <- parallel::mclapply(
        X = iterations.list, 
        FUN = assignment_ranking, 
        mc.preschedule = FALSE, 
        mc.silent = FALSE, 
        mc.cores = parallel.core,
        marker.number = marker.number
      )
      
      # Compiling the results
      message("Compiling results")
      assignment.res.summary <- suppressWarnings(
        bind_rows(assignment.res) %>%
          mutate(METHOD = rep(stri_join("thl_", thl) , n()))
      )
      
      if (thl == 1 | thl == "all") {
        assignment.stats.pop <- assignment.res.summary %>%
          mutate(
            CURRENT = factor(CURRENT, levels = unique(pop.labels), ordered = TRUE),
            CURRENT = droplevels(CURRENT)
          ) %>% 
          group_by(CURRENT, MARKER_NUMBER, MISSING_DATA, METHOD) %>%
          summarise(
            n = length(CURRENT[as.character(CURRENT) == as.character(INFERRED)]),
            TOTAL = length(CURRENT)
          ) %>%
          ungroup() %>% 
          mutate(MEAN = round(n/TOTAL*100, 0)) %>% 
          select(-n, -TOTAL)
        
        pop.levels.assignment.stats.overall <- c(levels(assignment.stats.pop$CURRENT), "OVERALL")
        
        assignment.stats.overall <- assignment.stats.pop %>% 
          group_by(MARKER_NUMBER, MISSING_DATA, METHOD) %>%
          rename(ASSIGNMENT_PERC = MEAN) %>%
          summarise(
            MEAN = round(mean(ASSIGNMENT_PERC), 2),
            SE = round(sqrt(var(ASSIGNMENT_PERC)/length(ASSIGNMENT_PERC)), 2),
            MIN = round(min(ASSIGNMENT_PERC), 2),
            MAX = round(max(ASSIGNMENT_PERC), 2),
            MEDIAN = round(median(ASSIGNMENT_PERC), 2),
            QUANTILE25 = round(quantile(ASSIGNMENT_PERC, 0.25), 2),
            QUANTILE75 = round(quantile(ASSIGNMENT_PERC, 0.75), 2)
          ) %>% 
          mutate(CURRENT = rep("OVERALL", n())) %>% 
          arrange(CURRENT, MARKER_NUMBER)
        
        assignment.summary.stats <- suppressWarnings(
          bind_rows(assignment.stats.pop, assignment.stats.overall) %>%
            mutate(CURRENT = factor(CURRENT, levels = pop.levels.assignment.stats.overall, ordered = TRUE)) %>%
            arrange(CURRENT, MARKER_NUMBER) %>%
            mutate(
              SE_MIN = MEAN - SE,
              SE_MAX = MEAN + SE
            )
        )
        
      } else {
        # thl != 1 or "all"
        # summary stats
        if (assignment.analysis == "adegenet") {
          assignment.res.summary <- assignment.res.summary %>% 
            group_by(CURRENT, INFERRED, ITERATIONS, MARKER_NUMBER, MISSING_DATA, METHOD) %>% 
            tally %>% 
            group_by(CURRENT) %>% 
            mutate(TOTAL = sum(n)) %>% 
            ungroup() %>% 
            mutate(ASSIGNMENT_PERC = round(n/TOTAL*100, 0)) %>% 
            filter(CURRENT == INFERRED) %>% 
            select(-n, -TOTAL)
          }

        assignment.stats.pop <- assignment.res.summary %>%
          mutate(
            CURRENT = factor(CURRENT, levels = unique(pop.labels), ordered = TRUE),
            CURRENT = droplevels(CURRENT)
          ) %>%
          group_by(CURRENT, MARKER_NUMBER, MISSING_DATA, METHOD) %>%
          summarise(
            MEAN = round(mean(ASSIGNMENT_PERC), 2),
            SE = round(sqrt(var(ASSIGNMENT_PERC)/length(ASSIGNMENT_PERC)), 2),
            MIN = round(min(ASSIGNMENT_PERC), 2),
            MAX = round(max(ASSIGNMENT_PERC), 2),
            MEDIAN = round(median(ASSIGNMENT_PERC), 2),
            QUANTILE25 = round(quantile(ASSIGNMENT_PERC, 0.25), 2),
            QUANTILE75 = round(quantile(ASSIGNMENT_PERC, 0.75), 2),
            SE_MIN = MEAN - SE,
            SE_MAX = MEAN + SE
          ) %>%
          arrange(CURRENT, MARKER_NUMBER)
        
        pop.levels.assignment.stats.overall <- c(levels(assignment.stats.pop$CURRENT), "OVERALL")
        
        assignment.stats.overall <- assignment.stats.pop %>%
          group_by(MARKER_NUMBER, MISSING_DATA, METHOD) %>%
          rename(ASSIGNMENT_PERC = MEAN) %>%
          summarise(
            MEAN = round(mean(ASSIGNMENT_PERC), 2),
            SE = round(sqrt(var(ASSIGNMENT_PERC)/length(ASSIGNMENT_PERC)), 2),
            MIN = round(min(ASSIGNMENT_PERC), 2),
            MAX = round(max(ASSIGNMENT_PERC), 2),
            MEDIAN = round(median(ASSIGNMENT_PERC), 2),
            QUANTILE25 = round(quantile(ASSIGNMENT_PERC, 0.25), 2),
            QUANTILE75 = round(quantile(ASSIGNMENT_PERC, 0.75), 2),
            SE_MIN = MEAN - SE,
            SE_MAX = MEAN + SE
          ) %>%
          mutate(CURRENT = rep("OVERALL", n())) %>%
          arrange(CURRENT, MARKER_NUMBER)
        
        assignment.summary.stats <- suppressWarnings(
          bind_rows(assignment.stats.pop, assignment.stats.overall) %>%
            mutate(CURRENT = factor(CURRENT, levels = pop.levels.assignment.stats.overall, ordered = TRUE)) %>%
            arrange(CURRENT, MARKER_NUMBER)
        )
      } # End thl != 1
      
      # Write the tables to directory
      # assignment results
      if (imputation.method == FALSE) {
        filename.assignment.res <- stri_join("assignment.res", "no.imputation", sampling.method, "tsv", sep = ".")
      } else { # with imputations
        filename.assignment.res <- stri_join("assignment.res", "imputed", sampling.method, "tsv", sep = ".")
      }
      write_tsv(x = assignment.res.summary, path = paste0(directory.subsample,filename.assignment.res), col_names = TRUE, append = FALSE)
      
      # assignment summary stats
      if (imputation.method == FALSE) {
        filename.assignment.sum <- stri_join("assignment.summary.stats", "no.imputation", sampling.method, "tsv", sep = ".")
      } else { # with imputations
        filename.assignment.sum <- stri_join("assignment.summary.stats", "imputed", sampling.method, "tsv", sep = ".")
      }
      write_tsv(x = assignment.summary.stats, path = paste0(directory.subsample,filename.assignment.sum), col_names = TRUE, append = FALSE)
    } # End of ranked thl method
    
    # update the assignment with subsampling iterations id
    assignment.summary.stats <- assignment.summary.stats %>% 
      mutate(SUBSAMPLE = rep(subsample.id, n()))
    return(assignment.summary.stats)
  } # End assignment_function
  
  res <- map(.x = subsample.list, .f = assignment_function,
             assignment.analysis = assignment.analysis,
             input = input,
             snp.ld = snp.ld,
             common.markers = common.markers,
             maf.thresholds = maf.thresholds,
             maf.pop.num.threshold = maf.pop.num.threshold,
             maf.approach = maf.approach,
             maf.operator = maf.operator,
             marker.number = marker.number,
             pop.levels = pop.levels,
             pop.labels = pop.labels,
             pop.id.start =  pop.id.start,
             pop.id.end = pop.id.end,
             sampling.method = sampling.method,
             thl = thl,
             iteration.method = iteration.method,
             gsi_sim.filename = gsi_sim.filename,
             keep.gsi.files = keep.gsi.files,
             imputation.method = imputation.method,
             impute = impute,
             imputations.group = imputations.group,
             num.tree = num.tree,
             iteration.rf = iteration.rf,
             split.number = split.number,
             verbose = verbose,
             parallel.core = parallel.core
  )
  res <- bind_rows(res)
  write_tsv(x = res, path = paste0(directory, "assignment.results.tsv"), col_names = TRUE, append = FALSE)
  
  # Summary of the subsampling iterations
  if (iteration.subsample > 1) {
    res.pop <- res %>%
      filter(CURRENT != "OVERALL") %>%
      group_by(CURRENT, MARKER_NUMBER, MISSING_DATA, METHOD) %>%
      rename(ASSIGNMENT_PERC = MEAN) %>%
      summarise(
        MEAN = round(mean(ASSIGNMENT_PERC), 2),
        SE = round(sqrt(var(ASSIGNMENT_PERC)/length(ASSIGNMENT_PERC)), 2),
        MIN = round(min(ASSIGNMENT_PERC), 2),
        MAX = round(max(ASSIGNMENT_PERC), 2),
        MEDIAN = round(median(ASSIGNMENT_PERC), 2),
        QUANTILE25 = round(quantile(ASSIGNMENT_PERC, 0.25), 2),
        QUANTILE75 = round(quantile(ASSIGNMENT_PERC, 0.75), 2)
      ) %>% 
      mutate(SUBSAMPLE = rep("OVERALL", n())) %>%
      arrange(CURRENT, MARKER_NUMBER)
    
    res.overall <- res.pop %>% 
      group_by(MARKER_NUMBER, MISSING_DATA, METHOD) %>%
      rename(ASSIGNMENT_PERC = MEAN) %>%
      summarise(
        MEAN = round(mean(ASSIGNMENT_PERC), 2),
        SE = round(sqrt(var(ASSIGNMENT_PERC)/length(ASSIGNMENT_PERC)), 2),
        MIN = round(min(ASSIGNMENT_PERC), 2),
        MAX = round(max(ASSIGNMENT_PERC), 2),
        MEDIAN = round(median(ASSIGNMENT_PERC), 2),
        QUANTILE25 = round(quantile(ASSIGNMENT_PERC, 0.25), 2),
        QUANTILE75 = round(quantile(ASSIGNMENT_PERC, 0.75), 2)
      ) %>%
      mutate(
        CURRENT = rep("OVERALL", n()),
        SUBSAMPLE = rep("OVERALL", n())
      ) %>% 
      arrange(CURRENT, MARKER_NUMBER)
    
    res.pop.overall <- suppressWarnings(
      bind_rows(res.pop, res.overall) %>%
        mutate(CURRENT = factor(CURRENT, levels = levels(res.pop$CURRENT), ordered = TRUE)) %>%
        arrange(CURRENT, MARKER_NUMBER) %>%
        mutate(
          SE_MIN = MEAN - SE,
          SE_MAX = MEAN + SE
        ) %>%
        select(CURRENT, MARKER_NUMBER, MEAN, MEDIAN, SE, MIN, MAX, QUANTILE25, QUANTILE75, SE_MIN, SE_MAX, METHOD, MISSING_DATA, SUBSAMPLE)
    )
    
    res <- bind_rows(
      res %>% 
        mutate(SUBSAMPLE = as.character(SUBSAMPLE)), 
      res.pop.overall) %>% 
      mutate(SUBSAMPLE = factor(SUBSAMPLE, levels = c(1:iteration.subsample, "OVERALL"), ordered = TRUE)) %>% 
      arrange(CURRENT, MARKER_NUMBER, SUBSAMPLE)
    
    # unused objects
    res.pop.overall <- NULL
    res.overall <- NULL
    res.pop <- NULL
  } # End summary of the subsampling iterations
  
  # Assignment plot
  if (imputation.method == FALSE) { # no imputation
    plot.assignment <- ggplot(res, aes(x = factor(MARKER_NUMBER), y = MEAN))+
      geom_point(size = 2, alpha = 0.5) +
      geom_errorbar(aes(ymin = SE_MIN, ymax = SE_MAX), width = 0.3) +
      scale_y_continuous(breaks = c(0, 10, 20 ,30, 40, 50, 60, 70, 80, 90, 100))+
      labs(x = "Marker number")+
      labs(y = "Assignment success (%)")+
      theme_bw()+
      theme(
        legend.position = "bottom",      
        panel.grid.minor.x = element_blank(), 
        panel.grid.major.y = element_line(colour = "grey60", linetype = "dashed"), 
        axis.title.x = element_text(size = 10, family = "Helvetica", face = "bold"), 
        axis.text.x = element_text(size = 8, family = "Helvetica", face = "bold", angle = 90, hjust = 1, vjust = 0.5), 
        axis.title.y = element_text(size = 10, family = "Helvetica", face = "bold"), 
        axis.text.y = element_text(size = 10, family = "Helvetica", face = "bold")
      )
  } else { #with imputations
    
    plot.assignment <- ggplot(res, aes(x = factor(MARKER_NUMBER), y = MEAN))+
      geom_point(aes(colour = MISSING_DATA), size = 2, alpha = 0.8) +
      geom_errorbar(aes(ymin = SE_MIN, ymax = SE_MAX), width = 0.3) +
      scale_colour_manual(name = "Missing data", values = c("gray33", "dodgerblue"))+
      scale_y_continuous(breaks = c(0, 10, 20 ,30, 40, 50, 60, 70, 80, 90, 100))+
      labs(x = "Marker number")+
      labs(y = "Assignment success (%)")+
      theme_bw()+
      theme(
        legend.position = "bottom",      
        panel.grid.minor.x = element_blank(), 
        panel.grid.major.y = element_line(colour = "grey60", linetype = "dashed"), 
        axis.title.x = element_text(size = 10, family = "Helvetica", face = "bold"), 
        axis.text.x = element_text(size = 8, family = "Helvetica", face = "bold", angle = 90, hjust = 1, vjust = 0.5), 
        axis.title.y = element_text(size = 10, family = "Helvetica", face = "bold"), 
        axis.text.y = element_text(size = 10, family = "Helvetica", face = "bold")
      )
  } # end plot
  
  # results
  res.list <- list(assignment = res, plot.assignment = plot.assignment)
  return(res.list)
} # End assignment_ngs
eriqande/assigner documentation built on May 16, 2019, 8:45 a.m.