R/genepop.utilities.R

Defines functions m.gen.dist multi.gen.dist gen.dist w.genepop.batch multi.clean.genepop clean.genepop

Documented in clean.genepop gen.dist m.gen.dist multi.clean.genepop multi.gen.dist w.genepop.batch

#' Remove illegal characters from HexSim generated genepop input file
#' 
#' HexSim includes a 'structure block' at the end of its genepop input file
#'   that are not recognised by genepop. This block is removed by \code{clean.genepop}
#'   Also, while the animal IDs between brackets should not create a problem in
#'   genepop, these may be an 'unexpected column' for other applications that 
#'   reads genepop input files (e.g. the R package diveRsity). Similarly, the 
#'   space between the word 'Trait' and the trait number, and the equal and column
#'   symbols in the first line of the file can cause unexpected 
#'   behaviours in some applications. All these unusual features are removed from
#'   the HexSim generated files, which are then re-saved with the same name,  
#'   a suffix "cleaned" and an extensin ".gen".
#'    
#' @param fname A character vector with the name of the input file including the 
#'   path
#' @param title A character vector to be used to replace the first line in the file
#' @return Save to disk a cleaned up genepop input file and a character vector
#' @export      
  clean.genepop  <-  function(fname, title=NULL) {
    if(is.null(title)) title <- "Title missing"
    rl <- readLines(fname)
    infile <- gsub(pattern=" \\(.*\\)", replacement="", rl)
    infile <- gsub(pattern="Trait ", replacement="Trait", infile)
    cut.off <- grep(pattern="^\\[\\[Structure", infile)
    if(length(cut.off) > 0) infile <- infile[1:cut.off - 1]
    
    infile[1] <- title
    writeLines(infile, con=paste0(dirname(fname),
                                  "/",
                                  sub(".txt", "", basename(fname)), 
                                  "_cleaned", ".gen"))
  }

#' Remove illegal characters from several HexSim generated genepop input files
#' 
#' \code{multi.clean.genepop} is a wrapper for \code{clean.genepop} that processes
#'   all HexSim generated genepop files within a given scenario(s). It assumes 
#'   that the names of the report files were 
#'   not changed from defaults. It reads the files in 
#'   each of the scenario's iteration subfolder and processes them   
#'   using \code{clean.genepop}.
#'   
#' \code{multi.clean.genepop} identifies the files searching for a text file 
#'   whose root's name is the \emph{scenario_name} of the file and the text 
#'   'REPORT_genepop', so if you have modified the name of the files, this 
#'   function won't work. Similarly, if you have added files named with a pattern
#'   consistent with what described above, the function will try to process these
#'   as well.   
#'   
#' When using \code{multi.clean.genepop} there is no option to choose the title 
#'   of the genepop files 
#'   (i.e. the first line of the genepop input file).
#'   
#' @inheritParams multi.reports
#' @return Save to disk a genepop file with a suffix 'cleaned' and an extensin
#'      ".gen" for each iteration within eachscenario
#' @seealso clean.genepop
#' @importFrom tcltk tk_choose.dir
#' @export
multi.clean.genepop <- function(path.results=NULL, scenarios="all", 
                                pop.name=NULL) {
  #----------------------------------------------------------------------------#
  # Helper functions
  #----------------------------------------------------------------------------#
  
  apply.clean.gen <- function(nscen, l.iter.folders, scenarios, pop.name) {
    message(paste("Processing scenario:", scenarios[nscen]))
    iters <- seq_along(l.iter.folders[[nscen]])
    gen.names <- lapply(iters, file.names, nscen, l.iter.folders, scenarios, 
                        pop.name)
    lapply(gen.names, byTS)
  }
  
  byTS <- function (gen.name) {
    m<-regexpr("\\[[[:digit:]]+\\]", text = gen.name[1])
    txt <- regmatches(gen.name[1], m)
    message(paste("Replicate:", txt))
    lapply(gen.name, clean.genepop)
  }
  
  file.names <- function(iter, nscen, l.iter.folders, scenarios, pop.name) {
    n <- list.files(l.iter.folders[[nscen]][iter], 
                    pattern=paste0(scenarios[nscen], "_REPORT_genepop_", 
                                   pop.name, ".*", "txt$"), full.names=TRUE)
    return(n)
  }
  #----------------------------------------------------------------------------#
  
  if(is.null(pop.name)) stop("Please, provide the name of the population")
  txt <- "Please, select the 'Results' folder within the workspace"
  if(is.null(path.results)) path.results <- tk_choose.dir(caption = txt)
  if(length(scenarios) == 1) {
    if(scenarios == "all") 
      scenarios <- list.dirs(path=path.results, full.names=FALSE, recursive=FALSE)
  }  
  l.iter.folders <- lapply(scenarios, iter.folders, dir.path=path.results)
  nscens <- seq_along(scenarios)
  
  lapply(nscens, apply.clean.gen, l.iter.folders, scenarios, pop.name)
}


#' Write a .xml file to generate genepop input files in batch mode for all 
#' replicates of given scenario(s)
#' 
#' Generate a batch .xml file in the workspace directory that will instruct 
#'   OutputTransformer.exe to generate genepop input files for all replicates of 
#'   the scenario(s) passed with \code{scenarios}. If \code{scenarios="all"} is
#'   used, then all the scenarios will be included.
#' 
#' @param time.steps A numeric vector to indicate the time step to be included
#' @param traits A character vector with the name of the traits to be included 
#' @inheritParams collate.census 
#' @inheritParams multi.reports 
#' @return A .xml file named batchFile_genepop_Reports.xml
#' @importFrom tcltk tk_choose.dir
#' @export
w.genepop.batch <- function(path.results=NULL, scenarios="all", time.steps=1, 
                              pop.name, traits) {
  #----------------------------------------------------------------------------#
  # Helper functions
  #----------------------------------------------------------------------------#
  gen.input <- function(nscen, l.iter.folders, time.steps, pop.name, traits, 
                        scenarios) {
    iters <- seq_along(l.iter.folders[[nscen]])
    scenario <- scenarios[nscen]
    iter_TS <- lapply(iters, byiter, paths=l.iter.folders[[nscen]], time.steps, 
                      pop.name, traits, scenario)
    return(iter_TS)
  }
  
  byiter <- function(iter, paths, time.steps, pop.name, traits, scenario) {
    path <- paths[iter]
    TS <- lapply(time.steps, gen.block, pop.name, traits, scenario, path)
    return(TS)
  }
  
  gen.block <- function(time.step, pop.name, traits, scenario, path) {
    open.args <- "  <args>" 
    open.arg <- "    <arg>"
    close.arg <- "</arg>"
    close.args <- "  </args>"
    arg1 <- paste0("-genepop:", time.step, ":\"", pop.name, "\":\"", traits, "\"")
    log.file <- paste0(scenario, ".log")
    arg2 <- paste(path, log.file, sep="\\")
    block <- c(open.args, 
               paste0(open.arg, arg1, close.arg),
               paste0(open.arg, arg2, close.arg),
               close.args)
    return(block)
  }
  
  #----------------------------------------------------------------------------#
  
  if(is.null(path.results)) path.results <- tk_choose.dir(caption = txt)
  if(length(scenarios) == 1) {
    if(scenarios == "all") 
      scenarios <- list.dirs(path=path.results, full.names=FALSE, recursive=FALSE)
  }
  l.iter.folders <- lapply(scenarios, iter.folders, dir.path=path.results)
  nscens <- seq_along(scenarios)
  traits <- paste0(traits, collapse="\":\"" )
  
  fl <- "<?xml version=\"1.0\"?>"
  open.block <- "<OutputTransform>"
  close.block <- "</OutputTransform>"
  scen_iter_TS <- lapply(nscens, gen.input, l.iter.folders, time.steps, pop.name, 
                         traits, scenarios)
  
  wspace <- sub(pattern = "Results", replacement = "", path.results)
  writeLines(c(fl, open.block, unlist(scen_iter_TS), close.block),
             con=paste0(wspace, "batchFile_genepop_Reports.xml")) 
}
#' Genetic distance
#' 
#' \code{gen.dist} calculates the Jost's D between all possible population pairs
#'    using the R package \code{mmod}.
#'    
#' This function takes the output from the function \code{clean.genepop} 
#'   (or \code{multi.clean.genepop}) as input and uses the R package \code{adegenet}
#'   to convert the input in a genind object. 
#'     
#' A text file with extension .pairD that contains a pairwise distance  matrix 
#'   is saved to disk.
#'   
#' \code{mean.type} can be set to either "arithmetic" or "harmonic" (default), 
#'   which is used to calculate the global estimates of Hs and Ht (see 
#'   \code{mmod}'s manual for details).
#' @references Jost, L.O.U., 2008. GST and its relatives do not measure 
#'   differentiation. Molecular Ecology 17, 4015-4026.
#' @references Winter, D.J., 2012. mmod: an R library for the calculation of 
#'   population differentiation statistics. Molecular Ecology Resources 12, 
#'   1158-1160.
#'   
#' @param mean.type The type of mean to be calculated over multiple loci
#' @inheritParams clean.genepop
#' @import adegenet
#' @import mmod
#' @return Save to disk a file with same root of the input file with extension 
#'   .pairD
#' @export
gen.dist <- function(fname, mean.type="harmonic") {
  adegen.data <- adegenet::read.genepop(fname, ncode=3L, quiet=TRUE)
  gen.dist <- pairwise_D(adegen.data, hsht_mean=mean.type)
  dir.out <- dirname(fname)
  dist.name <- basename(fname)
  dist.name <- sub("REPORT_genepop_", "", dist.name)
  dist.name <- sub("_cleaned.gen", "", dist.name)
  dist.name <- paste(dist.name, "pairD", sep=".")
  write.csv(as.matrix(gen.dist), file=paste0(dir.out, "/", dist.name))
}

#' Genetic distance for multiple scenarios
#' 
#' \code{multi.gen.dist} is a wrapper for \code{gen.dist} that generates
#'   a matrix of genetic distances for multiple scenarios. 
#'   
#' \code{multi.gen.dist} will generate a matrix for each replicate within the scenario's folder.
#'   
#' @inheritParams multi.reports
#' @inheritParams gen.dist
#' @return See \code{gen.dist} for each scenario
#' @seealso gen.dist
#' @importFrom tcltk tk_choose.dir
#' @export
multi.gen.dist <- function(path.results=NULL, scenarios="all", pop.name=NULL,
                           mean.type="harmonic") {
  #----------------------------------------------------------------------------#
  # Helper functions
  #----------------------------------------------------------------------------#
  
  apply.gen.dist <- function(nscen, l.iter.folders, scenarios, pop.name) {
    message(paste("Processing scenario:", scenarios[nscen]))
    iters <- seq_along(l.iter.folders[[nscen]])
    gen.names <- lapply(iters, file.names, nscen, l.iter.folders, scenarios, 
                        pop.name)
    distances <- lapply(gen.names, byTS)
    return(distances)
  }
  
  byTS <- function (gen.name) {
    l.infiles <- lapply(gen.name, gen.dist, mean.type)
    return(l.infiles)
  }
  
  file.names <- function(iter, nscen, l.iter.folders, scenarios, pop.name) {
    fn <- list.files(l.iter.folders[[nscen]][iter], 
                    pattern=paste0(scenarios[nscen], "_REPORT_genepop_", pop.name,
                                    ".*", "_cleaned.gen$"), full.names=TRUE)
    return(fn)
  }
  #----------------------------------------------------------------------------#
  
  if(is.null(pop.name)) stop("Please, provide the name of the population")
  txt <- "Please, select the 'Results' folder within the workspace"
  if(length(scenarios) == 1) {
    if(scenarios == "all") 
      scenarios <- list.dirs(path=path.results, full.names=FALSE, recursive=FALSE)
  }
  
  l.iter.folders <- lapply(scenarios, iter.folders, dir.path=path.results)
  nscens <- seq_along(scenarios)
  
  lapply(nscens, apply.gen.dist, l.iter.folders, scenarios, pop.name)
  }

#' Mean genetic distance
#' 
#' This function calculates the mean and standard deviation of the genetic 
#'   distance (calculated with \code{gen.dist}) across all replicates for the same
#'   scenario.
#' 
#' @inheritParams collate.census 
#' @inheritParams multi.reports
#' @inheritParams invasion.front
#' @inheritParams w.genepop.batch
#' @return A list with three elements: the mean and standard
#'   deviation for each time step and overall. \code{m.gen.dist} also saves to 
#'   disk a .xlsx with the same name of the input file with the suffix "_means". 
#' @import XLConnect
#' @importFrom tcltk tk_choose.dir
#' @export
m.gen.dist <- function(path.results=NULL, scenarios="all", pop.name, 
                          traits) {
  #----------------------------------------------------------------------------#
  # Helper functions
  #----------------------------------------------------------------------------#
  # Return a list where each element is one scenario
  byscen <- function (nscen, scenarios, l.iter.folders, pop.name, traits, path.results) {
    fnames <- list.files(path=l.iter.folders[[nscen]][1], 
                         pattern=paste0(scenarios[[nscen]], "_", pop.name,
                                        "_[0-9]+_", traits, ".pairD"))
    iters <- seq_along(l.iter.folders[[nscen]]) 
    l.TS.i <- lapply(fnames, byTS, iters, l.iter.folders, nscen, path.results,
                     scenarios)
    return(l.TS.i)
  }
  
  byTS <- function (fname, iters, l.iter.folders, nscen, path.results, scenarios) {
    l.data <- lapply(iters, read.data, fname, l.iter.folders, nscen)
    outer.len <- length(l.data)
    inner.len <-  dim(l.data[[1]])
    arr <- array( unlist(l.data) , c(inner.len, outer.len) )
    dist.means <- apply(arr, 1:2, mean, na.rm=TRUE)
    dist.sds <- apply(arr, 1:2, sd, na.rm=TRUE)
    
    means.dist.name <- sub(pattern=".pairD", replacement="_means", fname)
    wb.name <- paste0(path.results, "/", scenarios[nscen], "/", 
                      means.dist.name, ".xlsx")
    if(file.exists(wb.name)) file.remove(wb.name)
    wb <- loadWorkbook(wb.name, create=TRUE)
    createSheet(wb, name="means")
    writeWorksheet(wb, dist.means, sheet="means")
    createSheet(wb, name="sd")
    writeWorksheet(wb, dist.sds, sheet="sd")
    saveWorkbook(wb)    
    return(list(dist.means=dist.means, dist.means=dist.sds))
  }
  
  read.data <- function(iter, fname, l.iter.folders, nscen) {
    f <- paste(l.iter.folders[[nscen]][iter], fname, sep="/")
    dist.data <- as.matrix(read.csv(f, row.names=1)) 
    return(dist.data)
  }
  
  trait.assembler <- function(traits) {
    comb <- unlist(lapply(traits, function(trait) paste0("\\[", trait, "\\]")))
    comb <- paste0(comb, collapse="")
  }
  #----------------------------------------------------------------------------#
  txt <- "Please, select the 'Results' folder within the workspace"
  if(is.null(path.results)) path.results <- tk_choose.dir(caption = txt)
  if(length(scenarios) == 1) {
    if(scenarios == "all") 
      scenarios <- list.dirs(path=path.results, full.names=FALSE, recursive=FALSE)
  }
  traits <- trait.assembler(traits)
  
  l.iter.folders <- lapply(scenarios, iter.folders, dir.path=path.results)
  nscens <- seq_along(scenarios)
  means.gen.dist <- lapply(nscens, byscen, scenarios=scenarios, pop.name, 
                           traits=traits, l.iter.folders=l.iter.folders, 
                           path.results)
  
  return(means.gen.dist)
}
carlopacioni/HexSimR documentation built on Nov. 28, 2020, 4:12 p.m.