R/eco.format.R
In EcoGenetics: Management and Exploratory Analysis of Spatial Data in Landscape Genetics

#' Format tool for genetic data
#' 
#' @details The function can format data with different ploidy levels. 
#' It allows to:
#' - add/remove zeros at the beginning/end of each allele
#' - separate alleles with a character
#' - divide alleles into columns - bind alleles from separate columns
#' - transform character data into numeric data
#' 
#' "NA" is considered special character (not available data).
#' 
#' Note that the function can also work with other type of data as well, 
#' where the "alleles" represent the different states of the variables.
#'
#' @param data Genetic data frame.
#' @param ncod Number of digits coding each allele in the input file.
#' @param nout Number of digits in the output.
#' @param ploidy Ploidy of the data.
#' @param sep.in Character separating alleles in the input data if present.
#' @param sep.out Character separating alleles in the output data. Default 
#' @param fill.mode Add zeros at the beggining ("fist") or the end ("last")
#' of each allele. Default = "last". 
#' @param recode Recode mode: "none" for no recoding (defalut), "all" for recoding
#' the data considering all the individuals values at once (e.g., protein data), 
#' "column" for recoding the values by column (e.g., microsatellite data), "paired" 
#' for passing the values of allelic states and corresponding replacement values, using 
#' the replace_in and replace_out arguments (e.g. replace_in = c("A", "T", "C", "G"),
#' replace_out = c(1,2,3,4)).
#' @param replace_in vector with states of the data matrix to be replaced, when recode = "paired".
#' This argument must be used in conjunction with the argument "replace_out".
#' @param replace_out vector with states of the data matrix used for replacement, when recode = "paired".
#' This argument must be used in conjunction with the argument "replace_in".
#' @param show.codes May we returned tables with the equivalence between the old 
#' and new codes when recode = "all" or recode = "column"?
#' 
#' @examples
#' \dontrun{
#' 
#' data(eco.test)
#' 
#' # Adding zeros
#' 
#' example <- as.matrix(genotype[1:10,])
#' mode(example) <- "character"
#' # example data
#' example                
#' recoded <- eco.format(example, ncod = 1, ploidy = 2, nout = 3)
#' # recoded data
#' recoded
#' 
#' 
#' # Tetraploid data, separating alleles with a "/"
#' tetrap <- as.matrix(example)
#' # simulated tetraploid example data
#' tetrap <- matrix(paste(example,example, sep = ""), ncol = ncol(example)) 
#' recoded <- eco.format(tetrap, ncod = 1, ploidy = 4, sep.out = "/")
#' # recoded data
#' recoded
#' 
#' # Example with a single character
#' ex <- c("A","T","G","C")
#' ex <- sample(ex, 100, rep= T)
#' ex <- matrix(ex, 10, 10)
#' colnames(ex) <- letters[1:10]
#' rownames(ex) <- LETTERS[1:10]
#' # example data
#' ex  
#' recoded <- eco.format(ex, ploidy = 1, nout = 1,  recode = "all", show.codes = TRUE) 
#' # recoded data 
#' recoded
#' 
#' # Example using values-replacement pairs
#' samp1 <- sample(c("A","T","G","C"), 100, replace = TRUE)
#' samp2 <- sample(c("A","T","G","C"), 100, replace = TRUE)
#' paired <- matrix(paste0(samp1, samp2), 10, 10)
#' # Generate some NAs
#' paired[sample(1:100, 10)]<-NA
#' out <- eco.format(paired, recode = "paired", replace_in = c("A", "T", "G", "C"),
#' replace_out = c(1, 2, 3, 4))
#' out
#' 
#' 
#' # Example with two strings per cell and missing values:
#' ex <- c("Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", 
#' "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", 
#' "Val", "Trp")
#' ex1 <- sample(ex, 100, rep= T)
#' ex2 <- paste(ex1, ex1, sep="")
#' missing.ex2 <- sample(1:100, 20)
#' ex2[missing.ex2] <-NA
#' ex2 <- matrix(ex2, 10, 10)
#' colnames(ex2) <- letters[1:10]
#' rownames(ex2) <- LETTERS[1:10]
#' # example data
#' ex2
#' recoded <- eco.format(ex2, ncod = 3, ploidy = 2, 
#'                       nout = 2, recode = "column")
#' # recoded data
#' recoded
#' 
#' # Example with a vector, following the latter example:
#' ex1 <- as.data.frame(ex1)
#' # example data
#' ex1
#' recoded <- eco.format(ex1, ploidy = 1, recode = "all")
#' # recoded data
#' recoded
#' 
#' }
#' 
#' @author Leandro Roser \email{learoser@@gmail.com}
#' 
#' @export


setGeneric("eco.format", function(data, 
                                  ncod = NULL, 
                                  nout = 3, 
                                  ploidy = 2, 
                                  sep.in = "",
                                  sep.out = "",
                                  fill.mode = c("last", "first", "none"),
                                  recode = c("none", "all", "column", "paired"),
                                  replace_in = NULL,
                                  replace_out =NULL,
                                  show.codes = FALSE) {
  
  
  fill.mode <- match.arg(fill.mode)
  recode <- match.arg(recode)
  
  rep1 <- !is.null(replace_in)
  rep2 <- !is.null(replace_out)

  # set arguments for paired case
  if((rep1 || rep2)) {
    if(!(rep1 && rep2) || length(rep1) != length(rep2)) {
    stop("both replace_in and replace_out must be non null and of same length
         if one of these arguments is used")
    }
    paired_replacement <- TRUE
    replace_in <- as.character(replace_in)
    replace_out <- as.character(replace_out)
  } else {
    paired_replacement <- FALSE
  }

  data <- as.matrix(data, rownames.force = TRUE)
  
  mode(data) <- "character"
  data <- aue.rmspaces(data)
  data <- int.check.colnames(data)
  data <- int.check.rownames(data)
  indnames <- rownames(data)
  locnames <- colnames(data)
  
  #----control---------------------------------------------------------#  
  
  # recode check
  if(any(grep("[^[:digit:]]", data)) && recode == "none") {
    stop("Non numeric characters found. Use the argument 'recode'")
  }
  
  # ploidy check
  if(is.null(ncod)) {
    ncode_in <- FALSE
  } else {
    ncode_in <- TRUE
  }
  ncod <- int.check.ncod(data, ploidy = ploidy, ncod = ncod, sep= sep.in)
  
  if(recode == "none" && nout < ncod) {
    stop("nout (output number of digits) < ncod 
         (input number of digits per allele) is not valid")
  }
  
  
  ###########------RECODING CASE-----------####################################
  if(recode != "none") {
    
    #----------- function for recoding-----------------#
    singlechar <- function(x) {
      
      y <- as.vector(as.matrix(x))
      y <- as.factor(y)
      y.lev <- levels(y)
      num_states <- length(y.lev)
      original.code <- y.lev[!is.na(y.lev)]
      y <- as.numeric(y)
      ncod.y <- nchar(y, keepNA = TRUE)
      
      if(nout < nchar(num_states)) {
        stop("nout (output number of digits): ", nout, ", but the number of allelic states 
             found are: ", num_states)
      }
      
      #ncod.y[is.na(y)] <- NA
      ncl <- length(y)
      # recoding data
      
      if(fill.mode ==  "last") {
        if(nout>1) {
        add.dig <- paste(rep(1, ncl), paste(rep(0, nout-1), collapse = ""), sep = "")
        add.dig <- as.numeric(add.dig)
        y <- y + add.dig
        }
        y <- as.character(y)
        
      } else {
        y <- as.character(y)
        if(nout>1) {
        add.zeros <- nout-ncod.y
        add.zeros[is.na(add.zeros)] <- 0
        for(i in seq_len(ncl)) {
          zero.init <- paste(rep("0", add.zeros[i]), collapse = "")
          y[i] <- paste(zero.init, y[i],  sep = "")
        }
        }
      }
      
      y <- matrix(y, ncol = ncol(x))
      # codes
      y.cod <- as.factor(y)
      new.code <- levels(y.cod)[which(levels(y.cod)!= "NA")]
      y.tab <- data.frame(original.code, new.code)
      
      list(y, y.tab)
      
      }
    #--------------------------------------------------#
    
    #alleles in columns
    X <- int.loc2al(X = data, ncod = ncod, ploidy = ploidy, 
                    sep.in = sep.in, chk.plocod = FALSE,
                    chk.names = FALSE)
    X <- gsub("(NA)+", NA, X)
    # for recoding all data
    
    if(paired_replacement) {
      nlev <- levels(as.factor(X))
      nlev_len <- length(nlev)
      nin_len <-  length(replace_in)
      if(nlev_len != nin_len) {
        stop(paste0("The number of arguments used for replacement (", nin_len, ") differ of the number of 
           data states (", nlev_len, ")"))
      }
      
      cat(paste0("Levels found in the data: ", paste(nlev, collapse = ", ")), "\n")
      cat(paste0("Replacements: ", paste(replace_in, replace_out, sep = "=", collapse=", ")), "\n")
      conv_list <- matrix( c(replace_in, replace_out), 
                           nrow= length(replace_in),
                           dimnames = list(seq_len(nin_len), c("original_code", "new_code")))
      
      
      X[] <- replace_out[match(X, replace_in)]
    
    } else if(recode == "all") {
      recoding <- singlechar(X)
      # create objects with the recoded data
      X <- recoding[[1]]
      conv.list <- recoding[[2]]
      
    } else if(recode == "column") {
      X <- int.al2listal(X, ncod = ncod, ploidy = ploidy)
      recoding <- lapply(X, singlechar)
      X <- lapply(recoding, function(y) y[[1]])
      X <- do.call(cbind, X)
      conv.list <- lapply(recoding, function(y) y[[2]])
    }
    
    ###########------NO RECODING CASE-----------####################################
    
  } else  {
    
    X.list <- int.loc2listal(X = data, ncod = ncod, ploidy = ploidy)
    
    X <- lapply(X.list,     
                function(x) {
                  if(fill.mode ==  "last") {
                    x <-  paste(x, paste(rep("0", nout - ncod), collapse = ""), sep = "")
                    x<- matrix(x, ncol = ploidy)
                  } else {
                    x <-  paste(paste(rep("0", nout - ncod), collapse = ""), x, sep = "")
                    x <- matrix(x, ncol = ploidy)
                  }
                })
    
    
    X <- do.call(cbind, X)
  }
  
  # collapse alleles
  X <- suppressMessages(int.al2loc(X, ploidy = ploidy, sep.out = sep.out))
  
  
  # correction for NA
  X <- gsub("(^NA)(0*$)|(^0*)(NA$)", "NA", X)
  
  rownames(X) <- indnames
  colnames(X) <- locnames
  
  if(recode != "none" && show.codes == TRUE) { 
    X <- list(out = X, codes = conv.list)
  }
  
  X
})

Any scripts or data that you put into this service are public.

EcoGenetics documentation built on July 8, 2020, 5:46 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

EcoGenetics
Management and Exploratory Analysis of Spatial Data in Landscape Genetics

R/eco.format.R
In EcoGenetics: Management and Exploratory Analysis of Spatial Data in Landscape Genetics

Try the EcoGenetics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

EcoGenetics Management and Exploratory Analysis of Spatial Data in Landscape Genetics

R/eco.format.R In EcoGenetics: Management and Exploratory Analysis of Spatial Data in Landscape Genetics

Try the EcoGenetics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

EcoGenetics
Management and Exploratory Analysis of Spatial Data in Landscape Genetics

R/eco.format.R
In EcoGenetics: Management and Exploratory Analysis of Spatial Data in Landscape Genetics