R/genemapper2df.R

Defines functions genemapper2df

Documented in genemapper2df

#' Combine txt files generated by GeneMapper
#'
#' @param folder_name a folder where all the .txt files
#'   generated by GeneMapper are stored.
#' @param export_name the name of the exported csv file.
#' @param export_option whether to export a csv file (option "csv")
#' or a data.frame in the R Environment (option "R").
#'
#' @return a single csv file or a data.frame
#' @export
#'
#' @author Nikolaos Tourvas
#' @description This function allows the user to quickly convert multiple txt
#' files of microsatellite genotyping data from GeneMapper to a csv or a data.frame.
#' It was tested with txt files produced by GeneMapper v4.0 up to v6.0.
#'
#' @examples
#' \dontrun{
#'
#' # Export csv file for use in GenAlEx
#' genemapper2df(folder_name = "...", export_name = "my_data", export_option = "csv")
#'
#' # Create genind object
#' df <- genemapper2df(folder_name = "...", export_option = "R")
#' obj <- loci2genind(as.loci(df))
#'
#' # For genind objects you will probably want to import population data as well.
#' # This can be done in the following way.
#'
#' strata(obj) <- strata_df    #provide a data.frame with population strata (e.g. Year, Plot)
#' setPop(obj) <- ~Year/Plot    #set the appropriate population hierachy for the analysis
#' ## Note that you can change population hierarchy between analyses using the previous command.
#' }
#' @import dplyr
#' @import tidyr
#' @importFrom magrittr %>%
genemapper2df <- function(folder_name,
                          export_name = "genemapper_export.csv",
                          export_option = "csv") {

# function
# if Allele.2 spot is empty but Allele.1 is not,
# then copy Allele.1 to Allele.2
replace.empty <- function(x) {
  x %>%
    mutate(Allele.2.edit =
             case_when(
               is.na(Allele.2) ~ Allele.1,
               is.numeric(Allele.2) ~ Allele.2
             )
    )
}

temp <- list.files(path = folder_name,
                   pattern = "*.txt",
                   full.names = TRUE)
df <- lapply(temp, read.csv, sep = "\t",
               na.strings = "")

# Check if all input files are tab delimited
for (i in 1:length(df)){
  if(ncol(df[[i]]) < 3) {
    stop(call. = FALSE,
         "The input file is probably not tab delimited for marker",
         unique(df[[i]]$Marker))
  }
}

# Check if all alleles are coded as numbers
for (i in 1:length(df)){
  if(!is.numeric(df[[i]]$Allele.1)) {
    stop(call. = FALSE,
         "An allele is not coded as a number in the txt file of marker ",
         unique(df[[i]]$Marker))
  } else if (!is.numeric(df[[i]]$Allele.2)) {
    stop(call. = FALSE,
         "An allele is not coded as a number in the txt file of marker ",
         unique(df[[i]]$Marker))
  }
}

## TODO: make check that marker names do not have spaces

# sort data.frame based on Sample.File column
df <- lapply(df, arrange, Sample.File)
df <- lapply(df, select,
               Sample.File, Marker, Allele.1, Allele.2)

df_edit <- lapply(df, replace.empty)
df_edit <- lapply(df_edit, select,
               Allele.1, Allele.2.edit)

# vector of marker names
markers <- list()
for (i in 1:length(df)){
  markers[[i]] <- unique(df[[i]]$Marker)
}
markers <- unlist(markers)
markers <- rep(markers, each=2)

# merge multiple data.frames inside the list
# into one data.frame
df_2col <- do.call(cbind, df_edit)
colnames(df_2col) <- markers # add marker names

# add ind names column
ind_names <- df[[1]]$Sample.File
ind_names <- gsub(pattern = ".fsa",
                  replacement = "",
                  x = ind_names)

# df_final <- df_final %>%
#   add_column(Genotype = ind_names,
#              .before = 1)
rownames(df_2col) <- ind_names
df_2col[is.na(df_2col)] <- 0

if (export_option == "csv") {
  write.csv(x = df_2col, file = export_name)
} else if (export_option == "R") {

  N<-ncol(df_2col)
  n=N/2
  A<-seq(1,N, by=2)
  B<-seq(2,N, by=2)
  for (i in 1:n){
    locus<-(paste(df_2col[,A[i]],"/",df_2col[,B[i]],sep=""))
    if (exists("geno")) geno<-data.frame(geno,locus) else
      geno<-as.data.frame(locus)
  }

  rownames(geno) <- rownames(df_2col) # ind names
  markers <- colnames(df_2col)[c(TRUE, FALSE)]
  colnames(geno) <- markers
  return(geno)
} else {
  stop(call. = FALSE,
  'The export_option parameter should take one of the following values
        "csv" or "R" ')
}
}
nikostourvas/PopGenUtils documentation built on April 23, 2022, 1:14 p.m.