R/gl.report.replicates.r
In dartR: Importing and Analysing 'SNP' and 'Silicodart' Data Generated by Genome-Wide Restriction Fragment Analysis

Documented in gl.report.replicates

#' @name gl.report.replicates
#' @title Identify replicated individuals 
#' @description
#' Identify replicated individuals 
#' @param x Name of the genlight object containing the SNP data [required].
#' @param loc_threshold Minimum number of loci required to asses that two 
#' individuals are replicates [default 100].
#' @param perc_geno Mimimum percentage of genotypes in which two individuals 
#' should be the same [default 0.99]. 
#' @param plot.out Specify if plot is to be produced [default TRUE].
#' @param plot_theme User specified theme [default theme_dartR()].
#' @param plot_colors Vector with two color names for the borders and fill
#' [default two_colors].
#' @param bins Number of bins to display in histograms [default 100].
#' @param verbose Verbosity: 0, silent or fatal errors; 1, begin and end; 2,
#' progress log; 3, progress and results summary; 5, full report
#' [default 2, unless specified using gl.set.verbosity].
#' @details
#' This function uses an C++ implementation, so package Rcpp needs to be 
#' installed and it is therefore fast (once it has compiled the function after 
#' the first run).
#' 
#' Ideally, in a large dataset with related and unrelated individuals and 
#' several replicated individuals, such as in a capture/mark/recapture study, 
#' the first histogram should have four "peaks". The first peak should represent
#'  unrelated individuals, the second peak should correspond to second-degree 
#'  relationships (such as cousins), the third peak should represent 
#'  first-degree relationships (like parent/offspring and full siblings), and
#'   the fourth peak should represent replicated individuals. 
#'   
#' In order to ensure that replicated individuals are properly identified, it's
#'  important to have a clear separation between the third and fourth peaks in 
#'  the second histogram. This means that there should be bins with zero counts 
#'  between these two peaks.
#' @return A list with three elements:
#'\itemize{
#'\item table.rep: A dataframe with pairwise results of percentage of same 
#'genotypes between two individuals, the number of loci used in the comparison 
#'and the missing data for each individual.
#'\item ind.list.drop: A vector of replicated individuals to be dropped.
#' Replicated individual with the least missing data is reported.
#'\item ind.list.rep: A list of of each individual that has replicates in the 
#'dataset, the name of the replicates and the percentage of the same genotype.
#'  }
#' @author Custodian: Luis Mijangos -- Post to
#' \url{https://groups.google.com/d/forum/dartr}
#' @examples
#' \donttest{
#' res_rep <- gl.report.replicates(platypus.gl, loc_threshold = 500, 
#' perc_geno = 0.85)
#' }
#' @family report functions
#' @export

gl.report.replicates <- function(x,
                            loc_threshold = 100,
                            perc_geno = 0.99,
                            plot.out = TRUE,
                            plot_theme = theme_dartR(),
                            plot_colors = two_colors,
                            bins = 100,
                            verbose = NULL
){
  # SET VERBOSITY
  verbose <- gl.check.verbosity(verbose)
  
  # FLAG SCRIPT START
  funname <- match.call()[[1]]
  utils.flag.start(func = funname,
                   build = "Jody",
                   verbose = verbose)
  
  # CHECK DATATYPE
  datatype <- utils.check.datatype(x, verbose = verbose)
  
  # DO THE JOB
  xx <- as.matrix(x)
  
  # number of same genotypes pairwise
  #to hack package checking...
  SameGeno <- function() {}  
  Rcpp::cppFunction(
    "NumericMatrix SameGeno(NumericMatrix x) {
  int nrow = x.nrow();
  NumericMatrix out(nrow,nrow);
  for (int i=0; i<(nrow-1); i++) {
     for (int j=(i+1); j<nrow; j++) {
     out(j,i) = sum(na_omit(x(i,_)==x(j,_)) );
     }
  }
  return out;
}"
  )
  
  SameGeno_tmp <- SameGeno(xx)
  # number of no NAs in either or both genotypes pairwise
  #to hack package checking...
  SameGenoNA <- function() {}  
  Rcpp::cppFunction(
    "NumericMatrix SameGenoNA(NumericMatrix x) {
  int nrow = x.nrow();
  NumericMatrix out(nrow,nrow);
  for (int i=0; i<(nrow-1); i++) {
     for (int j=(i+1); j<nrow; j++) {
     out(j,i) = sum( !is_na( x(i,_) == x(j,_) ));
     }
  }
  return out;
}"
  )
  
  SameGenoNA_tmp <- SameGenoNA(xx)
  colnames(SameGenoNA_tmp) <- indNames(x)
  rownames(SameGenoNA_tmp) <- indNames(x)
  
  mat_same <- SameGeno_tmp/SameGenoNA_tmp
  
  colnames(mat_same) <- indNames(x)
  rownames(mat_same) <- indNames(x)
  
  col_same <- as.data.frame(as.table(mat_same))
  colnames(col_same) <- c("ind1","ind2","perc")
  perc <- NULL
  col_noNas <- as.data.frame(as.table(SameGenoNA_tmp))
  
  col_same$nloc <- col_noNas$Freq
  
  col_same <- col_same[complete.cases(col_same$perc),]
  # holding complete dataset for plotting 
  col_same_hold <- col_same
  
  col_same <- col_same[which(col_same$nloc > loc_threshold),]
  col_same <- col_same[which(col_same$perc > perc_geno),]
  
  col_same <- col_same[order(col_same$perc,decreasing = TRUE),]
  
  unique_ind <- col_same
  unique_ind$ind1 <- as.character(unique_ind$ind1)
  unique_ind$ind2 <- as.character(unique_ind$ind2)
  
  ind_NA <- 1 - rowSums(is.na(as.matrix(x))) / nLoc(x)
  ind1_NA <- data.frame(ind1 = indNames(x), ind1_NA = ind_NA)
  ind2_NA <- data.frame(ind2 = indNames(x), ind2_NA = ind_NA)

  unique_ind <- merge(unique_ind,ind1_NA,by="ind1")
  unique_ind <- merge(unique_ind,ind2_NA,by="ind2")
  unique_ind <- cbind(ind1=unique_ind[,2],unique_ind[,-2])
  
  # getting vector of individuals (replicates) to drop based on missing data
  
  unique_ind_tmp <- unique_ind
  unique_ind_tmp$test_stat <- unique_ind_tmp$ind1_NA > unique_ind_tmp$ind2_NA
  
  ind_list <- NULL
  
    for (y in 1:nrow(unique_ind_tmp)) {
      if (unique_ind_tmp[y, "test_stat"] == FALSE) {
        ind_tmp <- unique_ind_tmp[y, "ind1"]
      } else{
        ind_tmp <- unique_ind_tmp[y, "ind2"]
      }
      
      if (ind_tmp %in% ind_list) {
        next
      } else{
        ind_list <- c(ind_list, ind_tmp)
      }
      
    }
  
  # getting list of replicated individuals
  
  ind_mat <- as.matrix(reshape2::acast(unique_ind, ind1~ind2, value.var="perc"))
  # ind_mat[lower.tri(ind_mat,diag = TRUE)] <- NA
  
  ind_list_rep <- apply(ind_mat,2,function(x){
    names(x[which(!is.na(x))])
  })
  
  # Histograms
  p1_col_same_hold <- col_same_hold
  p1 <-
    ggplot(p1_col_same_hold, aes(x = perc)) + 
    geom_histogram(bins = bins, color = plot_colors[1],fill = plot_colors[2]) +
    xlab(" ") +
    ylab("Count") +
    plot_theme
  
  p2_col_same_hold <- col_same_hold[which(col_same_hold$perc>0.8),]
  p2 <-
    ggplot(p2_col_same_hold, aes(x = perc)) + 
    geom_histogram(bins = bins, color = plot_colors[1],fill = plot_colors[2]) +
    xlab("Percentage of same genotype") +
    ylab("Count") +
    plot_theme
  
  # PRINTING OUTPUTS
  if (plot.out) {
    # using package patchwork
    p3 <- (p1 / p2) 
    print(p3)
  }
  
  # FLAG SCRIPT END
  
  if (verbose >= 1) {
    cat(report("Completed:", funname, "\n"))
  }
  
  # RETURN
  
  return(list(table.rep = unique_ind,
              ind.list.drop = ind_list,
              ind.list.rep = ind_list_rep))
  
}

Any scripts or data that you put into this service are public.

dartR documentation built on April 4, 2025, 12:23 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

dartR
Importing and Analysing 'SNP' and 'Silicodart' Data Generated by Genome-Wide Restriction Fragment Analysis

R/gl.report.replicates.r
In dartR: Importing and Analysing 'SNP' and 'Silicodart' Data Generated by Genome-Wide Restriction Fragment Analysis

Defines functions gl.report.replicates

Documented in gl.report.replicates

Try the dartR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

dartR Importing and Analysing 'SNP' and 'Silicodart' Data Generated by Genome-Wide Restriction Fragment Analysis

R/gl.report.replicates.r In dartR: Importing and Analysing 'SNP' and 'Silicodart' Data Generated by Genome-Wide Restriction Fragment Analysis

Defines functions gl.report.replicates

Documented in gl.report.replicates

Try the dartR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

dartR
Importing and Analysing 'SNP' and 'Silicodart' Data Generated by Genome-Wide Restriction Fragment Analysis

R/gl.report.replicates.r
In dartR: Importing and Analysing 'SNP' and 'Silicodart' Data Generated by Genome-Wide Restriction Fragment Analysis