R/gl.report.sexlinked.r

Defines functions gl.report.sexlinked

Documented in gl.report.sexlinked

#' @name gl.report.sexlinked
#' @title Identifies loci that are sex linked
#' @description
#' Alleles unique to the Y or W chromosome and monomorphic on the X chromosomes
#' will appear in the SNP dataset as genotypes that are heterozygotic in all
#' individuals of the heterogametic sex and homozygous in all individuals of the
#' homogametic sex. This function identifies loci with alleles that behave in
#' this way, as putative sex specific SNP markers.
#' @param x Name of the genlight object containing the SNP or presence/absence
#'  (SilicoDArT) data [required].
#' @param sex Factor that defines the sex of individuals. See explanation in
#' details [default NULL].
#' @param t.het Tolerance in the heterogametic sex, that is t.het=0.05 means
#' that 5\% of the heterogametic sex can be homozygous and still be regarded as
#' consistent with a sex specific marker [default 0.1].
#' @param t.hom Tolerance in the homogametic sex, that is t.hom=0.05 means that
#'  5\% of the homogametic sex can be heterozygous and still be regarded as
#'  consistent with a sex specific marker [default 0.1].
#' @param t.pres Tolerance in presence, that is t.pres=0.05 means that a
#' silicodart marker can be present in either of the sexes and still be regarded
#' as a sex-linked marker [default 0.1].
#' @param plot.out Creates a plot that shows the heterozygosity of males and
#' females at each loci and shaded area in which loci can be regarded as 
#' consistent with a sex specific marker [default TRUE].
#' @param plot_theme Theme for the plot. See Details for options
#' [default theme_dartR()].
#' @param plot_colors List of three color names for the not sex-linked loci, for
#'  the sex-linked loci and for the area in which sex-linked loci appear
#'  [default three_colors].
#' @param verbose Verbosity: 0, silent or fatal errors; 1, begin and end; 2,
#' progress log; 3, progress and results summary; 5, full report
#' [default NULL, unless specified using gl.set.verbosity].
#' @details
#' Sex of the individuals for which sex is known with certainty can be provided
#' via a factor (equal to the length of the number of individuals) or to be held
#' in the variable \code{x@other$ind.metrics$sex}.
#' Coding is: M for male, F for female, U or NA for unknown/missing.
#' The script abbreviates the entries here to the first character. So, coding of
#' 'Female' and 'Male' works as well. Character are also converted to upper
#' cases.
#'
#''\strong{ Function's output }
#'
#' This function creates a plot that shows the heterozygosity of males and
#' females at each loci or SNP data or percentage of present/absent in the case
#' of SilicoDArT data.
#'
#'  Examples of other themes that can be used can be consulted in \itemize{
#'  \item \url{https://ggplot2.tidyverse.org/reference/ggtheme.html} and \item
#'  \url{https://yutannihilation.github.io/allYourFigureAreBelongToUs/ggthemes/}
#'  }
#' @return Two lists of sex-linked loci, one for XX/XY and one for ZZ/ZW systems
#' and a plot.
#' @author Arthur Georges, Bernd Gruber & Floriaan Devloo-Delva
#' (Post to \url{https://groups.google.com/d/forum/dartr})
#' @examples
#'  \donttest{
#' out <- gl.report.sexlinked(testset.gl)
#' out <- gl.report.sexlinked(testset.gs)
#' }
#' test <- gl.filter.callrate(platypus.gl)
#' test <- gl.filter.monomorphs(test)
#' out <- gl.report.sexlinked(test)
#' @family report functions
#' @export

gl.report.sexlinked <- function(x,
                                sex = NULL,
                                t.het = 0.1,
                                t.hom = 0.1,
                                t.pres = 0.1,
                                plot.out = TRUE,
                                plot_theme = theme_dartR(),
                                plot_colors = three_colors,
                                verbose = NULL) {
    # SET VERBOSITY
    verbose <- gl.check.verbosity(verbose)
    
    # FLAG SCRIPT START
    funname <- match.call()[[1]]
    utils.flag.start(func = funname,
                     build = "Jackson",
                     verbosity = verbose)
    
    # CHECK DATATYPE
    datatype <-
        utils.check.datatype(x,
                             accept = c("SNP", "SilicoDArT"),
                             verbose = verbose)
    
    # FUNCTION SPECIFIC ERROR CHECKING
    
    # Check for monomorphic loci
    tmp <- gl.filter.monomorphs(x, verbose = 0)
    if ((nLoc(tmp) < nLoc(x)) & verbose >= 2) {
        cat(warn("  Warning: genlight object contains monomorphic loci\n"))
    }
    
    # FLAG SCRIPT START
    
    if (verbose >= 1) {
        if (verbose == 5) {
            cat(report("\n\nStarting", funname, "[ Build =", build, "]\n\n"))
        } else {
            cat(report("\n\nStarting", funname, "\n\n"))
        }
    }
    
    # DO THE JOB
    
    # sex should be provided as it is not a default setting, if not provided it
    #will be searched here:
    if (is.null(sex)) {
        sex <- x@other$ind.metrics$sex
    }
    
    if (is.null(sex)) {
        stop(
            error(
                "No definition for the sex of individuals is provided. If not
               provided via the function call it needs to be at 
               gl@other$ind.metrics$sex.
               Please refer to the help pages how you can define sex of
                individuals."
            )
        )
    }
    
    if (length(sex) != nInd(x)) {
        stop(
            error(
                "The number of individuals and the number of entries defining 
                the sex do not match. Check your genlight object and your sex 
                defining column."
            )
        )
    }
    
    sex <- as.character(sex)
    UP <- toupper(sex)
    UP <- substring(UP, 1, 1)
    sex[UP == "F"] <- "F"
    sex[UP == "M"] <- "M"
    sex <- ifelse(sex == "F" | sex == "M", sex, "U")
    sex[is.na(sex)] <- "U"
    
    ########### FOR SNP data
    
    if (datatype == "SNP") {
        # Extract the data for the females
        matf <- as.matrix(x[sex == "F",])
        # For each individual
        f <- array(data = NA, dim = c(ncol(matf), 3))
        for (i in 1:ncol(matf)) {
            for (j in 1:3) {
                dummy <- sum(matf[, i] == (j - 1), na.rm = T)
                if (is.na(dummy))
                    dummy <- 0
                f[i, j] <- dummy
            }
        }
        dff <- data.frame(f)
        row.names(dff) <- locNames(x)
        # genotypes 0 for homozygous for reference allele is in F0, 1 for
        #heterozygous is in F1 and 2 for homozygous for alternative
        # allele is in F2
        colnames(dff) <- c("F0", "F1", "F2")
        
        # Extract the data for the males
        matm <- as.matrix(x[sex == "M",])
        # For each individual
        m <- array(data = NA, dim = c(ncol(matm), 3))
        for (i in 1:ncol(matm)) {
            for (j in 1:3) {
                dummy <- sum(matm[, i] == (j - 1), na.rm = T)
                if (is.na(dummy))
                    dummy <- 0
                m[i, j] <- dummy
            }
        }
        dfm <- data.frame(m)
        row.names(dfm) <- locNames(x)
        # genotypes 0 for homozygous for reference allele is in M0, 1 for
        #heterozygous is in M1 and 2 for homozygous for alternative
        # allele is in M2
        colnames(dfm) <- c("M0", "M1", "M2")
        
        # Combine the two files
        
        df <- cbind(dff, dfm)
        
        df$read.depth <- x@other$loc.metrics$rdepth
        
        # Check for hets in all males, homs in all females (XY); ditto for ZW
        sumf <- df$F0 + df$F1 + df$F2
        summ <- df$M0 + df$M1 + df$M2
        
        # Pull loci that are 100% homozygous for females and 100% heterozygous
        #for males
        indexxy <-
            ((df$F0 / (sumf) >= (1 - t.hom) |
                  df$F2 / (sumf) >= (1 - t.hom)) &
                 df$M1 / (summ) >= (1 - t.het))
        # when all loci are homozygous for the reference allele e.g.
        #df$F0/(sumf), the division is NaN. So, all the NaN's are set as TRUE
        indexxy[is.na(indexxy)] <- TRUE
        
        if (sum(indexxy, na.rm = T) > 0) {
            xy <- cbind(locnr = which(indexxy == TRUE), df[indexxy,])
            # when F0, F1, F2 or M0, M1, M2 are all 0 due to NAs heterozygosity
            #is NaN. these cases are removed
            xy$fhet <- xy$F1 / (xy$F0 + xy$F1 + xy$F2)
            xy$mhet <- xy$M1 / (xy$M0 + xy$M1 + xy$M2)
            xy <- xy[complete.cases(xy),]
        }
        
        if (sum(indexxy, na.rm = T) == 0) {
            xy <- data.frame()
        }
        
        # Pull loci that are 100% homozygous for males and 100% heterozygous for
        #females
        indexzw <-
            ((df$M0 / (summ) >= (1 - t.hom) |
                  df$M2 / (summ) >= (1 - t.hom)) &
                 df$F1 / (sumf) >= (1 - t.het))
        # when all loci are homozygous for the reference allele e.g.
        #df$M0/(summ), the division is NaN. So all the NAN's are set as TRUE
        indexzw[is.na(indexzw)] <- TRUE
        
        if (sum(indexzw, na.rm = T) > 0) {
            zw <- cbind(locnr = which(indexzw == TRUE), df[indexzw,])
            # when F0, F1, F2 or M0, M1, M2 are all 0 due to NAs heterozygosity
            #is NaN. these cases are removed
            zw$fhet <- zw$F1 / (zw$F0 + zw$F1 + zw$F2)
            zw$mhet <- zw$M1 / (zw$M0 + zw$M1 + zw$M2)
            zw <- zw[complete.cases(zw),]
        }
        
        if (sum(indexzw, na.rm = T) == 0) {
            zw <- data.frame()
        }
        
        if (verbose > 0) {
            cat("Number of females:", sum(sex == "F"), "\n")
            cat("Number of males:", sum(sex == "M"), "\n")
            cat("Sex ratio females:(males+females):",
                round(sum(sex == "F") / (
                    sum(sex == "F") + sum(sex == "M")
                ), 2),
                "\n")
        }
        if (nrow(zw) == 0 & verbose > 0) {
            cat(
                important(
                    "  No sex linked markers consistent with female 
                    heterogamety (ZZ/ZW)\n"
                )
            )
        } else {
            if (verbose > 0) {
                cat("\n  Sex linked loci consistent with female heterogamety 
                    (ZZ/ZW)\n\n")
                cat(
                    paste(
                        "    Threshold proportion for homozygotes in the
                        heterozygotic sex (ZW) is",
                        t.hom,
                        "\n"
                    )
                )
                cat(
                    paste(
                        "    Threshold proportion for heterozygotes in the 
                        homozygotic sex (ZZ) is",
                        t.het,
                        "\n\n"
                    )
                )
cat("- locnr is the location of the locus in the input genlight object\n")
cat("- F0 is the number of homozygous loci for the reference allele in 
    females\n")
cat("- F1 is the number of heterozygous loci in females\n")
cat("- F2 is the number of homozygous loci for the alternative allele in 
    females\n")
cat("- M0 is the number of homozygous loci for the reference allele in males\n")
cat("- M1 is the number of heterozygous loci in males\n")
cat("- M2 is the number of homozygous loci for the alternative allele in 
    males\n")
cat("- fhet is heterozygosity in females\n")
cat("- mhet is heterozygosity in males\n\n")
                print(zw)
                cat(
                    important(
                        "  \nNote: The most reliable putative markers will have
                        a read depth > 10.\n\n"
                    )
                )
            }
        }
        
        if (nrow(xy) == 0 & verbose > 0) {
            cat(
                important(
                    "  No sex linked markers consistent with male heterogamety 
                    (XX/XY)\n"
                )
            )
        } else {
            if (verbose > 0) {
                cat("\n  Sex linked loci consistent with male heterogamety 
                    (XX/XY)\n\n")
                cat(
                    paste(
                        "    Threshold proportion for homozygotes in the 
                        heterozygotic sex (XY) is",
                        t.hom,
                        "\n"
                    )
                )
                cat(
                    paste(
 "    Threshold proportion for heterozygotes in the homozygotic sex (XX) is",
                        t.het,
                        "\n\n"
                    )
                )
cat("- locnr is the location of the locus in the input genlight object\n")
cat("- F0 is the number of homozygous loci for the reference allele in 
    females\n")
cat("- F1 is the number of heterozygous loci in females\n")
cat("- F2 is the number of homozygous loci for the alternative allele in 
    females\n")
cat("- M0 is the number of homozygous loci for the reference allele in males\n")
cat("- M1 is the number of heterozygous loci in males\n")
cat("- M2 is the number of homozygous loci for the alternative allele in 
    males\n"
                )
cat("- fhet is heterozygosity in females\n")
cat("- mhet is heterozygosity in males\n\n")
                print(xy)
                cat(
                    important(
"  \nNote: The most reliable putative markers will have a read depth > 10.\n\n"
                    )
                )
            }
        }
        
        if (plot.out) {
            df$fhet <- dff$F1 / (dff$F0 + dff$F1 + dff$F2)
            df$mhet <- dfm$M1 / (dfm$M0 + dfm$M1 + dfm$M2)
            df$xy <- indexxy
            df$zw <- indexzw
            df$test <- df$xy + df$zw
            df_sex_linked <- df[which(df$test > 0),]
            df_no_sex_linked <- df[which(df$test == 0),]
            
            
            gg <-
                ggplot() + geom_rect(
                    aes(
                        xmin = 0,
                        xmax = t.hom,
                        ymin = 1 - t.het,
                        ymax = 1
                    ),
                    fill = three_colors[3],
                    alpha = 1 / 2,
                    color = "black"
                ) +
                geom_text(x = 0, y = 1.03, aes(label = "XX/XY")) + geom_rect(
                    aes(
                        xmin = 1,
                        xmax = 1 - t.het,
                        ymin = 0,
                        ymax = t.hom
                    ),
                    fill = three_colors[3],
                    alpha = 1 / 2,
                    color = "black"
                ) + geom_text(x = 1,
                              y = -0.02,
                              aes(label = "ZZ/ZW")) + geom_point(
                                  data = df_no_sex_linked,
                                  aes(x = fhet,
                                      y = mhet),
                                  alpha = 1 / 3,
                                  size = 2,
                                  color = three_colors[1]
                              ) + geom_point(
                                  data = df_sex_linked,
                                  aes(x = fhet, y = mhet),
                                  alpha = 1 / 3,
                                  size = 3,
                                  color = three_colors[2]
                              ) + xlab("Female Heterozygosity") + 
              ylab("Male Heterozygosity") + 
              xlim(0, 1) + 
              ylim(0, 1) +
                plot_theme
            
            suppressWarnings(print(gg))
        }
        
        l <- list(xxxy = xy,
                  zzzw = zw,
                  plot = gg)
        
    }
    
    ########### FOR SilicoDArT data
    
    if (datatype == "SilicoDArT") {
        matf <- as.matrix(x[sex == "F"])
        # For each individual
        f <- array(data = NA, dim = c(ncol(matf), 2))
        for (i in 1:ncol(matf)) {
            for (j in 1:2) {
                dummy <- sum(matf[, i] == (j - 1), na.rm = T)
                if (is.na(dummy))
                    dummy <- 0
                f[i, j] <- dummy
            }
        }
        dff <- data.frame(f)
        row.names(dff) <- locNames(x)
        colnames(dff) <- c("F0", "F1")
        
        # Extract the data for the males
        matm <- as.matrix(x[sex == "M"])
        # For each individual
        m <- array(data = NA, dim = c(ncol(matm), 2))
        for (i in 1:ncol(matm)) {
            for (j in 1:2) {
                dummy <- sum(matm[, i] == (j - 1), na.rm = T)
                if (is.na(dummy))
                    dummy <- 0
                m[i, j] <- dummy
            }
        }
        dfm <- data.frame(m)
        row.names(dfm) <- locNames(x)
        colnames(dfm) <- c("M0", "M1")
        
        # Combine the two files
        
        df <- cbind(dff, dfm)
        
        df$read.depth <- x@other$loc.metrics$AvgReadDepth
        
        # Check for hets in all males, homs in all females (XY); ditto for ZW
        sumf <- df$F0 + df$F1
        summ <- df$M0 + df$M1
        
        # Pull loci that are 100% present in females and 0% in males
        indexzw <-
            (df$F1 / (sumf) >= (1 - t.pres) &
                 df$M0 / (summ) >= (1 - t.pres))
        # when all loci are homozygous for the reference allele e.g.
        #df$M0/(summ), the division is NaN. So all the NAN's are set as TRUE
        indexzw[is.na(indexzw)] <- TRUE
        
        if (sum(indexzw, na.rm = T) > 0) {
            zw <- cbind(locnr = which(indexzw == TRUE), df[indexzw,])
        }
        
        if (sum(indexzw, na.rm = T) == 0) {
            zw <- data.frame()
        }
        
        # Pull loci that are 100% present in males and 0% in females
        indexxy <-
            (df$M1 / (summ) >= (1 - t.pres) &
                 df$F0 / (sumf) >= (1 - t.pres))  # when all loci are homozygous
        #for the reference allele e.g. df$F0/(sumf), the
        # division is NaN. So, all the NaN's are set as TRUE
        indexxy[is.na(indexxy)] <- TRUE
        
        if (sum(indexxy, na.rm = T) > 0) {
            xy <- cbind(locnr = which(indexxy == TRUE), df[indexxy,])
        }
        
        if (sum(indexxy, na.rm = T) == 0) {
            xy <- data.frame()
        }
        
        if (nrow(zw) == 0 & verbose > 0) {
            cat(
                important(
                    "  No sex linked markers consistent with female 
                    heterogamety (ZZ/ZW)\n"
                )
            )
        } else {
            if (verbose > 0) {
                cat("\n  Sex linked loci consistent with female heterogamety 
                    (ZZ/ZW)\n\n")
                cat(
                    paste(
                        "    Threshold proportion for presence/absence is",
                        t.pres,
                        "\n\n"
                    )
                )
cat("- locnr is the location of the locus in the input genlight object\n")
cat("- F0 is the number of loci absent in females\n")
cat("- F1 is the number of loci present in females\n")
cat("- M0 is the number of loci absent in males\n")
cat("- M1 is the number of loci present in males\n\n")
                print(zw)
                cat(
                    important(
                        "  \nNote: The most reliable putative markers will have
                        a read depth > 10.\n\n"
                    )
                )
            }
        }
        if (nrow(xy) == 0) {
            if (verbose > 0)
                cat(
                    important(
                        "  No sex linked markers consistent with male 
                        heterogamety (XX/XY)\n"
                    )
                )
        } else {
            if (verbose > 0) {
                cat("\n  Sex linked loci consistent with male heterogamety 
                    (XX/XY)\n\n")
                cat(
                    paste(
                        "    Threshold proportion for presence/absence is",
                        t.pres,
                        "\n\n"
                    )
                )
cat("- locnr is the location of the locus in the input genlight object\n")
cat("- F0 is the number of loci absent in females\n")
cat("- F1 is the number of loci present in females\n")
cat("- M0 is the number of loci absent in males\n")
cat("- M1 is the number of loci present in males\n\n")
                print(xy)
                cat(
                    important(
                        "  \nNote: The most reliable putative markers will have 
                        a read depth > 10.\n\n"
                    )
                )
            }
        }
        
        if (plot.out) {
            fhet <- mhet <- NULL
            df$fhet <- dff$F1 / (dff$F0 + dff$F1)
            df$mhet <- dfm$M1 / (dfm$M0 + dfm$M1)
            df$xy <- indexxy
            df$zw <- indexzw
            df$test <- df$xy + df$zw
            df_sex_linked <- df[which(df$test > 0),]
            df_no_sex_linked <- df[which(df$test == 0),]
            
            gg <-
                ggplot() + geom_rect(
                    aes(
                        xmin = 0,
                        xmax = t.pres,
                        ymin = 1 - t.pres,
                        ymax = 1
                    ),
                    fill = three_colors[3],
                    alpha = 1 / 2,
                    color = "black"
                ) +
                geom_text(x = 0, y = 1.03, aes(label = "XX/XY")) + geom_rect(
                    aes(
                        xmin = 1,
                        xmax = 1 - t.pres,
                        ymin = 0,
                        ymax = t.pres
                    ),
                    fill = three_colors[3],
                    alpha = 1 / 2,
                    color = "black"
                ) + geom_text(x = 1,
                              y = -0.02,
                              aes(label = "ZZ/ZW")) + geom_point(
                                  data = df_no_sex_linked,
                                  aes(x = fhet,
                                      y = mhet),
                                  alpha = 1 / 3,
                                  size = 2,
                                  color = three_colors[1]
                              ) + geom_point(
                                  data = df_sex_linked,
                                  aes(x = fhet, y = mhet),
                                  alpha = 1 / 3,
                                  size = 3,
                                  color = three_colors[2]
                              ) + xlab("% present in females") + 
              ylab("% present in males") + 
              xlim(0, 1) + 
              ylim(0, 1) + 
              plot_theme
            
            suppressWarnings(print(gg))
        }
        
        l <- list(xxxy = xy,
                  zzzw = zw,
                  plot = gg)
    }
    
    # FLAG SCRIPT END
    
    if (verbose >= 1) {
        cat(report("Completed:", funname, "\n"))
    }
    
    # RETURN
    
    invisible(l)
    
}

Try the dartR package in your browser

Any scripts or data that you put into this service are public.

dartR documentation built on June 8, 2023, 6:48 a.m.