Nothing
#'@name gl.infer.sex
#'@title Uses sex-linked loci to infer sex of individuals
#'@description
#' This function uses the output of function gl.keep.sexlinked (list of 5
#' elements) to infer the sex of all individuals in the dataset.
#' It uses 3 types of sex-linked loci (W-/Y-linked, Z-/X-linked, and
#' gametologs), assigns a preliminary genetic sex for each type of sex-linked
#' loci available, and outputs an agreed sex.
#'
#' This function produces as output a dataframe with individuals in rows and 11
#' columns.
#'
#' @param gl_sexlinked The output of function gl.keep.sexlinked (complete
#' list with 5 elements). See explanation in "Details" section [required].
#' @param system String that declares the sex-determination system of the
#' species: 'zw' or 'xy' [required].
#' @param seed User-defined integer for repeatability purposes. If not provided
#' by user, it is chosen randomly by default. See "Details" section.
#'
#' @details
#' Parameter \code{gl_sexlinked} must be the name of the output object (a
#' list of 5 elements) produced by function \code{gl.keep.sexlinked}. Parameter
#' \code{seed} must be an integer that will be used on the KMeans algorithm used
#' by the function. We highly recommend choosing the seed to guarantee
#' repeatability.
#'
#' Note that this function was created with the explicit intent that a human
#' checks the evidence for the sex assignments that do NOT agree for all
#' types of sex-linked loci (called "indefinite sex assignments" and denoted
#' as "*M" or "*F" in the last column of dataframe output). This human can then
#' use their criterion to validate these assignments.
#'
#'
#'\strong{ Function's output }
#'
#' This function creates a dataframe with one row per individual and 11 columns: \itemize{
#' \item {id > Individuals' ID.}
#'
#' \item {w.linked.sex or y.linked.sex > Sex inferred using w-linked or y-linked
#' loci.}
#' \item {#called > Number of W-linked or Y-linked loci for which the individual
#' had a called genotype (cf. missing genotype).}
#' \item {#missing > Number of W-linked or Y-linked loci for which the
#' individual had a missing genotype (cf. called genotype).}
#'
#' \item {z.linked.sex or x.linked.sex > Sex inferred using z-linked or x-linked
#' loci.}
#' \item {#Hom.z or #Hom.x > Number of z-linked or x-linked loci for which the
#' individual is homozygous.}
#' \item {#Het.z or #Het.x > Number of z-linked or x-linked loci for which the
#' individual is heterozygous.}
#'
#' \item {gametolog.sex > Sex inferred using gametologs.}
#' \item {#Hom.g > Number of gametologous loci for which the individual is
#' homozygous.}
#' \item {#Het.g > Number of gametologous loci for which the individual is
#' heterozygous.}
#'
#' \item {agreed.sex > Agreed sex: 'F' or 'M' if all preliminary sex-assignments
#' match (i.e., definite sex assignment), and '*F' or '*M' if NOT all
#' preliminary sex-assignments match (i.e., indefinite sex assignment).}
#' }
#'
#' @return A dataframe.
#' @author Custodian: Diana Robledo-Ruiz -- Post to
#' \url{https://groups.google.com/d/forum/dartr}
#'
#' @examples
#' LBP_sexLinked <- gl.keep.sexlinked(x = LBP, system = "xy", plot.display = TRUE, ncores = 1)
#' inferred.sexes <- gl.infer.sex(gl_sexlinked = LBP_sexLinked, system = "xy", seed = 100)
#' inferred.sexes
#'
#' @references
#' \itemize{
#' \item Robledo‐Ruiz, D. A., Austin, L., Amos, J. N., Castrejón‐Figueroa, J.,
#' Harley, D. K., Magrath, M. J., Sunnucks, P., & Pavlova, A. (2023).
#' Easy‐to‐use R functions to separate reduced‐representation genomic datasets
#' into sex‐linked and autosomal loci, and conduct sex assignment. Molecular
#' Ecology Resources, 00, 1-21.
#' }
#'
#' @importFrom stats kmeans
#' @importFrom stats na.omit
#'
#' @export
gl.infer.sex <- function(gl_sexlinked,
system = NULL,
seed = NULL) {
# Parameters check
if(is.null(system)){
stop("You must specify the sex-determination system with the parameter 'system' ('zw' or 'xy').")
} else {
if(!(system == 'zw' | system == 'xy')){
stop("Parameter 'system' must be 'zw' or 'xy'.")
}
}
# Random seed if not specified by user
if(is.null(seed)) {
seed <- sample.int(65535, 1)
}
if(system == "xy") {
gl1 <- gl_sexlinked$y.linked
gl2 <- gl_sexlinked$x.linked
}
if(system == "zw") {
gl1 <- gl_sexlinked$w.linked
gl2 <- gl_sexlinked$z.linked
}
# Gametologs
gl3 <- gl_sexlinked$gametolog
table <- gl_sexlinked$results.table # Retrieve table
if(system == "zw") {
all <- table[table$zw.gametolog == TRUE, ] # Gametologs
} else {
all <- table[table$xy.gametolog == TRUE, ] # Gametologs
}
all <- all[order(all$stat.p.adjusted), ] # Order from smallest p-value
useful <- row.names(all[1:5, ]) # Keep name of only top 5 gametologs
# Make sex assignment per type of sex-linked loci (Functions declared below)
# W/Y-linked
if (gl1@n.loc >= 1){
w <- W.sex(gl1, system = system)
} else {
message("Not enough W-linked/Y-linked loci (need at least 1). Assigning NA...")
w <- data.frame(W.sex = rep(NA, length(gl1@ind.names)),
n0.w = rep(NA, length(gl1@ind.names)),
n1.w = rep(NA, length(gl1@ind.names)))
}
# Z/X-linked
if (gl2@n.loc >= 2){
z <- Z.sex(gl2, system = system, seed = seed)
} else {
message("Not enough Z-linked/X-linked loci (need at least 2). Assigning NA...")
z <- data.frame(Z.sex = rep(NA, length(gl2@ind.names)),
n1.z = rep(NA, length(gl2@ind.names)),
n0.z = rep(NA, length(gl2@ind.names)))
}
# Gametologs
if (gl3@n.loc >= 5){
g <- g.sex(gl3, system = system, seed = seed, useful = useful)
} else {
message("Not enough gametologs (need at least 5). Assigning NA...")
g <- data.frame(g.sex = rep(NA, length(gl3@ind.names)),
n1.g = rep(NA, length(gl3@ind.names)),
n0.g = rep(NA, length(gl3@ind.names)))
}
# Put them all together
A <- data.frame(w, z, g)
# Function to conciliate assignments
Fun <- function(x, y, z){
d <- c(x, y, z)
yy <- data.frame(x = c("F", "M"),
freq = c(sum(d == "F", na.rm = TRUE),
sum(d == "M", na.rm = TRUE)))
yy <- yy[order(-yy$freq), ]
value <- if(length(unique(na.omit(d))) == 1) unique(na.omit(d)) else sprintf('*%s', yy[1, 1])
return(value)
}
# Add last column of agreed sexes
A$agreed.sex <- mapply(Fun, A$W.sex, A$Z.sex, A$g.sex)
if(system == 'xy'){
names <- c('y.linked.sex', '#called', '#missing',
'x.linked.sex', '#Het.x', '#Hom.x',
'gametolog.sex', '#Het.g', '#Hom.g', 'agreed.sex')
} else {
names <- c('w.linked.sex', '#called', '#missing',
'z.linked.sex', '#Het.z', '#Hom.z',
'gametolog.sex', '#Het.g', '#Hom.g', 'agreed.sex')
}
colnames(A) <- names
A <- cbind(row.names(A), A)
colnames(A)[1] <- "id"
message("***FINISHED***")
return(A)
}
############################### 1. W.sex function
### Map NAs (missing) and scored (called) to 1Dim in [-1,1], IF x<0, F else M
W.sex <- function(gl, system = NULL){
w <- as.matrix(gl)
w[is.na(w)] <- 3
n0.w <- rowSums(w == 0 | w == 2 | w == 1, na.rm = TRUE)
n1.w <- rowSums(w == 3, na.rm = TRUE)
# Calculate proportion
sex.score <- function(f, m){
return( (-f+m)/(f+m) )
}
c2 <- sex.score(n0.w, n1.w)
if(system == 'xy'){
lab0 <- 'M'
lab1 <-'F'
} else {
lab0 <- 'F'
lab1 <- 'M'
}
W.sex <- ifelse(c2 < 0, lab0, lab1)
Y <- data.frame(W.sex, n0.w, n1.w)
return(Y)
}
############################### 2. Z.sex function
### Map Hom and Het to 2Dim and apply kmeans. Choose the label from maximum Hom
Z.sex <- function(gl, system = NULL, seed = 42){
z <- as.matrix(gl)
n0.z = rowSums(z == 0 | z == 2, na.rm = TRUE)
n1.z = rowSums(z == 1, na.rm = TRUE)
Z_unclean <- t(apply(data.frame(n0.z, n1.z), 1, function(x) x / sum(x) ))
Z <- na.omit(Z_unclean)
Zna <- Z_unclean[rowSums(is.na(Z_unclean)) > 0,]
# Apply k-means
set.seed(seed)
km <- kmeans(Z, 2)
i <- names(which.max(n1.z)) # Largest proportion of '1'
label <- km$cluster[i]
if(system == 'xy'){
lab0 <- 'M'
lab1 <- 'F'
} else {
lab0 <- 'F'
lab1 <- 'M'
}
# Assign sex
Z.sex <- ifelse( km$cluster == label, lab1, lab0)
# Assign NA to inds with NA
if (!is.null(nrow(Zna)) && nrow(Zna) > 0 ){
for (i in 1:nrow(Zna)) {
Z.sex[length(Z.sex)+1] <- NA
names(Z.sex)[length(names(Z.sex))] <- rownames(Zna)[i]
}
}
# Fuse them
Y <- data.frame(n1.z, n0.z)
# Add NAs in appropriate row
Y$Z.sex <- "STOP"
for (i in 1:nrow(Y)){
Y[i, "Z.sex"] <- Z.sex[rownames(Y)[i]]
}
Y <- Y[, c("Z.sex", "n1.z", "n0.z")]
return(Y)
}
############################### 3. ZWg.sex function
### Map Hom and Het to 2Dim and apply kmeans. Choose the label from maximum Het
g.sex <- function(gl, system = NULL, seed = 42, useful = useful) {
z <- as.matrix(gl)
z <- z[, useful]
n0.g = rowSums(z == 0 | z == 2, na.rm = TRUE)
n1.g = rowSums(z == 1, na.rm = TRUE)
Z_unclean <- t(apply(data.frame(n0.g, n1.g), 1, function(x) x / sum(x) ))
Z <- na.omit(Z_unclean)
Zna <- Z_unclean[rowSums(is.na(Z_unclean)) > 0,]
# Apply k-means
set.seed(seed)
km <- kmeans(Z, 2)
i <- names(which.max(n1.g)) # Largest proportion of '1'
label <- km$cluster[i]
if(system == 'xy'){
lab0 <- 'M'
lab1 <- 'F'
} else {
lab0 <- 'F'
lab1 <- 'M'
}
# Assign sex (HERE IS THE OPPOSITE)
Z.sex <- ifelse( km$cluster == label, lab0, lab1)
# Assign NA to inds with NA
if (!is.null(nrow(Zna)) && nrow(Zna) > 0 ){
for (i in 1:nrow(Zna)) {
Z.sex[length(Z.sex)+1] <- NA
names(Z.sex)[length(names(Z.sex))] <- rownames(Zna)[i]
}
}
# Fuse them
Y <- data.frame(n1.g, n0.g)
# Add NAs in appropriate row
Y$g.sex <- "STOP"
for (i in 1:nrow(Y)){
Y[i, "g.sex"] <- Z.sex[rownames(Y)[i]]
}
Y <- Y[, c("g.sex", "n1.g", "n0.g")]
return(Y)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.