#' @title iMKT using PopHuman data
#'
#' @description Perform any MKT method using a subset of PopHuman data defined by custom genes and populations lists
#'
#' @details Execute any MKT method (standardMKT, FWW, DGRP, asymptoticMKT, iMKT) using a subset of PopHuman data defined by custom genes and populations lists. It uses the dataframe PopHumanData, which can be already loaded in the workspace (using loadPopHuman()) or is directly loaded when executing this function. It also allows deciding whether to analyze genes groupped by recombination bins or not, using recombination rate values corresponding to the sex average estimates from Bhérer et al. 2017 Nature Commun.
#'
#' @param genes list of genes to analyze
#' @param pops list of populations to analyze
#' @param recomb group genes according to recombination values (TRUE/FALSE)
#' @param cutoffs list of cutofs to perform FWW and/or DGRP
#' @param bins number of recombination bins to compute (mandatory if recomb=TRUE)
#' @param test which test to perform. Options include: standardMKT (default), DGRP, FWW, asymptoticMKT, iMKT
#' @param xlow lower limit for asymptotic alpha fit (default=0)
#' @param xhigh higher limit for asymptotic alpha fit (default=1)
#' @param plot report plot (optional). Default is FALSE
#'
#' @return List of lists with the default test output for each selected population (and recombination bin when defined)
#'
#' @examples
#' ## List of genes
#' mygenes <- c("ENSG00000011021.21_3","ENSG00000091483.6_3","ENSG00000116191.17_3",
#' "ENSG00000116337.15_4","ENSG00000116584.17_3","ENSG00000116745.6_3",
#' "ENSG00000116852.14_3","ENSG00000116898.11_3","ENSG00000117010.15_3",
#' "ENSG00000117090.14_3","ENSG00000117222.13_3","ENSG00000117394.20_3")
#' ## Perform analyses
#' PopHumanAnalysis(genes=mygenes , pops=c("CEU","YRI"), recomb=FALSE, test="standardMKT")
#' PopHumanAnalysis(genes=mygenes , pops=c("CEU"), recomb=TRUE, bins=3, test="DGRP")
#'
#' @import utils
#' @import stats
#'
#' @keywords PopData
#' @export
PopHumanAnalysis <- function(genes=c("gene1","gene2","..."), pops=c("pop1","pop2","..."), cutoffs=c(0,0.05,0.1), recomb=TRUE/FALSE, bins=0, test=c("standardMKT","DGRP","FWW","asymptoticMKT","iMKT"), xlow=0, xhigh=1, plot=FALSE) {
## Get PopHuman data
if (exists("PopHumanData") == TRUE) {
data <- get("PopHumanData")
} else {
loadPopHuman()
data <- get("PopHumanData") }
## Check input variables
## Numer of arguments
if (nargs() < 3 && nargs()) {
stop("You must specify 3 arguments at least: genes, pops, recomb (T/F).\nIf test = asymptoticMKT or test = iMKT, you must specify xlow and xhigh values.") }
## Argument genes
if (length(genes) == 0 || genes == "" || !is.character(genes)) {
stop("You must specify at least one gene.") }
if (!all(genes %in% data$ID) == TRUE) {
difGenes <- setdiff(genes, data$ID)
difGenes <- paste(difGenes, collapse=", ")
stopMssg <- paste0("MKT data is not available for the requested gene(s).\nRemember to use gene IDs from Ensembl (ENSG...).\nThe genes that caused the error are: ", difGenes, ".")
stop(stopMssg) }
## Argument pops
if (length(pops) == 0 || pops == "" || !is.character(pops)) {
stop("You must specify at least one population.") }
if (!all(pops %in% data$Population) == TRUE) {
correctPops <- c("ACB","ASW","BEB","CDX","CEU","CHB","CHS","CLM","ESN","FIN","GBR","GIH","GWD","IBS","ITU","JPT","KHV","LWK","MSL","MXL","PEL","PJL","PUR","STU","TSI","YRI")
difPops <- setdiff(pops, correctPops)
difPops <- paste(difPops, collapse=", ")
stopMssg <- paste0("MKT data is not available for the sequested populations(s).\nSelect among the following populations:\nACB, ASW, BEB, CDX, CEU, CHB, CHS, CLM, ESN, FIN, GBR, GIH, GWD, IBS, ITU, JPT, KHV, LWK, MSL, MXL, PEL, PJL, PUR, STU, TSI, YRI!.\nThe populations that caused the error are: ", difPops, ".")
stop(stopMssg) }
## Argument recomb
if (recomb != TRUE && recomb != FALSE) {
stop("Parameter recomb must be TRUE or FALSE.") }
## Argument bins
if (recomb == TRUE) {
if (!is.numeric(bins) || bins == 0 || bins == 1) {
stop("If recomb = TRUE, you must specify the number of bins to use (> 1).") }
if (bins > round(length(genes)/2)) {
stop("Parameter bins > (genes/2). At least 2 genes for each bin are required.") }
}
if (recomb == FALSE && bins != 0) {
warning("Parameter bins not used! (recomb=F selected)") }
## Argument test and xlow + xhigh (when necessary)
if(missing(test)) {
test <- "standardMKT"
}
else if (test != "standardMKT" && test != "DGRP" && test != "FWW" && test != "asymptoticMKT" && test != "iMKT") {
stop("Parameter test must be one of the following: standardMKT, DGRP, FWW, asymptoticMKT, iMKT")
}
if (length(test) > 1) {
stop("Select only one of the following tests to perform: standardMKT, DGRP, FWW, asymptoticMKT, iMKT") }
if ((test == "standardMKT" || test == "DGRP" || test == "FWW") && (xlow != 0 || xhigh != 1)) {
warningMssgTest <- paste0("Parameters xlow and xhigh not used! (test = ",test," selected)")
warning(warningMssgTest) }
## Arguments xlow, xhigh features (numeric, bounds...) checked in checkInput()
## Perform subset
subsetGenes <- data[(data$ID %in% genes & data$Population %in% pops), ]
subsetGenes$ID <- as.factor(subsetGenes$ID)
subsetGenes <- droplevels(subsetGenes)
## If recomb analysis is selected
if (recomb == TRUE) {
## Declare output list (each element 1 pop)
outputList <- list()
for (k in levels(subsetGenes$Population)) {
print(paste0("Population = ", k))
## Declare bins output list (each element 1 bin)
outputListBins <- list()
x <- subsetGenes[subsetGenes$Population == k, ]
x <- x[order(x$recomb), ]
## create bins
binsize <- round(nrow(x)/bins) ## Number of genes for each bin
count <- 1
x$Group <- ""
dat <- x[FALSE, ] ## Create df with colnames
for (i in 0:nrow(x)) {
if (i%%binsize == 0) { ## Only if reminder of division = 0 (equally sized bins)
i1 <- i + binsize
if (i == 0) {
g1 <- x[i:binsize,]
group <- count
g1$Group <- group
dat[i:binsize,] <- g1
count <- count+1 }
else if (i1 <= nrow(x)) {
ii <- i+1
g1 <- x[ii:i1,]
group <- count
g1$Group <- group
dat[ii:i1,] <- g1
count <- count+1 }
}
}
dat$Group <- as.factor(dat$Group)
## Iterate through each recomb bin
for (j in levels(dat$Group)) {
print(paste0("Recombination bin = ", j))
x1 <- dat[dat$Group == j, ]
## Recomb stats from bin j
numGenes <- nrow(x1)
minRecomb <- min(x1$recomb, na.rm=T)
medianRecomb <- median(x1$recomb, na.rm=T)
meanRecomb <- mean(x1$recomb, na.rm=T)
maxRecomb <- max(x1$recomb, na.rm=T)
recStats <- cbind(numGenes,minRecomb,medianRecomb,meanRecomb,maxRecomb)
recStats <- as.data.frame(recStats)
recStats <- list("Recombination bin Summary"=recStats)
## Set counters to 0
Pi <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
P0 <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
f <- seq(0.025,0.975,0.05)
mi <- 0; m0 <- 0
Di <- 0; D0 <- 0
## Group genes
x1 <- droplevels(x1)
for (l in levels(x1$ID)) {
x2 <- x1[x1$ID == l, ]
## DAF
x2$DAF0f <- as.character(x2$DAF0f); x2$DAF4f <- as.character(x2$DAF4f)
daf0f <- unlist(strsplit(x2$DAF0f, split=";"))
daf4f <- unlist(strsplit(x2$DAF4f, split=";"))
daf0f <- as.numeric(daf0f); daf4f <- as.numeric(daf4f)
Pi <- Pi + daf0f; P0 <- P0 + daf4f
## Divergence
mi <- mi + x2$mi; m0 <- m0 + x2$m0
Di <- Di + x2$di; D0 <- D0 + x2$d0
}
## Proper formats
daf <- cbind(f, Pi, P0); daf <- as.data.frame(daf)
names(daf) <- c("daf","Pi","P0")
div <- cbind(mi, Di, m0, D0); div <- as.data.frame(div)
names(div) <- c("mi","Di","m0","D0")
## Check data inside each test!
## Transform daf20 to daf10 (faster fitting) for asymptoticMKT and iMKT
if (nrow(daf) == 20) {
daf1 <- daf
daf1$daf10 <- sort(rep(seq(0.05,0.95,0.1),2)) ## Add column with the daf10 frequencies
daf1 <- daf1[c("daf10","Pi","P0")] ## Keep new frequencies, Pi and P0
daf1 <- aggregate(. ~ daf10, data=daf1, FUN=sum) ## Sum Pi and P0 two by two based on daf
colnames(daf1)<-c("daf","Pi","P0") ## Final daf columns name
}
## Perform test
if(test == "standardMKT") {
output <- standardMKT(daf, div)
output <- c(output, recStats) } ## Add recomb summary for bin j
else if(test == "DGRP" && plot == FALSE) {
output <- DGRP(daf, div, listCutoffs=cutoffs)
output <- c(output, recStats) }
else if(test == "DGRP" && plot == TRUE) {
output <- DGRP(daf, div, listCutoffs=cutoffs, plot=TRUE)
output <- c(output, recStats) }
else if(test == "FWW" && plot == FALSE) {
output <- FWW(daf, div, listCutoffs=cutoffs)
output <- c(output, recStats) }
else if(test == "FWW" && plot == TRUE) {
output <- FWW(daf, div, listCutoffs=cutoffs, plot=TRUE)
output <- c(output, recStats) }
else if(test == "asymptoticMKT") {
output <- asymptoticMKT(daf1, div, xlow, xhigh)
output <- c(output, recStats) }
else if(test == "iMKT" && plot == FALSE) {
output <- iMKT(daf1, div, xlow, xhigh)
output <- c(output, recStats) }
else if(test == "iMKT" && plot == TRUE) {
output <- iMKT(daf1, div, xlow, xhigh, plot=TRUE)
output <- c(output, recStats) }
## Fill list with each bin
outputListBins[[paste("Recombination bin = ",j)]] <- output
}
## Fill list with each pop
outputList[[paste("Population = ",k)]] <- outputListBins
}
## Warning if some genes are lost. Bins must be equally sized.
if (nrow(dat) != length(genes)) {
missingGenes <- round(length(genes) - nrow(dat))
genesNames <- as.vector(tail(x, missingGenes)$Name)
genesNames <- paste(genesNames, collapse=", ")
warningMssg <- paste0("The ",missingGenes," gene(s) with highest recombination rate estimates (", genesNames, ") was/were excluded from the analysis in order to get equally sized bins.\n")
warning(warningMssg) }
## Return output
cat("\n")
return(outputList)
}
## If NO recombination analysis selected
else if (recomb == FALSE) {
## Declare output list (each element 1 pop)
outputList <- list()
for (i in levels(subsetGenes$Population)) {
print(paste0("Population = ", i))
x <- subsetGenes[subsetGenes$Population == i, ]
## Set counters to 0
Pi <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
P0 <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
f <- seq(0.025,0.975,0.05)
mi <- 0; m0 <- 0
Di <- 0; D0 <- 0
## Group genes
for (j in levels(x$ID)) {
x1 <- x[x$ID == j, ]
## DAF
x1$DAF0f <- as.character(x1$DAF0f); x1$DAF4f <- as.character(x1$DAF4f)
daf0f <- unlist(strsplit(x1$DAF0f, split=";"))
daf4f <- unlist(strsplit(x1$DAF4f, split=";"))
daf0f <- as.numeric(daf0f); daf4f <- as.numeric(daf4f)
Pi <- Pi + daf0f; P0 <- P0 + daf4f
## Divergence
mi <- mi + x1$mi; m0 <- m0 + x1$m0
Di <- Di + x1$di; D0 <- D0 + x1$d0
}
## Proper formats
daf <- cbind(f, Pi, P0); daf <- as.data.frame(daf)
names(daf) <- c("daf","Pi","P0")
div <- cbind(mi, Di, m0, D0); div <- as.data.frame(div)
names(div) <- c("mi","Di","m0","D0")
## Check data inside each test!
## Transform daf20 to daf10 (faster fitting) for asymptoticMKT and iMKT
if (nrow(daf) == 20) {
daf1 <- daf
daf1$daf10 <- sort(rep(seq(0.05,0.95,0.1),2)) ## Add column with the daf10 frequencies
daf1 <- daf1[c("daf10","Pi","P0")] ## Keep new frequencies, Pi and P0
daf1 <- aggregate(. ~ daf10, data=daf1, FUN=sum) ## Sum Pi and P0 two by two based on daf
colnames(daf1)<-c("daf","Pi","P0") ## Final daf columns name
}
## Perform test
if(test == "standardMKT") {
output <- standardMKT(daf, div) }
else if(test == "DGRP" && plot == FALSE) {
output <- DGRP(daf, div, listCutoffs=cutoffs) }
else if(test == "DGRP" && plot == TRUE) {
output <- DGRP(daf, div, listCutoffs=cutoffs, plot=TRUE) }
else if(test == "FWW" && plot == FALSE) {
output <- FWW(daf, div, listCutoffs=cutoffs) }
else if(test == "FWW" && plot == TRUE) {
output <- FWW(daf, div, listCutoffs=cutoffs, plot=TRUE) }
else if(test == "asymptoticMKT") {
output <- asymptoticMKT(daf1, div, xlow, xhigh) }
else if(test == "iMKT" && plot == FALSE) {
output <- iMKT(daf1, div, xlow, xhigh) }
else if(test == "iMKT" && plot == TRUE) {
output <- iMKT(daf1, div, xlow, xhigh, plot=TRUE) }
## Fill list with each pop
outputList[[paste("Population = ",i)]] <- output
}
## Return output
cat("\n")
return(outputList)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.