Nothing
#' Simulate a single population
#'
#' Simulates the evolution of biological sequences for a single population with
#' variable theta values.
#'
#' @param nDip an integer representing the total number of diploid individuals
#' to simulate. Note that [scrm::scrm()] actually simulates haplotypes, so the
#' number of simulated haplotypes is double of this.
#' @param nloci is an integer that represents how many independent loci should
#' be simulated.
#' @param theta a value for the mutation rate assuming theta = 4Nu, where u is
#' the neutral mutation rate per locus.
#'
#' @return a list with genotypes. Each entry of the list corresponds to a
#' different locus. For each locus, the genotypes are in a matrix, with each
#' row representing a different individual and each column a different site.
#'
#' @examples
#' run_scrm(nDip = 100, nloci = 10)
#' run_scrm(nDip = 100, nloci = 10, theta = 5)
#'
#' @export
run_scrm <- function(nDip, nloci, theta = 10) {
# run scrm to obtain haplotypes
haplo <- scrm::scrm(paste(nDip*2, nloci, "-t", theta))
# combine those haplotypes into genotypes
genotypes <- GetGenotypes(haplotypes = haplo$seg_sites, nDip = nDip)
# output the genotypes
genotypes
}
#' Compute allele frequencies from genotypes
#'
#' Computes alternative allele frequencies from genotypes by dividing the total
#' number of alternative alleles by the total number of gene copies.
#'
#' @param nDip an integer representing the total number of diploid individuals
#' to simulate. Note that [scrm::scrm()] actually simulates haplotypes, so the
#' number of simulated haplotypes is double of this.
#' @param genotypes a list of simulated genotypes, where each entry is a matrix
#' corresponding to a different locus. At each matrix, each column is a
#' different SNP and each row is a different individual.
#'
#' @return a list of allele frequencies. Each entry of the list corresponds to a
#' different locus.
#'
#' @examples
#' genotypes <- run_scrm(nDip = 10, nloci = 10)
#' Ifreqs(nDip = 10, genotypes)
#'
#' @export
Ifreqs <- function(nDip, genotypes) {
# perform a sum over all columns to obtain the number of alternative alleles per site
alternative <- lapply(genotypes, colSums)
# compute allele frequencies by dividing the number of alternative alleles by the total number of alleles
ifreqs <- lapply(alternative, function(l) l/(nDip*2))
# output the allele frequencies computed directly from genotypes
ifreqs
}
#' Apply a minor allele reads threshold
#'
#' Removes sites where the total number of minor-allele reads is below a certain
#' threshold.
#'
#' If a site has less minor-allele reads than \code{min.minor} across all
#' populations, that site is removed from the data.
#'
#' @param freqs a vector of allele frequencies where each entry corresponds to a
#' different site.
#' @param alternative a matrix with the number of reads with the alternative
#' allele. Each row should be a different population and each column a
#' different site.
#' @param coverage a matrix with the total coverage. Each row should be a
#' different population and each column a different site.
#' @param minor a matrix with the number of minor-allele reads. Each row should
#' be a different population and each column a different site.
#' @param min.minor is an integer representing the minimum allowed number of
#' minor-allele reads. Sites that, across all populations, have less
#' minor-allele reads than this threshold will be removed from the data.
#'
#' @return a list with three named entries:
#'
#' \item{freqs}{is a vector with the allele frequencies minus the frequency of
#' the removed sites.}
#'
#' \item{alternative}{is a matrix with the number of alternative-allele reads
#' per site, minus any removed sites.}
#'
#' \item{coverage}{is a matrix with the depth of coverage minus the coverage
#' of the removed sites.}
#'
#' @examples
#' # create a vector of allele frequencies
#' freqs <- runif(20)
#'
#' set.seed(10)
#' # create a matrix with the number of reads with the alternative allele
#' alternative <- matrix(sample(x = c(0,5,10), size = 20, replace = TRUE), nrow = 1)
#' # create a matrix with the depth of coverage
#' coverage <- matrix(sample(100:150, size = 20), nrow = 1)
#' # the number of reads with the reference allele is obtained by subtracting
#' # the number of alternative allele reads from the depth of coverage
#' reference <- coverage - alternative
#'
#' # find the minor allele at each site
#' minor <- findMinor(reference = reference, alternative = alternative, coverage = coverage)
#' # keep only the matrix with the minor allele reads
#' minor <- minor[["minor"]]
#'
#' # remove sites where the number of minor-allele reads is below the threshold
#' removeSites(freqs = freqs, alternative = alternative, coverage = coverage,
#' minor = minor, min.minor = 2)
#'
#' @export
removeSites <- function(freqs, alternative, coverage, minor, min.minor) {
# check if the input is correct - freqs should always be supplied as a vector
if(!inherits(freqs, "numeric"))
stop(paste("freqs should be supplied on a numeric vector, with each entry corresponding to a site. Please check"))
# get the total number of minor allele reads in the data
tminor <- colSums(minor)
# find out in which columns the total sum of the reads with the minor allele is below the threshold
toremove <- tminor < min.minor
# if there are sites where the sum of the reads with the minor allele is below the threshold
if(sum(toremove) != 0) {
# remove those columns from the matrix containing the depth of coverage
coverage <- coverage[, !toremove, drop = FALSE]
# remove those columns from the matrix containing the number of reads with the alternative allele
alternative <- alternative[, !toremove, drop = FALSE]
# remove those entries from the vector containing the allele frequencies computed directly from genotypes
freqs <- freqs[!toremove, drop = FALSE]
}
# create the output containing the frequencies computed from the genotypes
# and the number of Pool-seq alternative allele reads and total coverage
out <- list(freqs = freqs, alternative = alternative, coverage = coverage)
# output the results of the function
out
}
#' Compute allele frequencies from pooled sequencing data
#'
#' Computes the frequency of the alternative allele in Pool-seq data and removes
#' any site with too few minor-allele reads from both the pool frequencies and
#' the frequencies computed directly from genotypes.
#'
#' The frequency at a given SNP is calculated according to: `pi = c/r`, where c
#' = number of alternative allele reads and r = total number of observed reads.
#' Additionally, if a site has less minor-allele reads than \code{min.minor}
#' across all populations, that site is removed from the data.
#'
#' @param reference a matrix with the number of reference allele reads. Each row
#' should be a different population and each column a different site.
#' @param alternative a matrix with the number of alternative allele reads. Each
#' row should be a different population and each column a different site.
#' @param coverage a matrix with the total coverage. Each row should be a
#' different population and each column a different site.
#' @param min.minor is an integer representing the minimum allowed number of
#' minor-allele reads. Sites that, across all populations, have less
#' minor-allele reads than this threshold will be removed from the data.
#' @param ifreqs a vector of allele frequencies computed directly from the
#' genotypes where each entry corresponds to a different site.
#'
#'
#' @return a list with two entries. The \code{ifreqs} entry contains the allele
#' frequencies computed directly from genotypes and \code{pfreqs} the allele
#' frequencies computed from pooled sequencing data.
#'
#' @examples
#' set.seed(10)
#' # create a vector of allele frequencies
#' freqs <- runif(20)
#' set.seed(10)
#' # create a matrix with the number of reads with the alternative allele
#' alternative <- matrix(sample(x = c(0,5,10), size = 20, replace = TRUE), nrow = 1)
#' # create a matrix with the depth of coverage
#' coverage <- matrix(sample(100:150, size = 20), nrow = 1)
#' # the number of reads with the reference allele is obtained by subtracting
#' # the number of alternative allele reads from the depth of coverage
#' reference <- coverage - alternative
#' # compute allele frequencies from pooled sequencing data
#' Pfreqs(reference = reference, alternative = alternative, coverage = coverage,
#' min.minor = 2, ifreqs = freqs)
#'
#' @export
Pfreqs <- function(reference, alternative, coverage, min.minor, ifreqs) {
# if the minimum number of minor allele reads is not set to zero
if(min.minor != 0) {
# check which of the two simulated alleles (reference or alternative) corresponds to the minor allele
minor <- findMinor(reference = reference, alternative = alternative, coverage = coverage)
# keep only the matrix with the number of minor allele reads
minor <- minor[["minor"]]
# use the removeSites function to remove sites with less than `min.minor` minor allele reads
temp <- removeSites(freqs = ifreqs, alternative = alternative, coverage = coverage,
minor = minor, min.minor = min.minor)
# get the number of reads with the alternative allele
alternative <- temp[["alternative"]]
# and the total coverage
coverage <- temp[["coverage"]]
# get the frequencies computed from genotypes
ifreqs <- temp[["freqs"]]
}
# compute the allele frequencies for Pool-seq data
freqs <- alternative/coverage
# create the output in this instance - with the allele frequencies computed from genotypes and from Pool-seq data
freqs <- list(ifreqs = ifreqs, pfreqs = freqs)
# output the allele frequencies
freqs
}
#' Average absolute difference between allele frequencies
#'
#' Calculates the average absolute difference between the allele frequencies
#' computed directly from genotypes and from pooled sequencing data.
#'
#' Different combinations of parameters can be tested to check the effect of the
#' various parameters. The average absolute difference is computed with the
#' \link[Metrics]{mae} function, assuming the frequencies computed directly from
#' the genotypes as the \code{actual} input argument and the frequencies from
#' pooled data as the \code{predicted} input argument.
#'
#' @param nDip an integer representing the total number of diploid individuals
#' to simulate. Note that [scrm::scrm()] actually simulates haplotypes, so the
#' number of simulated haplotypes is double of this.
#' @param nloci is an integer that represents how many independent loci should
#' be simulated.
#' @param pools a list with a vector containing the size (in number of diploid
#' individuals) of each pool. Thus, if a population was sequenced using a
#' single pool, the vector should contain only one entry. If a population was
#' sequenced using two pools, each with 10 individuals, this vector should
#' contain two entries and both will be 10.
#' @param pError an integer representing the value of the error associated with
#' DNA pooling. This value is related with the unequal contribution of both
#' individuals and pools towards the total number of reads observed for a
#' given population - the higher the value the more unequal are the individual
#' and pool contributions.
#' @param sError a numeric value with error rate associated with the sequencing
#' and mapping process. This error rate is assumed to be symmetric:
#' error(reference -> alternative) = error(alternative -> reference). This
#' number should be between 0 and 1.
#' @param mCov an integer that defines the mean depth of coverage to simulate.
#' Please note that this represents the mean coverage across all sites.
#' @param vCov an integer that defines the variance of the depth of coverage
#' across all sites.
#' @param min.minor is an integer representing the minimum allowed number of
#' minor-allele reads. Sites that, across all populations, have less
#' minor-allele reads than this threshold will be removed from the data.
#' @param minimum an optional integer representing the minimum coverage allowed.
#' Sites where the population has a depth of coverage below this threshold are
#' removed from the data.
#' @param maximum an optional integer representing the maximum coverage allowed.
#' Sites where the population has a depth of coverage above this threshold are
#' removed from the data.
#' @param theta a value for the mutation rate assuming theta = 4Nu, where u is
#' the neutral mutation rate per locus.
#'
#' @return a data.frame with columns detailing the number of diploid
#' individuals, the pool error, the number of pools, the number of individuals
#' per pool, the mean coverage, the variance of the coverage and the average
#' absolute difference between the frequencies computed from genotypes and
#' from pooled data.
#'
#' @examples
#' # single population sequenced with a single pool of 100 individuals
#' maePool(nDip = 100, nloci = 10, pools = list(100), pError = 100, sError = 0.01,
#' mCov = 100, vCov = 250, min.minor = 2)
#'
#' # single population sequenced with two pools, each with 50 individuals
#' maePool(nDip = 100, nloci = 10, pools = list(c(50, 50)), pError = 100, sError = 0.01,
#' mCov = 100, vCov = 250, min.minor = 2)
#'
#' # single population sequenced with two pools, each with 50 individuals
#' # removing sites with coverage below 10x or above 180x
#' maePool(nDip = 100, nloci = 10, pools = list(c(50, 50)), pError = 100, sError = 0.01,
#' mCov = 100, vCov = 250, min.minor = 2, minimum = 10, maximum = 180)
#'
#' @export
maePool <- function(nDip, nloci, pools, pError, sError, mCov, vCov, min.minor, minimum = NA, maximum = NA, theta = 10) {
# run SCRM and obtain genotypes for a single population
genotypes <- run_scrm(nDip = nDip, nloci = nloci, theta = theta)
# simulate number of reads
reads <- simulateCoverage(mean = mCov, variance = vCov, genotypes = genotypes)
# if the minimum coverage is defined
if(!is.na(minimum)) {
# check if the maximum coverage is also defined
if(is.na(maximum))
stop("please define the maximum coverage")
# remove sites with a depth of coverage above or below the defined threshold
reads <- remove_by_reads(nLoci = nloci, reads, minimum = minimum, maximum = maximum, genotypes = genotypes)
# get the genotypes - without sites simulated with a coverage below or above the threshold
genotypes <- lapply(reads, function(locus) locus[[2]])
# check the dimensions of the matrices with the genotypes
dimensions <- matrix(unlist(lapply(genotypes, dim)), ncol = 2, byrow = TRUE)
# we only wish to keep the locus where we have at least one polymorphic site
tokeep <- dimensions[, 2] != 0
# remove all loci without polymorphic sites
genotypes <- genotypes[tokeep]
# get the reads - without sites simulated with a coverage below or above the threshold
reads <- lapply(reads, function(locus) locus[[1]])
# use the same index to remove entries of the reads list that correspond to locus without polymorphic sites
reads <- reads[tokeep]
# ensure that each entry is a matrix
reads <- lapply(reads, function(locus) matrix(locus, nrow = 1))
}
# compute allele frequencies directly from the genotypes
ifreqs <- Ifreqs(nDip = nDip, genotypes = genotypes)
# simulate individual contribution to the total number of reads
indContribution <- lapply(1:nloci, function(locus)
popsReads(list_np = pools, coverage = reads[[locus]], pError = pError))
# simulate the number of reference reads
reference <- lapply(1:nloci, function(locus)
numberReferencePop(genotypes = genotypes[[locus]], indContribution = indContribution[[locus]],
size = pools, error = sError))
# simulate pooled sequencing data
pool <- poolPops(nPops = 1, nLoci = nloci, indContribution = indContribution, readsReference = reference)
# compute the allele frequencies obtained with pooled sequencing
pfreqs <- lapply(1:nloci, function(locus)
Pfreqs(reference = pool[["reference"]][[locus]], alternative = pool[["alternative"]][[locus]],
coverage = pool[["total"]][[locus]], min.minor = min.minor, ifreqs = ifreqs[[locus]]))
# get the allele frequencies computed directly from genotypes after removing sites that did not pass the threshold
ifreqs <- lapply(pfreqs, `[[`, 1)
# get the allele frequencies computed from Pool-seq after removing sites that did not pass the threshold
pfreqs <- lapply(pfreqs, `[[`, 2)
# compute the mean absolute error for each locus
abs_error <- lapply(1:nloci, function(locus) Metrics::mae(actual = ifreqs[[locus]], predicted = pfreqs[[locus]]))
# replace any NaN values with the mean of the remaining values
abs_error[is.na(abs_error)] <- mean(unlist(abs_error), na.rm = TRUE)
# get the number of pools used to sequence a population
nPools <- length(pools[[1]])
# get the number of individuals per pool
indsPool <- unique(pools[[1]])
# create a dataframe with the values for this particular combination of parameters
out <- data.frame(nDip = nDip, PoolError = pError, nPools = nPools, indsPool = indsPool, mean = mCov, var = vCov,
absError = unlist(abs_error))
# output the results of the function
out
}
#' Average absolute difference between allele frequencies computed from
#' genotypes and from Pool-seq data
#'
#' Calculates the average absolute difference between the allele frequencies
#' computed directly from genotypes and from pooled sequencing data.
#'
#' The average absolute difference is computed with the \link[Metrics]{mae}
#' function, assuming the frequencies computed directly from the genotypes as
#' the \code{actual} input argument and the frequencies from pooled data as the
#' \code{predicted} input argument.
#'
#' Note that this functions allows for different combinations of parameters.
#' Thus, the effect of different combinations of parameters on the average
#' absolute difference can be tested. For instance, it is possible to check what
#' is the effect of different coverages by including more than one value in the
#' \code{mCov} input argument. This function will run and compute the average
#' absolute difference for all combinations of the \code{nDip}, \code{pError}
#' and \code{mCov} input arguments. This function assumes that a single pool of
#' size \code{nDip} was used to sequence the population.
#'
#' @param nDip is an integer or a vector representing the total number of
#' diploid individuals to simulate. Note that [scrm::scrm()] actually
#' simulates haplotypes, so the number of simulated haplotypes is double of
#' this. If it is a vector, then each vector entry will be simulated
#' independently. For instance, if \code{nDip = c(100, 200)}, simulations will
#' be carried out for samples of 100 and 200 individuals.
#' @param nloci is an integer that represents how many independent loci should
#' be simulated.
#' @param pError an integer or a vector representing the value of the error
#' associated with DNA pooling. This value is related with the unequal
#' contribution of both individuals and pools towards the total number of
#' reads observed for a given population - the higher the value the more
#' unequal are the individual and pool contributions. If it is a vector, then
#' each vector entry will be simulated independently.
#' @param sError a numeric value with error rate associated with the sequencing
#' and mapping process. This error rate is assumed to be symmetric:
#' error(reference -> alternative) = error(alternative -> reference). This
#' number should be between 0 and 1.
#' @param mCov an integer or a vector that defines the mean depth of coverage to
#' simulate. Please note that this represents the mean coverage across all
#' sites. If it is a vector, then each vector entry will be simulated
#' independently.
#' @param vCov an integer or a vector that defines the variance of the depth of
#' coverage across all sites. If the \code{mCov} is a vector, then \code{vCov}
#' should also be a vector, with each entry corresponding to the variance of
#' the respective entry in the \code{mCov} vector. Thus, the first entry of
#' the \code{vCov} vector will be the variance associated with the first entry
#' of the \code{mCov} vector.
#' @param min.minor is an integer representing the minimum allowed number of
#' minor-allele reads. Sites that, across all populations, have less
#' minor-allele reads than this threshold will be removed from the data.
#' @param minimum an optional integer representing the minimum coverage allowed.
#' Sites where the population has a depth of coverage below this threshold are
#' removed from the data.
#' @param maximum an optional integer representing the maximum coverage allowed.
#' Sites where the population has a depth of coverage above this threshold are
#' removed from the data.
#' @param theta a value for the mutation rate assuming theta = 4Nu, where u is
#' the neutral mutation rate per locus.
#'
#' @return a data.frame with columns detailing the number of diploid
#' individuals, the pool error, the number of pools, the number of individuals
#' per pool, the mean coverage, the variance of the coverage and the average
#' absolute difference between the frequencies computed from genotypes and
#' from pooled data.
#'
#' @examples
#' # a simple test with a simple combination of parameters
#' maeFreqs(nDip = 100, nloci = 10, pError = 100, sError = 0.01, mCov = 100, vCov = 200, min.minor = 1)
#'
#' # effect of two different pool error values in conjugation with a fixed coverage and pool size
#' maeFreqs(nDip = 100, nloci = 10, pError = c(100, 200), sError = 0.01,
#' mCov = 100, vCov = 200, min.minor = 1)
#'
#' # effect of two different pool error values in conjugation with a fixed pool size
#' # and two different coverages
#' maeFreqs(nDip = 100, nloci = 10, pError = c(100, 200), sError = 0.01,
#' mCov = c(100, 200), vCov = c(200, 500), min.minor = 1)
#'
#' @export
maeFreqs <- function(nDip, nloci, pError, sError, mCov, vCov, min.minor, minimum = NA, maximum = NA, theta = 10) {
# create a matrix to save the values of the mean absolute error for the various conditions
final <- matrix(data = NA, nrow = 1, ncol = 7)
# add names to the columns of the matrix
colnames(final) <- c("nDip", "PoolError", "nPools", "indsPool", "mean", "var", "absError")
# create a matrix containing all combinations of factor variables
# each row will contain one combination of number of diploids, pool error and mean depth of coverage
combinations <- expand.grid(nDip, pError, mCov, stringsAsFactors = FALSE)
# do a loop over all the possible combinations
for (i in 1:nrow(combinations)) {
# get the number of diploid individuals for this combination
dip <- combinations[i, 1]
# get the pooling error for this combination
pError <- combinations[i, 2]
# get the mean depth of coverage for this combination
meanCov <- combinations[i, 3]
# get the variance of the depth of coverage associated with that particular mean coverage
varCov <- vCov[which(mCov == combinations[i, 3])]
# compute the average absolute difference between the allele frequencies from genotypes and from Pool-seq data
temp <- maePool(nDip = dip, nloci = nloci, pError = pError, pools = list(dip), sError = sError, mCov = meanCov,
vCov = varCov, min.minor = min.minor, minimum = minimum, maximum = maximum, theta = theta)
# add those values to the dataframe containing all the results
final <- rbind(final, temp)
}
# remove the first row of the final matrix - this row contains NAs
final <- final[-1 ,]
# output the final dataframe with the MAE values for the different parameter combinations
final
}
#' Average absolute difference between allele frequencies computed from
#' genotypes supplied by the user and from Pool-seq data
#'
#' Calculates the average absolute difference between the allele frequencies
#' computed directly from genotypes and from pooled sequencing data. The
#' genotypes used should be supplied by the user and can be simulated using
#' different software and under the demographic model of choice.
#'
#' The average absolute difference is computed with the \link[Metrics]{mae}
#' function, assuming the frequencies computed directly from the genotypes as
#' the \code{actual} input argument and the frequencies from pooled data as the
#' \code{predicted} input argument.
#'
#' Note that this functions allows for different combinations of parameters.
#' Thus, the effect of different combinations of parameters on the average
#' absolute difference can be tested. For instance, it is possible to check what
#' is the effect of different coverages by including more than one value in the
#' \code{mCov} input argument. This function will run and compute the average
#' absolute difference for all combinations of the \code{pools}, \code{pError}
#' and \code{mCov} input arguments.
#'
#' @param genotypes a list of genotypes, where each entry is a matrix
#' corresponding to a different locus. At each matrix, each column is a
#' different SNP and each row is a different individual. Genotypes should be
#' coded as 0, 1 or 2.
#' @param pools a list with a vector containing the size (in number of diploid
#' individuals) of each pool. Thus, if a population was sequenced using a
#' single pool, the vector should contain only one entry. If a population was
#' sequenced using two pools, each with 10 individuals, this vector should
#' contain two entries and both will be 10.
#' @param pError an integer representing the value of the error associated with
#' DNA pooling. This value is related with the unequal contribution of both
#' individuals and pools towards the total number of reads observed for a
#' given population - the higher the value the more unequal are the individual
#' and pool contributions.
#' @param sError a numeric value with error rate associated with the sequencing
#' and mapping process. This error rate is assumed to be symmetric:
#' error(reference -> alternative) = error(alternative -> reference). This
#' number should be between 0 and 1.
#' @param mCov an integer that defines the mean depth of coverage to simulate.
#' Please note that this represents the mean coverage across all sites.
#' @param vCov an integer that defines the variance of the depth of coverage
#' across all sites.
#' @param min.minor is an integer representing the minimum allowed number of
#' minor-allele reads. Sites that, across all populations, have less
#' minor-allele reads than this threshold will be removed from the data.
#' @param minimum an optional integer representing the minimum coverage allowed.
#' Sites where the population has a depth of coverage below this threshold are
#' removed from the data.
#' @param maximum an optional integer representing the maximum coverage allowed.
#' Sites where the population has a depth of coverage above this threshold are
#' removed from the data.
#'
#' @return a data.frame with columns detailing the number of diploid
#' individuals, the pool error, the number of pools, the number of individuals
#' per pool, the mean coverage, the variance of the coverage and the average
#' absolute difference between the frequencies computed from genotypes and
#' from pooled data.
#'
#' @examples
#' # 100 individuals sampled at a single locus
#' genotypes <- run_scrm(nDip = 100, nloci = 1, theta = 5)
#' # compute the mean absolute error assuming a coverage of 100x and two pools of 50 individuals each
#' mymae(genotypes = genotypes, pools = list(c(50, 50)), pError = 100, sError = 0.001,
#' mCov = 100, vCov = 250, min.minor = 0)
#'
#' # 10 individuals sampled at 5 different loci
#' genotypes <- run_scrm(nDip = 10, nloci = 5, theta = 5)
#' # compute the mean absolute error assuming a coverage of 100x and one pool of 10 individuals
#' mymae(genotypes = genotypes, pools = list(10), pError = 100, sError = 0.001,
#' mCov = 100, vCov = 250, min.minor = 0)
#'
#' @export
mymae <- function(genotypes, pools, pError, sError, mCov, vCov, min.minor, minimum = NA, maximum = NA) {
# check if the input is correct - genotypes should always be supplied as a list
if(any(class(genotypes) != "list"))
stop(paste("genotypes should be supplied on a list format, with each entry corresponding to a locus. Please check"))
# get the number of loci
nloci <- length(genotypes)
# get the number of individuals
nDip <- sum(unlist(pools))
# simulate number of reads
reads <- simulateCoverage(mean = mCov, variance = vCov, genotypes = genotypes)
# if the minimum coverage is defined
if(!is.na(minimum)) {
# check if the maximum coverage is also defined
if(is.na(maximum))
stop("please define the maximum coverage")
# remove sites with a depth of coverage above or below the defined threshold
reads <- remove_by_reads(nLoci = nloci, reads, minimum = minimum, maximum = maximum, genotypes = genotypes)
# get the genotypes - without sites simulated with a coverage below or above the threshold
genotypes <- lapply(reads, function(locus) locus[[2]])
# check the dimensions of the matrices with the genotypes
dimensions <- matrix(unlist(lapply(genotypes, dim)), ncol = 2, byrow = TRUE)
# we only wish to keep the locus where we have at least one polymorphic site
tokeep <- dimensions[, 2] != 0
# remove all loci without polymorphic sites
genotypes <- genotypes[tokeep]
# get the reads - without sites simulated with a coverage below or above the threshold
reads <- lapply(reads, function(locus) locus[[1]])
# use the same index to remove entries of the reads list that correspond to locus without polymorphic sites
reads <- reads[tokeep]
# ensure that each entry is a matrix
reads <- lapply(reads, function(locus) matrix(locus, nrow = 1))
}
# compute allele frequencies directly from the genotypes
ifreqs <- Ifreqs(nDip = nDip, genotypes = genotypes)
# simulate individual contribution to the total number of reads
indContribution <- lapply(1:nloci, function(locus)
popsReads(list_np = pools, coverage = reads[[locus]], pError = pError))
# simulate the number of reference reads
reference <- lapply(1:nloci, function(locus)
numberReferencePop(genotypes = genotypes[[locus]], indContribution = indContribution[[locus]],
size = pools, error = sError))
# simulate pooled sequencing data
pool <- poolPops(nPops = 1, nLoci = nloci, indContribution = indContribution, readsReference = reference)
# compute the allele frequencies obtained with pooled sequencing
pfreqs <- lapply(1:nloci, function(locus)
Pfreqs(reference = pool[["reference"]][[locus]], alternative = pool[["alternative"]][[locus]],
coverage = pool[["total"]][[locus]], min.minor = min.minor, ifreqs = ifreqs[[locus]]))
# get the allele frequencies computed directly from genotypes after removing sites that did not pass the threshold
ifreqs <- lapply(pfreqs, `[[`, 1)
# get the allele frequencies computed from Pool-seq after removing sites that did not pass the threshold
pfreqs <- lapply(pfreqs, `[[`, 2)
# compute the mean absolute error for each locus
abs_error <- lapply(1:nloci, function(locus) Metrics::mae(actual = ifreqs[[locus]], predicted = pfreqs[[locus]]))
# replace any NaN values with the mean of the remaining values
abs_error[is.na(abs_error)] <- mean(unlist(abs_error), na.rm = TRUE)
# get the number of pools used to sequence a population
nPools <- length(pools[[1]])
# get the number of individuals per pool
indsPool <- unique(pools[[1]])
# create a dataframe with the values for this particular combination of parameters
out <- data.frame(nDip = nDip, PoolError = pError, nPools = nPools, indsPool = indsPool, mean = mCov, var = vCov,
absError = unlist(abs_error))
# output the results of the function
out
}
#' Compute expected heterozygosity per site
#'
#' Computes the expected heterozygosity for a given site.
#'
#' @param geno_site is a vector where each entry contains the genotype for a
#' single individual, coded as 0,1,2 and using NA for the missing data.
#'
#' @return a numerical value corresponding to the expected heterozygosity at
#' that site.
#'
#' @keywords internal
#'
#' @export
ExpHet_site <- function(geno_site) {
# get the number of individuals with data
ngenecopies <- 2*sum(!is.na(geno_site))
# get the frequency of the alternative allele
freq <- sum(geno_site, na.rm = T)/ngenecopies
# compute the expected heterozygosity
he <- (ngenecopies/(ngenecopies-1))*2*freq*(1-freq)
# output the expected heterozygosity
he
}
#' Compute expected heterozygosity within a population
#'
#' This functions calculates the value of the expected heterozygosity for each
#' SNP.
#'
#' @param Pop_Pi is a matrix or list of allele frequencies. When dealing with a
#' single locus, this input is a matrix and when dealing with multiple loci it
#' is a list. Each entry of that list is a matrix representing a different
#' locus. Each row of that matrix should correspond to a different population
#' and each column to a different SNP.
#'
#' @return if the input is a single matrix, the output will be a matrix where
#' each row represents a different population and each column is the expected
#' heterozygosity of a population at that site. If the input is a list, the
#' output will also be a list, with each entry corresponding to a different
#' locus. Each of those entries will be a matrix with different populations in
#' different rows and the expected heterozygosity of different sites at
#' different columns.
#'
#' @keywords internal
#'
#' @export
Expected_Het <- function(Pop_Pi) {
# dealing with a single matrix of population allelic frequencies - a single locus or simulation
if(class(Pop_Pi)[1] == "matrix") {
# compute the expected heterozygosity for a site - this code goes across all sites
het <- apply(Pop_Pi, c(1,2), function (frequency) 2*frequency*(1 - frequency))
} else { # dealing with more than one locus or simulation
het <- lapply (Pop_Pi, FUN = function(x) {
apply(x, c(1,2), function (frequency) 2*frequency*(1 - frequency))})
}
# output the expected heterozygosity
het
}
#' Average absolute difference between expected heterozygosity
#'
#' Calculates the average absolute difference between the expected
#' heterozygosity computed directly from genotypes and from pooled sequencing
#' data.
#'
#' Different combinations of parameters can be tested to check the effect of the
#' various parameters. The average absolute difference is computed with the
#' \link[Metrics]{mae} function, assuming the expected heterozygosity computed
#' directly from the genotypes as the \code{actual} input argument and the
#' expected heterozygosity from pooled data as the \code{predicted} input
#' argument.
#'
#' @param nDip an integer representing the total number of diploid individuals
#' to simulate. Note that [scrm::scrm()] actually simulates haplotypes, so the
#' number of simulated haplotypes is double of this.
#' @param nloci is an integer that represents how many independent loci should
#' be simulated.
#' @param pools a list with a vector containing the size (in number of diploid
#' individuals) of each pool. Thus, if a population was sequenced using a
#' single pool, the vector should contain only one entry. If a population was
#' sequenced using two pools, each with 10 individuals, this vector should
#' contain two entries and both will be 10.
#' @param pError an integer representing the value of the error associated with
#' DNA pooling. This value is related with the unequal contribution of both
#' individuals and pools towards the total number of reads observed for a
#' given population - the higher the value the more unequal are the individual
#' and pool contributions.
#' @param sError a numeric value with error rate associated with the sequencing
#' and mapping process. This error rate is assumed to be symmetric:
#' error(reference -> alternative) = error(alternative -> reference). This
#' number should be between 0 and 1.
#' @param mCov an integer that defines the mean depth of coverage to simulate.
#' Please note that this represents the mean coverage across all sites.
#' @param vCov an integer that defines the variance of the depth of coverage
#' across all sites.
#' @param min.minor is an integer representing the minimum allowed number of
#' minor-allele reads. Sites that, across all populations, have less
#' minor-allele reads than this threshold will be removed from the data.
#' @param minimum an optional integer representing the minimum coverage allowed.
#' Sites where the population has a depth of coverage below this threshold are
#' removed from the data.
#' @param maximum an optional integer representing the maximum coverage allowed.
#' Sites where the population has a depth of coverage above this threshold are
#' removed from the data.
#' @param theta a value for the mutation rate assuming theta = 4Nu, where u is
#' the neutral mutation rate per locus.
#'
#' @return a data.frame with columns detailing the number of diploid
#' individuals, the pool error, the number of pools, the number of individuals
#' per pool, the mean coverage, the variance of the coverage and the average
#' absolute difference between the expected heterozygosity computed from
#' genotypes and from pooled data.
#'
#' @examples
#' # single population sequenced with a single pool of 100 individuals
#' errorHet(nDip = 100, nloci = 10, pools = list(100), pError = 100, sError = 0.01,
#' mCov = 100, vCov = 250, min.minor = 2)
#'
#' # single population sequenced with two pools, each with 50 individuals
#' errorHet(nDip = 100, nloci = 10, pools = list(c(50, 50)), pError = 100, sError = 0.01,
#' mCov = 100, vCov = 250, min.minor = 2)
#'
#' # single population sequenced with two pools, each with 50 individuals
#' # removing sites with coverage below 10x or above 180x
#' errorHet(nDip = 100, nloci = 10, pools = list(c(50, 50)), pError = 100, sError = 0.01,
#' mCov = 100, vCov = 250, min.minor = 2, minimum = 10, maximum = 180)
#'
#' @export
errorHet <- function(nDip, nloci, pools, pError, sError, mCov, vCov, min.minor, minimum = NA, maximum = NA, theta = 10) {
# run SCRM and obtain genotypes for a single population
genotypes <- run_scrm(nDip = nDip, nloci = nloci, theta = theta)
# simulate number of reads
reads <- simulateCoverage(mean = mCov, variance = vCov, genotypes = genotypes)
# if the minimum coverage is defined
if(!is.na(minimum)) {
# check if the maximum coverage is also defined
if(is.na(maximum))
stop("please define the maximum coverage")
# remove sites with a depth of coverage above or below the defined threshold
reads <- remove_by_reads(nLoci = nloci, reads, minimum = minimum, maximum = maximum, genotypes = genotypes)
# get the genotypes - without sites simulated with a coverage below or above the threshold
genotypes <- lapply(reads, function(locus) locus[[2]])
# check the dimensions of the matrices with the genotypes
dimensions <- matrix(unlist(lapply(genotypes, dim)), ncol = 2, byrow = TRUE)
# we only wish to keep the locus where we have at least one polymorphic site
tokeep <- dimensions[, 2] != 0
# remove all loci without polymorphic sites
genotypes <- genotypes[tokeep]
# get the reads - without sites simulated with a coverage below or above the threshold
reads <- lapply(reads, function(locus) locus[[1]])
# use the same index to remove entries of the reads list that correspond to locus without polymorphic sites
reads <- reads[tokeep]
# ensure that each entry is a matrix
reads <- lapply(reads, function(locus) matrix(locus, nrow = 1))
}
# compute the expected heterozygosity directly from genotypes
indHets <- lapply(genotypes, function(locus) apply(X = locus, MARGIN = 2, FUN = function(site)
ExpHet_site(geno_site = site)))
# simulate individual contribution to the total number of reads
indContribution <- lapply(1:nloci, function(locus)
popsReads(list_np = pools, coverage = reads[[locus]], pError = pError))
# simulate the number of reference reads
reference <- lapply(1:nloci, function(locus)
numberReferencePop(genotypes = genotypes[[locus]], indContribution = indContribution[[locus]],
size = pools, error = sError))
# simulate pooled sequencing data
pool <- poolPops(nPops = 1, nLoci = nloci, indContribution = indContribution, readsReference = reference)
# compute the allele frequencies obtained with pooled sequencing
pfreqs <- lapply(1:nloci, function(locus)
Pfreqs(reference = pool[["reference"]][[locus]], alternative = pool[["alternative"]][[locus]],
coverage = pool[["total"]][[locus]], min.minor = min.minor, ifreqs = indHets[[locus]]))
# get the expected heterozygosity computed directly from genotypes after removing sites that did not pass the threshold
indHets <- lapply(pfreqs, `[[`, 1)
# get the allele frequencies computed from Pool-seq after removing sites that did not pass the threshold
pfreqs <- lapply(pfreqs, `[[`, 2)
# compute the mean expected heterozygosity for each population and locus - using the Pool-seq data
poolHets <- Expected_Het(pfreqs)
# compute the mean absolute error for each locus
abs_error <- lapply(1:nloci, function(locus) Metrics::mae(actual = indHets[[locus]], predicted = poolHets[[locus]]))
# replace any NaN values with the mean of the remaining values
abs_error[is.na(abs_error)] <- mean(unlist(abs_error), na.rm = TRUE)
# get the number of pools used to sequence a population
nPools <- length(pools[[1]])
# get the number of individuals per pool
indsPool <- unique(pools[[1]])
# create a dataframe with the values for this particular combination of parameters
out <- data.frame(nDip=nDip, PoolError=pError, nPools=nPools, indsPool=indsPool, mean=mCov, var=vCov,
absError=unlist(abs_error))
# output the results of the function
out
}
#' Average absolute difference between the expected heterozygosity computed from
#' genotypes and from Pool-seq data
#'
#' Calculates the average absolute difference between the expected
#' heterozygosity computed directly from genotypes and from pooled sequencing
#' data.
#'
#' The average absolute difference is computed with the \link[Metrics]{mae}
#' function, assuming the expected heterozygosity computed directly from the
#' genotypes as the \code{actual} input argument and the expected heterozygosity
#' from pooled data as the \code{predicted} input argument.
#'
#' Note that this functions allows for different combinations of parameters.
#' Thus, the effect of different combinations of parameters on the average
#' absolute difference can be tested. For instance, it is possible to check what
#' is the effect of different coverages by including more than one value in the
#' \code{mCov} input argument. This function will run and compute the average
#' absolute difference for all combinations of the \code{nDip}, \code{pError}
#' and \code{mCov} input arguments. This function assumes that a single pool of
#' size \code{nDip} was used to sequence the population.
#'
#' @param nDip is an integer or a vector representing the total number of
#' diploid individuals to simulate. Note that [scrm::scrm()] actually
#' simulates haplotypes, so the number of simulated haplotypes is double of
#' this. If it is a vector, then each vector entry will be simulated
#' independently. For instance, if \code{nDip = c(100, 200)}, simulations will
#' be carried out for samples of 100 and 200 individuals.
#' @param nloci is an integer that represents how many independent loci should
#' be simulated.
#' @param pError an integer or a vector representing the value of the error
#' associated with DNA pooling. This value is related with the unequal
#' contribution of both individuals and pools towards the total number of
#' reads observed for a given population - the higher the value the more
#' unequal are the individual and pool contributions. If it is a vector, then
#' each vector entry will be simulated independently.
#' @param sError a numeric value with error rate associated with the sequencing
#' and mapping process. This error rate is assumed to be symmetric:
#' error(reference -> alternative) = error(alternative -> reference). This
#' number should be between 0 and 1.
#' @param mCov an integer or a vector that defines the mean depth of coverage to
#' simulate. Please note that this represents the mean coverage across all
#' sites. If it is a vector, then each vector entry will be simulated
#' independently.
#' @param vCov an integer or a vector that defines the variance of the depth of
#' coverage across all sites. If the \code{mCov} is a vector, then \code{vCov}
#' should also be a vector, with each entry corresponding to the variance of
#' the respective entry in the \code{mCov} vector. Thus, the first entry of
#' the \code{vCov} vector will be the variance associated with the first entry
#' of the \code{mCov} vector.
#' @param min.minor is an integer representing the minimum allowed number of
#' minor-allele reads. Sites that, across all populations, have less
#' minor-allele reads than this threshold will be removed from the data.
#' @param minimum an optional integer representing the minimum coverage allowed.
#' Sites where the population has a depth of coverage below this threshold are
#' removed from the data.
#' @param maximum an optional integer representing the maximum coverage allowed.
#' Sites where the population has a depth of coverage above this threshold are
#' removed from the data.
#' @param theta a value for the mutation rate assuming theta = 4Nu, where u is
#' the neutral mutation rate per locus.
#'
#' @return a data.frame with columns detailing the number of diploid
#' individuals, the pool error, the number of pools, the number of individuals
#' per pool, the mean coverage, the variance of the coverage and the average
#' absolute difference between the expected heterozygosity computed from
#' genotypes and from pooled data.
#'
#' @examples
#' # a simple test with a simple combination of parameters
#' maeHet(nDip = 100, nloci = 10, pError = 100, sError = 0.01, mCov = 100, vCov = 200, min.minor = 1)
#'
#' # effect of two different pool error values in conjugation with a fixed coverage and pool size
#' maeHet(nDip = 100, nloci = 10, pError = c(100, 200), sError = 0.01,
#' mCov = 100, vCov = 200, min.minor = 1)
#'
#' # effect of two different pool error values in conjugation with a fixed pool size
#' # and two different coverages
#' maeHet(nDip = 100, nloci = 10, pError = c(100, 200), sError = 0.01,
#' mCov = c(100, 200), vCov = c(200, 500), min.minor = 1)
#'
#' @export
maeHet <- function(nDip, nloci, pError, sError, mCov, vCov, min.minor, minimum = NA, maximum = NA, theta = 10) {
# create a matrix to save the values of the mean absolute error for the various conditions
final <- matrix(data = NA, nrow = 1, ncol = 7)
# add names to the columns of the matrix
colnames(final) <- c("nDip", "PoolError", "nPools", "indsPool", "mean", "var", "absError")
# create a matrix containing all combinations of factor variables
# each row will contain one combination of number of diploids, pool error and mean depth of coverage
combinations <- expand.grid(nDip, pError, mCov, stringsAsFactors = FALSE)
# do a loop over all the possible combinations
for (i in 1:nrow(combinations)) {
# get the number of diploid individuals for this combination
dip <- combinations[i, 1]
# get the pooling error for this combination
pError <- combinations[i, 2]
# get the mean depth of coverage for this combination
meanCov <- combinations[i, 3]
# get the variance of the depth of coverage associated with that particular mean coverage
varCov <- vCov[which(mCov == combinations[i, 3])]
# compute the average absolute difference between the allele frequencies from genotypes and from Pool-seq data
temp <- errorHet(nDip = dip, nloci = nloci, pError = pError, pools = list(dip), sError = sError, mCov = meanCov,
vCov = varCov, min.minor = min.minor, minimum = minimum, maximum = maximum, theta = theta)
# add those values to the dataframe containing all the results
final <- rbind(final, temp)
}
# remove the first row of the final matrix - this row contains NAs
final <- final[-1 ,]
# output the final dataframe with the MAE values for the different parameter combinations
final
}
#' Create invariable sites
#'
#' This function applies a correction for the situations where [scrm::scrm()]
#' does not produce a single polymorphic site for a given locus. In this
#' situation, two artificial sites are created at that locus. All individuals
#' are assumed to be homozygous for the reference allele at those sites.
#'
#' @param haplotypes a list of haplotypes obtained from the simulations done
#' with [scrm::scrm()]. Each entry of the list is a matrix that corresponds to
#' a given locus. At each matrix, each column is a different site and each row
#' is a different haplotype.
#' @param nHap an integer representing the total number of haplotypes simulated.
#'
#' @return a list of haplotypes identical to `haplotypes`, but without empty
#' loci.
#'
#' @keywords internal
#'
#' @export
haplo.fix <- function(haplotypes, nHap) {
# get the dimensions of each list entry - organized into a matrix with number of rows in the first column
# and number of columns in the second
size <- matrix(unlist(lapply(haplotypes, dim)), ncol = 2, byrow = TRUE)
# get the index of list entries with zero columns
index <- size[, 2] == 0
# replace those entries with a matrix containing a single column of zeros
haplotypes[index] <- list(matrix(rep(0, times = nHap), ncol = 1))
# get the dimensions of each list entry - organized in the same manner as before
size <- matrix(unlist(lapply(haplotypes, dim)), ncol = 2, byrow = TRUE)
# get the index of list entries with a single column
index <- size[, 2] == 1
# if there are any columns with a single entry
if(sum(index) != 0) {
# add another column with only zeros to those entries
haplotypes[index] <- sapply(haplotypes[index], function(x)
list(cbind(as.matrix(unlist(x), byrow = TRUE), matrix(rep(0, times = nHap)))))
}
# output the haplotypes
haplotypes
}
#' Convert haplotypes to genotypes
#'
#' This function converts haplotypes simulated with [scrm::scrm()] into
#' genotypes by adding the entries on one row with the entries of the subsequent
#' row.
#'
#' @param haplo a matrix of haplotypes obtained from the simulations done with
#' [scrm::scrm()]. Each column of the matrix is a different site and each row
#' is a different haplotype.
#'
#' @return a matrix of genotypes with half the rows of the `haplo` matrix. Each
#' column of this matrix is a different site and each row is a different
#' genotype.
#'
#' @keywords internal
#'
#' @export
hap2geno <- function(haplo) {
# add each row of the matrix to the row that is immediately below it
genotypes <- haplo[seq(1, by = 2, to = nrow(haplo)),] + haplo[seq(2, by = 2, to = nrow(haplo)), , drop = FALSE]
# output the matrix of genotypes
genotypes
}
#' Create genotypes from a output with haplotypes
#'
#' This function applies the \code{\link{hap2geno}} function to all entries of a
#' list. Each entry of that list is a different locus simulated with
#' [scrm::scrm()]. Thus, this function converts the haplotypes of all simulated
#' loci into genotypes.
#'
#' @param haplotypes a list of haplotypes obtained from the simulations done
#' with [scrm::scrm()]. Each entry of the list is a matrix that corresponds to
#' a given locus. At each matrix, each column is a different site and each row
#' is a different haplotype.
#' @param nDip an integer representing the total number of diploid individuals
#' to simulate. Note that this is the total number of diploid individuals and
#' not the number of individuals per population.
#'
#' @return a list of genotypes. Each entry of the list is a matrix corresponding
#' to a different locus. locus. At each matrix, each column is a different
#' site and each row is a different genotype
#'
#' @keywords internal
#'
#' @export
GetGenotypes <- function(haplotypes, nDip) {
# apply the hap2geno function across all entries of the list - sum haplotypes to get genotypes
genotypes <- lapply(haplotypes, function(x) hap2geno(x))
# remove the name (position) of each site - this is something that scrm creates
genotypes <- lapply(genotypes, function(x) unname(x))
# output the genotypes
genotypes
}
#' Simulate coverage at a single locus
#'
#' Simulates the total number of reads, for each polymorphic site of a given
#' locus using a negative binomial distribution.
#'
#' The total number of reads is simulated with a negative binomial and according
#' to a user-defined mean depth of coverage and variance. This function is
#' intended to work with a matrix of genotypes, simulating the depth of coverage
#' for each site present in the genotypes. However, it can also be used to
#' simulate coverage distributions independent of genotypes, by choosing how
#' many sites should be simulated (with the `nSNPs` option).
#'
#' @param mean an integer that defines the mean depth of coverage to simulate.
#' Please note that this represents the mean coverage across all sites. If a
#' vector is supplied instead, the function assumes that each entry of the
#' vector is the mean for a different population.
#' @param variance an integer that defines the variance of the depth of coverage
#' across all sites. If a vector is supplied instead, the function assumes
#' that each entry of the vector is the variance for a different population.
#' @param nSNPs an integer representing the number of polymorphic sites per
#' locus to simulate. This is an optional input but either this or the
#' `genotypes` matrix must be supplied.
#' @param genotypes a matrix of simulated genotypes, where each column is a
#' different SNP and each row is a different individual. This is an optional
#' input but either this or the `nSNPs` must be supplied.
#'
#' @return a matrix with the total coverage per population and per site.
#' Different rows represent different populations and each column is a
#' different site.
#'
#' @examples
#' # coverage for one population at 10 sites
#' simReads(mean = 20, variance = 100, nSNPs = 10)
#'
#' # simulate coverage at one locus with 10 SNPs for two populations:
#' # the first with 100x and the second with 50x
#' simReads(mean = c(100, 50), variance = c(250, 150), nSNPs = 10)
#'
#' @export
simReads <- function(mean, variance, nSNPs = NA, genotypes = NA) {
# check if either the number of SNPs or the genotypes were supplied as input
if(all(is.na(nSNPs), is.na(genotypes)))
stop("You should define the number of SNPs to simulate or supply a matrix of genotypes. Please check")
# check if the variance and mean are reasonable
if(any(variance - mean > 0) == FALSE)
stop("Error: variance equal to mean, or variance smaller than mean")
# if genotypes are supplied as input argument, assume that the number of SNPs
# is equal to the number of columns of the genotypes matrix
if(!all(is.na(genotypes)))
nSNPs <- ncol(genotypes)
# calculate the parameters for the negative binomial
pnb <- mean/variance
rnb <- (mean^2)/(variance - mean)
# use a negative binomial to draw random values, per site and per population, for the total number of observed reads
readnumbers <- t(mapply(FUN = function(size, prob) stats::rnbinom(n = nSNPs, size = size, prob = prob), rnb, pnb))
# if there is only a single SNP - we need to transpose the previous result to get each population in a different row
if(nSNPs == 1)
readnumbers <- t(readnumbers)
# get the output - number of reads per site and per population
readnumbers
}
#' Simulate total number of reads per site
#'
#' This function simulates the total number of reads, for each polymorphic site
#' using a negative binomial distribution.
#'
#' The total number of reads is simulated with a negative binomial and according
#' to a user-defined mean depth of coverage and variance. This function is
#' intended to work with a list of genotypes, simulating the depth of coverage
#' for each site present in the genotypes. However, it can also be used to
#' simulate coverage distributions independent of genotypes, by choosing how
#' many loci to simulate (with the `nLoci` option) and choosing how many sites
#' per locus should be simulated (with the `nSNPs` option).
#'
#' @param mean an integer that defines the mean depth of coverage to simulate.
#' Please note that this represents the mean coverage across all sites. If a
#' vector is supplied instead, the function assumes that each entry of the
#' vector is the mean for a different population.
#' @param variance an integer that defines the variance of the depth of coverage
#' across all sites. If a vector is supplied instead, the function assumes
#' that each entry of the vector is the variance for a different population.
#' @param genotypes a list of simulated genotypes, where each entry is a matrix
#' corresponding to a different locus. At each matrix, each column is a
#' different SNP and each row is a different individual. This is an optional
#' input but either this or the `nSNPs` must be supplied.
#' @param nSNPs an integer representing the number of polymorphic sites per
#' locus to simulate. This is an optional input but either this or the
#' `genotypes` list must be supplied.
#' @param nLoci an optional integer that represents how many independent loci
#' should be simulated.
#'
#' @return a list with the total coverage per population and per site. Each list
#' entry is a matrix corresponding to a different locus. For each matrix,
#' different rows represent different populations and each column is a
#' different site.
#'
#' @examples
#' # simulate 10 loci, each with 10 SNPs for a single population
#' simulateCoverage(mean = 100, variance = 250, nSNPs = 10, nLoci = 10)
#'
#' # simulate 10 loci, each with 10 SNPs for two populations:
#' # the first with 100x and the second with 50x
#' simulateCoverage(mean = c(100, 50), variance = c(250, 150), nSNPs = 10, nLoci = 10)
#'
#' # simulate coverage given a set of genotypes
#' # run scrm and obtain genotypes
#' genotypes <- run_scrm(nDip = 100, nloci = 10)
#' # simulate coverage
#' simulateCoverage(mean = 50, variance = 200, genotypes = genotypes)
#'
#' @export
simulateCoverage <- function(mean, variance, nSNPs = NA, nLoci = NA, genotypes = NA) {
# check if either the number of SNPs or the genotypes were supplied as input
if(all(is.na(nSNPs), is.na(genotypes)))
stop("You should define the number of SNPs to simulate or supply a list of genotypes. Please check")
# if the genotypes are supplied as input to the function
if(!all(is.na(genotypes))) {
# check if the input is correct - genotypes should always be supplied as a list
if(any(class(genotypes) != "list"))
stop(paste("genotypes should be supplied on a list format, with each entry corresponding to a locus. Please check"))
# use a negative binomial to draw random values, per site and per population, for the total number of observed reads
# this outputs a list where each entry corresponds to a locus
readnumbers <- lapply(genotypes, FUN = function(geno) simReads(mean, variance, genotypes = geno))
} else {
# check if the input is correct - when genotypes are not supplied as input, the number of loci should be defined
if(is.na(nLoci))
stop(paste("Please define the number of loci to simulate"))
# use a negative binomial to draw random values, per site and per population, for the total number of observed reads
# this outputs a list where each entry corresponds to a locus
readnumbers <- lapply(1:nLoci, FUN = function(geno) simReads(mean, variance, nSNPs))
}
# get the output - number of reads per site and per population
readnumbers
}
#' Apply a coverage-based filter to a matrix
#'
#' This function removes sites that have a coverage below a `minimum` value and
#' sites with a coverage above a `maximum` value. If a matrix of genotypes is
#' also supplied, then those same sites are also removed from that matrix.
#'
#' @param reads a matrix with the total depth of coverage. Each row of the
#' matrix should be the coverage of a different population and each column a
#' different site.
#' @param minimum an integer representing the minimum coverage allowed. Sites
#' where any population has a depth of coverage below this threshold are
#' removed from the data.
#' @param maximum an integer representing the maximum coverage allowed. Sites
#' where any population has a depth of coverage above this threshold are
#' removed from the data.
#' @param genotypes an optional matrix input with the genotypes. Each column of
#' the matrix should be a different site and each row a different individual.
#'
#' @return a matrix with the total depth of coverage minus the sites (i.e.
#' columns) where the coverage for any of the populations was below the
#' minimum or above the maximum. If genotypes were supplied, then the output
#' will be a list, with one entry per locus. Each entry will contain the
#' filtered coverage in the first entry and the genotypes, minus the removed
#' sites, in the second entry.
#'
#' @examples
#' set.seed(10)
#'
#' # simulate coverage for a single locus - select the first entry to obtain a matrix
#' reads <- simulateCoverage(mean = c(25, 25), variance = c(200, 200), nSNPs = 10, nLoci = 1)[[1]]
#'
#' # check the coverage matrix
#' reads
#'
#' # remove sites with coverage below 10x or above 100x
#' remove_by_reads_matrix(reads = reads, minimum = 10, maximum = 100)
#'
#' @export
remove_by_reads_matrix <- function(reads, minimum, maximum, genotypes = NA) {
# check which sites, if any, have a coverage below or above the threshold
toremove <- apply(X = reads, MARGIN = 2, function(col) any(col < minimum | col > maximum))
# if there are any sites with a coverage below or above the required value - remove those sites from the matrix
if(length(toremove) != 0)
reads <- reads[, !toremove, drop = FALSE]
# set the output if the only input were the number of reads
output <- reads
# when genotypes were also supplied as input
if(!all(is.na(genotypes))) {
# remove the same sites from the matrix containing the genotypes
genotypes <- genotypes[, !toremove, drop = FALSE]
# create the new output, containing both the reads and the genotypes
output <- list(reads, genotypes)
}
# output the number of reads
output
}
#' Apply a coverage-based filter over a list
#'
#' This function removes sites that have a coverage below a `minimum` value and
#' sites with a coverage above a `maximum` value. This is done over multiple
#' loci, assuming that each entry of the `reads` list is a different locus. If a
#' list of genotypes is also supplied, then those same sites are also removed
#' from each locus of the genotypes.
#'
#' @param nLoci an integer that represents how many independent loci were
#' simulated.
#' @param reads a list with the total depth of coverage. Each entry of the list
#' should be a matrix corresponding to a different locus. Each row of that
#' matrix should be the coverage of a different population and each column a
#' different site.
#' @param minimum an integer representing the minimum coverage allowed. Sites
#' where any population has a depth of coverage below this threshold are
#' removed from the data.
#' @param maximum an integer representing the maximum coverage allowed. Sites
#' where any population has a depth of coverage above this threshold are
#' removed from the data.
#' @param genotypes an optional list input with the genotypes. Each entry of the
#' list should be a matrix corresponding to a different locus. Each column of
#' the matrix should be a different site and each row a different individual.
#'
#' @return a list with the total depth of coverage similar to the `reads` input
#' argument but without sites where the coverage was below the `minimum` or
#' above the `maximum`. If the genotypes were included, a second list entry
#' will also be included in the output, containing the genotypes minus the
#' sites that were removed.
#'
#' @examples
#' set.seed(10)
#'
#' # simulate coverage for 10 locus
#' reads <- simulateCoverage(mean = c(25, 25), variance = c(200, 200), nSNPs = 10, nLoci = 10)
#'
#' # remove sites with coverage below 10x or above 100x
#' reads <- remove_by_reads(nLoci = 10, reads = reads, minimum = 5, maximum = 100)
#' # notice that some locus no longer have 10 SNPs - those sites were removed
#' reads
#'
#' @export
remove_by_reads <- function(nLoci, reads, minimum, maximum, genotypes = NA) {
# check if the input is correct - reads should always be supplied as a list
if(!inherits(reads, "list"))
stop(paste("reads should be supplied on a list format, with each entry corresponding to a locus. Please check"))
# this applies the remove_by_reads_matrix function to all the list entries - i.e. to all the different loci
# note that this will also remove those sites from the genotypes list - if genotypes are supplied as input
# if genotypes were not supplied as input
if(any(is.na(genotypes))) {
# remove sites with a depth of coverage above or below the defined threshold
out <- lapply(1:nLoci, function(locus) remove_by_reads_matrix(reads = reads[[locus]], minimum, maximum))
} else { # if genotypes are supplied as input
# remove sites with a depth of coverage above or below the defined threshold
out <- lapply(1:nLoci, function(locus)
remove_by_reads_matrix(reads = reads[[locus]], minimum, maximum, genotypes = genotypes[[locus]]))
}
# output the number of reads - and genotypes if relevant
out
}
#' Probability of contribution of each pool
#'
#' This function computes the probability of contribution of each pool towards
#' the total depth of coverage of a single population. If multiple pools where
#' used to sequence a single population, it is possible that some pools
#' contribute more than others.
#'
#' @param nPools an integer indicating how many pools were used to sequence the
#' population.
#' @param vector_np is a vector where each entry contains the number of diploid
#' individuals of a given pool. Thus, if a population was sequenced using two
#' pools, each with 10 individuals, this vector would contain two entries and
#' both will be 10.
#' @param nSNPs an integer indicating how many SNPs exist in the data.
#' @param pError an integer representing the value of the error associated with
#' DNA pooling. This value is related with the unequal pool contribution
#' towards the total number of reads of a population - the higher the value
#' the more unequal are the pool contributions.
#'
#' @return a matrix with the probabilities of contribution for each pool. Each
#' row represents a different pool and each column is a different site.
#'
#' @examples
#' # probability of contribution at 8 SNPs for 5 pools, each with 10 individuals
#' poolProbs(nPools = 5, vector_np = rep(10, 5), nSNPs = 8, pError = 50)
#'
#' @export
poolProbs <- function(nPools, vector_np, nSNPs, pError) {
# calculate the probability of contributing for each individual - in each population
# set the minimum threshold value of alpha
min.alpha <- 0.01
# check if we are dealing with a single population - this function should be used on a single population
# also check if the input is on the correct format
if(!inherits(vector_np, "numeric") | length(vector_np) == 1)
stop(paste("The vector_np input should be a vector. It should also contain more than one entry. Please check"))
# check if we are dealing with a single population - this function should be used on a single population
# also check if the input is on the correct format
if(nPools != length(vector_np))
stop(paste("The nPools input is", paste(nPools), "and so, the length of the size vector should also be",
paste(nPools, ".", sep = ""), "Please check"))
# set k - total number of pools
k <- nPools
# the total number of individuals in the population (n) can be obtained by adding the individuals in all pools
n <- sum(vector_np)
# pooling error is defined in % - change this to a proportion
pError <- pError/100
# if pools have different sizes
if(length(unique(vector_np)) != 1) {
# compute rho for the smallest pool using ns - n of the smallest pool
ns <- min(vector_np)
# check if the error is too high for this number of pools
if(sqrt((n/ns)-1) < pError) {
# if the error is too high, replace it by a maximum pool error
tmp <- round((sqrt((n/ns)-1)*100) - 10)
# replace the error
pError <- sqrt((n/ns)-1) - 0.1
# output a warning with the new error
warning(paste("pError was too high. It was replaced by", tmp), call. = FALSE)
}
# compute rho for that pool
rho <- ((n/ns)-1-pError^2)/pError^2
# if we use Dir(ρ*np/n), then the alpha_i for Dirichlet can be written as
alpha <- rho*(vector_np/n)
# check if any alpha is negative
if(any(alpha < 0))
stop("The sum(vector_np) divided by vector_np should be larger than 1. Please check!",
call. = FALSE)
} else { # if all pools have the same size
# check if the error is too high for this number of pools
if(sqrt(k-1) < pError) {
# if the error is too high, replace it by a maximum pool error
tmp <- (sqrt(k-1)*100) - 10
# replace the error
pError <- sqrt(k-1) - 0.1
# output a warning with the new error
warning(paste("pError was too high. It was replaced by", tmp), call. = FALSE)
}
# calculate rho
rho <- ((k-1)/pError^2) - 1
# if we use Dir(ρ*np/n), then the alpha_i for Dirichlet can be written as
alpha <- rho*(vector_np/n)
}
# if alpha is smaller than the minimum threshold - this mainly happens with high pool errors
if(all(alpha < min.alpha)) {
# alternatively, use a multinomial to sample a contribution of 1 for a random pool per site
probs <- stats::rmultinom(n = nSNPs, size = 1, prob = rep(1, k))
} else {
# use a Dirichlet distribution to get the probability of contribution for each pool across all sites
probs <- t(MCMCpack::rdirichlet(n = nSNPs, alpha = alpha))
}
# output the probability of contributing for each pool
probs
}
#' Reads contributed by each pool
#'
#' This function simulates the contribution, in terms of reads, of each pool.
#' The number of reads contributed from all pools is equal to the total coverage
#' of the population.
#'
#' @param nPools an integer indicating how many pools were used to sequence the
#' population.
#' @param coverage a vector containing the total depth of coverage of the
#' population. Each entry of the vector represents a different site.
#' @param probs a matrix containing the probability of contribution of each pool
#' used to sequence the population. This matrix can be obtained with the
#' `poolProbs` function.
#'
#' @return a matrix with the number of reads contributed by each pool towards
#' the total coverage of the population. Each row of the matrix is a different
#' pool and each column a different site.
#'
#' @examples
#' # simulate the probability of contribution of each pool
#' probs <- poolProbs(nPools = 5, vector_np = rep(10, 5), nSNPs = 8, pError = 50)
#'
#' # simulate the number of reads contributed, assuming 10x coverage for each site
#' poolReads(nPools = 5, coverage = rep(10, 8), probs = probs)
#'
#' @export
poolReads <- function(nPools, coverage, probs) {
# set the output of this function if a given locus has no sites
if(length(coverage) == 0) {
# set the contribution to NA
contribution <- NA
} else {
# check if the dimensions make sense
if(length(coverage) != ncol(probs))
stop("different number of sites in the coverage and probs input arguments")
# simulate the contribution of each pool towards the total depth of coverage
contribution <- vapply(1:length(coverage), FUN = function(i) {
stats::rmultinom(1, size = coverage[i], prob = probs[, i])
}, FUN.VALUE = numeric(nPools))
}
# output a matrix containing the contribution (in number of reads) for each pool and across all sites
contribution
}
#' Probability of contribution of each individual
#'
#' This function computes the probability of contribution for each individual of
#' a given pool. Please note that this function works for a single pool and
#' should not be directly applied to situations where multiple pools were used.
#'
#' @param np an integer specifying how many individuals were pooled.
#' @param nSNPs an integer indicating how many SNPs exist in the data.
#' @param pError an integer representing the value of the error associated with
#' DNA pooling. This value is related with the unequal individual contribution
#' towards the total number of reads contributed by a single pool - the higher
#' the value the more unequal are the individual contributions.
#'
#' @return a matrix with the probabilities of contribution for each individual.
#' Each row represents a different individual and each column is a different
#' site.
#'
#' @examples
#' # probability of contribution for 10 individuals at 5 sites
#' indProbs(np = 10, nSNPs = 5, pError = 100)
#'
#' @export
indProbs <- function(np, nSNPs, pError) {
# set the minimum threshold value of alpha
min.alpha <- 0.01
# pooling error is defined in % - change this to a proportion
pError <- pError/100
# check if the error is too high for this number of individuals
if(sqrt(np-1) <= pError) {
# if the error is too high, replace it by a maximum pool error
tmp <- (sqrt(np-1)*100) - 10
# replace the error
pError <- sqrt(np-1) - 0.1
# output a warning with the new error
warning(paste("pError was too high. It was replaced by", tmp), call. = FALSE)
}
# compute rho for individuals inside a single pool
rho <- ((np-1)/(pError^2)) - 1
# if we use Dir(rho/np), then the alpha_i for Dirichlet can be written as
alpha <- rho/np
# use a dirichlet distribution to get the probability of contribution for each individual across all sites
probs <- t(MCMCpack::rdirichlet(n = nSNPs, alpha = rep(alpha, times = np)))
# output the probability of contributing for each individual
probs
}
#' Reads contributed by each individual
#'
#' This function simulates the contribution, in terms of reads, of each
#' individual of a given pool. Please note that this function works for a single
#' pool and should not be directly applied to situations where multiple pools
#' were used.
#'
#' @param np an integer specifying how many individuals were pooled.
#' @param coverage a vector containing the total depth of coverage of a given
#' pool. Each entry of the vector represents a different site.
#' @param probs a matrix containing the probability of contribution of each
#' individual. This matrix can be obtained with the `indProbs` function.
#'
#' @return a matrix with the number of reads contributed by each individual
#' towards the coverage of its pool. Each row of the matrix is a different
#' individual and each column a different site.
#'
#' @examples
#' # probability of contribution for 10 individuals at 5 sites
#' probs <- indProbs(np = 10, nSNPs = 5, pError = 100)
#'
#' # simulate the number of reads contributed, assuming 10x coverage for each site
#' indReads(np = 10, coverage = rep(10, 5), probs = probs)
#'
#' @export
indReads <- function(np, coverage, probs) {
# set the output of this function if a given locus has no sites
if(length(coverage) == 0) {
# set the contribution to NA
contribution <- NA
} else {
# check if the dimensions make sense
if(length(coverage) != ncol(probs))
stop("different number of sites in the coverage and probs input arguments")
# simulate the contribution of each individual towards the total depth of coverage
contribution <- vapply(1:length(coverage), FUN = function(i) {
stats::rmultinom(1, size = coverage[i], prob = probs[, i])
}, FUN.VALUE = numeric(np))
}
# output a matrix containing the contribution (in number of reads) for each individual and across all sites
contribution
}
#' Compute number of reads for each individual and across all sites
#'
#' This function computes the contribution of each individual towards the total
#' coverage of a given population.
#'
#' If multiple pools were used to sequence a population, this will compute the
#' contribution of each pool and then use that to calculate how many reads does
#' that pool contribute. Next, the probability of contribution of each
#' individual is computed and utilized to calculate the number of reads that
#' each individual contributes towards the total number of reads observed in the
#' corresponding pool.
#'
#' @param vector_np is a vector where each entry contains the number of diploid
#' individuals of a given pool. Thus, if a population was sequenced using two
#' pools, each with 10 individuals, this vector would contain two entries and
#' both will be 10.
#' @param coverage a vector containing the total depth of coverage of the
#' population. Each entry of the vector represents a different site.
#' @param pError an integer representing the value of the error associated with
#' DNA pooling. This value is related with the unequal contribution of both
#' individuals and pools towards the total number of reads observed for a
#' given population - the higher the value the more unequal are the individual
#' and pool contributions.
#'
#' @return a matrix with the number of reads contributed by each individual.
#' Each row of the matrix corresponds to a different individual and each
#' column to a different site.
#'
#' @examples
#' # simulate number of reads contributed by each individual towards the total population coverage
#' # assuming a coverage of 10x at 5 sites and two pools, each with 5 individuals
#' popReads(vector_np = c(5, 5), coverage = rep(10, 5), pError = 100)
#'
#' @export
popReads <- function(vector_np, coverage, pError) {
# when the coverage input is a list, convert it to a vector
if(inherits(coverage, "list"))
coverage <- unlist(coverage)
# get the number of diploid individuals
nDip <- sum(vector_np)
# get the number of pools used to sequence the population
nPools <- length(vector_np)
# get the number of polymorphic sites
nSNPs <- length(coverage)
# if no polymorphic sites exist at any given locus - set the output to NA
if(nSNPs == 0) {
# set the output
iReads <- matrix(data = NA, nrow = 1)
} else {
# when the population was sequenced using a single pool of individuals
if (nPools == 1) {
# calculate the probability of contribution for each individual
probs <- indProbs(np = nDip, nSNPs = nSNPs, pError = pError)
# compute the contribution of each individual and across all sites
# towards the total depth of coverage of the population
iReads <- indReads(np = nDip, coverage = coverage, probs = probs)
} else { # When more than one pool of individuals was used to sequence the population
# calculate the probability of contribution for each pool used to sequence the population
probs_pool <- poolProbs(nPools = nPools, vector_np = vector_np, nSNPs = nSNPs, pError = pError)
# compute the number of reads that each pool contributes per site
# This takes into account the total depth of coverage of the population for a given site
# and divides that coverage amongst all the pools - considering also the probability of contribution of each pool
pool_reads <- poolReads(nPools = nPools, coverage, probs = probs_pool)
# calculate individual probabilities for each pool
probs_ind <- lapply(vector_np, FUN = function(pool) indProbs(np = pool, nSNPs = nSNPs, pError = pError))
# Taking into account the coverage of each pool, simulate how many reads does each individual inside a pool
# contributes towards that number i.e if a pool has 25 reads at a given site:
# simulate how many of these reads come from the first individual, how many from the second, etc
iReads <- lapply(1:nPools, function(i)
indReads(np = vector_np[i], coverage = pool_reads[i, ], probs = probs_ind[[i]]))
# combine the entries from the various pools to obtain how many reads does a given individual
# contributes to the total depth of coverage of the population
iReads <- do.call(rbind, iReads)
}
}
# output a matrix containing the contribution (in number of reads) for each individual and across all sites
iReads
}
#' Simulate total number of reads for multiple populations
#'
#' Simulates the contribution of each individual towards the total coverage of
#' its population.
#'
#' If multiple pools were used to sequence a population, this will compute the
#' contribution of each pool and then use that to calculate how many reads does
#' that pool contribute. Next, the probability of contribution of each
#' individual is computed and utilized to calculate the number of reads that
#' each individual contributes towards the total number of reads observed in the
#' corresponding pool. These steps will be performed for each population, thus
#' obtaining the number of reads contributed by each individual for each
#' population.
#'
#' @param list_np is a list where each entry corresponds to a different
#' population. Each entry is a vector and each vector entry contains the
#' number of diploid individuals of a given pool. Thus, if a population was
#' sequenced using two pools, each with 10 individuals, this vector would
#' contain two entries and both will be 10.
#' @param coverage a matrix containing the total depth of coverage of all
#' populations. Each row corresponds to a different population and each column
#' to a different site.
#' @param pError an integer representing the value of the error associated with
#' DNA pooling. This value is related with the unequal contribution of both
#' individuals and pools towards the total number of reads observed for a
#' given population - the higher the value the more unequal are the individual
#' and pool contributions.
#'
#' @return a list with one entry per population. Each entry represents the
#' number of reads contributed by each individual towards the total coverage
#' of its population. Different individuals correspond to different rows and
#' different sites to different columns.
#'
#' @examples
#' # simulate coverage for two populations sequenced at 10x at 5 sites
#' reads <- simulateCoverage(mean = c(10, 10), variance = c(20, 20), nSNPs = 5, nLoci = 1)
#'
#' # simulate the individual contribution towards that coverage
#' # assuming that the first population was sequenced using two pools of 5 individuals
#' # and the second using a single pool with 10 individuals
#' popsReads(list_np = list(c(5, 5), 10), coverage = reads, pError = 5)
#'
#' @export
popsReads <- function(list_np, coverage, pError) {
# get the number of populations
nPops <- length(list_np)
# when the coverage input is a list, convert with to a matrix, with each row corresponding to a population
if(any(class(coverage) == "list"))
coverage <- matrix(unlist(coverage), nrow = nPops, byrow = FALSE)
# Taking into account the depth of coverage for each population, simulate how many reads does each individual contributes
# towards the total coverage of the population
indCoverage <- lapply(1:nPops, function(pop) popReads(vector_np = list_np[[pop]], coverage[pop, ], pError))
# output the contribution (in number of reads) for each individual and across all sites
indCoverage
}
#' Split matrix of genotypes
#'
#' This function splits a matrix into different list entries. The matrix is
#' split according to a set of row indexes defined by the `size` input.
#'
#' The `size` input is utilized to create the index of the rows that go into the
#' different list entries. It specifies the size, in terms of number of
#' individuals, of each population.
#'
#' @param matrix is obviously a matrix, ideally one containing genotypes. Each
#' column of the matrix should be a different site and each row a different
#' individual.
#' @param size a list with one entry per population. Each entry should be a
#' vector containing the size (in number of diploid individuals) of each pool.
#' Thus, if a population was sequenced using a single pool, the vector should
#' contain only one entry. If a population was sequenced using two pools, each
#' with 10 individuals, this vector should contain two entries and both will
#' be 10.
#'
#' @return a list with one entry per entry of the size input argument. Each
#' entry contains the information of the original matrix for the number of
#' individuals specified by the corresponding entry of the size input
#' argument.
#'
#' @keywords internal
#'
#' @examples
#' set.seed(10)
#'
#' # create a random matrix
#' mymatrix <- matrix(round(runif(n = 50, min = 10, max = 25)), nrow = 10)
#'
#' # split that matrix assuming 8 individuals in the first population and two in the second
#' splitMatrix(matrix = mymatrix, size = list(8, 2))
#'
#' @export
splitMatrix <- function(matrix, size) {
# general check to see if the input is correctly supplied
if(!inherits(size, "list"))
stop(paste("The size input should be a list. Please check"))
# get the number of populations - it's the number of entries in the size input
nPops <- length(size)
# Perform a cumulative sum - this will create a vector, starting at the number one
# each subsequent entry on the vector is the index of the last individual of a population
popsize <- c(0, cumsum(lapply(size, sum)))
# Use the previous index to create vectors containing the index of all individuals per population.
# For example, if you have a population starting at index 30 and ending at 50, this will create a vector
# containing the numbers 30, 31, 32... etc until 50. This creates a list, where each entry is a vector with
# the indices of a single population
index <- lapply(1:nPops, function(i) seq(popsize[i]+1, popsize[i+1]))
# use the vectors containing the index of all individuals of a given population to subset the matrix
# The sub-setting is done by rows and it creates a list, where each entry contains the information for one population
output <- lapply(1:nPops, function(pop) matrix[index[[pop]], , drop = FALSE])
# Output the list containing, in each entry, the information for each population
output
}
#' Compute the number of reference reads
#'
#' This function takes as input the total depth of coverage and computes how
#' many of those reads are reference allele reads.
#'
#' More precisely, this function computes the number of reference reads per site
#' for one individual, given the genotype of the individual at each site, the
#' total number of reads observed for the individual at that site and an error
#' rate.
#'
#' @param genotype_v is a vector with the genotype of a given individual. Each
#' entry of the vector should be a different site. Genotypes should be encoded
#' as 0: reference homozygote, 1: heterozygote and 2: alternative homozygote.
#' @param readCount_v is a vector with the number of reads contributed by the
#' same given individual. Each entry of that vector should be a different
#' site.
#' @param error a numeric value with error rate associated with the sequencing
#' and mapping process. This error rate is assumed to be symmetric:
#' error(reference -> alternative) = error(alternative -> reference). This
#' number should be between 0 and 1.
#'
#' @return a vector with the number of reference allele reads. Each entry of the
#' vector corresponds to a different individual.
#'
#' @examples
#' # number of reference allele reads for three individuals, each with 10x coverage
#' # one individual is homozygote for the reference allele (0), other is heterozygote (1)
#' # and the last is homozygote for the alternative allele (2)
#' getNumReadsR_vector(genotype_v = c(0,1,2), readCount_v = c(10, 10, 10), error = 0.01)
#'
#' @export
getNumReadsR_vector <- function(genotype_v, readCount_v, error) {
# ensure that both vectors have the same length
# you should have one genotype and one value of depth of coverage per site
if(length(genotype_v) != length(readCount_v))
stop(paste("The lengths of the genotype_v and the readCount_v inputs should be the same. Please check"))
# initialize the number of A reads as a vector of 0 with same size as Genotype_v
numAreads <- numeric(length(genotype_v))
# Get the entries of genotype for hom reference
hom_anc <- genotype_v == 0
# Get the entries of heterozygotes (1)
het <- genotype_v == 1
# Get the entries for hom alternative (2)
hom_der <- genotype_v == 2
# Call the rbinom in a vector way, using size as the vector
# homozygote reference
numAreads[hom_anc] <- stats::rbinom(n = sum(hom_anc), size = readCount_v[hom_anc], prob = 1-error)
# hets
numAreads[het] <- stats::rbinom(n = sum(het), size = readCount_v[het], prob = 0.5)
# homozygote alternative
numAreads[hom_der] <- stats::rbinom(n = sum(hom_der), size = readCount_v[hom_der], prob = error)
# return the number of reads
numAreads
}
#' Compute the number of reference reads over a matrix
#'
#' This function works over all the rows and columns of a matrix and computes
#' the number of reads containing the reference allele at each site and for each
#' individual.
#'
#' @param genotypes is a matrix of genotypes. Each column of the matrix should
#' be a different site and each row a different individual. Genotypes should
#' be encoded as 0: reference homozygote, 1: heterozygote and 2: alternative
#' homozygote.
#' @param indContribution is a matrix of individual contributions. Each row of
#' that matrix is a different individual and each column is a different site.
#' Thus, each entry of the matrix should contain the number of reads
#' contributed by that individual at that particular site.
#' @param error a numeric value with error rate associated with the sequencing
#' and mapping process. This error rate is assumed to be symmetric:
#' error(reference -> alternative) = error(alternative -> reference). This
#' number should be between 0 and 1.
#'
#' @return a matrix with the number of reference allele reads contributed by
#' each individual. Each row of the matrix represents a different individual
#' and each column is a different site.
#'
#' @examples
#' # probability of contribution for 10 individuals at 5 sites
#' probs <- indProbs(np = 10, nSNPs = 5, pError = 5)
#'
#' # simulate the number of reads contributed, assuming 20 coverage for each site
#' indContribution <- indReads(np = 10, coverage = rep(20, 5), probs = probs)
#'
#' # set seed and create a random matrix of genotypes
#' set.seed(10)
#' genotypes <- matrix(rpois(50, 0.5), nrow = 10)
#'
#' # simulate the number of reads with the reference allele
#' computeReference(genotypes = genotypes, indContribution = indContribution, error = 0.01)
#'
#' @export
computeReference <- function(genotypes, indContribution, error) {
# ensure that both matrices have the same dimension
# you should have one genotype and one value of depth of coverage per site - both matrices should have the same dimension
if(identical(dim(genotypes), dim(indContribution)) == FALSE)
stop(paste("The dimensions of the genotypes and indContribution matrices should be the same. Please check"))
# the function getNumReadsR_vector gets as input vectors
# hence, I can apply it to each column of genotypes and individual contribution
tempReference <- sapply(1:ncol(genotypes), function(i) {
getNumReadsR_vector(genotype_v = genotypes[,i], readCount_v = indContribution[,i], error = error)})
# check that indContribution-tmp does not give negative values
if(sum( (indContribution - tempReference) < 0 ) > 0 ) {
stop("Error in creating number of reads!")
}
# output the number of reference reads
tempReference
}
#' Compute the number of reference reads at multiple loci
#'
#' This function computes the number of reference reads over multiple loci and
#' for a single population.
#'
#' Note that this function will also work on a single locus, provided that the
#' input is in the list format.
#'
#' @param genotypes is a list, where each entry corresponds to a different
#' locus. Each entry should be a matrix containing the genotypes (coded as 0,
#' 1 or 2). Each column of that matrix should be a different site and each row
#' a different individual.
#' @param indContribution either a list or a matrix (that the function will
#' convert to a list). Each list entry should be a matrix of individual
#' contributions. Each row of that matrix is a different individual and each
#' column is a different site. Thus, each entry of the matrix should contain
#' the number of reads contributed by that individual at that particular site.
#' @param error a numeric value with error rate associated with the sequencing
#' and mapping process. This error rate is assumed to be symmetric:
#' error(reference -> alternative) = error(alternative -> reference). This
#' number should be between 0 and 1.
#'
#' @return a list with one entry per locus. Each of those entries is a matrix
#' with the number of reference allele reads contributed by each individual.
#' Each matrix row represents a different individual and each column is a
#' different site.
#'
#' @keywords internal
#'
#' @export
numberReference <- function(genotypes, indContribution, error) {
if(!inherits(genotypes, "list"))
stop(paste("The genotypes input should be on a list format. Please check"))
if(any(class(indContribution) != "list"))
indContribution <- list(indContribution)
if(length(genotypes) != length(indContribution))
stop(paste("The genotypes and indContribution lists should have the same number of entries. Please check"))
# get the number of loci
nLoci <- length(genotypes)
# When dealing with a single population - all the genotypes in the matrix correspond to a single population
readsReference <- lapply(1:nLoci, function(locus) computeReference(genotypes[[locus]], indContribution[[locus]], error))
# output the number of reads with the reference allele per individual and per site
readsReference
}
#' Compute the number of reference reads for multiple populations
#'
#' This function computes the number of reference reads over a single locus for
#' multiple populations.
#'
#' Note that this function will not work as intended if the input consists of
#' multiple loci.
#'
#' @param genotypes either a list with a single entry (one locus) or a matrix
#' (that the function will convert to a list) containing the genotypes (coded
#' as 0, 1 or 2). Each column of that matrix should be a different site and
#' each row a different individual.
#' @param indContribution a list where each entry contains the information for a
#' single population. Each entry should be a matrix, with as many rows as the
#' number of individuals of that population. Each row contains the number of
#' contributed reads for a given individual and across all sites.
#' @param size a list with one entry per population. Each entry should be a
#' vector containing the size (in number of diploid individuals) of each pool.
#' Thus, if a population was sequenced using a single pool, the vector should
#' contain only one entry. If a population was sequenced using two pools, each
#' with 10 individuals, this vector should contain two entries and both will
#' be 10.
#' @param error a numeric value with error rate associated with the sequencing
#' and mapping process. This error rate is assumed to be symmetric:
#' error(reference -> alternative) = error(alternative -> reference). This
#' number should be between 0 and 1.
#'
#' @return a list with one entry per population. Each entry contains the number
#' of reference allele reads for the individuals of that population and for
#' that locus. Different individuals are in different rows and each columns
#' represents a different site.
#'
#' @examples
#' # simulate coverage at 5 SNPs for two populations, assuming 20x mean coverage
#' reads <- simulateCoverage(mean = c(20, 20), variance = c(100, 100), nSNPs = 5, nLoci = 1)
#'
#' # simulate the number of reads contributed by each individual
#' # for each population there are two pools, each with 5 individuals
#' indContribution <- popsReads(list_np = rep(list(rep(5, 2)), 2), coverage = reads, pError = 5)
#'
#' # set seed and create a random matrix of genotypes for the 20 individuals - 10 per population
#' set.seed(10)
#' genotypes <- matrix(rpois(100, 0.5), nrow = 20)
#'
#' # simulate the number of reference reads for the two populations
#' numberReferencePop(genotypes = genotypes, indContribution = indContribution,
#' size = rep(list(rep(5, 2)), 2), error = 0.01)
#'
#' @export
numberReferencePop <- function(genotypes, indContribution, size, error) {
# check if the indContribution input is the right format
if(any(class(indContribution) != "list"))
stop(paste("The indContribution input should be on a list format. Please check"))
# get the number of populations
nPops <- length(size)
# set the output when a given locus has no SNPs
if(any(is.na(unlist(indContribution)))) {
# set the output to NA
reference <- list(matrix(data = NA, nrow = nPops), matrix(data = NA, nrow = nPops))
} else {
# when the genotypes input is not a list, convert with to a list
if(any(class(genotypes) != "list"))
genotypes <- list(genotypes)
# check if the indContribution list contains one list entry per population
if(length(indContribution) != nPops)
stop(paste("The indContribution input should have one list entry per population. Please check"))
# use the splitMatrix function to split the genotypes into different list entries for each population
tempGeno <- sapply(genotypes, FUN = function(geno) splitMatrix(geno, size))
# compute the number of reads with the reference allele, for each individual and across all sites and populations
reference <- sapply(1:nPops, function(pop)
numberReference(genotypes = list(tempGeno[[pop]]), indContribution = indContribution[[pop]], error))
}
# output the number of reference reads
reference
}
#' Create Pooled DNA sequencing data for multiple populations
#'
#' This function combines the information for each individual of each population
#' into information at the population level.
#'
#' In other words, the information of all individuals in a given population is
#' combined into a single population value and this is done for the various
#' populations. In this situation, each entry of the `indContribution` and
#' `readsReference` lists should contain one entry per population - being, in
#' essence, a list within a list. Please note that this function is intended to
#' work for multiple populations and should not be used with a single
#' population.
#'
#' @param nPops An integer representing the total number of populations in the
#' dataset.
#' @param nLoci An integer that represents the total number of independent loci
#' in the dataset.
#' @param indContribution Either a list or a matrix (when dealing with a single
#' locus).
#' @param readsReference A list, where each entry contains the information for a
#' single locus. Each list entry should then have one separate entry per
#' population. Each of these entries should be a matrix, with each row
#' corresponding to a single individual and each column a different site.
#' Thus, each entry of the matrix contains the number of observed reads with
#' the reference allele for that individual at a given site. The output of the
#' `numberReference` or `numberReferencePop` functions should be the input
#' here.
#'
#' @return a list with three names entries
#'
#' \item{reference}{a list with one entry per locus. Each entry is a matrix
#' with the number of reference allele reads for each population. Each column
#' represents a different site and each row a different population.}
#'
#' \item{alternative}{a list with one entry per locus. Each entry is a matrix
#' with the number of alternative allele reads for each population. Each
#' column represents a different site and each row a different population.}
#'
#' \item{total}{a list with one entry per locus. Each entry is a matrix with
#' the coverage of each population. Each column represents a different site
#' and each row a different population.}
#'
#' @examples
#' # simulate coverage at 5 SNPs for two populations, assuming 20x mean coverage
#' reads <- simulateCoverage(mean = c(20, 20), variance = c(100, 100), nSNPs = 5, nLoci = 1)
#'
#' # simulate the number of reads contributed by each individual
#' # for each population there are two pools, each with 5 individuals
#' indContribution <- popsReads(list_np = rep(list(rep(5, 2)), 2), coverage = reads, pError = 5)
#'
#' # set seed and create a random matrix of genotypes for the 20 individuals - 10 per population
#' set.seed(10)
#' genotypes <- matrix(rpois(100, 0.5), nrow = 20)
#'
#' # simulate the number of reference reads for the two populations
#' readsReference <- numberReferencePop(genotypes = genotypes, indContribution = indContribution,
#' size = rep(list(rep(5, 2)), 2), error = 0.01)
#'
#' # create Pooled DNA sequencing data for these two populations and for a single locus
#' poolPops(nPops = 2, nLoci = 1, indContribution = indContribution, readsReference = readsReference)
#'
#' @export
poolPops <- function(nPops, nLoci, indContribution, readsReference) {
# when dealing with a single population and a single locus
if(nPops == 1 && nLoci == 1) {
# number of reads with the alternative allele is simply the total number of reads per individual minus the number of reads
# with the reference allele for that individual
readsAlternative <- lapply(1:nLoci, function(locus)
mapply(function(individual, reference) FUN = individual - reference, SIMPLIFY = FALSE,
individual = indContribution[[locus]], reference = readsReference[[locus]]))
} else {
# when dealing with a single locus it's possible that the indContribution input is not on the correct format
if(nLoci == 1 && length(indContribution) == nPops | any(class(indContribution) != "list"))
indContribution <- list(indContribution)
# the same is true for the readsReference input
if(nLoci == 1 && length(readsReference) == nPops | any(class(readsReference) != "list"))
readsReference <- list(readsReference)
# number of reads with the alternative allele is simply the total number of reads per individual minus the number of reads
# with the reference allele for that individual
readsAlternative <- lapply(1:nLoci, function(locus)
mapply(function(individual, reference) FUN = individual - reference, SIMPLIFY = FALSE,
individual = indContribution[[locus]], reference = readsReference[[locus]]))
}
# now, since each entry (each locus) has independent entries for each population, we can simple perform colSums across the
# various entries. This will sum the number of reads (with the reference, the alternative and the total number of reads)
# across all individuals of a population
# in this way, you get the total number of reads with each allele for each population
referencePool <- lapply(readsReference, function(reference) matrix(t(sapply(reference, colSums)), nrow = nPops))
alternativePool <- lapply(readsAlternative, function(alternative) matrix(t(sapply(alternative, colSums)), nrow = nPops))
totalPool <- lapply(indContribution, function(total) matrix(t(sapply(total, colSums)), nrow = nPops))
# combine the information about the different read types into a list, create names for each entry and output that list
final_list <- list(referencePool, alternativePool, totalPool)
names(final_list)=c("reference", "alternative", "total")
return(final_list)
}
#' Define major and minor alleles
#'
#' This function checks which of the two simulated alleles (reference or
#' alternative) corresponds to the minor allele. This function can also be used
#' to remove sites according to a minor-allele reads threshold.
#'
#' More precisely, this function counts the number of reads with the reference
#' or alternative allele at each site and then sets the minor allele as the
#' least frequent of the two. This is done across all populations and so the
#' major and minor alleles are defined at a global level. Then if the
#' `min.minor` input is not NA, sites where the number of minor allele reads,
#' across all populations, is below the user-defined threshold are removed.
#'
#' @param reference is a matrix of reference allele reads. Each row of the
#' matrix should be a different population and each column a different site.
#' Thus, each entry of the matrix contains the number of observed reads with
#' the reference allele for that population at a given site.
#' @param alternative is a matrix of alternative allele reads. Each row of the
#' matrix should be a different population and each column a different site.
#' Thus, each entry of the matrix contains the number of observed reads with
#' the alternative allele for that population at a given site.
#' @param coverage is a matrix of total coverage. Each row of the matrix should
#' be a different population and each column a different site. Thus, each
#' entry of the matrix contains the total number of observed reads for that
#' population at a given site.
#'
#' @return a list with three names entries
#'
#' \item{major}{a list with one entry per locus. Each entry is a matrix with
#' the number of major allele reads for each population. Each column
#' represents a different site and each row a different population.}
#'
#' \item{minor}{a list with one entry per locus. Each entry is a matrix with
#' the number of minor allele reads for each population. Each column
#' represents a different site and each row a different population.}
#'
#' \item{total}{a list with one entry per locus. Each entry is a matrix with
#' the coverage of each population. Each column represents a different site
#' and each row a different population.}
#'
#' @examples
#' # simulate coverage at 5 SNPs for two populations, assuming 20x mean coverage
#' reads <- simulateCoverage(mean = c(20, 20), variance = c(100, 100), nSNPs = 5, nLoci = 1)
#'
#' # simulate the number of reads contributed by each individual
#' # for each population there are two pools, each with 5 individuals
#' indContribution <- popsReads(list_np = rep(list(rep(5, 2)), 2), coverage = reads, pError = 5)
#'
#' # set seed and create a random matrix of genotypes for the 20 individuals - 10 per population
#' set.seed(10)
#' genotypes <- matrix(rpois(100, 0.5), nrow = 20)
#'
#' # simulate the number of reference reads for the two populations
#' readsReference <- numberReferencePop(genotypes = genotypes, indContribution = indContribution,
#' size = rep(list(rep(5, 2)), 2), error = 0.01)
#'
#' # create Pooled DNA sequencing data for these two populations and for a single locus
#' pools <- poolPops(nPops = 2, nLoci = 1, indContribution = indContribution,
#' readsReference = readsReference)
#'
#' # define the major and minor alleles for this Pool-seq data
#' # we have to select the first entry of the pools list because this function works for matrices
#' findMinor(reference = pools$reference[[1]], alternative = pools$alternative[[1]],
#' coverage = pools$total[[1]])
#'
#' @export
findMinor <- function(reference, alternative, coverage) {
# set the output for the situations where there is no SNP at the locus
# check for NAs in one of the matrices
if(any(is.na(unlist(reference)))) {
# set the output to NA
out <- list(reference = NA, alternative = NA, total = NA)
} else {
# create two temporary matrices - one containing the reads with the reference allele for each population and across all sites
# and another with the reads with the alternative allele
Ranc <- reference; Rder <- alternative
# perform an evaluation to check if the total number or reads for each site
# is bigger in the matrix containing the "reference" allele reads or in the matrix containing the "alternative" allele
eval <- colSums(Ranc) < colSums(Rder)
# at the sites where the number of reads for the "alternative" allele is bigger than the number of reads for the "reference" allele
# replace those columns with the rows from the matrix of the "alternative" allele
reference[, eval] <- Rder[, eval]
# and then replace those same columns in the matrix with the reads of the "alternative" allele
alternative[, eval] <- Ranc[, eval]
# output the matrices with the various types of read numbers
out <- list(major = reference, minor = alternative, total = coverage)
}
# output the matrices with the various types of read numbers
out
}
#' Filter sites according to a minor-allele reads threshold
#'
#' Removes sites from matrices with counts of reads. If a site has less
#' minor-allele reads than \code{min.minor} across all populations, that site is
#' removed from the data.
#'
#' @param reference a matrix with the number of reads with the reference allele.
#' Each row should be a different population and each column a different site.
#' @param alternative a matrix with the number of reads with the alternative
#' allele. Each row should be a different population and each column a
#' different site.
#' @param coverage is a matrix of total coverage. Each row of the matrix should
#' be a different population and each column a different site. Thus, each
#' entry of the matrix contains the total number of observed reads for that
#' population at a given site.
#' @param min.minor is an integer representing the minimum allowed number of
#' minor-allele reads. Sites that, across all populations, have less
#' minor-allele reads than this threshold will be removed from the data.
#'
#' @return a list with three named entries:
#'
#' \item{reference}{a list with one entry per locus. Each entry is a matrix with
#' the number of reference allele reads. Each column represents a different site.}
#'
#' \item{alternative}{a list with one entry per locus. Each entry is a matrix with
#' the number of alternative allele reads. Each column represents a different site.}
#'
#' \item{total}{a list with one entry per locus. Each entry is a matrix with
#' the total depth of coverage. Each column represents a different site.}
#'
#' @keywords internal
#'
#' @export
filterMinor <- function(reference, alternative, coverage, min.minor) {
# check which of the two simulated alleles (reference or alternative) corresponds to the minor allele
tminor <- findMinor(reference = reference, alternative = alternative, coverage = coverage)
# get the total number of minor allele reads in the data
tminor <- colSums(tminor[["minor"]])
# find out in which columns the total sum of the reads with the minor allele is below the threshold
toremove <- tminor < min.minor
# if there are sites where the sum of the reads with the minor allele is below the threshold
if(sum(toremove) != 0) {
# remove those columns from the matrix containing the depth of coverage
coverage <- coverage[, !toremove, drop = FALSE]
# remove those columns from the matrix containing the number of reads with the alternative allele
alternative <- alternative[, !toremove, drop = FALSE]
# and remove those columns from the matrix containing the number of reads with the reference allele
reference <- reference[, !toremove, drop = FALSE]
}
# create the output containing the sites with reads above the min.minor threshold
out <- list(reference = reference, alternative = alternative, total = coverage)
# output the results of the function
out
}
#' Filter Pool-seq data according to a minor-allele reads threshold
#'
#' Removes sites from Pool-seq data. If a site has less minor-allele reads than
#' \code{min.minor} across all populations, that site is removed from the data.
#'
#' @param pool a list containing the "reference" element, representing the
#' number of reads with the reference allele, the "alternative" element
#' representing the number of reads with the alternative allele and the
#' "total" element that contains information about the total number of reads.
#' @param nloci an integer that represents the total number of independent loci
#' in the dataset.
#' @param min.minor is an integer representing the minimum allowed number of
#' minor-allele reads. Sites that, across all populations, have less
#' minor-allele reads than this threshold will be removed from the data.
#'
#' @return a list with three named entries:
#'
#' \item{reference}{a list with one entry per locus. Each entry is a matrix with
#' the number of reference allele reads. Each column represents a different site.}
#'
#' \item{alternative}{a list with one entry per locus. Each entry is a matrix with
#' the number of alternative allele reads. Each column represents a different site.}
#'
#' \item{total}{a list with one entry per locus. Each entry is a matrix with
#' the total depth of coverage. Each column represents a different site.}
#'
#' @keywords internal
#'
#' @export
filterPool <- function(pool, nloci, min.minor) {
# check if we are using the correct input
if(any(names(pool) != c("reference", "alternative", "total")))
stop("Using an incorrect Pool-seq data list. Please check!")
# filter sites according to the number of minor allele reads
# keep only sites with reads above the min.minor threshold
pool <- lapply(1:nloci, function(locus)
filterMinor(reference = pool$reference[[locus]], alternative = pool$alternative[[locus]],
coverage = pool$total[[locus]], min.minor = min.minor))
# convert the pool list back to the correct format:
# one entry for reference reads, one for alternative reads and a final one for total coverage
pool <- list(reference = lapply(pool, function(locus) locus[["reference"]]),
alternative = lapply(pool, function(locus) locus[["alternative"]]),
total = lapply(pool, function(locus) locus[["total"]]))
# output the results of the function
pool
}
#' Calculate population frequency at each SNP
#'
#' The frequency at a given SNP is calculated according to: `pi = c/r`, where c
#' = number of minor-allele reads and r = total number of observed reads.
#'
#' This function takes as input a list that contains the number of reads with
#' the minor allele and the number of total reads per population at a given
#' site. The names of the respective elements of the list should be minor and
#' total. It works with lists containing just one set of minor and total reads,
#' corresponding to a single locus, and with lists where each entry contains a
#' different set of minor and total number of reads, corresponding to different
#' loci.
#'
#' @param listPool a list containing the "minor" element, representing the
#' number of reads with the minor-allele and the "total" element that contains
#' information about the total number of reads. The list should also contain a
#' "major" entry with the information about reads containing the major-allele.
#' The output of the `poolPops` function should be used as input here.
#' @param nLoci an integer that represents the total number of independent loci
#' in the dataset.
#'
#' @return a list with two named entries
#'
#' \item{pi}{a list with the allele frequencies of each population. Each list
#' entry is a matrix, corresponding to a different locus. Each row of a matrix
#' corresponds to a different population and each column to a different site.}
#'
#' \item{pool}{a list with three different entries: major, minor and total.
#' This list is similar to the one obtained with the \code{\link{findMinor}}
#' function.}
#'
#' @examples
#' # simulate coverage at 5 SNPs for two populations, assuming 20x mean coverage
#' reads <- simulateCoverage(mean = c(20, 20), variance = c(100, 100), nSNPs = 5, nLoci = 1)
#'
#' # simulate the number of reads contributed by each individual
#' # for each population there are two pools, each with 5 individuals
#' indContribution <- popsReads(list_np = rep(list(rep(5, 2)), 2), coverage = reads, pError = 5)
#'
#' # set seed and create a random matrix of genotypes for the 20 individuals - 10 per population
#' set.seed(10)
#' genotypes <- matrix(rpois(100, 0.5), nrow = 20)
#'
#' # simulate the number of reference reads for the two populations
#' readsReference <- numberReferencePop(genotypes = genotypes, indContribution = indContribution,
#' size = rep(list(rep(5, 2)), 2), error = 0.01)
#'
#' # create Pooled DNA sequencing data for these two populations and for a single locus
#' pools <- poolPops(nPops = 2, nLoci = 1, indContribution = indContribution,
#' readsReference = readsReference)
#'
#' # define the major and minor alleles for this pool-seq data
#' # note that we have to select the first entry of the pools list
#' # because this function works for matrices
#' pools <- findMinor(reference = pools$reference[[1]], alternative = pools$alternative[[1]],
#' coverage = pools$total[[1]])
#'
#' # calculate population frequency at each SNP of this locus
#' calculatePi(listPool = pools, nLoci = 1)
#'
#' @export
calculatePi <- function(listPool, nLoci) {
# the input should be the list containing information about reads per pool that was created by the previous function
# a failsafe to detect if you're using the right list
if(("minor" %in% names(listPool) | "total" %in% names(listPool)) == FALSE)
stop(paste("Using an incorrect list"))
# it is possible that each entry of the list is a single matrix - particularly when dealing with a single locus
# if this is the case, then this step will convert those entries into lists
if(any(sapply(listPool, class) == "matrix") == TRUE)
listPool <- lapply(listPool, list)
# by doing that transformation, we can use an lapply, whether we have one locus or multiple loci
# divide the number of reads with the minor allele by the total number of reads - to obtain allelic frequencies
Site_Pi <- lapply(1:nLoci, function(locus) listPool[["minor"]][[locus]]/listPool[["total"]][[locus]])
# remove sites without information from the various categories of reads
listPool[["minor"]] <- lapply(1:nLoci, function(locus)
listPool[["minor"]][[locus]][, colSums(is.na(Site_Pi[[locus]])) == 0, drop = FALSE])
listPool[["major"]] <- lapply(1:nLoci, function(locus)
listPool[["major"]][[locus]][, colSums(is.na(Site_Pi[[locus]])) == 0, drop = FALSE])
listPool[["total"]] <- lapply(1:nLoci, function(locus)
listPool[["total"]][[locus]][, colSums(is.na(Site_Pi[[locus]])) == 0, drop = FALSE])
# remove sites without information from the matrix containing the allelic frequencies
Site_Pi <- lapply(1:nLoci, function(locus)
Site_Pi[[locus]][, colSums(is.na(Site_Pi[[locus]])) == 0, drop = FALSE])
# the output is a list that contains the information about the site frequencies
# but also information about the total number of reads and the number of reads with the major/minor alleles
list(pi = Site_Pi, pool = listPool)
}
#' Randomly select the required number of loci from the pooled sequencing data
#'
#' This function removes loci without polymorphic sites and then randomly
#' selects the required number of loci from a larger dataset.
#'
#' If extra loci were simulated to try to have sufficient loci to keep the
#' required number of loci after filtering, then this function is used to remove
#' extra loci. This is done by randomly selecting the required number of loci
#' from the full contingent of extra simulated loci.
#'
#' @param nSims is an integer representing how many types of simulations were
#' performed. The possible types of simulations include loci without barriers
#' against migration between divergent ecotypes, loci without migration from
#' the C towards the W ecotype, loci without migration from the W towards the
#' C ecotypes and loci where no migration occurs between divergent ecotypes.
#' @param pool a list containing the "minor" element, representing the number of
#' reads with the minor-allele and the "total" element that contains
#' information about the total number of Reads. The list should also contain a
#' "major" entry with the information about reads containing the major-allele.
#' @param target is a vector with the required number of loci per category. If
#' extra loci were simulated, this vector informs how many loci of each
#' simulation type should be randomly selected.
#'
#' @return a list with three named entries
#'
#' \item{major}{a list with one entry per locus. Each entry is a matrix with
#' the number of major allele reads for each population. Each column
#' represents a different site and each row a different population.}
#'
#' \item{minor}{a list with one entry per locus. Each entry is a matrix with
#' the number of minor allele reads for each population. Each column
#' represents a different site and each row a different population.}
#'
#' \item{total}{a list with one entry per locus. Each entry is a matrix with
#' the coverage of each population. Each column represents a different site
#' and each row a different population.}
#'
#' @keywords internal
#'
#' @export
forcePool <- function(nSims, pool, target) {
# check the number of columns of the matrices with the number of minor-allele reads
dimensions <- lapply(1:nSims, function(sim) sapply(pool[[sim]][["minor"]], ncol))
# we only wish to keep loci where we have at least one polymorphic site
# with this we obtain a vector that marks the loci without polymorphic sites with a FALSE
tokeep <- lapply(dimensions, function(sim) sim != 0)
# remove those loci from the data
pool <- lapply(1:nSims, function(sim) lapply(pool[[sim]], function(i) i[tokeep[[sim]]]))
# create a random vector of TRUE/FALSE to keep only the required number of loci per category
tokeep <- lapply(1:nSims, function(sim)
sample(x = c(rep(TRUE, target[sim]), rep(FALSE, length(pool[[sim]][["minor"]]) - target[sim])),
size = length(pool[[sim]][["minor"]]), replace = F))
# keep only the randomly selected loci
pool <- lapply(1:nSims, function(sim) lapply(pool[[sim]], function(i) i[tokeep[[sim]]]))
# convert the pool list back to the previous format:
# one entry for major allele, one for minor allele and a final one for total coverage
pool <- list(major = unlist(lapply(pool, function(x) x[["major"]]), recursive = FALSE),
minor = unlist(lapply(pool, function(x) x[["minor"]]), recursive = FALSE),
total = unlist(lapply(pool, function(x) x[["total"]]), recursive = FALSE))
# output the pooled sequencing data
pool
}
#' Simulate Pool-seq data
#'
#' Simulates pooled sequencing data given a set of parameters and individual
#' genotypes.
#'
#' Note that this functions allows for different combinations of parameters.
#' Thus, Pool-seq data can be simulated for a variety of parameters. For
#' instance, different mean depths of coverage can be used to simulate Pool-seq
#' data. It is also possible to simulate Pool-seq data using different pool
#' sizes (by changing the \code{pools} input) and different values of the
#' Pool-seq error parameter (\code{pError}).
#'
#' @param genotypes a list of genotypes, where each entry is a matrix
#' corresponding to a different locus. At each matrix, each column is a
#' different SNP and each row is a different individual. Genotypes should be
#' coded as 0, 1 or 2.
#' @param pools a list with a vector containing the size (in number of diploid
#' individuals) of each pool. Thus, if a population was sequenced using a
#' single pool, the vector should contain only one entry. If a population was
#' sequenced using two pools, each with 10 individuals, this vector should
#' contain two entries and both will be 10.
#' @param pError an integer representing the value of the error associated with
#' DNA pooling. This value is related with the unequal contribution of both
#' individuals and pools towards the total number of reads observed for a
#' given population - the higher the value the more unequal are the individual
#' and pool contributions.
#' @param sError a numeric value with error rate associated with the sequencing
#' and mapping process. This error rate is assumed to be symmetric:
#' error(reference -> alternative) = error(alternative -> reference). This
#' number should be between 0 and 1.
#' @param mCov an integer that defines the mean depth of coverage to simulate.
#' Please note that this represents the mean coverage across all sites.
#' @param vCov an integer that defines the mean depth of coverage to simulate.
#' Please note that this represents the mean coverage across all sites.
#' @param min.minor is an integer representing the minimum allowed number of
#' minor-allele reads. Sites that, across all populations, have less
#' minor-allele reads than this threshold will be removed from the data.
#' @param minimum an optional integer representing the minimum coverage allowed.
#' Sites where the population has a depth of coverage below this threshold are
#' removed from the data.
#' @param maximum an optional integer representing the maximum coverage allowed.
#' Sites where the population has a depth of coverage above this threshold are
#' removed from the data.
#'
#' @return a list with three named entries:
#'
#' \item{reference}{a list with one entry per locus. Each entry is a matrix with
#' the number of reference allele reads. Each column represents a different site.}
#'
#' \item{alternative}{a list with one entry per locus. Each entry is a matrix with
#' the number of alternative allele reads. Each column represents a different site.}
#'
#' \item{total}{a list with one entry per locus. Each entry is a matrix with
#' the total depth of coverage. Each column represents a different site.}
#'
#' @examples
#' # simulate Pool-seq data for 100 individuals sampled at a single locus
#' genotypes <- run_scrm(nDip = 100, nloci = 1, theta = 5)
#' # simulate Pool-seq data assuming a coverage of 100x and two pools of 50 individuals each
#' simPoolseq(genotypes = genotypes, pools = c(50, 50), pError = 100, sError = 0.001,
#' mCov = 100, vCov = 250, min.minor = 0)
#'
#' # simulate Pool-seq data for 10 individuals sampled at 5 loci
#' genotypes <- run_scrm(nDip = 10, nloci = 5, theta = 5)
#' # simulate Pool-seq data assuming a coverage of 100x and a single pool of 10 individuals
#' simPoolseq(genotypes = genotypes, pools = 10, pError = 100, sError = 0.001,
#' mCov = 100, vCov = 250, min.minor = 0)
#'
#' @export
simPoolseq <- function(genotypes, pools, pError, sError, mCov, vCov, min.minor, minimum = NA, maximum = NA) {
# check if the input is correct - genotypes should always be supplied as a list
if(any(class(genotypes) != "list"))
stop(paste("genotypes should be supplied on a list format, with each entry corresponding to a locus. Please check"))
# check if the input is correct - pools should be a list
if(any(class(pools) != "list"))
pools <- list(pools)
# get the number of loci
nloci <- length(genotypes)
# get the number of individuals
nDip <- sum(unlist(pools))
# simulate number of reads
reads <- simulateCoverage(mean = mCov, variance = vCov, genotypes = genotypes)
# if the minimum coverage is defined
if(!is.na(minimum)) {
# check if the maximum coverage is also defined
if(is.na(maximum))
stop("please define the maximum coverage")
# remove sites with a depth of coverage above or below the defined threshold
reads <- remove_by_reads(nLoci = nloci, reads, minimum = minimum, maximum = maximum, genotypes = genotypes)
# get the genotypes - without sites simulated with a coverage below or above the threshold
genotypes <- lapply(reads, function(locus) locus[[2]])
# check the dimensions of the matrices with the genotypes
dimensions <- matrix(unlist(lapply(genotypes, dim)), ncol = 2, byrow = TRUE)
# we only wish to keep the locus where we have at least one polymorphic site
tokeep <- dimensions[, 2] != 0
# remove all loci without polymorphic sites
genotypes <- genotypes[tokeep]
# get the reads - without sites simulated with a coverage below or above the threshold
reads <- lapply(reads, function(locus) locus[[1]])
# use the same index to remove entries of the reads list that correspond to locus without polymorphic sites
reads <- reads[tokeep]
# ensure that each entry is a matrix
reads <- lapply(reads, function(locus) matrix(locus, nrow = 1))
}
# simulate individual contribution to the total number of reads
indContribution <- lapply(1:nloci, function(locus)
popsReads(list_np = pools, coverage = reads[[locus]], pError = pError))
# simulate the number of reference reads
reference <- lapply(1:nloci, function(locus)
numberReferencePop(genotypes = genotypes[[locus]], indContribution = indContribution[[locus]],
size = pools, error = sError))
# simulate pooled sequencing data
pool <- poolPops(nPops = 1, nLoci = nloci, indContribution = indContribution, readsReference = reference)
# if a minimum minor-allele reads threshold is defined
if(min.minor != 0)
pool <- filterPool(pool = pool, nloci = nloci, min.minor = min.minor)
# output Pool-seq data
pool
}
#' Create vcf string for a single SNP
#'
#' Creates a string with the information for a single SNP. The information is
#' coded as R,A:DP. R is the number of reads of the reference allele, A is the
#' number of reads of the alternative allele and DP is the total depth of
#' coverage.
#'
#' @param reference an integer representing the number of reads with the
#' reference allele
#' @param alternative an integer representing the number of reads with the
#' alternative allele
#' @param total an integer representing the total number of reads observed at
#' this SNP.
#'
#' @return a character string coded as R,A:DP.
#'
#' @keywords internal
#'
#' @export
strg2vcf <- function(reference, alternative, total) {
# create a string with the information for the vcf file
paste(paste(reference, alternative, sep = ","), ":", total, sep = "")
}
#' Create vcf string for all SNPs in a single locus
#'
#' Creates a string with the information for all SNPs. The information is coded
#' as R,A:DP. R is the number of reads of the reference allele, A is the number
#' of reads of the alternative allele and DP is the total depth of coverage.
#' Each entry of the character string corresponds to a different SNP.
#'
#' @param reference is a vector with the number of reads with the reference
#' allele. Each entry of the vector corresponds to a different SNP.
#' @param alternative is a vector with the number of reads with the alternative
#' allele. Each entry of the vector corresponds to a different SNP.
#' @param total is a vector with the total number of reads observed at each SNP.
#' Each entry of the vector corresponds to a different SNP.
#'
#' @return is a character vector with as many entries as the number of SNPs in
#' the locus. Each entry of this character vector contains the information for
#' a single SNP coded as R,A:DP.
#'
#' @keywords internal
#'
#' @export
vcflocus <- function(reference, alternative, total) {
# check if the input is a single locus or not
if(any(class(reference) == "list") && length(reference) != 1)
stop("Data should be a single locus. Please check")
# check if the reference input is a list and unlist it if it is
if(any(class(reference) == "list"))
reference <- unlist(reference)
# do the same for the alternative input
if(any(class(alternative) == "list"))
alternative <- unlist(alternative)
# and for the total input
if(any(class(total) == "list"))
total <- unlist(total)
# get the number of SNPs in this locus
nSNP <- length(reference)
# create a string with the information in the format for the vcf file for each SNP
out <- sapply(1:nSNP, FUN = function(i)
strg2vcf(reference = reference[i], alternative = alternative[i], total = total[i]))
# output the string for each SNP
out
}
#' Create vcf string for all SNPs in multiple loci
#'
#' Creates a string with the information for all SNPs across multiple loci. The
#' information is coded as R,A:DP. R is the number of reads of the reference
#' allele, A is the number of reads of the alternative allele and DP is the
#' total depth of coverage. Each entry of the character string corresponds to a
#' different SNP and each entry of the list to a different locus.
#'
#' @param reference is a list where each entry corresponds to a different locus.
#' Each list entry is a vector with the number of reads with the reference
#' allele. Each entry of the vector corresponds to a different SNP.
#' @param alternative is a list where each entry corresponds to a different
#' locus. Each list entry is a vector with the number of reads with the
#' alternative allele. Each entry of the vector corresponds to a different
#' SNP.
#' @param total is a list where each entry corresponds to a different locus.
#' Each list entry is a vector with the total number of reads observed at each
#' SNP. Each entry of the vector corresponds to a different SNP.
#'
#' @return is a list where each entry corresponds to a different locus. Each
#' entry of the list is a character vector with as many entries as the number
#' of SNPs in the locus. Each entry of this character vector contains the
#' information for a single SNP coded as R,A:DP.
#'
#' @keywords internal
#'
#' @export
vcfloci <- function(reference, alternative, total) {
# check if the input contains more than one locus or not
if(any(class(reference) == "list") && length(reference) == 1)
stop("Data should include more than one locus. Please check")
# get the number of loci in the input
nloci <- length(reference)
# create a string with the information in the format for the vcf file for each loci and for each SNP
out <- sapply(1:nloci, FUN = function(i)
vcflocus(reference = reference[i], alternative = alternative[i], total = total[i]))
# output the string for each loci and each SNP
out
}
#' Create vcf table with relevant information
#'
#' Creates a data frame in the VCF format for all SNPs and across all loci in
#' the data set.
#'
#' This function combines the information coded as R,A:DP with other necessary
#' information such as the chromosome of each SNP, the position of the SNP and
#' the quality of the genotype among others. Note that in the character string,
#' R is the number of reads of the reference allele, A is the number of reads of
#' the alternative allele and DP is the total depth of coverage. Each row of the
#' data frame corresponds to a different SNP.
#'
#' @param string is a character vector or a list where each entry contains a
#' character vector for a different locus. Each entry of this character vector
#' contains the information for a single SNP coded as R,A:DP. The output of
#' the \code{\link{vcflocus}} or \code{\link{vcfloci}} is the intended input
#' here.
#' @param pos is an optional input (default is NULL). If the actual position of
#' the SNPs are known, they can be used as input here. When working with a
#' single locus, this should be a numeric vector with each entry corresponding
#' to the position of each SNP. If the data has multiple loci, this should be
#' a list where each entry is a numeric vector with the position of the SNPs
#' for a different locus.
#'
#' @return a data frame with 10 different columns
#'
#' \item{chr}{Chromosome. Each locus is treated as different linkage group.}
#'
#' \item{pos}{Co-ordinate. The coordinate of the SNP.}
#'
#' \item{ID}{Identifier.}
#'
#' \item{REF}{Reference allele. We assume that the reference allele is always
#' an A. Note that this is not necessarily the major allele.}
#'
#' \item{ALT}{Alternative allele. We assume that the alternative allele is always
#' a T.}
#'
#' \item{QUAL}{Quality score out of 100. We assume that this score is always
#' 100.}
#'
#' \item{FILTER}{If this SNP passed quality filters.}
#'
#' \item{INFO}{Further information. Provides further information on the
#' variants.}
#'
#' \item{FORMAT}{Information about the following columns. This column tells us
#' how the number of reads is coded in the next column.}
#'
#' \item{pop1}{Number of reference-allele reads, alternative-allele reads and
#' total depth of coverage observed for this population at this SNP.}
#'
#' @keywords internal
#'
#' @export
vcfinfo <- function(string, pos = NULL) {
# check if the string input is not a list - this should mean that we are dealing with a single locus
if(any(class(string) != "list")) {
# create the string with the chromosome information - this single locus corresponds to linkage group (LG) 1
chr <- rep("LG1", length(string))
# if no SNP positions are supplied as input
if(is.null(pos))
pos <- 1:length(string) # create a string with the information about the position of each SNP
# create a string with the ID information
ID <- rep(".", length(string))
# create a string with the base of the reference allele
REF <- rep("A", length(string))
# create a string with the base of the alternative allele
ALT <- rep("T", length(string))
# create a string with the quality information
QUAL <- rep(100, length(string))
# create a string with the filter information
FILTER <- rep(".", length(string))
# create a string with the INFO
INFO <- rep(".", length(string))
# create a string with the FORMAT
FORMAT <- rep("AD:DP", length(string))
# create a data frame with the information
out <- data.frame(chr = chr, pos = pos, ID = ID, REF = REF, ALT = ALT, QUAL = QUAL, FILTER = FILTER,
INFO = INFO, FORMAT = FORMAT, pop1 = string)
}
# check if the string input is a list - this should mean that we are dealing with multiple loci
if(any(class(string) == "list")) {
# get the number of loci in the input
nloci <- length(string)
# create the string with the chromosome information - each locus corresponds to a linkage group (LG)
chr <- sapply(1:nloci, function(l) paste(rep("LG", length(string[[l]])), l, sep = ""))
# if no SNP positions are supplied as input
if(is.null(pos))
pos <- sapply(1:nloci, function(l) 1:sapply(string, length)[l])
# create a string with the ID information
ID <- sapply(1:nloci, function(l) rep(".", length(string[[l]])))
# create a string with the base of the reference allele
REF <- sapply(1:nloci, function(l) rep("A", length(string[[l]])))
# create a string with the base of the alternative allele
ALT <- sapply(1:nloci, function(l) rep("T", length(string[[l]])))
# create a string with the quality information
QUAL <- sapply(1:nloci, function(l) rep(100, length(string[[l]])))
# create a string with the filter information
FILTER <- sapply(1:nloci, function(l) rep(".", length(string[[l]])))
# create a string with the INFO
INFO <- sapply(1:nloci, function(l) rep(".", length(string[[l]])))
# create a string with the FORMAT
FORMAT <- sapply(1:nloci, function(l) rep("AD:DP", length(string[[l]])))
# create a data frame with the information
out <- data.frame(chr = unlist(chr), pos = unlist(pos), ID = unlist(ID), REF = unlist(REF), ALT = unlist(ALT), QUAL = unlist(QUAL),
FILTER = unlist(FILTER), INFO = unlist(INFO), FORMAT = unlist(FORMAT), pop1 = unlist(string))
}
# output the vcf string together with the remaining information
out
}
#' Create VCF file from Pool-seq data
#'
#' Creates and saves a file with the information from Pool-seq data coded in the
#' VCF format.
#'
#' It starts by converting the number of reads with the \code{reference} allele,
#' the \code{alternative} allele and the \code{total} depth of coverage to a
#' R,A:DP string. R is the number of reads of the reference allele, A is the
#' number of reads of the alternative allele and DP is the total depth of
#' coverage.
#'
#' Then, this information coded as R,A:DP is combined with other necessary
#' information such as the chromosome of each SNP, the position of the SNP and
#' the quality of the genotype among others. This creates a data frame where
#' each row corresponds to a different SNP.
#'
#' A \code{file} is then created and saved in the current working directory,
#' with the header lines that go above the table in a VCF file. Finally, the
#' data frame is appended to that file.
#'
#' @param reference is a list where each entry corresponds to a different locus.
#' Each list entry is a vector with the number of reads with the reference
#' allele. Each entry of the vector corresponds to a different SNP. This list
#' can have a single entry if the data is comprised of a single locus.
#' @param alternative is a list where each entry corresponds to a different
#' locus. Each list entry is a vector with the number of reads with the
#' alternative allele. Each entry of the vector corresponds to a different
#' SNP. This list can have a single entry if the data is comprised of a single
#' locus.
#' @param total is a list where each entry corresponds to a different locus.
#' Each list entry is a vector with the total number of reads observed at each
#' SNP. Each entry of the vector corresponds to a different SNP. This list can
#' have a single entry if the data is comprised of a single locus.
#' @param file is a character string naming the file to write to.
#' @param pos is an optional input (default is NULL). If the actual position of
#' the SNPs are known, they can be used as input here. When working with a
#' single locus, this should be a numeric vector with each entry corresponding
#' to the position of each SNP. If the data has multiple loci, this should be
#' a list where each entry is a numeric vector with the position of the SNPs
#' for a different locus.
#'
#' @return a file in the current working directory containing Pool-seq data in
#' the VCF format.
#'
#' @examples
#' # simulate Pool-seq data for 100 individuals sampled at a single locus
#' genotypes <- run_scrm(nDip = 100, nloci = 1, theta = 5)
#' # simulate Pool-seq data assuming a coverage of 100x and two pools of 50 individuals each
#' pool <- simPoolseq(genotypes = genotypes, pools = c(50, 50), pError = 100, sError = 0.001,
#' mCov = 100, vCov = 250, min.minor = 0)
#' # create a vcf file of the simulated data - this will create a txt file
#' # pool2vcf(reference = pool$reference, alternative = pool$alternative,
#' # total = pool$total, file = "myvcf.txt")
#'
#' # simulate Pool-seq data for 10 individuals sampled at 5 loci
#' genotypes <- run_scrm(nDip = 10, nloci = 5, theta = 5)
#' # simulate Pool-seq data assuming a coverage of 100x and a single pool of 10 individuals
#' pool <- simPoolseq(genotypes = genotypes, pools = 10, pError = 100, sError = 0.001,
#' mCov = 100, vCov = 250, min.minor = 0)
#'
#' # create a vcf file of the simulated data - this will create a txt file
#' # pool2vcf(reference = pool$reference, alternative = pool$alternative,
#' # total = pool$total, file = "myvcf.txt")
#'
#' @export
pool2vcf <- function(reference, alternative, total, file, pos = NULL) {
# check if the input has a single locus and use the appropriate function
if(any(class(reference) == "list") && length(reference) == 1)
vcftemp <- vcflocus(reference, alternative, total)
# check if the input has multiple loci and use the appropriate function
if(any(class(reference) == "list") && length(reference) != 1)
vcftemp <- vcfloci(reference, alternative, total)
# create a VCF table with relevant information
vcftemp <- vcfinfo(string = vcftemp, pos = pos)
# create the header lines to be placed above the table
myformat <- paste("##fileformat=VCFv4.1
##source=\"PoolHelper R package simulation of PoolSeq data - reference coded always as A, alternative as T. QUAL set to 100\"
##FORMAT=<ID=AD,Number=.,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed\">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Approximate read depth (reads with MQ=255 or with bad mates are filtered)\">
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tpop1", sep = "")
# write the header of vcf file
write(x = myformat, file = file)
# append the info about each site for vcf file
utils::write.table(x = vcftemp, file = file, append = TRUE, quote = FALSE, row.names = FALSE, col.names = FALSE)
}
#' Create sync string for a single SNP
#'
#' Creates a string with the information for a single SNP. The information is
#' coded as A-count:T-count:C-count:G-count:N-count:deletion-count. Note that we
#' assume that the reference allele is always A and the alternative is always T.
#'
#' @param reference an integer representing the number of reads with the
#' reference allele.
#' @param alternative an integer representing the number of reads with the
#' alternative allele.
#'
#' @return a character string coded as
#' A-count:T-count:C-count:G-count:N-count:deletion-count.
#'
#' @keywords internal
#'
#' @export
strg2sync <- function(reference, alternative) {
# create a string with the information for the sync file
paste(reference, alternative, 0, 0, 0, 0, sep = ":")
}
#' Create 'synchronized' file from Pool-seq data
#'
#' Creates and saves a file with the information from Pool-seq data coded in the
#' 'synchronized' format.
#'
#' It starts by converting the number of reads with the \code{reference} allele
#' and the \code{alternative} allele to a
#' A-count:T-count:C-count:G-count:N-count:deletion-count string. Here, we
#' assume that the reference allele is always A and the alternative is always T.
#'
#' Then, this A-count:T-count:C-count:G-count:N-count:deletion-count string is
#' combined with other necessary information such as the chromosome of each SNP,
#' the position of the SNP and the reference character. This step creates a data
#' frame where each row corresponds to a different SNP.
#'
#' A \code{file} is then created and saved in the current working directory,
#' with the Pool-seq data coded in the 'synchronized' file format.
#'
#' @param reference is a list where each entry corresponds to a different locus.
#' Each list entry is a vector with the number of reads with the reference
#' allele. Each entry of the vector corresponds to a different SNP. This list
#' can have a single entry if the data is comprised of a single locus.
#' @param alternative is a list where each entry corresponds to a different
#' locus. Each list entry is a vector with the number of reads with the
#' alternative allele. Each entry of the vector corresponds to a different
#' SNP. This list can have a single entry if the data is comprised of a single
#' locus.
#' @param file is a character string naming the file to write to.
#' @param pos is an optional input (default is NULL). If the actual position of
#' the SNPs are known, they can be used as input here. When working with a
#' single locus, this should be a numeric vector with each entry corresponding
#' to the position of each SNP. If the data has multiple loci, this should be
#' a list where each entry is a numeric vector with the position of the SNPs
#' for a different locus.
#'
#' @return a file in the current working directory containing Pool-seq data in
#' the 'synchronized' format.
#'
#' @examples
#' # simulate Pool-seq data for 100 individuals sampled at a single locus
#' genotypes <- run_scrm(nDip = 100, nloci = 1, theta = 5)
#' # simulate Pool-seq data assuming a coverage of 100x and two pools of 50 individuals each
#' pool <- simPoolseq(genotypes = genotypes, pools = c(50, 50), pError = 100, sError = 0.001,
#' mCov = 100, vCov = 250, min.minor = 0)
#' # create a 'synchronized' file of the simulated data - this will create a txt file
#' # pool2sync(reference = pool$reference, alternative = pool$alternative, file = "mysync.txt")
#'
#' # simulate Pool-seq data for 10 individuals sampled at 5 loci
#' genotypes <- run_scrm(nDip = 10, nloci = 5, theta = 5)
#' # simulate Pool-seq data assuming a coverage of 100x and a single pool of 10 individuals
#' pool <- simPoolseq(genotypes = genotypes, pools = 10, pError = 100, sError = 0.001,
#' mCov = 100, vCov = 250, min.minor = 0)
#'
#' # create a 'synchronized' file of the simulated data - this will create a txt file
#' # pool2sync(reference = pool$reference, alternative = pool$alternative, file = "mysync.txt")
#'
#' @export
pool2sync <- function(reference, alternative, file, pos = NULL) {
# check if the input is a single locus or not
if(any(class(reference) == "list") && length(reference) == 1) {
# check if the reference input is a list and unlist it if it is
if(any(class(reference) == "list"))
reference <- unlist(reference)
# do the same for the alternative input
if(any(class(alternative) == "list"))
alternative <- unlist(alternative)
# create the string with the chromosome information - this single locus corresponds to linkage group (LG) 1
chr <- rep("LG1", length(reference))
# if no SNP positions are supplied as input
if(is.null(pos))
pos <- 1:length(reference) # create a string with the information about the position of each SNP
# create a string with the base of the reference allele
REF <- rep("A", length(reference))
# code the allele frequencies in the sync format
freqs <- sapply(1:length(reference), function(i) strg2sync(reference = reference[i], alternative = alternative[i]))
# create a data frame with the reference contig, position in the reference contig, reference character and allele frequencies
mydf <- data.frame(chr, pos, REF, freqs)
}
# if the input has more than one locus
if(any(class(reference) == "list") && length(reference) != 1) {
# get the number of loci in the input
nloci <- length(reference)
# create the string with the chromosome information - each locus corresponds to a linkage group (LG)
chr <- unlist(sapply(1:nloci, function(l) paste(rep("LG", length(reference[[l]])), l, sep = "")))
# if no SNP positions are supplied as input
if(is.null(pos))
pos <- unlist(sapply(1:nloci, function(l) 1:sapply(reference, length)[l]))
# create a string with the base of the reference allele
REF <- unlist(sapply(1:nloci, function(l) rep("A", length(reference[[l]]))))
# code the allele frequencies in the sync format
freqs <- unlist(sapply(1:nloci, function(l) sapply(1:length(reference[[l]]), function(i)
strg2sync(reference = reference[[l]][i], alternative = alternative[[l]][i]))))
# create a data frame with the reference contig, position in the reference contig,
# reference character and allele frequencies
mydf <- data.frame(chr, pos, REF, freqs)
}
# write the Pool-seq data in the 'synchronized' format
utils::write.table(x = mydf, file = file, quote = FALSE, row.names = FALSE, col.names = FALSE)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.