#' Read in arp files of single DNA Loci
#'
#' This function reads in .arp files from fsc26 and creates DNAbin
#' objects for each simulated population
#'
#' @param file PATH to .arp file
#'
#' @export
read.arp <- function(file) {
arp <- scan(file, what = "raw", sep = "\n", strip.white = T, quiet = T) # Read in file, file becomes a vector where each line is an entry as a string
Nsamp <- as.numeric(gsub("NbSamples=", "", arp[grep("NbSamples", arp)])) # Calculate the number of samples by finding the line in the .arp file and getting rid of everything but the number
#print(Nsamp)
Nsite <- as.numeric(gsub(".+polymorphic sites: ", "", arp[grep("polymorphic sites", arp)])) # Same as above but to find the number of polymorphic sites
#print(paste("Nsite = ", Nsite))
if(Nsamp == 1) {
linenum <- grep(paste0("SampleName=\"Sample ", 1), arp) # Find the line number to start on for the sample
N <- as.numeric(gsub("SampleSize=", "", arp[linenum+1])) # Find out how many sequences are in the sample
out <- matrix(nrow = N, ncol = Nsite)
for(j in 1:N) { # Loop through the number of sequences in the sample
index <- grep(paste0(1,"_",j,"\\t"), arp) # Find the line with the desired sequence
#print(paste("Index = ", index))
seq <- gsub(paste0(".+\\t"), "", arp[index]) # Extract the sequence as a single character string
#print(seq)
seq <- unlist(strsplit(seq, "")) # Split the sequence into a vector where each element is a nucleotide
#print(length(seq))
out[j,] <- seq # Add that sequence to the matrix
}
out <- ape::as.DNAbin(out)
return(out)
} else {
out.list <- list() # Initialize listfor output
for(i in 1:Nsamp) { # Loop through the number of samples
linenum <- grep(paste0("SampleName=\"Sample ", i), arp) # Find the line number to start on for the sample
N <- as.numeric(gsub("SampleSize=", "", arp[linenum+1])) # Find out how many sequences are in the sample
out.list[[i]] <- matrix(nrow = N, ncol = Nsite) # Initialize matrix for sample sequences
#print(out.list)
for(j in 1:N) { # Loop through the number of sequences in the sample
index <- grep(paste0(i,"_",j,"\\t"), arp) # Find the line with the desired sequence
#print(paste("Index = ", index))
seq <- gsub(paste0(".+\\t"), "", arp[index]) # Extract the sequence as a single character string
#print(seq)
seq <- unlist(strsplit(seq, "")) # Split the sequence into a vector where each element is a nucleotide
#print(length(seq))
out.list[[i]][j,] <- seq # Add that sequence to the matrix
}
rownames(out.list[[i]]) <- paste0(i,"-",1:N) # Add sequence names (following that of the fastsimcoal2 output)
}
samples <- paste0(rep("sample",Nsamp),".",1:Nsamp) # Make vector of names for list
#nam <- c(samples, "full") # Same as above
#out.list[[Nsamp+1]] <- do.call(rbind, out.list) # Add another matrix including all sequences
#names(out.list) <- nam # Give the matrix objects names
#out <- lapply(out.list, as.DNAbin) # Convert the matrices into DNABIN objects
names(out.list) <- samples
return(out.list)
#names(out) <- samples
#return(out)
}
}
#' Read in a .param file
#'
#' @param path PATH to .param file
#'
#' @export
read.param <- function(path) {
input <- readLines(path)
for(i in 1:length(input)) {
if(i == 1) {
header <- unlist(strsplit(input[i], "\t"))
print(header)
rmv <- grep("Max", header)
header <- header[-rmv]
out <- matrix(nrow = length(input) - 1, ncol = length(header))
} else {
inline <- as.numeric(unlist(strsplit(input[i], "\t")))
out[i - 1, ] <- inline
}
}
colnames(out) <- header
out <- as.data.frame(out)
out
}
#' Concatenate sequences into a vector from matrix
#'
#' @param x A matrix of a multiple sequence alignment where each column is a base
#' pair and each row is a sequence
#'
#' @export
concat_seqs <- function(x) {
y <- apply(x, 1, function(x) {paste(x, collapse = "")})
return(y)
}
#' Function to calculate the number of shared haplotypes between populations
#'
#' @param x A list of matrices of multiple sequence alignments or DNAbins
#'
#' @export
sharedhaps <- function(x) {
t <- lapply(x, fscSS::concat_seqs) # Turn matrices into vectors where each element is a single sequence and put all in a list
combs <- utils::combn(t, 2, simplify = F) # Get all the pairwise combinations between populations
all.inter <- lapply(combs, function(x) intersect(x[[1]], x[[2]])) # Find the intersection between all pairwise combinations
out.un <- unlist(all.inter) # Make all shared haplotypes into a single vector
shared.haps <- intersect(out.un, out.un) # Find which haplotypes are shared
#print(paste0("Number of Shared Haplotypes = ", length(shared.haps)), quote = F)
seqs <- unlist(t) # Put all sequences into one vector
counts <- table(seqs) # Count how many times each haplotype occurs
shared <- match(shared.haps, names(counts)) # Get counts of only shared haplotypes
freqs <- as.vector(counts[shared]) # Same as above but convert to normal vector
freqs <- freqs/length(seqs) # Calculate the percentage of all sequences the shared haplotypes exhibit
#print(paste0("Percent Sequences Shared = ", sum(freqs)), quote = F)
return(list(length(shared.haps), sum(freqs)))
}
#' Calculate the average number of nucleotide differences (Tajima 1983 eq. A3, Nei 1987 eq. 10.6)
#'
#' @param x An object of class DNAbin or a matrix where rows are sequences and columns are nucleotide positions (assumed to be already aligned)
#'
#' @export
calc_pi <- function(x) {
differ <- function(x, y) {
n <- which(x != y)
return(length(n))
}
l <- split(x, row(x))
d <- sapply(l, function(m) sapply(l, function(n) differ(m, n)))
d[upper.tri(d)] <- 0
if(length(l) == 0) {
ave.pi <- 0
} else {
ave.pi <- sum(d/choose(length(l), 2))
}
return(ave.pi)
}
#' Function to calculate the sampling variance of pi (Nei 1987, eq. 10.7; from source code of DNAsp)
#'
#' @param x An object of class DNAbin or a matrix where rows are sequences and columns are nucleotide positions (assumed to be already aligned)
#' @param nsites Optional integer specifying the total number of nucleotide sites in the alignment if fastsimcoal2 does not output all sites
#' @export
calc_var_pi <- function(x, nsites = NULL) {
differ <- function(x, y) {
n <- which(x != y)
return(length(n))
}
n <- nrow(x)
if(is.null(nsites)) {
nsite <- ncol(x)
} else {
nsite <- nsites
}
kba <- 0
for(i in 1:(n-1)) {
for(j in (i+1):n) {
count <- differ(x[i,], x[j,])
kba <- kba + count
}
}
kba <- kba / (n * (n - 1)/2)
var1 = 1
var1 <- (kba * kba * (n-1) * (n-1)) / (var1 * 4 * nsite * nsite * n * n)
var2 = 0
for(i in 1:n) {
for(j in 1:n) {
if(i != j) {
for(k in j:n) {
count_ij <- differ(x[i,], x[j,])
count_ik <- differ(x[i,], x[k,])
count <- count_ij*count_ik
if(j != k) {
count <- count * 2
}
var2 <- var2 + count
}
}
}
}
var2 <- var2 * (1 / (nsite * nsite))
var2 <- var2 * (1 / (n * n * n))
var2 <- var2 * (n - 2)
var3 <- 0
for(i in 1:(n-1)) {
for(j in (i+1):n) {
count <- differ(x[i,], x[j,])
count <- count * count
var3 <- var3 + count
}
}
var3 <- var3 / (nsite * nsite)
var3 <- var3 * (1 / (n * n))
var_pi = (4 / ((n) * (n - 1))) * (((6 - (4 * n)) * var1) + var2 + var3)
return(var_pi)
}
#' Function to calculate the sampling variance of pi (Nei 1987, eq. 10.7)
#'
#' @param x An object of class DNAbin or a matrix where rows are sequences and columns are nucleotide positions (assumed to be already aligned)
#' @param nsites Optional integer specifying the total number of nucleotide sites in the alignment if fastsimcoal2 does not output all sites
#' @export
calc_var_pi2 <- function(x, nsites = NULL) { # New Var_pi function
if(is.null(nsites)) {
nsites <- ncol(x)
} else {
nsites <- nsites
}
# Get the unique sequences and the number of unique sequences
haps <- table(apply(x, 1, paste, collapse = ''))
# The line above creates a named vector, where the names are the sequences themselves
# and the values are the number of times that sequence occurs
# for this example it looks like:
# AATCCG AATCGG ATTCCG
# 2 2 2
# This named vector can act like a dictionary where the name is the key and
# the values are the values
# Calculate the frequencies of each unique sequence
# The output of this is the same as above but instead of '2's as values
# it is the frequencies (in this case 1/3)
freqs <- haps/nrow(x)
calcs <- 0 # initialize a count for var1
var3 <- 0 # initialize a count for var3
# The following loops calculate var1 and var3 for these loops the equation specifies
# that i<j, so we only run this loop through the combinations where i<j
# in our example those permutations are (2,1) (3,1) (3,2)
for(i in 2:length(haps)) {
for(j in 1:(i-1)) {
#print(paste(i, j))
# calculates pi_ij in the equation
pij <- length(which(unlist(strsplit(names(haps)[i], '')) != unlist(strsplit(names(haps)[j], ''))))/nsites
#freq[i] = x_; freqs[j] = x_j; add the sum to the total sum
calcs <- calcs + (freqs[i]*freqs[j]*pij)
# For var3 pij needs to be squared, but is otherwise the same as var1
var3 <- var3 + (freqs[i]*freqs[j]*pij^2)
}
}
# var1 needs to be squared as in the equation
var1 <- calcs^2
# This next line creates a matrix listing all the permutations that need to be
# calculated for var2 (i.e. the list of i's, j's, and k's). This looks like:
# 1 1 1
# 1 1 2
# 1 1 3
# ...
# 3 3 2
# 3 3 3
perms <- gtools::permutations(length(haps), 3, repeats.allowed = TRUE)
var2 = 0 # initialize the count for var2
# I loop through the matrix of permutations and use them as indices
# for the i's, j's, k's
for(i in 1:nrow(perms)) {
# Calculate the number of differences between x_i and x_j
countij <- length(which(unlist(strsplit(names(haps)[perms[i,1]], '')) != unlist(strsplit(names(haps)[perms[i,2]], ''))))
# Calculate the number of differences between x_i and x_k
countik <- length(which(unlist(strsplit(names(haps)[perms[i,1]], '')) != unlist(strsplit(names(haps)[perms[i,3]], ''))))
# Divide these counts by the length of the sequences
countij <- countij/nsites
countik <- countik/nsites
# freqs[perms[i,1]] = x_i; freqs[perms[i,2]] = x_j; freqs[perms[i,3]] = x_k
var2 <- var2 + freqs[perms[i,1]]*freqs[perms[i,2]]*freqs[perms[i,3]]*countij*countik
}
# Multiply by n-2 per the equation
var2 <- var2 * (nrow(x) - 2)
# Remove the names that R gave the variables
names(var1) <- NULL
names(var2) <- NULL
names(var3) <- NULL
#print(paste("var1 =", var1))
#print(paste("var2 =", var2))
#print(paste("var3 =", var3))
# Put the equation all together
var_pi <- (4/(nrow(x)*(nrow(x)-1)))*((6-4*nrow(x))*var1+var2+var3)
return(var_pi)
}
#' Function to calculate the average number of nucleotide differences (dxy) between two separate populations (wakeley)
#'
#' @param l List with objects of class DNAbin or matrices where rows are sequences and columns are nucleotide positions from two different populations
#'
#' @export
calc_dxy <- function (l) {
differ <- function(x, y) {
n <- which(x != y)
return(length(n))
}
nPOP <- length(l)
combs <- utils::combn(nPOP, 2)
seqcheck <- sum(unlist(lapply(l, length)))
nms <- c()
if(seqcheck == 0) {
for(i in 1:ncol(combs)) {
nms[i] <- paste0("d", combs[1, i], combs[2, i])
}
out <- rep(0, nPOP)
names(out) <- nms
return(out)
} else {
out <- c(length = ncol(combs))
nms <- c()
for (i in 1:ncol(combs)) {
pop1 <- l[[combs[1, i]]]
pop2 <- l[[combs[2, i]]]
pop1 <- split(pop1, row(pop1))
pop2 <- split(pop2, row(pop2))
d <- sapply(pop1, function(m) sapply(pop2, function(n) differ(m,n)))
out[i] <- sum(d)/length(d)
nms[i] <- paste0("d", combs[1, i], combs[2, i])
}
names(out) <- nms
return(out)
}
}
#calc_pi_pops <- function(x, y) {
#
# differ <- function(x, y) {
# n <- which(x != y)
# return(length(n))
# }
#
# l.1 <- split(x, row(x))
# l.2 <- split(y, row(y))
# d <- sapply(l.1, function(m) sapply(l.2, function(n) differ(m, n)))
# ave.pi <- sum(d)/length(d)
# return(ave.pi)
#}
#' Function to calculate the Watterson estimator for theta
#'
#' @param x Object of class DNAbin or a matrix of multiple sequence alignment where
#' rows are sequences and columns are nucleotide positions
#'
#' @export
theta_watt <- function(x) {
sites <- apply(x, 2, unique) # Find which sites have multiple nucleotides present
if(is.matrix(sites)) {
s <- ncol(sites)
} else {
s <- length(which(sapply(sites, length) != 1)) # Calculate the total number of segregating sites
}
theta <- s/sum(1/1:(nrow(x)-1))
theta
}
#' Function to calculate Tajima's D
#'
#' @param x Object of class DNAbin or a matrix of multiple sequence alignment where
#' rows are sequences and columns are nucleotide positions
#'
#' @export
tajimaD <- function(x) {
n <- nrow(x)
pi <- fscSS::calc_pi(x)
sites <- apply(x, 2, unique) # Find which sites have multiple nucleotides present
if(is.matrix(sites)) {
s <- ncol(sites)
} else {
s <- length(which(sapply(sites, length) != 1)) # Calculate the total number of segregating sites
}
tmp <- 1:(n - 1)
a1 <- sum(1/tmp)
a2 <- sum(1/tmp^2)
b1 <- (n + 1)/(3*(n - 1))
b2 <- 2*(n^2 + n + 3)/(9*n*(n - 1))
c1 <- b1 - 1/a1
c2 <- b2 - (n + 2)/(a1*n) + a2/a1^2
e1 <- c1/a1
e2 <- c2/(a1^2 + a2)
D <- (pi - s/a1)/sqrt(e1*s + e2*s*(s-1))
D
}
#' Function to calculate the number of haplotypes, number of segregating sites, haplotype diversity, and pi of a matrix of sequences
#'
#' @param x Object of class DNAbin or a matrix of multiple sequence alignment where
#' rows are sequences and columns are nucleotide positions
#' @param nsites Optional integer specifying the total number of nucleotide sites in the loci examined
#'
#' @export
sumstats <- function(x, nsites = NULL) {
n <- nrow(x)
if(length(x) == 0) {
num.haps <- 1
hap.freqs <- 1
} else {
haps <- pegas::haplotype(x) # Find haplotypes using Pegas's haplotype function
num.haps <- length(attr(haps, "index")) # Find the total number of haplotypes
hap.freqs <- lengths(attr(haps, "index"))/n # Make a vector containing the frequencies of each haplotype
}
h <- (n*(1-sum(hap.freqs^2)))/(n-1) # Calculate haplotype diversity (Nei 1987, eq. 8.4)
var_h <- (2/(n*(n-1)))*(2*(n-2)*(sum(hap.freqs^3)-sum(hap.freqs^2)^2)+sum(hap.freqs^2)-(sum(hap.freqs^2)^2)) # Calculate variance of haplotype diversity (Nei 1987, eq. 8.12)
sites <- apply(x, 2, unique) # Find which sites have multiple nucleotides present
if(is.matrix(sites)) { # Fixes an error where the output of sites is a matrix if all segregating sites have the same number of mutations
s <- ncol(sites)
} else if(is.list(sites)) {
s <- length(which(sapply(sites, length) != 1)) # Calculate the total number of segregating sites
} else if(num.haps == 1) {
s <- 0
}
if(length(x[1,]) == 0) { # Add if statement to check if sequences are different so Pi can be calculated
pi <- 0
var_pi <- 0
} else {
pi <- fscSS::calc_pi(x) # Calculate the Average number of nucleotide differences (π)
if(is.null(nsites)) {
var_pi <- fscSS::calc_var_pi2(x)
} else {
var_pi <- fscSS::calc_var_pi2(x, nsites = nsites)
}
}
theta <- fscSS::theta_watt(x)
D <- fscSS::tajimaD(x)
out <- c(num.haps, h, var_h, s, pi, var_pi, theta, D)
names(out) <- c("K", "H", "VarH", "S", "Pi", "Varpi", "ThetaW", "TajimaD")
return(out)
}
#' Function to calculate sumstats on samples with multiple populations
#'
#' @param x List of objects of class DNAbin or matrices of multiple sequence alignment where
#' rows are sequences and columns are nucleotide positions. Each object in list is a separate
#' population.
#' @param nsites Optional integer indicating the total number of nucleotide sites in the analyzed locus
#'
#' @export
sumstats_pops <- function(x, nsites = NULL) {
npops <- length(x)
if(is.null(nsites)) {
y <- lapply(x, fscSS::sumstats) # Calculate sumstats within populations
} else {
y <- lapply(x, fscSS::sumstats, nsites = nsites)
}
y <- unlist(y) # make a vector of sumstats from populations
#ind.K <- grep("K", names(y)) # Find which vector elements correspond to pi
#mean.K <- mean(y[ind.K]) # Calculate the mean pi across populations
#sd.K <- stats::sd(y[ind.K]) # Calculate the standard deviation of pi across populations
#ind.S <- grep("S", names(y)) # Find which vector elements correspond to pi
#mean.S <- mean(y[ind.S]) # Calculate the mean pi across populations
#sd.S <- stats::sd(y[ind.S]) # Calculate the standard deviation of pi across populations
#ind <- grep("Pi", names(y)) # Find which vector elements correspond to pi
#mean.pi <- mean(y[ind]) # Calculate the mean pi across populations
#sd.pi <- stats::sd(y[ind]) # Calculate the standard deviation of pi across populations
#ind.h <- grep("H", names(y))
#mean.H <- mean(y[ind.h])
#sd.H <- stats::sd(y[ind.h])
#ind.theta <- grep("ThetaW", names(y))
#mean.theta <- mean(y[ind.theta])
#sd.theta <- stats::sd(y[ind.theta])
#ind.D <- grep("TajimaD", names(y))
#mean.D <- mean(y[ind.D])
#var.D <- stats::var(y[ind.D])
#all.samps <- do.call(rbind, x) # Make a new matrix pooling the populations together
#n <- nrow(all.samps)
#all.stats <- fscSS::sumstats(all.samps)
#shared.haps <- fscSS::sharedhaps(x)
#tot <- c(all.stats["K"], shared.haps[[1]], shared.haps[[2]], all.stats["S"], all.stats["H"], all.stats["Pi"], all.stats["ThetaW"], all.stats["TajimaD"],mean.K, sd.K, mean.S, sd.S, mean.H, sd.H, mean.pi, sd.pi, mean.theta, sd.theta, mean.D, var.D) # Output pooled summary statistics
#out <- append(y, tot)
dxy <- calc_dxy(x)
mean_pis <- mean(y[grep("Pi", names(y))])
Fst <- 1 - (mean_pis/mean(dxy)) # Hudson, Slatkin, Maddison 1992, eq. 3
popshead <- vector()
for(i in 1:npops) {
popshead <- append(popshead, paste0(c("K", "H", "VarH", "S", "Pi", "Varpi", "ThetaW", "TajimaD"), i))
}
names(y) <- popshead
out <- c(y, dxy, Fst = Fst)
return(out)
}
#' Function to calculate simple transition/transversion ratio and transition rate
#'
#' @param x Object of class DNAbin
#'
#' @export
count_tstv <- function(x) {
ts <- 0
tv <- 0
for(i in 1:ncol(x)) {
alleles <- unique(as.vector(as.character(x[,i])))
if(length(alleles) == 1){
next
} else if(length(alleles) == 2) {
if("a" %in% alleles & "g" %in% alleles) {
ts <- ts + 1
} else if("c" %in% alleles & "t" %in% alleles) {
ts <- ts + 1
} else {
tv <- tv + 1
}
} else if(length(alleles) > 2) {
warning(paste0("Multiple hits at position ", i, ", position skipped"))
next
}
}
cat(paste("Transitions:", ts, "\n"))
cat(paste("Transversions:", tv, "\n"))
cat(paste("ts/tv:", ts/tv, "\n"))
cat(paste("ts rate:", ts/sum(ts, tv), "\n"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.