Nothing
# Non-iterative function for calculating allele frequencies
# under polysomic inheritance. For allele copy number ambiguity,
# assumes that all alleles have an equal chance of being present in more
# than one copy.
simpleFreq <- function(object, samples=Samples(object), loci=Loci(object)){
# subset the object to avoid a lot of indexing later in the function
object <- object[samples,loci]
# we need PopInfo and Ploidies; check for these
if(!all(!is.na(PopInfo(object)))){
cat("PopInfo needed for samples:",
samples[is.na(PopInfo(object))], sep="\n")
stop("PopInfo needed for this function.")
}
if(!all(!is.na(Ploidies(object)))){
stop("Ploidies needed for this function.")
}
# we need a genbinary object for this function; make one if necessary
if(is(object, "genambig")){
object <- genambig.to.genbinary(object)
}
Present(object) <- as.integer(1)
Absent(object) <- as.integer(0)
# get the populations that will be evaluated
pops <- PopNames(object)[unique(PopInfo(object))]
# Get ploidies into a format where you can total over the populations
# (and don't index by locus if you don't need to)
if(is(object@Ploidies, "ploidymatrix")){
object <- reformatPloidies(object, output="collapse", erase=FALSE)
}
if(!is(object@Ploidies, "ploidysample") &&
length(unique(Ploidies(object, samples, loci)))==1){
object <- reformatPloidies(object, output="sample", erase=FALSE)
}
if(is(object@Ploidies, "ploidylocus")){
object <- reformatPloidies(object, output="matrix", erase=FALSE)
}
# Get total genomes per population if loci are uniform ploidy
if(is(object@Ploidies, "ploidysample")){
# get the total number of genomes per population
totgenomes <- rep(0, length(pops))
names(totgenomes) <- pops
for(p in pops){
totgenomes[p] <- sum(Ploidies(object)[Samples(object,populations=p)])
}
# set up data frame to contain allele frequencies
freqtable <- data.frame(Genomes=totgenomes,row.names=pops)
} else { # or if loci have different ploidies
freqtable <- data.frame(row.names=pops)
}
# loop to get frequency data
for(L in loci){
# get all samples without missing data at this locus
xsamples <- samples[!isMissing(object, samples, L)]
# get the total number of genomes per population
totgenomes <- rep(0, length(pops))
names(totgenomes) <- pops
for(p in pops){
totgenomes[p] <- sum(Ploidies(object, xsamples[xsamples %in%
Samples(object,
populations=p)], L))
}
# write genomes to the table if necessary
if(is(object@Ploidies, "ploidymatrix")){
totg <- data.frame(totgenomes)
names(totg) <- paste(L, "Genomes", sep=".")
freqtable <- cbind(freqtable, totg)
}
# make a conversion factor to weight allele presence based on ploidy
# of each individual and number of alleles at this locus
numalleles <- rep(0, length(xsamples))
names(numalleles) <- xsamples
for(s in xsamples){
numalleles[s] <- sum(Genotype(object, s , L))
}
convf <- as.vector(Ploidies(object,xsamples,L))/numalleles
# make table of weighted allele presence
loctable <- Genotypes(object, xsamples, L) * convf
# loop through all alleles at this locus
for(al in dimnames(loctable)[[2]]){
theseallelefreqs <- rep(0, length(pops))
names(theseallelefreqs) <- pops
# loop through populations
for(p in pops){
theseallelefreqs[p]<-sum(loctable[xsamples[xsamples %in%
Samples(object, populations=p)],
al])/totgenomes[p]
}
freqtable <- cbind(freqtable,theseallelefreqs)
names(freqtable)[length(freqtable)] <- al
}
}
# return data frame
return(freqtable)
}
# Iterative estimation of allele frequencies under polysomic inheritance,
# with a uniform, even-numbered ploidy and a known selfing rate.
# Much of this code is translated directly from:
# De Silva HN, Hall AJ, Rikkerink E, McNeilage MA, and LG Fraser (2005).
# Estimation of allele frequencies in polyploids under certain patterns
# of inheritance. Heredity 95:327-334.
deSilvaFreq <- function(object, self,
samples=Samples(object),
loci=Loci(object), initNull = 0.15,
initFreq=simpleFreq(object[samples,loci]),
tol = 0.00000001, maxiter = 1e4){
# make sure self argument has been given
if(missing(self)) stop("Selfing rate required.")
# make sure initFreq is in right format
if(!"Genomes" %in% names(initFreq))
stop("initFreq must have single Genomes column.")
# convert object to genambig if necessary
if("genbinary" %in% class(object)) object <- genbinary.to.genambig(object,
samples,
loci)
# get the ploidy (m2), and check that there is only one and that it is even
m2 <- unique(as.vector(Ploidies(object,samples,loci)))
if(length(m2) != 1)
stop("Only one ploidy allowed. Try running subsets of data one ploidy at a time.")
if(is.na(m2)) stop("Function requires information in Ploidies slot.")
if(m2 %% 2 != 0) stop("Ploidy must be even.")
# check and set up initNull
if(!length(initNull) %in% c(1,length(loci)))
stop("Need single value for initNull or one value per locus.")
if(length(initNull) == 1)
initNull <- rep(initNull, times=length(loci))
if(is.null(names(initNull)))
names(initNull) <- loci
# calculate the number of alleles from one gamete
# (from de Silva et al's INIT subroutine)
m <- m2 %/% 2L
# get the populations that will be used
pops <- PopNames(object)[unique(PopInfo(object)[samples])]
if(!identical(pops, row.names(initFreq)))
stop("Population names in initFreq don't match those in object.")
# set up the data frame where final allele frequencies will be stored
# has columns from initFreq, plus columns for nulls
finalfreq <- data.frame(row.names=pops, Genomes=initFreq$Genomes,
matrix(0, nrow=length(pops),
ncol=dim(initFreq)[2]-1+length(loci),
dimnames=list(NULL, sort(c(
paste(loci, ".null", sep=""),
names(initFreq)[2:dim(initFreq)[2]])))))
# get the divisor for the selfing matrices
smatdiv <- (G(m-1,m+1))^2
# INDEXF function from de Silva et al
# af1 = vector of alleles in phenotype
# m = number of observable alleles in phenotype
# na = number of alleles, calculated for each locus
indexf <- function(m, af1, na){
x <- 1L
if(m == 1){
x <- x + af1[1]
} else {
if(m > 1){
for(q in 1:(m-1)){
x <- x + G(q-1,na-q+1)
}
x <- x + G(m-1,na+1-m) - G(m-1,na+2-m-af1[1])
if(m > 2){
for(q in 1:(m-2)){
x <- x + G(q,na-q-af1[m-q-1]) -
G(q,na+1-q-af1[m-q])
}
}
x <- x + af1[m] - af1[m-1]
}
}
return(x)
}
# FENLIST function
fenlist <- function(na){
# set up temporary holding vector for phenotpes (FENLIST)
af1 <- rep(0L, m2)
# set up array to contain all phenotypes (FENLIST)
af <- array(0L, dim=c(1, m2))
# set up vector for number of alleles in each phenotype (FENLIST)
naf <- integer(0)
# fill in af and naf (FENLIST)
# This is done with rbind rather than setting up whole array,
# because the method for calculating the number of phenotypes
# doesn't seem to work for certain numbers of alleles.
f <- 1L
naf <- 0L
for(m in 1:min(m2, na)){
af1[1] <- 1L
if(m > 1){
for(j in 2:m){
af1[j] <- af1[j-1] + 1L
}
}
f <- f + 1L
naf[f] <- m
af <- rbind(af, af1)
a <- m
while(a > 0){
if(af1[a] == (na+a-m)){
a <- a - 1L
} else {
if(a > 0){
af1[a] <- af1[a] + 1L
if(a < m){
for(a1 in (a+1):m) af1[a1] <- af1[a1-1] + 1L
}
f <- f + 1L
naf[f] <- m
af <- rbind(af, af1)
a <- m
}
}
}
}
return(list(af, naf))
}
# CONVMAT function
convmat <- function(ng, nf, na1, ag){
na <- na1 - 1L
af1 <- rep(0L, m2)
# CONVMAT subroutine - make a matrix for conversion from
# genotypes to phenotypes
cmat <- matrix(0L, nrow=nf, ncol=ng)
for(g in 1:ng){
ag1 <- ag[g,]
# find the phenotype (af1) that matches this genotype (ag1)
if(ag1[1] == na1){
naf1 <- 0L # this is the homozygous null genotype
} else {
naf1 <- 1L
af1[naf1] <- ag1[1]
for(a in 2:m2){
if(ag1[a] == na1) break # exit loop if null allele
if(ag1[a] > af1[naf1]){
naf1 <- naf1 + 1L
af1[naf1] <- ag1[a]
}
}
}
# fill in the extra alleles with zeros
if(naf1 < m2){
for(j in (naf1 + 1):m2){
af1[j] <- 0L
}
}
f <- indexf(naf1, af1, na) # This is the one phenotype to match
# this genotype.
cmat[f,g] <- 1L
}
return(cmat)
}
# Loop to do calculations one locus and population at a time
for(L in loci){
cat(paste("Starting", L), sep="\n")
## Begin looping through populations
for(pop in pops){
cat(paste("Starting", L, pop), sep="\n")
# get a list of samples in this pop being analyzed
psamples <- Samples(object, populations=pop)[!isMissing(object,
Samples(object, populations=pop),
L)]
psamples <- psamples[psamples %in% samples]
# extract the initial allele frequencies and add a null
subInitFreq <- initFreq[pop, grep(paste("^", L,"\\.",sep=""),
names(initFreq))]
# set up matrices if this has not already been done
templist <- names(subInitFreq)[subInitFreq !=0]
templist <- strsplit(templist, split=".", fixed=TRUE)
alleles <- rep(0L, length(templist))
for(i in 1:length(alleles)){
alleles[i] <- as.integer(templist[[i]][2])
}
alleles <- sort(alleles)
na <- length(alleles)
# get the number of alleles with a null
na1 <- na + 1L
# get the number of genotypes (from the GENLIST function)
ng <- na1
for(j in 2:m2){
ng <- ng * (na1 + j - 1L) / j
}
ag <- GENLIST(ng, na1, m2)
temp <- fenlist(na)
af <- temp[[1]]
naf <- temp[[2]]
nf <- length(naf)
temp <- RANMUL(ng, na1, ag, m2)
rmul <- temp[[1]]
arep <- temp[[2]]
smatt <- SELFMAT(ng, na1, ag, m2)/smatdiv
cmat <- convmat(ng, nf, na1, ag)
rm(temp)
# calculate pp, the frequency of each phenotype in this population
pp <- rep(0, nf)
for(s in psamples){
phenotype <- sort(unique(Genotype(object, s, L)))
phenotype <- match(phenotype, alleles)
f <- indexf(length(phenotype), phenotype, na)
pp[f] <- pp[f] + 1
}
pp <- pp/sum(pp)
# Get initial allele frequencies
p1 <- rep(0, na1) # vector to hold frequencies
p1[na1] <- initNull[L] # add null freq to the last position
# make sure everything will sum to 1
subInitFreq <- subInitFreq * (1 - initNull[L])/sum(subInitFreq)
# get each allele frequency
for(a in alleles){
p1[match(a, alleles)] <- subInitFreq[1,paste(L, a, sep = ".")]
}
## Begin the EM algorithm
converge <- 0
niter <- 1L
oneg <- rep(1, ng)
while(converge == 0){
# Expectation step
# GPROBS subroutine
pa <- rep(0, na1)
pa[na1] <- 1
for(j in 1:na){
pa[j] <- p1[j]
pa[na1] <- pa[na1]-p1[j]
# it seems like this is the same as pa <- p1
# unless p1 is not supposed to have the null allele
# (conflicting information in orignal code comments)
}
rvec <- rep(0, ng)
for(g in 1:ng){
rvec[g] <- rmul[g]
for(j in 1:m2){
rvec[g] <- rvec[g]*pa[ag[g,j]]
}
# This gives prob of genotype by multiplying
# probs of alleles and coefficients for a multinomial
# distribution.
}
# make an identity matrix
id <- diag(nrow=ng)
# calculate gprob using selfing and outcrossing matrices
s3 <- id - self * smatt
gprob <- (1-self) * solve(s3, rvec)
# end GPROBS
# equation (12) from the paper
xx1 <- matrix(0, nrow=nf, ncol=ng)
for(i in 1:nf){
xx1[i,] <- cmat[i,] * gprob
}
xx2 <- xx1 %*% oneg
xx3 <- matrix(0, nrow=nf, ncol=ng)
for(i in 1:ng){
xx3[,i] <- xx1[,i] * (xx2^(-1))
}
EP <- t(xx3) %*% pp
# Maximization step
p2 <- t(arep) %*% EP/m2
# check for convergence
pB <- p1 + p2
pT <- p1 - p2
pT <- pT[abs(pB) > 1e-14]
pB <- pB[abs(pB) > 1e-14]
if(length(pB) == 0){
converge <- 1
} else {
if(sum(abs(pT)/pB) <= tol){
converge <- 1
}
}
if(niter >= maxiter){
converge <- 1
}
niter <- niter + 1L
p1 <- p2
}
# write frequencies in p2 to finalfreqs
for(a in alleles){
finalfreq[pop, match(paste(L, a, sep="."), names(finalfreq))]<-
p2[match(a, alleles)]
}
finalfreq[pop, match(paste(L, "null", sep="."), names(finalfreq))]<-
p2[na1]
# print the number of reps
cat(paste(niter-1, "repetitions for", L, pop),
sep="\n")
}
}
return(finalfreq)
}
calcPopDiff<-function(freqs, metric, pops=row.names(freqs),
loci=unique(gsub("\\..*$", "", names(freqs))),
global = FALSE, bootstrap = FALSE, n.bootstraps = 1000,
object = NULL){
# check metric
if(!metric %in% c("Fst", "Gst", "Jost's D", "Rst")){
stop("metric must be Fst, Gst, Rst, or Jost's D")
}
# check pop names
if(!all(pops %in% row.names(freqs))){
stop("pops must all be in row names of freqs.")
}
freqs <- freqs[pops,]
# Clean up loci
loci<-loci[loci!="Genomes"]
if(metric == "Rst" && (is.null(object) || any(is.na(Usatnts(object)[loci])))){
stop("gendata object required with Usatnts for Rst metric")
}
# internal function to get differentiation statistic from any set of two or more pops
cpd <- function(freqs, metric, loci, bootstrap, n.boostraps){
# Get genome number from the table
if("Genomes" %in% names(freqs)){
genomes<-freqs$Genomes
names(genomes)<-row.names(freqs)
GbyL <- FALSE
} else {
GbyL <- TRUE
}
# set up array for HT and HS values
hets<-array(0,dim=c(length(loci),4),
dimnames=list(loci,c("HT","HS","HTest","HSest")))
for(L in loci){
# population sizes for this locus
if(!GbyL){
thesegenomes <- genomes
} else {
thesegenomes <- freqs[,paste(L, "Genomes", sep=".")]
}
nsubpop <- length(thesegenomes)
# allele frequencies for this locus
thesefreqs<-freqs[,grep(paste("^", L,"\\.",sep=""), names(freqs)), drop = FALSE]
thesefreqs <- thesefreqs[,names(thesefreqs)!=paste(L,"Genomes",sep="."), drop = FALSE]
# expected heterozygosity by pop
hsByPop <- apply(as.matrix(thesefreqs), 1, function(x) 1 - sum(x^2))
if(metric == "Fst"){
# weighted mean frequencies for each allele
avgfreq <- unlist(lapply(thesefreqs, weighted.mean, w = thesegenomes))
# weighted mean Hs
hets[L, "HS"] <- weighted.mean(hsByPop, thesegenomes)
}
if(metric %in% c("Jost's D", "Gst")){
# unweighted mean frequencies for each allele
avgfreq <- colMeans(thesefreqs)
# unweighted mean Hs
hets[L, "HS"] <- mean(hsByPop)
}
if(metric == "Rst"){
replen <- Usatnts(object)[L] # microsatellite repeat length
alleles <- sapply(strsplit(grep(paste("^", L,"\\.", sep = ""), names(freqs), value = TRUE), ".",
fixed = TRUE),
function(x) x[2])
alleles <- alleles[alleles != "Genomes"]
alleles[alleles == "null"] <- 0
alleles <- as.integer(alleles)
if(0 %in% alleles){ # remove null alleles if they are there
nullfreqs <- thesefreqs[,match(0, alleles)] # frequency of null allele in each pop
for(pop in 1:dim(thesefreqs)[1]){
thesefreqs[pop,] <- thesefreqs[pop,]/(1-nullfreqs[pop]) # scale so non-null allele frequencies sum to 1
}
thesegenomes <- round(thesegenomes * (1-nullfreqs)) # remove null allele copies from total genomes
}
totgenomes <- sum(thesegenomes)
avgfreq <- colMeans(thesefreqs)
SSalleledistS <- numeric(dim(freqs)[1]) # for totaling sums of squares of allele differences for each pop
SSalleledistT <- 0 # for totaling sums of squares of allele differences across all pops
for(i in seq_len(length(alleles)-1)){ # loop through all pairs of different alleles
for(j in (i+1):length(alleles)){
if(alleles[i] == 0 || alleles[j] == 0) next
sqdiff <- (abs(alleles[i] - alleles[j])/replen)^2 # squared difference in repeat number
nocc <- thesefreqs[,i] * thesefreqs[,j] * thesegenomes^2 # number of times these alleles would be compared
SSalleledistS <- SSalleledistS + sqdiff * nocc
SSalleledistT <- SSalleledistT + sqdiff * totgenomes^2 * avgfreq[i] * avgfreq[j]
}
}
hets[L, "HS"] <- mean(SSalleledistS / (thesegenomes * (thesegenomes - 1)))
hets[L, "HT"] <- SSalleledistT/(totgenomes * (totgenomes - 1))
}
# estimate expected heterozygosity for panmictic pop
if(metric %in% c("Fst", "Gst", "Jost's D")){
hets[L,"HT"]<-1-sum(avgfreq^2)
}
if(metric %in% c("Jost's D","Gst")){
# harmonic mean of the sample size
meanGenomes <- 1/mean(1/thesegenomes)
# Nei and Chesser's (1983) estimates of expected heterozygosity
hets[L, "HSest"] <- hets[L, "HS"] * meanGenomes/(meanGenomes - 1)
hets[L, "HTest"] <- hets[L, "HT"] + hets[L, "HSest"] / (nsubpop * meanGenomes)
}
}
# set up vector to contain results, and bootstrapping
if(!bootstrap){
n.bootstraps <- 1
}
result <- numeric(n.bootstraps)
for(b in seq_len(n.bootstraps)){ # loop through bootstraps (just one if not bootstrapping)
if(bootstrap){
thishets <- hets[sample(loci, replace = TRUE),] # H matrix with bootstrapped set of loci
} else {
thishets <- hets
}
if(metric == "Fst"){ # calculate Fst
HT<-mean(thishets[,"HT"])
HS<-mean(thishets[,"HS"])
result[b] <- (HT-HS)/HT
}
if(metric == "Rst"){
R <- (thishets[,"HT"] - thishets[,"HS"])/thishets[,"HT"]
if(any(is.na(R))){
warning(paste("Monomorphic marker", paste(names(R)[is.na(R)], collapse = " "),
"ignored in comparison of", paste(rownames(freqs), collapse = " ")))
R <- R[!is.na(R)]
}
result[b] <- mean(R)
}
if(metric == "Gst"){ # calculate Gst and average across loci
G <- (thishets[,"HTest"] - thishets[,"HSest"])/thishets[,"HTest"]
if(any(is.na(G))){
warning(paste("Monomorphic marker", paste(names(G)[is.na(G)], collapse = " "),
"ignored in comparison of", paste(rownames(freqs), collapse = " ")))
G <- G[!is.na(G)]
}
result[b] <- mean(G)
}
if(metric == "Jost's D"){ # calculate Jost's D and average across loci
D <- nsubpop / (nsubpop - 1) * (thishets[,"HTest"] - thishets[,"HSest"]) / (1 - thishets[,"HSest"])
result[b] <- mean(D)
if(nsubpop == 1) result[b] <- 0 # prevent NaN
}
}
return(result)
} # end internal function
# wrappers for internal function (global or pairwise)
if(global){ # estimate single global statistic
result <- cpd(freqs, metric, loci, bootstrap, n.bootstraps)
} else { # estimate pairwise statistics
# Set up matrix for pairwise diffentiation statistics
if(bootstrap){
result <- array(list(), dim = c(length(pops), length(pops)), dimnames = list(pops,pops)) # needs to be array-list if each item will be vector of bootstraps
} else {
result<-matrix(0,nrow=length(pops),ncol=length(pops),dimnames=list(pops,pops)) # matrix for single non-bootstrapped values
}
for(m in 1:length(pops)){
for(n in m:length(pops)){
thisres <- cpd(freqs[unique(c(m,n)),], metric, loci, bootstrap, n.bootstraps)
if(bootstrap){
result[[m, n]] <- result[[n, m]] <- thisres
} else {
result[m, n] <- result[n, m] <- thisres
}
}
}
}
# return matrix of differentiation statistics (or single global value)
return(result)
}
# wrapper function for backwards compatibility
calcFst<-function(freqs, pops=row.names(freqs),
loci=unique(gsub("\\..*$", "", names(freqs))), ...){
return(calcPopDiff(freqs = freqs, metric = "Fst", pops = pops, loci = loci, ...))
}
# put allele frequencies into a format for SPAGeDi for it to use in estimates
# of kinship and relatedness coefficients
write.freq.SPAGeDi <- function(freqs, usatnts, file="", digits=2,
pops=row.names(freqs),
loci=unique(as.matrix(as.data.frame(strsplit(names(freqs),
split=".", fixed=TRUE), stringsAsFactors=FALSE))[1,])){
if(is.null(names(usatnts))) names(usatnts) <- loci
loci <- loci[loci != "Genomes"]
# subset the data frame
freqs <- freqs[pops,]
# make a list to contain the columns to write
datalist <- list(0)
length(datalist) <- ( length(loci) * 2 )
item <- 1
genbyloc <- !"Genomes" %in% names(freqs)
if(!genbyloc) genomes <- freqs$Genomes
for(L in loci){
# find the alleles
alleles <- as.matrix(as.data.frame(strsplit(names(freqs)[grep(paste("^", L,"\\.",sep=""),
names(freqs))],
split=".", fixed=TRUE))[2,])
alleles <- as.vector(alleles)
if(genbyloc){
alleles <- alleles[alleles!="Genomes"]
}
# convert alleles to numbers used in SPAGeDi file
alleles[alleles == "null"] <- 0
alleles <- as.integer(alleles)
alleles <- floor(alleles/usatnts[L])
suballele<-function(value){if(value[1]!=0){
value-(10^(digits-1))} else{
0}}
while(max(alleles) >= 10^digits){
alleles<-mapply(suballele, alleles)
}
# add alleles to datalist
datalist[[item]] <- alleles
names(datalist)[item] <- L
item <- item + 1
# make a weighted average of allele frequencies across populations
if(genbyloc){
genomes <- freqs[[paste(L, "Genomes", sep=".")]]
locindex <- grep(paste("^", L, "\\.", sep=""), names(freqs))
locindex <- locindex[!locindex %in% grep("Genomes", names(freqs))]
} else {
locindex <- grep(paste("^", L, "\\.", sep=""), names(freqs))
}
avgfreq <- (genomes %*% as.matrix(freqs[,locindex])) / sum(genomes)
# add frequencies to list
datalist[[item]] <- as.vector(avgfreq)
names(datalist)[item] <- length(avgfreq)
item <- item + 1
}
# find the maximum number of alleles
maxal <- max(mapply(length, datalist))
# set up data frame to write
fr <- data.frame(row.names=1:maxal)
# put list elements into the data frame
for(i in datalist){
fr <- data.frame(fr, c(i, rep("", maxal-length(i))))
}
names(fr) <- names(datalist)
# write the file
write.table(fr, file=file, sep="\t", col.names=TRUE, row.names=FALSE,
quote=FALSE)
}
freq.to.genpop <- function(freqs, pops=row.names(freqs),
loci=unique(as.matrix(as.data.frame(strsplit(names(freqs),
split=".",fixed=TRUE),stringsAsFactors=FALSE))[1,])){
# clean up loci
loci <- loci[loci!="Genomes"]
# errors
if(!"Genomes" %in% names(freqs))
stop("Only one Genomes column allowed.")
# get population sizes
popsizes <- freqs[pops, "Genomes"]
# get an index of columns containing these loci
loccol <- integer(0)
for(L in loci){
loccol <- c(loccol, grep(paste("^", L, "\\.",sep=""), names(freqs)))
}
# get a table just for these populations and loci
subfreq <- freqs[pops, loccol]
# convert allele frequencies to allele counts
for(i in 1:length(subfreq)){
subfreq[[i]] <- round(popsizes * subfreq[[i]])
}
return(subfreq)
}
## Genotype diversity functions
# p is a vector of genotype counts
# base = exp(1) for natural log, or base = 2 for log base 2
Shannon <- function(p, base=exp(1)){
N <- sum(p)
return(-sum(p/N * log(p/N, base=base)))
}
Simpson <- function(p){
N <- sum(p)
return(sum(p*(p-1)/(N*(N-1))))
}
Simpson.var <- function(p){
N <- sum(p)
p <- p/N
v <- (4*N*(N-1)*(N-2)*sum(p^3) + 2*N*(N-1)*sum(p^2) -
2*N*(N-1)*(2*N-3)*(sum(p^2))^2)/((N*(N-1))^2)
return(v)
}
# ... is passed to index. (So you can adjust base of Shannon index.)
# threshold is highest distance between two individuals that can be considered
# to be one clone.
genotypeDiversity <- function(genobject,
samples = Samples(genobject), loci = Loci(genobject),
d = meandistance.matrix(genobject,samples,loci,
all.distances=TRUE,distmetric=Lynch.distance),
threshold = 0, index = Shannon,
...){
# get populations
if(all(is.na(PopInfo(genobject)[samples]))){
PopInfo(genobject)[samples] <- rep(1, length(samples))
}
if(!all(!is.na(PopInfo(genobject)[samples]))){
stop("PopInfo needed.")
}
pops <- PopNames(genobject)[unique(PopInfo(genobject)[samples])]
# set up results matrix
results <- matrix(NA, ncol=length(loci)+1, nrow=length(pops),
dimnames=list(pops, c(loci,"overall")))
# statistics for individual loci
for(L in loci){
for(p in pops){
# get all samples for this pop with data for this locus
psamples <- samples[samples %in% Samples(genobject, populations=p)]
psamples <- psamples[!isMissing(genobject, psamples, L)]
# assign to genotype groups
dsub <- d[[1]][L, psamples, psamples]
if(is.vector(dsub)){ # fix for if there was only one sample
dsub <- array(dsub, dim=c(1,1),
dimnames=list(psamples, psamples))
}
clones <- assignClones(dsub,
threshold=threshold)
# get genotype frequencies
# n <- length(clones) # number of individuals
cl <- length(unique(clones)) # number of groups
counts <- rep(NA, cl) # vector to hold counts of individuals
for(i in 1:cl){
counts[i] <- length(clones[clones == i])
}
# get diversity index
results[p,L] <- index(counts, ...)
}
}
# get a mean array for only the loci being used here
if(length(loci) > dim(d[[1]])[1]){
d2 <- meandist.from.array(d[[1]], samples=samples, loci=loci)
} else {
d2 <- d[[2]]
}
# statistics for multilocus genotypes
for(p in pops){
psamples <- samples[samples %in% Samples(genobject, populations=p)]
clones <- assignClones(d2, psamples, threshold)
# n <- length(clones) # number of individuals
cl <- length(unique(clones)) # number of groups
counts <- rep(NA, cl) # vector to hold counts of individuals
for(i in 1:cl){
counts[i] <- length(clones[clones == i])
}
# get diversity index
results[p,"overall"] <- index(counts, ...)
}
return(results)
}
# function to get unique alleles and counts
alleleDiversity <- function(genobject, samples=Samples(genobject),
loci=Loci(genobject),
alleles=TRUE, counts=TRUE){
if(!alleles && !counts)
stop("At least one of alleles and counts must be set to TRUE")
# get populations
if(all(is.na(PopInfo(genobject)[samples]))){
PopInfo(genobject)[samples] <- rep(1, length(samples))
}
if(!all(!is.na(PopInfo(genobject)[samples]))){
stop("PopInfo needed.")
}
pops <- PopNames(genobject)[unique(PopInfo(genobject)[samples])]
# set up results tables
rcounts <- matrix(NA, nrow=length(pops)+1, ncol=length(loci),
dimnames=list(c(pops,"overall"),loci))
ralleles <- array(list(NA), dim=c(length(pops)+1,length(loci)),
dimnames=dimnames(rcounts))
# get unique alleles
for(L in loci){
for(p in pops){
psamples <- samples[samples %in% Samples(genobject, populations=p)]
xalleles <- .unal1loc(genobject, psamples, L)
ralleles[[p,L]] <- xalleles
rcounts[p,L] <- length(xalleles)
}
xalleles <- .unal1loc(genobject, samples, L)
ralleles[["overall",L]] <- xalleles
rcounts["overall",L] <- length(xalleles)
}
if(alleles && counts)
return(list(alleles=ralleles, counts=rcounts))
if(alleles && !counts)
return(ralleles)
if(!alleles && counts)
return(rcounts)
}
PIC <- function(freqs, pops=row.names(freqs),
loci=unique(as.matrix(as.data.frame(strsplit(names(freqs),
split=".",
fixed=TRUE),
stringsAsFactors=FALSE))[1,]),
bypop = TRUE, overall = TRUE){
if(!bypop && !overall){
stop("At least one of bypop and overall must be TRUE.")
}
# check pop names
if(!all(pops %in% row.names(freqs))){
stop("pops must all be in row names of freqs.")
}
freqs <- freqs[pops,]
# Clean up loci
loci<-loci[loci!="Genomes"]
# is number of genomes (pop size) specified by locus?
GbL <- !"Genomes" %in% names(freqs)
# convert freqs to matrix for math
freqs <- as.matrix(freqs)
# set up results matrix
if(bypop){
results <- matrix(NA, nrow = length(pops), ncol = length(loci), dimnames = list(pops, loci))
} else {
results <- matrix(NA, nrow = 0, ncol = length(loci), dimnames = list(character(0), loci))
}
if(overall){
results <- rbind(results, matrix(NA, nrow = 1, ncol = length(loci), dimnames = list("Overall", loci)))
}
# function to get PIC for one locus and pop
pic <- function(fr){
if(any(is.na(fr))) return(NA)
if(!isTRUE(all.equal(sum(fr), 1))) stop("Allele frequencies don't sum to 1.")
sq <- fr^2 # square each frequncy to get p_i^2 and p_j^2
nAl <- length(fr) # number of alleles
# matrix of the squares multiplied by each other
mt <- matrix(sq, nrow = nAl, ncol = 1) %*% matrix(sq, nrow = 1, ncol = nAl)
# sum of the squared allele freqs
sum1 <- sum(sq)
# twice the sum of the product of allele freqs (only between different alleles)
sum2 <- sum(mt) - sum(diag(mt))
return(1 - sum1 - sum2)
}
# loop through loci
for(L in loci){
lcol <- grep(paste("^", L, "\\.", sep = ""), dimnames(freqs)[[2]]) # columns for this locus
if(length(lcol) == 0) stop(paste("Locus", L, "not found in freqs."))
if(GbL){
lgencol <- grep(paste("^", L, "\\.Genomes", sep = ""), dimnames(freqs)[[2]])
lcol <- lcol[lcol != lgencol]
}
if(bypop){ # get PIC for each population for this locus
for(p in pops){
results[p,L] <- pic(freqs[p,lcol])
}
}
if(overall){ # get PIC for mean allele frequency
if(GbL){
genomes <- freqs[,lgencol]
} else {
genomes <- freqs[,"Genomes"]
}
meanfreq <- apply(freqs[,lcol], 2, function(x) weighted.mean(x, w = genomes))
results["Overall",L] <- pic(meanfreq)
}
}
return(results)
} # end of PIc function
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.