This a simple package with a specific purpose of selecting an optimal set of a specified number of CC lines from some specified subset of the "available" 73, based on a number of criterion. For example, selection the best 5 lines from 11 that are "readily available" while maximizing genetic diversity and cumulative wild-derived allele frequencies.
Example data set is included in the package - it is quite large - sorry. The following code services as a simple vignette for using the package for now.
library(devtools)
install_github("gkeele/CCexplore")
library(CCexplore)
data(CC.probs)
# Large set
these.lines <- c("CC065.Unc",
"CC011.Unc",
"CC004.TauUnc",
"CC003.Unc",
"CC039.Unc",
"CC053.Unc",
"CC071.TauUnc",
"CC017.Unc",
"CC022.GeniUnc",
"CC021.Unc",
"CC058.Unc")
candidate.allele.freq <- get.allele.freq(allele.props.array=CC.probs, these.individuals=these.lines)
allele.freq.plot(allele.freq=candidate.allele.freq)
# Include lines
include.lines <- c("CC065.Unc",
"CC011.Unc",
"CC004.TauUnc",
"CC058.Unc")
# Choice lines
choice.lines <- c("CC039.Unc",
"CC053.Unc",
"CC071.TauUnc",
"CC017.Unc",
"CC022.GeniUnc",
"CC021.Unc")
all.eval <- eval.criteria(allele.props.array=CC.probs, fixed.set=include.lines,
choice.set=choice.lines, choice.select=3)
# Remove extreme imbalance
extreme.sets <- all.eval$extreme.imbalance > 0
# Bottom 20% for imbalance
balanced.sets <- all.eval$imbalance < quantile(all.eval$imbalance, probs=0.2)
# Top 20% for wild-derived alleles
wild.sets <- all.eval$mean.wild > quantile(all.eval$mean.wild, probs=0.8)
# Top 20% for PWK & CAST alleles
pwk_cast.sets <- all.eval$mean.pwk_cast > quantile(all.eval$mean.pwk_cast, probs=0.8)
# Top 20% for mean L2norm
L2.sets <- all.eval$mean.L2norm > quantile(all.eval$mean.L2norm, probs=0.8)
# Accumulate criterion (Could weight based on importance, currently just equally weighting)
all.checks <- as.numeric(!extreme.sets) + as.numeric(balanced.sets) + as.numeric(wild.sets) + as.numeric(pwk_cast.sets) + as.numeric(L2.sets)
table(all.checks)
# Look at top sets
top.sets <- which(all.checks == max(all.checks))
all.eval$possible[top.sets,]
top.allele.freq <- get.allele.freq(allele.props.array=CC.probs, these.individuals=all.eval$possible[top.sets[1],])
allele.freq.plot(allele.freq=top.allele.freq)
# Look at the kind of imbalance that is arising in a good set
problem.sites <- flag_allele.imbalance(top.allele.freq)
allele.freq.plot(allele.freq=top.allele.freq[problem.sites,], title="Imbalanced Sites")
## Look at sample realizations of haplotype probabilities
# Max probability
maxprobs.haps <- haplotypes.maxprob(probs.3D=CC.probs, set=all.eval$possible[top.sets,])
allele.freq.plot(allele.freq=maxprobs.haps)
# Random sampling from probabilities
sample.haps <- haplotypes.sample(probs.3D = CC.probs, set=all.eval$possible[top.sets,], seed=1)
allele.freq.plot(allele.freq= sample.haps)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.