Nothing
## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE, fig.width = 6, fig.height = 6)
## -----------------------------------------------------------------------------
library(polyRAD)
maphmcfile <- system.file("extdata", "ClareMap_HapMap.hmc.txt",
package = "polyRAD")
maphmcfile
mydata <- readHMC(maphmcfile,
possiblePloidies = list(2, c(2, 2)),
taxaPloidy = 2)
mydata
## -----------------------------------------------------------------------------
GetTaxa(mydata)[c(1:10,293:299)]
## -----------------------------------------------------------------------------
mydata <- SetDonorParent(mydata, "Kaskade-Justin")
mydata <- SetRecurrentParent(mydata, "Zebrinus-Justin")
## -----------------------------------------------------------------------------
mydata$taxaPloidy[c("IGR-2011-001", "p196-150A-c", "p877-348-b")] <- 1L
mydata
## -----------------------------------------------------------------------------
alignfile <- system.file("extdata", "ClareMap_alignments.csv",
package = "polyRAD")
aligndata <- read.csv(alignfile, row.names = 1)
head(aligndata)
mydata$locTable$Chr <- aligndata[GetLoci(mydata), 1]
mydata$locTable$Pos <- aligndata[GetLoci(mydata), 2]
head(mydata$locTable)
## ----eval = FALSE-------------------------------------------------------------
# mydata <- AddPCA(mydata)
## -----------------------------------------------------------------------------
load(system.file("extdata", "examplePCA.RData", package = "polyRAD"))
mydata$PCA <- examplePCA
## -----------------------------------------------------------------------------
plot(mydata)
## -----------------------------------------------------------------------------
realprogeny <- GetTaxa(mydata)[mydata$PCA[,"PC1"] > -10 &
mydata$PCA[,"PC1"] < 10]
# eliminate the one doubled haploid line in this group
realprogeny <- realprogeny[!realprogeny %in% c("IGR-2011-001", "p196-150A-c",
"p877-348-b")]
# also retain parents
keeptaxa <- c(realprogeny, GetDonorParent(mydata), GetRecurrentParent(mydata))
mydata <- SubsetByTaxon(mydata, taxa = keeptaxa)
plot(mydata)
## -----------------------------------------------------------------------------
mydata2 <- PipelineMapping2Parents(mydata,
freqAllowedDeviation = 0.06,
useLinkage = FALSE,
minLikelihoodRatio = 2)
## -----------------------------------------------------------------------------
overdispersionP <- TestOverdispersion(mydata2, to_test = 8:15)
sapply(overdispersionP[names(overdispersionP) != "optimal"],
quantile, probs = c(0.01, 0.25, 0.5, 0.75, 0.99))
## -----------------------------------------------------------------------------
my_ovdisp <- overdispersionP$optimal
## ----message = FALSE----------------------------------------------------------
myhindhe <- HindHeMapping(mydata, ploidy = 2L)
hist(colMeans(myhindhe, na.rm = TRUE), col = "lightgrey",
xlab = "Hind/He", main = "Histogram of Hind/He by locus")
## -----------------------------------------------------------------------------
set.seed(720)
ExpectedHindHeMapping(mydata, ploidy = 2, overdispersion = my_ovdisp, reps = 2,
contamRate = 0.001, errorRate = 0.001)
## -----------------------------------------------------------------------------
goodMarkers <- colnames(myhindhe)[which(colMeans(myhindhe, na.rm = TRUE) < 0.53 &
colMeans(myhindhe, na.rm = TRUE) > 0.43)]
mydata <- SubsetByLocus(mydata, goodMarkers)
## -----------------------------------------------------------------------------
mydata <- PipelineMapping2Parents(mydata,
freqAllowedDeviation = 0.06,
useLinkage = TRUE, overdispersion = my_ovdisp,
minLikelihoodRatio = 2)
## -----------------------------------------------------------------------------
table(mydata$alleleFreq)
## -----------------------------------------------------------------------------
mydata$alleleDepth["Map1-089",1:8]
mydata$genotypeLikelihood[[1,"2"]][,"Map1-089",1:8]
mydata$genotypeLikelihood[[2,"2"]][,"Map1-089",1:8]
## -----------------------------------------------------------------------------
mydata$priorProb[[1,"2"]][,1:8]
mydata$priorProb[[2,"2"]][,1:8]
## -----------------------------------------------------------------------------
mydata$ploidyChiSq[,1:8]
## -----------------------------------------------------------------------------
plot(mydata$ploidyChiSq[1,], mydata$ploidyChiSq[2,],
xlab = "Chi-squared for diploid model",
ylab = "Chi-squared for tetraploid model")
## -----------------------------------------------------------------------------
mydata$posteriorProb[[1,"2"]][,"Map1-089",1:8]
mydata$posteriorProb[[2,"2"]][,"Map1-089",1:8]
## -----------------------------------------------------------------------------
mydata <- SubsetByPloidy(mydata, ploidies = list(2))
## -----------------------------------------------------------------------------
mydata <- RemoveUngenotypedLoci(mydata)
## -----------------------------------------------------------------------------
mywm <- GetWeightedMeanGenotypes(mydata)
round(mywm[c(276, 277, 1:5), 9:12], 3)
## -----------------------------------------------------------------------------
mydata$likelyGeno_donor[,1:8]
mydata$likelyGeno_recurrent[,1:8]
## ----echo = FALSE-------------------------------------------------------------
# Determine if VariantAnnotation is installed, so we know whether to
# execute the rest of the vignette.
haveVA <- requireNamespace("VariantAnnotation", quietly = TRUE)
## ----message=FALSE, warning=FALSE, eval = haveVA------------------------------
library(VariantAnnotation)
myVCF <- system.file("extdata", "Msi01genes.vcf", package = "polyRAD")
## ----eval=FALSE---------------------------------------------------------------
# mybg <- bgzip(myVCF)
# indexTabix(mybg, format = "vcf")
## -----------------------------------------------------------------------------
pldfile <- system.file("extdata", "Msi_ploidies.txt", package = "polyRAD")
msi_ploidies <- read.table(pldfile, sep = "\t", header = FALSE)
head(msi_ploidies)
table(msi_ploidies$V2)
pld_vect <- msi_ploidies$V2
names(pld_vect) <- msi_ploidies$V1
## ----eval = haveVA------------------------------------------------------------
mydata <- VCF2RADdata(myVCF, possiblePloidies = list(2, c(2,2)),
expectedLoci = 100, expectedAlleles = 500,
taxaPloidy = pld_vect)
mydata
## ----echo = FALSE, eval = !haveVA---------------------------------------------
# # If we don't have VariantAnnotation, load in the dataset
# load(system.file("extdata", "vcfdata.RData", package = "polyRAD"))
## -----------------------------------------------------------------------------
overdispersionP <- TestOverdispersion(mydata, to_test = 8:14)
sapply(overdispersionP[names(overdispersionP) != "optimal"],
quantile, probs = c(0.01, 0.25, 0.5, 0.75, 0.99))
## -----------------------------------------------------------------------------
my_ovdisp <- overdispersionP$optimal
## -----------------------------------------------------------------------------
myhindhe <- HindHe(mydata)
myhindheByLoc <- colMeans(myhindhe, na.rm = TRUE)
hist(myhindheByLoc, col = "lightgrey",
xlab = "Hind/He", main = "Histogram of Hind/He by locus")
abline(v = 0.5, col = "blue", lwd = 2)
## -----------------------------------------------------------------------------
mydata <- AddAlleleFreqHWE(mydata)
theseloci <- GetLoci(mydata)[mydata$alleles2loc[mydata$alleleFreq >= 0.05 & mydata$alleleFreq < 0.5]]
theseloci <- unique(theseloci)
myhindheByLoc2 <- colMeans(myhindhe[mydata$taxaPloidy == 2L, theseloci], na.rm = TRUE)
hist(myhindheByLoc2, col = "lightgrey",
xlab = "Hind/He", main = "Histogram of Hind/He by locus, MAF >= 0.05")
abline(v = 0.5, col = "blue", lwd = 2)
## -----------------------------------------------------------------------------
set.seed(803)
ExpectedHindHe(mydata, inbreeding = 0.25, ploidy = 2, overdispersion = my_ovdisp,
reps = 10, contamRate = 0.001, errorRate = 0.001)
## -----------------------------------------------------------------------------
mean(myhindheByLoc < 0.24) # about 29% of markers would be removed
keeploci <- names(myhindheByLoc)[myhindheByLoc >= 0.24]
mydata <- SubsetByLocus(mydata, keeploci)
## ----message = FALSE----------------------------------------------------------
mydataHWE <- IterateHWE(mydata, tol = 1e-3, overdispersion = 10)
## -----------------------------------------------------------------------------
hist(mydataHWE$alleleFreq, breaks = 20, col = "lightgrey")
## ----message = FALSE----------------------------------------------------------
set.seed(3908)
mydataPopStruct <- IteratePopStruct(mydata, nPcsInit = 8, tol = 5e-03,
overdispersion = 10)
## -----------------------------------------------------------------------------
hist(mydataPopStruct$alleleFreq, breaks = 20, col = "lightgrey")
## -----------------------------------------------------------------------------
plot(mydataPopStruct)
## -----------------------------------------------------------------------------
myallele <- 1
freqcol <- heat.colors(101)[round(mydataPopStruct$alleleFreqByTaxa[,myallele] * 100) + 1]
plot(mydataPopStruct, pch = 21, bg = freqcol)
## -----------------------------------------------------------------------------
plot(mydataPopStruct$ploidyChiSq[1,], mydataPopStruct$ploidyChiSq[2,],
xlab = "Chi-squared for diploid model",
ylab = "Chi-squared for allotetraploid model", log = "xy")
abline(a = 0, b = 1, col = "blue", lwd = 2)
## ----message = FALSE, eval = requireNamespace("ggplot2", quietly = TRUE)------
myChiSqRat <- mydataPopStruct$ploidyChiSq[1,] / mydataPopStruct$ploidyChiSq[2,]
myChiSqRat <- tapply(myChiSqRat, mydataPopStruct$alleles2loc, mean)
allelesPerLoc <- as.vector(table(mydataPopStruct$alleles2loc))
library(ggplot2)
ggplot(mapping = aes(x = myhindheByLoc[GetLoci(mydata)], y = myChiSqRat, fill = as.factor(allelesPerLoc))) +
geom_point(shape = 21, size = 3) +
labs(x = "Hind/He", y = "Ratio of Chi-squared values, diploid to allotetraploid",
fill = "Alleles per locus") +
geom_hline(yintercept = 1) +
geom_vline(xintercept = 0.5) +
scale_fill_brewer(palette = "YlOrRd")
## -----------------------------------------------------------------------------
wmgenoPopStruct <- GetWeightedMeanGenotypes(mydataPopStruct)
wmgenoPopStruct[1:10,1:5]
## ----eval = FALSE-------------------------------------------------------------
# myHindHe <- HindHe(mydata)
# TotDepthT <- rowSums(mydata$locDepth)
## -----------------------------------------------------------------------------
print(load(system.file("extdata", "MsaHindHe0.RData", package = "polyRAD")))
## -----------------------------------------------------------------------------
myHindHeByInd <- rowMeans(myHindHe, na.rm = TRUE)
## ----eval = requireNamespace("ggplot2", quietly = TRUE)-----------------------
ggplot(data.frame(Depth = TotDepthT, HindHe = myHindHeByInd,
Ploidy = ploidies),
mapping = aes(x = Depth, y = HindHe, color = Ploidy)) +
geom_point() +
scale_x_log10() +
facet_wrap(~ Ploidy) +
geom_hline(data = data.frame(Ploidy = c("2x", "3x", "4x"),
ExpHindHe = c(1/2, 2/3, 3/4)),
mapping = aes(yintercept = ExpHindHe), lty = 2) +
labs(x = "Read Depth", y = "Hind/He", color = "Ploidy")
## -----------------------------------------------------------------------------
myHindHe2x <- myHindHe[ploidies == "2x",]
myHindHe4x <- myHindHe[ploidies == "4x",]
## -----------------------------------------------------------------------------
myHindHeByLoc2x <- colMeans(myHindHe2x, na.rm = TRUE)
hist(myHindHeByLoc2x, breaks = 50, xlab = "Hind/He",
main = "Distribution of Hind/He among loci in diploids",
col = "lightgrey")
abline(v = 0.5, col = "blue", lwd = 2)
myHindHeByLoc4x <- colMeans(myHindHe4x, na.rm = TRUE)
hist(myHindHeByLoc4x, breaks = 50, xlab = "Hind/He",
main = "Distribution of Hind/He among loci in tetraploids",
col = "lightgrey")
abline(v = 0.75, col = "blue", lwd = 2)
## -----------------------------------------------------------------------------
goodLoci <- colnames(myHindHe)[myHindHeByLoc2x < 0.5 & myHindHeByLoc4x < 0.75]
length(goodLoci) # 611 out of 1000 markers retained
head(goodLoci)
## ----eval = FALSE-------------------------------------------------------------
# library(polyRAD)
# library(VariantAnnotation)
#
# # Two files produced by the TASSEL-GBSv2 pipeline using two different
# # enzyme systems.
# NsiI_file <- "170705Msi_NsiI_genotypes.vcf.bgz"
# PstI_file <- "170608Msi_PstI_genotypes.vcf.bgz"
#
# # The vector allSam was defined outside of this script, and contains the
# # names of all samples that I wanted to import. Below I find sample names
# # within the VCF files that match those samples.
# NsiI_sam <- allSam[allSam %in% samples(scanVcfHeader(NsiI_file))]
# PstI_sam <- allSam[allSam %in% samples(scanVcfHeader(PstI_file))]
#
# # Import two RADdata objects, assuming diploidy. A large yield size was
# # used due to the computer having 64 Gb RAM; on a typical laptop you
# # would probably want to keep the default of 5000.
# PstI_RAD <- VCF2RADdata(PstI_file, samples = PstI_sam, yieldSize = 5e4,
# expectedAlleles = 1e6, expectedLoci = 2e5)
# NsiI_RAD <- VCF2RADdata(NsiI_file, samples = NsiI_sam, yieldSize = 5e4,
# expectedAlleles = 1e6, expectedLoci = 2e5)
#
# # remove any loci duplicated across the two sets
# nLoci(PstI_RAD) # 116757
# nLoci(NsiI_RAD) # 187434
# nAlleles(PstI_RAD) # 478210
# nAlleles(NsiI_RAD) # 952511
# NsiI_keeploci <- which(!GetLoci(NsiI_RAD) %in% GetLoci(PstI_RAD))
# cat(nLoci(NsiI_RAD) - length(NsiI_keeploci),
# file = "180522Num_duplicate_loci.txt") #992 duplicate
# NsiI_RAD <- SubsetByLocus(NsiI_RAD, NsiI_keeploci)
#
# # combine allele depth into one matrix
# PstI_depth <- PstI_RAD$alleleDepth
# NsiI_depth <- NsiI_RAD$alleleDepth
# total_depth <- matrix(0L, nrow = length(allSam),
# ncol = ncol(PstI_depth) + ncol(NsiI_depth),
# dimnames = list(allSam,
# c(colnames(PstI_depth),
# colnames(NsiI_depth))))
# total_depth[,colnames(PstI_depth)] <- PstI_depth[allSam,]
# total_depth[rownames(NsiI_depth),colnames(NsiI_depth)] <- NsiI_depth
#
# # combine other slots
# total_alleles2loc <- c(PstI_RAD$alleles2loc,
# NsiI_RAD$alleles2loc + nLoci(PstI_RAD))
# total_locTable <- rbind(PstI_RAD$locTable, NsiI_RAD$locTable)
# total_alleleNucleotides <- c(PstI_RAD$alleleNucleotides,
# NsiI_RAD$alleleNucleotides)
#
# # build new RADdata object and save
# total_RAD <- RADdata(total_depth, total_alleles2loc, total_locTable,
# list(2L), 0.001, total_alleleNucleotides)
# #save(total_RAD, file = "180524_RADdata_NsiIPstI.RData")
#
# # Make groups representing pairs of chromosomes, and one group for all
# # non-assembled scaffolds.
# splitlist <- list(c("^01$", "^02$"),
# c("^03$", "^04$"),
# c("^05$", "^06$"),
# c("^07$", "^08$"),
# c("^09$", "^10$"),
# c("^11$", "^12$"),
# c("^13$", "^14$", "^15$"),
# c("^16$", "^17$"),
# c("^18$", "^194"), "^SCAFFOLD")
# # split by chromosome and save seperate objects
# SplitByChromosome(total_RAD, chromlist = splitlist,
# chromlist.use.regex = TRUE, fileprefix = "180524splitRAD")
#
# # files with RADdata objects
# splitfiles <- grep("^180524splitRAD", list.files("."), value = TRUE)
#
# # list to hold markers formatted for GAPIT/FarmCPU
# GAPITlist <- list()
# length(GAPITlist) <- length(splitfiles)
#
# # loop through RADdata objects
# for(i in 1:length(splitfiles)){
# load(splitfiles[i])
# splitRADdata <- IteratePopStructLD(splitRADdata)
# GAPITlist[[i]] <- ExportGAPIT(splitRADdata)
# }
# #save(GAPITlist, file = "180524GAPITlist.RData")
#
# # put together into one dataset for FarmCPU
# GM.all <- rbind(GAPITlist[[1]]$GM, GAPITlist[[2]]$GM, GAPITlist[[3]]$GM,
# GAPITlist[[4]]$GM, GAPITlist[[5]]$GM, GAPITlist[[6]]$GM,
# GAPITlist[[7]]$GM, GAPITlist[[8]]$GM,
# GAPITlist[[9]]$GM, GAPITlist[[10]]$GM)
# GD.all <- cbind(GAPITlist[[1]]$GD, GAPITlist[[2]]$GD[,-1],
# GAPITlist[[3]]$GD[,-1], GAPITlist[[4]]$GD[,-1],
# GAPITlist[[5]]$GD[,-1], GAPITlist[[6]]$GD[,-1],
# GAPITlist[[7]]$GD[,-1], GAPITlist[[8]]$GD[,-1],
# GAPITlist[[9]]$GD[,-1], GAPITlist[[10]]$GD[,-1])
# #save(GD.all, GM.all, file = "180525GM_GD_all_polyRAD.RData") # 1076888 markers
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.