polyRADtutorial.R
In polyRAD: Genotype Calling with Uncertainty from Sequencing Data in Polyploids and Diploids

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE, fig.width = 6, fig.height = 6)

## -----------------------------------------------------------------------------
library(polyRAD)
maphmcfile <- system.file("extdata", "ClareMap_HapMap.hmc.txt", 
                          package = "polyRAD")
maphmcfile

mydata <- readHMC(maphmcfile,
                  possiblePloidies = list(2, c(2, 2)),
                  taxaPloidy = 2)
mydata

## -----------------------------------------------------------------------------
GetTaxa(mydata)[c(1:10,293:299)]

## -----------------------------------------------------------------------------
mydata <- SetDonorParent(mydata, "Kaskade-Justin")
mydata <- SetRecurrentParent(mydata, "Zebrinus-Justin")

## -----------------------------------------------------------------------------
mydata$taxaPloidy[c("IGR-2011-001", "p196-150A-c", "p877-348-b")] <- 1L
mydata

## -----------------------------------------------------------------------------
alignfile <- system.file("extdata", "ClareMap_alignments.csv", 
                         package = "polyRAD")

aligndata <- read.csv(alignfile, row.names = 1)
head(aligndata)

mydata$locTable$Chr <- aligndata[GetLoci(mydata), 1]
mydata$locTable$Pos <- aligndata[GetLoci(mydata), 2]
head(mydata$locTable)

## ----eval = FALSE-------------------------------------------------------------
#  mydata <- AddPCA(mydata)

## -----------------------------------------------------------------------------
load(system.file("extdata", "examplePCA.RData", package = "polyRAD"))
mydata$PCA <- examplePCA

## -----------------------------------------------------------------------------
plot(mydata)

## -----------------------------------------------------------------------------
realprogeny <- GetTaxa(mydata)[mydata$PCA[,"PC1"] > -10 &
                                 mydata$PCA[,"PC1"] < 10]
# eliminate the one doubled haploid line in this group
realprogeny <- realprogeny[!realprogeny %in% c("IGR-2011-001", "p196-150A-c",
                                               "p877-348-b")]
# also retain parents
keeptaxa <- c(realprogeny, GetDonorParent(mydata), GetRecurrentParent(mydata))

mydata <- SubsetByTaxon(mydata, taxa = keeptaxa)
plot(mydata)

## -----------------------------------------------------------------------------
mydata2 <- PipelineMapping2Parents(mydata, 
                                   freqAllowedDeviation = 0.06,
                                   useLinkage = FALSE,
                                   minLikelihoodRatio = 2)

## -----------------------------------------------------------------------------
overdispersionP <- TestOverdispersion(mydata2, to_test = 8:15)

sapply(overdispersionP[names(overdispersionP) != "optimal"],
       quantile, probs = c(0.01, 0.25, 0.5, 0.75, 0.99))

## -----------------------------------------------------------------------------
my_ovdisp <- overdispersionP$optimal

## ----message = FALSE----------------------------------------------------------
myhindhe <- HindHeMapping(mydata, ploidy = 2L)
hist(colMeans(myhindhe, na.rm = TRUE), col = "lightgrey",
     xlab = "Hind/He", main = "Histogram of Hind/He by locus")

## -----------------------------------------------------------------------------
set.seed(720)
ExpectedHindHeMapping(mydata, ploidy = 2, overdispersion = my_ovdisp, reps = 2,
                      contamRate = 0.001, errorRate = 0.001)

## -----------------------------------------------------------------------------
goodMarkers <- colnames(myhindhe)[which(colMeans(myhindhe, na.rm = TRUE) < 0.53 &
                                          colMeans(myhindhe, na.rm = TRUE) > 0.43)]
mydata <- SubsetByLocus(mydata, goodMarkers)

## -----------------------------------------------------------------------------
mydata <- PipelineMapping2Parents(mydata, 
                                  freqAllowedDeviation = 0.06,
                                  useLinkage = TRUE, overdispersion = my_ovdisp,
                                  minLikelihoodRatio = 2)

## -----------------------------------------------------------------------------
table(mydata$alleleFreq)

## -----------------------------------------------------------------------------
mydata$alleleDepth["Map1-089",1:8]
mydata$genotypeLikelihood[[1,"2"]][,"Map1-089",1:8]
mydata$genotypeLikelihood[[2,"2"]][,"Map1-089",1:8]

## -----------------------------------------------------------------------------
mydata$priorProb[[1,"2"]][,1:8]
mydata$priorProb[[2,"2"]][,1:8]

## -----------------------------------------------------------------------------
mydata$ploidyChiSq[,1:8]

## -----------------------------------------------------------------------------
plot(mydata$ploidyChiSq[1,], mydata$ploidyChiSq[2,], 
     xlab = "Chi-squared for diploid model",
     ylab = "Chi-squared for tetraploid model")

## -----------------------------------------------------------------------------
mydata$posteriorProb[[1,"2"]][,"Map1-089",1:8]
mydata$posteriorProb[[2,"2"]][,"Map1-089",1:8]

## -----------------------------------------------------------------------------
mydata <- SubsetByPloidy(mydata, ploidies = list(2))

## -----------------------------------------------------------------------------
mydata <- RemoveUngenotypedLoci(mydata)

## -----------------------------------------------------------------------------
mywm <- GetWeightedMeanGenotypes(mydata)
round(mywm[c(276, 277, 1:5), 9:12], 3)

## -----------------------------------------------------------------------------
mydata$likelyGeno_donor[,1:8]
mydata$likelyGeno_recurrent[,1:8]

## ----echo = FALSE-------------------------------------------------------------
# Determine if VariantAnnotation is installed, so we know whether to
# execute the rest of the vignette.
haveVA <- requireNamespace("VariantAnnotation", quietly = TRUE)

## ----message=FALSE, warning=FALSE, eval = haveVA------------------------------
library(VariantAnnotation)

myVCF <- system.file("extdata", "Msi01genes.vcf", package = "polyRAD")

## ----eval=FALSE---------------------------------------------------------------
#  mybg <- bgzip(myVCF)
#  indexTabix(mybg, format = "vcf")

## -----------------------------------------------------------------------------
pldfile <- system.file("extdata", "Msi_ploidies.txt", package = "polyRAD")
msi_ploidies <- read.table(pldfile, sep = "\t", header = FALSE)
head(msi_ploidies)
table(msi_ploidies$V2)
pld_vect <- msi_ploidies$V2
names(pld_vect) <- msi_ploidies$V1

## ----eval = haveVA------------------------------------------------------------
mydata <- VCF2RADdata(myVCF, possiblePloidies = list(2, c(2,2)),
                      expectedLoci = 100, expectedAlleles = 500,
                      taxaPloidy = pld_vect)
mydata

## ----echo = FALSE, eval = !haveVA---------------------------------------------
#  # If we don't have VariantAnnotation, load in the dataset
#  load(system.file("extdata", "vcfdata.RData", package = "polyRAD"))

## -----------------------------------------------------------------------------
overdispersionP <- TestOverdispersion(mydata, to_test = 8:14)

sapply(overdispersionP[names(overdispersionP) != "optimal"],
       quantile, probs = c(0.01, 0.25, 0.5, 0.75, 0.99))

## -----------------------------------------------------------------------------
my_ovdisp <- overdispersionP$optimal

## -----------------------------------------------------------------------------
myhindhe <- HindHe(mydata)
myhindheByLoc <- colMeans(myhindhe, na.rm = TRUE)
hist(myhindheByLoc, col = "lightgrey",
     xlab = "Hind/He", main = "Histogram of Hind/He by locus")
abline(v = 0.5, col = "blue", lwd = 2)

## -----------------------------------------------------------------------------
mydata <- AddAlleleFreqHWE(mydata)
theseloci <- GetLoci(mydata)[mydata$alleles2loc[mydata$alleleFreq >= 0.05 & mydata$alleleFreq < 0.5]]
theseloci <- unique(theseloci)
myhindheByLoc2 <- colMeans(myhindhe[mydata$taxaPloidy == 2L, theseloci], na.rm = TRUE)
hist(myhindheByLoc2, col = "lightgrey",
     xlab = "Hind/He", main = "Histogram of Hind/He by locus, MAF >= 0.05")
abline(v = 0.5, col = "blue", lwd = 2)

## -----------------------------------------------------------------------------
set.seed(803)
ExpectedHindHe(mydata, inbreeding = 0.25, ploidy = 2, overdispersion = my_ovdisp,
               reps = 10, contamRate = 0.001, errorRate = 0.001)

## -----------------------------------------------------------------------------
mean(myhindheByLoc < 0.24) # about 29% of markers would be removed
keeploci <- names(myhindheByLoc)[myhindheByLoc >= 0.24]
mydata <- SubsetByLocus(mydata, keeploci)

## ----message = FALSE----------------------------------------------------------
mydataHWE <- IterateHWE(mydata, tol = 1e-3, overdispersion = 10)

## -----------------------------------------------------------------------------
hist(mydataHWE$alleleFreq, breaks = 20, col = "lightgrey")

## ----message = FALSE----------------------------------------------------------
set.seed(3908)
mydataPopStruct <- IteratePopStruct(mydata, nPcsInit = 8, tol = 5e-03,
                                    overdispersion = 10)

## -----------------------------------------------------------------------------
hist(mydataPopStruct$alleleFreq, breaks = 20, col = "lightgrey")

## -----------------------------------------------------------------------------
plot(mydataPopStruct)

## -----------------------------------------------------------------------------
myallele <- 1
freqcol <- heat.colors(101)[round(mydataPopStruct$alleleFreqByTaxa[,myallele] * 100) + 1]
plot(mydataPopStruct, pch = 21, bg = freqcol)

## -----------------------------------------------------------------------------
plot(mydataPopStruct$ploidyChiSq[1,], mydataPopStruct$ploidyChiSq[2,], 
     xlab = "Chi-squared for diploid model",
     ylab = "Chi-squared for allotetraploid model", log = "xy")
abline(a = 0, b = 1, col = "blue", lwd = 2)

## ----message = FALSE, eval = requireNamespace("ggplot2", quietly = TRUE)------
myChiSqRat <- mydataPopStruct$ploidyChiSq[1,] / mydataPopStruct$ploidyChiSq[2,]
myChiSqRat <- tapply(myChiSqRat, mydataPopStruct$alleles2loc, mean)
allelesPerLoc <- as.vector(table(mydataPopStruct$alleles2loc))

library(ggplot2)
ggplot(mapping = aes(x = myhindheByLoc[GetLoci(mydata)], y = myChiSqRat, fill = as.factor(allelesPerLoc))) +
  geom_point(shape = 21, size = 3) +
  labs(x = "Hind/He", y = "Ratio of Chi-squared values, diploid to allotetraploid",
       fill = "Alleles per locus") +
  geom_hline(yintercept = 1) +
  geom_vline(xintercept = 0.5) +
  scale_fill_brewer(palette = "YlOrRd")

## -----------------------------------------------------------------------------
wmgenoPopStruct <- GetWeightedMeanGenotypes(mydataPopStruct)
wmgenoPopStruct[1:10,1:5]

## ----eval = FALSE-------------------------------------------------------------
#  myHindHe <- HindHe(mydata)
#  TotDepthT <- rowSums(mydata$locDepth)

## -----------------------------------------------------------------------------
print(load(system.file("extdata", "MsaHindHe0.RData", package = "polyRAD")))

## -----------------------------------------------------------------------------
myHindHeByInd <- rowMeans(myHindHe, na.rm = TRUE)

## ----eval = requireNamespace("ggplot2", quietly = TRUE)-----------------------
ggplot(data.frame(Depth = TotDepthT, HindHe = myHindHeByInd,
                  Ploidy = ploidies),
  mapping = aes(x = Depth, y = HindHe, color = Ploidy)) +
  geom_point() +
  scale_x_log10() +
  facet_wrap(~ Ploidy) +
  geom_hline(data = data.frame(Ploidy = c("2x", "3x", "4x"),
                               ExpHindHe = c(1/2, 2/3, 3/4)),
             mapping = aes(yintercept = ExpHindHe), lty = 2) +
  labs(x = "Read Depth", y = "Hind/He", color = "Ploidy")

## -----------------------------------------------------------------------------
myHindHe2x <- myHindHe[ploidies == "2x",]
myHindHe4x <- myHindHe[ploidies == "4x",]

## -----------------------------------------------------------------------------
myHindHeByLoc2x <- colMeans(myHindHe2x, na.rm = TRUE)
hist(myHindHeByLoc2x, breaks = 50, xlab = "Hind/He",
     main = "Distribution of Hind/He among loci in diploids",
     col = "lightgrey")
abline(v = 0.5, col = "blue", lwd = 2)

myHindHeByLoc4x <- colMeans(myHindHe4x, na.rm = TRUE)
hist(myHindHeByLoc4x, breaks = 50, xlab = "Hind/He",
     main = "Distribution of Hind/He among loci in tetraploids",
     col = "lightgrey")
abline(v = 0.75, col = "blue", lwd = 2)

## -----------------------------------------------------------------------------
goodLoci <- colnames(myHindHe)[myHindHeByLoc2x < 0.5 & myHindHeByLoc4x < 0.75]
length(goodLoci) # 611 out of 1000 markers retained
head(goodLoci)

## ----eval = FALSE-------------------------------------------------------------
#  library(polyRAD)
#  library(VariantAnnotation)
#  
#  # Two files produced by the TASSEL-GBSv2 pipeline using two different
#  # enzyme systems.
#  NsiI_file <- "170705Msi_NsiI_genotypes.vcf.bgz"
#  PstI_file <- "170608Msi_PstI_genotypes.vcf.bgz"
#  
#  # The vector allSam was defined outside of this script, and contains the
#  # names of all samples that I wanted to import.  Below I find sample names
#  # within the VCF files that match those samples.
#  NsiI_sam <- allSam[allSam %in% samples(scanVcfHeader(NsiI_file))]
#  PstI_sam <- allSam[allSam %in% samples(scanVcfHeader(PstI_file))]
#  
#  # Import two RADdata objects, assuming diploidy.  A large yield size was
#  # used due to the computer having 64 Gb RAM; on a typical laptop you
#  # would probably want to keep the default of 5000.
#  PstI_RAD <- VCF2RADdata(PstI_file, samples = PstI_sam, yieldSize = 5e4,
#                          expectedAlleles = 1e6, expectedLoci = 2e5)
#  NsiI_RAD <- VCF2RADdata(NsiI_file, samples = NsiI_sam, yieldSize = 5e4,
#                          expectedAlleles = 1e6, expectedLoci = 2e5)
#  
#  # remove any loci duplicated across the two sets
#  nLoci(PstI_RAD)    # 116757
#  nLoci(NsiI_RAD)    # 187434
#  nAlleles(PstI_RAD) # 478210
#  nAlleles(NsiI_RAD) # 952511
#  NsiI_keeploci <- which(!GetLoci(NsiI_RAD) %in% GetLoci(PstI_RAD))
#  cat(nLoci(NsiI_RAD) - length(NsiI_keeploci),
#      file = "180522Num_duplicate_loci.txt") #992 duplicate
#  NsiI_RAD <- SubsetByLocus(NsiI_RAD, NsiI_keeploci)
#  
#  # combine allele depth into one matrix
#  PstI_depth <- PstI_RAD$alleleDepth
#  NsiI_depth <- NsiI_RAD$alleleDepth
#  total_depth <- matrix(0L, nrow = length(allSam),
#                        ncol = ncol(PstI_depth) + ncol(NsiI_depth),
#                        dimnames = list(allSam,
#                                        c(colnames(PstI_depth),
#                                          colnames(NsiI_depth))))
#  total_depth[,colnames(PstI_depth)] <- PstI_depth[allSam,]
#  total_depth[rownames(NsiI_depth),colnames(NsiI_depth)] <- NsiI_depth
#  
#  # combine other slots
#  total_alleles2loc <- c(PstI_RAD$alleles2loc,
#                         NsiI_RAD$alleles2loc + nLoci(PstI_RAD))
#  total_locTable <- rbind(PstI_RAD$locTable, NsiI_RAD$locTable)
#  total_alleleNucleotides <- c(PstI_RAD$alleleNucleotides,
#                               NsiI_RAD$alleleNucleotides)
#  
#  # build new RADdata object and save
#  total_RAD <- RADdata(total_depth, total_alleles2loc, total_locTable,
#                       list(2L), 0.001, total_alleleNucleotides)
#  #save(total_RAD, file = "180524_RADdata_NsiIPstI.RData")
#  
#  # Make groups representing pairs of chromosomes, and one group for all
#  # non-assembled scaffolds.
#  splitlist <- list(c("^01$", "^02$"),
#                    c("^03$", "^04$"),
#                    c("^05$", "^06$"),
#                    c("^07$", "^08$"),
#                    c("^09$", "^10$"),
#                    c("^11$", "^12$"),
#                    c("^13$", "^14$", "^15$"),
#                    c("^16$", "^17$"),
#                    c("^18$", "^194"), "^SCAFFOLD")
#  # split by chromosome and save seperate objects
#  SplitByChromosome(total_RAD, chromlist = splitlist,
#                    chromlist.use.regex = TRUE, fileprefix = "180524splitRAD")
#  
#  # files with RADdata objects
#  splitfiles <- grep("^180524splitRAD", list.files("."), value = TRUE)
#  
#  # list to hold markers formatted for GAPIT/FarmCPU
#  GAPITlist <- list()
#  length(GAPITlist) <- length(splitfiles)
#  
#  # loop through RADdata objects
#  for(i in 1:length(splitfiles)){
#    load(splitfiles[i])
#    splitRADdata <- IteratePopStructLD(splitRADdata)
#    GAPITlist[[i]] <- ExportGAPIT(splitRADdata)
#  }
#  #save(GAPITlist, file = "180524GAPITlist.RData")
#  
#  # put together into one dataset for FarmCPU
#  GM.all <- rbind(GAPITlist[[1]]$GM, GAPITlist[[2]]$GM, GAPITlist[[3]]$GM,
#                  GAPITlist[[4]]$GM, GAPITlist[[5]]$GM, GAPITlist[[6]]$GM,
#                  GAPITlist[[7]]$GM, GAPITlist[[8]]$GM,
#                  GAPITlist[[9]]$GM, GAPITlist[[10]]$GM)
#  GD.all <- cbind(GAPITlist[[1]]$GD, GAPITlist[[2]]$GD[,-1],
#                  GAPITlist[[3]]$GD[,-1], GAPITlist[[4]]$GD[,-1],
#                  GAPITlist[[5]]$GD[,-1], GAPITlist[[6]]$GD[,-1],
#                  GAPITlist[[7]]$GD[,-1], GAPITlist[[8]]$GD[,-1],
#                  GAPITlist[[9]]$GD[,-1], GAPITlist[[10]]$GD[,-1])
#  #save(GD.all, GM.all, file = "180525GM_GD_all_polyRAD.RData") # 1076888 markers
Any scripts or data that you put into this service are public.
polyRAD documentation built on Nov. 10, 2022, 5:14 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
polyRAD
Genotype Calling with Uncertainty from Sequencing Data in Polyploids and Diploids

inst/doc/polyRADtutorial.R
In polyRAD: Genotype Calling with Uncertainty from Sequencing Data in Polyploids and Diploids

Try the polyRAD package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

polyRAD Genotype Calling with Uncertainty from Sequencing Data in Polyploids and Diploids

inst/doc/polyRADtutorial.R In polyRAD: Genotype Calling with Uncertainty from Sequencing Data in Polyploids and Diploids

Try the polyRAD package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

polyRAD
Genotype Calling with Uncertainty from Sequencing Data in Polyploids and Diploids

inst/doc/polyRADtutorial.R
In polyRAD: Genotype Calling with Uncertainty from Sequencing Data in Polyploids and Diploids