knitr::opts_chunk$set(echo = TRUE)
To test the utility of this approach, we will use simulations.
Terminology used in this package and markdown:
# Load some packages library(fsimpute) library(qtl) library(stringr) library(tidyverse) library(broom)
# Load the data load("simulations/fsimpute_simulation_results.RData") # Gather the results and summarize results.sum <- results.full %>% mutate(fam_size = as.factor(fam_size)) %>% # Rename the factors rename(`Selfing Generations` = self_gen, `Maximum Missingness` = max_miss, `Founder Accuracy` = fou_acc, `Founder Missingness` = fou_miss, `Final Accuracy` = fin_acc, `Final Missingness` = fin_miss, `Family Size` = fam_size) %>% gather(param, value, -`Selfing Generations`:-rep) %>% group_by(`Selfing Generations`, `Family Size`, `Maximum Missingness`, param) %>% summarize(mean = mean(value), se = sd(value) / sqrt(n())) # Plot # Imputation Accuracy gp <- results.sum %>% filter(param %in% c("Founder Accuracy", "Final Accuracy")) %>% ggplot(aes(x = `Maximum Missingness`, y = mean, col = `Family Size`)) + geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.05) + geom_point(size = 2) + geom_line() + ylab("Imputation Accuracy") + ggtitle("Imputation Accuracy in De Novo Simulations") gp + facet_wrap(c("param", "`Selfing Generations`"), labeller = labeller(param = label_value, `Selfing Generations` = label_both)) # Save plot ggsave(filename = "simulations/de_novo_accuracy.jpg", height = 5, width = 7) # Remaining missingness gp <- results.sum %>% filter(param %in% c("Founder Missingness", "Final Missingness")) %>% ggplot(aes(x = `Maximum Missingness`, y = mean, col = `Family Size`)) + geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.05) + geom_point(size = 2) + geom_line() + ylab("Remaining Missing Data") + ggtitle("Missing Data After Imputation in De Novo Simulations") gp + facet_wrap(c("param", "`Selfing Generations`"), labeller = labeller(param = label_value, `Selfing Generations` = label_both)) # Save plot ggsave(filename = "simulations/de_novo_missingness.jpg", height = 5, width = 7)
These simulations will use real barley data for the founder lines
library(GSSimTPUpdate) data("CAP.markers") data("CAP.haploids") # Change the genetic map ## Select only markers on chromosomes 1 and 2 map <- CAP.markers %>% split(.$chrom) %>% map(function(chr.df) { chr <- chr.df$pos * 100 names(chr) <- chr.df$rs class(chr) <- "A" return(chr) }) # Sample those markers in the haploid matrix haploids <- CAP.haploids[,unlist(lapply(map, names))] # Combine to form diploid genotypes genos <- as.data.frame(haploids) %>% split(rep(1:(nrow(.) / 2), each = 2)) %>% # Apply a sum function over each set of haploids map(colSums) %>% do.call("rbind", .)
# Apply a function over the results data.frame results.full <- design %>% by_row(function(iter) { # Separate parameters selfing.gen <- as.numeric(iter[1]) size <- as.numeric(iter[2]) miss <- as.numeric(iter[3]) rep <- as.numeric(iter[4]) # Simulate the family fam.cross <- sim.cross(map = map, n.ind = size, type = "bcsft", cross.scheme = c(0, (selfing.gen + 1))) # Generate founder genotypes by randomly sampling the CAP genotypes founders.recode <- genos[sample(nrow(genos), 2),] founders.recode <- lapply(X = map, FUN = function(chr) founders.recode[,names(chr)]) # Extract the progeny genotypes from the cross finals <- lapply(X = fam.cross$geno, FUN = function(chr) chr$data) # Recode the finals to z {0, 1, 2} format finals.recode <- mapply(founders.recode, finals, FUN = function(p.chr, f.chr) { # Iterate over markers new.genos <- sapply(X = seq_len(ncol(p.chr)), FUN = function(i) { # Pull out parental genotypes pars <- p.chr[,i] names(pars) <- c(1,3) # If the parents are the same, the het is the same as the homs if (all(pars[1] == pars)) { het <- pars[1] } else { het <- 1 } # Convert the genotypes ifelse(f.chr[,i] == 1, pars["1"], ifelse(f.chr[,i] == 2, het, ifelse(f.chr[,i] == 3, pars["3"], NA))) }) colnames(new.genos) <- colnames(p.chr) return(list(new.genos)) }) # Combine matricies and simulate missing data missing.recode <- list(founders.recode, finals.recode) %>% pmap(function(fo.chr, fi.chr) { # Combine matrices chr <- rbind(fo.chr, fi.chr) # First generate the missing data proportions from a uniform distribution miss.prob <- runif(n = ncol(chr), min = 0, max = miss) miss.mat <- sapply(X = miss.prob, FUN = function(prob) sample(x = c(T, F), size = nrow(chr), replace = T, prob = c(prob, 1 - prob))) chr[miss.mat] <- NA fo.chr <- chr[1:2,] fi.chr <- chr[-1:-2,] list(founders = fo.chr, finals = fi.chr) }) # Grab the founders and finals matrices founders.missing <- lapply(X = missing.recode, FUN = function(chr) chr$founders) finals.missing <- lapply(X = missing.recode, FUN = function(chr) chr$finals) # Create the cross object new.cross <- prep_cross(map = map, founders = founders.missing, finals = finals.missing, selfing.gen = selfing.gen) # Impute impute.cross <- fsimpute(cross = new.cross, prob.threshold = 0.7) # Founder accuracy fi.combined <- do.call("cbind", impute.cross$imputed$founders) f.combined <- do.call("cbind", founders.recode) tab <- table(fi.combined, f.combined, dnn = list("imputed", "original")) fou_acc <- sum(diag(tab)) / sum(tab) # Finals accuracy fi.combined <- do.call("cbind", impute.cross$imputed$finals) f.combined <- do.call("cbind", finals.recode) tab <- table(fi.combined, f.combined, dnn = list("imputed", "original")) fin_acc <- sum(diag(tab)) / sum(tab) # Missingness fou_miss <- impute.cross$imputed$stats$missingness["founders.missing","mean"] fin_miss <- impute.cross$imputed$stats$missingness["finals.missing","mean"] # Return results c(fou_acc, fin_acc, fou_miss, fin_miss) }, .collate = "cols") %>% # Rename the output rename(fou_acc = .out1, fin_acc = .out2, fou_miss = .out3, fin_miss = .out4) # Save save("results.full", file = "fsimpute_realData_simulation_results.RData")
# Load the data load("simulations/fsimpute_realData_simulation_results.RData") # Plot # Imputation Accuracy gp <- results.sum %>% filter(param %in% c("Founder Accuracy", "Final Accuracy")) %>% ggplot(aes(x = `Maximum Missingness`, y = mean, col = `Family Size`)) + geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.05) + geom_point(size = 2) + geom_line() + ylab("Imputation Accuracy") + ggtitle("Imputation Accuracy in Real-Data Simulations") gp + facet_wrap(c("param", "`Selfing Generations`"), labeller = labeller(param = label_value, `Selfing Generations` = label_both)) # Save plot ggsave(filename = "simulations/read_data_accuracy.jpg", height = 5, width = 7) # Remaining missingness gp <- results.sum %>% filter(param %in% c("Founder Missingness", "Final Missingness")) %>% ggplot(aes(x = `Maximum Missingness`, y = mean, col = `Family Size`)) + geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.05) + geom_point(size = 2) + geom_line() + ylab("Remaining Missing Data") + ggtitle("Missing Data After Imputation in Real-Data Simulations") gp + facet_wrap(c("param", "`Selfing Generations`"), labeller = labeller(param = label_value, `Selfing Generations` = label_both)) # Save plot ggsave(filename = "simulations/read_data_missingness.jpg", height = 5, width = 7)
## Extra testing ```r # Function for exact test segregation_test <- function(observed, expected = "hom0-hom2", n, selfing.gen) { # Define the expected segregation ratios if (expected == "hom0-hom2") { het1 <- 0.5 ^ selfing.gen hom0 <- hom2 <- (1 - het1) / 2 } # Hom = 0 if (expected == "hom0-het") { het1 <- 0.5 ^ (selfing.gen + 1) hom0 <- 0.5 + ((0.5 - het1) / 2) hom2 <- 1 - het1 - hom0 } # Hom = 2 if (expected == "het-hom2") { het1 <- 0.5 ^ (selfing.gen + 1) hom2 <- 0.5 + ((0.5 - het1) / 2) hom0 <- 1 - het1 - hom0 } if (expected == "het-het") { het1 <- 0.5 ^ (selfing.gen + 1) hom0 <- hom2 <- (1 - het1) / 2 } exp_ratio <- c(`0` = hom0, `1` = het1, `2` = hom2) * n # Calculate chi-square stat Xsq <- sum( (observed - exp_ratio) ^ 2 ) # return the p-value pchisq(Xsq, df = 2) } ## Another option for detecting parental states is to look at those markers where ## at least one of the founders is observed, then look at the progeny to determine ## the other genotype list(founders.missing, finals.missing) %>% map(function(fou.chr, fin.chr) { # Combine chr <- rbind(fou.chr, fin.chr) # Apply a function over sites apply(X = chr, MARGIN = 2, FUN = function(snp) { # Is only one site missing in the founders? fou <- snp[1:2] if ( sum(is.na(fou)) == 1 ) { # Is the observed founder het or hom? obs.fou <- fou[!is.na(fou)] names(obs.fou) <- ifelse( fou[!is.na(fou)] %in% c(0,2), "hom", "het" ) # Survey the progeny fin <- snp[-1:-2] # Are the progeny polymorphic? if (length( unique( na.omit(fin) ) ) > 1) { # Find the observed observed <- c(sum(fin == 0, na.rm = T), sum(fin == 1, na.rm = T), sum(fin == 2, na.rm = T)) # If the observed founder is homozygote, first test for another hom if (names(obs.fou) == "hom") { pval <- segregation_test(observed, "hom-hom", length(fin), selfing.gen) # If the pvalus is greather than 0.05, break if (pval > alpha) { fou[is.na(fou)] <- setdiff(c(0,2), obs.fou) return(fou) } else { pval <- ifelse(obs.fou == 0, segregation_test(observed, "hom-het", length(fin), selfing.gen), segregation_test(observed, "het-hom", length(fin), selfing.gen)) if (pval > 0.05) { fou[is.na(fou)] <- 1 return(fou) }} # If the observed value is not homozygous, test for either hom } else { pval0 <- segregation_test(observed, "hom-het", length(fin), selfing.gen) # Now test for a het # First test for another homozygote
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.