In neyhartj/fsimpute: Imputing genotype data in bi-parental populations of finite selfing

knitr::opts_chunk$set(echo = TRUE)

Introduction

To test the utility of this approach, we will use simulations.

Terminology used in this package and markdown:

founder(s) - the parents of a family. In the case of a bi-parental population, the number of founders is 2
final(s) - the progeny of a cross between the founders. The finals are the "observed" progeny. That is, through inbreeding, different generations of progeny are created, but presumably only one generation is genotyped. These are the "observed" progeny.

# Load some packages
library(fsimpute)
library(qtl)
library(stringr)
library(tidyverse)
library(broom)

De novo Simulations

Plot

# Load the data
load("simulations/fsimpute_simulation_results.RData")

# Gather the results and summarize
results.sum <- results.full %>% 
  mutate(fam_size = as.factor(fam_size)) %>%
  # Rename the factors
  rename(`Selfing Generations` = self_gen,
         `Maximum Missingness` = max_miss,
         `Founder Accuracy` = fou_acc,
         `Founder Missingness` = fou_miss,
         `Final Accuracy` = fin_acc,
         `Final Missingness` = fin_miss,
         `Family Size` = fam_size) %>%
  gather(param, value, -`Selfing Generations`:-rep) %>%
  group_by(`Selfing Generations`, `Family Size`, `Maximum Missingness`, param) %>%
  summarize(mean = mean(value),
            se = sd(value) / sqrt(n()))


# Plot
# Imputation Accuracy
gp <- results.sum %>%
  filter(param %in% c("Founder Accuracy", "Final Accuracy")) %>%
  ggplot(aes(x = `Maximum Missingness`, y = mean, col = `Family Size`)) + 
  geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.05) +
  geom_point(size = 2) + 
  geom_line() +
  ylab("Imputation Accuracy") +
  ggtitle("Imputation Accuracy in De Novo Simulations")
gp + facet_wrap(c("param", "`Selfing Generations`"), 
                labeller = labeller(param = label_value, `Selfing Generations` = label_both))
# Save plot
ggsave(filename = "simulations/de_novo_accuracy.jpg", height = 5, width = 7)



# Remaining missingness
gp <- results.sum %>%
  filter(param %in% c("Founder Missingness", "Final Missingness")) %>%
  ggplot(aes(x = `Maximum Missingness`, y = mean, col = `Family Size`)) + 
  geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.05) +
  geom_point(size = 2) + 
  geom_line() +
  ylab("Remaining Missing Data") +
  ggtitle("Missing Data After Imputation in De Novo Simulations")
gp + facet_wrap(c("param", "`Selfing Generations`"), 
                labeller = labeller(param = label_value, `Selfing Generations` = label_both))
# Save plot
ggsave(filename = "simulations/de_novo_missingness.jpg", height = 5, width = 7)

Using Real Data in the Simulation

These simulations will use real barley data for the founder lines

library(GSSimTPUpdate)

data("CAP.markers")
data("CAP.haploids")

# Change the genetic map
## Select only markers on chromosomes 1 and 2
map <- CAP.markers %>%
  split(.$chrom) %>%

  map(function(chr.df) {

    chr <- chr.df$pos * 100
    names(chr) <- chr.df$rs
    class(chr) <- "A"
    return(chr) })

# Sample those markers in the haploid matrix
haploids <- CAP.haploids[,unlist(lapply(map, names))]

# Combine to form diploid genotypes
genos <- as.data.frame(haploids) %>%
  split(rep(1:(nrow(.) / 2), each = 2)) %>%

  # Apply a sum function over each set of haploids
  map(colSums) %>%
  do.call("rbind", .)

Start the Simulation

# Apply a function over the results data.frame
results.full <- design %>%
  by_row(function(iter) {

    # Separate parameters
    selfing.gen <- as.numeric(iter[1])
    size <- as.numeric(iter[2])
    miss <- as.numeric(iter[3])
    rep <- as.numeric(iter[4])

    # Simulate the family
    fam.cross <- sim.cross(map = map, n.ind = size, type = "bcsft", 
                           cross.scheme = c(0, (selfing.gen + 1)))

    # Generate founder genotypes by randomly sampling the CAP genotypes
    founders.recode <- genos[sample(nrow(genos), 2),]
    founders.recode <- lapply(X = map, FUN = function(chr) founders.recode[,names(chr)])


    # Extract the progeny genotypes from the cross
    finals <- lapply(X = fam.cross$geno, FUN = function(chr) chr$data)

    # Recode the finals to z {0, 1, 2} format
    finals.recode <- mapply(founders.recode, finals, FUN = function(p.chr, f.chr) {

        # Iterate over markers
        new.genos <- sapply(X = seq_len(ncol(p.chr)), FUN = function(i) {
          # Pull out parental genotypes
          pars <- p.chr[,i]
          names(pars) <- c(1,3)

          # If the parents are the same, the het is the same as the homs
          if (all(pars[1] == pars)) {
            het <- pars[1]
          } else {
            het <- 1
          }

          # Convert the genotypes
          ifelse(f.chr[,i] == 1, pars["1"], 
                 ifelse(f.chr[,i] == 2, het, 
                        ifelse(f.chr[,i] == 3, pars["3"], NA))) })

        colnames(new.genos) <- colnames(p.chr)

        return(list(new.genos)) })

    # Combine matricies and simulate missing data
    missing.recode <- list(founders.recode, finals.recode) %>%
      pmap(function(fo.chr, fi.chr) {

        # Combine matrices
        chr <- rbind(fo.chr, fi.chr)

        # First generate the missing data proportions from a uniform distribution
        miss.prob <- runif(n = ncol(chr), min = 0, max = miss)

        miss.mat <- sapply(X = miss.prob, FUN = function(prob) 
          sample(x = c(T, F), size = nrow(chr), replace = T, prob = c(prob, 1 - prob)))

        chr[miss.mat] <- NA

        fo.chr <- chr[1:2,]
        fi.chr <- chr[-1:-2,]

        list(founders = fo.chr, finals = fi.chr) })

    # Grab the founders and finals matrices
    founders.missing <- lapply(X = missing.recode, FUN = function(chr) chr$founders)
    finals.missing <- lapply(X = missing.recode, FUN = function(chr) chr$finals)

    # Create the cross object
    new.cross <- prep_cross(map = map, founders = founders.missing, 
                            finals = finals.missing, selfing.gen = selfing.gen)


    # Impute
    impute.cross <- fsimpute(cross = new.cross, prob.threshold = 0.7)


    # Founder accuracy
    fi.combined <- do.call("cbind", impute.cross$imputed$founders)
    f.combined <- do.call("cbind", founders.recode)

    tab <- table(fi.combined, f.combined, dnn = list("imputed", "original"))

    fou_acc <- sum(diag(tab)) / sum(tab)


    # Finals accuracy
    fi.combined <- do.call("cbind", impute.cross$imputed$finals)
    f.combined <- do.call("cbind", finals.recode)

    tab <- table(fi.combined, f.combined, dnn = list("imputed", "original"))

    fin_acc <- sum(diag(tab)) / sum(tab)

    # Missingness
    fou_miss <- impute.cross$imputed$stats$missingness["founders.missing","mean"]
    fin_miss <- impute.cross$imputed$stats$missingness["finals.missing","mean"]

    # Return results
    c(fou_acc, fin_acc, fou_miss, fin_miss) }, .collate = "cols") %>%

  # Rename the output
  rename(fou_acc = .out1, fin_acc = .out2, fou_miss = .out3, fin_miss = .out4)

  # Save
  save("results.full", file = "fsimpute_realData_simulation_results.RData")

Plot

# Load the data
load("simulations/fsimpute_realData_simulation_results.RData")




# Plot
# Imputation Accuracy
gp <- results.sum %>%
  filter(param %in% c("Founder Accuracy", "Final Accuracy")) %>%
  ggplot(aes(x = `Maximum Missingness`, y = mean, col = `Family Size`)) + 
  geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.05) +
  geom_point(size = 2) + 
  geom_line() +
  ylab("Imputation Accuracy") +
  ggtitle("Imputation Accuracy in Real-Data Simulations")
gp + facet_wrap(c("param", "`Selfing Generations`"), 
                labeller = labeller(param = label_value, `Selfing Generations` = label_both))
# Save plot
ggsave(filename = "simulations/read_data_accuracy.jpg", height = 5, width = 7)



# Remaining missingness
gp <- results.sum %>%
  filter(param %in% c("Founder Missingness", "Final Missingness")) %>%
  ggplot(aes(x = `Maximum Missingness`, y = mean, col = `Family Size`)) + 
  geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.05) +
  geom_point(size = 2) + 
  geom_line() +
  ylab("Remaining Missing Data") +
  ggtitle("Missing Data After Imputation in Real-Data Simulations")
gp + facet_wrap(c("param", "`Selfing Generations`"), 
                labeller = labeller(param = label_value, `Selfing Generations` = label_both))
# Save plot
ggsave(filename = "simulations/read_data_missingness.jpg", height = 5, width = 7)

## Extra testing


```r


 # Function for exact test
    segregation_test <- function(observed, expected = "hom0-hom2", n, selfing.gen) {


      # Define the expected segregation ratios
      if (expected == "hom0-hom2") {
        het1 <- 0.5 ^ selfing.gen
        hom0 <- hom2 <- (1 - het1) / 2
      }
      # Hom = 0
      if (expected == "hom0-het") {
        het1 <- 0.5 ^ (selfing.gen + 1)
        hom0 <- 0.5 + ((0.5 - het1) / 2)
        hom2 <- 1 - het1 - hom0
      }
      # Hom = 2
      if (expected == "het-hom2") {
        het1 <- 0.5 ^ (selfing.gen + 1)
        hom2 <- 0.5 + ((0.5 - het1) / 2)
        hom0 <- 1 - het1 - hom0
      }
      if (expected == "het-het") {
        het1 <- 0.5 ^ (selfing.gen + 1)
        hom0 <- hom2 <- (1 - het1) / 2
      }

      exp_ratio <- c(`0` = hom0, `1` = het1, `2` = hom2) * n

      # Calculate chi-square stat
      Xsq <- sum( (observed - exp_ratio) ^ 2 )
      # return the p-value
      pchisq(Xsq, df = 2)
    }


    ## Another option for detecting parental states is to look at those markers where
    ## at least one of the founders is observed, then look at the progeny to determine 
    ## the other genotype
    list(founders.missing, finals.missing) %>%
      map(function(fou.chr, fin.chr) {

        # Combine
        chr <- rbind(fou.chr, fin.chr)
        # Apply a function over sites
        apply(X = chr, MARGIN = 2, FUN = function(snp) {
          # Is only one site missing in the founders?
          fou <- snp[1:2]

          if ( sum(is.na(fou)) == 1 ) {
            # Is the observed founder het or hom?
            obs.fou <- fou[!is.na(fou)]
            names(obs.fou) <- ifelse( fou[!is.na(fou)] %in% c(0,2), "hom", "het" )

            # Survey the progeny
            fin <- snp[-1:-2]

            # Are the progeny polymorphic?
            if (length( unique( na.omit(fin) ) ) > 1) {
              # Find the observed
              observed <- c(sum(fin == 0, na.rm = T), 
                            sum(fin == 1, na.rm = T), 
                            sum(fin == 2, na.rm = T))

              # If the observed founder is homozygote, first test for another hom
              if (names(obs.fou) == "hom") {
                pval <- segregation_test(observed, "hom-hom", length(fin), selfing.gen)

                # If the pvalus is greather than 0.05, break
                if (pval > alpha) {
                  fou[is.na(fou)] <- setdiff(c(0,2), obs.fou)
                  return(fou)

                } else {

                  pval <- ifelse(obs.fou == 0,
                         segregation_test(observed, "hom-het", length(fin), selfing.gen),
                         segregation_test(observed, "het-hom", length(fin), selfing.gen))

                  if (pval > 0.05) {
                    fou[is.na(fou)] <- 1
                    return(fou)
                  }}

              # If the observed value is not homozygous, test for either hom
              } else {

                pval0 <- segregation_test(observed, "hom-het", length(fin), selfing.gen)


              # Now test for a het





              # First test for another homozygote

neyhartj/fsimpute documentation built on May 23, 2019, 4:29 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

neyhartj/fsimpute
Imputing genotype data in bi-parental populations of finite selfing

In neyhartj/fsimpute: Imputing genotype data in bi-parental populations of finite selfing

Introduction

De novo Simulations

Plot

Using Real Data in the Simulation

Start the Simulation

Plot

R Package Documentation

Browse R Packages

We want your feedback!

neyhartj/fsimpute Imputing genotype data in bi-parental populations of finite selfing

In neyhartj/fsimpute: Imputing genotype data in bi-parental populations of finite selfing

Introduction

De novo Simulations

Plot

Using Real Data in the Simulation

Start the Simulation

Plot

R Package Documentation

Browse R Packages

We want your feedback!

neyhartj/fsimpute
Imputing genotype data in bi-parental populations of finite selfing