tests/testthat/test-autosomal.R

context("Autosomal")

ESTIMATION_TOL_DUE_TO_SAMPLING <- 0.01



set.seed(1)
sim_res_fixed <- sample_geneology(population_size = 1e3, 
                                  generations = 100, 
                                  generations_full = 3,
                                  generations_return = 3, # default value
                                  progress = FALSE)

peds <- build_pedigrees(sim_res_fixed$population, progress = FALSE)

# alleles         A    B    C
allele_prob <- c(0.7, 0.2, 0.1)
theta <- 0.1
L <- length(allele_prob)

x <- replicate(10000, sample_autosomal_genotype(allele_prob, theta))
#prop.table(table(apply(x, 2, paste0, collapse = ";")))

x_p_tab <- prop.table(table(c(x)))
x_p <- as.numeric(x_p_tab)

test_that("sample_autosomal_genotype works", {
  expect_equal(x_p, allele_prob, tol = ESTIMATION_TOL_DUE_TO_SAMPLING)
})


geno_probs_R_mat <- matrix(0, nrow = L, ncol = L)  
for (i in 1L:L) {
  for (j in i:L) {
    #P_AA = F*p_A + (1 - F) * p_A^2
    #X != A: P_AX = (1 - F) * 2 * p_A*p_X
    prob <- if (i == j) # homzyg:
      theta*allele_prob[i] + (1-theta)*allele_prob[i]^2
    else
      (1-theta)*2*allele_prob[i]*allele_prob[j]
    
    # only assign (i, j);
    # if (j, i) should also be assigned, the factor 2 above in
    # the else case must be removed
    geno_probs_R_mat[i, j] <- prob
  }
}
geno_probs_rcpp <- calc_autosomal_genotype_probs(allele_prob, theta)
geno_probs_R <- geno_probs_R_mat[upper.tri(geno_probs_R_mat, diag = TRUE)]
test_that("calc_autosomal_genotype_probs works", {
  expect_equal(geno_probs_R, geno_probs_rcpp)
})

pedigrees_all_populate_autosomal(peds, allele_prob, theta, mutation_rate = 0, FALSE)

geno_mat <- do.call(rbind, lapply(seq_len(pedigrees_count(peds)), function(i) {
  hs <- do.call(rbind, get_haplotypes_in_pedigree(peds[[i]]))
  hs
}))
geno_vec <- c(geno_mat)
geno_vec_p <- as.numeric(prop.table(table(geno_vec)))

test_that("pedigrees_all_populate_autosomal works", {
  expect_equal(geno_vec_p, allele_prob, tol = ESTIMATION_TOL_DUE_TO_SAMPLING)
})



get_ols_quantities <- function(y) {
  x_unique_geno <- y[!duplicated(y), ]
  ols_quantities <- apply(x_unique_geno, 1,
                          function(as) {
                            geno_prob <- mean(apply(y, 1, function(as2) all(as == as2)))
                            
                            if (length(unique(as)) == 1L) {
                              p <- x_p_tab[as.character(as[1])]
                              
                              x <- p - p^2
                              y <- geno_prob - p^2
                              return(c(x, y))
                            } else {
                              p <- x_p_tab[as.character(as[1])]
                              q <- x_p_tab[as.character(as[2])]
                              
                              x <- -2*p*q
                              y <- geno_prob - 2*p*q
                              return(c(x, y))
                            }
                          })
  ols_x <- as.matrix(ols_quantities[1L, ])
  ols_y <- ols_quantities[2L, ]
  return(list(x = ols_x, y = ols_y))
}

y <- t(x)
ols_res_genotypes <- get_ols_quantities(y)
theta_res <- estimate_autotheta_1subpop_genotypes(y)
test_that("estimate_autotheta_1subpop_genotypes works", {
  expect_true(theta_res$error == FALSE)
  expect_equal(qr.solve(ols_res_genotypes$x, ols_res_genotypes$y), 
               theta_res$estimate)
})

theta_res_info <- estimate_autotheta_1subpop_genotypes(y, return_estimation_info = TRUE)
test_that("estimate_autotheta_1subpop_genotypes return_estimation_info works", {
  expect_true(theta_res_info$error == FALSE)
  expect_equal(qr.solve(ols_res_genotypes$x, ols_res_genotypes$y), 
               theta_res_info$estimate)
  expect_equal(sort(ols_res_genotypes$x), sort(theta_res_info$estimation_info$X))
})

# bootstrap:
theta_boot <- replicate(100, {
  yboot <- y[sample(nrow(y), replace = TRUE), ]
  estimate_autotheta_1subpop_genotypes(yboot)$estimate
})
# Expanding a bit...
theta_boot_rng <- c(0.9, 1.1) * range(theta_boot)
#theta_boot_rng
test_that("estimate_autotheta_1subpop_genotypes boot contains true", {
  expect_true(all(theta_boot >= 0 & theta_boot <= 1))
  expect_true(theta >= theta_boot_rng[1L] & theta <= theta_boot_rng[2L])
})


p_cumdists <- calc_autosomal_genotype_conditional_cumdist(allele_dist = allele_prob, 
                                                          theta = theta)

cumdist_r <- geno_probs_R_mat
cumdist_r[lower.tri(cumdist_r)] <- cumdist_r[upper.tri(cumdist_r)] / 2
cumdist_r[upper.tri(cumdist_r)] <- cumdist_r[upper.tri(cumdist_r)] / 2
cumdist_r <- t(apply(cumdist_r / rowSums(cumdist_r), 1, cumsum))
#cumdist_r

test_that("calc_autosomal_genotype_conditional_cumdist works", {
  expect_equal(p_cumdists, cumdist_r)
})


livepop <- sim_res_fixed$individuals_generations
y_livepop <- get_haplotypes_individuals(livepop)
ols_res_livepop <- get_ols_quantities(y_livepop)

test_that("estimate_autotheta_1subpop_individuals works", {
  expect_equal(qr.solve(ols_res_livepop$x, ols_res_livepop$y),
               estimate_autotheta_1subpop_individuals(livepop)$estimate, 
               tol = 10*ESTIMATION_TOL_DUE_TO_SAMPLING
  )
})

test_that("livepop: haplotypes/individual same theta", {
  expect_equal(estimate_autotheta_1subpop_genotypes(y_livepop)$estimate,
               estimate_autotheta_1subpop_individuals(livepop)$estimate)
})


test_that("geneaology independent sample and population same theta", {
  expect_equal(estimate_autotheta_1subpop_genotypes(y)$estimate,
               estimate_autotheta_1subpop_genotypes(y_livepop)$estimate,
               tol = ESTIMATION_TOL_DUE_TO_SAMPLING)
  
  expect_equal(estimate_autotheta_1subpop_genotypes(y)$estimate,
               estimate_autotheta_1subpop_individuals(livepop)$estimate, 
               tol = ESTIMATION_TOL_DUE_TO_SAMPLING)
})


if (FALSE) {
  prop.table(table(c(x)))
  prop.table(table(c(y_livepop)))
  
  prop.table(table(apply(x, 2, paste0, collapse = ";")))
  prop.table(table(apply(y_livepop, 1, paste0, collapse = ";")))
  
  estimate_autotheta_1subpop_genotypes(y)
  qr.solve(ols_res_genotypes$x, ols_res_genotypes$y)
  
  estimate_autotheta_1subpop_genotypes(y_livepop)
  estimate_autotheta_1subpop_individuals(livepop)
  qr.solve(ols_res_livepop$x, ols_res_livepop$y)
}


#######################################################################



##########################################
# estimate_autotheta_subpops_individuals
##########################################
l_nonindv <- list(1:10, 1:20)
l_nonindv_sizes <- sapply(l_nonindv, length)
test_that("estimate_autotheta_subpops_individuals not list of list of individuals", {
  expect_error(estimate_autotheta_subpops_individuals(l_nonindv, l_nonindv_sizes))
})


set.seed(2)
grps <- sample(c(1, 2), length(livepop), replace = TRUE)

subpops <- split(livepop, grps)
subpops_sizes <- sapply(subpops, length)
res_indv <- estimate_autotheta_subpops_individuals(subpops, subpops_sizes)

subpops_haps <- lapply(split(seq_len(nrow(y_livepop)), grps), function(is) y_livepop[is, ])
subpops_haps_sizes <- sapply(subpops_haps, nrow)
res_geno <- estimate_autotheta_subpops_genotypes(subpops_haps, subpops_haps_sizes)

test_that("estimate_autotheta_subpops_individuals = estimate_autotheta_subpops_genotypes", {
  expect_equal(res_indv, res_geno)
})


# By pid:
pids_livepop <- sapply(livepop, get_pid)
subpops_pids <- split(pids_livepop, grps)
subpops_pids_sizes <- sapply(subpops_pids, length)
res_pids <- estimate_autotheta_subpops_pids(sim_res_fixed$population, subpops_pids, subpops_pids_sizes)

test_that("estimate_autotheta_subpops_pids", {
  expect_equal(res_indv, res_pids)
  expect_equal(res_geno, res_pids)
})

#####################################################################################
# theta
#####################################################################################

# Known answers, GDA2
two2cols <- function(x) {
  do.call(rbind, lapply(x, function(as) as.integer(strsplit(as.character(as), "")[[1]])))
}

# GDA2, Exercise 5.4, p. 200:
loc4_g <- c(rep(11, 24), rep(12, 16))
loc4_n <- c(rep(11, 44), rep(12, 5), rep(22, 1))
loc4 <- list(two2cols(loc4_g), two2cols(loc4_n))
loc4_ni <- sapply(loc4, nrow)

# GDA2, Exercise 5.4 solution, p. 401
# loc4_sol_F <- -0.0082
# loc4_sol_theta <- 0.0633
# loc4_sol_f <- -0.0764
# From GDA1.1 program, too
loc4_sol_F <- -0.008234
loc4_sol_theta <- 0.063292
loc4_sol_f <- -0.076359

loc4_allele1_sigmasqP <- 0.007324
loc4_allele1_sigmasqI <- -0.008277
loc4_allele1_sigmasqG <- 0.116667
loc4_allele2_sigmasqP <- loc4_allele1_sigmasqP
loc4_allele2_sigmasqI <- loc4_allele1_sigmasqI
loc4_allele2_sigmasqG <- loc4_allele1_sigmasqG

res_loc4 <- estimate_autotheta_subpops_genotypes(loc4, loc4_ni)

test_that("estimate_autotheta_subpops_genotypes GDA2, Exercise 5.4, locus 4", {
  expect_equal(res_loc4$F, loc4_sol_F, tol = 1e-5)
  expect_equal(res_loc4$theta, loc4_sol_theta, tol = 1e-6)
  expect_equal(res_loc4$f, loc4_sol_f, tol = 1e-5)
  
  expect_equal(res_loc4$extra$allele_values[["1"]]$sigmasq_P, 
               res_loc4$extra$allele_values[["2"]]$sigmasq_P)
  expect_equal(res_loc4$extra$allele_values[["1"]]$sigmasq_I, 
               res_loc4$extra$allele_values[["2"]]$sigmasq_I)
  expect_equal(res_loc4$extra$allele_values[["1"]]$sigmasq_G, 
               res_loc4$extra$allele_values[["2"]]$sigmasq_G)
  
  expect_equal(res_loc4$extra$allele_values[["1"]]$sigmasq_P, 
               res_loc4$extra$allele_values[["2"]]$sigmasq_P)
  
  expect_equal(loc4_allele1_sigmasqP, 
               res_loc4$extra$allele_values[["1"]]$sigmasq_P, tol = 1e-6)
  expect_equal(loc4_allele1_sigmasqI, 
               res_loc4$extra$allele_values[["1"]]$sigmasq_I, tol = 1e-6)
  expect_equal(loc4_allele1_sigmasqG, 
               res_loc4$extra$allele_values[["1"]]$sigmasq_G, tol = 1e-6)

  # Allele wise theta/F should be the same, GDA2, p. 178 mid.
  # theta
  for (allele_info in res_loc4$extra$allele_values) {
    # allele_info <- res_loc4$extra$allele_values[["1"]]
    expect_equal(with(allele_info, sigmasq_P / (sigmasq_P + sigmasq_I + sigmasq_G)),
                 with(allele_info, S1  / S2), tol = 1e-6)
  }
  
  # F
  for (allele_info in res_loc4$extra$allele_values) {
    # allele_info <- res_loc4$extra$allele_values[["1"]]
    expect_equal(with(allele_info, (sigmasq_P + sigmasq_I) / (sigmasq_P + sigmasq_I + sigmasq_G)),
                 with(allele_info, 1 - (S3  / S2)), tol = 1e-5)
  }
})

# Same as above, now locus 6
loc6_g <- c(rep(11, 9), rep(12, 1), rep(22, 5), rep(23, 7), rep(24, 8), rep(34, 10))
loc6_n <- c(rep(11, 22), rep(12, 14), rep(13, 2), rep(14, 6), rep(23, 1), rep(24, 3), rep(33, 1), rep(34, 1))

test_that("loc4/loc6 length", {
  expect_equal(length(loc4_g), length(loc6_g))
  expect_equal(length(loc4_n), length(loc6_n))
})

loc6 <- list(two2cols(loc6_g), two2cols(loc6_n))
loc6_ni <- sapply(loc6, nrow)
res_loc6 <- estimate_autotheta_subpops_genotypes(loc6, loc6_ni)

# loc6_sol_F <- 0.2009
# loc6_sol_theta <- 0.1517
# loc6_sol_f <- 0.0581
# From GDA1.1 program, too
loc6_sol_F <- 0.200938
loc6_sol_theta <- 0.151652
loc6_sol_f <- 0.058097

loc6_allele1_sigmasqP <- 0.086002
loc6_allele1_sigmasqI <- 0.080586
loc6_allele1_sigmasqG <- 0.127778

loc6_allele2_sigmasqP <- 0.008555
loc6_allele2_sigmasqI <- -0.007456
loc6_allele2_sigmasqG <- 0.188889

loc6_allele3_sigmasqP <- 0.010538
loc6_allele3_sigmasqI <- -0.009882
loc6_allele3_sigmasqG <- 0.116667

loc6_allele4_sigmasqP <- 0.006668
loc6_allele4_sigmasqI <- -0.026926
loc6_allele4_sigmasqG <- 0.155556


test_that("estimate_autotheta_subpops_genotypes GDA2, Exercise 5.4, locus 6", {
  expect_equal(res_loc6$F, loc6_sol_F, tol = 1e-5) 
  expect_equal(res_loc6$theta, loc6_sol_theta, tol = 1e-6) 
  expect_equal(res_loc6$f, loc6_sol_f, tol = 1e-5) 
  
  # Allele 1
  expect_equal(loc6_allele1_sigmasqP, 
               res_loc6$extra$allele_values[["1"]]$sigmasq_P, tol = 1e-6)
  expect_equal(loc6_allele1_sigmasqI, 
               res_loc6$extra$allele_values[["1"]]$sigmasq_I, tol = 1e-6)
  expect_equal(loc6_allele1_sigmasqG, 
               res_loc6$extra$allele_values[["1"]]$sigmasq_G, tol = 1e-6)
  
  # Allele 2
  expect_equal(loc6_allele2_sigmasqP, 
               res_loc6$extra$allele_values[["2"]]$sigmasq_P, tol = 1e-6)
  expect_equal(loc6_allele2_sigmasqI, 
               res_loc6$extra$allele_values[["2"]]$sigmasq_I, tol = 1e-6)
  expect_equal(loc6_allele2_sigmasqG, 
               res_loc6$extra$allele_values[["2"]]$sigmasq_G, tol = 1e-6)
  
  # Allele 3
  expect_equal(loc6_allele3_sigmasqP, 
               res_loc6$extra$allele_values[["3"]]$sigmasq_P, tol = 1e-6)
  expect_equal(loc6_allele3_sigmasqI, 
               res_loc6$extra$allele_values[["3"]]$sigmasq_I, tol = 1e-6)
  expect_equal(loc6_allele3_sigmasqG, 
               res_loc6$extra$allele_values[["3"]]$sigmasq_G, tol = 1e-6)
  
  # Allele 4
  expect_equal(loc6_allele4_sigmasqP, 
               res_loc6$extra$allele_values[["4"]]$sigmasq_P, tol = 1e-6)
  expect_equal(loc6_allele4_sigmasqI, 
               res_loc6$extra$allele_values[["4"]]$sigmasq_I, tol = 1e-6)
  expect_equal(loc6_allele4_sigmasqG, 
               res_loc6$extra$allele_values[["4"]]$sigmasq_G, tol = 1e-6)
  
  # Allele wise theta/F should be the same, GDA2, p. 178 mid.
  # theta
  for (allele_info in res_loc6$extra$allele_values) {
    # allele_info <- res_loc4$extra$allele_values[["1"]]
    expect_equal(with(allele_info, sigmasq_P / (sigmasq_P + sigmasq_I + sigmasq_G)),
                 with(allele_info, S1  / S2), tol = 1e-6)
  }
  
  # F
  for (allele_info in res_loc6$extra$allele_values) {
    # allele_info <- res_loc4$extra$allele_values[["1"]]
    expect_equal(with(allele_info, (sigmasq_P + sigmasq_I) / (sigmasq_P + sigmasq_I + sigmasq_G)),
                 with(allele_info, 1 - (S3  / S2)), tol = 1e-5)
  }
})


#########################################################################




#####################################################################################
# GDA program
# https://phylogeny.uconn.edu/software/
# v. 1.1
# diploid.nex output
#####################################################################################

if (FALSE) {
  diploid_nex_str <- "Pop_1:
  _1_ 4/4 4/3 4/3 3/3 4/4
  _2_ 4/4 4/4 4/3 3/3 4/4
  _3_ 4/4 4/4 4/3 4/3 4/4
  _4_ 4/4 4/4 ?/? 3/3 4/4
  _5_ 4/4 4/4 2/4 3/4 4/4
  _6_ 4/4 4/4 ?/? 4/3 4/4 
  _7_ 4/4 4/4 4/3 4/3 4/4 
  _8_ 4/4 4/4 ?/? 4/3 4/4,
  Pop_2:
  _1_ 4/4 4/4 3/3 3/2 4/4 
  _2_ 4/4 3/3 4/4 4/3 4/4 
  _3_ 4/4 4/3 4/4 4/3 4/4
  _4_ 4/4 4/4 3/3 3/3 4/4 
  _5_ 4/4 4/3 4/4 4/4 4/4 
  _6_ 4/4 4/4 4/4 2/2 4/4 
  _7_ 4/4 4/4 4/3 4/3 4/4 
  _8_ 4/4 4/4 4/4 4/4 4/4,
  Pop_3:
  _1_ 4/4 4/4 4/4 4/3 4/4 
  _2_ 4/4 4/4 4/4 4/4 4/4 
  _3_ 4/4 4/4 4/3 2/1 4/4 
  _4_ 4/4 4/4 3/3 4/3 4/4 
  _5_ 4/4 4/4 4/3 2/1 4/4,
  Pop_4:
  _1_ 4/4 4/4 4/3 4/4 4/4 
  _2_ 4/4 4/4 4/3 4/3 4/4
  _3_ 4/4 4/4 4/3 4/3 4/4 
  _4_ 4/4 4/4 4/3 4/4 4/4 
  _5_ 4/4 4/4 4/3 4/4 4/4 
  _6_ 4/4 4/4 4/4 3/3 4/4 
  _7_ 4/4 4/4 4/4 4/4 4/4,
  Pop_5:
  _1_ 4/4 4/4 4/4 2/1 4/4 
  _2_ 4/4 4/4 4/4 3/3 4/4 
  _3_ 4/4 4/4 4/3 4/3 4/4 
  _4_ 4/4 4/4 4/3 4/3 4/4
  _5_ 4/4 4/4 4/4 4/4 4/4 
  _6_ 4/4 4/4 4/4 4/3 4/4 
  _7_ 4/4 4/4 4/3 4/3 4/4 
  _8_ 4/4 4/4 4/4 ?/? 4/4 
  _9_ 4/4 4/3 4/4 4/3 4/4,
  Pop_6:
  _1_ 4/4 4/4 4/4 4/3 4/4 
  _2_ 4/4 4/4 4/3 3/3 4/4 
  _3_ 4/4 4/4 4/4 3/2 4/4 
  _4_ 4/4 4/4 4/3 4/1 4/4 
  _5_ 4/4 4/4 4/4 4/4 4/4
  _6_ 4/4 4/4 4/4 4/2 4/4
  _7_ 4/4 4/4 4/4 4/3 4/4"
  
  diploid_nex_con <- textConnection(diploid_nex_str)
  lns <- readLines(diploid_nex_con)
  is <- c(which(grepl("Pop", lns)), length(lns)+1)
  grp <- unlist(lapply(1L:(length(is)-1), function(i) rep(i, is[i+1]-is[i])))
  length(lns)
  length(grp)
  
  pops <- split(lns, grp)
  prof <- lapply(pops, function(x) {
    x <- x[-1]
    x <- gsub("^[ ]*_[1-9]+_[ ]*(.*)$", "\\1", x)
    x <- gsub(",", "", x, fixed = TRUE)
    x <- strsplit(x, " ")
    #x <- lapply(x, function(y) do.call(rbind, strsplit(y, "/")))
    x <- lapply(x, function(y) strsplit(y, "/"))
    #x <- do.call(rbind, lapply(x, function(y) strsplit(y, "/")))
    x
  })
  prof
  loci <- 2:4
  prof_loc <- lapply(loci, function(i) {
    lapply(prof, function(x) {
      d <- do.call(rbind, lapply(x, function(y) y[[i]]))
      d[d == "?"] <- NA
      d <- na.omit(d)
      d <- apply(d, 2, as.integer)
      d
    })
  })
  names(prof_loc) <- loci
  prof_loc
  diploid_nex_data <- prof_loc
  dput(prof_loc)
  
}

diploid_nex_data <- structure(list(`2` = structure(list(`1` = structure(c(4L, 4L, 
                                                                          4L, 4L, 4L, 4L, 4L, 4L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Dim = c(8L, 
                                                                                                                                            2L)), 
                                                        `2` = structure(c(4L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 
                                                                          3L, 4L, 3L, 4L, 4L, 4L), .Dim = c(8L, 2L)), 
                                                        `3` = structure(c(4L, 
                                                                          4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Dim = c(5L, 2L)), 
                                                        `4` = structure(c(4L, 
                                                                          4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Dim = c(7L, 
                                                                                                                                        2L)), 
                                                        `5` = structure(c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
                                                                          4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L), .Dim = c(9L, 2L)), 
                                                        `6` = structure(c(4L, 
                                                                          4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Dim = c(7L, 
                                                                                                                                        2L))), .Names = c("1", "2", "3", "4", "5", "6")), 
                                   `3` = structure(list(
                                     `1` = structure(c(4L, 4L, 4L, 2L, 4L, 3L, 3L, 3L, 4L, 3L), .Dim = c(5L, 
                                                                                                         2L)), 
                                     `2` = structure(c(3L, 4L, 4L, 3L, 4L, 4L, 4L, 4L, 3L, 
                                                       4L, 4L, 3L, 4L, 4L, 3L, 4L), .Dim = c(8L, 2L)), 
                                     `3` = structure(c(4L, 
                                                       4L, 4L, 3L, 4L, 4L, 4L, 3L, 3L, 3L), .Dim = c(5L, 2L)), 
                                     `4` = structure(c(4L, 
                                                       4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 3L, 4L, 4L), .Dim = c(7L, 
                                                                                                                     2L)), 
                                     `5` = structure(c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
                                                       4L, 4L, 3L, 3L, 4L, 4L, 3L, 4L, 4L), .Dim = c(9L, 2L)), 
                                     `6` = structure(c(4L, 
                                                       4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 4L, 3L, 4L, 4L, 4L), .Dim = c(7L, 
                                                                                                                     2L))), .Names = c("1", "2", "3", "4", "5", "6")), 
                                   `4` = structure(list(
                                     `1` = structure(c(3L, 3L, 4L, 3L, 3L, 4L, 4L, 4L, 3L, 3L, 
                                                       3L, 3L, 4L, 3L, 3L, 3L), .Dim = c(8L, 2L)), 
                                     `2` = structure(c(3L, 
                                                       4L, 4L, 3L, 4L, 2L, 4L, 4L, 2L, 3L, 3L, 3L, 4L, 2L, 3L, 4L
                                     ), .Dim = c(8L, 2L)), 
                                     `3` = structure(c(4L, 4L, 2L, 4L, 2L, 
                                                       3L, 4L, 1L, 3L, 1L), .Dim = c(5L, 2L)), 
                                     `4` = structure(c(4L, 
                                                       4L, 4L, 4L, 4L, 3L, 4L, 4L, 3L, 3L, 4L, 4L, 3L, 4L), .Dim = c(7L, 
                                                                                                                     2L)), 
                                     `5` = structure(c(2L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 1L, 
                                                       3L, 3L, 3L, 4L, 3L, 3L, 3L), .Dim = c(8L, 2L)), 
                                     `6` = structure(c(4L, 
                                                       3L, 3L, 4L, 4L, 4L, 4L, 3L, 3L, 2L, 1L, 4L, 2L, 3L), .Dim = c(7L, 
                                                                                                                     2L))), .Names = c("1", "2", "3", "4", "5", "6"))), .Names = c("2", 
                                                                                                                                                                                   "3", "4"))


# Excerpt:
#        Locus      Allele           f           F     Theta-P
#   ----------  ----------  ----------  ----------  ----------
#      locus-2         All    0.250514    0.302529    0.069401
#      locus-3         All   -0.034807   -0.030052    0.004595
#      locus-4         All    0.012751    0.036738    0.024297

#       Locus      Allele     Sigma-G     Sigma-I     Sigma-P
#   ----------  ----------  ----------  ----------  ----------
#      locus-2         All    0.090909    0.030386    0.009046
#                        3    0.045455    0.015193    0.004523
#                        4    0.045455    0.015193    0.004523
#                                                             
#      locus-3         All    0.439024   -0.014767    0.001958
#                        2    0.012195   -0.000383    0.000453
#                        3    0.207317   -0.001286   -0.002083
#                        4    0.219512   -0.013098    0.003588
#                                                             
#      locus-4         All    0.604651    0.007810    0.015252
#                        1    0.046512   -0.003444    0.002099
#                        2    0.069767    0.014971    0.000953
#                        4    0.244186    0.006261    0.000451
#                        3    0.244186   -0.009979    0.011749

gda_sigma_lns <- readLines(textConnection("Locus      Allele     Sigma-G     Sigma-I     Sigma-P
     locus-2         All    0.090909    0.030386    0.009046
                       3    0.045455    0.015193    0.004523
                       4    0.045455    0.015193    0.004523
                                                            
     locus-3         All    0.439024   -0.014767    0.001958
                       2    0.012195   -0.000383    0.000453
                       3    0.207317   -0.001286   -0.002083
                       4    0.219512   -0.013098    0.003588
                                                            
     locus-4         All    0.604651    0.007810    0.015252
                       1    0.046512   -0.003444    0.002099
                       2    0.069767    0.014971    0.000953
                       4    0.244186    0.006261    0.000451
                       3    0.244186   -0.009979    0.011749"))
gda_sigma_lns <- gda_sigma_lns[!grepl("locus", gda_sigma_lns)]
gda_sigma_lns <- gsub("[ ]+", " ", gda_sigma_lns)
gda_sigma_lns <- gsub("^ ", "", gda_sigma_lns)
gda_sigma_lns <- gda_sigma_lns[gda_sigma_lns != " "]
gda_sigma_lns <- gda_sigma_lns[gda_sigma_lns != ""]
gda_sigma_lns <- gda_sigma_lns[-1]
gda_sigma_d <- read.table(text = gda_sigma_lns, stringsAsFactors = FALSE, 
                             sep = " ", row.names = NULL,
                             col.names = c("Allele", "sigmasq_G", "sigmasq_I", "sigmasq_P"))
gda_sigma <- list(
  "2" = gda_sigma_d[1:2, ],
  "3" = gda_sigma_d[3:5, ],
  "4" = gda_sigma_d[6:9, ]
)

gda_thetaFf <- list(
  "2" = list(f =  0.250514, F =  0.302529, theta = 0.069401),
  "3" = list(f = -0.034807, F = -0.030052, theta = 0.004595),
  "4" = list(f =  0.012751, F =  0.036738, theta = 0.024297)
)

test_that("GDA 1.1 diploid.nex ", {
  for (loc in seq_along(diploid_nex_data)) {
    #loc <- 1
    expect_equal(names(gda_thetaFf)[loc], names(diploid_nex_data)[loc])
    
    gda_loc <- gda_thetaFf[[loc]]
    
    loc_subpops <- diploid_nex_data[[loc]]
    loc_subpops_ni <- sapply(loc_subpops, nrow)
    res_loc <- estimate_autotheta_subpops_genotypes(loc_subpops, loc_subpops_ni)
    
    expect_equal(res_loc$theta, gda_loc$theta, tol = 1e-5)
    expect_equal(res_loc$F, gda_loc$F, tol = 1e-5)
    expect_equal(res_loc$f, gda_loc$f, tol = 1e-5)
    
    # Sigma
    sigma_tmp <- gda_sigma[[ names(gda_thetaFf)[loc] ]]
    
    for (a_i in seq_along(sigma_tmp$Allele)) {
      #a_i <- 1
      a <- as.character(sigma_tmp$Allele[a_i])
      
      expect_equal(sigma_tmp$sigmasq_P[a_i], 
                   res_loc$extra$allele_values[[a]]$sigmasq_P, tol = 1e-6)
      expect_equal(sigma_tmp$sigmasq_I[a_i], 
                   res_loc$extra$allele_values[[a]]$sigmasq_I, tol = 1e-6)
      expect_equal(sigma_tmp$sigmasq_G[a_i], 
                   res_loc$extra$allele_values[[a]]$sigmasq_G, tol = 1e-6)
      
    }
    
    # Allele wise theta/F should be the same, GDA2, p. 178 mid.
    # theta
    for (allele_info in res_loc$extra$allele_values) {
      # allele_info <- res_loc4$extra$allele_values[["1"]]
      expect_equal(with(allele_info, sigmasq_P / (sigmasq_P + sigmasq_I + sigmasq_G)),
                   with(allele_info, S1  / S2), tol = 1e-6)
    }
    
    # F
    for (allele_info in res_loc$extra$allele_values) {
      # allele_info <- res_loc4$extra$allele_values[["1"]]
      expect_equal(with(allele_info, (sigmasq_P + sigmasq_I) / (sigmasq_P + sigmasq_I + sigmasq_G)),
                   with(allele_info, 1 - (S3  / S2)), tol = 1e-5)
    }
  }
})

######################################
# Unweighted
######################################



set.seed(2000)
grps <- sample(c(1, 2), length(livepop), replace = TRUE)
subpops <- split(livepop, grps)
subpops_haps <- lapply(split(seq_len(nrow(y_livepop)), grps), function(is) y_livepop[is, ])
pids_livepop <- sapply(livepop, get_pid)
subpops_pids <- split(pids_livepop, grps)

test_that("get_allele_counts_pids/get_allele_counts_genotypes", {
  expect_equal(get_allele_counts_pids(sim_res_fixed$population, subpops_pids)[, c("0", "1", "2")],
               get_allele_counts_genotypes(subpops_haps)[, c("0", "1", "2")])
})

test_that("Theta unweighted", {
  expect_equal(estimate_autotheta_subpops_unweighted_pids(sim_res_fixed$population, subpops_pids, assume_HWE = TRUE),
               estimate_autotheta_subpops_unweighted_genotypes(subpops_haps, assume_HWE = TRUE))
  # Not yet implemented
  # expect_equal(estimate_autotheta_subpops_unweighted_pids(sim_res_fixed$population, subpops_pids, assume_HWE = FALSE),
  #              estimate_autotheta_subpops_unweighted_genotypes(subpops_haps, assume_HWE = FALSE))
})



##########################

if (requireNamespace("dirmult", quietly = TRUE)) {
  library(dirmult)
  
  dirmult_weir1 <- dirmult::weirMoM(get_allele_counts_pids(sim_res_fixed$population, subpops_pids))
  dirmult_weir2 <- dirmult::weirMoM(get_allele_counts_genotypes(subpops_haps))
  
  test_that("Theta unweighted with dirmult::weirMoM", {
    expect_equal(dirmult_weir1, dirmult_weir2)
    
    expect_equal(estimate_autotheta_subpops_unweighted_pids(sim_res_fixed$population, subpops_pids, assume_HWE = TRUE),
                 estimate_autotheta_subpops_unweighted_genotypes(subpops_haps, assume_HWE = TRUE))
  })
  
  ralleleprob <- function(n, ps, theta) {
    #n <- 2; ps <- ALLELE_PROB; theta <- 0.1
    #n <- 2; ps <- ALLELE_PROB; theta <- 0.001
    #
    stopifnot(length(ps) > 1)
    
    stopifnot(length(theta) == 1L)
    stopifnot(theta >= 0)
    stopifnot(theta <= 1)
    
    # FIXME: theta == 0 problematic...
    alpha <- (1-theta)*ps/theta
    
    x <- rdirichlet(n, alpha)
    return(x)
  }
  
  set.seed(100)
  for (theta in c(0.01, 0.02)) {
    #theta <- 0.02
    subpop_ps <- ralleleprob(10, ps = allele_prob, theta = theta)
    
    subpop_geno <- lapply(seq_len(nrow(subpop_ps)), function(i) {
      geno <- t(replicate(1000, sample_autosomal_genotype(subpop_ps[i, ], 0))) # 0 for HWE
      return(geno)
    })
    
    # Missing alleles a technical issue, but not here because all are observed in all subpops
    allele_counts <- do.call(rbind, lapply(subpop_geno, function(x) table(x)))
    
    #get_allele_counts_genotypes()
    
    if (FALSE) {
      theta
      fit <- dirmult(allele_counts, trace = FALSE)
      fit$theta
      dirmult::weirMoM(allele_counts)
      estimate_autotheta_subpops_genotypes(subpop_geno, sapply(subpop_geno, function(x) 1000))$theta
      estimate_autotheta_subpops_unweighted_genotypes(subpop_geno, assume_HWE = TRUE)
    }
    
    test_that(paste0("Theta unweighted with dirmult: theta = ", theta), {
      expect_equal(estimate_autotheta_subpops_unweighted_genotypes(subpop_geno, assume_HWE = TRUE), 
                   dirmult::weirMoM(allele_counts))
    })
    
    test_that(paste0("get_allele_counts_genotypes with dirmult: theta = ", theta), {
      expect_equal(allele_counts,
                   get_allele_counts_genotypes(subpop_geno)[, colnames(allele_counts)])
    })
  }
}

####################################
# Weighted vs unweighted
####################################

if (requireNamespace("dirmult", quietly = TRUE)) {
  library(dirmult)
  
  ralleleprob <- function(n, ps, theta) {
    #n <- 2; ps <- ALLELE_PROB; theta <- 0.1
    #n <- 2; ps <- ALLELE_PROB; theta <- 0.001
    #
    stopifnot(length(ps) > 1)
    
    stopifnot(length(theta) == 1L)
    stopifnot(theta >= 0)
    stopifnot(theta <= 1)
    
    # FIXME: theta == 0 problematic...
    alpha <- (1-theta)*ps/theta
    
    x <- rdirichlet(n, alpha)
    return(x)
  }
  
  set.seed(100)
  for (theta in c(0.01, 0.02)) {
    #theta <- 0.02
    subpop_ps <- ralleleprob(10, ps = allele_prob, theta = theta)
    
    subpop_geno_equal <- lapply(seq_len(nrow(subpop_ps)), function(i) {
      geno <- t(replicate(1000, sample_autosomal_genotype(subpop_ps[i, ], 0))) # 0 for HWE
      return(geno)
    })
    
    test_that(paste0("Theta unweighted vs weighted for equal sizes: theta = ", theta), {
      expect_equal(
        estimate_autotheta_subpops_unweighted_genotypes(
          subpops = subpop_geno_equal, 
          assume_HWE = TRUE), 
        estimate_autotheta_subpops_genotypes(
          subpops = subpop_geno_equal, 
          subpops_sizes = sapply(subpop_geno_equal, nrow))$theta, 
        tol = 1e-4
        )
    })

    subpop_geno_unequal <- lapply(seq_len(nrow(subpop_ps)), function(i) {
      geno <- t(replicate(i*100, sample_autosomal_genotype(subpop_ps[i, ], 0))) # 0 for HWE
      return(geno)
    })
    
    theta_unw <- estimate_autotheta_subpops_unweighted_genotypes(
      subpops = subpop_geno_unequal, 
      assume_HWE = TRUE)
    theta_w <- estimate_autotheta_subpops_genotypes(
      subpops = subpop_geno_unequal, 
      subpops_sizes = sapply(subpop_geno_unequal, nrow))$theta
    
    test_that(paste0("Theta unweighted vs weighted for equal sizes: theta = ", theta), {
      expect_true(!isTRUE(all.equal(theta_unw, theta_w)))
    })
  }
}
########################################


############ Others

test_that("hash_colisions", {
  expect_true(all(unique(as.integer(hash_colisions(20))) == 1L))
})


############################################

# Infinite alleles


set.seed(1)
sim_res_fixed <- sample_geneology(population_size = 1e3, 
                                  generations = 100, 
                                  generations_full = 3,
                                  generations_return = 3, # default value
                                  progress = FALSE)
pop <- sim_res_fixed$population
livepop <- sim_res_fixed$individuals_generations
#peds <- build_pedigrees(sim_res_fixed$population, progress = FALSE)

get_generation(get_individual(pop, get_pid(livepop[[1]])))

set.seed(1)
population_populate_autosomal_infinite_alleles(pop, mutation_rate = 0, FALSE)

y_livepop <- get_haplotypes_individuals(livepop)

test_that("infinite alleles -- no mutations", {
  expect_true(all.equal(unique(c(y_livepop)), 0L))
})

set.seed(1)
population_populate_autosomal_infinite_alleles(pop, mutation_rate = 1, FALSE)

y_livepop <- get_haplotypes_individuals(livepop)

test_that("infinite alleles -- always mutations", {
  expect_equal(length(unique(c(y_livepop))), 2L*length(livepop))
})


set.seed(1)
population_populate_autosomal_infinite_alleles(pop, mutation_rate = 0.003, FALSE)

y_livepop <- get_haplotypes_individuals(livepop)
c(y_livepop) %>% table() %>% sort(decreasing = TRUE) %>% head()

test_that("infinite alleles -- sometimes mutations", {
  expect_true(length(unique(c(y_livepop))) > 1)
  expect_true(length(unique(c(y_livepop))) < 2L*length(livepop))
})

Try the malan package in your browser

Any scripts or data that you put into this service are public.

malan documentation built on July 4, 2024, 9:09 a.m.