#' Preparation of input files for imputation
#'
#' This function is used as a nested function in phcp_impute()
#' @param haps Phased reference genomes in .haps format, which is the output of SHAPEIT2.
#' @param genetic_map genetic map for the reference genome with 3 columns: chromosome number, genetic distance in Morgans, physical position. The columns header should be: chr, genetic_distance, bp
#' @param ancient_tped pseudo-haplotype to impute in Plink tped format
#' @param chr the number of chromosome to impute
#' @return an input for the imputation
#' @seealso [PHCPImpute::phcp_impute] which is the imputation function that uses this function
#' @export
#' @examples
input = function(haps, genetic_map, ancient_tped, chr) {
colnames(haps)[1:5] = c("chr","id","bp","a0","a1")
colnames(ancient_tped) = c("chr","id","genetic_distance","bp","ph1","ph2")
all_genomes <- inner_join(haps, ancient_tped, by = c("chr", "bp")) %>%
filter( ph1 == a0 | ph1 == a1 | ph1 == 0) %>%
select(!c("id.x","id.y","genetic_distance","ph2")) %>%
mutate(ancient_binary = ifelse(ph1 == 0, NA, ifelse(ph1 == a0, 0, 1)), .after = 'a1') %>%
select(-ph1) %>%
inner_join(genetic_map,.) %>%
arrange(chr, bp)
return(all_genomes)
}
#' Choose most informative donors for imputation
#'
#' The donors are chosen based on K. Wasik et al, 2021
#' This function is used as a nested function in phcp_impute()
#' @param data the input file that was generated with [PHCPImpute::input]
#' @param n_donors the number of donors' haplotypes to choose
#' @param n_info the number of columns in the data file before the columns of the haplotypes
#' @param freq the maximal minor allele frequency in variants to consider when choosing the donors
#' @return the data file with the chosen haplotypes
#' @seealso [PHCPImpute::phcp_impute] which is the imputation function that uses this function
#' @seealso [PHCPImpute::input] that generates the data input for this function
#' @export
#' @examples
#' @references K. Wasik et al., Comparing low-pass sequencing and genotyping for trait mapping in pharmacogenetics. BMC genomics 22, 197 (2021).
filter_donors = function(data, n_donors, n_info, freq){
data = data %>% mutate(ratio = rowMeans(.[ ,(all_of(n_info) + 1):ncol(.)]), .after = all_of(n_info))
n_info = n_info + 1
fT1 = (data$ratio > freq) & (data == 0)
fT2 = (data$ratio < 1-freq) & (data == 1)
fT = fT1 + fT2
fM1 = (data$ratio > freq) & (data == 0) & (!is.na(data$ancient_binary) & data$ancient_binary== 0)
fM2 = (data$ratio < 1-freq) & (data == 1) & (!is.na(data$ancient_binary) & data$ancient_binary == 1)
fM = fM1 + fM2
fT = colSums(fT)
fM = colSums(fM)
ranks = fM/fT
ranks[1:n_info] = 1
ranks[is.nan(ranks)] = 0
top_donors = order(ranks, decreasing=TRUE)[1:(n_info+n_donors)]
data <- data[, top_donors]
output = list(data = data, donors = top_donors[-(1:n_info)]-n_info)
return(output)
}
#' Imputation for pseudo-haplotypes
#'
#' This function perform imputation of pseudo-haplotypes. The imputation is based on Lee and Li & Stephens hidden Markov model, as implemented in ChromoPainter, but with the hidden state representing a pair of haplotypes from the reference panel.
#' @param haps a data frame of Phased reference genomes in .haps format, which is the output of SHAPEIT2.
#' @param ancient_tped a data frame of pseudo-haplotype to impute in Plink tped format.
#' @param genetic_map a data frame of genetic map for the reference genome with 3 columns: chromosome number, genetic distance in Morgans, physical position. The columns header should be: chr, genetic_distance, bp
#' @param chr the number of chromosome to impute (integer).
#' @param choose_donors the number of haplotypes to choose from the reference data for the imputation. If FALSE, all haplotypes in the data will be used.
#' @param freq_donors the maximal minor allele frequency in variants to consider when choosing the donors
#' @param variants a data frame with variants to impute . If the arguments is missing, all variants of the reference panel will be be imputed.
#' @return A data frames with the posterior probabilities of the possible genotypes in the imputed variants.
#' @export
#' @example
#' @references N. Li, M. Stephens, Modeling linkage disequilibrium and identifying recombination hotspots using single-nucleotide polymorphism data. Genetics 165, 2213-2233 (2003)
#' @references D. J. Lawson, G. Hellenthal, S. Myers, D. Falush, Inference of population structure using dense haplotype data. PLoS Genet 8, e1002453 (2012)
phcp_impute = function(haps, ancient_tped, genetic_map, chr, choose_donors = F, freq_donors = 0.5, variants) {
colnames(haps)[1:5] = c("chr","id","bp","a0","a1")
all_genomes = input(haps, genetic_map, ancient_tped, chr)
n_info = all_genomes %>% select(!starts_with("V")) %>% ncol(.)
if(!choose_donors){
data = all_genomes
} else{
filtered_input = filter_donors(data = all_genomes, n_donors = choose_donors, n_info, freq_donors)
data = filtered_input[["data"]]
}
theta <- 0.00138488 # The mutation rate
N <- 64.5698 # The effective population size
reference_genomes <- as.matrix(data %>% select(starts_with("V")))
mode(reference_genomes) <- "logical"
J <- ncol(reference_genomes) #number of donors haplotypes
prior <- rep(1/J,J) #vector of a-priori copying probabilities from one haplotype
prior.prior <- outer(prior,prior)
ancient <- as.matrix(data$ancient_binary)
informative_snps <- !is.na(ancient)
#remove variants that are missing in the ancient genome from both donors data and ancient genome
df <- reference_genomes[informative_snps,]
ancient <- ancient[informative_snps,]
L <- nrow(df) #number of variants
## calculate the gentic distance between each pair of following variants
rho <- N*c(diff(data$genetic_distance[informative_snps]), Inf)
#forward algorithm (with scaling)
time = Sys.time()
Theta <- matrix(theta,J,J)
df.snp <- df[1,1:J]
ancient.equal.df <- (ancient[1] == df.snp)*(1-2*theta)
A.ne.B <- outer(df.snp, df.snp, function(x,y) x != y)
emission_l <- sweep(Theta, 1, ancient.equal.df, "+")
emission_l[A.ne.B] <- 0.5
alpha_hat <- emission_l*prior.prior
ct <- numeric(L) #scaling vector
ct[1] <- 1/sum(alpha_hat)
alpha_hat <- alpha_hat*ct[1]
prob_array <- array(dim = c(J,J,L))
prob_array[ , ,1] <- alpha_hat
for(l in 2:L) {
expr <- exp(-rho[l-1])
recomb2 <- (1-expr)^2*(1/(J^2))
recomb1 <- expr*(1-expr)*(1/J)*rowSums(alpha_hat)
recomb02 <- exp(-2*rho[l-1])*alpha_hat + recomb2
recomb021 <- sweep(recomb02, 1, recomb1, "+")
tot_rec <- sweep(recomb021, 2, recomb1, "+")
df.snp <- df[l,1:J]
ancient.equal.df <- (ancient[l] == df.snp)*(1-2*theta)
A.ne.B <- outer(df.snp, df.snp, function(x,y) x != y)
emission_l <- sweep(Theta, 1, ancient.equal.df, "+")
emission_l[A.ne.B] <- 0.5
alpha_hat <- emission_l * tot_rec
ct[l] <- 1/sum(alpha_hat)
alpha_hat <- ct[l]*alpha_hat
prob_array[ , ,l] <- alpha_hat
}
print("finish forward")
print(Sys.time() - time)
mutations_prob = array(dim = c(J,J,L))
beta_hat <- ct[L]*matrix(1, nrow = J, ncol = J)
genotype_prob = matrix(nrow = L, ncol = 3)
colnames(genotype_prob) = c("0/0","0/1","1/1")
states_probs = beta_hat * prob_array[ , ,L] / sum(beta_hat * prob_array[ , ,L])
alleles <- outer(df[L,1:J], df[L,1:J], "+")
genotype_prob[L, ] = c(sum(states_probs[alleles == 0]), sum(states_probs[alleles == 1]), sum(states_probs[alleles == 2]))
mutations_prob[ , ,L] = states_probs
for(l in (L-1):1) {
expr <- exp(-rho[l])
df.snp <- df[l+1,1:J]
ancient.equal.df <- (ancient[l+1] == df.snp)*(1-2*theta)
A.ne.B <- outer(df.snp, df.snp, function(x,y) x != y)
emission_l <- sweep(Theta, 1, ancient.equal.df, "+")
emission_l[A.ne.B] <- 0.5
b <- beta_hat*emission_l
recomb2 <- (1-expr)^2*(1/(J^2))*sum(b)
recomb1 <- expr*(1-expr)*(1/J)*rowSums(b)
recomb02 <- exp(-2*rho[l])*b + recomb2
recomb021 <- sweep(recomb02, 1, recomb1, "+")
beta_hat <- sweep(recomb021, 2, recomb1, "+")
beta_hat <- beta_hat*ct[l]
#####new lines####
states_probs = beta_hat * prob_array[ , ,l] / sum(beta_hat * prob_array[ , ,l])
alleles <- outer(df[l,1:J], df[l,1:J], "+")
genotype_prob[l, ] = c(sum(states_probs[alleles == 0]), sum(states_probs[alleles == 1]), sum(states_probs[alleles == 2]))
mutations_prob[ , ,l] = states_probs
}
print("finished backward")
mutations_info = haps %>%
inner_join(., genetic_map) %>%
arrange(bp)
if(!missing("variants")){
mutations_info = mutations_info %>%
inner_join(y = variants, by = c("chr", "bp")) %>%
filter((Ref == a0 & Alt == a1) | (Ref == a1 & Alt == a0)) %>%
arrange(bp)
}
mutations_genotype = mutations_info %>% select(starts_with("V"))
if (choose_donors){
mutations_genotype = mutations_genotype %>% select(filtered_input[["donors"]])
}
mutations_genotype = as.matrix(mutations_genotype)
mutations_output = matrix(nrow = nrow(mutations_info), ncol = 3)
colnames(mutations_output) = c("0/0","0/1","1/1")
for (i in 1:nrow(mutations_info)){
alleles <- outer(mutations_genotype[i, ], mutations_genotype[i, ], "+")
closest_snps = which.min(abs(mutations_info$genetic_distance[i] - data$genetic_distance[informative_snps]))
probs = mutations_prob[,,closest_snps]
mutations_output[i, ] = c(sum(probs[alleles == 0]), sum(probs[alleles == 1]), sum(probs[alleles == 2]))
}
mutations_output = data.frame(as.data.frame(mutations_info)[,c("chr","bp","a0","a1")], mutations_output)
print("finished imputation")
print(Sys.time() - start_time)
return(mutations_output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.