Nothing
# Translates a genotype vec into a format that haplo.stats understands:
# 0 -> 1, 1
# 1 -> 1, 2
# 2 -> 2, 2
haplo.g = function(g) {
a1 = (g > 1) + 1
a2 = (g > 0) + 1
data.frame(a1, a2)
}
running.bool = function(true_id, len) {
stopifnot(all(true_id >= 1 & true_id <= len))
res = rep(FALSE, len)
res[true_id] = TRUE
res
}
# given a vector of maxid and a vector of lengths,
# return a bool vector
# for example:
# maxid = c(1 2 1 3)
# l = c(2 3 4 5), then
# ret = c(T, F; # 2 bools, 1st chosen
# F, T, F; # 3 bools, 2nd chosen
# T, F, F, F; # 4 bools, 1st chosen
# F, F, T, F, F) # 5 bools, 3rd chosen
running.bools = function(maxid, len) {
stopifnot(length(maxid) == length(len))
res = lapply(1:length(len), function(i) running.bool(maxid[i], len[i]))
do.call(c, res)
}
haploFormat = function(geno) {
haplo = data.frame(matrix(NA, nrow(geno), 0))
geno_ncol = ncol(geno)
for(i in 1:geno_ncol) {
# each SNP gets two alleles
haplo = cbind(haplo, haplo.g(geno[, i]))
}
old_cnames = rep(colnames(geno), each = 2)
cnames = paste0(old_cnames, ".a", rep(1:2, geno_ncol))
colnames(haplo) = cnames
haplo
}
#' Infer haplotypes for a pair of SNPs
#'
#' @param geno Genotype data frame. Must have 4 columns, the first two being "FID" and "IID", the last two being the genotypes.
#' @param format_idx Column indices used for formatting haplotype string.
#' @return A data frame of haplotypes
#' @importFrom haplo.stats haplo.em
#' @importFrom dplyr group_by summarise id
#'
#' @author kaiyin
getHaplo = function(geno, format_idx = NULL) {
stopifnot(all(c("FID", "IID") %in% colnames(geno)))
geno = geno[complete.cases(geno), ]
nsnp = ncol(geno)
labels = colnames(geno)[3:nsnp]
geno$id = 1:nrow(geno)
save.em = haplo.em(haploFormat(geno[, 3:nsnp]), labels)
hapdat = data.frame(id = save.em$subj.id, h1 = save.em$hap1code, h2 = save.em$hap2code, p = save.em$post)
id_groups = group_by(hapdat, id)
idx = summarise(id_groups, maxid = which.max(p))
len = summarise(id_groups, len = length(p))
select = running.bools(idx$maxid, len$len)
hapdat = hapdat[select, ]
haptab = save.em$haplotype
haptab$h = 1:nrow(haptab)
hapdat = dplyr::left_join(hapdat, haptab, by = c("h1" = "h"))
# geno.hapdat = renameLoci(geno.hapdat, 1)
hapdat = dplyr::left_join(hapdat, haptab, by = c("h2" = "h"))
h = haploString(hapdat, snp_idx = 5:ncol(hapdat), format_idx)
dplyr::left_join(geno[, c("FID", "IID", "id")], h[, c("id", "diplo")], by = "id")
}
haploString = function(hapdat, snp_idx = NULL, format_idx = NULL, generic_diplo_name = TRUE) {
if(is.null(snp_idx)) {
snp_idx = 1:nc
}
hapdat1 = hapdat[, snp_idx]
nc = ncol(hapdat1)
stopifnot(nc %% 2 == 0)
half = nc / 2
stopifnot(length(format_idx) <= half)
stopifnot(all(format_idx > 0))
format1 = function(x, idx1, idx2) {
s1 = paste(x[idx1], collapse = "-")
s2 = paste(x[idx2], collapse = "-")
s1 = paste0("-", s1, "-")
s2 = paste0("-", s2, "-")
paste0(s1, "+", s2)
}
cnames = gsub(x = colnames(hapdat1), pattern = "\\.[xy]$", replacement = "")
newcolname = function(idx) {
paste(cnames[idx], collapse = "_")
}
if(is.null(format_idx)) {
idx1 = 1:half
idx2 = idx1 + half
f = function(x) {
format1(x, idx1, idx2)
}
if(generic_diplo_name) {
hapdat[, "diplo"] = apply(hapdat1, 1, f)
} else {
hapdat[, newcolname(idx2)] = apply(hapdat1, 1, f)
}
hapdat
} else {
stopifnot(max(format_idx) <= half)
idx2 = format_idx + half
f = function(x) {
format1(x, format_idx, idx2)
}
if(generic_diplo_name) {
hapdat[, "diplo"] = apply(hapdat1, 1, f)
} else {
hapdat[, newcolname(idx2)] = apply(hapdat1, 1, f)
}
hapdat
}
}
#' Inferring haplotypes from two genotype data frames, and join with phenotypes
#' @param g1 First genotype data frame
#' @param g2 Second genotype data frame, must be of the same dimension as the first. The first two column must be FID and IID.
#' @param phe Phenotype data frame, the first two columns must be FID and IID
#' @param pool A genotype data frame, assumed to be different from g1 and g2, used for pooling.
#' @return A data frame containing phenotype and haplotype for each individual.
#'
#' @author kaiyin
#' @export
getHaplos = function(g1, g2, phe, pool = NULL) {
stopifnot(all(dim(g1) == dim(g2)))
stopifnot(all(g1$FID == g2$FID))
stopifnot(all(g1$IID == g2$IID))
nc = ncol(g1)
stopifnot(nc > 2)
stopifnot(ncol(phe) > 2)
stopifnot(all(c("FID", "IID") %in% colnames(phe)))
stopifnot(all(c("FID", "IID") %in% colnames(g1)))
stopifnot(all(c("FID", "IID") %in% colnames(g2)))
ret = list()
for (i in 3:nc) {
x = dplyr::left_join(g1[, c(1, 2, i)], g2[, c(1, 2, i)], by = c("FID", "IID"))
if(!is.null(pool)) {
ncpool = ncol(pool)
stopifnot(ncpool > 4) # fid and iid should be there
stopifnot(all(c("FID", "IID") %in% colnames(geno_pool)))
idx = c(1, 2, sample(3:ncpool, 3))
x = dplyr::left_join(x, pool[, idx], by = c("FID", "IID"))
}
h = getHaplo(x, 1:2)
dat = dplyr::inner_join(phe, h[, c("FID", "IID", "diplo")], by = c("FID", "IID"))
name = paste0(names(g1)[i], "_", names(g2)[i])
ret[[name]] = dat
}
ret
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.