#' @title get compound heterozygous knockouts
#' @description manually open the phased GT of each individual in the context of a
#' gene, to figure out which one are compound hetz.
#' todo: rework for smarter, less laborious.
#' todo: also allow single homozygous (trans) variants
#' @param vcf a phased vcf file.
#' @param ref a reference file with at least the follwoing columns:
#' chr, bp, rsid, gene
#' @param vcf.col.chr integer. column for chromosome.
#' @param vcf.col.rsid integer.
#' @param ref.col.chr integer.
#' @param ref.col.rsid integer.
#' @param ref.col.gene gene.
#' @param ref.col.info info column wheree entries are seperated by '|'
#' @param verbose boolean.
#' @param verbose.iter how often should iter be printed.
#'
#' @note VCF is expected to have a header. Thus, use fwrite(.., header = T).
#' @export
get_compound_hetz_ko <- function(vcf, vcf.col.chr = NULL, vcf.col.rsid = NULL,
ref, ref.col.chr = NULL, ref.col.rsid = NULL, ref.col.gene = NULL, ref.col.info = NULL,
verbose = T, verbose.iter = 10000){
require(data.table)
# ensure that cols are integers
stopifnot(is.numeric(vcf.col.chr))
stopifnot(is.numeric(vcf.col.rsid))
stopifnot(is.numeric(ref.col.chr))
stopifnot(is.numeric(ref.col.rsid))
stopifnot(is.numeric(ref.col.gene))
stopifnot(is.numeric(ref.col.info))
# check that dimensions work out
stopifnot(all(dim(vcf) > 1))
stopifnot(all(dim(ref) > 1))
stopifnot(ncol(ref) == 4)
#######
# ref #
#######
# check reference input
ref.cols = c(ref.col.chr, ref.col.rsid, ref.col.gene, ref.col.info)
if (any(ref.cols > ncol(ref))) stop('ref.cols specified exceed columns in ref file!')
# for reference keep only the three columns
ref = setDT(ref)
ref = ref[,c(ref.col.chr, ref.col.rsid, ref.col.gene, ref.col.info), with = F]
colnames(ref) <- c('chr','rsid','gene','info')
ref$chr <- as.character(ref$chr)
ref = ref[!duplicated(ref),]
#######
# vcf #
#######
# Get individuals, GT, and rsid only
vcf = setDT(vcf)
colnames(vcf)[c(vcf.col.chr, vcf.col.rsid)] <- c('chr','rsid')
vcf$chr <- as.character(vcf$chr)
cnames = colnames(vcf)
# convert into seperate data.table
bool_ind = grepl("[0-9]+$", cnames, perl = T)
ind_id = cnames[bool_ind]
vcf_ind = cbind(vcf[,c('chr','rsid')], vcf[, bool_ind, with = F])
# merge by rsid to find trans compound hetz
refvcf = merge.data.table(ref, vcf_ind, by = c('chr','rsid'))
recurring_genes = unique(refvcf$gene[duplicated(refvcf$gene)])
if (nrow(refvcf) == 0) stop('No chr and variant match between vcf and ref file.')
if (length(recurring_genes) == 0) stop(paste('No recurring genes were found. Stopping.'))
refvcf = refvcf[refvcf$gene %in% recurring_genes, ]
######################
# find compound hetz #
######################
# restrict analysis to people with at least one alternate variant
alt_allele = apply(refvcf[,-c('chr','rsid','gene','info')], 2, function(y) grepl('1',y))
alt_allele_colsums = colSums(alt_allele)
ind_id_ok = names(alt_allele_colsums)[alt_allele_colsums > 0]
n_ind = length(ind_id_ok)
if (verbose) write(paste('[MERGED REF/VCF]', length(ind_id_ok),'individuals with alt allele.'),stdout())
#note: 2409598 is a compound hetz
result_gene <- lapply(recurring_genes, function(g){
# gene-wise iteration
cur_gene = refvcf[refvcf$gene == g,]
count = 0
# sample id-wise iteration
result_id <- lapply(ind_id_ok, function(id){
# counts
if (verbose & (count %% verbose.iter == 0)) write(paste(g, '-', count,'/',n_ind), stdout())
count <<- count + 1
# only continue if individual have
# at least one heterozygous mutation
gt_tmp = cur_gene[,id, with = F]
if (any(grepl('1',gt_tmp))){
# combine meta data alongside individual phased genotype
tmp = cbind(cur_gene[,c('chr','rsid','gene','info')], gt_tmp)
colnames(tmp) = c('chr','rsid','gene','info', 'gt')
tmp$ind = id
# seperate phased genotype and convert it to numeric
gt = data.frame(do.call(rbind, strsplit(tmp$gt, '\\|')), stringsAsFactors = F)
gt = data.frame(apply(gt, 2, as.numeric))
colnames(gt) <- c('S1', 'S2')
tmp <- cbind(tmp, gt)
# only keep rows with potential hetz
bool_heterozygous = as.vector(rowSums(gt) > 0)
tmp = tmp[bool_heterozygous,]
if (nrow(tmp) > 1){
# check for type of human knockout
ko_type = ifelse(all(colSums(gt) > 0), 'TRANS', 'CIS')
ko = ifelse(any(rowSums(gt) > 1), 'HOMOZYGOUS','COMPOUND_HETEROZYGOUS')
impact = ko_type == 'TRANS'
# save in data.frame
result = data.frame(
sample_id = id,
snp_chr = unique(tmp$chr),
snp_gene = g,
knockout = ko,
knockout_how = ko_type,
knocout_impact = impact,
genotype = paste(tmp$rsid, tmp$gt, collapse = ';', sep = '='),
geno = paste(tmp$rsid, tmp$gt, collapse = ';', sep = '='),
info = paste(tmp$rsid, tmp$info, collapse = ';', sep = '=')
)
# verbose output
if (verbose) write(paste0(id, ' = ',ko,' LOF (',ko_type,')'), stdout())
return(result)
}
}
})
return(do.call(rbind, result_id))
})
# finally merge list of data.frames
result = setDT(as.data.frame(do.call(rbind, result_gene)))
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.