#' @title get compound heterozygous knockouts
#' @param vcf a phased vcf file.
#' @param ref a reference file with at least the follwoing columns:
#' chr, bp, rsid, gene
#' @param vcf.col.chr integer.
#' @param vcf.col.rsid integer.
#' @param ref.col.chr integer.
#' @param ref.col.rsid integer.
#' @param ref.col.gene gene.
#' @param verbose boolean.
#'
#' @note VCF is expected to have a header. Thus, use fwrite(.., header = T).
#' @export
get_compound_hetz_ko <- function(vcf, vcf.col.chr = 1, vcf.col.rsid = 3,
ref, ref.col.chr = 1, ref.col.rsid = 2, ref.col.gene = 3,
verbose = T){
if (F){
### debugging qsub
require(data.table)
vcf.col.chr = 1
vcf.col.rsid = 3
ref.col.chr = 1
ref.col.rsid = 2
ref.col.gene = 3
ref.col.info = 4
verbose = T
ref = '/well/lindgren/UKBIOBANK/flassen/projects/KO/IMPUTATION/derived/plof/vep_compound_hetz21.tmp'
vcf = '/well/lindgren/UKBIOBANK/flassen/projects/KO/IMPUTATION/derived/phased/phased_ukb_plof_regions_maf_info_500k_1206v_chr21.vcf.gz'
vcf <- fread(vcf, sep = '\t', skip = "#", header = T)
ref <- fread(ref, header = F, sep = ' ')
#####
# for debugging on server
require(data.table)
path = "phased_ukb_plof_regions_maf_info_500k_1206v_chr1.vcf.gz"
vcf <- fread(path, sep = '\t', skip = "#")
ref.path = "../ukbb_mfi_vep_all_HIGH_maf_info.txt"
# args
vcf.col.chr = 1
vcf.col.rsid = 3
ref.col.chr = 1
ref.col.rsid = 4
ref.col.gene = 6
# normally ref should be handled in unix
ref1 <- fread(ref.path, sep = ' ')
ref2 <- fread(ref.path, sep = '|')
x1 <- ref1[,c(1,4), with = F]
x2 <- ref2[,c(6), with = F]
ref <- cbind(x1,x2)
verbose = T
}
#########
# input #
#########
# Get individuals, GT, and rsid only
vcf = setDT(vcf)
colnames(vcf)[c(vcf.col.chr, vcf.col.rsid)] <- c('chr','rsid')
vcf$chr <- as.character(vcf$chr)
cnames = colnames(vcf)
# convert into seperate data.table
bool_ind = grepl("^[0-9]+$", cnames, perl = T)
ind_id = cnames[bool_ind]
vcf_ind = cbind(vcf[,c('chr','rsid')], vcf[, bool_ind, with = F])
# print some stats
#if (verbose) write(paste('[VCF] data.table shape:', paste(dim(vcf_ind),collapse =','),'with',length((unique(ind_id))),'unique individuals.'),stdout())
#if (verbose) write(paste('[VCF] data.table shape:', paste(dim(vcf_ind),collapse =','),'with',length((unique(vcf$rsid))),'unique SNPs.'),stdout())
# check input
ref.cols = c(ref.col.chr, ref.col.rsid, ref.col.gene)
#if (any(ref.cols > ncol(ref))) stop('ref.cols specified exceed columns in ref file!')
ref = setDT(ref)
ref = ref[,c(ref.col.chr, ref.col.rsid, ref.col.gene), with = F]
colnames(ref) <- c('chr','rsid','gene')
ref$chr <- as.character(ref$chr)
ref = ref[!duplicated(ref),]
# print some stats
#if (verbose) write(paste('[REF] data.table shape:', paste(dim(ref),collapse =','),'with',length((unique(ref$rsid))),'unique SNPs.'),stdout())
#if (verbose) write(paste('[REF] data.table shape:', paste(dim(ref),collapse =','),'with',length((unique(ref$gene))),'unique Genes.'),stdout())
#########
# merge #
#########
# merge by rsid to find trans compound hetz
refvcf = merge.data.table(ref, vcf_ind, by = c('chr','rsid'))
recurring_genes = refvcf$gene[duplicated(refvcf$gene)]
#if (verbose) write(paste('[MERGED REF/VCF] data.table shape:', paste(dim(refvcf),collapse =','),'with',length((unique(refvcf$gene))),
# 'unique genes [', length(recurring_genes), 'recurring genes ]'),stdout())
# restrict analysis to recurring genes
refvcf = refvcf[refvcf$gene %in% recurring_genes, ]
#if (verbose) write(paste('[MERGED REF/VCF] data.table shape:', paste(dim(refvcf),collapse =',')),stdout())
#if (verbose) write(paste('[MERGED REF/VCF] recurring genes:', paste(recurring_genes,collapse =';')),stdout())
# ensure that at least two genes are
# implicated by at least one mutation.
if (length(recurring_genes) > 0){
# restrict analysis to people with at least one alternate variant
alt_allele = apply(refvcf[,-c('chr','rsid','gene')], 2, function(y) grepl('1',y))
alt_allele_colsums = colSums(alt_allele)
ind_id_ok = names(alt_allele_colsums)[alt_allele_colsums > 0]
if (verbose) write(paste('[MERGED REF/VCF]', length(ind_id_ok),'individuals with alt allele.'),stdout())
#note: 2409598 is a compound hetz
n_ind = length(ind_id_ok)
######################
# find compound hetz #
######################
if (verbose) write(paste('[Finding knockouts]: time started:', Sys.time()),stdout())
result_gene <- lapply(refvcf$gene[1], function(g){
# gene-wise iteration
cur_gene = refvcf[refvcf$gene == g,]
count = 0
# sample id-wise iteration
result_id <- lapply(ind_id_ok, function(id){
# counts
if (verbose & (count %% 1000 == 0)) write(paste(g, '-', count,'/',n_ind), stdout())
count <<- count + 1
# only continue if individual have
# at least one heterozygous mutation
gt_tmp = cur_gene[,id, with = F]
if (any(grepl('1',gt_tmp))){
# combine meta data alongside individual phased genotype
tmp = cbind(cur_gene[,c('chr','rsid','gene')], gt_tmp)
colnames(tmp) = c('chr','rsid','gene', 'gt')
tmp$ind = id
# seperate phased genotype and convert it to numeric
gt = data.frame(do.call(rbind, strsplit(tmp$gt, '\\|')), stringsAsFactors = F)
gt = data.frame(apply(gt, 2, as.numeric))
colnames(gt) <- c('S1', 'S2')
tmp <- cbind(tmp, gt)
# only keep rows with potential hetz
bool_heterozygous = as.vector(rowSums(gt) > 0)
tmp = tmp[bool_heterozygous,]
if (nrow(tmp) > 1){
ko_type = ifelse(all(colSums(gt) > 0), 'TRANS', 'CIS')
write(paste0(id, ' = compound heterozygous KO (',ko_type,')'), stdout())
# save in data.frame
result = data.frame(
sample_id = id,
snp_chr = unique(tmp$chr),
gene = g,
compound_hetz_type = ko_type,
gene_knockout = as.numeric(ko_type == 'TRANS'),
genotype = paste(tmp$rsid, tmp$gt, collapse = ';', sep = '=')
)
}
}
})
return(do.call(rbind, result_id))
})
# finally merge data
if (verbose) write(paste('[Finding knockouts]: time ended:', Sys.time()),stdout())
dt_result = setDT(as.data.frame(do.call(rbind, result_gene)))
return(dt_result)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.