R/get_compound_hetz_ko.R

Defines functions get_compound_hetz_ko

Documented in get_compound_hetz_ko

#' @title get compound heterozygous knockouts
#' @description manually open the phased GT of each individual in the context of a
#' gene, to figure out which one are compound hetz.
#' todo: rework for smarter, less laborious.
#' todo: also allow single homozygous (trans) variants
#' @param vcf a phased vcf file.
#' @param ref a reference file with at least the follwoing columns:
#' chr, bp, rsid, gene
#' @param vcf.col.chr integer. column for chromosome.
#' @param vcf.col.rsid integer.
#' @param ref.col.chr integer.
#' @param ref.col.rsid integer.
#' @param ref.col.gene gene.
#' @param ref.col.info info column wheree entries are seperated by '|'
#' @param verbose boolean.
#' @param verbose.iter how often should iter be printed.
#'
#' @note VCF is expected to have a header. Thus, use fwrite(.., header = T).
#' @export


get_compound_hetz_ko <- function(vcf, vcf.col.chr = NULL, vcf.col.rsid = NULL,
                                 ref, ref.col.chr = NULL, ref.col.rsid = NULL, ref.col.gene = NULL, ref.col.info = NULL,
                                 verbose = T, verbose.iter = 10000){

  require(data.table)

  # ensure that cols are integers
  stopifnot(is.numeric(vcf.col.chr))
  stopifnot(is.numeric(vcf.col.rsid))
  stopifnot(is.numeric(ref.col.chr))
  stopifnot(is.numeric(ref.col.rsid))
  stopifnot(is.numeric(ref.col.gene))
  stopifnot(is.numeric(ref.col.info))

  # check that dimensions work out
  stopifnot(all(dim(vcf) > 1))
  stopifnot(all(dim(ref) > 1))
  stopifnot(ncol(ref) == 4)

  #######
  # ref #
  #######

  # check reference input
  ref.cols = c(ref.col.chr, ref.col.rsid, ref.col.gene, ref.col.info)
  if (any(ref.cols > ncol(ref))) stop('ref.cols specified exceed columns in ref file!')

  # for reference keep only the three columns
  ref = setDT(ref)
  ref = ref[,c(ref.col.chr, ref.col.rsid, ref.col.gene, ref.col.info), with = F]
  colnames(ref) <- c('chr','rsid','gene','info')
  ref$chr <- as.character(ref$chr)
  ref = ref[!duplicated(ref),]

  #######
  # vcf #
  #######

  # Get individuals, GT, and rsid only
  vcf = setDT(vcf)
  colnames(vcf)[c(vcf.col.chr, vcf.col.rsid)] <- c('chr','rsid')
  vcf$chr <- as.character(vcf$chr)
  cnames = colnames(vcf)

  # convert into seperate data.table
  bool_ind = grepl("[0-9]+$", cnames, perl = T)
  ind_id = cnames[bool_ind]
  vcf_ind = cbind(vcf[,c('chr','rsid')], vcf[, bool_ind, with = F])

  # merge by rsid to find trans compound hetz
  refvcf = merge.data.table(ref, vcf_ind, by = c('chr','rsid'))
  recurring_genes = unique(refvcf$gene[duplicated(refvcf$gene)])
  if (nrow(refvcf) == 0) stop('No chr and variant match between vcf and ref file.')
  if (length(recurring_genes) == 0) stop(paste('No recurring genes were found. Stopping.'))
  refvcf = refvcf[refvcf$gene %in% recurring_genes, ]

  ######################
  # find compound hetz #
  ######################

  # restrict analysis to people with at least one alternate variant
  alt_allele = apply(refvcf[,-c('chr','rsid','gene','info')], 2, function(y) grepl('1',y))
  alt_allele_colsums = colSums(alt_allele)
  ind_id_ok = names(alt_allele_colsums)[alt_allele_colsums > 0]
  n_ind = length(ind_id_ok)
  if (verbose) write(paste('[MERGED REF/VCF]', length(ind_id_ok),'individuals with alt allele.'),stdout())

  #note: 2409598 is a compound hetz
  result_gene <- lapply(recurring_genes, function(g){

    # gene-wise iteration
    cur_gene = refvcf[refvcf$gene == g,]
    count = 0

    # sample id-wise iteration
    result_id <- lapply(ind_id_ok, function(id){

      # counts
      if (verbose & (count %% verbose.iter == 0)) write(paste(g, '-', count,'/',n_ind), stdout())
      count <<- count + 1

      # only continue if individual have
      # at least one heterozygous mutation

      gt_tmp = cur_gene[,id, with = F]

      if (any(grepl('1',gt_tmp))){

        # combine meta data alongside individual phased genotype
        tmp = cbind(cur_gene[,c('chr','rsid','gene','info')], gt_tmp)
        colnames(tmp) = c('chr','rsid','gene','info', 'gt')
        tmp$ind = id

        # seperate phased genotype and convert it to numeric
        gt = data.frame(do.call(rbind, strsplit(tmp$gt, '\\|')), stringsAsFactors = F)
        gt = data.frame(apply(gt, 2, as.numeric))
        colnames(gt) <- c('S1', 'S2')
        tmp <- cbind(tmp, gt)

        # only  keep rows with potential hetz
        bool_heterozygous = as.vector(rowSums(gt) > 0)
        tmp = tmp[bool_heterozygous,]

        if (nrow(tmp) > 1){

          # check for type of human knockout
          ko_type = ifelse(all(colSums(gt) > 0), 'TRANS', 'CIS')
          ko = ifelse(any(rowSums(gt) > 1), 'HOMOZYGOUS','COMPOUND_HETEROZYGOUS')
          impact = ko_type == 'TRANS'

          # save in data.frame
          result = data.frame(
              sample_id = id,
              snp_chr = unique(tmp$chr),
              snp_gene = g,
              knockout = ko,
              knockout_how = ko_type,
              knocout_impact = impact,
              genotype = paste(tmp$rsid, tmp$gt, collapse = ';', sep = '='),
              geno = paste(tmp$rsid, tmp$gt, collapse = ';', sep = '='),
              info = paste(tmp$rsid, tmp$info, collapse = ';', sep = '=')
          )

          # verbose output
          if (verbose) write(paste0(id, ' = ',ko,' LOF (',ko_type,')'), stdout())
          return(result)

        }
      }
    })

    return(do.call(rbind, result_id))
  })

  # finally merge list of data.frames
  result = setDT(as.data.frame(do.call(rbind, result_gene)))
  return(result)
}
frhl/our documentation built on Feb. 5, 2021, 7:30 p.m.