workflows/210202_get_compound_hetz_old.R

#' @title get compound heterozygous knockouts
#' @param vcf a phased vcf file.
#' @param ref a reference file with at least the follwoing columns:
#' chr, bp, rsid, gene
#' @param vcf.col.chr integer.
#' @param vcf.col.rsid integer.
#' @param ref.col.chr integer.
#' @param ref.col.rsid integer.
#' @param ref.col.gene gene.
#' @param verbose boolean.
#'
#' @note VCF is expected to have a header. Thus, use fwrite(.., header = T).
#' @export


get_compound_hetz_ko <- function(vcf, vcf.col.chr = 1, vcf.col.rsid = 3,
                                 ref, ref.col.chr = 1, ref.col.rsid = 2, ref.col.gene = 3,
                                 verbose = T){

  if (F){

    ### debugging qsub

    require(data.table)
    vcf.col.chr = 1
    vcf.col.rsid = 3
    ref.col.chr = 1
    ref.col.rsid = 2
    ref.col.gene = 3
    ref.col.info = 4
    verbose = T

    ref = '/well/lindgren/UKBIOBANK/flassen/projects/KO/IMPUTATION/derived/plof/vep_compound_hetz21.tmp'
    vcf = '/well/lindgren/UKBIOBANK/flassen/projects/KO/IMPUTATION/derived/phased/phased_ukb_plof_regions_maf_info_500k_1206v_chr21.vcf.gz'
    vcf <- fread(vcf, sep = '\t', skip = "#", header = T)
    ref <- fread(ref, header = F, sep = ' ')


    #####

    # for debugging on server
    require(data.table)
    path = "phased_ukb_plof_regions_maf_info_500k_1206v_chr1.vcf.gz"
    vcf <- fread(path, sep = '\t', skip = "#")
    ref.path = "../ukbb_mfi_vep_all_HIGH_maf_info.txt"
    # args
    vcf.col.chr = 1
    vcf.col.rsid = 3
    ref.col.chr = 1
    ref.col.rsid = 4
    ref.col.gene = 6
    # normally ref should be handled in unix
    ref1 <- fread(ref.path, sep = ' ')
    ref2 <- fread(ref.path, sep = '|')
    x1 <- ref1[,c(1,4), with = F]
    x2 <- ref2[,c(6), with = F]
    ref <- cbind(x1,x2)
    verbose = T
  }


  #########
  # input #
  #########

  # Get individuals, GT, and rsid only
  vcf = setDT(vcf)
  colnames(vcf)[c(vcf.col.chr, vcf.col.rsid)] <- c('chr','rsid')
  vcf$chr <- as.character(vcf$chr)
  cnames = colnames(vcf)

  # convert into seperate data.table
  bool_ind = grepl("^[0-9]+$", cnames, perl = T)
  ind_id = cnames[bool_ind]
  vcf_ind = cbind(vcf[,c('chr','rsid')], vcf[, bool_ind, with = F])

  # print some stats
  #if (verbose) write(paste('[VCF] data.table shape:', paste(dim(vcf_ind),collapse =','),'with',length((unique(ind_id))),'unique individuals.'),stdout())
  #if (verbose) write(paste('[VCF] data.table shape:', paste(dim(vcf_ind),collapse =','),'with',length((unique(vcf$rsid))),'unique SNPs.'),stdout())

  # check input
  ref.cols = c(ref.col.chr, ref.col.rsid, ref.col.gene)
  #if (any(ref.cols > ncol(ref))) stop('ref.cols specified exceed columns in ref file!')

  ref = setDT(ref)
  ref = ref[,c(ref.col.chr, ref.col.rsid, ref.col.gene), with = F]
  colnames(ref) <- c('chr','rsid','gene')
  ref$chr <- as.character(ref$chr)
  ref = ref[!duplicated(ref),]

  # print some stats
  #if (verbose) write(paste('[REF] data.table shape:', paste(dim(ref),collapse =','),'with',length((unique(ref$rsid))),'unique SNPs.'),stdout())
  #if (verbose) write(paste('[REF] data.table shape:', paste(dim(ref),collapse =','),'with',length((unique(ref$gene))),'unique Genes.'),stdout())

  #########
  # merge #
  #########

  # merge by rsid to find trans compound hetz
  refvcf = merge.data.table(ref, vcf_ind, by = c('chr','rsid'))
  recurring_genes = refvcf$gene[duplicated(refvcf$gene)]
  #if (verbose) write(paste('[MERGED REF/VCF] data.table shape:', paste(dim(refvcf),collapse =','),'with',length((unique(refvcf$gene))),
  #                         'unique genes [', length(recurring_genes), 'recurring genes ]'),stdout())

  # restrict analysis to recurring genes
  refvcf = refvcf[refvcf$gene %in% recurring_genes, ]
  #if (verbose) write(paste('[MERGED REF/VCF] data.table shape:', paste(dim(refvcf),collapse =',')),stdout())
  #if (verbose) write(paste('[MERGED REF/VCF] recurring genes:', paste(recurring_genes,collapse =';')),stdout())

  # ensure that at least two genes are
  # implicated by at least one mutation.
  if (length(recurring_genes) > 0){

    # restrict analysis to people with at least one alternate variant
    alt_allele = apply(refvcf[,-c('chr','rsid','gene')], 2, function(y) grepl('1',y))
    alt_allele_colsums = colSums(alt_allele)
    ind_id_ok = names(alt_allele_colsums)[alt_allele_colsums > 0]
    if (verbose) write(paste('[MERGED REF/VCF]', length(ind_id_ok),'individuals with alt allele.'),stdout())

    #note: 2409598 is a compound hetz
    n_ind = length(ind_id_ok)


    ######################
    # find compound hetz #
    ######################


    if (verbose) write(paste('[Finding knockouts]: time started:', Sys.time()),stdout())

    result_gene <- lapply(refvcf$gene[1], function(g){

      # gene-wise iteration
      cur_gene = refvcf[refvcf$gene == g,]
      count = 0

      # sample id-wise iteration
      result_id <- lapply(ind_id_ok, function(id){

        # counts
        if (verbose & (count %% 1000 == 0)) write(paste(g, '-', count,'/',n_ind), stdout())
        count <<- count + 1

        # only continue if individual have
        # at least one heterozygous mutation
        gt_tmp = cur_gene[,id, with = F]

        if (any(grepl('1',gt_tmp))){

          # combine meta data alongside individual phased genotype
          tmp = cbind(cur_gene[,c('chr','rsid','gene')], gt_tmp)
          colnames(tmp) = c('chr','rsid','gene', 'gt')
          tmp$ind = id

          # seperate phased genotype and convert it to numeric
          gt = data.frame(do.call(rbind, strsplit(tmp$gt, '\\|')), stringsAsFactors = F)
          gt = data.frame(apply(gt, 2, as.numeric))
          colnames(gt) <- c('S1', 'S2')
          tmp <- cbind(tmp, gt)

          # only  keep rows with potential hetz
          bool_heterozygous = as.vector(rowSums(gt) > 0)
          tmp = tmp[bool_heterozygous,]

          if (nrow(tmp) > 1){

            ko_type = ifelse(all(colSums(gt) > 0), 'TRANS', 'CIS')
            write(paste0(id, ' = compound heterozygous KO (',ko_type,')'), stdout())

            # save in data.frame
            result = data.frame(
              sample_id = id,
              snp_chr = unique(tmp$chr),
              gene = g,
              compound_hetz_type = ko_type,
              gene_knockout = as.numeric(ko_type == 'TRANS'),
              genotype = paste(tmp$rsid, tmp$gt, collapse = ';', sep = '=')
            )

          }
        }


      })

      return(do.call(rbind, result_id))
    })

    # finally merge data
    if (verbose) write(paste('[Finding knockouts]: time ended:', Sys.time()),stdout())
    dt_result = setDT(as.data.frame(do.call(rbind, result_gene)))
    return(dt_result)
  }
}
frhl/our documentation built on Feb. 5, 2021, 7:30 p.m.