R/~old/dev/5_annovar/annotateSnps.R

Defines functions annotateSnps

#' Annotate functionality of SNPs using AnnoVar
#' (http://annovar.openbioinformatics.org/en/latest/#annovar-documentation)

#' @param snpList (char) path to file with list of pathway SNPs from GSEA
#' @param dbsnpFile (char) path to AnnoVar SNP annotation file
#' @param TABLE_AV (char) path to table_annovar.pl
#' @param ANNO_VAR (char) path to annotate_variation.pl
#' @param annodb (char) path to AnnoVar annotation directory
#' (eg. for purposes of this function, using hg19 build annotation)
#' @param annoBuild (char) string indicating genome build
#' @param dbString (char) string indicating the databases that will be used
#' to annotate each SNP provided in snpList
#' @param opString (char) string indicating which operation to use for each
#' database in dbString (eg. g=gene-based, r=region-based, f=filter-based)
#' @return (char) files containing AnnoVar annotations
#' @export

annotateSnps <- function(snpList, snpGroup=c("top","bottom","random","all"),
                         dbsnpFile, TABLE_AV, ANNO_VAR, annodb,
                         annoBuild="hg19",
                         dbString=paste0("refGene,cytoBand,genomicSuperDups,",
                                         "avsnp144,ljb26_all,1000g2015aug_all"),
                         opString="g,r,r,f,f,f") {
out <- tryCatch({

  sink(sprintf("%s/annotateSnps.log", outDir))
  x <- Sys.info()["nodename"]
  if (any(grep("^gpc", x))) cat("on scinet\n") else cat("on workstation\n")

  for (i in 1:length(snpGroup)) {
    message(sprintf("Creating AnnoVar input file for %s SNPs...", snpGroup[i]))
    system(sprintf("fgrep -w -f %s_%s %s > %s/%s_snplist.avinput",
                   snpList, snpGroup[i], dbsnpFile, outDir, snpGroup[i]))
    message(" done.\n")

    # Annotate variants in the avinput file
    message("AnnoVar processes beginnning...")
    str1 <- sprintf("perl %s %s/%s_snplist.avinput %s",
                    TABLE_AV, outDir, snpGroup[i], annodb)
    str2 <- sprintf("-buildver %s -out %s/%s_snp_anno",
                    annoBuild, outDir, snpGroup[i])
    str3 <- sprintf("-remove -protocol %s -operation %s", dbString, opString)
    str4 <- sprintf("-nastring .")
    cmd  <- sprintf("%s %s %s %s", str1, str2, str3, str4)
    system(cmd)

    # Run gene-based annotations (ie. classify them as intergenic, intronic,
    # non-synonymous SNP, frameshift deletion, large-scale duplication, etc.)
    str1 <- sprintf("perl %s -geneanno", ANNO_VAR)
    str2 <- sprintf("-buildver %s %s/%s_snplist.avinput %s",
                    annoBuild, outDir, snpGroup[i], annodb)
    cmd <- sprintf("%s %s", str1, str2)
    system(cmd)

    # Run region-based annotations (ie. annotate variants that fall within
    # conserved regions)
    str1 <- sprintf("perl %s -regionanno", ANNO_VAR)
    str2 <- sprintf("-build %s -out %s/%s_phastCons100way",
                    annoBuild, outDir, snpGroup[i])
    str3 <- sprintf("-dbtype phastConsElements100way %s/%s_snplist.avinput %s",
                    outDir, snpGroup[i], annodb)
    cmd <- sprintf("%s %s %s", str1, str2, str3)
    system(cmd)

    # Run filter-based annotations (ie. identify a subset of variants input file
    # that are not observed in 1000G version 2014 Oct and those that are observed
    # with allele frequencies)
    #cat("Running filter-based annotations...")
    #str1 <- sprintf("perl %s -filter", ANNO_VAR)
    #str2 <- sprintf("-dbtype 1000g2014oct_all -buildver %s %s/snplist.avinput %s",
    #                annoBuild, outDir, annodb)
    #cmd <- sprintf("%s %s", str1, str2)
    #system(cmd)
    #cat(" done.\n")

    # Print summary of SNP annotations
    cat(sprintf("\n------Annotation summary for %s SNPs------\n", snpGroup[i]))
    dat <- read.delim(sprintf("%s/%s_snp_anno.hg19_multianno.txt",
                      outDir, snpGroup[i]), h=T, as.is=T)

    # Write output to excel format
    write.xlsx(dat, file=sprintf("%s/%s_snp_anno.hg19_multianno.xlsx",
               outDir, snpGroup[i]), col=T, row=F)

     # Pie chart of SNP functional annotations (all annotated SNPs)
     annoTable <- table(dat$Func.refGene)
     annoTabledf <- as.data.frame(annoTable)
     pred_anno <- annoTabledf$Var1
     freq_anno <- annoTabledf$Freq
       freq_anno_pct <- round(freq_anno/sum(freq_anno) * 100)
       freq_anno_pct <- paste(freq_anno_pct, "%", sep="")
       freq_anno_pct <- sprintf("(%s)", freq_anno_pct)
     plot_anno_lbls <- paste(pred_anno, freq_anno_pct, sep=" ")

     png(file=sprintf("%s/%s_function_piechart.png",
         outDir, snpGroup[i]))
     pie(annoTable, labels=plot_anno_lbls,
         col=rainbow_hcl(length(annoTable)),
         main=paste0(sprintf("Proportion of %s variant function", snpGroup[i])))
     dev.off()

    # Rearrange data to get columns in the following order: "avsnp144 (rsID)",
    # "Func.refGene", "ExonicFunc.refGene", "SIFT_score", "SIFT_pred",
    # "Polyphen2_HDIV_score", "Polyphen2_HDIV_pred", "Polyphen2_HVAR_score",
    # "Polyphen2_HVAR_pred", etc..
    dat2 <- dat[c(13,6,9,14:ncol(dat))]
    cat(sprintf("Total number of annotated SNPs: %i\n", nrow(dat2)))
    cat("Function of all annotated SNPs in refGene database:")
    print(table(dat2[2]))

    # Determining predictions
    cat("Functional predictions per annotation database:\n")
    # PolyPhen2 HDIV prediction
    type_of_change <- data.frame()    # Initialize empty data frame
    exonic <- dat2[which(dat2$Func.refGene == "exonic"),]

    polyphenPred <- exonic$Polyphen2_HDIV_pred
    for (i in 1:nrow(exonic)) {
      if(polyphenPred[i] == ".") {
        type_of_change[i, "Polyphen2_HDIV_pred"] <- "missing"
      }
      else if(polyphenPred[i] %in% "D") {
        type_of_change[i, "Polyphen2_HDIV_pred"] <- "probably_damaging"
      }
      else if(polyphenPred[i] %in% "P") {
        type_of_change[i, "Polyphen2_HDIV_pred"] <- "possibly_damaging"
      }
      else if(polyphenPred[i] %in% "B") {
        type_of_change[i, "Polyphen2_HDIV_pred"] <- "benign"
      }
      else {
        type_of_change[i, "Polyphen2_HDIV_pred"] <- "other"
      }
    }

    # PolyPhen2 HVAR prediction
    polyphenPred2 <- exonic$Polyphen2_HVAR_pred
    for (i in 1:nrow(exonic)) {
      if(polyphenPred2[i] == ".") {
        type_of_change[i, "Polyphen2_HVAR_pred"] <- "missing"
      }
      else if(polyphenPred2[i] %in% "D") {
        type_of_change[i, "Polyphen2_HVAR_pred"] <- "probably_damaging"
      }
      else if(polyphenPred2[i] %in% "P") {
        type_of_change[i, "Polyphen2_HVAR_pred"] <- "possibly_damaging"
      }
      else if(polyphenPred2[i] %in% "B") {
        type_of_change[i, "Polyphen2_HVAR_pred"] <- "benign"
      }
      else {
        type_of_change[i, "Polyphen2_HVAR_pred"] <- "other"
      }
    }

    # SIFT prediction
    SIFTpred <- exonic$SIFT_pred
    for (i in 1:nrow(exonic)) {
     if(SIFTpred[i] == ".") {
       type_of_change[i, "SIFT_pred"] <- "missing"
     }
     else if(SIFTpred[i] == "D") {
       type_of_change[i, "SIFT_pred"] <- "damaging"
     }
     else if(SIFTpred[i] == "T") {
       type_of_change[i, "SIFT_pred"] <- "tolerated"
     }
     else {
       type_of_change[i, "SIFT_pred"] <- "other"
     }
    }

    # LRT prediction
    LRTpred <- exonic$LRT_pred
    for (i in 1:nrow(exonic)) {
      if(LRTpred[i] == ".") {
        type_of_change[i, "LRT_pred"] <- "missing"
      }
      else if(LRTpred[i] == "D") {
        type_of_change[i, "LRT_pred"] <- "deleterious"
      }
      else if(LRTpred[i] == "N") {
        type_of_change[i, "LRT_pred"] <- "neutral"
      }
      else if(LRTpred[i] == "U") {
        type_of_change[i, "LRT_pred"] <- "unknown"
      }
    }

     # MutationTaster prediction
     MTpred <- exonic$MutationTaster_pred
     for (i in 1:nrow(exonic)) {
       if(MTpred[i] == ".") {
         type_of_change[i, "MutationTaster_pred"] <- "missing"
       }
       else if(MTpred[i] == "A") {
         type_of_change[i, "MutationTaster_pred"] <- "disease_causing_automatic"
       }
       else if(MTpred[i] == "D") {
         type_of_change[i, "MutationTaster_pred"] <- "disease_causing"
       }
       else if(MTpred[i] == "N") {
         type_of_change[i, "MutationTaster_pred"] <- "polymorphism"
       }
       else if(MTpred[i] == "P") {
         type_of_change[i, "MutationTaster_pred"] <- "polymorphism_automatic"
       }
       else {
         type_of_change[i, "MutationTaster_pred"] <- "other"
       }
     }

     # MutationAssessor prediction
     MApred <- exonic$MutationAssessor_pred
     for (i in 1:nrow(exonic)) {
       if(MApred[i] == ".") {
         type_of_change[i, "MutationAssessor_pred"] <- "missing"
       }
       else if(MApred[i] == "H") {
         type_of_change[i, "MutationAssessor_pred"] <- "high_functional"
       }
       else if(MApred[i] == "M") {
         type_of_change[i, "MutationAssessor_pred"] <- "medium_functional"
       }
       else if(MApred[i] == "L") {
         type_of_change[i, "MutationAssessor_pred"] <- "low_non-functional"
       }
       else if(MApred[i] == "N") {
         type_of_change[i, "MutationAssessor_pred"] <- "neutral_non-functional"
       }
       else {
         type_of_change[i, "MutationAssessor_pred"] <- "other"
       }
     }

     # FATHMM prediction
     FATHMMpred <- exonic$FATHMM_pred
     for (i in 1:nrow(exonic)) {
       if(FATHMMpred[i] == ".") {
         type_of_change[i, "FATHMM_pred"] <- "missing"
        }
       else if(FATHMMpred[i] == "D") {
          type_of_change[i, "FATHMM_pred"] <- "damaging"
        }
       else if(FATHMMpred[i] == "T") {
         type_of_change[i, "FATHMM_pred"] <- "tolerated"
       }
       else {
         type_of_change[i, "FATHMM_pred"] <- "other"
       }
     }

     # GERP++_RS prediction
     GERPpred <- as.numeric(exonic$GERP)
     for (i in 1:nrow(exonic)) {
       if(GERPpred[i] %in% NA) {
         type_of_change[i, "GERP"] <- "missing"
        }
       else if(GERPpred[i] > 0) {
          type_of_change[i, "GERP"] <- "conserved"
         }
       else if(GERPpred[i] < 0) {
          type_of_change[i, "GERP"] <- "not_conserved"
         }
       else {
          type_of_change[i, "GERP"] <- "other"
         }
      }

    # Print table of AnnoVar predictions for each exonic SNV to log
    exonic_SNVfunc <- exonic[c(1,3)]
    colnames(exonic_SNVfunc) <- c("Exonic_SNV", "Function")
    exonic_predTable <- cbind(exonic_SNVfunc, type_of_change)
    print(exonic_predTable)
    }
  })
  return(out); gc(out)
  finally = {
   cat("...closing log.\n")
    sink(NULL)
  }
}
BaderLab/POPPATHR documentation built on Dec. 17, 2021, 9:53 a.m.