R/runPipelineOnlist.R
In SNPannotator: Investigating the Functional Characteristics of Selected SNPs and Their Vicinity Genomic Region

Documented in annotate

#' Run the annotation pipeline on a list of variants
#'
#' This function receives a list of variants and checks their information on Ensembl website via
#' the Ensembl REST API server.
#'
#' @param rslist A vector of rs numbers.
#' @param server Name of the server. "https://rest.ensembl.org" can be used for GRCh38
#' and "https://grch37.rest.ensembl.org" for GRCh37.
#' @param db The population database for calculating LD scores. This can be found using `listDatabases` function.
#' @param outputPath The report file will be saved in this path as an Excel file (.xlsx)
#' @param window_size Number of base pairs around the variant for checking LD scores (max = 500kb)
#' @param r2 The LD threshold for selecting variants around the target SNP.
#' @param LDlist If set to TRUE, variants in high LD will be found and added to the output.
#' @param cadd If set to TRUE, the CADD scores will be added to variant information.
#' @param geneNames.file path the gene information file (*.rds). Default value is NULL and ENSEMBL website will be checked if no file is provided.
#' @param regulatoryType.file path the variants regulatory type information file (*.rds). Default value is NULL and this step will be skipped if no file is provided.
#' @param cores set to a value above 0 for parallel processing.
#' @return a data table with all variant information is returned.
#' @examples
#' \dontrun{
#' # select the required server
#' server <- "https://grch37.rest.ensembl.org"
#'
#' # select the database for population data
#' # this can be selected from listDatabases() function
#' db <- "1000GENOMES:phase_3:EUR"
#'
#' # create a vector of required SNPs
#' rslist=c('rs236349')
#'
#' output <- annotate(rslist,server,db,
#'   outputPath = paste(tempdir(),'sampleOutput.xlsx',sep="/"),
#'   window_size = 500,
#'   r2 = .9,
#'   cadd = FALSE)
#'   }
#'
annotate <- function(rslist,
                              server,
                              db,
                              outputPath,
                              window_size=500,
                              r2=0.5,
                              LDlist = TRUE,
                              cadd = FALSE,
                              geneNames.file=NULL,
                              regulatoryType.file=NULL,
                              cores=0) {


  if(file.exists(outputPath))
  {
    outputPath = sprintf('%s_%s',createRandString(),outputPath)
  }


  stopifnot('RS list must be a vector.' = is.vector(rslist),
            'Range for r2 is 0-1' = (r2 >0 & r2<=1),
            'Range for window size is 100-500' = (window_size > 100 & window_size<=500)
  )

  # check if system has the number of selected cores
  check.core.count(cores)


  # load gene name file - to add gene information for intergenic variations
  # based on grch37 or 38
  g.set <- NULL
  get.gene.info.from.API <- TRUE

  if(!is.null(geneNames.file))
  {
    g.set <- getGeneFile.external(geneNames.file)
    get.gene.info.from.API <- FALSE
  }



  # load regulatory file
  # based on grch37
  r.set <- NULL
  if(!is.null(regulatoryType.file))
    r.set <- getRegulatoryFile.external(regulatoryType.file)

  # load cytoband file
  # based on grch37
  c.set <- getCytobandFile(server)

  # set start time
  start.time <-  proc.time()

  on.exit({
    if(!is.null(start.time))
      cat(sprintf("\nRun time: %s",timetaken(start.time)),fill = TRUE)# END LOG
  })

  ######################
  ######################

  pingServer(server)

  #wb <- createWorkbook()
  output <- data.table()

  cat("Fetching variant information ...", fill = TRUE)

  for(i in 1:length(rslist))
  {
    rs= rslist[i]

    # get variant info + variants in high LD
    varInfo <- tryCatch(
      {
        inSilicoSeqPipeline(rs, server, db , window_size , r2 , i, length(rslist), LDlist, get.gene.info.from.API)
      },
      error = function(err) {
        message(paste0("Error in fetching variant information: ", err$message))
        NULL
      }
    )

    # rsid is checked vs synonym id
    if(!is.null(varInfo$LDList))
      varInfo$LDList <- checkReturnedVariantData(varInfo)

    if(!is.null(varInfo))
    {

      tab <- returnVariantDatatable(i, varInfo,db)

      # fill missing genes
      if(!is.null(g.set))
      {
        cat("Checking gene information ...", fill = TRUE)

        tab[Gene=='',
            c('GeneId','Gene') := find.nearest.gene(Chr,Pos_37,g.set),
            by=list(Chr,Pos_37)]

        tab[Cytoband=='',
            Cytoband := find.band(Chr,Pos_37,c.set),
            by=list(Chr,Pos_37)]
      }
      # add regulatory features

      if(!is.null(r.set))
      {

        cat("Checking regulatory feature information ...", fill = TRUE)

        tab[,c('RegId','RegType') := find.regulatory(Chr,Pos_37,r.set),
            by=list(Chr,Pos_37)]
      }

      ## check if this variants is already present in the output
      ## maybe the provided variants are not independent with each other
      checkifOutputIncludesRS(rs,output)

      output <- rbind(output,tab)

      # add a sheet for each variant
      # appendXLSXfile(tab,sheetName = rs,fileName = outputPath)
    }
    else
    {
      cat("Variant not found.\n")
    }
  }
  cat("done\n", fill = TRUE)


  if(cadd & nrow(output) > 0)
  {
    # cat("=============================================\n")
    cat('Fetching CADD scores ... ', fill = TRUE)
    # cat("=============================================\n")

    ## for progress bar 2
    pb <- progress::progress_bar$new(format = "[:bar] :current/:total (:percent)", total = length(output$Linked_SNP))
    pb$tick(0)
    ##

    conseq.dt <- tryCatch(
      {
        if(cores == 0)
          getConseqOnList(output$Linked_SNP,server,pb)
        else
          getConseqOnList.parallel(output$Linked_SNP,server,cores)
      },
      error =function(err)
      {
        message(paste0("Error in fetching CADD information: ",err$message))
        NULL
      }
    )

    if(!is.null(conseq.dt))
    {
     # output = merge(x=output,y=conseq.dt,by.x = 'Linked_SNP',by.y = 'rs',all.x = TRUE,sort = FALSE)
      output[is.element(Linked_SNP,conseq.dt$rs) , cadd := conseq.dt[conseq.dt$rs == Linked_SNP]$cadd ]

      if(any(grepl(x = output$cadd,pattern = ',')))
        output[grepl(x = cadd,pattern = ','), cadd := select.CADD.scores(Alleles,cadd),by = Linked_SNP]

      # convert scores to percentile
      output[,Deleteriousness := convert.CADD(cadd), by =list(gSNP,Linked_SNP)]
    }

  }



  # save excel file
  if(nrow(output) > 0)
    appendXLSXfile(output,sheetName = 'All Variants',fileName = outputPath, addFirst = FALSE)


  # save html file
  if(rmarkdown::pandoc_available())
  {
    if(nrow(output) > 0 & LDlist== TRUE)
    {
      html.file = gsub(x = outputPath, pattern = 'xlsx',replacement = 'html')

      if(generate.report.file(output, html.file,server,db,window_size,r2))
        cat(sprintf('File saved: %s',html.file),fill = TRUE)
    }
  }else {
    cat('Pandoc is not available. Skipping HTML report.',fill = TRUE)
  }



  invisible(output)
}

Any scripts or data that you put into this service are public.

SNPannotator documentation built on Jan. 12, 2023, 5:15 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

SNPannotator
Investigating the Functional Characteristics of Selected SNPs and Their Vicinity Genomic Region

R/runPipelineOnlist.R
In SNPannotator: Investigating the Functional Characteristics of Selected SNPs and Their Vicinity Genomic Region

Defines functions annotate

Documented in annotate

Try the SNPannotator package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

SNPannotator Investigating the Functional Characteristics of Selected SNPs and Their Vicinity Genomic Region

R/runPipelineOnlist.R In SNPannotator: Investigating the Functional Characteristics of Selected SNPs and Their Vicinity Genomic Region

Defines functions annotate

Documented in annotate

Try the SNPannotator package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

SNPannotator
Investigating the Functional Characteristics of Selected SNPs and Their Vicinity Genomic Region

R/runPipelineOnlist.R
In SNPannotator: Investigating the Functional Characteristics of Selected SNPs and Their Vicinity Genomic Region