rsnps: Get 'SNP' ('Single-Nucleotide' 'Polymorphism') Data on the Web

Documented in get_frequency get_gene_names get_placements ncbi_snp_query

#' Internal function to get the position, alleles, assembly, hgvs notation
#'
#' @param primary_info refsnp entry read in JSON format
#'
get_placements <- function(primary_info) {
  for (record in primary_info$primary_snapshot_data$placements_with_allele) {
    if (record$is_ptlp & length(record$placement_annot$seq_id_traits_by_assembly) > 0) {
      assembly_name <- record$placement_annot$seq_id_traits_by_assembly[[1]]$assembly_name
      chrom_name <- gsub("NC_[0]+([1-9]+[0-9]?)\\.[[:digit:]]+", "\\1", record$seq_id)
      df_list <- lapply(
        record$alleles,
        function(one_allele) {
          if (one_allele$allele$spdi$deleted_sequence != one_allele$allele$spdi$inserted_sequence) {
            alleles <- paste(one_allele$allele$spdi$deleted_sequence,
                             one_allele$allele$spdi$inserted_sequence,
                             sep = ","
            )
            if (one_allele$allele$spdi$inserted_sequence == "") {
              VariationAllele <- paste0(
                "del",
                one_allele$allele$spdi$deleted_sequence
              )
              alleles <- paste(alleles, VariationAllele)
            } else {
              VariationAllele <- one_allele$allele$spdi$inserted_sequence
            }
            
            df_snp <- data.frame(
              Alleles = alleles,
              AncestralAllele = one_allele$allele$spdi$deleted_sequence,
              VariationAllele = VariationAllele,
              BP = one_allele$allele$spdi$position + 1,
              seqname = one_allele$allele$spdi$seq_id,
              hgvs = one_allele$hgvs,
              assembly = assembly_name,
              Chromosome = chrom_name,
              stringsAsFactors = FALSE
            )
            return(df_snp)
          }
        }
      )
      
      ## remove the NULLs
      df_list <- df_list[lengths(df_list) != 0]
      
      ## if more than one we need to merge the ancestral and variation alleles
      df_all <- do.call(rbind, df_list)
      new_df <- stats::aggregate(. ~ BP,
                                 data = df_all,
                                 FUN = function(x) {
                                   paste(unique(x[x != ""]),
                                         collapse = ","
                                   )
                                 }
      )
      new_df$Alleles <- paste(unique(unlist(strsplit(new_df$Alleles, ","))),
                              collapse = ","
      )
      return(new_df)
    }
  }
}


#' Internal function to get the frequency of the variants from
#' different studies.
#'
#' @param Class What kind of variant is the rsid. Accepted options are "snv", "snp" and "delins".
#' @param primary_info refsnp entry read in JSON format
#'
get_frequency <- function(Class, primary_info) {
  if (Class %in% c("snv", "snp", "delins")) {
    df_freq <- NULL
    for (record in primary_info$primary_snapshot_data$allele_annotations) {
      for (freq_record in record$frequency) {
        if (freq_record$observation$deleted_sequence != freq_record$observation$inserted_sequence) {
          if (freq_record$observation$inserted_sequence == "") {
            MAF <- freq_record$allele_count / freq_record$total_count
            df_freq_study <- data.frame(
              study = freq_record$study_name,
              ref_seq = freq_record$observation$deleted_sequence,
              Minor = paste0("del", freq_record$observation$deleted),
              MAF = MAF,
              stringsAsFactors = FALSE
            )
          } else if (freq_record$observation$deleted_sequence == "") {
            MAF <- freq_record$allele_count / freq_record$total_count
            df_freq_study <- data.frame(
              study = freq_record$study_name,
              ref_seq = freq_record$observation$deleted_sequence,
              Minor = paste0("dup", freq_record$observation$inserted_sequence),
              MAF = MAF,
              stringsAsFactors = FALSE
            )
          } else {
            MAF <- freq_record$allele_count / freq_record$total_count
            df_freq_study <- data.frame(
              study = freq_record$study_name,
              ref_seq = freq_record$observation$deleted_sequence,
              Minor = freq_record$observation$inserted_sequence,
              MAF = MAF,
              stringsAsFactors = FALSE
            )
          }
          df_freq <- rbind(df_freq, df_freq_study)
        }
      }
    }
    
    if (!is.null(df_freq)) {
      return(df_freq)
    } else {
      df_freq <- data.frame(
        study = "",
        ref_seq = NA,
        Minor = NA,
        MAF = NA,
        stringsAsFactors = FALSE
      )
    }
  } else {
    df_freq <- data.frame(
      study = "",
      ref_seq = NA,
      Minor = NA,
      MAF = NA,
      stringsAsFactors = FALSE
    )
  }
  
  ## sort study names so studies grouped together
  df_freq <- df_freq[order(df_freq$study), ]
}

#' Internal function to get gene names.
#'
#' If multiple gene names are encountered they are collapsed with a
#' "/".
#' @param primary_info refsnp entry read in JSON format
#'
get_gene_names <- function(primary_info) {
  gene_list <- c()
  for (record in primary_info$primary_snapshot_data$allele_annotations) {
    for (annotations in record$assembly_annotation) {
      for (genes in annotations$genes) {
        gene <- genes$locus
        gene_list <- c(gene_list, gene)
      }
    }
  }
  gene_list <- paste(unique(gene_list), collapse = "/")
  return(gene_list)
}


#' Query NCBI's refSNP for information on a set of SNPs via the API
#'
#' This function queries NCBI's refSNP for information related to the latest
#' dbSNP build and latest reference genome for information on the vector
#' of snps submitted.
#'
#' This function currently pulling data for Assembly 38 - in particular
#' note that if you think the BP position is wrong, that you may be
#' hoping for the BP position for a different Assembly.
#'
#' @export
#' @param snps (character) A vector of SNPs (rs numbers).
#' @return A dataframe with columns:
#'
#' - query: The rs ID that was queried.
#' - chromosome: The chromosome that the marker lies on.
#' - bp: The chromosomal position, in base pairs, of the marker,
#' as aligned with the current genome used by dbSNP. we add 1 to the base
#' pair position in the BP column in the output data.frame to agree with
#' what the dbSNP website has.
#' - rsid: Reference SNP cluster ID. If the rs ID queried
#' has been merged, the up-to-date name of the ID is returned here, and
#' a warning is issued.
#' - class: The rsid's 'class'. See
#' <https://www.ncbi.nlm.nih.gov/projects/SNP/snp_legend.cgi?legend=snpClass>
#' for more details.
#' - gene: If the rsid lies within a gene (either within the exon
#' or introns of a gene), the name of that gene is returned here; otherwise,
#' `NA`. Note that
#' the gene may not be returned if the rsid lies too far upstream or downstream
#' of the particular gene of interest.
#' - alleles: The alleles associated with the SNP if it is a
#' SNV; otherwise, if it is an INDEL, microsatellite, or other kind of
#' polymorphism the relevant information will be available here.
#' - minor: The allele for which the MAF is computed,
#' given it is an SNV; otherwise, `NA`.
#' - maf: The minor allele frequency of the SNP, given it is an SNV.
#' This is drawn from the current global reference population used by NCBI (GnomAD).
#' - ancestral_allele: allele as described in the current assembly
#' - variation_allele: difference to the current assembly
#' - seqname - Chromosome RefSeq reference.
#' - hgvs -  full hgvs notation for variant
#' - assembly - which assembly was used for the annotations
#' - ref_seq - sequence in reference assembly
#' - maf_population - dataframe of all minor allele frequencies reported, with columns study,
#' reference allele, alternative allele (minor) and minor allele frequency.
#'
#'
#' @references <https://www.ncbi.nlm.nih.gov/projects/SNP/>
#' @references <https://pubmed.ncbi.nlm.nih.gov/31738401/> SPDI model
#'
#' @details Note that you are limited in the to a max of one query per second
#' and concurrent queries are not allowed.
#' If users want to set curl options when querying for the SNPs they can do so by using
#'  httr::set_config/httr::with_config
#'
#' @examples \dontrun{
#' ## an example with both merged SNPs, non-SNV SNPs, regular SNPs,
#' ## SNPs not found, microsatellite
#' SNPs <- c("rs332", "rs420358", "rs1837253", "rs1209415715", "rs111068718")
#' ncbi_snp_query(SNPs)
#' # ncbi_snp_query("123456") ##invalid: must prefix with 'rs'
#' ncbi_snp_query("rs420358")
#' ncbi_snp_query("rs332") # warning that its merged into another, try that
#' ncbi_snp_query("rs121909001")
#' ncbi_snp_query("rs1837253")
#' ncbi_snp_query("rs1209415715")
#' ncbi_snp_query("rs111068718")
#' ncbi_snp_query(snps = "rs9970807")
#'
#' ncbi_snp_query("rs121909001")
#' ncbi_snp_query("rs121909001", verbose = TRUE)
#' }
ncbi_snp_query <- function(snps) {
  
  ## NCBI moved to https but not using http v.2. The setting of the version
  ## used with curl is based on
  ##  https://github.com/ropensci/rentrez/issues/127#issuecomment-488838967
  ## in the rentrez package
  httr::set_config(httr::config(http_version = 2)) ## value 2 corresponds to CURL_HTTP_VERSION_1_1
  
  ## ensure these are rs numbers of the form rs[0-9]+
  tmp <- sapply(snps, function(x) {
    grep("^rs[0-9]+$", x)
  })
  if (any(sapply(tmp, length) == 0)) {
    stop("not all items supplied are prefixed with 'rs';\n",
         "you must supply rs numbers and they should be prefixed with ",
         "'rs', e.g. rs420358",
         call. = FALSE
    )
  }
  
  message(paste0(
    "Getting info about the following rsIDs: ",
    paste(snps,
          collapse = ", "
    )
  ))
  ## transform all SNPs into numbers (rsid)
  snps_num <- gsub("rs", "", snps)
  
  out <- as.data.frame(matrix(NA, nrow = length(snps_num), ncol = 15))
  names(out) <- c("query", "chromosome", "bp", "class", "rsid", "gene", "alleles", "ancestral_allele", "variation_allele", "seqname", "hgvs", "assembly", "ref_seq", "minor", "maf")
  
  out_maf <- list(NULL)
  
  ## as far as I understand from https://api.ncbi.nlm.nih.gov/variation/v0/#/RefSNP/ we
  ## can only send one query at a time and max 1 per second.
  for (i in seq_along(snps_num)) {
    variant.url <- paste0("https://api.ncbi.nlm.nih.gov/variation/v0/refsnp/", snps_num[i])
    variant.response <- httr::GET(variant.url)
    
    ## handle sporadic status 500 from NCBI that have
    ## been occurring starting in Dec 2022.
    if (variant.response$status_code == 500){
      ## try to redo the call
      variant.response <- httr::GET(variant.url)
    }
    
    variant.response.content <- jsonlite::fromJSON(rawToChar(variant.response$content),
                                                   simplifyVector = FALSE
    )
    
    if ("error" %in% names(variant.response.content)) {
      if (variant.response.content$error$message == "RefSNP not found") {
        warning("The following rsId had no information available on NCBI:\n  ",
                paste0("rs", snps_num[i]),
                call. = FALSE
        )
        next()
      } else {
        warning("The following error was received from NCBI:\n  ",
                variant.response.content$error$message,
                call. = FALSE
        )
        next()
      }
    }
    
    if ("withdrawn_snapshot_data" %in% names(variant.response.content) & length(variant.response.content$present_obs_movements) == 0) {
      warning("The following rsId has been withdrawn from NCBI:\n  ",
              paste0("rs", snps_num[i]),
              call. = FALSE
      )
      next()
    }
    
    if ("unsupported_snapshot_data" %in% names(variant.response.content) & length(variant.response.content$present_obs_movements) == 0) {
      warning("The following rsId no longer has any supporting observations in NCBI:\n  ",
              paste0("rs", snps_num[i]),
              call. = FALSE
      )
      next()
    }
    
    Query <- as.character(paste0("rs", variant.response.content$refsnp_id))
    
    ## if merged into another id
    if (is.null(variant.response.content$primary_snapshot_data)) {
      rsid <- as.character(paste0("rs", variant.response.content$merged_snapshot_data$merged_into))
      no_rsid <- as.character(variant.response.content$merged_snapshot_data$merged_into)
      
      warning(Query, " has been merged into ", rsid, call. = FALSE)
      
      variant.url <- paste0("https://api.ncbi.nlm.nih.gov/variation/v0/refsnp/", no_rsid)
      variant.response <- httr::GET(variant.url)
      variant.response.content <- jsonlite::fromJSON(rawToChar(variant.response$content),
                                                     simplifyVector = FALSE
      )
      
      ## if merged into a second snp
      if (is.null(variant.response.content$primary_snapshot_data)) {
        rsid2 <- as.character(paste0("rs", variant.response.content$merged_snapshot_data$merged_into))
        no_rsid2 <- as.character(variant.response.content$merged_snapshot_data$merged_into)
        
        warning(Query, " has been merged into ", rsid, " which has been merged into ", rsid2, call. = FALSE)
        
        variant.url <- paste0("https://api.ncbi.nlm.nih.gov/variation/v0/refsnp/", no_rsid2)
        variant.response <- httr::GET(variant.url)
        variant.response.content <- jsonlite::fromJSON(rawToChar(variant.response$content),
                                                       simplifyVector = FALSE
        )
        
      } else {
        rsid <- as.character(paste0(
          "rs",
          variant.response.content$refsnp_id
        ))
      }
      
    } else {
      rsid <- as.character(paste0(
        "rs",
        variant.response.content$refsnp_id
      ))
    }
    
    Class <- as.character(variant.response.content$primary_snapshot_data$variant_type)
    placement_SNP <- get_placements(variant.response.content)
    
    ## frequency of minor allele for all studies
    frequency_SNP <- get_frequency(Class, variant.response.content)
    Gene <- get_gene_names(variant.response.content)
    
    out[i, ] <- c(
      Query,
      placement_SNP$Chromosome,
      placement_SNP$BP,
      Class,
      rsid,
      Gene,
      placement_SNP$Alleles,
      placement_SNP$AncestralAllele,
      placement_SNP$VariationAllele,
      placement_SNP$seqname,
      placement_SNP$hgvs,
      placement_SNP$assembly,
      # if GnomAD maf available, pick that
      ifelse(any(frequency_SNP$study == "GnomAD"), frequency_SNP$ref_seq[frequency_SNP$study == "GnomAD"], NA),
      ifelse(any(frequency_SNP$study == "GnomAD"), frequency_SNP$Minor[frequency_SNP$study == "GnomAD"], NA),
      ifelse(any(frequency_SNP$study == "GnomAD"), frequency_SNP$MAF[frequency_SNP$study == "GnomAD"], NA)
    )
    out_maf[[i]] <- frequency_SNP
  }
  Sys.sleep(1)
  
  ## remove missing rsnumbers
  ind <- which(!is.na(out$query))
  out <- out[ind, ]
  out_maf <- out_maf[ind]
  
  ## ensure maf and bp are numeric
  for (nm in c("maf", "bp")) {
    out[, nm] <- as.numeric(out[, nm])
  }
  
  ## adding maf_population
  out <- tibble::tibble(out, maf_population = out_maf)
  
  return(out)
}