R/locus.from.rsid.R

Defines functions variants.from.file snps.from.file formatVcfOut unlistColumn strSort change.to.search.genome determine.allele.from.ambiguous snps.from.rsid

Documented in snps.from.file snps.from.rsid

#' Import SNPs from rsid for use in motifbreakR
#'
#' @param rsid Character; a character vector of rsid values from dbSNP
#' @param dbSNP an object of class SNPlocs to lookup rsids; see \code{availible.SNPs} in
#'   \code{\link[BSgenome]{injectSNPs}} to check for availible SNPlocs
#' @param search.genome an object of class BSgenome for the species you are interrogating;
#'  see \code{\link[BSgenome]{available.genomes}} for a list of species
#' @seealso See \code{\link{motifbreakR}} for analysis; See \code{\link{snps.from.file}}
#'   for an alternate method for generating a list of variants.
#' @details \code{snps.from.rsid} take an rsid, or character vector of rsids and
#'  generates the required object to input into \code{motifbreakR}
#' @return a GRanges object containing:
#'  \item{SNP_id}{The rsid of the snp with the "rs" portion stripped}
#'  \item{alleles_as_ambig}{THE IUPAC ambiguity code between the reference and
#'  alternate allele for this SNP}
#'  \item{REF}{The reference allele for the SNP}
#'  \item{ALT}{The alternate allele for the SNP}
#' @examples
#'  library(BSgenome.Hsapiens.UCSC.hg19)
#'  library(SNPlocs.Hsapiens.dbSNP142.GRCh37)
#'  snps.file <- system.file("extdata", "pca.enhancer.snps", package = "motifbreakR")
#'  snps <- as.character(read.table(snps.file)[,1])
#'  snps.mb <- snps.from.rsid(snps[1],
#'                            dbSNP = SNPlocs.Hsapiens.dbSNP142.GRCh37,
#'                            search.genome = BSgenome.Hsapiens.UCSC.hg19)
#'
#' @importFrom BSgenome snpsById snplocs
#' @importFrom Biostrings DNAStringSet DNAString DNAStringSetList
#' @export
snps.from.rsid <- function(rsid = NULL, dbSNP = NULL,
                           search.genome = NULL) {
  if (is.null(rsid)) {
    stop("no RefSNP IDs have been found, please include RefSNP ID numbers")
  }
  if (!inherits(dbSNP, "SNPlocs")) {
    stop(paste0("dbSNP argument was not provided with a valid SNPlocs object.\n",
                "Please run availible.SNPs() to check for availible SNPlocs"))
  }
  if (!inherits(search.genome, "BSgenome")) {
    stop(paste0("search.genome argument was not provided with a valid BSgenome object.\n",
                "Run availible.genomes() and choose the appropriate BSgenome object"))
  }
  if (all(!grepl("rs", rsid))) {
    bad.names <- rsid[!grepl("rs", rsid)]
    stop(paste(paste(bad.names, collapse = " "), "are not rsids, perhaps you want to import your snps from a bed or vcf file with snps.from.file()?"))
  }
  rsid <- unique(rsid)
  rsid.grange <- as(snpsById(dbSNP, rsid, ifnotfound = "warning"), "GRanges")
  rsid.grange <- change.to.search.genome(rsid.grange, search.genome)
  rsid.grange <- GRanges(rsid.grange)
  rsid.refseq <- getSeq(search.genome, rsid.grange)
  rsid.grange$UCSC.reference <- as.character(rsid.refseq)
  rsid.grange <- sapply(split(rsid.grange, rsid.grange$RefSNP_id), function(snp) {
    alt.allele <- determine.allele.from.ambiguous(snp$alleles_as_ambig, snp$UCSC.reference)
    if (length(alt.allele) > 1L) {
      snp <- do.call("c", replicate(length(alt.allele), snp))
      snp$UCSC.alternate <- alt.allele
      names(snp) <- paste(snp$RefSNP_id, alt.allele, sep = ":")
    } else {
      snp$UCSC.alternate <- alt.allele
      names(snp) <- snp$RefSNP_id
    }
    return(snp)
  })
  rsid.grange <- unlist(do.call("GRangesList", rsid.grange), use.names = FALSE)
  colnames(mcols(rsid.grange)) <- c("RefSNP_id", "alleles_as_ambig", "REF", "ALT")
  rsid.grange$REF <- DNAStringSet(rsid.grange$REF)
  rsid.grange$ALT <- DNAStringSet(rsid.grange$ALT)
  # rsid.grange$alleles_as_ambig <- DNAStringSet(rsid.grange$alleles_as_ambig)
  rsid.grange$alleles_as_ambig <- NULL
  colnames(mcols(rsid.grange))[1] <- "SNP_id"
  attributes(rsid.grange)$genome.package <- attributes(search.genome)$pkgname
  return(rsid.grange)
}

determine.allele.from.ambiguous <- function(ambiguous.allele, known.allele) {
  neucleotide.ambiguity.code <- list(Y = c("C", "T"), R = c("A", "G"), W = c("A", "T"),
                                     S = c("G", "C"), K = c("T", "G"), M = c("C", "A"),
                                     D = c("A", "G", "T"), V = c("A", "C", "G"),
                                     H = c("A", "C", "T"), B = c("C", "G", "T"),
                                     N = c("A", "C", "G", "T"))
  specnac <- neucleotide.ambiguity.code[[ambiguous.allele]]
  unknown.allele <- specnac[-grep(known.allele, specnac)]
  return(unknown.allele)
}

#' @import GenomeInfoDb
change.to.search.genome <- function(granges.object, search.genome) {
  sequence <- seqlevels(granges.object)
  ## sequence is in UCSC format and we want NCBI style
  newStyle <- mapSeqlevels(sequence,seqlevelsStyle(search.genome))
  newStyle <- newStyle[complete.cases(newStyle)] # removing NA cases.
  ## rename the seqlevels
  granges.object <- renameSeqlevels(granges.object,newStyle)
  seqlevels(granges.object) <- seqlevelsInUse(granges.object)
  seqinfo(granges.object) <- keepSeqlevels(seqinfo(search.genome),
                                           value = seqlevelsInUse(granges.object))
  return(granges.object)
}


strSort <- function(x) {
  sapply(lapply(strsplit(x, NULL), sort), paste, collapse = "")
}

unlistColumn <- function(x, column = NULL) {
  if (is.null(column)) {
    stop("select column to unlist from x")
  }
  if (is(mcols(x)[[column]], "DNAStringSet")) {
    mcols(x)[[column]] <- as.character(mcols(x)[[column]])
  }
  if (any(lengths(mcols(x)[[column]]) > 1)) {
    columnvals <- unlist(mcols(x)[[column]])
    numsplits <- lengths(mcols(x)[[column]])
    x <- rep(x, times = numsplits)
    mcols(x)[[column]] <- columnvals
    return(x)
  } else {
    return(x)
  }
}

formatVcfOut <- function(x, gseq = search.genome) {
  x$SNP_id <- names(x)
  mcols(x) <- mcols(x)[, c("SNP_id", "REF", "ALT")]
  x$REF <- unlist(DNAStringSetList(x$REF))
  x$ALT <- unlist(DNAStringSetList(x$ALT))
  x <- x[!grepl("MT", seqnames(x))]
  if (!any(grepl("chr", seqlevels(x)))) {
    seqlevels(x) <- paste0("chr", seqlevels(x))
  }
  x <- change.to.search.genome(x, gseq)
  can.ref <- getSeq(gseq, x)
  names(can.ref) <- NULL
  if (all(can.ref == x$REF)) {
    rm(can.ref)
  } else {
    warning(paste0("User selected reference allele differs from the sequence in ",
                   attributes(gseq)$pkgname, " continuing with genome specified",
                   " reference allels\n", "there are ", sum(x$REF != can.ref),
                   " differences"))
  }
  attributes(x)$genome.package <- attributes(gseq)$pkgname
  return(x)
}
#' Import SNPs from a BED file or VCF file for use in motifbreakR
#'
#' @param file Character; a character containing the path to a bed file or a vcf file
#'   see Details for a description of the required format
#' @param dbSNP OPTIONAL; an object of class SNPlocs to lookup rsids; see \code{availible.SNPs} in
#'   \code{\link[BSgenome]{injectSNPs}} to check for availible SNPlocs
#' @param search.genome an object of class BSgenome for the species you are interrogating;
#'  see \code{\link[BSgenome]{available.genomes}} for a list of species
#' @param format Character; one of \code{bed} or \code{vcf}
#' @seealso See \code{\link{motifbreakR}} for analysis; See \code{\link{snps.from.rsid}}
#'   for an alternate method for generating a list of variants.
#' @details \code{snps.from.file} takes a character vector describing the file path
#'  to a bed file that contains the necissary information to generate the input for
#'  \code{motifbreakR} see \url{http://www.genome.ucsc.edu/FAQ/FAQformat.html#format1}
#'  for a complete description of the BED format.  Our convention deviates in that there
#'  is a required format for the name field.  \code{name} is defined as chromosome:start:REF:ALT
#'  or the rsid from dbSNP (if you've included the optional SNPlocs argument).
#'  For example if you were to include rs123 in it's alternate
#'  format it would be entered as chr7:24966446:C:A
#' @return a GRanges object containing:
#'  \item{SNP_id}{The rsid of the snp with the "rs" portion stripped}
#'  \item{alleles_as_ambig}{THE IUPAC ambiguity code between the reference and
#'  alternate allele for this SNP}
#'  \item{REF}{The reference allele for the SNP}
#'  \item{ALT}{The alternate allele for the SNP}
#' @examples
#'  library(BSgenome.Drerio.UCSC.danRer7)
#'  library(SNPlocs.Hsapiens.dbSNP142.GRCh37)
#'  snps.bed.file <- system.file("extdata", "danRer.bed", package = "motifbreakR")
#'  # see the contents
#'  read.table(snps.bed.file, header = FALSE)
#'  #import the BED file
#'  snps.mb <- snps.from.file(snps.bed.file,
#'                            search.genome = BSgenome.Drerio.UCSC.danRer7,
#'                            format = "bed")
#'
#' @importFrom rtracklayer import
#' @importFrom Biostrings IUPAC_CODE_MAP uniqueLetters BStringSetList
#' @importFrom VariantAnnotation readVcf ref alt isSNV VcfFile ScanVcfParam
#' @importFrom SummarizedExperiment rowRanges
#' @importFrom stringr str_sort str_split
#' @export
snps.from.file <- function(file = NULL, dbSNP = NULL, search.genome = NULL, format = "bed", indels = FALSE) {
  if (format == "vcf") {
    if (!inherits(search.genome, "BSgenome")) {
      stop(paste0(search.genome, " is not a BSgenome object.\n", "Run availible.genomes() and choose the appropriate BSgenome object"))
    }
    genome.name <- genome(search.genome)[[1]]
    vcfparam <- ScanVcfParam(info = NA, geno = NA)
    vcffile = open(VcfFile(file))
    vcf = readVcf(vcffile, genome = genome.name, param = vcfparam)
    close(vcffile)
    vcf_ranges <- rowRanges(vcf)
    vcf_ranges <- unlistColumn(vcf_ranges, "ALT")
    vcf_ranges <- unlistColumn(vcf_ranges, "REF")
    vcf_ranges$index <- seq_along(vcf_ranges)
    complex.variants <- vcf_ranges[nchar(vcf_ranges$REF) > 1 | nchar(vcf_ranges$ALT) > 1]
    snps <- vcf_ranges[!(nchar(vcf_ranges$REF) > 1 | nchar(vcf_ranges$ALT) > 1)]
    if (indels) {
      if (length(complex.variants) > 0) {
        alt_letters <- uniqueLetters(unlist(BStringSetList(complex.variants$ALT)))
        ref_letters <- uniqueLetters(unlist(BStringSetList(complex.variants$REF)))
        alt_letters_remove <- alt_letters[!alt_letters %in% DNA_ALPHABET]
        ref_letters_remove <- ref_letters[!ref_letters %in% DNA_ALPHABET]
        if (length(alt_letters_remove) > 0) {
          search.pattern <- paste0(alt_letters_remove, collapse = "|")
          drop.variants.alt <- vapply(complex.variants$ALT,
                                      function(x,
                                               remove_letters = search.pattern) {
                                        any(grepl(remove_letters, x))
                                      }, logical(1))
        } else {
          drop.variants.alt <- as.logical(rep.int(0, length(complex.variants)))
        }
        if (length(ref_letters_remove) > 0) {
          search.pattern <- paste0(ref_letters_remove, collapse = "|")
          drop.variants.ref <- vapply(complex.variants$REF,
                                      function(x,
                                               remove_letters = search.pattern) {
                                        any(grepl(remove_letters, x))
                                      }, logical(1))
        } else {
          drop.variants.ref <- as.logical(rep.int(0, length(complex.variants)))
        }
        complex.variants <- complex.variants[!(drop.variants.alt | drop.variants.ref)]
        complex.variants <- unlistColumn(complex.variants, "ALT")
        complex.variants <- unlistColumn(complex.variants, "REF")
      }
      all.variants <- c(complex.variants, snps)
      all.variants <- formatVcfOut(all.variants[order(all.variants$index), ], search.genome)
      return(all.variants)
    } else {
      snps <- formatVcfOut(snps, search.genome)
      return(snps)
    }
  } else {
    if (format == "bed") {
      snps <- import(file, format = "bed")
      if (!indels) {
        if (any(grepl("rs", snps$name)) & (!inherits(dbSNP, "SNPlocs"))) {
          stop(paste0(file, " contains at least one variant with an rsID and no SNPlocs has been indicated\n",
                      "Please run availible.SNPs() to check for availble SNPlocs"))
        }
      } else if (inherits(dbSNP, "SNPlocs")) {
        warning("Variants are not compared to nor extracted from SNPlocs objects when indels are included.",
                " SNPlocs will not be used.")
      }
      if (!inherits(search.genome, "BSgenome")) {
        stop(paste0(search.genome, " is not a BSgenome object.\n", "Run availible.genomes() and choose the appropriate BSgenome object"))
      }
      ## spit snps into named and unnamed snps
      snps.noid <- snps[!grepl("rs", snps$name), ]
      ## get ref for unnamed snps
      snps.ref <- getSeq(search.genome, snps)
      snps.ref <- as.character(snps.ref)
      ## get alt for unnamed snps
      snps.alt <- snps$name
      snps.alt <- unlist(lapply(snps.alt, strsplit, split = ":"), recursive = FALSE)
      snps.ref.user <- sapply(snps.alt, "[", 3)
      if (isTRUE(all.equal(snps.ref, snps.ref.user))) {
        rm(snps.ref.user)
      } else {
        warning(paste0("User selected reference allele differs from the sequence in ",
                       attributes(search.genome)$pkgname, " continuing with genome specified",
                       " reference allels\n", " there are ", sum(snps.ref != snps.ref.user),
                       " differences"))
      }
      snps.alt <- sapply(snps.alt, "[", 4)
      snps.alt.split <- str_split(snps.alt, ",")
      rep.vars <- vapply(snps.alt.split, length, integer(1))
      snps <- rep(snps, rep.vars)
      snps.ref <- rep(snps.ref, rep.vars)
      snps.alt <- unlist(snps.alt.split)
      is.indel <- nchar(snps.alt) > 1 | nchar(snps.ref) > 1
      if (!indels) {
        snps <- snps[!is.indel]
        snps.ref <- snps.ref[!is.indel]
        snps.alt <- snps.alt[!is.indel]
      }
      alt.letters <- uniqueLetters(unlist(BStringSetList(snps.alt)))
      alt_letters_remove <- alt.letters[!alt.letters %in% DNA_ALPHABET]
      if (length(alt_letters_remove) > 0) {
        search.pattern <- paste0(alt_letters_remove, collapse = "|")
        drop.variants.alt <- vapply(complex.variants$ALT,
                                    function(x,
                                             remove_letters = search.pattern) {
                                      any(grepl(remove_letters, x))
                                    }, logical(1))
      } else {
        drop.variants.alt <- as.logical(rep.int(0, length(snps.alt)))
      }
      ## check if alt was given for unnamed snps
      alt.allele.is.valid <- !drop.variants.alt & (snps.alt != snps.ref)
      if (!all(alt.allele.is.valid)) {
        snpnames <- snps$name[drop.variants.alt]
        if (length(snpnames) < 50 && length(snpnames) > 0) {
          warning(paste("User variant", snpnames, "alternate allele is not one of \"A\", \"T\", \"G\", or \"C\""))
        } else {
          if (length(snpnames) > 0) {
            warning(paste0(length(snpnames), " user variants contain an alternate allele that is not one of \"A\", \"T\", \"G\", \"C\"\n",
                           " These variants were excluded"))
          }
        }
        equal.to.ref <- snps.alt == snps.ref
        if (sum(equal.to.ref) > 0) {
          warning(paste0(sum(equal.to.ref), " user variants are the same as the reference genome ",
                         search.genome@provider_version, " for ", search.genome@common_name, "\n These variants were excluded"))
        }
        snps <- snps[alt.allele.is.valid]
        snps.ref <- snps.ref[alt.allele.is.valid]
        snps.alt <- snps.alt[alt.allele.is.valid]
      }
      snps$REF <- snps.ref
      snps$ALT <- snps.alt
      strand(snps) <- "*"
      names(snps) <- paste(as.character(snps), snps.ref, snps.alt, sep = ":")
      snps <- formatVcfOut(snps, search.genome)
      if (!indels) {
        snps.rsid <- snps[grepl("rs", snps$SNP_id), ]
        snps.noid <- snps[!grepl("rs", snps$SNP_id), ]
      ## get object for named snps
        if (length(snps.rsid) > 0) {
          snps.rsid.out <- snps.from.rsid(snps.rsid$name, dbSNP = dbSNP, search.genome = search.genome)
          colnames(mcols(snps.rsid.out))[1] <- "SNP_id"
          names(snps.rsid.out) <- snps.rsid.out$SNP_id
          snps.out <- c(snps.rsid.out, snps.noid)
        } else {
          snps.out <- snps.noid
        }
      } else {
        snps.out <- snps
      }
      attributes(snps.out)$genome.package <- attributes(search.genome)$pkgname
      return(snps.out)
    } else {
      stop("format must be one of 'vcf' or 'bed'; currently set as ", format)
    }
  }
}

#' @export
variants.from.file <- function(file = NULL, dbSNP = NULL, search.genome = NULL, format = "bed") {
  return(snps.from.file(file = file, dbSNP = dbSNP, search.genome = search.genome, format = format, indels = TRUE))
}

Try the motifbreakR package in your browser

Any scripts or data that you put into this service are public.

motifbreakR documentation built on Nov. 8, 2020, 5:31 p.m.