Nothing
#' Search for SNPs in Linkage Disequilibrium with a set of SNPs
#'
#' This function queries the SNP Annotation and Proxy tool (SNAP) for SNPs
#' in high linkage disequilibrium with a set of SNPs, and also merges in
#' up-to-date SNP annotation information available from NCBI.
#'
#' For more details, please see
#' \url{http://archive.broadinstitute.org/mpg/snap/ldsearch.php}.
#'
#' Information on the HapMap populations:
#' \url{https://catalog.coriell.org/0/Sections/Collections/NHGRI/hapmap.aspx?PgId=266&coll=HG}
#'
#' Information on the 1000 Genomes populations:
#' \url{http://www.internationalgenome.org/category/population}
#'
#' @note \code{ld_search} is a synonym of \code{LDSearch} - we'll make
#' \code{LDSearch} defunct in the next version
#'
#' @export
#' @param SNPs A vector of SNPs (rs numbers).
#' @param dataset The dataset to query. Must be one of: \itemize{
#' \item{\code{rel21: }}{HapMap Release 21}
#' \item{\code{rel22: }}{HapMap Release 22}
#' \item{\code{hapmap3r2: }}{HapMap 3 (release 2)}
#' \item{\code{onekgpilot: }}{1000 Genomes Pilot 1}
#' }
#' @param panel The panel to use from the queried data set.
#' Must be one of: \itemize{
#' \item{\code{CEU}}
#' \item{\code{YRI}}
#' \item{\code{JPT+CHB}}
#' }
#' If you are working with \code{hapmap3r2}, you can choose
#' the additional panels: \itemize{
#' \item{\code{ASW}}
#' \item{\code{CHD}}
#' \item{\code{GIH}}
#' \item{\code{LWK}}
#' \item{\code{MEK}}
#' \item{\code{MKK}}
#' \item{\code{TSI}}
#' \item{\code{CEU+TSI}}
#' \item{\code{JPT+CHB+CHD}}
#' }
#' @param RSquaredLimit The R Squared limit to specify as a filter for returned
#' SNPs; that is, only SNP pairs with R-squared greater than \code{RSquaredLimit}
#' will be returned.
#' @param distanceLimit The distance (in kilobases) upstream and downstream
#' to search for SNPs in LD with each set of SNPs.
#' @param GeneCruiser boolean; if \code{TRUE} we attempt to get gene info through
#' GeneCruiser for each SNP. This can slow the query down substantially.
#' @param quiet boolean; if \code{TRUE} progress updates are written to the
#' console.
#' @param ... Curl options passed on to \code{\link[httr]{GET}}
#' @return A list of data frames, one for each SNP queried, containing
#' information about the SNPs found to be in LD with that SNP. A description
#' of the columns follows:
#' \itemize{
#' \item{\code{Proxy:} The proxy SNP matched to the queried SNP.}
#' \item{\code{SNP:} The SNP queried.}
#' \item{\code{Distance:} The distance, in base pairs, between the queried SNP
#' and the proxy SNP. This distance is calculated according to up-to-date position
#' information returned from NCBI.}
#' \item{\code{RSquared:} The measure of LD between the SNP and the proxy.}
#' \item{\code{DPrime:} Another measure of LD between the SNP and the proxy.}
#' \item{\code{GeneVariant:} Present if \code{GeneCruiser} is \code{TRUE}.
#' This will identify where the SNP lies relative to its 'parent' SNP.}
#' \item{\code{GeneName:} Present if \code{GeneCruiser} is \code{TRUE}.
#' If the proxy SNP found lies within a gene, the name of that
#' gene will be returned here. Otherwise, the field is \code{N/A}.}
#' \item{\code{GeneDescription:} Present if \code{GeneCruiser} is \code{TRUE}. If
#' the proxy SNP lies within a gene, information about that gene (as
#' obtained from GeneCruiser) will be available here.}
#' \item \code{Major:} The major allele, as reported by SNAP.
#' \item \code{Minor:} The minor allele, as reported by SNAP.
#' \item{\code{MAF:} The minor allele frequency corresponding to the reference
#' panel queried, as obtained through SNAP.}
#' \item{\code{NObserved:} The number of individuals from which the MAF
#' information is generated, for column \code{MAF}.}
#' \item \code{Chromosome_NCBI:} The chromosome that the marker lies on.
#' \item \code{Marker_NCBI:} The name of the marker. If the rs ID queried
#' has been merged, the up-to-date name of the marker is returned here, and
#' a warning is issued.
#' \item \code{Class_NCBI:} The marker's 'class'. See
#' \url{https://www.ncbi.nlm.nih.gov/projects/SNP/snp_legend.cgi?legend=snpClass}
#' for more details.
#' \item \code{Gene_NCBI:} If the marker lies within a gene (either within the exon
#' or introns of a gene), the name of that gene is returned here; otherwise,
#' \code{NA}. Note that
#' the gene may not be returned if the marker lies too far upstream or downstream
#' of the particular gene of interest.
#' \item \code{Alleles_NCBI:} The alleles associated with the SNP if it is a
#' SNV; otherwise, if it is an INDEL, microsatellite, or other kind of
#' polymorphism the relevant information will be available here.
#' \item \code{Major_NCBI:} The major allele of the SNP, on the forward strand,
#' given it is an SNV; otherwise, \code{NA}.
#' \item \code{Minor_NCBI:} The minor allele of the SNP, on the forward strand,
#' given it is an SNV; otherwise, \code{NA}.
#' \item \code{MAF_NCBI:} The minor allele frequency of the SNP, given it is an SNV.
#' This is drawn from the current global reference population used by NCBI.
#' \item \code{BP_NCBI:} The chromosomal position, in base pairs, of the marker,
#' as aligned with the current genome used by dbSNP.
#' }
#' @examples \dontrun{
#' ld_search("rs420358")
#' ld_search('rs2836443')
#' ld_search('rs113196607')
#' }
LDSearch <- function(SNPs,
dataset="onekgpilot",
panel="CEU",
RSquaredLimit=0.8,
distanceLimit=500,
GeneCruiser=TRUE,
quiet=FALSE, add_ncbi=TRUE, ...) {
if (grepl("LDSearch", paste(deparse(sys.call()), collapse = ""))) {
.Deprecated("ld_search", package = "rsnps",
"use ld_search instead - LDSearch removed in next version")
}
## ensure these are rs numbers of the form rs[0-9]+
tmp <- sapply( SNPs, function(x) { grep( "^rs[0-9]+$", x) } )
if ( any( sapply( tmp, length ) == 0 ) ) {
stop("not all items supplied are prefixed with 'rs';\n",
"you must supply rs numbers and they should be prefixed with ",
"'rs', e.g. rs420358", call. = FALSE)
}
## RSquaredLimit
if ( RSquaredLimit < 0 || RSquaredLimit > 1 ) {
stop("RSquaredLimit must be between 0 and 1", call. = FALSE)
}
## distanceLimit
if ( is.character(distanceLimit) ) {
n <- nchar(distanceLimit)
stopifnot( substring( distanceLimit, n - 1, n ) == "kb" )
distanceLimit <- as.integer( gsub("kb", "", distanceLimit) )
}
valid_distances <- c(0, 10, 25, 50, 100, 250, 500)
if ( !(distanceLimit %in% valid_distances) ) {
stop("invalid distanceLimit. distanceLimit must be one of: ",
paste( valid_distances, collapse = ", "), call. = FALSE)
}
url <- "http://archive.broadinstitute.org/mpg/snap/ldsearch.php"
columnList_query <- if (GeneCruiser) "DP,GA,MAF" else "DP,MAF"
args <- rsnps_comp(list(snpList = paste(SNPs, collapse = ","), hapMapRelease = dataset,
hapMapPanel = panel, RSquaredLimit = RSquaredLimit,
distanceLimit = as.integer( distanceLimit * 1E3 ), downloadType = "file",
includeQuerySnp = "on", submit = "search", `columnList[]` = columnList_query))
if ( !quiet ) cat("Querying SNAP...\n")
dat_tmp <- GET(url, query = args, ...)
stop_for_status(dat_tmp)
dat <- cuf8(dat_tmp)
## check for validation error
if ( length( grep( "validation error", dat ) ) > 0 ) {
stop(dat, call. = FALSE)
}
## search through for missing SNPs and remove them from output
tmp <- unlist( strsplit( dat, "\r\n", fixed = TRUE ) )
warning_SNPs <- grep( "WARNING", tmp, value = TRUE )
for (line in warning_SNPs ) {
warning( line, call. = FALSE )
}
bad_lines <- grep( "WARNING", tmp )
if ( length( bad_lines ) > 0 ) {
tmp <- tmp[ -bad_lines ]
}
out <- split_to_df( tmp, sep = "\t", fixed = TRUE )
names( out ) <- unlist( unclass( out[1,] ) )
out <- out[2:nrow(out),]
out_split <- split(out, out$SNP)
for (i in 1:length(out_split) ) {
rownames( out_split[[i]] ) <- 1:nrow( out_split[[i]] )
}
# check whether any real data or not
if (all(unlist(out_split$SNP[1,], use.names = FALSE) %in% names(out_split$SNP)) &&
NROW(out_split$SNP) == 1) {
stop("no valid data found", call. = FALSE)
}
if ( !quiet ) {
cat("Querying NCBI for up-to-date SNP annotation information...\n")
}
## query NCBI for additional SNP information
SNP_info <- vector("list", length(out_split))
## get all the proxy SNP information in one query
if (add_ncbi) {
proxy_SNPs <- unique( unname( unlist( sapply( out_split, "[", "Proxy" ) ) ) )
ncbi_info <- ncbi_snp_query2(proxy_SNPs)$summary
names(ncbi_info) <- paste(sep = '', names(ncbi_info), "_NCBI")
}
## quick function for adding NCBI info to SNPs queried
add_ncbi_info <- function(x) {
x$ORDER <- 1:nrow(x)
x <- merge( x, ncbi_info,
by.x = "Proxy", by.y = "query_NCBI",
all.x = TRUE
)
x <- x[ order( x$ORDER ), ]
x <- x[ !(names(x) %in% "ORDER") ]
x$Distance[ x$Distance == 0 ] <- NA
x$Distance <- x$bp_NCBI - rep( x$bp_NCBI[1], nrow(x) )
x[ order( x$RSquared, decreasing = TRUE ), ]
}
if (add_ncbi) out <- lapply(out_split, add_ncbi_info) else out <- out_split
if (!quiet ) {
on.exit( cat("Done!\n") )
}
return( out )
}
#' @export
#' @rdname LDSearch
ld_search <- LDSearch
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.