#' @importFrom dplyr rename
#' @importFrom magrittr %>%
#' @importFrom stats predict
bigsnp2anno <- function(df, markers, FDRalpha){
if(attr(df, "class") == "tbl_df" & "bigsnpscore" %in% names(df) &
"pvalue" %in% names(df) & "FDR_adj" %in% names(df) &
"CHR" %in% names(df) & "POS" %in% names(df)
& "estim" %in% names(df) & "std_err" %in% names(df)){
input_df <- df %>%
as_tibble() %>%
dplyr::rename(`p value` = .data$pvalue,
`FDR Adjusted p value` = .data$`FDR_adj`,
`SNP Effect` = .data$estim,
`SNP standard error` = .data$std_err)
} else { # else this is an object from pvdiv_gwas and needs additional vals
input_df <- df %>%
mutate(p.value = predict(df, log10 = FALSE))
if(!is.na(FDRalpha)[1]){
res <- mt.rawp2adjp(input_df$p.value, alpha = FDRalpha,
proc = "BH")
adj_p <- res$adjp[order(res$index), ]
input_df <- cbind(markers, input_df, adj_p) %>%
as_tibble() %>%
dplyr::rename(`p value` = .data$p.value,
`FDR Adjusted p value` = .data$`BH`,
`SNP Effect` = .data$estim,
`SNP standard error` = .data$std.err)
} else {
input_df <- cbind(markers, input_df) %>%
as_tibble() %>%
dplyr::rename(`p value` = .data$p.value,
`SNP Effect` = .data$estim,
`SNP standard error` = .data$std.err)
}
}
return(input_df)
}
#' @importFrom ashr get_lfsr
#' @importFrom dplyr rename select left_join mutate
#' @importFrom magrittr %>%
#' @importFrom tidyr separate
#' @importFrom tibble as_tibble rownames_to_column
mash2anno <- function(df, markers){
input_df <- get_log10bf(m = df) %>%
as.data.frame() %>%
rownames_to_column(var = "value") %>%
mutate(value = as.integer(.data$value)) %>%
as_tibble() %>%
left_join(get_marker_df(m = df), by = "value") %>%
dplyr::rename(log10BayesFactor = .data$V1) %>%
dplyr::select(-.data$value) %>%
separate(.data$Marker, into = c("CHR", "POS"), sep = "_") %>%
mutate(POS = as.integer(.data$POS))
return(input_df)
}
#' @importFrom dplyr select mutate
#' @importFrom magrittr %>%
#' @importFrom readr read_csv
#' @importFrom tidyr separate
rqtl2anno <- function(df){
input <- read_csv(file = df)
input_df <- input %>%
separate(.data$marker, into = c("CHR", "marker_pos"), sep = "_") %>%
separate(.data$flank_lo, into = c("cl", "flank_lo_pos"), sep = "_") %>%
separate(.data$flank_hi, into = c("ch", "flank_hi_pos"), sep = "_") %>%
dplyr::select(-.data$chr, -.data$cl, -.data$ch) %>%
mutate(POS = as.numeric(.data$marker_pos)*1E6,
start = as.numeric(.data$flank_lo_pos)*1E6,
end = as.numeric(.data$flank_hi_pos)*1E6,
POS = as.integer(.data$POS),
start = as.integer(.data$start),
end = as.integer(.data$end)
) %>%
dplyr::select(-(.data$marker_pos:.data$flank_hi_pos))
return(input_df)
}
#' @importFrom dplyr filter top_n
#' @importFrom rlist list.append
#' @importFrom magrittr %>%
get_top_snps <- function(input_df, n, FDRalpha, type = c("bigsnp", "mash")){
topsnp_inputlist <- list()
if(!is.na(n[1]) & !is.na(FDRalpha[1])){
for(i in seq_along(n)){
if(type == "bigsnp"){
topsnp_inputlist[[i]] <- input_df %>%
top_n( -n[i], .data$`p value`)
} else {
topsnp_inputlist[[i]] <- input_df %>%
top_n( n[i], .data$log10BayesFactor)
}
}
for(i in seq_along(FDRalpha)){
if(type == "bigsnp"){
FDR1 <- input_df %>%
filter(.data$`FDR Adjusted p value` <= FDRalpha[i])
} else {
BFalpha <- -log10(FDRalpha[i])+1
FDR1 <- input_df %>%
filter(.data$log10BayesFactor >= BFalpha)
}
topsnp_inputlist <- list.append(topsnp_inputlist, FDR1)
}
} else if(!is.na(n[1]) & is.na(FDRalpha[1])){
for(i in seq_along(n)){
if(type == "bigsnp"){
topsnp_inputlist[[i]] <- input_df %>%
top_n( -n[i], .data$`p value`)
} else {
topsnp_inputlist[[i]] <- input_df %>%
top_n( n[i], .data$log10BayesFactor)
}
}
} else if (is.na(n[1]) & !is.na(FDRalpha[1])){
for(i in seq_along(FDRalpha)){
if(type == "bigsnp"){
FDR1 <- input_df %>%
filter(.data$`FDR Adjusted p value` <= FDRalpha[i])
} else {
BFalpha <- -log10(FDRalpha[i])+1
FDR1 <- input_df %>%
filter(.data$log10BayesFactor >= BFalpha)
}
topsnp_inputlist <- list.append(topsnp_inputlist, FDR1)
}
} else {
stop(paste0("Need to specify at least one of n (as an integer) or FDR",
" (between 0 and 1)."))
}
return(topsnp_inputlist)
}
#' @importFrom dplyr select mutate left_join rename
#' @importFrom magrittr %>%
#' @importFrom tibble as_tibble
get_tidy_annos <- function(df, input, anno_info, txdb){
out <- as_tibble(df)
requireNamespace("GenomicFeatures")
filter1 <- list(gene_id = out$GENEID)
genedf <- as_tibble(GenomicFeatures::genes(txdb, filter = filter1)) %>%
dplyr::rename(gene_width = .data$width, gene_start = .data$start,
gene_end = .data$end, gene_strand = .data$strand) %>%
mutate(seqnames = as.character(.data$seqnames))
outdf <- out %>%
mutate(seqnames = as.character(.data$seqnames)) %>%
dplyr::select(.data$seqnames:.data$TXID, .data$GENEID) %>%
left_join(input, by = c("seqnames" = "CHR", "start", "end")) %>%
left_join(genedf, by = c("seqnames", "GENEID" = "gene_id")) %>%
left_join(anno_info, by = c("GENEID" = "locusName")) %>%
dplyr::rename(region_start = .data$start,
region_end = .data$end,
region_strand = .data$strand) %>%
mutate(d2_start = .data$POS - .data$gene_start,
d2_end = .data$POS - .data$gene_end,
distance = ifelse(abs(.data$d2_start) < abs(.data$d2_end),
abs(.data$d2_start),
abs(.data$d2_end))) %>%
dplyr::rename(CHR = .data$seqnames,
`Annotation` = .data$LOCATION,
`Gene ID` = .data$GENEID,
`Arabidopsis thaliana homolog` = .data$`Best-hit-arabi-name`,
`A. thaliana gene name` = .data$`arabi-symbol`,
`A. thaliana gene annotation` = .data$`arabi-defline`,
`Rice homolog` = .data$`Best-hit-rice-name`,
`Rice gene name` = .data$`rice-symbol`,
`Rice gene annotation` = .data$`rice-defline`,
`Panther Categories` = .data$Panther,
`distance from gene` = .data$distance) %>%
dplyr::select(-.data$LOCSTART, -.data$LOCEND)
return(outdf)
}
#' @title Create a Table of Annotated Top SNPs for Panicum virgatum
#'
#' @description Loads in one set of GWAS results from a bigsnp GWAS, a mash
#' object, RQTL2 output, or a dataframe with 'CHR', 'start', and 'end'
#' columns for the genome of Panicum virgatum. Then, it constructs SNP
#' tables meeting different criteria using these genomic intervals.
#'
#' @param df a data frame or tbl_df. Can be bigsnpr output, mashr output
#' (loaded into R), r/qtl2 output (specify the path to a saved .csv file),
#' or, for Panicum virgatum intervals in another format, a
#' data frame containing columns 'CHR', 'start', and 'end'.
#' @param type Type of Panicum virgatum genomic marker input specified by the
#' df parameter. Options are "bigsnp", "mash", "rqtl2", and "table". Defaults
#' to 'bigsnp'.
#' @param n An integer or integer vector The numberof most significant SNPs to
#' select (by p-value). Set to NA to omit this table. Default is 10.
#' @param FDRalpha The false discovery rate. Numeric, a number or vector of
#' numbers between 0 and 1. Set to NA to omit this table. Default is 0.1.
#' @param rangevector How far from the significant SNP should annotations be
#' pulled? Can be an integer or a vector of integers. Default is 0 (the SNP
#' itself) and a 10 kbp window around the SNP.
#' @param snp For data frames of type "mash" or "bigsnp", the bigSNP object used
#' to create your data. Load into R with bigsnpr::snp_attach().
#' @param anno_info Gene information from
#' Pvirgatum_516_v5.1.annotation_info.txt
#' @param txdb Annotation information from Pvirgatum_516_v5.1.gene.txdb.sqlite
#'
#' @return A list containing dataframes of SNPs. If more than one dataframe is
#' returned, they are named using the criteria used to select the SNPs in the dataframe.
#'
#' @importFrom dplyr mutate
#' @importFrom magrittr %>%
#' @importFrom rlang is_null
#' @importFrom tibble as_tibble
#'
#' @export
pvdiv_table_topsnps <- function(df, type = c("bigsnp", "mash", "rqtl2", "table"),
n = 10, FDRalpha = 0.1,
rangevector = c(0, 10000), snp = NULL,
txdb = NULL,
anno_info = switchgrassGWAS::anno_info){
requireNamespace("VariantAnnotation")
n <- as.integer(n)
FDRalpha <- as.numeric(FDRalpha)
rangevector <- as.integer(rangevector)
stopifnot(type %in% c("bigsnp", "mash", "rqtl2", "table"),
!is_null(anno_info),
!is_null(txdb))
if(type == "table" & !("CHR" %in% names(df)) & !("start" %in% names(df)) &
!("end" %in% names(df))){
stop(paste0("For 'table' type, need to have columns 'CHR', 'start', and ",
"'end' in your data frame."))
}
if(type %in% c("bigsnp", "mash") & is.na(n)[1] & is.na(FDRalpha)[1]){
stop(paste0("For 'mash' and 'bigsnp' types, need to specify at least one",
"of n (as an integer) or FDR (between 0 and 1)."))
}
if(type %in% c("bigsnp", "mash")){
if(attr(snp, "class") != "bigSNP"){
stop("snp needs to be a bigSNP object, produced by the bigsnpr package.")
}
markers <- tibble(CHR = snp$map$chromosome,
POS = snp$map$physical.pos)
}
topsnp_inputlist <- list()
topsnp_outputlist <- list()
## Prepare a dataframe for each type for further analysis.
if(type == "bigsnp"){
input_df <- bigsnp2anno(df = df, markers = markers, FDRalpha = FDRalpha)
}
if(type == "mash"){
input_df <- mash2anno(df = df, markers = markers)
}
if(type == "rqtl2"){
topsnp_inputlist[[1]] <- rqtl2anno(df = df)
}
if(type == "table"){
topsnp_inputlist[[1]] <- df %>%
mutate(start = as.integer(.data$start),
end = as.integer(.data$end),
POS = (.data$end + .data$start)/2)
}
if(type %in% c("bigsnp", "mash")){
## Now make multiple dataframes for different categories of top SNPs.
topsnp_inputlist <- get_top_snps(input_df = input_df, n = n,
FDRalpha = FDRalpha, type = type)
}
#### Find annotations for each tbl_df found for each SNP criterion specified.
if(type %in% c("bigsnp", "mash")){
for(k in seq_along(rangevector)){
range <- rangevector[k]
for(i in seq_along(topsnp_inputlist)){
loop_input <- topsnp_inputlist[[i]]
if(nrow(loop_input) > 0){
input <- loop_input %>%
mutate(start = .data$POS - (range/2),
end = .data$POS + (range/2))
## Make input into a GRanges object for querying with locateVariants
target <- with(input, GRanges(seqnames = Rle(CHR),
ranges = IRanges(start = start,
end = end,
names = NULL),
strand = Rle(strand("*"))))
### Find genes that overlap input SNPs with VariantAnnotation
loc <-
VariantAnnotation::locateVariants(target, txdb,
VariantAnnotation::AllVariants(promoter =
VariantAnnotation::PromoterVariants(upstream = 2000L,
downstream = 2000L),
intergenic =
VariantAnnotation::IntergenicVariants(upstream = 20000L,
downstream = 20000L,
idType = "gene")))
#### Convert a GRanges object to an output data frame
topsnp_outputlist[[(i + (k-1)*length(topsnp_inputlist))]] <-
get_tidy_annos(df = loc, input = input, anno_info = anno_info,
txdb = txdb)
} else {
topsnp_outputlist[[(i + (k-1)*length(topsnp_inputlist))]] <-
as_tibble(NA)
}
}
}
} else if(type %in% c("rqtl2", "table")){
loop_input <- topsnp_inputlist[[1]]
if(nrow(loop_input) > 0){
input <- loop_input # Other types have start and end already
## Make input into a GRanges object for querying with locateVariants
target <- with(input, GRanges(seqnames = Rle(CHR),
ranges = IRanges(start = start,
end = end,
names = NULL),
strand = Rle(strand("*"))))
### Find genes that overlap input SNPs with VariantAnnotation
loc <-
VariantAnnotation::locateVariants(target, txdb,
VariantAnnotation::AllVariants(promoter =
VariantAnnotation::PromoterVariants(upstream = 2000L,
downstream = 2000L),
intergenic =
VariantAnnotation::IntergenicVariants(upstream = 20000L,
downstream = 20000L,
idType = "gene")))
#### Convert a GRanges object to an output data frame
topsnp_outputlist[[1]] <-
get_tidy_annos(df = loc, input = input, anno_info = anno_info,
txdb = txdb)
} else {
topsnp_outputlist[[1]] <-
as_tibble(NA)
}
}
if(type %in% c("bigsnp", "mash")){
if(!is.na(n)[1] & !is.na(FDRalpha)[1]){
names1 <- c(paste0("top", n, "SNPs_"), paste0("FDR", FDRalpha, "_"))
} else if(!is.na(n)[1]){
names1 <- c(paste0("top", n, "SNPs_"))
} else {
names1 <- c(paste0("FDR", FDRalpha, "_"))
names(topsnp_outputlist) <- paste0(rep(names1, length(rangevector)),
"within",
rep(rangevector, each = length(names1)),
"bp")
}
} else {
topsnp_outputlist <- topsnp_outputlist[[1]]
}
return(topsnp_outputlist)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.