CDBNgenomics: Genomics Using the Cooperative Dry Bean Nursery Dataset

Documented in get_annotations

#' @importFrom dplyr rename
#' @importFrom magrittr %>%
#' @importFrom multtest mt.rawp2adjp
#' @importFrom stats predict
bigsnp2anno <- function(df, markers, FDRalpha){
  input_df <- df %>%
    mutate(p.value = predict(df, log10 = FALSE))
  if(!is.na(FDRalpha)[1]){
    res <- mt.rawp2adjp(input_df$p.value, alpha = FDRalpha,
                        proc = "BH")
    adj_p <- res$adjp[order(res$index), ]
    input_df <- cbind(markers, input_df, adj_p) %>%
      as_tibble() %>%
      dplyr::rename(`p value` = .data$p.value,
                    `FDR Adjusted p value` = .data$`BH`,
                    `SNP Effect` = .data$estim,
                    `SNP standard error` = .data$std.err)
  } else {
    input_df <- cbind(markers, input_df) %>%
      as_tibble() %>%
      dplyr::rename(`p value` = .data$p.value,
                    `SNP Effect` = .data$estim,
                    `SNP standard error` = .data$std.err)
  }
  return(input_df)
}

#' @importFrom readr read_csv
#' @importFrom dplyr rename mutate
#' @importFrom tibble as_tibble
gapit2anno <- function(df){
  input <- read_csv(file = df)
  inputdf <- input %>%
    as_tibble() %>%
    dplyr::rename(`p value` = .data$P.value,
                  `FDR Adjusted p value` = .data$`FDR_Adjusted_P-values`,
                  `SNP Effect` = .data$effect,
                  POS = .data$Position,
                  `Model R^2 without SNP` = .data$Rsquare.of.Model.without.SNP,
                  `Model R^2 with SNP` = .data$Rsquare.of.Model.with.SNP,
                  MAF = .data$maf,
                  `Number of Observations` = .data$nobs,
                  ) %>%
    mutate(CHR = ifelse(.data$Chromosome <= 9,
                        paste0("Chr0", .data$Chromosome),
                        paste0("Chr", .data$Chromosome
                        )))
}

#' @importFrom ashr get_lfsr
#' @importFrom dplyr rename select left_join mutate
#' @importFrom magrittr %>%
#' @importFrom tidyr separate
#' @importFrom tibble as_tibble rownames_to_column
#' @importFrom stringr str_replace
mash2anno <- function(df, markers){
  input_df <- get_log10bf(m = df) %>%
    as.data.frame() %>%
    rownames_to_column(var = "value") %>%
    mutate(value = as.integer(.data$value)) %>%
    as_tibble() %>%
    left_join(get_marker_df(m = df), by = "value") %>%
    dplyr::rename(log10BayesFactor = .data$V1) %>%
    dplyr::select(-.data$value) %>%
    separate(.data$Marker, into = c("CHR", "POS"), sep = "_") %>%
    mutate(POS = as.integer(.data$POS),
           CHR = str_replace(.data$CHR, "^S", "Chr")
           )
  return(input_df)
}

#' @importFrom dplyr select mutate
#' @importFrom magrittr %>%
#' @importFrom readr read_csv
#' @importFrom tidyr separate
#' @importFrom stringr str_replace
rqtl2anno <- function(df){
  input <- read_csv(file = df)
  input_df <- input %>%
    separate(.data$marker, into = c("CHR", "marker_pos"), sep = "_") %>%
    separate(.data$flank_lo, into = c("cl", "flank_lo_pos"), sep = "_") %>%
    separate(.data$flank_hi, into = c("ch", "flank_hi_pos"), sep = "_") %>%
    dplyr::select(-.data$chr, -.data$cl, -.data$ch) %>%
    mutate(POS = as.numeric(.data$marker_pos)*1E6,
           start = as.numeric(.data$flank_lo_pos)*1E6,
           end = as.numeric(.data$flank_hi_pos)*1E6,
           POS = as.integer(.data$POS),
           start = as.integer(.data$start),
           end = as.integer(.data$end),
           CHR = str_replace(.data$CHR, "^S", "Chr")
    ) %>%
    dplyr::select(-(.data$marker_pos:.data$flank_hi_pos))
  return(input_df)
}

#' @importFrom dplyr filter top_n
#' @importFrom rlist list.append
#' @importFrom magrittr %>%
get_top_snps <- function(input_df, n, FDRalpha, type = c("bigsnp", "gapit",
                                                         "mash")){
  topsnp_inputlist <- list()
  if(!is.na(n[1]) & !is.na(FDRalpha[1])){
    for(i in seq_along(n)){
      if(type %in% c("bigsnp", "gapit")){
        topsnp_inputlist[[i]] <- input_df %>%
          top_n( -n[i], .data$`p value`)
      } else {
        topsnp_inputlist[[i]] <- input_df %>%
          top_n( n[i], .data$log10BayesFactor)
      }
    }
    for(i in seq_along(FDRalpha)){
      if(type %in% c("bigsnp", "gapit")){
        FDR1 <- input_df %>%
          filter(.data$`FDR Adjusted p value` <= FDRalpha[i])
      } else {
        BFalpha <- -log10(FDRalpha[i])+1
        FDR1 <- input_df %>%
          filter(.data$log10BayesFactor >= BFalpha)
      }
      topsnp_inputlist <- list.append(topsnp_inputlist, FDR1)
    }
  } else if(!is.na(n[1]) & is.na(FDRalpha[1])){
    for(i in seq_along(n)){
      if(type %in% c("bigsnp", "gapit")){
        topsnp_inputlist[[i]] <- input_df %>%
          top_n( -n[i], .data$`p value`)
      } else {
        topsnp_inputlist[[i]] <- input_df %>%
          top_n( n[i], .data$log10BayesFactor)
      }
    }
  } else if (is.na(n[1]) & !is.na(FDRalpha[1])){
    for(i in seq_along(FDRalpha)){
      if(type %in% c("bigsnp", "gapit")){
        FDR1 <- input_df %>%
          filter(.data$`FDR Adjusted p value` <= FDRalpha[i])
      } else {
        BFalpha <- -log10(FDRalpha[i])+1
        FDR1 <- input_df %>%
          filter(.data$log10BayesFactor >= BFalpha)
      }
      topsnp_inputlist <- list.append(topsnp_inputlist, FDR1)
    }
  } else {
    stop(paste0("Need to specify at least one of n (as an integer) or FDR",
                " (between 0 and 1)."))
  }
  return(topsnp_inputlist)
}

#' @importFrom dplyr select mutate left_join rename
#' @importFrom magrittr %>%
#' @importFrom tibble as_tibble
get_tidy_annos <- function(df, input, anno_info, txdb){
  out <- as_tibble(df)
  requireNamespace("GenomicFeatures")

  filter1 <- list(gene_id = out$GENEID)
  genedf <- as_tibble(GenomicFeatures::genes(txdb, filter = filter1)) %>%
    dplyr::rename(gene_width = .data$width, gene_start = .data$start,
                  gene_end = .data$end, gene_strand = .data$strand) %>%
    mutate(seqnames = as.character(.data$seqnames))

  outdf <- out %>%
    mutate(seqnames = as.character(.data$seqnames)) %>%
    dplyr::select(.data$seqnames:.data$TXID, .data$GENEID) %>%
    left_join(input, by = c("seqnames" = "CHR", "start", "end")) %>%
    left_join(genedf, by = c("seqnames", "GENEID" = "gene_id")) %>%
    left_join(anno_info, by = c("GENEID" = "locusName")) %>%
    dplyr::rename(region_start = .data$start,
                  region_end = .data$end,
                  region_strand = .data$strand) %>%
    mutate(d2_start = .data$POS - .data$gene_start,
           d2_end = .data$POS - .data$gene_end,
           distance = ifelse(abs(.data$d2_start) < abs(.data$d2_end),
                             abs(.data$d2_start),
                             abs(.data$d2_end))) %>%
    left_join(CDBNgenomics::Pv_GO, by = "GENEID") %>%
    left_join(CDBNgenomics::Pv_kegg, by = "GENEID") %>%
    dplyr::rename(CHR = .data$seqnames,
                  `Annotation` = .data$LOCATION,
                  `Gene ID` = .data$GENEID,
                  `Arabidopsis thaliana homolog` = .data$Best.hit.arabi.name,
                  `A. thaliana gene name` = .data$arabi.symbol,
                  `A. thaliana gene annotation` = .data$arabi.defline,
                  `KEGG Info` = .data$KEGGInfo,
                  `GO Categories` = .data$GOCategories,
                  `GO Info` = .data$GOInfo,
                  `Panther Categories` = .data$Panther,
                  `distance from gene` = .data$distance) %>%
    dplyr::select(-.data$LOCSTART, -.data$LOCEND)
  return(outdf)
}




#' @title Create a Table of Annotated Top SNPs for Phaseolus vulgaris
#'
#' @description Loads in one set of GWAS results from a bigsnp GWAS, a GAPIT
#'    GWAS, a mash object, RQTL2 output, or a dataframe with 'CHR',
#'    'start', and 'end' columns for the genome of Phaseolus_vulgaris. Then, it
#'    constructs SNP tables meeting different criteria using these genomic
#'    intervals.
#'
#' @param df a data frame or tbl_df. Can be bigsnpr output, mashr output
#'    (loaded into R), gapit or r/qtl2 output (specify the path to a saved
#'    .csv file), or, for Phaseolus vulgaris intervals in another format, a
#'    data frame containing columns 'CHR', 'start', and 'end'.
#' @param type Type of Phaseolus vulgaris genomic marker input specified by the
#'    df parameter. Options are "bigsnp", "gapit", "mash", "rqtl2", and "table".
#'     Defaults to 'bigsnp'.
#' @param n An integer or integer vector The numberof most significant SNPs to
#'     select (by p-value). Set to NA to omit this table. Default is 10.
#' @param FDRalpha The false discovery rate. Numeric, a number or vector of
#'     numbers between 0 and 1. Set to NA to omit this table. Default is 0.1.
#' @param rangevector How far from the significant SNP should annotations be
#'    pulled? Can be an integer or a vector of integers. Default is 0 (the SNP
#'    itself) and a 10 kbp window around the SNP.
#' @param markers For data frames of type "mash" or "bigsnp", the same set of
#'    markers (with CHR and POS columns) as your df object.
#' @param anno_info Gene information from
#'    Pvulgaris_442_v2.1.annotation_info.txt
#' @param txdb Annotation information from Pvulgaris_442_v2.1.gene.txdb.sqlite
#'
#' @return A list containing dataframes of SNPs. If more than one dataframe is
#'     returned, they are named using the criteria used to select the SNPs in
#'     the dataframe.
#'
#' @importFrom dplyr mutate
#' @importFrom magrittr %>%
#' @importFrom multtest mt.rawp2adjp
#' @importFrom rlang is_null
#' @importFrom tibble as_tibble
#'
#' @export
get_annotations <- function(df, type = c("bigsnp", "gapit", "mash", "rqtl2",
                                         "table"), n = 10, FDRalpha = NA,
                            rangevector = c(0, 10000), markers = NULL,
                            anno_info = NULL, txdb = NULL){
  requireNamespace("VariantAnnotation")

  n <- as.integer(n)
  FDRalpha <- as.numeric(FDRalpha)
  rangevector <- as.integer(rangevector)
  stopifnot(type %in% c("bigsnp", "gapit", "mash", "rqtl2", "table"),
            !is_null(anno_info), !is_null(txdb))
  if(type == "table" & !("CHR" %in% names(df)) & !("start" %in% names(df)) &
     !("end" %in% names(df))){
    stop(paste0("For 'table' type, need to have columns 'CHR', 'start', and ",
                "'end' in your data frame."))
  }
  if(type %in% c("bigsnp", "mash") & is.na(n)[1] & is.na(FDRalpha)[1]){
    stop(paste0("For 'mash' and 'bigsnp' types, need to specify at least one",
                "of n (as an integer) or FDR (between 0 and 1)."))
  }
  if(type %in% c("bigsnp", "mash")){
    stopifnot(!is_null(markers[1]))
  }
  topsnp_inputlist <- list()
  topsnp_outputlist <- list()
  ## Prepare a dataframe for each type for further analysis.
  if(type == "bigsnp"){
    input_df <- bigsnp2anno(df = df, markers = markers, FDRalpha = FDRalpha)
  }
  if(type == "gapit"){
    input_df <- gapit2anno(df = df)
  }
  if(type == "mash"){
    input_df <- mash2anno(df = df, markers = markers)
  }
  if(type == "rqtl2"){
    topsnp_inputlist[[1]] <- rqtl2anno(df = df)
  }
  if(type == "table"){
    topsnp_inputlist[[1]] <- df %>%
      mutate(CHR = ifelse(.data$CHR <= 9,
                    paste0("Chr0", .data$CHR),
                    paste0("Chr", .data$CHR
                           )),
             start = as.integer(.data$start),
             end = as.integer(.data$end),
             POS = as.integer(.data$end))
  }
  if(type %in% c("bigsnp", "gapit", "mash")){
    ## Now make multiple dataframes for different categories of top SNPs.
    topsnp_inputlist <- get_top_snps(input_df = input_df, n = n,
                                     FDRalpha = FDRalpha, type = type)
  }

  #### Find annotations for each tbl_df found for each SNP criterion specified.
  for(k in seq_along(rangevector)){
    range <- rangevector[k]
    for(i in seq_along(topsnp_inputlist)){
      loop_input <- topsnp_inputlist[[i]]
      if(nrow(loop_input) > 0){

        if(type %in% c("bigsnp", "gapit", "mash")){
          # Prepare input dataframe
          input <- loop_input %>%
            mutate(CHR = ifelse(is.integer(.data$CHR),
                                case_when(.data$CHR >= 10 ~ paste0("Chr",
                                                             as.character(.data$CHR)),
                                          .data$CHR <= 9 ~ paste0("Chr0",
                                                            as.character(.data$CHR)),
                                          TRUE ~ as.character(.data$CHR)), .data$CHR)) %>%
            mutate(start = .data$POS - (range/2),
                   end = .data$POS + (range/2))
        } else if(type == "table"){
          input <- loop_input %>%
            mutate(CHR = ifelse(is.integer(.data$CHR),
                                case_when(.data$CHR >= 10 ~ paste0("Chr",
                                                             as.character(.data$CHR)),
                                          .data$CHR <= 9 ~ paste0("Chr0",
                                                            as.character(.data$CHR)),
                                          TRUE ~ as.character(.data$CHR)), .data$CHR)) %>%
            mutate(start = .data$start - (range/2),
                   end = .data$end + (range/2))
        } else{
          input <- loop_input # Other types have start and end already
        }

        ## Make input into a GRanges object for querying with locateVariants
        target <- with(input, GRanges(seqnames = Rle(CHR),
                                      ranges = IRanges(start = start,
                                                       end = end,
                                                       names = NULL),
                                      strand = Rle(strand("*"))))

        ### Find genes that overlap input SNPs with VariantAnnotation
        loc <-
          VariantAnnotation::locateVariants(target, txdb,
                                            VariantAnnotation::AllVariants(promoter =
                                                                             VariantAnnotation::PromoterVariants(upstream = 2000L,
                                                                                                                 downstream = 2000L),
                                                                           intergenic =
                                                                             VariantAnnotation::IntergenicVariants(upstream = 20000L,
                                                                                                                   downstream = 20000L,
                                                                                                                   idType = "gene")))

        #### Convert a GRanges object to an output data frame
        topsnp_outputlist[[(i + (k-1)*length(topsnp_inputlist))]] <-
          get_tidy_annos(df = loc, input = input, anno_info = anno_info,
                         txdb = txdb)
      } else {
        topsnp_outputlist[[(i + (k-1)*length(topsnp_inputlist))]] <-
          as_tibble(NA)
      }
    }
  }
  if(type %in% c("bigsnp", "gapit", "mash")){
    if(!is.na(n)[1] & !is.na(FDRalpha)[1]){
      names1 <- c(paste0("top", n, "SNPs_"), paste0("FDR", FDRalpha, "_"))
    } else if(!is.na(n)[1]){
      names1 <- c(paste0("top", n, "SNPs_"))
    } else {
      names1 <- c(paste0("FDR", FDRalpha, "_"))
    }
    names(topsnp_outputlist) <- paste0(rep(names1, length(rangevector)),
                                       "within",
                                       rep(rangevector, each = length(names1)),
                                       "bp")
  }
  return(topsnp_outputlist)
}
Alice-MacQueen/CDBNgenomics documentation built on Aug. 18, 2020, 4:39 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
Alice-MacQueen/CDBNgenomics
Genomics Using the Cooperative Dry Bean Nursery Dataset

R/get_annotations.R
In Alice-MacQueen/CDBNgenomics: Genomics Using the Cooperative Dry Bean Nursery Dataset

Defines functions get_annotations get_tidy_annos get_top_snps rqtl2anno mash2anno gapit2anno bigsnp2anno

Documented in get_annotations

R Package Documentation

Browse R Packages

We want your feedback!

Alice-MacQueen/CDBNgenomics Genomics Using the Cooperative Dry Bean Nursery Dataset

R/get_annotations.R In Alice-MacQueen/CDBNgenomics: Genomics Using the Cooperative Dry Bean Nursery Dataset

Defines functions get_annotations get_tidy_annos get_top_snps rqtl2anno mash2anno gapit2anno bigsnp2anno

Documented in get_annotations

R Package Documentation

Browse R Packages

We want your feedback!

Alice-MacQueen/CDBNgenomics
Genomics Using the Cooperative Dry Bean Nursery Dataset

R/get_annotations.R
In Alice-MacQueen/CDBNgenomics: Genomics Using the Cooperative Dry Bean Nursery Dataset