R/setters_and_getters.R

Defines functions get_snps_within_region set_genes_pos_adj get_topr_colors get_top_hit get_top_snp get_legend get_gene get_genes_by_Gene_Symbol get_best_snp_per_MB get_pos_with_offset get_pos_with_offset4list get_ticks get_ticknames get_chr_offsets get_ymin get_ymax get_shades get_chr_from_df get_exons get_genes set_log10p set_shape set_alpha set_size set_color set_annotate set_shape_by_impact set_size_shape_alpha

Documented in get_best_snp_per_MB get_gene get_genes_by_Gene_Symbol get_snps_within_region get_topr_colors get_top_snp

set_size_shape_alpha <- function(df,size,shape,alpha,locuszoomplot=F){
  df <- set_size(df,size, locuszoomplot = locuszoomplot)
  df <- set_shape(df,shape,locuszoomplot = locuszoomplot)
  df <- set_alpha(df,alpha)
  if(!locuszoomplot){
    df <- set_shape_by_impact(df)
  }
  return(df)
}

set_shape_by_impact <- function(dat){
  for(i in seq_along(dat)){
    if("Max_Impact" %in% colnames(dat[[i]])){
      dat[[i]]$shape0 <- ifelse(dat[[i]]$Max_Impact == "MODERATE", 17, dat[[i]]$shape)
      dat[[i]]$shape <- ifelse(dat[[i]]$Max_Impact =="HIGH", 8, dat[[i]]$shape0)
    }
  }
  return(dat)
}

set_annotate <- function(dat,annotate){
  if(! is.null(annotate)){
    for(i in seq_along(dat)){
      if(!is.null(annotate) & nrow(dat[[i]]>0)){
        if(is.vector(annotate) & (length(annotate) >= i))
          dat[[i]]$annotate <- annotate[i]
        else
          dat[[i]]$annotate <- annotate
      }
    }
  }
  return(dat)
}

set_color <- function(dat,color){
  if(length(color) < length(dat))
    stop(paste("There are ",length(dat), " datasets, but only ",length(color), " color. Add more colors with the color argument, eg. color=c(\"blue\",\"yellow\",\"green\", etc)"))
  for(i in seq_along(dat)){
    if(!is.null(color) & nrow(dat[[i]]>0)){
      if(is.vector(color) & (length(color) >= i))
        dat[[i]]$color <- color[i]
      else
        dat[[i]]$color <- color
    }
  }
  return(dat)
}

set_size <- function(dat,size, locuszoomplot = locuszoomplot){
  if(is.null(size))
    size <- 1
  for(i in seq_along(dat)){
    if(!is.null(size) & nrow(dat[[i]]>0)){
      if(is.vector(size) &  length(size) >= i )
        dat[[i]]$size <- size[i]
      else
        dat[[i]]$size <- size
    }
    if(locuszoomplot & ("R2" %in% colnames(dat[[i]]))){
      dat[[i]]$size <- ifelse(dat[[i]]$R2 == 1, dat[[i]]$size*1.5, dat[[i]]$size)
    }
  }
  return(dat)
}

set_alpha <- function(dat,alpha){
  if(is.null(alpha))
    alpha <- 0.7
  for(i in seq_along(dat)){
    if(!is.null(alpha) & nrow(dat[[i]]>0)){
      if(is.vector(alpha) & (length(alpha) >= i))
        dat[[i]]$alpha <- alpha[i]
      else
        dat[[i]]$alpha <- alpha
    }
  }
  return(dat)
}

set_shape <- function(dat,shape,locuszoomplot=F){
  if(is.null(shape))
    shape <- 19
  for(i in seq_along(dat)){
    if(!is.null(shape) & nrow(dat[[i]]>0)){
      if(is.vector(shape) & (length(shape) >= i))
        dat[[i]]$shape <- shape[i]
      else
        dat[[i]]$shape <- shape
    }
    if(locuszoomplot & ("R2" %in% colnames(dat[[i]]))){
        dat[[i]]$shape <- ifelse(dat[[i]]$R2 == 1, 18, dat[[i]]$shape)
    }
  }
  return(dat)
}

set_log10p <- function(dat, ntop){
  for(i in seq_along(dat)){
    df <- dat[[i]]
    if(ntop > length(dat)){ ntop <- length(dat)}
    if(i <= ntop){
      dat[[i]] <- df %>% dplyr::mutate(log10p=-log10(P))

    }else{
      dat[[i]] <- df %>% dplyr::mutate(log10p=log10(P))
    }
  }
  return(dat)
}

## Getters

get_genes <- function(chr,xmin=0,xmax=NULL,protein_coding_only=FALSE){
  chr <- gsub("chr", "", chr)
  chr <- paste("chr",chr,sep="")
  if(is.null(xmax)){
    xmax <- chr_lengths[chr_lengths$V1 == chr,]$V2
  }
  if(protein_coding_only)
    genes <- topr::ENSGENES %>% dplyr::filter(chrom==chr) %>% dplyr::filter(biotype=="protein_coding") %>% dplyr::filter(gene_start>xmin & gene_start<xmax | gene_end > xmin & gene_end < xmax | gene_start < xmin & gene_end>xmax | gene_start == xmin | gene_end == xmax)
  else
    genes <- topr::ENSGENES %>% dplyr::filter(chrom==chr) %>% dplyr::filter(gene_start>xmin & gene_start<xmax | gene_end > xmin & gene_end < xmax | gene_start < xmin & gene_end>xmax | gene_start == xmin | gene_end == xmax)
  return(genes)
}

get_exons <- function(chr,xmin=0,xmax=NULL,protein_coding_only=FALSE){
  genes <- get_genes(chr,xmin,xmax)
  if(protein_coding_only)
    exons <- merge(topr::ENSEXONS, genes %>% dplyr::select(gene_symbol,biotype,chrom,gene_start,gene_end) %>% dplyr::filter(biotype=="protein_coding"), by=c("gene_symbol","chrom","gene_start","gene_end"))
  else
    exons <- merge(topr::ENSEXONS, genes %>% dplyr::select(gene_symbol,biotype,chrom,gene_start,gene_end), by=c("gene_symbol","chrom","gene_start","gene_end"))
    #rename chromstart exon_chromstart | rename chromend exon_chromend)",sep=""))
  return(exons)
}


get_chr_from_df <- function(df){
  #take the chromosome from the dataframe
  if(length(unique(df$CHROM)) == 1)
    chr <- unique(df$CHROM)
  else
    stop("More than one chromosome in the input data, please select which chromosome to display by setting the chr argument (e.g. chr='chr10') or use input data with just one chromosome")
  return(chr)
}


get_shades <- function(offsets,dat,ntop=ntop,include_chrX=FALSE,ymin=NULL,ymax=NULL){
  n_offsets <- 11
  if(!(include_chrX)) n_offsets <-10
  y1 <- c(rep(0, n_offsets))  #if there is no bottom plot
  if(is.null(ymin)){
    ymin <- get_ymin(dat)
  }
  if(length(dat) > ntop){
    y1 <- c(rep(ymin, n_offsets))
  }
  if(is.null(ymax)){
    ymax <- get_ymax(dat)
  }
  ##### here we need to accoutn for whether we have chr X or not
  if(include_chrX){
    shades <- data.frame(x1=c(offsets[[2]],offsets[[4]],offsets[[6]],offsets[[8]],offsets[[10]],offsets[[12]],offsets[[14]],offsets[[16]],offsets[[18]],offsets[[20]],offsets[[22]]),
                      x2=c(offsets[[3]],offsets[[5]],offsets[[7]], offsets[[9]],offsets[[11]],offsets[[13]],offsets[[15]],offsets[[17]],offsets[[19]],offsets[[21]],offsets[[23]]),
                      y1=y1,
                      y2=c(rep(ymax, n_offsets)))
  }
  else{
    shades <- data.frame(x1=c(offsets[[2]],offsets[[4]],offsets[[6]],offsets[[8]],offsets[[10]],offsets[[12]],offsets[[14]],offsets[[16]],offsets[[18]],offsets[[20]]),
                      x2=c(offsets[[3]],offsets[[5]],offsets[[7]], offsets[[9]],offsets[[11]],offsets[[13]],offsets[[15]],offsets[[17]],offsets[[19]],offsets[[21]]),
                      y1=y1,
                      y2=c(rep(ymax, n_offsets)))
  }
  return(shades)
}
get_ymax <- function(dat){
  ymax <- 0
  if(is.data.frame(dat)) dat <- list(dat)
  for(i in seq_along(dat)){
    tmp <- max(dat[[i]]$log10p)
    if(tmp>ymax){
      ymax <- tmp
    }
  }
  return(ymax)
}

get_ymin <- function(dat){
  ymin <- 5
  if(is.data.frame(dat)) dat <- list(dat)
    for(i in seq_along(dat)){
      tmp <- min(dat[[i]]$log10p)
      if(tmp<ymin){
        ymin <- tmp
      }
    }
  return(ymin)
}

get_chr_offsets <- function(include_chrX=F){
  #create the offsets from the internal chr_lengths data
  chr_lengths$CHROM <- gsub("chr","", chr_lengths$V1)
  chr_lengths <- chr_lengths %>% dplyr::filter(! V1 %in% c("chrM") )
  chr_lengths[chr_lengths$CHROM=="X",'CHROM'] <- "23"
  chr_lengths[chr_lengths$CHROM=="Y",'CHROM'] <- "24"
  chr_lengths$CHROM <- as.integer(chr_lengths$CHROM)
  no_chrs <- 22
  if(include_chrX)
    no_chrs <- 23
  chr_lengths <- chr_lengths %>% dplyr::filter(CHROM< no_chrs +1 )
  tmp <- chr_lengths %>% dplyr::group_by(CHROM) %>% dplyr::summarize(m=V2) %>% dplyr::mutate(offset=cumsum(lag(m, default=0)))
  offsets <- stats::setNames(tmp$offset,tmp$CHROM)
  return(offsets)
}

get_ticknames <- function(df){
  no_chrs <- ifelse("chrX" %in% df$CHROM || "X" %in% df$CHROM || "chr23" %in% df$CHROM || "23" %in% df$CHROM, 23, 22)
  if(no_chrs == 23){
    ticknames <- c(1:16, '',18, '',20, '',22, 'X')
  }else{
    ticknames <- c(1:16, '',18, '',20, '',22)
  }
  tickpos <-df %>% dplyr::group_by(CHROM)%>%
                                dplyr::summarize(pm=mean(POS))%>%
                                dplyr::pull(pm)

  names(tickpos) <- NULL
  return(list(names=ticknames, pos=tickpos))
}

get_ticks <- function(dat){
  df <- dat[[1]]
  for(i in seq_along(dat)){ if(length(unique(dat[[i]]$CHROM))  > length(unique(df$CHROM))){ df <- dat[[i]] } }
  no_chrs <- ifelse("chrX" %in% df$CHROM || "X" %in% df$CHROM || "chr23" %in% df$CHROM || "23" %in% df$CHROM, 23, 22)
  if(no_chrs == 23){
    ticknames <- c(1:16, '',18, '',20, '',22, 'X')
  }else{
    ticknames <- c(1:16, '',18, '',20, '',22)
  }
  tickpos <- df %>%
                                dplyr::group_by(CHROM)%>%
                                dplyr::summarize(pm=mean(POS))%>%
                                dplyr::pull(pm)

  names(tickpos) <- NULL
  return(list(names=ticknames, pos=tickpos))
}

get_pos_with_offset4list <- function(dat, offsets){
  for(i in seq_along(dat)){
    dat[[i]] <- get_pos_with_offset(dat[[i]], offsets)
  }
  return(dat)
}

get_pos_with_offset <- function(df,offsets){
  df$POS_orig <- df$POS
  df <- df %>% dplyr::mutate(POS=POS_orig+offsets[CHROM])
  return(df)
}


#' Get the top SNP per 10 MB
#'
#' @description
#'
#' \code{get_best_snp_per_MB()} Get the top SNP per 1 MB region
#' All other input parameters are optional
#'
#' @param df Dataframe
#' @param thresh P-value threshold, only consider variants with p-values below this threshold (1e-09 by default)
#' @param region Get the top/best variant (with p-value below thresh) within this region (region=1000000 by default)
#' @param protein_coding_only Set this variable to TRUE to only use protein_coding genes for annotation
#' @param chr Use this argument to get the top variants from one chromosome only
#' @param .checked If the input data has already been checked, this can be set to TRUE so it wont be checked again (FALSE by default)
#' @return Dataframe of lead variants. Returns the best variant per MB (by default, change the region size with the region argument) with p-values below the input threshold (thresh=1e-09 by default)
#' @export
#'
#' @examples
#' \dontrun{
#' data(gwas_CD)
#' get_best_snp_per_MB(gwas_CD, thresh = 1e-09, region_size = 10000000)
#' }
get_best_snp_per_MB <- function(df, thresh=1e-09,region_size=1000000,protein_coding_only=FALSE,chr=NULL, .checked=FALSE, verbose=F){
  variants <- df[0,]
  dat <- df
  if(is.data.frame(dat)) dat <- list(dat)

  if(! .checked){
    dat <- dat_check(dat)
  }
  df <- dat[[1]]
  if(! is.null(chr)){
    chr <- gsub("chr","", chr)
    df <- df %>% dplyr::filter(CHROM == chr)
  }

  df <- df %>% dplyr::filter(P<thresh)
  if(protein_coding_only & ("biotype" %in% colnames(df))){
       df <- df %>% dplyr::filter(biotype == "protein_coding")
  }
  if(nrow(df) > 0){
    df$tmp <- NA
    if(max(df$POS)-min(df$POS) < region_size){
      df$tmp<- 1
    }else{
      for(row in seq_len(nrow(df))){
        df$tmp <- base::round(df$POS/region_size)
      }
    }
    lead_snps <- df %>% dplyr::group_by(CHROM,tmp) %>% dplyr::arrange(P) %>% distinct(P, .keep_all=T) %>% dplyr::filter(P == min(P))
    variants <- dplyr::ungroup(lead_snps)%>% dplyr::distinct(CHROM,POS, .keep_all = T)
    if(verbose){
      print(paste("Found ",length(variants$POS), " index/lead variants with a p-value below ", thresh, sep=""))
      print(paste("Index/lead variant: the top variant within a",format(region_size, scientific = F),"bp region_size"), sep="")
    }
  }else{
    print(paste("There are no SNPs with p-values below ",thresh, " in the input dataset. Use the [thresh] argument to lower the threshold.",sep=""))
  }
  if("tmp" %in% colnames(variants)){
    variants <- variants %>% dplyr::select(-tmp)
  }
  return(variants)
}

#' Get the genetic position of a gene or genes by their gene name
#'
#' @description
#'
#' \code{get_genes_by_Gene_Symbol()} Get genes by their gene symbol/name
#' Required parameters is on gene name or a vector of gene names
#'
#' @param genes A gene name (e.g. "FTO") or a vector of gene names ( c("FTO","NOD2"))
#' @param chr Search for the genes on this chromsome only
#' @return Dataframe of genes
#' @export
#'
#' @examples
#' \dontrun{
#' get_genes_by_Gene_Symbol(c("FTO","THADA"))
#' }
#'
get_genes_by_Gene_Symbol <- function(genes, chr=NULL){
    genes_df <- topr::ENSGENES %>% dplyr::filter(gene_symbol %in% genes)
    if(! is.null(chr)){
      chr <- gsub('chr','',chr)
      genes_df <- genes_df %>% dplyr::filter(chrom == paste("chr",chr,sep=""))
  }
  genes_df <- genes_df %>% dplyr::mutate(POS = gene_start + round((gene_end-gene_start)/2)) %>% dplyr::rename(Gene_Symbol=gene_symbol,CHROM=chrom)
  dist_genes <- genes_df %>% dplyr::distinct(CHROM,gene_start,gene_end, .keep_all = T)
  return(dist_genes)
}

#' Get the genetic position of a gene or genes by their gene name
#'
#' @description
#'
#' \code{get_gene()} Get gene coordinates by its gene symbol/name
#' Required parameters is on gene name or a vector of gene names
#'
#' @param gene_name string A gene name (e.g. "FTO")
#' @param chr Search for the gene on this chromosome
#' @return Dataframe of genes
#' @export
#'
#' @examples
#' \dontrun{
#' get_gene("FTO")
#' }
#'

get_gene  <- function(gene_name,chr=NULL){
  if(! is.null(chr)){
    chr <- gsub("chr", "", chr)
    chr <- paste("chr",chr,sep="")
    gene <- topr::ENSGENES %>% dplyr::filter(chrom == chr) %>% dplyr::filter(gene_symbol == gene_name)
  }
  else{
    gene <- topr::ENSGENES %>% dplyr::filter(gene_symbol == gene_name)
  }
  if(is.null(gene)){
    print(paste("Could not find a gene with the gene name: [", gene_name, "]", sep=""))
  }
  return(gene)
}

get_legend<-function(p1){
  tmp <- ggplot_gtable(ggplot_build(p1))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)
}



#' Get the top hit from the dataframe
#'
#' @description
#'
#' \code{get_top_snp()} Get the top hit from the dataframe
#' All other input parameters are optional
#'
#' @param df Dataframe containing association results
#' @param chr Get the top hit in the data frame for this chromosome. If chromosome is not provided, the top hit from the entire dataset is returned.

#' @return Dataframe containing the top hit
#' @export
#'
#' @examples
#' \dontrun{
#' data(gwas_CD)
#' get_top_snp(gwas_CD, chr="chr1")
#' }
#'
get_top_snp <- function(df, chr=NULL){
  if (!is.null(chr)) {
    chr_tmp <- gsub('chr','',chr)
    chr <- paste("chr",chr_tmp, sep="")
    df <- df %>%
      filter(CHROM == chr)
  } else {
    print("Returning the top hit over the entire genome, since chromosome was not provided as an argument")
  }
  top_hit <- df %>%
    dplyr::arrange(P) %>%
    head(n = 1)

  return(top_hit)
}

get_top_hit <- function(df, chr=NULL){
  get_top_snp(df, chr=chr)
}


#' Get the top hit from the dataframe
#'
#' @description
#'
#' \code{get_topr_colors()} Get the top hit from the dataframe
#' All other input parameters are optional
#'
#' @return Vector of colors used for plotting
#' @export
#'
#' @examples
#' \dontrun{
#' get_topr_colors()
#' }
#'
get_topr_colors <- function(){
  return(c("darkblue","#E69F00","#00AFBB","#999999","#FC4E07","darkorange1","darkgreen","blue","red"))
}

set_genes_pos_adj <- function(genes, offsets){
  #CHROM,POS,Gene_Symbol
  genes <- genes %>% dplyr::mutate(CHROM=gsub('chr','',CHROM))
  genes[genes$CHROM=='X', 'CHROM'] <- "23"
  genes$CHROM <- as.integer(genes$CHROM)
  genes <- genes %>% dplyr::mutate(pos_adj=POS+offsets[CHROM])
  return(genes)
}

#' Get SNPs/variants within region
#'
#' @description
#'
#' \code{get_snps_within_region()}
#'
#' @param df data frame of association results with the columns CHR and POS
#' @param region string representing the genetic region (e.g chr16:50693587-50734041)
#' @param chr chromosome (e.g. chr16)
#' @param xmin include variants with POS larger than xmin
#' @param xmax include variants with POS smaller than xmax
#' @return the variants within the requested region
#' @export
#'
#' @examples
#' \dontrun{
#' get_snps_within_region(dat, "chr16:50593587-50834041")
#' }
#'

get_snps_within_region <- function(df, region, chr=NULL, xmin=NULL, xmax=NULL){
  snps <- NULL
  dat <- dat_check(df)
  if(!is.null(region)){
    tmp <- unlist(stringr::str_split(region, ":"))
    chr <- tmp[1]
    tmp_pos <- unlist(stringr::str_split(tmp[2], "-"))
    xmin <- tmp_pos[1]
    xmax <- tmp_pos[2]
  }

  chr <- gsub("chr", "", chr)
  chr <- gsub("X", "23", chr)
  if(!is.null(chr) & !is.null(xmin) & !is.null(xmax)){
    snps <- dat[[1]] %>% filter(CHROM == chr & POS >= xmin & POS <= xmax)
    if(length(snps$POS) < 1){
      print(paste("There are no SNPs within the region: ", chr, ":", xmin, "-", xmax, sep=""))
    }
  }
  return(snps)
}
wuxi-nextcode/topR documentation built on Dec. 23, 2021, 6:13 p.m.