R/fantom.R

Defines functions get_fantom5_variants get_fantom5_enhancers_from_hgnc prepare_fantom

Documented in get_fantom5_enhancers_from_hgnc get_fantom5_variants prepare_fantom

# ---- FANTOM ----

#' @title Generate data.frame from FANTOM5 enhancer file
#' @description Prepare FANTOM5 for \code{\link{get_fantom5_enhancers_from_hgnc}}
#'
#' @param enhancer_tss_association the "enhancer_tss_associations.bed" file from FANTOM5.
#' The file can be downloaded here:
#' https://slidebase.binf.ku.dk/human_enhancers/presets/serve/enhancer_tss_associations
#' @return a data.frame containing the file information
prepare_fantom <- function(enhancer_tss_association) {
  # TSS = Transcripton Start Site
  fantom <- utils::read.delim(enhancer_tss_association, header = F,
                              stringsAsFactors = FALSE)

  colnames(fantom) <- c("chrom", "chromStart", "chromEnd", "name", "score",
                        "strand", "thickStart", "thickEnd", "itemRgb",
                        "blockCount", "blockSizes", "chromStarts")

  # The enhancer position correspond to the 4th column of the bed file
  # SO we split it by ";" and get the chr, start, stop and gene symbol.
  fantom_df <- as.data.frame(splitstackshape::cSplit(fantom, splitCols="name",
                                                     sep=";", direction="wide"))

  locs <- strsplit(as.character(fantom_df$name_1),"[:-]")
  fantom_df$chr <- sapply(locs,"[",1)
  fantom_df$start <- as.numeric(sapply(locs,"[",2))
  fantom_df$end <- as.numeric(sapply(locs,"[",3))
  fantom_df$symbol <- fantom_df$name_3
  fantom_df$corr <- sub("R:","",fantom_df$name_4)
  fantom_df$fdr <- sub("FDR:","",fantom_df$name_5)

  return(fantom_df)
}


#' @title Get the enhancers associated to certain genes from FANTOM5
#' @description from the FANTOM5 dataset, get the enhancers associated to the
#' genes using the HGNC symbols, the association must pass the correlation threshold given
#' by the user. Is used internally by \code{\link{get_fantom5_variants}}
#'
#' @param fantom_df the output of \code{\link{prepare_fantom}}
#' @param hgnc_symbols vector of HUGO ids for the genes of interest
#' @param corr_threshold the minimum correlation (z-score) to consider a
#' enhancer/gene assocation valid (default: 0.25).
#' A z-score greater than 0 represents an element greater than the mean, this means
#' that this association has more correlation than random motifs.
#' @return a subset of the FANTOM5 data.frame containing the information about the
#' enhancers. The data.frame contains the following columns
#' \itemize{
#'   \item chr (chromosome)
#'   \item start (start of the enhancers)
#'   \item end (end of the enhancers)
#'   \item symbol (HGNC symbol of the gene associated to the enhancer)
#'   \item corr (correlation z-score from FANTOM5)
#'   \item fdr (False Discovery Rate)
#' }
#'

# vargen_install(install_dir = "./vargen_data/")
# fantom_df <- prepare_fantom("./vargen_data/enhancer_tss_associations.bed")
# get_fantom5_enhancers_from_hgnc(fantom_df = fantom_df,
#                                 hgnc_symbols = "FOXP3",
#                                 corr_threshold = 0.30)
get_fantom5_enhancers_from_hgnc <- function(fantom_df, hgnc_symbols,
                                            corr_threshold = 0.25){
  # Get the enhancers with a decent level of correlations & significance
  # Correlation is Pearson as a z-score:
  # "(pearson corr - (mean of random motifs)) / std(pearson of random motifs)"
  # https://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0560-6
  # A z-score greater than 0 represents an element greater than the mean, this
  # means "more correlation than random motifs"
  enhancers_corr <- unique(subset(fantom_df,
                                  fantom_df$corr >= corr_threshold,# & fdr < fdr_threshold,
                                  select = c("chr", "start", "end", "symbol", "corr", "fdr"))
  )
  # Get list of enhancer related to list of genes
  return(enhancers_corr[(enhancers_corr$symbol %in% hgnc_symbols),])
}


#' @title Get variants on the enhancers of the list of genes given as input
#' @description FANTOM5 is used to get the enhancers of the genes, then the variants
#' located on the enhancers are fetched with \code{\link{get_variants_from_locations}}
#' It is also possible to specify a correlation threshold to limit the number
#' of association to output (default: 0.25). The enhancer tss association file
#' is based on hg19, so liftOver is needed.
#'
#' @param fantom_df the output of \code{\link{prepare_fantom}}
#' @param omim_genes output from \code{\link{get_omim_genes}}
#' @param corr_threshold the minimum correlation (z-score) to consider a
#' enhancer/gene association valid (default: 0.25). A z-score greater than 0
#' represents an element greater than the mean, this means that this association
#'  has more correlation than random motifs.
#' @param hg19ToHg38.over.chain the chain file to liftOver locations from
#' hg19 to hg38.
#' @param verbose if true, will print progress information (default: FALSE)
#'
#' @return a data.frame with information about the variants located on the enhancers.
#' The data.frame will contain the following columns:
#' \itemize{
#'   \item chr (chromosome)
#'   \item pos (position of the variant)
#'   \item rsid (variant ID)
#'   \item ensembl_gene_id ("gene id" of the gene associated with the variant)
#'   \item hgnc_symbol ("hgnc symbol" of the gene associated with the variant)
#'   \item source (here the value will be "fantom5")
#' }
#
# gene_mart <- connect_to_gene_ensembl()
# DM1_genes <- get_omim_genes(omim_ids = "222100", gene_mart = gene_mart)
#
# vargen_install(install_dir = "./vargen_data/")
# fantom_df <- prepare_fantom("./vargen_data/enhancer_tss_associations.bed")
# get_fantom5_variants(fantom_df, DM1_genes, 0.25, "hg19ToHg38.over.chain")
get_fantom5_variants <- function(fantom_df, omim_genes, corr_threshold = 0.25,
                                 hg19ToHg38.over.chain, verbose = FALSE) {
  fantom_variants <- data.frame()
  list.variants <- vector('list', nrow(omim_genes))

  for(gene in 1:nrow(omim_genes)){
    enhancers_df <- get_fantom5_enhancers_from_hgnc(fantom_df = fantom_df,
                                                    hgnc_symbols = omim_genes[gene, "hgnc_symbol"],
                                                    corr_threshold = corr_threshold)
    if(nrow(enhancers_df) != 0) {
      enhancers_df <- GenomicRanges::makeGRangesFromDataFrame(enhancers_df,
                                                              keep.extra.columns = TRUE)
      enhancers_df <- unlist(rtracklayer::liftOver(enhancers_df, rtracklayer::import.chain(hg19ToHg38.over.chain)))
      fantom_locs <- paste0(GenomeInfoDb::seqnames(enhancers_df), ":",
                            BiocGenerics::start(enhancers_df)-1, ":",
                            BiocGenerics::end(enhancers_df))
      fantom_locs <- sub("^chr", "", fantom_locs)


      variants_loc <- get_variants_from_locations(fantom_locs,
                                                  verbose = verbose)
      if(length(variants_loc) != 0){
        enhancer_variants <- cbind(ensembl_gene_id = omim_genes[gene, "ensembl_gene_id"],
                                   hgnc_symbol = omim_genes[gene, "hgnc_symbol"],
                                   variants_loc)

        enhancer_variants_df <- format_output(chr = unlist(enhancer_variants$seq_region_name),
                                              pos =  unlist(enhancer_variants$start),
                                              rsid = unlist(enhancer_variants$id),
                                              ensembl_gene_id = unique(enhancer_variants$ensembl_gene_id),
                                              hgnc_symbol = unique(enhancer_variants$hgnc_symbol))

        list.variants[[gene]] <- enhancer_variants_df
      }
    }
  }

  # Removing the NULL elements of the list if exists
  list.variants <- list.variants[!sapply(list.variants, is.null)]
  # Then concatenate the list into a data.frame
  fantom_variants <- do.call('rbind', list.variants)
  if(length(fantom_variants) != 0){
    fantom_variants$source <- "fantom5"
  }

  return(fantom_variants)
}
MCorentin/vargen documentation built on Feb. 6, 2024, 2:32 p.m.