R/GetNearbyGenes.R

Defines functions getRegionNearGenes calcDistNearestTSS addDistNearestTSS GetNearGenes

Documented in addDistNearestTSS calcDistNearestTSS GetNearGenes getRegionNearGenes

# NearGenes
# @param Target A charactor which is name of TRange or one of rownames of TBed.
# @param Gene A GRange object contains coordinates of promoters for human genome.
# @param geneNum A number determine how many gene will be collected from each 
# side of target (number shoule be even).
# @param TRange A GRange object contains coordinate of targets.
# @return A data frame of nearby genes and information: genes' IDs, genes' symbols, 
# distance with target and side to which the gene locate to the target.
#'@importFrom GenomicRanges strand<-
NearGenes <- function (Target = NULL,
                       Gene = NULL,
                       geneNum = 20,
                       TRange = NULL){
  # Algorithm:
  # 1) get the follow gene (overlapping genes are diconsidered) to be the first in L1 (index variable)
  #                 probe
  #     -------       O      ------
  #    |      |      ||     |     |
  #    -------       ||     ------
  #  follow gene          precede gene
  # 2) Sort genes (by start) 
  # 3.1) Get 9 genes before index (L1)
  #      If we only have l genes (l < 9) due to end of genomic region, get the l ones and get more 10-l to the right
  # 3.2) Get 10 after index (L1)
  #      If we only have r genes (r < 10) due to end of genomic region, get the r ones and get more 10-r to the left 
  # Where 10 is genum/2
  Gene$GENEID <- Gene$ensembl_gene_id
  if("external_gene_name" %in% colnames(S4Vectors::mcols(Gene))){
    Gene$SYMBOL <- Gene$external_gene_name
  } else  if("external_gene_id" %in% colnames(S4Vectors::mcols(Gene))){
    Gene$SYMBOL <- Gene$external_gene_id
  } else {
    stop("No gene symbol column found (expected external_gene_id or external_gene_name")
  }
  if(is.null(Gene) | is.null(Target)){
    stop ("Target and Genes should both be defined")
  }
  if(is.null(TRange)){
    stop( "TRange must be defined")
  }else{
    # Just to be sure we have only one probe. To be removed ?
    regionInfo <- TRange[names(TRange) %in% Target]
  }
  GeneIDs <- c()
  Distances <- c()
  strand(Gene) <- "*"
  # We will get only genes in the same same chromossome
  Gene <- Gene[as.character(seqnames(Gene)) %in% as.character(seqnames(regionInfo))]
  if(length(Gene)==0){
    warning(paste0(Target," don't have any nearby gene in the given gene list."))
    Final <- NA
  } else {
    Gene <- sort(Gene)
    index <- follow(regionInfo,Gene)
    #left side
    Leftlimit <- geneNum/2
    Rightlimit <- geneNum/2
    n <- 1
    if(is.na(index)){
      index<- 0
      Leftlimit <- 0
      Left <- c()
    }else if(index==1){
      Left <- index
      Leftlimit <- length(Left)
    }else{
      Left <- index
      while(length(Left) < Leftlimit){
        # If the gene is not in the list already add it, otherwise go to the next
        if(!as.character(Gene$GENEID[index-n]) %in% as.character(Gene$GENEID[Left])) Left <- c((index-n),Left) 
        
        # Is it the first gene? If so there is nothing in the left anymore
        if((index-n)==1) Leftlimit <- length(Left)
        n <- n + 1
      }
    }
    
    Right <- c()
    n <- 1
    if(index==length(Gene) || 
       all(unique(Gene$GENEID[(index+1):length(Gene)]) %in% as.character(Gene$GENEID[index]))){
      Rightlimit <- length(Right)
    }else{
      while(length(Right) < Rightlimit){
        if(!as.character(Gene$GENEID[index+n])%in% as.character(Gene$GENEID[c(Right,Left)])) 
          Right <- c(Right,(index+n))
        
        if(index+n==length(Gene)){
          Rightlimit <- length(Right)
        } else{
          n <- n+1
        }     
      }
    }
    
    if(Rightlimit < geneNum/2){
      n <- 1
      if(Left[1]-n > 0){
        while((length(Left)+length(Right)) < geneNum){
          if(!as.character(Gene$GENEID[Left[1]-n])%in%as.character(Gene$GENEID[c(Left,Right)])) 
            Left <- c((Left[1]-n),Left) 
          n <- n+1
        }
      }  
    }
    
    if(Leftlimit < geneNum/2){
      n <- 1
      m <- length(Right)
      if(Right[m]+n < length(Gene)+1)
        while((length(Left)+length(Right)) < geneNum){
          if(!as.character(Gene$GENEID[Right[m]+n])%in%as.character(Gene$GENEID[c(Left,Right)])) 
            Right <- c(Right,(Right[m]+n)) 
          n <- n+1
        } 
    }
    Whole <- c(Left,Right)
    GeneIDs <- Gene$GENEID[Whole]
    Symbols <- Gene$SYMBOL[Whole]
    Distances <-  suppressWarnings(distance(Gene[Whole],regionInfo))
    if(Rightlimit < 1){
      Sides <- paste0("L",length(Left):1)
    } else if( Leftlimit < 1){
      Sides <- paste0("R",1:length(Right))
    } else{
      Sides <- c(paste0("L",length(Left):1),paste0("R",1:length(Right)))
    }
    
    Final <- data.frame(Target=rep(Target,length(GeneIDs)),GeneID=GeneIDs,
                        Symbol=Symbols,Distance=Distances, Side=Sides, 
                        stringsAsFactors = FALSE)
    Final <- Final[order(Final$Side,Final$Distance),]
  }
  return(Final)
}

#' GetNearGenes to collect nearby genes for one locus.
#' @description 
#' GetNearGenes is a function to collect equal number of gene on each side of one locus.
#' It can receite either multi Assay Experiment with both DNA methylation and gene Expression matrix
#' and the names of probes to select nearby genes, or it can receive two granges objects TRange and geneAnnot.
#' @param data A multi Assay Experiment with both DNA methylation and gene Expression objects
#' @param probes Name of probes to get nearby genes (it should be rownames of the DNA methylation 
#' object in the data argument object)
#' @param geneAnnot A GRange object  or Summarized Experiment object that contains coordinates of promoters for 
#' human genome.
#' @param numFlankingGenes A number determines how many gene will be collected totally. 
#' Then the number devided by 2 is the number of genes collected from 
#' each side of targets (number shoule be even) Default to 20.
#' @param TRange A GRange object or Summarized Experiment object that contains coordinates of a list of targets loci.
#' @return A data frame of nearby genes and information: genes' IDs, genes' symbols, 
#' distance with target and side to which the gene locate to the target.
#' @export
#' @importFrom GenomicRanges strand follow distance 
#' @importFrom plyr alply
#' @importFrom doParallel registerDoParallel
#' @importFrom SummarizedExperiment rowRanges
#' @references 
#' Yao, Lijing, et al. "Inferring regulatory element landscapes and transcription 
#' factor networks from cancer methylomes." Genome biology 16.1 (2015): 1.
#' @examples
#' geneAnnot <- getTSS(genome = "hg38")
#' probe <- GenomicRanges::GRanges(seqnames = c("chr1","chr2"), 
#' range=IRanges::IRanges(start = c(16058489,236417627), end= c(16058489,236417627)), 
#' name= c("cg18108049","cg17125141"))
#' names(probe) <- c("cg18108049","cg17125141")
#' NearbyGenes <- GetNearGenes(numFlankingGenes = 20,geneAnnot=geneAnnot,TRange=probe)
GetNearGenes <- function(data = NULL,
                         probes = NULL,
                         geneAnnot = NULL,
                         TRange = NULL,
                         numFlankingGenes = 20){
  message("Searching for the ", numFlankingGenes, " near genes")
  if(!is.null(data)){
    if(is.null(probes)) stop("Please set the probes argument (names of probes to select nearby genes)")
    TRange <- subset(getMet(data), rownames(getMet(data)) %in% probes)
    geneAnnot <- getExp(data)
  }    
  if(is.null(TRange)){
    stop("TRange must be defined")
  }
  tssAnnot <- NULL
  if(is.null(geneAnnot)){
    if("genome" %in% names(metadata(data))){
      genome <- metadata(data)$genome
      tssAnnot <- getTSS(genome = genome)
      geneAnnot <- get.GRCh(genome = genome,as.granges = TRUE)
    }
  }
  
  if(class(TRange) == class(as(SummarizedExperiment(),"RangedSummarizedExperiment"))){
    TRange <- rowRanges(TRange)
  }
  if(class(geneAnnot) == class(as(SummarizedExperiment(),"RangedSummarizedExperiment"))){
    geneAnnot <- rowRanges(geneAnnot)
  }
  
  if(is.null(names(TRange))) {
    if(is.null(TRange$name)) stop("No probe names found in TRange")
    names(TRange) <- TRange$name
  }
  
  NearGenes <-
    getRegionNearGenes(
      numFlankingGenes = numFlankingGenes,
      geneAnnot = geneAnnot,
      tssAnnot = tssAnnot,
      TRange = TRange
    )
  return(NearGenes)
}


#' @title Calculate the distance between probe and gene TSS
#' @description Calculate the distance between probe and gene TSS
#' @param data A multi Assay Experiment with both DNA methylation and gene Expression objects
#' @param NearGenes A list or a data frame with the pairs gene probes
#' @param cores Number fo cores to be used. Deafult: 1
#' @param met.platform DNA methyaltion platform to retrieve data from: EPIC or 450K (default)
#' @param genome Which genome build will be used: hg38 (default) or hg19.
#' @export
#' @examples 
#' \dontrun{
#'  data <- ELMER:::getdata("elmer.data.example")
#'  NearbyGenes <- GetNearGenes(
#'   data = data, 
#'   probes = c("cg15924102", "cg24741609"),  
#'   numFlankingGenes = 20
#'  )
#'  NearbyGenes <- addDistNearestTSS(data = data, NearGenes = NearbyGenes)
#' }
addDistNearestTSS <- function(
  data,
  NearGenes,
  genome,
  met.platform,
  cores = 1
) {
  
  if(missing(NearGenes)) stop("Please set NearGenes argument")
  
  # used to recover TSS information
  if(missing(data) & missing(genome)) {
    stop("Please set data argument or genome arguments")
  }
  
  # For a given probe/region and gene find nearest TSS distance
  if(!missing(data)){
    tss <- getTSS(metadata(data)$genome)
  } else {
    tss <- getTSS(genome = genome)
  }
  
  message("Update the distance to gene to distance to the nearest TSS of the gene")
  
  # If our input has the probe names we will have to recover the probe metadata to map.
  region <- FALSE
  if(!missing(data)){
    met <- rowRanges(getMet(data))
  } else if(!missing(met.platform)){
    met <- getInfiniumAnnotation(plat = met.platform, genome = genome)
  } else {
    region <- TRUE
    met <- NearGenes %>% tidyr::separate("Target", c("seqnames","start","end"), 
                                         sep = ":|-", remove = FALSE,
                                         convert = FALSE, extra = "warn", fill = "warn") %>% 
      makeGRangesFromDataFrame(keep.extra.columns = TRUE)
  }
  
  NearGenes <- calcDistNearestTSS(links = NearGenes,TRange = met,tssAnnot = tss)
  
  return(NearGenes)
}

#' @title Calculate distance from region to nearest TSS
#' @description 
#' Idea 
#' For a given region R linked to X genes G
#' merge R with nearest TSS for G (multiple)
#' this will increse nb of lines 
#' i.e R1 - G1 - TSS1 - DIST1
#'    R1 - G1 - TSS2 - DIST2
#'    To vectorize the code:
#' make a granges from left and onde from right 
#' and find distance
#' collapse the results keeping min distance for equals values
#' @param links Links to calculate the distance
#' @param TRange Genomic coordinates for Tartget region
#' @param tssAnnot TSS annotation
#' @importFrom dplyr slice left_join group_by_
#' @examples 
#' \dontrun{
#'  data <- ELMER:::getdata("elmer.data.example")
#'   NearbyGenes <- GetNearGenes(
#'     data = data, 
#'     probes = c("cg15924102", "cg24741609"),  
#'     numFlankingGenes = 20
#'  )
#'  
#'   NearbyGenes <- ELMER:::calcDistNearestTSS(
#'        links = NearbyGenes,
#'        tssAnnot =  getTSS(genome = "hg38"),
#'        TRange = rowRanges(getMet(data))
#'   )
#' }
#' @author Tiago C. Silva
calcDistNearestTSS <- function(
  links,
  TRange,
  tssAnnot
){
  
  message("calculating Distance to nearest TSS")
  if(!is(tssAnnot,"GenomicRanges")){
    stop("tssAnnot is not a GenomicRanges")
  }
  
  if(!is(TRange,"GenomicRanges")){
    stop("tssAnnot is not a GenomicRanges")
  }
  
  if(!"ID" %in% colnames(values(TRange))){
    TRange$ID <- names(TRange)
  }
  
  if(!"ensembl_gene_id" %in% colnames(links)){
    colnames(links)[grep("GeneID", colnames(links))] <- "ensembl_gene_id"
  }
  
  merged <- dplyr::left_join(
    links,
    suppressWarnings(tibble::as_tibble(tssAnnot)),
    by = c("ensembl_gene_id")
  )
  
  merged <- dplyr::left_join(
    merged,
    suppressWarnings(tibble::as_tibble(TRange)),
    by = c("ID")
  )
  
  # In case a gene was removed from newer versions and not mapped
  merged <- merged[!is.na(merged$transcription_start_site),]
  
  left <- makeGRangesFromDataFrame(
    merged,
    start.field = "transcription_start_site",
    end.field = "transcription_start_site",
    seqnames.field = "seqnames.x",
    strand.field = "strand.x",
    ignore.strand = FALSE
  )
  
  right <- makeGRangesFromDataFrame(
    merged,
    start.field = "start.y",
    end.field = "end.y",
    strand.field = "strand.y",
    seqnames.field = "seqnames.y",
    ignore.strand = FALSE
  )
  
  merged$DistanceTSS <- distance(left,right,ignore.strand = TRUE)
  merged <- unique(merged[,c("ID","ensembl_gene_id","DistanceTSS")])
  
  ret <- merged %>% 
    dplyr::group_by(.data$ID,.data$ensembl_gene_id) %>% 
    dplyr::slice(which.min(DistanceTSS))
  
  #ret <- ret[match(links %>% tidyr::unite(ID,Target,GeneID) %>% pull(ID),
  #          ret %>% tidyr::unite(ID,Target,GeneID) %>% pull(ID)),]
  links <- dplyr::full_join(links,ret)
  colnames(links)[1:3] <- c("ID","GeneID","Symbol")
  return(links)
}

#' @title  Identifies nearest genes to a region
#' @description 
#'  Auxiliary function for GetNearGenes
#'  This will get the closest genes (n=numFlankingGenes) for a target region (TRange)
#'  based on a genome of refenrece gene annotation (geneAnnot). If the
#'  transcript level annotation (tssAnnot) is provided the Distance will be updated to
#'  the distance to the nearest TSS.
#' @param geneAnnot A GRange object contains gene coordinates of for human genome.
#' @param tssAnnot A GRange object contains tss coordinates of for human genome.
#' @param numFlankingGenes A number determine how many gene will be collected from each 
# side of target (number shoule be even).
#' @param TRange A GRange object contains coordinate of targets.
#' @return A data frame of nearby genes and information: genes' IDs, genes' symbols, 
# distance with target and side to which the gene locate to the target.
#' @examples
#' geneAnnot <-  ELMER:::get.GRCh("hg38",as.granges = TRUE)
#' tssAnnot <-  getTSS(genome = "hg38")
#' probe <- GenomicRanges::GRanges(seqnames = c("chr1","chr2"), 
#' range=IRanges::IRanges(start = c(16058489,236417627), end= c(16058489,236417627)), 
#' name= c("cg18108049","cg17125141"))
#' names(probe) <- c("cg18108049","cg17125141")
#' NearbyGenes <- getRegionNearGenes(numFlankingGenes = 20,
#'                              geneAnnot = geneAnnot,
#'                              TRange = probe,
#'                              tssAnnot = tssAnnot)
#' @importFrom GenomicRanges nearest precede follow
#' @importFrom tibble as_tibble                              
#' @importFrom dplyr group_by do   
#' @importFrom progress progress_bar             
#' @author 
#' Tiago C Silva (maintainer: tiagochst@usp.br)
#' @export
getRegionNearGenes <- function(TRange = NULL,
                               numFlankingGenes = 20,
                               geneAnnot = NULL,
                               tssAnnot = NULL){
  
  
  pb <- progress::progress_bar$new(total = numFlankingGenes * 2)
  
  TRange$ID <- names(TRange)
  
  # We will consider the input at a gene level only
  if(! "ensembl_gene_id" %in%  colnames(S4Vectors::mcols(geneAnnot))){
    message("geneAnnot needs the following column ensembl_gene_id")
  }
  geneAnnot <- geneAnnot[!duplicated(geneAnnot$ensembl_gene_id)]
  
  
  # Optimized version
  # Idea: vectorize search
  # 1) For all regions, get nearest gene
  # 2) check follow and overlapping genes recursively
  # 3) check precede and overlapping genes recursively
  # 4) map the positions based on min distance (L1)
  # The input data has to be at gene level and not transcript which would broke
  # some of the optimizations for which we remove the genes already evaluated
  all <- 1:length(TRange)
  nearest.idx <-
    nearest(TRange,
            geneAnnot,
            select = "all",
            ignore.strand = TRUE)
  idx <- suppressWarnings(tibble::as_tibble(nearest.idx))
  evaluating <- idx$queryHits
  suppressWarnings({
    ret <-
      cbind(
        suppressWarnings(tibble::as_tibble(geneAnnot[idx$subjectHits])),
        tibble::tibble(
          "ID" = names(TRange)[idx$queryHits],
          "Distance" = distance(TRange[idx$queryHits],
                                geneAnnot[idx$subjectHits], select = "all", ignore.strand = TRUE) *
            ifelse(start(TRange[evaluating]) < start(geneAnnot[idx$subjectHits]), 1,-1) 
          
        )
      )
  })
  for (i in 1:(numFlankingGenes)) {
    idx <-
      unique(rbind(
        suppressWarnings(tibble::as_tibble(
          findOverlaps(
            geneAnnot[idx$subjectHits],
            geneAnnot,
            ignore.strand = TRUE,
            type = "any",
            select = "all"
          )
        )),
        suppressWarnings(tibble::as_tibble(
          precede(
            geneAnnot[idx$subjectHits],
            geneAnnot,
            select = "all",
            ignore.strand = TRUE
          )
        ))
      ))
    idx$evaluating <-  evaluating[idx$queryHits]
    # remove same target gene and probe if counted twice
    idx <- idx[!duplicated(idx[, 2:3]), ]
    
    # todo remove already evaluated previously (we don't wanna do it again)
    idx <-
      idx[!paste0(geneAnnot[idx$subjectHits]$ensembl_gene_id, names(TRange)[idx$evaluating]) %in% paste0(ret$ensembl_gene_id, ret$ID), ]
    evaluating <- evaluating[idx$queryHits]
    ret <-
      rbind(ret, # keep old results
            cbind(
              suppressWarnings(
                tibble::as_tibble(geneAnnot[idx$subjectHits])),
              tibble::tibble(
                "ID" = names(TRange)[evaluating],
                "Distance" = ifelse(start(TRange[evaluating]) < start(geneAnnot[idx$subjectHits]), 1,-1) * 
                  distance(TRange[evaluating],
                           geneAnnot[idx$subjectHits], select = "all",
                           ignore.strand = TRUE)
              )
            ))
    pb$tick()
  }
  ret <- ret[!duplicated(ret[,c("ensembl_gene_id","ID")]),]
  
  idx <- suppressWarnings(tibble::as_tibble(nearest.idx))
  evaluating <- idx$queryHits
  for (i in 1:(numFlankingGenes)) {
    idx <-
      unique(rbind(
        suppressWarnings(tibble::as_tibble(
          findOverlaps(
            geneAnnot[idx$subjectHits],
            geneAnnot,
            ignore.strand = TRUE,
            type = "any",
            select = "all"
          )
        )),
        suppressWarnings(tibble::as_tibble(
          follow(
            geneAnnot[idx$subjectHits],
            geneAnnot,
            select = "all",
            ignore.strand = TRUE
          )
        ))
      ))
    idx$evaluating <-  evaluating[idx$queryHits]
    idx <- idx[!duplicated(idx[, 2:3]), ]
    idx <-
      idx[!paste0(geneAnnot[idx$subjectHits]$ensembl_gene_id, names(TRange)[idx$evaluating]) %in% paste0(ret$ensembl_gene_id, ret$ID), ]
    evaluating <- evaluating[idx$queryHits]
    ret <-
      rbind(ret, cbind(
        suppressWarnings(tibble::as_tibble(geneAnnot[idx$subjectHits])),
        suppressWarnings(tibble::tibble(
          "ID" = names(TRange)[evaluating],
          "Distance" = ifelse(start(TRange[evaluating]) < start(geneAnnot[idx$subjectHits]), 1,-1) * 
            distance(TRange[evaluating],
                     geneAnnot[idx$subjectHits], select = "all",ignore.strand = TRUE)
        ))
      ))
    pb$tick()
  }
  ret <- ret[!duplicated(ret[,c("ensembl_gene_id","ID")]),]
  ret <- ret[order(ret$Distance),]
  
  ret <- ret[, c("ID",
                 "ensembl_gene_id", 
                 grep("external_gene_", colnames(ret), value = TRUE),
                 "Distance")]
  
  message("Identifying gene position for each probe") 
  f <- function(x) {
    center <- which(abs(x$Distance) == min(abs(x$Distance)))[1]
    pos <- setdiff(-center:(nrow(x) - center), 0)
    x$Side <- ifelse(pos > 0, paste0("R", abs(pos)), paste0("L", abs(pos)))
    out <-  x %>% dplyr::filter(x$Side %in% c(paste0("R", 1:(numFlankingGenes / 2)), 
                                              paste0("L", 1:(numFlankingGenes / 2))
    ))
    if (nrow(out) < numFlankingGenes) {
      if (paste0("R", floor(numFlankingGenes / 2)) %in% out$Side) {
        cts <- length(grep("L", sort(x$Side), value = TRUE))
        out <- x %>% dplyr::filter(Side %in% c(paste0("R", 1:(numFlankingGenes - cts)),
                                               grep("L", sort(out$Side), value = TRUE)))
      } else {
        cts <- length(grep("R", sort(x$Side), value = TRUE))
        out <- x %>% dplyr::filter(x$Side %in% 
                                     c(paste0("L", 1:(numFlankingGenes - cts)),
                                       grep("R", sort(out$Side), value = TRUE))
        )
      }
    }
    out <- out[order(out$Distance), ]
    return(out)
  }
  ret <- ret %>% group_by(ID) %>% do(f(.))
  
  if (!is.null(tssAnnot)) {
    message("Calculating distance to nearest TSS")
    ret <- calcDistNearestTSS(
      links = ret,
      TRange = TRange,
      tssAnnot = tssAnnot
    )
  }
  
  if(any(grepl("external_gene_", colnames(ret)))){
    colnames(ret)[1:3] <- c("ID", "GeneID", "Symbol")
  } else {
    colnames(ret)[1:2] <- c("ID", "GeneID")
  }
  pb$terminate()
  return(ret)
}

Try the ELMER package in your browser

Any scripts or data that you put into this service are public.

ELMER documentation built on Nov. 8, 2020, 4:59 p.m.