R/cna2gene.R

Defines functions cna2gene

Documented in cna2gene

#' @title cna2gene
#'  
#' @param seg seg object generated by \code{\link{readSegment}} function.
#' @param txdb A TxDb object. i.e., TxDb.Hsapiens.UCSC.hg19.knownGene. Default NULL.
#' @param min.overlap.len The minimum insertion size of segment and gene. Default 50.
#' @param geneList The list of genes used to limit the annotation.Default NULL. 
#' 
#' @examples
#' segFile <- system.file("extdata", "CRC_HZ.seg.txt", package = "MesKit")
#' gisticAmpGenesFile <- system.file("extdata", "COREAD_amp_genes.conf_99.txt", package = "MesKit")
#' gisticDelGenesFile <- system.file("extdata", "COREAD_del_genes.conf_99.txt", package = "MesKit")
#' gisticAllLesionsFile <- system.file("extdata", "COREAD_all_lesions.conf_99.txt", package = "MesKit")
#' seg <- readSegment(segFile = segFile,
#'                    gisticAmpGenesFile = gisticAmpGenesFile,
#'                     gisticDelGenesFile = gisticDelGenesFile, 
#'                    gisticAllLesionsFile = gisticAllLesionsFile)
#'                    
#' library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#' library(org.Hs.eg.db)
#' cna2gene(seg, txdb = TxDb.Hsapiens.UCSC.hg19.knownGene)
#' @return seg object
#' @export cna2gene

cna2gene <- function(seg, txdb, min.overlap.len = 50, geneList = NULL){
  
  ## combine data frame
  if(is(seg, "list")){
    seg <- dplyr::bind_rows(seg) %>% as.data.table()
  }
  seg$Chromosome = gsub(pattern = 'X', replacement = '23', x = seg$Chromosome, fixed = TRUE)
  seg$Chromosome = gsub(pattern = 'Y', replacement = '24', x = seg$Chromosome, fixed = TRUE)
  seg$Chromosome <- as.numeric(seg$Chromosome)
  # annotate seg with gene

  allgenes <-  suppressMessages(as.data.table(genes(txdb)))
  allgenes$seqnames = gsub(pattern = 'X', replacement = '23', x = allgenes$seqnames, fixed = TRUE)
  allgenes$seqnames = gsub(pattern = 'Y', replacement = '24', x = allgenes$seqnames, fixed = TRUE)
  gene_symbols <- suppressMessages(AnnotationDbi::select(org.Hs.eg.db,keys=allgenes$gene_id,columns="SYMBOL",keytype="ENTREZID")$SYMBOL)
  allgenes <- allgenes %>%
      dplyr::mutate(Hugo_Symbol = gene_symbols) %>%
      dplyr::rename(Chromosome = seqnames,
             Start_Position = start,
             End_Position = end) %>%
      dplyr::mutate(Chromosome = gsub("chr","",Chromosome)) %>%  
    dplyr::filter(Chromosome %in% seq_len(24)) %>% 
    dplyr::mutate(Chromosome = as.numeric(Chromosome))

  if(!is.null(geneList)){
     gene.setdiff <- setdiff(geneList, gene_symbols)
     if(length(gene.setdiff) > 0){
        stop(paste0("Gene: ", gene.setdiff, " can not be found in Entrez Gene"))
     }
    allgenes <- allgenes[Hugo_Symbol %in% geneList]
  }
  
  data.table::setkey(x = seg, Chromosome, Start_Position, End_Position)
  data.table::setkey(x = allgenes,Chromosome, Start_Position, End_Position)
  ## map gene to seg
  anno_result <-  data.table::foverlaps(seg,allgenes,
                                        by.x = c("Chromosome","Start_Position","End_Position")) %>%
    na.omit()
  if(nrow(anno_result) == 0){
    stop("No genes mapped to seg.")
  } 
  anno_result$id <- seq_len(nrow(anno_result))
  
  anno_result$seg_id <- paste0(
    anno_result$Patient_ID,"&",
    anno_result$Tumor_Sample_Barcode,"&",
    anno_result$Chromosome,"&",
    anno_result$i.Start_Position,"&",
    anno_result$i.End_Position
    )
  
  anno_result$gene_id <- paste0(
    anno_result$Hugo_Symbol,"&",
    anno_result$Chromosome,"&",
    anno_result$i.Start_Position,"&",
    anno_result$i.End_Position
  )
  
  ## seg end position > gene start position, 
  ## seg end position < gene end position
  ar1 <- anno_result %>%
    dplyr::filter(
      (i.End_Position - Start_Position) > min.overlap.len,
      i.End_Position <= End_Position,
      i.Start_Position <= Start_Position
    ) %>% 
      dplyr::mutate(
        overlap_width = i.End_Position - Start_Position
      )
  ## gene end position > seg start position, 
  ## gene end position < seg end position
  ar2 <- anno_result %>% 
    dplyr::filter(
      (End_Position - i.Start_Position) > min.overlap.len,
      End_Position <= i.End_Position,
      Start_Position <= i.Start_Position
    ) %>% 
    dplyr::mutate(
      overlap_width = End_Position - i.Start_Position
    )
  ## gene in seg
  ar3 <- anno_result %>% 
    dplyr::filter(
      i.End_Position >= End_Position,
      i.Start_Position <= Start_Position
    ) %>% 
    dplyr::mutate(
      overlap_width = End_Position - Start_Position
    )
  ## seg in gene
  ar4 <- anno_result %>% 
    dplyr::filter(
      End_Position >= i.End_Position,
      Start_Position <= i.Start_Position
    )%>% 
    dplyr::mutate(
      overlap_width = i.End_Position - i.Start_Position
    )
  result <- rbind(ar1,ar2,ar3,ar4) %>% 
    dplyr::distinct(id, .keep_all = TRUE) %>% 
    dplyr::group_by(Patient_ID, Tumor_Sample_Barcode, seg_id) %>%
    dplyr::filter(overlap_width == max(overlap_width)) %>%
    dplyr::ungroup() %>%
    as.data.frame()
  if("Gistic_type" %in% colnames(result)){
    if("Tumor_Sample_Label" %in% colnames(result)){
      result <- result %>% 
        dplyr::select(
          "Patient_ID",
          "Hugo_Symbol",
          "Tumor_Sample_Barcode",
          "Tumor_Sample_Label",
          "Cytoband",
          "Cytoband_pos",
          "Chromosome", 
          "i.Start_Position", 
          "i.End_Position",
          "CopyNumber",
          "Type",
          "Gistic_type"
        )
    }else{
      result <- result %>% 
        dplyr::select(
          "Patient_ID",
          "Hugo_Symbol",
          "Tumor_Sample_Barcode",
          "Cytoband",
          "Cytoband_pos",
          "Chromosome", 
          "i.Start_Position", 
          "i.End_Position",
          "CopyNumber",
          "Type",
          "Gistic_type"
        )
    }
  }else{
    if("Tumor_Sample_Label" %in% colnames(result)){
      result <- result %>% 
        dplyr::select(
          "Patient_ID",
          "Hugo_Symbol",
          "Tumor_Sample_Barcode",
          "Tumor_Sample_Label",
          "Chromosome", 
          "i.Start_Position", 
          "i.End_Position",
          "CopyNumber",
          "Type",
        )
    }else{
      result <- result %>% 
        dplyr::select(
          "Patient_ID",
          "Hugo_Symbol",
          "Tumor_Sample_Barcode",
          "Chromosome", 
          "i.Start_Position", 
          "i.End_Position",
          "CopyNumber",
          "Type"
        )
    }
  }
  
  
  result <- result %>% 
    dplyr::rename(
      "End_Position" = "i.End_Position",
      "Start_Position" = "i.Start_Position"
    )
  return(result)
  # t <- anno_result[Patient_ID == "V402"&Tumor_Sample_Barcode == "T1"]
}

Try the MesKit package in your browser

Any scripts or data that you put into this service are public.

MesKit documentation built on March 28, 2021, 6 p.m.