R/readSegment.R

Defines functions readSegment

Documented in readSegment

#' @title readSegment
#'  
#' @param segFile The segment file.
#' @param gisticAmpGenesFile Amplification Genes file generated by GISTIC. Default NULL.
#' @param gisticDelGenesFile Deletion Genes file generated by GISTIC. Default NULL.
#' @param gisticAllLesionsFile Information of all lesions generated by GISTIC. Default NULL.
#' @param gistic.qval The threshold of gistic Q value. Default 0.25.
#' @param verbose Whether to display details in the console. Default TRUE.
#' @param min.seg.size The smallest size of segments. Default 500.
#' @param txdb  A TxDb object. i.e., TxDb.Hsapiens.UCSC.hg19.knownGene. Default NULL.
#' @param min.overlap.len The minimum insertion size of segment and gene. Default 50.
#' @param ... ... Other options passed to \code{\link{cna2gene}}.
#' 
#' @examples
#' segFile <- system.file("extdata", "CRC_HZ.seg.txt", package = "MesKit")
#' gisticAmpGenesFile <- system.file("extdata", "COREAD_amp_genes.conf_99.txt", package = "MesKit")
#' gisticDelGenesFile <- system.file("extdata", "COREAD_del_genes.conf_99.txt", package = "MesKit")
#' gisticAllLesionsFile <- system.file("extdata", "COREAD_all_lesions.conf_99.txt", package = "MesKit")
#' seg <- readSegment(segFile = segFile,
#'                    gisticAmpGenesFile = gisticAmpGenesFile,
#'                    gisticDelGenesFile = gisticDelGenesFile,
#'                    gisticAllLesionsFile = gisticAllLesionsFile)
#'
#' @return a list of segmentation data frame
#' @importFrom data.table foverlaps as.data.table setkey rbindlist
#' @export readSegment
#'

readSegment <- function(segFile,
                        gisticAmpGenesFile = NULL, 
                        gisticDelGenesFile = NULL,
                        gisticAllLesionsFile = NULL,
                        gistic.qval = 0.25,
                        min.seg.size = 500,
                        txdb = NULL,
                        min.overlap.len = 50,
                        verbose = TRUE,
                        ...){
  seg <- suppressWarnings(data.table::fread(segFile, header=TRUE, sep="\t", stringsAsFactors = FALSE))
  
  ## valid seg
  seg <- validSeg(seg)
  
  if(!"CopyNumber" %in% colnames(seg)){
    if("SegmentMean" %in% colnames(seg)){
        seg$SegmentMean <- as.numeric(seg$SegmentMean)
        seg$CopyNumber <- round(2^(seg$SegmentMean)*2)
        if("Tumor_Sample_Label" %in% colnames(seg)){
          seg <- dplyr::select(seg, "Patient_ID", "Tumor_Sample_Barcode", "Tumor_Sample_Label", "Chromosome", "Start_Position", "End_Position", "CopyNumber")
        }else{
          seg <- dplyr::select(seg, "Patient_ID", "Tumor_Sample_Barcode", "Chromosome", "Start_Position", "End_Position", "CopyNumber")
        }
    }
    else{
        stop("Neither copy number nor SegmentMean information was found in seg object.")  
    }
  }
  
  Minor_Major_CN <- data.frame()
  if("Minor_CN" %in% colnames(seg) & "Major_CN" %in% colnames(seg)){
    Minor_Major_CN <- seg %>%
      dplyr::select("Patient_ID",
                    "Tumor_Sample_Barcode",
                    "Chromosome",
                    "Start_Position",
                    "End_Position",
                    "Minor_CN",
                    "Major_CN")
  }
  
  seg$CopyNumber <- as.numeric(seg$CopyNumber)
  
  seg$Width <- seg$End_Position - seg$Start_Position
  seg <- dplyr::mutate(seg, Type = unlist(lapply(seg$CopyNumber, function(x){
      if(x == 0){
          return("Deletion")
      }
      else if(x == 1){
          return("Loss")
      }
      else if(x == 2){
          return("Neutral")
      }
      else if(x == 3){
          return("Gain")
      }
      else if(x >=4){
          return("Amplification")
      }
  }))) %>%
      dplyr::filter(.data$Width > min.seg.size)
  
  if("Tumor_Sample_Label" %in% colnames(seg)){
    seg <-  seg %>% 
      dplyr::select("Patient_ID", "Tumor_Sample_Barcode", "Tumor_Sample_Label", "Chromosome", "Start_Position", "End_Position", "CopyNumber", "Type") %>% 
      as.data.table()
  }else{
    seg <- seg %>% 
      dplyr::select("Patient_ID", "Tumor_Sample_Barcode", "Chromosome", "Start_Position", "End_Position", "CopyNumber", "Type") %>% 
      as.data.table()
  }
  seg$Chromosome <- as.numeric(seg$Chromosome)
  seg$Start_Position <- as.numeric(seg$Start_Position)
  seg$End_Position <- as.numeric(seg$End_Position)
  data.table::setkey(x = seg, "Chromosome", "Start_Position", "End_Position")
  
  gisticCNVgenes <- data.table::data.table()
  if(!is.null(gisticAmpGenesFile)){
      if(verbose){
          cat(paste0('--Processing ', basename(gisticAmpGenesFile), '\n'))
      }
      gisticAmpGenes <- readGisticGene(gisticAmpGenesFile, "AMP")
      gisticCNVgenes <- rbind(gisticCNVgenes, gisticAmpGenes)
  }   
  if(!is.null(gisticDelGenesFile)){
      if(verbose){
          cat(paste0('--Processing ', basename(gisticDelGenesFile), '\n'))
      }
      gisticDelGenes <- readGisticGene(gisticDelGenesFile, "Del")
      gisticCNVgenes <- rbind(gisticCNVgenes, gisticDelGenes)
  }
  gisticLesions <- data.table::data.table()
  if(!is.null(gisticAllLesionsFile)){
      gisticLesions <- as.data.table(readGisticAllLesions(gisticAllLesionsFile, verbose = verbose)) %>%
                       dplyr::rename("Wide_Peak_Boundaries" = "WPB")
      if("Amp" %in% gisticLesions$Gistic_type){
          gisticLesions[gisticLesions$Gistic_type == "Amp",]$Gistic_type <- "AMP"
      }
  }
  if(nrow(gisticCNVgenes) != 0){
      if(!is.null(gisticAllLesionsFile)){
          gisticLesions$ID <- tidyr::unite(gisticLesions,"ID","Cytoband","Wide_Peak_Boundaries","Gistic_type",sep = "_")$ID
          gisticCNVgenes$ID <- tidyr::unite(gisticCNVgenes,"ID","Cytoband","Wide_Peak_Boundaries","Gistic_type",sep = "_")$ID
          gisticCNVgenes <- merge(gisticLesions,gisticCNVgenes,by = "ID") %>% 
                         dplyr::select("Cytoband.x", "Wide_Peak_Boundaries.x", "Gene", "Gistic_type.x", "Qvalue") %>%
                         dplyr::rename("Cytoband" = "Cytoband.x", "Wide_Peak_Boundaries" = "Wide_Peak_Boundaries.x",
                                       "Gistic_type" = "Gistic_type.x") %>%
                         dplyr::filter(.data$Qvalue < gistic.qval) %>% data.table::as.data.table()
          
      }
      ## combine segment file and gistic results
      gisticCNVgenes$Chromosome <-  unlist(lapply(gisticCNVgenes$Wide_Peak_Boundaries,function(x)strsplit(x, ":")[[1]][1]))  
      gisticCNVgenes$loc <-  unlist(lapply(gisticCNVgenes$Wide_Peak_Boundaries,function(x)strsplit(x, ":")[[1]][2]))  
      gisticCNVgenes$Start_Position <- unlist(lapply(gisticCNVgenes$loc,function(x)strsplit(x, "-")[[1]][1]))
      gisticCNVgenes$Start_Position <- as.numeric(gisticCNVgenes$Start_Position)
      gisticCNVgenes$End_Position <- unlist(lapply(gisticCNVgenes$loc,function(x)strsplit(x, "-")[[1]][2]))
      gisticCNVgenes$End_Position <- as.numeric(gisticCNVgenes$End_Position)
      gisticCNVgenes$Cytoband_pos <- as.integer(gisticCNVgenes$Start_Position + (gisticCNVgenes$End_Position - gisticCNVgenes$Start_Position)/2)
      gisticCNVgenes$Chromosome <- gsub(pattern = 'chr', replacement = '', x = gisticCNVgenes$Chromosome, fixed = TRUE)
      gisticCNVgenes$Chromosome <- gsub(pattern = 'X', replacement = '23', x = gisticCNVgenes$Chromosome, fixed = TRUE)
      gisticCNVgenes$Chromosome <- gsub(pattern = 'Y', replacement = '24', x = gisticCNVgenes$Chromosome, fixed = TRUE)
      
      gisticCNVgenes <- gisticCNVgenes %>% 
        dplyr::select(.data$Cytoband,
                      .data$Cytoband_pos, 
                      .data$Gene, 
                      .data$Gistic_type, 
                      .data$Chromosome, 
                      .data$Start_Position, 
                      .data$End_Position) %>% 
        dplyr::distinct(.data$Cytoband, .keep_all = TRUE)
      
      gisticCNVgenes$Chromosome <- as.numeric(gisticCNVgenes$Chromosome)
      gisticCNVgenes$Start_Position <- as.numeric(gisticCNVgenes$Start_Position)
      gisticCNVgenes$End_Position <- as.numeric(gisticCNVgenes$End_Position)
      data.table::setkey(x = gisticCNVgenes,"Chromosome", "Start_Position", "End_Position")
      mapDat <- data.table::foverlaps(gisticCNVgenes,seg, by.x = c("Chromosome","Start_Position","End_Position")) %>% 
                na.omit()
      if("Tumor_Sample_Label" %in% colnames(mapDat)){
        mapDat <- mapDat %>% 
          dplyr::select("Patient_ID", "Tumor_Sample_Barcode", "Tumor_Sample_Label", "Cytoband", "Cytoband_pos", "Chromosome", "Start_Position", "End_Position", "CopyNumber", "Type", "Gistic_type") %>% 
          as.data.table() %>%
        dplyr::distinct()
      }else{
        mapDat <-  mapDat %>% 
          dplyr::select("Patient_ID", "Tumor_Sample_Barcode", "Cytoband", "Cytoband_pos", "Chromosome", "Start_Position", "End_Position", "CopyNumber", "Type", "Gistic_type") %>% 
          dplyr::distinct()
      }
      seg <- mapDat[(mapDat$Type %in% c("Loss", "Deletion") & mapDat$Gistic_type  == "Del")|(mapDat$Type %in% c("Gain", "Amplification") & mapDat$Gistic_type  == "AMP"),]
  }else if(nrow(gisticLesions) != 0){
    gisticLesions$Chromosome <-  unlist(lapply(gisticLesions$Wide_Peak_Boundaries,function(x)strsplit(x, ":")[[1]][1]))  
    gisticLesions$loc <-  unlist(lapply(gisticLesions$Wide_Peak_Boundaries,function(x)strsplit(x, ":")[[1]][2]))  
    gisticLesions$Start_Position <- unlist(lapply(gisticLesions$loc,function(x)strsplit(x, "-")[[1]][1]))
    gisticLesions$Start_Position <- as.numeric(gisticLesions$Start_Position)
    gisticLesions$End_Position <- unlist(lapply(gisticLesions$loc,function(x)strsplit(x, "-")[[1]][2]))
    gisticLesions$End_Position <- as.numeric(gisticLesions$End_Position)
    gisticLesions$Cytoband_pos <- as.integer(gisticLesions$Start_Position + (gisticLesions$End_Position - gisticLesions$Start_Position)/2)
      gisticLesions$Chromosome <- gsub(pattern = 'chr', replacement = '', x = gisticLesions$Chromosome, fixed = TRUE)
      gisticLesions$Chromosome <- gsub(pattern = 'X', replacement = '23', x = gisticLesions$Chromosome, fixed = TRUE)
      gisticLesions$Chromosome <- gsub(pattern = 'Y', replacement = '24', x = gisticLesions$Chromosome, fixed = TRUE)
      gisticLesions <- dplyr::select(gisticLesions,"Cytoband", "Cytoband_pos", "Gistic_type", "Chromosome", "Start_Position", "End_Position", "Qvalue")
      gisticLesions$Chromosome <- as.numeric(gisticLesions$Chromosome)
      gisticLesions$Start_Position <- as.numeric(gisticLesions$Start_Position)
      gisticLesions$End_Position <- as.numeric(gisticLesions$End_Position)
      data.table::setkey(x = gisticLesions, "Chromosome", "Start_Position", "End_Position") 
      mapDat <- data.table::foverlaps(gisticLesions,seg, by.x = c("Chromosome","Start_Position","End_Position")) %>% 
          na.omit() %>% 
          dplyr::filter(.data$Qvalue < gistic.qval)
      
      if("Tumor_Sample_Label" %in% colnames(mapDat)){
        mapDat <- mapDat %>% 
          dplyr::select("Patient_ID", "Tumor_Sample_Barcode", "Tumor_Sample_Label", "Cytoband", "Cytoband_pos", "Chromosome", "Start_Position", "End_Position", "CopyNumber", "Type", "Gistic_type") %>%
          dplyr::distinct() %>% 
          data.table::as.data.table()
      }else{
        mapDat <- mapDat %>% 
          dplyr::select("Patient_ID", "Tumor_Sample_Barcode", "Cytoband", "Cytoband_pos", "Chromosome", "Start_Position", "End_Position", "CopyNumber", "Type", "Gistic_type") %>%
          dplyr::distinct() %>% 
          data.table::as.data.table()
      }
      
      seg <- mapDat[(mapDat$Type %in% c("Loss", "Deletion") & mapDat$Gistic_type  == "Del")|(mapDat$Type %in% c("Gain", "Amplification") & mapDat$Gistic_type  == "AMP"),]
  }
  
  data.table::setkey(x = seg, "Patient_ID", "Tumor_Sample_Barcode", "Chromosome", "Start_Position", "End_Position")
  
  seg$Chromosome = gsub(pattern = '23', replacement = 'X', x = seg$Chromosome, fixed = TRUE)
  seg$Chromosome = gsub(pattern = '24', replacement = 'Y', x = seg$Chromosome, fixed = TRUE)
  
  if(!is.null(txdb)){
    seg <- cna2gene(seg, txdb = txdb, min.overlap.len = min.overlap.len, ...)
  }
  
  if(nrow(Minor_Major_CN) > 0){
    Minor_Major_CN$Chromosome <- as.character(Minor_Major_CN$Chromosome)
    seg$Chromosome <- as.character(seg$Chromosome)
    seg <- dplyr::left_join(
      seg,
      Minor_Major_CN,
      by = c("Patient_ID",
             "Tumor_Sample_Barcode",
             "Chromosome",
             "Start_Position",
             "End_Position")
    )
  }
  
  result <- split(seg, seg$Patient_ID)
  
  if(length(result) == 1){
    return(result[[1]])
  }else{
    return(result)
  }

}

Try the MesKit package in your browser

Any scripts or data that you put into this service are public.

MesKit documentation built on March 28, 2021, 6 p.m.