R/getAlterations.R

Defines functions .checkRepos

#-------------------------------
# CHECK CUSTOM DATA CONSISTENCY
#
.checkRepos <- function(repos) {
  if (!is.list(repos)) {
    stop("repos should be a list")
  }
  if (length(repos) != 4) {
    stop("repos should be a list of length 4")
  }
  if (!all(names(repos) %in% c("fusions", "mutations"
                               ,  "copynumber", "expression"))) {
    stop("repos is not correctly formatted. alteration type missing")
  }
  reposNames <- lapply(repos , function(x)
    names(x))
  if (!all(vapply(reposNames , function(k)
    identical(k , c("data" , "Samples")) , logical(1)))) {
    stop(paste("Each alteration type should contain"
               , "two elements named data and Samples"))
  }
  kColnamesIndataFull <- list(
    fusions = c(
      "tumor_type"
      ,"case_id"
      ,"Gene_A"
      ,"Gene_B"
      ,"FusionPair"
      ,"tier"
      ,"frame"
    ),mutations = c(
      "entrez_gene_id"
      ,"gene_symbol"
      ,"case_id"
      ,"mutation_type"
      ,"amino_acid_change"
      ,"genetic_profile_id"
      ,"tumor_type"
      ,"amino_position"
      ,"genomic_position")
    ,copynumber = c("gene_symbol", "CNA", "case_id", "tumor_type", "CNAvalue")
    ,expression = c(
      "gene_symbol"
      ,"expression"
      ,"case_id"
      ,"tumor_type"
      ,"expressionValue")
  )
  for (i in names(kColnamesIndataFull)) {
    # if both data and Samples are NULL, skip
    if (is.null(repos[[i]]$Samples) & is.null(repos[[i]]$data)) {
      next
      # if data exists and samples is NULL, stop
    } else if (!is.null(repos[[i]]$data) &
               is.null(repos[[i]]$Samples)) {
      stop(paste("In" , i , ", Samples are NULL"))
    } else {
      # --------------------------
      # Check over data consistency before append
      # --------------------------
      # data must be a dataframe
      # colnames must be consistent
      # Samples must be a list
      # elements of Samples must be character vectors
      # elements of Samples must be vectors of length > 1
      
      if (!is.data.frame(repos[[i]]$data)) {
        stop(paste("In" , i , "data is not a data.frame"))
      }
      # Last column in copynumber and expression is not necessary
      # In case missing, we add it manually
      if (i %in% c("copynumber" , "expression")) {
        if (ncol(repos[[i]]$data) != 5) {
          if (i == "copynumber") {
            repos[[i]]$data$CNAvalue <- rep(NA , nrow(repos[[i]]$data))
          } else {
            repos[[i]]$data$expressionValue <- rep(NA , nrow(repos[[i]]$data))
          }
        }
      }
      if (!identical(colnames(repos[[i]]$data) ,  kColnamesIndataFull[[i]])) {
        stop(paste("colnames in repos are wrong for the alteration type" , i))
      }
      
      if (!is.list(repos[[i]]$Samples)) {
        stop(paste("In data," , i , ", Samples is not a list"))
      }
      
      if (!all(vapply(repos[[i]]$Samples , class 
                      , character(1)) == "character")) {
        stop(paste("some sample names in" , i , "are not character vectors"))
      }
      
      if (!all(lengths(repos[[i]]$Samples) > 0)) {
        stop(paste(
          "elements of Samples in",
          i ,
          "must be character vectors of length > 0"
        ))
      }
      #-----------------------------------------
      # CHECK TUMOR TYPE AND CASE_ID CONSISTENCY
      #
      # The user cannot put samples or tumor type 
      # in the data but not in the Samples
      if (any(repos[[i]]$data$case_id %notin% unlist(repos[[i]]$Samples))) {
        stop(paste(
          "In" ,
          i ,
          ", some case_id are in the data but not in the samples"
        ))
      }
      if (any(repos[[i]]$data$tumor_type %notin% names(repos[[i]]$Samples))) {
        stop(paste(
          "In" ,
          i ,
          ", some tumor_type are in the data but are not listed in the samples"
        ))
      }
    }
  }
}
################################################################################
# CLASS METHODS DECLARATION
################################################################################
# Download the alterations of interest
setGeneric('getAlterations', function(object
                ,tumor_type = NULL
                ,repos = NULL
                ,mutation_type = c("all_nonsynonymous"
                    ,"all_mutations"
                    ,"missense"
                    ,"truncating")
                ,expr_z_score = 2
                ,BPPARAM = bpparam("SerialParam")
                ,gene_block = 50){
  standardGeneric('getAlterations')
})
setMethod('getAlterations', 'CancerPanel', function(object
                ,tumor_type = NULL
                ,repos = NULL
                ,mutation_type = c("all_nonsynonymous"
                    ,"all_mutations"
                    ,"missense"
                    ,"truncating")
                ,expr_z_score = 2
                ,BPPARAM = bpparam("SerialParam")
                ,gene_block = 50)
{
  if (is.null(object)) {
    stop("No CancerPanel object provided")
  }
  ##############################################################################
  # IF repos is provided, get the data from it into the class
  #-----------------------------------------------------------------------------
  # The patients sets are retrieved from the data
  # Freq is not calculated
  if (!is.null(repos)) {
    .checkRepos(repos)
    object@dataFull$fusions <- list(data = repos$fusions$data)
    object@dataFull$mutations <- list(data = repos$mutations$data)
    object@dataFull$copynumber <-
      list(data = repos$copynumber$data)
    object@dataFull$expression <-
      list(data = repos$expression$data)
    tumor_type <- c()
    for (i in c("mutations" , "copynumber" , "expression" , "fusions")) {
      # If no Samples are provided, they are calculated from the data
      if (is.null(repos[[i]]$Samples)) {
        if (!is.null(repos[[i]])) {
          pats <- lapply(unique(repos[[i]]$tumor_type) , function(x) {
            repos[[i]][repos[[i]]$tumor_type == x , "case_id"] %>%
              unique
          })
          names(pats) <- unique(repos[[i]]$tumor_type)
          tumor_type <-
            unique(c(tumor_type , unique(repos[[i]]$tumor_type)))
          object@dataFull[[i]]$Samples <- pats
        } else {
          object@dataFull[[i]]['Samples'] <- list(NULL)
        }
      } else {
        object@dataFull[[i]]['Samples'] <- list(repos[[i]]$Samples)
      }
    }
    object@arguments$tumor_type <-
      lapply(object@dataFull , function(x) {
        if (!is.null(x$Samples))
          names(x$Samples)
      }) %>% unlist %>% unique
    return(object)
  }
  
  ##############################################################################
  # If no custom data are provided, you should specify tumor types
  #-----------------------------------------------------------------------------
  #Initial checks
  
  if (is.null(tumor_type) || identical(tumor_type,"")) {
    stop("You must specify one or more tumor types or tumor studies")
  }
  # Check consistency of tumor_type parameter
  if ("all_tumors" %in% tumor_type & length(tumor_type) > 1) {
    warning("tumor_type was set to all_tumor")
    tumor_type <- "all_tumors"
  }
  # Check format of expr_z_score
  if (!is.numeric(expr_z_score)) {
    stop("Expression z score should be a numeric value (e.g. 2 or 3)")
  }
  
  if (!is.numeric(gene_block)) {
    stop("gene_block must be a number between 1 and 100")
  } else {
    if (gene_block < 1 | gene_block > 100) {
      stop("gene_block must be a number between 1 and 100")
    }
  }
  
  #-----------------------------------------------------------------------------
  # Integrity check: All tumor types and tumor studies must be in cbioportal
  # You cannot mix tumor studies and tumor types
  #-----------------------------------------------------------------------------
  if (!is.null(tumor_type) && tumor_type[1] != "all_tumors") {
    #get list of tumor type available
    showTum <- showTumorType()
    availtumtype <- showTum$Code
    names(availtumtype) <- showTum$studyId
    #get list of cancer studies available
    # availcancerstudy <- showCancerStudy()[, 1]
    availcancerstudy <- showTum$studyId
    
    #Looking for any missing match. If a mismatch is found, print an error
    if (any(tumor_type %notin% availtumtype) &
        any(tumor_type %notin% availcancerstudy)) {
      notavail <- setdiff(tumor_type , c(availtumtype , availcancerstudy))
      if (length(notavail) == 0)
        stop("You probably mixed cancer studies and tumor types")
      else
        stop(
          paste(
            "The following tumor types or cancer studies are not available:" ,
            paste(notavail , collapse = ", ") ,
            ". Check with showTumorType() or showCancerStudy()"
          )
        )
    }
  }
  
  ##############################################################################
  # DOWNLOAD DATA
  #-----------------------------------------------------------------------------
  noDataMex <- paste("\nNo mutation data available for the specified" 
      ,"tumor types or cancer studies...")
  if (tumor_type[1] == "all_tumors") {
    derived_tumor_type <- showTumorType()$tumor_type %>% unique
  } else {
    derived_tumor_type <-
      unname(unique(vapply(tumor_type, function(x)
        strsplit(x , "_")[[1]][1] , character(1))))
  }
  object@arguments$tumor_type <- derived_tumor_type
  panel <- cpArguments(object)$panel
  
  #-----------------------------------------------------------------------------
  # LOOK FOR FUSIONS
  #-----------------------------------------------------------------------------
  if (any(cpArguments(object)$panel$alteration == "fusion")) {
    message("Retrieving fusions...")
    fus <-
      readRDS(file.path(
        system.file(package = "PrecisionTrialDrawer") ,
        "extdata" ,
        "TCGAtranslocation.rds"
      ))
    fusSamples <-
      readRDS(file.path(
        system.file(package = "PrecisionTrialDrawer") ,
        "extdata" ,
        "TCGAtranslocationSamples.rds"
      ))
    if (tumor_type[1] != "all_tumors") {
      # fusion table does not include the study so we include 
      # just the tumor types of the study (e.g. brca_tcga_pub becomes brca)
      fus <- fus[fus$tumor_type %in% derived_tumor_type ,]
    }
    if (nrow(fus) == 0) {
      message("No fusion data for the specified tumor types")
      object@dataFull$fusions <- list(data = NULL
                                      , Samples = NULL)
    } else {
      samples_fus <- fusSamples[names(fusSamples) %in% derived_tumor_type]
      if (any(derived_tumor_type %notin% names(fusSamples))) {
        missingFusSamples <-
          derived_tumor_type[derived_tumor_type %notin% names(fusSamples)]
        samples_fus_miss <-
          lapply(missingFusSamples , function(x)
            NULL) %>% setNames(missingFusSamples)
        samples_fus <- c(samples_fus , samples_fus_miss)
      }
      fus <- fus[fus$tier %in% c("tier1" , "tier2") ,]
      fus <- fus[fus$frame != "Out-of-frame" ,]
      fusgenes <-
        panel[panel$alteration == "fusion" , "gene_symbol"] %>% unique
      gtOut <-
        fus[fus$Gene_A %in% fusgenes |
              fus$Gene_B %in% fusgenes | fus$FusionPair %in% fusgenes ,]
      object@dataFull$fusions <- list(data = gtOut
                                      , Samples = samples_fus)
    }
  } else {
    object@dataFull$fusions <- list(data = NULL
                                    , Samples = NULL)
  }
  
  #-----------------------------------------------------------------------------
  # LOOK FOR SNV
  #-----------------------------------------------------------------------------
  if (any(cpArguments(object)$panel$alteration == "SNV")) {
    #check if all the tumor_type selected are available
    availability <-
      .checkDataAvailability(tumor_type = tumor_type 
                             , genProfile = "MUTATION") %>% lengths
    #in case there is one missing
    if (all(availability == 0)) {
      message(paste0("\n" , noDataMex))
      object@dataFull$mutations <- list(data = NULL
                                        , Samples = NULL)
    } else {
      message("\nRetrieving mutations...")
      mutgenes <-
        panel[panel$alteration == "SNV" , "gene_symbol"] %>% unique
      mutgenes_entrez <- object@arguments$genedata$entrezgene_id[ match(mutgenes , object@arguments$genedata$gene_symbol)]
      names(mutgenes) <- mutgenes_entrez
      # when genes like 
      # C10orf12 are queried, they are put in upper case
      # The solution is to revert them to their original case
      mutgenes_translator <-
        data.frame(
          original = mutgenes,
          upper = toupper(mutgenes),
          stringsAsFactors = FALSE
        )
      # If the number of requested genes is too large 
      # we subset the SQL query in chunks
      # if (length(mutgenes) > 100) {
      #   #split gene list in blocks of gene_block elements
      #   myGenesBlocks <- .subsetter(mutgenes , gene_block)
      #   message(
      #     paste("Too many genes. We subset data retrieval in"
      #       ,length(myGenesBlocks)
      #       ,"blocks of"
      #       ,gene_block
      #       ,"genes each"))
      #   gmOut <-
      #     lapply(seq_len(length(myGenesBlocks)) , function(x) {
      #       geneblock <- myGenesBlocks[[x]]
      #       .getMutations(geneblock 
      #                     , tumor_type = tumor_type 
      #                     , block = x 
      #                     , totalBlocks = length(myGenesBlocks))
      #     })
      #   gmOut2 <- bplapply(names(gmOut[[1]]), function(x) {
      #     box <- lapply(gmOut , function(z) z[[x]]$out)
      #     as.data.frame(data.table::rbindlist(box))
      #   } , BPPARAM = BPPARAM)
      #   gmOut2 <- as.data.frame(data.table::rbindlist(gmOut2))
      #   if (!is.null(gmOut2) && nrow(gmOut2) > 0) {
      #     tumor_type_vec <- unique(gmOut2$genetic_profile_id) %>%
      #       strsplit(. , "_") %>%
      #       vapply(., '[' , character(1) , 1)
      #     names(tumor_type_vec) <-
      #       unique(gmOut2$genetic_profile_id)
      #     gmOut2$tumor_type <-
      #       tumor_type_vec[gmOut2$genetic_profile_id]
      #     tumor_type_vec2_names <- names(gmOut[[1]])
      #     tumor_type_vec2 <-
      #       strsplit(tumor_type_vec2_names , "_") %>% vapply(. , '[' 
      #                                   , character(1) , 1)
      #     samples_mut <-
      #       lapply(unique(tumor_type_vec2) , function(tum) {
      #         # browser()
      #         tum <-
      #           tumor_type_vec2_names[tumor_type_vec2 == tum]
      #         lapply(gmOut[[1]][tum] , '[[' , "patients") %>% unlist %>% unique
      #       })
      #     names(samples_mut) <- unique(tumor_type_vec2)
      #   }
      # } else {
      gmOut <-
        .getMutations(mutgenes
                      , tumor_type = tumor_type 
                      , block = NULL
                      , totalBlocks = NULL)
      gmOut2 <- as.data.frame(data.table::rbindlist(lapply(gmOut , '[[' , 1)))
      if (is.data.frame(gmOut2)) {
        if (nrow(gmOut2) == 0) {
          gmOut2 <- NULL
        }
      }
      gmOut2_tums <- unique(gmOut2[ , c("tumor_type" , "genetic_profile_id")])
      samples_mut <- lapply(sort(unique(gmOut2_tums$tumor_type)) , function(x) {
        myGenProf <- gmOut2_tums[ gmOut2_tums$tumor_type %in% x , "genetic_profile_id" , drop = TRUE]
        lapply(gmOut[myGenProf] , '[[' , 'patients') %>% unlist %>% unique
      })
      names(samples_mut) <- sort(unique(gmOut2_tums$tumor_type))
      # if (!is.null(gmOut2)) {
      #     tumor_type_vec <- unique(gmOut2$genetic_profile_id) %>%
      #       strsplit(. , "_") %>%
      #       vapply(., '[' , character(1) , 1)
      #     names(tumor_type_vec) <-
      #       unique(gmOut2$genetic_profile_id)
      #     gmOut2$tumor_type <-
      #       tumor_type_vec[gmOut2$genetic_profile_id]
      #     tumor_type_vec2_names <- names(gmOut)
      #     tumor_type_vec2 <-
      #       strsplit(tumor_type_vec2_names , "_") %>% vapply(. , '[' 
      #                                         , character(1) , 1)
      #     samples_mut <-
      #       lapply(unique(tumor_type_vec2) , function(tum) {
      #         # browser()
      #         tum <-
      #           tumor_type_vec2_names[tumor_type_vec2 == tum]
      #         lapply(gmOut[tum] , '[[' , "patients") %>% unlist %>% unique
      #       })
      #     names(samples_mut) <- unique(tumor_type_vec2)
      #   }
      # }
      if (is.null(gmOut2)) {
        message(
          noDataMex
        )
        object@dataFull$mutations <- list(data = NULL
                                          , Samples = NULL)
      } else {
        # Some clean up
        gmOut2 <- gmOut2[!is.na(gmOut2$mutationType) ,]
        # Accepted TCGA style variant classification
        # We exclude every mutation that differ from the MAF v2.4 standards
        ktcga_types <- c(
          "In_Frame_Del",
          "In_Frame_Ins",
          "Missense_Mutation",
          "Frame_Shift_Del",
          "Frame_Shift_Ins",
          "Nonsense_Mutation",
          "Splice_Site",
          "Splice_Region",
          "Translation_Start_Site",
          "Nonstop_Mutation",
          "Silent",
          "3'UTR",
          "3'Flank",
          "5'UTR",
          "5'Flank",
          "IGR",
          "Intron",
          "RNA",
          "Targeted_Region")
        knotTransc <-
          c(
            "3'UTR",
            "3'Flank",
            "5'UTR",
            "5'Flank",
            "IGR",
            "Intron",
            "RNA",
            "Targeted_Region")
        knonsynonymous <- setdiff(ktcga_types , c("Silent" , knotTransc))
        kmiss_type <- ktcga_types[c(1, 2, 3)]
        ktrunc_type <- ktcga_types[c(4, 5, 6, 7, 8, 9)]
        mutation_type <- mutation_type[1]
        if (mutation_type == "all_nonsynonymous")
          gmOut2 <-
          gmOut2[gmOut2$mutationType %in% knonsynonymous ,]
        if (mutation_type == "missense")
          gmOut2 <-
          gmOut2[gmOut2$mutationType %in% kmiss_type ,]
        if (mutation_type == "truncating")
          gmOut2 <-
          gmOut2[gmOut2$mutationType %in% ktrunc_type ,]
        if (mutation_type == "all_mutations")
          gmOut2 <-
          gmOut2[gmOut2$mutationType %in% ktcga_types ,]
        # Add new useful columns
        gmOut2$amino_position <-
          ifelse(gmOut2$mutationType %notin% knotTransc
                 , as.numeric(as.character(
                   str_extract(
                     string = gmOut2$proteinChange,
                     pattern = "\\d+"
                   )))
                 , NaN)
        #-----------------------------
        # Chr retrieval in case is NA
        #
        if (any(is.na(gmOut2$chr))) {
          nochr <- gmOut2[is.na(gmOut2$chr) , "gene_symbol"] %>% unique
          ensembl <-
            biomaRt::useMart(host = "https://grch37.ensembl.org"
                             ,
                             biomart = "ENSEMBL_MART_ENSEMBL"
                             ,
                             dataset = "hsapiens_gene_ensembl")
          Chr_df <- biomaRt::getBM(
            mart = ensembl
            , values = nochr
            , filters = "hgnc_symbol"
            , attributes = c("hgnc_symbol", "chromosome_name")
          )
          Chr_df <-
            Chr_df[Chr_df$chromosome_name %in% c(seq_len(23) 
                              , "X" , "M" , "MT" , "Y") ,] %>% unique
          if (any(nochr %notin% Chr_df$hgnc_symbol)) {
            genesout <- setdiff(nochr , Chr_df$hgnc_symbol)
            stop(paste(
              "Problem retrieving chr name from biomart in" ,
              paste(genesout , collapse = ", ")
            ))
          }
          for (j in nochr) {
            mychr <-
              Chr_df[Chr_df$hgnc_symbol == j , "chromosome_name" 
                     , drop = TRUE] %>% unique %>% unname
            if (length(mychr) != 1) {
              stop(paste(
                j ,
                "is associated with more than 1 chromosome name in biomart"
              ))
            }
            gmOut2[gmOut2$gene_symbol == j , "chr"] <-
              mychr
          }
        }
        #-------------------------------
        # substitute chrMT with chrM
        #
        if (any(gmOut2$chr == "chrMT")) {
          gmOut2[gmOut2$chr == "chrMT" , "chr"] <- "chrM"
        }
        #-----------------------------------------------------------------
        # create genomic position column and get rid of each separate col
        #
        gmOut2$genomic_position <- paste(
          gmOut2$chr
          , gmOut2$startPosition
          , paste(gmOut2$referenceAllele , gmOut2$variantAllele , sep = ",")
          , sep = ":")
        for (z in c("chr" ,
                    "startPosition" ,
                    "referenceAllele" ,
                    "variantAllele")) {
          gmOut2[, z] <- NULL
        }
        # Correct TCGA samples name (revert TCGA-11-1234-01 to TCGA-11-1234)
        # gmOut2$case_id <- as.character(gmOut2$case_id)
        # gmOut2$case_id[grep("^TCGA-" , gmOut2$case_id)] <-
        #   gmOut2$case_id[grep("^TCGA-" , gmOut2$case_id)] %>%
        #   strsplit(. , "-") %>%
        #   vapply(. , function(x)
        #     paste(x[seq_len(3)] , collapse = "-") , character(1)) %>%
        #   unlist
        # Correct gene names. If upper case, revert them to their original case
        if (any(mutgenes_translator$original != mutgenes_translator$upper)) {
          mutgenes_translator <-
            mutgenes_translator[mutgenes_translator$original != 
                                  mutgenes_translator$upper , , drop=FALSE]
          gmOut2$gene_symbol <-
            .mapvalues(
              gmOut2$gene_symbol
              , from = mutgenes_translator$upper
              , to = mutgenes_translator$original
              , warn_missing = FALSE
            )
        }
        # samples_mut <- lapply(samples_mut , function(x) {
        #   if (is.null(x[1])) {
        #     return(x)
        #   }
        #   x[grep("^TCGA-" , x)] <-
        #     x[grep("^TCGA-" , x)] %>%
        #     strsplit(. , "-") %>%
        #     vapply(. , function(x)
        #       paste(x[seq_len(3)] , collapse = "-") , character(1)) %>%
        #     unlist
        #   return(unique(x))
        # })
        # names(samples_mut) <- unique(tumor_type_vec2)
        gmOut2 <- unique(gmOut2)
        # Remove samples that are not in the set
        gmOut2 <- gmOut2[ gmOut2$case_id %in% unlist(samples_mut) , ]
        # Modify final names of dataset
        colnames(gmOut2) <- c(
          "entrez_gene_id"
          ,"case_id"
          ,"mutation_type"
          ,"amino_acid_change"
          ,"genetic_profile_id"
          ,"gene_symbol"
          ,"tumor_type"
          ,"amino_position"
          ,"genomic_position")
        finalOrder <- c(
          "entrez_gene_id"
          ,"gene_symbol"
          ,"case_id"
          ,"mutation_type"
          ,"amino_acid_change"
          ,"genetic_profile_id"
          ,"tumor_type"
          ,"amino_position"
          ,"genomic_position")
        gmOut2 <- gmOut2[ , finalOrder]
        #Calculate frequencies
        object@dataFull$mutations <- list(data = gmOut2
                                          , Samples = samples_mut)
      }
    }
  } else {
    object@dataFull$mutations <- list(data = NULL
                                      , Samples = NULL)
  }
  
  #-----------------------------------------------------------------------------
  # LOOK FOR CNA
  #-----------------------------------------------------------------------------
  # Let's deal with CNAs. CNA are in theory an easier task
  # The data come in a different form as patients on rows and genes on columns
  # To uniform the output to mutations, we have to melt this matrix
  if (any(cpArguments(object)$panel$alteration == "CNA")) {
    availability <-
      .checkDataAvailability(tumor_type = tumor_type 
                             , genProfile = "COPY_NUMBER_ALTERATION") %>% lengths
    if (all(availability == 0)) {
      message(
        paste("\nNo copynumber alteration data" 
              ,"available for the specified tumor types...")
      )
      object@dataFull$copynumber <- list(data = NULL
                                         , Samples = NULL)
    } else {
      message("\nRetrieving copy number alterations...")
      cnagenes <-
        panel[panel$alteration == "CNA" , "gene_symbol"] %>% unique
      cnagenes_entrez <- object@arguments$genedata$entrezgene_id[ match(cnagenes 
                                      , object@arguments$genedata$gene_symbol)]
      names(cnagenes) <- cnagenes_entrez
      # when genes like 
      # C10orf12 are queried, they are put in upper case
      # The solution is to revert them to their original case
      cnagenes_translator <-
        data.frame(
          original = cnagenes,
          upper = toupper(cnagenes),
          stringsAsFactors = FALSE
        )
      gcOut <- .getCNA(cnagenes , tumor_type = tumor_type , block = NULL)
        # for (x in names(gcOut)) {
        #   if (!is.null(gcOut[[x]]$out)) {
        #     gcOut[[x]]$out$case_id <-
        #       gsub("\\." , "-" , rownames(gcOut[[x]]$out))
        #     gcOut[[x]]$out$genetic_profile_id <- x
        #     tum_type <- strsplit(x , "_")[[1]][1]
        #     gcOut[[x]]$out$tumor_type <- tum_type
        #   }
        # }
      gcOut2 <- as.data.frame(rbindlist(lapply(gcOut , '[[' , 1)))
      gcOut2_tums <- unique(gcOut2[ , c("tumor_type" , "genetic_profile_id")])
      samples_cna <- lapply(sort(unique(gcOut2_tums$tumor_type)) , function(x) {
        myGenProf <- gcOut2_tums[ gcOut2_tums$tumor_type %in% x , "genetic_profile_id" , drop = TRUE]
        lapply(gcOut[myGenProf] , '[[' , 'patients') %>% unlist %>% unique
      })
      names(samples_cna) <- sort(unique(gcOut2_tums$tumor_type))
      # gcOut_melt <-
      #     reshape2::melt(
      #       gcOut2 ,
      #         id.vars = c("case_id" , "genetic_profile_id" , "tumor_type")
      #       , value.name = "value" 
      #       , variable.name = "gene_symbol"
      #       , factorsAsStrings = FALSE
      #     ) %>% .changeFactor(.)
      #   
      #   tumor_type_vec <-
      #     unique(gcOut2$genetic_profile_id) %>%
      #     strsplit(. , "_") %>%
      #     vapply(., '[' , character(1) , 1)
      #   names(tumor_type_vec) <-
      #     unique(gcOut2$genetic_profile_id)
      #   gcOut_melt$tumor_type <-
      #     tumor_type_vec[gcOut2$genetic_profile_id]
      #   tumor_type_vec2_names <- names(gcOut)
      #   tumor_type_vec2 <-
      #     strsplit(tumor_type_vec2_names , "_") %>%
      #     vapply(., '[' , character(1) , 1)
      #   samples_cna <-
      #     lapply(unique(tumor_type_vec2) , function(tum) {
      #       # browser()
      #       tum <-
      #         tumor_type_vec2_names[tumor_type_vec2 == tum]
      #       lapply(gcOut[tum] , '[[' , "patients") %>% unlist %>% unique
      #     })
      #   names(samples_cna) <- unique(tumor_type_vec2)
      # # Correct TCGA samples name (revert TCGA-11-1234-01 to TCGA-11-1234)
      # gcOut_melt$case_id <- as.character(gcOut_melt$case_id)
      # gcOut_melt$case_id[grep("^TCGA-" , gcOut_melt$case_id)] <-
      #   gcOut_melt$case_id[grep("^TCGA-" , gcOut_melt$case_id)] %>%
      #   strsplit(. , "-") %>%
      #   vapply(. , function(x)
      #     paste(x[seq_len(3)] , collapse = "-") , character(1)) %>%
      #   unlist
      # samples_cna <- lapply(samples_cna , function(x) {
      #   if (is.null(x[1]))
      #     return(x)
      #   x[grep("^TCGA-" , x)] <-
      #     x[grep("^TCGA-" , x)] %>%
      #     strsplit(. , "-") %>%
      #     vapply(. , function(x)
      #       paste(x[seq_len(3)] , collapse = "-") , character(1)) %>%
      #     unlist
      #   return(unique(x))
      # })
      # Correct gene names. If upper case,
      # revert them to their original case
      if (any(cnagenes_translator$original != cnagenes_translator$upper)) {
        cnagenes_translator <-
          cnagenes_translator[cnagenes_translator$original !=
                                cnagenes_translator$upper , , drop =FALSE]
        gcOut2$gene_symbol <-
          .mapvalues(
            gcOut2$gene_symbol
            ,
            from = cnagenes_translator$upper
            ,
            to = cnagenes_translator$original
            ,
            warn_missing = FALSE
          )
      }
      # # All this mess is because gistic applies 
      # # different scores at different datasets
      # # This create confusion, because if the 
      # # same patient belongs to different study
      # # Its gistic copynumber can be different
      # gcOut_melt2 <-
      #   data.frame(
      #     tumor_type = factor(gcOut_melt$tumor_type 
      #                         , levels = derived_tumor_type)
      #     ,
      #     gene_symbol = factor(gcOut_melt$gene_symbol , levels = cnagenes)
      #     ,
      #     CNA = ifelse(is.na(gcOut_melt$CNA) , 0 , gcOut_melt$CNA)
      #     ,
      #     case_id = gcOut_melt$case_id
      #     ,
      #     stringsAsFactors = FALSE
      #   ) %>% unique
      gcOut_melt2 <- aggregate(value ~ tumor_type + gene_symbol + case_id , gcOut2 , mean, na.rm=TRUE)
        # aggregate(CNA ~ tumor_type + gene_symbol + case_id , gcOut_melt2 , mean)
      gcOut_melt2$CNA <-
        factor(as.character(round(gcOut_melt2$value)) 
               , levels = c("2" , "1" , "0" , "-1" , "-2"))
      # NOTE: the format of Freq is different 
      # from mutations. It is a list (not a matrix)
      # It contains an element for each tumor type
      # Each tumor type is a dataframe with amplification, 
      # normal and deletion freqeuncy
      gcOut_melt3 <-
        data.frame(
          gene_symbol = as.character(gcOut_melt2$gene_symbol)
          , CNA = ifelse(
            as.character(gcOut_melt2$CNA) %in% c("1" , "0" , "-1") , "normal"
            , ifelse( as.character(gcOut_melt2$CNA) == "2" ,"amplification" , "deletion"))
          , case_id = gcOut_melt2$case_id
          , tumor_type = as.character(gcOut_melt2$tumor_type)
          , CNAvalue = as.character(gcOut_melt2$value)
          , stringsAsFactors = FALSE
        ) %>% unique
      # Remove samples that are not in the set
      gcOut_melt3 <- gcOut_melt3[ gcOut_melt3$case_id %in% unlist(samples_cna) , ]
      object@dataFull$copynumber <- list(data = gcOut_melt3
                                         , Samples = samples_cna)
    }
  } else {
    object@dataFull$copynumber <- list(data = NULL
                                       , Samples = NULL)
  }
  
  
  #-----------------------------------------------------------------------------
  # LOOK FOR EXPRESSION
  #-----------------------------------------------------------------------------
  # Let's deal with Expression. Expression comes exactly as CNA
  # To uniform the output to mutations, we have to melt this matrix
  if (any(cpArguments(object)$panel$alteration == "expression")) {
    availability <-
      .checkDataAvailability(tumor_type = tumor_type 
                , genProfile = "MRNA_EXPRESSION") %>% lengths
    if (all(availability == 0)) {
      message(paste("\nSorry, no expression data available" 
                    ,"for the specified tumor types..."))
      object@dataFull$expression <- list(data = NULL
                                         , Samples = NULL)
    } else {
      message("\nRetrieving Expression data...")
      exprgenes <-
        panel[panel$alteration == "expression" , "gene_symbol"] %>% unique
      exprgenes_entrez <- object@arguments$genedata$entrezgene_id[ match(exprgenes 
                                          , object@arguments$genedata$gene_symbol)]
      names(exprgenes) <- exprgenes_entrez
      # when genes like 
      # C10orf12 are queried, they are put in upper case
      # The solution is to revert them to their original case
      exprgenes_translator <- data.frame(
          original = exprgenes,
          upper = toupper(exprgenes),
          stringsAsFactors = FALSE
        )
      geOut <- .getExpression(exprgenes , tumor_type = tumor_type , block = NULL)
      geOut2 <- unique(as.data.frame(rbindlist(lapply(geOut , '[[' , 1))))
      geOut2_tums <- unique(geOut2[ , c("tumor_type" , "genetic_profile_id")])
      samples_expr <- lapply(sort(unique(geOut2_tums$tumor_type)) , function(x) {
        myGenProf <- geOut2_tums[ geOut2_tums$tumor_type %in% x , "genetic_profile_id" , drop = TRUE]
        lapply(geOut[myGenProf] , '[[' , 'patients') %>% unlist %>% unique
      })
      names(samples_expr) <- sort(unique(geOut2_tums$tumor_type))
      # par(mfrow=c(3,3))
      # for(i in names(geOut)){
      #    if(!is.null(geOut[[i]]$out))
      #    hist(geOut[[i]]$out$value[ geOut[[i]]$out$gene_symbol=="ERBB2" ] , main = paste(i , geOut[[i]]$type))
      #   }
        # for (x in names(geOut)) {
        #   if (!is.null(geOut[[x]]$out)) {
        #     geOut[[x]]$out$case_id <-
        #       gsub("\\." , "-" , rownames(geOut[[x]]$out))
        #     geOut[[x]]$out$genetic_profile_id <- x
        #     tum_type <- strsplit(x , "_")[[1]][1]
        #     geOut[[x]]$out$tumor_type <- tum_type
        #   }
        # }
      
      #   geOut_melt <-
      #     reshape2::melt(
      #       geOut2 ,
      #       id.vars = c("case_id" , "genetic_profile_id" , "tumor_type")
      #       ,
      #       value.name = "expression" ,
      #       variable.name = "gene_symbol"
      #       ,
      #       factorsAsStrings = FALSE
      #     ) %>% .changeFactor(.)
      #   tumor_type_vec <-
      #     unique(geOut2$genetic_profile_id) %>%
      #     strsplit(. , "_") %>%
      #     vapply(., '[' , character(1) , 1)
      #   names(tumor_type_vec) <-
      #     unique(geOut2$genetic_profile_id)
      #   geOut2$tumor_type <-
      #     tumor_type_vec[geOut2$genetic_profile_id]
      #   tumor_type_vec2_names <- names(geOut)
      #   tumor_type_vec2 <-
      #     strsplit(tumor_type_vec2_names , "_") %>% 
      #     vapply(., '[' , character(1) , 1)
      #   samples_expr <-
      #     lapply(unique(tumor_type_vec2) , function(tum) {
      #       tum <- tumor_type_vec2_names[tumor_type_vec2 == tum]
      #       lapply(geOut[tum] , '[[' , "patients") %>% unlist %>% unique
      #     })
      #   names(samples_expr) <- unique(tumor_type_vec2)
      # }
      # # Correct TCGA samples name (revert TCGA-11-1234-01 to TCGA-11-1234)
      # geOut_melt$case_id <- as.character(geOut_melt$case_id)
      # geOut_melt$case_id[grep("^TCGA-" , geOut_melt$case_id)] <-
      #   geOut_melt$case_id[grep("^TCGA-" , geOut_melt$case_id)] %>%
      #   strsplit(. , "-") %>%
      #   vapply(. , function(x)
      #     paste(x[seq_len(3)] , collapse = "-") , character(1)) %>%
      #   unlist
      # samples_expr <- lapply(samples_expr , function(x) {
      #   if (is.null(x[1]))
      #     return(x)
      #   x[grep("^TCGA-" , x)] <-
      #     x[grep("^TCGA-" , x)] %>%
      #     strsplit(. , "-") %>%
      #     vapply(. , function(x)
      #       paste(x[seq_len(3)] , collapse = "-") , character(1)) %>%
      #     unlist
      #   return(unique(x))
      # })
      # Correct gene names. If upper case, 
      # revert them to their original case
      geOut_melt <- aggregate(value ~ tumor_type + gene_symbol + case_id , geOut2 , mean , na.rm=TRUE)
      if (any(exprgenes_translator$original != exprgenes_translator$upper)) {
        exprgenes_translator <-
          exprgenes_translator[exprgenes_translator$original != 
                                 exprgenes_translator$upper , , drop =FALSE]
        geOut_melt$gene_symbol <-
          .mapvalues(
            geOut_melt$gene_symbol
            ,from = exprgenes_translator$upper
            ,to = exprgenes_translator$original
            ,warn_missing = FALSE
          )
      }
      geOut_melt2 <-
        unique(geOut_melt[, c("tumor_type" , "case_id" 
                              , "gene_symbol" , "value")])
      geOut_melt2$value <-
        .noNA(geOut_melt2$value , subs = 0)
      # geOut_melt2 <-
      #   aggregate(expression ~ tumor_type + gene_symbol + case_id ,
      #             geOut_melt2 ,
      #             mean)
      geOut_melt2$expressionValue <- geOut_melt2$value
      expression_dichotomize <-
        ifelse(
          geOut_melt2$expressionValue >= expr_z_score ,
          "up"
          ,
          ifelse(
            geOut_melt2$expressionValue <= (-expr_z_score) ,
            "down"
            ,
            "normal"
          )
        )
      geOut_melt2$expression <-
        factor(expression_dichotomize , levels = c("up" , "normal" , "down"))
      # NOTE: the format of Freq is different 
      # from mutations. It is a list (not a matrix)
      # It contains a element for each tumor type
      # Each tumor type is a dataframe with amplification, 
      # normal and deletion frequency
      geOut_melt3 <-
        data.frame(
          gene_symbol = as.character(geOut_melt2$gene_symbol)
          , expression = as.character(geOut_melt2$expression)
          , case_id = geOut_melt2$case_id
          , tumor_type = as.character(geOut_melt2$tumor_type)
          , expressionValue = geOut_melt2$expressionValue
          , stringsAsFactors = FALSE
        )
      # Remove samples that are not in the set
      geOut_melt3 <- geOut_melt3[ geOut_melt3$case_id %in% unlist(samples_expr) , ]
      object@dataFull$expression <- list(data = geOut_melt3
                                         , Samples = samples_expr)
    }
  } else {
    object@dataFull$expression <- list(data = NULL
                                       , Samples = NULL)
  }
  return(object)
})
gmelloni/PrecisionTrialDrawer documentation built on March 4, 2023, 1:48 a.m.