R/mutationHelper.R

Defines functions .getMutations

# Given a vector of genes and a vector of tumor 
# type as resulted from showTumorType() or showCancerStudy()
# It returns a nested list composed by n elements, one for each tumor type
# Each element is composed by a mutation df and a 
# vector with all patient barcodes for that tumor type
.getMutations <- function(myGenes=myGenes
                            , tumor_type="all_tumors"
                            , block=NULL
                            , totalBlocks=NULL)
{
    mycgds <- cBioPortalData::cBioPortal(
      hostname = "www.cbioportal.org",
      protocol = "https",
      api. = "/api/api-docs")
    allCanStudy <- cBioPortalData::getStudies(mycgds)[ , c("cancerTypeId" , "studyId")]
    allCanStudy <- unique(allCanStudy)
    # allCanStudy$tumor_type <- vapply(strsplit(allCanStudy$cancerTypeId
    #                                       , "_") , '[' , character(1) , 1)
    allCanStudy$tumor_type <- allCanStudy$cancerTypeId
    if(tumor_type[1]=="all_tumors") {
        chosenTumors <- allCanStudy[,1,drop=TRUE]
    } else {
        #FIND All cancer studies associated with the tumor_type specified
        chosenTumors <- allCanStudy[ allCanStudy$tumor_type %in% 
                                       tumor_type, 1,drop=TRUE]
        names(chosenTumors) <- allCanStudy[ allCanStudy$tumor_type %in% 
                                              tumor_type, 2,drop=TRUE]
        #In case we are not looking for tumor ID but cancer studies ID
        if(length(chosenTumors)==0){
            chosenTumors <- allCanStudy[ allCanStudy$studyId %in% 
                                           tumor_type, 1,drop=TRUE]
            names(chosenTumors) <- allCanStudy[ allCanStudy$studyId %in% 
                                                  tumor_type, 2,drop=TRUE]
        }
        #if still we have not found anything
        if (length(chosenTumors)==0){
          stop(paste("Could not find the tumor_type nor"
                     ,"the cancer_study_id specified"))
        }
    }
    out_double <- lapply(names(chosenTumors) , function(i)
        {
            #for each Cancer Study, fetch the type 
            # of alteration (genetic profile) 
            #to be considered
      geneticProfile <- cBioPortalData::molecularProfiles(mycgds, i)
      geneticProfile <- geneticProfile[ grep("MUTATION" , geneticProfile$molecularAlterationType) , ]
      if(nrow(geneticProfile)==0){
        message(paste("geneticProfile" , i , "has no mutation data"))
        return( list( out=NULL , patients=NULL) )
      }
      # Fetch the patient list for the specified molecular profile and data type (mutations)
      caseList <- cBioPortalData::sampleLists(mycgds, i)
      caseList <- caseList[ caseList$category == "all_cases_with_mutation_data" , ]
      if(nrow(caseList)==0){
        message(paste("ProfileId" , i , "has no samples with mutations"))
        return( list( out=NULL , patients=NULL) )
      }
      # Get the actual case names (e.g. TCGA-AR-A1AR)
      caseListId <- cBioPortalData::getSampleInfo(api = mycgds 
                                                  , studyId = geneticProfile$studyId 
                                                  , sampleListIds = caseList$sampleListId)$sampleId
      tryCatch(
        muts <- cBioPortalData::mutationData(
          api = mycgds
          , molecularProfileIds = geneticProfile$molecularProfileId
          , entrezGeneIds = as.numeric(names(myGenes))
          , sampleIds = caseListId
        )[[1]]
        , error=function(e) message(paste("Impossible to retrive mutations from" , i , "study or no mutations are present on the selected genes"))
      )
      
      if(!exists("muts")) {
        muts <- NULL
        patients <- NULL
      } else if(nrow(muts) == 0 ){
        muts <- NULL
        patients <- NULL
      } else {
        colsIlike <- c(
          "entrezGeneId"
          # ,"gene_symbol"
          # ,"patientId"
          , "sampleId"
          # ,"sequencing_center"
          # ,"mutation_status"
          ,"mutationType"
          # ,"validation_status"
          ,"proteinChange"
          # ,"functional_impact_score"
          # ,"xvar_link"
          # ,"xvar_link_pdb"
          # ,"xvar_link_msa"
          ,"chr"
          ,"startPosition"
          # ,"end_position"
          ,"referenceAllele"
          ,"variantAllele"
          # ,"reference_read_count_tumor"
          # ,"variant_read_count_tumor"
          # ,"reference_read_count_normal"
          # ,"variant_read_count_normal"
          # ,"genetic_profile_id"
          )
        muts <- muts[ , colsIlike]
        # Fix chromosome number
        muts$chr <- muts$chr %>% as.character %>%
          sub("chr" , "" , .) %>%
          sub("23" , "X" , .) %>%
          sub("24" , "Y" , .) %>%
          sub("M" , "MT" , .)
        muts <- muts[ muts$chr %in% c(seq_len(22), "X", "Y", "MT") , ]
        muts$genetic_profile_id <- i
        muts$gene_symbol <- myGenes[ as.character(muts$entrezGeneId) ]
        colnames(muts)[colnames(muts)=="sampleId"] <- "case_id"
        muts$case_id <- ifelse(grepl("^TCGA" , muts$case_id) 
                               , unlist(lapply(strsplit(muts$case_id , "-") , function(x) paste(x[seq_len(3)] , collapse="-")))
                               , muts$case_id)
        muts$tumor_type <- chosenTumors[ muts$genetic_profile_id ]
        # patients <- strsplit(caseList[sel, 'case_ids'] 
        #                      , split=" ")[[1]]
        patients <- unique(ifelse(grepl("^TCGA" , caseListId) 
                           , unlist(lapply(strsplit(caseListId , "-") , function(x) paste(x[seq_len(3)] , collapse="-")))
                           , caseListId))
        # message(paste("Mutations retrieved from" , i , "study"))
      }
      return( list( out=muts , patients=patients) )
  # } , .inform = TRUE)
    })
  names(out_double) <- names(chosenTumors)
  return(out_double)
}

.subsetMutations <- function(panel , muts_full , rs_df)
{
    emptyDF <- data.frame(drug = character(0)
                          ,group = character(0)
                          ,gene_symbol = character(0)
                          ,tumor_type = character(0)
                          ,case_id = character(0)
                          ,genomic_position=character(0)
                          , stringsAsFactors = FALSE)
    if(any(panel$exact_alteration=="amino_acid_position")){
        panel_mut_1 <- panel[ panel$exact_alteration=="amino_acid_position" 
                            , c("drug", "gene_symbol" 
                                , "mutation_specification" ,"group")]
        mut_specs <- strsplit(panel_mut_1$mutation_specification , "-")
        panel_mut_1$start <- vapply(mut_specs , function(x) {
          as.numeric(x[1])} , numeric(1))
        panel_mut_1$end <- vapply(mut_specs , function(x) {
          as.numeric(x[2])} , numeric(1))
        panel_mut_1$mutation_specification <- NULL
        mut_data1 <- merge(muts_full , panel_mut_1 
                           , all.y=TRUE , by="gene_symbol")
        if(any(!is.na(mut_data1$case_id))){
            mut_data1 <- mut_data1[ !is.na(mut_data1$case_id) , ]
            # watch out from mutations without amino acid position
            mut_data1 <- mut_data1[!is.na(mut_data1$amino_position) 
                                   , , drop=FALSE]
            mut_data1 <- mut_data1[mut_data1$amino_position>=mut_data1$start & 
                mut_data1$amino_position<=mut_data1$end , , drop=FALSE] %>% 
                unique
        } else {
            mut_data1 <- emptyDF
        }        
        mut_data1 <- mut_data1[ , colnames(emptyDF)]
    } else {
        mut_data1 <- emptyDF
    }

    if(any(panel$exact_alteration=="amino_acid_variant")){
        panel_mut_2 <- panel[ panel$exact_alteration=="amino_acid_variant" 
                            , c("drug", "gene_symbol" 
                                , "mutation_specification" ,"group")]
        mut_data2 <- merge(muts_full , panel_mut_2 
                           , all.y=TRUE , by="gene_symbol")
        if(any(!is.na(mut_data2$case_id))){
            mut_data2 <- mut_data2[ !is.na(mut_data2$case_id) , ]
            mut_data2 <- mut_data2[mut_data2$amino_acid_change==
                                     mut_data2$mutation_specification 
                                   , , drop=FALSE] %>% unique
        } else {
            mut_data2 <- emptyDF
        }
        mut_data2 <- mut_data2[ , colnames(emptyDF)]
    } else {
        mut_data2 <- emptyDF
    }

    if(any(panel$exact_alteration=="mutation_type")){
        ktcga_types <- c(
        "In_Frame_Del", "In_Frame_Ins","Missense_Mutation"
        ,"Frame_Shift_Del", "Frame_Shift_Ins"
        , "Nonsense_Mutation",  "Splice_Site"
        , "Translation_Start_Site", "Nonstop_Mutation"
        , "Silent","3'UTR", "3'Flank"
        , "5'UTR", "5'Flank", "IGR1 ", "Intron", "RNA", "Targeted_Region")
        knotTransc <- c("3'UTR", "3'Flank", "5'UTR", "5'Flank"
                ,"IGR1", "IGR", "Intron", "RNA", "Targeted_Region")
        kmiss_type <- ktcga_types[c(1,2,3)]
        ktrunc_type <- ktcga_types[c(4,5,6)]
        panel_mut_3 <- panel[ panel$exact_alteration=="mutation_type" 
                            , c("drug", "gene_symbol" 
                                , "mutation_specification" ,"group")]
        mut_data3 <- merge(muts_full , panel_mut_3 , all.y=TRUE 
                           , by="gene_symbol")
        if(any(!is.na(mut_data3$case_id))){
            mut_data3 <- mut_data3[ !is.na(mut_data3$case_id) , ]
            mut_data3 <- mut_data3[(mut_data3$mutation_type %in% 
                    kmiss_type & mut_data3$mutation_specification=="missense") |
                    (mut_data3$mutation_type %in% ktrunc_type & 
                       mut_data3$mutation_specification=="truncating")
                    , , drop=FALSE] %>% unique
        } else {
            mut_data3 <- emptyDF    
        }
        mut_data3 <- mut_data3[ , colnames(emptyDF)]
    } else {
        mut_data3 <- emptyDF
    }

    if(any(panel$exact_alteration=="genomic_position" | 
           panel$exact_alteration=="dbSNP_rs")){
        panel_mut_4 <- panel[ panel$exact_alteration %in% 
                                c("genomic_position","dbSNP_rs")
                , c("drug", "gene_symbol" , "mutation_specification" ,"group")]
        if(!is.null(rs_df)){
            panel_mut_4$mutation_specification <- 
              .mapvalues(panel_mut_4$mutation_specification 
                                                    , from=rs_df$rs
                                                    , to=rs_df$genomic_range
                                                    ,warn_missing=FALSE)
        }
        mut_specs2 <- strsplit(panel_mut_4$mutation_specification , "-|:")
        panel_mut_4$start <- vapply(mut_specs2 , function(x) {
          as.numeric(x[2])} , numeric(1))
        panel_mut_4$end <- vapply(mut_specs2 , function(x) {
          as.numeric(x[3])} , numeric(1))
        panel_mut_4$mutation_specification <- NULL
        mut_data4 <- merge(muts_full , panel_mut_4 
                           , all.y=TRUE , by="gene_symbol")
        if(any(!is.na(mut_data4$case_id))){
            mut_data4 <- mut_data4[!is.na(mut_data4$case_id) , ]
            mut_data4$genomic_position_num <- 
              strsplit(mut_data4$genomic_position , ":") %>%
                                        vapply(. , '[' , character(1) , 2) %>%
                                        as.numeric
            mut_data4 <- mut_data4[ 
              mut_data4$genomic_position_num>=mut_data4$start & 
                mut_data4$genomic_position_num<=mut_data4$end , ] %>% unique
        } else {
            mut_data4 <- emptyDF
        }
        mut_data4 <- mut_data4[ , colnames(emptyDF)]
    } else {
        mut_data4 <- emptyDF
    }

    if(any(panel$exact_alteration=="genomic_variant")){
        panel_mut_5 <- panel[ panel$exact_alteration=="genomic_variant" 
                            , c("drug", "gene_symbol" 
                                , "mutation_specification" ,"group")]
        mut_data5 <- merge(muts_full , panel_mut_5 
                           , all.y=TRUE , by="gene_symbol")
        if(any(!is.na(mut_data5$case_id))){
            mut_data5 <- mut_data5[!is.na(mut_data5$case_id) , ]
            mut_data5 <- mut_data5[ mut_data5$genomic_variant==
                                mut_data5$mutation_specification, ] %>% unique
        } else {
            mut_data5 <- emptyDF
        }
        mut_data5 <- mut_data5[ , colnames(emptyDF)]
    } else {
        mut_data5 <- emptyDF
    }

    if(any(panel$exact_alteration=="")){
        panel_mut_6 <- panel[ panel$exact_alteration=="" 
                            , c("drug", "gene_symbol" 
                            , "mutation_specification" ,"group")]
        mut_data6 <- merge(muts_full , panel_mut_6 , all.y=TRUE 
                           , by="gene_symbol")
        if(any(!is.na(mut_data6$case_id))){
            mut_data6 <- mut_data6[!is.na(mut_data6$case_id) , ]
        } else {
            mut_data6 <- emptyDF
        }
        mut_data6 <- mut_data6[ , colnames(emptyDF)]
    } else {
        mut_data6 <- emptyDF
    }
    muts_subset <- do.call("rbind" 
          , list(mut_data1,mut_data2,mut_data3,mut_data4,mut_data5,mut_data6))
    muts_subset <- muts_subset[ , c("drug", "group", "gene_symbol", "tumor_type"
                                    , "case_id","genomic_position")] %>% unique
    if(nrow(muts_subset)>0){
      muts_subset$alteration_id <- paste0("mut_" , seq_len(nrow(muts_subset)))
    } else {
      muts_subset$alteration_id <- character(0)
    }
    muts_subset$genomic_position <- NULL
    return(muts_subset)
}
gmelloni/PrecisionTrialDrawer documentation built on March 4, 2023, 1:48 a.m.