R/im_syng_tcga.R

Defines functions im_syng_tcga

Documented in im_syng_tcga

#'  Calculation and statistical assessment of synergistic associations using TCGA data.
#'
#' @param onco_gene A character vector of gene Hugo symbols.
#' @param icp_gene An optional character vector of immune checkpoint gene Hugo symbols.
#' @param cohort A list of TCGA diseases
#' @param select_iap  An optional  character vector or numeric matrix or data.frame.
#' @param method A character string indicating which synergy score to be used. one of "max" or "independence".
#' Default is "max".
#'
#' @param specificity An optional logical indicating if specificity analysis should be done. Default is FALSE.
#' @param N_iteration_specificity Number of iterations for random sampling for specificity p.value calculation.
#' Default is 1000.
#' @param sensitivity An optional logical indicating if a sensitivity analysis should be done. Default is FALSE.
#' @param N_iteration_sensitivity Number of iterations for random sampling for sensitivity analysis.
#' Default is 1000.
#' @param sample_list An optional character vector of TCGA samples barcodes indicating a subset of samples within a cohort.
#'
#' @keywords Synergy scoring, immune feature, immune checkpoint, bootstrapping, TCGA
#' @return A data.frame of synergy scores and bootstrapping p.values.
#' @description
#'
#' Takes a list of Tumor-intrinsic pathway(TIP) genes and returns their combinatorial association with immune checkpoint(ICP) genes by evaluating their synergistic impact on immune-associated phenotypes(IAP) using RNASeq2GeneNorm expressions as provided by \pkg{curatedTCGAData} in a selected disease cohort.
#'
#' @details
#'
#   Output is a dataframe, with each row containing two gene Hugo IDs, an IAP name, the inferred impact of each single gene on the IAP, the calculated synergy score and the statistical assessment parameters to measure robustness, significance, and specificity.
#
# IAP names are listed in TCGA_immune_features_list.
#'If no icp_gene is specified, the default icp_gene_list will be used.
#'If select_iap is a character vector, it must be any sub-list of IAP names as listed in TCGA_immune_features_list. If a numeric matrix or data.frame, each column represents a user-defined IAP and must have a range between [0,1]. If select_iap is missing all IAPs listed in TCGA_immune_features_list will be considered for analysis.
#'
#' For synergy score calculations all features are normalized to be on [0,1] range. For details of synergy score and significance pvalue calculations see \code{find_a_synergy} function.
#'
#' A specificity p.value is computed using random sampling with replacement from two null models, generated from one of the two genes against a set of genes randomly selected from the genome. Two P-values are calculated for the synergistic interaction of the pair against the two null models. The highest of the two P-values is used to assess the specificity of the interaction against the whole genome. The number of randomly selected genes in each null model is determined by N_iteration_specificity.
#'
#' Sensitivity (Robustness) score defined as normalized root mean square deviation of scores calculated over 70% of samples, selected via random sampling. The number of sub-sample iterations is determined by N_iteration_sensitivity.
#'
#'All barcodes in sample_list must be 15 character long and belong to the same cohort. When sample_list is provided, cohort should be the disease cohort that they belong to, otherwise only the first element of the cohort list will be used.
#'
#' @examples im_syng_tcga(onco_gene=c("TGFB1","SERPINB9"), cohort=c("ucec","skcm"))
#'
#' @importFrom dplyr bind_rows across mutate group_by everything distinct
#' @importFrom magrittr %>%
#' @importFrom data.table setkey as.data.table
#' @import curatedTCGAData
#' @importFrom data.table ":=" ".SD"
#' @importFrom utils combn setTxtProgressBar txtProgressBar
#' @importFrom stats complete.cases median
#'
#' @export

im_syng_tcga <- function(onco_gene, icp_gene, cohort, select_iap, method, specificity, N_iteration_specificity, sensitivity, N_iteration_sensitivity, sample_list){

  df_syng_all <- data.frame(Disease=character(),
                            agent1=character(),
                            agent2=character(),
                            Immune_feature=character(),
                            Synergy_score=numeric(),
                            agent1_expression=character(),
                            agent2_expression=character(),
                            wilcox_pvalue=numeric(),
                            specificity_pvalue=numeric(),
                            sensitivity_R=numeric())

  #Check input parameters------------------------


  if( missing( method ) ){
    method <- "max"
  }else{
    method <- tolower(method)
    if(!(method=="max" || method=="independence")){
      stop("ERROR: Method is not found. Please choose a method from: max or independence.")
    }
  }

  if(!missing(sample_list)){
    cohort <- cohort[1]
  }
  cohort <- tolower(cohort)


  #Loop over diseases-----------------------------


  for(cohortID in 1:length(cohort)){

    df_syng <- data.frame(Disease=character(),
                          agent1=character(),
                          agent2=character(),
                          Immune_feature=character(),
                          Synergy_score=numeric(),
                          agent1_expression=character(),
                          agent2_expression=character(),
                          wilcox_pvalue=numeric(),
                          specificity_pvalue=numeric(),
                          sensitivity_R=numeric())

    #Get expression data----------------------------

    disease <- cohort[cohortID]
    message("\nReading TCGA ",toupper(disease)," data\n")
    df <-curatedTCGAData::curatedTCGAData( diseaseCode = disease,
                                           assays = c("RNASeq2GeneNorm"), dry.run = F)@ExperimentList@listData[[1]]

    data_expression <- df@assays$data@listData[[1]]
    colnames(data_expression)<-  substr(colnames(data_expression), 1, 15)
    if(!missing(sample_list)){
      data_expression<-data_expression[,sample_list ]
      if(ncol(data_expression)==0){
        stop("ERROR: barcodes not found.")
      }
    }

    message("Quantile ranking Gene expressions...")

    #Check for co-target expressions----------------

    if(length(onco_gene)==1){
      df_selected <- as.data.frame((data_expression[rownames(data_expression ) %in% onco_gene,]))
      colnames(df_selected) <- onco_gene
    }else{
      df_selected <- t(data_expression[rownames(data_expression ) %in% onco_gene,])
    }
    if(nrow(df_selected)==0){
      stop("ERROR: No Hugo symbols found for onco-genes.")
    }
    df_selected <- as.data.frame(df_selected[, colSums(df_selected != 0) > 0,drop=FALSE])
    if(ncol(df_selected)==0){
      warning("All onco_gene's have zero expression in ", disease )
      next
    }
    onco_gene_sub <- colnames(df_selected)

    #Check for immune checkpoint expressions--------

    if(missing(icp_gene)){
      icp_gene <- icp_gene_list
      icp_gene[1] <- "C10orf54"
      icp_gene[12] <- "DKFZp686O24166"

    }
    if(length(icp_gene)==1){
      df_icp <- as.data.frame(data_expression[rownames(data_expression) %in% icp_gene,])
      colnames(df_icp) <- icp_gene
    }else{
      df_icp <- t(data_expression[rownames(data_expression) %in% icp_gene,])
    }
    if(nrow(df_icp)==0){
      stop("ERROR: No Hugo symbols found for icp_genes.")
    }
    df_icp <- as.data.frame(df_icp[, colSums(df_icp != 0) > 0,drop=FALSE])
    if(ncol(df_icp)==0){
      warning("All icp_gene's have zero expression in ", disease )
      next
    }
    icp_gene_sub <- colnames(df_icp)

    #Construct quantile ranking matrices for each sample--------

    df_selected <- scale(log2(df_selected+1),center = T,scale = T)
    df_icp <- scale(log2(df_icp+1),center = T,scale = T)
    df_select_qr <- get_quantile_rank(df_selected)
    df_icp_qr <- get_quantile_rank(df_icp)
    df_all <- merge(df_select_qr,df_icp_qr)
    rownames(df_all)<- df_all$Tumor_Sample_ID
    df_all <- as.matrix(df_all[,-1])

    #Construct quantile ranking matrices for each patient---------
    #For immune cell count features as calculated by CIBERSORT,
    #PATIENT_BARCODE is used instead of Tumor_Sample_ID

    tmp <- as.data.frame(df_selected)
    tmp$PATIENT_BARCODE <- substr(rownames(tmp), 1, 12)
    tmp <- tmp %>% group_by(PATIENT_BARCODE) %>%
      mutate(across(.cols = everything(),.fns = ~median(.x, na.rm = TRUE))) %>% distinct
    df_selected2 <- as.matrix(tmp[,-which(colnames(tmp)=="PATIENT_BARCODE")])
    rownames(df_selected2)<- tmp$PATIENT_BARCODE
    df_select_qr2 <- get_quantile_rank(df_selected2)

    tmp <- as.data.frame(df_icp)
    tmp$PATIENT_BARCODE <- substr(rownames(tmp), 1, 12)
    tmp <- tmp %>% group_by(PATIENT_BARCODE) %>%
      mutate(across(.cols = everything(),.fns = ~median(.x, na.rm = TRUE))) %>% distinct
    df_icp2 <- as.matrix(tmp[,-which(colnames(tmp)=="PATIENT_BARCODE")])
    rownames(df_icp2)<- tmp$PATIENT_BARCODE
    df_icp_qr2 <- get_quantile_rank(df_icp2)
    df_all2 <- merge(df_select_qr2,df_icp_qr2)
    rownames(df_all2)<- df_all2$Tumor_Sample_ID
    df_all2 <- as.matrix(df_all2[,-1])

    #Construct IAPs-----------------------------------

    message("Constructing IAPs...")

    if(!missing(select_iap)){
      if(is.data.frame(select_iap) || is.matrix(select_iap)){
        df_min <- min(select_iap,na.rm=TRUE)
        df_max <- max(select_iap,na.rm=TRUE)
        if(df_min < 0 | df_max > 1 ){
          stop("ERROR: feature is out of range. Normalize data_feature to [0,1].")
        }
        select_iap <- as.data.frame(select_iap)
        data_feature <- select_iap
      }else{
        data_feature <- get_features(data_expression)
        data_feature <- data_feature[,which(colnames(data_feature) %in% c("Tumor_Sample_ID",select_iap)),drop=F]
        rownames(data_feature)<- data_feature$Tumor_Sample_ID
        data_feature<- as.matrix(data_feature[,-1,drop=F])
        data_feature <- data_feature[,colSums(is.na(data_feature))<nrow(data_feature),drop=F]

        data_cell <- TCGA_IMCell_fraction
        data_cell <- data_cell[,which(colnames(data_cell) %in% c("PATIENT_BARCODE",select_iap)),drop=F]
        rownames(data_cell)<- data_cell$PATIENT_BARCODE
        data_cell <- as.matrix(data_cell[,-1])
        data_cell <- data_cell[,colSums(is.na(data_cell))<nrow(data_cell),drop=F]
      }
    }else{
      select_iap <- TCGA_immune_features_list
      data_feature <- get_features(data_expression)
      rownames(data_feature)<- data_feature$Tumor_Sample_ID
      data_feature<- as.matrix(data_feature[,-1,drop=F])
      data_feature <- data_feature[,colSums(is.na(data_feature))<nrow(data_feature),drop=F]

      data_cell <- TCGA_IMCell_fraction
      rownames(data_cell)<- data_cell$PATIENT_BARCODE
      data_cell <- as.matrix(data_cell[,-1])
      data_cell <- data_cell[,colSums(is.na(data_cell))<nrow(data_cell),drop=F]
    }

    #Build unique permutations

    all_genes <- unique(c(onco_gene_sub,icp_gene_sub))
    all_perms <- t(combn(all_genes,m = 2))
    N_perm <- as.numeric(nrow(all_perms))
    N_genes <- as.numeric(length(all_genes))
    N_feature <- as.numeric(ncol(data_feature))
    N_imcell_feature <- as.numeric(ncol(data_cell))


    #Calculate synergy scores for features ---------

    message("Calculating synerrgy scores ...\n")
    if(N_feature>0){
      pb <- txtProgressBar(min = 0, max = N_perm, char="-",style = 3)

      for(pair_ID in 1:N_perm){
        gene_ID1 <- which(colnames(df_all)==all_perms[pair_ID,1])
        gene_ID2 <- which(colnames(df_all)==all_perms[pair_ID,2])
        dft <- df_all[ , c(gene_ID1,gene_ID2)]
        dft <- dft[dft[ , 1] %in% c(1 , 4) ,,drop=F ]
        dft <- dft[dft[ , 2] %in% c(1 , 4) ,,drop=F ]
        if(nrow(dft) > 4){
          df_helper <-  data.frame(agent1=character(),
                                   agent2=character(),
                                   Immune_feature=character(),
                                   Synergy_score=numeric(),
                                   agent1_expression=character(),
                                   agent2_expression=character(),
                                   wilcox_pvalue=numeric())

          for(im_ID in 1:ncol(data_feature)){
            dft2 <- cbind(data_feature[ , im_ID][match(rownames(dft),rownames(data_feature))], dft)
            colnames(dft2)[1]<- colnames(data_feature)[im_ID]
            dft2 <- dft2[complete.cases(dft2),]

            dfts <- find_a_synergy(dft2,method = method)
            df_helper <- bind_rows(df_helper , dfts)
          }
        }else{
          df_helper <-  data.frame(agent1=colnames(dft)[1],
                                   agent2=colnames(dft)[2],
                                   Immune_feature=colnames(data_feature),
                                   Synergy_score=NA,
                                   agent1_expression=NA,
                                   agent2_expression=NA,
                                   wilcox_pvalue=NA)

        }
        if(nrow(df_helper)>0){
          df_helper$Disease <- disease
          df_helper <- df_helper[ , c("Disease" , c(setdiff(colnames(df_helper) , "Disease")))]
          df_helper$specificity_pvalue <- NA
          df_helper$sensitivity_R <- NA
          df_syng <- bind_rows(df_syng , df_helper)
        }
        setTxtProgressBar(pb, pair_ID)
      }
    }
    #Calculate synergy scores for immune cell features--------
    if(N_imcell_feature>0){
      pb <- txtProgressBar(min = 0, max = N_perm, char="-",style = 3)

      for(pair_ID in 1:N_perm){
        gene_ID1 <- which(colnames(df_all2)==all_perms[pair_ID,1])
        gene_ID2 <- which(colnames(df_all2)==all_perms[pair_ID,2])
        dft <- df_all2[ , c(gene_ID1,gene_ID2)]
        dft <- dft[dft[ , 1] %in% c(1 , 4) , ,drop=F]
        dft <- dft[dft[ , 2] %in% c(1 , 4) , ,drop=F]
        if(nrow(dft) > 4){
          df_helper <-  data.frame(agent1=character(),
                                   agent2=character(),
                                   Immune_feature=character(),
                                   Synergy_score=numeric(),
                                   agent1_expression=character(),
                                   agent2_expression=character(),
                                   wilcox_pvalue=numeric())

          for(if_ID in 1:ncol(data_cell)){
            dft2 <- cbind(data_cell[ , if_ID][match(rownames(dft),rownames(data_cell))], dft)
            colnames(dft2)[1]<- colnames(data_cell)[if_ID]
            dft2 <- dft2[complete.cases(dft2),,drop=F]
            if(nrow(dft2)>0){
              dfts <- find_a_synergy(dft2,method = method)
              df_helper <- bind_rows(df_helper , dfts)
            }
          }
        }else{
          df_helper <-  data.frame(agent1=colnames(dft)[1],
                                   agent2=colnames(dft)[2],
                                   Immune_feature=colnames(data_cell),
                                   Synergy_score=NA,
                                   agent1_expression=NA,
                                   agent2_expression=NA,
                                   wilcox_pvalue=NA)
        }
        if(nrow(df_helper)>0){
          df_helper$Disease <- disease
          df_helper <- df_helper[ , c("Disease" , c(setdiff(colnames(df_helper) , "Disease")))]
          df_helper$specificity_pvalue <- NA
          df_helper$sensitivity_R <- NA
          df_syng <- bind_rows(df_syng , df_helper)
        }
        setTxtProgressBar(pb, pair_ID)
      }
    }
    df_syng <- as.data.table(df_syng)
    setkey(df_syng, Disease, agent1,agent2,Immune_feature)

    message("\nSynergy calculation completed!\n ")


    #Check if p.value should be calculated-----------------------
    if(missing(specificity)){
      specificity <- FALSE
    }
    if(specificity){
      message("\nStarting Specificity analysis:\n ")

      #Build unique permutations----------------------
      df_syng_complete <- df_syng[ !is.na( df_syng$Synergy_score),]
      df_syng_complete <- df_syng_complete[ which(df_syng_complete$Disease==disease), ]

      if(nrow(df_syng_complete)>0){

        all_genes <- unique( c( df_syng_complete$agent1,df_syng_complete$agent2))
        N_genes <- as.numeric( length( all_genes))
        sub_feature_list <- unique(df_syng_complete[
          df_syng_complete$Immune_feature %in% colnames( data_feature),]$Immune_feature)
        N_feature <- as.numeric( length( sub_feature_list))
        sub_imcell_feature_list <- unique(df_syng_complete[
          df_syng_complete$Immune_feature %in% colnames( TCGA_IMCell_fraction ),]$Immune_feature)
        N_imcell_feature <-as.numeric( length( sub_imcell_feature_list))
        if(missing(N_iteration_specificity)){
          N_iteration_specificity <- 1000
        }

        message("  Building bootstrapping distribution from ", N_iteration_specificity ," genes")

        #Build random bank of expression values---------
        df_bank <- data.frame()
        while(ncol(df_bank)==0){
          df_bank <- as.data.frame(t(data_expression[
            c(sample(nrow(data_expression),N_iteration_specificity ,replace = T),
              which(rownames(data_expression) %in% all_genes)),]))
          df_bank <-  df_bank[, colSums(abs(df_bank)!= 0 ) > 0]
          df_bank <- df_bank[!duplicated(as.list(df_bank))]
        }
        df_bank <- df_bank[,c(all_genes, setdiff(colnames(df_bank),all_genes))]

        #Construct quantile ranking matrices for bank---

        df_bank <- scale(log2(df_bank+1),center = T,scale = T)
        df_bank_qr <- get_quantile_rank(df_bank)

        tmp <- as.data.frame(df_bank)
        tmp$PATIENT_BARCODE <- substr(rownames(tmp), 1, 12)
        tmp <- as.data.table(tmp)
        tmp <- tmp[,lapply(.SD,median),by=PATIENT_BARCODE]
        tmp<- as.data.frame(tmp)
        df_bank2 <- as.matrix(tmp[,-which(colnames(tmp)=="PATIENT_BARCODE")])
        rownames(df_bank2)<- tmp$PATIENT_BARCODE
        df_bank_qr2 <- get_quantile_rank(df_bank2)

        df_bank_qr <- as.matrix(df_bank_qr[,-1])
        df_bank_qr2 <- as.matrix(df_bank_qr2[,-1])

        #Calculate p.values for each feature----

        message("Calculating pvalues...")
        if(N_feature > 0){
          for( i in 1:N_feature){

            #Select feature
            select_feature <- sub_feature_list[i]
            mark_feature <- as.integer( which( colnames( data_feature) == select_feature))
            df_feature <- as.matrix(data_feature[ , mark_feature, drop=F ])
            message( "          ...",select_feature)

            #Make a list of all genes with non-NA synergy score value
            df_syng_t <- df_syng_complete[ df_syng_complete$Immune_feature == select_feature,]
            sub_genes1 <- unique( df_syng_t[ ,.( agent1 , agent1_expression)])
            sub_genes2 <- unique( df_syng_t[ ,.( agent2 , agent2_expression)])
            colnames(sub_genes1)<-c("gene","effect")
            colnames(sub_genes2)<-c("gene","effect")
            sub_genes <- unique( rbind(sub_genes1,sub_genes2))

            syng_dist <- vector( "list", nrow( sub_genes ))
            names(syng_dist) <- paste0(sub_genes$gene,"_",sub_genes$effect)

            df_bank_sub1 <- cbind(df_feature,
                                  df_bank_qr[match(rownames(df_feature),rownames(df_bank_qr)),])
            df_bank_sub1 <- df_bank_sub1[complete.cases(df_bank_sub1),]
            bank_size <- as.numeric(ncol(df_bank_sub1))
            bank_mark <- N_genes+2
            df_bank_sub1_cols <- colnames( df_bank_sub1)

            #Build bootstrapping distribution
            tmpn <- nrow(sub_genes)
            pb <- txtProgressBar(min = 0, max = tmpn, char="-",style = 3)

            for( j in 1 :tmpn){
              gene_mark <- as.numeric( which(  df_bank_sub1_cols  == sub_genes$gene[ j]))
              df_bank_sub2 <- df_bank_sub1[ , c(1, gene_mark, bank_mark : bank_size)]
              df_bank_sub2 <- df_bank_sub2[ df_bank_sub2[ , 2 ] %in% c( 1, 4), ]
              bank_size2 <- as.numeric( ncol( df_bank_sub2))
              gene_effect <- sub_genes$effect[j]

              syng_dist_t <- vector()
              for(k in 3:bank_size2){
                dft <- df_bank_sub2[,c(1,2,k)]
                dft <- dft[dft[,3] %in% c(1,4),,drop=F]
                if(nrow(dft)>0){
                  syng_dist_t[k-2] <- find_a_synergy(fdata = dft,
                                                     method = method,
                                                     oncogene1 = gene_effect)$Synergy_score
                }
              }
              syng_dist_t <- syng_dist_t[complete.cases(syng_dist_t)]
              syng_dist[[j]] <- syng_dist_t
              setTxtProgressBar(pb, j)
            }

            #Calculate p.values for all pairs

            for(pair_ID in 1:nrow(df_syng_t)){

              tmp_row <- df_syng_t[pair_ID,]
              gene1 <- tmp_row$agent1[1]
              gene2 <- tmp_row$agent2[1]
              effect1 <- tmp_row$agent1_expression[1]
              effect2 <- tmp_row$agent2_expression[1]
              myscore <- tmp_row$Synergy_score[1]
              mysign <- sign(myscore)
              if( mysign==0) mysign <- 1

              j1 <- which( names( syng_dist) == paste0(gene1,"_",effect1))
              tmp_dist <- syng_dist[[j1]]
              Pvalue1 <- sum(mysign*tmp_dist > mysign*myscore, tmp_dist==myscore )/length(tmp_dist)
              j2 <- which(names(syng_dist) == paste0(gene2,"_",effect2))
              tmp_dist <- syng_dist[[j2]]
              Pvalue2 <- sum(mysign*tmp_dist > mysign*myscore, tmp_dist==myscore )/length(tmp_dist)
              df_syng <- df_syng[.(disease,gene1,gene2,select_feature,effect1,effect2),
                                 specificity_pvalue :=max(Pvalue1, Pvalue2)]
            }
          }
        }
        #Calculate p.value for each immune cell count------------------------------------
        if( N_imcell_feature > 0){
          for( i in 1:N_imcell_feature){

            #Select feature
            select_feature <- sub_imcell_feature_list[i]
            mark_feature <- as.integer( which( colnames(data_cell) == select_feature))
            df_feature <- as.matrix(data_cell[ , mark_feature,drop=F])
            message( "          ...",select_feature)

            #Make a list of all genes with non-NA synergy score value
            df_syng_t <- df_syng_complete[df_syng_complete$Immune_feature==select_feature,]
            sub_genes1 <- unique( df_syng_t[ ,.( agent1 , agent1_expression)])
            sub_genes2 <- unique( df_syng_t[ ,.( agent2 , agent2_expression)])
            colnames(sub_genes1)<-c("gene","effect")
            colnames(sub_genes2)<-c("gene","effect")
            sub_genes <- unique( rbind(sub_genes1,sub_genes2))

            syng_dist <- vector( "list", nrow( sub_genes ))
            names(syng_dist) <- paste0(sub_genes$gene,"_",sub_genes$effect)

            df_bank_sub1 <- cbind(df_feature,
                                  df_bank_qr2[match(rownames(df_feature),rownames(df_bank_qr2)),])
            df_bank_sub1 <- df_bank_sub1[complete.cases(df_bank_sub1),]
            bank_size <- as.numeric(ncol(df_bank_sub1))
            bank_mark <- N_genes+2
            df_bank_sub1_cols <- colnames( df_bank_sub1)

            #Build bootstrapping distribution
            tmpn <- nrow(sub_genes)
            pb <- txtProgressBar(min = 0, max = tmpn, char="-",style = 3)

            for( j in 1 :tmpn){

              gene_mark <- as.numeric( which( df_bank_sub1_cols == sub_genes$gene[ j]))
              df_bank_sub2 <- df_bank_sub1[ , c(1, gene_mark, bank_mark : bank_size)]
              df_bank_sub2 <- df_bank_sub2[ df_bank_sub2[ , 2 ] %in% c( 1, 4),,drop=F ]
              bank_size2 <- as.numeric( ncol( df_bank_sub2 ))
              gene_effect <- sub_genes$effect[j]

              syng_dist_t <- vector()
              for(k in 3:(bank_size2)){
                dft <- df_bank_sub2[,c(1,2,k)]
                dft <- dft[dft[,3] %in% c(1,4),,drop=F]
                if(nrow(dft)>0){
                  syng_dist_t[k-2] <- find_a_synergy(fdata = dft,
                                                     method = method,
                                                     oncogene1 = gene_effect)$Synergy_score
                }else{ syng_dist_t[k-2] <- NA }
              }
              syng_dist_t <- syng_dist_t[complete.cases(syng_dist_t)]
              syng_dist[[j]] <- syng_dist_t
              setTxtProgressBar(pb, j)
            }

            #Calculate p.value for all pairs

            for(pair_ID in 1:nrow(df_syng_t)){
              tmp_row <- df_syng_t[pair_ID,]
              gene1 <- tmp_row$agent1[1]
              gene2 <- tmp_row$agent2[1]
              effect1 <- tmp_row$agent1_expression[1]
              effect2 <- tmp_row$agent2_expression[1]
              myscore <- tmp_row$Synergy_score[1]
              mysign <- sign(myscore)
              if( mysign==0) mysign <- 1

              j1 <- which( names( syng_dist) == paste0(gene1,"_",effect1))
              tmp_dist <- syng_dist[[j1]]
              Pvalue1 <- sum( mysign*tmp_dist > mysign*myscore, tmp_dist==myscore )/length(tmp_dist)
              j2 <- which(names(syng_dist)==paste0(gene2,"_",effect2))
              tmp_dist <- syng_dist[[j2]]
              Pvalue2 <- sum( mysign*tmp_dist > mysign*myscore, tmp_dist==myscore )/length(tmp_dist)
              df_syng <- df_syng[.(disease,gene1,gene2,select_feature,effect1,effect2),
                                 specificity_pvalue :=max(Pvalue1, Pvalue2)]
            }
          }
        }
      }
    }
    #Check if robustness analysis should be done---------------------------

    if(missing(sensitivity)){
      sensitivity <- FALSE
    }
    if(sensitivity){
      if(missing(N_iteration_sensitivity)){
        N_iteration_sensitivity <- 1000
      }
      message("\nCalculating robustness R. Iterating ", N_iteration_sensitivity, " times:")


      #Define parameters
      df_syng_complete <- df_syng[ !is.na( df_syng$Synergy_score),]
      if( nrow(df_syng_complete) > 0 ){
        df_syng_complete$sum <- 0
        df_syng_complete$sum2 <- 0
        df_syng_complete$N <- 0

        dft1 <- as.data.frame(df_selected)
        dft2 <- as.data.frame(df_icp)
        dft1$ID <- rownames(dft1)
        dft2$ID <- rownames(dft2)
        df_comb <- merge(dft1,dft2)
        rownames(df_comb) <- df_comb$ID
        df_comb$ID <- NULL
        df_comb <- as.matrix(df_comb)

        N_sample <- as.numeric(nrow(df_comb))
        N_sub <- floor(N_sample*0.7)


        df_syng_complete <- df_syng[ !is.na( df_syng$Synergy_score),]
        df_syng_complete <- df_syng_complete[ which(df_syng_complete$Disease==disease), ]
        df_syng_complete1 <- df_syng_complete[df_syng_complete$Immune_feature %in%
                                                colnames(data_feature),]
        df_syng_complete2 <- df_syng_complete[df_syng_complete$Immune_feature %in%
                                                colnames(data_cell),]

        N_syng_complete1 <- nrow(df_syng_complete1)
        N_syng_complete2 <- nrow(df_syng_complete2)

        pb <- txtProgressBar(min = 0, max = N_iteration_sensitivity, char="-",style = 3)

        for(n_sns in 1:N_iteration_sensitivity){

          df_sub <- df_comb[sample(N_sample,N_sub,replace = F),]
          df_sub_qr <- get_quantile_rank(df_sub)


          tmp <- as.data.frame(df_sub)
          tmp$PATIENT_BARCODE <- substr(rownames(tmp), 1, 12)
          tmp <- as.data.table(tmp)
          tmp <- tmp[,lapply(.SD,median),by=PATIENT_BARCODE]
          tmp<- as.data.frame(tmp)
          df_sub2 <- as.matrix(tmp[,-which(colnames(tmp)=="PATIENT_BARCODE")])
          rownames(df_sub2)<- tmp$PATIENT_BARCODE
          df_sub_qr2 <- get_quantile_rank(df_sub2)
          df_sub_qr <- as.matrix(df_sub_qr[,-1])
          df_sub_qr2 <- as.matrix(df_sub_qr2[,-1])

          dft <- data_expression[,colnames(data_expression) %in% rownames(df_sub)]
          data_feature <- get_features(dft)

          if(is.data.frame(select_iap)){
            data_feature <- as.data.frame(merge(data_feature , select_iap, by="Tumor_Sample_ID"))
          }
          rownames(data_feature)<- data_feature$Tumor_Sample_ID
          data_feature<- as.matrix(data_feature[,-1])

          data_cell <- TCGA_IMCell_fraction
          rownames(data_cell)<- data_cell$PATIENT_BARCODE
          data_cell <- as.matrix(data_cell[,-1])


          for(pair_ID in 1:N_syng_complete1 ){
            tmp_df <- df_syng_complete1[pair_ID,]
            gene_ID1 <- tmp_df$agent1
            gene_ID2 <- tmp_df$agent2
            base_score <- tmp_df$Synergy_score
            effect1 <- tmp_df$agent1_expression
            effect2 <- tmp_df$agent2_expression
            feature_ID <- tmp_df$Immune_feature
            mark_feature <- as.integer( which( colnames( data_feature) == feature_ID))
            df_feature <- as.matrix(data_feature[ , mark_feature,drop=F])

            dft <- df_sub_qr[ , c(which(colnames(df_sub_qr)==gene_ID1),
                                  which(colnames(df_sub_qr)==gene_ID2))]
            dft <- dft[dft[ , 1] %in% c(1 , 4) ,,drop=F ]
            dft <- dft[dft[ , 2] %in% c(1 , 4) ,,drop=F ]

            dft <- cbind(df_feature[match(rownames(dft),rownames(df_feature)),,drop=F], dft)
            dft <- dft[complete.cases(dft),,drop=F]

            if(nrow(dft)>0){
              dfts <- find_a_synergy(fdata = dft,
                                     method = method,
                                     oncogene1 = effect1,
                                     oncogene2 = effect2)$Synergy_score
            }else{
              dfts <- NA
            }

            df_syng_complete1$sum[pair_ID] <- sum(df_syng_complete1$sum[pair_ID],
                                                  dfts, na.rm = T)
            dfts <- dfts -base_score
            df_syng_complete1$sum2[pair_ID] <- sum(df_syng_complete1$sum2[pair_ID],
                                                   dfts*dfts, na.rm = T)
            df_syng_complete1$N[pair_ID] <- sum(df_syng_complete1$N[pair_ID],
                                                abs(sign(dfts)), na.rm = T)
          }
          if(N_syng_complete2>0){
            for(pair_ID in 1:N_syng_complete2 ){

              tmp_df <- df_syng_complete2[pair_ID]
              gene_ID1 <- tmp_df$agent1
              gene_ID2 <- tmp_df$agent2
              base_score <- tmp_df$Synergy_score
              effect1 <- tmp_df$agent1_expression
              effect2 <- tmp_df$agent2_expression
              feature_ID <- tmp_df$Immune_feature
              mark_feature <- as.integer( which( colnames(data_cell) == feature_ID))
              df_feature <- as.matrix(data_cell[ , mark_feature,drop=F])

              dft <- df_sub_qr2[ , c(which(colnames(df_sub_qr2)==gene_ID1),
                                     which(colnames(df_sub_qr2)==gene_ID2))]
              dft <- dft[dft[ , 1] %in% c(1 , 4) ,,drop=F ]
              dft <- dft[dft[ , 2] %in% c(1 , 4) ,,drop=F ]
              dft <- cbind(df_feature[match(rownames(dft),rownames(df_feature)),,drop=F], dft)
              dft <- dft[complete.cases(dft),,drop=F]
              if(nrow(dft)>0){
                dfts <- find_a_synergy(fdata = dft,
                                       method = method,
                                       oncogene1 = effect1,
                                       oncogene2 = effect2)$Synergy_score
              }else{
                dfts <- NA
              }

              df_syng_complete2$sum[pair_ID] <- sum(df_syng_complete2$sum[pair_ID],
                                                    dfts, na.rm = T)
              dfts <- dfts -base_score
              df_syng_complete2$sum2[pair_ID] <-sum(df_syng_complete2$sum2[pair_ID],
                                                    dfts*dfts, na.rm = T)
              df_syng_complete2$N[pair_ID] <- sum(df_syng_complete2$N[pair_ID],
                                                  abs(sign(dfts)), na.rm = T)

            }
          }
          setTxtProgressBar(pb, n_sns)
        }
        if(nrow(df_syng_complete2>0)){
          df_syng_complete <- rbind(df_syng_complete1,df_syng_complete2)
        }else{
          df_syng_complete <- df_syng_complete1
        }
        df_syng_complete$sensitivity_R <- sqrt(df_syng_complete$sum2/df_syng_complete$N) / abs(df_syng_complete$Synergy_score)
        df_syng_complete$sum <- NULL
        df_syng_complete$sum2 <- NULL
        df_syng_complete$N <- NULL
        rownames(df_syng_complete) <- NULL
        df_syng<- df_syng[df_syng_complete, sensitivity_R := i.sensitivity_R]
      }
    }
    df_syng <- as.data.frame(df_syng)
    df_syng_all <- bind_rows(df_syng_all,df_syng )
    message("\n",disease," completed\n")
  }
  df_syng_all[df_syng_all=="DKFZp686O24166"]<-"NCR3LG1"
  df_syng_all[df_syng_all=="C10orf54"]<-"VSIR"
  colnames(df_syng_all)[1:7] <- c("Disease",
                                  "Gene1","Gene2",
                                  "IAP","Synergy_score",
                                  "Gene1_expression","Gene2_expression")
  return(df_syng_all)
}
Bozorgui/imogene documentation built on Feb. 21, 2022, 11:57 a.m.