knitr::opts_chunk$set(echo = TRUE)

Creating data object

Loading all the relevant data and creating the data object with the mutation and viability scores for each type of cancer.

library(reshape2)
library(dplyr)
library(viridis)
library(ggplot2)
library(nem)
library(biclust)
library(UpSetR)

# causal packages
library(tableone)
library(ipw)
# library(sandwich) #for robust variance estimation
# library(survey)
library(Matching)
library(slidr)

# Set path to all data
base_folder = "~/GitHub/CrispR/Data/"

# Loading mutations for all cell lines
all_cancers_mut_df <- read.delim(paste0(base_folder,
                                       "MutationFiles/MutationsCCLE/CCLE_maf_allcancers.txt"),
                                stringsAsFactors = FALSE, header = TRUE)
# Filtering out silent mutations
all_cancers_mut_df <- all_cancers_mut_df %>% 
                        dplyr::filter(Variant_Classification != "Silent" )

# Load annotation file and meta data
cellline_annot <-  read.csv(paste0(base_folder, "ProjectDRIVE/TableS2.csv"),
                            stringsAsFactors = F)
meta_data <- read.csv(paste0(base_folder, "MutationFiles/File_metadata.csv"), 
                      header = TRUE, stringsAsFactors = FALSE)
Primary_sites <-   meta_data$Primary_site

# Load log2 copy number data
CN_df <- read.delim(paste0(base_folder,"/CopyNumber/CCLE_copynumber_byGene.txt"),
                     stringsAsFactors = F, header = T)

# Load Gistic copy number data
CN_df_gistic <- read.delim(paste0(base_folder,"/CopyNumber/CCLE_CNA_Integers.txt"),
                            stringsAsFactors = F, header=T)
CN_df_gistic <- CN_df_gistic[-which(duplicated(CN_df_gistic$Hugo_Symbol)),]

# Getting essential genes
data_rsa <- readRDS(paste0(base_folder, "ProjectDRIVE/DRIVE_RSA_data.RDS"))

# Remove the essential data from ATARiS normalised data
data_ataris <- readRDS(paste0(base_folder, "ProjectDRIVE/DRIVE_ATARiS_data.RDS"))

# Getting a list of cell lines for each cancer type
cellline_list <- lapply(Primary_sites, 
                        function(x){slidr::getCelllines(x, cellline_annot, meta_data)})
names(cellline_list) <- Primary_sites

# Getting a list of essential genes for each cancer type
essen_gene_list <- lapply(names(cellline_list), 
                        function(x){slidr::getEssentialGenes(x = x, 
                                                             celllines = cellline_list[[x]], 
                                                             data = data_rsa)})
names(essen_gene_list) <- Primary_sites

all_data = list()
fdr = 0.05
min_Nmut = 2

# Generate objects for each primary site
all_data <- lapply(Primary_sites, 
                   function(x){ slidr::prepareDataObjects(data = data_ataris, 
                                                      x = x, 
                                                      fdr = fdr, 
                                                      min_Nmut = min_Nmut, 
                                                      all_cancers_mut_df = all_cancers_mut_df, 
                                                      CN_df = CN_df, #CN_df_gistic 
                                                      gistic = FALSE, #TRUE 
                                                      celllines = cellline_list[[x]], 
                                                      meta_data = meta_data, 
                                                      essential_genes = essen_gene_list[[x]])})
names(all_data) <- Primary_sites

Identify SL hits

Once the data is processed in the correct format, use the identifySLHits function to get the mutation-specific SL partner for each cancer type.

path_results <- "~/Downloads/Slidr_Results_new/"

hits <- lapply(all_data, 
               function(x){slidr::identifySLHits(canc_data = x, path_results = path_results, WT_pval_thresh = 0.1)})
names(hits) <- names(all_data)

save.image(paste0(path_results, "ProcessedData.Rdata"))

Pan Cancer Analysis

Creating the data object for the pan-cancer analysis.

# Choosing cell lines with both CN and viability data and the driver genes
pc_celllines  <- intersect(colnames(data_ataris),tolower(colnames(CN_df)))

# driver_genes  <- read.delim(paste0(base_folder,
#                                        "MutationFiles/Cell2018_mutations.txt"),
#                                 stringsAsFactors = FALSE, header = FALSE)[[1]]
driver_genes  <- unique(all_cancers_mut_df$Hugo_Symbol)
# Getting the essential genes
pc_essen_gene <- slidr::getEssentialGenes(x = NULL, data = data_rsa, celllines = pc_celllines)
# Removing the essential genes
pc_data <- list(viabilities = NULL,
                mutations = NULL,
                CNalterations = NULL,
                mutation_annot = NULL,
                primary_site = "pan_cancer")

data_ataris <- as.data.frame(t(apply(data_ataris,
                                      1,
                                      function(x){
                                           x[which(is.na(x))] <- mean(x, na.rm = TRUE)
                                           x})))
pc_data$viabilities <- data_ataris[!rownames(data_ataris) %in% pc_essen_gene,pc_celllines]

# Threshold for removing drivers with fewer than 8% mutated samples
mut_pc     <- 8 #2         
mut_thresh <- floor(mut_pc * length(pc_celllines) / 100)
CN_Thr     <- 1 # Threshold to choose only deep deletions
# Binary mutation data
mut_mat    <- slidr::prepareMutMat(x = NULL, 
                                  driver_genes = driver_genes, 
                                  samples = pc_celllines,
                                  all_cancers_mut_df = all_cancers_mut_df)

pc_data$CNalterations <- slidr::prepareCNMat(CN_df = CN_df, 
                                             samples = pc_celllines,
                                             driver_genes = driver_genes, 
                                             x = NULL)

# binarized copy number data with only deep deletions 
CN_bin <- binarize(2 - pc_data$CNalterations, threshold = CN_Thr)
CN_bin[is.na(CN_bin)] <- 0

# Updating copy numbers in mutation matrix
pc_data$mutations <- binarize((CN_bin + mut_mat), threshold = 0)

# Removing genes with < 8% mutated samples
pc_data$mutations   <- pc_data$mutations[rowSums(pc_data$mutations) >= mut_thresh, ]

# Create a mutation annotation file
mut_mat_annot <- melt(pc_data$mutations)
mut_mat_annot <- mut_mat_annot %>% dplyr::filter(value != 0)
colnames(mut_mat_annot) <- c("Hugo_Symbol","Cell_Line","Mut_Status")
# Use copy number info in the mutation annotation file
all_cancers_mut_df$Tumor_Sample_Barcode <- tolower(all_cancers_mut_df$Tumor_Sample_Barcode)
CN_alterations_df <- melt(pc_data$CNalterations)
colnames(CN_alterations_df) <- c("Hugo_Symbol","Cell_Line","CN_Value")

# Annotating mutations and adding copy number information
pc_data$mutation_annot <- left_join(mut_mat_annot,all_cancers_mut_df,
                              by = c("Hugo_Symbol" = "Hugo_Symbol","Cell_Line" = "Tumor_Sample_Barcode")) %>%
                          dplyr::select(Hugo_Symbol, Cell_Line, Mut_Status, Variant_Classification) %>%
                          dplyr::group_by(Hugo_Symbol,Cell_Line,Mut_Status) %>% 
                          dplyr::summarise(Variant_Classification = paste(Variant_Classification, collapse=";")) 

pc_data$mutation_annot <- left_join(pc_data$mutation_annot, CN_alterations_df, by = c("Hugo_Symbol","Cell_Line"))

# Replacing all the na
pc_data$mutation_annot[is.na(pc_data$mutation_annot)] <- 0

save.image("~/Downloads/Slidr_Results_new/pancan.Rdata")

Identify Synthetic lethals from pan cancer data.

# Path for results
path_results <- "~/Downloads/Slidr_Results_new/"

hits_pancan <- slidr::identifySLHits(canc_data = pc_data, path_results = path_results, WT_pval_thresh = 0, fp_thresh = 1)

Plotting p-values of the SL pairs and identifying partners that are partners for many drivers.

hits_pancan <- read.delim("~/Downloads/Slidr_Results/Pan_cancer_8pc/Hit_List/SL_hits_pan_cancer.txt",stringsAsFactors = F)
hits_pancan$sl_partner_gene <- sapply(hits_pancan$sl_partner_gene, function(x){strsplit(x, ",")[[1]][1]})

p1 <- ggplot(hits_pancan, aes(x=driver_gene, y=sl_partner_gene, size = -log(mut_pvalue, base = 10))) + 
      geom_point(fill = "#055C95",color="#055C95", alpha=0.45) + 
      scale_size_continuous(range=c(2, 10)) +
      theme_bw() +
      xlab("Driver genes") +
      ylab("SL partner genes") +
      labs(size = "-log(p-value)")+
      theme(panel.grid.major = element_line(size = 0.1),
            panel.grid.minor = element_blank(),
            panel.border = element_blank(),
            axis.ticks.x = element_blank(),
            axis.ticks.y = element_blank(),
            axis.text.y=element_text(angle=0, size=11,colour="#525252"),
            axis.text.x=element_text(angle=90, size=11,vjust=0.5 ,hjust=1, colour="#525252"),
            axis.title.x=element_text(angle=0, size=20,colour="#525252"),
            axis.title.y=element_text(angle=90, size=20,colour="#525252"),
            legend.position="bottom",
            legend.title = element_text(angle=0, size=13,colour="#525252"),
            legend.text = element_text(size=13,colour="#525252"),
            legend.key.size = unit(0.75, "cm")) 
ggsave(filename = "~/Downloads/Slidr_Results/Pan_cancer_8pc/SLpairs_8pc_bubblePlot.pdf", p1, width = 18, height = 17, units = "in")

Causal inference

set.seed(478)
hits_pancan <- read.delim("~/Downloads/Slidr_Results/Pan_cancer_8pc/Hit_List/SL_hits_pan_cancer.txt",stringsAsFactors = F)

load("~/Downloads/Slidr_Results/Pan_cancer_8pc/pancan.Rdata")

causal_hits <- data.frame("driver_gene" = character(), 
                          "sl_partner_gene" = character(), 
                          "pval" = numeric())

grouped_hits <- hits_pancan %>% 
                dplyr::group_by(sl_partner_gene) %>% 
                summarise(drivers = paste(driver_gene, collapse = ","))

# set the caliper for matching
set_cal = 0.1

for(i in 1:nrow(grouped_hits)){
  temp_drivers <- sapply(grouped_hits$drivers[i], function(x){strsplit(x,",")[[1]]})
  if(length(temp_drivers) >= 2){ # running for two or more genes
    # defining the causal df
    sl_gene    <- grouped_hits$sl_partner_gene[i]
    causal_df  <- t(pc_data$mutations[temp_drivers,])
    causal_df  <- cbind.data.frame(t(pc_data$viabilities[sl_gene,rownames(causal_df)]), causal_df)
    # To avoid confusion between driver and same gene as target
    colnames(causal_df)[1]  <- "Viability"
    causal_df[,"Viability"] <- 1 - binarize(causal_df[,"Viability"], threshold = median(causal_df[,"Viability"]))
    colnames(causal_df)     <- gsub('-', "_", colnames(causal_df))
    temp_drivers            <- gsub('-', "_", temp_drivers)

    for(j in 1: length(temp_drivers)){
      # t_res <- NULL  
      t_res <- data.frame("smd_sum" = numeric(),
                          "p_val" = numeric())  
      for(k in 1:50){
        # Shuffling the rows for matching
        temp_df   <- causal_df[sample(1:nrow(causal_df)),]
        # values of viabilities and treatment var
        y         <- temp_df$Viability
        tr_gene   <- temp_drivers[j]
        # covariates
        xvars     <- temp_drivers[-j]
        table1    <- CreateTableOne(vars = xvars, 
                                    strata = tr_gene, 
                                    data = temp_df, 
                                    test = FALSE)
        #print(table1,smd=TRUE)

        # Propensity matching
        psmodel <- glm(as.formula(paste0(tr_gene," ~" ,paste(xvars, collapse = " + "))),
                       family = binomial(),
                       data = temp_df)

        #create propensity score
        pscore <- psmodel$fitted.values

        #do greedy matching on logit(PS) using Match with a caliper
        logit   <- function(p) {log(p)-log(1-p)}
        err_mes <- try(Match(Tr = temp_df[,tr_gene],
                             M = 1,
                             X = logit(pscore),
                             replace = FALSE,
                             caliper = set_cal, 
                             estimand = "ATT"), silent = FALSE)
        if(is.na(err_mes[[1]])){
          # t_res <- c(t_res,NaN)
          t_res <- rbind.data.frame(t_res,
                                    cbind.data.frame(smd_sum = NaN, p_val = NaN))
        }else{
          psmatch <- Match(Tr = temp_df[,tr_gene],
                           M = 1,
                           X = logit(pscore),
                           replace = FALSE,
                           caliper = set_cal, 
                           estimand = "ATT")
          matched <- temp_df[unlist(psmatch[c("index.treated","index.control")]), ]

          #get standardized differences
          matchedtab1 <- CreateTableOne(vars=xvars, strata =tr_gene, 
                                        data=matched, test = FALSE)
          #print(matchedtab1, smd = TRUE)
          temp_smd_sum <- sum(ExtractSmd(matchedtab1))

          #outcome analysis
          y_trt <- matched$Viability[matched[,tr_gene] == 1]
          y_con <- matched$Viability[matched[,tr_gene] == 0]

          #pairwise difference
          diff_y <- y_trt - y_con

          # paired t-test
          if(class(try(t.test(diff_y), silent = TRUE)) == "try-error"){
            t_res <- rbind.data.frame(t_res,
                                      cbind.data.frame(smd_sum = temp_smd_sum, p_val = NaN))
          }else
            t_res <- rbind.data.frame(t_res,
                                      cbind.data.frame(smd_sum = temp_smd_sum, p_val = t.test(diff_y)$p.value))
        }
        # t_res <- t_res[!is.nan(t_res)]

      # print(length(t_res))
      # print(pchisq(-2 * sum(log(t_res)),
      #          df = 2 * length(t_res),
      #          lower.tail=FALSE))

      # comb_pval <- pchisq(-2 * sum(log(t_res)),
      #                     df = 2 * length(t_res),
      #                     lower.tail = FALSE)
      # 
      }
      # colnames(t_res) <- c("smd_sum", "p_val")
      t_res           <- t_res[!is.nan(t_res$p_val),]
      # causal_hits <- rbind.data.frame(causal_hits,
      #                                 cbind.data.frame(tr_gene, sl_gene, comb_pval))

      # if all viabilities are equal then p-value will be NaN and hence we set the value to 1
      if(nrow(t_res) != 0){
        # setting p-val to median p-val when you have multiple samples with same min smd
        if(sum(t_res$smd_sum == min(t_res$smd_sum)) > 1){
          fin_pval <- t_res %>% 
                      dplyr:: filter(smd_sum == min(t_res$smd_sum)) %>%
                      dplyr::summarize(x = mean(p_val))
        }else
          fin_pval <- t_res$p_val[which.min(t_res$smd_sum)]
      }else{
        fin_pval <- NaN
      }
      causal_hits <- rbind.data.frame(causal_hits,
                                      cbind.data.frame(driver_gene = tr_gene, 
                                                       sl_partner_gene = sl_gene, 
                                                       pval = as.numeric(fin_pval)))
    }
  }

}

# retaining pairs for which true causal hit exists 
best_causal_hits <- causal_hits %>% 
                      dplyr::group_by(sl_partner_gene) %>%
                      dplyr::slice(which.min(pval))

# filtering out the pairs for which true causal hit exists 
insig_causal_hits <- causal_hits %>% 
                      dplyr::group_by(sl_partner_gene) %>%
                      dplyr::slice(-which.min(pval))

# Remove the insignificant causal pairs from the total pancancer hits
filt_hits_pancan <- hits_pancan %>% 
                      dplyr::anti_join(insig_causal_hits, by = c("driver_gene", "sl_partner_gene"))

Plotting p-values for a given pair across different cancers.

# hits_pancan <- read.delim("~/Downloads/Slidr_Results/Pan_cancer_8pc/Hit_List/SL_hits_pan_cancer.txt",stringsAsFactors = F)
# 
# load("~/Downloads/Slidr_Results/Pan_cancer_8pc/pancan.Rdata")

load("~/Downloads/Slidr_Results/Pan_cancer_8pc/causal_res.RData")

original_hits_pancan <- hits_pancan
hits_pancan          <- filt_hits_pancan

# create data frame for each pair with frequency of different celllines
canc_type_df <- do.call(rbind.data.frame, lapply(1:nrow(hits_pancan), function(x){
                                                  mutations     <- pc_data$mutations
                                                  mut_celllines <- names(which(mutations[hits_pancan$driver_gene[x],] == 1))
                                                  cancer_types  <- sapply(mut_celllines, 
                                                                        function(x){paste(strsplit(x,"_")[[1]][-1], collapse = "_")})
                                                  cancer_types  <- as.data.frame(table(cancer_types))
                                                  data.frame(hits_pancan$driver_gene[x], hits_pancan$sl_partner_gene[x], cancer_types)
                                                #}

}))

colnames(canc_type_df) <- c("driver_gene", "sl_partner_gene", "canc_type", "n_samples")

min_Nmut = 2

# get the p-value for each pair and type of cancer
pval_df <- do.call(rbind.data.frame, lapply(1:nrow(canc_type_df), function(x){
                                          canc_type        <- as.character(canc_type_df$canc_type[x])
                                          driver           <- as.character(canc_type_df$driver_gene[x])
                                          sl_partner       <- as.character(canc_type_df$sl_partner_gene[x])
                                          celllines        <- grep(canc_type,colnames(pc_data$mutations))
                                          mut_samples      <- sum(pc_data$mutations[driver,celllines])
                                          if(mut_samples >= min_Nmut & (length(celllines) - mut_samples) >= min_Nmut){
                                            temp             <- pc_data
                                            temp$mutations   <- pc_data$mutations[,celllines]
                                            temp$viabilities <- pc_data$viabilities[,celllines]
                                            c(canc_type,slidr::getPval(temp,driver,sl_partner))
                                          }else
                                            c(canc_type,driver,sl_partner,1,1)

}))

colnames(pval_df)          <- c("canc_type","driver_gene", "sl_partner_gene", "WT_pvalue", "mut_pvalue")
summary_df                 <- left_join(canc_type_df, pval_df, by = c("driver_gene","sl_partner_gene","canc_type"))
summary_df$sl_partner_gene <- sapply(summary_df$sl_partner_gene, function(x){strsplit(x, ",")[[1]][1]})
summary_df                 <- cbind(paste(summary_df$driver_gene,summary_df$sl_partner_gene, sep = "_"), 
                                    summary_df,
                                    rowSums(pc_data$mutations[summary_df$driver_gene,]))
colnames(summary_df)[c(1,8)]  <- c("sl_pairs","tot_samples")
summary_df                    <- summary_df %>% dplyr::filter(mut_pvalue != 1 & WT_pvalue != 1)

# Using bubble plot to summarize the p-values in each cancer type
q1 <- ggplot(summary_df, aes(y=canc_type, x=sl_pairs, size = -log(as.numeric(as.character(mut_pvalue)), base = 10))) + 
      geom_point(fill = "#055C95",color="#055C95", alpha=0.45) + 
      scale_size_continuous(range=c(2, 8)) +
      theme_bw() +
      ylab("Primary sites") +
      xlab("Driver and SL partner genes") +
      labs(size = "-log(p-value)")+
      theme(panel.grid.major = element_line(size = 0.1),
            panel.grid.minor = element_blank(),
            panel.border = element_blank(),
            axis.ticks.x = element_blank(),
            axis.ticks.y = element_blank(),
            axis.text.x = element_text(angle = 90, vjust=0.5, hjust=1, size=13,colour="#525252"),
            axis.text.y = element_text(angle = 0, size=14, vjust=0.5, hjust=1, colour="#525252"),
            axis.title.x=element_text(angle = 0, size=20,colour="#525252"),
            axis.title.y=element_text(angle=90, size=20,colour="#525252"),
            legend.position="bottom",
            legend.title = element_text(angle=0, size=15,colour="#525252"),
            legend.text = element_text(size=13,colour="#525252"),
            legend.key.size = unit(0.75, "cm")) +
    scale_y_discrete(labels = c("Autonomic ganglia", "Bone", "Breast", "CNS", "Endometrium", "Blood", "Kidney", "Large intestine", "Liver", "Lung", "Oesophagus", "Ovary", "Pancreas", "Skin", "Soft tissue", "Stomach", "UADT", "Urinary tract") )# No Pleura after causal filtering

# Using a heatmap to describe the p-values in each cancer type
q2 <- ggplot(summary_df, aes(x = sl_pairs,y = canc_type)) +
        geom_tile(aes(fill = -log(as.numeric(as.character(mut_pvalue))))) +
        scale_fill_viridis(option="D", begin = 0, end = 1, alpha = 0.9) +
        theme_bw() +
        ylab("Primary sites") +
        xlab("Driver and SL partner genes") +
        labs(fill = "-log(p-value)")+
        theme(panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
            axis.text.x = element_text(angle = 90, vjust=0.5, hjust=1, size=13,colour="#525252"),
            axis.text.y = element_text(angle = 0, size=14, vjust=0.5, hjust=1, colour="#525252"),
            axis.title.x=element_text(angle = 0, size=20,colour="#525252"),
            axis.title.y=element_text(angle=90, size=20,colour="#525252"),
            legend.position="bottom",
            legend.title = element_text(angle=0, size=15,colour="#525252"),
            legend.text = element_text(size=13,colour="#525252"),
            legend.key.size = unit(0.9, "cm"))+
            #panel.border = element_blank()) +
        scale_y_discrete(labels = c("Autonomic ganglia", "Bone", "Breast", "CNS", "Endometrium", 
                                    "Blood", "Kidney", "Large intestine", "Liver", "Lung", "Oesophagus", 
                                    "Ovary", "Pancreas", "Skin", "Soft tissue", "Stomach", "UADT", "Urinary tract") ) # No Pleura after causal filtering


ggsave(filename = "~/Downloads/Slidr_Results/Pan_cancer_8pc/SLpairs_canctype_8pc_bubblePlot.pdf", q1, width = 35, height = 9, units = "in")
ggsave(filename = "~/Downloads/Slidr_Results/Pan_cancer_8pc/SLpairs_canctype_8pc_heatmap.pdf", q2, width = 35, height = 9, units = "in")

Plotting stacked bar plot for number of samples.

barplot_df <- summary_df %>% dplyr::distinct(driver_gene, n_samples, canc_type, tot_samples)
barplot_df$canc_type <- factor(barplot_df$canc_type, 
                               labels = c("Autonomic ganglia", "Bone", "Breast", "CNS", "Endometrium", "Blood", "Kidney", "Large intestine", "Liver", "Lung", "Oesophagus",
"Ovary", "Pancreas", "Pleura", "Skin", "Soft tissue", "Stomach", "UADT", "Urinary tract"))

coloursSites <- c("#CC99BB", "#AA4488", "#771155", "#77AADD",
"#4477AA", "#114477", "#77CCCC", "#44AAAA", "#117777", "#88CCAA",
"#44AA77", "#117744", "#DDDD77", "#AAAA44", "#777711", "#DDAA77",
"#AA7744", "#774411", "#DD7788")

p2 <- ggplot(barplot_df, aes(fill = canc_type, x = reorder(driver_gene,tot_samples), y = n_samples)) +
        geom_bar(stat = "identity") +
        theme_bw() +
        ylab("Number of mutated cell lines") +
        xlab("Driver genes") +
        labs(fill = "Primary site")+
        theme(panel.grid.major = element_blank(),
             panel.grid.minor = element_blank(),
             panel.border = element_blank(),
             axis.line = element_line(colour = "#525252"),
             axis.text.x = element_text(angle = 90, vjust=0.4, hjust=1, size=11,colour="#525252"),
             axis.text.y = element_text(angle = 0, size=11, vjust=1, hjust=1, colour="#525252"),
             axis.title.x=element_text(angle = 0, size=16,colour="#525252"),
             axis.title.y=element_text(angle=90, size=16,colour="#525252"),
             legend.position="right",
             legend.title = element_text(angle=0, size=13,colour="#525252"),
             legend.text = element_text(size=12,colour="#525252"),
             legend.key.size = unit(0.75, "cm"))+
        scale_fill_manual(values = coloursSites)+
        scale_y_continuous(breaks = seq(0, 280, by = 40))

ggsave("~/Downloads/Slidr_Results/Pan_cancer_8pc/drivers_canctype_8pc_barplot.pdf", p2, width = 18, height = 8, units = "in")

Group the SL partners and learn the subset structures between mutated genes. We use the nem package for this.

grouped_hits <- hits_pancan %>% 
                dplyr::group_by(sl_partner_gene) %>% 
                summarise(drivers = paste(driver_gene, collapse = ","))

for(i in 1:nrow(grouped_hits)){
  temp_drivers <- sapply(grouped_hits$drivers[i], function(x){strsplit(x,",")[[1]]})
  if(length(temp_drivers) >= 2){ # running for two or more genes
    D            <- t(pc_data$mutations[temp_drivers,])
    control      <- set.default.parameters(unique(colnames(D)),type="mLL",pcombi = TRUE, trans.close=TRUE, para = c(1e-4,1e-4))
    temp_res     <- nem(D, inference = "ModuleNetwork", control=control, verbose=FALSE)
    temp_adj_mat <- transitive.reduction(as(temp_res$graph, "matrix"))

    pdf(file =  paste(path_results, "Subset_plots/", grouped_hits$sl_partner_gene[i], ".pdf", sep = ""))
    plot(as(temp_adj_mat, "graphNEL"),
       attrs=list(node=list(shape = "ellipse", 
                            color="olivedrab3", 
                            fontcolor="deepskyblue4",
                            fontsize=12),
                  edge=list(color="gray56", 
                            arrowhead = "normal")),
                  main= grouped_hits$sl_partner_gene[i], col.main = "gray56")
    dev.off()
  }

}

Trying other models like IPTW for causality.

set.seed(478)
hits_pancan <- read.delim("~/Downloads/Slidr_Results/Pan_cancer_8pc/Hit_List/SL_hits_pan_cancer.txt",stringsAsFactors = F)

load("~/Downloads/Slidr_Results/Pan_cancer_8pc/pancan.Rdata")

grouped_hits <- hits_pancan %>% 
                dplyr::group_by(sl_partner_gene) %>% 
                summarise(drivers = paste(driver_gene, collapse = ","))

for(i in 1:nrow(grouped_hits)){
  temp_drivers <- sapply(grouped_hits$drivers[i], function(x){strsplit(x,",")[[1]]})
  if(length(temp_drivers) >= 2){ # running for two or more genes
    # defining the causal df
    sl_gene    <- grouped_hits$sl_partner_gene[i]
    causal_df  <- t(pc_data$mutations[temp_drivers,])
    causal_df  <- cbind.data.frame(t(pc_data$viabilities[sl_gene,rownames(causal_df)]), causal_df)
    # To avoid confusion between driver and same gene as target
    colnames(causal_df)[1]  <- "Viability"
    causal_df[,"Viability"] <- 1 - binarize(causal_df[,"Viability"], threshold = median(causal_df[,"Viability"]))
    colnames(causal_df)     <- gsub('-', "_", colnames(causal_df))
    temp_drivers            <- gsub('-', "_", temp_drivers)

    for(j in 1: length(temp_drivers)){
        # values of viabilities and treatment var
        y         <- causal_df$Viability
        tr_gene   <- temp_drivers[j]
        treatment <- causal_df[,tr_gene]
        # covariates
        xvars     <- temp_drivers[-j]
        table1    <- CreateTableOne(vars = xvars, 
                                  strata = tr_gene, 
                                  data = causal_df, 
                                  test = FALSE)
        print(table1,smd=TRUE)

        # Propensity matching
        psmodel <- glm(as.formula(paste0(tr_gene," ~" ,paste(xvars, collapse = " + "))),
                     family = binomial(link ="logit"),
                     data = causal_df)

        ## value of propensity score for each subject
        ps <- predict(psmodel, type = "response")

        #create weights
        weight <- ifelse(treatment==1,1/(ps),1/(1-ps))

        truncweight <- replace(weight,weight>600,600)

        #apply weights to data
        weighteddata <- svydesign(ids = ~ 1, 
                                  data = causal_df, 
                                  weights = ~ truncweight)

        #weighted table 1
        weightedtable <- svyCreateTableOne(vars = xvars, 
                                           strata = tr_gene, 
                                           data = weighteddata, 
                                           test = FALSE)
        ## Show table with SMD
        print(weightedtable, smd = TRUE)

        #get causal risk difference
        glm.obj <- glm(formula(paste0("Viability ~", tr_gene)), 
                     weights = truncweight,
                     family = quasibinomial(link="identity"),
                     data = causal_df)
        #summary(glm.obj)
        betaiptw <- coef(glm.obj)
        SE <- sqrt(diag(vcovHC(glm.obj, type="HC0")))

        causalrd <- (betaiptw[2])
        lcl <- (betaiptw[2]-1.96*SE[2])
        ucl <- (betaiptw[2]+1.96*SE[2])
        c(lcl,causalrd,ucl)
        z   <- betaiptw[2]/SE[2]
        exp((-0.717 * z) - (0.416 * z^2))
    }

    # #McNemar test
    # mc_res <- mcnemar.test(as.matrix(table(y_trt,y_con)))
    # mc_res$p.value

  }}











# fit propensity score model to get weights, but truncated
weightmodel<-ipwpoint(exposure= IFNE , family = "binomial", link ="logit",
                      denominator= ~  MTAP + IFNA1 + IFNA13 + IFNA8 + IFNA17 + IFNW1 + IFNA14 + 
    IFNA21 + IFNA16 + IFNA2 + IFNA7 + PTPLAD2 + IFNA4 + IFNB1 + 
    IFNA5 + DMRTA1 + CDKN2B + IFNA6 + IFNA10 + KLHL9, 
    data = causal_df,
    trunc = 0)

weighteddata <- svydesign(ids = ~ 1, 
                           data = causal_df, 
                           weights = ~ weightmodel$weights.trun)
weightedtable <- svyCreateTableOne(vars = xvars, 
                                            strata = tr_gene, 
                                            data = weighteddata, 
                                            test = FALSE)
print(weightedtable, smd = TRUE)
#numeric summary of weights
summary(weightmodel$weights.trun)
#plot of weights
ipwplot(weights = weightmodel$weights.trun, logscale = FALSE,
        main = "weights")
causal_df$wt<-weightmodel$weights.trun
#fit a marginal structural model (risk difference)
msm <- (svyglm(Viability ~ IFNE, design = svydesign(~ 1, weights = ~wt,
                                                    data =causal_df)))
coef(msm)
confint(msm)

se <- (confint(msm)[2,2] -  confint(msm)[2,1])/(2*1.96)
z   <- coef(msm)[2]/se
exp((-0.717 * z) - (0.416 * z^2))

Comparing the tp53 pan-cancer hits after dividing samples based on hot-spots and truncating mutations

path_results <- "~/Downloads/Slidr_Results/"

set.seed(58972)
hits_pancan <- read.delim("~/Downloads/Slidr_Results/Pan_cancer_8pc/Hit_List/SL_hits_pan_cancer.txt",stringsAsFactors = F)

load("~/Downloads/Slidr_Results/Pan_cancer_8pc/pancan.Rdata")

p53_missense_cl <- pc_data$mutation_annot %>% 
                    dplyr::filter(Hugo_Symbol == "TP53" & Variant_Classification == "Missense_Mutation") %>%
                    dplyr::ungroup() %>%
                    dplyr::select(Cell_Line) %>% t()

p53_truncate_cl <- pc_data$mutation_annot %>% 
                    dplyr::filter(Hugo_Symbol == "TP53" & Variant_Classification != "Missense_Mutation") %>%
                    dplyr::ungroup() %>%
                    dplyr::select(Cell_Line) %>% t()

p53_missense_data <- list()
p53_missense_data$viabilities             <- dplyr::select(pc_data$viabilities, -p53_truncate_cl)
# p53_missense_data$mutations               <- t(pc_data$mutations["TP53",which(!colnames(pc_data$mutations) %in% p53_truncate_cl)])
# rownames(p53_missense_data$mutations)     <- "TP53"
# p53_missense_data$CNalterations           <- t(pc_data$CNalterations["TP53",which(!colnames(pc_data$CNalterations) %in% p53_truncate_cl)])
# rownames(p53_missense_data$CNalterations) <- "TP53"
p53_missense_data$mutations               <- pc_data$mutations[,which(!colnames(pc_data$mutations) %in% p53_truncate_cl)]
p53_missense_data$CNalterations           <- pc_data$CNalterations[,which(!colnames(pc_data$CNalterations) %in% p53_truncate_cl)]
# p53_missense_data$mutation_annot          <- pc_data$mutation_annot %>% 
#                                               dplyr::filter(Hugo_Symbol == "TP53" & Variant_Classification == "Missense_Mutation") 
p53_missense_data$mutation_annot          <- pc_data$mutation_annot 
p53_missense_data$primary_site            <- "tp53_missense"


p53_truncate_data <- list()
p53_truncate_data$viabilities             <- dplyr::select(pc_data$viabilities, -p53_missense_cl)
# p53_truncate_data$mutations               <- t(pc_data$mutations["TP53",which(!colnames(pc_data$mutations) %in% p53_missense_cl)])
# rownames(p53_truncate_data$mutations)     <- "TP53"
# p53_truncate_data$CNalterations           <- t(pc_data$CNalterations["TP53",which(!colnames(pc_data$CNalterations) %in% p53_missense_cl)])
# rownames(p53_truncate_data$CNalterations) <- "TP53"
p53_truncate_data$mutations               <- pc_data$mutations[,which(!colnames(pc_data$mutations) %in% p53_missense_cl)]
p53_truncate_data$CNalterations           <- pc_data$CNalterations[,which(!colnames(pc_data$CNalterations) %in% p53_missense_cl)]
# p53_truncate_data$mutation_annot          <- pc_data$mutation_annot %>% 
#                                               dplyr::filter(Hugo_Symbol == "TP53" & Variant_Classification != "Missense_Mutation") 
p53_truncate_data$mutation_annot          <- pc_data$mutation_annot
p53_truncate_data$primary_site            <- "tp53_truncate"


unlist(lapply(p53_missense_data, function(x){print(dim(x))}))

unlist(lapply(p53_truncate_data, function(x){print(dim(x))}))

hits_pancan_missense <- slidr::identifySLHits(canc_data = p53_missense_data, 
                                               path_results = path_results, 
                                               WT_pval_thresh = 0, 
                                               fp_thresh = 1)

hits_pancan_truncate <- slidr::identifySLHits(canc_data = p53_truncate_data, 
                                               path_results = path_results, 
                                               WT_pval_thresh = 0, 
                                               fp_thresh = 1)

hits_list <- list(All = hits_pancan$sl_partner_gene[hits_pancan$driver_gene == "TP53"],
                  Missense = hits_pancan_missense$sl_partner_gene[hits_pancan_missense$driver_gene == "TP53"],
                  Truncated = hits_pancan_truncate$sl_partner_gene[hits_pancan_truncate$driver_gene == "TP53"])

hits_list <- lapply(hits_list, function(x){x[x != "TP53"]})

cairo_pdf("~/Downloads/Slidr_Results/TP53/upset_hits.pdf", width = 6, height = 4)
upset(fromList(hits_list),
      nsets = 3,
      empty.intersections = "on",
      matrix.color = "#066CAE",
      matrix.dot.alpha = 0.6,
      mb.ratio = c(0.7, 0.3),
      main.bar.color = "#2D2C2C",
      # mainbar.y.label = "Intersection Size",
      order.by = "freq",
      point.size = 3,
      sets.bar.color = "#2D2C2C",
      # sets.x.label = "Number of hits", 
      text.scale = 1.3)
dev.off()


cbg-ethz/slidr documentation built on Feb. 8, 2023, 11:15 p.m.