R/calJSI.R

Defines functions calJSI

Documented in calJSI

#' compareJSI
#'
#' @description The Jaccard similarity index (JSI) is applied to distinguish monoclonal versus polyclonal seeding in metastases.
#' @references Hu, Z., Li, Z., Ma, Z. et al. Multi-cancer analysis of clonality and the timing of systemic spread in paired primary tumors and metastases. Nat Genet (2020).
#' @param maf Maf or MafList object generated by \code{\link{readMaf}} function.
#' @param patient.id Select the specific patients. Default NULL, all patients are included.
#' @param pairByTumor Compare JSI between different tumors. Default FALSE.
#' @param min.ccf The minimum value of CCF. Default 0.
#' @param plot Logical (Default: FALSE).
#' @param use.circle Logical (Default: TRUE). Whether to use "circle" as visualization method of correlation matrix.
#' @param title Title of the plot Default "Jaccard similarity".
#' @param number.cex The size of text shown in correlation plot. Default 8.
#' @param number.col The color of text shown in correlation plot. Default "#C77960".
#' @param use.tumorSampleLabel Logical (Default: FALSE). Rename the 'Tumor_Sample_Barcode' by 'Tumor_Sample_Label'.
#' @param ... Other options passed to \code{\link{subMaf}}
#'
#' @examples
#' maf.File <- system.file("extdata/", "CRC_HZ.maf", package = "MesKit")
#' clin.File <- system.file("extdata/", "CRC_HZ.clin.txt", package = "MesKit")
#' ccf.File <- system.file("extdata/", "CRC_HZ.ccf.tsv", package = "MesKit")
#' maf <- readMaf(mafFile=maf.File, clinicalFile = clin.File, ccfFile=ccf.File, refBuild="hg19")
#' calJSI(maf)
#' @return Correlation matrix and heatmap via Jaccard similarity coefficient method
#' @export calJSI

calJSI <- function(
    maf,
    patient.id = NULL,
    pairByTumor = FALSE,
    min.ccf = 0,
    plot = FALSE, 
    use.circle = TRUE,
    title = NULL,
    number.cex = 8,
    number.col = "#C77960",
    use.tumorSampleLabel = FALSE,
    ...) {
    
    processJSI <- function(m){
        maf_data <- getMafData(m) %>%
            dplyr::filter(!is.na(.data$Clonal_Status),
                          !is.na(CCF))
        if(! "CCF" %in% colnames(getMafData(m))){
            stop(paste0("Calculation of Jaccard similarity requires CCF data." ,
                        "No CCF data was found when generating Maf/MafList object."))
        }
        
        patient <- getMafPatient(m)
        if(nrow(maf_data) == 0){
            message("Warning: there was no mutation found in ", patient, " after filtering.")
            return(NA)
        }
        
        
        if(pairByTumor){
            if(length(unique(maf_data$Tumor_ID))  < 2 ){
                message(paste0("Warning: only one tumor was found in ", patient,
                               " according to 'Tumor_ID'. You may want to compare CCF between regions by setting 'pairByTumor' as 'FALSE'")
                )
                return(NA)
            }
            
            maf_data$CCF <- maf_data$Tumor_Average_CCF
            
            tumors <- as.character(unique(maf_data$Tumor_ID))
            pairs <- utils::combn(tumors, 2, simplify = FALSE)
            dist_mat <- diag(1, nrow = length(tumors), ncol = length(tumors))
            dist_mat <- cbind(dist_mat, tumors)
            rownames(dist_mat) <- tumors
            colnames(dist_mat) <- c(tumors, "name")
            
        }else{
            if(length(unique(maf_data$Tumor_Sample_Barcode))  < 2 ){
                message(paste0("Warning: only one sample was found in ", patient, "."))
                return(NA)
            } 
            
            samples <- as.character(unique(maf_data$Tumor_Sample_Barcode))
            pairs <- utils::combn(samples, 2, simplify = FALSE)
            dist_mat <- diag(1, nrow = length(samples), ncol = length(samples))
            dist_mat <- cbind(dist_mat, samples)
            rownames(dist_mat) <- samples
            colnames(dist_mat) <- c(samples, "name")
            
        }
        
        JSI_input <-  maf_data %>%
            tidyr::unite(
                "Mut_ID",
                c(
                    "Chromosome",
                    "Start_Position",
                    "Reference_Allele",
                    "Tumor_Seq_Allele2"
                ),
                sep = ":",
                remove = FALSE
            ) %>%
            dplyr::select(
                "Mut_ID",
                "Tumor_ID",
                "Tumor_Sample_Barcode",
                "Clonal_Status",
                "CCF")
        
        
        
        processJSI2 <- function(pair){
            
            if(pairByTumor){
                # name <- paste(pair[1], pair[2], sep = "_")
                ccf.pair <- subset(JSI_input, JSI_input$Tumor_ID %in% c(pair[1], pair[2])) %>%
                    tidyr::unite("Mut_ID2",
                                 c("Mut_ID",
                                   "Tumor_ID"),
                                 sep = ":",
                                 remove = FALSE
                    ) %>%
                    dplyr::distinct(.data$Mut_ID2, .keep_all = TRUE) %>%
                    dplyr::select("Mut_ID", "Tumor_ID", "Clonal_Status", "CCF") %>% 
                    tidyr::pivot_wider(
                        names_from = "Tumor_ID",       
                        values_from = c("CCF", "Clonal_Status"),
                        values_fill = list("CCF" = 0, "Clonal_Status" = "nostatus")
                    ) %>%
                    dplyr::ungroup()
                colnames(ccf.pair) <- c("Mut_ID", "ccf1", "ccf2", "status1", "status2")
            }
            else{
                # name <- paste(pair[1], pair[2], sep = "_")
                ccf.pair <- subset(JSI_input, JSI_input$Tumor_Sample_Barcode %in% c(pair[1], pair[2])) %>% 
                    dplyr::select("Mut_ID", "Tumor_Sample_Barcode", "Clonal_Status", "CCF") %>% 
                    tidyr::pivot_wider(
                        names_from = "Tumor_Sample_Barcode",       
                        values_from = c("CCF", "Clonal_Status"),
                        values_fill = list("CCF" = 0, "Clonal_Status" = "nostatus")
                    ) %>%
                    dplyr::ungroup()
                colnames(ccf.pair) <- c("Mut_ID", "ccf1", "ccf2", "status1", "status2")
            }
            
            ccf.pair <- ccf.pair %>% 
                dplyr::filter(.data$ccf1 + .data$ccf2 !=0) %>% 
                data.table::setDT()
            PC_1 <- nrow(ccf.pair[ccf.pair$status1 == "Clonal" & 
                                      ccf.pair$ccf1 > 0 & 
                                      ccf.pair$ccf2 == 0])
            PC_2 <- nrow(ccf.pair[ccf.pair$status2 == "Clonal" & 
                                      ccf.pair$ccf1 == 0 & 
                                      ccf.pair$ccf2 > 0])
            SS_12 = nrow(ccf.pair[ccf.pair$status2 == "Subclonal" &
                                      ccf.pair$status1 == "Subclonal" &
                                      ccf.pair$ccf1>0 &
                                      ccf.pair$ccf2>0 ])
            jsi <- SS_12/(PC_1+PC_2+SS_12)
            if(is.nan(jsi)){
                jsi <- 0
            }
            return(c(PC_1 = PC_1, PC_2 = PC_2, SS_12 = SS_12, jsi = jsi))
        }
        
        pair_result <- lapply(pairs, processJSI2)
        
        PC_1.list <- lapply(pair_result, function(x)x["PC_1"]) %>% unlist()
        PC_2.list <- lapply(pair_result, function(x)x["PC_2"]) %>% unlist()
        SS_12.list <- lapply(pair_result, function(x)x["SS_12"]) %>% unlist()
        multi <- mean(SS_12.list)/(mean(PC_1.list) + mean(PC_2.list) + mean(SS_12.list))
        multi <- ifelse(is.nan(multi), 0, multi)
        
        pairs_name <- lapply(pairs, function(x)paste0(x[1], "_", x[2])) %>% unlist()
        jsi.list <- lapply(pair_result, function(x)x["jsi"]) %>% unlist()
        names(jsi.list) <- pairs_name
        
        processJSI3 <- function(j){
            row_name <- j["name"]
            j1 <- j[-length(j)]
            
            idx <- which(j == "0")
            j2 <- names(j[idx])
            
            mat_row <- vapply(j2, function(g){
                name1 <- paste0(g, "_", row_name)
                name2 <- paste0(row_name, "_", g)
                
                pos <- which(grepl(name1, names(jsi.list)))
                if(length(pos) == 0){
                    pos <- which(grepl(name2, names(jsi.list)))
                }
                jsi <- jsi.list[pos]
                return(jsi)
            }, FUN.VALUE = double(1))
            
            j[idx] <- mat_row
            return(j)
        }
        
        dist_mat <- t(apply(dist_mat, 1, processJSI3))
        dist_mat <- dist_mat[,-ncol(dist_mat)] %>% apply(c(1,2), as.numeric)
        
        JSI.multi <- multi
        JSI.pair <- dist_mat
        
        if(plot){
            values <- sort(unique(as.numeric(dist_mat))) 
            if(length(values[values!=0 &values !=1]) == 0){
                message(paste0("Warning: private clonal mutations or shared subclonal mutations were not found in all tumor/sample pairs in ", patient, ".\n",
                               "Correlation plot of Jaccard similarity coefficient will not be generated for ", patient, "!"))
                return(list(JSI.multi = JSI.multi, JSI.pair = JSI.pair, JSI.plot = NA))
            }
            if(is.null(title)){
                min_value <- min(dist_mat[dist_mat!=1 & dist_mat!=0])
                significant_digit <- gsub(pattern =  "0\\.0*", "", as.character(min_value))
                digits <- nchar(as.character(min_value)) - nchar(significant_digit) 
                title_id <- paste0("JSI of patient ", patient, ": ", round(multi,digits))
            }else{
                title_id <- title
            }
            p <- plotCorr(
                dist_mat, 
                use.circle, 
                number.cex = number.cex,
                number.col = number.col,
                title = if(!is.null(title_id)) title_id else{NA} 
            )
            JSI.plot <- p
            return(list(JSI.multi = JSI.multi, JSI.pair = JSI.pair, JSI.plot = JSI.plot))
        }else{
            return(list(JSI.multi = JSI.multi, JSI.pair = JSI.pair))
        }
        
    }
    
    maf_input <- subMaf(maf,
                        min.ccf = min.ccf,
                        patient.id = patient.id, 
                        mafObj = TRUE,
                        use.tumorSampleLabel = use.tumorSampleLabel,
                        ...)
    
    
    result <- lapply(maf_input, processJSI)
    result <- result[!is.na(result)]
    
    if(length(result) > 1){
        return(result)
    }else if(length(result) == 0){
        return(NA)
    }else{
        return(result[[1]])
    }
    
    
    
}

Try the MesKit package in your browser

Any scripts or data that you put into this service are public.

MesKit documentation built on March 28, 2021, 6 p.m.