R/calFst.R

Defines functions calFst

Documented in calFst

################################################################################
#' calFst
#' @description Genetic divergence between regions of subclonal sSNVs using 
#' the Weir and Cockerham method 
#' @references Sun R, Hu Z, Sottoriva A, et al. Between-region 
#' genetic divergence reflects the mode and tempo of tumor evolution. 
#' Nat Genet. 2017;49(7):1015-1024.
#' 
#' Bhatia G, Patterson N, Sankararaman S, Price AL. 
#' Estimating and interpreting FST: the impact of rare variants. 
#' Genomic Res. 2013;23(9):1514-1521.
#' 
#' @param maf A Maf or MafList object generated by \code{\link{readMaf}} function.
#' @param patient.id Select the specific patients. 
#' Default NULL, all patients are included.
#' @param min.vaf Specify The minimum VAF to filter variants. Default 0.
#' @param min.total.depth The minimum total allele depth for filtering variants. 
#' Default 2.
#' @param use.adjVAF Use adjusted VAF in analysis when adjusted VAF or CCF is available. Default FALSE. 
#' @param plot Logical (Default: TRUE).
#' @param withinTumor Logical (Default: FALSE). Whether calculate between-region heterogeneity within tumors.
#' @param use.circle Logical (Default: TRUE). Whether use "circle" in the plot. 
#' as visualization method of correlation matrix
#' @param title The title of the plot. Default "Nei's distance"
#' @param number.cex The size of text shown in correlation plot. Default 8.
#' @param number.col The color of text shown in correlation plot. 
#' Default "#C77960".
#' @param use.tumorSampleLabel Logical (Default: FALSE). Rename the 'Tumor_Sample_Barcode' by 'Tumor_Sample_Label'.
#' @param ... Other options passed to \code{\link{subMaf}}
#'  
#' @return A list contains Fst value of MRS and Hudson estimator of each sample-pair, respectively.
#'
#' @examples
#' maf.File <- system.file("extdata/", "CRC_HZ.maf", package = "MesKit")
#' clin.File <- system.file("extdata/", "CRC_HZ.clin.txt", package = "MesKit")
#' ccf.File <- system.file("extdata/", "CRC_HZ.ccf.tsv", package = "MesKit")
#' maf <- readMaf(mafFile=maf.File, clinicalFile = clin.File, ccfFile=ccf.File, refBuild="hg19")
#' calFst(maf)
#' @import dplyr circlize tidyr
#' @importFrom utils combn
#' @export calFst

calFst <- function(
  maf, 
  patient.id = NULL, 
  min.vaf = 0,
  min.total.depth = 2,
  use.adjVAF = FALSE,
  plot = TRUE,
  withinTumor = FALSE,
  use.circle = TRUE,
  title = NULL,
  number.cex = 8, 
  number.col = "#C77960",
  use.tumorSampleLabel = FALSE,
  ...){
  
  
  processFst <- function(maf_data){
    
    patient <- unique(maf_data$Patient_ID)
    
    if(nrow(maf_data) == 0){
      message("Warning: there was no mutation in ", patient, " after filtering.")
      return(NA)
    }
    
    # if(!"VAF_adj" %in% colnames(maf_data)){
    #   stop("Adjusted VAFs were not found in the maf object.")
    # }
    
    Fst_input <- maf_data %>% 
      tidyr::unite(
        "Mut_ID",
        c(
          "Chromosome",
          "Start_Position",
          "Reference_Allele",
          "Tumor_Seq_Allele2"
        ),
        sep = ":",
        remove = FALSE
      ) %>% 
      dplyr::mutate(totalDepth = .data$Ref_allele_depth + .data$Alt_allele_depth) %>% 
      dplyr::select(        	
        "Mut_ID",
        "Tumor_ID",
        "Tumor_Sample_Barcode",
        "VAF",
        "totalDepth") %>% 
      ## remove NA(it may be caused by NA of CCF)
      dplyr::filter(!is.na(.data$VAF))
    
    ## check data
    if(withinTumor){
      tumor <- unique(Fst_input$Tumor_ID)
      if(length(unique(Fst_input$Tumor_Sample_Barcode))  < 2 ){
        message(paste0("Warning: only one sample was found of ", tumor,
                       " in ", patient, 
                       ". If you want to compare CCF between regions, withinTumor should be set as FALSE"))
        return(NA)
      }
    }else{
      if(length(unique(Fst_input$Tumor_Sample_Barcode))  < 2 ){
        message(paste0("Warning: only one sample was found in ", patient, "."))
        return(NA)
      } 
    }
    
    
    ## pairwise heterogeneity
    samples <- as.character(unique(Fst_input$Tumor_Sample_Barcode))
    pairs <- utils::combn(samples, 2, simplify = FALSE)
    
    dist_mat <- diag(1, nrow=length(samples), ncol=length(samples))
    dist_mat <- cbind(dist_mat, samples)
    rownames(dist_mat) <- samples
    colnames(dist_mat) <- c(samples, "name")
    
    processFstpair <- function(pair){
      
      pair_vaf <- subset(Fst_input, Fst_input$Tumor_Sample_Barcode %in% c(pair[1], pair[2])) %>%
        dplyr::select(-"Tumor_ID") %>% 
        tidyr::pivot_wider(
          names_from = "Tumor_Sample_Barcode",       
          values_from = c("VAF", "totalDepth"),
          values_fill = list("VAF" = 0, "totalDepth" = 0)
        )
      colnames(pair_vaf) <- c("Mut_ID", "vaf1", "vaf2", "depth1", "depth2")
      
      ## 
      pair_vaf <- pair_vaf %>% 
        dplyr::mutate(
          covariance = (.data$vaf1-.data$vaf2)^2-(.data$vaf1*(1-.data$vaf1))/(.data$depth1-1)-(.data$vaf2*(1-.data$vaf2))/(.data$depth2-1), 
          sd = .data$vaf1*(1-.data$vaf2)+.data$vaf2*(1-.data$vaf1)
        )
      fst <- mean(pair_vaf$covariance)/mean(pair_vaf$sd)
      # if(is.nan(fst)){
      #   browser()
      #   print(pair_vaf)
      # }
      return(fst)
    }
    
    fst.list <- lapply(pairs, processFstpair) %>% unlist()
    pairs_name <- lapply(pairs, function(x)paste0(x[1], "_", x[2])) %>% unlist()
    names(fst.list) <- pairs_name
    
    processFstMat <- function(j){
      row_name <- j["name"]
      j1 <- j[-length(j)]
      
      idx <- which(j == "0")
      j2 <- names(j[idx])
      
      mat_row <- vapply(j2, function(g){
        name1 <- paste0(g, "_", row_name)
        name2 <- paste0(row_name, "_", g)
        
        pos <- which(grepl(name1, names(fst.list)))
        if(length(pos) == 0){
          pos <- which(grepl(name2, names(fst.list)))
        }
        fst <- fst.list[pos]
        return(fst)
      }, FUN.VALUE = double(1))
      
      j[idx] <- mat_row
      return(j)
    }
    
    dist_mat <- t(apply(dist_mat, 1, processFstMat))
    dist_mat <- dist_mat[,-ncol(dist_mat)] %>% apply(c(1,2),as.numeric)
    
    Fst.avg <- mean(dist_mat[upper.tri(dist_mat, diag = FALSE)])
    Fst.pair <- dist_mat
    
    if(plot){
      ## get significant number
      if(is.null(title)){
        min_value <- min(dist_mat[dist_mat != 1 & dist_mat != 0])
        min_value <- format(min_value, scientific = FALSE)
        significant_digit <- gsub(pattern =  "0\\.0*", "", as.character(min_value))
        digits <- nchar(as.character(min_value)) - nchar(significant_digit) 
        if(withinTumor){
          title_id <- paste0("Fst of ", tumor, " in ", patient, ": ", 
                             round(Fst.avg, digits))
        }else{
          title_id <- paste0("Fst of patient ", patient, ": ", round(Fst.avg, digits))
        }
      }else{
        title_id <- title
      }
      Fst.plot <- plotCorr(
        dist_mat, 
        use.circle, 
        number.cex=number.cex,
        number.col=number.col,
        title = if(!is.null(title_id)) title_id else{NA} 
      )
      return(list(Fst.avg = Fst.avg, Fst.pair = Fst.pair, Fst.plot = Fst.plot))
    }else{
      return(list(Fst.avg = Fst.avg, Fst.pair = Fst.pair))
    }
    
  }
  
  
  if(withinTumor){
    clonalStatus <- "Subclonal"
  }else{
    clonalStatus <- NULL
  }
  
  maf_input <- subMaf(maf = maf,
                min.vaf = min.vaf,
                min.total.depth = min.total.depth,
                clonalStatus = clonalStatus,
                mafObj = TRUE,
                use.adjVAF = use.adjVAF,
                patient.id = patient.id,
                use.tumorSampleLabel = use.tumorSampleLabel,
                ...)

  maf_data_list <- lapply(maf_input, getMafData)
  
  if(withinTumor){
    maf_group <- dplyr::bind_rows(maf_data_list) %>%
      dplyr::group_by(.data$Patient_ID, .data$Tumor_ID)
    group_name <- paste(group_keys(maf_group)$Patient_ID, group_keys(maf_group)$Tumor_ID, sep = "_")
    
    result <- maf_group %>% 
      dplyr::group_map(~processFst(.), .keep = TRUE)
    names(result) <- group_name
  }else{
    maf_group <- dplyr::bind_rows(maf_data_list) %>%
      dplyr::group_by(.data$Patient_ID)
    group_name <- group_keys(maf_group)$Patient_ID
    
    result <- maf_group %>% 
      dplyr::group_map(~processFst(.), .keep = TRUE)
    names(result) <- group_name
  }
  
  result <- result[!is.na(result)]
  
  
  if(length(result) > 1){
    return(result)
  }else if(length(result) == 0){
    return(NA)
  }else{
    return(result[[1]])
  }
}

Try the MesKit package in your browser

Any scripts or data that you put into this service are public.

MesKit documentation built on March 28, 2021, 6 p.m.