R/calc_dcorr_GC_content_logfc.R

Defines functions calc_dcorr_GC_content_logfc

Documented in calc_dcorr_GC_content_logfc

#' calc_dcorr_GC_content_logfc
#'
#' Calculates the distance correlation between the log2 fold change and the GC content of sgRNAs in a given library.
#'
#' @param logfc A data frame of log2 fold change data for a sample contrast (e.g. Control vs Plasmid; gRNAs as rows) as generated by `fgcQC::calc_log2_fold_change_gRNAs`.
#' @param library A data frame containing the library file in which the first column gives the sgRNA sequence and the second column gives the sgRNA ID.
#' @param col_suffix A character string providing a name to go at the end of the output column name (e.g. `ctrl_plasmid`).
#'
#' @return A data frame with a `SampleName` column and a `distcorr_GC_content_logfc` column.
#' @author Alex T. Kalinka, \email{alex.kalinka@@cancer.org.uk}
#' @importFrom dplyr filter arrange sym
#' @importFrom tibble tibble
#' @importFrom rlang :=
#' @importFrom energy dcor2d
#' @importFrom magrittr %<>%
#' @references Szekely, G.J., Rizzo, M.L., and Bakirov, N.K. (2007), Measuring and Testing Dependence by Correlation of Distances, Annals of Statistics, Vol. 35 No. 6, pp. 2769-2794.
#' @export
calc_dcorr_GC_content_logfc <- function(logfc, library, col_suffix){
  tryCatch({
    dc_col <- dplyr::sym(paste("distcorr_GC_content_logfc", col_suffix, sep="."))
    library %<>%
      fgcQC::calc_GC_percent_library() %>%
      dplyr::filter(V2 %in% logfc$sgRNA) %>%
      dplyr::arrange(V2)
    logfc %<>%
      dplyr::filter(sgRNA %in% library$V2) %>%
      dplyr::arrange(sgRNA)
    ret <- tibble::tibble(!!dc_col := energy::dcor2d(logfc$log2FC,
                                                     library$GC_percent[match(logfc$sgRNA, library$V2)],
                                                     type = "U"))
  },
  error = function(e) stop(paste("unable to calculate dist corr for GC vs logFC:",e))
  )
  return(ret)
}
alex-kalinka-cruk/fgcQC documentation built on June 23, 2020, 9:05 p.m.