R/analysis-functions.R
In ISAnalytics: Analyze gene therapy vector insertion sites data identified from genomics next generation sequencing reads for clonal tracking studies

Documented in CIS_grubbs comparison_matrix compute_abundance cumulative_count_union default_stats sample_statistics separate_quant_matrices threshold_filter top_integrations

#------------------------------------------------------------------------------#
# Analysis functions
#------------------------------------------------------------------------------#
#' Computes the abundance for every integration event in the input data frame.
#'
#' \lifecycle{maturing}
#' Abundance is obtained for every integration event by calculating the ratio
#' between the single value and the total value for the given group.
#'
#' @details Abundance will be computed upon the user selected columns
#' in the `columns` parameter. For each column a corresponding
#' relative abundance column (and optionally a percentage abundance
#' column) will be produced.
#'
#' @param x An integration matrix - aka a data frame that includes
#' the `mandatory_IS_vars()` as columns. The matrix can either be aggregated
#' (via `aggregate_values_by_key()`) or not.
#' @param columns A character vector of column names to process,
#' must be numeric or integer columns
#' @param percentage Add abundance as percentage?
#' @param key The key to group by when calculating totals
#' @param keep_totals A value between `TRUE`, `FALSE` or `df`. If `TRUE`,
#' the intermediate totals for each group will be kept in the output
#' data frame as a dedicated column with a trailing "_tot". If `FALSE`,
#' totals won't be included in the output data frame. If `df`, the totals
#' are returned to the user as a separate data frame, together with the
#' abundance data frame.
#'
#' @family Analysis functions
#'
#' @importFrom magrittr `%>%`
#' @import dplyr
#' @importFrom rlang .data eval_tidy parse_expr
#' @importFrom purrr map_lgl
#' @importFrom stringr str_replace
#' @return Either a single data frame with computed abundance values or
#' a list of 2 data frames (abundance_df, quant_totals)
#' @export
#'
#' @examples
#' path <- system.file("extdata", "ex_annotated_ISMatrix.tsv.xz",
#'     package = "ISAnalytics"
#' )
#' matrix <- import_single_Vispa2Matrix(path)
#'
#' # Simple integration matrix - grouping by CompleteAmplificationID
#' abundance1 <- compute_abundance(matrix)
#' abundance1
#'
#' # Keeping totals as a separate data frame
#' abundance2 <- compute_abundance(matrix, keep_totals = "df")
#' abundance2
compute_abundance <- function(x,
    columns = "Value",
    percentage = TRUE,
    key = "CompleteAmplificationID",
    keep_totals = FALSE) {
    ## Check parameters
    stopifnot(is.data.frame(x))
    stopifnot(is.character(columns))
    stopifnot(is.character(key))
    if (.check_mandatory_vars(x) == FALSE) {
        stop(.non_ISM_error())
    }
    stopifnot(is.logical(percentage) & length(percentage) == 1)
    if (!all(columns %in% colnames(x)) | !all(key %in% colnames(x))) {
        missing_cols <- c(
            columns[!columns %in% colnames(x)],
            key[!key %in% colnames(x)]
        )
        rlang::abort(.missing_user_cols_error(missing_cols))
    }
    non_num_cols <- purrr::map_lgl(columns, function(col) {
        expr <- rlang::expr(`$`(x, !!col))
        if (is.numeric(rlang::eval_tidy(expr)) |
            is.integer(rlang::eval_tidy(expr))) {
            return(FALSE)
        } else {
            return(TRUE)
        }
    })
    if (any(non_num_cols)) {
        stop(.non_num_user_cols_error(columns[non_num_cols]))
    }
    stopifnot(is.logical(keep_totals) || keep_totals == "df")
    ## Computation
    ### Computes totals for each group defined by key
    totals <- x %>%
        dplyr::group_by(dplyr::across(dplyr::all_of(key))) %>%
        dplyr::summarise(
            dplyr::across(dplyr::all_of(columns),
                sum,
                .names = "{.col}_tot"
            ),
            .groups = "drop"
        )
    ### Computes abundance as value (for each col) / total of the corresponding
    ### group (defined by key)
    abundance_df <- x %>%
        dplyr::left_join(totals, by = key) %>%
        dplyr::mutate(dplyr::across(dplyr::all_of(columns),
            list(ab = ~ .x / rlang::eval_tidy(
                rlang::parse_expr(
                    paste(
                        dplyr::cur_column(),
                        "tot",
                        sep = "_"
                    )
                )
            )),
            .names = "{.col}_RelAbundance"
        )) %>%
        dplyr::distinct()
    if (keep_totals == FALSE || keep_totals == "df") {
        abundance_df <- abundance_df %>%
            dplyr::select(-c(dplyr::all_of(paste(columns, "tot", sep = "_"))))
    }
    if (percentage == TRUE) {
        abundance_df <- abundance_df %>%
            dplyr::mutate(
                dplyr::across(dplyr::contains("RelAbundance"), ~ .x * 100,
                    .names = "{.col}_PercAbundance"
                )
            ) %>%
            dplyr::rename_with(
                ~ stringr::str_replace(.x, "_RelAbundance", ""),
                dplyr::contains("PercAbundance")
            )
    }
    if (keep_totals == "df") {
        return(list(abundance_df = abundance_df, quant_totals = totals))
    } else {
        return(abundance_df)
    }
}


#' obtain a single integration matrix from individual quantification
#' matrices.
#'
#' \lifecycle{maturing}
#' Takes a list of integration matrices referring to different qunatification
#' types and merges them in a single data frame that has multiple
#' value columns, each renamed according to their quantification type
#' of reference.
#'
#' @param x A named list of integration matrices, ideally obtained via
#' \link{import_parallel_Vispa2Matrices_interactive} or
#' \link{import_parallel_Vispa2Matrices_auto}. Names must be
#' quantification types.
#' @param fragmentEstimate The name of the output column for fragment
#' estimate values
#' @param seqCount The name of the output column for sequence
#' count values
#' @param barcodeCount The name of the output column for barcode count
#' values
#' @param cellCount The name of the output column for cell count values
#' @param ShsCount The name of the output column for Shs count values
#'
#' @importFrom purrr walk map2 reduce
#' @importFrom dplyr rename full_join intersect
#' @importFrom magrittr `%>%`
#' @importFrom rlang .data `:=`
#'
#' @family Analysis functions
#'
#' @seealso \link{quantification_types}
#'
#' @return A tibble
#' @export
#'
#' @examples
#' op <- options("ISAnalytics.widgets" = FALSE)
#' path <- system.file("extdata", "ex_association_file.tsv",
#'     package = "ISAnalytics"
#' )
#' root_pth <- system.file("extdata", "fs.zip", package = "ISAnalytics")
#' root <- unzip_file_system(root_pth, "fs")
#' matrices <- import_parallel_Vispa2Matrices_auto(
#'     association_file = path, root = root,
#'     quantification_type = c("fragmentEstimate", "seqCount"),
#'     matrix_type = "annotated", workers = 2, patterns = NULL,
#'     matching_opt = "ANY",
#'     dates_format = "dmy", multi_quant_matrix = FALSE
#' )
#' total_matrix <- comparison_matrix(matrices)
#' options(op)
comparison_matrix <- function(x,
    fragmentEstimate = "fragmentEstimate",
    seqCount = "seqCount",
    barcodeCount = "barcodeCount",
    cellCount = "cellCount",
    ShsCount = "ShsCount") {
    stopifnot(is.list(x) & !is.data.frame(x))
    stopifnot(all(names(x) %in% quantification_types()))
    purrr::walk(x, function(m) {
        mand <- .check_mandatory_vars(m)
        amp <- .check_complAmpID(m)
        val <- .check_value_col(m)
        if (any(c(mand, amp, val) == FALSE)) {
            stop(.non_ISM_error())
        }
    })
    stopifnot(is.character(fragmentEstimate) & length(fragmentEstimate) == 1)
    stopifnot(is.character(seqCount) & length(seqCount) == 1)
    stopifnot(is.character(barcodeCount) & length(barcodeCount) == 1)
    stopifnot(is.character(cellCount) & length(cellCount) == 1)
    stopifnot(is.character(ShsCount) & length(ShsCount) == 1)
    param_names <- c(
        fragmentEstimate = fragmentEstimate,
        seqCount = seqCount, barcodeCount = barcodeCount,
        cellCount = cellCount, ShsCount = ShsCount
    )
    x <- purrr::map2(x, names(x), function(matrix, quant_type) {
        quant_name <- param_names[names(param_names) %in% quant_type]
        matrix %>% dplyr::rename(!!quant_name := .data$Value)
    })
    result <- purrr::reduce(x, function(matrix1, matrix2) {
        commoncols <- dplyr::intersect(colnames(matrix1), colnames(matrix2))
        matrix1 %>%
            dplyr::full_join(matrix2, by = commoncols)
    })
    if (any(is.na(result)) & getOption("ISAnalytics.verbose") == TRUE) {
        message(.nas_introduced_msg())
    }
    result
}


#' Separate a multiple-quantification matrix into single quantification
#' matrices.
#'
#' \lifecycle{maturing}
#' The function separates a single multi-quantification integration
#' matrix, obtained via \link{comparison_matrix}, into single
#' quantification matrices as a named list of tibbles.
#'
#' @param x Single integration matrix with multiple quantification
#' value columns, likely obtained via \link{comparison_matrix}.
#' @param fragmentEstimate Name of the fragment estimate values column
#' in input
#' @param seqCount Name of the sequence count values column
#' in input
#' @param barcodeCount Name of the barcode count values column
#' in input
#' @param cellCount Name of the cell count values column
#' in input
#' @param ShsCount Name of the shs count values column
#' in input
#'
#' @importFrom purrr is_empty map set_names
#' @importFrom dplyr rename
#' @importFrom magrittr `%>%`
#'
#' @family Analysis functions
#'
#' @return A named list of tibbles, where names are quantification types
#' @seealso \link{quantification_types}
#' @export
#'
#' @examples
#' op <- options("ISAnalytics.widgets" = FALSE)
#' path <- system.file("extdata", "ex_association_file.tsv",
#'     package = "ISAnalytics"
#' )
#' root_pth <- system.file("extdata", "fs.zip", package = "ISAnalytics")
#' root <- unzip_file_system(root_pth, "fs")
#' association_file <- import_association_file(
#'     path = path, root = root,
#'     dates_format = "dmy"
#' )
#' matrices <- import_parallel_Vispa2Matrices_auto(
#'     association_file = association_file,
#'     quantification_type = c("seqCount", "fragmentEstimate"),
#'     matrix_type = "annotated", workers = 2, patterns = NULL,
#'     matching_opt = "ANY"
#' )
#' separated_matrix <- separate_quant_matrices(matrices)
#' options(op)
separate_quant_matrices <- function(x, fragmentEstimate = "fragmentEstimate",
    seqCount = "seqCount",
    barcodeCount = "barcodeCount",
    cellCount = "cellCount",
    ShsCount = "ShsCount") {
    stopifnot(is.data.frame(x))
    if (.check_mandatory_vars(x) == FALSE) {
        stop(.non_ISM_error())
    }
    if (.check_complAmpID(x) == FALSE) {
        stop(.missing_complAmpID_error())
    }
    num_cols <- .find_exp_cols(x)
    if (purrr::is_empty(num_cols)) {
        stop(.missing_num_cols_error())
    }
    stopifnot(is.character(fragmentEstimate) & length(fragmentEstimate) == 1)
    stopifnot(is.character(seqCount) & length(seqCount) == 1)
    stopifnot(is.character(barcodeCount) & length(barcodeCount) == 1)
    stopifnot(is.character(cellCount) & length(cellCount) == 1)
    stopifnot(is.character(ShsCount) & length(ShsCount) == 1)
    param_col <- c(
        fragmentEstimate = fragmentEstimate,
        seqCount = seqCount, barcodeCount = barcodeCount,
        cellCount = cellCount,
        ShsCount = ShsCount
    )
    to_copy <- if (any(!num_cols %in% param_col)) {
        if (all(!num_cols %in% param_col)) {
            stop(.non_quant_cols_error())
        }
        num_cols[!num_cols %in% param_col]
    }
    num_cols <- param_col[param_col %in% num_cols]
    annot <- if (.is_annotated(x)) {
        annotation_IS_vars()
    } else {
        character(0)
    }
    if (!purrr::is_empty(to_copy) & getOption("ISAnalytics.verbose") == TRUE) {
        message(.non_quant_cols_msg(to_copy))
    }
    separated <- purrr::map(num_cols, function(quant) {
        x[c(
            mandatory_IS_vars(), annot, "CompleteAmplificationID",
            to_copy, quant
        )] %>% dplyr::rename(Value = quant)
    }) %>% purrr::set_names(names(num_cols))
    separated
}


#' Filter data frames with custom predicates
#'
#' @description
#' \lifecycle{experimental}
#' Filter a single data frame or a list of data frames with custom
#' predicates assembled from the function parameters.
#'
#' @details
#' ## A single data frame as input
#'
#' If the user chooses to operate on a single data frame, the other parameters
#' should only be vectors: numeric vector for `threshold` and character
#' vectors for both `cols_to_compare` and `comparators`.
#' A filtering condition is obtained by combining element by element
#' `cols_to_compare` + `comparators` + `threshold` (similarly to the
#' `paste` function). For example:
#'
#' \verb{
#' threshold = c(20, 35, 50)
#' cols_to_compare = c("a", "b", "c")
#' comparators = "<"
#' }
#'
#' given these vectors, the input data frame
#' will be filtered by checking which values in column "a" are less
#' than 20 **AND** which values in column "b" are less than 35 **AND**
#' which values in column "c" are less than 50.
#' Things the user should keep in mind are:
#' * The vectors of length 1 are going to be recycled if one or
#' more parameters are longer (in the example, the `comparators` value)
#' * If vectors are not of length 1 they must have the same length
#' * Columns to compare, of course, need to be included in the
#' input data frame and need to be numeric/integer
#' * The filtering will perform a logical "AND" on all the conditions,
#' only rows that satisfy ALL the conditions are preserved
#'
#' ## A list of data frames as input
#'
#' The input for the function may also be a list of data frames,
#' either named or unnamed.
#'
#' ### Unnamed list
#' If the input is a simple unnamed list, the other parameters should
#' be simple vectors (as for data frames). All the predicates will
#' simply be applied to every data frame in the list: this is useful
#' if it's desirable to filter for the same conditions different data frames
#' that have the same structure but different data.
#'
#' ### Named list
#' It is also possible to filter different data frames with different
#' sets of conditions. Besides having the possibility of defining the
#' other parameters as simple vector, which has the same results as
#' operating on an unnamed list, the user can define the parameters as
#' named lists containing vectors. For example:
#'
#' ```{r}
#'
#' example_df <- tibble::tibble(a = c(20, 30, 40),
#'                              b = c(40, 50, 60),
#'                              c = c("a", "b", "c"),
#'                              d = c(3L, 4L, 5L))
#' example_list <- list(first = example_df,
#'                      second = example_df,
#'                      third = example_df)
#' print(example_list)
#'
#' filtered <- threshold_filter(example_list,
#'                              threshold = list(first = c(20, 60),
#'                                               third = c(25)),
#'                              cols_to_compare = list(first = c("a", "b"),
#'                                                     third = c("a")),
#'                              comparators = list(first = c(">", "<"),
#'                                                 third = c(">=")))
#' print(filtered)
#'
#' ```
#' The above signature will roughly be translated as:
#' * Filter the element "first" in the list by checking that values in
#' column "a" are bigger than 20 AND values in column "b" are less than
#' 60
#' * Don't apply any filter to the element "second" (returns the
#' data frame as is)
#' * Filter the element "third" by checking that values in column "a"
#' are equal or bigger than 25.
#'
#' It is also possible to use some parameters as vectors and some as
#' lists: vectors will be recycled for every element filtered.
#'
#' ```r
#' filtered <- threshold_filter(example_list,
#'                              threshold = list(first = c(20, 60),
#'                                               third = c(25, 65)),
#'                              cols_to_compare = c("a", "b"),
#'                              comparators = list(first = c(">", "<"),
#'                                                 third = c(">=", "<=")))
#' ```
#' In this example, different threshold and comparators will be applied
#' to the same columns in all data frames.
#'
#' Things the user should keep in mind are:
#' * Names for the list parameters must be the same names in the
#' input list
#' * Only elements explicited in list parameters as names will
#' be filtered
#' * Lengths of both vectors and lists must be consistent
#'
#' @param x A data frame or a list of data frames
#' @param threshold A numeric/integer vector or a named list of
#' numeric/integer vectors
#' @param cols_to_compare A character vector or a named list of
#' character vectors
#' @param comparators A character vector or a named list of
#' character vectors. Must be one of the allowed values between
#' `c("<", ">", "==", "!=", ">=", "<=")`
#'
#' @family Analysis functions
#'
#' @return A data frame or a list of data frames
#' @export
#'
#' @examples
#' example_df <- tibble::tibble(
#'     a = c(20, 30, 40),
#'     b = c(40, 50, 60),
#'     c = c("a", "b", "c"),
#'     d = c(3L, 4L, 5L)
#' )
#' example_list <- list(
#'     first = example_df,
#'     second = example_df,
#'     third = example_df
#' )
#'
#' filtered <- threshold_filter(example_list,
#'     threshold = list(
#'         first = c(20, 60),
#'         third = c(25)
#'     ),
#'     cols_to_compare = list(
#'         first = c("a", "b"),
#'         third = c("a")
#'     ),
#'     comparators = list(
#'         first = c(">", "<"),
#'         third = c(">=")
#'     )
#' )
threshold_filter <- function(x,
    threshold,
    cols_to_compare = "Value",
    comparators = ">") {
    stopifnot(is.list(x))
    ### ---- If x is a data frame ---- ###
    if (is.data.frame(x)) {
        return(.tf_data_frame(x, threshold, cols_to_compare, comparators))
    }
    ### ---- If x is a list ---- ###
    return(.tf_list(x, threshold, cols_to_compare, comparators))
}


#' Sorts and keeps the top n integration sites based on the values
#' in a given column.
#'
#' \lifecycle{experimental}
#' The input data frame will be sorted by the highest values in
#' the columns specified and the top n rows will be returned as output.
#' The user can choose to keep additional columns in the output
#' by passing a vector of column names or passing 2 "shortcuts":
#' * `keep = "everything"` keeps all columns in the original data frame
#' * `keep = "nothing"` only keeps the mandatory columns
#' (`mandatory_IS_vars()`) plus the columns in the `columns` parameter.
#'
#' @param x An integration matrix (data frame containing
#' `mandatory_IS_vars()`)
#' @param n How many integrations should be sliced (in total or
#'  for each group)? Must be numeric
#' or integer and greater than 0
#' @param columns Columns to use for the sorting. If more than a column
#' is supplied primary ordering is done on the first column,
#' secondary ordering on all other columns
#' @param keep Names of the columns to keep besides `mandatory_IS_vars()`
#' and `columns`
#' @param key Either `NULL` or a character vector of column names to group
#' by. If not `NULL` the input will be grouped and the top fraction will
#' be extracted from each group.
#'
#' @family Analysis functions
#'
#' @import dplyr
#' @importFrom magrittr `%>%`
#' @importFrom rlang abort
#'
#' @return Either a data frame with at most n rows or
#' a data frames with at most n*(number of groups) rows.
#' @export
#'
#' @examples
#' smpl <- tibble::tibble(
#'     chr = c("1", "2", "3", "4", "5", "6"),
#'     integration_locus = c(14536, 14544, 14512, 14236, 14522, 14566),
#'     strand = c("+", "+", "-", "+", "-", "+"),
#'     CompleteAmplificationID = c("ID1", "ID2", "ID1", "ID1", "ID3", "ID2"),
#'     Value = c(3, 10, 40, 2, 15, 150),
#'     Value2 = c(456, 87, 87, 9, 64, 96),
#'     Value3 = c("a", "b", "c", "d", "e", "f")
#' )
#' top <- top_integrations(smpl,
#'     n = 3,
#'     columns = c("Value", "Value2"),
#'     keep = "nothing"
#' )
#' top_key <- top_integrations(smpl,
#'     n = 3,
#'     columns = "Value",
#'     keep = "Value2",
#'     key = "CompleteAmplificationID"
#' )
top_integrations <- function(x, n = 50,
    columns = "fragmentEstimate_sum_RelAbundance",
    keep = "everything", key = NULL) {
    stopifnot(is.data.frame(x))
    stopifnot(is.numeric(n) & length(n) == 1 & n > 0)
    stopifnot(is.character(keep))
    stopifnot(is.character(columns))
    stopifnot(is.null(key) || is.character(key))
    if (!.check_mandatory_vars(x)) {
        rlang::abort(.non_ISM_error())
    }
    if (!all(columns %in% colnames(x))) {
        rlang::abort(.missing_user_cols_error(
            columns[!columns %in% colnames(x)]
        ))
    }
    if (!(all(keep == "everything") || all(keep == "nothing"))) {
        if (any(!keep %in% colnames(x))) {
            rlang::abort(.missing_user_cols_error(
                keep[!keep %in% colnames(x)]
            ))
        }
    }
    if (!is.null(key)) {
        if (!all(key %in% colnames(x))) {
            rlang::abort(.missing_user_cols_error(
                key[!key %in% colnames(x)]
            ))
        }
    }
    essential_cols <- c(mandatory_IS_vars(), columns)
    to_keep <- if (all(keep == "everything")) {
        colnames(x)[!colnames(x) %in% essential_cols]
    } else if (all(keep == "nothing")) {
        character(0)
    } else {
        keep[!keep %in% essential_cols]
    }
    if (!is.null(key)) {
        result <- x %>%
            dplyr::group_by(dplyr::across(dplyr::all_of(key))) %>%
            dplyr::arrange(dplyr::across(
                dplyr::all_of(columns),
                dplyr::desc
            ), .by_group = TRUE) %>%
            dplyr::slice_head(n = n) %>%
            dplyr::select(dplyr::all_of(c(key, essential_cols, to_keep))) %>%
            dplyr::ungroup()
        return(result)
    }
    result <- x %>%
        dplyr::arrange(dplyr::across(
            dplyr::all_of(columns),
            dplyr::desc
        )) %>%
        dplyr::slice_head(n = n) %>%
        dplyr::select(dplyr::all_of(c(essential_cols, to_keep)))
    return(result)
}


#' Computes user specified functions on numerical columns and updates
#' the metadata data frame accordingly.
#'
#' @description
#' \lifecycle{experimental}
#' The function operates on a data frame by grouping the content by
#' the sample key and computing every function specified on every
#' column in the `value_columns` parameter. After that the metadata
#' data frame is updated by including the computed results as columns
#' for the corresponding key.
#' For this reason it's required that both `x` and `metadata` have the
#' same sample key, and it's particularly important if the user is
#' working with previously aggregated data.
#' For example:
#'
#' ```r
#' ### Importing association file and matrices
#' path_AF <- system.file("extdata", "ex_association_file.tsv",
#' package = "ISAnalytics")
#' root_correct <- system.file("extdata", "fs.zip",
#' package = "ISAnalytics")
#' root_correct <- unzip_file_system(root_correct, "fs")
#'
#' association_file <- import_association_file(path_AF, root_correct)
#' matrices <- import_parallel_Vispa2Matrices_auto(
#' association_file = association_file , root = NULL,
#' quantification_type = c("seqCount","fragmentEstimate"),
#' matrix_type = "annotated", workers = 2, patterns = NULL,
#' matching_opt = "ANY", dates_format = "dmy")
#'
#' ### Aggregating data (both by same key)
#' aggreggated_x <- aggregate_values_by_key(matrices$seqCount,
#' association_file)
#' aggregated_meta <- aggregate_metadata(association_file)
#'
#' ### Sample statistics
#' sample_stats <- sample_statistics(x = aggregated_x,
#' metadata = aggregated_meta,
#' sample_key = c("SubjectID", "CellMarker","Tissue", "TimePoint"))
#'
#' ```
#' @param x A data frame
#' @param metadata The metadata data frame
#' @param sample_key Character vector representing the key for identifying
#' a sample
#' @param value_columns THe name of the columns to be computed,
#' must be numeric or integer
#' @param functions A named list of function or purrr-style lambdas
#'
#' @family Analysis functions
#' @importFrom rlang eval_tidy expr
#' @importFrom purrr is_function is_formula
#' @import dplyr
#' @importFrom magrittr `%>%`
#'
#' @return A list with modified x and metadata data frames
#' @export
#'
#' @examples
#' op <- options(ISAnalytics.widgets = FALSE)
#'
#' path_AF <- system.file("extdata", "ex_association_file.tsv",
#'     package = "ISAnalytics"
#' )
#' root_correct <- system.file("extdata", "fs.zip",
#'     package = "ISAnalytics"
#' )
#' root_correct <- unzip_file_system(root_correct, "fs")
#'
#' association_file <- import_association_file(path_AF, root_correct,
#'     dates_format = "dmy"
#' )
#' matrices <- import_parallel_Vispa2Matrices_auto(
#'     association_file = association_file, root = NULL,
#'     quantification_type = c("seqCount", "fragmentEstimate"),
#'     matrix_type = "annotated", workers = 2, patterns = NULL,
#'     matching_opt = "ANY", multi_quant_matrix = FALSE
#' )
#'
#' stats <- sample_statistics(matrices$seqCount, association_file)
#' options(op)
sample_statistics <- function(x, metadata,
    sample_key = "CompleteAmplificationID",
    value_columns = "Value",
    functions = default_stats()) {
    stopifnot(is.data.frame(x))
    stopifnot(is.data.frame(metadata))
    stopifnot(is.character(sample_key))
    stopifnot(is.character(value_columns))
    stopifnot(is.list(functions))
    if (!all(sample_key %in% colnames(x))) {
        stop(paste("Key columns not found in the data frame"))
    }
    if (!all(sample_key %in% colnames(metadata))) {
        stop(paste("Key columns not found in metadata"))
    }
    if (!all(value_columns %in% colnames(x))) {
        stop(paste("Value columns not found in the data frame"))
    }
    purrr::walk(value_columns, function(col) {
        expr <- rlang::expr(`$`(x, !!col))
        if (!is.numeric(rlang::eval_tidy(expr)) &&
            !is.integer(rlang::eval_tidy(expr))) {
            stop(paste("Some or all of value columns are not numeric"))
        }
    })
    purrr::walk(functions, function(f) {
        if (!(purrr::is_function(f) | purrr::is_formula(f))) {
            stop(paste(
                "The function parameter should contain a list",
                "of either functions or formulas.",
                "See ?sample_statistics for details"
            ))
        }
    })
    result <- x %>%
        dplyr::group_by(dplyr::across(dplyr::all_of(sample_key))) %>%
        dplyr::summarise(dplyr::across(
            .cols = dplyr::all_of(value_columns),
            .fns = functions
        ),
        .groups = "drop"
        )

    ## Flatten nested data frames
    df_cols <- purrr::map_lgl(result, ~ is.data.frame(.x))
    if (any(df_cols == TRUE)) {
        df_cols <- df_cols[df_cols]
        df_cols <- names(df_cols)
        dfss <- purrr::map(df_cols, function(col) {
            exp <- rlang::expr(`$`(result, !!col))
            df <- rlang::eval_tidy(exp)
            df <- df %>% dplyr::rename_with(.fn = ~ paste0(col, "_", .x))
            df
        }) %>% purrr::set_names(df_cols)
        for (dfc in df_cols) {
            result <- result %>%
                dplyr::select(-dplyr::all_of(dfc)) %>%
                dplyr::bind_cols(dfss[[dfc]])
        }
    }

    updated_meta <- metadata %>% dplyr::left_join(result, by = sample_key)
    return(list(x = result, metadata = updated_meta))
}


#' Grubbs test for Common Insertion Sites (CIS).
#'
#' \lifecycle{experimental}
#' Statistical approach for the validation of common insertion sites
#' significance based on the comparison of the integration frequency
#' at the CIS gene with respect to other genes contained in the
#' surrounding genomic regions. For more details please refer to
#' this paper:
#' <`r .lentiviral_CIS_paper()`>
#'
#' @details
#' ## Genomic annotation file
#' This file is a data base, or more simply a .tsv file to import, with
#' genes annotation for the specific genome. The annotations for the
#' human genome (hg19) is already included in this package.
#' If for any reason the user is performing an analysis on another genome,
#' this file needs to be changed respecting the USCS Genome Browser
#' format, meaning the input file headers should be:
#'
#' ```{r echo=FALSE, tidy=TRUE}
#' cat(c(
#'   paste(c("name2", "chrom", "strand"), collapse = ", "), "\n",
#'   paste(c("min_txStart","max_txEnd", "minmax_TxLen"), collapse = ", "),
#'   "\n",
#'   paste(c("average_TxLen", "name", "min_cdsStart"), collapse = ", "),
#'   "\n",
#'   paste(c("max_cdsEnd","minmax_CdsLen", "average_CdsLen"), collapse = ", ")
#' ))
#'
#' ```
#'
#' @param x An integration matrix, must include the `mandatory_IS_vars()`
#' columns and the `annotation_IS_vars()` columns
#' @param genomic_annotation_file Database file for gene annotation,
#' see details
#' @param grubbs_flanking_gene_bp Number of base pairs flanking a gene
#' @param threshold_alpha Significance threshold
#' @param add_standard_padjust Compute the standard padjust?
#'
#' @family Analysis functions
#'
#' @import dplyr
#' @importFrom tibble as_tibble
#' @importFrom rlang .data
#' @importFrom magrittr `%>%`
#' @importFrom stats median pt p.adjust
#' @importFrom utils read.csv
#'
#' @return A data frame
#' @export
#'
#' @examples
#' op <- options(ISAnalytics.widgets = FALSE)
#'
#' path_AF <- system.file("extdata", "ex_association_file.tsv",
#'     package = "ISAnalytics"
#' )
#' root_correct <- system.file("extdata", "fs.zip",
#'     package = "ISAnalytics"
#' )
#' root_correct <- unzip_file_system(root_correct, "fs")
#'
#' matrices <- import_parallel_Vispa2Matrices_auto(
#'     association_file = path_AF, root = root_correct,
#'     quantification_type = c("seqCount", "fragmentEstimate"),
#'     matrix_type = "annotated", workers = 2, patterns = NULL,
#'     matching_opt = "ANY",
#'     dates_format = "dmy"
#' )
#'
#' cis <- CIS_grubbs(matrices)
#'
#' options(op)
CIS_grubbs <- function(x,
    genomic_annotation_file =
        system.file("extdata", "hg19.refGene.oracle.tsv.xz",
            package = "ISAnalytics"
        ),
    grubbs_flanking_gene_bp = 100000,
    threshold_alpha = 0.05,
    add_standard_padjust = TRUE) {
    # Check x has the correct structure
    stopifnot(is.data.frame(x))
    if (!all(mandatory_IS_vars() %in% colnames(x))) {
        stop(.non_ISM_error())
    }
    if (!.is_annotated(x)) {
        stop(.missing_annot())
    }
    # Check other parameters
    stopifnot(is.character(genomic_annotation_file) &
        length(genomic_annotation_file) == 1)
    stopifnot(is.numeric(grubbs_flanking_gene_bp) |
        is.integer(grubbs_flanking_gene_bp))
    stopifnot(length(grubbs_flanking_gene_bp) == 1)
    stopifnot(is.numeric(threshold_alpha) & length(threshold_alpha) == 1)
    stopifnot(is.logical(add_standard_padjust) &
        length(add_standard_padjust) == 1)
    stopifnot(file.exists(genomic_annotation_file))
    # Determine file extension
    ext <- .check_file_extension(genomic_annotation_file)

    # Try to import annotation file
    if (ext == "tsv") {
        refgenes <- utils::read.csv(
            file = genomic_annotation_file,
            header = TRUE, fill = TRUE, sep = "\t",
            check.names = FALSE,
            na.strings = c("NONE", "NA", "NULL", "NaN", "")
        )
        refgenes <- tibble::as_tibble(refgenes) %>%
            dplyr::mutate(chrom = stringr::str_replace_all(
                .data$chrom,
                "chr", ""
            ))
    } else if (ext == "csv") {
        refgenes <- utils::read.csv(
            file = genomic_annotation_file,
            header = TRUE, fill = TRUE,
            check.names = FALSE,
            na.strings = c("NONE", "NA", "NULL", "NaN", "")
        )
        refgenes <- tibble::as_tibble(refgenes) %>%
            dplyr::mutate(chrom = stringr::str_replace_all(
                .data$chrom,
                "chr", ""
            ))
    } else {
        stop(paste(
            "The genomic annotation file must be either in",
            ".tsv or .csv format (compressed or not)"
        ))
    }

    # Check annotation file format
    if (!all(c(
        "name2", "chrom", "strand", "min_txStart", "max_txEnd",
        "minmax_TxLen", "average_TxLen", "name", "min_cdsStart",
        "max_cdsEnd", "minmax_CdsLen", "average_CdsLen"
    ) %in%
        colnames(refgenes))) {
        stop(.non_standard_annotation_structure())
    }
    ### Begin - init phase
    df_by_gene <- x %>%
        dplyr::group_by(
            .data$GeneName,
            .data$GeneStrand,
            .data$chr
        ) %>%
        dplyr::summarise(
            n_IS_perGene = dplyr::n_distinct(
                .data$integration_locus
            ),
            min_bp_integration_locus =
                min(.data$integration_locus),
            max_bp_integration_locus =
                max(.data$integration_locus),
            IS_span_bp = (max(.data$integration_locus) -
                min(.data$integration_locus)),
            avg_bp_integration_locus =
                mean(.data$integration_locus),
            median_bp_integration_locus =
                stats::median(.data$integration_locus),
            distinct_orientations = dplyr::n_distinct(.data$strand),
            describe = list(tibble::as_tibble(
                psych::describe(.data$integration_locus)
            )),
            .groups = "drop"
        ) %>%
        tidyr::unnest(.data$describe, keep_empty = TRUE, names_sep = "_")

    df_bygene_withannotation <- df_by_gene %>%
        dplyr::inner_join(refgenes, by = c(
            "chr" = "chrom",
            "GeneStrand" = "strand",
            "GeneName" = "name2"
        )) %>%
        dplyr::select(c(
            dplyr::all_of(colnames(df_by_gene)),
            .data$average_TxLen
        ))
    n_elements <- nrow(df_bygene_withannotation)

    df_bygene_withannotation <- df_bygene_withannotation %>%
        dplyr::mutate(
            geneIS_frequency_byHitIS = .data$n_IS_perGene / n_elements
        )

    ### Grubbs test
    ### --- Gene Frequency
    df_bygene_withannotation <- df_bygene_withannotation %>%
        dplyr::mutate(
            raw_gene_integration_frequency =
                .data$n_IS_perGene / .data$average_TxLen,
            integration_frequency_withtolerance = .data$n_IS_perGene /
                (.data$average_TxLen + grubbs_flanking_gene_bp) * 1000,
            minus_log2_integration_freq_withtolerance =
                -log(x = .data$integration_frequency_withtolerance, base = 2)
        )
    ### --- z score
    z_mlif <- function(x) {
        sqrt((n_elements * (n_elements - 2) * x^2) /
            (((n_elements - 1)^2) - (n_elements * x^2)))
    }
    df_bygene_withannotation <- df_bygene_withannotation %>%
        dplyr::mutate(
            zscore_minus_log2_int_freq_tolerance =
                scale(-log(
                    x = .data$integration_frequency_withtolerance,
                    base = 2
                )),
            neg_zscore_minus_log2_int_freq_tolerance =
                -.data$zscore_minus_log2_int_freq_tolerance,
            t_z_mlif = z_mlif(
                .data$neg_zscore_minus_log2_int_freq_tolerance
            )
        )
    ### --- tdist
    t_dist_2t <- function(x, deg) {
        return((1 - stats::pt(x, deg)) * 2)
    }
    df_bygene_withannotation <- df_bygene_withannotation %>%
        dplyr::mutate(
            tdist2t = t_dist_2t(.data$t_z_mlif, n_elements - 2),
            tdist_pt = pt(
                q = .data$t_z_mlif,
                df = n_elements - 2
            ),
            tdist_bonferroni_default = ifelse(
                .data$tdist2t * n_elements > 1, 1,
                .data$tdist2t * n_elements
            )
        )
    if (add_standard_padjust == TRUE) {
        df_bygene_withannotation <- df_bygene_withannotation %>%
            dplyr::mutate(
                tdist_bonferroni = stats::p.adjust(
                    .data$tdist2t,
                    method = "bonferroni",
                    n = length(.data$tdist2t)
                ),
                tdist_fdr = stats::p.adjust(
                    .data$tdist2t,
                    method = "fdr",
                    n = length(.data$tdist2t)
                ),
                tdist_benjamini = stats::p.adjust(
                    .data$tdist2t,
                    method = "BY",
                    n = length(.data$tdist2t)
                )
            )
    }
    df_bygene_withannotation <- df_bygene_withannotation %>%
        dplyr::mutate(
            tdist_positive_and_corrected =
                ifelse(
                    (.data$tdist_bonferroni_default < threshold_alpha &
                        .data$neg_zscore_minus_log2_int_freq_tolerance > 0),
                    .data$tdist_bonferroni_default,
                    NA
                ),
            tdist_positive = ifelse(
                (.data$tdist2t < threshold_alpha &
                    .data$neg_zscore_minus_log2_int_freq_tolerance > 0),
                .data$tdist2t,
                NA
            )
        )
    EM_correction_N <- length(
        df_bygene_withannotation$tdist_positive[
            !is.na(df_bygene_withannotation$tdist_positive)
        ]
    )
    df_bygene_withannotation <- df_bygene_withannotation %>%
        dplyr::mutate(
            tdist_positive_and_correctedEM =
                ifelse(
                    (.data$tdist2t * EM_correction_N <
                        threshold_alpha &
                        .data$neg_zscore_minus_log2_int_freq_tolerance > 0),
                    .data$tdist2t * EM_correction_N,
                    NA
                )
        )
    return(df_bygene_withannotation)
}

#' Integrations cumulative count in time by sample
#'
#' \lifecycle{experimental}
#' This function computes the cumulative number of integrations
#' observed in each sample at different time points by assuming that
#' if an integration is observed at time point "t" then it is also observed in
#' time point "t+1".
#'
#' @details
#' ## Input data frame
#' The user can provide as input for the `x` parameter both a simple
#' integration matrix AND setting the `aggregate` parameter to TRUE,
#' or provide an already aggregated matrix via
#' \link{aggregate_values_by_key}.
#' If the user supplies a matrix to be aggregated the `association_file`
#' parameter must not be NULL: aggregation will be done by an internal
#' call to the aggregation function.
#' If the user supplies an already aggregated matrix, the `key` parameter
#' is the key used for aggregation -
#' **NOTE: for this operation is mandatory
#' that the time point column is included in the key.**
#' ## Assumptions on time point format
#' By using the functions provided by this package, when imported,
#' an association file will be correctly formatted for future usage.
#' In the formatting process there is also a padding operation performed on
#' time points: this means the functions expects the time point column to
#' be of type character and to be correctly padded with 0s. If the
#' chosen column for time point is detected as numeric the function will
#' attempt the conversion to character and automatic padding.
#' If you choose to import the association file not using the
#' \link{import_association_file} function, be sure to check the format of
#' the chosen column to avoid undesired results.
#'
#' @param x A simple integration matrix or an aggregated matrix (see details)
#' @param association_file NULL or the association file for x if `aggregate`
#' is set to TRUE
#' @param timepoint_column What is the name of the time point column?
#' @param key The aggregation key - must always contain the `timepoint_column`
#' @param include_tp_zero Include timepoint 0?
#' @param zero How is 0 coded in the data frame?
#' @param aggregate Should x be aggregated?
#' @param ... Additional parameters to pass to `aggregate_values_by_key`
#'
#' @family Analysis functions
#'
#' @import dplyr
#' @importFrom magrittr `%>%`
#' @importFrom rlang .data
#' @importFrom stringr str_pad
#' @importFrom purrr reduce is_empty
#' @importFrom tidyr pivot_longer
#' @importFrom stats na.omit
#'
#' @return A data frame
#' @export
#'
#' @examples
#' op <- options(ISAnalytics.widgets = FALSE)
#'
#' path_AF <- system.file("extdata", "ex_association_file.tsv",
#'     package = "ISAnalytics"
#' )
#' root_correct <- system.file("extdata", "fs.zip",
#'     package = "ISAnalytics"
#' )
#' root_correct <- unzip_file_system(root_correct, "fs")
#'
#' association_file <- import_association_file(path_AF, root_correct,
#'     dates_format = "dmy"
#' )
#' matrices <- import_parallel_Vispa2Matrices_auto(
#'     association_file = association_file, root = NULL,
#'     quantification_type = c("seqCount", "fragmentEstimate"),
#'     matrix_type = "annotated", workers = 2, patterns = NULL,
#'     matching_opt = "ANY", multi_quant_matrix = FALSE
#' )
#'
#' #### EXTERNAL AGGREGATION
#' aggregated <- aggregate_values_by_key(matrices$seqCount, association_file)
#' cumulative_count <- cumulative_count_union(aggregated)
#'
#' #### INTERNAL AGGREGATION
#' cumulative_count_2 <- cumulative_count_union(matrices$seqCount,
#'     association_file,
#'     aggregate = TRUE
#' )
#'
#' options(op)
cumulative_count_union <- function(x,
    association_file = NULL,
    timepoint_column = "TimePoint",
    key = c(
        "SubjectID",
        "CellMarker",
        "Tissue",
        "TimePoint"
    ),
    include_tp_zero = FALSE,
    zero = "0000",
    aggregate = FALSE,
    ...) {
    stopifnot(is.data.frame(x))
    stopifnot(is.data.frame(association_file) | is.null(association_file))
    stopifnot(is.character(timepoint_column) & length(timepoint_column) == 1)
    stopifnot(is.character(key))
    stopifnot(is.logical(include_tp_zero))
    stopifnot(is.character(zero) & length(zero) == 1)
    stopifnot(is.logical(aggregate))
    if (aggregate == TRUE & is.null(association_file)) {
        stop(.agg_with_null_meta_err())
    }
    if (!all(timepoint_column %in% key)) {
        stop(.key_without_tp_err())
    }
    if (aggregate == FALSE) {
        if (!all(key %in% colnames(x))) {
            stop(.key_not_found())
        }
    } else {
        x <- aggregate_values_by_key(
            x = x,
            association_file = association_file,
            key = key, ...
        )
        if (is.numeric(association_file[[timepoint_column]]) |
            is.integer(association_file[[timepoint_column]])) {
            max <- max(association_file[[timepoint_column]])
            digits <- floor(log10(x)) + 1
            association_file <- association_file %>%
                dplyr::mutate({{ timepoint_column }} := stringr::str_pad(
                    as.character(.data$TimePoint),
                    digits,
                    side = "left",
                    pad = "0"
                ))
            zero <- paste0(rep_len("0", digits), collapse = "")
        }
    }
    if (include_tp_zero == FALSE) {
        x <- x %>%
            dplyr::filter(dplyr::across(
                dplyr::all_of(timepoint_column),
                ~ .x != zero
            ))
        if (nrow(x) == 0) {
            message(paste(
                "All time points zeros were excluded, the data",
                "frame is empty."
            ))
            return(x)
        }
    }
    annot <- if (.is_annotated(x)) {
        annotation_IS_vars()
    } else {
        character(0)
    }
    x <- x %>% dplyr::select(dplyr::all_of(c(
        mandatory_IS_vars(),
        annot,
        key
    )))
    cols_for_join <- colnames(x)[!colnames(x) %in% timepoint_column]
    key_minus_tp <- key[!key %in% timepoint_column]
    distinct_tp_for_each <- x %>%
        dplyr::arrange(dplyr::across(dplyr::all_of(timepoint_column))) %>%
        dplyr::group_by(dplyr::across(dplyr::all_of(key_minus_tp))) %>%
        dplyr::summarise(
            Distinct_tp = unique(
                `$`(.data, !!timepoint_column)
            ),
            .groups = "drop"
        ) %>%
        dplyr::rename({{ timepoint_column }} := "Distinct_tp")

    splitted <- x %>%
        dplyr::arrange(dplyr::across(dplyr::all_of(timepoint_column))) %>%
        dplyr::group_by(dplyr::across(dplyr::all_of(timepoint_column))) %>%
        dplyr::group_split()

    mult_tp <- purrr::reduce(splitted, dplyr::full_join, by = cols_for_join)
    tp_indexes <- grep(timepoint_column, colnames(mult_tp))
    tp_indexes <- tp_indexes[-1]
    res <- if (!purrr::is_empty(tp_indexes)) {
        for (i in tp_indexes) {
            mod_col <- mult_tp[[i]]
            mod_col_prec <- mult_tp[[i - 1]]
            val <- dplyr::first(stats::na.omit(mod_col))
            mod_col[!is.na(mod_col_prec)] <- val
            mult_tp[i] <- mod_col
        }
        mult_tp %>%
            tidyr::pivot_longer(
                cols = dplyr::starts_with(timepoint_column),
                values_to = timepoint_column,
                values_drop_na = TRUE
            ) %>%
            dplyr::select(-c("name")) %>%
            dplyr::distinct()
    } else {
        mult_tp
    }
    res <- res %>% dplyr::semi_join(distinct_tp_for_each, by = key)
    res <- res %>%
        dplyr::group_by(dplyr::across(dplyr::all_of(key))) %>%
        dplyr::summarise(count = dplyr::n(), .groups = "drop")
    return(res)
}


#' A set of pre-defined functions for `sample_statistics`.
#'
#' @return A named list of functions/purrr-style lambdas
#' @export
#'
#' @family Analysis functions helpers
#'
#'
#' @examples
#' default_stats()
default_stats <- function() {
    list(
        diversity = vegan::diversity,
        sum = ~ sum(.x, na.rm = TRUE),
        count = length,
        describe = ~ tibble::as_tibble(psych::describe(.x))
    )
}
Any scripts or data that you put into this service are public.
ISAnalytics documentation built on April 9, 2021, 6:01 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
ISAnalytics
Analyze gene therapy vector insertion sites data identified from genomics next generation sequencing reads for clonal tracking studies

R/analysis-functions.R
In ISAnalytics: Analyze gene therapy vector insertion sites data identified from genomics next generation sequencing reads for clonal tracking studies

Defines functions default_stats cumulative_count_union CIS_grubbs sample_statistics top_integrations threshold_filter separate_quant_matrices comparison_matrix compute_abundance

Documented in CIS_grubbs comparison_matrix compute_abundance cumulative_count_union default_stats sample_statistics separate_quant_matrices threshold_filter top_integrations

Try the ISAnalytics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ISAnalytics Analyze gene therapy vector insertion sites data identified from genomics next generation sequencing reads for clonal tracking studies

R/analysis-functions.R In ISAnalytics: Analyze gene therapy vector insertion sites data identified from genomics next generation sequencing reads for clonal tracking studies

Defines functions default_stats cumulative_count_union CIS_grubbs sample_statistics top_integrations threshold_filter separate_quant_matrices comparison_matrix compute_abundance

Documented in CIS_grubbs comparison_matrix compute_abundance cumulative_count_union default_stats sample_statistics separate_quant_matrices threshold_filter top_integrations

Try the ISAnalytics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ISAnalytics
Analyze gene therapy vector insertion sites data identified from genomics next generation sequencing reads for clonal tracking studies

R/analysis-functions.R
In ISAnalytics: Analyze gene therapy vector insertion sites data identified from genomics next generation sequencing reads for clonal tracking studies