R/scores.R

Defines functions score_gloh score_mbalt score_nlst score_estwgd score_avgcn score_td is_arm_to_ban score_loh score_lst armlevel_alt

Documented in armlevel_alt score_avgcn score_estwgd score_gloh score_loh score_lst score_mbalt score_nlst score_td

# scores.R Functions to identify the globally-altered arms and computes various
# scores.  Author: Yann Christinat Date: 19.9.2019


#' Get all globally-altered chromosome arms.
#'
#' @details By default uses the sum of all alterations and set the arm as
#' globally altered if \>80\% of the arm is altered. Does not account for
#' alteration type and copy number.
#' Will run the function \code{trim_to_coverage} on the segments.
#'
#' @param segments A \code{GRanges} object containing the segments.
#' @param kit.coverage A \code{GRanges} object containing the regions covered on
#'  each chromosome arm.
#' @param threshold The minimum percentage of the arm to be considered as
#' globally altered. Defaults to 80\%.
#'
#' @return A list of globally-altered chromosome arms with the percentage of arm
#'  altered.
#' @export
#'
#' @import GenomicRanges
#' @importFrom methods is
#'
#' @examples
#' arms <- armlevel_alt(segs.chas_example, oncoscan_na33.cov, 0.9)
armlevel_alt <- function(segments, kit.coverage, threshold = 0.9) {
    is_cn_segment(segments, raise_error = TRUE)
    stopifnot(is(kit.coverage, "GRanges"))

    vals <- vapply(levels(seqnames(kit.coverage)), function(arm) {
        if (!(arm %in% as.vector(seqnames(segments)))) {
            return(0)
        }
        # Get the number of bases covered by the kit
        iscov <- as.vector(seqnames(kit.coverage)) == arm
        armlen <- sum(width(kit.coverage[iscov]))

        # Run trim_to_coverage on arm segments
        armsegs <- segments[as.vector(seqnames(segments)) == arm]
        segs <- trim_to_coverage(armsegs, kit.coverage)
        if (length(segs) == 0) {
            return(0)
        }


        # Set the copy number to NA to make sure overlapping segments will be
        # merged, then merge
        segs$cn <- NA
        segs.flat <- merge_segments(segs, 1 / 1000)

        # Get the number of bases altered
        segslen <- sum(width(segs.flat))

        return(segslen / armlen)
    }, c(0), USE.NAMES = TRUE)

    return(vals[vals >= threshold])
}


#' Compute the number of Large-scale State Transitions (LSTs).
#'
#' @details Procedure based on the paper from Popova et al, Can. Res. 2012
#' (PMID: 22933060). First segments smaller than 3Mb are removed, then segments
#' are smoothed with respect to copy number at a distance of 3Mb.
#' The number of LSTs is the number of breakpoints (breakpoints closer than 3Mb
#' are merged) that have a segment larger or equal to 10Mb on each side. This
#' score was linked to BRCA1/2-deficient tumors.
#'
#' @param segments A \code{GRanges} object containing the segments, their copy
#' number and copy number types.
#' @param kit.coverage A \code{GRanges} object containing the regions covered on
#'  each chromosome arm.
#'
#' @return An integer representing the number of LSTs.
#' @export
#'
#' @examples
#' score_lst(segs.chas_example, oncoscan_na33.cov)
score_lst <- function(segments, kit.coverage) {
    is_cn_segment(segments, raise_error = TRUE)
    stopifnot(is(kit.coverage, "GRanges"))

    # Prune all segments < 3Mb
    segs.min3mb <- prune_by_size(segments, 3000)
    if (length(segs.min3mb) == 0) {
        return(0)
    }

    # Merge segments by CN at a distance of 3Mb
    segs.merged <- merge_segments(segs.min3mb, 3000)

    # Compute the number of Large-scale State Transitions for each arm
    vals <- lapply(unique(seqnames(segs.merged)), function(arm) {
        arm.cov <- kit.coverage[as.vector(seqnames(kit.coverage)) == arm]
        segs <- segs.merged[as.vector(seqnames(segs.merged)) == arm]

        # Get breakpoints and start/end of arm
        starts <- start(segs)
        ends <- end(segs)
        breakpoints <-
            sort(unique(c(start(arm.cov), starts, ends + 1, end(arm.cov) + 1)))

        if (length(breakpoints) < 3) {
            # Since we add the start/end of arm, there has to be at least 3
            # elements in 'breakpoints' to have LSTs
            return(0)
        }

        # Smooth breakpoints at 3Mb
        for (i in 2:length(breakpoints)) {
            if (breakpoints[i] - breakpoints[i - 1] < 3*10^6) {
                # Merge breakpoints
                midpoint <- round((breakpoints[i] + breakpoints[i - 1]) / 2)
                breakpoints[i - 1] <- NA
                breakpoints[i] <- midpoint
            }
        }
        bp.smooth <- breakpoints[!is.na(breakpoints)]

        if (length(bp.smooth) < 3) {
            # Since we add the start/end of arm, there has to be at least 3
            # elements in 'breakpoints' to have LSTs
            return(0)
        }

        # Get the number of LSTs
        n.lst <- 0
        for (i in 2:(length(bp.smooth) - 1)) {
            prev.gt10m <- bp.smooth[i] - bp.smooth[i - 1] >= 10*10^6
            next.gt10m <- bp.smooth[i + 1] - bp.smooth[i] >= 10*10^6
            n.lst <- ifelse(prev.gt10m & next.gt10m, n.lst + 1, n.lst)
        }
        return(n.lst)
    })
    return(sum(unlist(vals)))
}


#' Compute the number HR deficiency-associated LOH regions.
#'
#' @details Procedure based on the paper from Abkevich et al., Br J Cancer 2012
#' (PMID: 23047548). All LOH segments larger than 15Mb but excluding chromosome
#' with a global LOH alteration (to compute with the \code{armlevel_alt}
#' function on LOH segments only). This score was linked to BRCA1/2-deficient
#' tumors.
#' Note that the function will merge overlapping or neighbor LOH segments (at a
#' distance of 1bp).
#'
#' @param segments A \code{GRanges} object containing the segments, their copy
#' number and copy number types.
#' @param arms.loh A list of arms with global/arm-level LOH alteration.
#' @param arms.hetloss A list of arms with global/arm-level heterozygous
#' losses.
#' @param kit.coverage A \code{GRanges} object containing the regions covered on
#'  each chromosome arm.
#'
#' @return An integer representing the number of HRD-LOH regions.
#' @export
#'
#' @examples
#' armlevel.loh <- armlevel_alt(get_loh_segments(segs.chas_example),
#'                              kit.coverage = oncoscan_na33.cov)
#' armlevel.hetloss <- armlevel_alt(get_hetloss_segments(segs.chas_example),
#'                              kit.coverage = oncoscan_na33.cov)
#' score_loh(segs.chas_example, names(armlevel.loh), names(armlevel.hetloss),
#' oncoscan_na33.cov)
score_loh <- function(segments, arms.loh, arms.hetloss, kit.coverage) {
    is_cn_segment(segments, raise_error = TRUE)
    stopifnot(is(kit.coverage, "GRanges"))

    if (length(segments) == 0) { return(0) }

    # Test if any arms in arms.loh or arms.loss is not in kit.coverage
    if (length(c(arms.loh, arms.hetloss)) > 0) {
        unknown.arms <- setdiff(c(arms.loh, arms.hetloss),
                                seqnames(kit.coverage))
        if (length(unknown.arms) > 0) {
            msg <- paste('Some arms were not found in the kit:',
                         paste(as.character(unknown.arms), collapse = TRUE))
            stop(msg)
        }
    }

    arms.to_ban <- c()
    kit.chroms <- as.character(seqnames(kit.coverage))
    chromnames <- unique(unlist(strsplit(kit.chroms, "[pq]")))

    # Get chromosomes with LOH on both arms
    if (length(arms.loh) > 0) {
        # Check on each chromosome if both arms are within the parameter
        # 'arms.loh'. Also account for the fact that a chromosome may have
        # only one arm covered by the kit.
        arms_found.to_ban <- lapply(chromnames, is_arm_to_ban,
                                    arms=arms.loh, kit=kit.coverage)
        arms.to_ban <- append(arms.to_ban, do.call("c", arms_found.to_ban))
    }
    # Get chromosomes with het loss on both arms
    if (length(arms.hetloss) > 0) {
        # Check on each chromosome if both arms are within the parameter
        # 'arms.loh'. Also account for the fact that a chromosome may have
        # only one arm covered by the kit.
        arms_found.to_ban <- lapply(chromnames, is_arm_to_ban,
                                    arms=arms.hetloss, kit=kit.coverage)
        arms.to_ban <- append(arms.to_ban, do.call("c", arms_found.to_ban))
    }
    # Get LOH segments larger than 15Mb and not in 'arms.to_ban'
    segs.loh <- c(get_loh_segments(segments), get_hetloss_segments(segments))
    segs.loh <- segs.loh[!(as.vector(seqnames(segs.loh)) %in% arms.to_ban)]
    segs.merged <- merge_segments(segs.loh, 1 / 1000)
    segs.loh_15m <- segs.merged[width(segs.merged) > 15*10^6]
    return(length(segs.loh_15m))
}


#' Helper function to detect if a chromosome is within a set of chromosomal arms
#'
#' @details Used in the score_loh function
#'
#' @param chrom String of a chromosome
#' @param arms list of chromosomal arms (strings)
#' @param kit A \code{GRanges} object containing the regions covered on
#'  each chromosome arm.
#'
#' @return A list of chromosomal arms (string)
#'
#' @noRd
is_arm_to_ban <- function(chrom, arms, kit) {
    chrom.arms <- paste0(chrom, c("p", "q"))

    arms.covered <- intersect(chrom.arms, as.vector(seqnames(kit)))

    arms.found <- unique(intersect(arms.covered, arms))
    if (length(arms.found) == length(arms.covered)) {
        return(as.character(arms.covered))
    }
}


#' Compute the number of large tandem duplication (TDplus).
#'
#' @details Procedure based on the paper from Popova et al., Cancer Res 2016
#' (PMID: 26787835). The TDplus score is defined as the number of regions larger
#'  than 1Mb but smaller or equal to 10Mb with a gain of one or two copies. This
#'  score was linked
#'  to CDK12-deficient tumors. They also identified as second category of tandem
#'   duplication whose size is smaller or equal than 1Mb and around 300Kb but
#'   could not link it to a phenotype. Note that due to its resolution the
#'   Oncoscan assay will most likely miss this second category. Nonetheless it
#'   is reported by the function.
#'
#' @param segments A \code{GRanges} object containing the segments, their copy
#' number and copy number types.
#'
#' @return A list of integer containing the TDplus score (\code{'TDplus'}) and
#' the small TD score (\code{'TD'}).
#' @export
#'
#' @examples
#' score_td(segs.chas_example)
score_td <- function(segments) {
    is_cn_segment(segments, raise_error = TRUE)

    if (length(segments) == 0) {
        return(list(TDplus = 0, TD = 0))
    }


    segs.gain <- segments[segments$cn.type == cntypes$Gain & segments$cn <= 4]
    segs.width <- width(segs.gain)
    segs.tdplus <- segs.gain[segs.width > 1*10^6 & segs.width <= 10*10^6]
    segs.td <- segs.gain[segs.width <= 1*10^6]

    return(list(TDplus = length(segs.tdplus), TD = length(segs.td)))
}



#' Compute the average copy number variation across the genome.
#'
#' @details Compute the weighted average (by segment length) of the copy number
#' variation. LOH segments and sexual chromosomes are excluded. Copy number
#' variation is rounded to the next level (1.67 -> 1 but 2.33 -> 3).
#'
#' @param segments A \code{GRanges} object containing the segments, their copy
#' number and copy number types.
#' @param kit.coverage A \code{GRanges} object containing the regions covered on
#'  each chromosome arm.
#'
#' @return A decimal value
#' @export
#'
#' @examples
#' score_avgcn(segs.chas_example, oncoscan_na33.cov)
score_avgcn <- function(segments, kit.coverage) {
    is_cn_segment(segments, raise_error = TRUE)
    stopifnot(is(kit.coverage, "GRanges"))

    autosomes <-
        as.vector(seqnames(kit.coverage)[!(as.vector(seqnames(kit.coverage))
                                           %in% c("Xp", "Xq", "Yp", "Yq"))])

    # Total number of bases (in Mbp) covered by the kit
    kitwidth.noXY <-
        sum(width(kit.coverage[as.vector(seqnames(kit.coverage)) %in%
                                   autosomes]) / 10^6)

    # Select segments on autosomes and with non-copy neutral alterations
    segs <-
        segments[as.vector(seqnames(segments)) %in% autosomes &
                     segments$cn.type %in%
                     c(cntypes$Gain, cntypes$Loss)]
    if (length(segs) == 0) {
        return(2)
    }

    segs.w <- width(segs) / 10^6  # Width of each segment in Mbp

    # Round the copy number in case of subclones
    cn <- segs$cn
    cn[cn < 2] <- floor(cn[cn < 2])
    cn[cn > 2] <- ceiling(cn[cn > 2])

    avgcn <-
        (sum(segs.w * cn) + 2 * (kitwidth.noXY - sum(segs.w))) / kitwidth.noXY
    return(avgcn)
}


#' Estimates the number of whole-genome doubling events (WGD).
#'
#' @details Based on the publication from Carter et al. (Nature Biotechnology
#' 2012; PubMed ID: 22544022).
#' On a pan-cancer cohort, they observed that tumors that underwent one
#' whole-genome doubling event had a ploidy (average copy number) between 2.2
#' and 3.4. This function relies on the function \code{score_avgcn} to compute
#' the ploidy.
#'
#' @param segments A \code{GRanges} object containing the segments, their copy
#' number and copy number types.
#' @param kit.coverage A \code{GRanges} object containing the regions covered on
#'  each chromosome arm.
#'
#' @return A named list with two values: WGD (whole-genome doubling events) and
#' avgCN (the average copy number). WGD values are 0 for no WGD event, 1 for one
#'  WGD event, 2 for several WGD events.
#' @export
#'
#' @examples
#' score_estwgd(segs.chas_example, oncoscan_na33.cov)
score_estwgd <- function(segments, kit.coverage) {
    is_cn_segment(segments, raise_error = TRUE)
    stopifnot(is(kit.coverage, "GRanges"))

    # Get the average copy number
    avgcn <- score_avgcn(segments, kit.coverage)

    # Estimate the number of WGD events
    wgd.est <- ifelse(avgcn > 3.4, 2, ifelse(avgcn > 2.2, 1, 0))

    return(c(WGD = wgd.est, avgCN = avgcn))
}


#' Compute the number of LSTs, normalized by the number of WGD events.
#'
#' @details Compute the number of LSTs in non-LOH segments via the
#' \code{score_lst} function and subtract the extra noise induced by WGD events:
#'  nLST = LST - 7*W/2 where W is the number of WGD events.
#' A sample is HRD positive (deficient in HR pathway) if nLST is greater or
#' equal to the threshold (15 by default).
#' This score was linked to BRCA1/2-deficient tumors.
#'
#' @param segments A \code{GRanges} object containing the segments, their copy
#' number and copy number types.
#' @param n.wgd Number of whole-genome doubling events (0 if diploid).
#' @param kit.coverage A \code{GRanges} object containing the regions covered on
#'  each chromosome arm.
#' @param threshold A number above which the test is returned positive (>=).
#'
#' @return A named list with the number of nLSTs and the corresponding label
#' ('Positive', 'Negative').
#' @export
#'
#' @examples
#' w <- score_estwgd(segs.chas_example, oncoscan_na33.cov)
#' score_nlst(segs.chas_example, w['WGD'], oncoscan_na33.cov)
score_nlst <-
    function(segments, n.wgd, kit.coverage, threshold = 15) {
        is_cn_segment(segments, raise_error = TRUE)
        stopifnot(is(kit.coverage, "GRanges"))

        lst.noLOH <-
            score_lst(segments[segments$cn.type != cntypes$LOH], kit.coverage)

        nlst <- max(0, lst.noLOH - 3.5 * as.numeric(n.wgd))
        label <- ifelse(nlst >= threshold, "Positive", "Negative")
        return(c(nLST = nlst, HRD = label))
    }


#' Computes the total number of Mbp altered.
#'
#' @param segments A \code{GRanges} object containing the segments, their copy
#' number and copy number types.
#' @param kit.coverage A \code{GRanges} object containing the regions covered on
#'  each chromosome arm.
#' @param loh.rm A boolean (TRUE by default) to indicate whether LOH segments
#' should be excluded.
#'
#' @return A named list representing the Mbp altered in the sample and the total
#'  Mbp of the kit.
#' @export
#'
#' @examples
#' score_mbalt(segs.chas_example, oncoscan_na33.cov)
#' score_mbalt(segs.chas_example, oncoscan_na33.cov, FALSE)
score_mbalt <- function(segments, kit.coverage, loh.rm = TRUE) {
    is_cn_segment(segments, raise_error = TRUE)
    stopifnot(is(kit.coverage, "GRanges"))

    # Compute the number Mbp present in the whole kit
    mb.kit <- round(sum(width(kit.coverage) / 10^6))

    # Exclude (or not) the LOH segments
    segs <- segments
    if (loh.rm) {
        segs <- segments[segments$cn.type != cntypes$LOH]
    }

    if (length(segs) == 0) {
        return(c(sample = 0, kit = mb.kit))
    } else {
        segs.w <- width(reduce(segs)) / 10^6  # Width of each segment in Mbp
        mb.alt <- round(sum(segs.w))

        return(c(sample = mb.alt, kit = mb.kit))
    }
}


#' Compute the genomic LOH score.
#'
#' @details The percentage genomic LOH score is computed as described in the
#' FoundationFocus CDx BRCA LOH assay; i.e. the percentage of bases covered by
#' the Oncoscan that display a loss of heterozygosity independently of the
#' number of copies, excluding chromosomal arms that have a global LOH (>=90% of
#' arm length).
#' To compute with the \code{armlevel_alt} function on LOH segments only).
#' This score was linked to BRCA1/2-deficient tumors.
#'
#' @param segments A \code{GRanges} object containing the segments, their copy
#' number and copy number types.
#' @param arms.loh A list of arms with global/arm-level LOH alteration.
#' @param arms.hetloss A list of arms with global/arm-level heterozygous
#' loss.
#' @param kit.coverage A \code{GRanges} object containing the regions covered on
#'  each chromosome arm.
#'
#' @return An integer representing the percentage of LOH bases.
#' @export
#'
#' @examples
#' armlevel.loh <- armlevel_alt(get_loh_segments(segs.chas_example),
#'                              kit.coverage = oncoscan_na33.cov)
#' armlevel.hetloss <- armlevel_alt(get_hetloss_segments(segs.chas_example),
#'                              kit.coverage = oncoscan_na33.cov)
#' score_gloh(segs.chas_example, names(armlevel.loh), names(armlevel.hetloss),
#' oncoscan_na33.cov)
score_gloh <- function(segments, arms.loh, arms.hetloss, kit.coverage) {
    is_cn_segment(segments, raise_error = TRUE)
    stopifnot(is(kit.coverage, "GRanges"))

    # Test if any arms in arms.loh or arms.loss is not in kit.coverage
    if (length(c(arms.loh, arms.hetloss)) > 0) {
        unknown.arms <- setdiff(c(arms.loh, arms.hetloss),
                                seqnames(kit.coverage))
        if (length(unknown.arms) > 0) {
            msg <- paste('Some arms were not found in the kit:',
                         paste(as.character(unknown.arms), collapse = TRUE))
            stop(msg)
        }
    }

    arms.to_ban <- c(arms.loh, arms.hetloss)

    segs.loh <- c(get_loh_segments(segments), get_hetloss_segments(segments))
    segs.loh <- segs.loh[!(as.vector(seqnames(segs.loh)) %in% arms.to_ban)]

    width.loh <- width(segs.loh)
    return(sum(width.loh) / sum(width(kit.coverage)))
}
yannchristinat/oncoscanR documentation built on Oct. 27, 2023, 11:19 p.m.