R/sitednds.R

Defines functions sitednds

Documented in sitednds

#' sitednds
#'
#' Function to estimate site-wise dN/dS values and p-values against neutrality. To generate a valid input object for this function, use outmats=T when running dndscv. This function is in testing, please interpret the results with caution. Also note that recurrent artefacts or SNP contamination can violate the null model and dominate the list of sites under apparent selection. A considerable number of significant synonymous sites may reflect a problem with the data. Be very critical of the results and if suspicious sites appear recurrently mutated consider refining the variant calling (e.g. using a better unmatched normal panel). In the future, this function may be extended to perform inferences at a codon level instead of at a single-base level.
#'
#' @author Inigo Martincorena (Wellcome Sanger Institute)
#'
#' @param dndsout Output object from dndscv. To generate a valid input object for this function, use outmats=T when running dndscv.
#' @param min_recurr Minimum number of mutations per site to estimate site-wise dN/dS ratios. [default=2]
#' @param gene_list List of genes to restrict the p-value and q-value calculations (Restricted Hypothesis Testing). Note that q-values are only valid if the list of genes is decided a priori. [default=NULL, sitednds will be run on all genes in dndsout]
#' @param site_list List of hotspot sites to restrict the p-value and q-value calculations (Restricted Hypothesis Testing). Note that q-values are only valid if the list of sites is decided a priori. [default=NULL, sitednds will be run on all genes in dndsout]
#' @param trinuc_list List of trinucleotide substitution to restrict the analysis of sitednds. This is used to estimate separate overdispersion parameters for different substitution contexts [default=NULL, sitednds will be run on all substitution contexts]
#' @param theta_option 2 options: "mle" (uses the MLE of the overdispersion parameter) or "conservative" (uses the conservative bound of the CI95). Values other than "mle" will lead to the conservative option [default="conservative"]
#' @param syn_drivers Vector with a list of known synonymous driver mutations to exclude from the background model [default="TP53:T125T"]. See Martincorena et al., Cell, 2017 (PMID:29056346).
#' @param method Overdispersion model: NB = Negative Binomial (Gamma-Poisson), LNP = Poisson-Lognormal (see Hess et al., BiorXiv, 2019). [default="NB"]
#' @param numbins Number of bins to discretise the rvec vector [default=1e4]. This enables fast execution of the LNP model in datasets of arbitrarily any size.
#' @param kc List of a-priori known cancer genes (to be excluded when fitting the background model)
#'
#' @return 'sitednds' returns a table of recurrently mutated sites and the estimates of the size parameter:
#' @return - recursites: Table of recurrently mutated sites with site-wise dN/dS values and p-values
#' @return - overdisp: Maximum likelihood estimate and CI95% for the overdispersion parameter (the size parameter of the negative binomial distribution or the sigma parameter of the lognormal distribution). The lower the size value or the higher the sigma value the higher the variation of the mutation rate across sites not captured by the trinucleotide change or by variation across genes.
#' @return - fpr_nonsyn_q05: Fraction of the significant non-synonymous sites (qval<0.05) that are estimated to be false positives. This assumes that all synonymous mutations (except those in TP53 and CDKN2A) are false positives, thus offering a conservative estimate of the false positive rate.
#' @return - LL: Log-likelihood of the fit of the overdispersed model (see "method" argument) to all synonymous sites.
#'
#' @export

sitednds = function(dndsout, min_recurr = 2, gene_list = NULL, site_list = NULL, trinuc_list = NULL, theta_option = "conservative", syn_drivers = "TP53:T125T", method = "NB", numbins = 1e4, kc = "cgc81") {

    ## 1. Fitting a negative binomial distribution at the site level considering the background mutation rate of the gene and of each trinucleotide
    message("[1] Site-wise overdispersed model accounting for trinucleotides and relative gene mutability...")

    # N and L matrices for synonymous mutations
    if (length(dndsout$N)==0) { stop(sprintf("Invalid input: dndsout must be generated using outmats=T in dndscv.")) }
    if (nrow(dndsout$mle_submodel)!=195) { stop("Invalid input: dndsout must be generated using the default trinucleotide substitution model in dndscv.") }

    # Restricting the analysis to an input list of genes
    if (!is.null(gene_list)) {
        g = as.vector(dndsout$genemuts$gene_name)
        # Correcting CDKN2A if required (hg19)
        if (any(g %in% c("CDKN2A.p14arf","CDKN2A.p16INK4a")) & any(gene_list=="CDKN2A")) {
            gene_list = unique(c(setdiff(gene_list,"CDKN2A"),"CDKN2A.p14arf","CDKN2A.p16INK4a"))
        }
        nonex = gene_list[!(gene_list %in% g)]
        if (length(nonex)>0) {
            warning(sprintf("The following input gene names are not in dndsout input object and will not be analysed: %s.", paste(nonex,collapse=", ")))
        }
        numtests = sum(dndsout$L[,,which(g %in% gene_list)])
    } else {
        numtests = sum(dndsout$L)
    }

    # Input: known cancer genes to exclude from the background model fitting (the user can input a gene list as a character vector)
    if (is.null(kc)) {
        known_cancergenes = ""
    } else if (kc[1] %in% c("cgc81")) {
        data(list=sprintf("cancergenes_%s",kc), package="dndscv")
    } else {
        known_cancergenes = kc
    }

    # L matrix
    L = dndsout$L
    L[,2:4,which(as.vector(dndsout$genemuts$gene_name) %in% known_cancergenes)] = 0 # Removing non-synonymous sites in known_cancergenes
    L = apply(L, c(1,3), sum) # Total number of sites to be considered in the background model


    # Counts of observed mutations

    annotsubs = dndsout$annotmuts[which(dndsout$annotmuts$impact %in% c("Synonymous","Missense","Nonsense","Essential_Splice")),]
    num_syn_drivers_masked = sum(paste(annotsubs$gene,annotsubs$aachange,sep=":") %in% syn_drivers)

    if (!is.null(trinuc_list)) { # Restricting sitednds to certain trinucleotide changes
        annotsubs = annotsubs[which(paste(annotsubs$ref3_cod,annotsubs$mut3_cod,sep=">") %in% trinuc_list), ]
        if (nrow(annotsubs)==0) { stop("No mutations left after restricting by trinuc_list. Please review your input arguments and your mutation table (dndsout$annotmuts).") }
    }

    annotsubs$trisub = paste(annotsubs$chr,annotsubs$pos,annotsubs$ref,annotsubs$mut,annotsubs$gene,annotsubs$aachange,annotsubs$impact,annotsubs$ref3_cod,annotsubs$mut3_cod,sep=":")
    annotsubs = annotsubs[which(annotsubs$ref!=annotsubs$mut),]
    freqs = sort(table(annotsubs$trisub), decreasing=T)

    # Relative mutation rate per gene
    # Note that this assumes that the gene order in genemuts has not been altered with respect to the N and L matrices, as it is currently the case in dndscv
    relmr = dndsout$genemuts$exp_syn_cv/dndsout$genemuts$exp_syn
    names(relmr) = dndsout$genemuts$gene_name

    # Substitution rates (192 trinucleotide rates, strand-specific)
    sm = setNames(dndsout$mle_submodel$mle, dndsout$mle_submodel$name)
    sm["TTT>TGT"] = 1 # Adding the TTT>TGT rate (which is arbitrarily set to 1 relative to t)
    sm = sm*sm["t"] # Absolute rates
    sm = sm[setdiff(names(sm),c("wmis","wnon","wspl","t"))] # Removing selection parameters
    sm = sm[order(names(sm))] # Sorting

    if (!is.null(trinuc_list)) { # Restricting sitednds to certain trinucleotide changes
        sm[!(names(sm) %in% trinuc_list)] = 0 # Setting all other rates to zero
    }

    mat_trisub = array(sm, dim=c(192,nrow(dndsout$genemuts))) # Relative mutation rates by trinucleotide
    mat_relmr = t(array(relmr, dim=c(nrow(dndsout$genemuts),192))) # Relative mutation rates by gene
    R = mat_trisub * mat_relmr # Expected rate for each mutation type in each gene

    # Expanded vectors: full vectors of observed and expected mutations per site across all sites considered in dndsout
    rvec = rep(R, times=L) # Expanded vector of expected mutation counts per site
    nvec = array(0, length(rvec)) # Initialising the vector with observed mutation counts per site

    mutsites = read.table(text=names(freqs), header=0, sep=":", stringsAsFactors=F) # Frequency table of mutations
    colnames(mutsites) = c("chr","pos","ref","mut","gene","aachange","impact","ref3_cod","mut3_cod")
    mutsites$freq = freqs
    trindex = setNames(1:192, names(sm))
    geneindex = setNames(1:length(names(relmr)), names(relmr))
    mutsites$trindex = trindex[paste(mutsites$ref3_cod, mutsites$mut3_cod, sep=">")]
    mutsites$geneindex = geneindex[mutsites$gene]

    # Mutations for the background model (excluding non-synonymous mutations in known_cancergenes)
    synsites = mutsites[!(mutsites$impact!="Synonymous" & mutsites$gene %in% known_cancergenes),]
    synsites = synsites[!(paste(synsites$gene,synsites$aachange,sep=":") %in% syn_drivers),]
    Lcum = array(cumsum(L), dim=dim(L)) # Cumulative L indicating the position to place a given mutation in the nvec following rvec
    synsites$vecindex = apply(as.matrix(synsites[,c("trindex","geneindex")]), 1, function(x) Lcum[x[1], x[2]]) # Index for the mutation
    synsites = synsites[order(synsites$vecindex), ] # Sorting by index in the nvec

    # Stop execution with an error if there are no synonymous mutations
    if (nrow(synsites)<2) {
        stop("Too few synonymous mutations found in the input. sitednds cannot run without synonymous mutations.")
    }

    # Correcting the index when there are multiple synonymous mutations in the same gene and trinucleotide class
    s = snew = synsites$vecindex
    sameind = 0
    for (j in 2:nrow(synsites)) {
        if (s[j]<=s[j-1]) {
            sameind = sameind + 1 # Annotating a run of elements
            snew[j] = s[j-1] - sameind # We assign it an earlier position in the vector
        } else {
            sameind = 0
        }
    }
    synsites$vecindex2 = snew

    nvec[synsites$vecindex2] = synsites$freq # Expanded nvec for the negative binomial regression
    rvec = rvec * (sum(dndsout$genemuts$n_syn)-num_syn_drivers_masked) / sum(dndsout$genemuts$exp_syn_cv) # Minor correction ensuring that global observed and expected rates are identical (this works after subsetting substitutions)

    # Estimation of overdispersion: Using optimize appears to yield reliable results. Problems experienced with fitdistr, glm.nb and theta.ml. Consider using grid search if problems appear with optimize.
    if (method=="LNP") { # Modelling rates per site with a Poisson-Lognormal mixture

        lnp_est = fitlnpbin(nvec, rvec, theta_option = theta_option, numbins = numbins)
        theta_ml = lnp_est$ml$minimum
        theta_ci95 = lnp_est$sig_ci95
        LL = -lnp_est$ml$objective # LogLik
        thetaout = setNames(c(theta_ml, theta_ci95), c("MLE","CI95_high"))

    } else { # Modelling rates per site as negative binomially distributed (i.e. quantifying uncertainty above Poisson using a Gamma)

        nbin = function(theta, n=nvec, r=rvec) { -sum(dnbinom(x=n, mu=r, log=T, size=theta)) } # nbin loglik function for optimisation
        ml = optimize(nbin, interval=c(0,1000))
        theta_ml = ml$minimum
        LL = -ml$objective # LogLik

        # CI95% for theta using profile likelihood and iterative grid search (this yields slightly conservative CI95)
        grid_proflik = function(bins=5, iter=5) {
            for (j in 1:iter) {
                if (j==1) {
                    thetavec = sort(c(0, 10^seq(-3,3,length.out=bins), theta_ml, theta_ml*10, 1e4)) # Initial vals
                } else {
                    thetavec = sort(c(seq(thetavec[ind[1]], thetavec[ind[1]+1], length.out=bins), seq(thetavec[ind[2]-1], thetavec[ind[2]], length.out=bins))) # Refining previous iteration
                }

                proflik = sapply(thetavec, function(theta) -sum(dnbinom(x=nvec, mu=rvec, size=theta, log=T))-ml$objective) < qchisq(.95,1)/2 # Values of theta within CI95%
                ind = c(which(proflik[1:(length(proflik)-1)]==F & proflik[2:length(proflik)]==T)[1],
                        which(proflik[1:(length(proflik)-1)]==T & proflik[2:length(proflik)]==F)[1]+1)
                if (is.na(ind[1])) { ind[1] = 1 }
                if (is.na(ind[2])) { ind[2] = length(thetavec) }
            }
            return(thetavec[ind])
        }
        theta_ci95 = grid_proflik(bins=5, iter=5)
        thetaout = setNames(c(theta_ml, theta_ci95), c("MLE","CI95low","CI95_high"))
    }


    ## 2. Calculating site-wise dN/dS ratios and P-values for recurrently mutated sites
    message("[2] Calculating site-wise dN/dS ratios and p-values...")

    # Theta option
    if (theta_option=="mle" | theta_option=="MLE") {
        theta = theta_ml
    } else { # Conservative
        message("    Using the conservative bound of the confidence interval of the overdispersion parameter.")
        theta = theta_ci95[1]
    }

    # Creating the recursites object
    recursites = mutsites[, c("chr","pos","ref","mut","gene","aachange","impact","ref3_cod","mut3_cod","freq")]
    recursites$mu = relmr[recursites$gene] * sm[paste(recursites$ref3_cod,recursites$mut3_cod,sep=">")]

    # Gene RHT
    if (!is.null(gene_list)) {
        message("    Peforming Restricted Hypothesis Testing on the input list of a-priori genes")
        recursites = recursites[which(recursites$gene %in% gene_list), ] # Restricting the p-value and q-value calculations to gene_list
    }

    # Site RHT
    if (!is.null(site_list)) {
        message("    Peforming Restricted Hypothesis Testing on the input list of a-priori sites (numtests = length(site_list))")
        mutstr = paste(recursites$chr,recursites$pos,recursites$ref,recursites$mut,recursites$gene,recursites$aachange,recursites$ref3_cod,recursites$mut3_cod,sep=":")
        if (!any(mutstr %in% site_list)) {
            stop("No mutation was observed in the restricted list of known hotspots. Site-RHT cannot be run.")
        }
        recursites = recursites[which(mutstr %in% site_list), ] # Restricting the p-value and q-value calculations to site_list
        numtests = length(site_list)

        # Calculating global dN/dS ratios at known hotspots
        auxsites = as.data.frame(do.call("rbind",strsplit(site_list,split=":")), stringsAsFactors=F)
        auxsites = auxsites[auxsites$V5 %in% names(relmr), ]
        neutralexp = sum(relmr[auxsites$V5]*sm[paste(auxsites$V7,auxsites$V8,sep=">")]) # Number of mutations expected at known hotspots under neutrality
        numobs = sum(recursites$freq) # Number observed
        poistest = poisson.test(numobs, T=neutralexp)
        globaldnds_knownsites = setNames(c(numobs, neutralexp, poistest$estimate, poistest$conf.int), c("obs","exp","dnds","cilow","cihigh"))
        message(sprintf("    Mutations at known hotspots: %0.0f observed, %0.3g expected, obs/exp~%0.3g (CI95:%0.3g,%0.3g).", globaldnds_knownsites[1], globaldnds_knownsites[2], globaldnds_knownsites[3], globaldnds_knownsites[4], globaldnds_knownsites[5]))
    }

    # Restricting the recursites output by min_recurr
    recursites = recursites[recursites$freq>=min_recurr, ] # Restricting the output to sites with min_recurr

    if (nrow(recursites)>0) {

        recursites$dnds = recursites$freq / recursites$mu # Site-wise dN/dS (point estimate)

        if (method=="LNP") { # Modelling rates per site with a Poisson-Lognormal mixture

            # Cumulative Lognormal-Poisson using poilog::dpoilog
            dpoilog = poilog::dpoilog
            ppoilog = function(n, mu, sig) {
                p = sum(dpoilog(n=floor(n+1):floor(n*10+1000), mu=log(mu)-sig^2/2, sig=sig))
                return(p)
            }

            message(sprintf("    Modelling substitution rates using a Lognormal-Poisson: sig = %0.3g (upperbound = %0.3g)", theta_ml, theta_ci95))
            recursites$pval = apply(recursites, 1, function(x) ppoilog(n=as.numeric(x["freq"])-0.5, mu=as.numeric(x["mu"]), sig=theta))

        } else { # Negative binomial model

            message(sprintf("    Modelling substitution rates using a Negative Binomial: theta = %0.3g (CI95:%0.3g,%0.3g)", theta_ml, theta_ci95[1], theta_ci95[2]))
            recursites$pval = pnbinom(q=recursites$freq-0.5, mu=recursites$mu, size=theta, lower.tail=F)
        }

        recursites = recursites[order(recursites$pval, -recursites$freq), ] # Sorting by p-val and frequency
        recursites$qval = p.adjust(recursites$pval, method="BH", n=numtests) # P-value adjustment for all possible changes
        rownames(recursites) = NULL

        # Estimating False Positive Rates based on the observed number of significant synonymous hits

        qcutoff = 0.05 # q-value cutoff to estimate false positive rates
        if (any(recursites$qval<qcutoff)) {

            signifsites = recursites[recursites$qval<qcutoff, ]
            obs_hits = c(sum(signifsites$impact=="Synonymous" & !(signifsites$gene %in% c("TP53","CDKN2A","CDKN2A.p14arf"))), sum(signifsites$impact!="Synonymous"))
            exp_frac = c(sum(dndsout$genemuts$exp_syn), sum(dndsout$genemuts$exp_mis+dndsout$genemuts$exp_non+dndsout$genemuts$exp_spl))
            obsexp = (obs_hits[2]/obs_hits[1])/(exp_frac[2]/exp_frac[1])
            fpr_nonsyn = list()
            fpr_nonsyn$estimate = 1 / pmax(1,obsexp) # i.e. 1 - driver_fraction
            fpr_nonsyn$conf.int = rev(1 / pmax(1,as.vector(poisson.test(x=obs_hits[2:1], T=exp_frac[2:1])$conf.int)))

            if (fpr_nonsyn$estimate>qcutoff) {
                warning(sprintf("The estimated false positive rate for nonsynonymous hits (qval<0.05) is %0.3f (CI95:%0.3f,%0.3f). High false positive rates (>>0.05) evidence problems with the data or the model and mean that the results are not reliable.", fpr_nonsyn$estimate, fpr_nonsyn$conf.int[1], fpr_nonsyn$conf.int[2]))
            }

            if (!is.null(site_list)) { # We do not compute FPRs when restricting the analysis to a list of a priori hotspots
                fpr_nonsyn = NULL
            }

        } else {
            fpr_nonsyn = NULL
        }

    } else {
        recursites = fpr_nonsyn = lnp_est = NULL
        warning("No site was found with the minimum recurrence requested [default min_recurr=2]")
    }

    if (is.null(site_list)) {
        return(list(recursites=recursites, overdisp=thetaout, fpr_nonsyn_q05=fpr_nonsyn, LL=LL))
    } else {
        return(list(recursites=recursites, overdisp=thetaout, fpr_nonsyn_q05=fpr_nonsyn, LL=LL, globaldnds_knownsites=globaldnds_knownsites))
    }
}
im3sanger/dndscv documentation built on Oct. 1, 2023, 1:05 p.m.