R/estimate_noise.R
In BEARscc: BEARscc (Bayesian ERCC Assesstment of Robustness of Single Cell Clusters)

Documented in apply_bayes build_dropoutmodel cleanup_model_names compute_alpha compute_genewise_dropouts counts2mpc create_null_dropout_model estimate_missingdata estimate_mu2sigma estimate_noiseparameters estimate_undetected2molpercell iterate_alphas iterate_spikeins melt_spikeins plot_alpha2mu plot_mu2sigma plot_obs2actual plot_spikein_fits prepare_data sample_models subcompute_sample_models subplot_spikein_fits write_noise_model

#! R script
# David T Severson
# Scope: Function in BEARscc used to estimate parameters
    # and dropouts from ERCC spike-ins.


################### INTERNAL FUNCTIONS #######################################


#annotate spike-in counts with actual transcript counts
prepare_data<-function(spike_counts.df,spike_conc.df){
    spike_counts.df$transcripts<-spike_conc.df[rownames(spike_counts.df),1]
    spike_counts.df
}

#melt and format spike-in counts
melt_spikeins<-function(prepared_spikeins){
    value<-V1<-NULL
    melted_spikeins.dt<-melt(data.table(prepared_spikeins,
        keep.rownames = TRUE), id.vars = c("rn", "transcripts"))
    detected_spikeins.dt<-melted_spikeins.dt[,mean(value)>0, by="rn"]
    setkey(detected_spikeins.dt, "rn")
    setkey(melted_spikeins.dt,"rn")
    melted_spikeins.dt<-detected_spikeins.dt[melted_spikeins.dt,
        ][V1==TRUE,][,-2,with=FALSE]
    colnames(melted_spikeins.dt)<-c("spike-in", "transcripts" , "sample",
        "counts")
    melted_spikeins.dt
}

#plot mean to std dev correlation
plot_mu2sigma<-function(spikein_stats.dt, mu2sigma.fit, file, sd_inflate){
    V1<-NULL
    g<-ggplot(data=spikein_stats.dt,aes(x=log2(mean), y=log2(V1)))+
        geom_point()+xlab("\nMean spike-in expression / log2")+
        geom_abline(slope = coef(mu2sigma.fit)[2],
        intercept = coef(mu2sigma.fit)[1])+
        ylab("Standard deviation of spike-in expression / log2\n")+
    ggtitle("Relationship between standard deviation
        and mean of spike in measurements\n")+
    geom_abline(slope=coef(mu2sigma.fit)[2],color="red",
        intercept=coef(mu2sigma.fit)[1]+
        sd_inflate*max(mu2sigma.fit$residuals))
    ggsave(paste(file,"mu2sigma.pdf", sep="_"), plot=g, device="pdf",
        width=8, height=6, units = "in")
}

#estimate mean to std dev correlation by linear regression
estimate_mu2sigma<-function(melted_spikeins.dt, plot, sd_inflate, file=file){
    V1<-value<-counts<-NULL
    spikein_stats.dt<-melted_spikeins.dt[,sd(counts), by=c("spike-in",
        "transcripts")]
    spikein_stats.dt$mean<-melted_spikeins.dt[,mean(counts),
        by=c("spike-in", "transcripts")]$V1
    mu2sigma.fit<-lm(log2(spikein_stats.dt$V1)~log2(spikein_stats.dt$mean))
    mu2sigma<-data.frame(mu2sigma.slope=coef(mu2sigma.fit)[2],
        mu2sigma.intercept=coef(mu2sigma.fit)[1],
        max.res=max(mu2sigma.fit$residuals))
    if (plot==TRUE){
        plot_mu2sigma(spikein_stats.dt, mu2sigma.fit, file=file,
            sd_inflate=sd_inflate)
    }
    mu2sigma
}

#plot actual spike-in transcript number vs observed spike-in counts
plot_obs2actual<-function(melted_spikeins.dt, file){
    transcripts<-counts<-NULL
    g<-ggplot(data=melted_spikeins.dt[transcripts>0,],
        aes(x=factor(round(log10(transcripts+0.25), digits=3)),
        y=log2(counts+0.25)))+geom_violin(scale = "width")+
    xlab("\nSpike-in molecules per cell / log10")+
    ylab("Measured counts / log2\n")+
    ggtitle("The count measurement distribution as a
        function of actual transcript number.\n")
    ggsave(paste(file,"_countsdistribution_vs_moleculespercell.pdf", sep="_"),
        plot=g, device="pdf", width=8, height=6, units = "in")
}

#subfunction to compute alpha_i for each spike-in
iterate_spikeins<-function(spikein_name, alpha, melted_spikeins.dt,
mean_spikeins.dt, max_cumprob, bins, mu2sigma, sd_inflate){
    `spike-in`<-NULL
    melted_1spikein.dt<-melted_spikeins.dt[`spike-in`==spikein_name]
    onespikein_samples<-dim(melted_1spikein.dt)[1]
    mean_1spikein<-mean_spikeins.dt[`spike-in`==spikein_name]$mean
    top_probable_count<-max(qnbinom(max_cumprob,
        size=ceiling(mean_1spikein)^2/((2^(mu2sigma[1,1]*
        log2(ceiling(mean_1spikein))+mu2sigma[1,2]+
        sd_inflate*mu2sigma[1,3]))^2-ceiling(mean_1spikein)),
        mu=ceiling(mean_1spikein)), qpois(max_cumprob,
        ceiling(mean_1spikein)), melted_1spikein.dt$counts+1,
        na.rm = TRUE)
    event_space<-seq(from=0, to=top_probable_count, by=1)
    neg_binom.density <- dnbinom(event_space,
        size=ceiling(mean_1spikein)^2/((2^(mu2sigma[1,1]*
        log2(ceiling(mean_1spikein))+mu2sigma[1,2]+
        sd_inflate*mu2sigma[1,3]))^2-round(mean_1spikein)),
        mu=ceiling(mean_1spikein))
    poisson.density <- dpois(event_space,ceiling(mean_1spikein))
    mixed_model.density<-alpha*neg_binom.density+(1-alpha)*poisson.density
    event_space2<-seq(from=0, to=top_probable_count,
        by=ceiling(top_probable_count/bins))
    observed.hist<-hist(melted_1spikein.dt$counts, breaks=event_space,
        include.lowest = TRUE, plot=FALSE)
    observed.density<-observed.hist$density
    observed.density<-diff(c(0,cumsum(observed.density)[event_space2]))
    mixed_model.density<-diff(c(0, cumsum(mixed_model.density)[event_space2]))
    error<-sum(abs(mixed_model.density-observed.density))
    data.frame(`spike-in`=as.character(spikein_name),mean=mean_1spikein,
    alpha=alpha, alpha_inv=1-alpha, error=error)
}

#subfunction to compute alpha_i for each spike-in
iterate_alphas<-function(alpha, spikein_names, melted_spikeins.dt,
mean_spikeins.dt, max_cumprob, bins, mu2sigma, sd_inflate){
    if (alpha %in% seq(from=0, to=1, by=0.25)){
        print(paste("Estimating error for spike-ins with alpha = ", alpha,
            sep=""))
    }
    alpha_table.df<-do.call(rbind, lapply(spikein_names, `iterate_spikeins`,
        alpha, melted_spikeins.dt, mean_spikeins.dt, max_cumprob, bins,
        mu2sigma, sd_inflate))
    alpha_table.df
}

#plots the alpha paramter as function of mean spike-in expression
plot_alpha2mu<-function(final_alpha.dt, alpha2mean, file){
    g<-ggplot(data=final_alpha.dt, aes(x=log2(mean),y=alpha))+
        geom_point(fill="red",color="black", size=2.2,alpha=0.6,  pch=21)+
        xlab("\nObserved mean spike-in expression, log2(expression)")+
        ylab("Neg. binomial contribution parameter, alpha\n")+ggtitle("The Neg.
        binomial contribution is a function of gene expression.\n")+
        geom_abline(slope=coef(alpha2mean)[2], intercept = coef(alpha2mean)[1])
    ggsave(paste(file,"alpha2mu_correlation.pdf",sep="_"),
        plot=g, device="pdf", width=8, height=6, units = "in")
}

#compute alpha parameter empirically for mixed-model
compute_alpha<-function(melted_spikeins.dt,estimate_mu2sigma, bins,
tie_function, max_cumprob, sd_inflate, plot, file,
alpha_resolution=alpha_resolution){
    transcripts<-value<-rn<-Mean<-r.value<-p.nbinom<-NULL
    counts<-error<-`spike-in`<-min_error<-NULL
    mu2sigma<-estimate_mu2sigma(melted_spikeins.dt, plot, sd_inflate, file)
    if (plot==TRUE){
        plot_obs2actual(melted_spikeins.dt, file=file)
    }
    mean_spikeins.dt<-melted_spikeins.dt[,mean(counts),
        by=c("spike-in", "transcripts")]
    setkey(mean_spikeins.dt, 'spike-in')
    colnames(mean_spikeins.dt)<-c('spike-in', "transcripts", "mean")
    spikein_names<-as.character(mean_spikeins.dt$'spike-in')
    setkey(mean_spikeins.dt, 'spike-in')
    alpha_vector<-seq(from=0, to=1, by=alpha_resolution)
    alpha_table.dt<-data.table(do.call(rbind, lapply(alpha_vector,
        `iterate_alphas`, spikein_names, melted_spikeins.dt,
        mean_spikeins.dt, max_cumprob, bins, mu2sigma, sd_inflate)))
    colnames(alpha_table.dt)<-c("spike-in", "mean", "alpha", "alpha_inv",
        "error")
    min_error.dt<-alpha_table.dt[,min(error),by=`spike-in`]
    colnames(min_error.dt)<-c("spike-in", "min_error")
    setkey(min_error.dt, `spike-in`)
    setkey(alpha_table.dt, `spike-in`)
    alpha_table.dt<-alpha_table.dt[min_error.dt]
    if (tie_function=="minimum") {
        final_alpha.dt<-alpha_table.dt[error==min_error, min(alpha),
        by=c("spike-in", "mean")]
    }
    else {
        final_alpha.dt<-alpha_table.dt[error==min_error, min(alpha),
        by=c("spike-in", "mean")]
    }
    colnames(final_alpha.dt)<-c("spike-in", "mean", "alpha")
    alpha2mean<-lm(final_alpha.dt$alpha~log2(final_alpha.dt$mean))
    if (plot==TRUE){
        plot_alpha2mu(final_alpha.dt, alpha2mean, file=file)
    }
    alpha2mu<-data.frame(alpha2mu.slope=coef(alpha2mean)[2],
        alpha2mu.intercept=coef(alpha2mean)[1], sd.inflate=sd_inflate)
    model_parameters<-data.frame(cbind(mu2sigma,alpha2mu),
        row.names=c("parameter.value"))
    model_parameters
}

#spike-in wise model computation
subcompute_sample_models<-function(spikein_name, melted_spikeins.dt,
model_parameters){
    `spike-in`<-NULL
    one_spikein.dt<-melted_spikeins.dt[`spike-in`==spikein_name]
    transcript_conc<-melted_spikeins.dt[`spike-in`==spikein_name]$transcripts
    spikenamerepeats<-melted_spikeins.dt[`spike-in`==spikein_name]$`spike-in`
    sample_number<-length(one_spikein.dt$counts)
    mean_1spikein<-mean(one_spikein.dt$counts)
    alpha<-model_parameters$alpha2mu.slope*log2(mean_1spikein)+
        model_parameters$alpha2mu.intercept
    if (alpha>1) {alpha<-1}
    if (alpha<0) {alpha<-0}
    neg_binom.sample <- rnbinom(sample_number, size = ceiling(mean_1spikein
        )^2/((2^(model_parameters$mu2sigma.slope*log2(ceiling(mean_1spikein))+
        model_parameters$mu2sigma.intercept+model_parameters$max.res*
        model_parameters$sd.inflate))^2-ceiling(mean_1spikein)),
        mu = ceiling(mean_1spikein))
    poisson.sample <- rpois(sample_number, mean_1spikein)
    neg_binom_mixed.sample <- rnbinom(ceiling(sample_number*alpha),
        size = ceiling(mean_1spikein)^2/((2^(model_parameters$mu2sigma.slope*
        log2(ceiling(mean_1spikein))+model_parameters$mu2sigma.intercept+
        model_parameters$max.res*model_parameters$sd.inflate
        ))^2-ceiling(mean_1spikein)), mu = ceiling(mean_1spikein))
    poisson_mixed.sample <- rpois(ceiling(sample_number*(1-alpha)),
        mean_1spikein)
    mixed_model.sample<-sample(append(neg_binom_mixed.sample,
        poisson_mixed.sample), sample_number, replace = FALSE)
    samp.mixedmodel<-data.frame(`spike-in`=spikenamerepeats,
        transcripts=as.numeric(transcript_conc), sample=rep("Mixed",
        sample_number), counts=mixed_model.sample)
    samp.negbinomial<-data.frame(`spike-in`=spikenamerepeats,
        transcripts=transcript_conc, sample=rep("NBinom", sample_number),
        counts=neg_binom.sample)
    samp.poisson<-data.frame(`spike-in`=spikenamerepeats,
        transcripts=transcript_conc, sample=rep("Poisson", sample_number),
        counts=poisson.sample)
    model_sampling.df<-rbind(samp.mixedmodel, samp.negbinomial,
        samp.poisson, data.frame(one_spikein.dt))
    model_sampling.df
}

#computes vanilla, and mixed models for plotting
sample_models<-function(melted_spikeins.dt, model_parameters){
    rn<-NULL
    spikein_names<-unique(as.character(melted_spikeins.dt$`spike-in`))
    model_sampling<-do.call(rbind,lapply(spikein_names,
        `subcompute_sample_models`, melted_spikeins.dt, model_parameters))
    data.table(model_sampling)
}

#makes plots of models more readable
cleanup_model_names<-function(model_sampling){
    model_sampling$distribution<-gsub(TRUE,"Observed",
        model_sampling$sample!="Mixed" & model_sampling$sample!="NBinom" &
        model_sampling$sample!="Poisson")
    model_sampling[model_sampling$sample=="Mixed",
        ]$distribution<-"Optimized"
    model_sampling[model_sampling$sample=="Poisson",
        ]$distribution<-"Poisson"
    model_sampling[model_sampling$sample=="NBinom",
        ]$distribution<-"Neg. Binomial"
    data.table(model_sampling)
}

subplot_spikein_fits<-function(iteration,model_sampling.dt, model_view, file,
bins, spikein_names){
    spike.in<-distribution<-counts<-NULL
    six_spikeins<-spikein_names[seq(from=iteration, to=length(spikein_names),
        by=round(length(spikein_names)/6))]
    g<-ggplot(data=model_sampling.dt[spike.in %in% six_spikeins,
        ][distribution %in% model_view], aes(x=counts))+
        geom_histogram(aes(fill=factor(distribution, levels=c("Poisson",
        "Neg. Binomial", "Optimized", "Observed"))), alpha=0.30,
        position="identity", bins=bins)+facet_wrap(~spike.in, ncol=3,nrow=4,
        scales="free")+ylab("Observation density\n")+labs(fill="Model")+
        xlab("\nSpike-in expression / normalized counts")+
        ggtitle("Approximating technical noise using a refined
        mixed distribution.\n")+scale_color_discrete(guide=FALSE)+
        theme(axis.text.x = element_text(angle = 90, hjust = 1, size=10))
    savefile<-paste("spikein_fits/", paste(file,"spikein_fits_subset",
        iteration, "plot.pdf",sep="_"),sep="")
    ggsave(savefile, plot=g, device="pdf", width=8, height=6, units = "in")
}

#plot mixed-model fits for each spike-in molecule
plot_spikein_fits<-function(model_sampling.dt, model_view, file, bins){
    transcripts<-NULL
    dir.create("spikein_fits")
    print("Writing spike-in plots; this takes a few seconds.")
    spikein_names<-as.character(unique(model_sampling.dt[
        order(transcripts)]$spike.in))
    iteration<-seq(from=1, to=round(length(spikein_names)/6), by=1)
    lapply(iteration, `subplot_spikein_fits`,
        model_sampling.dt=model_sampling.dt, model_view=model_view,
        file=file, bins=bins, spikein_names=spikein_names)
}

#computes the linear fit parameters to model false zeroes
estimate_undetected2molpercell<-function(undetected_spikeins.dt,
plot, file){
    V1<-transcripts<-undetected<-NULL
    undetected_spikeins.dt<-undetected_spikeins.dt[
        undetected!=0][transcripts>=0.5]
    undetected2mpc.fit<-lm(undetected~log2(transcripts),
        undetected_spikeins.dt)
    ##Read correlation paramaters into data.frame
    undetected2molpercell<-data.frame(
        undetected2mpc.slope=coef(undetected2mpc.fit)[2],
        undetected2mpc.intercept=coef(undetected2mpc.fit)[1])
    if (plot==TRUE){
        g<-ggplot(data=undetected_spikeins.dt,
            aes(x=log2(transcripts), y=undetected))+geom_point()+
            xlab("\nSpike-in actual counts, log2(molecules per cell)")+
            ylab("Fraction of cells where observed spike-in count is zero")+
            ggtitle("Conditional probability of observing a zero given
            transcript concentration\n")+geom_abline(
            slope=coef(undetected2mpc.fit)[2],color="red",
            intercept=coef(undetected2mpc.fit)[1])
        ggsave(paste(file,"undetected2molpercell.pdf", sep="_"), plot=g,
            device="pdf", width=11, height=8.5, units = "in")
    }
    undetected2molpercell
}

#computes relationship between counts and molecules per cell
counts2mpc<-function(melted_spikeins.dt, plot, file){
    transcripts<-value<-V1<-counts<-NULL
    mean_spikeins.dt<-melted_spikeins.dt[,mean(counts),by=transcripts][
        transcripts>0]
    counts2mpc.fit<-lm(log2(mean_spikeins.dt$V1)~log2(
        mean_spikeins.dt$transcripts))
    if (plot==TRUE){
        g<-ggplot(data=mean_spikeins.dt, aes(x=log2(transcripts),y=log2(V1)))+
            geom_point()+xlab("\nSpike-in molecules per cell,
            log2(molecules)")+ylab("Observed counts, log2(counts) \n")+
            ggtitle("Correlation of measured counts as afunction of actual
            transcript number.\n")+geom_abline(slope=coef(counts2mpc.fit)[2],
            intercept=coef(counts2mpc.fit)[1], color="red")
        ggsave(paste(file,"cor_counts2moleculespercell.pdf", sep="_"),
            plot=g, device="pdf", width=8, height=6, units = "in")
    }
    counts2mpc.fit
}

apply_bayes<-function(dropout_model.dt, noise_models){
    dropouts_given_transcripts<-as.vector(
        dropout_model.dt$dropouts_given_transcripts[-1])
    genewise_prob_transcript_is_k<-as.vector(
        noise_models$genewise$prob_transcript_is_k)
    names(genewise_prob_transcript_is_k)<-as.character(
        rownames(noise_models$genewise_dropouts))
    bayes_numerator<-dropouts_given_transcripts%*%t(
        genewise_prob_transcript_is_k)
    noise_models$dropout_parameters<-data.table(data.frame(sweep(rbind(t(
        noise_models$genewise_dropouts$prob_truezero), bayes_numerator), 2,
        noise_models$genewise_dropouts$prob_geneobservedaszero, "/")),
        keep.rownames=TRUE)
    colnames(noise_models$dropout_parameters)<-c("transcripts",
        colnames(noise_models$dropout_parameters)[-1])
    noise_models$dropout_parameters$transcripts<-
        as.numeric(noise_models$dropout_parameters$transcripts)-1
    noise_models
}

#computes genewise probabilities that observed values are zero
compute_genewise_dropouts<-function(dropout_model.dt, endogenous_counts.df,
kmax){
    transcripts<-NULL
    noise_models<-list()
    noise_models$genewise_dropouts<-data.frame(rowSums(endogenous_counts.df==0
        )/dim(endogenous_counts.df)[2])
    colnames(noise_models$genewise_dropouts)<-"prob_geneobservedaszero"
    noise_models$genewise_dropouts$prob_truezero<-(
        noise_models$genewise_dropouts$prob_geneobservedaszero-
        sum(dropout_model.dt$dropouts_given_transcripts[-1])/kmax)/
        (1-sum(dropout_model.dt$dropouts_given_transcripts[-1])/kmax)
    noise_models$genewise_dropouts[
        noise_models$genewise_dropouts$prob_truezero<0,]$prob_truezero<-0
    noise_models$genewise_dropouts$prob_transcript_is_k<-(1-
        noise_models$genewise_dropouts$prob_truezero)*1/kmax
    noise_models<-apply_bayes(dropout_model.dt=dropout_model.dt,
        noise_models=noise_models)
    setkey(noise_models$dropout_parameters, transcripts)
    setkey(dropout_model.dt, transcripts)
    noise_models$dropout_parameters<-dropout_model.dt[
        noise_models$dropout_parameters,]
    setkey(noise_models$dropout_parameters, transcripts)
    noise_models$dropout_parameters<-data.frame(
        noise_models$dropout_parameters)
    noise_models
}

build_dropoutmodel<-function(kmax, undetected2mpc, counts2mpc.fit,
dropout_inflate){
    transcript_values<-seq(from=0, to=kmax, by=1)
    dropout_model.dt<-data.table(data.frame(
        transcripts=transcript_values))
    dropout_model.dt$dropouts_given_transcripts<-(1/as.numeric(
        dropout_inflate))*as.numeric(undetected2mpc[1])*log2(
        dropout_model.dt$transcripts)+as.numeric(undetected2mpc[2])
    dropout_model.dt[dropout_model.dt$dropouts_given_transcripts>1 |
        dropout_model.dt$dropouts_given_transcripts==Inf,
        ]$dropouts_given_transcripts<-1
    dropout_model.dt$counts<-2^(coef(counts2mpc.fit)[2]*
        log2(dropout_model.dt$transcripts)+coef(counts2mpc.fit)[1])
    dropout_model.dt
}


create_null_dropout_model<-function(counts2mpc.fit, endogenous_counts.df){
    noise_models<-list()
    transcripts<-seq(from=0, to=10, by=1)
    dropouts_given_transcripts<-c(1,rep(0,10))
    counts<-c(0, 2^(coef(counts2mpc.fit)[2]*
        log2(transcripts[-1])+coef(counts2mpc.fit)[1]))
    gene_nodropouts<-data.frame(dropouts_given_transcripts%*%t(rep(1,
        length(rownames(endogenous_counts.df)))), row.names = transcripts)
    colnames(gene_nodropouts)<-rownames(endogenous_counts.df)
    dropout_null<-data.frame(transcripts=transcripts,
        dropouts_given_transcripts=dropouts_given_transcripts,
        counts=counts)
    rownames(dropout_null)<-transcripts
    noise_models$dropout_parameters<-cbind(dropout_null, gene_nodropouts)
    noise_models
}

#Uses Bayes' theorem to build drop-out noise models
estimate_missingdata<-function(melted_spikeins.dt, endogenous_counts.df,
counts2mpc.fit, plot, file, dropout_inflate=dropout_inflate, sample_number){
    transcripts<-rn<-counts<-NULL
    force(sample_number)
    undetected_spikeins.dt<-melted_spikeins.dt[,sum(counts==0)/sample_number,
        by=c("spike-in", "transcripts")][transcripts>0,]
    colnames(undetected_spikeins.dt)<-c("spike-in", "transcripts",
        "undetected")
    undetected2mpc<-estimate_undetected2molpercell(undetected_spikeins.dt,
        plot=plot, file=file)
    kmax<-ceiling(2^(-as.numeric(undetected2mpc[2])/
        ((1/as.numeric(dropout_inflate))*as.numeric(undetected2mpc[1]))))-1
    if (sum(undetected_spikeins.dt$undetected==0)==0){
        print(paste("Warning: there are no spike-ins that were detected in",
            "every sample. As a result the actual transcript count",
            "threshold, k, at which drop-outs are not present will be",
            "extrapolated rather than interpolated. The extrapolated value ",
            "for k is, ", kmax, ".", sep=""))
    }
    if (kmax>0){
        print(paste("There are adequate spike-in drop-outs to build the",
            "drop-out model. Estimating the drop-out model now."), sep="")
        noise_models<-compute_genewise_dropouts(build_dropoutmodel(kmax=kmax,
            undetected2mpc=undetected2mpc, counts2mpc.fit=counts2mpc.fit,
            dropout_inflate=dropout_inflate), kmax=kmax,
            endogenous_counts.df=endogenous_counts.df)
    } else {
        print(paste("Warning: there are inadequate spike-in drop-outs to",
            "build the drop-out model. All observed zeros will be treated",
            "as true zeros during simulation of technical replicates."),
            sep="")
        noise_models<-create_null_dropout_model(counts2mpc.fit=counts2mpc.fit,
            endogenous_counts.df=endogenous_counts.df)
    }
    noise_models
}

write_noise_model<-function(SCEList, file){
    metadata<-assay<-NULL
    write.table(metadata(SCEList)$spikein_parameters, file=paste(file,
        "parameters4randomize.xls", sep="_"), quote=FALSE,
        sep="\t", row.names = FALSE)
    write.table(metadata(SCEList)$dropout_parameters,
        file=paste(file, "bayesianestimates.xls", sep="_"),
        quote=FALSE, sep="\t", row.names=FALSE )
    write.table(assay(SCEList, "observed_expression"), file=paste(file,
        "counts4clusterperturbation.xls", sep="_"), quote=FALSE,
        sep="\t", row.names=TRUE)
    print(paste("Parameters have been saved into the current directory",
        "in bayesianestimates.xls and parameters4randomize.xls."), sep="")
}


############################ USER COMMANDS ###################################
estimate_noiseparameters<-function(SCEList, plot=FALSE, sd_inflate=0,
max_cumprob=0.9999, bins=10, write.noise.model=TRUE, file="noise_estimation",
dropout_inflate=1, model_view=c("Observed", "Optimized"),
alpha_resolution=0.005, tie_function="maximum"){
    transcripts<-value<-V1<-rn<-distribution<-metadata<-NULL
    `metadata<-`<-assay<-NULL
    spike_counts.df<-data.frame(assay(altExp(SCEList, "ERCC_spikes")))
    spike_conc.df<-metadata(SCEList)$spikeConcentrations
    endogenous_counts.df<-data.frame(assay(SCEList, "observed_expression"))
    melted_spikeins.dt<-melt_spikeins(prepare_data(spike_counts.df,
        spike_conc.df))
    print("Fitting parameter alpha to establish spike-in derived noise model.")
    model_parameters<-compute_alpha(melted_spikeins.dt=melted_spikeins.dt,
        estimate_mu2sigma, bins=bins, max_cumprob=max_cumprob, plot=plot,
        sd_inflate=sd_inflate, file=file, tie_function = tie_function,
        alpha_resolution=alpha_resolution)
    model_sampling.dt<-cleanup_model_names(sample_models(melted_spikeins.dt,
        model_parameters))
    if (plot==TRUE){
        plot_spikein_fits(model_sampling.dt, model_view=model_view,
            file=file, bins=bins)
    }
    counts2mpc.fit<-counts2mpc(melted_spikeins.dt, plot=plot, file=file)
    sample_number<-dim(spike_counts.df)[2]
    noise_models<-estimate_missingdata(melted_spikeins.dt,
        endogenous_counts.df, counts2mpc.fit ,plot=plot,
        file=file, dropout_inflate=dropout_inflate,
        sample_number=sample_number)
    ##Prepare final output
    metadata(SCEList)<-list(dropout_parameters=noise_models$dropout_parameters,
        spikein_parameters=model_parameters, spikeConcentrations=spike_conc.df,
        genewiseDropouts=noise_models$genewise_dropouts)
    if (write.noise.model==TRUE) {
        write_noise_model(SCEList=SCEList, file=file)
    }
    SCEList
}
Any scripts or data that you put into this service are public.
BEARscc documentation built on Nov. 8, 2020, 7:56 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
BEARscc
BEARscc (Bayesian ERCC Assesstment of Robustness of Single Cell Clusters)

R/estimate_noise.R
In BEARscc: BEARscc (Bayesian ERCC Assesstment of Robustness of Single Cell Clusters)

Try the BEARscc package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

BEARscc BEARscc (Bayesian ERCC Assesstment of Robustness of Single Cell Clusters)

R/estimate_noise.R In BEARscc: BEARscc (Bayesian ERCC Assesstment of Robustness of Single Cell Clusters)

Try the BEARscc package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

BEARscc
BEARscc (Bayesian ERCC Assesstment of Robustness of Single Cell Clusters)

R/estimate_noise.R
In BEARscc: BEARscc (Bayesian ERCC Assesstment of Robustness of Single Cell Clusters)