#' Get random dispersion value based on the read count size (mean).
#' @param mean read count size
#' @param mean.condition read count vector of a certain sample condition
#' @param disp.condition dispersion vector of a certain sample condition
getDisp = function(mean, mean.condition, disp.condition)
{
pool = disp.condition[which(mean.condition>(mean-20) & mean.condition<(mean+20))]
if(length(pool)==0){value = disp.condition[which.min(abs(mean.condition-mean))]
}else{value = sample(pool,1)}
return(value)
}
#' Generate List containing estimated mean and dispersion parameters and filtered count from original count dataset.
#' @export
generateDatasetParameter = function(){
# Effect size 1.2/1.5~
data(kidney, package='SimSeq') # Load TCGA KIRC RNA-seq data. 144 samples (72 cancer and matched normal, respectively)
k_count = kidney$counts # RNA-seq read count data
index.cancer=(1:72)*2 # cancer sample index
index.normal=index.cancer-1 # normal sample index
k_count= k_count[,c(index.cancer, index.normal)] # Arrange samples for convinience
# Get count mean and dispersion using edgeR package
# Mean and dispersion values are obtained separately from cancer and normal samples when different dispersion is assummed between two sample types.
# Mean and dispersion values from normal samples
dge.normal=DGEList(counts=k_count[,73:144], group = factor(rep(2,72)))
dge.normal=calcNormFactors(dge.normal)
dge.normal=estimateCommonDisp(dge.normal)
dge.normal=estimateTagwiseDisp(dge.normal)
disp.normal = dge.normal$tagwise.dispersion # Dispersion
mean.normal = apply(k_count[,73:144],1,mean)
# Mean and dispersion values from cancer samples
dge.cancer=DGEList(counts=k_count[,1:72], group=factor(rep(1,72)))
dge.cancer=calcNormFactors(dge.cancer)
dge.cancer=estimateCommonDisp(dge.cancer)
dge.cancer=estimateTagwiseDisp(dge.cancer)
disp.cancer = dge.cancer$tagwise.dispersion
mean.cancer = apply(k_count[,1:72],1,mean)
# Gene filtering: Genes having small read count (<10) are filtered
k_mean.total = apply(k_count,1,mean)
k_index.filter = which(k_mean.total < 10)
k_mean.total = k_mean.total[-k_index.filter]
disp.normal = disp.normal[-k_index.filter]
disp.cancer = disp.cancer[-k_index.filter]
mean.normal = mean.normal[-k_index.filter]
mean.cancer = mean.cancer[-k_index.filter]
# Mean and dispersion values obtained using all samples when same dispersion is assummed between two sample types..
k_dge.total = DGEList(counts = k_count, group = factor(c(rep(1,72),rep(2,72))))
k_dge.total = calcNormFactors(k_dge.total)
k_dge.total = estimateCommonDisp(k_dge.total)
k_dge.total = estimateTagwiseDisp(k_dge.total)
k_disp.total = k_dge.total$tagwise.dispersion
k_disp.total = k_disp.total[-k_index.filter]
###############################################
#############bottomly###########################
################################################
load(system.file("extdata", "bottomly_eset.RData",package = "compareDEtools"))# Load Bottomly mouse RNA-seq data. 21 samples
b_count<-exprs(bottomly.eset) # RNA-seq read count data
strain<-pData(bottomly.eset)[,'strain']
index.C=which(strain=='C57BL/6J') # C57BL/6J sample index
index.D=which(strain=='DBA/2J') # DBA/2J sample index
b_count= b_count[,c(index.C, index.D)] # Arrange samples for convinience
# Get count mean and dispersion using edgeR package
# Mean and dispersion values are obtained separately from C57BL/6J and DBA/2J samples when different dispersion is assummed between two sample types.
# Mean and dispersion values from C57BL/6J samples
dge.C=DGEList(counts=b_count[,1:10], group=factor(rep(1,10)))
dge.C=calcNormFactors(dge.C)
dge.C=estimateCommonDisp(dge.C)
dge.C=estimateTagwiseDisp(dge.C)
disp.C = dge.C$tagwise.dispersion
mean.C = apply(b_count[,1:10],1,mean)
# Mean and dispersion values from DBA/2J samples
dge.D=DGEList(counts=b_count[,11:21], group = factor(rep(2,11)))
dge.D=calcNormFactors(dge.D)
dge.D=estimateCommonDisp(dge.D)
dge.D=estimateTagwiseDisp(dge.D)
disp.D = dge.D$tagwise.dispersion # Dispersion
mean.D = apply(b_count[,11:21],1,mean)
# Gene filtering: Genes having small read count (<10) are filtered
b_mean.total = apply(b_count,1,mean)
b_index.filter = which(b_mean.total < 10)
b_mean.total = b_mean.total[-b_index.filter]
disp.C = disp.C[-b_index.filter]
disp.D = disp.D[-b_index.filter]
mean.C = mean.C[-b_index.filter]
mean.D = mean.D[-b_index.filter]
# Mean and dispersion values obtained using all samples when same dispersion is assummed between two sample types..
b_dge.total = DGEList(counts = b_count, group = factor(c(rep(1,10),rep(2,11))))
b_dge.total = calcNormFactors(b_dge.total)
b_dge.total = estimateCommonDisp(b_dge.total)
b_dge.total = estimateTagwiseDisp(b_dge.total)
b_disp.total = b_dge.total$tagwise.dispersion
b_disp.total = b_disp.total[-b_index.filter]
###############################################
############# SEQC ########################
###############################################
SEQC<-system.file("extdata", "GSE49712_HTSeq.txt",package = "compareDEtools")
s_count<-read.table(SEQC, header = T)
s_count <- s_count[grep('no_feature|ambiguous|too_low_aQual|not_aligned|alignment_not_unique' ,
rownames(s_count), invert=TRUE),]
s_mean.total = apply(s_count,1,mean)
s_index.filter = which(s_mean.total < 10)
dataset.parameters=list(k_count=k_count, disp.normal=disp.normal, mean.normal=mean.normal, disp.cancer=disp.cancer,
mean.cancer=mean.cancer, k_mean.total=k_mean.total, k_index.filter=k_index.filter, k_disp.total=k_disp.total,
b_count=b_count, disp.D=disp.D, mean.D=mean.D, disp.C=disp.C, mean.C=mean.C, b_mean.total=b_mean.total,
b_index.filter=b_index.filter, b_disp.total=b_disp.total, s_count=s_count, s_mean.total=s_mean.total, s_index.filter=s_index.filter)
return(dataset.parameters)
}
#' Generate synthetic count data for analysis
#' @param simul.data Characters indicating which dataset will be used for simulation data generation
#' "KIRC" for KIRC dataset.
#' "Bottomly" for Bottomly dataset.
#' "mKdB" for hybrid dataset combining mean of KIRC and dispersion of Bottomly datatset.
#' "mBdK" for hybrid dataset combining mean of Bottomly and dispersion of KIRC datatset.
#' @param dataset Characters specifying the file name of simulation dataset.
#' @param fixedfold A logical indicating whether this dataset is generated by fold changes with fixed values or random folds following exponential distribution.
#' @param samples.per.cond An integer indicating number of samples for each sample group (e.g. 3).
#' @param n.var An integer indicating the number of total gene in the synthetic data.
#' @param n.diffexp An integer indicating number of generated DE genes in the synthetic data.
#' @param fraction.upregulated Proportion of upregulated DE genes in the synthetic data. (e.g. 0.5).
#' @param dispType A character parameter indicating how is the dispersion parameter assumed to be for each condition to make a synthetic data. Possible values are 'same' and 'different'.
#' @param mode Characters specifying test conditions used for simulation data generation.
#' "D" for basic simulation (not adding outliers).
#' "R" for adding 5% of random outlier.
#' "OS" for adding outlier sample to each sample group.
#' @param RO.prop An integer specifying random outlier proportion percentage that we generate dataset with.
#' "DL" for decreasing KIRC simulation dispersion 22.5 times (similar to SEQC data dispersion) to compare with SEQC data.
#' @param dataset.parameters A list containing estimated mean and dispersion parameters and filtered count from original count dataset.
#' @export
SyntheticDataSimulation = function(simul.data, dataset, random_sampling=FALSE, Large_sample=FALSE, fixedfold=FALSE, samples.per.cond, n.var, n.diffexp, fraction.upregulated, dispType, mode, RO.prop=5, dataset.parameters)
{
# Generate simulation data
if(mode != 'D' && mode!='S' && mode!='R' && mode!='OS' && mode!='DL'){stop('mode must be "D" (DE variation test), "S" (single outlier test) or "R" (random outlier test) or "OS" (dispersion outlier sample test) or "DL" (dispersion lowered test)')}
datasetName = dataset
s = samples.per.cond
k_count=dataset.parameters$k_count
b_count=dataset.parameters$b_count
disp.normal=dataset.parameters$disp.normal
disp.cancer=dataset.parameters$disp.cancer
disp.D=dataset.parameters$disp.D
disp.C=dataset.parameters$disp.C
mean.normal=dataset.parameters$mean.normal
mean.cancer=dataset.parameters$mean.cancer
mean.D=dataset.parameters$mean.D
mean.C=dataset.parameters$mean.C
k_mean.total=dataset.parameters$k_mean.total
k_index.filter=dataset.parameters$k_index.filter
k_disp.total=dataset.parameters$k_disp.total
b_mean.total=dataset.parameters$b_mean.total
b_index.filter=dataset.parameters$b_index.filter
b_disp.total=dataset.parameters$b_disp.total
if(simul.data=='KIRC'||simul.data=='mKdB'){
random.index = sample(1:length(k_mean.total), size=n.var)
sample.mean1 = k_mean.total[random.index]
if(simul.data=='mKdB'){
sub.random.index =sample(1:length(b_mean.total), size=n.var)
sub.sample.mean1 = b_mean.total[sub.random.index]
sub.sample.mean2 = sub.sample.mean1
}
}else if(simul.data=='Bottomly'||simul.data=='mBdK'){
random.index = sample(1:length(b_mean.total), size=n.var)
sample.mean1 = b_mean.total[random.index]
if(simul.data=='mBdK'){
sub.random.index =sample(1:length(k_mean.total), size=n.var)
sub.sample.mean1 = k_mean.total[sub.random.index]
sub.sample.mean2 = sub.sample.mean1
}
}
sample.mean2 = sample.mean1
if(n.diffexp!=0){
if(fixedfold){
if(simul.data!='KIRC'){
stop('Simulation with fixed fold must be based on KIRC dataset.')
}
factor1 = 1.15
factor2 = 1.3
factor3 = 1.6
sample.mean2[1:round(n.diffexp/3)] = sample.mean2[1:round(n.diffexp/3)]*factor1
sample.mean2[round((n.diffexp/3)+1):round(2*n.diffexp/3)] = sample.mean2[round((n.diffexp/3)+1):round(2*n.diffexp/3)]*factor2
sample.mean2[round((2*n.diffexp/3)+1):(n.diffexp)] = sample.mean2[round((2*n.diffexp/3)+1):(n.diffexp)]/factor3
}else{
upindex = 1:round(n.diffexp*fraction.upregulated)
dnindex = round(n.diffexp*fraction.upregulated+1):n.diffexp
if(s<=3){
factor1 = 1.5+rexp(n = length(upindex), rate=1) #3 sample fold change 1.5~
factor2 = 1.5+rexp(n = length(dnindex), rate=1)
}else if(s<=5){
factor1 = 1.3+rexp(n = length(upindex), rate=1) #5 sample fold change 1.3~
factor2 = 1.3+rexp(n = length(dnindex), rate=1)
}
else{
factor1 = 1.2+rexp(n = length(upindex), rate=1) #10 sample fold change 1.2~
factor2 = 1.2+rexp(n = length(dnindex), rate=1)
}
sample.mean2[upindex] = sample.mean2[upindex]*factor1
sample.mean2[dnindex] = sample.mean2[dnindex]/factor2
if(simul.data=='mBdK'||simul.data=='mKdB'){
sub.sample.mean2[upindex] = sub.sample.mean2[upindex]*factor1
sub.sample.mean2[dnindex] = sub.sample.mean2[dnindex]/factor2
}
}
}
if(simul.data=='KIRC'||simul.data=='mBdK'){
mean.condition1=mean.normal
mean.condition2=mean.cancer
disp.condition1=disp.normal
disp.condition2=disp.cancer
disp.total=k_disp.total
}else if(simul.data=='Bottomly'||simul.data=='mKdB'){
mean.condition1=mean.D
mean.condition2=mean.C
disp.condition1=disp.D
disp.condition2=disp.C
disp.total=b_disp.total
}
if(dispType == 'different')
{
if(simul.data=='KIRC'||simul.data=='Bottomly'){
sample.disp1 = sapply(sample.mean1, FUN = getDisp, mean.condition = mean.condition1, disp.condition = disp.condition1, simplify = T, USE.NAMES = F)
sample.disp2 = sapply(sample.mean2, FUN = getDisp, mean.condition = mean.condition2, disp.condition = disp.condition2, simplify = T, USE.NAMES = F)
}else if(simul.data=='mKdB'||simul.data=='mBdK'){
sample.disp1 = sapply(sub.sample.mean1[order(sub.sample.mean1)], FUN = getDisp, mean.condition = mean.condition1, disp.condition = disp.condition1, simplify = T, USE.NAMES = F)
sample.disp1 <-sample.disp1[order(order(sample.mean1))]
sample.disp2 = sapply(sub.sample.mean2[order(sub.sample.mean2)], FUN = getDisp, mean.condition = mean.condition2, disp.condition = disp.condition2, simplify = T, USE.NAMES = F)
sample.disp2 <-sample.disp2[order(order(sample.mean2))]
}
}else if(dispType == 'same')
{
if(simul.data=='KIRC'||simul.data=='Bottomly'){
sample.disp1 = disp.total[random.index]
}else if(simul.data=='mKdB'||simul.data=='mBdK'){
sample.disp1 = disp.total[sub.random.index[order(sub.sample.mean1)]][order(order(sample.mean1))]
}
sample.disp2 = sample.disp1
}
counts = matrix(nrow=n.var, ncol = 2*s)
if(mode == "OS"){
for(i in 1:n.var){
counts[i,1:round(s/3)]=rnbinom(round(s/3), 1/(5*sample.disp2[i]),mu=sample.mean2[i])
counts[i,(round(s/3)+1):s] = rnbinom((s-round(s/3)), 1/sample.disp2[i], mu = sample.mean2[i])
counts[i,(s+1):(s+round(s/3))] = rnbinom(round(s/3), 1/(5*sample.disp1[i]), mu = sample.mean1[i])
counts[i,(s+round(s/3)+1):(2*s)] = rnbinom((s-round(s/3)), 1/sample.disp1[i], mu = sample.mean1[i])
}
}else if(mode == "DL"){
for(i in 1:n.var)
{
counts[i,1:s] = rnbinom(s, 22.5/sample.disp2[i], mu = sample.mean2[i])
counts[i,(s+1):(2*s)] = rnbinom(s, 22.5/sample.disp1[i], mu = sample.mean1[i])
}
}else if(Large_sample==TRUE){
for(i in 1:n.var)
{
counts[i,1:round(s/3)]=rnbinom(round(s/3), 1/(5*sample.disp2[i]),mu=sample.mean2[i])
counts[i,(round(s/3)+1):s] = rnbinom((s-round(s/3)), 1/sample.disp2[i], mu = sample.mean2[i])
counts[i,(s+1):(s+round(s/3))] = rnbinom(round(s/3), 1/(5*sample.disp1[i]), mu = sample.mean1[i])
counts[i,(s+round(s/3)+1):(2*s)] = rnbinom((s-round(s/3)), 1/sample.disp1[i], mu = sample.mean1[i])
}
RO = matrix(runif(n.var*2*s , min = 0, max = 100), nrow = n.var, ncol = 2*s)
index.outlier = which(RO<3)
index.outlier<-index.outlier[c(which(index.outlier>n.var*(round(s/3))&index.outlier<=n.var*s),which(index.outlier>n.var*(s+round(s/3))&index.outlier<=n.var*2*s))]
counts[index.outlier] = counts[index.outlier]*runif(n = length(index.outlier), min=5, max=10)
counts = round(counts)
}else{
if(random_sampling==TRUE){
rand1=runif(s,min=0.7,max=1.3)
rand2=runif(s,min=0.7,max=1.3)
}else{
rand1=rep(1,s)
rand2=rep(1,s)
}
for(i in 1:n.var)
{
counts[i,1:s] = sapply(rand1, FUN = function(x) rnbinom(1, 1/sample.disp2[i], mu=sample.mean2[i]*x))
counts[i,(s+1):(2*s)] = sapply(rand2, FUN = function(x) rnbinom(1, 1/sample.disp1[i], mu=sample.mean1[i]*x))
}
}
### Random Outlier
if(mode == "R")
{
RO = matrix(runif(n.var*2*s , min = 0, max = 100), nrow = n.var, ncol = 2*s)
index.outlier = which(RO<RO.prop)
counts[index.outlier] = counts[index.outlier]*runif(n = length(index.outlier), min=5, max=10)
counts = round(counts)
}
rownames(counts) = paste("g",1:n.var,sep="")
sample.annot = data.frame(condition = c(rep(1,s),rep(2,s)))
colnames(counts) = rownames(sample.annot)
info.parameters = list(dataset = datasetName, uID = datasetName)
# save as compdata
cpd = compData(count.matrix = counts,
sample.annotations = sample.annot,
info.parameters = info.parameters)
saveRDS(cpd, datasetName)
}
#' Generate real data for analysis
#' @param simul.data Characters indicating which dataset will be used for simulation data generation
#' "KIRC" for KIRC dataset.
#' "Bottomly" for Bottomly dataset.
#' "SEQC" for SEQC ERCC Spike-In dataset.
#' @param dataset A character parameter specifying the file name of simulation dataset.
#' @param samples.per.cond An integer indicating number of samples for each sample group (e.g. 3).
#' @param fpc A logical indicating whether simulation data is made from single sample group(e.g. normal) to calculate False positive counts. Default is FALSE.
#' @param dataset.parameters A list containing estimated mean and dispersion parameters and filtered count from original count dataset.
#' @export
RealDataSimulation = function(simul.data ,dataset, samples.per.cond, fpc=FALSE, dataset.parameters)
{
# Generate simulation data
datasetName = dataset
s=samples.per.cond
k_count=dataset.parameters$k_count
b_count=dataset.parameters$b_count
k_index.filter=dataset.parameters$k_index.filter
b_index.filter=dataset.parameters$b_index.filter
s_count=dataset.parameters$s_count
s_index.filter=dataset.parameters$s_index.filter
if(fpc){
if(simul.data=='KIRC'){
random.index_row = sample(1:72, size=2*s)
counts<-k_count[-k_index.filter,random.index_row]
}else if(simul.data=='Bottomly'){
random.index_row = sample(1:10, size=2*s)
counts<-b_count[-b_index.filter,random.index_row]
}else if(simul.data=='SEQC'){
stop('SEQC is not used for FP count calculation')
}
}else{
if(simul.data=='KIRC'){
random.index_1 = sample(1:72, size=s)
random.index_2 = sample(73:144, size=s)
count<-k_count[-k_index.filter,]
random.index_row<-append(random.index_1,random.index_2)
counts<-count[,random.index_row]
}else if(simul.data=='Bottomly'){
random.index_1 = sample(1:10, size=s)
random.index_2 = sample(11:21, size=s)
count<-b_count[-b_index.filter,]
random.index_row<-append(random.index_1,random.index_2)
counts<-count[,random.index_row]
}else if(simul.data=='SEQC'){
count<-s_count[-s_index.filter,]
random.index_1 = sample(1:5, size=s)
random.index_2 = sample(6:10, size=s)
random.index_row<-append(random.index_1,random.index_2)
counts<-count[,random.index_row]
}
}
if(simul.data!='SEQC'){
rownames(counts) = paste("g",1:nrow(counts),sep="")
}
sample.annot = data.frame(condition = c(rep(1,s),rep(2,s)))
colnames(counts) = rownames(sample.annot)
info.parameters = list(dataset = datasetName, uID = datasetName)
# save as compdata
cpd = compData(count.matrix = counts,
sample.annotations = sample.annot,
info.parameters = info.parameters)
saveRDS(cpd, datasetName)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.