#' @importFrom data.table dcast copy melt
#' @importFrom outliers chisq.out.test
#' @importFrom MASS fitdistr
#' @export
keep_only_proteotypic <- function(input_dt) {
return(input_dt[which(grepl("^1/", input_dt$ProteinName)),])
}
#' @export
long2wide <- function(input_dt, level = "PeptideIon") {
if (!level %in% c("PeptideIon", "Protein")) {
stop("Please select a valid type. Options: \"PeptideIon(default)\", \"Protein\"")
}
if (level == "PeptideIon") {
output_dt <-
dcast(input_dt, ProteinName + PeptideIon ~ filename, value.var = "Intensity")
} else if (level == "Protein") {
output_dt <-
dcast(input_dt, ProteinName ~ run_id, value.var = "Quant_sum")
}
return(output_dt)
}
#' @export
wide2long <- function(input_dt) {
output_dt <-
melt(
input_dt,
id.vars = c("PeptideIon", "ProteinName", "FullName", "NTT", "NMC"),
measure.vars = names(input_dt)[which(grepl("mean_", names(input_dt)))],
variable.name = "run_id",
value.name = "Intensity",
na.rm = T
)
return(output_dt)
}
#' @export
keep_only_semi <- function(input_dt) {
semi_cons_peptides_long <- copy(input_dt)
semi_cons_peptides_long[which(semi_cons_peptides_long$NTT == 2),]$Intensity <- 0
return(semi_cons_peptides_long)
}
#' @export
pept2prot <- function(input_pept_long, intensity_normalization = "sqrt") {
#for CPTAC itraq breast cancer and benchmarking data, this seems that sqrt works better.
if (intensity_normalization == "sqrt") {
output_prot_long <-
input_pept_long[, .(Quant_sum = sum_na(sqrt(Intensity + 1)),
NumPeptides = length(which(Intensity > 0))), by = .(ProteinName, run_id)]
} else if (intensity_normalization == "log") {
output_prot_long <-
input_pept_long[, .(Quant_sum = sum_na(log2(Intensity + 1)),
NumPeptides = length(which(Intensity > 0))), by = .(ProteinName, run_id)]
} else if (intensity_normalization == "raw") {
output_prot_long <-
input_pept_long[, .(Quant_sum = sum_na(Intensity),
NumPeptides = length(which(Intensity > 0))), by = .(ProteinName, run_id)]
}
return(output_prot_long)
}
#' @export
normalize_data <- function(input_dt, input_index, normalization = "mediancenter") {
output_dt <- copy(input_dt)
output_dt[, (input_index) := log2(output_dt[, input_index, with = F])]
run_median <-
sapply(output_dt[, input_index, with = F], median_na)
for (i in 1:length(input_index)) {
output_dt[, names(output_dt)[input_index[i]]] <-
output_dt[, input_index[i], with = F] - run_median[i] + median(run_median)
output_dt[, names(output_dt)[input_index[i]]] <-
2 ^ output_dt[, input_index[i], with = F]
}
return(output_dt)
}
#' @export
calculate_stats <- function(input_dt, input_index) {
index_semi <- which(input_dt$NTT < 2)
index_missed <- which(input_dt$NMC > 0)
index_normal <- which(input_dt$NTT == 2 & input_dt$NMC == 0)
numPept <- apply(input_dt[, input_index, with = F], 2, function(x)
length(which(x > 0)))
numSemi <-
apply(input_dt[index_semi, input_index, with = F], 2, function(x)
length(which(x > 0)))
numMissed <-
apply(input_dt[index_missed, input_index, with = F], 2, function(x)
length(which(x > 0)))
numNormal <-
apply(input_dt[index_normal, input_index, with = F], 2, function(x)
length(which(x > 0)))
TIC <- apply(input_dt[, input_index, with = F], 2, sum_na)
proteomic_measure <- data.frame(cbind(names(input_dt)[input_index], numPept, numSemi, numMissed, numNormal, TIC))
names(proteomic_measure) <-
c("filename",
"numPept",
"numSemi",
"numMissed",
"numNormal",
"TIC")
return(proteomic_measure)
}
#' @export
merge_replicates <- function(input_dt, input_anno) {
output_dt <- copy(input_dt)
list_samples <- unique(input_anno$sample_id)
message("It starts to merge replicates...")
for (i in 1:length(list_samples)) {
cat("Processing ", i, " sample: ", list_samples[i], "\n")
output_dt[, paste0("mean_", list_samples[i]) := apply(.SD, 1, mean_na), .SDcols = input_anno[input_anno$sample_id %in% list_samples[i],]$filename]
}
message("Done with merging replicates...")
return(output_dt)
}
#' @export
generate_iPIS_matrix <- function(input_all_dt, input_semi_dt) {
index_temp <- which(grepl("^mean", names(input_all_dt)))
m_iPIS <- copy(input_all_dt)
#for(i in 1:length(index_temp)) {
#m_iPIS[, index_temp[i], with=F] <- 1 - input_semi_dt[, index_temp[i], with=F] / input_all_dt[, index_temp[i], with=F]
#}
m_iPIS[, index_temp] <- 1 - input_semi_dt[, index_temp, with = F] / input_all_dt[, index_temp, with = F]
names(m_iPIS)[index_temp] <- sapply(strsplit(names(m_iPIS)[index_temp], "mean_"), "[[", 2)
index_order <- order(apply(m_iPIS[, index_temp, with = F], 1, function(x) length(which(is.na(x)))))
m_iPIS <- m_iPIS[index_order, ]
#m_iPIS <- m_iPIS[order(apply(m_iPIS[, index_temp, with = F], 1, function(x) length(which(is.na(x))))), ]
return(m_iPIS)
}
#' @export
old_calculate_PIN <- function(input_iPIS) {
index_temp <- seq(2, dim(input_iPIS)[2], 1)
m_PIN <- data.frame(
sample_id = names(input_iPIS)[index_temp],
PIN = apply(input_iPIS[, index_temp, with =
F], 2, mean_na),
pval = 1.0
)
m_PIN <- m_PIN[order(m_PIN$PIN), ]
#wenguang: re-scale PIN... not sure whether it is a good idea...
#m_PIN$PIN <- -log2(0.98 - m_PIN$PIN)
for (i in 1:(dim(m_PIN)[1] - 1)) {
if (grepl("lowest",
chisq.out.test(m_PIN$PIN[i:dim(m_PIN)[1]], opposite = F)$alternative)) {
m_PIN[i,]$pval <-
chisq.out.test(m_PIN$PIN[i:dim(m_PIN)[1]], opposite = F)$p.value
} else {
m_PIN[i,]$pval <-
chisq.out.test(m_PIN$PIN[i:dim(m_PIN)[1]], opposite = T)$p.value
}
if (i > 1) {
if (m_PIN[i,]$pval < m_PIN[i - 1,]$pval) {
m_PIN[i,]$pval <- m_PIN[i - 1,]$pval + m_PIN[i,]$pval * 0.05
#m_PIN[i, ]$pval <- m_PIN[i, ]$pval*0.6 + m_PIN[i-1, ]$pval * 0.4
}
}
}
#wenguang: need to be tested, and will be released later...
if (1 == 1) {
#index_sig <- which(m_PIN$pval < 0.01)
#index_insig <- which(m_PIN$pval >= 0.01)
index_sig <- which(m_PIN$pval < 0.05)
index_insig <- which(m_PIN$pval >= 0.05)
for (i in 1:length(index_sig)) {
if (grepl("lowest",
chisq.out.test(m_PIN$PIN[c(index_sig[i], index_insig)], opposite = F)$alternative)) {
m_PIN[index_sig[i],]$pval <-
chisq.out.test(m_PIN$PIN[c(index_sig[i], index_insig)], opposite = F)$p.value
} else {
m_PIN[index_sig[i],]$pval <-
chisq.out.test(m_PIN$PIN[c(index_sig[i], index_insig)], opposite = T)$p.value
}
}
#p.adjust(raw$pval, method="hommel")
}
return(m_PIN)
}
#' @export
calculate_PIN <- function(input_iPIS, input_remove_zero_rows) {
index_temp <- seq(2, dim(input_iPIS)[2], 1)
#wenguang: CAUTION!!! This will calculate PINs without proteins detected consistently with only semi-tryptic peptides across all measurements (i.e. the row is full with zero).
#wenguang: by default it is off. It was only turned on, that yansheng plasma samples were analyzed, as those rows took up a large fraction of total proteins identified.
if(input_remove_zero_rows == TRUE) {
input_iPIS <- input_iPIS[ - which( apply(input_iPIS[, index_temp, with=F], 1, function(x) length(which(x == 0.0))) == length(index_temp) ), ]
}
m_PIN <- data.frame(
sample_id = names(input_iPIS)[index_temp],
PIN = apply(input_iPIS[, index_temp, with=F], 2, mean_na),
pval = 1.0
)
m_PIN <- m_PIN[order(m_PIN$PIN), ]
fit_weibull <- fitdistr(m_PIN$PIN, densfun = "weibull")
m_PIN$pval <- pweibull(m_PIN$PIN,
shape = fit_weibull$estimate[1],
scale = fit_weibull$estimate[2])
length_insig_last <- dim(m_PIN)[1]
count <- 1
index_insig <- which(m_PIN$pval >= 0.05)
while (length(index_insig) < length_insig_last) {
cat("Iteration for fitting null distribution: ", count, "\n")
#wenguang: note that for error like "Error in fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull" optimization failed", you may want to try
# fit_weibull <- fitdistr(m_PIN[index_insig,]$PIN, densfun = "weibull", lower=c(10, 0.6))
if( !is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull"), silent=T), "try-error") ) {
fit_weibull <- fitdistr(m_PIN[index_insig,]$PIN, densfun = "weibull")
} else if( !is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower=c(10, 0.6)), silent=T), "try-error") ) {
fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower=c(10, 0.6))
} else if( !is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower=c(1, 0.1)), silent=T), "try-error") ) {
fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower=c(1, 0.1))
}
m_PIN$pval <- pweibull(m_PIN$PIN,
shape = fit_weibull$estimate[1],
scale = fit_weibull$estimate[2])
length_insig_last <- length(index_insig)
count <- count + 1
index_insig <- which(m_PIN$pval >= 0.05)
#index_insig <- which(m_PIN$pval >= 0.02)
}
#wenguang: need to be tested, and will be released later...
if (1 == 1) {
#index_sig <- which(m_PIN$pval < 0.02)
#index_insig <- which(m_PIN$pval >= 0.02)
index_sig <- which(m_PIN$pval < 0.05)
index_insig <- which(m_PIN$pval >= 0.05)
for (i in 1:length(index_sig)) {
if (grepl("lowest",
chisq.out.test(
m_PIN$PIN[c(index_sig[i], index_insig)],
variance = var(m_PIN[index_insig,]$PIN),
opposite = F
)$alternative)) {
m_PIN[index_sig[i],]$pval <-
chisq.out.test(m_PIN$PIN[c(index_sig[i], index_insig)],
variance = var(m_PIN[index_insig,]$PIN),
opposite = F)$p.value
} else {
m_PIN[index_sig[i],]$pval <-
chisq.out.test(m_PIN$PIN[c(index_sig[i], index_insig)],
variance = var(m_PIN[index_insig,]$PIN),
opposite = T)$p.value
}
}
#p.adjust(raw$pval, method="hommel")
}
return(m_PIN)
}
#' @export
calculate_PIN_new <- function(input_iPIS, input_remove_zero_rows) {
insig_pval <- 0.05
index_temp <- seq(2, dim(input_iPIS)[2], 1)
#wenguang: CAUTION!!! This will calculate PINs without proteins detected consistently with only semi-tryptic peptides across all measurements (i.e. the row is full with zero).
#wenguang: by default it is off. It was only turned on, that yansheng plasma samples were analyzed, as those rows took up a large fraction of total proteins identified.
if(input_remove_zero_rows == TRUE) {
input_iPIS <- input_iPIS[ - which( apply(input_iPIS[, index_temp, with=F], 1, function(x) length(which(x == 0.0))) == length(index_temp) ), ]
}
m_PIN <- data.frame(
sample_id = names(input_iPIS)[index_temp],
PIN = apply(input_iPIS[, index_temp, with=F], 2, mean_na),
pval = 1.0
)
m_PIN <- m_PIN[order(m_PIN$PIN), ]
fit_weibull <- fitdistr(m_PIN$PIN, densfun = "weibull")
m_PIN$pval <- pweibull(m_PIN$PIN,
shape = fit_weibull$estimate[1],
scale = fit_weibull$estimate[2])
length_insig_last <- dim(m_PIN)[1]
count <- 1
index_insig <- which(m_PIN$pval >= insig_pval)
best_fit_count <- 1
best_fit_D <- 0
best_fit_bool <- FALSE
while (length(index_insig) < length_insig_last) {
cat("Iteration for fitting null distribution: ", count, "\n")
#wenguang: note that for error like "Error in fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull" optimization failed", you may want to try
# fit_weibull <- fitdistr(m_PIN[index_insig,]$PIN, densfun = "weibull", lower=c(10, 0.6))
if( !is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull"), silent=T), "try-error") ) {
fit_weibull <- fitdistr(m_PIN[index_insig,]$PIN, densfun = "weibull")
} else if( !is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower=c(10, 0.6)), silent=T), "try-error") ) {
fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower=c(10, 0.6))
} else if( !is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower=c(1, 0.1)), silent=T), "try-error") ) {
fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower=c(1, 0.1))
}
m_PIN$pval <- pweibull(m_PIN$PIN,
shape = fit_weibull$estimate[1],
scale = fit_weibull$estimate[2])
length_insig_last <- length(index_insig)
index_insig <- which(m_PIN$pval >= insig_pval)
print(ks.test(m_PIN[index_insig, ]$PIN, rweibull(n=10000, shape=fit_weibull$estimate[1], scale=fit_weibull$estimate[2])))
fit_test <- ks.test(m_PIN[index_insig, ]$PIN, rweibull(n=10000, shape=fit_weibull$estimate[1], scale=fit_weibull$estimate[2]))
if( fit_test$statistic > best_fit_D) {
best_fit_D <- fit_test$statistic
best_fit_count <- count
if( fit_test$p.value < 0.01 ) best_fit_bool <- TRUE
}
count <- count + 1
}
if(best_fit_bool == "TRUE") {
cat("It fits best when iteration", best_fit_count, "times.\n" )
m_PIN <- data.frame(
sample_id = names(input_iPIS)[index_temp],
PIN = apply(input_iPIS[, index_temp, with=F], 2, mean_na),
pval = 1.0
)
m_PIN <- m_PIN[order(m_PIN$PIN), ]
fit_weibull <- fitdistr(m_PIN$PIN, densfun = "weibull")
m_PIN$pval <- pweibull(m_PIN$PIN,
shape = fit_weibull$estimate[1],
scale = fit_weibull$estimate[2])
length_insig_last <- dim(m_PIN)[1]
index_insig <- which(m_PIN$pval >= insig_pval)
reloop_count <- 1
while (reloop_count <= best_fit_count) {
cat("Iteration for fitting null distribution: ", reloop_count, "\n")
#wenguang: note that for error like "Error in fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull" optimization failed", you may want to try
# fit_weibull <- fitdistr(m_PIN[index_insig,]$PIN, densfun = "weibull", lower=c(10, 0.6))
if( !is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull"), silent=T), "try-error") ) {
fit_weibull <- fitdistr(m_PIN[index_insig,]$PIN, densfun = "weibull")
} else if( !is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower=c(10, 0.6)), silent=T), "try-error") ) {
fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower=c(10, 0.6))
} else if( !is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower=c(1, 0.1)), silent=T), "try-error") ) {
fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower=c(1, 0.1))
}
m_PIN$pval <- pweibull(m_PIN$PIN,
shape = fit_weibull$estimate[1],
scale = fit_weibull$estimate[2])
length_insig_last <- length(index_insig)
index_insig <- which(m_PIN$pval >= insig_pval)
#index_insig <- which(m_PIN$pval >= 0.02)
print(ks.test(m_PIN[index_insig, ]$PIN, rweibull(n=10000, shape=fit_weibull$estimate[1], scale=fit_weibull$estimate[2])))
fit_test <- ks.test(m_PIN[index_insig, ]$PIN, rweibull(n=10000, shape=fit_weibull$estimate[1], scale=fit_weibull$estimate[2]))
reloop_count <- reloop_count + 1
}
if (1 == 0) {
#index_sig <- which(m_PIN$pval < 0.02)
#index_insig <- which(m_PIN$pval >= 0.02)
index_sig <- which(m_PIN$pval < 0.01)
index_insig <- which(m_PIN$pval >= 0.01)
for (i in 1:length(index_sig)) {
if (grepl("lowest",
chisq.out.test(
m_PIN$PIN[c(index_sig[i], index_insig)],
variance = var(m_PIN[index_insig,]$PIN),
opposite = F
)$alternative)) {
m_PIN[index_sig[i],]$pval <-
chisq.out.test(m_PIN$PIN[c(index_sig[i], index_insig)],
variance = var(m_PIN[index_insig,]$PIN),
opposite = F)$p.value
} else {
m_PIN[index_sig[i],]$pval <-
chisq.out.test(m_PIN$PIN[c(index_sig[i], index_insig)],
variance = var(m_PIN[index_insig,]$PIN),
opposite = T)$p.value
}
}
#p.adjust(raw$pval, method="hommel")
}
} else {
m_PIN <- data.frame(
sample_id = names(input_iPIS)[index_temp],
PIN = apply(input_iPIS[, index_temp, with=F], 2, mean_na),
pval = 1.0
)
m_PIN <- m_PIN[order(m_PIN$PIN), ]
fit_weibull <- fitdistr(m_PIN$PIN, densfun = "weibull")
m_PIN$pval <- pweibull(m_PIN$PIN,
shape = fit_weibull$estimate[1],
scale = fit_weibull$estimate[2])
length_insig_last <- dim(m_PIN)[1]
count <- 1
index_insig <- which(m_PIN$pval >= 0.05)
while (length(index_insig) < length_insig_last) {
cat("Iteration for fitting null distribution: ", count, "\n")
#wenguang: note that for error like "Error in fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull" optimization failed", you may want to try
# fit_weibull <- fitdistr(m_PIN[index_insig,]$PIN, densfun = "weibull", lower=c(10, 0.6))
if( !is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull"), silent=T), "try-error") ) {
fit_weibull <- fitdistr(m_PIN[index_insig,]$PIN, densfun = "weibull")
} else if( !is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower=c(10, 0.6)), silent=T), "try-error") ) {
fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower=c(10, 0.6))
} else if( !is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower=c(1, 0.1)), silent=T), "try-error") ) {
fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower=c(1, 0.1))
}
m_PIN$pval <- pweibull(m_PIN$PIN,
shape = fit_weibull$estimate[1],
scale = fit_weibull$estimate[2])
length_insig_last <- length(index_insig)
count <- count + 1
index_insig <- which(m_PIN$pval >= 0.05)
#index_insig <- which(m_PIN$pval >= 0.02)
}
#wenguang: need to be tested, and will be released later...
if (1 == 1) {
#index_sig <- which(m_PIN$pval < 0.02)
#index_insig <- which(m_PIN$pval >= 0.02)
if( dim(m_PIN)[1] < 100 ) {
index_sig <- which(m_PIN$pval < 0.04)
index_insig <- which(m_PIN$pval >= 0.04)
} else {
index_sig <- which(m_PIN$pval < 0.02)
index_insig <- which(m_PIN$pval >= 0.02)
}
if( length(index_sig) > 0 ) {
for (i in 1:length(index_sig)) {
if (grepl("lowest",
chisq.out.test(
m_PIN$PIN[c(index_sig[i], index_insig)],
variance = var(m_PIN[index_insig,]$PIN),
opposite = F
)$alternative)) {
m_PIN[index_sig[i],]$pval <-
chisq.out.test(m_PIN$PIN[c(index_sig[i], index_insig)],
variance = var(m_PIN[index_insig,]$PIN),
opposite = F)$p.value
} else {
m_PIN[index_sig[i],]$pval <-
chisq.out.test(m_PIN$PIN[c(index_sig[i], index_insig)],
variance = var(m_PIN[index_insig,]$PIN),
opposite = T)$p.value
}
}
}
#p.adjust(raw$pval, method="hommel")
}
}
return(m_PIN)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.