R/Processor.R

#' @importFrom data.table dcast copy melt
#' @importFrom outliers chisq.out.test
#' @importFrom MASS fitdistr


#' @export
keep_only_proteotypic <- function(input_dt) {
    
    return(input_dt[which(grepl("^1/", input_dt$ProteinName)), ])
    
}


#' @export
long2wide <- function(input_dt, level = "PeptideIon") {
    
    if (!level %in% c("PeptideIon", "Protein")) {
        stop("Please select a valid type. Options:  \"PeptideIon(default)\", \"Protein\"")
    }
    
    if (level == "PeptideIon") {
        output_dt <- dcast(input_dt, ProteinName + PeptideIon ~ filename, value.var = "Intensity")
    } else if (level == "Protein") {
        output_dt <- dcast(input_dt, ProteinName ~ run_id, value.var = "Quant_sum")
    }
    
    return(output_dt)
    
}


#' @export
wide2long <- function(input_dt) {
    
    output_dt <- melt(input_dt, id.vars = c("PeptideIon", "ProteinName", "FullName", 
        "NTT", "NMC"), measure.vars = names(input_dt)[which(grepl("mean_", names(input_dt)))], 
        variable.name = "run_id", value.name = "Intensity", na.rm = T)
    
    return(output_dt)
    
}


#' @export
keep_only_semi <- function(input_dt) {
    
    semi_cons_peptides_long <- copy(input_dt)
    semi_cons_peptides_long[which(semi_cons_peptides_long$NTT == 2), ]$Intensity <- 0
    
    return(semi_cons_peptides_long)
    
}


#' @export
pept2prot <- function(input_pept_long, intensity_normalization = "sqrt") {
    
    # for CPTAC itraq breast cancer and benchmarking data, this seems that sqrt works
    # better.
    if (intensity_normalization == "sqrt") {
        output_prot_long <- input_pept_long[, .(Quant_sum = sum_na(sqrt(Intensity + 
            1)), NumPeptides = length(which(Intensity > 0))), by = .(ProteinName, 
            run_id)]
        
    } else if (intensity_normalization == "log") {
        output_prot_long <- input_pept_long[, .(Quant_sum = sum_na(log2(Intensity + 
            1)), NumPeptides = length(which(Intensity > 0))), by = .(ProteinName, 
            run_id)]
        
    } else if (intensity_normalization == "raw") {
        output_prot_long <- input_pept_long[, .(Quant_sum = sum_na(Intensity), NumPeptides = length(which(Intensity > 
            0))), by = .(ProteinName, run_id)]
        
    }
    
    
    return(output_prot_long)
    
}


#' @export
normalize_data <- function(input_dt, input_index, normalization = "mediancenter") {
    
    output_dt <- copy(input_dt)
    
    output_dt[, `:=`((input_index), log2(output_dt[, input_index, with = F]))]
    
    run_median <- sapply(output_dt[, input_index, with = F], median_na)
    
    for (i in 1:length(input_index)) {
        output_dt[, names(output_dt)[input_index[i]]] <- output_dt[, input_index[i], 
            with = F] - run_median[i] + median(run_median)
        output_dt[, names(output_dt)[input_index[i]]] <- 2^output_dt[, input_index[i], 
            with = F]
    }
    
    return(output_dt)
    
}


#' @export
calculate_stats <- function(input_dt, input_index) {
    
    index_semi <- which(input_dt$NTT < 2)
    index_missed <- which(input_dt$NMC > 0)
    index_normal <- which(input_dt$NTT == 2 & input_dt$NMC == 0)
    
    numPept <- apply(input_dt[, input_index, with = F], 2, function(x) length(which(x > 
        0)))
    
    numSemi <- apply(input_dt[index_semi, input_index, with = F], 2, function(x) length(which(x > 
        0)))
    numMissed <- apply(input_dt[index_missed, input_index, with = F], 2, function(x) length(which(x > 
        0)))
    numNormal <- apply(input_dt[index_normal, input_index, with = F], 2, function(x) length(which(x > 
        0)))
    
    TIC <- apply(input_dt[, input_index, with = F], 2, sum_na)
    
    proteomic_measure <- data.frame(cbind(names(input_dt)[input_index], numPept, 
        numSemi, numMissed, numNormal, TIC))
    names(proteomic_measure) <- c("filename", "numPept", "numSemi", "numMissed", 
        "numNormal", "TIC")
    
    return(proteomic_measure)
    
}


#' @export
merge_replicates <- function(input_dt, input_anno) {
    
    output_dt <- copy(input_dt)
    
    list_samples <- unique(input_anno$sample_id)
    
    message("It starts to merge replicates...")
    
    for (i in 1:length(list_samples)) {
        cat("Processing ", i, " sample: ", list_samples[i], "\n")
        output_dt[, `:=`(paste0("mean_", list_samples[i]), apply(.SD, 1, mean_na)), 
            .SDcols = input_anno[input_anno$sample_id %in% list_samples[i], ]$filename]
    }
    
    message("Done with merging replicates...")
    
    return(output_dt)
    
}


#' @export
generate_iPIS_matrix <- function(input_all_dt, input_semi_dt) {
    
    index_temp <- which(grepl("^mean", names(input_all_dt)))
    
    m_iPIS <- copy(input_all_dt)
    
    # for(i in 1:length(index_temp)) { m_iPIS[, index_temp[i], with=F] <- 1 -
    # input_semi_dt[, index_temp[i], with=F] / input_all_dt[, index_temp[i], with=F]
    # }
    
    m_iPIS[, index_temp] <- 1 - input_semi_dt[, index_temp, with = F]/input_all_dt[, 
        index_temp, with = F]
    
    names(m_iPIS)[index_temp] <- sapply(strsplit(names(m_iPIS)[index_temp], "mean_"), 
        "[[", 2)
    
    index_order <- order(apply(m_iPIS[, index_temp, with = F], 1, function(x) length(which(is.na(x)))))
    
    m_iPIS <- m_iPIS[index_order, ]
    
    # m_iPIS <- m_iPIS[order(apply(m_iPIS[, index_temp, with = F], 1, function(x)
    # length(which(is.na(x))))), ]
    
    return(m_iPIS)
    
}


#' @export
old_calculate_PIN <- function(input_iPIS) {
    
    index_temp <- seq(2, dim(input_iPIS)[2], 1)
    
    m_PIN <- data.frame(sample_id = names(input_iPIS)[index_temp], PIN = apply(input_iPIS[, 
        index_temp, with = F], 2, mean_na), pval = 1)
    
    m_PIN <- m_PIN[order(m_PIN$PIN), ]
    
    # wenguang: re-scale PIN... not sure whether it is a good idea... m_PIN$PIN <-
    # -log2(0.98 - m_PIN$PIN)
    
    for (i in 1:(dim(m_PIN)[1] - 1)) {
        
        if (grepl("lowest", chisq.out.test(m_PIN$PIN[i:dim(m_PIN)[1]], opposite = F)$alternative)) {
            m_PIN[i, ]$pval <- chisq.out.test(m_PIN$PIN[i:dim(m_PIN)[1]], opposite = F)$p.value
        } else {
            m_PIN[i, ]$pval <- chisq.out.test(m_PIN$PIN[i:dim(m_PIN)[1]], opposite = T)$p.value
        }
        
        if (i > 1) {
            if (m_PIN[i, ]$pval < m_PIN[i - 1, ]$pval) {
                m_PIN[i, ]$pval <- m_PIN[i - 1, ]$pval + m_PIN[i, ]$pval * 0.05
                # m_PIN[i, ]$pval <- m_PIN[i, ]$pval*0.6 + m_PIN[i-1, ]$pval * 0.4
            }
        }
        
    }
    
    
    # wenguang: need to be tested, and will be released later...
    
    if (1 == 1) {
        # index_sig <- which(m_PIN$pval < 0.01) index_insig <- which(m_PIN$pval >= 0.01)
        
        index_sig <- which(m_PIN$pval < 0.05)
        index_insig <- which(m_PIN$pval >= 0.05)
        
        for (i in 1:length(index_sig)) {
            if (grepl("lowest", chisq.out.test(m_PIN$PIN[c(index_sig[i], index_insig)], 
                opposite = F)$alternative)) {
                m_PIN[index_sig[i], ]$pval <- chisq.out.test(m_PIN$PIN[c(index_sig[i], 
                  index_insig)], opposite = F)$p.value
            } else {
                m_PIN[index_sig[i], ]$pval <- chisq.out.test(m_PIN$PIN[c(index_sig[i], 
                  index_insig)], opposite = T)$p.value
            }
            
            
        }
        
        # p.adjust(raw$pval, method='hommel')
        
    }
    
    return(m_PIN)
    
}





#' @export
calculate_PIN <- function(input_iPIS, input_remove_zero_rows) {
    
    index_temp <- seq(2, dim(input_iPIS)[2], 1)
    
    # wenguang: CAUTION!!! This will calculate PINs without proteins detected
    # consistently with only semi-tryptic peptides across all measurements (i.e. the
    # row is full with zero). wenguang: by default it is off. It was only turned on,
    # that yansheng plasma samples were analyzed, as those rows took up a large
    # fraction of total proteins identified.
    if (input_remove_zero_rows == TRUE) {
        input_iPIS <- input_iPIS[-which(apply(input_iPIS[, index_temp, with = F], 
            1, function(x) length(which(x == 0))) == length(index_temp)), ]
    }
    
    m_PIN <- data.frame(sample_id = names(input_iPIS)[index_temp], PIN = apply(input_iPIS[, 
        index_temp, with = F], 2, mean_na), pval = 1)
    
    m_PIN <- m_PIN[order(m_PIN$PIN), ]
    
    fit_weibull <- fitdistr(m_PIN$PIN, densfun = "weibull")
    m_PIN$pval <- pweibull(m_PIN$PIN, shape = fit_weibull$estimate[1], scale = fit_weibull$estimate[2])
    
    length_insig_last <- dim(m_PIN)[1]
    count <- 1
    index_insig <- which(m_PIN$pval >= 0.05)
    
    while (length(index_insig) < length_insig_last) {
        
        cat("Iteration for fitting null distribution: ", count, "\n")
        
        
        # wenguang: note that for error like 'Error in fitdistr(m_PIN[index_insig, ]$PIN,
        # densfun = 'weibull' optimization failed', you may want to try fit_weibull <-
        # fitdistr(m_PIN[index_insig,]$PIN, densfun = 'weibull', lower=c(10, 0.6))
        
        if (!is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull"), silent = T), 
            "try-error")) {
            
            fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull")
            
        } else if (!is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", 
            lower = c(10, 0.6)), silent = T), "try-error")) {
            
            fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", 
                lower = c(10, 0.6))
            
        } else if (!is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", 
            lower = c(1, 0.1)), silent = T), "try-error")) {
            
            fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", 
                lower = c(1, 0.1))
            
        }
        
        m_PIN$pval <- pweibull(m_PIN$PIN, shape = fit_weibull$estimate[1], scale = fit_weibull$estimate[2])
        
        length_insig_last <- length(index_insig)
        count <- count + 1
        
        index_insig <- which(m_PIN$pval >= 0.05)
        # index_insig <- which(m_PIN$pval >= 0.02)
        
    }
    
    # wenguang: need to be tested, and will be released later...
    
    if (1 == 1) {
        # index_sig <- which(m_PIN$pval < 0.02) index_insig <- which(m_PIN$pval >= 0.02)
        
        index_sig <- which(m_PIN$pval < 0.05)
        index_insig <- which(m_PIN$pval >= 0.05)
        
        
        
        for (i in 1:length(index_sig)) {
            
            if (grepl("lowest", chisq.out.test(m_PIN$PIN[c(index_sig[i], index_insig)], 
                variance = var(m_PIN[index_insig, ]$PIN), opposite = F)$alternative)) {
                m_PIN[index_sig[i], ]$pval <- chisq.out.test(m_PIN$PIN[c(index_sig[i], 
                  index_insig)], variance = var(m_PIN[index_insig, ]$PIN), opposite = F)$p.value
            } else {
                m_PIN[index_sig[i], ]$pval <- chisq.out.test(m_PIN$PIN[c(index_sig[i], 
                  index_insig)], variance = var(m_PIN[index_insig, ]$PIN), opposite = T)$p.value
            }
            
            
        }
        
        # p.adjust(raw$pval, method='hommel')
        
    }
    
    return(m_PIN)
    
}







#' @export
calculate_PIN_new <- function(input_iPIS, input_remove_zero_rows) {
    
    insig_pval <- 0.05
    
    index_temp <- seq(2, dim(input_iPIS)[2], 1)
    
    # wenguang: CAUTION!!! This will calculate PINs without proteins detected
    # consistently with only semi-tryptic peptides across all measurements (i.e. the
    # row is full with zero). wenguang: by default it is off. It was only turned on,
    # that yansheng plasma samples were analyzed, as those rows took up a large
    # fraction of total proteins identified.
    if (input_remove_zero_rows == TRUE) {
        input_iPIS <- input_iPIS[-which(apply(input_iPIS[, index_temp, with = F], 1, function(x) length(which(x == 0))) == length(index_temp)), ]
    }
    
    m_PIN <- data.frame(sample_id = names(input_iPIS)[index_temp], PIN = apply(input_iPIS[, index_temp, with = F], 2, mean_na), pval = 1)
    
    m_PIN <- m_PIN[order(m_PIN$PIN), ]
    
    fit_weibull <- fitdistr(m_PIN$PIN, densfun = "weibull")
    m_PIN$pval <- pweibull(m_PIN$PIN, shape = fit_weibull$estimate[1], scale = fit_weibull$estimate[2])
    
#    cat("first_fit: \n")
#    print( ks.test(m_PIN$PIN, rweibull(n = 10000, shape = fit_weibull$estimate[1], scale = fit_weibull$estimate[2])) )

    length_insig_last <- dim(m_PIN)[1]
    count <- 1
    index_insig <- which(m_PIN$pval >= insig_pval)

#    cat("null_dist: ", m_PIN[index_insig, ]$sample_id, "\n")
    
    best_fit_count <- 1
    best_fit_D <- 0
    best_fit_bool <- FALSE
    
    while (length(index_insig) < length_insig_last) {
        
        cat("Iteration for fitting null distribution: ", count, "\n")
        
        
        # wenguang: note that for error like 'Error in fitdistr(m_PIN[index_insig, ]$PIN,
        # densfun = 'weibull' optimization failed', you may want to try fit_weibull <-
        # fitdistr(m_PIN[index_insig,]$PIN, densfun = 'weibull', lower=c(10, 0.6))
        
        if (!is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull"), silent = T), "try-error")) {
            
            fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull")
            
        } else if (!is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower = c(10, 0.6)), silent = T), "try-error")) {
            
            fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower = c(10, 0.6))
            
        } else if (!is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower = c(1, 0.1)), silent = T), "try-error")) {
            
            fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower = c(1, 0.1))
            
        }
        

        
#        print(ks.test(m_PIN[index_insig, ]$PIN, rweibull(n = 10000, shape = fit_weibull$estimate[1], scale = fit_weibull$estimate[2])))

#cat("null_dist: ", length(index_insig), "\n")
        
        fit_test <- ks.test(m_PIN[index_insig, ]$PIN, rweibull(n = 10000, shape = fit_weibull$estimate[1], 
            scale = fit_weibull$estimate[2]))
        
        if (fit_test$statistic > best_fit_D) {

            best_fit_D <- fit_test$statistic
            best_fit_count <- count
            
            if (fit_test$p.value < 0.01) 
                best_fit_bool <- TRUE
            
        }
        
        m_PIN$pval <- pweibull(m_PIN$PIN, shape = fit_weibull$estimate[1], scale = fit_weibull$estimate[2])
        
        length_insig_last <- length(index_insig)
        
        index_insig <- which(m_PIN$pval >= insig_pval)
        
        count <- count + 1
        
    }
    
    
    if (best_fit_bool == "TRUE") {
        
        cat("It fits best when n = ", best_fit_count, ".\n")
        
        m_PIN <- data.frame(sample_id = names(input_iPIS)[index_temp], PIN = apply(input_iPIS[, index_temp, with = F], 2, mean_na), pval = 1)
        
        m_PIN <- m_PIN[order(m_PIN$PIN), ]
        
        fit_weibull <- fitdistr(m_PIN$PIN, densfun = "weibull")
        m_PIN$pval <- pweibull(m_PIN$PIN, shape = fit_weibull$estimate[1], scale = fit_weibull$estimate[2])
        
        length_insig_last <- dim(m_PIN)[1]
        index_insig <- which(m_PIN$pval >= insig_pval)
        
        reloop_count <- 1
        
        while (reloop_count <= best_fit_count) {
            
#            cat("Iteration for fitting null distribution: ", reloop_count, "\n")
            
            
            # wenguang: note that for error like 'Error in fitdistr(m_PIN[index_insig, ]$PIN,
            # densfun = 'weibull' optimization failed', you may want to try fit_weibull <-
            # fitdistr(m_PIN[index_insig,]$PIN, densfun = 'weibull', lower=c(10, 0.6))
            
            if (!is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull"), silent = T), "try-error")) {
                
                fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull")
                
            } else if (!is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower = c(10, 0.6)), silent = T), "try-error")) {
                
                fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower = c(10, 0.6))
                
            } else if (!is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower = c(1, 0.1)), silent = T), "try-error")) {
                
                fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower = c(1, 0.1))
                
            }
            
#            print(ks.test(m_PIN[index_insig, ]$PIN, rweibull(n = 10000, shape = fit_weibull$estimate[1], scale = fit_weibull$estimate[2])))
#cat("null_dist: ", m_PIN[index_insig, ]$sample_id, "\n")

            m_PIN$pval <- pweibull(m_PIN$PIN, shape = fit_weibull$estimate[1], scale = fit_weibull$estimate[2])
            
            length_insig_last <- length(index_insig)
            
            index_insig <- which(m_PIN$pval >= insig_pval)
            # index_insig <- which(m_PIN$pval >= 0.02)
                        
            fit_test <- ks.test(m_PIN[index_insig, ]$PIN, rweibull(n = 10000, shape = fit_weibull$estimate[1], scale = fit_weibull$estimate[2]))
            
            reloop_count <- reloop_count + 1
            
        }
        
        
        if (1 == 0) {
            # index_sig <- which(m_PIN$pval < 0.02) index_insig <- which(m_PIN$pval >= 0.02)
            
            index_sig <- which(m_PIN$pval < 0.01)
            index_insig <- which(m_PIN$pval >= 0.01)
            
            for (i in 1:length(index_sig)) {
                
                if (grepl("lowest", chisq.out.test(m_PIN$PIN[c(index_sig[i], index_insig)], 
                  variance = var(m_PIN[index_insig, ]$PIN), opposite = F)$alternative)) {
                  m_PIN[index_sig[i], ]$pval <- chisq.out.test(m_PIN$PIN[c(index_sig[i], index_insig)], variance = var(m_PIN[index_insig, ]$PIN), opposite = F)$p.value
                } else {
                  m_PIN[index_sig[i], ]$pval <- chisq.out.test(m_PIN$PIN[c(index_sig[i], index_insig)], variance = var(m_PIN[index_insig, ]$PIN), opposite = T)$p.value
                }
                
                
            }
            
            # p.adjust(raw$pval, method='hommel')
            
        }
        
        
    } else {
        
        
        m_PIN <- data.frame(sample_id = names(input_iPIS)[index_temp], PIN = apply(input_iPIS[, 
            index_temp, with = F], 2, mean_na), pval = 1)
        
        m_PIN <- m_PIN[order(m_PIN$PIN), ]
        
        fit_weibull <- fitdistr(m_PIN$PIN, densfun = "weibull")
        m_PIN$pval <- pweibull(m_PIN$PIN, shape = fit_weibull$estimate[1], scale = fit_weibull$estimate[2])
        
        length_insig_last <- dim(m_PIN)[1]
        count <- 1
        index_insig <- which(m_PIN$pval >= insig_pval)
        
        while (length(index_insig) < length_insig_last) {
            
            cat("Iteration for fitting null distribution: ", count, "\n")
            
            # wenguang: note that for error like 'Error in fitdistr(m_PIN[index_insig, ]$PIN,
            # densfun = 'weibull' optimization failed', you may want to try fit_weibull <-
            # fitdistr(m_PIN[index_insig,]$PIN, densfun = 'weibull', lower=c(10, 0.6))
            
            if (!is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull"), silent = T), "try-error")) {
                
                fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull")
                
            } else if (!is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower = c(10, 0.6)), silent = T), "try-error")) {
                
                fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower = c(10, 0.6))
                
            } else if (!is(try(fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower = c(1, 0.1)), silent = T), "try-error")) {
                
                fit_weibull <- fitdistr(m_PIN[index_insig, ]$PIN, densfun = "weibull", lower = c(1, 0.1))
                
            }
            
            m_PIN$pval <- pweibull(m_PIN$PIN, shape = fit_weibull$estimate[1], scale = fit_weibull$estimate[2])
            
            length_insig_last <- length(index_insig)
            count <- count + 1
            
            index_insig <- which(m_PIN$pval >= insig_pval)
            # index_insig <- which(m_PIN$pval >= 0.02)
            
        }
        
        # wenguang: need to be tested, and will be released later...
        
        if (1 == 1) {
            # index_sig <- which(m_PIN$pval < 0.02) index_insig <- which(m_PIN$pval >= 0.02)
            
            if (dim(m_PIN)[1] < 80) {
                index_sig <- which(m_PIN$pval < insig_pval)
                index_insig <- which(m_PIN$pval >= insig_pval)
            } else {
                index_sig <- which(m_PIN$pval < 0.015)
                index_insig <- which(m_PIN$pval >= 0.015)
            }
            
            
            if (length(index_sig) > 0) {
                
                for (i in 1:length(index_sig)) {
                  
                  if (grepl("lowest", chisq.out.test(m_PIN$PIN[c(index_sig[i], index_insig)], 
                    variance = var(m_PIN[index_insig, ]$PIN), opposite = F)$alternative)) {
                    m_PIN[index_sig[i], ]$pval <- chisq.out.test(m_PIN$PIN[c(index_sig[i], 
                      index_insig)], variance = var(m_PIN[index_insig, ]$PIN), opposite = F)$p.value
                  } else {
                    m_PIN[index_sig[i], ]$pval <- chisq.out.test(m_PIN$PIN[c(index_sig[i], 
                      index_insig)], variance = var(m_PIN[index_insig, ]$PIN), opposite = T)$p.value
                  }
                  
                  
                }
                
            }
            
            # p.adjust(raw$pval, method='hommel')
            
        }
        
        
        
    }
    
    return(m_PIN)
    
}
ProteomicsTools/PIN documentation built on May 15, 2019, 4:29 p.m.