utils::globalVariables(
c("col_name",
"confident_score",
"confident_site",
"entrez_id",
"gene_symbol",
"id_repeats",
"protein_id",
"ptm_id",
"ptm_peptide",
"ratio_values",
"ri_intensity",
"tmt11_channel",
"vial_label",
"all_vial_labels",
"tmt_plex"))
#' @title check proteomics ratio file
#'
#' @description check whether the proteomics ratio results files is following guidelines
#' @param df_ratio (data.frame) proteomics ratio results data frame (required)
#' @param isPTM (logical) `TRUE` if a PTM results file
#' @param f_proof (logical) `TRUE` (default) to print out distribution and NA plots
#' @param output_prefix (char) if `f_proof = TRUE`, provide a prefix for the output name
#' @param out_qc_folder (char) if `f_proof is TRUE`, a folder path can be provided
#' (otherwise print in current working directory)
#' @param printPDF (logical) if `TRUE` (default print plots to pdf)
#' @param return_n_issues (logical) if `TRUE` returns the number of issues.
#' @param verbose (logical) `TRUE` (default) shows messages
#' @return (int) number of issues identified
#' @examples {
#' test <- check_ratio_proteomics(df_ratio = metadata_metabolites_named,
#' isPTM = TRUE, return_n_issues = TRUE, verbose = FALSE)
#' # "test" should be NULL
#' }
#' @export
check_ratio_proteomics <- function(df_ratio,
isPTM,
f_proof = TRUE,
output_prefix = "ratio-file",
out_qc_folder = NULL,
printPDF = TRUE,
return_n_issues = FALSE,
verbose = TRUE){
vial_label = all_vial_labels = tmt_plex = NULL
ic_rr <- 0
required_columns <- get_required_columns(isPTM = isPTM,
prot_file = "ratio")
if(!all( required_columns %in% colnames(df_ratio) )){
if(verbose) message(" - (-) CRITICAL REQUIRED column(s) missed: ", appendLF = FALSE)
if(verbose) message(paste(required_columns[!required_columns %in% colnames(df_ratio)], collapse = ", "))
ic_rr <- 10
return(ic_rr)
}
if(isPTM){
# The ptm_id must be unique
if( any(duplicated(df_ratio$ptm_id)) ){
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) NON UNIQUE ptm_id values ")
if(f_proof){
# Print out redundancies
# red_ids <- df_ratio$ptm_id[duplicated(df_ratio$ptm_id)]
# rii_redundant_ids <- df_ratio[which(df_ratio$ptm_id %in% red_ids),]
# Plot redundancies
ya <- df_ratio %>%
group_by(ptm_id) %>%
dplyr::reframe(id_repeats = n()) %>%
group_by(id_repeats) %>%
dplyr::reframe(n = n()) %>%
ggplot2::ggplot(aes(x = as.factor(id_repeats), y = n, fill = as.factor(id_repeats))) +
geom_bar(stat = "identity") +
theme_linedraw() +
theme(legend.title = element_blank()) +
labs(title = "RATIO RESULTS: Number of redundant ptm_ids",
subtitle = paste(output_prefix))
if(is.null(out_qc_folder)){
out_plot_redundancies <- paste0(output_prefix,"-qc-ratior-redundant-ids.pdf")
}else{
out_plot_redundancies <- file.path(normalizePath(out_qc_folder), paste0(output_prefix,"-qc-ratior-redundant-ids.pdf"))
}
if(printPDF) pdf(out_plot_redundancies, width = 10, height = 8)
print(ya)
if(printPDF) garbage <- dev.off()
}
}else{
if(verbose) message(" + (+) ptm_id unique: ok")
}
if( any(is.na(df_ratio$ptm_id)) ){
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) Some PTM_ID are missed")
}else{
if(verbose) message(" + (+) All PTM_ID available")
}
if( any(is.na(df_ratio$confident_score)) ){
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) Some CONFIDENT_SCORE are missed")
}else{
if(verbose) message(" + (+) All CONFIDENT_SCORE available")
}
if( any(is.na(df_ratio$confident_site)) ){
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) Some CONFIDENT_SITE are missed")
}else{
if(verbose) message(" + (+) All CONFIDENT_SITE available")
}
if( any(is.na(df_ratio$flanking_sequence)) ){
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) Some FLANKING_SEQUENCEs are missed")
}else{
if(verbose) message(" + (+) All FLANKING_SEQUENCEs available")
}
if("ptm_score" %in% colnames(df_ratio)){
if( any(is.na(df_ratio$ptm_score)) ){
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) Some PTM_SCOREs are missed")
}else{
if(verbose) message(" + (+) All PTM_SCOREs available")
}
if(is.numeric(df_ratio$ptm_score)){
if(verbose) message(" + (+) PTM_SCORE is numeric")
}else{
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) PTM_SCORE is NOT NUMERIC")
}
}else{
ic_rr <- ic_rr + 2
if(verbose) message(" - (-) ptm_score column missed")
}
}else{
if( any(duplicated(df_ratio$protein_id)) ){
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) Duplicated PROTEIN_ID found: FAIL")
}
if("num_peptides" %in% colnames(df_ratio)){
if( any(is.na(df_ratio$num_peptides)) ){
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) Some num_peptides are missed")
}else{
if(verbose) message(" + (+) All num_peptides available")
}
if(is.numeric(df_ratio$num_peptides)){
if(verbose) message(" + (+) num_peptides is numeric")
}else{
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) num_peptides is NOT NUMERIC")
}
}else{
ic_rr <- ic_rr + 2
if(verbose) message(" - (-) num_peptides column missed")
}
if("percent_coverage" %in% colnames(df_ratio)){
if( any(is.na(df_ratio$percent_coverage)) ){
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) Some percent_coverage are missed")
}else{
if(verbose) message(" + (+) All percent_coverage available")
}
if(is.numeric(df_ratio$percent_coverage)){
if(verbose) message(" + (+) percent_coverage is numeric")
}else{
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) percent_coverage is NOT NUMERIC")
}
}else{
ic_rr <- ic_rr + 2
if(verbose) message(" - (-) percent_coverage column missed")
}
if("protein_score" %in% colnames(df_ratio)){
if( any(is.na(df_ratio$protein_score)) ){
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) Some protein_score are missed")
}else{
if(verbose) message(" + (+) All protein_score available")
}
if(is.numeric(df_ratio$protein_score)){
if(verbose) message(" + (+) protein_score is numeric")
}else{
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) protein_score is NOT NUMERIC")
}
}else{
ic_rr <- ic_rr + 2
if(verbose) message(" - (-) protein_score column missed")
}
} # PROTEIN RATIO
if( any(is.na(df_ratio$protein_id)) ){
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) Some PROTEIN_ID are missed")
}else{
if(verbose) message(" + (+) All PROTEIN_ID available")
}
if( any( is.na( df_ratio$gene_symbol[which(df_ratio$is_contaminant == FALSE)] ) ) ){
# ic_rii <- ic_rii + 1
if(verbose) message(paste(" - (!) GENE_SYMBOL values are missed for",
length( df_ratio$protein_id[which(is.na( df_ratio$gene_symbol[which(df_ratio$is_contaminant == FALSE)] ))] ),
"protein ids"))
}else{
if(verbose) message(" + (+) All GENE_SYMBOL available")
}
if( any( is.na( df_ratio$entrez_id[which(df_ratio$is_contaminant == FALSE)] ) ) ){
# ic_rii <- ic_rii + 1
if(verbose) message(paste(" - (!) ENTREZ_ID values are missed for",
length(df_ratio$protein_id[which(is.na( df_ratio$entrez_id[which(df_ratio$is_contaminant == FALSE)] ))]),
"protein ids"))
}else{
if(verbose) message(" + (+) All ENTREZ_ID available")
}
if( any(is.na(df_ratio$redundant_ids)) ){
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) Some redundant_ids are missed")
}else{
if(verbose) message(" + (+) All redundant_ids available")
}
if( any(is.na(df_ratio$organism_name)) ){
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) Some organism_name are missed")
}else{
if(verbose) message(" + (+) All organism_name available")
}
if("is_contaminant" %in% colnames(df_ratio)){
if(is.logical(df_ratio$is_contaminant)){
if(verbose) message(" + (+) All is_contaminant available")
}else{
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) is_contaminant is not logical (TRUE/FALSE)")
}
if( any(is.na(df_ratio$is_contaminant)) ){
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) Some is_contaminant are missed")
}else{
if(verbose) message(" + (+) All is_contaminant available")
}
}
if( any(is.na(df_ratio$is_contaminant)) ){
ic_rr <- ic_rr + 1
if(verbose) message(" - (-) Some organism_name are missed")
}else{
if(verbose) message(" + (+) All organism_name available")
}
if(is.null(ic_rr) ){
ic_rr <- 0
}
if(return_n_issues) return(ic_rr)
}
#' @title check proteomics reported ion intensity file
#'
#' @description check whether the proteomics rri results files is following guidelines
#' @param df_rri (data.frame) proteomics rri data frame (required)
#' @param isPTM (logical) `TRUE` if a PTM results file
#' @param f_proof (logical) `TRUE` (default) to print out distribution and NA plots
#' @param output_prefix (char) if `f_proof = TRUE`, provide a prefix for the output name
#' @param out_qc_folder (char) if `f_proof is TRUE`, a folder path can be provided
#' (otherwise print in current working directory)
#' @param printPDF (logical) if `TRUE` (default print plots to pdf)
#' @param return_n_issues (logical) if `TRUE` returns the number of issues.
#' @param verbose (logical) `TRUE` (default) shows messages
#' @return (int) number of issues identified
#' @examples {
#' test <- check_rii_proteomics(df_rri = metadata_metabolites_named,
#' isPTM = TRUE, return_n_issues = TRUE, verbose = FALSE)
#' # "test" should be NULL
#' }
#' @export
check_rii_proteomics <- function(df_rri,
isPTM,
f_proof = TRUE,
output_prefix = "rii-file",
out_qc_folder = NULL,
return_n_issues = FALSE,
printPDF = TRUE,
verbose = TRUE){
ic_rii <- 0
required_columns <- get_required_columns(isPTM = isPTM,
prot_file = "rii")
if(!all( required_columns %in% colnames(df_rri) )){
ic_rii <- 10
if(verbose) message(" - (-) Error! Critical Required column(s) not found: ", appendLF = FALSE)
if(verbose) message(paste(required_columns[!required_columns %in% colnames(df_rri)], collapse = ", "))
return(ic_rii)
}
if(dim(df_rri)[2] > 400){
f_proof <- FALSE
}
if(isPTM){
# The ptm_id must be unique
if( any(duplicated(df_rri$ptm_peptide)) ){
ic_rii <- ic_rii + 1
if(verbose) message(" - (-) ERROR: NON UNIQUE PTM_PEPTIDE values")
if(f_proof){
# Print out redundancies
# red_ids <- df_rri$ptm_peptide[duplicated(df_rri$ptm_peptide)]
# rii_redundant_ids <- df_rri[which(df_rri$ptm_peptide %in% red_ids),]
# Plot redundancies
df <- df_rri %>%
group_by(ptm_peptide) %>%
dplyr::reframe(id_repeats = n()) %>%
group_by(id_repeats) %>%
dplyr::reframe(n = n())
xa <- ggplot2::ggplot(df, aes(x = as.factor(id_repeats), y = n, fill = as.factor(id_repeats))) +
geom_bar(stat = "identity") +
theme_linedraw() +
theme(legend.title = element_blank()) +
labs(title = "RII: Number of redundant ptm_peptides",
subtitle = paste(output_prefix))
if(verbose) message(" - (p) Plot redundant values")
if(is.null(out_qc_folder)){
out_plot_redundancies <- paste0(output_prefix,"-qc-redundant-ids.pdf")
}else{
out_plot_redundancies <- file.path(normalizePath(out_qc_folder), paste0(output_prefix,"-qc-redundant-ids.pdf"))
}
if(printPDF) pdf(out_plot_redundancies, width = 10, height = 8)
print(xa)
if(printPDF) garbage <- dev.off()
}
}
if( any(is.na(df_rri$ptm_id)) ){
ic_rii <- ic_rii + 1
if(verbose) message(" - (-) ERROR: Some PTM_ID values are missed")
}else{
if(verbose) message(" + (+) All PTM_ID available")
}
if("confident_score" %in% colnames(df_rri)){
if(is.numeric(df_rri$confident_score)){
if(verbose) message(" + (+) All CONFIDENT_SCORE are numeric")
}else{
ic_rii <- ic_rii + 1
if(verbose) message(" - (-) ERROR: CONFIDENT_SCORE is not numeric!")
}
if( any(is.na(df_rri$confident_score)) ){
ic_rii <- ic_rii + 1
if(verbose) message(" - (-) ERROR: Some CONFIDENT_SCORE values are missed")
}else{
if(verbose) message(" + (+) All CONFIDENT_SCORE available")
}
}
if("confident_site" %in% colnames(df_rri)){
if( any(is.na(df_rri$confident_site)) ){
ic_rii <- ic_rii + 1
if(verbose) message(" - (-) ERROR: Some CONFIDENT_SITE are missed")
}else{
if(verbose) message(" + (+) All CONFIDENT_SITE available")
}
if(is.logical(df_rri$confident_site)){
if(verbose) message(" + (+) confident_site is a logical variable")
}else{
ic_rii <- ic_rii + 1
if(verbose) message(" + (+) confident_site is NOT a logical variable (all values should be TRUE/FALSE)")
}
}else{
ic_rii <- ic_rii + 1
if(verbose) message(" - (-) ERROR: confident_site column is not available")
}
# isPTM == TRUE
}else{
df_rri$protein_sequence <- paste0(df_rri$protein_id,"-", df_rri$sequence)
if( any( duplicated(df_rri$protein_sequence) ) ){
ic_rii <- ic_rii + 1
if(verbose) message(" - (-) ERROR: Duplicated Protein + Sequence identified")
}else{
if(verbose) message(" + (+) All Protein + Sequence are unique")
}
df_rri$protein_sequence <- NULL
}
if( any(is.na(df_rri$protein_id)) ){
ic_rii <- ic_rii + 1
if(verbose) message(" - (-) ERROR: (some) PROTEIN_IDs missed")
}else{
if(verbose) message(" + (+) All PROTEIN_ID values available")
}
if( any( is.na( df_rri$gene_symbol[which(df_rri$is_contaminant == FALSE)] ) ) ){
# ic_rii <- ic_rii + 1
if(verbose) message(paste(" - (!) GENE_SYMBOL values are missed for",
length(df_rri$protein_id[which(is.na( df_rri$gene_symbol[which(df_rri$is_contaminant == FALSE)] ))]),
"protein ids"))
}else{
if(verbose) message(" + (+) All GENE_SYMBOL ids available")
}
if( any( is.na( df_rri$entrez_id[which(df_rri$is_contaminant == FALSE)] ) ) ){
# ic_rii <- ic_rii + 1
if(verbose) message(paste(" - (!) ENTREZ_ID values are missed for",
length(df_rri$protein_id[which(is.na( df_rri$entrez_id[which(df_rri$is_contaminant == FALSE)] ))]),
"protein ids"))
}else{
if(verbose) message(" + (+) All ENTREZ_ID ids available")
}
if( any(is.na(df_rri$redundant_ids)) ){
# ic_rii <- ic_rii + 1
if(verbose) message(" - (!) Some REDUNDANT_IDS values are missed (it should be fine)")
}else{
if(verbose) message(" + (+) All REDUNDANT_IDS values available")
}
if( any(is.na(df_rri$sequence)) ){
ic_rii <- ic_rii + 1
if(verbose) message(" - (-) ERROR: Some SEQUENCE values are missed")
}else{
if(verbose) message(" + (+) All SEQUENCE values available")
}
if("is_contaminant" %in% colnames(df_rri)){
if( any(is.na(df_rri$is_contaminant)) ){
ic_rii <- ic_rii + 1
if(verbose) message(" - (-) ERROR: Some IS_CONTAMINANT values are missed")
}else{
if(verbose) message(" + (+) All is_contaminant values available")
}
if(is.logical(df_rri$is_contaminant)){
if(verbose) message(" + (+) IS_CONTAMINANT is a logical variable")
}else{
ic_rii <- ic_rii + 1
if(verbose) message(" + (+) IS_CONTAMINANT is NOT a logical variable (all values should be TRUE/FALSE)")
}
}else{
ic_rii <- ic_rii + 1
if(verbose) message(" - (-) ERROR: IS_CONTAMINANT column is not available")
}
if("peptide_score" %in% colnames(df_rri)){
if( any(is.na(df_rri$peptide_score)) ){
ic_rii <- ic_rii + 1
if(verbose) message(" - (-) ERROR: Some PEPTIDE_SCORE values are missed")
}else{
if(verbose) message(" + (+) All PEPTIDE_SCORE values available")
}
if(is.numeric(df_rri$peptide_score)){
if(verbose) message(" + (+) All PEPTIDE_SCORE values are numeric")
}else{
ic_rii <- ic_rii + 1
if(verbose) message(" - (-) ERROR: PEPTIDE_SCORE is not numeric!")
}
}
if( any(is.na(df_rri$organism_name)) ){
ic_rii <- ic_rii + 1
if(verbose) message(" - (-) ERROR: Some ORGANISM_NAME values are missed")
}else{
if(verbose) message(" + (+) All ORGANISM_NAME values available")
}
if( is.null(ic_rii) ){
ic_rii <- 0
}
if(return_n_issues) return(ic_rii)
}
#' @title check proteomics vial metadata file
#'
#' @description check whether the proteomics rri results files is following guidelines
#' @param df_vm (data.frame) proteomics vial_label data frame (required)
#' @param return_n_issues (logical) if `TRUE` returns the number of issues.
#' @param verbose (logical) `TRUE` (default) shows messages
#' @return (int) number of issues identified
#' @examples {
#' test <- check_vial_metadata_proteomics(df_vm = metadata_metabolites_named,
#' return_n_issues = TRUE, verbose = FALSE)
#' # "test" should be NULL
#' }
#' @export
check_vial_metadata_proteomics <- function(df_vm,
return_n_issues = FALSE,
verbose = TRUE){
ic_vm <- NULL
tmt_channel <- NULL
if("tmt11_channel" %in% colnames(df_vm)){
required_columns <- c("vial_label", "tmt_plex", "tmt11_channel")
tmt_channel <- "tmt11_channel"
}else if("tmt16_channel" %in% colnames(df_vm)){
required_columns <- c("vial_label", "tmt_plex", "tmt16_channel")
tmt_channel <- "tmt16_channel"
}else if("tmt18_channel" %in% colnames(df_vm)){
required_columns <- c("vial_label", "tmt_plex", "tmt18_channel")
tmt_channel <- "tmt18_channel"
}else{
if(verbose) message(" - (-) `tmt[11|16|18]_channel` column not found")
ic_vm <- 3
return(ic_vm)
}
if(!all( required_columns %in% colnames(df_vm))){
if(verbose) message(" - (-) CRITICAL column(s) missed: ", appendLF = FALSE)
if(verbose) message(paste(required_columns[!required_columns %in% colnames(df_vm)], collapse = ", "))
ic_vm <- 3
return(ic_vm)
}
if("tmt11_channel" %in% colnames(df_vm)){
df_vm$vial_label <- gsub(" ", "", df_vm$vial_label)
valid_channels <- c("126C", "127N", "127C", "128N", "128C", "129N", "129C", "130N", "130C", "131N", "131C")
}else if("tmt16_channel" %in% colnames(df_vm)){
df_vm$vial_label <- gsub(" ", "", df_vm$vial_label)
valid_channels <- c("126C", "127N", "127C", "128N", "128C", "129N", "129C", "130N", "130C", "131N", "131C", "132N", "132C", "133N", "133C", "134N")
}else if("tmt18_channel" %in% colnames(df_vm)){
df_vm$vial_label <- gsub(" ", "", df_vm$vial_label)
valid_channels <- c("126C", "127N", "127C", "128N", "128C", "129N", "129C", "130N", "130C", "131N", "131C", "132N", "132C", "133N", "133C", "134N")
}
ic_vm <- 0
plexes <- unique(df_vm$tmt_plex)
for(p in plexes){
temp_plex <- df_vm[which(df_vm$tmt_plex == p),]
if( all(valid_channels %in% temp_plex[[tmt_channel]]) ){
if(verbose) message(" + (+) All ", tmt_channel," channels are valid in plex ", paste(p))
}else{
if(tmt_channel == "tmt11_channel"){
# Broad exception: they removed one plexed due to some issues. This is an exception to handle it
valid_channels <- c("126C", "127N", "127C", "128N", "128C", "129N", "129C", "130N", "131N", "131C")
if(all(valid_channels %in% temp_plex$tmt11_channel)){
if(verbose) message(" + ( ) Channel 130C missed in ", paste(p))
ic_vm <- ic_vm + 1
}else{
if(verbose) message(" - (-) TMT Channels are missed in ", paste(p))
ic_vm <- ic_vm + 1
}
}
}
}
# Check that all plexes have one channel
for(p in plexes){
temp_plex <- df_vm[which(df_vm$tmt_plex == p),]
if( sum(grepl("Ref", temp_plex$vial_label)) == 1 ){
if(verbose) message(" + (+) Ref channel found in plex ", paste(p))
}else if(sum(grepl("Ref", temp_plex$vial_label)) == 0) {
if(verbose) message(" - (-) Ref channel missed in plex ", paste(p))
ic_vm <- ic_vm + 1
}else if(sum(grepl("Ref", temp_plex$vial_label)) > 1) {
if(verbose) message(" - (-) Too many reference channels in plex ", paste(p))
if(verbose){
print(temp_plex[grepl("Ref", temp_plex$vial_label), ])
}
ic_vm <- ic_vm + 1
}
}
if( any(duplicated(df_vm$vial_label)) ){
if(verbose) message(" - (-) VIAL_METADATA: duplications detected in the `vial_label` column: FAIL")
ic_vm <- ic_vm + 1
}
if(is.null(ic_vm)){
ic_vm <- 0
}
if(return_n_issues) return(ic_vm)
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @title Load Proteomics batch
#'
#' @description Load a proteomics batch
#' @param input_results_folder (char) path to the PROCESSED folder to check
#' @param isPTM (logical) `TRUE` if it is Post-Translational-Modification proteomics assay
#' @param verbose (logical) `TRUE` (default) prints QC details.
#' @return (List) of data frames: rii,
#' @export
load_proteomics <- function(input_results_folder,
isPTM,
verbose = TRUE){
if(any(missing(input_results_folder) |
missing(isPTM)))
stop("One (or many) of the required arguments missed.
Please, check the help for this function to find out more")
# Validate folder structure-----
processfolder <- validate_processFolder(input_results_folder)
phase <- validate_phase(input_results_folder)
if(verbose) message("# LOAD PROTEOMICS BATCH\n\n")
if(verbose) message(paste(processfolder))
# COUNTS----
ic <- 0 # Critical issues
ic_rii <- NA # RII issues
ic_rr <- NA # ratio results
ic_vm <- NA # vial metadata
ic_man <- NA # namifiest
# VIAL METADATA-----
if(verbose) message("\n## VIAL METADATA\n")
lista <- open_file(input_results_folder = input_results_folder,
filepattern = "vial_metadata.txt",
verbose = verbose)
f_vm <- lista$flag
all_vial_labels <- NA
if(f_vm){
v_m <- lista$df
v_m <- filter_required_columns(df = v_m,
type = "v_m",
verbose = verbose)
# Remove space between "Ref_ A"
# v_m$vial_label <- gsub(" ", "", v_m$vial_label)
ic_vm <- check_vial_metadata_proteomics(df_vm = v_m,
return_n_issues = TRUE,
verbose = verbose)
if(is.null(ic_vm)){
f_vm <- FALSE
ic <- ic + 1
ic_vm <- NA
}else{
if( any( grepl("Ref", v_m$vial_label) ) ){
all_samples <- v_m$vial_label
all_vial_labels <- all_samples[!grepl('^Ref', all_samples)]
}else{
if(verbose) message(" - (-) Ref channels not found in vial_metadata")
ic_vm <- ic_vm + 1
}
}
}else{
if(verbose) message(" - (-) {vial_metadata} not available")
ic <- ic + 1
}
# REPORTED ION INTENSITY -----
if(verbose) message("\n## REPORTED ION INTENSITY\n")
if(verbose) message(" + Loading the file (might take some time)")
lista <- open_file(input_results_folder = input_results_folder,
filepattern = "results_RII-peptide.txt",
verbose = verbose)
f_rii <- lista$flag
if(f_rii){
peprii <- lista$df
ic_rii <- check_rii_proteomics(df_rri = peprii,
isPTM = isPTM,
verbose = verbose)
# Rename
if( !is.null(all_samples) ){
required_columns <- get_required_columns(isPTM = isPTM,
prot_file = "rii")
required_columns <- c(required_columns, sort(all_samples))
if( all(required_columns %in% colnames(peprii)) ){
peprii <- subset(peprii, select = required_columns)
}else{
what_is_missed <- required_columns[!(required_columns %in% colnames(peprii))]
stop("RII: required columns from vial_label are not available in RII file. \nMissed columns: ", paste(what_is_missed, collapse = ","))
}
}
}else{
if(verbose) message(" - (-) {results_RII-peptide} file not available")
ic <- ic + 10
}
# RATIO----
if(verbose) message("\n\n## RATIO results\n")
lista <- open_file(input_results_folder = input_results_folder,
filepattern = "results_ratio.txt",
verbose = verbose)
f_rr <- lista$flag
if(f_rr){
ratior <- lista$df
ic_rr <- check_ratio_proteomics(df_ratio = ratior,
isPTM = isPTM,
verbose = verbose)
if( !is.null(all_vial_labels) ){
required_columns <- get_required_columns(isPTM = isPTM,
prot_file = "ratio")
required_columns <- c(required_columns, all_vial_labels)
if( all(required_columns %in% colnames(ratior)) ){
ratior <- subset(ratior, select = required_columns)
}else{
stop("RATIO: required columns from vial_label are not available in RATIO file")
}
}
}else{
if(verbose) message(" - (-) {results_ratio.txt} file not available")
ic <- ic + 10
}
if(f_rr & f_rii & f_vm){
list_df <- list ("peprii" = peprii,
"v_m" = v_m,
"ratior" = ratior,
"phase" = phase)
return(list_df)
}else{
stop("PROBLEMS LOADING PROTEOMICS BATCH")
}
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @title Validate a Proteomics submissions
#'
#' @description Validate a Proteomics submission
#' @param input_results_folder (char) path to the PROCESSED folder to check
#' @param isPTM (logical) `TRUE` if it is Post-Translational-Modification proteomics assay
#' @param cas (char) CAS code
#' @param dmaqc_shipping_info (char) DMAQC file with shipping information. If not provided
#' (default), then DMQAC validation is not peformed (only done at the BIC)
#' @param dmaqc_phase2validate (char) Provide phase to validate. This argument
#' is not required since it should be extracted from the input folder or from the
#' new required file `metadata_phase.txt`. However, if this argument is provided,
#' it will take priority (and the phase from the input folder and the
#' `metadata_phase.txt` will be ignored). Examples
#' - Folder with `PASS1A-06`: type either `PASS1A-06` or leave it `NULL`
#' - Both `PASS1A-06` and `PASS1C-06`: type `PASS1A-06|PASS1C-06`
#' - Only `PASS1C-06`: type `PASS1C-06`
#' @param f_proof (char) print out pdf with charts including:
#' - Reported Ion Intensity boxplot distribution and percentage of NA values per sample
#' - Ratio: ratio boxplot distribution and percentage of NA values per samples
#' @param out_qc_folder (char) if f_proof is TRUE, then a folder must be provided
#' @param return_n_issues (logical) if `TRUE` (default) returns the number of issues
#' @param full_report (logical) if `FALSE` (default) it returns only the
#' total number of issues. If `TRUE` returns the details of the number of issues (by
#' group of files, e.g., results, metadata_metabolites, etc)
#' @param printPDF (logical) if `TRUE` (default print plots to pdf)
#' @param check_only_results (logical) if `TRUE`, only validates results (default `FALSE`)
#' @param verbose (logical) `TRUE` (default) prints QC details.
#' @return (data.frame) Summary of issues
#' @export
validate_proteomics <- function(input_results_folder,
isPTM,
cas,
dmaqc_shipping_info = NULL,
dmaqc_phase2validate = FALSE,
f_proof = FALSE,
out_qc_folder = NULL,
return_n_issues = TRUE,
full_report = FALSE,
printPDF = TRUE,
verbose = TRUE,
check_only_results = FALSE){
percent_coverage <- NULL
if(any(missing(input_results_folder) |
missing(isPTM) |
missing(cas)))
stop("One (or many) of the required arguments missed.
Please, check the help for this function to find out more")
if(dir.exists(input_results_folder)){
input_results_folder <- normalizePath(input_results_folder, winslash = "/")
}
# Validate folder structure-----
validate_cas(cas = cas)
processfolder <- validate_processFolder(input_results_folder)
assay <- validate_assay(input_results_folder)
phase <- validate_phase(input_results_folder)
tissue_code <- validate_tissue(input_results_folder)
batch_folder <- validate_batch(input_results_folder)
if(verbose) message("# PROTEOMICS QC report\n\n")
# Set phase-----
dmaqc_phase2validate <- set_phase(input_results_folder = input_results_folder,
dmaqc_phase2validate = dmaqc_phase2validate)
phase2file <- gsub("\\|", "_", dmaqc_phase2validate)
# Print out proofs----
if(f_proof){
if(!is.null(out_qc_folder)){
if(!dir.exists(file.path(out_qc_folder))){
dir.create(file.path(out_qc_folder), recursive = TRUE)
}
}else{
out_qc_folder <- getwd()
}
output_prefix <- paste0(cas, ".", tolower(phase2file), ".", tissue_code, ".",tolower(assay), ".", tolower(processfolder))
}
input_folder_short <- regmatches(input_results_folder, regexpr("(PASS|HUMAN).*RESULTS_[0-9]{8}", input_results_folder))
if( purrr::is_empty(input_folder_short) ){
if(verbose) message("\nThe RESULTS_YYYYMMDD folder full path is not correct. Example:")
if(verbose) message("/full/path/to/folder/PASS1A-06/T66/RPNEG/BATCH1_20190822/RESULTS_20200308")
stop("Input folder not according to guidelines")
}
if(verbose) message("+ Site: ", toupper(cas))
if(verbose) message("+ Folder: `", paste0(input_folder_short),"`")
# COUNTS----
ic <- 0 # Critical issues
ic_rii <- NA # RII issues
ic_rr <- NA # ratio results
ic_vm <- NA # vial metadata
ic_man <- 0 # manifest
if(missing(dmaqc_shipping_info)){
dmaqc_shipping_info = NULL
}
if(is.null(dmaqc_shipping_info)){
ic_vl <- "missed"
} else{
ic_vl <- NA
}
is_mp <- check_metadata_phase_file(input_results_folder = input_results_folder,
verbose = verbose)
if(!is_mp){
ic <- ic + 1
}
# VIAL METADATA-----
if(verbose) message("\n## VIAL METADATA\n")
lista <- open_file(input_results_folder = input_results_folder,
filepattern = "vial_metadata.txt",
verbose = verbose)
f_vm <- lista$flag
all_vial_labels <- NULL
if(f_vm){
v_m <- lista$df
v_m <- filter_required_columns(df = v_m,
type = "v_m",
verbose = verbose)
ic_vm <- check_vial_metadata_proteomics(df_vm = v_m,
return_n_issues = TRUE,
verbose = verbose)
if(is.null(ic_vm)){
f_vm <- FALSE
ic <- ic + 1
ic_vm <- NA
}else{
if( any( grepl("Ref", v_m$vial_label) ) ){
all_samples <- v_m$vial_label
all_vial_labels <- all_samples[!grepl('^Ref', all_samples)]
}else{
if(verbose) message(" - (-) Ref channels not found in vial_metadata")
ic_vm <- ic_vm + 1
}
}
}else{
if(verbose) message(" - (-) {vial_metadata} not available")
ic <- ic + 1
}
# REPORTED ION INTENSITY -----
if(verbose) message("\n## REPORTED ION INTENSITY\n")
if(verbose) message(" + Loading the file (might take some time)")
lista <- open_file(input_results_folder = input_results_folder,
filepattern = "results_RII-peptide.txt",
verbose = verbose)
f_rii <- lista$flag
if(f_rii){
peprii <- lista$df
ic_rii <- check_rii_proteomics(df_rri = peprii,
isPTM = isPTM,
f_proof = f_proof,
output_prefix = output_prefix,
return_n_issues = TRUE,
out_qc_folder = out_qc_folder,
printPDF = printPDF,
verbose = verbose)
if( is.null(ic_rii) ){
f_rii <- FALSE
ic_rii <- NA
ic <- ic + 1
}else{
if(f_proof & f_vm){
# print proof function here
# Only if vial_label metadata is available
if(dim(peprii)[2] < 400){
proteomics_plots_rii(all_vial_labels = all_vial_labels,
all_samples = all_samples,
peprii = peprii,
isPTM = isPTM,
v_m = v_m,
out_qc_folder = out_qc_folder,
output_prefix = output_prefix,
printPDF = printPDF,
verbose = verbose)
}else{
if(verbose) message (" - ( ) The file is too large to generate plots")
}
} # print plots
}
}else{
if(verbose) message(" - (-) {results_RII-peptide} file not available")
ic <- ic + 10
}
# RATIO----
if(verbose) message("\n\n## RATIO results\n")
lista <- open_file(input_results_folder = input_results_folder,
filepattern = "results_ratio.txt",
verbose = verbose)
f_rr <- lista$flag
if(f_rr){
ratior <- lista$df
ic_rr <- check_ratio_proteomics(df_ratio = ratior,
isPTM = isPTM,
f_proof = f_proof,
output_prefix = output_prefix,
out_qc_folder = out_qc_folder,
printPDF = printPDF,
return_n_issues = TRUE,
verbose = verbose)
if(is.null(ic_rr)){
f_rr <- FALSE
ic <- ic + 1
ic_rr <- NA
}else{
# Plot distributions
if(f_proof){
if(verbose) message(" + (+) PLOTS RATIO------------------")
# Get the total number of samples to customize the plots. If larger than 200,
# prepare for large plots
sn <- length(unique(all_samples))
# Set a limit for which remove labels
sn_limit <- 200
if( !is.null(all_vial_labels) ){
required_columns <- get_required_columns(isPTM = isPTM,
prot_file = "ratio")
required_columns <- c(required_columns, all_vial_labels)
if( all(required_columns %in% colnames(ratior)) ){
if(isPTM){
r_c <- c("ptm_id", all_vial_labels)
fratior <- subset(ratior, select = r_c)
ratior_long <- fratior %>% tidyr::pivot_longer(cols = -c(ptm_id),
names_to = "vial_label",
values_to = "ratio_values")
}else{
r_c <- c("protein_id", all_vial_labels)
fratior <- subset(ratior, select = r_c)
ratior_long <- fratior %>% tidyr::pivot_longer(cols = -c(protein_id),
names_to = "vial_label",
values_to = "ratio_values")
}
if(f_vm){
# Plotting ratio distributions-----
if(verbose) message(" - (p) Plotting ratio distributions")
ratior_long <- merge(v_m, ratior_long, by = c("vial_label"))
pisr <- ggplot2::ggplot(ratior_long,
aes(x = reorder(vial_label, ratio_values, FUN = median, na.rm = TRUE),
y = ratio_values,
fill = tmt_plex)) +
geom_boxplot(na.rm = TRUE) +
theme_linedraw() +
theme(axis.text.x = element_text(angle = 90,
hjust = 1,
vjust = 0.5,
size = 8)) + #legend.position = "none"
labs(x = "vial_label", y = "ratio_values") +
labs(title = "Ratio",
subtitle = output_prefix)
# Plotting NA values-----
if(verbose) message(" - (p) Plotting NA percentage in ratio results")
p_na_ratior <- ratior[required_columns] %>%
inspectdf::inspect_na() %>%
dplyr::arrange(match(col_name, colnames(ratior))) %>%
inspectdf::show_plot() +
ylim(0, 100) + theme_linedraw() +
theme(axis.text.x = element_text(angle = 90,
hjust = 1,
vjust = 0.5,
size = 8))
# Plot unique IDs-----
if(verbose) message(" - (p) Plotting unique ids in ratio")
if(isPTM){
key_id <- "ptm_id"
}else{
key_id <- "protein_id"
}
uid <- ratior_long %>%
group_by(across(all_of(c(key_id, "vial_label", "tmt_plex")))) %>%
dplyr::reframe(total_rii = ratio_values)
uid2 <- uid[which(!is.na(uid$total_rii)),]
uid3 <- unique(uid2[c(key_id, "vial_label", "tmt_plex")]) %>%
count(vial_label, tmt_plex)
puid1 <- ggplot(uid3, aes(x = reorder(vial_label, n), y = n, fill = tmt_plex)) +
geom_bar(stat = "identity") +
theme_linedraw() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
vjust = 0.5,
size = 8
),
legend.position = "none"
) +
geom_text(
aes(label = n),
# vjust = -0.5,
hjust = 1,
size = 2.7,
angle = 90
) +
ggtitle("Ratio: Unique IDs in samples") +
facet_wrap(~ tmt_plex, scales = "free") +
xlab("Vial Labels")
puid2 <- ggplot(uid3, aes(x = reorder(vial_label, n), y = n, fill = tmt_plex)) +
geom_bar(stat = "identity",
na.rm = TRUE) +
theme_linedraw() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
vjust = 0.5,
size = 8
)
) +
geom_text(
aes(label = n),
# vjust = -0.5,
hjust = 1,
size = 2.7,
angle = 90
) +
ggtitle("Ratio: Unique IDs in samples") +
xlab("Vial Labels")
# Plotting protein coverage-----
if(!isPTM){
if(verbose) message(" - (p) Plotting protein coverage")
ppp <- ggplot(ratior, aes(x=percent_coverage)) +
geom_histogram( binwidth=2, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Overall Protein Identification Coverage") +
theme_light() +
theme(plot.title = element_text(size=14)) +
labs(x = "Percent Coverage", y = "Number of Proteins")
pbp <- ggplot(ratior, aes(y = percent_coverage)) +
geom_boxplot(col="#69b3a2") +
ggtitle("") +
theme_light() +
theme(plot.title = element_text(size=14)) +
theme(axis.title.x=element_blank())
}
# + scale_fill_brewer(palette="Reds")
if(is.null(out_qc_folder)){
out_plot_ratdist <- paste0(output_prefix,"-qc-ratio-distribution.pdf")
}else{
out_plot_ratdist <- file.path(normalizePath(out_qc_folder), paste0(output_prefix,"-qc-ratio-distribution.pdf"))
}
if(printPDF){
if(sn > 800){
pdf(out_plot_ratdist, width = 40, height = 12)
}else if(sn <= 800 & sn > 200){
pdf(out_plot_ratdist, width = 32, height = 10)
}else{
pdf(out_plot_ratdist, width = 14, height = 8)
}
}
print(pisr)
print(puid1)
print(puid2)
print(p_na_ratior)
if(!isPTM) grid.arrange(arrangeGrob(ppp, pbp, ncol = 2))
if(printPDF) garbage <- dev.off()
}
}else{
if(verbose) message(" - (-) Required columns are missed in ratio file (based on vial_label). Print proof is not possible")
ic_rr <- ic_rr + 1
}
}else{
if(verbose) message(" - (-) Vial labels are NULL: not possible to generate proof")
ic_rr <- ic_rr + 1
}
}# if proof
} #print plots
}else{
if(verbose) message(" - (-) {results_ratio.txt} file not available")
ic <- ic + 10
}
# MANIFEST----
if(check_only_results){
f_man <- FALSE
ic_man <- 0
}else{
if(verbose) message("\n## MANIFEST\n")
file_manifest <- list.files(normalizePath(batch_folder),
pattern="file_manifest",
ignore.case = TRUE,
full.names=TRUE,
recursive = TRUE)
if(length(file_manifest) == 0){
f_man <- FALSE
ic_man <- ic_man + 1
}else if(length(file_manifest) >= 1){
file_manifest <- file_manifest[length(file_manifest)]
f_man <- TRUE
}
if(f_man){
# load all files in manifest:
f_list = list()
for (f in file_manifest ){
f_list[[f]] = read.csv(f)
f_list[[f]]$file <- f
}
manifest <- bind_rows(f_list)
manifest$file_name <- basename(manifest$file_name)
mani_columns <- c("file_name", "md5")
if( all(mani_columns %in% colnames(manifest)) ){
if(verbose) message(" + (+) <file_name, md5> columns available in manifest file")
if(f_rr){
ratio_file <- manifest$file_name[grepl("ratio.txt", manifest$file_name)]
if( length(ratio_file) == 1 ){
if( file.exists(file.path(input_results_folder, ratio_file)) ){
if(verbose) message(" + (+) RATIO file included")
}else{
if(verbose) message(" - (-) The ratio file name provided in manifest is not found in folder")
ic_man <- ic_man + 1
}
}else if (length(ratio_file) > 1){
if(verbose) message(" - (-) More than one ratio.txt files available in the folder")
ic_man <- ic_man + 1
}else{
# This should never happen
if(verbose) message(" - (-) No ratio.txt file available in the manifest file")
ic_man <- ic_man + 1
}
}
if(f_rii){
rii_file <- manifest$file_name[grepl("RII-peptide.txt", manifest$file_name, ignore.case = TRUE)]
if(length(rii_file) == 1){
if(file.exists(file.path(input_results_folder, rii_file))){
if(verbose) message(" + (+) RII file included")
}else{
if(verbose) message(" - (-) RII file name provided in manifest is not found in folder")
ic_man <- ic_man + 1
}
}else if (length(rii_file) > 1){
if(verbose) message(" - (-) More than one RII-peptide.txt files available in the folder")
ic_man <- ic_man + 1
}else{
# This should never happen
if(verbose) message(" - (-) No RII-peptide.txt available in the manifest file")
ic_man <- ic_man + 1
}
}
if(f_vm){
vm_file <- manifest$file_name[grepl("vial_metadata", manifest$file_name, ignore.case = TRUE)]
if( length(vm_file) == 1 ){
if(file.exists(file.path(input_results_folder, vm_file))){
if(verbose) message(" + (+) VIAL_METADATA file included")
}else{
if(verbose) message(" - (-) vial_metadata name provided in manifest is not found in folder")
ic_man <- ic_man + 1
}
}else if( length(vm_file) > 1 ) {
if(verbose) message(" - (-) VIAL_METADATA file is not included in manifest file")
ic_man <- ic_man + 1
}else{
if(verbose) message(" - (-) VIAL_METADATA file is not included in manifest file")
ic_man <- ic_man + 1
}
}
if( any(is.na(manifest$md5)) ){
if(verbose) message(" - (-) MD5 column contains NA values")
ic_man <- ic_man + 1
}
}else{
if(verbose) message(" - (-) MANIFEST ERROR: Missing columns (must contain <file_name> and <md5>)")
ic_man <- ic_man + 1
ic <- ic + 1
}
}else{
message(" - (-) ERROR: manifest file not found")
ic_man = 6
}
} #check_only_results?
if(ic_man > 0){
ic <- ic + ic_man
}
# INTER-FILE VALIDATION----
if(verbose) message("\n## INTER-file validations\n")
if(f_vm & f_rii & f_rr){
if( all(all_vial_labels %in% colnames(peprii)) ){
if(verbose) message(" + (+) All vial labels samples available in RII file")
}else{
ic <- ic + 1
if(verbose) message(" - (-) CRITICAL: Some vial labels are not available in RII file: FAIL")
}
if( all(all_vial_labels %in% colnames(ratior)) ){
if(verbose) message(" + (+) All vial labels available in RATIO results file")
}else{
ic <- ic + 1
if(verbose) message(" - (-) CRITICAL: Some vial labels not available in RATIO results file: FAIL")
}
}else{
if(verbose) message(" - (-) CRITICAL: One or many of the files are not available: FAIL")
ic <- ic + 3
}
# DMAQC validation----
# Validate vial labels from DMAQC
if(verbose) message("\n\n## DMAQC validation\n")
failed_samples <- check_failedsamples(input_results_folder = input_results_folder,
verbose = verbose)
if( is.na(ic_vl) ){
if(f_vm){
outfile_missed_viallabels <- paste0(cas, ".", tolower(phase2file), ".", tissue_code, ".",tolower(assay), ".", tolower(processfolder))
ic_vl <- check_viallabel_dmaqc(vl_submitted = all_vial_labels,
dmaqc_shipping_info = dmaqc_shipping_info,
tissue_code = tissue_code,
cas = cas,
phase = dmaqc_phase2validate,
failed_samples = failed_samples,
out_qc_folder = out_qc_folder,
outfile_missed_viallabels = outfile_missed_viallabels,
return_n_issues = TRUE,
verbose = verbose)
}else{
ic <- ic + 1
if(verbose) message(" - (-) DMAQC validation cannot be performed (no vial label data available)")
}
}
# PRINT OUT RESULTS-----
batchversion <- stringr::str_extract(string = input_results_folder, pattern = "BATCH.*_([0-9]+/(RESULTS|BICRESULTS)_[0-9]+)")
qc_date <- Sys.time()
qc_date <- gsub("-", "", qc_date)
qc_date <- gsub(" ", "_", qc_date)
qc_date <- gsub(":", "", qc_date)
t_name <- bic_animal_tissue_code$bic_tissue_name[which(bic_animal_tissue_code$bic_tissue_code == tissue_code)]
if(return_n_issues){
total_issues <- sum(ic, ic_vm, ic_rr, ic_rii, ic_man, na.rm = TRUE)
if(verbose) message("\nTOTAL NUMBER OF ISSUES: ", total_issues,"\n")
if(full_report){
reports <- data.frame(cas = cas,
phase= dmaqc_phase2validate,
tissue = tissue_code,
t_name = t_name,
assay = assay,
version = batchversion,
vial_label = length(all_vial_labels),
qc_samples = NA,
dmaqc_valid = ic_vl,
critical_issues = ic,
pep_rri = ic_rii,
ratio = ic_rr,
vial_meta = ic_vm,
qc_date = qc_date)
return(reports)
}else{
return(total_issues)
}
}
}
#' @title Write proteomics data release
#'
#' @description Write out proteomics data releases. Doesn't check whether
#' data has been submited according to guidelines
#' @param input_results_folder (char) Path to the RESULTS_YYYYMMDD folder
#' @param folder_name (char) output folder name.
#' @param folder_root (char) absolute path to write the output folder.
#' Default: current directory
#' @param version_file (char) file version number (v#.#)
#' @param verbose (logical) `TRUE` (default) shows messages
#' @return bic release folder/file structure `PHASE/OMICS/TCODE_NAME/ASSAY/` and file names, including:
#' `motrpac_YYYYMMDD_phasecode_tissuecode_omics_assay_file-details.txt` where files-details can be:
#' `rii-results.txt`, `ratio-results.txt`, `vial-metadata.txt`
#' @examples
#' \dontrun{
#' write_proteomics_releases(
#' input_results_folder = "/full/path/to/RESULTS_YYYYMMDD/")
#' }
#' @export
write_proteomics_releases <- function(input_results_folder,
folder_name = "motrpac_release",
folder_root = NULL,
version_file = "v1.0",
verbose = TRUE){
# Get names from input_results_folder------
assay <- validate_assay(input_results_folder)
phase <- validate_phase(input_results_folder)
tissue_code <- validate_tissue(input_results_folder)
folder_phase <- tolower(phase)
folder_tissue <- bic_animal_tissue_code$tissue_name_release[which(bic_animal_tissue_code$bic_tissue_code == tissue_code)]
phase_metadata <- set_phase(input_results_folder = input_results_folder,
dmaqc_phase2validate = FALSE,
verbose = FALSE)
phase_details <- generate_phase_details(phase_metadata)
assay_codes$cas_code <- NULL
assay_codes <- unique(assay_codes)
if(length(assay_codes$assay_code[which(assay_codes$submission_code == assay)]) == 1){
folder_assay <- assay_codes$assay_code[which(assay_codes$submission_code == assay)]
}else{
stop("ASSAY code ", assay, " not available in < assay_codes >")
}
if(verbose) message("+ Writing out ", phase, " (phase-details: ", phase_details, ") ", tissue_code, " ", assay, " files", appendLF = FALSE)
if( grepl("PH", assay) | grepl("AC", assay) | grepl("UB", assay)){
isPTM = TRUE
}else{
isPTM = FALSE
}
# Load all datasets----
prot_dfs <- load_proteomics(input_results_folder = input_results_folder,
isPTM = isPTM,
verbose = FALSE)
# Create output folder-------
if (is.null(folder_root)){
folder_root <- getwd()
}else{
folder_root <- normalizePath(folder_root)
}
output_folder <- file.path(folder_root, folder_name, folder_phase, "proteomics-untargeted", folder_tissue, folder_assay)
if(!dir.exists(file.path(output_folder))){
dir.create(file.path(output_folder), recursive = TRUE)
}
file_name_shared <- paste0("motrpac_",
phase_details, "_",
folder_tissue, "_",
folder_assay)
# Create and write FILES-----
prot_rii <- file.path(output_folder, paste0(file_name_shared,"_rii-results_", version_file, ".txt"))
prot_ratio <- file.path(output_folder, paste0(file_name_shared,"_ratio-results_", version_file, ".txt"))
vial_metadata <- file.path(output_folder, paste0(file_name_shared,"_vial-metadata_", version_file, ".txt"))
write.table(prot_dfs$peprii, prot_rii, row.names = FALSE, sep = "\t", quote = FALSE)
write.table(prot_dfs$ratior, prot_ratio, row.names = FALSE, sep = "\t", quote = FALSE)
write.table(prot_dfs$v_m, vial_metadata, row.names = FALSE, sep = "\t", quote = FALSE)
if(verbose) message("...done!")
}
# @title Get proteomics required columns
get_required_columns <- function(isPTM, prot_file){
if(prot_file == "ratio"){
both_required <- c("protein_id",
"gene_symbol",
"entrez_id",
"redundant_ids",
"organism_name",
"is_contaminant")
if(isPTM){
required_columns <- c(both_required,
"ptm_id",
"confident_score",
"confident_site",
"flanking_sequence",
"ptm_score")
}else{
required_columns <- c(both_required,
"num_peptides",
"percent_coverage",
"protein_score")
}
}else if(prot_file == "rii"){
both_required <- c("protein_id",
"redundant_ids",
"is_contaminant",
"peptide_score",
"sequence",
"gene_symbol",
"entrez_id",
"organism_name")
if(isPTM){
required_columns <- c(both_required,
"ptm_id",
"ptm_peptide",
"confident_score",
"confident_site")
}else{
required_columns <- c(both_required)
}
}
return(required_columns)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.