################################
# Output data to use for testing
library(MassExpression)
design <- mq_lfq_data$design
intensities <- mq_lfq_data$intensities
parameters <- mq_lfq_data$parameters
normalisation_method <- parameters[parameters[,1] == "UseNormalisationMethod",2]
species <- parameters[parameters[,1] == "Species",2]
labellingMethod <- parameters[parameters[,1] == "LabellingMethod",2]
listIntensityExperiments <- runGenericDiscovery(experimentDesign = design,
proteinIntensities = intensities,
normalisationMethod = normalisation_method,
species = species,
labellingMethod = labellingMethod)
expectedCompleteIntensityExperiment <- listIntensityExperiments$CompleteIntensityExperiment
expectedIntensityExperiment <- listIntensityExperiments$IntensityExperiment
output_folder <- "tests/data"
expectedcomparisonExperiments <-
listComparisonExperiments(expectedCompleteIntensityExperiment)
save(expectedIntensityExperiment,
expectedCompleteIntensityExperiment,
expectedcomparisonExperiments,
file = file.path("tests/data/mq_lfq_output.RData"))
######################################################
# Output data to use for testing - HER2 with integer data for Condition names and SampleNames
library(MassExpression)
with_integer_condition <- TRUE
design <- mq_lfq_data$design
intensities <- mq_lfq_data$intensities
if(with_integer_condition){
design <- design %>% mutate(Condition = ifelse(Condition %in% "AZD8931_resistant_SKBR3_AZDRc",1,2),
SampleName = 1:6)
colnames(intensities) <- c(1:6, "ProteinId")
}
parameters <- mq_lfq_data$parameters
normalisation_method <- parameters[parameters[,1] == "UseNormalisationMethod",2]
species <- parameters[parameters[,1] == "Species",2]
labellingMethod <- parameters[parameters[,1] == "LabellingMethod",2]
listIntensityExperiments <- runGenericDiscovery(experimentDesign = design,
proteinIntensities = intensities,
normalisationMethod = normalisation_method,
species = species,
labellingMethod = labellingMethod)
expectedCompleteIntensityExperiment <- listIntensityExperiments$CompleteIntensityExperiment
expectedIntensityExperiment <- listIntensityExperiments$IntensityExperiment
output_folder <- "tests/data"
expectedcomparisonExperiments <-
listComparisonExperiments(expectedCompleteIntensityExperiment)
if(with_integer_condition){
save(expectedIntensityExperiment,
expectedCompleteIntensityExperiment,
expectedcomparisonExperiments,
file = file.path("tests/data/mq_lfq_output_integers.RData"))
}else{
save(expectedIntensityExperiment,
expectedCompleteIntensityExperiment,
expectedcomparisonExperiments,
file = file.path("tests/data/mq_lfq_output.RData"))
}
##########################
# HER2 on Maxquant worflow
##########################
# test data created in dev/AQ-3.0-banchmark-MassExpression
#' Transform MaxQuant proteinGroups to universal input
MaxQuantTranform <- function(proteinGroups, design, species, useNormalisationMethod, labellingMethod){
proteinGroups <- convert_protein_groups_to_universal(proteinGroups)
cols <- colnames(proteinGroups)[grepl("LFQ.intensity.", colnames(proteinGroups))]
runs <- gsub("LFQ.intensity.", "", cols)
groups <- gsub("_","",str_extract(runs,"_(.*)_"))
experiment.design <- dplyr::as_tibble(cbind(cols,runs)) %>%
dplyr::rename(mqExperiment = runs) %>%
left_join(design)
design <- experiment.design %>%
dplyr::select(cols, experiment, techRep, bioRep) %>%
dplyr::rename(SampleName = cols, Condition = experiment)
intensities <- proteinGroups[,c(cols,"ProteinId")]
parameters <- data.frame(X1 = c("Species", "UseNormalisationMethod", "LabellingMethod"),
X2 = c(species,useNormalisationMethod,labellingMethod))
list(design=design, intensities=intensities, parameters=parameters)
}
#' This function removes bad rows in a maxquant protein intensity object
filter_columns_mq <- function(proteinGroups){
cols.to.filter = c("Reverse", "Potential.contaminant", "Only.identified.by.site", "Contaminant")
for (col in intersect(colnames(proteinGroups),cols.to.filter)){
proteinGroups = proteinGroups[proteinGroups[[col]] != "+",]
proteinGroups[[col]] = NULL
}
return(proteinGroups)
}
#' This function uses regex to get the first uniprot assession and the
#' corresponding gene and description and adds them to a protein intensity table
#' coming from maxquant
#' @export assign_uniprot_ids_mq
#' @importFrom stringr str_extract_all str_extract str_c
assign_uniprot_ids_mq <- function(proteinGroups){
uniprot_pattern = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
major_ids <- str_extract_all(proteinGroups$Majority.protein.IDs, uniprot_pattern)
num_major_ids <- table(unlist(lapply(major_ids, length)))
selected_ids <- unlist(lapply(major_ids, function(x){x[1]}))
gene_patterns = str_c(selected_ids, ".*GN=(\\S*) ")
string_id_to_gene = str_extract(proteinGroups$Fasta.headers, gene_patterns) # get the string for the right assession
string_just_gene = str_extract(string_id_to_gene, "GN=(\\S*) ")
genes = gsub("[GN=| ]","", string_just_gene)
string_id_to_gene = str_extract(proteinGroups$Fasta.headers, gene_patterns) # get the string for the right assession
string_description = str_extract(string_id_to_gene, " (.*) ")
descriptions = gsub("GN.*","", string_description )
proteinGroups$ProteinId <- selected_ids
proteinGroups$GeneId <- genes
proteinGroups$Description <- descriptions
return(proteinGroups)
}
#' This is a wrapper for the mq parser functions that can be used to go
#' from maxquant data to protein intensity tables we can work with
#' @export convert_protein_groups_to_universal
convert_protein_groups_to_universal <- function(proteinGroups){
proteinGroups = filter_columns_mq(proteinGroups)
proteinGroups = assign_uniprot_ids_mq(proteinGroups)
return(proteinGroups)
}
###
library(stringr)
### Raw MaxQuant output
mq_data_input <- "../../../data-s3/data-formats/maxquant/LFQ/HER2/data/"
protein.intensities <- read.csv(file.path(mq_data_input, "proteinGroups.txt"), sep = "\t", stringsAsFactors = F)
design <- read.csv(file.path(mq_data_input, "experimentDesign_original.txt"), sep = "\t", stringsAsFactors = F)
input_gen <- MaxQuantTranform(proteinGroups=protein.intensities, design = design, species = "human", useNormalisationMethod = "None", labellingMethod = "LFQ")
data_int_me <- input_gen$intensities
colnames(data_int_me) <- gsub("LFQ.", "lfq.", colnames(data_int_me))
## Load HER2 output after running LFQprocessing
library(LFQProcessing)
# Where the output from LFQ processing is saved
output_folder <- "../../../data-s3/data-formats/maxquant/LFQ/HER2/data/output/"
# her2 <- protein_quant_runner(upload_folder, output_folder)
out_data <- read.delim(file.path(output_folder, "proteinGroups_quant.txt")) %>%
dplyr::rename(Majority.protein.IDs=majority.protein.ids,
Fasta.headers=fasta.headers)
out_data_1prot <- assign_uniprot_ids_mq(out_data)
out_data_bench <- out_data_1prot %>% dplyr::select("ProteinId",
starts_with("logFC."),
starts_with("adj.P.Val."),
starts_with("P.Value."))
colnames(out_data_bench)[2:ncol(out_data_bench)] <- paste0("Disco_", colnames(out_data_bench)[2:ncol(out_data_bench)])
# Intensities
lfq_intensities <- out_data_1prot %>% dplyr::select(ProteinId, starts_with("lfq."))
## Run MassExpression discovery
with_integer_condition <- TRUE
intensities <- input_gen$intensities
design <- input_gen$design
if(with_integer_condition){
design <- design %>% mutate(Condition = ifelse(Condition %in% "AZD8931_resistant_SKBR3_AZDRc",1,2),
SampleName = 1:6)
colnames(intensities) <- c(1:6, "ProteinId")
}
parameters <- input_gen$parameters
normalisation_method <- parameters[parameters[,1] == "UseNormalisationMethod",2]
species <- parameters[parameters[,1] == "Species",2]
labellingMethod <- parameters[parameters[,1] == "LabellingMethod",2]
results <- runGenericDiscovery(experimentDesign = design,
proteinIntensities = intensities,
normalisationMethod = normalisation_method,
species = species,
labellingMethod = labellingMethod)
completeExperiment <- results$CompleteIntensityExperiment
inten <- results$IntensityExperiment
comparisonExperiments <-
listComparisonExperiments(completeExperiment)
## Compare logFC and Pvals between MassExpression and LFQprocessing
expected_complete_new <- rowData(completeExperiment)
expected_comparison_new_run <- rowData(comparisonExperiments[[1]])
# out_data_bench : HER2 processed with LFQ processing
sum(!(out_data_bench$ProteinId %in% expected_complete_new$ProteinId))
sum(!(expected_complete_new$ProteinId %in% out_data_bench$ProteinId))
sum(!(expected_comparison_new_run$ProteinId %in% out_data_bench$ProteinId))
compare_new_run <- out_data_bench %>% left_join(as_tibble(expected_comparison_new_run))
expected_diff_fc_maxquant <- compare_new_run$FC - compare_new_run$Disco_logFC.AZD8931_resistant_SKBR3_AZDRc...Parental_SKBR3
expected_diff_pval_maxquant <- compare_new_run$P.Value - compare_new_run$Disco_P.Value.AZD8931_resistant_SKBR3_AZDRc...Parental_SKBR3
data_bench_maxquant <- out_data_bench
ggplot(compare_new_run, aes(x=FC,
y = `Disco_logFC.AZD8931_resistant_SKBR3_AZDRc...Parental_SKBR3`)) +
geom_point() +
theme_bw() +
geom_smooth()+
ggtitle("Proteins used in pairwise comparison")
if(with_integer_condition){
compare_new_run$NImpute <- compare_new_run$NImputed..2 + compare_new_run$NImputed..1
}else{
compare_new_run$NImpute <- compare_new_run$NImputed..Parental_SKBR3 + compare_new_run$NImputed..AZD8931_resistant_SKBR3_AZDRc
}
ggplot(compare_new_run, aes(x=-log10(P.Value),
y = -log10(`Disco_P.Value.AZD8931_resistant_SKBR3_AZDRc...Parental_SKBR3`), colour=NImpute)) +
geom_point() +
theme_bw() +
geom_smooth()+
scale_colour_viridis_c()+
ggtitle("using proteins used in pairwise comparison")
library(cowplot)
p1=ggplot(compare_new_run, aes(x = Disco_logFC.AZD8931_resistant_SKBR3_AZDRc...Parental_SKBR3, y = -log10(`Disco_P.Value.AZD8931_resistant_SKBR3_AZDRc...Parental_SKBR3`))) + geom_point()
p2=ggplot(compare_new_run, aes(x = FC, y = -log10(`P.Value`))) + geom_point()
plot_grid(p1,p2)
if(with_integer_condition){
save(data_bench_maxquant, expected_diff_fc_maxquant, expected_diff_pval_maxquant, file = "tests/data/HER2_maxquant_workflow.RData")
}else{
save(data_bench_maxquant, expected_diff_fc_maxquant, expected_diff_pval_maxquant, file = "tests/data/HER2_maxquant_workflow_integers.RData")
}
# Test data fro writeReplicatesData
SampleDT <- data.table(ProteinId = c(rep("Prot1", 10), rep("Prot2", 10)),
GeneName = NA, Description = NA,
log2NInt = rep(1.5, 20),
Condition = c(rep(c("A", "B"), each=5), rep(c("A", "B"), each=5)),
Replicate = c(1:5, 1:5, 1:5, 1:5),
Imputed = c(0,0,0,0,0,
0,0,0,1,1,
1,0,0,0,0,
1,1,0,0,0))
writeReplicateData(SampleDT, "tests/data")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.