utils::globalVariables(c("bait", "bckg", "time", "bio", "idx_match"))
#' Analysis of AP-MS data
#' @param df A dataframe containing protein intensities. By default, protein intensity column names start by "Intensity."
#' (use parameter \code{Column_intensity_pattern} to change)
#' @param id The id of the \code{InteRactome}. 1 by default. Useful to distinguish \code{InteRactome}s with the same bait.
#' @param updateProgress function to show progress bar in shiny app
#' @param N_rep Number of iterations for the replacement of missing values
#' @param method Method to replace missing values. Methods from the "mice" package are supported.
#' Use "none" if you do not want to replace missing values. By default, missing values are sampled from a
#' normal distribution centered on the quantile of ctrl intensities defined by parameter \code{quantile_rep}
#' with the standard deviation set to the mean SD of ctrl intensities across all proteins.
#' @param quantile_rep Numeric value between 0 and 1. Quantile of the distribution of mean intensities
#' in the control background used to replace missing values.
#' @param scale_background logical. Scale imputed values according to protein intensities in the control background?
#' @param pool_background option to use all control background conditions as one control group for all conditions
#' @param log_test logical, perform t-test on log transform intensities
#' @param log_stoichio logical, use the geometric mean instead of the arithmetic mean to compute stoichiometries
#' @param log_mean logical, use the geometric mean instead of the arithmetic mean to compute the mean \code{InteRactome}
#' @param substract_ctrl logical, substract ctrl intensities in the calculation of stoichiometries
#' @param use_mean_for_bait logical, average bait intensities across all conditions to compute interaction stoichiometries
#' @param by_conditions option to perform the comparison between bait and control group for each condition
#' @param preprocess_df list obtained by the function \code{preprocess_data()}
#' @param ... Additional parameters passed to function \code{preprocess_data()} and \code{identify_conditions}
#' @return a list containing the preprocessed data and on object of class \code{InteRactome}, i.e a list including the following elements :
#' @return \code{conditions} : a vector of experimental conditions.
#' @return \code{names} : a vector of names (by default gene names are used).
#' @return \code{p_val} : a list of vectors containing the p values associated to each experimental condition.
#' @return \code{fold_change} : a list of vectors containing the fold change associated to each experimental condition.
#' @return \code{...} : other variables.
#' @importFrom stats quantile rnorm
#' @import mice
#' @export
#' @author Guillaume Voisinne
#' @examples
#' #load data :
#' data("proteinGroups_Cbl")
#' #Run InteRact with default parameters
#' res <- InteRact(proteinGroups_Cbl, bait_gene_name = "Cbl")
#'
#' #You now have an `InteRactome`. See its elements.
#' class(res)
#' names(res)
#' #Generate volcano plots
#' plot_volcanos(res)
#'
#' #Identify specific interactors
#' res <- identify_interactors(res, p_val_thresh = 0.05, fold_change_thresh = 2)
#'
#' #Visualize interaction kinetics
#' plot_per_condition(res)
#'
#' # Append annotations
#' # res <- append_annotations(res)
#' #Create a summary data frame
#' sum_tbl <- summary_table(res)
InteRact <- function(
df,
id = NULL,
updateProgress = NULL,
N_rep=1,
method = "default",
quantile_rep = 0.05,
scale_background = TRUE,
pool_background = FALSE,
log_test = TRUE,
log_stoichio = TRUE,
log_mean = TRUE,
by_conditions = TRUE,
substract_ctrl = TRUE,
use_mean_for_bait = FALSE,
preprocess_df = NULL,
...
){
if(is.null(preprocess_df)){
avg <- preprocess_data(df=df, ...)
} else {
avg <- preprocess_df
}
# identify missing values
log10_I_norm_mean <- log10(avg$Intensity);
q <- stats::quantile(log10_I_norm_mean[ , avg$conditions$bckg == avg$bckg_ctrl], na.rm=TRUE, probs=quantile_rep);
s <- mean( row_sd(log10_I_norm_mean[ , avg$conditions$bckg == avg$bckg_ctrl]), na.rm=TRUE);
log10_I_norm_mean_rep <- log10_I_norm_mean
# replace missing values N_rep times
if(N_rep>0 & method != "none"){
res <- vector("list", N_rep)
names(res)<-paste( rep('Rep_',N_rep), 1:N_rep, sep="" )
cat(paste("Replace missing values and perform interactome analysis for",N_rep,"replicates\n",sep=" "))
n_replace <- length(which(is.na(log10_I_norm_mean)));
for(i in 1:N_rep){
if (is.function(updateProgress)) {
text <- paste0( i, "/", N_rep)
updateProgress(value = i/N_rep*100, detail = text)
}
cat(paste("Nrep=",i,"\n",sep=""));
if(method == "default"){
if(scale_background){
for(k in 1:dim(log10_I_norm_mean_rep)[1]){
median_ctrl <- stats::median(t(log10_I_norm_mean[k , avg$conditions$bckg == avg$bckg_ctrl]), na.rm=TRUE)
if(is.na(median_ctrl)){
median_ctrl <- q
}
idx_na <- which(is.na(log10_I_norm_mean[k, ]))
if(length(idx_na)>0){
log10_I_norm_mean_rep[k, idx_na] <- stats::rnorm(n=length(idx_na), mean=median_ctrl, sd=s)
}
}
}else{
log10_I_norm_mean_rep[is.na(log10_I_norm_mean)] <- stats::rnorm(n=n_replace, mean=q, sd=s)
}
}
else{
imp <- mice::mice(log10_I_norm_mean, printFlag = FALSE, method = method, m=1)
log10_I_norm_mean_rep <- mice::complete(imp)
}
res[[i]] <- analyse_interactome(Intensity_na_replaced = 10^log10_I_norm_mean_rep,
Intensity = avg$Intensity,
conditions = avg$conditions,
bait_gene_name = avg$bait_gene_name,
Npep = avg$Npep,
Protein.IDs = avg$Protein.IDs,
names = avg$names,
bckg_bait = avg$bckg_bait,
bckg_ctrl = avg$bckg_ctrl,
pool_background = pool_background,
log_test = log_test,
log_stoichio = log_stoichio,
by_conditions = by_conditions,
substract_ctrl = substract_ctrl,
use_mean_for_bait =use_mean_for_bait)
}
res_mean = mean_analysis(res, log = log_mean, na.rm = TRUE);
}else{
res_mean <- analyse_interactome(Intensity_na_replaced = 10^log10_I_norm_mean_rep,
Intensity = avg$Intensity,
conditions = avg$conditions,
bait_gene_name = avg$bait_gene_name,
Npep = avg$Npep,
Protein.IDs = avg$Protein.IDs,
names = avg$names,
bckg_bait = avg$bckg_bait,
bckg_ctrl = avg$bckg_ctrl,
pool_background = pool_background,
log_test = log_test,
log_stoichio = log_stoichio,
by_conditions = by_conditions,
substract_ctrl = substract_ctrl,
use_mean_for_bait = use_mean_for_bait)
}
res_mean <- global_analysis(res_mean)
if(is.null(id)){
res_mean$id <- res_mean$bait
}else{
res_mean$id <- id
}
res_mean$params <- c( list(N_rep=N_rep,
method = method,
quantile_rep = quantile_rep),
res_mean$params)
res_mean$data <- c(res_mean$data,
list(Intensity_filter = avg$Intensity_filter,
conditions_filter = avg$conditions_filter))
return(res_mean)
}
#' Identify indirect interactions by comparing stoichiometries between two interactomes.
#' @param resA reference interactome
#' @param resB intermediate interactome.
#' @param conditions set of conditions for which stoichiometries will be compared. If \code{NULL} then all shared conditions will be used.
#' @export
identify_indirect_interactions <- function(resA, resB, conditions = NULL){
shared_cond <- intersect(resA$conditions, resB$conditions)
common_interactors <- setdiff(intersect(resA$interactor, resB$interactor), c(resA$bait, resB$bait) )
# Verify that the analysis can be performed
if(!(resB$bait %in% resA$interactor)){
cat("B is not an interactor of bait A\n")
return(NULL)
}
if(length(common_interactors) == 0){
cat("Empty set of shared interactors\n")
return(NULL)
}
#identify conditions for which interaction stoichiometries will be compared
if(is.null(conditions)){
if(length(shared_cond)==0){
cond <- "max"
warning("No shared condition. Analysis will be performed using the maximum stoichiometry across conditions.")
}else{
cond <- shared_cond
}
}else{
if( length(intersect(conditions, shared_cond)) == 0){
cond <- "max"
warning("Specified conditions are not shared by both interactomes. Analysis will be performed using the maximum stoichiometry across conditions.")
}else{
cond <- conditions
}
}
idxA <- match(common_interactors, resA$names)
idxB <- match(common_interactors, resB$names)
idxB_in_A <- match(resB$bait, resA$names)
stA <- data.frame(do.call(cbind, resA$stoichio), row.names = resA$names)
stB <- data.frame(do.call(cbind, resB$stoichio), row.names = resB$names)
names(stA)<- resA$conditions
names(stB)<- resB$conditions
stoichio_direct <- sapply(shared_cond, function(x){stA[idxA, x]} )
if(length(common_interactors)==1){
stoichio_direct <- t(stoichio_direct)
}
rownames(stoichio_direct) <- common_interactors
stoichio_direct <- data.frame(stoichio_direct, check.names = FALSE)
stoichio_indirect <- sapply(shared_cond, function(x){stB[idxB, x]*stA[idxB_in_A, x]} )
if(length(common_interactors)==1){
stoichio_indirect <- t(stoichio_indirect)
}
rownames(stoichio_indirect) <- common_interactors
stoichio_indirect <- data.frame(stoichio_indirect, check.names = FALSE)
stoichio_C_in_B <- sapply(shared_cond, function(x){stB[idxB, x]} )
if(length(common_interactors)==1){
stoichio_C_in_B <- t(stoichio_C_in_B)
}
rownames(stoichio_C_in_B) <- common_interactors
stoichio_C_in_B <- data.frame(stoichio_C_in_B, check.names = FALSE)
perc_C_in_B <- sapply(shared_cond, function(x){stB[idxB, x]*resB$Copy_Number[resB$names == resB$bait]/resB$Copy_Number[idxB]} )
if(length(common_interactors)==1){
perc_C_in_B <- t(perc_C_in_B)
}
rownames(perc_C_in_B) <- common_interactors
perc_C_in_B <- data.frame(perc_C_in_B, check.names = FALSE)
delta_stoichio_log <- log10(stoichio_direct) - log10(stoichio_indirect)
max_delta_stoichio_log <- apply(abs(delta_stoichio_log), MARGIN = 1, max)
sum_delta_stoichio_log <- apply(abs(delta_stoichio_log), MARGIN = 1, sum)
return(list(interactors = common_interactors,
conditions = shared_cond,
stoichio_direct = stoichio_direct,
stoichio_indirect=stoichio_indirect,
stoichio_C_in_B=stoichio_C_in_B,
perc_C_in_B = perc_C_in_B,
delta_stoichio_log = delta_stoichio_log,
max_delta_stoichio_log = max_delta_stoichio_log,
sum_delta_stoichio_log = sum_delta_stoichio_log,
bait_A = resA$bait,
bait_B = resB$bait)
)
}
#' Preprocessing of raw data
#' @param df Data.frame with protein intensities
#' @param bait_gene_name The gene name of the bait
#' @param Column_score Column with protein identification score
#' @param Column_ID Column with protein IDs
#' @param Column_Npep Column with number of theoretically observable peptides per protein
#' @param Column_gene_name Column with gene names
#' @param Column_intensity_pattern Pattern (regular exrpression) used to identfy df's columns containing protein intensity values
#' @param condition data.frame with columns "column", bckg", "bio", "time" and "tech" indicating
#' for each intensity column ("sample") its corresponding background ("bckg"), biologicla replicate ("bio),
#' experimental condition ("tine) and technical replicate ("tech).
#' @param bckg_bait Name of the bait background as found in \code{condition$bckg} (see below)
#' @param bckg_ctrl Name of the control background as found in \code{condition$bckg} (see below)
#' @param log logical, use geometric mean to average technical replicates
#' @param filter_time vector of experimental conditions to exclude from analysis
#' @param filter_bio vector of biological replicates to exclude from analysis
#' @param filter_tech vector of technical replicates to exclude from analysis
#' @param min_score threshold on identification score
#' @param filter_gene_name logical, filter out proteins withy empty gene name
#' @param ... Additional parameters passed to function \code{identify_conditions}
#' @export
#' @examples
#' #load data :
#' data("proteinGroups_Cbl")
#' df <- preprocess_data(proteinGroups_Cbl, bait_gene_name = "Cbl")
preprocess_data <- function(df,
Column_gene_name = "Gene.names",
Column_score = "Score",
Column_ID = "Protein.IDs",
Column_Npep = NULL,
Column_intensity_pattern = "^Intensity.",
bait_gene_name,
condition = NULL,
bckg_bait = bait_gene_name,
bckg_ctrl = "WT",
log = TRUE,
filter_time=NULL,
filter_bio=NULL,
filter_tech=NULL,
min_score = 0,
filter_gene_name = TRUE,
...
){
idx_intensity_cols <- grep(Column_intensity_pattern, names(df))
is_col_numeric <- sapply( idx_intensity_cols, function(x) !is.numeric( df[[x]] ) )
if( sum( is_col_numeric ) >0 ){
warning(paste("Intensity columns ",
paste(names(df)[idx_intensity_cols[is_col_numeric]], collapse = ", ") ,
" are not numeric. Try changing the decimal separator (most likely '.' or ',') used for importing the data",
sep = ""))
}
if(! Column_ID %in% names(df)){
warning(paste("Column ", Column_ID, " could not be found", sep=""))
}
if(! Column_gene_name %in% names(df)){
warning(paste("Column ", Column_gene_name, " could not be found", sep=""))
}
# Identify conditions corresponding to intensity columns
if( is.null(condition) ){
cond <- identify_conditions(df,
Column_intensity_pattern = Column_intensity_pattern,
...)
} else if( length(setdiff(c("column", "bait", "bckg", "bio", "time", "tech"), names(condition))) > 0 ) {
stop("incorrect dimensions for data.frame condition")
} else {
cond <- dplyr::tibble(column = condition$column,
bait = condition$bait,
bckg = condition$bckg,
bio = condition$bio,
time = condition$time,
tech = condition$tech)
}
# filter out some experimental conditions
cond_filter <- cond[!is.na(cond$time), ]
idx_filter <- c( unlist( lapply(filter_time, function(x) l=which(cond_filter$time==x) ) ) ,
unlist( lapply(filter_bio, function(x) l=which(cond_filter$bio==x) ) ),
unlist( lapply(filter_tech, function(x) l=which(cond_filter$tech==x) ) ) )
if(!is.null(idx_filter) && length(idx_filter)>0 ){
cat("Filter following intensity columns :\n")
cat(idx_filter)
cat("\n")
cond_filter <- cond[-idx_filter,]
}
# match conditions to samples
idx_match <- match(cond_filter$column, names(df))
if( sum(is.na(idx_match)) == length(idx_match) ){
stop("Could not match all conditions to samples")
}
cond_filter <- cond_filter[!is.na(idx_match), ]
#discard columns that are factors
is_factor <- sapply(match(cond_filter$column, names(df)), function(x){is.factor(df[[x]])})
if( sum(is_factor) == length(is_factor) ){
stop("All intensity columns are factors, try changing the decimal separator (most likely '.' or ',') used for importing the data")
}else if(sum(is_factor) > 0) {
warning("Some intensity columns identified as factors were discarded. Check imported data")
df <- df[ , -match(cond_filter$column, names(df))[is_factor] ]
cond_filter <- cond_filter[!is_factor, ]
}
# Filter protein with no gene name and a low score
df <- filter_Proteins(df,
min_score = min_score,
filter_gene_name = filter_gene_name,
Column_ID = Column_ID,
Column_gene_name = Column_gene_name,
Column_score = Column_score)
# Compute number of theoretically observable peptides
df$Npep <- estimate_Npep(df, Column_Npep = Column_Npep)
#Merge protein groups with the same gene name
idx_col = match(cond_filter$column, names(df))
idx_col <- idx_col[!is.na(idx_col)]
df <- merge_duplicate_groups(df, idx_col = idx_col, merge_column = "gene_name")
#identify bait
ibait <- which(df$gene_name == bait_gene_name);
if(length(ibait)==0){
stop(paste("Could not find bait '",bait_gene_name,"' in column '",Column_gene_name,"'", sep=""))
}
#Select intensity columns corresponding to selected conditions
T_int <- df[ , idx_col];
T_int[T_int==0] <- NA;
#Discard proteins with NA values for all conditions
idx_all_na <- which( rowSums(!is.na(T_int)) == 0 )
if(length(idx_all_na)>0){
T_int <- T_int[-idx_all_na, ]
df <- df[-idx_all_na, ]
}
row.names(T_int) <- df$gene_name
#Normalize on median intensity across conditions
T_int_norm <- rescale_median(T_int);
cat("Rescale median intensity across conditions\n")
avg <- average_technical_replicates(T_int_norm, cond_filter, log = log)
avg$Intensity_filter <- T_int_norm
avg$conditions_filter <- cond
avg$Npep <- df$Npep
avg$Protein.IDs <- df[[Column_ID]]
avg$names <- df$gene_name
row.names(avg$Intensity) <- avg$names
avg$bckg_bait <- bckg_bait
avg$bckg_ctrl <- bckg_ctrl
avg$bait_gene_name <- bait_gene_name
return(avg)
}
#' Identify conditions (background, time of stimulation, biological and technical replicates)
#' from column names
#' @param df A dataframe containing protein intensities. By default, protein intensity column names start by "Intensity."
#' (use parameter \code{Column_intensity_pattern} to change)
#' @param Column_intensity_pattern Pattern (regular exrpression) used to identfy df's columns containing protein intensity values
#' @param split Character used to split column names into substrings
#' @param bait_pos Position of the bait name in splitted column names
#' @param bckg_pos Position of the sample background in splitted column names
#' @param bio_pos Position of the sample biological replicate in splitted column names
#' @param time_pos Position of the sample experimental condition in splitted column names
#' @param tech_pos Position of the sample technical replicate in splitted column names
#' @return a data frame describing experimental samples in terms of background,
#' biological and technical replicates, and experimental conditions
#' @importFrom dplyr tibble
#' @export
#' @examples
#' #load data :
#' data("proteinGroups_Cbl")
#' # You can identify columns and their description separately using `identify_conditions()`
#' cond <- identify_conditions(proteinGroups_Cbl)
#' print.data.frame(cond)
#' # and use it as parameters for function InteRact()
#' res <- InteRact(proteinGroups_Cbl, bait_gene_name = "Cbl", condition = cond)
identify_conditions <- function(df,
Column_intensity_pattern = "^Intensity.",
split = "_",
bait_pos = 0,
bckg_pos = 1,
bio_pos = 3,
time_pos = 2,
tech_pos = 4
){
idx_col<-grep(Column_intensity_pattern,colnames(df))
if(length(idx_col)==0){
stop("Couldn't find pattern in column names")
}
T_int <- df[ ,idx_col];
col_I <- colnames(T_int)
s0 <- sapply(strsplit(col_I, Column_intensity_pattern), function(x){x[2]})
s <- strsplit(s0, split=split, fixed=TRUE)
bait<- rep("", length(col_I))
bckg <- rep("", length(col_I))
bio <- rep("", length(col_I))
tech <- rep("", length(col_I))
time <- rep("", length(col_I))
for (i in 1:length(col_I)){
n <- length(s[[i]])
if(bait_pos > 0 & bait_pos <= n){ bait[i] <- s[[i]][bait_pos]}
if(bckg_pos <= n){ bckg[i] <- s[[i]][bckg_pos]}
if(bio_pos <= n){ bio[i] <-s[[i]][bio_pos] }
if(tech_pos <= n){ tech[i] <- s[[i]][tech_pos] }
if(time_pos <= n){ time[i] <- s[[i]][time_pos] }
}
bait[nchar(bait)==0] <- paste("bait", "0", sep=split)
bckg[nchar(bckg)==0] <- paste("bckg", "0", sep=split)
bio[nchar(bio)==0] <- paste("bio", "0", sep=split)
tech[nchar(tech)==0] <- paste("tech", "0", sep=split)
time[nchar(time)==0] <- paste("time", "0", sep=split)
cond <- dplyr::tibble(column=factor(col_I),
bait = factor(bait),
bckg = factor(bckg),
time = factor(time),
bio = factor(bio),
tech = factor(tech))
}
#' Average protein intensities over technical replicates
#' @param df A data frame of protein intensities
#' @param cond A data frame containing the description of df's columns (i.e "bckg", "time", "bio" and "tech")
#' as returned by function \code{identify_conditions()}
#' @param log use geometric mean
#' @return A list containing :
#' @return \code{Intensity}, a data frame of protein intensities averaged over technical replicates;
#' @return \code{conditions}, a data frame containing the description of \code{Intensity}'s columns
#' @importFrom dplyr group_by summarise
#' @export
#' @examples
#' #load data :
#' data("proteinGroups_Cbl")
#' df <- proteinGroups_Cbl
#' cond <- identify_conditions(df)
#' Column_intensity_pattern <- "^Intensity."
#' df_int <- df[ , grep(Column_intensity_pattern, colnames(df))]
#' avg <- average_technical_replicates(df_int, cond)
average_technical_replicates<-function(df, cond, log = TRUE){
cond$idx_match <- match(cond$column, names(df))
cond_group <- dplyr::group_by(cond, bait, bckg, time, bio)
idx_cond <- dplyr::summarise(cond_group, idx_tech=list(idx_match))
cond_name <- vector("character", dim(idx_cond)[1])
df_mean = data.frame( matrix( NA, nrow = dim(df)[1], ncol=dim(idx_cond)[1] ) );
for(j in 1:dim(idx_cond)[1]){
cond_name[j] = paste( idx_cond$bait[j], idx_cond$bckg[j], 't', idx_cond$time[j], 'rep', idx_cond$bio[j], sep="_");
df_mean[[j]] <- row_mean(df[ idx_cond$idx_tech[[j]] ], na.rm=TRUE, log = log);
}
colnames(df_mean)=cond_name;
idx_cond$column <- cond_name
idx_cond$tech <- rep("tech_0", dim(idx_cond)[1])
output = list(Intensity=df_mean, conditions=idx_cond)
}
#' Filtering of a data frame using a threshold on protein identification score and
#' gene names
#' @param df A data frame
#' @param min_score Threshold for protein identification score
#' @param filter_gene_name logical, filter out proteins withy empty gene name
#' @param Column_gene_name The name of df's column containing gene names
#' @param Column_ID Column with protein IDs
#' @param Column_score The name of df's column containing protein identification score
#' @param split_param Character used to split gene names into substrings.
#' @return A filtered data frame. Contains an extra column with the first substring of the column \code{Column_gene_name}
#' @export
filter_Proteins <- function( df,
min_score=0,
filter_gene_name =TRUE,
Column_ID = "Protein.IDs",
Column_gene_name = "Gene.names",
Column_score= "Score",
split_param=";"){
idx_row = 1:dim(df)[1]
if( nchar(Column_score)>0 & Column_score %in% colnames(df)){
if(inherits(df[[Column_score]], "numeric")){
idx_row = which( df[[Column_score]] > min_score )
df<-df[idx_row, ]
cat("Data Filtered based on portein identification score\n")
}else{
warning("Column 'Score' is not numeric : Data NOT Filtered based on portein identification score\n")
}
}else{
warning("Column 'Score' not available : Data NOT Filtered based on portein identification score\n")
}
#Remove contaminants from dataset
if( Column_gene_name %in% colnames(df) & Column_ID %in% colnames(df)){
idx_cont <- unique( c(grep("^KRT",toupper(df[[Column_gene_name]])),
grep("^CON_",toupper(df[[Column_ID]])),
grep("^REV_",toupper(df[[Column_ID]]))
))
if (length(idx_cont) > 0){
df<-df[ -idx_cont, ]
cat("Contaminant proteins discarded\n")
}
df$gene_name <- as.character(df[[Column_gene_name]])
length_name <- nchar(df$gene_name)
idx_no_name <- which( length_name == 0 | is.na(length_name) )
if(length(idx_no_name) > 0){
df$gene_name[idx_no_name] <- as.character(df[[Column_ID]][idx_no_name])
}
if(filter_gene_name){
if(length(idx_no_name) > 0){
df <- df[ -idx_no_name, ]
cat("Proteins with no gene name available discarded\n")
}
}
df$gene_name <- sapply(df$gene_name, function(x){ strsplit(as.character(x),split=split_param)[[1]][1]} )
}else{
warning(paste("Column_gene_name '", Column_gene_name, "' or Column_ID '", Column_ID,"' not available",sep=""))
}
return(df)
}
#' Merge protein groups with the same gene name.
#' @param df A data frame
#' @param idx_col idx of columns for which values will be merged across protein groups
#' @param merge_column column to identify rows to be be merged
#' @param sum_intensities Logical. Sum intensities across merged groups.
#' @return A merged data frame
merge_duplicate_groups <- function(df, idx_col = NULL, merge_column = "gene_name", sum_intensities = TRUE){
cat("Merge protein groups associated to the same gene name (sum of intensities) \n")
df_int <- df
ugene <- unique(df[[merge_column]]);
idx_merge = rep(1, dim(df)[1]); # rows with idx_merge = 0 will be filtered out
for(i in 1:length(ugene) ){
idx_u <- which( df[[merge_column]] == ugene[i] )
if (length(idx_u) > 1) {
max_I <- rep(0, length(idx_u));
for (j in 1:length(idx_u) ){
idx_merge[idx_u[j]] = 0;
max_I[j] = max(df[ idx_u[j], idx_col], na.rm = TRUE)
}
jmax = which(max_I == max(max_I) );
idx_merge[ idx_u[jmax[1]] ] = 1;
for (k in idx_col ){
s=0;
for(j in idx_u ){
if(!is.na(df[j, k])){
s= s + df[j, k]
}
}
if(sum_intensities){
df_int[idx_u[jmax[1]], k] = s;
}
}
}
}
return(df_int[idx_merge>0, ])
}
#' Get the number of theoretically observable peptides per protein
#' @param df A data frame
#' @param Column_Npep column containing the number of theoretically observable peptides per protein.
#' If NULL try to compute the number of theoretically observable peptides using iBAQ values,
#' or use molecular weight.
#' @return A data frame with the column 'Npep'
estimate_Npep <- function(df, Column_Npep = NULL){
if ( is.null(Column_Npep) ){
if( "Intensity" %in% colnames(df) & "iBAQ" %in% colnames(df)){
Npep <- round( as.numeric( as.character(df$Intensity))/as.numeric( as.character(df$iBAQ)) )
cat("Number of theoretically observable peptides computed using iBAQ values\n")
}else if("Mol..weight..kDa." %in% colnames(df) ){
Npep <- df$Mol..weight..kDa.
cat("Number of theoretically observable peptides unavailable : used MW instead\n")
}else{
Npep <- rep(1,dim(df)[1])
cat("Number of theoretically observable peptides unavailable : set to 1\n")
}
} else {
Npep <- df[[Column_Npep]]
}
output<-Npep
}
#' Normalize data frame by columns using the median
#' @param df A data frame
#' @importFrom stats median
#' @return A normalized data frame
rescale_median <- function(df){
df_out<-df;
for (i in seq_along(df)){
df_out[[i]] <- df[[i]]/median(df[[i]], na.rm=TRUE);
}
df_out
}
#' Perform the geometric mean of a numeric vector
#' @param x A numeric vector
#' @param na.rm remove NA values
#' @return A numeric value
geom_mean = function(x, na.rm = TRUE){
idx_na <- which(is.na(x))
x <- x[x>0]
if(sum(!is.na(x)) == 0){
return(NA)
}else if(!na.rm & sum(is.na(x)) >0){
return(NA)
}else{
return( exp(sum(log(x[!is.na(x)])) / sum(!is.na(x)) ))
}
}
#' Compute the standard deviation by row
#' @param df a data frame
#' @importFrom stats sd
#' @return A numeric vector
row_sd <- function(df){
output<-vector("double", dim(df)[1] )
for(i in 1:dim(df)[1] ){
output[i] <- sd(as.numeric(df[i,]),na.rm=TRUE);
}
output
}
#' Compute the mean by row
#' @param df a data frame
#' @param log logical, use geometric mean instead of arithmetic mean
#' @param na.rm logical, remove NA values
#' @return A numeric vector
#' @export
row_mean <- function(df, na.rm = TRUE, log = FALSE){
output <- vector("double", dim(df)[1] )
for(i in 1:dim(df)[1] ){
if(log){
output[i] <- geom_mean(as.numeric(df[i,]), na.rm = na.rm);
} else{
output[i] <- mean(as.numeric(df[i,]), na.rm = na.rm);
}
}
output
}
#' Perform a Welch two-sided t-test comparison between two groups by row
#'
#' @param df a data frame
#' @param idx_group_1 column indexes corresponding to the first group
#' @param idx_group_2 column indexes corresponding to the second group
#' @param log option to perform the t-test on log transformed data
#' @param ... parameters passed to \code{stats::t.test()}
#' @importFrom stats t.test
#' @return A data frame with columns 'p_val' and 'fold_change' (group_1 vs group_2)
row_ttest <- function(df, idx_group_1, idx_group_2, log = TRUE, ...){
p_val <- rep(NaN,dim(df)[1]);
fold_change <- rep(NaN,dim(df)[1]);
output <- data.frame(p_val=p_val, fold_change=fold_change);
if(log){
df_test<-log10(df)
}else{
df_test<-df
}
for(i in (1:dim(df)[1]) ){
x <- as.numeric(df_test[i, idx_group_1])
y <- as.numeric(df_test[i, idx_group_2])
res<-try(t.test(x = x, y = y, ...), silent=TRUE)
if(!inherits(res,"try-error")){
p_val[i] <- res$p.value;
if(log){
fold_change[i] <- 10^(mean(x, na.rm = TRUE) - mean(y, na.rm = TRUE))
#fold_change[i] <- 10^(res$estimate[1] - res$estimate[2])
} else {
fold_change[i] <- mean(x, na.rm = TRUE) / mean(y, na.rm = TRUE)
#fold_change[i] <- res$estimate[1] / res$estimate[2]
}
}
}
output$p_val <- p_val;
output$fold_change <- fold_change;
output
}
#' Compute the stoichiometry of interaction using the method described in ...
#'
#' @param df a data frame
#' @param idx_group_1 column indexes corresponding to the first group (bait background)
#' @param idx_group_2 column indexes corresponding to the second group (ctrl background)
#' @param idx_bait row index for the bait protein
#' @param Npep numeric vector containing the number of theoretically observable peptides for each protein
#' @param log logical, use the geometric mean instead of the arithmetic mean
#' @param substract_ctrl logical, substract ctrl intensities in the calculation of stoichiometries
#' @param use_mean_for_bait logical, average bait intensities across all conditions to compute interaction stoichiometries
#' @param idx_group_1_mean if \code{use_mean_for_bait} is TRUE, column indexes for the bait protein corresponding to the first group (bait background)
#' @param idx_group_2_mean if \code{use_mean_for_bait} is TRUE, column indexes for the bait protein corresponding to the second group (ctrl background)
#' @return A numeric vector of interaction stoichiometries
row_stoichio <- function(df,
idx_group_1,
idx_group_2,
idx_bait,
Npep,
log = TRUE,
substract_ctrl = TRUE,
use_mean_for_bait = TRUE,
idx_group_1_mean,
idx_group_2_mean){
stoichio <- rep(NaN,dim(df)[1])
intensity <- rep(NaN,dim(df)[1])
if(use_mean_for_bait){
xbait1<-df[idx_bait, idx_group_1_mean]
xbait2<-df[idx_bait, idx_group_2_mean]
}else{
xbait1<-df[idx_bait, idx_group_1]
xbait2<-df[idx_bait, idx_group_2]
}
for(i in (1:dim(df)[1]) ){
x1<-df[i, idx_group_1]
x2<-df[i, idx_group_2]
if (log) {
if (substract_ctrl){
stoichio[i] <- ( geom_mean(x1) - geom_mean(x2) ) / ( geom_mean(xbait1) - geom_mean(xbait2) )*Npep[idx_bait]/Npep[i]
} else {
stoichio[i] <- ( geom_mean(x1) ) / ( geom_mean(xbait1) )*Npep[idx_bait]/Npep[i]
}
intensity[i] <- geom_mean(x1)
} else {
if (substract_ctrl){
stoichio[i] <- ( mean(x1) - mean(x2) ) / ( mean(xbait1) - mean(xbait2) )*Npep[idx_bait]/Npep[i]
} else {
stoichio[i] <- ( mean(x1) ) / ( mean(xbait1) )*Npep[idx_bait]/Npep[i]
}
intensity[i] <- mean(x1)
}
}
return(list(stoichio = stoichio, intensity = intensity))
}
#' Construct an interactome by comparing bait and control background across experimental conditions
#'
#' @param Intensity a data frame of protein intensities (without replacement of missing values). columns are experimental samples and rows are proteins
#' @param Intensity_na_replaced a data frame of protein intensities (with replacement of missing values). columns are experimental samples and rows are proteins
#' @param conditions : data frame describing the conditions corresponding to the columns of the data frame \code{Intensity}
#' @param bait_gene_name : The gene name of the bait
#' @param Npep : vector containing the number of theoretically observable peptide per protein (same length as \code{dim(df)[1]})
#' @param Protein.IDs : vector containing protein IDs (same length as \code{dim(df)[1]})
#' @param names : vector containing protein names (same length as \code{dim(df)[1]})
#' @param bckg_bait : Name of the bait background as found in \code{conditions$bckg}
#' @param bckg_ctrl : Name of the control background as found in \code{conditions$bckg}
#' @param pool_background option to use all control background conditions as one control group for all conditions
#' @param log_test logical, perform t-test on log transform intensities
#' @param log_stoichio logical, use the geometric mean instead of the arithmetic mean to compute stoichiometries
#' @param substract_ctrl logical, substract ctrl intensities in the calculation of stoichiometries
#' @param use_mean_for_bait logical, average bait intensities across all conditions to compute interaction stoichiometries
#' @param by_conditions option to perform the comparison between bait and control group for each condition
#' @return an object of class \code{InteRactome}, i.e a list including the following elements :
#' @return \code{conditions} : a vector of experimental conditions.
#' @return \code{names} : a vector of names (by default gene names are used).
#' @return \code{p_val} : a list of vectors containing the p values associated to each experimental condition.
#' @return \code{fold_change} : a list of vectors containing the fold change associated to each experimental condition.
#' @return \code{...} : other variables.
analyse_interactome <- function(Intensity,
Intensity_na_replaced = Intensity,
conditions,
bait_gene_name,
Npep,
Protein.IDs,
names,
bckg_bait,
bckg_ctrl,
by_conditions = TRUE, pool_background = TRUE,
log_test = TRUE, log_stoichio = TRUE,
substract_ctrl = TRUE,
use_mean_for_bait = TRUE){
if( ! bckg_bait %in% conditions$bckg ){
stop(paste("Could not find",
bckg_bait,
"in available backgrounds.",
"Please choose parameter `bckg_bait` from",
unique(conditions$bckg)))
}
if( ! bckg_ctrl %in% conditions$bckg ){
stop(paste("Could not find",
bckg_ctrl,
"in available backgrounds.",
"Please choose parameter `bckg_ctrl` from",
unique(conditions$bckg)))
}
if(!by_conditions){
conds<-rep(-1,length(conditions$time))
}else{
conds<-conditions$time
}
cond<-unique(conds)
background <- conditions$bckg
p_val <- vector("list",length(cond))
fold_change <- vector("list",length(cond))
stoichio <- vector("list",length(cond))
intensity <- vector("list",length(cond))
names(p_val)<-as.character(cond)
names(fold_change)<-as.character(cond)
names(stoichio)<-as.character(cond)
names(intensity)<-as.character(cond)
replicates <- conditions$bio
ubio <- unique(replicates)
stoichio_bio <- vector("list",length(ubio))
intensity_bio <- vector("list",length(ubio))
names(stoichio_bio) <- as.character(ubio)
names(intensity_bio) <- as.character(ubio)
for (i_bio in 1:length(ubio)){
stoichio_bio[[i_bio]] <- vector("list",length(cond))
names(stoichio_bio[[i_bio]])<-as.character(cond)
intensity_bio[[i_bio]] <- vector("list",length(cond))
names(intensity_bio[[i_bio]])<-as.character(cond)
}
for( i in seq_along(cond) ){
idx_ctrl <- which( background == bckg_ctrl )
if(!pool_background) {
idx_ctrl <- which( background == bckg_ctrl & conds == cond[[i]] )
}
ttest <- row_ttest(Intensity_na_replaced,
idx_group_1 = which( background == bckg_bait & conds==cond[[i]]),
idx_group_2 = idx_ctrl,
log = log_test)
p_val[[i]] <- ttest$p_val;
fold_change[[i]] <- ttest$fold_change;
rst <- row_stoichio(Intensity_na_replaced,
idx_group_1 = which( background==bckg_bait & conds==cond[[i]] ),
idx_group_2 = idx_ctrl,
idx_bait = which(names == bait_gene_name),
Npep=Npep,
log = log_stoichio,
substract_ctrl = substract_ctrl,
use_mean_for_bait = use_mean_for_bait,
idx_group_1_mean = which( background == bckg_bait ),
idx_group_2_mean = which( background == bckg_ctrl )
)
stoichio[[i]] <- rst$stoichio
intensity[[i]] <- rst$intensity
for (i_bio in 1:length(ubio)){
idx_ctrl_bio <- which( background == bckg_ctrl & replicates == ubio[i_bio])
if(!pool_background) {
idx_ctrl_bio <- which( background == bckg_ctrl & conds == cond[[i]] & replicates == ubio[i_bio])
}
idx_bait_bio <- which( background==bckg_bait & conds==cond[[i]] & replicates==ubio[i_bio])
rst_bio <- row_stoichio(Intensity_na_replaced,
idx_group_1 = idx_bait_bio,
idx_group_2 = idx_ctrl_bio,
idx_bait = which(names == bait_gene_name),
Npep=Npep,
log = log_stoichio,
substract_ctrl = substract_ctrl,
use_mean_for_bait = use_mean_for_bait,
idx_group_1_mean = which( background == bckg_bait ),
idx_group_2_mean = which( background == bckg_ctrl )
)
stoichio_bio[[i_bio]][[i]] <- rst_bio$stoichio
intensity_bio[[i_bio]][[i]] <- rst_bio$intensity
}
}
res = list(bait = bait_gene_name,
bckg_bait = bckg_bait,
bckg_ctrl = bckg_ctrl,
conditions = as.character(cond),
replicates = as.character(ubio),
names = names,
Protein.IDs = Protein.IDs,
Npep = Npep,
p_val=p_val,
fold_change=fold_change,
stoichio=stoichio,
stoichio_bio = stoichio_bio,
norm_intensity=intensity,
norm_intensity_bio = intensity_bio,
data = list(Intensity_na_replaced = Intensity_na_replaced,
Intensity = Intensity,
conditions = conditions
),
params = list(by_conditions = by_conditions,
pool_background = pool_background,
log_test = log_test,
log_stoichio = log_stoichio,
substract_ctrl = substract_ctrl,
use_mean_for_bait = use_mean_for_bait
)
)
class(res) <- 'InteRactome'
return(res)
}
#' Performs a running average on a numeric vector
#' @param x a numeric vector
#' @param n integer, radius of the moving avergae (number of points extending on each side of the
#' center point on which the average is computed)
#' @return a smoothed numeric vector
moving_average <- function(x, n){
x_smooth <- x
for (i in 1:length(x)){
idx <- max(c(i-n, 1)):min(c(i+n, length(x)))
x_smooth[i] <- mean(x[idx], na.rm=TRUE)
}
return(x_smooth)
}
#' Smooth, using a moving average across conditions, selected variables of an \code{InteRactome}
#' @param res an \code{InteRactome}
#' @param n integer, radius of the moving avergae (number of points extending on each side of the
#' center point on which the average is computed)
#' @param order_conditions a numeric vector ordering conditions in \code{res$conditions}
#' @param var_smooth variables on which the moving average will be computed
#' @return an smoothed \code{InteRactome}
#' @export
smooth_interactome <- function( res, n = 1, order_conditions = NULL, var_smooth = c("fold_change","p_val") ){
res_smooth <- res
if(!is.null(order_conditions)){
idx_order <- order_conditions
} else {
idx_order <- 1:length(res$conditions)
}
for (var in var_smooth){
M <- do.call(cbind, res[[var]])
M_smooth <- M
for (i in 1:dim(M)[1] ){
M_smooth[i, idx_order] <- 10^(moving_average(log10(M[i, idx_order]), n))
}
for (i in 1:length(res$conditions)){
res_smooth[[var]][[res$conditions[i]]] <- M_smooth[ , i]
}
}
if( "norm_stoichio" %in% names(res) ){
res_smooth <- global_analysis(res_smooth)
}
return(res_smooth)
}
#' Normalize the log fold change by its standard deviation for each condition
#' @param res an \code{InteRactome}
#' @return an \code{InteRactome} with the additional variable \code{norm_log_fold_change}
#' @export
normalize_interactome <- function( res ){
res_norm <- res
res_norm[["norm_log_fold_change"]] <- vector("list", length = length(res$conditions))
for (i in 1:length(res$conditions)){
sd_norm <- sd(log(res[["fold_change"]][[res$conditions[i]]]))
res_norm[["norm_log_fold_change"]][[res$conditions[i]]] <- log(res[["fold_change"]][[res$conditions[i]]])/sd_norm
}
return(res_norm)
}
#' Merge different conditions from different interactomes into a single data.frame
#' @param res a list of \code{InteRactomes}
#' @param selected_conditions a character vector containing names of conditions to merge
#' @return a data.frame with columns bait, names, Protein.IDs, conditions, p_val,
#' @return fold_change and norm_log_fold_change
#' @export
merge_conditions <- function( res, selected_conditions = NULL){
df_merge <- NULL
if (inherits(res, "list")){
for (i in 1:length(res)){
if(inherits(res[[i]], "InteRactome")){
if(is.null(selected_conditions)){
conditions <- res[[i]]$conditions
} else {
conditions <- selected_conditions
}
res_int <- normalize_interactome(res[[i]])
if(! "id" %in% names(res_int)){
res_int$id <- res_int$bait
}
for (cond in conditions){
names <- res_int$names
df <- data.frame(
id = rep(res_int$id, length(names)),
bait = rep(res_int$bait, length(names)),
names = names,
Protein.IDs = res_int$Protein.IDs,
conditions = rep(cond, length(names)),
p_val = res_int$p_val[[cond]],
fold_change = res_int$fold_change[[cond]],
norm_log_fold_change = res_int$norm_log_fold_change[[cond]]
)
df_merge <- rbind(df_merge, df)
}
} else {
stop("Input is not of class 'InteRactome'")
}
}
} else{
if(inherits(res, "InteRactome")){
if(is.null(selected_conditions)){
conditions <- res$conditions
} else {
conditions <- selected_conditions
}
res_int <- normalize_interactome(res)
if(! "id" %in% names(res_int)){
res_int$id <- res_int$bait
}
for (cond in conditions){
names <- res_int$names
df <- data.frame(
id = rep(res_int$id, length(names)),
bait = rep(res_int$bait, length(names)),
names = names,
Protein.IDs = res_int$Protein.IDs,
conditions = rep(cond, length(names)),
p_val = res_int$p_val[[cond]],
fold_change = res_int$fold_change[[cond]],
norm_log_fold_change = res_int$norm_log_fold_change[[cond]]
)
df_merge <- rbind(df_merge, df)
}
} else {
stop("Input is not of class 'InteRactome'")
}
}
return(df_merge)
}
#' Compute the FDR from the asymmetry of the volcano plot
#' @description Compute the FDR (False Discovery Rate) using the asymmetry of the volcano plot.
#' It uses the fonction f(x) = c / (|x|-x0) with x = log10(fold_change), y=-log10(p_value) by default.
#' Otherwise, custom x and y vectors can be provided.
#' Points with x>x0 and y>f(x) are taken as true positive (TP)
#' Points with x<x0 and y>f(x) are taken as false positive (FP)
#' For a given set of parameters (c,x0), the FDR is given by TP/(TP+FP)
#' @param df : a data.frame containing columns \code{p_val} and \code{fold_change}
#' @param x : numeric vector of x values. Only used if \code{df} is NULL.
#' @param y : numeric vector of y values. Only used if \code{df} is NULL.
#' @param c : numeric vector
#' @param x0 : numeric vector
#' @return a data.frame with a extra column \code{FDR} if \code{df} is not NULL. A vector of FDR values otherwise.
#' @return If parameters \code{c} and \code{x0} are vectors, \code{FDR} is taken as the minimum FDR value across all sets of parameters
#' @import utils
#' @export
#' @examples
#' #' #load data :
#' data("proteinGroups_Cbl")
#' #Run InteRact with default parameters
#' res <- InteRact(proteinGroups_Cbl, bait_gene_name = "Cbl")
#' df_merge <- merge_conditions(res)
#' df_FDR <- compute_FDR_from_asymmetry(df_merge)
#' Interactome <- append_FDR(res, cbind(df_merge, FDR = df_FDR$FDR) )
compute_FDR_from_asymmetry <- function( df = NULL,
x = NULL,
y = NULL,
c = seq(from = 0, to = 10, by = 0.1),
x0 = seq(from = 0, to =10, by = 0.1)){
if(!is.null(df)){
df_int<-df
}else if(is.null(x) & is.null(y)){
stop("No input data provided.")
}
if(is.null(x)){
if("fold_change" %in% names(df)){
x <- log10(df$fold_change)
}else{
stop("Variable 'fold_change' not available")
}
}
if(is.null(y)){
if("p_val" %in% names(df)){
y <- -log10(df$p_val)
}else{
stop("Variable 'p_val' not available")
}
}
if(length(x) != length(y)){
stop("Lengths of x and y differ.")
}
FDR <- rep(1, length(x))
cat("Compute FDR...\n")
pb <- txtProgressBar(min = 0, max = length(c)*length(x0), style = 3)
count <- 0
c_tot <- NULL
x0_tot <- NULL
TP_tot <- NULL
FP_tot <- NULL
FDR_tot <- NULL
for (i in 1:length(c)){
for (j in 1:length(x0)){
idx_TP <- which(x>x0[j] & y>c[i]/(x-x0[j]) )
idx_FP <- which(x<(-x0[j]) & y>c[i]/(-x-x0[j]) )
TP_int <- length(idx_TP)
FP_int <- length(idx_FP)
FDR_int <- FP_int/(FP_int + TP_int)
FDR[idx_TP[ FDR[idx_TP] >= FDR_int ]] <- FDR_int
c_tot <- c(c_tot, c[i])
x0_tot <- c(x0_tot, x0[j])
TP_tot <- c(TP_tot, TP_int)
FP_tot <- c(FP_tot, FP_int)
FDR_tot <- c(FDR_tot, FDR_int)
count <- count +1
setTxtProgressBar(pb, count)
}
}
df_params = data.frame(c = c_tot,
x0 = x0_tot,
TP = TP_tot,
FP = FP_tot,
FDR = FDR_tot)
close(pb)
uFDR <- unique(df_params$FDR)
uFDR <- uFDR[order(uFDR)]
idx_max <- rep(NA, length(uFDR))
for(i in 1:length(uFDR)){
if(!is.na(uFDR[i])){
idx_FDR <- which(df_params$FDR <= uFDR[i] & !is.na(df_params$FDR) )
idx_max[i] <- idx_FDR[which.max(df_params$TP[idx_FDR])]
}
}
df_max_TP <- data.frame( FDR = uFDR,
c = df_params$c[idx_max],
x0 = df_params$x0[idx_max],
TP = df_params$TP[idx_max],
FP = df_params$FP[idx_max])
return(list(FDR = FDR,
parameters = df_params,
max_TP_parameters = df_max_TP ))
}
#' Append a FDR column to an \code{InteRactome}
#' @param res an \code{InteRactome}
#' @param df a data.frame containing (at least) columns 'id', 'bait',
#' 'names', 'FDR' and 'conditions'
#' @return an \code{InteRactome}
#' @export
#' @examples
#' #' #load data :
#' data("proteinGroups_Cbl")
#' #Run InteRact with default parameters
#' res <- InteRact(proteinGroups_Cbl, bait_gene_name = "Cbl")
#' df_merge <- merge_conditions(res)
#' df_FDR <- compute_FDR_from_asymmetry(df_merge)
#' Interactome <- append_FDR(res, cbind(df_merge, FDR = df_FDR$FDR) )
append_FDR <- function(res, df){
res_int <- res
df_int <- df
if("id" %in% names(df_int) & "id" %in% names(res)){
df_int <- df_int[which(df_int$id == res$id), ]
} else if ("bait" %in% names(df_int) & "bait" %in% names(res)){
df_int <- df_int[which(df_int$bait == res$bait), ]
} else {
stop("Could not identify InteRactome using 'id' or 'bait' in input data.frame")
}
FDR <- list()
for (cond in res$conditions){
df_cond <- df_int[df_int$conditions == cond, ]
idx_match <- match(res$names, df_cond$names)
FDR[[cond]] <- df_cond$FDR[idx_match]
}
res_int$FDR <-FDR
res_int <- global_analysis(res_int)
return(res_int)
}
#' Filter conditions from an interactome
#' @param res an \code{InteRactome}
#' @param conditions_to_filter_out character vector with names of conditions to filter out
#' @return an \code{InteRactome}
#' @export
filter_conditions <- function( res, conditions_to_filter_out ){
idx_conditions <- which(is.element(res$conditions, conditions_to_filter_out ))
res_filter<-res
res_filter$conditions <- res$conditions[-idx_conditions]
for(i in 1:length(res)){
if( setequal( names(res[[i]]), res$conditions ) ){
res_filter[[i]] <- res[[i]][-idx_conditions]
}
}
if( "norm_stoichio" %in% names(res) ){
res_filter <- global_analysis(res_filter)
}
return(res_filter)
}
#' Compute the mean \code{InteRactome} (on variables 'p_val', 'fold_cahnge', 'stoichio' and 'stoichio_bio')
#' from a list of \code{InteRactomes}
#' @param res a list of \code{InteRactome}
#' @param log logical, use the geometric mean instead of the arithmetic mean
#' @param na.rm logical, remove NA values
#' @export
mean_analysis <- function(res, log = TRUE, na.rm = TRUE ){
# Performs the average of different interactomes
cat(paste("Averaging", length(res) ,"interactomes\n",sep=" ") )
res_mean = res[[1]];
for ( cond in seq_along(res_mean$conditions) ){
p_val=vector("list",length(res));
fold_change=vector("list",length(res));
stoichio = vector("list",length(res));
for ( i in seq_along(res)){
p_val[[i]] <- res[[i]]$p_val[[cond]]
fold_change[[i]] <- res[[i]]$fold_change[[cond]]
stoichio[[i]] <- res[[i]]$stoichio[[cond]]
}
if (log){
res_mean$p_val[[cond]] <- 10^( rowMeans( log10(do.call(cbind, p_val)), na.rm =na.rm) )
res_mean$fold_change[[cond]] <- 10^( rowMeans( log10(do.call(cbind, fold_change)), na.rm =na.rm) )
res_mean$stoichio[[cond]] <- 10^ (rowMeans( log10(do.call(cbind, stoichio)), na.rm =na.rm) )
} else {
res_mean$p_val[[cond]] <- rowMeans( do.call(cbind, p_val), na.rm =na.rm)
res_mean$fold_change[[cond]] <- rowMeans( do.call(cbind, fold_change), na.rm =na.rm)
res_mean$stoichio[[cond]] <- rowMeans( do.call(cbind, stoichio), na.rm =na.rm)
}
for (bio in res_mean$replicates){
stoichio_bio <- vector("list",length(res));
for ( i in seq_along(res)){
stoichio_bio[[i]] <- res[[i]]$stoichio_bio[[bio]][[cond]]
}
if (log){
res_mean$stoichio_bio[[bio]][[cond]] <- 10^( rowMeans( log10(do.call(cbind, stoichio_bio)), na.rm =na.rm) )
} else {
res_mean$stoichio_bio[[bio]][[cond]] <- rowMeans( do.call(cbind, stoichio_bio), na.rm =na.rm)
}
}
}
res_mean
}
#' Adds global variables by analysing values
#' across all conditions of an \code{InteRactome}
#' @param res an \code{InteRactome}
#' @return an \code{InteRactome} with global varaiables
#' @export
global_analysis <- function( res ){
res_int <- res;
max_stoichio<- apply( do.call(cbind, res$stoichio), 1, function(x) ifelse( sum(!is.na(x))>0, max(x,na.rm=TRUE),NaN) )
max_stoichio[!is.finite(max_stoichio)]<-NaN
res_int$max_stoichio <- max_stoichio
matrix_fold_change<-do.call(cbind, res$fold_change)
res_int$max_fold_change <- apply(matrix_fold_change , 1, function(x) ifelse( sum(!is.na(x))>0, max(x,na.rm=TRUE),NaN) )
matrix_p_val<-do.call(cbind, res$p_val)
matrix_p_val[matrix_fold_change<1]<-NA # keep only p_values corresponding to a fold_change >1
res_int$min_p_val <- apply( matrix_p_val, 1, function(x) ifelse( sum(!is.na(x))>0, min(x,na.rm=TRUE),NaN) )
if( "FDR" %in% names(res) ){
matrix_FDR<-do.call(cbind, res$FDR)
res_int$min_FDR <- apply( matrix_FDR, 1, function(x) ifelse( sum(!is.na(x))>0, min(x,na.rm=TRUE),NaN) )
}
norm_stoichio = vector("list",length(res$conditions))
names(norm_stoichio)<-res$conditions
for ( i in seq_along(res$conditions) ){
norm_stoichio[[i]] <- res$stoichio[[i]] / max_stoichio
}
res_int$norm_stoichio <- norm_stoichio
output=res_int
}
#' Add protein abundance to an \code{InteRactome}
#' @description Add protein abundance to an \code{InteRactome}. For multiple identifiers,
#' the abundance of the first match in the proteome dataset is returned.
#' @param res an \code{InteRactome}
#' @param ... Parameters passed to \code{proteinRuler::map_proteome()}
#' @importFrom proteinRuler map_proteome
#' @export
merge_proteome <- function( res, ...){
res_int <- res
ibait = which(res$names == res$bait)
res_int <- map_proteome(df = res_int, ...)
res_int$stoch_abundance = res_int$Copy_Number / res_int$Copy_Number[ibait]
res_int
}
#' Identify specific interactors in an \code{InteRactome}
#' @param res an \code{InteRactome}
#' @param var_p_val name of the p-value variable
#' @param p_val_thresh p-value threshold
#' @param fold_change_thresh fold-change threshold
#' @param conditions Select conditions used to identify interactors
#' @param n_success_min minimal number of conditions in which the interactor
#' must pass the the p-value and the fold-change thresholds
#' @param consecutive_success logical, impose that the interactor must pass selection thresholds
#' in \code{n_success_min} consecutive conditions.
#' @param p_val_breaks numeric vector to discretize p-values.
#' Passed to function \code{order_interactome()}
#' @param var_min_p_val name of the p-value variable used to order the interactome.
#' Passed to function \code{order_interactome()}
#' @param ... additionnal paramters passed to function \code{order_interactome()}
#' @return an \code{InteRactome} with extra variables \code{is_interactor},
#' \code{n_success} and \code{interactor}
#' @export
identify_interactors <- function(res,
var_p_val = "p_val",
p_val_thresh = 0.05,
fold_change_thresh = 2,
conditions = res$conditions,
n_success_min = 1,
consecutive_success = FALSE,
p_val_breaks = c(1, min(1,4*p_val_thresh), min(1, 2*p_val_thresh), min(1, p_val_thresh)),
var_min_p_val = paste("min_",var_p_val,sep=""),
...){
res_int <- res
n_cond <- length(conditions)
is_interactor <- rep(0, length(res$names))
n_success <- rep(0, length(res$names))
M <- do.call(cbind, res[[var_p_val]][conditions]) <= p_val_thresh & do.call(cbind, res[["fold_change"]][conditions]) >= fold_change_thresh
for (i in 1:length(res$names)){
M_test <- M[i, ]
n_success[i] <- sum( M_test, na.rm = TRUE )
if (consecutive_success & n_success_min > 1){
for (k in 1:(n_success_min-1)){
idx_mod <- ((1:n_cond) + k) %% n_cond
idx_mod[ idx_mod == 0] <- n_cond
M_test <- rbind(M_test, M[i, idx_mod])
}
is_interactor[i] <- sum( colMeans(M_test, na.rm = TRUE ) == 1, na.rm = TRUE) > 0
} else {
is_interactor[i] <- (!is.na(n_success[i]) & n_success[i] >= n_success_min)
}
}
res_int$is_interactor <- is_interactor
res_int$n_success <- n_success
res_int$interactor <- res$names[is_interactor>0]
res_int <- order_interactome(res_int,
idx = NULL,
p_val_breaks = p_val_breaks,
var_min_p_val = var_min_p_val,
...)
res_int$params$var_p_val <- var_p_val
res_int$params$p_val_thresh <- p_val_thresh
res_int$params$fold_change_thresh <- fold_change_thresh
res_int$params$conditions <- conditions
res_int$params$n_success_min <- n_success_min
res_int$params$consecutive_success <- consecutive_success
return(res_int)
}
#' Discretize values in a vector based on a finite set of values
#' @param x numeric vector
#' @param breaks numeric vector. Set of discrete values on which \code{x} values will be mapped.
#' Non-mapped values will be set to NA
#' @param decreasing_order logical. Map \code{beaks} values from the greatest to the smallest
#' @return a numeric vector
#' @export
discretize_values <- function( x, breaks = c(1,0.1,0.05,0.01), decreasing_order = TRUE){
breaks_order <- breaks[order(breaks, decreasing=decreasing_order)]
x_discrete <- rep(NA, length(x));
for( i in 1:length(breaks_order) ){
x_discrete[ x <= breaks_order[i] ] <- breaks_order[i];
}
return(x_discrete)
}
#' Order proteins within an \code{InteRactome}
#' @param res an \code{InteRactome}
#' @param idx indices used to order proteins. Overrides ordering using \code{var_p_val}, \code{p_val_breaks} and \code{var_order}
#' @param var_min_p_val name of the p-value variable
#' @param p_val_breaks numeric vector to discretize p-value
#' @param var_order Variable used to order interactors for each p-value
#' @param bait_first logical, puts bait in first position
#' @return an \code{InteRactome}
#' @export
order_interactome <- function(res, idx = NULL,
var_min_p_val = "min_p_val",
p_val_breaks = c(1,0.1,0.05,0.01),
var_order = "max_stoichio",
bait_first = TRUE){
min_p_val_discrete <- discretize_values(res[[var_min_p_val]], breaks = p_val_breaks, decreasing_order = TRUE)
if(!is.null(idx)){
idx_order <- idx
if(length(idx_order)!=length(res$names)){
stop("Vector of ordering indexes does not have the proper length")
}
} else if( "interactor" %in% names(res)){
Ndetect<-length(res$interactor)
idx_order<-order(res$is_interactor, 1/min_p_val_discrete, res[[var_order]], decreasing = TRUE)
} else{
stop("idx not provided")
}
if(bait_first){
idx_bait <- which(res$names == res$bait)
idx_order_bait <- which(idx_order == idx_bait)
idx_order <- c(idx_bait, idx_order[-idx_order_bait])
}
res_order<-res;
for( var in setdiff( names(res), c("bait", "id", "bckg_bait", "bckg_ctrl","conditions",
"interactor", "replicates", "data", "params") ) ){
if(length(res[[var]]) != 0){
if(length(res[[var]]) == length(res$names)){
res_order[[var]] <- res[[var]][idx_order]
}
else{
names_var <- names(res[[var]])
if( all(names_var %in% res$conditions) ){
for(i in 1:length(names_var) ){
res_order[[var]][[i]] <- res[[var]][[i]][idx_order]
}
}
else{
for(i in 1:length(names_var) ){
names_var_2 <- names(res[[var]][[i]])
for(j in 1:length(names_var_2) ){
if(length(res[[var]][[i]][[j]]) == length(res$names)){
res_order[[var]][[i]][[j]] <- res[[var]][[i]][[j]][idx_order]
} else if (dim(res[[var]][[i]][[j]])[1] == length(res$names)){
res_order[[var]][[i]][[j]] <- res[[var]][[i]][[j]][idx_order, ]
} else {
warning(paste("Couldn't order ",var, sep=""))
}
}
}
}
}
}
}
output = res_order
}
#' Compute correlation in protein recruitment
#' @description Compute correlation in protein recruitment from interaction stoichiometries
#' computed across all conditions for each biological replicate. Pearson correlations are used.
#' @param res an \code{InteRactome} or a data.frame
#' @param idx indexes of the set of proteins for which correlations will be computed
#' @param log logical, use log-transformed stoichiometries
#' @param n_edges_max integer, limits the number of edges per node
#' @param strict logical, if TRUE, ensures a strict upper bound on the maximal degree
#' @param Rmin minimum absolute value of the Pearson R coefficient
#' @param n_min minimum number of values used to compute correlation
#' @param P_max maximum p-value associated with the correlation
#' @return a data.frame with protein correlation information
#' (correlation coefficient in column 'r_corr' and associated p-value in column 'p-corr')
#' @importFrom Hmisc rcorr
#' @importFrom stats p.adjust
#' @export
compute_correlations <- function(res, idx = NULL, log = FALSE, n_edges_max = NULL, strict = TRUE, Rmin = 0, n_min = 3, P_max = 1){
# build matrix on which correlations will be computed
if(inherits(res, "InteRactome")){
idx_selected <- 1:length(res$names)
if (!is.null(idx)) {
idx_selected <- idx
}
idx_bait <- which(res$names == res$bait)
idx_selected <- setdiff(idx_selected, idx_bait)
names <- res$names[idx_selected]
df<-NULL
names_df <- NULL
for (bio in res$replicates){
for (cond in res$conditions){
if(log){
df <- cbind(df, log10(res$stoichio_bio[[bio]][[cond]][idx_selected]))
}else{
df <- cbind(df, res$stoichio_bio[[bio]][[cond]][idx_selected])
}
}
}
}else{
df <- res
names <- rownames(df)
if(is.null(names)){
names <- as.character(1:dim(df)[1])
}
}
M <- as.matrix(df)
row.names(M) <- names
R <- Hmisc::rcorr(t(M))
n <- dim(R$r)[1]
M_transpose <- matrix(0, n, n) # used to avoid duplicated edges
r_corr <- rep(NA, n*(n-1)/2)
p_corr <- rep(NA, n*(n-1)/2)
n_corr <- rep(NA, n*(n-1)/2)
name_1 <- rep("", n*(n-1)/2)
name_2 <- rep("", n*(n-1)/2)
count<-0
for (i in 1:(n-1)){
idx <- order(R$r[i, ], decreasing = TRUE)
for (j in setdiff(1:n, i) ){
if(M_transpose[j, i] == 0){
count<-count+1
r_corr[count] <- R$r[i, j]
p_corr[count] <- R$P[i, j]
n_corr[count] <- R$n[i, j]
name_1[count] <- names[i]
name_2[count] <- names[j]
M_transpose[i, j] <- 1
}
}
}
df_corr <- data.frame(name_1 = name_1,
name_2 = name_2,
r_corr = r_corr,
p_corr = p_corr,
n_corr = n_corr)
if(inherits(res, "InteRactome")){
df_corr$bait <- res$bait
}
#Filter edges based on correlation coefficient
df_corr <- df_corr[!is.na(df_corr$r_corr) &
!is.na(df_corr$p_corr) &
!is.na(df_corr$n_corr) &
df_corr$p_corr <= P_max &
df_corr$n_corr > n_min &
abs(df_corr$r_corr) >= Rmin, ]
df_corr$p_corr_bonferroni <- stats::p.adjust(df_corr$p_corr, method="bonferroni")
df_corr$p_corr_fdr <- stats::p.adjust(df_corr$p_corr, method="fdr")
df_corr <- df_corr[order(df_corr$r_corr, decreasing = TRUE), ]
#limit the number of edges per node
df_corr <- restrict_network_degree(df_corr = df_corr, source = "name_1", target = "name_2", nmax = n_edges_max , strict = strict)
return(df_corr)
}
#' Limit the number of edges per node within a network
#' @description Limit the number of edges per node within a network
#' @param df_corr a data.frame
#' @param var_order column in \code{df_corr} used to order network edges before filtering
#' @param decreasing logical, use decreasing order to order network edges according to column \code{var_order}
#' @param source column with source nodes
#' @param target column with target nodes
#' @param nmax integer, limits the number of edges per node
#' @param strict logical, if TRUE, ensures a strict upper bound on the maximal degree
#' @return a network with a maximum degree lower than \code{nmax}
#' @export
restrict_network_degree <- function(df_corr, var_order = "r_corr", decreasing = TRUE, source = "name_1", target = "name_2", nmax = NULL, strict = TRUE){
df_corr <- df_corr[order(df_corr[[var_order]], decreasing = decreasing), ]
nmax_int <- nmax
if(!is.null(nmax)){
if(nmax <= 0){
nmax_int <- NULL
}
}
delete_edge <- rep(FALSE, dim(df_corr)[1])
df_corr[[source]] <- as.character(df_corr[[source]])
df_corr[[target]] <- as.character(df_corr[[target]])
u_nodes <- unique(c(df_corr[[source]], df_corr[[target]]))
if(!is.null(nmax_int)){
n_edges <- rep(0, length(u_nodes))
names(n_edges) <- u_nodes
for(i in 1:dim(df_corr)[1]){
n_edges[[df_corr[[source]][i]]] <- n_edges[[df_corr[[source]][i]]] + 1
n_edges[[df_corr[[target]][i]]] <- n_edges[[df_corr[[target]][i]]] + 1
if(strict){
if(n_edges[[df_corr[[source]][i]]] > nmax_int | n_edges[[df_corr[[target]][i]]] > nmax_int){
delete_edge[i] <- TRUE
}
}else{
if(n_edges[[df_corr[[source]][i]]] > nmax_int & n_edges[[df_corr[[target]][i]]] > nmax_int){
delete_edge[i] <- TRUE
}
}
}
}
return( df_corr[!delete_edge, ] )
}
#' Create a summary table for an \code{InteRactome}
#' @param res an \code{InteRactome}
#' @param add_columns names of the variables to display in the summary table
#' @return a data.frame
#' @export
summary_table <- function(res, add_columns = names(res) ){
columns <- unique( c("names", add_columns) )
columns <- setdiff(columns, c("bait",
"id",
"bckg_bait",
"bckg_ctrl",
"conditions",
"interactor",
"replicates",
"intensity_bait",
"intensity_ctrl",
"intensity_na_bait",
"intensity_na_ctrl",
"data",
"params"
))
if(! "id" %in% names(res) ){
res$id <- res$bait
}
df<-data.frame( id=rep(res$id, length(res$names)),
bait=rep(res$bait, length(res$names)) )
names_df<-c("id", "bait")
idx<-c(1,1)
for( var in columns ){
if(length(res[[var]])!=0){
if(length(res[[var]]) == length(res$names)){
idx<-c(idx,1)
names_df<-c(names_df, var)
df<-cbind(df,res[[var]])
}
else{
names_var <- names(res[[var]])
if( length(setdiff(names_var, res$conditions))==0 ){
for(i in 1:length(names_var) ){
idx<-c(idx,NaN)
names_df<-c(names_df, paste(var,"_",names_var[i],sep=""))
df<-cbind(df,res[[var]][[i]])
}
}
else{
for(i in 1:length(names_var) ){
names_var_2 <- names(res[[var]][[i]])
if( length(setdiff(names_var_2, res$conditions)) == 0 ){
for(j in 1:length(names_var_2) ){
idx<-c(idx,NaN)
names_df<-c(names_df, paste(var,"_",names(res[[var]])[i],"_",names_var_2[j],sep=""))
df<-cbind(df,res[[var]][[i]][[j]])
}
}
}
}
}
}
}
names(df)<-names_df
df<-df[,order(idx)]
return(df)
}
#' Create a summary for selected proteins in an \code{InteRactome}
#' @param res an \code{InteRactome}
#' @param name protein name
#' @param idx indexes of the proteins to display. Overrides \code{name}
#' @return a data.frame
#' @export
summary_protein <- function(res, name, idx = NULL){
sum_table <- summary_table(res)
idx <- which(res$names == name)
if(length(idx)>0){
return( t(sum_table[idx, ]) )
}else{
warning("Could not find name in InteRactome")
return(NULL)
}
}
#' Compare stoichiometries or intensities between conditions using a t-test
#' @param res an \code{Interactome}
#' @param var_comp variable used to compare values across conditions.
#' Can be either "stoichio_bio" or "norm_intensity_bio"
#' @param names names selected
#' @param replicates names of the biogical replicates used to compare stoichiometries
#' @param ref_condition reference condition
#' @param test_conditions set of conditions to be compared to \code{ref_condition}
#' @param p_val_thresh Threshold for the t-test p-value
#' @param fold_change_thresh Threshold for the t-test fold-change
#' @param ... Additionnal parameters passed to \code{row_ttest()}
#' @export
compare_stoichio <- function(res,
var_comp = "stoichio_bio",
names = res$names,
replicates = res$replicates,
ref_condition = res$conditions[1],
test_conditions = setdiff(res$conditions, ref_condition),
p_val_thresh = 0.05,
fold_change_thresh = 3,
...){
if(!is.null(names)){
idx_match <- match(names, res$names)
}
p_val <- vector("list", length(test_conditions))
fold_change <- vector("list", length(test_conditions))
names(p_val) <- test_conditions
names(fold_change) <- test_conditions
for(i in 1:length(test_conditions)){
conditions <- c(test_conditions[i], ref_condition)
cond_tot <- NULL
df_tot <- NULL
for ( bio in replicates){
stoichio <- as.data.frame(do.call(cbind, res[[var_comp]][[bio]]))[idx_match , conditions]
df <- data.frame(stoichio = stoichio)
if(!is.null(df_tot)){
df_tot <- cbind(df_tot, df)
cond_tot <- c(cond_tot, conditions)
}else{
df_tot <- df
cond_tot <- conditions
}
}
test_res <- row_ttest(df_tot,
idx_group_1 = which(cond_tot == conditions[1]),
idx_group_2 = which(cond_tot == conditions[2]),
...)
p_val[[i]] <- test_res$p_val
fold_change[[i]] <- test_res$fold_change
}
M_p_val <- do.call(cbind, p_val)
M_fold_change <- do.call(cbind, fold_change)
regulation <- rep("not_regulated", length(names) )
for(i in 1:length(names)){
idx_pval <- which( M_p_val[i, ] <= p_val_thresh )
if(length(idx_pval)>0){
idx_max <- idx_pval[ which.max( abs(log10(M_fold_change[i, idx_pval]))) ]
if(M_fold_change[i, idx_max] > fold_change_thresh){
regulation[i] <- "induced"
}
if(M_fold_change[i, idx_max] <= 1/fold_change_thresh){
regulation[i] <- "repressed"
}
}
}
return(list(p_val = p_val, fold_change = fold_change, regulation = regulation))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.