Introduction

PhosMap is a comprehensive R package for analyzing quantitative phosphoproteomics data, which provides mutltiple functions for users as follow: - (1) clustering: principal component analysis (PCA) and t-Distributed Stochastic Neighbor Embedding (t-SNE); - (2) differential expression analysis; - (3) time course analysis; - (4) kinase activity prediction to find activated/deactivated kinases from the kinase-substrate database - (5) phosphorylation motif enrichment analysis to provide clues for finding candidate kinases that are not present in the database; - (6) and data visualization.

Loading data

To test the efficacy of PhosMap and help users get started quickly, we collected a dataset including 39 phosphoproteomics and 32 proteomics raw files deposited in the ProteomeXchange Consortium (Ressa, et al., 2018).The partial key intermediate results were provided for uers to master PhosMap. We have embeded intermediate results from demo into PhosMap for help users get started.

# load PhosMap
library('PhosMap')
# load intermediate results from https://github.com/ecnuzdd/PhosMap_datasets
# manully download from https://github.com/ecnuzdd/PhosMap_datasets/blob/master/data/BRAFi.RData
load("BRAFi.RData")

# background_df A data frame for motif enrichment analysis as background
# combined_df_with_mapped_gene_symbol Get a data frame mapped GI number to Gene Symbol
# data_frame_normalization_with_control_no_pair A data frame containning phosphoproteomics data normalized by proteomics data.
# foreground_df A data frame for motif enrichment analysis as foreground.
# fuzzy_input_df A data frame for time course analysis as input.
# group A factor for experiment group information.
# merge_df_with_phospho_peptides A merged phosphoproteomics data frame based on peptides files (unique ID).
# motif_group_m_ratio_df_mat A matrix for motif profile.
# phospho_data_filtering_STY_and_normalization A phosphoproteomics data frame after normalization and filtering.
# profiling_data_normalized A proteomics data frame after normalization and filtering.
# summary_df_of_unique_proteins_with_sites A data frame that phosphorylation sites had been mapping to protein sequence and eliminated redundancy.

Data pre-processing

An intact data pre-processing procedure of phosphoproteomics data covered three main steps: merging input files after quality control, mapping phosphorylation sites (p-sites) to the corresponding protein sequence and data normalization.

Merging input files after quality control

'Phosphoproteomics data' and 'The phosphoproteomics experimental design' are required as input. For p-sites detected by Mascot, PhosMap could provide confidence probability of p-sites extracted from Mascot xml file. For p-sites detected by other software, a two column table including their corresponding sequences and confidence probability was indispensable. Then quality control at peptide and site levels for each experiment was performed.

BASE_DIR <- getwd() # working directory
BASE_DIR <- normalizePath(BASE_DIR)
phosphorylation_exp_design_info_file_path <- normalizePath(file.path(BASE_DIR, 'phosphorylation_exp_design_info.txt'))
phosphorylation_peptide_dir <- normalizePath(file.path(BASE_DIR, 'phosphorylation_peptide_txt'))

if(FALSE){
  # if you have xml files from mascot results, you can run the cmd to parser them to text files.
  mascot_xml_dir <- normalizePath(file.path(BASE_DIR, 'mascot_xml'))
  mascot_txt_dir <- normalizePath(file.path(BASE_DIR, 'mascot_txt'))
  extract_psites_score(phosphorylation_exp_design_info_file_path, mascot_xml_dir, mascot_txt_dir)

  # Based on above-mentioned text files from Mascot results, 
  # the following cmd can generate CSV files of phosphorylation sites with confidence score.
  psites_score_dir <- normalizePath(file.path(BASE_DIR, 'psites_score_txt'))
  generate_psites_score_file(mascot_txt_dir, phosphorylation_peptide_dir, psites_score_dir)
}
# Merge phosphoproteomics data based on peptides files (unique ID).
# If qc = TRUE, considering confidence score of phosphorylation sites.
# A merged phosphoproteomics data frame based on peptides files (unique ID).
merge_df_with_phospho_peptides <- pre_process_filter_psites(
  phosphorylation_peptide_dir, 
  psites_score_dir, 
  phosphorylation_exp_design_info_file_path, 
  qc = TRUE,
  min_score = 20,
  min_FDR = 0.01
)

Mapping phosphorylation sites (p-sites) to the corresponding protein sequence

# Get a data frame mapped GI number to Gene Symbol.
# system.time({
#   combinated_df_with_mapped_gene_symbol = get_combined_data_frame(
#     merge_df_with_phospho_peptides, species = 'human', id_type = 'RefSeq_Protein_GI'
#   )
# })
head(combined_df_with_mapped_gene_symbol)
# Assign psites to protein sequence.
# Unique ID: protein_gi + phosphorylation site in protein sequence.
# system.time({
#   summary_df_of_unique_proteins_with_sites = get_summary_with_unique_sites(
#     combinated_df_with_mapped_gene_symbol, species = 'human', fasta_type = 'refseq'
#   )
# })
head(summary_df_of_unique_proteins_with_sites)

Data normalization

PhosMap provides two kinds of normalizations. 1. PhosMap allowed for a total sum scaling normalization.

# Imputation with the next order of magnitude of the minimum except for zero.
# Filtering data only including phosphorylation site.
phospho_data_filtering_STY_and_normalization_list <- get_normalized_data_of_psites(
  summary_df_of_unique_proteins_with_sites, 
  phosphorylation_exp_design_info_file_path, 
  topN = NA, mod_types = c('S', 'T', 'Y')
)
phospho_data_filtering_STY <- phospho_data_filtering_STY_and_normalization_list$ptypes_area_df_with_id
phospho_data_filtering_STY_and_normalization <- phospho_data_filtering_STY_and_normalization_list$ptypes_fot5_df_with_id
head(phospho_data_filtering_STY_and_normalization)
  1. If having matched proteomics data with phosphoproteomics, PhosMap allowed for normalizing phosphoproteomics data based on proteomics data.
# Based on phospho_data_filtering_STY_and_normalization
ID <- paste(phospho_data_filtering_STY_and_normalization$GeneSymbol,
           phospho_data_filtering_STY_and_normalization$AA_in_protein,
           sep = '_')
Value <- phospho_data_filtering_STY_and_normalization[,-seq(1,6)]
phospho_data <- data.frame(ID, Value)
phospho_data_rownames <- paste(phospho_data_filtering_STY_and_normalization$GI,
                              phospho_data_filtering_STY_and_normalization$GeneSymbol,
                              phospho_data_filtering_STY_and_normalization$AA_in_protein,
                              sep = '_')
rownames(phospho_data) <- phospho_data_rownames
# Further normalize phosphoproteomics data based on proteomics data
# The configurations of function see help document.
data_frame_normalization_with_control_no_pair <- normalize_phos_data_to_profiling(
  phospho_data, profiling_data_normalized,
  phosphorylation_exp_design_info_file_path,
  profiling_exp_design_info_file_path,
  control_label = '0', 
  pair_flag = FALSE
)
head(data_frame_normalization_with_control_no_pair)

Data analysis

PhosMap incorporated four analysis modules, including clustering and differential expression analysis, time course analysis, kinase-substrate enrichment analysis to find activated/deactivated kinases and motif enrichment analysis.

Clustering and differential expression analysis

# Clustering: t-SNE or PCA
expr_data_frame <- data_frame_normalization_with_control_no_pair
# t-SNE using all experiments
visualization_with_simple_tsne(expr_data_frame, group)
- PCA
expr_ID <- as.vector(expr_data_frame[,1])
expr_Valule <- expr_data_frame[,-1]
expr_Valule_mean <- NULL
expr_Valule_row <- nrow(expr_Valule)
for(i in 1:expr_Valule_row){
  x <- as.vector(unlist(expr_Valule[i,]))
  x_m <- tapply(x, group, mean)
  expr_Valule_mean <- rbind(expr_Valule_mean, x_m)
}
group_levels = levels(group)
colnames(expr_Valule_mean) <- group_levels
expr_df <- data.frame(expr_ID, expr_Valule_mean)
# PCA using mean value in group for comparison with original literature
visualization_with_simple_pca(expr_df)
# Differently expressed Proteins/Genes analysis
# t2 vs t0
expr_data_frame <- data_frame_normalization_with_control_no_pair[,1:17] # phosphoproteomics data normalized by proteomics data
# select phosphorylation sites with greater variation
expr_data_frame_var <- apply(expr_data_frame, 1, function(x){
  var(x[-1])
})
index_of_kept <- which(expr_data_frame_var>1)
expr_data_frame <- expr_data_frame[index_of_kept,]

# group information (t0 vs t2)
deps_group_levels <- c('t0', 't2')
deps_group <- factor(as.vector(group)[1:16], levels = deps_group_levels)
- limma
# (1) limma
limma_results_df <- analysis_deps_limma(expr_data_frame, deps_group, deps_group_levels, log2_label = FALSE, adjust_method = 'none')
limma_results_df$ID <- apply(limma_results_df, 1, function(x){
  x = strsplit(x, '_')[[1]]
  paste(x[2], x[3], sep = '_')
})
visualization_deps_with_scatter(limma_results_df, minFC = 2, minPvalue = 0.05, main = 'Differentially expressed proteins  \n with limma',
                                show_text = FALSE, min_up_text = 70, min_down_text = 70)
- SAM
# (2) SAM
sam_results_list <- analysis_deps_sam(expr_data_frame, deps_group, log2_label = FALSE, nperms = 100, rand = NULL, minFDR = 0.05, samr_plot = TRUE)
sam_results <- rbind(sam_results_list$genes_up_df, sam_results_list$genes_down_df)
- ANOVA
# (3) annova
anova_result_df <- analysis_deps_anova(expr_data_frame, deps_group, log2_label = FALSE, return_padjust = TRUE, adjust_method = 'BH')
visualization_deps_with_scatter(anova_result_df, minFC = 2, minPvalue = 0.05, main = 'Differentially expressed proteins \n with anova',
                                show_text = FALSE, min_up_text = 15, min_down_text = 15)

Time course analysis

Fuzzy clustering was applied to time course analysis for discovering patterns associated with time points in PhosMap.The corresponding line chart combined with membership for each cluster was also drawn.

group_levels <- levels(group)
# fuzzy c-means clustering
set.seed(1000)
fuzzy_clustObj <- visualization_fuzzycluster(
    fuzzy_input_df, group, group_levels, 
    k_cluster=9, iteration = 100, 
    mfrow = c(3,3), min_mem = 0.1,
    plot = TRUE
)
# clusters information
clusterS_info <- fuzzy_clustObj$cluster
clusterS_names <- names(clusterS_info)
clusters_df <- data.frame(clusterS_names, clusterS_info)
# write.csv(clusters_df, 'clusters_df.csv', row.names = TRUE)

Kinase activity prediction to find activated/deactivated kinases

In PhosMap, three kinase activity prediction methods were included: KSEA, multiple linear regression (MLR) and Mean Value. - Data preparation

# For early and late response
# early -> clusterS_info==1
# late -> clusterS_info==2
cluster_flag <- 'early'
cluster_symbol <- clusterS_names[clusterS_info==1]
expr_data_frame <- data_frame_normalization_with_control_no_pair
index_of_cluster <- match(cluster_symbol, expr_data_frame$ID)
cluster_df <- expr_data_frame[index_of_cluster,]
# Perform KSEA
summary_df_list_from_ksea_cluster <- get_summary_from_ksea(cluster_df, species = 'human', log2_label = FALSE, ratio_cutoff = 3)
# Activity of regulons for regulation
ksea_regulons_activity_df_cluster <- summary_df_list_from_ksea_cluster$ksea_regulons_activity_df
ksea_id_cluster <- as.vector(ksea_regulons_activity_df_cluster[,1])
ksea_value_cluster <- ksea_regulons_activity_df_cluster[,-1]
if(FALSE){
  # Pvalue of regulons for regulation
  ksea_regulons_pvalue_cluster <- summary_df_list_from_ksea_cluster$ksea_regulons_pvalue_df
  # Activity of regulons for regulation
  ksea_regulons_activity_cluster <- summary_df_list_from_ksea_cluster$ksea_regulons_activity_df
  # Expression ratio of regulons for regulation
  ksea_kinase_site_substrate_original_ratio_cluster <- summary_df_list_from_ksea_cluster$ksea_kinase_site_substrate_original_ratio_df
}



# plot pheatmap
if(TRUE){
  # annotation setting
  annotation_col <- data.frame(
    group = group
  )
  rownames(annotation_col) <- colnames(ksea_value_cluster)

  # breaks and colors setting
  breaks_1 <- seq(-4, -2, 0.2)
  colors_1 <- colorRampPalette(c('#11264f', '#145b7d'))(length(breaks_1)-1)

  breaks_2 <- seq(-2, -1, 0.2)
  colors_2 <- colorRampPalette(c('#145b7d', '#009ad6'))(length(breaks_2))

  breaks_3 <- seq(-1, 1, 0.2)
  colors_3 <- colorRampPalette(c('#009ad6', 'white', '#FF6600'))(length(breaks_3))

  breaks_4 <- seq(1, 2, 0.2)
  colors_4 <- colorRampPalette(c('#FF6600', 'red'))(length(breaks_4))

  breaks_5 <- seq(2, 4, 0.2)
  colors_5 <- colorRampPalette(c('red', 'firebrick'))(length(breaks_5))

  breaks <- c(breaks_1, breaks_2, breaks_3, breaks_4, breaks_5)
  breaks <- breaks[which(!duplicated(breaks))]
  color <- c(colors_1, colors_2, colors_3, colors_4, colors_5)
  color <- color[which(!duplicated(color))]
  library(pheatmap)
  ph <- pheatmap(
    ksea_value_cluster, 
    scale = 'none', 
    annotation_col = annotation_col, 
    clustering_distance_rows = 'euclidean',
    fontsize_row = 5, 
    # cutree_rows = 1, 
    show_rownames = TRUE,
    fontsize_col = 5,
    # cutree_cols = 1, 
    cluster_cols = FALSE,
    border_color = 'black', 
    cellwidth = 5, cellheight = 5,
    breaks = breaks,
    color = color,
    legend_breaks = c(-4, -2, -1, 0, 1, 2, 4),
    legend_labels = c(-4, -2, -1, 0, 1, 2, 4),
    main = paste('Kinase-substrate enrichment analysis', cluster_flag, sep = ' ')
  )
}
# get kinase activity matrix with multiple linear regression (mlr) method
kinase_activity_df_mlr <- get_ka_by_mean_or_mlr(cluster_df, species = 'human', log2_label = TRUE, method = 'mlr')
# get kinase activity matrix with mean value method
kinase_activity_df_mean <- get_ka_by_mean_or_mlr(cluster_df, species = 'human', log2_label = TRUE, method = 'mean')

Motif enrichment analysis (MEA)

PhosMap allowed for performing MEA on user defined phosphopeptides lists. - Data preparation

# *** foreground ***
foreground_data <- phospho_data_filtering_STY_and_normalization # pre-processed data
foreground_sequence <- as.vector(foreground_data$Sequence)
GI <- as.vector(foreground_data$GI)
Sequence <- as.vector(foreground_data$Sequence)
AA_in_protein <- as.vector(foreground_data$AA_in_protein)

# *** required parameters ***
fixed_length <- 15
species <- 'human'
motifx_pvalue <- 0.01

# get foreground data frame
# foreground_df = get_aligned_seq_for_mea(ID, Sequence, AA_in_protein, fixed_length, species = 'human', fasta_type = 'refseq')

# get background data frame
# background_df = get_global_background_df(species = 'human', fasta_type = 'refseq')
# construct foreground and background
# To facilitate testing the module, select an appropriate number of items at random.
foreground <- as.vector(foreground_df$aligned_seq)
foreground <- foreground[sample(length(foreground), 1000)]
background <- as.vector(background_df$Aligned_Seq)
background <- background[sample(length(background), 10000)]


motifs_list <- mea_based_on_background(foreground, AA_in_protein, background, motifx_pvalue)

# Find sequences in foreground that are mapped to specific motif
foreground_sequences_mapped_to_motifs <- get_foreground_seq_to_motifs(motifs_list, foreground)


# Find data frame in foreground that are mapped to specific motif
foreground_df_mapped_to_motifs <- get_foreground_df_to_motifs(foreground_sequences_mapped_to_motifs, foreground, foreground_df)
# The data can be used for ploting logo of sepcific motif: foreground_sequences_mapped_to_motifs
# ploting logo: Q......S.......
require(ggseqlogo)
ggseqlogo(foreground_sequences_mapped_to_motifs[[1]])

if(TRUE){
  # batch plot and count peptides for each motif
  foreground_sequences_mapped_to_motifs_count <- length(foreground_sequences_mapped_to_motifs)
  motifs <- names(foreground_sequences_mapped_to_motifs)
  peptides_count <- NULL
  for(i in seq_len(foreground_sequences_mapped_to_motifs_count)){
    l_i <- foreground_sequences_mapped_to_motifs[[i]]
    peptides_count <- c(peptides_count, length(l_i))
  }
  motifs_peptides_count_df <- data.frame(motifs, peptides_count)
  # quantile(peptides_count, seq(0,1,0.05))
  if(FALSE){
    plot_seqlogo(BASE_DIR, foreground_sequences_mapped_to_motifs, plot_min_seqs = 25)
  }

}
# Select motifs at least having 50 peptides
# Assign quantitative values of peptides to their motif
foreground_value <- foreground_data[,-c(seq(1,6))]
min_seqs <- 50
index_of_motifs <- which(peptides_count>=min_seqs)
motif_group_m_ratio_df <- NULL
for(i in index_of_motifs){
  motif <- motifs[i]
  aligned_peptides <- foreground_sequences_mapped_to_motifs[[i]]
  index_of_match <- match(aligned_peptides, foreground_df$aligned_seq)
  motif_value <- foreground_value[index_of_match,]
  motif_value_colsum <- colSums(motif_value)
  motif_group_m <- tapply(motif_value_colsum, group, mean)
  motif_group_m_ratio <- motif_group_m/motif_group_m[1]
  motif_group_m_ratio_df <- rbind(motif_group_m_ratio_df, motif_group_m_ratio)
}
motifs_subset <- motifs[index_of_motifs]
peptides_count_subset <- peptides_count[index_of_motifs]
rownames(motif_group_m_ratio_df) <- paste(motifs_subset, peptides_count_subset)

# The matrix is import inot pheatmap
motif_group_m_ratio_df_mat <- as.matrix(motif_group_m_ratio_df)




# plot pheatmap
if(TRUE){
  # library(pheatmap)
  # breaks and colors setting
  breaks_1 <- seq(0, 0.5, 0.1)
  colors_1 <- colorRampPalette(c('green', 'blue'))(length(breaks_1)-1)

  breaks_3 <- seq(0.5, 1.5, 0.1)
  colors_3 <- colorRampPalette(c('blue', 'white', '#FFBFBF'))(length(breaks_3))

  breaks_4 <- seq(1.5, 2, 0.1)
  colors_4 <- colorRampPalette(c('#FFBFBF', 'red'))(length(breaks_4))

  breaks_5 <- seq(2, 4, 0.1)
  colors_5 <- colorRampPalette(c('red','firebrick'))(length(breaks_5))

  breaks <- c(breaks_1, breaks_3, breaks_4, breaks_5)
  breaks <- breaks[which(!duplicated(breaks))]
  colors <- c(colors_1, colors_3, colors_4, colors_5)
  colors <- colors[which(!duplicated(colors))]

  length(breaks)
  length(which(!duplicated(colors)))
  ph <- pheatmap(
    motif_group_m_ratio_df_mat, 
    scale = 'none', 
    # annotation_col = annotation_col, 
    clustering_distance_cols = 'euclidean',
    fontsize_row = 6, cutree_rows = 1, show_rownames = TRUE, cluster_rows = TRUE,
    fontsize_col = 6, cutree_cols = 1, show_colnames = TRUE, cluster_cols = FALSE,
    border_color = 'black', 
    # color = colors, 
    cellwidth = 12, cellheight = 12,
    breaks = breaks,
    color = colors,
    legend_breaks = c(0, 0.5, 1, 1.5, 2, 4),
    legend_labels = c(0, 0.5, 1, 1.5, 2, 4),
    main = 'Motif enrichment analysis'
  )
}
# formatting output
formatted_output_df <- formatted_output_mef_results(foreground_sequences_mapped_to_motifs)
# write file
# write.table(formatted_output_df, 'formatted_output_df.txt', row.names = FALSE, col.names = FALSE, sep = '\t')

Session Info {.unnumbered}

sessionInfo()


ecnuzdd/PhosMap documentation built on Dec. 7, 2022, 4:09 a.m.