doc/introduction_of_PhosMap.R

## ----load---------------------------------------------------------------------
# load PhosMap
library('PhosMap')
# load intermediate results from https://github.com/ecnuzdd/PhosMap_datasets
# manully download from https://github.com/ecnuzdd/PhosMap_datasets/blob/master/data/BRAFi.RData
load("BRAFi.RData")

# background_df A data frame for motif enrichment analysis as background
# combined_df_with_mapped_gene_symbol Get a data frame mapped GI number to Gene Symbol
# data_frame_normalization_with_control_no_pair A data frame containning phosphoproteomics data normalized by proteomics data.
# foreground_df A data frame for motif enrichment analysis as foreground.
# fuzzy_input_df A data frame for time course analysis as input.
# group A factor for experiment group information.
# merge_df_with_phospho_peptides A merged phosphoproteomics data frame based on peptides files (unique ID).
# motif_group_m_ratio_df_mat A matrix for motif profile.
# phospho_data_filtering_STY_and_normalization A phosphoproteomics data frame after normalization and filtering.
# profiling_data_normalized A proteomics data frame after normalization and filtering.
# summary_df_of_unique_proteins_with_sites A data frame that phosphorylation sites had been mapping to protein sequence and eliminated redundancy.


## ----extract_psites_score, eval = FALSE---------------------------------------
#  BASE_DIR <- getwd() # working directory
#  BASE_DIR <- normalizePath(BASE_DIR)
#  phosphorylation_exp_design_info_file_path <- normalizePath(file.path(BASE_DIR, 'phosphorylation_exp_design_info.txt'))
#  phosphorylation_peptide_dir <- normalizePath(file.path(BASE_DIR, 'phosphorylation_peptide_txt'))
#  
#  if(FALSE){
#    # if you have xml files from mascot results, you can run the cmd to parser them to text files.
#    mascot_xml_dir <- normalizePath(file.path(BASE_DIR, 'mascot_xml'))
#    mascot_txt_dir <- normalizePath(file.path(BASE_DIR, 'mascot_txt'))
#    extract_psites_score(phosphorylation_exp_design_info_file_path, mascot_xml_dir, mascot_txt_dir)
#  
#    # Based on above-mentioned text files from Mascot results,
#    # the following cmd can generate CSV files of phosphorylation sites with confidence score.
#    psites_score_dir <- normalizePath(file.path(BASE_DIR, 'psites_score_txt'))
#    generate_psites_score_file(mascot_txt_dir, phosphorylation_peptide_dir, psites_score_dir)
#  }
#  # Merge phosphoproteomics data based on peptides files (unique ID).
#  # If qc = TRUE, considering confidence score of phosphorylation sites.
#  # A merged phosphoproteomics data frame based on peptides files (unique ID).
#  merge_df_with_phospho_peptides <- pre_process_filter_psites(
#    phosphorylation_peptide_dir,
#    psites_score_dir,
#    phosphorylation_exp_design_info_file_path,
#    qc = TRUE,
#    min_score = 20,
#    min_FDR = 0.01
#  )

## -----------------------------------------------------------------------------
# Get a data frame mapped GI number to Gene Symbol.
# system.time({
#   combinated_df_with_mapped_gene_symbol = get_combined_data_frame(
#     merge_df_with_phospho_peptides, species = 'human', id_type = 'RefSeq_Protein_GI'
#   )
# })
head(combined_df_with_mapped_gene_symbol)

## -----------------------------------------------------------------------------
# Assign psites to protein sequence.
# Unique ID: protein_gi + phosphorylation site in protein sequence.
# system.time({
#   summary_df_of_unique_proteins_with_sites = get_summary_with_unique_sites(
#     combinated_df_with_mapped_gene_symbol, species = 'human', fasta_type = 'refseq'
#   )
# })
head(summary_df_of_unique_proteins_with_sites)

## ----eval = FALSE-------------------------------------------------------------
#  # Imputation with the next order of magnitude of the minimum except for zero.
#  # Filtering data only including phosphorylation site.
#  phospho_data_filtering_STY_and_normalization_list <- get_normalized_data_of_psites(
#    summary_df_of_unique_proteins_with_sites,
#    phosphorylation_exp_design_info_file_path,
#    topN = NA, mod_types = c('S', 'T', 'Y')
#  )
#  phospho_data_filtering_STY <- phospho_data_filtering_STY_and_normalization_list$ptypes_area_df_with_id
#  phospho_data_filtering_STY_and_normalization <- phospho_data_filtering_STY_and_normalization_list$ptypes_fot5_df_with_id
#  head(phospho_data_filtering_STY_and_normalization)

## ----eval = FALSE-------------------------------------------------------------
#  # Based on phospho_data_filtering_STY_and_normalization
#  ID <- paste(phospho_data_filtering_STY_and_normalization$GeneSymbol,
#             phospho_data_filtering_STY_and_normalization$AA_in_protein,
#             sep = '_')
#  Value <- phospho_data_filtering_STY_and_normalization[,-seq(1,6)]
#  phospho_data <- data.frame(ID, Value)
#  phospho_data_rownames <- paste(phospho_data_filtering_STY_and_normalization$GI,
#                                phospho_data_filtering_STY_and_normalization$GeneSymbol,
#                                phospho_data_filtering_STY_and_normalization$AA_in_protein,
#                                sep = '_')
#  rownames(phospho_data) <- phospho_data_rownames
#  # Further normalize phosphoproteomics data based on proteomics data
#  # The configurations of function see help document.
#  data_frame_normalization_with_control_no_pair <- normalize_phos_data_to_profiling(
#    phospho_data, profiling_data_normalized,
#    phosphorylation_exp_design_info_file_path,
#    profiling_exp_design_info_file_path,
#    control_label = '0',
#    pair_flag = FALSE
#  )
#  head(data_frame_normalization_with_control_no_pair)

## ----visualization_with_simple_tsne, fig.cap = "clustering example: t-SNE", fig.wide = TRUE----
# Clustering: t-SNE or PCA
expr_data_frame <- data_frame_normalization_with_control_no_pair
# t-SNE using all experiments
visualization_with_simple_tsne(expr_data_frame, group)

## ----visualization_with_simple_pca, fig.cap = "clustering example: PCA", fig.wide = TRUE----
expr_ID <- as.vector(expr_data_frame[,1])
expr_Valule <- expr_data_frame[,-1]
expr_Valule_mean <- NULL
expr_Valule_row <- nrow(expr_Valule)
for(i in 1:expr_Valule_row){
  x <- as.vector(unlist(expr_Valule[i,]))
  x_m <- tapply(x, group, mean)
  expr_Valule_mean <- rbind(expr_Valule_mean, x_m)
}
group_levels = levels(group)
colnames(expr_Valule_mean) <- group_levels
expr_df <- data.frame(expr_ID, expr_Valule_mean)
# PCA using mean value in group for comparison with original literature
visualization_with_simple_pca(expr_df)

## -----------------------------------------------------------------------------
# Differently expressed Proteins/Genes analysis
# t2 vs t0
expr_data_frame <- data_frame_normalization_with_control_no_pair[,1:17] # phosphoproteomics data normalized by proteomics data
# select phosphorylation sites with greater variation
expr_data_frame_var <- apply(expr_data_frame, 1, function(x){
  var(x[-1])
})
index_of_kept <- which(expr_data_frame_var>1)
expr_data_frame <- expr_data_frame[index_of_kept,]

# group information (t0 vs t2)
deps_group_levels <- c('t0', 't2')
deps_group <- factor(as.vector(group)[1:16], levels = deps_group_levels)

## ----analysis_deps_limma, fig.cap = "Differential expression analysis: limma", fig.wide = TRUE----
# (1) limma
limma_results_df <- analysis_deps_limma(expr_data_frame, deps_group, deps_group_levels, log2_label = FALSE, adjust_method = 'none')
limma_results_df$ID <- apply(limma_results_df, 1, function(x){
  x = strsplit(x, '_')[[1]]
  paste(x[2], x[3], sep = '_')
})
visualization_deps_with_scatter(limma_results_df, minFC = 2, minPvalue = 0.05, main = 'Differentially expressed proteins  \n with limma',
                                show_text = FALSE, min_up_text = 70, min_down_text = 70)

## ----eval=FALSE---------------------------------------------------------------
#  # (2) SAM
#  sam_results_list <- analysis_deps_sam(expr_data_frame, deps_group, log2_label = FALSE, nperms = 100, rand = NULL, minFDR = 0.05, samr_plot = TRUE)
#  sam_results <- rbind(sam_results_list$genes_up_df, sam_results_list$genes_down_df)

## ----eval=FALSE---------------------------------------------------------------
#  # (3) annova
#  anova_result_df <- analysis_deps_anova(expr_data_frame, deps_group, log2_label = FALSE, return_padjust = TRUE, adjust_method = 'BH')
#  visualization_deps_with_scatter(anova_result_df, minFC = 2, minPvalue = 0.05, main = 'Differentially expressed proteins \n with anova',
#                                  show_text = FALSE, min_up_text = 15, min_down_text = 15)

## ----visualization_fuzzycluster, fig.cap = "Time course analysis example", fig.width = 6, fig.height = 6, fig.align = 'center'----
group_levels <- levels(group)
# fuzzy c-means clustering
set.seed(1000)
fuzzy_clustObj <- visualization_fuzzycluster(
	fuzzy_input_df, group, group_levels, 
	k_cluster=9, iteration = 100, 
	mfrow = c(3,3), min_mem = 0.1,
	plot = TRUE
)
# clusters information
clusterS_info <- fuzzy_clustObj$cluster
clusterS_names <- names(clusterS_info)
clusters_df <- data.frame(clusterS_names, clusterS_info)
# write.csv(clusters_df, 'clusters_df.csv', row.names = TRUE)

## -----------------------------------------------------------------------------
# For early and late response
# early -> clusterS_info==1
# late -> clusterS_info==2
cluster_flag <- 'early'
cluster_symbol <- clusterS_names[clusterS_info==1]
expr_data_frame <- data_frame_normalization_with_control_no_pair
index_of_cluster <- match(cluster_symbol, expr_data_frame$ID)
cluster_df <- expr_data_frame[index_of_cluster,]

## ----fig.cap = "KSEA method", fig.width = 6, fig.height = 5, fig.align = 'center'----
# Perform KSEA
summary_df_list_from_ksea_cluster <- get_summary_from_ksea(cluster_df, species = 'human', log2_label = FALSE, ratio_cutoff = 3)
# Activity of regulons for regulation
ksea_regulons_activity_df_cluster <- summary_df_list_from_ksea_cluster$ksea_regulons_activity_df
ksea_id_cluster <- as.vector(ksea_regulons_activity_df_cluster[,1])
ksea_value_cluster <- ksea_regulons_activity_df_cluster[,-1]
if(FALSE){
  # Pvalue of regulons for regulation
  ksea_regulons_pvalue_cluster <- summary_df_list_from_ksea_cluster$ksea_regulons_pvalue_df
  # Activity of regulons for regulation
  ksea_regulons_activity_cluster <- summary_df_list_from_ksea_cluster$ksea_regulons_activity_df
  # Expression ratio of regulons for regulation
  ksea_kinase_site_substrate_original_ratio_cluster <- summary_df_list_from_ksea_cluster$ksea_kinase_site_substrate_original_ratio_df
}



# plot pheatmap
if(TRUE){
  # annotation setting
  annotation_col <- data.frame(
    group = group
  )
  rownames(annotation_col) <- colnames(ksea_value_cluster)
  
  # breaks and colors setting
  breaks_1 <- seq(-4, -2, 0.2)
  colors_1 <- colorRampPalette(c('#11264f', '#145b7d'))(length(breaks_1)-1)
  
  breaks_2 <- seq(-2, -1, 0.2)
  colors_2 <- colorRampPalette(c('#145b7d', '#009ad6'))(length(breaks_2))
  
  breaks_3 <- seq(-1, 1, 0.2)
  colors_3 <- colorRampPalette(c('#009ad6', 'white', '#FF6600'))(length(breaks_3))
  
  breaks_4 <- seq(1, 2, 0.2)
  colors_4 <- colorRampPalette(c('#FF6600', 'red'))(length(breaks_4))
  
  breaks_5 <- seq(2, 4, 0.2)
  colors_5 <- colorRampPalette(c('red', 'firebrick'))(length(breaks_5))
  
  breaks <- c(breaks_1, breaks_2, breaks_3, breaks_4, breaks_5)
  breaks <- breaks[which(!duplicated(breaks))]
  color <- c(colors_1, colors_2, colors_3, colors_4, colors_5)
  color <- color[which(!duplicated(color))]
  library(pheatmap)
  ph <- pheatmap(
    ksea_value_cluster, 
    scale = 'none', 
    annotation_col = annotation_col, 
    clustering_distance_rows = 'euclidean',
    fontsize_row = 5, 
    # cutree_rows = 1, 
    show_rownames = TRUE,
    fontsize_col = 5,
    # cutree_cols = 1, 
    cluster_cols = FALSE,
    border_color = 'black', 
    cellwidth = 5, cellheight = 5,
    breaks = breaks,
    color = color,
    legend_breaks = c(-4, -2, -1, 0, 1, 2, 4),
    legend_labels = c(-4, -2, -1, 0, 1, 2, 4),
    main = paste('Kinase-substrate enrichment analysis', cluster_flag, sep = ' ')
  )
}


## ----eval = FALSE-------------------------------------------------------------
#  # get kinase activity matrix with multiple linear regression (mlr) method
#  kinase_activity_df_mlr <- get_ka_by_mean_or_mlr(cluster_df, species = 'human', log2_label = TRUE, method = 'mlr')

## ----eval = FALSE-------------------------------------------------------------
#  # get kinase activity matrix with mean value method
#  kinase_activity_df_mean <- get_ka_by_mean_or_mlr(cluster_df, species = 'human', log2_label = TRUE, method = 'mean')

## ----get_aligned_seq_for_mea--------------------------------------------------
# *** foreground ***
foreground_data <- phospho_data_filtering_STY_and_normalization # pre-processed data
foreground_sequence <- as.vector(foreground_data$Sequence)
GI <- as.vector(foreground_data$GI)
Sequence <- as.vector(foreground_data$Sequence)
AA_in_protein <- as.vector(foreground_data$AA_in_protein)

# *** required parameters ***
fixed_length <- 15
species <- 'human'
motifx_pvalue <- 0.01

# get foreground data frame
# foreground_df = get_aligned_seq_for_mea(ID, Sequence, AA_in_protein, fixed_length, species = 'human', fasta_type = 'refseq')

# get background data frame
# background_df = get_global_background_df(species = 'human', fasta_type = 'refseq')

## ----mea_based_on_background--------------------------------------------------
# construct foreground and background
# To facilitate testing the module, select an appropriate number of items at random.
foreground <- as.vector(foreground_df$aligned_seq)
foreground <- foreground[sample(length(foreground), 1000)]
background <- as.vector(background_df$Aligned_Seq)
background <- background[sample(length(background), 10000)]


motifs_list <- mea_based_on_background(foreground, AA_in_protein, background, motifx_pvalue)

# Find sequences in foreground that are mapped to specific motif
foreground_sequences_mapped_to_motifs <- get_foreground_seq_to_motifs(motifs_list, foreground)


# Find data frame in foreground that are mapped to specific motif
foreground_df_mapped_to_motifs <- get_foreground_df_to_motifs(foreground_sequences_mapped_to_motifs, foreground, foreground_df)

## ----ggseqlogo, fig.cap = "Plot motif logo",  fig.width = 6, fig.height = 6, fig.align = 'center'----
# The data can be used for ploting logo of sepcific motif: foreground_sequences_mapped_to_motifs
# ploting logo: Q......S.......
require(ggseqlogo)
ggseqlogo(foreground_sequences_mapped_to_motifs[[1]])

if(TRUE){
  # batch plot and count peptides for each motif
  foreground_sequences_mapped_to_motifs_count <- length(foreground_sequences_mapped_to_motifs)
  motifs <- names(foreground_sequences_mapped_to_motifs)
  peptides_count <- NULL
  for(i in seq_len(foreground_sequences_mapped_to_motifs_count)){
    l_i <- foreground_sequences_mapped_to_motifs[[i]]
    peptides_count <- c(peptides_count, length(l_i))
  }
  motifs_peptides_count_df <- data.frame(motifs, peptides_count)
  # quantile(peptides_count, seq(0,1,0.05))
  if(FALSE){
    (BASE_DIR, foreground_sequences_mapped_to_motifs, plot_min_seqs = 25)
  }
  
}

## ----Assign quantitative values of peptides to their motif, fig.cap = "KSEA method", fig.width = 6, fig.height = 5, fig.align = 'center'----
# Select motifs at least having 50 peptides
# Assign quantitative values of peptides to their motif
foreground_value <- foreground_data[,-c(seq(1,6))]
min_seqs <- 50
index_of_motifs <- which(peptides_count>=min_seqs)
motif_group_m_ratio_df <- NULL
for(i in index_of_motifs){
  motif <- motifs[i]
  aligned_peptides <- foreground_sequences_mapped_to_motifs[[i]]
  index_of_match <- match(aligned_peptides, foreground_df$aligned_seq)
  motif_value <- foreground_value[index_of_match,]
  motif_value_colsum <- colSums(motif_value)
  motif_group_m <- tapply(motif_value_colsum, group, mean)
  motif_group_m_ratio <- motif_group_m/motif_group_m[1]
  motif_group_m_ratio_df <- rbind(motif_group_m_ratio_df, motif_group_m_ratio)
}
motifs_subset <- motifs[index_of_motifs]
peptides_count_subset <- peptides_count[index_of_motifs]
rownames(motif_group_m_ratio_df) <- paste(motifs_subset, peptides_count_subset)

# The matrix is import inot pheatmap
motif_group_m_ratio_df_mat <- as.matrix(motif_group_m_ratio_df)




# plot pheatmap
if(TRUE){
  # library(pheatmap)
  # breaks and colors setting
  breaks_1 <- seq(0, 0.5, 0.1)
  colors_1 <- colorRampPalette(c('green', 'blue'))(length(breaks_1)-1)
  
  breaks_3 <- seq(0.5, 1.5, 0.1)
  colors_3 <- colorRampPalette(c('blue', 'white', '#FFBFBF'))(length(breaks_3))
  
  breaks_4 <- seq(1.5, 2, 0.1)
  colors_4 <- colorRampPalette(c('#FFBFBF', 'red'))(length(breaks_4))
  
  breaks_5 <- seq(2, 4, 0.1)
  colors_5 <- colorRampPalette(c('red','firebrick'))(length(breaks_5))
  
  breaks <- c(breaks_1, breaks_3, breaks_4, breaks_5)
  breaks <- breaks[which(!duplicated(breaks))]
  colors <- c(colors_1, colors_3, colors_4, colors_5)
  colors <- colors[which(!duplicated(colors))]
  
  length(breaks)
  length(which(!duplicated(colors)))
  ph <- pheatmap(
    motif_group_m_ratio_df_mat, 
    scale = 'none', 
    # annotation_col = annotation_col, 
    clustering_distance_cols = 'euclidean',
    fontsize_row = 6, cutree_rows = 1, show_rownames = TRUE, cluster_rows = TRUE,
    fontsize_col = 6, cutree_cols = 1, show_colnames = TRUE, cluster_cols = FALSE,
    border_color = 'black', 
    # color = colors, 
    cellwidth = 12, cellheight = 12,
    breaks = breaks,
    color = colors,
    legend_breaks = c(0, 0.5, 1, 1.5, 2, 4),
    legend_labels = c(0, 0.5, 1, 1.5, 2, 4),
    main = 'Motif enrichment analysis'
  )
}

## ----formatted_output_foreground_sequences_mapped_to_motifs-------------------
# formatting output
formatted_output_df <- formatted_output_mef_results(foreground_sequences_mapped_to_motifs)
# write file
# write.table(formatted_output_df, 'formatted_output_df.txt', row.names = FALSE, col.names = FALSE, sep = '\t')

## ----session------------------------------------------------------------------
sessionInfo()
ecnuzdd/PhosMap documentation built on Dec. 7, 2022, 4:09 a.m.