additional_file/generate_pathway.R

##############
#This file should be run in the fourth. 
#This file utilizes the 'rp.molecule.all.clean', 'kn.molecule.all.clean' and 'tf.molecule.all.clean' RData files generated by the 'molecule_classification_model.R' file.
#In a separate drectory, store the 'generate_pathway.R' file.
#Create a folder name as 'object' in the current directory and store the RData files. 
#This R file contains the code to generate human pathway data by utilizing the all rp, kn and tf clean pathway molecules.
#Also create a folder as name "stringdb_human" in the current directory for downloading stringdb files for the human species.
#First run the "get_ppi_for_molecules_new" function and then follow the code in the main part section.
#This file generates the pathway path data by using the helping function of the SPAGI package.
#Finally we will save the 'pathway.path.2' RData in the 'result' subdirectory of the current directory to be used by the pathway_cleaning.R file.
##############







##################################################################################################
#############Need a folder for downloading stringdb files for the species - stringdb_human
#############It takes some time to download the data, and then can reuse the downloaded data

#' @title get_ppi_for_molecules_new
#'
#' @description
#' This function gets the PPI data from STRINGdb for the protein molecules provided.
#'
#' @rdname get_ppi_for_molecules_new
#' @name get_ppi_for_molecules_new
#'
#' @details
#' This function gets the PPI data from STRINGdb for the protein molecules provided.
#'
#' @return This function returns a data frame of the PPI for the molecules.
#'
#' @param RP.protein A vector containg the receptor (RP) proteins.
#' @param KN.protein A vector containg the kinase (KN) proteins.
#' @param TF.protein A vector containg the transcription factor (TF) proteins.
#' @param species The species name, either "hsapiens" or "mmusculus".
#' @param score An interger value for the STRINGdb PPI score threshold cutoff. Default is 700.
#'
#' @importFrom STRINGdb STRINGdb
#'
#' @export
#'
#' @examples
#' ## Need a folder at working directory for downloading stringdb files for the species - stringdb_human.
#' ## It takes some time to download the data, and then can reuse the downloaded data.
#' ## Here we will use RP.protein, KN.protein, TF.protein protein parameters. These data are automatically loaded with the SPAGI package. You can modify these parameters.
#' ## And we will use the species as "hsapiens" by default.
#'
#' ## Now get the filtered PPI for the molecules and the RP and TF proteins of the filtered PPI
#' hs.ppi.result<-get_ppi_for_molecules_new(RP.protein, KN.protein, TF.protein)
#' head(hs.ppi.result)
#'

get_ppi_for_molecules_new<-function(RP.protein, KN.protein, TF.protein, species="hsapiens", score=700){
  ##get ppi interactions for molecules
  if(species=="hsapiens"){
    #initiate the connection, id  9606 for human
    string_db_human <- STRINGdb$new(version="10", species=9606, score_threshold=0, input_directory="stringdb_human" )
    #now combine all the protein molecules
    all.protein<-unique(c(RP.protein, KN.protein, TF.protein))
    #make a data frame from all the protein
    all.protein.df<-data.frame("gene"=all.protein)
    #mapping gene names to string ids
    all.protein.mapped <- string_db_human$map(all.protein.df, "gene", takeFirst = T, removeUnmappedRows = TRUE)
    #get interactions information
    all.protein.mapped.interactions<-string_db_human$get_interactions(all.protein.mapped$STRING_id)
    #get only interactions and score
    all.protein.mapped.interactions.score<-all.protein.mapped.interactions[,c(1,2,16)]
  }
  else{
    print("ERROR: Do not support other species at this moment.")
    return(NULL)
  }
  ##
  
  
  ##from STRING_id to gene name conversion
  all.factor.M<-all.protein.mapped.interactions.score
  all.factor.N<-all.protein.mapped
  all.factor.M[,1]<-all.factor.N[match(all.factor.M$from, all.factor.N$STRING_id),1]
  all.factor.M[,2]<-all.factor.N[match(all.factor.M$to, all.factor.N$STRING_id),1]
  all.factor.PPI<-all.factor.M
  ##
  
  
  ##get only the significant interactions, here by default combined score >= 700
  all.factor.PPI.significant<-all.factor.PPI[all.factor.PPI$combined_score>=score,]
  ##
  
  
  
  #########To get all interactions without considering the directions
  #########Here, we will take the highest score value for duplicates
  ##1st get the original interactions
  all.ppi.sig.1<-all.factor.PPI.significant
  rownames(all.ppi.sig.1)<-NULL
  ##
  
  
  #####
  ##combine the neighboring factors to treat as a single vector - original order
  comb.ppi.1<-list()
  for(i in 1:nrow(all.ppi.sig.1)){
    comb.ppi.1[[i]]<-paste(all.ppi.sig.1[i,1], all.ppi.sig.1[i,2], sep="*")
  }
  ##
  
  ##make the first df (original order) with the combined_score
  comb.ppi.1.df<-data.frame("interaction"=unlist(comb.ppi.1), "score"=all.ppi.sig.1$combined_score)
  ##
  #####
  
  
  #####
  ##combine the neighboring factors to treat as a single vector - reverse order
  comb.ppi.2<-list()
  for(j in 1:nrow(all.ppi.sig.1)){
    comb.ppi.2[[j]]<-paste(all.ppi.sig.1[j,2], all.ppi.sig.1[j,1], sep="*")
  }
  ##
  
  ##make the second df (reverse order) with the combined_score
  comb.ppi.2.df<-data.frame("interaction"=unlist(comb.ppi.2), "score"=all.ppi.sig.1$combined_score)
  ##
  #####
  
  
  ##Now add both the interactions' data frame - original order and reverse order
  comb.ppi<-rbind(comb.ppi.1.df, comb.ppi.2.df)
  ##
  
  ##order according to the score value - highest to lowest
  comb.ppi.ordered<-comb.ppi[order(comb.ppi$score, decreasing = T),]
  ##
  
  ##take PPIs with the highest score valued unique one from the duplicates
  comb.ppi.ordered.unique <- comb.ppi.ordered[!duplicated(comb.ppi.ordered$interaction),]
  rownames(comb.ppi.ordered.unique)<-NULL
  ##
  ##########
  
  
  
  #####
  #now separating the links using a list that contains all the links as vectors
  comb.ppi.interaction.split<-lapply(as.vector(comb.ppi.ordered.unique$interaction), function(x) {return(unlist(strsplit(x, split = "[*]")))})
  #making data frame from the unique split lists
  comb.ppi.interaction.split.df<-as.data.frame(do.call(rbind, lapply(comb.ppi.interaction.split, rbind)))
  #set the column names of the data frame
  colnames(comb.ppi.interaction.split.df)<-c("from", "to")
  #now add the score value as a 3rd column
  all.factor.PPI.significant<-data.frame(comb.ppi.interaction.split.df, "score" = as.vector(comb.ppi.ordered.unique$score))
  #####
  
  
  
  
  #####To get only the significant links exist from RP - KN - TF
  #####FOr RP - RP, we have allowed maximum of 2 layers according to our design,
  #####If you need different design you should change in this section according to your design.
  ##get interactions from RP to KN
  RP.to.KN.significant.ppi<-all.factor.PPI.significant[((all.factor.PPI.significant$from %in% RP.protein) &
                                                          (all.factor.PPI.significant$to %in% KN.protein)),]
  
  ##get interactions from KN to KN - for all KNs
  KN.to.KN.significant.ppi<-all.factor.PPI.significant[((all.factor.PPI.significant$from %in% KN.protein) &
                                                          (all.factor.PPI.significant$to %in% KN.protein)),]
  
  ##get interactions from KN to TF - for all KNs
  KN.to.TF.significant.ppi<-all.factor.PPI.significant[((all.factor.PPI.significant$from %in% KN.protein) &
                                                          (all.factor.PPI.significant$to %in% TF.protein)),]
  
  ##get the RPs that have no direct interaction with the KNs
  RP.not.connected.with.KN<-setdiff(RP.protein, unique(RP.to.KN.significant.ppi$from))
  
  ##get the ppi from 'RP.not.connected.with.KN' to 'unique(RP.to.KN.significant.ppi$from)'
  #this will give us interaction for 2 RP layers
  #get interactions from from RP not connected with KN to RP connected with KN
  #these combined RPs will act as source to finding the paths
  RP.to.RP.significant.ppi<-all.factor.PPI.significant[((all.factor.PPI.significant$from %in% RP.not.connected.with.KN) &
                                                          (all.factor.PPI.significant$to %in% unique(RP.to.KN.significant.ppi$from))),]
  
  ##And finally combine all the interactions from RP-RP-KN-...-KN-TF
  all.significant.filtered.ppi<-rbind(RP.to.RP.significant.ppi, RP.to.KN.significant.ppi,
                                      KN.to.KN.significant.ppi, KN.to.TF.significant.ppi)
  rownames(all.significant.filtered.ppi)<-NULL
  #####
  
  
  
  #####
  ##Now get the RP and TF of the interactions
  RPs<-unique(c(unique(as.vector(RP.to.RP.significant.ppi$from)),
                unique(as.vector(RP.to.KN.significant.ppi$from))))
  
  TFs<-unique(as.vector(KN.to.TF.significant.ppi$to))
  #####
  
  
  
  #####
  ##Finally make a list of all.significant.filtered.ppi, RPs and TFs and then return
  comb.ppi.result<-list()
  comb.ppi.result[["PPI"]]<-all.significant.filtered.ppi
  comb.ppi.result[["RPs"]]<-RPs
  comb.ppi.result[["TFs"]]<-TFs
  return(comb.ppi.result)
  ##
  #####
}
###########################################################################################








###############################Main part##########################
###################Installation#######################
#### string database handling for PPI
#If 'BiocManager' is not already installed, first install 'BiocManager' to insall the bioconductor packages.
#if(!requireNamespace("BiocManager", quietly = TRUE))
#  install.packages("BiocManager")

#then install the STRINGdb package
#BiocManager::install("STRINGdb")
library(STRINGdb)
####


#install.packages('data.table')
#install.packages('igraph')

#install.packages('devtools')
#devtools::install_github('VCCRI/SPAGI')

library(data.table)
library(igraph)
library(spagi)
#######################################################


#####Load the pathway molecules
load("object/rp.molecule.all.clean.RData")
load("object/kn.molecule.all.clean.RData")
load("object/tf.molecule.all.clean.RData")
#####


#####Now get the filtered PPI for the molecules and the RP and TF proteins of the filtered PPI
hs.ppi.result<-get_ppi_for_molecules_new(rp.molecule.all.clean, kn.molecule.all.clean, tf.molecule.all.clean)
head(summary(hs.ppi.result))
#####


#####Generate the pathway path data using the hs.ppi.result and housekeeping.gene data sets
pathway.path.2<-generate_pathway_path(ppi.result=hs.ppi.result, housekeeping.gene=housekeeping.gene)
head(summary(pathway.path.2))
#save(pathway.path.2, file = "result/pathway.path.2.RData")
#####
humayun2017/SPAGI2 documentation built on Aug. 5, 2020, 12:06 a.m.