additional_file/pathway_cleaning.R

############################
##This file should be run at the fifth.
##This file is used to generate the generally highly expressed receptor proteins 
##then to clean the pathways by removing the pathways that have no expression information of receptors either in human or mouse encode data sets.
##This utilizes the pathway.path.2 RData file as pathway data generated by the generated_pathway.R file.
##It utilizes both the human and mouse encode data sets with their descriptors.
##Here, a receptor is treated as generally highly expressed that has median expression >=4 for the human/mouse encode data set.
##Both for human and mouse encode data sets generally highly expressed receptors are generated separately. 
##Then, an unique set of generally highly expressed receptors is generated by taking union of the two sets of housekeeping genes. 
##Next, this generates a list of clean pathway names that have expression information of their receptors either in human or mouse encode data sets.
##Finally, it cleans the pthway.path.2 data by taking only the pathways that have names in clean pathway names.
############################





###########################################
# If 'BiocManager' is not already installed, first install 'BiocManager' to insall the bioconductor packages. Type the following in an R command window:
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
#
# BiocManager::install("biomaRt")
# Finally load the library with
library(biomaRt)
#############################################
#
#
#
####################
##!!When Ensembl move servers (rarely) or it is down (less rare) these variable may need to be changed
biomart.ID <- "ENSEMBL_MART_ENSEMBL"
host.ID <-  "www.ensembl.org"
##or,
#host.ID <- "asia.ensembl.org"
##or, for example to an archived version
#host.ID <- "http://sep2019.archive.ensembl.org"
####################
#######################################################################################





####################
# Returns a data frame with two columns containing all gene symbols and their corresponding gene IDs (eg. Ensembl gene IDs) for the given species
# Input parameter is the species string like 'hsapiens'
# gene mapping to external_gene_name with other gene id
# Here, geneID.forMapping is used for the existing gene ID (that has 'external_gene_name' mapping in the biomaRt database), such as "ensembl_gene_id" for ensembl gene id.
make_ENSEMBL_symbol_map_new <- function(species, geneSymbol.toMapping, geneID.forMapping){
  dataset.name <- paste(species, "_gene_ensembl", sep="")
  ensembl <- useMart(biomart = biomart.ID, dataset = dataset.name, host = host.ID)
  ENSEMBL.symbol.map <- getBM(attributes=c(geneSymbol.toMapping, geneID.forMapping), mart = ensembl)
  return(ENSEMBL.symbol.map)
}
####################





######################################
# This function converts gene IDs to gene symbols for cells/tissues gene expression profiles
# for example, used external_gene_name for geneSymbol.toMapping and ensembl_gene_id for geneID.forMapping
# the example gene symbols should be - rownames(query.data)<-c("CRYAA", "CRYAB", "CRYBB3", "PAX6", "SOX2", "PROX1", "SIX3", "CRIM1", "CRYBB2", "BMP7")
convert_geneID_to_geneSymbol <- function(user.cell.data, species="hsapiens", geneSymbol.toMapping="external_gene_name", geneID.forMapping = "ensembl_gene_id", collapse.method = "max"){
  ensembl.symbol.map<-make_ENSEMBL_symbol_map_new(species, geneSymbol.toMapping, geneID.forMapping)
  
  #for cell/tissue gene expression data sets
  cell.tissue.names<-colnames(user.cell.data)
  list.gene.names<-rownames(user.cell.data)
  ensembl.genes.exist.in.user.cell.data <- ensembl.symbol.map[ensembl.symbol.map[[2]] %in% list.gene.names,]
  
  #duplicate probes / symbol IDS - multiple ENS genes match to the same probe or symbol - probably mostly NA or ""
  duplicate.ensembl.gene.list <- duplicated(ensembl.genes.exist.in.user.cell.data[[2]])
  ensembl.genes.exist.in.user.cell.data.without.duplication <- ensembl.genes.exist.in.user.cell.data[!duplicate.ensembl.gene.list,]
  
  idlist <- split(ensembl.genes.exist.in.user.cell.data.without.duplication[[2]], ensembl.genes.exist.in.user.cell.data.without.duplication[[1]])
  
  if(collapse.method == "mean"){
    #for mean
    collapsed <- lapply(idlist,function(X){
      return(colMeans(user.cell.data[unique(X),,drop=F]))
    })
  }
  else if(collapse.method == "max"){
    #for max
    collapsed <- lapply(idlist,function(X){
      return(user.cell.data[unique(X),,drop=F][which.max(rowMeans(user.cell.data[unique(X),,drop=F])),])
    })
  }
  else {
    print("ERROR: Other collapse methods not supported!!")
    return(NULL)
  }
  
  user.cell.data.clean <- do.call(rbind, collapsed)
  colnames(user.cell.data.clean)<-cell.tissue.names
  
  return(user.cell.data.clean)
}
#####################################







#############################
##This funtion computes mean for a data matrix.
##First checks for vector data and if it is then simply make a data matrix by copying the values.
compute_mean <- function(data.matrix){
  if((is.vector(data.matrix)==TRUE) || (ncol(data.matrix)<2))
    data.matrix<-cbind(val1=data.matrix,vall2=data.matrix)
  return(rowMeans(data.matrix))
}
#############################





#####################
##This function formats the list data to make a matrix formatted data
format_list_data<-function(list.data, experiment.descriptor){
  names(list.data)<-experiment.descriptor
  mat.data<-do.call(cbind,lapply(list.data,compute_mean))
  return(mat.data)
}
#####################






###############pre-processing the human compendium data
##load human encode data
load("objects/all_FPKMs_human_cleanID.RData")
load("objects/experiment.descriptors.humanEncode.RData") 

##Make the human compendium
human.compendium.data<-format_list_data(all_FPKMs_clean_ID, experiment.descriptors.humanEncode)
dim(human.compendium.data)
#[1] 57820   144
##

##taking log2 of the data
human.compendium.data.log<-log2(human.compendium.data+1)
################





###############pre-processing the mouse compendium data
##load mouse encode data
load("objects/all_FPKMs_Mouse_cleanID.RData")
load("objects/experiment.descriptors.mouseEncode.RData") 

##Make the mouse compendium
mouse.compendium.data<-format_list_data(all_FPKMs_clean_ID_Mouse, experiment.descriptors.mouseEncode)
dim(mouse.compendium.data)
#[1] 43346    94
##

##taking log2 of the data
mouse.compendium.data.log<-log2(mouse.compendium.data+1)
################





#############
##now, convert the ensembl ids to gene symbols
##note that convert_geneID_to_geneSymbol() function is used from the SPAGI2 package
dd.human<-convert_geneID_to_geneSymbol(human.compendium.data.log)
dd.mouse<-convert_geneID_to_geneSymbol(user.cell.data = mouse.compendium.data.log, species = "mmusculus")
##


##load the pathway.path.2 and then take the names of the pathways
load("objects/pathway.path.2.RData")
pathway.names<-names(pathway.path.2)
length(pathway.names)
#[1] 910
##
#############





##########For Human
##get pathway names expression from human data
human.pathway.name.exp<-dd.human[(rownames(dd.human) %in% pathway.names),]
pathway.name.not.in.human<-setdiff(pathway.names,rownames(human.pathway.name.exp))
pathway.name.not.in.human
#[1] "ORAI1"   "NGFRAP1" "PVRL2"   "PVRL1"   "SSTR3"   "PVRL4"   "ADRA2B"  "ELTD1"   "GPR56"   "CD97"    "KIR3DS1"
#[12] "PVRL3"   "ADORA3"  "OR2AG1" 
##


##find the median for all pathway names
hs.rp.med.exp<-apply(human.pathway.name.exp,1,median)
barplot(sort(hs.rp.med.exp, decreasing = T),las=2)
hist(hs.rp.med.exp)
#filter the RPs that have expression >=4
hs.rp.med.exp.filter4<-hs.rp.med.exp[which(hs.rp.med.exp >= 4)]
##

##get the names of the RPs that have median exp >=4
hs.rp.med.exp.4.name<-names(hs.rp.med.exp.filter4)
length(hs.rp.med.exp.4.name)
#[1] 92
##
##########





##########For Mouse
##get pathway names expression from human data - here considering 1-1 hs and mm correspondance
dd.mouse.up<-dd.mouse
rownames(dd.mouse.up)<-toupper(rownames(dd.mouse.up))
mouse.pathway.name.exp<-dd.mouse.up[(rownames(dd.mouse.up) %in% pathway.names),]
pathway.name.not.in.mouse<-setdiff(pathway.names,rownames(mouse.pathway.name.exp))
pathway.name.not.in.mouse
#[1] "CD58"      "SIRPG"     "NGFRAP1"   "PVRL2"     "ICOSLG"    "PVRL1"     "NCR2"      "SIGLEC14"  "TAS2R16"  
#[10] "NLGN4Y"    "PVRL4"     "CD1D"      "HCAR3"     "SIGLEC5"   "SIRPB1"    "NPBWR2"    "HTR1E"     "GALR3"    
#[19] "ELTD1"     "NLGN4X"    "GPR56"     "CD97"      "DSG1"      "BTN2A1"    "KIR3DS1"   "TNFRSF10D" "GPR78"    
#[28] "GPR42"     "P2RY8"     "CD1C"      "IL4R"      "AGTR1"     "IL6R"      "TNFRSF10A" "LTB4R"     "CD244"    
#[37] "PVRL3"     "FCER2"     "CD8B"      "CD209"     "MLNR"      "MCHR2"     "HLA-DRB1"  "STX4"      "SIGLEC7"  
#[46] "TNFRSF6B"  "OR1E2"     "OR5AR1"    "OR10H2"    "OR10H3"    "OR2AG1"    "OR10H5"    "OR1E1"     "CLEC4M"   
#[55] "OR10H4"    "OR11H4"    "OR56A1"    "OR6T1"     "OR1D2"     "OR56A4"    "OR1G1"     "OR52A1"    "OR10J5"   
#[64] "OR13F1"    "OR10H1"    "OR2L13"    "OR3A2"     "OR4E2"     "OR10J1"    
##


##find the median for all pathway names
mm.rp.med.exp<-apply(mouse.pathway.name.exp,1,median)
barplot(sort(mm.rp.med.exp, decreasing = T),las=2)
hist(mm.rp.med.exp)
#filter out the RPs that have expression >=4
mm.rp.med.exp.filter4<-mm.rp.med.exp[which(mm.rp.med.exp >= 4)]
##


##get the names of the RPs that have median exp >=4
mm.rp.med.exp.4.name<-names(mm.rp.med.exp.filter4)
length(mm.rp.med.exp.4.name)
#[1] 79
##############



#######Now get the RPs that have median expression >= 4 in any one species
##
rp.median.exp.4<-unique(c(hs.rp.med.exp.4.name, mm.rp.med.exp.4.name))
length(rp.median.exp.4)
#[1] 118
#save(rp.median.exp.4, file = "result/rp.median.exp.4.RData")
#this receptor proteins will be used as generally highly expressed in many cells/tissues
#this 'rp.median.exp.4' will be used as background generally highly expressed receptors data for SPAGI2
##


##Now get the RPs that have median expression < 4 in any one species
#First remove the RPs that have no expression information in any one species
pathway.names.exp.not.exist<-unique(c(pathway.name.not.in.human, pathway.name.not.in.mouse))
length(pathway.names.exp.not.exist)
#[1] 73
pathway.names.clean<-setdiff(pathway.names, pathway.names.exp.not.exist)
length(pathway.names.clean)
#[1] 837
length(setdiff(pathway.names.clean, rp.median.exp.4))
#[1] 719, (837-118=719)
##
#######





##################################################
#####now, clean the pathways (pathway.path.2) by taking only the pathways that have pathway names in the 'pathway.names.clean' list

##take only the pathways which names are exist in the "pathway.names.clean" list
pathway.path.new<-list()
for(i in 1:length(pathway.path.2)){
  if((names(pathway.path.2)[i]) %in% pathway.names.clean){
    pathway.path.new[[names(pathway.path.2)[i]]]<-pathway.path.2[[i]]
  }
}
##


##
length(pathway.path.new)
#[1] 837
##

##save pathway path new file
#save(pathway.path.new, file = "result/pathway.path.new.RData")
#this 'pathway.path.new' RData file will be used as background pathway path data for SPAGI2.
##
####################################################
humayun2017/SPAGI2 documentation built on Aug. 5, 2020, 12:06 a.m.