############################
##This file should be run at the fifth.
##This file is used to generate the generally highly expressed receptor proteins
##then to clean the pathways by removing the pathways that have no expression information of receptors either in human or mouse encode data sets.
##This utilizes the pathway.path.2 RData file as pathway data generated by the generated_pathway.R file.
##It utilizes both the human and mouse encode data sets with their descriptors.
##Here, a receptor is treated as generally highly expressed that has median expression >=4 for the human/mouse encode data set.
##Both for human and mouse encode data sets generally highly expressed receptors are generated separately.
##Then, an unique set of generally highly expressed receptors is generated by taking union of the two sets of housekeeping genes.
##Next, this generates a list of clean pathway names that have expression information of their receptors either in human or mouse encode data sets.
##Finally, it cleans the pthway.path.2 data by taking only the pathways that have names in clean pathway names.
############################
###########################################
# If 'BiocManager' is not already installed, first install 'BiocManager' to insall the bioconductor packages. Type the following in an R command window:
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("biomaRt")
# Finally load the library with
library(biomaRt)
#############################################
#
#
#
####################
##!!When Ensembl move servers (rarely) or it is down (less rare) these variable may need to be changed
biomart.ID <- "ENSEMBL_MART_ENSEMBL"
host.ID <- "www.ensembl.org"
##or,
#host.ID <- "asia.ensembl.org"
##or, for example to an archived version
#host.ID <- "http://sep2019.archive.ensembl.org"
####################
#######################################################################################
####################
# Returns a data frame with two columns containing all gene symbols and their corresponding gene IDs (eg. Ensembl gene IDs) for the given species
# Input parameter is the species string like 'hsapiens'
# gene mapping to external_gene_name with other gene id
# Here, geneID.forMapping is used for the existing gene ID (that has 'external_gene_name' mapping in the biomaRt database), such as "ensembl_gene_id" for ensembl gene id.
make_ENSEMBL_symbol_map_new <- function(species, geneSymbol.toMapping, geneID.forMapping){
dataset.name <- paste(species, "_gene_ensembl", sep="")
ensembl <- useMart(biomart = biomart.ID, dataset = dataset.name, host = host.ID)
ENSEMBL.symbol.map <- getBM(attributes=c(geneSymbol.toMapping, geneID.forMapping), mart = ensembl)
return(ENSEMBL.symbol.map)
}
####################
######################################
# This function converts gene IDs to gene symbols for cells/tissues gene expression profiles
# for example, used external_gene_name for geneSymbol.toMapping and ensembl_gene_id for geneID.forMapping
# the example gene symbols should be - rownames(query.data)<-c("CRYAA", "CRYAB", "CRYBB3", "PAX6", "SOX2", "PROX1", "SIX3", "CRIM1", "CRYBB2", "BMP7")
convert_geneID_to_geneSymbol <- function(user.cell.data, species="hsapiens", geneSymbol.toMapping="external_gene_name", geneID.forMapping = "ensembl_gene_id", collapse.method = "max"){
ensembl.symbol.map<-make_ENSEMBL_symbol_map_new(species, geneSymbol.toMapping, geneID.forMapping)
#for cell/tissue gene expression data sets
cell.tissue.names<-colnames(user.cell.data)
list.gene.names<-rownames(user.cell.data)
ensembl.genes.exist.in.user.cell.data <- ensembl.symbol.map[ensembl.symbol.map[[2]] %in% list.gene.names,]
#duplicate probes / symbol IDS - multiple ENS genes match to the same probe or symbol - probably mostly NA or ""
duplicate.ensembl.gene.list <- duplicated(ensembl.genes.exist.in.user.cell.data[[2]])
ensembl.genes.exist.in.user.cell.data.without.duplication <- ensembl.genes.exist.in.user.cell.data[!duplicate.ensembl.gene.list,]
idlist <- split(ensembl.genes.exist.in.user.cell.data.without.duplication[[2]], ensembl.genes.exist.in.user.cell.data.without.duplication[[1]])
if(collapse.method == "mean"){
#for mean
collapsed <- lapply(idlist,function(X){
return(colMeans(user.cell.data[unique(X),,drop=F]))
})
}
else if(collapse.method == "max"){
#for max
collapsed <- lapply(idlist,function(X){
return(user.cell.data[unique(X),,drop=F][which.max(rowMeans(user.cell.data[unique(X),,drop=F])),])
})
}
else {
print("ERROR: Other collapse methods not supported!!")
return(NULL)
}
user.cell.data.clean <- do.call(rbind, collapsed)
colnames(user.cell.data.clean)<-cell.tissue.names
return(user.cell.data.clean)
}
#####################################
#############################
##This funtion computes mean for a data matrix.
##First checks for vector data and if it is then simply make a data matrix by copying the values.
compute_mean <- function(data.matrix){
if((is.vector(data.matrix)==TRUE) || (ncol(data.matrix)<2))
data.matrix<-cbind(val1=data.matrix,vall2=data.matrix)
return(rowMeans(data.matrix))
}
#############################
#####################
##This function formats the list data to make a matrix formatted data
format_list_data<-function(list.data, experiment.descriptor){
names(list.data)<-experiment.descriptor
mat.data<-do.call(cbind,lapply(list.data,compute_mean))
return(mat.data)
}
#####################
###############pre-processing the human compendium data
##load human encode data
load("objects/all_FPKMs_human_cleanID.RData")
load("objects/experiment.descriptors.humanEncode.RData")
##Make the human compendium
human.compendium.data<-format_list_data(all_FPKMs_clean_ID, experiment.descriptors.humanEncode)
dim(human.compendium.data)
#[1] 57820 144
##
##taking log2 of the data
human.compendium.data.log<-log2(human.compendium.data+1)
################
###############pre-processing the mouse compendium data
##load mouse encode data
load("objects/all_FPKMs_Mouse_cleanID.RData")
load("objects/experiment.descriptors.mouseEncode.RData")
##Make the mouse compendium
mouse.compendium.data<-format_list_data(all_FPKMs_clean_ID_Mouse, experiment.descriptors.mouseEncode)
dim(mouse.compendium.data)
#[1] 43346 94
##
##taking log2 of the data
mouse.compendium.data.log<-log2(mouse.compendium.data+1)
################
#############
##now, convert the ensembl ids to gene symbols
##note that convert_geneID_to_geneSymbol() function is used from the SPAGI2 package
dd.human<-convert_geneID_to_geneSymbol(human.compendium.data.log)
dd.mouse<-convert_geneID_to_geneSymbol(user.cell.data = mouse.compendium.data.log, species = "mmusculus")
##
##load the pathway.path.2 and then take the names of the pathways
load("objects/pathway.path.2.RData")
pathway.names<-names(pathway.path.2)
length(pathway.names)
#[1] 910
##
#############
##########For Human
##get pathway names expression from human data
human.pathway.name.exp<-dd.human[(rownames(dd.human) %in% pathway.names),]
pathway.name.not.in.human<-setdiff(pathway.names,rownames(human.pathway.name.exp))
pathway.name.not.in.human
#[1] "ORAI1" "NGFRAP1" "PVRL2" "PVRL1" "SSTR3" "PVRL4" "ADRA2B" "ELTD1" "GPR56" "CD97" "KIR3DS1"
#[12] "PVRL3" "ADORA3" "OR2AG1"
##
##find the median for all pathway names
hs.rp.med.exp<-apply(human.pathway.name.exp,1,median)
barplot(sort(hs.rp.med.exp, decreasing = T),las=2)
hist(hs.rp.med.exp)
#filter the RPs that have expression >=4
hs.rp.med.exp.filter4<-hs.rp.med.exp[which(hs.rp.med.exp >= 4)]
##
##get the names of the RPs that have median exp >=4
hs.rp.med.exp.4.name<-names(hs.rp.med.exp.filter4)
length(hs.rp.med.exp.4.name)
#[1] 92
##
##########
##########For Mouse
##get pathway names expression from human data - here considering 1-1 hs and mm correspondance
dd.mouse.up<-dd.mouse
rownames(dd.mouse.up)<-toupper(rownames(dd.mouse.up))
mouse.pathway.name.exp<-dd.mouse.up[(rownames(dd.mouse.up) %in% pathway.names),]
pathway.name.not.in.mouse<-setdiff(pathway.names,rownames(mouse.pathway.name.exp))
pathway.name.not.in.mouse
#[1] "CD58" "SIRPG" "NGFRAP1" "PVRL2" "ICOSLG" "PVRL1" "NCR2" "SIGLEC14" "TAS2R16"
#[10] "NLGN4Y" "PVRL4" "CD1D" "HCAR3" "SIGLEC5" "SIRPB1" "NPBWR2" "HTR1E" "GALR3"
#[19] "ELTD1" "NLGN4X" "GPR56" "CD97" "DSG1" "BTN2A1" "KIR3DS1" "TNFRSF10D" "GPR78"
#[28] "GPR42" "P2RY8" "CD1C" "IL4R" "AGTR1" "IL6R" "TNFRSF10A" "LTB4R" "CD244"
#[37] "PVRL3" "FCER2" "CD8B" "CD209" "MLNR" "MCHR2" "HLA-DRB1" "STX4" "SIGLEC7"
#[46] "TNFRSF6B" "OR1E2" "OR5AR1" "OR10H2" "OR10H3" "OR2AG1" "OR10H5" "OR1E1" "CLEC4M"
#[55] "OR10H4" "OR11H4" "OR56A1" "OR6T1" "OR1D2" "OR56A4" "OR1G1" "OR52A1" "OR10J5"
#[64] "OR13F1" "OR10H1" "OR2L13" "OR3A2" "OR4E2" "OR10J1"
##
##find the median for all pathway names
mm.rp.med.exp<-apply(mouse.pathway.name.exp,1,median)
barplot(sort(mm.rp.med.exp, decreasing = T),las=2)
hist(mm.rp.med.exp)
#filter out the RPs that have expression >=4
mm.rp.med.exp.filter4<-mm.rp.med.exp[which(mm.rp.med.exp >= 4)]
##
##get the names of the RPs that have median exp >=4
mm.rp.med.exp.4.name<-names(mm.rp.med.exp.filter4)
length(mm.rp.med.exp.4.name)
#[1] 79
##############
#######Now get the RPs that have median expression >= 4 in any one species
##
rp.median.exp.4<-unique(c(hs.rp.med.exp.4.name, mm.rp.med.exp.4.name))
length(rp.median.exp.4)
#[1] 118
#save(rp.median.exp.4, file = "result/rp.median.exp.4.RData")
#this receptor proteins will be used as generally highly expressed in many cells/tissues
#this 'rp.median.exp.4' will be used as background generally highly expressed receptors data for SPAGI2
##
##Now get the RPs that have median expression < 4 in any one species
#First remove the RPs that have no expression information in any one species
pathway.names.exp.not.exist<-unique(c(pathway.name.not.in.human, pathway.name.not.in.mouse))
length(pathway.names.exp.not.exist)
#[1] 73
pathway.names.clean<-setdiff(pathway.names, pathway.names.exp.not.exist)
length(pathway.names.clean)
#[1] 837
length(setdiff(pathway.names.clean, rp.median.exp.4))
#[1] 719, (837-118=719)
##
#######
##################################################
#####now, clean the pathways (pathway.path.2) by taking only the pathways that have pathway names in the 'pathway.names.clean' list
##take only the pathways which names are exist in the "pathway.names.clean" list
pathway.path.new<-list()
for(i in 1:length(pathway.path.2)){
if((names(pathway.path.2)[i]) %in% pathway.names.clean){
pathway.path.new[[names(pathway.path.2)[i]]]<-pathway.path.2[[i]]
}
}
##
##
length(pathway.path.new)
#[1] 837
##
##save pathway path new file
#save(pathway.path.new, file = "result/pathway.path.new.RData")
#this 'pathway.path.new' RData file will be used as background pathway path data for SPAGI2.
##
####################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.