#gets patthways for each disease
getPathways = function(gene_population, database)
{
pathway_table <- data.frame(matrix(ncol = 4, nrow = 0))
colnames(pathway_table) <- c("Pathway", "Pathway ID", "Gene (Gene Symbol)", "Source" )
pathway_table = pathway_table %>%
dplyr::mutate(across(everything(), as.character))
for (gene in gene_population)
{
url = paste0("http://pcoskb.bicnirrh.res.in/result1.php?opt=6&qury=", gene)
gene_pathway_table = getTableSection(url, n_col = 4, tag = "td td td td", n_page = 0)
colnames(gene_pathway_table) = gene_pathway_table[1,]
gene_pathway_table = gene_pathway_table[-1,]
if(nrow(gene_pathway_table) == 0)
{
next
}
gene_pathway_table = gene_pathway_table %>% dplyr::filter(Source == database)
#add an aditional grep check
if(nrow(gene_pathway_table) > 0 )
{
gene_pathway_table = gene_pathway_table[checkIfCorrect(gene, gene_pathway_table),]
}
else
{
next
}
pathway_table = dplyr::union(pathway_table, gene_pathway_table)
}
return(pathway_table)
}
checkIfCorrect = function(gene, gene_pathway_table)
{
#check if all the strings in the gene pathway table contain the exact given genE
boolean_vec = sapply(gene_pathway_table$`Gene (Gene Symbol)`, function(x)
{
genes = countGenes(x)
if(length(which(gene == genes)) == 0)
{
return(FALSE)
}
else
{
return(TRUE)
}
})
return(unname(boolean_vec))
}
getComb = function(pathways, gene_population)
{
path_pcos = sapply(pathways$`Gene (Gene Symbol)`, function(x){
genes = strsplit(x, split = ", ")[[1]]
genes = unname(sapply(genes, str_replace_all, pattern = "[\r\n\\s]", replacement = ""))
genes = intersect(gene_population, genes)
return(paste(genes, collapse = ", "))
})
return(unname(path_pcos))
}
countGenes = function(gene_column)
{
genes = sapply(gene_column, strsplit, split = ", | ")
genes = unname(unlist(genes))
genes = stringr::str_replace_all(genes, "[\r\n\\s]" , "")
genes = unique(genes[which(genes != "")])
return(genes)
}
#takes the output of the pathway analysis function and returns a network where the nodes are the pathways
#function works regardless of database or dataset
viewNetwork = function(pathways)
{
if(missing(pathways))
stop("Argument 'disease_list' must be specified")
#check if pathways have the correct format
#create nodes
nodes = data.frame(id = pathways$`Pathway name`, #this is an id
label = pathways$`Pathway name`, #this labels the node with the disease name
value = rep(NA, nrow(pathways))) #this gives the size to the node
#this assigns a size to each node depending on the amount of genes the disease has
for (row in 1:nrow(nodes))
{
genes = countGenes(pathways$`Pathway genes associated with PCOS`[row])
nodes[row, "value"] = length(genes)
}
#create edges
#this creates all pairs of pathways and edge dataframe
edges = t(combn(pathways$`Pathway name`, 2))
colnames(edges) = c("from", "to")
#this manipulates the width of each edge
value = rep(NA, nrow(edges))
edges = cbind(edges, value)
for (row in 1:nrow(edges))
{
pathway_A = edges[row,1]
pathway_B = edges[row,2]
genes_pathway_A = pathways[which(pathways$`Pathway name` == pathway_A),3]
genes_pathway_A = countGenes(genes_pathway_A)
genes_pathway_B = pathways[which(pathways$`Pathway name` == pathway_B),3]
genes_pathway_B = countGenes(genes_pathway_B)
shared_genes = intersect(genes_pathway_A, genes_pathway_B)
edges[row, "value"] = length(shared_genes)
}
#feed nodes and edges to the function
edges <- as.data.frame(edges)
edges$value= as.numeric(as.character(edges$value))
valid = which(edges$value == 0)
if(length(valid) > 0)
{
edges = edges[-valid,]
}
print(visNetwork::visNetwork(nodes, edges, width = "100%") %>% visNetwork::visIgraphLayout())
}
getPathway_mirna = function(gene_pathway_table, pathway_id, pcos_associated_mirna, dataset)
{
gene_associated_w_pathways = NULL
if("Reactome" == dataset)
{
hsa_ncbi2reactome = ncbi2reactome %>% dplyr::filter(grepl(pattern = "R-HSA", x = V2))
filtered_hsa_ncbi2reactome = hsa_ncbi2reactome %>% dplyr::filter(V2 == pathwayId)
gene_associated_w_pathways = filtered_hsa_ncbi2reactome$V1
}
else if("KEGG" == dataset)
{
gene_associated_w_pathways = gene_pathway_table %>% dplyr::filter(pathway_id == `associated_pathway`)
gene_associated_w_pathways = gene_associated_w_pathways$entrez_id
}
else
{
}
#search for them in mirTar
miRNA_data = human_data %>% dplyr::filter(`Target Gene (Entrez Gene ID)` %in% gene_associated_w_pathways)
#search for the miRNA inside pcosk
miRNA_names = miRNA_data$miRNA
#convert miRNA to precursrs
mature_to_precursor = miRBaseConverter::miRNA_MatureToPrecursor(miRNA_names)
miRNA_names = mature_to_precursor$Precursor
result = pcos_associated_mirna[pcos_associated_mirna$mirbase_id %in% miRNA_names,1]
return(result)
}
#gets pathway using
createMirnaPathway = function(disease_target_genes, database, disease_mirnas)
{
#convert geneSymbol to mirBase id
mart <- biomaRt::useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
pcos_mirna_table =biomaRt::getBM(attributes = c("hgnc_symbol", "mirbase_id"), filters = "hgnc_symbol", values = unique(mirna_disease_associations$`Gene Symbol`), mart = mart)
pcos_mirna_table[16,2] = "hsa-mir-320a"
pthwys = NULL
if(database == "KEGG")
{
kegg_gene_pathway_table = NULL
for (id in disease_target_genes)
{
geneInfo = tryCatch( KEGGREST::keggGet(paste0("hsa:", id)), error = function(e) NULL)
if(!is.null(geneInfo))
{
entrez_id = geneInfo[[1]][["ENTRY"]]
gene_pathways = names(geneInfo[[1]][["PATHWAY"]])
gene_pathway_table = data.frame(entrez_id=rep(entrez_id, length(gene_pathways)),
associated_pathway = gene_pathways)
kegg_gene_pathway_table = rbind(kegg_gene_pathway_table, gene_pathway_table)
}
}
pthwys = getPathwayUsingId(kegg_gene_pathway_table$associated_pathway)
pathway_table = data.frame(`Pathway` = pthwys$Pathway,
`Pathway genes associated with PCOS` = pthwys$`Pathway ID`,
`Pathway genes associated with PCOS and the selected diseases` = rep(NA, length( pthwys$Pathway)),
`Hypergeometric probability` = rep(NA, length( pthwys$Pathway)))
#gwt pathway mirna
for (row in 1:nrow(pathway_table))
{
pathway_id = pathway_table[row,2]
pathway_table[row,2] = paste0(getPathway_mirna(gene_pathway_table = kegg_gene_pathway_table, pathway_id, pcos_associated_mirna = pcos_mirna_table, dataset = "KEGG"), collapse = ", ")
pathway_table[row,3] = intersect(countGenes(pathway_table[row,2]), disease_mirnas)
}
#this is the gene population, it's N in the formula
}
else if(database == "Reactome")
{
#filter human pathways
hsa_ncbi2reactome = ncbi2reactome %>% dplyr::filter(grepl(pattern = "R-HSA", x = V2))
#filter to target genes
filtered_hsa_ncbi2reactome = hsa_ncbi2reactome %>% dplyr::filter(V1 %in% disease_target_genes)
#filter to pathways uniquely in PCOSKB
pthwys = getPathwayUsingId(filtered_hsa_ncbi2reactome$V2)
for (row in 1:nrow(pthwys)) {
#find entrez gene associated with each pathway
pathwayId = pthwys[row,2]
pathway_genes = filtered_hsa_ncbi2reactome %>% dplyr::filter(V2 == pathwayId)
pathway_gene_ids = pathway_genes$V1
#find all miRNA associated with the pathway
pathway_mirna = human_data %>% dplyr::filter(`Target Gene (Entrez Gene ID)` %in% pathway_gene_ids)
#convert it to precursor
miRNAs = unique(pathway_mirna$miRNA)
#the extra empty string is added to the vector is for when there is a single miRNA,
#to avoid error
if(length(miRNAs) == 1)
{
miRNAs= append(miRNAs, "")
}
mature_to_precursor = miRBaseConverter::miRNA_MatureToPrecursor(miRNAs)
miRNA_names = mature_to_precursor$Precursor
result = pcos_mirna_table[pcos_mirna_table$mirbase_id %in% miRNA_names,1]
pthwys[row,2] = paste0(result[!is.na(result)], collapse = ", ")
}
return(pthwys)
}
else
{
}
#filters kegg pathways that are solely in pcoskbr
out_pathways = pthwys
#get miRNA for each pathway
gene_population = disease_mirnas
pcos_associated_mirna = mirna_disease_associations %>% dplyr::filter(Disease == "PCOS")
colnames(pathway_table) = c("Pathway name", "Pathway genes associated with PCOS", "Pathway genes associated with PCOS and the selected diseases", "Hypergeometric probability")
N = union(countGenes(pathway_table$`Gene (Gene Symbol)`), gene_disease_associations$geneSymbol) #population of genes in pcos and pathway dataset
n = gene_population #sample size = number of gnes used in input
for (i in 1:nrow(pathway_table))
{
K = unique(countGenes(pathway_table$`Pathway genes associated with PCOS`[i])) #number of pcos genes in ith pathway
k = intersect(K, gene_population) #number of inpiut genes in sample overlapping iwth genes in ith pathay
pathway_table[i,"Hypergeometric probability"] =dhyper(x = length(k), m = length(K), n = length(setdiff(N, K)), k = length(n))
}
pathway_table = pathway_table[,c(1,3,2,4)]
}
getPathwayUsingId = function(kegg_pathway_ids)
{
query = ""
for (pathway_id in kegg_pathway_ids) {
query = paste0(query, " ", paste0("[ID]{", pathway_id, "}"))
}
url = "http://pcoskb.bicnirrh.res.in/advsrh.php"
advcd_search = rvest::session(url)
html_node = rvest::read_html(advcd_search)
search_form = html_node %>% html_form()
search_form = search_form[[1]]
search = search_form %>% html_form_set(qry = query, make = "Associated Pathways", type = "ID")
resp <- html_form_submit(search)
tables = read_html(resp) %>% html_nodes("p+ table td") %>% html_text()
name = tables
n_col = 4
n_row = length(name)/n_col #calculates the number of rows in the table
table_section = NULL
#loop splits row of text into a vector
for (i in 1:n_row)
{
start_index = 1 + n_col*(i - 1) # calculates start index of the ith row in the name vector
end_index = start_index + n_col - 1 #calculates the end_index of the ith row in the name vector
name = trimws(name)
row = name[start_index:end_index] #slices the elements to make the ith row
table_section = rbind(table_section, row) #binds the (i-1)th row to the ith row
}
table_section = as.data.frame(table_section)
col_names = table_section[1,]
colnames(table_section) = col_names
table_section = table_section[-1,]
}
#' Pathway enrichment Analysis
#' @description
#' For selected diseases, this tool gives a list of enriched pathways.
#' @usage
#' \code{PathwayAnalysis(disease_list, dataset = c("miRNAs", "Genes"), database = c("KEGG", "Reactome"))}
#' @param disease_list Diseases for which comorbidity with PCOS will be analyzed. A possible list of diseases can be retrieved using the \code{listDiseases} function.
#' @param dataset Dataset. It can either be "miRNAs" or "Genes".
#' @param database Database from which pathways will be extracted. It can either be "KEGG" or "Reactome".
#'
#' @author Omar Hassoun
#'
#' @return \code{data.frame} of enriched pathways.
#' @export
#'
#' @examples
#' \code{PathwayAnalysis(disease_list = c("Tangier Disease", "Hypoalphalipoproteinemia", "Neuropathy"),
#' dataset = "Genes",
#' database = "KEGG")}
PathwayAnalysis = function(disease_list, dataset, database)
{
#check if dataset argument and disease list argument is correct
if(missing(disease_list))
stop("Argument 'disease_list' must be specified")
if(missing(dataset))
stop("Argument 'dataset' must be specified")
if(database != "KEGG" & database != "Reactome")
stop("Argument database can either be \"KEGG\" or \"Reactome\"")
if(!isInDataset(dataset = dataset, disease_list))
stop("Argument 'disease_list' contains one or more diseases not found in dataset")
if(dataset != "miRNAs" & dataset != "Genes")
stop("Argument 'dataset' must be specified")
#getting gene population
disease_table = getDiseaseTable(disease_list, dataset = dataset)
gene_population = countGenes(disease_table$`Gene Symbol`) #outputs gene symbols
#map miRNAs to genes using mirTar
disease_mirnas = NULL #miRNAs to be used as input in n variable
if(dataset == "miRNAs")
{
disease_mirnas = gene_population
#convert miRNAs to mirBase ids using biomaRt
mart <- biomaRt::useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
disease_mirna_table = biomaRt::getBM(attributes = c("hgnc_symbol", "mirbase_id"), filters = "hgnc_symbol", values = unique(disease_mirnas), mart = mart)
disease_mirnas_mirTar = disease_mirna_table$mirbase_id
disease_mirnas_mirTar = paste0(disease_mirnas_mirTar, "-?")
disease_data = human_data %>% dplyr::filter(grepl(paste(disease_mirnas_mirTar, collapse = "|"), x = miRNA, ignore.case = TRUE))
#get pathways
disease_target_genes = unique(disease_data$`Target Gene (Entrez Gene ID)`)
gene_population = disease_target_genes #outputs entrez gene ids
}
#step 2: Map genes to pathways depending on database
if(dataset == "Genes")
#outputs a data.frame of mapped pathways with their KEGG id, Source and Database
#pathwa-gene association table from pcoskbR
pathway = getPathways(gene_population, database = database)
else
{
#makes the whole analysis
pathway = createMirnaPathway(disease_target_genes = gene_population, database = "KEGG", disease_mirnas)
return(pathway)
}
#assign name for each column and calculate score
colnames(pathway) = c("Pathway name", "Pathway genes associated with PCOS", "Pathway genes associated with PCOS and the selected diseases", "Hypergeometric probability")
for (row in 1:nrow(pathway)) {
pathway[row,2] = paste0(countGenes(pathway[row,3]) , collapse = ", ") #assigns genes to pathways
pathway[row,3] = paste0(intersect(countGenes(pathway[row,2]), gene_population), collapse = ", ") #assigns pcos and selected diseaes
}
pathway = pathway[,c(1,3,2,4)]
#vakues u need to calculate p value
N = unique(gene_disease_associations$geneSymbol) #population of genes
n = gene_population #sample size = number of gnes used in input
#assign proper value to each column
for (i in 1:nrow(pathway))
{
K = unique(countGenes(pathway$`Pathway genes associated with PCOS`[i])) #number of pcos genes in ith pathway
k = intersect(K, gene_population) #number of inpiut genes in sample overlapping iwth genes in ith pathay
pathway[i,"Hypergeometric probability"] =dhyper(x = length(k), m = length(K), n = length(setdiff(N, K)), k = length(n))
}
return(pathway)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.