#' Calculate iPAS pathway scores for a given input signature
#'
#' This function takes in a query signature (ex: log fold-change values of gene
#' expression for a perturbation experiment) and calculates iPAS scores for
#' each KEGG pathway. Many query signatures may be submitted at once. The input
#' signature(s) must be in the form of a matrix where each column is a signature
#' and each row is a gene. Row names must be Entrez IDs. Names for each
#' signature may be given as column names.
#'
#' @param query a matrix with the input/query signatures to perform pathway analysis on.
#' The row names should be genes (Entrez IDs) and the column names should be
#' names for the samples/signatures.
#' @param similarity the type of similarity measure to use for iPAS. Choices are
#' "Pearson" for Pearson correlation, "cosine" for cosine similarity, and
#' "dot_product" for the dot product.
#' @param gene_type the type of names used for the genes. Currently only Entrez
#' IDs are available.
#' @param category the categories of KEGG pathways for which to calculate iPAS
#' scores. We categorize pathways as "Disease", "Other", "Signaling", or
#' "Cancer". By default all categories/pathways are included.
#' @param perm the number of permutations to perform (1000 by default)
#' @param testing T/F value or integer. If TRUE, only calculate iPAS scores for
#' the first 5 pathways. If an integer n, calculate iPAS scores for the first n
#' pathways. Used in testing.
#' @param return_individual_cl T/F value, whether to return similarity scores
#' for each individual cell line
#' @param return_null_dist T/F value, whether to return the null distribution
#' of iPAS scores for each pathway (permutation scores). TRUE is required for
#' the iPAS_density and iPAS_density_facet functions, which graph the density
#' of the null distribution. FALSE by default.
#' @param overlap_min the minimum number of overlapping genes required between
#' the input signature and a pathway signature to calculate similarity (default
#' is 10).
#' @param ncores number of cores to use for calculation (via parallel package),
#' default is 1.
#' @param seed a seed (integer value) to use for random number generation,
#' passed to set.seed. This can be used to make the analysis reproducible, since
#' it involved random permutation.
#' @param print_updates T/F value, whether to print updates while calculating,
#' FALSE by default.
#'
#' @return a data frame (tibble) with correlation and p-value results for the input signature
#' compared with each PAS signature
#' @export
iPAS_enrich = function(query,
similarity = c("Pearson", "cosine", "dot_product"),
gene_type = "entrez",
category = c("Disease", "Other", "Signaling", "Cancer"),
perm = 1000,
testing = F,
return_individual_cl = T,
return_null_dist = F,
overlap_min = 10,
ncores = 1,
seed = NULL,
print_updates = F,
updateProgress = NULL
) {
call = list(query = query, similarity = similarity, gene_type = gene_type,
category = category, perm = perm, testing = testing,
return_individual_cl = return_individual_cl,
return_null_dist = return_null_dist,
ncores = ncores,
seed = seed
)
# Check inputs ------------------------------------------------------------
similarity = match.arg(similarity)
gene_type = match.arg(gene_type)
#cell_line = match.arg(cell_line, several.ok = TRUE)
category = match.arg(category, several.ok = TRUE)
all_cl = c("A375", "A549", "HA1E", "HCC515",
"HEKTE", "HEPG2", "HT29", "MCF7",
"NPC", "PC3", "SW480", "VCAP")
# Load pathway metadata and PAS -------------------------------------------
#### Load pathway metadata
path_meta = iPAS::kegg_meta %>% dplyr::rename(pathway = Pathway)
ens_weights = iPAS::ens_weights_no_intercept
ens_pathways = colnames(ens_weights)
## Filter to only the specified categories
path_meta = path_meta %>% dplyr::filter(Category_New %in% category)
all_paths = path_meta$pathway
all_paths = intersect(all_paths, ens_pathways)
if(testing) {
if(is.logical(testing)) {
all_paths = all_paths[1:5]
}
if(is.numeric(testing)) {
all_paths = all_paths[1:testing]
}
}
#### Load ensemble signatures
pas_by_path = iPAS::iPAS_by_path
l1k_genes = pas_by_path[[1]] %>% rownames()
#pas_ls = iPAS::pas_ls
#ens_weights = readRDS("/opt/raid10/genomics/nick/Projects/Git/MTL/output/ensemble_beta_w2_0.rds")
#ens_weights = ens_weights[-1,]
##check1 = all.equal(rownames(ens_weights), names(pas_ls))
# Loop over each sample of input/query ------------------------------------
res = parallel::mclapply(1:dim(query)[2], function(i) {
if(print_updates) {
print(i)
}
tmp_query = query[,i]
tmp_query = tmp_query[!is.na(tmp_query)]
tmp_query = tmp_query[!is.infinite(tmp_query)]
### Create null distribution signatures
if(!is.null(seed)) set.seed(seed)
if(length(tmp_query) >= 978) {
query_perm_full = sapply(1:perm, function(i) {
sample(tmp_query, size = 978, replace = FALSE)
})
} else {
query_perm_full = sapply(1:perm, function(i) {
sample(tmp_query, size = 978, replace = TRUE)
})
}
# Loop over each each pathway ---------------------------------------------
#tictoc::tic()
tmp_res = lapply(1:length(all_paths), function(j){
x = all_paths[j]
if(print_updates) {
print(x)
}
# If we were passed a progress update function, call it
if (is.function(updateProgress)) {
n_samp = dim(query)[2]
n_path = length(all_paths)
total = n_samp*n_path
prog = (i - 1)*n_path + j
val = prog/total
text <- paste0("Sample ", i, " of ", n_samp," - ", "Pathway ", j, " of ", n_path)
updateProgress(value = val, detail = text)
}
pas_tmp = pas_by_path[[x]]
null_check = is.null(pas_tmp)
if(null_check) {
print("NULL check")
return(NULL)
}
### set rownames of permutations to L1000 genes
rownames(query_perm_full) = rownames(pas_tmp)
check_tmp = all.equal(colnames(pas_tmp), all_cl)
if(!check_tmp) stop("columns of 'pas_tmp' matrix out of order: iPAS_enrich function")
# Loop over each cell line ------------------------------------------------
tmp_cl = lapply(all_cl, function(cl) {
if(print_updates) {
#print(cl)
}
pas_ind = pas_tmp[,cl]
### match genes from tmp_query with those from pas_tmp and remove NAs
pas_ind = pas_ind[!is.na(pas_ind)]
pas_ind = pas_ind[!is.infinite(pas_ind)]
shared_genes = intersect(names(pas_ind), names(tmp_query))
overlap = length(shared_genes)
pas_ind = pas_ind[shared_genes]
query_ind = tmp_query[shared_genes]
query_perm = query_perm_full[shared_genes, , drop = F]
### send to pathway scoring function
wt = ens_weights[cl,x]
sim_tmp = iPAS_score_perm(query = query_ind, query_perm = query_perm, pas = pas_ind, similarity = similarity, perm = perm, pathway = x, cell_line = cl, ens_weight = wt, overlap_min = overlap_min)
return(sim_tmp)
}) %>% magrittr::set_names(all_cl)
# Calculate ensemble pathway scores ---------------------------------------
score_vec = sapply(tmp_cl, function(x) x$score)
ens_tmp = ens_weights[,x]
score = sum(score_vec*ens_tmp)
gene_scores_ind = sapply(tmp_cl, function(x) {
tmp = x$gene_scores
gene_vec = rep(0, 978) %>% magrittr::set_names(l1k_genes)
gene_vec[match(names(tmp), names(gene_vec))] = tmp
return(gene_vec)
})
gene_scores_ens_mat = gene_scores_ind %*% as.matrix(ens_tmp)
gene_scores_ens = gene_scores_ens_mat[,1]
check_tmp = all.equal(names(score_vec), names(ens_tmp))
if(!check_tmp) stop("cell line names not matching")
perm_scores_cl = sapply(tmp_cl, function(x) x$null_dist)
perm_scores = perm_scores_cl %*% as.matrix(ens_tmp, ncol = 1)
ecdf_perm = ecdf(perm_scores)
mean_perm = mean(perm_scores, na.rm = TRUE)
sd_perm = sd(perm_scores, na.rm = TRUE)
z = (score - mean_perm)/sd_perm
if(sign(score) > 0) {
direction = "Positive"
p_emp = (sum(perm_scores >= score) + 1)/perm ## add 1 to prevent p = 0
} else if (sign(score) < 0) {
direction = "Negative"
p_emp = (sum(perm_scores <= score) + 1)/perm ## add 1 to prevent p = 0
} else {
direction = "Zero"
p_emp = 1
}
overlap = lapply(tmp_cl, function(x) x$overlap)
out_list = list(
ensemble = list(score = score, z_score = z, p_emp = p_emp,
overlap = overlap, direction = direction, similarity = similarity,
gene_scores_ens = gene_scores_ens,
pas_ens = list(pas_tmp),
query_l1k = list(tmp_query),
pathway = x, cell_line = NA, null_ecdf = ecdf_perm,
null_dist = perm_scores, null_mean = mean_perm, null_sd = sd_perm
)
)
ens_and_cl = c(tmp_cl, out_list)
#### format list as a tibble and return
out_tbl_ens = tibble::tibble(pathway = x, cell_line = "ensemble",
score = round(score, 3),
z_score = round(z, 3), p_emp = p_emp, direction = direction,
ens_weight = NA, gene_scores_ens = list(gene_scores_ens),
score_type = similarity, overlap = list(overlap), null_dist = list(perm_scores)
)
ind_tbls = lapply(tmp_cl, function(x) x$tbl) %>% dplyr::bind_rows()
full_tbl = ind_tbls %>% dplyr::bind_rows(out_tbl_ens)
return( list(tbl = full_tbl) )
}) %>% magrittr::set_names(all_paths)
#tictoc::toc()
tbl = lapply(tmp_res, function(x) x$tbl) %>% dplyr::bind_rows()
if(!return_null_dist) tbl$null_dist = NULL
### merge table with pathway metadata
tbl2 = tbl %>%
dplyr::filter(cell_line == "ensemble") %>%
dplyr::left_join(path_meta, by = "pathway") %>%
dplyr::arrange( p_emp, desc(abs(z_score))) %>%
dplyr::relocate(pathway, pathway_desc, score, z_score, p_emp,
direction, cell_line, Category_Lv1,
Category_Lv2, Category_New, size, path_nodes_list,
overlap
)
if(return_individual_cl) {
pathway_ind = tbl %>%
dplyr::filter(cell_line != "ensemble") %>%
split(.$pathway)
} else {
pathway_ind = tibble::tibble()
}
res_ls = list(ensemble = tbl2, cl = pathway_ind)
return(res_ls)
}, mc.cores = ncores) %>% magrittr::set_names(colnames(query))
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.