##' Benchmark linear motif instance found using QSLIMFinder (SLIMFinder)
##' @rdname benchmarkMotifs
##' @name benchmarkMotifs
##' @author Vitalii Kleshchevnikov
##' @param occurence_file a path to a tsv (txt) file containing QSLIMFinder (SLIMFinder) occurence output
##' @param main_file a path to a tsv (txt) file containing QSLIMFinder (SLIMFinder) main output
##' @param domain_res_file path to RData containing objects generated by what_we_find_VS_ELM.Rmd script (specifically \code{domain_results_obj} object)
##' @param motif_setup path to RData containing objects generated by PPInetwork2SLIMFinder pipeline (specifically \code{motif_input_obj} object)
##' @param domain_results_obj character, name of the object containing domain enrichment results (class == XYZinteration_XZEmpiricalPval)
##' @param motif_input_obj character, name of the object of class InteractionSubsetFASTA_list containing: FASTA sequences for interacting proteins, molecular interaction data they correspond to. Each element of a list contains input for individual QSLIMFinder run.
##' @param motif_setup_obj2 alternative way to provide motif_input_obj (class InteractionSubsetFASTA_list) directly. This object should not require matching domain-protein pairs. It must have been already processed by \code{\link{domainProteinPairMatch}} Can be useful for repeating benchmarking.
##' @param occurence_filt QSLIMFinder (SLIMFinder) occurence output filtered by those that we could have found from motif_input_obj.
##' @param one_from_cloud use only one top motif from motif cloud
##' @param dbfile_main a path to a gff (txt) file containing ELM database motif occurrences (proteins in the main set)
##' @param dburl_main url where to get ELM database containing motif occurrences (proteins in the main set)
##' @param dbfile_query a path to a gff (txt) file containing ELM database motif occurrences (proteins in the query set)
##' @param dburl_query url where to get ELM database containing motif occurrences (proteins in the query set)
##' @param query_res_query_only return only GRanges for query proteins, passed to "GRangesINinteractionSubsetFASTA". Do not change the default value.
##' @param motif_types character vector of motif types
##' @param all_res_excl_query all results in the output is all occurences excluding the query proteins. If FALSE, all results include occurences in all proteins. Not implemented
##' @param merge_motif_variants If FALSE (default) merge motif occurences only if motifs are variants of the same motif (such as TRG_NLS).
##' @param seed when using random negative sets (\code{neg_set = "random"}): seed for RNG for sampling
##' @param N when using random negative sets (\code{neg_set = "random"}): number of samples
##' @param replace when using random negative sets (\code{neg_set = "random"}): sample starts of GRanges with replacement \link{randomGRanges}
##' @param within1sequence when using random negative sets (\code{neg_set = "random"}): resample GRanges within one sequence or across sequences \link{randomGRanges}. If seq 1 has two motifs of length 4 and 7 and \code{within1sequence = TRUE} two motifs of the same length 4 and 7 will be sampled from the same protein. If \code{within1sequence = FALSE} two motifs of the same length 4 and 7 will be sampled from any protein in the set used for benchmarking.
##' @param all_predictor_col "Sig"
##' @param query_predictor_col "Sig" or "p.value" or "domain_motif_pval"
##' @param normalise logical, normalise predictor value, just in case predictor doesn't span the full range between 0 ... 1
##' @param minoverlap integer, passed to \code{\link[GenomicRanges]{findOverlaps}}
##' @param minoverlap_redundant for removing motif classes that match the same occurence
##' @param merge_domain_data If TRUE, merge domain enrichment results to motif occurence
##' @param merge_by_occurence_mcols columns of mcols (metadata of GRanges) that contain IDs of [1] protein with motif, [2] proteins with domain, e.g. c("query", "interacts_with"),
##' @param merge_by_domain_res_cols columns of domain enrichment results that contain IDs of [1] protein with motif, [2] proteins with domain, [3] domain, [4] and [5] Taxid for proteins with motif and domain respectively, e.g. c("IDs_interactor_viral", "IDs_interactor_human", "IDs_domain_human", "Taxid_interactor_human","Taxid_interactor_viral"). If Taxid columns are not present - omit.
##' @param merge_by_non_query_domain_res_cols columns of domain enrichment results for non-query proteins that contain IDs of [1] protein with motif, [2] proteins with domain, [3] domain, [4] and [5] Taxid for proteins with motif and domain respectively, e.g. c("IDs_interactor_human_A", "IDs_interactor_human_B", "IDs_domain_human_B", "Taxid_interactor_human_A","Taxid_interactor_human_B"). If Taxid columns are not present - omit.
##' @param filter_by_domain_data criteria to filter domain data and restrict motif search datasets (for example, "p.value < 0.05" or "fdr_pval < 0.05 & domain_count_per_IDs_interactor_viral > 1")
##' @param select_predictor_per_range function (such as min) that select predictor value if multiple values (such as returned by multiple datasets or multiple domains integrated) describe the same range
##' @param non_query_domain_res_file path to RData file containing the result of domain enrichment analysis for non-query proteins
##' @param non_query_domain_results_obj character, name of the object containing domain enrichment results for non-query proteins (class == XYZinteration_XZEmpiricalPval), when provided will be used for filtering datasets.
##' @param non_query_domains_N the number of non-query proteins with predicted domains for each dataset. Used only when non_query_domain_results_obj is not NULL
##' @param non_query_set_only If TRUE sequence sets searched for motif are filtered to contain only proteins from non_query_domain_results_obj (interacting partners of a seed), if FALSE - both from non_query_domain_results_obj and domain_res_obj. Used only when non_query_domain_results_obj is not NULL.
##' @param query_domains_only If TRUE proteins whose sequences will be used for motif search must be predicted to bind the same domains in a seed protein as domains predicted for query protein. Used only when non_query_domain_results_obj is not NULL
##' @param min_non_query_domain_support Minimal number of non-query proteins with the same motif as the query that are predicted to bind the same domain. Used to filter domains and proteins that do not predict top domains. Used only when non_query_domain_results_obj is not NULL.
##' @param min_top_domain_support4motif_nq Similar to min_non_query_domain_support. Minimal number of non-query proteins with the same motif as the query which have the same top-1 domain predicted.
##' @param select_top_domain If TRUE, top domain is selected using a product of domain p-values for all proteins with the same motif (min p-value) found using the same dataset. Used only when non_query_domain_results_obj is not NULL.
##' @param ... other arguments passed to passed to \code{\link[GenomicRanges]{findOverlaps}}
##' @return object class \code{(benchmarkMotifsResult)} containing occurence (GRanges, all, query, just after filtering by motif setup), instances_all (GRanges, known instances in all proteins or all excluding the query proteins), instances_query (GRanges, known instances in query proteins), predictions_all (for ROCR), labels_all (for ROCR), predictions_query (for ROCR), labels_query (for ROCR), overlapping_GRanges_all (GRanges, known instances that we also found), overlapping_GRanges_query(GRanges, known instances that we also found), N_query_prot_with_known_instances, N_query_known_instances, N_all_prot_with_known_instances, N_all_known_instances
##' @import GenomicRanges
##' @import GenomeInfoDb
##' @import data.table
##' @export benchmarkMotifs
##' @seealso \code{\link{ELMdb2GRanges}}, \code{\link{findOverlapsBench}}
benchmarkMotifs = function(occurence_file = "../viral_project/qslimfinder.Full_IntAct3.FALSE/result/occurence.txt",
main_file = "../viral_project/qslimfinder.Full_IntAct3.FALSE/result/main_result.txt",
domain_res_file = "../viral_project/processed_data_files/domain_res_count_20171019.RData",
motif_setup = "../viral_project/processed_data_files/QSLIMFinder_instances_h2v_qslimfinder.Full_IntAct3.FALSE_clust201802.RData",
neg_set = c("all_instances", "all_proteins", "random")[1],
domain_results_obj = "res_count",
motif_input_obj = "forSLIMFinder_Ready",
motif_setup_obj2 = NULL,
occurence_filt = NULL,
one_from_cloud = T,
dbfile_main = "../viral_project/data_files/instances_all.gff",
dburl_main = "http://elm.eu.org/instances.gff?q=None&taxon=Homo%20sapiens&instance_logic=",
dbfile_query = "../viral_project/data_files/instances_query.gff",
dburl_query = "http://elm.eu.org/instances.gff?q=all&taxon=irus&instance_logic=",
query_res_query_only = T, motif_types = c("DOC", "MOD", "LIG", "DEG", "CLV", "TRG"),
all_res_excl_query = T, merge_motif_variants = F,
seed = 21, N = 100, replace = T, within1sequence = T,
query_predictor_col = "Sig", all_predictor_col = "Sig", normalise = T,
minoverlap = 2,
minoverlap_redundant = 5,
merge_domain_data = T,
merge_by_occurence_mcols = c("query", "interacts_with"),
merge_by_domain_res_cols = c("IDs_interactor_viral", "IDs_interactor_human", "IDs_domain_human", "Taxid_interactor_human","Taxid_interactor_viral"),
merge_by_non_query_domain_res_cols = c("IDs_interactor_human_A", "IDs_interactor_human_B", "IDs_domain_human_B", "Taxid_interactor_human_A","Taxid_interactor_human_B"),
filter_by_domain_data = "p.value < 0.05", motif_pval_cutoff = 1,
select_predictor_per_range = max,
non_query_domain_res_file = "../viral_project/processed_data_files/predict_domain_human_clust20180819.RData",
non_query_domain_results_obj = NULL, # res_count_all
non_query_domains_N = 0,
non_query_set_only = F,
query_domains_only = F,
min_non_query_domain_support = 0,
min_top_domain_support4motif_nq = 0,
select_top_domain = F,
...) {
### Load domain enrichment results, PPI data, and data used for QSLIMfinder
# domain enrichment results - query proteins
domain_res_env = R.utils::env(load(domain_res_file))
domain_res = domain_res_env[[domain_results_obj]]
rm(domain_res_env)
# domain enrichment results - non-query proteins
if(!is.null(non_query_domain_results_obj)){
non_query_domain_res = R.utils::env(load(non_query_domain_res_file))[[non_query_domain_results_obj]]
} else non_query_domain_res = NULL
# data used for QSLIMfinder
if(is.null(motif_setup_obj2)) {
if(file.exists(paste0(motif_setup, ".zip"))) {
unzip(zipfile = paste0(motif_setup, ".zip"),
junkpaths = T,
exdir = dirname(paste0(motif_setup, ".zip")))
forSLIMFinder_Ready = R.utils::env(load(motif_setup))[[motif_input_obj]]
} else {
forSLIMFinder_Ready = R.utils::env(load(motif_setup))[[motif_input_obj]]
}
} else {
forSLIMFinder_Ready = copy(motif_setup_obj2)
motif_setup_obj2 = T
}
# keep only SLIMFinder datasets where seed protein - query protein pair matches filtering criteria
domain_res = XYZ.p.adjust(domain_res, adj_by = "p.value")
if(!is.null(filter_by_domain_data)){
eval(parse(text = paste0("domain_res$data_with_pval = domain_res$data_with_pval[",filter_by_domain_data,"]")))
if(!is.null(non_query_domain_res)) {
non_query_domain_res = XYZ.p.adjust(non_query_domain_res, adj_by = "p.value")
eval(parse(text = paste0("non_query_domain_res$data_with_pval = non_query_domain_res$data_with_pval[",filter_by_domain_data,"]")))
}
if(!isTRUE(motif_setup_obj2)){ # calculate only when not provided
forSLIMFinder_Ready = domainProteinPairMatch(forSLIMFinder_Ready, domain_res,
non_query_domain_res = non_query_domain_res,
non_query_domains_N = non_query_domains_N,
non_query_set_only = non_query_set_only,
query_domains_only = query_domains_only, remove = T)
}
}
#instances_all = ELMdb2GRanges(dbfile = dbfile_main,
# dburl = dburl_main,
# tsvurl = gsub("gff", "tsv", dburl_main),
# tsvfile = gsub("gff", "tsv", dbfile_main))
instances_query = ELMdb2GRanges(dbfile = dbfile_query,
dburl = dburl_query,
tsvurl = gsub("gff", "tsv", dburl_query),
tsvfile = gsub("gff", "tsv", dbfile_query))
### Filter ELM instances by instance logic == "true positive"
#instances_all = instances_all[instances_all$InstanceLogic == "true positive"]
instances_query = instances_query[instances_query$InstanceLogic == "true positive"]
### Filter ELM instances located in proteins that we use to search for motifs and select motif types of interest
#searched_all = GRangesINinteractionSubsetFASTA(grange = instances_all, interactionSubsetFASTA = forSLIMFinder_Ready)
#message(paste0(sum(searched_all$granges_in_sequences_Searched), " occurences out of ", length(searched_all$granges_in_sequences_Searched), " total in ELM could have been discovered (all proteins, all types of motifs)"))
#instances_all = instances_all[searched_all$granges_in_sequences_Searched]
#instances_all = filterBYmotifType(instances_all, motif_types = motif_types)
#seqlevels(instances_all) <- seqlevelsInUse(instances_all)
#seqlengths(instances_all) = searched_all$seqlengths[names(seqlengths(instances_all))]
searched_query = GRangesINinteractionSubsetFASTA(grange = instances_query, interactionSubsetFASTA = forSLIMFinder_Ready, query_only = query_res_query_only)
message(paste0(sum(searched_query$granges_in_sequences_Searched), " occurences out of ", length(searched_query$granges_in_sequences_Searched), " total in ELM in query proteins could have been discovered (QSLIMFinder query proteins, all types of motifs)"))
instances_query = instances_query[searched_query$granges_in_sequences_Searched]
instances_query = filterBYmotifType(instances_query, motif_types = motif_types)
seqlevels(instances_query) <- seqlevelsInUse(instances_query)
seqlengths(instances_query) = searched_query$seqlengths[names(seqlengths(instances_query))]
### Remove redundancy in known instances
#instances_all = reduceOverlappingGRanges(instances_all, minoverlap = minoverlap_redundant, merge_motif_variants = merge_motif_variants)
instances_query = reduceOverlappingGRanges(instances_query, minoverlap = minoverlap_redundant, merge_motif_variants = merge_motif_variants)
if(is.null(occurence_filt)){
### Load QSLIMFinder results and ELM data
occurence = SLIMFinderOcc2GRanges(occurence_file = occurence_file,
main_file = main_file,
one_from_cloud = one_from_cloud)
### Filter discovered occurences by those that we could have found
found = GRangesINinteractionSubsetFASTA(grange = occurence, interactionSubsetFASTA = forSLIMFinder_Ready, query_only = F)
occurence = occurence[found$granges_in_sequences_Searched]
## filter by predicted motif type
seqlevels(occurence) <- seqlevelsInUse(occurence)
seqlengths(occurence) = found$seqlengths[names(seqlengths(occurence))]
occurence_filt = copy(occurence)
} else {
occurence = copy(occurence_filt)
}
# apply motif_pval_cutoff
occurence = occurence[mcols(occurence[,query_predictor_col])[,1] < motif_pval_cutoff]
# add an identifier for motifs
mcols(occurence)$unique_position = paste0(GenomicRanges::seqnames(occurence), "_",
GenomicRanges::start(occurence), "_",
GenomicRanges::end(occurence))
#igraph::plot.igraph(overlapping_instances_graph, vertex.size = 1, arrow.mode = "-", vertex.label.cex = 0.6)
if(merge_domain_data){
# standardise column names
data_with_pval = copy(domain_res$data_with_pval)
std_cols = c("protein_with_motif", "protein_with_domain", "domain", "Taxid_protein_with_motif","Taxid_protein_with_domain")
for (col_name in merge_by_domain_res_cols) {
setnames(data_with_pval, colnames(data_with_pval),
gsub(col_name,
std_cols[which(merge_by_domain_res_cols == col_name)],
colnames(data_with_pval)))
}
data_with_pval[, domain_set := "query_set"]
if(!is.null(non_query_domain_res)) {
data_with_pval_nq = copy(non_query_domain_res$data_with_pval)
# standardise column names
for (col_name_nq in merge_by_non_query_domain_res_cols) {
setnames(data_with_pval_nq, colnames(data_with_pval_nq),
gsub(col_name_nq,
std_cols[which(merge_by_non_query_domain_res_cols == col_name_nq)],
colnames(data_with_pval_nq)))
}
data_with_pval_nq[, domain_set := "non_query_set"]
data_with_pval = rbind(data_with_pval, data_with_pval_nq)
## => use select domains that are supported by many human proteins (```[domain_set == "non_query_set" & domain_support_md_pair > 2, .(domain_id, motif_id, dataset_id)]; filter by .(domain_id, motif_id, dataset_id)```)
## => and filter datasets when there is no strong support (```[domain_set == "non_query_set" & domain_support_md_pair > 2, dataset_id]; filter by dataset_id```)
# merge
occurence = merge2GRangesmcols(occurence, data_with_pval,
by.x = c("prot_names", "interacts_with"),
by.y = c("protein_with_motif", "protein_with_domain"))
# save order
mcols(occurence)$motif_id_order = 1:length(occurence)
# extract metadata
mcol = as.data.table(mcols(occurence))
# combine p-values using geometric mean
gm_mean = function(x, na.rm=TRUE){
exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}
mcol[, combined_p.value := gm_mean(p.value), by = .(query, Pattern, interacts_with, domain)]
# find top domain for each motif 1-prod(1-p.value)
mcol[, top_domain := p.value == min(p.value), by = .(prot_names)]
mcol[is.na(top_domain), top_domain := FALSE]
top_temp = mcol[, .(top_domain_support4motif_nq = uniqueN(prot_names)),
by=.(query, Pattern, interacts_with, domain, top_domain, domain_set)]
top_temp = top_temp[top_domain == T & domain_set == "non_query_set"]
top_temp = top_temp[, top_domain := NULL]
top_temp = top_temp[, domain_set := NULL]
mcol = merge(mcol, top_temp, by = c("query", "Pattern", "interacts_with", "domain"), all.x = T, all.y = T)
# for each motif in each dataset (viral-human pair, protein A - protein B pair) find how many non-query proteins from the same dataset each query-binding domain is predicted to bind
q_temp = mcol[, .(domain_support4motif = uniqueN(prot_names)),
by=.(query, Pattern, interacts_with, domain, domain_set)]
# add this data to mcols in 2 separate columns
nq = q_temp[domain_set == "non_query_set", .(query, Pattern, interacts_with, domain,
domain_support4motif_nq = domain_support4motif)]
q = q_temp[domain_set == "query_set", .(query, Pattern, interacts_with, domain,
domain_support4motif_q = domain_support4motif)]
q = merge(q, nq, by = c("query", "Pattern", "interacts_with", "domain"), all.x = T, all.y = T)
mcol = merge(mcol, q, by = c("query", "Pattern", "interacts_with", "domain"), all.x = T, all.y = T)
rm(nq, q, q_temp)
# find which domains are supported by both non_query_set and query_set
mcol[, domain_in_sets := uniqueN(domain_set),
by=.(query, Pattern, interacts_with, domain)]
# add mcol back to GRanges
setorder(mcol, motif_id_order)
mcols(occurence) = mcol
# select domains that are supported by many human proteins
non_query_domain_support = as.data.table(mcols(occurence))[, !is.na(domain_support4motif_nq) &
domain_support4motif_nq >= min_non_query_domain_support]
occurence = occurence[non_query_domain_support]
# select domains that are top-1 for many human proteins
top_non_query_domain_support = as.data.table(mcols(occurence))[, !is.na(top_domain_support4motif_nq) &
top_domain_support4motif_nq >= min_top_domain_support4motif_nq]
occurence = occurence[top_non_query_domain_support]
if(select_top_domain){
top_domain = as.data.table(mcols(occurence))[, p.value == min(p.value), by = unique_position]
occurence = occurence[top_domain]
}
if(query_domains_only) occurence = occurence[occurence$domain_in_sets == 2]
#mcol[query == "P06463" & Pattern == "L.R..E" & interacts_with == "P17980" & domain == "IPR003959"]
#mcol[query == "P06429" & Pattern == "LKE.V" & interacts_with == "Q9H9D4" & domain == "IPR013083"]
} else {
# merge
occurence = merge2GRangesmcols(occurence, data_with_pval,
by.x = c("prot_names", "interacts_with"),
by.y = c("protein_with_motif", "protein_with_domain"))
}
}
### Filter discovered occurences to keep query occurences only
found_in_query = GRangesINinteractionSubsetFASTA(grange = occurence, interactionSubsetFASTA = forSLIMFinder_Ready, query_only = T)
occurence_query = occurence[found_in_query$granges_in_sequences_Searched]
# keep only cases when we found a motif in a query protein when that protein was a query
occurence_query = occurence_query[seqnames(occurence_query) == occurence_query$query]
## filter by predicted motif type
seqlevels(occurence_query) <- seqlevelsInUse(occurence_query)
seqlengths(occurence_query) = found_in_query$seqlengths[names(seqlengths(occurence_query))]
#suppressWarnings({
#if(all_res_excl_query){
# instances_all = subsetByOverlaps(instances_all, instances_query, type = "equal", invert = T)
#} else instances_all = c(instances_all, subsetByOverlaps(instances_query, instances_all, type = "equal", invert = T))
#})
# normalise, just in case predictor doesn't span the full range 0 ... 1
if(normalise) {
occurence_query$Sig_not_normalised = occurence_query$Sig
occurence_query$Sig = occurence_query$Sig/max(occurence_query$Sig)
occurence$Sig_not_normalised = occurence$Sig
occurence$Sig = occurence$Sig/max(occurence$Sig)
}
######################### Negative set: ranges in random locations ######################### START
if(neg_set == "random") {
### Generate random negative sets
set.seed(seed)
#random_instances_all = randomGRanges(instances_all, N = N, replace = replace, within1sequence = within1sequence)
random_instances_query = randomGRanges(instances_query, N = N, replace = replace, within1sequence = within1sequence)
### Combine positive and negative datasets
#combined_instances_all = lapply(random_instances_all, function(random_instance, instances_all){
# c(instances_all, random_instance)
#},instances_all)
combined_instances_query = lapply(random_instances_query, function(random_instance, instances_query){
c(instances_query, random_instance)
},instances_query)
### Find overlaps with predicted motifs
# All
#suppressWarnings({
#predictions_all = lapply(combined_instances_all, function(inst){
# findOverlapsBench(occuring = occurence, benchmarking = inst, predictor_col = all_predictor_col,
# labels_col = "for_benchmarking", normalise = normalise,
# minoverlap = minoverlap,
# select_predictor_per_range = select_predictor_per_range, ...)$for_ROC$predictions
#})
#labels_all = lapply(combined_instances_all, function(inst){
# labels = findOverlapsBench(occuring = occurence, benchmarking = inst, predictor_col = all_predictor_col,
# labels_col = "for_benchmarking", normalise = normalise,
# minoverlap = minoverlap,
# select_predictor_per_range = select_predictor_per_range, ...)$for_ROC$labels
# labels[grepl("1",labels)] = 1
# labels
#})
#})
# Query
suppressWarnings({
predictions_query = lapply(combined_instances_query, function(inst){
findOverlapsBench(occuring = occurence_query, benchmarking = inst, predictor_col = query_predictor_col,
labels_col = "for_benchmarking", normalise = normalise,
minoverlap = minoverlap,
select_predictor_per_range = select_predictor_per_range, ...)$for_ROC$predictions
})
labels_query = lapply(combined_instances_query, function(inst){
labels = findOverlapsBench(occuring = occurence_query, benchmarking = inst, predictor_col = query_predictor_col,
labels_col = "for_benchmarking", normalise = normalise,
minoverlap = minoverlap,
select_predictor_per_range = select_predictor_per_range, ...)$for_ROC$labels
labels[grepl("1",labels)] = 1
labels
})
})
}
######################### Negative set: ranges in random locations ######################### END
######################### Negative set: protein in ELM, motif not in ELM ######################### START
if(neg_set == "all_instances"){
## query proteins
combined_instances_query = instances_query
occurence_query = occurence_query[seqnames(occurence_query) %in% seqnames(instances_query)]
TP_FN = findOverlapsBench(occuring = occurence_query,
benchmarking = instances_query,
predictor_col = query_predictor_col,
labels_col = "for_benchmarking",
normalise = normalise,
minoverlap = minoverlap,
select_predictor_per_range = select_predictor_per_range)
TP_FN$for_ROC$labels = 1
FP = subsetByOverlaps(occurence_query, TP_FN$overlapping_GRanges, type = "equal", invert = T)
FP$for_benchmarking = 0
res = data.frame(predictions = 1 - mcols(FP)[,query_predictor_col],
labels = mcols(FP)[,"for_benchmarking"],
stringsAsFactors = F)
# choose only one p-value for the same range: occuring_subset
res_list = split(res, paste0(seqnames(FP), start(FP), end(FP)))
res_list = lapply(res_list, function(res_over) {
res_over$predictions = select_predictor_per_range(res_over$predictions)
unique(res_over)
})
res = Reduce(rbind, res_list)
res = rbind(res, TP_FN$for_ROC)
predictions_query = res$predictions
labels_query = res$labels
## all proteins
#combined_instances_all = instances_all
#occurence = occurence[seqnames(occurence) %in% seqnames(instances_all)]
#TP_FN_all = findOverlapsBench(occuring = occurence,
# benchmarking = instances_all,
# predictor_col = all_predictor_col,
# labels_col = "for_benchmarking",
# normalise = normalise,
#minoverlap = minoverlap,
# select_predictor_per_range = select_predictor_per_range)
#TP_FN_all$for_ROC$labels = 1
#FP_all = subsetByOverlaps(occurence, TP_FN_all$overlapping_GRanges, type = "equal", invert = T)
#FP_all$for_benchmarking = 0
#res_all = data.frame(predictions = 1 - mcols(FP_all)[,all_predictor_col], labels = mcols(FP_all)[,"for_benchmarking"], stringsAsFactors = F)
#res_all = rbind(res_all, TP_FN_all$for_ROC)
#predictions_all = res_all$predictions
#labels_all = res_all$labels
}
######################### Negative set: protein in ELM, motif not in ELM ######################### END
######################### Negative set: protein not in ELM or protein in ELM but motif not in ELM ######################### START
if(neg_set == "all_proteins"){
## query proteins
combined_instances_query = instances_query
TP_FN = findOverlapsBench(occuring = occurence_query,
benchmarking = instances_query,
predictor_col = query_predictor_col,
labels_col = "for_benchmarking",
normalise = normalise,
minoverlap = minoverlap,
select_predictor_per_range = select_predictor_per_range)
TP_FN$for_ROC$labels = 1
FP = subsetByOverlaps(occurence_query, TP_FN$overlapping_GRanges, type = "equal", invert = T)
FP$for_benchmarking = 0
res = data.frame(predictions = 1 - mcols(FP)[,query_predictor_col], labels = mcols(FP)[,"for_benchmarking"], stringsAsFactors = F)
# choose only one p-value for the same range: occuring_subset
res_list = split(res, paste0(seqnames(FP), start(FP), end(FP)))
res_list = lapply(res_list, function(res_over) {
res_over$predictions = select_predictor_per_range(res_over$predictions)
unique(res_over)
})
res = Reduce(rbind, res_list)
res = rbind(res, TP_FN$for_ROC)
predictions_query = res$predictions
labels_query = res$labels
labels_query[grepl("1",labels_query)] = 1
## all proteins
#combined_instances_all = instances_all
#TP_FN_all = findOverlapsBench(occuring = occurence,
# benchmarking = instances_all,
# predictor_col = all_predictor_col,
# labels_col = "for_benchmarking",
# normalise = normalise,
# minoverlap = minoverlap,
# select_predictor_per_range = select_predictor_per_range)
#TP_FN_all$for_ROC$labels = 1
#FP_all = subsetByOverlaps(occurence, TP_FN_all$overlapping_GRanges, type = "equal", invert = T)
#FP_all$for_benchmarking = 0
#res_all = data.frame(predictions = 1 - mcols(FP_all)[,all_predictor_col], labels = mcols(FP_all)[,"for_benchmarking"], stringsAsFactors = F)
#res_all = rbind(res_all, TP_FN_all$for_ROC)
#predictions_all = res_all$predictions
#labels_all = res_all$labels
}
######################### Negative set: protein not in ELM or protein in ELM but motif not in ELM ######################### END
### Get overlapping instances
suppressWarnings({
if(neg_set == "random") {
# combined_instances_all_temp = combined_instances_all[[1]]
combined_instances_query_temp = combined_instances_query[[1]]
} else {
# combined_instances_all_temp = combined_instances_all
combined_instances_query_temp = combined_instances_query
}
#overlaps_all = findOverlapsBench(occuring = occurence,
# benchmarking = combined_instances_all_temp,
# predictor_col = all_predictor_col,
# labels_col = "for_benchmarking",
# normalise = normalise,
# minoverlap = minoverlap)$overlapping_GRanges
#overlapping_GRanges_all = overlaps_all$overlapping_GRanges[overlaps_all$overlapping_GRanges$for_benchmarking == 1,]
overlaps_query = findOverlapsBench(occuring = occurence_query,
benchmarking = combined_instances_query_temp,
predictor_col = query_predictor_col,
labels_col = "for_benchmarking",
normalise = normalise,
minoverlap = minoverlap)
overlapping_GRanges_query = overlaps_query$overlapping_GRanges[overlaps_query$overlapping_GRanges$for_benchmarking == 1,]
})
##### KNOWN #####
N_query_prot_with_known_instances = length(unique(as.character(seqnames(instances_query)))) #motif_protein_table_queryKNOWN[, uniqueN(UNIPROT)]#, by = .(ID)]
# total number of instances
N_query_known_instances = length(unique(instances_query))
# motif_protein_table_allKNOWN = as.data.table(cbind(mcols(instances_all)[, c("ID")], UNIPROT = seqnames(instances_all)))
# number of proteins with motif
# N_all_prot_with_known_instances = length(unique(as.character(seqnames(instances_all)))) #motif_protein_table_allKNOWN[, uniqueN(UNIPROT)]#, by = .(ID)]
# total number of instances
# N_all_known_instances = length(unique(instances_all))
##### FOUND #####
# number of proteins with motif
N_query_prot_with_known_instances_found = length(unique(as.character(seqnames(overlapping_GRanges_query)))) #motif_protein_table_query[, uniqueN(UNIPROT), by = .(ID, Pattern)]
# total number of discovered instances
N_query_known_instances_found = length(unique(overlapping_GRanges_query))
# number of known instances that match discovered instances
N_query_match_known_instances_found = N_query_known_instances - length(unique(overlaps_query$not_found))
# total found
N_query_total_instances_found = length(unique(paste0(seqnames(occurence_query),
start(occurence_query), end(occurence_query))))
# number of proteins with motif
#N_all_prot_with_known_instances_found = length(unique(as.character(seqnames(overlapping_GRanges_all)))) #motif_protein_table_all[, uniqueN(UNIPROT), by = .(ID, Pattern)]
# total number of discovered instances
#N_all_known_instances_found = length(unique(overlapping_GRanges_all))
# number of known instances that match discovered instances
#N_all_match_known_instances_found = N_all_known_instances - length(unique(overlaps_all$not_found))
# total found
#N_all_total_instances_found = length(unique(paste0(seqnames(occurence_query),
# start(occurence_query), end(occurence_query))))
out = list(
occurence = occurence, occurence_query = occurence_query, occurence_filt = occurence_filt,
#instances_all = instances_all,
instances_query = instances_query,
#predictions_all = predictions_all, labels_all = labels_all,
predictions_query = predictions_query, labels_query = labels_query,
#overlapping_GRanges_all = overlapping_GRanges_all,
overlapping_GRanges_query = overlapping_GRanges_query,
N_query_prot_with_known_instances = N_query_prot_with_known_instances, N_query_known_instances = N_query_known_instances,
#N_all_prot_with_known_instances = N_all_prot_with_known_instances, N_all_known_instances = N_all_known_instances,
N_query_prot_with_known_instances_found = N_query_prot_with_known_instances_found, N_query_known_instances_found = N_query_known_instances_found,
N_query_total_instances_found = N_query_total_instances_found,
#N_all_total_instances_found = N_all_total_instances_found,
#N_all_prot_with_known_instances_found = N_all_prot_with_known_instances_found, N_all_known_instances_found = N_all_known_instances_found,
N_query_match_known_instances_found = N_query_match_known_instances_found,
#N_all_match_known_instances_found = N_all_match_known_instances_found,
domain_res = domain_res,
motif_setup_obj2 = forSLIMFinder_Ready)
class(out) = "benchmarkMotifsResult"
out
}
##' Get motifs from the output of benchmarking linear motifs by id
##' @rdname benchmarkMotifs
##' @name queryOCCByMCOL
##' @author Vitalii Kleshchevnikov
##' @param res object class \code{(benchmarkMotifsResult)}, the output of benchmarkMotifs
##' @param keytype character, name of the column that contains key identifiers
##' @param key character, identifiers for which to retrieve the result
##' @return GenomicRanges containing motifs for a given key
##' @import GenomicRanges
##' @import data.table
##' @export queryOCCByMCOL
queryOCCByMCOL = function(res, keytype = "IDs_domain_human", key = "IPR032440"){
if(class(res) != "benchmarkMotifsResult") stop("res should be of class \"benchmarkMotifsResult\"")
res$occurence_query[which(mcols(res$occurence_query)[, keytype] == key)]
}
##' Benchmark linear motif instance found using QSLIMFinder (SLIMFinder)
##' @rdname benchmarkMotifs
##' @name mBenchmarkMotifs
##' @author Vitalii Kleshchevnikov
##' @param datasets character vector, names of the datasets ("Vidal" in "./SLIMFinder_Vidal/result/occurence.txt" or "" in "./SLIMFinder/result/occurence.txt")
##' @param descriptions character vector, description of the datasets (title of venn diagram plot)
##' @param dir character, base directory. For example, "./" in "./SLIMFinder_Vidal/result/occurence.txt"
##' @return list of objects of class \code{(benchmarkMotifsResult)}
##' @import GenomicRanges
##' @import data.table
##' @export mBenchmarkMotifs
mBenchmarkMotifs = function(datasets = c("qslimfinder.Full_IntAct3.FALSE"),
descriptions = c("human network (full IntAct) searched \nfor motifs present in viral proteins"),
dir = "./",
motif_setup_months = "201802",
...){
results = lapply(datasets, function(dataset) {
description = descriptions[datasets == dataset]
if(length(motif_setup_months) > 1){
motif_setup_month = motif_setup_months[datasets == dataset]
} else motif_setup_month = motif_setup_months
occurence_file = paste0(dir, dataset, "/result/occurence.txt")
main_file = paste0(dir, dataset, "/result/main_result.txt")
motif_setup = paste0(dir,"/QSLIMFinder_instances_h2v_", dataset, "_clust", motif_setup_month,".RData")
# if only viral protein were searched - all_res_excl_query =F
#if(grepl("all_viral_interaction", dataset)) all_res_excl_query = F
result = benchmarkMotifs(occurence_file = occurence_file, # generated by mBenchmarkMotifs, all else just passed
main_file = main_file, # generated by mBenchmarkMotifs
motif_setup = motif_setup, # generated by mBenchmarkMotifs
#all_res_excl_query = all_res_excl_query,
...)
result$description = description
result
})
resnames = character(length(datasets))
resnames[datasets != ""] = datasets[datasets != ""]
resnames[datasets == ""] = "full_Intact"
names(results) = resnames
results
}
##' Reduce GRanges keeping metadata (collapsing withing each column separated by pipes)
##' @rdname reduceWithMcols
##' @name reduceWithMcols
##' @author Vitalii Kleshchevnikov
##' @param range any Genomic Ranges object with metadata (mcols())
##' @return reduced Genomic Ranges object
##' @import GenomicRanges
##' @export reduceWithMcols
reduceWithMcols = function(range) {
metadata = mcols(range)
for(i in 1:ncol(metadata)) {
metadata[,i] = paste0(as.character(metadata[,i]), collapse = "|")
}
metadata = unique(metadata)
range = reduce(range, min.gapwidth = 0)
mcols(range) = metadata
range
}
##' Generate motif variant class (the first 2 element in the motif names)
##' @rdname motifVariantClass
##' @name motifVariantClass
##' @author Vitalii Kleshchevnikov
##' @param ids character vector of ELM motif IDs
##' @return character vector of ELM motif variant class (such as TRG_NLS or LIG_SH3)
##' @export motifVariantClass
motifVariantClass = function(ids){
matches = gregexpr("^[[:alnum:]]+_[[:alnum:]]+", ids)
for (i in seq_along(matches)) {
ids[i] = substr(ids[i], matches[[i]][1], attr(matches[[i]],"match.length"))
}
ids
}
##' Reduce Overlapping GRanges keeping metadata (collapsing withing each column separated by pipes)
##' @rdname reduceOverlappingGRanges
##' @name reduceOverlappingGRanges
##' @author Vitalii Kleshchevnikov
##' @param GRanges any Genomic Ranges object with metadata (mcols())
##' @param minoverlap min overlap between Genomic Ranges to continue with reduce
##' @return reduced Genomic Ranges object
##' @import GenomicRanges
##' @import data.table
##' @export reduceOverlappingGRanges
reduceOverlappingGRanges = function(GRanges, minoverlap = 5, merge_motif_variants = F, ...){
overlapping_instances = findOverlaps(GRanges, minoverlap = minoverlap, ...)
overlapping_instances = as.data.frame(overlapping_instances[queryHits(overlapping_instances) < subjectHits(overlapping_instances)])
overlapping_instances$queryHits = as.character(overlapping_instances$queryHits)
overlapping_instances$subjectHits = as.character(overlapping_instances$subjectHits)
overlapping_instances_graph = igraph::graph.edgelist(as.matrix(overlapping_instances), directed = F)
if(length(igraph::E(overlapping_instances_graph)) > 0){
overlapping_instances = igraph::fastgreedy.community(overlapping_instances_graph)
overlapping_instances = data.table(range_indx = as.integer(igraph::V(overlapping_instances_graph)$name), cluster = igraph::membership(overlapping_instances))
overlapping_instances = overlapping_instances[order(range_indx)]
non_overlapping_instances = GRanges[-overlapping_instances$range_indx]
overlapping_instances_ranges = GRanges[overlapping_instances$range_indx]
overlapping_instances_ranges = split(overlapping_instances_ranges, overlapping_instances$cluster)
instances_ranges = sapply(overlapping_instances_ranges, function(range, merge_motif_variants){
if(merge_motif_variants){
range = reduceWithMcols(range)
} else {
motif_variants_classes = split(range, motifVariantClass(range$ID))
range = sapply(motif_variants_classes, function(motif_variants_class) reduceWithMcols(motif_variants_class))
range = Reduce(c, range)
}
range
}, merge_motif_variants)
instances_ranges = Reduce(c, instances_ranges)
instances_ranges = c(instances_ranges, non_overlapping_instances)
} else instances_ranges = GRanges
instances_ranges
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.