R/benchmarkMotifs.R

##' Benchmark linear motif instance found using QSLIMFinder (SLIMFinder)
##' @rdname benchmarkMotifs
##' @name benchmarkMotifs
##' @author Vitalii Kleshchevnikov
##' @param occurence_file a path to a tsv (txt) file containing QSLIMFinder (SLIMFinder) occurence output
##' @param main_file a path to a tsv (txt) file containing QSLIMFinder (SLIMFinder) main output
##' @param domain_res_file path to RData containing objects generated by what_we_find_VS_ELM.Rmd script (specifically \code{domain_results_obj} object)
##' @param motif_setup path to RData containing objects generated by PPInetwork2SLIMFinder pipeline (specifically \code{motif_input_obj} object)
##' @param domain_results_obj character, name of the object containing domain enrichment results (class == XYZinteration_XZEmpiricalPval)
##' @param motif_input_obj character, name of the object of class InteractionSubsetFASTA_list containing: FASTA sequences for interacting proteins, molecular interaction data they correspond to. Each element of a list contains input for individual QSLIMFinder run.
##' @param motif_setup_obj2 alternative way to provide motif_input_obj (class InteractionSubsetFASTA_list) directly. This object should not require matching domain-protein pairs. It must have been already processed by \code{\link{domainProteinPairMatch}} Can be useful for repeating benchmarking.
##' @param occurence_filt QSLIMFinder (SLIMFinder) occurence output filtered by those that we could have found from motif_input_obj.
##' @param one_from_cloud use only one top motif from motif cloud
##' @param dbfile_main a path to a gff (txt) file containing ELM database motif occurrences (proteins in the main set)
##' @param dburl_main url where to get ELM database containing motif occurrences (proteins in the main set)
##' @param dbfile_query a path to a gff (txt) file containing ELM database motif occurrences (proteins in the query set)
##' @param dburl_query url where to get ELM database containing motif occurrences (proteins in the query set)
##' @param query_res_query_only return only GRanges for query proteins, passed to "GRangesINinteractionSubsetFASTA". Do not change the default value.
##' @param motif_types character vector of motif types
##' @param all_res_excl_query all results in the output is all occurences excluding the query proteins. If FALSE, all results include occurences in all proteins. Not implemented
##' @param merge_motif_variants If FALSE (default) merge motif occurences only if motifs are variants of the same motif (such as TRG_NLS).
##' @param seed when using random negative sets (\code{neg_set = "random"}): seed for RNG for sampling
##' @param N when using random negative sets (\code{neg_set = "random"}): number of samples
##' @param replace when using random negative sets (\code{neg_set = "random"}): sample starts of GRanges with replacement \link{randomGRanges}
##' @param within1sequence when using random negative sets (\code{neg_set = "random"}): resample GRanges within one sequence or across sequences \link{randomGRanges}. If seq 1 has two motifs of length 4 and 7 and \code{within1sequence = TRUE} two motifs of the same length 4 and 7 will be sampled from the same protein. If \code{within1sequence = FALSE} two motifs of the same length 4 and 7 will be sampled from any protein in the set used for benchmarking.
##' @param all_predictor_col "Sig"
##' @param query_predictor_col "Sig" or "p.value" or "domain_motif_pval"
##' @param normalise logical, normalise predictor value, just in case predictor doesn't span the full range between 0 ... 1
##' @param minoverlap integer, passed to \code{\link[GenomicRanges]{findOverlaps}}
##' @param minoverlap_redundant for removing motif classes that match the same occurence
##' @param merge_domain_data If TRUE, merge domain enrichment results to motif occurence
##' @param merge_by_occurence_mcols columns of mcols (metadata of GRanges) that contain IDs of [1] protein with motif, [2] proteins with domain, e.g. c("query", "interacts_with"),
##' @param merge_by_domain_res_cols columns of domain enrichment results that contain IDs of [1] protein with motif, [2] proteins with domain, [3] domain, [4] and [5] Taxid for proteins with motif and domain respectively, e.g. c("IDs_interactor_viral", "IDs_interactor_human", "IDs_domain_human", "Taxid_interactor_human","Taxid_interactor_viral"). If Taxid columns are not present - omit.
##' @param merge_by_non_query_domain_res_cols columns of domain enrichment results for non-query proteins that contain IDs of [1] protein with motif, [2] proteins with domain, [3] domain, [4] and [5] Taxid for proteins with motif and domain respectively, e.g. c("IDs_interactor_human_A", "IDs_interactor_human_B", "IDs_domain_human_B", "Taxid_interactor_human_A","Taxid_interactor_human_B"). If Taxid columns are not present - omit.
##' @param filter_by_domain_data criteria to filter domain data and restrict motif search datasets (for example, "p.value < 0.05" or "fdr_pval < 0.05 & domain_count_per_IDs_interactor_viral > 1")
##' @param select_predictor_per_range function (such as min) that select predictor value if multiple values (such as returned by multiple datasets or multiple domains integrated) describe the same range
##' @param non_query_domain_res_file path to RData file containing the result of domain enrichment analysis for non-query proteins
##' @param non_query_domain_results_obj character, name of the object containing domain enrichment results for non-query proteins (class == XYZinteration_XZEmpiricalPval), when provided will be used for filtering datasets.
##' @param non_query_domains_N the number of non-query proteins with predicted domains for each dataset. Used only when non_query_domain_results_obj is not NULL
##' @param non_query_set_only If TRUE sequence sets searched for motif are filtered to contain only proteins from non_query_domain_results_obj (interacting partners of a seed), if FALSE - both from non_query_domain_results_obj and domain_res_obj. Used only when non_query_domain_results_obj is not NULL.
##' @param query_domains_only If TRUE proteins whose sequences will be used for motif search must be predicted to bind the same domains in a seed protein as domains predicted for query protein. Used only when non_query_domain_results_obj is not NULL
##' @param min_non_query_domain_support Minimal number of non-query proteins with the same motif as the query that are predicted to bind the same domain. Used to filter domains and proteins that do not predict top domains. Used only when non_query_domain_results_obj is not NULL.
##' @param min_top_domain_support4motif_nq Similar to min_non_query_domain_support. Minimal number of non-query proteins with the same motif as the query which have the same top-1 domain predicted.
##' @param select_top_domain If TRUE, top domain is selected using a product of domain p-values for all proteins with the same motif (min p-value) found using the same dataset. Used only when non_query_domain_results_obj is not NULL.
##' @param ... other arguments passed to passed to \code{\link[GenomicRanges]{findOverlaps}}
##' @return object class \code{(benchmarkMotifsResult)} containing occurence (GRanges, all, query, just after filtering by motif setup), instances_all (GRanges, known instances in all proteins or all excluding the query proteins), instances_query (GRanges, known instances in query proteins), predictions_all (for ROCR), labels_all (for ROCR), predictions_query (for ROCR), labels_query (for ROCR), overlapping_GRanges_all (GRanges, known instances that we also found), overlapping_GRanges_query(GRanges, known instances that we also found), N_query_prot_with_known_instances, N_query_known_instances, N_all_prot_with_known_instances, N_all_known_instances
##' @import GenomicRanges
##' @import GenomeInfoDb
##' @import data.table
##' @export benchmarkMotifs
##' @seealso \code{\link{ELMdb2GRanges}}, \code{\link{findOverlapsBench}}
benchmarkMotifs = function(occurence_file = "../viral_project/qslimfinder.Full_IntAct3.FALSE/result/occurence.txt",
                           main_file = "../viral_project/qslimfinder.Full_IntAct3.FALSE/result/main_result.txt",
                           domain_res_file = "../viral_project/processed_data_files/domain_res_count_20171019.RData",
                           motif_setup = "../viral_project/processed_data_files/QSLIMFinder_instances_h2v_qslimfinder.Full_IntAct3.FALSE_clust201802.RData",
                           neg_set = c("all_instances", "all_proteins", "random")[1],
                           domain_results_obj = "res_count",
                           motif_input_obj = "forSLIMFinder_Ready",
                           motif_setup_obj2 = NULL,
                           occurence_filt = NULL,
                           one_from_cloud = T,
                           dbfile_main = "../viral_project/data_files/instances_all.gff",
                           dburl_main = "http://elm.eu.org/instances.gff?q=None&taxon=Homo%20sapiens&instance_logic=",
                           dbfile_query = "../viral_project/data_files/instances_query.gff",
                           dburl_query = "http://elm.eu.org/instances.gff?q=all&taxon=irus&instance_logic=",
                           query_res_query_only = T, motif_types = c("DOC", "MOD", "LIG", "DEG", "CLV", "TRG"),
                           all_res_excl_query = T, merge_motif_variants = F,
                           seed = 21, N = 100, replace = T, within1sequence = T,
                           query_predictor_col = "Sig", all_predictor_col = "Sig", normalise = T,
                           minoverlap = 2,
                           minoverlap_redundant = 5,
                           merge_domain_data = T,
                           merge_by_occurence_mcols = c("query", "interacts_with"),
                           merge_by_domain_res_cols = c("IDs_interactor_viral", "IDs_interactor_human", "IDs_domain_human", "Taxid_interactor_human","Taxid_interactor_viral"),
                           merge_by_non_query_domain_res_cols = c("IDs_interactor_human_A", "IDs_interactor_human_B", "IDs_domain_human_B", "Taxid_interactor_human_A","Taxid_interactor_human_B"),
                           filter_by_domain_data = "p.value < 0.05", motif_pval_cutoff = 1,
                           select_predictor_per_range = max,
                           non_query_domain_res_file = "../viral_project/processed_data_files/predict_domain_human_clust20180819.RData",
                           non_query_domain_results_obj = NULL, # res_count_all
                           non_query_domains_N = 0,
                           non_query_set_only = F,
                           query_domains_only = F,
                           min_non_query_domain_support = 0,
                           min_top_domain_support4motif_nq = 0,
                           select_top_domain = F,
                           ...) {

  ### Load domain enrichment results, PPI data, and data used for QSLIMfinder
  # domain enrichment results - query proteins
  domain_res_env = R.utils::env(load(domain_res_file))
  domain_res = domain_res_env[[domain_results_obj]]
  rm(domain_res_env)
  # domain enrichment results - non-query proteins
  if(!is.null(non_query_domain_results_obj)){
    non_query_domain_res = R.utils::env(load(non_query_domain_res_file))[[non_query_domain_results_obj]]
  } else non_query_domain_res = NULL

  # data used for QSLIMfinder
  if(is.null(motif_setup_obj2)) {
    if(file.exists(paste0(motif_setup, ".zip"))) {
      unzip(zipfile = paste0(motif_setup, ".zip"),
            junkpaths = T,
            exdir = dirname(paste0(motif_setup, ".zip")))
      forSLIMFinder_Ready = R.utils::env(load(motif_setup))[[motif_input_obj]]
    } else {
      forSLIMFinder_Ready = R.utils::env(load(motif_setup))[[motif_input_obj]]
    }
  } else {
    forSLIMFinder_Ready = copy(motif_setup_obj2)
    motif_setup_obj2 = T
  }

  # keep only SLIMFinder datasets where seed protein - query protein pair matches filtering criteria
  domain_res = XYZ.p.adjust(domain_res, adj_by = "p.value")
  if(!is.null(filter_by_domain_data)){
    eval(parse(text = paste0("domain_res$data_with_pval = domain_res$data_with_pval[",filter_by_domain_data,"]")))
    if(!is.null(non_query_domain_res)) {
      non_query_domain_res = XYZ.p.adjust(non_query_domain_res, adj_by = "p.value")
      eval(parse(text = paste0("non_query_domain_res$data_with_pval = non_query_domain_res$data_with_pval[",filter_by_domain_data,"]")))
    }
    if(!isTRUE(motif_setup_obj2)){ # calculate only when not provided
      forSLIMFinder_Ready = domainProteinPairMatch(forSLIMFinder_Ready, domain_res,
                                                   non_query_domain_res = non_query_domain_res,
                                                   non_query_domains_N = non_query_domains_N,
                                                   non_query_set_only = non_query_set_only,
                                                   query_domains_only = query_domains_only, remove = T)
    }
  }

  #instances_all = ELMdb2GRanges(dbfile = dbfile_main,
  #                              dburl = dburl_main,
  #                              tsvurl = gsub("gff", "tsv", dburl_main),
  #                              tsvfile = gsub("gff", "tsv", dbfile_main))
  instances_query = ELMdb2GRanges(dbfile = dbfile_query,
                                  dburl = dburl_query,
                                  tsvurl = gsub("gff", "tsv", dburl_query),
                                  tsvfile = gsub("gff", "tsv", dbfile_query))

  ### Filter ELM instances by instance logic == "true positive"
  #instances_all = instances_all[instances_all$InstanceLogic == "true positive"]
  instances_query = instances_query[instances_query$InstanceLogic == "true positive"]

  ### Filter ELM instances located in proteins that we use to search for motifs and select motif types of interest
  #searched_all = GRangesINinteractionSubsetFASTA(grange = instances_all, interactionSubsetFASTA = forSLIMFinder_Ready)
  #message(paste0(sum(searched_all$granges_in_sequences_Searched), " occurences out of ", length(searched_all$granges_in_sequences_Searched), " total in ELM could have been discovered (all proteins, all types of motifs)"))
  #instances_all = instances_all[searched_all$granges_in_sequences_Searched]
  #instances_all = filterBYmotifType(instances_all, motif_types = motif_types)
  #seqlevels(instances_all) <- seqlevelsInUse(instances_all)
  #seqlengths(instances_all) = searched_all$seqlengths[names(seqlengths(instances_all))]


  searched_query = GRangesINinteractionSubsetFASTA(grange = instances_query, interactionSubsetFASTA = forSLIMFinder_Ready, query_only = query_res_query_only)
  message(paste0(sum(searched_query$granges_in_sequences_Searched), " occurences out of ", length(searched_query$granges_in_sequences_Searched), " total in ELM in query proteins could have been discovered (QSLIMFinder query proteins, all types of motifs)"))
  instances_query = instances_query[searched_query$granges_in_sequences_Searched]
  instances_query = filterBYmotifType(instances_query, motif_types = motif_types)
  seqlevels(instances_query) <- seqlevelsInUse(instances_query)
  seqlengths(instances_query) = searched_query$seqlengths[names(seqlengths(instances_query))]

  ### Remove redundancy in known instances
  #instances_all = reduceOverlappingGRanges(instances_all, minoverlap = minoverlap_redundant, merge_motif_variants = merge_motif_variants)
  instances_query = reduceOverlappingGRanges(instances_query, minoverlap = minoverlap_redundant, merge_motif_variants = merge_motif_variants)

  if(is.null(occurence_filt)){
    ### Load QSLIMFinder results and ELM data
    occurence = SLIMFinderOcc2GRanges(occurence_file = occurence_file,
                                      main_file = main_file,
                                      one_from_cloud = one_from_cloud)
    ### Filter discovered occurences by those that we could have found
    found = GRangesINinteractionSubsetFASTA(grange = occurence, interactionSubsetFASTA = forSLIMFinder_Ready, query_only = F)
    occurence = occurence[found$granges_in_sequences_Searched]
    ## filter by predicted motif type
    seqlevels(occurence) <- seqlevelsInUse(occurence)
    seqlengths(occurence) = found$seqlengths[names(seqlengths(occurence))]
    occurence_filt = copy(occurence)
  } else {
    occurence = copy(occurence_filt)
  }
  # apply motif_pval_cutoff
  occurence = occurence[mcols(occurence[,query_predictor_col])[,1] < motif_pval_cutoff]
  # add an identifier for motifs
  mcols(occurence)$unique_position = paste0(GenomicRanges::seqnames(occurence), "_",
                                            GenomicRanges::start(occurence), "_",
                                            GenomicRanges::end(occurence))
  #igraph::plot.igraph(overlapping_instances_graph, vertex.size = 1, arrow.mode = "-", vertex.label.cex = 0.6)

  if(merge_domain_data){
    # standardise column names
    data_with_pval = copy(domain_res$data_with_pval)
    std_cols = c("protein_with_motif", "protein_with_domain", "domain", "Taxid_protein_with_motif","Taxid_protein_with_domain")
    for (col_name in merge_by_domain_res_cols) {
      setnames(data_with_pval, colnames(data_with_pval),
               gsub(col_name,
                    std_cols[which(merge_by_domain_res_cols == col_name)],
                    colnames(data_with_pval)))
    }
    data_with_pval[, domain_set := "query_set"]
    if(!is.null(non_query_domain_res)) {
      data_with_pval_nq = copy(non_query_domain_res$data_with_pval)
      # standardise column names
      for (col_name_nq in merge_by_non_query_domain_res_cols) {
        setnames(data_with_pval_nq, colnames(data_with_pval_nq),
                 gsub(col_name_nq,
                      std_cols[which(merge_by_non_query_domain_res_cols == col_name_nq)],
                      colnames(data_with_pval_nq)))
      }
      data_with_pval_nq[, domain_set := "non_query_set"]
      data_with_pval = rbind(data_with_pval, data_with_pval_nq)
      ## => use select domains that are supported by many human proteins (```[domain_set == "non_query_set" & domain_support_md_pair > 2, .(domain_id, motif_id, dataset_id)]; filter by .(domain_id, motif_id, dataset_id)```)
      ## => and filter datasets when there is no strong support (```[domain_set == "non_query_set" & domain_support_md_pair > 2, dataset_id]; filter by dataset_id```)

      # merge
      occurence = merge2GRangesmcols(occurence, data_with_pval,
                                     by.x = c("prot_names", "interacts_with"),
                                     by.y = c("protein_with_motif", "protein_with_domain"))
      # save order
      mcols(occurence)$motif_id_order = 1:length(occurence)

      # extract metadata
      mcol = as.data.table(mcols(occurence))

      # combine p-values using geometric mean
      gm_mean = function(x, na.rm=TRUE){
        exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
      }
      mcol[, combined_p.value := gm_mean(p.value), by = .(query, Pattern, interacts_with, domain)]

      # find top domain for each motif 1-prod(1-p.value)
      mcol[, top_domain := p.value == min(p.value), by = .(prot_names)]
      mcol[is.na(top_domain), top_domain := FALSE]
      top_temp = mcol[, .(top_domain_support4motif_nq = uniqueN(prot_names)),
                    by=.(query, Pattern, interacts_with, domain, top_domain, domain_set)]
      top_temp = top_temp[top_domain == T & domain_set == "non_query_set"]
      top_temp = top_temp[, top_domain := NULL]
      top_temp = top_temp[, domain_set := NULL]
      mcol = merge(mcol, top_temp, by = c("query", "Pattern", "interacts_with", "domain"), all.x = T, all.y = T)

      # for each motif in each dataset (viral-human pair, protein A - protein B pair) find how many non-query proteins from the same dataset each query-binding domain is predicted to bind
      q_temp = mcol[, .(domain_support4motif = uniqueN(prot_names)),
                    by=.(query, Pattern, interacts_with, domain, domain_set)]
      # add this data to mcols in 2 separate columns
      nq = q_temp[domain_set == "non_query_set", .(query, Pattern, interacts_with, domain,
                                                   domain_support4motif_nq = domain_support4motif)]
      q = q_temp[domain_set == "query_set", .(query, Pattern, interacts_with, domain,
                                              domain_support4motif_q = domain_support4motif)]
      q = merge(q, nq, by = c("query", "Pattern", "interacts_with", "domain"), all.x = T, all.y = T)
      mcol = merge(mcol, q, by = c("query", "Pattern", "interacts_with", "domain"), all.x = T, all.y = T)
      rm(nq, q, q_temp)
      # find which domains are supported by both non_query_set and query_set
      mcol[, domain_in_sets := uniqueN(domain_set),
           by=.(query, Pattern, interacts_with, domain)]

      # add mcol back to GRanges
      setorder(mcol, motif_id_order)
      mcols(occurence) = mcol
      # select domains that are supported by many human proteins
      non_query_domain_support = as.data.table(mcols(occurence))[,  !is.na(domain_support4motif_nq) &
                                                                   domain_support4motif_nq >= min_non_query_domain_support]
      occurence = occurence[non_query_domain_support]
      # select domains that are top-1 for many human proteins
      top_non_query_domain_support = as.data.table(mcols(occurence))[,  !is.na(top_domain_support4motif_nq) &
                                                                   top_domain_support4motif_nq >= min_top_domain_support4motif_nq]
      occurence = occurence[top_non_query_domain_support]

      if(select_top_domain){
        top_domain = as.data.table(mcols(occurence))[, p.value == min(p.value), by = unique_position]
        occurence = occurence[top_domain]
      }
      if(query_domains_only) occurence = occurence[occurence$domain_in_sets == 2]
      #mcol[query == "P06463" & Pattern == "L.R..E" & interacts_with == "P17980" & domain == "IPR003959"]
      #mcol[query == "P06429" & Pattern == "LKE.V" & interacts_with == "Q9H9D4" & domain == "IPR013083"]

    } else {
      # merge
      occurence = merge2GRangesmcols(occurence, data_with_pval,
                                     by.x = c("prot_names", "interacts_with"),
                                     by.y = c("protein_with_motif", "protein_with_domain"))
    }
  }

  ### Filter discovered occurences to keep query occurences only
  found_in_query = GRangesINinteractionSubsetFASTA(grange = occurence, interactionSubsetFASTA = forSLIMFinder_Ready, query_only = T)
  occurence_query = occurence[found_in_query$granges_in_sequences_Searched]
  # keep only cases when we found a motif in a query protein when that protein was a query
  occurence_query = occurence_query[seqnames(occurence_query) == occurence_query$query]
  ## filter by predicted motif type
  seqlevels(occurence_query) <- seqlevelsInUse(occurence_query)
  seqlengths(occurence_query) = found_in_query$seqlengths[names(seqlengths(occurence_query))]

  #suppressWarnings({
  #if(all_res_excl_query){
  #  instances_all = subsetByOverlaps(instances_all, instances_query, type = "equal", invert = T)
  #} else instances_all = c(instances_all, subsetByOverlaps(instances_query, instances_all, type = "equal", invert = T))
  #})

  # normalise, just in case predictor doesn't span the full range 0 ... 1
  if(normalise) {
    occurence_query$Sig_not_normalised = occurence_query$Sig
    occurence_query$Sig = occurence_query$Sig/max(occurence_query$Sig)
    occurence$Sig_not_normalised = occurence$Sig
    occurence$Sig = occurence$Sig/max(occurence$Sig)
  }


  ######################### Negative set: ranges in random locations ######################### START
  if(neg_set == "random") {
    ### Generate random negative sets
    set.seed(seed)
    #random_instances_all = randomGRanges(instances_all, N = N, replace = replace, within1sequence = within1sequence)
    random_instances_query = randomGRanges(instances_query, N = N, replace = replace, within1sequence = within1sequence)

    ### Combine positive and negative datasets
    #combined_instances_all = lapply(random_instances_all, function(random_instance, instances_all){
    #  c(instances_all, random_instance)
    #},instances_all)
    combined_instances_query = lapply(random_instances_query, function(random_instance, instances_query){
      c(instances_query, random_instance)
    },instances_query)

    ### Find overlaps with predicted motifs
    # All
    #suppressWarnings({
    #predictions_all = lapply(combined_instances_all, function(inst){
    #  findOverlapsBench(occuring = occurence, benchmarking = inst, predictor_col = all_predictor_col,
    #                    labels_col = "for_benchmarking", normalise = normalise,
    #                    minoverlap = minoverlap,
    #                    select_predictor_per_range = select_predictor_per_range, ...)$for_ROC$predictions
    #})
    #labels_all = lapply(combined_instances_all, function(inst){
    #  labels = findOverlapsBench(occuring = occurence, benchmarking = inst, predictor_col = all_predictor_col,
    #                             labels_col = "for_benchmarking", normalise = normalise,
    #                             minoverlap = minoverlap,
    #                             select_predictor_per_range = select_predictor_per_range, ...)$for_ROC$labels
    #  labels[grepl("1",labels)] = 1
    #  labels
    #})
    #})
    # Query
    suppressWarnings({
      predictions_query = lapply(combined_instances_query, function(inst){
        findOverlapsBench(occuring = occurence_query, benchmarking = inst, predictor_col = query_predictor_col,
                          labels_col = "for_benchmarking", normalise = normalise,
                          minoverlap = minoverlap,
                          select_predictor_per_range = select_predictor_per_range, ...)$for_ROC$predictions
      })
      labels_query = lapply(combined_instances_query, function(inst){
        labels = findOverlapsBench(occuring = occurence_query, benchmarking = inst, predictor_col = query_predictor_col,
                                   labels_col = "for_benchmarking", normalise = normalise,
                                   minoverlap = minoverlap,
                                   select_predictor_per_range = select_predictor_per_range, ...)$for_ROC$labels
        labels[grepl("1",labels)] = 1
        labels
      })
    })
  }
  ######################### Negative set: ranges in random locations #########################  END

  ######################### Negative set: protein in ELM, motif not in ELM  ######################### START
  if(neg_set == "all_instances"){
    ## query proteins
    combined_instances_query = instances_query
    occurence_query = occurence_query[seqnames(occurence_query) %in% seqnames(instances_query)]
    TP_FN = findOverlapsBench(occuring = occurence_query,
                              benchmarking = instances_query,
                              predictor_col = query_predictor_col,
                              labels_col = "for_benchmarking",
                              normalise = normalise,

                              minoverlap = minoverlap,
                              select_predictor_per_range = select_predictor_per_range)
    TP_FN$for_ROC$labels = 1

    FP = subsetByOverlaps(occurence_query, TP_FN$overlapping_GRanges, type = "equal", invert = T)
    FP$for_benchmarking = 0
    res = data.frame(predictions = 1 - mcols(FP)[,query_predictor_col],
                     labels = mcols(FP)[,"for_benchmarking"],
                     stringsAsFactors = F)

    # choose only one p-value for the same range: occuring_subset
    res_list = split(res, paste0(seqnames(FP), start(FP), end(FP)))
    res_list = lapply(res_list, function(res_over) {
      res_over$predictions = select_predictor_per_range(res_over$predictions)
      unique(res_over)
    })
    res = Reduce(rbind, res_list)

    res = rbind(res, TP_FN$for_ROC)

    predictions_query = res$predictions
    labels_query = res$labels


    ## all proteins
    #combined_instances_all = instances_all
    #occurence = occurence[seqnames(occurence) %in% seqnames(instances_all)]
    #TP_FN_all = findOverlapsBench(occuring = occurence,
    #                              benchmarking = instances_all,
    #                              predictor_col = all_predictor_col,
    #                              labels_col = "for_benchmarking",
    #                              normalise = normalise,
    #minoverlap = minoverlap,
    #                              select_predictor_per_range = select_predictor_per_range)
    #TP_FN_all$for_ROC$labels = 1

    #FP_all = subsetByOverlaps(occurence, TP_FN_all$overlapping_GRanges, type = "equal", invert = T)
    #FP_all$for_benchmarking = 0
    #res_all = data.frame(predictions = 1 - mcols(FP_all)[,all_predictor_col], labels = mcols(FP_all)[,"for_benchmarking"], stringsAsFactors = F)
    #res_all = rbind(res_all, TP_FN_all$for_ROC)

    #predictions_all = res_all$predictions
    #labels_all = res_all$labels
  }
  ######################### Negative set: protein in ELM, motif not in ELM  ######################### END

  ######################### Negative set: protein not in ELM or protein in ELM but motif not in ELM  ######################### START
  if(neg_set == "all_proteins"){
    ## query proteins
    combined_instances_query = instances_query

    TP_FN = findOverlapsBench(occuring = occurence_query,
                              benchmarking = instances_query,
                              predictor_col = query_predictor_col,
                              labels_col = "for_benchmarking",
                              normalise = normalise,

                              minoverlap = minoverlap,
                              select_predictor_per_range = select_predictor_per_range)
    TP_FN$for_ROC$labels = 1

    FP = subsetByOverlaps(occurence_query, TP_FN$overlapping_GRanges, type = "equal", invert = T)
    FP$for_benchmarking = 0
    res = data.frame(predictions = 1 - mcols(FP)[,query_predictor_col], labels = mcols(FP)[,"for_benchmarking"], stringsAsFactors = F)

    # choose only one p-value for the same range: occuring_subset
    res_list = split(res, paste0(seqnames(FP), start(FP), end(FP)))
    res_list = lapply(res_list, function(res_over) {
      res_over$predictions = select_predictor_per_range(res_over$predictions)
      unique(res_over)
    })
    res = Reduce(rbind, res_list)

    res = rbind(res, TP_FN$for_ROC)

    predictions_query = res$predictions
    labels_query = res$labels
    labels_query[grepl("1",labels_query)] = 1

    ## all proteins
    #combined_instances_all = instances_all

    #TP_FN_all = findOverlapsBench(occuring = occurence,
    #                              benchmarking = instances_all,
    #                              predictor_col = all_predictor_col,
    #                              labels_col = "for_benchmarking",
    #                              normalise = normalise,
    #                              minoverlap = minoverlap,
    #                              select_predictor_per_range = select_predictor_per_range)
    #TP_FN_all$for_ROC$labels = 1

    #FP_all = subsetByOverlaps(occurence, TP_FN_all$overlapping_GRanges, type = "equal", invert = T)
    #FP_all$for_benchmarking = 0
    #res_all = data.frame(predictions = 1 - mcols(FP_all)[,all_predictor_col], labels = mcols(FP_all)[,"for_benchmarking"], stringsAsFactors = F)
    #res_all = rbind(res_all, TP_FN_all$for_ROC)

    #predictions_all = res_all$predictions
    #labels_all = res_all$labels
  }
  ######################### Negative set: protein not in ELM or protein in ELM but motif not in ELM  ######################### END

  ### Get  overlapping instances
  suppressWarnings({
    if(neg_set == "random") {
      #  combined_instances_all_temp = combined_instances_all[[1]]
      combined_instances_query_temp = combined_instances_query[[1]]
    } else {
      #  combined_instances_all_temp = combined_instances_all
      combined_instances_query_temp = combined_instances_query
    }

    #overlaps_all = findOverlapsBench(occuring = occurence,
    #                                            benchmarking = combined_instances_all_temp,
    #                                            predictor_col = all_predictor_col,
    #                                            labels_col = "for_benchmarking",
    #                                            normalise = normalise,
    #                                            minoverlap = minoverlap)$overlapping_GRanges
    #overlapping_GRanges_all = overlaps_all$overlapping_GRanges[overlaps_all$overlapping_GRanges$for_benchmarking == 1,]
    overlaps_query = findOverlapsBench(occuring = occurence_query,
                                       benchmarking = combined_instances_query_temp,
                                       predictor_col = query_predictor_col,
                                       labels_col = "for_benchmarking",
                                       normalise = normalise,
                                       minoverlap = minoverlap)
    overlapping_GRanges_query = overlaps_query$overlapping_GRanges[overlaps_query$overlapping_GRanges$for_benchmarking == 1,]
  })

  ##### KNOWN #####
  N_query_prot_with_known_instances = length(unique(as.character(seqnames(instances_query)))) #motif_protein_table_queryKNOWN[, uniqueN(UNIPROT)]#, by = .(ID)]
  # total number of instances
  N_query_known_instances = length(unique(instances_query))

  # motif_protein_table_allKNOWN = as.data.table(cbind(mcols(instances_all)[, c("ID")], UNIPROT = seqnames(instances_all)))
  # number of proteins with motif
  # N_all_prot_with_known_instances = length(unique(as.character(seqnames(instances_all)))) #motif_protein_table_allKNOWN[, uniqueN(UNIPROT)]#, by = .(ID)]
  # total number of instances
  # N_all_known_instances = length(unique(instances_all))

  ##### FOUND #####
  # number of proteins with motif
  N_query_prot_with_known_instances_found = length(unique(as.character(seqnames(overlapping_GRanges_query)))) #motif_protein_table_query[, uniqueN(UNIPROT), by = .(ID, Pattern)]
  # total number of discovered instances
  N_query_known_instances_found = length(unique(overlapping_GRanges_query))
  # number of known instances that match discovered instances
  N_query_match_known_instances_found = N_query_known_instances - length(unique(overlaps_query$not_found))
  # total found
  N_query_total_instances_found = length(unique(paste0(seqnames(occurence_query),
                                                       start(occurence_query), end(occurence_query))))

  # number of proteins with motif
  #N_all_prot_with_known_instances_found = length(unique(as.character(seqnames(overlapping_GRanges_all)))) #motif_protein_table_all[, uniqueN(UNIPROT), by = .(ID, Pattern)]
  # total number of discovered instances
  #N_all_known_instances_found = length(unique(overlapping_GRanges_all))
  # number of known instances that match discovered instances
  #N_all_match_known_instances_found = N_all_known_instances - length(unique(overlaps_all$not_found))
  # total found
  #N_all_total_instances_found = length(unique(paste0(seqnames(occurence_query),
  #                                                   start(occurence_query), end(occurence_query))))


  out = list(
    occurence = occurence, occurence_query = occurence_query, occurence_filt = occurence_filt,
    #instances_all = instances_all,
    instances_query = instances_query,
    #predictions_all = predictions_all, labels_all = labels_all,
    predictions_query = predictions_query, labels_query = labels_query,
    #overlapping_GRanges_all = overlapping_GRanges_all,
    overlapping_GRanges_query = overlapping_GRanges_query,
    N_query_prot_with_known_instances = N_query_prot_with_known_instances, N_query_known_instances = N_query_known_instances,
    #N_all_prot_with_known_instances = N_all_prot_with_known_instances, N_all_known_instances = N_all_known_instances,
    N_query_prot_with_known_instances_found = N_query_prot_with_known_instances_found, N_query_known_instances_found = N_query_known_instances_found,
    N_query_total_instances_found = N_query_total_instances_found,
    #N_all_total_instances_found = N_all_total_instances_found,
    #N_all_prot_with_known_instances_found = N_all_prot_with_known_instances_found, N_all_known_instances_found = N_all_known_instances_found,
    N_query_match_known_instances_found = N_query_match_known_instances_found,
    #N_all_match_known_instances_found = N_all_match_known_instances_found,
    domain_res = domain_res,
    motif_setup_obj2 = forSLIMFinder_Ready)
  class(out) = "benchmarkMotifsResult"
  out
}

##' Get motifs from the output of benchmarking linear motifs by id
##' @rdname benchmarkMotifs
##' @name queryOCCByMCOL
##' @author Vitalii Kleshchevnikov
##' @param res object class \code{(benchmarkMotifsResult)}, the output of benchmarkMotifs
##' @param keytype character, name of the column that contains key identifiers
##' @param key character, identifiers for which to retrieve the result
##' @return GenomicRanges containing motifs for a given key
##' @import GenomicRanges
##' @import data.table
##' @export queryOCCByMCOL
queryOCCByMCOL = function(res, keytype = "IDs_domain_human", key = "IPR032440"){
  if(class(res) != "benchmarkMotifsResult") stop("res should be of class \"benchmarkMotifsResult\"")
  res$occurence_query[which(mcols(res$occurence_query)[, keytype] == key)]
}

##' Benchmark linear motif instance found using QSLIMFinder (SLIMFinder)
##' @rdname benchmarkMotifs
##' @name mBenchmarkMotifs
##' @author Vitalii Kleshchevnikov
##' @param datasets character vector, names of the datasets ("Vidal" in "./SLIMFinder_Vidal/result/occurence.txt" or "" in "./SLIMFinder/result/occurence.txt")
##' @param descriptions character vector, description of the datasets (title of venn diagram plot)
##' @param dir character, base directory. For example, "./" in "./SLIMFinder_Vidal/result/occurence.txt"
##' @return list of objects of class \code{(benchmarkMotifsResult)}
##' @import GenomicRanges
##' @import data.table
##' @export mBenchmarkMotifs
mBenchmarkMotifs = function(datasets = c("qslimfinder.Full_IntAct3.FALSE"),
                            descriptions = c("human network (full IntAct) searched \nfor motifs present in viral proteins"),
                            dir = "./",
                            motif_setup_months = "201802",
                            ...){

  results = lapply(datasets, function(dataset) {

    description = descriptions[datasets == dataset]
    if(length(motif_setup_months) > 1){
      motif_setup_month = motif_setup_months[datasets == dataset]
    } else motif_setup_month = motif_setup_months


    occurence_file = paste0(dir, dataset, "/result/occurence.txt")
    main_file = paste0(dir, dataset, "/result/main_result.txt")
    motif_setup = paste0(dir,"/QSLIMFinder_instances_h2v_", dataset, "_clust", motif_setup_month,".RData")

    # if only viral protein were searched - all_res_excl_query =F
    #if(grepl("all_viral_interaction", dataset)) all_res_excl_query = F

    result = benchmarkMotifs(occurence_file = occurence_file, # generated by mBenchmarkMotifs, all else just passed
                             main_file = main_file, # generated by mBenchmarkMotifs
                             motif_setup = motif_setup, # generated by mBenchmarkMotifs
                             #all_res_excl_query = all_res_excl_query,
                             ...)

    result$description = description
    result
  })

  resnames = character(length(datasets))
  resnames[datasets != ""] = datasets[datasets != ""]
  resnames[datasets == ""] = "full_Intact"
  names(results) = resnames

  results
}

##' Reduce GRanges keeping metadata (collapsing withing each column separated by pipes)
##' @rdname reduceWithMcols
##' @name reduceWithMcols
##' @author Vitalii Kleshchevnikov
##' @param range any Genomic Ranges object with metadata (mcols())
##' @return reduced Genomic Ranges object
##' @import GenomicRanges
##' @export reduceWithMcols
reduceWithMcols = function(range) {
  metadata = mcols(range)
  for(i in 1:ncol(metadata)) {
    metadata[,i] = paste0(as.character(metadata[,i]), collapse = "|")
  }
  metadata = unique(metadata)
  range = reduce(range, min.gapwidth = 0)
  mcols(range) = metadata
  range
}

##' Generate motif variant class (the first 2 element in the motif names)
##' @rdname motifVariantClass
##' @name motifVariantClass
##' @author Vitalii Kleshchevnikov
##' @param ids character vector of ELM motif IDs
##' @return character vector of ELM motif variant class (such as TRG_NLS or LIG_SH3)
##' @export motifVariantClass
motifVariantClass = function(ids){
  matches = gregexpr("^[[:alnum:]]+_[[:alnum:]]+", ids)
  for (i in seq_along(matches)) {
    ids[i] = substr(ids[i], matches[[i]][1], attr(matches[[i]],"match.length"))
  }
  ids
}

##' Reduce Overlapping GRanges keeping metadata (collapsing withing each column separated by pipes)
##' @rdname reduceOverlappingGRanges
##' @name reduceOverlappingGRanges
##' @author Vitalii Kleshchevnikov
##' @param GRanges any Genomic Ranges object with metadata (mcols())
##' @param minoverlap min overlap between Genomic Ranges to continue with reduce
##' @return reduced Genomic Ranges object
##' @import GenomicRanges
##' @import data.table
##' @export reduceOverlappingGRanges
reduceOverlappingGRanges = function(GRanges, minoverlap = 5, merge_motif_variants = F, ...){
  overlapping_instances = findOverlaps(GRanges, minoverlap = minoverlap, ...)

  overlapping_instances = as.data.frame(overlapping_instances[queryHits(overlapping_instances) < subjectHits(overlapping_instances)])
  overlapping_instances$queryHits = as.character(overlapping_instances$queryHits)
  overlapping_instances$subjectHits = as.character(overlapping_instances$subjectHits)

  overlapping_instances_graph = igraph::graph.edgelist(as.matrix(overlapping_instances), directed = F)
  if(length(igraph::E(overlapping_instances_graph)) > 0){
    overlapping_instances = igraph::fastgreedy.community(overlapping_instances_graph)
    overlapping_instances = data.table(range_indx = as.integer(igraph::V(overlapping_instances_graph)$name), cluster = igraph::membership(overlapping_instances))
    overlapping_instances = overlapping_instances[order(range_indx)]
    non_overlapping_instances = GRanges[-overlapping_instances$range_indx]
    overlapping_instances_ranges = GRanges[overlapping_instances$range_indx]
    overlapping_instances_ranges = split(overlapping_instances_ranges, overlapping_instances$cluster)
    instances_ranges = sapply(overlapping_instances_ranges, function(range, merge_motif_variants){
      if(merge_motif_variants){
        range = reduceWithMcols(range)
      } else {
        motif_variants_classes = split(range, motifVariantClass(range$ID))
        range = sapply(motif_variants_classes, function(motif_variants_class) reduceWithMcols(motif_variants_class))
        range = Reduce(c, range)
      }
      range
    }, merge_motif_variants)

    instances_ranges = Reduce(c, instances_ranges)
    instances_ranges = c(instances_ranges, non_overlapping_instances)
  } else instances_ranges = GRanges

  instances_ranges
}
vitkl/SLIMFinderR documentation built on May 3, 2019, 8:08 p.m.