R/cleanMITAB27.R

##' \code{cleanMITAB} extracts and cleans a set of columns of MITAB2.7
##' @name cleanMITAB27
##' @description function called internally by \code{\link{cleanMITAB}} if the format is MITAB2.7
##' @param mitab data.table containing molecular interaction data in MITAB 2.7 format
##' @author Vitalii Kleshchevnikov
##' @import data.table
cleanMITAB27 = function(mitab){
  # changing column names to data.table-compatible format
  {
    colnames(mitab) = gsub(" ","_",colnames(mitab))
    colnames(mitab) = gsub("\\(|\\)","",colnames(mitab))
    colnames(mitab) = gsub("#","",colnames(mitab))
  }
  # cleaning Taxid "taxid:9606(human)|taxid:9606(Homo sapiens)" to 9606
  {
    mitab[, Taxid_interactor_A := gsub("taxid:|\\(.*$","",Taxid_interactor_A)]
    mitab[, Taxid_interactor_B := gsub("taxid:|\\(.*$","",Taxid_interactor_B)]
    mitab[, Host_organisms := gsub("taxid:|\\(.*$","",Host_organisms)]
    # saving identifier types and cleaning interactor ids
    mitab[, interactor_IDs_databases_A := gsub(":.*$","",IDs_interactor_A)]
    mitab[, interactor_IDs_databases_B := gsub(":.*$","",IDs_interactor_B)]
    mitab[, IDs_interactor_A := gsub("^.*:","",IDs_interactor_A)]
    mitab[, IDs_interactor_B := gsub("^.*:","",IDs_interactor_B)]
    # isoform "-1" is a canonical sequence, IntAct uses isoform "-1" when it's clear that the isoform is "-1" and a canonical identifier if it's not clear which isoform was used in the experiment. Removing isoform sign "-1":
    mitab[, IDs_interactor_A := gsub("-1$", "", IDs_interactor_A)]
    mitab[, IDs_interactor_B := gsub("-1$", "", IDs_interactor_B)]
    # cleaning other information
    mitab[, bait_prey_status_A := gsub("^.*\\(|\\)","",Experimental_roles_interactor_A)]
    mitab[, bait_prey_status_B := gsub("^.*\\(|\\)","",Experimental_roles_interactor_B)]
    mitab[, Publication_Identifiers := gsub("^.*pubmed:|\\|.*$","",Publication_Identifiers)]
    mitab[, Confidence_values := gsub("^intact-miscore:","",Confidence_values)]
    mitab[, Confidence_values := gsub("-","NA",Confidence_values)]
    # supress expected warning (NA introduced by coersion to numeric) to avoid confusion
    suppressWarnings({mitab[, Confidence_values := as.numeric(Confidence_values)]})
    #mitab[, Interaction_identifiers := unlist(gsubfn::strapplyc(Interaction_identifiers,"EBI-[[:digit:]]+",simplify = T)), by =Interaction_identifiers]
    # generating unique identifier for interacting pairs
    mitab[, pair_id := {
      z = sort(c(IDs_interactor_A, IDs_interactor_B))
      paste0(z[1],"|",z[2])
    }, by = .(IDs_interactor_A,IDs_interactor_B)]
  }
  {
    # extract region sufficient to interact or mutation affecting interaction information from Features_interactor_A and Features_interactor_B
    mitab = MITABregionFeature(mitab)

    # reorder by all interactor attribute columns by pair_id (alphanumeric order)
    mitab[, c("IDs_A_order", "IDs_B_order") := tstrsplit(pair_id, "\\|")]
    mitab = reorderMITAB27(mitab)
  }
}

##' \code{reorderMITAB27} reorders interacting molecules in a pair (and all the corresponding columns) according to order provided in IDs_A_order and IDs_B_order columns (latter are deleted)
##' @name reorderMITAB27
##' @description function called internally by \code{\link{cleanMITAB27}} if the format is MITAB2.7
##' @param mitab data.table containing molecular interaction data in MITAB 2.7 format
##' @author Vitalii Kleshchevnikov
##' @import data.table
reorderMITAB27 = function(mitab){
  if(mean(c("IDs_A_order", "IDs_B_order") %in% colnames(mitab)) != 1) stop("columns to order by not provided (IDs_A_order, IDs_B_order)") else {
    mitab[IDs_interactor_A == IDs_B_order & IDs_interactor_B == IDs_A_order,
          c("IDs_interactor_A", "IDs_interactor_B",
            "interactor_IDs_databases_A", "interactor_IDs_databases_B",
            "Taxid_interactor_A", "Taxid_interactor_B",
            "bait_prey_status_A", "bait_prey_status_B",
            "Features_interactor_A", "Features_interactor_B",
            "Identification_method_participant_A", "Identification_method_participant_B",
            "binding_region_A_start", "binding_region_A_end", "binding_region_B_start", "binding_region_B_end",
            "binding_region_A_type", "binding_region_B_type") :=
            .(IDs_interactor_B, IDs_interactor_A,
              interactor_IDs_databases_B, interactor_IDs_databases_A,
              Taxid_interactor_B, Taxid_interactor_A,
              bait_prey_status_B, bait_prey_status_A,
              Features_interactor_B, Features_interactor_A,
              Identification_method_participant_B, Identification_method_participant_A,
              binding_region_B_start, binding_region_B_end, binding_region_A_start, binding_region_A_end,
              binding_region_B_type, binding_region_A_type)]
  }
  # keep only relevant columns
  mitab = unique(mitab[, .(IDs_interactor_A, IDs_interactor_B,
                           interactor_IDs_databases_A, interactor_IDs_databases_B,
                           Taxid_interactor_A, Taxid_interactor_B,
                           Publication_Identifiers, Confidence_values,
                           Host_organisms,
                           bait_prey_status_A, bait_prey_status_B,
                           Interaction_detection_methods, Interaction_types, Interaction_identifiers, Expansion_methods,
                           Features_interactor_A, Features_interactor_B,
                           Identification_method_participant_A, Identification_method_participant_B,
                           binding_region_A_start, binding_region_A_end, binding_region_B_start, binding_region_B_end,
                           binding_region_A_type, binding_region_B_type,
                           pair_id)])
  return(mitab)
}
vitkl/MItools documentation built on May 29, 2019, 2:55 p.m.