R/load_contacts.R

Defines functions loadHiCpro load_contacts

Documented in load_contacts

#' Construct a contacts-object.
#'
#' Make an object which holds the most needed information of a HiC-experiment at a given resolution.
#'
#' @param signal_path Full path to an Hi-C file. This could be \*.matrix (for HiCpro), \*.cooler or \*.hic (for juicer).
#' @param indices_path Full path the HiC-pro-like index-file. Required when `signal_path` is \*.matrix.
#' @param resolution Set the desired resolution of the matrix when using juicer-data.
#' @param sample_name The name of the sample.
#' @param centromeres One of the following: \itemize{
#'  \item \code{NULL} (default) for empirically estimating centromeres by 
#'  searching for the largest stretch of empty bins.
#'  \item A \code{data.frame} with three columns per chromosome:
#'  chromosome name, start-position and end-positions of centromeric regions.
#'  \item \code{FALSE} to set no centromere information.
#' }
#' @param colour colour associated with sample.
#' @param z_norm Normalise the matrices per-chromosome with a Z-score.
#' @param balancing TRUE (default) will perform matrix balancing for .cooler and KR for.hic.
#' @param scale_bp Scale contacts to have genome-wide sum of `scale_bp` reads (default: 1e9). Set to NULL to skip this.
#' @param scale_cis Only scale with cis-contacts.
#' @param legacy Get a pre-v1 object (mimics the output of construct.experiment.)
#' @param verbose Do you want updates during the construction?
#' @note
#' Some reference genomes have very small "random" or "patch" chromosomes,
#' which can have zero contacts mapped to it (at certain resolutions).
#' `load_contacts` checks this and omits these chromosomes in the resulting
#' experiment-object. The RMCHROM-flag will also be set to TRUE: this will
#' help other GENOVA-functions to deal better with this problem. There is a
#' slight performance-cost during the construction of the object, however.
#' @examples
#' \dontrun{
#' WT_10kb_hicpro <- load_contacts(signal_path = "WT_10kb_iced.matrix", 
#'                                 indices_path = "WT_10kb_abs.bed", 
#'                                 sample_name = "WT", 
#'                                 colour = "black")
#' WT_10kb_cooler <- load_contacts("WT_10kb.cooler", 
#'                                 balancing =T, 
#'                                 sample_name = "WT", 
#'                                 colour = "black")
#' WT_10kb_juicer <- load_contacts("WT_10kb_iced.hic", 
#'                                 resolution = 10e3, 
#'                                 balancing =T, 
#'                                 sample_name = "WT", 
#'                                 colour = "black")
#' }
#' @return An contacts-object, which is a named list of contacts, indices and 
#' attributes for a Hi-C matrix of a given sample at a given resolution.
#' @export
load_contacts = function(signal_path, 
                         indices_path = NULL,
                         resolution = 10e3,
                         sample_name = NULL,
                         centromeres = NULL,
                         colour = NULL,
                         z_norm = FALSE,
                         scale_bp = 1e9,
                         scale_cis = FALSE,
                         balancing = TRUE,
                         legacy = FALSE,
                         verbose = TRUE){
  
  # Control data.table threads
  dt.cores <- data.table::getDTthreads()
  on.exit(data.table::setDTthreads(dt.cores))
  data.table::setDTthreads(1)
  
  if(is.null(sample_name)){
    
    sample_name = basename(signal_path)
    
  }
  doJuicer = F
  doCooler = F
  doHiCpro = F
  balanced = F
  juicerPath = NULL
  coolerPath = NULL
  ##############################################################################
  #################################################### what are we dealing with?
  ##############################################################################
  inputType = switch(tools::file_ext(tolower(signal_path)), 
                     'matrix' = 'hicpro',
                     'cooler' = 'cooler',
                     'cool' = 'cooler',
                     'mcool' = 'cooler',
                     'hic'    = 'juicer')
  
  if(inputType == 'juicer'){juicerPath = signal_path}
  if(inputType == 'cooler'){coolerPath = signal_path}
  
  if(!is.null(juicerPath)){
    ##################################################################### juicer
    
    
    
    if(grepl(juicerPath ,pattern = '^http')){
      stop('signal_path starts with http, but the current version does not support it.')
      # signal_path points to an URL!
      if(unname(RCurl::url.exists(signal_path, .header = T)[7] == "404")){
        stop('signal_path starts with http, but url is not found (404).')
      }
      
    } else if(!file.exists(juicerPath)){
      stop("juicerPath doesn't point to an existing .hic-file.")
    }
    
    # is strawr installed?
    try_require('strawr', "load_contacts", source = 'github')
    
    doJuicer = T
    
    
  } else if(!is.null(coolerPath)){
    ##################################################################### cooler
    
    # a coolerPath is given. Does it exist?
    if(!file.exists(coolerPath)){
      stop("coolerPath doesn't point to an existing file.")
    }
    
    try_require('rhdf5', 'loadCooler', source = 'BIOC')
    
    doCooler = T
    
  } else if(!is.null(signal_path)){
    ##################################################################### hicpro
    
    # a signal_path is given. Does it exist?
    if(!file.exists(signal_path)){
      stop("signal_path doesn't point to an existing file.")
    }
    
    if(!file.exists(indices_path)){
      stop("indices_path doesn't point to an existing file.")
    }
    
    doHiCpro = T
    
  } else {
    stop('please provide either .matrix/indices_path, a .cooler or a .hic.')
  }
  
  ##############################################################################
  #################################################################### load data
  ##############################################################################
  if(verbose){message('Reading data...')}
  if(doJuicer){
    sig_ind = loadJuicer(juicerPath, resolution, scale_bp = scale_bp, scale_cis = scale_cis, balancing = balancing)
    balanced = balancing
  }
  
  if(doHiCpro){
    sig_ind = loadHiCpro(signal_path, indices_path, scale_bp = scale_bp, scale_cis = scale_cis )
    balanced = any(data.table::fread(signal_path, nrows = 1000)[,3] %% 1 != 0)
  }
  
  if(doCooler){
    sig_ind = loadCooler(coolerPath, scale_bp = scale_bp,balancing = balancing, scale_cis = scale_cis, resolution = resolution)
    balanced = balancing
  }
  
  ##############################################################################
  ########################################################## check index and sig
  ##############################################################################
  signal = sig_ind[[1]]
  if(!data.table::is.data.table(signal)){
    stop('signal is no dt')
  }
  if(is.null(data.table::key(signal))){
    data.table::setkey(signal, "V1", "V2")
  }
  
  index = sig_ind[[2]]
  if(!data.table::is.data.table(index)){
    stop('index is no dt')
  }
  if(is.null(data.table::key(index))){
    data.table::setkey(index, "V1", "V2")
  }
  res <- as.numeric(stats::median(index$V3 - index$V2))
  ##############################################################################
  ################################################################### check bins
  ##############################################################################
  
  ICEmaxID <- max(max(signal$V1), max(signal$V2))
  ABSmaxID <- max(index[, 4])
  if (ICEmaxID > ABSmaxID) {
    stop("Undefined HiC-bins.\nWe checked if the highest Hi-C bin in the signal-file could be found in the indices-file: it couldn't.\nPlease check if indices and signal are from the same reference genome and resolution!")
  }
  if (abs(ICEmaxID - ABSmaxID) > ICEmaxID * 0.1) {
    warning("Undefined BED-bins.\nWe checked if the highest Hi-C bin in the indices-file could be found in the signal-file: it couldn't.\nPlease check if indices and signal are from the same reference genome and resolution!")
  }
  
  ##############################################################################
  ########################################################## remove empty chroms
  ##############################################################################
  chromRLE = rle(index$V1)
  CS = cumsum(c(1,chromRLE$lengths))
  
  tmpsignal = signal
  tmpsignal$C1 = chromRLE$values[findInterval(tmpsignal$V1, CS)]
  tmpsignal$C2 = chromRLE$values[findInterval(tmpsignal$V2, CS)]
  foundChroms = unique(c(unique(tmpsignal$C1), unique(tmpsignal$C2)))
  RMCHROM = length(foundChroms) != length(chromRLE$values)
  
  ##############################################################################
  ###################################################################### Z-score
  ##############################################################################
  if(z_norm){
    if(verbose){message('Running Z-score normalisation...')}
    signal = zscore_hic(signal, index)
  }
  
  ##############################################################################
  ############################################################# set to upper tri
  ##############################################################################
  if(!all(signal$V1 <= signal$V2)){
    signal[, c("V1", "V2") := list(pmin(V1, V2), pmax(V1, V2))]
    setkeyv(signal, c("V1", "V2"))
  }
  
  ##############################################################################
  ############################################################ check centromeres
  ##############################################################################
  
  if (!is.null(centromeres)) {
    if (inherits(centromeres, "data.frame")) {
      centromeres <- clean_centromeres(centromeres, index)
    } else if (is.logical(centromeres) && # Literal check for literal FALSE
               length(centromeres) == 1 && 
               !centromeres) {
      centromeres <- data.table(
        chrom = unique(index[[1]]),
        start = -1,
        end = -1
      )
    } else {
      stop(paste0("Don't know how to interpret `", typeof(centromeres), "` as",
                  "centromeres"), call. = FALSE)
    }
  } else {
    centromeres <- find_centromeres(signal, index)
  }
  
  ##############################################################################
  ###############################################################  Contruct list
  ##############################################################################
  if( is.null(colour) ){
    colour = 'black'
  }
  out = structure(list(
    # Iced HiC-matrix in three-column format (i.e. from HiC-pro)
    MAT = signal,
    
    # HiC-index in four-column format (i.e. from HiC-pro)
    IDX = index,
    
    # Available chromosomes
    CHRS = foundChroms,
    
    # DF with chr, start, stop
    CENTROMERES = centromeres
    
  ),
  # classes
  class = "contacts",
  # attrs
  znorm = z_norm, samplename = sample_name, colour = colour,
  resolution = res, rmChrom = RMCHROM, balanced = balanced, scale_cis = scale_cis,
  # packgs
  package = "GENOVA")
  
  
  if(legacy){
    warning("Legacy is a soft deprecation option, which ~someday~ will be removed.\nUsing the legacy option means you're longing for the good old days.
            Don't: the future has zoomy functions and shiny plots.")
    out = legacy_out(out)
  }
  
  return(out)
}



loadHiCpro = function(signal_path, indices_path, scale_bp, scale_cis){
  
  # init
  .          <- NULL
  V1         <- NULL
  V4         <- NULL
  
  ABS <- data.table::fread(indices_path, header = F, data.table = T)
  
  SIG = NULL
  if(scale_cis){
    SIG <- read.hicpro.matrix(signal_path, scale_bp = NULL)
    
    chromRange = ABS[ , .(first = min(V4)), by = V1]
    chromRange = chromRange[order(chromRange$first),]
    
    F1 = findInterval(SIG$V1, chromRange$first)
    F2 = findInterval(SIG$V2, chromRange$first)
    
    SIG$V3 <- scale_bp * SIG$V3 / sum(SIG[ifelse(F1 == F2, T, F), 3])
    
  } else {
    SIG <- read.hicpro.matrix(signal_path, scale_bp = scale_bp)
  }
  
  return(list(SIG, ABS))
}


zscore_hic = function(SIG, ABS){
  
  # init
  D          <- NULL
  V3         <- NULL
  C1         <- NULL
  C2         <- NULL
  
  chromRLE = rle(ABS$V1)
  
  CS = cumsum(c(1,chromRLE$lengths))
  SIG$C1 = chromRLE$values[findInterval(SIG$V1, CS)]
  SIG$C2 = chromRLE$values[findInterval(SIG$V2, CS)]
  SIG$D = abs(SIG$V1-SIG$V2)
  
  SIG[SIG$C1 != SIG$C2, 'D'] = 0
  
  Z = SIG[ , V3 := scale(V3), by = list(C1, C2, D)]
  Z = Z[,-c(4:6)]
  Z$V3[is.nan(Z$V3)] = 1
  
  data.table::setkey(Z, "V1", "V2")
  
  return(Z)
}

legacy_out = function(out){
  newOut = list(
    # Iced HiC-matrix in three-column format (i.e. from HiC-pro)
    ICE = out$MAT,
    
    # HiC-index in four-column format (i.e. from HiC-pro)
    ABS = data.table::as.data.table(out$IDX),
    
    # Name of sample
    NAME = attr(out, 'samplename'),
    
    # Resolution of sample
    RES = attr(out, 'resolution'),
    
    # Available chromosomes
    CHRS = out$CHRS,
    
    # colour of sample (optional, but recommended for running RCP)
    COL = attr(out, 'colour'),
    
    # comments
    COMM = "",
    
    # Vector of masked bins
    MASK = vector(),
    
    # DF with chr, start, stop
    CENTROMERES = out$CENTROMERES,
    
    # Are there now missing bins?
    RMCHROM =  attr(out, 'rmChrom')
    
  )
  return(newOut)
  
}

#' Read hicpro three column matrix format.
#'
#' This function loads a HiC-pro file as a matrix. It assumes a three-column
#' layout: bin1, bin2 and score. All scors are normalised to contacts per *scale_bp*
#' total contacts.
#'
#' @param file Full path to file.
#' @param scale_bp Normalising factor. Set to NULL to skip scale_bp.
#' @return A data.table with normalised counts.
read.hicpro.matrix <- function(file, scale_bp = 1e9) {
  data <- data.table::fread(file)
  data.table::setkey(data, "V1", "V2")
  if (!is.null(scale_bp)) {
    data$V3 <- scale_bp * data$V3 / sum(data$V3)
  }
  return(data)
}
robinweide/GENOVA documentation built on March 14, 2024, 11:16 p.m.