R/rGMAP_source.R

Defines functions plotdom refm_hic transf_tad_horz triangle_tad data_simu rGMAP rGMAP_singChr call_domain gmixmodel0 gmixmodel rm_fpdomain_local rm_fpdomain tune_allPara tune_T1T2 tune_insulScore cal_insul_org cal_insul full_seg def_tad_gmap localMaxima cal_stat btw_stat down_stat up_stat

Documented in data_simu plotdom rGMAP

##---------------------------------------------------------------------##
## --This program is written by Wenbao Yu for implementing GMAP -------##
## --This file contains all the source codes
## -- filter peak if it's not a local peak of insulation in orignal; update tune_score
## -- function without using prefixed domain as background
##---------------------------------------------------------------------##

# call upstream window counts of class1
up_stat <- function(x, pp, d = 50){
  pairx = (x - d + 1) : x
  if(x < d) pairx = 1:x
  if(x < 5) pairx = 1 : 5   # use 4 as a buffer
  num_class1 = sum(pp[pairx, pairx])
  tnum = (length(pairx))^2
  return(c(num_class1, tnum))
}
up_stat = cmpfun(up_stat)


# call downstream window counts of class1
down_stat <- function(x, pp, d = 50){
  pairx = x:(x + d - 1)
  if(x > nrow(pp) - d + 1) pairx = (nrow(pp) - d + 1) : nrow(pp)
  if(x > nrow(pp) - 5) pairx = (x-4) : nrow(pp)

  num_class1 = sum(pp[pairx, pairx])
  tnum = (length(pairx))^2
  return(c(num_class1,  tnum))
}
down_stat = cmpfun(down_stat)


# call counts of class1 beween the upwindow and downwindow
btw_stat <- function(x, pp, d = 50){
  pairx1 = (x - d + 1) : x
  pairx2 = x : (x + d - 1)

  if(x < d) pairx1 = 1:x
  if(x < 5) pairx1 = 1 : 5

  if(x > nrow(pp) - d + 1) pairx2 = (nrow(pp) - d + 1) : nrow(pp)
  if(x > nrow(pp) - 5) pairx2 = (x-4) : nrow(pp)

  num_class1 = sum(pp[pairx1, pairx2]) + sum(pp[pairx2, pairx1])
  tnum = 2 * length(pairx1) * length(pairx2)
  return(c(num_class1,  tnum))
}
btw_stat = cmpfun(btw_stat)


## calculate within-between test stat
cal_stat <- function(pp, d = 50){
  bpos = 1:nrow(pp)
  ustat = lapply(bpos, up_stat,  pp=pp, d = d)
  dstat = lapply(bpos, down_stat, pp=pp,  d = d)
  bstat = lapply(bpos, btw_stat, pp=pp,  d = d)

  n_up = sapply(ustat, function(x) x[2])
  n_btw = sapply(bstat, function(x) x[2])
  n_down = sapply(dstat, function(x) x[2])

  prop_up = sapply(ustat, function(x) x[1]/x[2])
  prop_btw = sapply(bstat, function(x) x[1]/x[2])
  prop_down = sapply(dstat, function(x) x[1]/x[2])

  prop_up[n_up == 0] = 0L
  prop_down[n_down == 0] = 0L
  prop_btw[n_btw == 0] = 0L

  # for testing difference between up and down
  p0 = (n_down * prop_down + n_up * prop_up)/(n_up + n_down)
  serror = sqrt(p0 * (1-p0) * (1/(n_up) + 1/n_down))
  test = (prop_up - prop_down)/serror
  test[serror == 0] = 0L

  stat_up = pmax(0, test)
  stat_down = pmax(0, -test)


  # testing up+down > between
  n = length(bpos)
  p0 = (n_down * prop_down + n_up * prop_up + n_btw * prop_btw) / (n_down + n_btw + n_up)
  serror = sqrt(p0 * (1-p0) * (1/(n_up + n_down) + 1/n_btw))
  prop_within = (n_down * prop_down + n_up * prop_up)/(n_up + n_down)
  stat_wb = (prop_within - prop_btw)/serror
  stat_wb[serror == 0] = 0L






  return(list('up' = round(stat_up, 2), 'down' = round(stat_down,2),
              'wb' = round(stat_wb, 2)))

}
cal_stat = cmpfun(cal_stat)



## find local peaks for a given vector
localMaxima <- function(x,  stats1, thr = 0.75,  dp = 10) {
  # smmothing using sliding window before search local peaks
  # filter by fc
  x = round(runmean(x, 5), 2)

  stats1 = round(runmean(stats1, 5), 2)

  y = extrema(x)$maxindex[, 1]

  y_fc = extrema(stats1)$maxindex[, 1]

  if(length(y_fc) > 0){
    y_mdist_fc = sapply(y, function(t) min(abs(t - y_fc)))
    y_stat1 = stats1[y]
    y = y[y_mdist_fc <= 3 & y_stat1 > 0]

  }else{
    return(NULL)
  }
  xy = x[y]

  # filter by t1
  local_t1_quantile <- function(ly){
    ll = max(ly - 250, 1)
    rr = min(ly + 250, length(x))
    return(quantile(x[ll : rr], thr))
  }

  local_t1thr = quantile(x, thr)

  if(length(x) >= 500) local_t1thr = sapply(y, local_t1_quantile)


  y = y[xy >= local_t1thr]  ## screening by t1


  if(length(y) == 0) return(NULL)

  # remove peaks that are very close
  if(!is.null(y)) y = combPeaks(y, x, dp)


  y = y[y >= 6]
  y = y[y <= (length(x) - 6)]

  return(y)
}
localMaxima = cmpfun(localMaxima)



## define tad by gmap given peaks and statistics
def_tad_gmap <- function(wb_peaks, stat_up, stat_down, nn, thr = 2){
  s1 = stat_up[wb_peaks]
  s2 = stat_down[wb_peaks]


  len = length(wb_peaks)
  signs = rep(0, len)

  ind1 = which(s1 > thr)
  ind2 = which(s2 > thr)
  if(length(ind1) > 0) signs[ind1] = 1
  if(length(ind2) > 0) signs[ind2] = -1

  start = end = NULL


  # start call first tad
  if(signs[1] == -1){
    start = c(start, 1)
    i = 1
  }
  if(signs[1] == 0){
    start = c(start, 0)
    end = c(end, 1)
    start = c(start, 1)
    i = 1
  }
  if(signs[1] == 1){
    start = c(start, 0)
    i = 0
  }

  repeat{
    if(i == len) break
    for(j in (i+1):len){

      if(signs[j] == -1 & j != len) next
      if(signs[j] == -1 & j == len) break

      if(signs[j] == 0){
        end = c(end, j)
        start = c(start, j)
        break
      }

      if(signs[j] == 1){
        if(j==len){
          end = c(end, j)
          break
        }

        if(j < len & signs[j+1] != 1) {
          end = c(end, j)
          start = c(start, j+1)
          break
        }

        if(j < len & signs[j+1] == 1) next

      }
    }  ## end of for loop
    i = max(start)
    if(j == len) break
  }    ## end of repeat loop

  # final ends
  if(length(start) > length(end)){
    end <- c(wb_peaks[end], nn)
  }else{
    end <- wb_peaks[end]
  }

  # final starts
  if(start[1] == 0){
    start = c(1, wb_peaks[start[-1]])
  }else{
    start = wb_peaks[start]
  }

  dd = data.frame("start" = start, "end" = end)
  dd = dd[dd$start != dd$end, ]
  return(dd)
}
def_tad_gmap = cmpfun(def_tad_gmap)



## using boundary to define segmatation
full_seg <- function(bounds){
  bounds = sort(bounds)
  len = length(bounds)
  return(data.frame('start' = bounds[-len], 'end' = bounds[-1]))
}
full_seg = cmpfun(full_seg)

## calculate insulation score
cal_insul = function(x, pp, wd = 50){
  nn = nrow(pp)
  start_bin = max(1, (x - wd))
  end_bin = min(nn, (x + wd))
  intra.score = sum(c(pp[start_bin:x, start_bin:x], pp[x:end_bin, x:end_bin]))
  inter.score = 2*sum(pp[start_bin:x, x:end_bin])
  insul.score = log2(intra.score + 1) - log2(inter.score + 1)
  return(insul.score)
}

## not used version
cal_insul_org = function(x, pp, wd = 50){
  nn = nrow(pp)
  start_bin = max(1, (x - wd))
  end_bin = min(nn, (x + wd))
  intra.score = sum(c(pp[start_bin:x, start_bin:x], pp[x:end_bin, x:end_bin]))
  inter.score = 2*sum(pp[start_bin:x, x:end_bin])
  insul.score = intra.score  - inter.score
  return(insul.score)
}


## without using predifined area: using intra-tad over inter-tad instead
tune_insulScore <- function(pp, tads, wd = 50){
  nn = nrow(pp)

  bds = sort(unique(unlist(tads)))
  bds = setdiff(bds, c(1, nn))
  insul.score = lapply(bds, cal_insul, pp, wd)
  insul.score = do.call('c', insul.score)

  if(all(is.na(insul.score))) return(0)

  score.summ = mean(insul.score) * log(length(insul.score[insul.score > 0]) + 1)

  return(score.summ)
}
tune_insulScore = cmpfun(tune_insulScore)

## including tune t1 and t2 -- given dp and d
tune_T1T2 <- function(pp, stats, stats1, d, dp, t1thr = 0.5){

  stat_up = stats$up
  stat_down = stats$down
  stat_wb = stats$wb
  pr_fc_wb = stats$pr_fc_wb
  nn = nrow(pp)

  # no local peaks
  if(all(diff(stat_wb) <=0) || all(diff(stat_wb) >= 0)) return(list('score' = 0, 'tads' = NULL))

  sthr = 0.05
  sthr0 = round(qnorm(sthr/nn, lower.tail = F), 1)

  # 75% quantile is too close to 99% quantile
  if(quantile(stat_wb, 0.9) - quantile(stat_wb, 0.75) < 1 ) return(list('score' = 0, 'tads' = NULL))


  # give candidates for t1
  stat_wb = runmean(stat_wb, 5)

  cand_t1 = seq(0.99, t1thr, length = 50)  # use quantile

  # define t2 candidates globally
  tmp = c(stat_up, stat_down)
  cand_t2 <- quantile(tmp[tmp != 0], seq(0.975, 1.0, length = 2))
  tmp = qnorm(sthr, lower.tail = F)
  cand_t2 = unique(pmax(tmp, round(cand_t2, 2)))


  len1 = length(cand_t1)
  len2 = length(cand_t2)
  score = matrix(0, len1, len2)

  for(i in 1:len1){
    wb_peaks = localMaxima(stat_wb, stats1, thr = cand_t1[i], dp = dp)

    if(is.null(wb_peaks)) {
      score[i, ] = 0
      next
    }

    if(length(wb_peaks) == 0) {
      score[i, ] = 0
      next
    }

    for(j in 1:len2){

      tads <- def_tad_gmap(wb_peaks, stat_up, stat_down, nn, thr = cand_t2[j])

      if(nrow(tads) == 0) {
        score[i, j] = 0
      }else{
        score[i, j] <- tune_insulScore(pp, tads)
      }
    }
  }

  if(max(score, na.rm = TRUE) <= 0) return(list('score' = 0, 'tads' = NULL))

  ind = which(score == max(score, na.rm = TRUE), arr.ind=TRUE)[1, ]
  wb_peaks = localMaxima(stat_wb, stats1, thr = cand_t1[ind[1]],  dp = dp)

  if(is.null(wb_peaks)) return(list('score' = 0, 'tads' = NULL))

  tads <- def_tad_gmap(wb_peaks, stat_up, stat_down, nn,
                       thr = cand_t2[ind[2]])

  return(list('tads' = tads, 'score' = max(score), 't1' = round(cand_t1[ind[1]], 3),
              't2' = round(cand_t2[ind[2]], 3) , 'stats' = stats,
              'local_peaks' = wb_peaks))
}
tune_T1T2 = cmpfun(tune_T1T2)




## tune all parameters -- give candidates of d and dp
tune_allPara <- function(pp, pp1, candD, candDp,  t1thr = 0.5){
  lend = length(candD)
  lendp = length(candDp)
  score = matrix(0, lend, lendp)
  nn = nrow(pp)
  sthr = 0.05

  for(i in 1:lend){
    di = candD[i]
    stats = cal_stat(pp, d = di)
    insul_ws = 20

    stats1 = lapply(1:nn, cal_insul, pp1, insul_ws)
    stats1 = do.call('c', stats1)

    if(all(stats$wb <= qnorm(sthr/nn, lower.tail = F))) {
      score[i, ] = 0
      next
    }
    for(j in 1:lendp){
      score[i, j] = tune_T1T2(pp, stats, stats1, d = candD[i], dp = candDp[j],
                               t1thr = t1thr)$score
    }
  }

  if(max(score, na.rm = TRUE) <= 0) return(NULL)

  ind = which(score == max(score, na.rm = TRUE), arr.ind = T)[1, ]
  d = candD[ind[1]]
  dp = candDp[ind[2]]

  stats = cal_stat(pp, d = d)
  res = tune_T1T2(pp, stats,  stats1, d = d, dp = dp, t1thr = t1thr)
  res$d = d
  res$dp = dp
  return(res)

}
tune_allPara = cmpfun(tune_allPara)



## remove redundant domains
rm_fpdomain <- function(subTads, pp){

  subTads = as.matrix(subTads)
  bds = sort(unique(as.vector(subTads)))

  ## filter domains with very small contacts
  subTads = as.matrix(full_seg(bds))

  s_density = round(apply(subTads, 1, function(x) mean(pp[x[1]:x[2], x[1]:x[2]])), 2)
  tsize = round(apply(subTads, 1, function(x) x[2] - x[1]))
  nn = nrow(pp)

  calDensity_expect <- function(tsize0){
    ids = lapply(1:nn, function(x) return(cbind(x, c(max(1, x - tsize0):min(nn, x + tsize0)))))
    ids = do.call('rbind', ids)
    return(round(mean(pp[ids]), 2))
  }


  e_density = sapply(tsize, calDensity_expect)

  ids = which(s_density <= e_density)
  if(length(ids) > 0) subTads = subTads[-ids, ]

  return(subTads)
}
rm_fpdomain = cmpfun(rm_fpdomain)


## remove redundant domains locally
## dw represents dw up- and down- range of the current domain
rm_fpdomain_local <- function(subTads, pp, dw = 10){

  subTads = as.matrix(subTads)
  bds = sort(unique(as.vector(subTads)))

  ## filter domains with very small contacts
  subTads = as.matrix(full_seg(bds))

  s_density = round(apply(subTads, 1, function(x) mean(pp[x[1]:x[2], x[1]:x[2]])), 2)
  tsize = round(apply(subTads, 1, function(x) x[2] - x[1]))
  midp = apply(subTads, 1, function(x) floor(x[2]/2 + x[1]/2))
  nn = nrow(pp)

  calDensity_expect_local <- function(j, dw0 = dw){
    tsize0 = tsize[j]
    midp0 = midp[j]
    ids = lapply(max(1, midp0 - dw0 * tsize0) : min(nn, midp0 + dw0 * tsize0), function(x) return(cbind(x, c(max(1, x - tsize0):min(nn, x + tsize0)))))
    ids = do.call('rbind', ids)
    return(round(mean(pp[ids]), 2))
  }


  e_density = sapply(1:length(tsize), calDensity_expect_local)

  ids = which(s_density <= e_density)
  if(length(ids) > 0) subTads = subTads[-ids, ]

  return(subTads)
}
rm_fpdomain_local = cmpfun(rm_fpdomain_local)


## transfrom hic normalized count to binary
gmixmodel <- function(sub_mat, hthr = 0.9, maxDistInBin = 300){
  nn = nrow(sub_mat)

  pp = matrix(0L, nn, nn)

  # if the tad is too small (less than 10 bins), do not call subtad
  if(nn <= 10) return(pp)

    ## if the hic-matrix is too large, we assume two loci with distance larger than
    ## say 1000 bins have no interactions. Data from this part will not be used for
    ## constructing the mixture models
  if(maxDistInBin > nn) maxDistInBin = nn
  partialIDs = sapply(1:(nn-1), function(x) return(cbind(x, c((x + 1):min(nn, x + maxDistInBin)))))
  partialIDs = do.call('rbind', partialIDs)
  temp = sub_mat[partialIDs]

    # deal with outliers
    class1 = rep(0, length(temp))

    # deal with outliers

    out.thr.upper = quantile(temp, 0.95)
    #out.thr.lower = max(quantile(temp, 0.025), 0)
    out.thr.lower = quantile(temp, 0.05)

    id.lower = which(temp <= out.thr.lower)
    id.upper = which(temp >= out.thr.upper)
    ids = NULL

    if(length(id.upper) > 0){
      class1[id.upper] = 1
      ids = c(id.lower, id.upper)
      temp = temp[-ids]
      class0 = class1[-ids]
    }else{
      if(length(id.lower) > 0){
        ids = id.lower
        temp = temp[-ids]
        class0 = class1[-ids]
      }else{
        class0 = class1
      }
    }


    if(length(unique(temp)) >= 2){
      model2 <- Mclust(temp, G = 2, modelNames = 'E')

      if(!is.null(model2)){
        ind = which.max(model2$parameters$mean)
        posterior = model2$z[, ind]
        class0 = ifelse(posterior > hthr, 1, 0)
        rm(posterior)
      }

      if(is.null(model2)){ ## not covergent
        message('Trying using models with different variances')
        model2 <- Mclust(temp, G = 2, modelNames = 'V')
        if(!is.null(model2)) {
          ind = which.max(model2$parameters$mean)
          posterior = model2$z[, ind]
          class0 = ifelse(posterior > hthr, 1, 0)
          rm(posterior)
        }else{
          message('No models can be fitted, using 75% quantile as cutoff')
          class0 = ifelse(temp > quantile(temp, 0.75), 1, 0)
        }
      }


      # just in case:
      if(length(unique(class0)) == 1 )  class0 = ifelse(temp > quantile(temp, 0.75), 1, 0)
      rm(temp, model2)
    }

    if(length(ids) > 0) {
      class1[-ids] = class0
      pp[partialIDs] = class1
    }else{
      pp[partialIDs] = class0
    }

    pp = pp + t(pp)
    rm(class0, partialIDs)



  diag(pp) <- 0

  return(list('binary_mat' = pp, 'mat_adjDist' = sub_mat))
}
gmixmodel = cmpfun(gmixmodel)



## not used version
gmixmodel0 <- function(sub_mat, hthr = 0.9, maxDistInBin = 300){
  nn = nrow(sub_mat)

  ## consider distance
  if(F){
    exp_mat = matrix(0, nn, nn)
    for(dist.bin in 0:maxDistInBin){
      row.ids = 1:(nn - dist.bin)
      col.ids = row.ids + dist.bin
      pids = cbind(row.ids, col.ids)
      exp.n = mean(sub_mat[pids], trim = 0.25)
      #exp.n = median(sub_mat[pids])
      exp_mat[pids] = exp.n
    }
    exp_mat = exp_mat/2 + t(exp_mat)/2
    sub_mat = sub_mat - exp_mat

  }



  pp = matrix(0L, nn, nn)

  # if the tad is too small (less than 10 bins), do not call subtad
  if(nn <= 10) return(pp)

  ## if the hic-matrix is too large, we assume two loci with distance larger than
  ## say 1000 bins have no interactions. Data from this part will not be used for
  ## constructing the mixture models
  if(maxDistInBin > nn) maxDistInBin = nn
  partialIDs = sapply(1:(nn-1), function(x) return(cbind(x, c((x + 1):min(nn, x + maxDistInBin)))))
  partialIDs = do.call('rbind', partialIDs)
  temp = sub_mat[partialIDs]

  # deal with outliers
  class1 = rep(0, length(temp))

  # deal with outliers

  out.thr.upper = quantile(temp, 0.975)
  #out.thr.lower = max(quantile(temp, 0.025), 0)
  out.thr.lower = quantile(temp, 0.025)

  id.lower = which(temp <= out.thr.lower)
  id.upper = which(temp >= out.thr.upper)
  ids = NULL

  if(length(id.upper) > 0){
    class1[id.upper] = 1
    ids = c(id.lower, id.upper)
    temp = temp[-ids]
    class0 = class1[-ids]
  }else{
    if(length(id.lower) > 0){
      ids = id.lower
      temp = temp[-ids]
      class0 = class1[-ids]
    }else{
      class0 = class1
    }
  }


  if(length(unique(temp)) >= 2){
    model2 <- Mclust(temp, G = 2, modelNames = 'E')

    if(!is.null(model2)){
      ind = which.max(model2$parameters$mean)
      posterior = model2$z[, ind]
      class0 = ifelse(posterior > hthr, 1, 0)
      rm(posterior)
    }

    if(is.null(model2)){ ## not covergent
      message('No models can be fitted, using 75% quantile as cutoff')
      class0 = ifelse(temp > quantile(temp, 0.75), 1, 0)
    }


    # just in case:
    if(length(unique(class0)) == 1 )  class0 = ifelse(temp > quantile(temp, 0.75), 1, 0)
    #rm(temp, model2)
  }

  if(length(ids) > 0) {
    class1[-ids] = class0
    pp[partialIDs] = class1
  }else{
    pp[partialIDs] = class0
  }

  pp = pp + t(pp)
  rm(class0, partialIDs)



  diag(pp) <- 0

  return(list('binary_mat' = pp, 'mat_adjDist' = sub_mat))
}
gmixmodel0 = cmpfun(gmixmodel0)


call_domain <- function(sub_mat, max_d, min_d, max_dp, min_dp,  hthr = 0.5,
                        maxDistInBin = 200, t1thr = 0.5){

  nn = nrow(sub_mat)
  ##
  gres = gmixmodel(sub_mat, hthr, maxDistInBin)
  pp = gres$binary_mat
  sub_mat = gres$mat_adjDist
  rm(gres)
  if(sum(pp) == 0) return(list('tads' = NULL))

  # define candidates for d and dp
  #candD = seq(min_d, max_d, by = min_d)
  candD = seq(min_d, max_d, by = min_d)

  candDp = seq(min_dp, max_dp, by = min_dp)

  # define the background
  if(maxDistInBin > nn){

    rnum_bk <- sum(pp)
    tnum_bk <- length(pp)

    rnum_bk0 = sum(sub_mat)

  }else{
    idBg = lapply(1:nn, function(x) return(cbind(x, c(max(1, x - maxDistInBin):min(nn, x + maxDistInBin)))))
    idBg = do.call('rbind', idBg)
    temp = pp[idBg]
    rnum_bk <- sum(temp)
    tnum_bk <- length(temp)
  }

  if(rnum_bk >= 0.95 * tnum_bk || rnum_bk < 1/sqrt(tnum_bk) * tnum_bk) return(list("tads" = NULL))

  pp1 = pp
  pp1[idBg] = sub_mat[idBg]
  diag(pp1) = 0

  #pp1 = gmixmodel(sub_mat, hthr, floor(maxDistInBin * 1.2))

  res = tune_allPara(pp, pp1, candD, candDp, t1thr)
  subTads = res$tads



  if(!is.null(subTads)) res$tads = rm_fpdomain_local(subTads, pp)

  return(res)
}
call_domain = cmpfun(call_domain)


# run rGMAP for a single chromosom
rGMAP_singChr <- function(hic_mat, resl = 10*10^3, logt = T, dom_order = 2,
                          maxDistInBin = min(200, 2*10^6/resl), min_d = 25, max_d = 100,
                          min_dp = 5, max_dp = 10, hthr = 0.95, t1thr = 0.5){
  if(ncol(hic_mat) == 3){
    names(hic_mat) = c('n1', 'n2', 'counts')
    hic_mat = data.table(hic_mat)
    hic_mat = hic_mat[abs(n1 - n2) <= maxDistInBin]  ## keep contacts within maxDistInBin distance
    
    hic_mat = sparseMatrix(i = hic_mat$n1, j = hic_mat$n2, x=hic_mat$counts,
                           symmetric = T )
  }
  
  hic_mat = as.matrix(hic_mat)
  hic_mat[is.na(hic_mat)] = 0L
  hic_mat = round(hic_mat, 1)
  options(scipen = 10)
  
  
  # remove first k rows and columns if the first k by k submatrix is zero matrix
  k = 0
  for(i in 1:nrow(hic_mat)){
    if(any(hic_mat[i, ] != 0)) {
      k = i - 1
      break
    }
  }
  if(k > 0) {
    hic_mat = hic_mat[-(1:k), -(1:k)]
    #message(paste(">>>> remove first ", k, " bins whose counts are all zero"))
  }
  nn = nrow(hic_mat)
  
  ## do logtransformation
  if(logt){
    if(any(hic_mat < 0)) stop('Cannot do log-transformation on negative counts!')
    hic_mat = log2(hic_mat + 1)
  }
  
  if(min_d >= nn){
    min_d = floor(nn/2)
    max_d = floor(nn/4*3)
  }
  if(max_d > maxDistInBin) max_d = maxDistInBin
  if(max_d < min_d) max_d = min_d
  
  message(">>>> call TADs...")
  res = call_domain(hic_mat, max_d, min_d, max_dp, min_dp, hthr, maxDistInBin, t1thr)
  tads = res$tads
  if(is.null(tads)) {
    message(">>> No tads: probably because maxDistInBin, t1thr too big!")
    
    message(">>> first try: decrease maxDistInBin")
    res = call_domain(hic_mat, max_d, min_d, max_dp, min_dp, hthr,
                      floor(maxDistInBin*0.5), t1thr)
    tads = res$tads
    
    if(is.null(tads)){
      message(">>> second try: decrease t1thr")
      res = call_domain(hic_mat, max_d, min_d, max_dp, min_dp, hthr,
                        maxDistInBin, t1thr = t1thr/2)
      tads = res$tads
    }
    
    if(is.null(tads)){
      message(">>> third try: decrease maxDistInBin and t1thr")
      res = call_domain(hic_mat, max_d, min_d, max_dp, min_dp,  hthr,
                        floor(maxDistInBin*0.5), t1thr = t1thr/2)
      tads = res$tads
    }
    
    if(is.null(tads)){
      message(">>> Give up, No tads detected!")
      return(list("tads" = NULL, 'hierTads' = NULL))
    }
    
  }
  params = data.frame('score' = round(res$score, 1), 'd' = res$d, 'dp' = res$dp, 't1' = res$t1, 't2' = res$t2)
  
  #message(paste(c('score', 'd =', 'dp =', 't1 =', 't2 ='), unlist(params),  collapse = ", "))
  
  ## call hierarchical structure
  
  hierTads = list()
  if(dom_order > 1){
    ll = 1
    parenTads = tads
    if(is.null(tads)) return(list("tads" = NULL, 'hierTads' = NULL))
    
    
    hierTads[[ll]] = data.frame(tads)
    
    while(ll < dom_order){
      tempTads = list()
      ll = ll + 1
      for(i in 1:nrow(parenTads)){
        
        Sbin = parenTads[i, 1]
        Ebin = parenTads[i, 2]
        len = Ebin - Sbin + 1
        if(len <= min(50, floor(500*1000/resl))) next
        
        message(paste(">>>> call sub-domains for ", i, "th"," domain of order ", ll-1,
                      "...", sep = ""))
        message(paste("start from bin ", parenTads[i, 1], " to bin ", parenTads[i, 2], sep = ""))
        
        
        md = min(200*1000/resl, 10)
        Md = min(max_d, floor(len/3))
        Md = max(md, Md)
        
        mdp = 5
        Mdp = 10
        
        tmp0 <-  call_domain(hic_mat[Sbin:Ebin, Sbin:Ebin], Md, md, Mdp, mdp,
                             hthr = max(0.9, hthr), floor(2*len/3), t1thr = max(t1thr, 0.9))
        
        if(is.null(tmp0$tads)){
          message(paste('>>> no sub-TADs found!'))
          next
        }
        
        tempTads[[i]] = tmp0$tads + Sbin - 1
        params0 = c(tmp0$d, tmp0$dp, tmp0$t1, tmp0$t2)
        
        message(paste(c('d =', 'dp =', 't1 =', 't2 ='), params0,  collapse = ", "))
        
      }
      
      ## do not go to next step if no subTads are called from current level
      if(length(tempTads) == 0) break
      
      parenTads = data.frame(do.call("rbind", tempTads), row.names = NULL)
      
      hierTads[[ll]] = parenTads
      
    }
    rm(hic_mat)
  }
  
  
  
  tads = data.frame((tads+k) * resl - floor(resl/2))
  
  if(length(hierTads) == 0) {
    hierTads = tads
    hierTads$dom_order = 1
  }else{
    hierTads = lapply(hierTads, function(x) (x + k) * resl - floor(resl/2))
    
    dlens = sapply(hierTads, nrow)
    
    hierTads = do.call('rbind', hierTads)
    
    hierTads$dom_order = rep(1:length(dlens), dlens)
  }
  
  
  row.names(hierTads) = NULL
  return(list('tads' = data.table(tads), 'hierTads' = data.table(hierTads),
              'params' = data.table(params)))
  
}
rGMAP_singChr = cmpfun(rGMAP_singChr)



#' Detect hierarchical choromotin domains by GMAP
#' @param  hic_obj HiC contact matrix, generally either a 3-column data frame/data table or a tab/space delimited text file,
#'  corresponding to the i_th, j_th bin of a chromosome and the contact frequency
#' * For single chromosome, could also be a n by n matrix 
#' * For multiple chromosomes, an index_obj indicates genomic coordinate for each hic bin should be provided
#' @md
#' @param  index_obj A 4-columns tab/space delimited text file (or corresponding data frame/data table) indicates the genomic coordinates for each bin (compatible with HiC-Pro); with columns 
#' *bin_chr bin_start bin_end bin_id*, default NULL; when index _obj was given, it should be consitent with hic_obj
#' @param  resl The resolution (bin size), default 10kb
#' @param logt Do log-transformation or not, default TRUE
#' @param dom_order Maximum level of hierarchical structures, default 2 (call TADs and subTADs)
#' @param maxDistInBin Only consider contact whose distance is not greater than maxDistInBIn bins,
#' default 200 bins (or 2Mb)
#' @param  min_d The minimum d (d: window size), default 25
#' @param  max_d The maximum d (d: window size), default 100
#' @param min_dp The minmum dp (dp: lower bound of tad size), defalt 5
#' @param max_dp The maximum dp (dp: lower bound of tad size), defalt 10.
#'   min_d, max_d, min_dp and max_dp should be specified in number of bins
#' @param hthr The lower bound cutoff for posterior probability, default 0.95
#' @param t1thr Lower bound for t1 for calling TAD, default 0.5 quantile of test statistics
#'        of TADs, 0.9 of subTADs
#' @return A list includes following elements:
#' \item{tads}{A data frame with columns start, end indicates the start and end coordinates of each domain, respectively}
#' \item{hierTads}{A data frame with columns start, end, dom_order, where dom_order indicates the hierarchical status of a domain, 1 indicates tads, 2 indicates subtads, and so on}
#' \item{params}{A data frame gives the final parameters for calling TADs}
#' @rdname rGMAP
#' @export
#' @examples
#' ## On simulated data:
#' library(rGMAP)
#' simu_res = data_simu('poisson-dist-hier')
#' true_domains = simu_res$hierTads
#' simu_mat = simu_res$hic_mat
#' predicted_domains = rGMAP(simu_mat, resl = 1)$hierTads
#' true_domains
#' predicted_domains
#'
#' ## On an real data example
#'hic_rao_IMR90_chr15   # normalized Hi-C data for IMR90, chr15 with resolution 10kb
#'res = rGMAP(hic_rao_IMR90_chr15, NULL, resl = 10 * 1000, dom_order = 2)
#'names(res)
#' #quickly visualize some hierarchical domains
#' pp = plotdom(hic_rao_IMR90_chr15, NULL, res$hierTads, NULL, 6000, 7000, 30, 10)
#' pp$p2
rGMAP <- function(hic_obj, index_obj = NULL, resl = 10*10^3, logt = T, dom_order = 2,
                  maxDistInBin = min(200, 2*10^6/resl), min_d = 25, max_d = 100,
                  min_dp = 5, max_dp = 10, hthr = 0.95, t1thr = 0.5){

  if(class(hic_obj)[1] == 'character') {
     message('Read hic_obj...')
     hic_file = hic_obj
     hic_obj = fread(hic_file)
     if(ncol(hic_obj) != 3 & nrow(hic_obj) != ncol(hic_obj)) hic_obj = data.table(read.table(hic_file, header = F))
     if(ncol(hic_obj) != 3 & nrow(hic_obj) != ncol(hic_obj)) stop('Wrong dimension of data!')
     
  }
  if(is.null(index_obj)){
    message('Note that the input hic map should be just for a single chromosome, since no index_obj was provided.')
    output_list = rGMAP_singChr(hic_obj, resl, logt, dom_order, maxDistInBin, min_d , max_d ,
          min_dp, max_dp, hthr, t1thr)
  }
  
  
  if(!is.null(index_obj)){
    if(ncol(hic_obj) != 3) stop('The input hic_obj should be in 3-column data frame format!')
    if(!any(class(hic_obj) == 'data.table')) hic_obj = data.table(hic_obj)
    names(hic_obj) = c('n1', 'n2', 'counts')
    
    if(class(index_obj)[1] == 'character'){
      index = fread(index_obj, select = 1:4)
    }else{
      index = data.table(index_obj)
    }
    names(index) = c('chr', 'start', 'end', 'id')
    
    
    if(any(!hic_obj$n1 %in% index$id) || any(!hic_obj$n2 %in% index$id)){
      stop('The hic_obj was not consistent with the index file: 
           all bin ids in the hic_obj should be included in the index file!')
    }
    index = index[chr != 'chrM']
    chrs = unique(index$chr)
    tads = list()
    hierTads = list()
    params = list()
    
    for(chr0 in chrs){
      # extract hic_obj for chr0
      message(paste('Working on chromosome', chr0, '...'))
      index0 = index[chr == chr0]
      hic_obj0 = hic_obj[n1 %in% index0$id & n2 %in% index0$id]
      id0 = min(index0$id)
      hic_obj0[, 'n1' := n1 - id0 + 1]
      hic_obj0[, 'n2' := n2 - id0 + 1]
      if(nrow(hic_obj0) <= 20) next
      res = rGMAP_singChr(hic_obj0, resl, logt, dom_order, maxDistInBin, min_d , max_d ,
                  min_dp, max_dp, hthr, t1thr)
      if(!is.null(res$tads)) res$tads$chr = chr0
      if(!is.null(res$hierTads)) res$hierTads$chr = chr0
      if(!is.null(res$params)) res$params$chr = chr0
      
      tads[[chr0]] = res$tads
      hierTads[[chr0]] = res$hierTads
      params[[chr0]] = res$params
      
      message(paste(chr0, 'done!'))
    }
    
    tads = do.call('rbind', tads)
    hierTads = do.call('rbind', hierTads)
    params = do.call('rbind', params)
    
    if(!is.null(tads)) setcolorder(tads, c('chr', 'start', 'end'))
    if(!is.null(hierTads)) setcolorder(hierTads, c('chr', 'start', 'end', 'dom_order'))
    if(!is.null(params)) setcolorder(params, c('chr', 'score', 'd', 'dp', 't1', 't2'))
    
    output_list = list('tads' = tads, 'hierTads' = hierTads, 'params' = params)
  }
  
  
  
  message('All done!')
  return(output_list)
}

rGMAP = cmpfun(rGMAP)





#' generate simulated hic_mat and true tads
#' @param stype One of four types of simulated data in the manuscript:
#' poission-dist, poission-dist-hier, nb-dist, nb-dist-hier;
#' poission- or nb- indicates poission distribution or negative bionomial distribution
#' -hier indicated subtads are generated nestly
#' @param nratio The effect size between intra- and inter domain, larger means higher intra-tad contacts
#' @param mu0 The mean parameter, default 200
#' @param resl Resolution, default set to 1
#' @return A list includes following elements:
#' \item{hic_mat}{n by n contact matrix}
#' \item{hierTads}{True heirarchical domains}
#' \item{tads_true}{True TADs}
#' @rdname data_simu
#' @export
data_simu <- function(stype = 'poisson-dist', nratio = 2.5, mu0 = 200, resl = 1){

  if(stype == 'poisson-dist'){
    ## TADs with gap and Hier
    tbins = 1000  ## total bins

    bounds = c(1, 120, 160, 215, 355, 440, 530, 705, 765, 850, 950, 1000)
    bounds = sort(unique(bounds))
    #mu0 = 5
    ntad = length(bounds) - 1


    n = max(bounds)
    hic_mat  = matrix(0, n, n)
    hic_mat = as.data.table(as.data.frame(hic_mat))

    generateSparse_dat <- function(n1, mu1){
      df = data.table('id1' = rep(1:(n1), (n1):1),
                      'id2' = do.call('c', sapply(1:(n1), function(x) (x):n1)))
      df[, 'dist' := (id2 - id1)]
      df[, dist := ifelse(dist == 0, 0.9, dist)]
      df[, 'N' := rpois(1, mu1  * dist^(-1)), by = list(id1, id2)]

      return(df)
    }

    df = generateSparse_dat(n, mu0)

    for(k in 1:nrow(df)) set(hic_mat, i = as.integer(df$id1[k]),
                             j = as.integer(df$id2[k]), value = df$N[k])
    for(k in 1:nrow(df)) set(hic_mat, i = as.integer(df$id2[k]),
                             j = as.integer(df$id1[k]), value = df$N[k])

    hic_mat = as.matrix(hic_mat)


    modify_dat <- function(hic_mat, bounds, skip_ind = 0, mu = 5){
      ntad = length(bounds) - 1
      sizes = diff(bounds) + 1
      for(i in 1:ntad){
        if(i %in% skip_ind) next
        tn = sizes[i]
        #t_mu = mu * (1 + min(sizes)/tn)
        t_mu = mu
        df = generateSparse_dat(tn, t_mu)
        thic_mat  = matrix(0, tn, tn)
        thic_mat = as.data.table(as.data.frame(thic_mat))
        for(k in 1:nrow(df)) set(thic_mat, i = as.integer(df$id1[k]),
                                 j = as.integer(df$id2[k]), value = df$N[k])
        for(k in 1:nrow(df)) set(thic_mat, i = as.integer(df$id2[k]),
                                 j = as.integer(df$id1[k]), value = df$N[k])
        thic_mat = as.matrix(thic_mat)
        start = bounds[i]
        end = bounds[i+1]
        hic_mat[start:end, start:end] = thic_mat
      }
      return(hic_mat)

    }

    ## add TADs
    hic_mat = modify_dat(hic_mat, bounds, skip_ind = c(5, 7), mu0 * nratio)


    tads_true <- data.frame('start' = bounds[1:ntad], 'end' = bounds[2:(ntad + 1)])
    tads_true <- tads_true[-c(5, 7), ] * resl ## a gap

  }

  if(stype == 'poisson-dist-hier'){
    ## TADs with gap and Hier
    tbins = 1000  ## total bins

    bounds = c(1, 120, 160, 215, 355, 440, 530, 705, 765, 850, 950, 1000)
    bounds = sort(unique(bounds))
    #mu0 = 5
    ntad = length(bounds) - 1


    n = max(bounds)
    hic_mat  = matrix(0, n, n)
    hic_mat = as.data.table(as.data.frame(hic_mat))

    generateSparse_dat <- function(n1, mu1){
      df = data.table('id1' = rep(1:(n1), (n1):1),
                      'id2' = do.call('c', sapply(1:(n1), function(x) (x):n1)))
      df[, 'dist' := (id2 - id1)]
      df[, dist := ifelse(dist == 0, 0.9, dist)]
      df[, 'N' := rpois(1, mu1  * dist^(-1)), by = list(id1, id2)]

      return(df)
    }

    df = generateSparse_dat(n, mu0)

    for(k in 1:nrow(df)) set(hic_mat, i = as.integer(df$id1[k]),
                             j = as.integer(df$id2[k]), value = df$N[k])
    for(k in 1:nrow(df)) set(hic_mat, i = as.integer(df$id2[k]),
                             j = as.integer(df$id1[k]), value = df$N[k])

    hic_mat = as.matrix(hic_mat)


    modify_dat <- function(hic_mat, bounds, skip_ind = 0, mu = 5){
      ntad = length(bounds) - 1
      sizes = diff(bounds) + 1
      for(i in 1:ntad){
        if(i == skip_ind) next
        tn = sizes[i]
        #t_mu = mu * (1 + min(sizes)/tn)
        t_mu = mu
        df = generateSparse_dat(tn, t_mu)
        thic_mat  = matrix(0, tn, tn)
        thic_mat = as.data.table(as.data.frame(thic_mat))
        for(k in 1:nrow(df)) set(thic_mat, i = as.integer(df$id1[k]),
                                 j = as.integer(df$id2[k]), value = df$N[k])
        for(k in 1:nrow(df)) set(thic_mat, i = as.integer(df$id2[k]),
                                 j = as.integer(df$id1[k]), value = df$N[k])
        thic_mat = as.matrix(thic_mat)
        start = bounds[i]
        end = bounds[i+1]
        hic_mat[start:end, start:end] = thic_mat
      }
      return(hic_mat)

    }

    ## add TADs
    hic_mat = modify_dat(hic_mat, bounds, skip_ind = 5, mu0 * nratio)



    # generate subTADs
    start = 530
    end = 705

    sbounds = c(start, start + 50, start + 130, end)

    hic_mat = modify_dat(hic_mat, sbounds, skip_ind = 0, mu = mu0 * nratio * nratio)


    tads_true <- data.frame('start' = bounds[1:ntad],
                            'end' = bounds[2:(ntad + 1)]) * resl
    tads_true = tads_true[-5, ]  ## a gap

    sub_tads <- data.frame('start' = sbounds[1:3],
                           'end' = sbounds[2:4]) * resl

    hierTads = rbind(tads_true, sub_tads)
    hierTads$dom_order = c(rep(1, nrow(tads_true)), rep(2, nrow(sub_tads)))

  }

  if(stype == 'poisson-dist-hier2'){
    ## TADs with gap and Hier
    tbins = 1000  ## total bins

    bounds = c(1, 120, 160, 215, 355, 440, 530, 705, 765, 850, 950, 1000)
    bounds = sort(unique(bounds))
    #mu0 = 5
    ntad = length(bounds) - 1


    n = max(bounds)
    hic_mat  = matrix(0, n, n)
    hic_mat = as.data.table(as.data.frame(hic_mat))

    generateSparse_dat <- function(n1, mu1){
      df = data.table('id1' = rep(1:(n1), (n1):1),
                      'id2' = do.call('c', sapply(1:(n1), function(x) (x):n1)))
      df[, 'dist' := (id2 - id1)]
      df[, dist := ifelse(dist == 0, 0.9, dist)]
      df[, 'N' := rpois(1, mu1  * dist^(-1)), by = list(id1, id2)]

      return(df)
    }

    df = generateSparse_dat(n, mu0)

    for(k in 1:nrow(df)) set(hic_mat, i = as.integer(df$id1[k]),
                             j = as.integer(df$id2[k]), value = df$N[k])
    for(k in 1:nrow(df)) set(hic_mat, i = as.integer(df$id2[k]),
                             j = as.integer(df$id1[k]), value = df$N[k])

    hic_mat = as.matrix(hic_mat)


    modify_dat <- function(hic_mat, bounds, skip_ind = 0, mu = 5){
      ntad = length(bounds) - 1
      sizes = diff(bounds) + 1
      for(i in 1:ntad){
        if(i == skip_ind) next
        tn = sizes[i]
        #t_mu = mu * (1 + min(sizes)/tn)
        t_mu = mu
        df = generateSparse_dat(tn, t_mu)
        thic_mat  = matrix(0, tn, tn)
        thic_mat = as.data.table(as.data.frame(thic_mat))
        for(k in 1:nrow(df)) set(thic_mat, i = as.integer(df$id1[k]),
                                 j = as.integer(df$id2[k]), value = df$N[k])
        for(k in 1:nrow(df)) set(thic_mat, i = as.integer(df$id2[k]),
                                 j = as.integer(df$id1[k]), value = df$N[k])
        thic_mat = as.matrix(thic_mat)
        start = bounds[i]
        end = bounds[i+1]
        hic_mat[start:end, start:end] = thic_mat
      }
      return(hic_mat)

    }

    ## add TADs
    hic_mat = modify_dat(hic_mat, bounds, skip_ind = 5, mu0 * nratio)



    # generate subTADs
    start = 530
    end = 705

    sbounds = c(start, start + 25, end)

    hic_mat = modify_dat(hic_mat, sbounds, skip_ind = 0, mu = mu0 * nratio * nratio)


    tads_true <- data.frame('start' = bounds[1:ntad],
                            'end' = bounds[2:(ntad + 1)]) * resl
    tads_true = tads_true[-5, ]  ## a gap

    sub_tads <- data.frame('start' = sbounds[1:2],
                           'end' = sbounds[2:3]) * resl

    hierTads = rbind(tads_true, sub_tads)
    hierTads$dom_order = c(rep(1, nrow(tads_true)), rep(2, nrow(sub_tads)))

  }


  if(stype == 'nb-dist-hier'){
    ## TADs with gap and Hier
    tbins = 1000  ## total bins

    bounds = c(1, 120, 160, 215, 355, 440, 530, 705, 765, 850, 950, 1000)
    bounds = sort(unique(bounds))
    #mu0 = 5
    ntad = length(bounds) - 1


    n = max(bounds)
    hic_mat  = matrix(0, n, n)
    hic_mat = as.data.table(as.data.frame(hic_mat))

    phi0 = 4
    generateSparse_dat <- function(n1, mu1){
      df = data.table('id1' = rep(1:(n1), (n1):1),
                      'id2' = do.call('c', sapply(1:(n1), function(x) (x):n1)))
      df[, 'dist' := (id2 - id1)]
      df[, dist := ifelse(dist == 0, 0.9, dist)]
      df[, 'N' := rnbinom(1, mu = mu1  * dist^(-1), size = phi0 * mu1  * dist^(-1)), by = list(id1, id2)]

      return(df)
    }

    df = generateSparse_dat(n, mu0)

    for(k in 1:nrow(df)) set(hic_mat, i = as.integer(df$id1[k]),
                             j = as.integer(df$id2[k]), value = df$N[k])
    for(k in 1:nrow(df)) set(hic_mat, i = as.integer(df$id2[k]),
                             j = as.integer(df$id1[k]), value = df$N[k])

    hic_mat = as.matrix(hic_mat)


    modify_dat <- function(hic_mat, bounds, skip_ind = 0, mu = 5){
      ntad = length(bounds) - 1
      sizes = diff(bounds) + 1
      for(i in 1:ntad){
        if(i == skip_ind) next
        tn = sizes[i]
        #t_mu = mu * (1 + min(sizes)/tn)
        t_mu = mu
        df = generateSparse_dat(tn, t_mu)
        thic_mat  = matrix(0, tn, tn)
        thic_mat = as.data.table(as.data.frame(thic_mat))
        for(k in 1:nrow(df)) set(thic_mat, i = as.integer(df$id1[k]),
                                 j = as.integer(df$id2[k]), value = df$N[k])
        for(k in 1:nrow(df)) set(thic_mat, i = as.integer(df$id2[k]),
                                 j = as.integer(df$id1[k]), value = df$N[k])
        thic_mat = as.matrix(thic_mat)
        start = bounds[i]
        end = bounds[i+1]
        hic_mat[start:end, start:end] = thic_mat
      }
      return(hic_mat)

    }

    ## add TADs
    hic_mat = modify_dat(hic_mat, bounds, skip_ind = 5, mu0 * nratio)



    # generate subTADs
    start = 530
    end = 705

    sbounds = c(start, start + 50, start + 130, end)

    hic_mat = modify_dat(hic_mat, sbounds, skip_ind = 0, mu = mu0 * nratio * nratio)


    tads_true <- data.frame('start' = bounds[1:ntad],
                            'end' = bounds[2:(ntad + 1)]) * resl
    tads_true <- tads_true[-5, ]  ## a gap

    sub_tads <- data.frame('start' = sbounds[1:3],
                           'end' = sbounds[2:4]) * resl

    hierTads <- rbind(tads_true, sub_tads)
    hierTads$dom_order = c(rep(1, nrow(tads_true)), rep(2, nrow(sub_tads)))

  }

  if(stype == 'nb-dist'){
    ## TADs with gap and Hier
    tbins = 1000  ## total bins

    bounds = c(1, 120, 160, 215, 355, 440, 530, 705, 765, 850, 950, 1000)
    bounds = sort(unique(bounds))
    #mu0 = 5
    ntad = length(bounds) - 1


    n = max(bounds)
    hic_mat  = matrix(0, n, n)
    hic_mat = as.data.table(as.data.frame(hic_mat))

    phi0 = 4
    generateSparse_dat <- function(n1, mu1){
      df = data.table('id1' = rep(1:(n1), (n1):1),
                      'id2' = do.call('c', sapply(1:(n1), function(x) (x):n1)))
      df[, 'dist' := (id2 - id1)]
      df[, dist := ifelse(dist == 0, 0.9, dist)]
      df[, 'N' := rnbinom(1, mu = mu1  * dist^(-1), size = phi0 * mu1  * dist^(-1)), by = list(id1, id2)]

      return(df)
    }

    df = generateSparse_dat(n, mu0)

    for(k in 1:nrow(df)) set(hic_mat, i = as.integer(df$id1[k]),
                             j = as.integer(df$id2[k]), value = df$N[k])
    for(k in 1:nrow(df)) set(hic_mat, i = as.integer(df$id2[k]),
                             j = as.integer(df$id1[k]), value = df$N[k])

    hic_mat = as.matrix(hic_mat)


    modify_dat <- function(hic_mat, bounds, skip_ind = 0, mu = 5){
      ntad = length(bounds) - 1
      sizes = diff(bounds) + 1
      for(i in 1:ntad){
        if(i %in% skip_ind) next
        tn = sizes[i]
        #t_mu = mu * (1 + min(sizes)/tn)
        t_mu = mu
        df = generateSparse_dat(tn, t_mu)
        thic_mat  = matrix(0, tn, tn)
        thic_mat = as.data.table(as.data.frame(thic_mat))
        for(k in 1:nrow(df)) set(thic_mat, i = as.integer(df$id1[k]),
                                 j = as.integer(df$id2[k]), value = df$N[k])
        for(k in 1:nrow(df)) set(thic_mat, i = as.integer(df$id2[k]),
                                 j = as.integer(df$id1[k]), value = df$N[k])
        thic_mat = as.matrix(thic_mat)
        start = bounds[i]
        end = bounds[i+1]
        hic_mat[start:end, start:end] = thic_mat
      }
      return(hic_mat)

    }

    ## add TADs
    hic_mat = modify_dat(hic_mat, bounds, skip_ind = c(5, 7), mu0 * nratio)


    tads_true <- data.frame('start' = bounds[1:ntad],
                            'end' = bounds[2:(ntad + 1)]) * resl
    tads_true <- tads_true[-c(5, 7), ]  ## a gap


  }

  if(!grepl(stype, pattern = 'hier'))  {
    hierTads = tads_true
    hierTads$dom_order = 1
  }

  row.names(tads_true) = row.names(hierTads) = NULL
  return(list("hic_mat" = hic_mat, "tads_true" = tads_true, 'hierTads' = hierTads))
}
data_simu = cmpfun(data_simu)



## transfer tad to be in triangle shape
triangle_tad <- function(tads, resl = 1){
  pos <- within(tads, {
    y <- end/resl
    x <- start/resl
    rm(start, end)
  })

  # to plot a tad in a triangle shape
  triangle_tran <- function(x){
    yy = matrix(0, 3, 2)
    yy[1, ] = c(x[1], x[1])
    yy[2, ] = c(x[1], x[2])
    yy[3, ] = c(x[2], x[2])
    return(yy)
  }

  xy = NULL
  pos = as.matrix(pos)
  for(i in 1:nrow(pos)){
    xy = rbind(xy, triangle_tran(pos[i, ]))
  }
  return(xy)
}


## transferd triangle TADs for better visualization (horizontally arranged)
transf_tad_horz <- function(triangle_tad){
  xy_tad = as.matrix(triangle_tad)
  transf_mat = matrix(c(1, -1, 1, 1), 2, 2)/2
  #transf_mat = matrix(c(1, -1, 1, 1), 2, 2)/sqrt(2)
  xy_new =  transf_mat %*% t(xy_tad)
  return(t(xy_new))
}

## reformat n by n hic_mat into (x, y, count) format
refm_hic <- function(hic_mat){
  n = nrow(hic_mat)
  dat = NULL
  tfun <- function(x, mat){
    lx = n - x + 1
    xx = rep(x, lx)
    tt = cbind(xx, x:n, mat[x, x:n])
  }
  temp = lapply(1:n, tfun, hic_mat)
  refm_mat = do.call('rbind', temp)
  colnames(refm_mat) = c('x', 'y', 'count')
  return(refm_mat)
}
refm_hic = cmpfun(refm_hic)


#' visualize hierarchical domains
#' @param  hic_obj the hic_obj in the same format as used for calling TADs
#' @param index_obj the index_obj used for calling TADs
#' @param  hiertads_gmap the hierarchical domains outputted by GMAP
#' @param chr0 the selected single chromosome ('chr1' for example)
#' @param start_bin the start bin of the selected chromosome
#' @param end_bin the end bin of the selected chromosome
#' @param cthr the upper bound count threshold for color, default 20
#' @param resl reslution of Hi-C data, default 10000, if resl=1, it works on simulated data
#' @rdname plotdom
#' @export
plotdom <- function(hic_obj, index_obj = NULL, hiertads_gmap, chr0 = NULL, start_bin, end_bin, cthr = 20, resl = 10000){

  if(class(hic_obj)[1] == 'character') {
    message('Read hic_obj...')
    hic_file = hic_obj
    hic_obj = fread(hic_file)
    if(ncol(hic_obj) != 3 & nrow(hic_obj) != ncol(hic_obj)) hic_obj = data.table(read.table(hic_file, header = F))
    if(ncol(hic_obj) != 3 & nrow(hic_obj) != ncol(hic_obj)) stop('Wrong dimension of data!')
    
  }
  
  # transfer the n by n matrix to 3-column data table
  if(dim(hic_obj)[1] == dim(hic_obj)[2]) hic_obj = data.table(refm_hic(hic_obj))
  
  if(!any(class(hic_obj) == 'data.table')) hic_obj = data.table(hic_obj)
  
  if (ncol(hiertads_gmap) == 4) #if TADs called from multiple chromosomes, assuming the first col should be chr, cut it.
  {
    if(is.null(chr0)) stop('should provided a selected chromosome')
    hiertads_gmap = hiertads_gmap[chr == chr0]
    hiertads_gmap = hiertads_gmap[, -1]
  }
  
  if(!is.null(index_obj) & class(index_obj)[1] == 'character'){
    index_obj = fread(index_obj, select = 1:4)
    names(index_obj) = c('chr', 'start', 'end', 'id')
  }
  
  names(hiertads_gmap) = c('start', 'end', 'dom_order')
  names(hic_obj) = c('n1', 'n2', 'count')
  
  
  if(!is.null(index_obj)){
    index_obj = index_obj[chr == chr0]
    id0 = min(index_obj$id)
    hic_obj = hic_obj[n1 %in% index_obj$id & n2 %in% index_obj$id]
    hic_obj[, 'n1' := n1 - id0 + 1]
    hic_obj[, 'n2' := n2 - id0 + 1]
  }
  
  tads_gmap = subset(hiertads_gmap, dom_order == 1, select = -dom_order)
  tadsL2 = subset(hiertads_gmap, dom_order == 2, select = -dom_order)
  tadsL3 = subset(hiertads_gmap, dom_order == 3, select = -dom_order)

  ## plot tads_gmap (select region)
  xy_gmap = triangle_tad(tads_gmap, resl = resl)
  if(nrow(tadsL2) > 0 )xy_tadsL2 = triangle_tad(tadsL2, resl = resl)
  if(nrow(tadsL3) > 0 ) xy_tadsL3 = triangle_tad(tadsL3, resl = resl)


  ## plot it horizontally
  hxy_gmap = transf_tad_horz(xy_gmap)
  if(nrow(tadsL2) > 0 ) hxy_tadsL2 = transf_tad_horz(xy_tadsL2)
  if(nrow(tadsL3) > 0 )hxy_tadsL3 = transf_tad_horz(xy_tadsL3)


  dat = data.frame(hic_obj)
  pdat = subset(dat, n1 <= end_bin & n1 >= start_bin & n2 <= end_bin & n2 >= start_bin)
  pdat$count = ifelse(pdat$count > cthr, cthr, pdat$count)

  orgPlot <- ggplot(data = pdat, aes(n1, n2)) + geom_point(aes(colour = count)) +
    scale_colour_gradient(high='red', low = 'white') + xlim(min(pdat$n1), max(pdat$n1))

  #orgPlot


  ## plot it in horizontal triangle
  hdat = pdat[pdat$n1 <= pdat$n2, ]
  hdat[, 1:2] = transf_tad_horz(as.matrix(hdat[, 1:2]))

  hdat = hdat[hdat$n2 <= 100, ]

  cols = colorRampPalette(c("white", "red"))(2)


  ss = ifelse(resl == 1, 1, 10^6/resl)  ## plot in scale of mb
  xlab0 = ifelse(resl == 1, 'bin', 'Mb')
  #hdat$n1 = (hdat$n1 - as.numeric(as.vector(hic_obj[1, 1])) + 1)/ss
  hdat$n1 = (hdat$n1)/ss
  
  
  orgPlot <- ggplot(data = hdat, aes(n1, n2)) + geom_point(aes(colour = count)) +
    scale_colour_gradient(high = 'red', low = 'white') + xlim(min(hdat$n1), max(hdat$n1)) +
    xlab(xlab0) + ylab("") + theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(),
                                   legend.position = 'none') + theme(
                                     plot.background = element_blank()
                                     ,panel.grid.major = element_blank()
                                     ,panel.grid.minor = element_blank()
                                     ,panel.border = element_blank())

  #orgPlot



  orgWTad_horz <- function(xy, m = start_bin
                           , M = end_bin ){
    ind1 = which(xy[, 1]>= m & xy[, 1]<= M)
    if(length(ind1) <= 1) return(NULL)
    xy = xy[ind1, ]
    df = data.frame('xs' = xy[1:(nrow(xy)-1), 1]/ss,
                    'ys' = xy[1:(nrow(xy)-1), 2],
                    "xe" = xy[2:nrow(xy), 1]/ss,
                    "ye" = xy[2:nrow(xy), 2])
    return(df)
  }

  p1 = p2 = p3 = orgPlot

  if(!is.null(orgWTad_horz(hxy_gmap))) {
    tdat = orgWTad_horz(hxy_gmap)
    tdat = tdat[order(tdat$xs), ]
    tdat = tdat[tdat$xs <= tdat$xe,]
    p1 <- p1 + geom_segment(aes(x = xs, y = ys, xend = xe, yend = ye), size = 0.7,
                            data = tdat, color = 'black')
    p2 = p1
  }

  if(nrow(tadsL2) > 0 ){
    if(!is.null(orgWTad_horz(hxy_tadsL2))){
      p2 = p2 + geom_segment(aes(x = xs, y = ys, xend = xe, yend = ye), size = 0.7,
                             data = orgWTad_horz(hxy_tadsL2), color = 'black')
      p3 = p2
    }
  }


  if(nrow(tadsL3) > 0 ){
    if(!is.null(orgWTad_horz(hxy_tadsL3))) {
      p3 = p2 + geom_segment(aes(x = xs, y = ys, xend = xe, yend = ye), size = 0.7,
                             data = orgWTad_horz(hxy_tadsL3), color = 'black')
    }
  }


  return(list('p1' = p1, 'p2' = p2, 'p3' = p3))
}
plotdom = cmpfun(plotdom)
wbaopaul/rGMAP documentation built on Nov. 18, 2020, 9:37 p.m.