R/scan.R

Defines functions scan

Documented in scan

#' Scan to identify structurally variable regions (SVRs)
#'
#' This function detects structurally variable regions in nucleotide resolution and
#' controls family-wise error rate.
#'
#' @param r Initialized reactivity from \code{\link{init}} or normalized reactivity
#' from \code{\link{normalize}}.
#' @param seed Random seed for reproducible results.
#' @param alpha Significance level.
#' @param N Number of replications in Monte Carlo sampling.
#' @param ncores Number of cores to use for parallel computation. Automatically determined
#' by default.
#'
#' @return A list with each element corresponding to a transcript. \code{svr} is
#' SVRs with significance score \code{P}. \code{scan} includes the start and stop positions
#' of all scanned segments.
#' @export
#'
scan <- function(r, seed, alpha = 0.05, N = 1e3, ncores = "auto") {
  dir_pkg = find.package('DiffScan')
  if(!('Qmax_null.rds' %in% list.files(paste0(dir_pkg, '/R/')))) {
    message('First time to run scan. Downloading precalculated parameters ...')
    download.file(
      url = "https://onedrive.live.com/download?resid=3D9C2C35D99F543F%2134216&authkey=!AH2XMY2d6FUe0mo&download=1",
      destfile = paste0(dir_pkg, '/R/Qmax_null.rds'),
      mode = 'wb'
    )
  }
  if(!('Qmax_null' %in% objects(envir = .GlobalEnv))) {
    Qmax_null <<- readRDS(paste0(dir_pkg, '/R/Qmax_null.rds'))
  }
  if(.Platform$OS.type == 'windows') ncores <- 1
  else {
    if (ncores == "auto") ncores <- parallel::detectCores() - 1
  }
  RNGkind("L'Ecuyer-CMRG")
  set.seed(seed)
  if(.Platform$OS.type != 'windows') parallel::mc.reset.stream()
  nseg <- 100
  radius <- 2
  gamma <- 0.5
  scan_ctrl <- list(
    radius = radius, N = N,
    Q = function(pv) sum(-2 * log(pv)) / (length(pv))^gamma,
    region_range = dplyr::tibble(length = 1:100, Lmin = 1, Lmax = pmin(length, 20))
  ) # fixed, nseg and region_range are concordant with pre-calculated Qmax_null

  message("Scan ====================")
  main <- function(r, radius, N, Q, region_range, Qmax_null, alpha, ncores) {
    message("sampling null ...")
    nseg_num <- r %>>% rlist::list.mapv(sapply(., nrow)) %>>% table() %>>% {
      dplyr::tibble(nseg = as.integer(names(.)), num = .)
    }
    QGlobalMax_null <- parallel::mcmapply(
      FUN = function(k) {
        max(apply(nseg_num, 1, function(z) {
          max(sample(x = Qmax_null[[paste0("nseg=", z[1])]], size = z[2]))
        }))
      },
      k = 1:N, mc.cores = ncores
    )
    Qthr <- ifelse(alpha >= 1, -Inf, quantile(QGlobalMax_null, 1 - alpha))

    message("calculate Q and P ...")
    scan_seg_r <- function(r) {
      scan_seg <- function(obs, Q, Lmin, Lmax) {
        n <- length(obs)
        regions <- lapply(1:(n - Lmax + 1), function(k) {
          cbind(Start = k, Stop = (k + Lmin - 1):(k + Lmax - 1))
        }) %>>% rlist::list.rbind()
        if (Lmax > Lmin) {
          regions <- rbind(
            regions,
            lapply((n - Lmax + 2):(n - Lmin + 1), function(k) {
              cbind(Start = k, Stop = (k + Lmin - 1):n)
            }) %>>% rlist::list.rbind()
          )
        }
        Q <- apply(regions, 1, function(z) Q(obs[z[1]:z[2]]))
        keep_id <- which(Q >= Qthr)
        if (length(keep_id) == 0) {
          return(NULL)
        }
        P <- sapply(Q[keep_id], function(z) mean(QGlobalMax_null >= z))
        cbind(
          if (length(keep_id) > 1) {
            regions[keep_id, ]
          } else {
            t(regions[keep_id, ])
          },
          Q = Q[keep_id],
          P = P
        )
      }

      rA <- r %>>% dplyr::select(dplyr::starts_with("A"))
      rB <- r %>>% dplyr::select(dplyr::starts_with("B"))
      if (radius == 0 & ncol(rA) == 1 & ncol(rB) == 1) {
        message("set radius > 0 when single replicate is supplied")
        stop()
      }
      len <- nrow(r)
      P <- sapply(1:len, function(j) {
        Cj <- max(1, j - radius):min(len, j + radius)
        Pj <- tryCatch(
          stats::wilcox.test(
            x = unlist(rA[Cj, ]), y = unlist(rB[Cj, ]), paired = F
          )$p.value,
          warning = function(w) {
            return("Fail")
          }
        )
        if (!identical("Fail", Pj)) {
          return(Pj)
        }
        coin_test <- tryCatch(
          coin::wilcox_test(
            formula = x ~ group,
            data = dplyr::tibble(
              x = c(unlist(rA[Cj, ]), unlist(rB[Cj, ])),
              group = factor(c(
                rep("a", length(Cj) * ncol(rA)),
                rep("b", length(Cj) * ncol(rB))
              ))
            ),
            distribution = "exact"
          ),
          warning = function(w) {
            return("Error")
          }
        )
        if (identical("Error", coin_test)) {
          return(1)
        } # coin test fails for all same values
        coin::pvalue(coin_test)
      })
      scan_seg(
        obs = P,
        Q = Q,
        Lmin = region_range$Lmin[region_range$length == len],
        Lmax = region_range$Lmax[region_range$length == len]
      )
    }
    pbmcapply::pbmcmapply(
      FUN = function(r_rna_cut) {
        r_rna_cut %>>% rlist::list.map(scan_seg_r(.))
      },
      r_rna_cut = r,
      SIMPLIFY = F,
      mc.cores = ncores
    )
  }
  res <- main(
    r = r,
    radius = scan_ctrl$radius,
    N = scan_ctrl$N,
    Q = scan_ctrl$Q,
    region_range = scan_ctrl$region_range,
    Qmax_null = Qmax_null,
    alpha = alpha,
    ncores = ncores
  )

  message('\t')
  message("Scan completed, pick regions ====================")
  pick <- function(res, ncores) {
    pick_seg <- function(cands) {
      if (identical(NULL, cands)) {
        return(
          dplyr::tibble(
            Start = integer(0), Stop = integer(0),
            Q = integer(0), P = integer(0)
          )
        )
      }
      cands <- cands %>>% dplyr::as_tibble() %>>% dplyr::arrange(dplyr::desc(Q))
      picks <- NULL
      while (nrow(cands) > 0) {
        new <- cands[1, ]
        picks <- rbind(picks, new)
        cands <- cands %>>% dplyr::filter(Stop < new$Start | Start > new$Stop)
      }
      picks
    }
    cuts_to_scanned_sites <- function(cuts) {
      raw_scanned_sites <- cuts %>>% rlist::list.map(
        as.integer(strsplit(., "_to_")[[1]])
      ) %>>% rlist::list.rbind()
      if (length(cuts) == 1) {
        return(raw_scanned_sites)
      }

      scanned_sites <- matrix(raw_scanned_sites[1, ], nrow = 1)
      for (k in 2:length(cuts)) {
        new <- raw_scanned_sites[k, ]
        if (new[1] == scanned_sites[1, 2] + 1) {
          scanned_sites[1, 2] <- new[2]
        } else {
          scanned_sites <- rbind(new, scanned_sites)
        }
      }
      rownames(scanned_sites) <- NULL
      colnames(scanned_sites) <- c("Start", "Stop")
      return(scanned_sites)
    }
    pick_rna <- function(res_rna) {
      scan <- cuts_to_scanned_sites(names(res_rna))
      svr <- res_rna %>>% rlist::list.map({
        abs_start <- strsplit(.name, "_to_")[[1]][1] %>>% as.integer()
        pick_seg(.) %>>% dplyr::mutate(
          Start = Start + abs_start - 1,
          Stop = Stop + abs_start - 1
        )
      }) %>>% rlist::list.rbind() %>>% dplyr::arrange(dplyr::desc(Q))
      list(scan = scan, SVR = svr)
    }

    parallel::mcmapply(
      FUN = function(res_rna) pick_rna(res_rna = res_rna),
      res_rna = res,
      SIMPLIFY = F,
      mc.cores = ncores
    )
  }
  pick(res = res, ncores = ncores)
}
yub18/DESSERT documentation built on Nov. 5, 2024, 2:36 a.m.