R/InternalFunc.R

Defines functions RandomName WhichCells SetIfNull PSICalculate fun1 ASSJFilter ASSJFind multiMerge ReadSTARSJ

Documented in ASSJFilter ASSJFind fun1 multiMerge PSICalculate ReadSTARSJ SetIfNull WhichCells

#' These functions are internal function and will not be exported
#'
#' @rdname InternalFunc
#' @name ReadSTARSJ
#'
#' @importFrom data.table fread
#' @importFrom data.table :=
#' @importFrom data.table setnames
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges

ReadSTARSJ <- function(file, uniqMapOnly = TRUE, minSJRead = 1, annotationOnly = TRUE, minOverhang = 3) {
  Tab <- data.table::fread(file, stringsAsFactors = FALSE)
  if(ncol(Tab) != 9) {
    stop("your file may be not STAR output SJ.out.tab file")
  }
  colnames(Tab) <- c("chr", "start", "end", "strand", "intron_motif", "annotation", "um_reads", "mm_reads", "overhang")

  if(length(minOverhang) != 1) {
    stop("minOverhang must be a positive number")
  }

  if(!is.numeric(minOverhang)) {
    stop("minOverhang must be a positive number")
  }

  if(length(minSJRead) != 1) {
    stop("minSJRead must be a positive number")
  }

  if(!is.numeric(minSJRead)) {
    stop("minSJRead must be a positive number")
  }

  if(!is.logical(uniqMapOnly)) {
    stop("uniqMapOnly must be a logical value")
  }

  if(uniqMapOnly) {
    data.table::setnames(Tab, "um_reads", "reads")
    Tab[, mm_reads := NULL]
  } else {
    Tab[, um_reads := um_reads + mm_reads]
    data.table::setnames(Tab, "um_reads", "reads")
    Tab[, mm_reads := NULL]
  }

  Tab <- subset(Tab, reads >= minSJRead)
  Tab <- subset(Tab, overhang >= minOverhang)
  Tab <- subset(Tab, strand != 0)

  if(annotationOnly) {
    Tab <- subset(Tab, annotation == 1)
  }

  Tab[, strand := ifelse(strand == 0, "*", ifelse(strand == 1, "+", "-"))]
  Tab[, intron_motif := ifelse(intron_motif == 0, "*", ifelse(intron_motif == 1 | intron_motif == 2, "GT-AG", ifelse(intron_motif == 3 | intron_motif == 4, "GC-AG", "AT-AC")))]

  gr <- GenomicRanges::GRanges(seqnames = Tab$chr,
                               ranges = IRanges::IRanges(start = Tab$start, end = Tab$end),
                               strand = Tab$strand,
                               intron_motif = Tab$intron_motif,
                               annotation = Tab$annotation,
                               reads = Tab$reads,
                               overhang = Tab$overhang)
  return(gr)
}


#' @rdname InternalFunc
#' @name multiMerge
#' @importFrom data.table fread
#' @importFrom data.table :=
#' @importFrom data.table setkey
#' @importClassesFrom GenomicRanges GRanges

multiMerge <- function(SJFiles, uniqMapOnly = TRUE, minSJRead = 1, minSJRowSum = 10, minSJSample = 2, annotationOnly = TRUE, minOverhang = 3, postfix = "SJ.out.tab") {
  stopifnot(all(file.exists(SJFiles)))

  if(length(minOverhang) != 1) {
    stop("minOverhang must be a positive number")
  }

  if(!is.numeric(minOverhang)) {
    stop("minOverhang must be a positive number")
  }

  if(minOverhang < 0) {
    stop("minOverhang must be a positive number")
  }

  if(length(minSJRead) != 1) {
    stop("minSJRead must be a positive number")
  }

  if(!is.numeric(minSJRead)) {
    stop("minSJRead must be a positive number")
  }

  if(minSJRead < 0) {
    stop("minSJRead must be a positive number")
  }

  if(length(minSJRowSum) != 1) {
    stop("minSJRowSum must be a positive number")
  }

  if(!is.numeric(minSJRowSum)) {
    stop("minSJRowSum must be a positive number")
  }

  if(minSJRowSum < 0) {
    stop("minSJRowSum must be a positive number")
  }

  if(length(minSJSample) != 1) {
    stop("minSJSample must be a positive number")
  }

  if(!is.numeric(minSJSample)) {
    stop("minSJSample must be a positive number")
  }

  if(minSJSample < 0) {
    stop("minSJSample must be a positive number")
  }

  if(minSJSample > length(SJFiles)) {
    stop("minSJSample should not greater than number of SJFiles")
  }

  if(!is.logical(uniqMapOnly)) {
    stop("uniqMapOnly must be a logical value")
  }

  SJs <- lapply(SJFiles, FUN = function(x) {
    sj <- ReadSTARSJ(file = x, uniqMapOnly = uniqMapOnly, minSJRead = minSJRead, annotationOnly = annotationOnly, minOverhang = minOverhang)
    sj <- data.table::data.table(SJ = as.character(sj), read = sj$reads)
    data.table::setkey(sj, SJ)
    return(sj)
  })

  sjs <- unique(do.call(c, lapply(SJs, FUN = function(x) x$SJ)))
  sjs <- as.character(sort(as(sjs, "GRanges")))
  Tab <- do.call(cbind, lapply(SJs, function(x) x[sjs, read]))
  row.names(Tab) <- sjs
  colnames(Tab) <- gsub(postfix, "", basename(SJFiles))

  Tab[is.na(Tab)] <- 0
  Tab <- Tab[rowSums(Tab) >= minSJRowSum & rowSums(Tab > 0) >= minSJSample, ]
  return(Tab)
}

#' @rdname InternalFunc
#' @name ASSJFind
#'
#' @importFrom GenomicRanges GRanges
#' @importClassesFrom GenomicRanges GRanges
#' @importFrom GenomeInfoDb seqnames
#' @importFrom BiocGenerics start
#' @importFrom BiocGenerics end
#' @importFrom BiocGenerics strand

ASSJFind <- function(x) {
  if(!is(x, "GRanges")) {
    stop("x must be a GRanges object.")
  }

  if(anyDuplicated(as.character(x))) {
    stop("Some splice junction are duplicated.")
  }

  DonorSites <- paste0(seqnames(x), ":", start(x), ":", strand(x))
  AcceptorSites <- paste0(seqnames(x), ":", end(x), ":", strand(x))
  whichAS <- DonorSites %in% names(which(table(DonorSites) > 1)) | AcceptorSites %in% names(which(table(AcceptorSites) > 1))
  x$ASSJ <- as.numeric(whichAS)
  return(x)
}


#' @rdname InternalFunc
#' @name ASSJFilter
#'
#' @importFrom GenomicRanges GRanges
#' @importClassesFrom GenomicRanges GRanges
#' @importFrom GenomeInfoDb seqnames
#' @importFrom BiocGenerics start
#' @importFrom BiocGenerics end
#' @importFrom BiocGenerics strand
#'
#' @param MMF minimum minor-junction fraction of multiple-alternative-splice site (same start/end junction > 2)

ASSJFilter <- function(matrix, MMJF) {
  x <- as(row.names(matrix), "GRanges")

  if(anyDuplicated(row.names(matrix))) {
    stop("Some splice junction are duplicated.")
  }

  DonorSites <- paste0(seqnames(x), ":", start(x), ":", strand(x))
  AcceptorSites <- paste0(seqnames(x), ":", end(x), ":", strand(x))

  whichAS <- DonorSites %in% names(which(table(DonorSites) > 1)) | AcceptorSites %in% names(which(table(AcceptorSites) > 1))
  assj <- as.character(x)[which(whichAS)]
  x <- x[which(whichAS)]

  y <- data.table::data.table(SJ = as.character(x),
                              start = paste0(as.character(seqnames(x)), ":", start(x), as.character(strand(x))),
                              end = paste0(as.character(seqnames(x)), ":", end(x), as.character(strand(x))),
                              reads = rowSums(matrix[as.character(x), ]))

  mass <- y[, .N, by = "start"][N > 2, start]
  mase <- y[, .N, by = "end"][N > 2, end]

  mass <- y[start %in% mass, ]
  mass$Frac <- mass[, prop.table(reads), by = start]$V1
  MultiSameStartMinorJunction <- mass[Frac < MMJF, SJ]
  mase <- y[end %in% mase, ]
  mase$Frac <- mase[, prop.table(reads), by = end]$V1
  MultiSameEndMinorJunction <- mase[Frac < MMJF, SJ]

  assj <- setdiff(assj, union(MultiSameStartMinorJunction, MultiSameEndMinorJunction))
  matrix[assj, ]
}




#' @rdname InternalFunc
#' @name PSICalculate
#'
#' @importFrom GenomicRanges GRanges
#' @importClassesFrom GenomicRanges GRanges
#' @importFrom GenomeInfoDb seqnames
#' @importFrom BiocGenerics start
#' @importFrom BiocGenerics end
#' @importFrom BiocGenerics strand
#'
#' @param MinSumSpliceSite The minimum number of reads of each SameStart or SameEnd for PSI Calculating


fun1 <- function(x, MinSum) {
  x/ifelse(sum(x, na.rm = TRUE) >= MinSum, sum(x, na.rm = TRUE), NA)
}

PSICalculate <- function(ASSJMat, MinSumSpliceSite){
  sjInfo <- data.table::as.data.table(as(row.names(ASSJMat), "GRanges"), stringsAsFactors = FALSE)[, c(-4)]
  ASSJMat <- cbind(sjInfo, ASSJMat)

  data.table::setkey(ASSJMat, seqnames, start, strand)
  start.psi <- ASSJMat[, lapply(.SD, fun1, MinSum = MinSumSpliceSite), .SDcols = -c(1:4), by = list(seqnames, start, strand)]
  start.psi$end <- ASSJMat$end

  data.table::setkey(ASSJMat, seqnames, end, strand)
  end.psi <- ASSJMat[, lapply(.SD, fun1, MinSum = MinSumSpliceSite), .SDcols = -c(1:4), by = list(seqnames, end, strand)]
  end.psi$start <- ASSJMat$start

  data.table::setcolorder(end.psi, colnames(start.psi))
  psi <- rbind(start.psi, end.psi)
  data.table::setkey(psi, seqnames, start, end, strand)
  data.table::setcolorder(psi, c(c("seqnames", "start", "end", "strand"), setdiff(colnames(psi), c("seqnames", "start", "end", "strand"))))

  psi <- psi[rowSums(psi[, -c(1:4)] != 1, na.rm = TRUE) != 0, ]
  psi <- psi[, lapply(.SD, function(x) mean(x, na.rm = TRUE)), by = list(seqnames, start, end, strand)]

  psi <- as.matrix(data.table::setDF(psi[, -c(1:4)], rownames = psi[, paste0(seqnames, ":", start, "-", end, ":", strand)]))
  return(psi)
}


#' @rdname InternalFunc
#' @name SetIfNull
#' Set a default value if an object is null
#
#' @param x An object to set if it's null
#' @param default The value to provide if x is null
#
#' @return default if x is null, else x
#
SetIfNull <- function(x, default) {
  if(is.null(x = x)){
    return(default)
  } else {
    return(x)
  }
}


#' @rdname InternalFunc
#' @name WhichCells
#
#' @param object An object to set if it's null
#' @param ident one level of design
#
WhichCells <- function(object, ident) {
  row.names(colData(object))[colData(object)[[object@design]] == ident]
}


# Generate a random name
#
# Make a name from randomly sampled lowercase letters,
# pasted together with no spaces or other characters
#
# @param length How long should the name be
# @param ... Extra parameters passed to sample
#
# @return A character with nchar == length of randomly sampled letters
#
# @seealso \code{\link{sample}}
#
RandomName <- function(length = 5L, ...) {
  return(paste(sample(x = letters, size = length, ...), collapse = ''))
}
tangchao7498/ICAS documentation built on Jan. 28, 2021, 3:56 p.m.