R/utils.R

Defines functions .compute_similarity .create_met_region .constructGRanges .parallel_find_region .predictive_infer_profile .parallel_cores .minmax_scaling .do_centre_loc design_matrix.rbf design_matrix.default design_matrix .rbf_basis

Documented in design_matrix design_matrix.default design_matrix.rbf

# (Internal) Apply the radial basis function
#
# Apply the RBF function to the input X.
#
# @param X Input data.
# @param mus Centers from where we should compute the distance of the data X.
# @param gamma Inverse width of radial basis function.
#
# @return Input X, after being transformed from the RBF.
#
# @author Hongen Kang  \email{geneprophet@163.com}
#
#
.rbf_basis <- function(X,
                       mus, 
                       gamma = 1){
  return(exp( (-1) * gamma * sum( (X - mus) ^ 2) ))
}


#' @name design_matrix
#' @rdname design_matrix
#' @aliases designmatrix des_matrix des_mat
#'
#' @title Generic function for creating design matrices
#'
#' @description These functions call the appropriate methods depending on the
#'   class of the object \code{obj} to create RBF design matrices.
#'
#' @param obj A basis function object.
#' @param obs A vector of observations.
#' @param ... Additional parameters.
#'
#' @return A design matrix object
#'
#' @author Hongen Kang  \email{geneprophet@163.com}
#'
#' @export
design_matrix <- function(obj, 
                          ...){
  UseMethod("design_matrix")
}


#' @rdname design_matrix
#'
design_matrix.default <- function(obj, 
                                  ...){
  stop(paste0("Object type '", class(obj), "' is not implemented."))
}

#' @rdname design_matrix
#'
#' @export
design_matrix.rbf <- function(obj,
                              obs,
                              ...){
  assertthat::assert_that(methods::is(obj, "rbf"))
  
  if (is.na(obs[1])) {
    return(list(H = NA, basis = obj))
  } else {
    if(methods::is(obs,"matrix")){
      obs = obs[,1]
    }
    assertthat::assert_that(is.vector(obs))
    N   <- length(obs)  # Length of the dataset
    if (obj$M == 0) { H <- matrix(1, nrow = N, ncol = 1); obj$mus <- 0;
    }else{
      if (is.null(obj$mus)) {
        if (obj$eq_spaced_mus) {
          obj$mus <- vector(mode = "numeric", obj$M)
          if (!obj$whole_region) {
            # TODO: Keep this as functionality?
            for (i in 1:obj$M) {
              obj$mus[i] <- i*(max(obs) - min(obs))/(obj$M + 1) +
                min(obs)
            }
          }
        }else {
          repeat {
            # TODO: Keep this as functionality?
            km <- stats::kmeans(obs, obj$M, iter.max = 30, nstart = 10)
            if (min(km$size) > 0) { break } # Accept non-empty clusters
          }
          obj$mus <- km$centers  # RBF centers
        }
      }
      # Convert the 'obs' vector to an N x 1 dimensional matrix
      obs <- as.matrix(obs)
      H <- matrix(1, nrow = N, ncol = obj$M + 1)
      for (j in 1:obj$M) {
        H[, j + 1] <- apply(obs,1,.rbf_basis,mus = obj$mus[j],
                            gamma = obj$gamma)
      }
    }
    return(list(H = H, basis = obj))
  }
}



# Center CpG locations relative to TSS
#
# \code{center_loc} centera CpG locations relative to TSS
#
# @param region CpG locations
# @param tss TSS location
# @param strand_direction Strand direction
#
# @return Centered location data relative to TSS
#
.do_centre_loc <- function(region,
                           centre, 
                           strand_direction){
  assertthat::assert_that(is.character(strand_direction))
  center <- region - centre
  if (identical(strand_direction, "-")){
    center  <- (-center)  # If '-' strand, swap CpG locations
  }
  return(center)
}
# scaling_location
# Compute the min-max scaling
#
# \code{.minmax_scaling} normalizes a given vector using the the min-max
# scaling method. More formally:
# \deqn{scaled = \frac{data -x_{min}}{x_{max} - x_{min}} \times (f_{max} -
#  f_{min}) + f_{min}}
#
# @param data Vector with numeric data to be scaled.
# @param xmin Optional minimum value, otherwise \code{min(data)} will be used.
# @param xmax Optional maximum value, otherwise \code{max(data)} will be used.
# @param fmin Optional minimum range value, default is -1.
# @param fmax Optional maximum range value, default is 1.
#
# @return The scaled data in the given range, default is between (-1, 1). If
#  xmin = xmax the input vector \code{data} is returned.
#
.minmax_scaling <- function(data,
                            xmin = NULL, 
                            xmax = NULL,
                            fmin = -1, 
                            fmax = 1){
  if (is.null(xmin)) { xmin <- min(data) }
  if (is.null(xmax)) { xmax <- max(data) }
  if ( (xmin - xmax) == 0) { return(data) }
  minmax <- (data - xmin) / (xmax - xmin)
  minmax_scaled <- minmax * (fmax - fmin) + fmin
  return(minmax_scaled)
}


# @title Number of parallel cores
#
# @description Function for creating the number of parallel cores that will be
#   used during EM.
# @param no_cores Number of cores given as input
# @param is_parallel Logical, did we require parallel computations
# @param M Total number of sources
#
.parallel_cores <- function(no_cores=NULL,
                            is_parallel=FALSE, 
                            max_cores = NULL){
  if (is_parallel) { # If parallel mode is ON
    # If number of cores is not given
    if (is.null(no_cores)) { no_cores <- parallel::detectCores() - 1
    } else{if (no_cores >= parallel::detectCores()) {
      no_cores <- parallel::detectCores() - 1 }
    }
    if (is.na(no_cores)) { no_cores <- 2 }
    if (!is.null(max_cores)) {
      if (no_cores > max_cores) { no_cores <- max_cores }
    }
  }
  return(no_cores)
}
# Compute predictive distribution of inferred profiles using VB
.predictive_infer_profile <- function(model,
                                      x,
                                      region = 1){
  # Predictive mean
  H <- design_matrix(model$basis, x)$H
  if(methods::is(model$W[region, ], "numeric")){
    W_pred <- c(H %*% model$W[region, ])
  }else{
    W_pred <- c(H %*% as.vector(model$W[region, ]$w1))
  }
  if (methods::is(model, "infer_profiles_vb_binomial") ||
      methods::is(model, "infer_profiles_vb_bernoulli")) {
    # Predictive variance
    W_sd_pred <- sqrt(1 + diag(H %*% model$W_Sigma[[region]] %*% t(H)))
    W_pred <- pnorm(W_pred / W_sd_pred)
  }else {stop("Not predictive distribution for this object") }
  return(list(W_pred = W_pred, W_sd_pred = W_sd_pred))
}
# @title findOverlap
# 根据hits聚点成region,间隔小于200bp,连续10个
# parallel function 
.parallel_find_region <- function(methylation,
                                  annotation,
                                  index,
                                  min_sites_number,
                                  max_distance_between_sites,
                                  ignore_strand){
  anno <- annotation[index]
  hits <- GenomicRanges::findOverlaps(methylation,anno,ignore.strand=ignore_strand)
  begin <- vector(mode = "double", length = 0)
  end <- vector(mode = "double", length = 0)
  anno_index <- vector(mode = "double", length = 0)
  if(length(hits)>=min_sites_number){
    count = 1
    b = GenomicRanges::start(methylation[S4Vectors::queryHits(hits[1])])
    e = NULL
    for (i in 2:length(hits)) {
      #current <- GenomicRanges::start(methylation[S4Vectors::queryHits(hits[i])])
      #before <- GenomicRanges::start(methylation[S4Vectors::queryHits(hits[i-1])])
      if((GenomicRanges::start(methylation[S4Vectors::queryHits(hits[i])])-GenomicRanges::start(methylation[S4Vectors::queryHits(hits[i-1])]))<max_distance_between_sites){
        count = count + 1
        print(count)
        #finsh the iteration when reach the end of CpG_gr
        if(count==length(hits)){
          e = GenomicRanges::start(methylation[S4Vectors::queryHits(hits[i])])
          begin <- c(begin,b)
          end <- c(end,e)
          #anno_index <- c(anno_index,index)
        }
      }else{
        if(count >= min_sites_number){
          e = GenomicRanges::start(methylation[S4Vectors::queryHits(hits[i-1])])
          begin <- c(begin,b)
          end <- c(end,e)
          #anno_index <- c(anno_index,index)
          b = GenomicRanges::start(methylation[S4Vectors::queryHits(hits[i])])
          count = 1
          e = NULL
        }else{
          count = 1
          b = GenomicRanges::start(methylation[S4Vectors::queryHits(hits[i])])
          e = NULL
        }
      }
    }
    
  }
  
  #if no region meet the suitation then return NULL
  if(length(end)==0){
    return(NULL)
  }else{
    anno_index <- rep(index,length(end))
    region <- data.frame(start=begin,end=end,anno_index=anno_index)
    return(region)
  }
  
}

.constructGRanges <- function(inputRegion,
                              out,
                              index){
  #判断Deleted
  if(length(out[[index]])==0){
    result <- GenomicRanges::GRanges(
      seqnames = "FAILED",
      ranges = IRanges::IRanges(start = 0,end = 0),
      strand = "-",
      id =  "liftOver FAILED",
      center = NA,
      annotation = "Deleted in new: Sequence intersects no chains"
    )
    return(result)
  }else{
    #判断split
    if(length(BiocGenerics::unique(BiocGenerics::as.vector(GenomicRanges::seqnames(out[[index]]))))>1 || length(BiocGenerics::unique(BiocGenerics::as.vector(BiocGenerics::strand(out[[index]]))))>1){
      result <- GenomicRanges::GRanges(
        seqnames = "FAILED",
        ranges = IRanges::IRanges(start = 0,end = 0),
        strand = "-",
        id =  "liftOver FAILED",
        center = NA,
        annotation = "Split in new: Sequence insufficiently intersects multiple chains"
      )
      return(result)
    }else{
      if(tail(BiocGenerics::as.vector(GenomicRanges::end(out[[index]])),1) > head(BiocGenerics::as.vector(GenomicRanges::start(out[[index]])),1)){
        result <- GenomicRanges::GRanges(
          seqnames = BiocGenerics::unique(BiocGenerics::as.vector(GenomicRanges::seqnames(out[[index]]))),
          ranges = IRanges::IRanges(start = head(BiocGenerics::as.vector(GenomicRanges::start(out[[index]])),1), end = tail(BiocGenerics::as.vector(GenomicRanges::end(out[[index]])),1)),
          strand = BiocGenerics::unique(BiocGenerics::as.vector(BiocGenerics::strand(out[[index]]))),
          id = "liftOver SUCCESS",
          center = round((head(BiocGenerics::as.vector(GenomicRanges::start(out[[index]])),1) + tail(BiocGenerics::as.vector(GenomicRanges::end(out[[index]])),1))/2),
          annotation = "Successed"
        )
        #判断Partially deleted
        if(abs(GenomicRanges::width(inputRegion[index])-GenomicRanges::width(result))>3000){
          result <- GenomicRanges::GRanges(
            seqnames = "FAILED",
            ranges = IRanges::IRanges(start = 0,end = 0),
            strand = "-",
            id = "liftOver FAILED",
            center = NA,
            annotation = "Partially deleted in new: Sequence insufficiently intersects one chain"
          )
        }
        return(result)
      }else{
        result <- GenomicRanges::GRanges(
          seqnames = BiocGenerics::unique(BiocGenerics::as.vector(GenomicRanges::seqnames(out[[index]]))),
          ranges = IRanges::IRanges(start =  tail(BiocGenerics::as.vector(GenomicRanges::end(out[[index]])),1), end = head(BiocGenerics::as.vector(GenomicRanges::start(out[[index]])),1)),
          strand = BiocGenerics::unique(BiocGenerics::as.vector(BiocGenerics::strand(out[[index]]))),
          id = "liftOver SUCCESS",
          center = round((tail(BiocGenerics::as.vector(GenomicRanges::end(out[[index]])),1) + head(BiocGenerics::as.vector(GenomicRanges::start(out[[index]])),1))/2),
          annotation = "Successed"
        )
        if(abs(GenomicRanges::width(inputRegion[index])-GenomicRanges::width(result))>3000){
          result <- GenomicRanges::GRanges(
            seqnames = "FAILED",
            ranges = IRanges::IRanges(start = 0,end = 0),
            strand = "-",
            id = "liftOver FAILED",
            center = NA,
            annotation = "Partially deleted in new: Sequence insufficiently intersects one chain"
          )
        }
        return(result)
      }
    }
  }
}


.create_met_region <- function(met_dt,
                               gen_ind,
                               query_hits,
                               subj_hits,
                               cov,
                               D,
                               met,
                               total,
                               cpg_loc,
                               centre,
                               chrom,
                               strand,
                               up_anno,
                               down_anno,
                               fmin,
                               fmax,
                               index){
  
  cpg_sites = subj_hits[which(query_hits==index)]
  
  if(length(cpg_sites) > cov){
    
    # Locations of CpGs in the genome
    region <- cpg_loc[cpg_sites]
    # Middle location for region[index]
    middle <- centre[gen_ind[which(gen_ind==index)]]
    # Extract chromosome information
    chrom_info <- chrom[gen_ind[which(gen_ind==index)]]
    # Extract strand information, i.e. direction
    strand_direction <- strand[gen_ind[which(gen_ind==index)]]
    # Extract upstream information
    upstream <- up_anno[gen_ind[which(gen_ind==index)]]
    # Extract downstream information
    downstream <- down_anno[gen_ind[which(gen_ind==index)]]
    # Shift CpG locations relative to TSS
    center_data  <- .do_centre_loc(
      region = region,
      centre = middle,
      strand_direction = strand_direction
    )
    # In "-" strand the order of the locations should change
    Order <- base::order(center_data)
    res <- matrix(data = 0, nrow = length(cpg_sites), ncol = D)
    # Store actual genomic coordinates of CpGs
    rownames(res) <- paste(chrom_info,
                           region[Order], sep = ":")
    # Store normalized locations of CpGs
    res[, 1] <- round(
      .minmax_scaling(
        data = center_data[Order],
        xmin = upstream,
        xmax = downstream,
        fmin = fmin,
        fmax = fmax
      ),
      10
    )
    # Store reads in the corresponding locations
    if (D == 2) {
      res[, 2] <- met[cpg_sites][Order]
    } else{
      # Store total reads in the corresponding locations
      res[, 2] <- total[cpg_sites][Order]
      # Store methylated reads in the corresponding locations
      res[, 3] <- met[cpg_sites][Order]
    }
    return(res)
  }else{
    res = NA
    return(res)
  }
}


.compute_similarity <- function(queryProfiles,
                                subjectProfiles,
                                index){
  if( is.na(queryProfiles$W[index,]) || is.na(subjectProfiles$W[index,])){
    return(NA)
  }else {
    a <- queryProfiles$W[index,]
    b <- subjectProfiles$W[index,]
    if(methods::is(a, "list")){
      a=as.vector(a$w1)
    }
    if(methods::is(b, "list")){
      b=as.vector(b$w1)
    }
    
    similarity <- crossprod(a-mean(a),b-mean(b))/sqrt(crossprod(a-mean(a))*crossprod(b-mean(b)))
    #sim <- crossprod(a,b)/sqrt(crossprod(a)*crossprod(b))
    #c <- cor(a,b)
    #normalize to [0-1]
    s <- 0.5 + 0.5*similarity
    #res <- c(s, index, which(subjectObj$anno$correspondingIndex == index),sim,c)
    return(s)
  }
}
geneprophet/BSDMR documentation built on March 3, 2021, 5:50 a.m.