GSClassifier: Gene Signature Classifier

####====================Basic function==================####

#' @description given a matrix and gene pairs, create binary features
#' @param X One gene expression matrix
#' @param genes a vector of gene pairs
#' @return xbin, binned expression profile
#' @details This function could be achieved via \code{makeGenePairs}. Not be used in the later GSClassifier.
#' @examples
#' Xsub <- createPairsFeatures(Xmat, genepairs)
createPairsFeatures <- function(X, genes) {
  # first convert the gene pairs to a named list
  # where each entry of the list is the genes for a given pivot-gene
  genePairs <- strsplit(genes, ':')
  pairList <- list()
  for (gi in genePairs) {
    pairList[[gi[1]]] <- c(pairList[[gi[1]]], gi[2])
  }

  resList <- list() # then for each pivot gene
  for (gi in names(pairList)) {
    # assuming it's in the data ... really should be!
    if (gi %in% rownames(X)) {
      gs <-
        pairList[[gi]]                  ## can end up with the pivot gene in the genes...
      pval <-
        as.numeric(X[gi, ])            ## pivot values across samples
      idx <-
        match(table = rownames(X), x = gs) ## get index to genes for this pivot
      Xsub <-
        X[idx, ]                       ## subset the matrix, NAs for missing genes, pivot gene on top
      is_num_vector <-
        class(Xsub)[1]  %in% c('numeric', "integer") & length(gs) == 1
      if (is_num_vector) {
        Xsub <- matrix(data = Xsub,
                       ncol = ncol(X),
                       nrow = 1)
        colnames(Xsub) <- colnames(X)
      }
      rownames(Xsub) <- gs                  ## give gene IDs
      res0 <-
        lapply(1:ncol(Xsub), function(a)
          binaryGene(pval[a], Xsub[, a]))  ## create binary values
      resList[[gi]] <- do.call('cbind', res0)
    } else {
      # else we need to include some dummy rows
      randMat <-
        matrix(data = rbinom(n = length(pairList[[gi]]) * ncol(X), prob = 0.5, 1),
               ncol = ncol(X))
      colnames(randMat) <- colnames(X)
      resList[[gi]] <- randMat
    }
  }
  newMat <- do.call('rbind', resList)
  rownames(newMat) <- genes
  return(newMat)
}


#' @description splite matrix to a list of subset
#' @inheritParams geneMatch
#' @param cutoff the number of every subset of X
spliteMatrix <- function(X, cutoff = 20) {
  # Cut_vector
  cut_vector <- function(vt, nsplit = 100) {
    if (nsplit == 1) {
      v2 <- list(vt)

      names(v2) <- paste0("1-", length(v2))
    } else {
      ##转化成位置向量
      len.vt = 1:length(vt)

      ##间隔
      len1 <- floor(length(len.vt) / nsplit)

      ## low.ci and upper.ci
      low.ci <- 1

      for (i in 1:(nsplit - 1)) {
        low.ci[i + 1] <- low.ci[i] + len1
      }
      upper.ci <- len1
      for (i in 1:(nsplit - 1)) {
        upper.ci[i + 1] <- upper.ci[i] + len1
      }

      ## upper.ci的最后一个值
      upper.ci[length(upper.ci)] <- length(vt)

      ## 形成列表
      v2 <- NULL
      for (i in 1:length(low.ci)) {
        v2.i <- low.ci[i]:upper.ci[i]
        v2.i <- vt[v2.i]
        v2 <- c(v2, list(v2.i))
        names(v2)[i] <- paste0(low.ci[i], "-", upper.ci[i])
      }
    }
    ## 输出结果
    return(v2)
  }

  # Splite matrix
  nXL <- floor(ncol(X) / cutoff)
  eXL <- cut_vector(colnames(X), nXL)
  XL <- list()
  for (i in 1:length(eXL)) {
    # i=1
    XL[[names(eXL)[i]]] <- X[, eXL[[i]]]
  }
  return(XL)
}



####==============callSubtype functions==================####

#' @description Data preprocessing
#' @param mods A model or list of models, containing breakpoints, used to bin expression data
#' @param geneSet A list of genes for classification
#' @param nClust The number of clustering
#' @inheritParams geneMatch
#' @importFrom stringr str_detect
#' @return Xbin, the binned, subset, and binarized values.
#' @examples
#' mod1 <- dataProc(X, mods)
dataProc <- function(X,
                     mods = NULL,
                     geneSet,
                     nClust = 4) {
  # working with matrices
  Xmat <- as.matrix(X)

  # NA filling with recursive partitioning and regression trees
  # Xmat <- na_fill(Xmat, method="anova", na.action = na.rpart)

  # Set features
  nGS <- length(geneSet)
  featureNames <- c()
  for (j1 in 1:(nGS - 1)) {
    for (j2 in (j1 + 1):nGS) {
      featureNames <- c(featureNames, paste0('s', j1, 's', j2))
    }
  }

  # get out the relevant items
  breakVec <- mods$breakVec
  genes <- mods$bst$feature_names
  singleGenes <- genes[!str_detect(genes, ':')]
  singleGenes <- singleGenes[!singleGenes %in% featureNames]
  pairedGenes <- genes[str_detect(genes, ':')]
  setFeatures <- genes[str_detect(genes, 's[0-9]{1,6}s[0-9]{1,6}')]

  # bin the expression data
  Xbinned <- apply(Xmat, 2, breakBin, breakVec)
  rownames(Xbinned) <- rownames(Xmat)

  # and subset the genes to those not in pairs
  if (length(singleGenes) > 1) {
    Xbinned <- Xbinned[singleGenes, ]
  }

  # here we have expression data, and we're using the pairs model
  # so we need to make pairs features.
  Xpairs <- makeGenePairs(unique(Fastextra(pairedGenes,'\\:')), Xmat)[pairedGenes, ]
  # Xpairs <- createPairsFeatures(Xmat, pairedGenes)
  # colnames(Xpairs) <- colnames(Xmat)

  # gene set features.
  Xset <- makeSetData(Xmat, geneSet)
  if (length(setFeatures) > 1) {
    Xset <- Xset[setFeatures, ]
  }

  # join the data types and transpose
  Xbin <- t(rbind(Xbinned, Xpairs, Xset))
  return(Xbin)
}


#' @description Make subtype calls for one sample
#' @param mods xgboost model list
#' @param X gene expression matrix, genes in rows, samples in columns
#' @param ci cluster label, and index into mods
#' @param verbose whether report messages
#' @inheritParams dataProc
#' @import xgboost
#' @return preds of one cluster model.
callOneSubtype <-
  function(mods, X, ci, geneSet, nClust, verbose = T) {
    # 2023-12-26：List in R is special. If the name of the list is 1,2,3,4, when you call l[[1]], it would call the name of "1" instead of the first one.
    # Xbin needs to have the same columns as the training matrix.

    # if(verbose) (paste0('calling subtype ', ci))

    if (as.character(ci) %in% names(mods)) {
      if (length(mods[[as.character(ci)]]) > 1) {
        mi <- mods[[as.character(ci)]] # or we can use: mi <- mods[[ci]]
        Xbin <- dataProc(X, mods = mi, geneSet, nClust)

        # WARNING: amalgamation/../src/learner.cc:438:
        # If you are loading a serialized model (like pickle in Python, RDS in R) generated by
        # older XGBoost, please export the model by calling `Booster.save_model` from that version
        # first, then load it back in current version. See:
        #   https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
        # for more details about differences between saving model and serializing.

        # Prediction
        pred <- predict(mi$bst, Xbin)
      } else {
        pred <- rep(0, ncol(X))
      }
    } else {
      # if(verbose) paste0('calling subtype ', ci, ': No avaliable model. Return 0!')

      # The model is unavailable. Return 0.
      pred <- rep(0, ncol(X))

    }

    return(pred)
  }


#' @description Make subtype calls for each sample
#' @inheritParams callOneSubtype
callSubtypes <- function(mods, X, geneSet, clusterName, verbose = T) {
  pList <-
    lapply(clusterName, function(mi)
      callOneSubtype(mods, X, mi, geneSet, nClust, verbose))
  pMat  <- do.call('cbind', pList)
  colnames(pMat) <- clusterName # names(mods)
  bestCall <-
    apply(pMat, 1, function(pi)
      colnames(pMat)[which(pi == max(pi))][1])
  # 2022-9-27. There's no problem because bestcall_max is a legacy category strategy and would not be recommended.
  # bestCall <- apply(pMat, 1, function(pi) colnames(pMat)[which(pi == max(pi)[1])])
  #     1           2           3           4
  # 0.077546641 0.004749997 0.832491577 0.832491577
  return(data.frame(
    SampleID = colnames(X),
    BestCall = bestCall,
    pMat,
    stringsAsFactors = F
  ))
}

huangwb8/GSClassifier documentation built on Oct. 15, 2024, 8:12 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

huangwb8/GSClassifier
Gene Signature Classifier

R/ModelSeries_callSubtypes.R
In huangwb8/GSClassifier: Gene Signature Classifier

Defines functions callSubtypes callOneSubtype dataProc spliteMatrix createPairsFeatures

R Package Documentation

Browse R Packages

We want your feedback!

huangwb8/GSClassifier Gene Signature Classifier

R/ModelSeries_callSubtypes.R In huangwb8/GSClassifier: Gene Signature Classifier

Defines functions callSubtypes callOneSubtype dataProc spliteMatrix createPairsFeatures

R Package Documentation

Browse R Packages

We want your feedback!

huangwb8/GSClassifier
Gene Signature Classifier

R/ModelSeries_callSubtypes.R
In huangwb8/GSClassifier: Gene Signature Classifier