R/sign.R

Defines functions concatenate_obj_sign auto_find_marker_gene find_marker_gene auto_find_marker_sign find_marker_sign compute_nswaps classify_samples_merlot_sign do_CalculatePseudotimes_dmap do_SignSpaceEmbedding_dmap do_CalculateElasticTree_dmap do_CalculateScaffoldTree_dmap classify_samples_seuratFindClusters_sign classify_samples_hclustCutree_sign classify_samples_pam_sign do_DiffusionMap_sign do_umap_sign do_tsne_sign do_pca_sign make_signxsample_matrix manual_selection_sign manual_curation_sign reduce_sign compute_SemSim_sign select_sign separate_variables_sign do_quickQC_sign

Documented in auto_find_marker_gene auto_find_marker_sign classify_samples_hclustCutree_sign classify_samples_merlot_sign classify_samples_pam_sign classify_samples_seuratFindClusters_sign compute_nswaps compute_SemSim_sign concatenate_obj_sign do_CalculateElasticTree_dmap do_CalculatePseudotimes_dmap do_CalculateScaffoldTree_dmap do_DiffusionMap_sign do_pca_sign do_quickQC_sign do_SignSpaceEmbedding_dmap do_tsne_sign do_umap_sign find_marker_gene find_marker_sign make_signxsample_matrix manual_curation_sign manual_selection_sign reduce_sign select_sign separate_variables_sign

#' Perform quick QC for ?
#'
#' Takes an intersection of genes between supplied data and tidy_GO, and then
#' excludes the IDs with too few or too many genes by setting thresholds.
#'
#' @param obj ASURAT object.
#' @param data_type Database to use. ***Need clarification***
#' @param min_ngenes Minimal number of genes.
#' @param max_ngenes Maximal number of genes.
#'
#' @return ASURAT object.
#' @export
#'
#' @examples do_quickQC_sign(obj = object, data_type = "DO", min_ngenes = 2, max_ngenes = 1000)
do_quickQC_sign <- function(obj, data_type, min_ngenes = 2, max_ngenes = 1000){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  sign <- "sign"
  category_names <- names(obj[[sign]][[data_type]])
  obj_genes <- obj[["variable"]][["symbol"]]
  obj_geneIDs <- obj[["variable"]][["entrez"]]
  #--------------------------------------------------
  # History
  #--------------------------------------------------
  slot_name <- paste("do_quickQC_", data_type, sep = "")
  obj[["history"]][[slot_name]][["min_ngenes"]] <- min_ngenes
  obj[["history"]][[slot_name]][["max_ngenes"]] <- max_ngenes
  #--------------------------------------------------
  # Select genes existing in user's data set
  #--------------------------------------------------
  for(category in category_names){
    tmp <- obj[[sign]][[data_type]][[category]]
    if(identical(head(tmp$Gene[is.na(tmp$Gene)], n=3), c(NA, NA, NA))){
      for(i in 1:nrow(tmp)){
        #------------------------------
        # GeneID: NCBI gene (formerly Entrezgene) ID
        #------------------------------
        geneIDs <- unlist(strsplit(tmp$GeneID[i], "/"))
        geneIDs <- geneIDs[which(geneIDs %in% intersect(geneIDs, obj_geneIDs))]
        tmp$GeneID[i] <- paste(geneIDs, collapse = "/")
        tmp$Count[i] <- as.integer(length(geneIDs))
        #------------------------------
        # Gene: gene symbol
        #------------------------------
        genes <- c()
        for(g in geneIDs)
          genes <- c(genes, obj_genes[which(obj_geneIDs == g)])
        tmp$Gene[i] <- paste(genes, collapse = "/")
      }
    }else if(identical(head(tmp$GeneID[is.na(tmp$GeneID)], n=3), c(NA, NA, NA))){
      for(i in 1:nrow(tmp)){
        #------------------------------
        # Gene: gene symbol
        #------------------------------
        genes <- unlist(strsplit(tmp$Gene[i], "/"))
        genes <- genes[which(genes %in% intersect(genes, obj_genes))]
        tmp$Gene[i] <- paste(genes, collapse = "/")
        tmp$Count[i] <- as.integer(length(genes))
        #------------------------------
        # GeneID: NCBI gene (formerly Entrezgene) ID
        #------------------------------
        geneIDs <- c()
        for(g in genes){
          geneIDs <- c(geneIDs, obj_geneIDs[which(obj_genes == g)])
        }
        tmp$GeneID[i] <- paste(geneIDs, collapse = "/")
      }
    }else{
      for(i in 1:nrow(tmp)){
        #------------------------------
        # Count
        #------------------------------
        genes <- unlist(strsplit(tmp$Gene[i], "/"))
        genes <- genes[which(genes %in% intersect(genes, obj_genes))]
        tmp$Count[i] <- as.integer(length(genes))
      }
    }

    obj[[sign]][[data_type]][[category]] <- tmp
  }
  #--------------------------------------------------
  # Filter out the IDs including too few or too many genes
  #--------------------------------------------------
  for(category in category_names){
    tmp <- obj[[sign]][[data_type]][[category]]
    res <- tmp[which((tmp$Count >= min_ngenes) & (tmp$Count <= max_ngenes)),]
    rownames(res) <- 1:nrow(res)
    obj[[sign]][[data_type]][[category]] <- as.data.frame(res)
  }

  return(obj)
}

#' Classify genes annotated with each biological description into groups.
#'
#' Classify genes annotated with each biological description into three groups:
#' strongly correlated gene set (SCG), variably correlated gene set (VCG),
#' and weakly correlated gene set (WCG).
#'
#' @param obj ASURAT object.
#' @param obj_cor ***Need clarification***
#' @param data_type Database to use. ***Need clarification***
#' @param method Method for correlation calculation (e.g. "spearman")
#' @param th_posi Positive threshold to classify SCG and WCG.
#' @param th_nega Negative threshold to classify SCG and WCG.
#'
#' @return ASURAT object.
#' @export
#'
#' @examples separate_variables_sign(obj = object,
#' obj_cor = day1_norm_cor,
#' data_type = "DO",
#' method = "spearman",
#' th_posi = 0.40, th_nega = -0.20)
separate_variables_sign <- function(obj, obj_cor, data_type, method, th_posi, th_nega){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  sign <- "sign"
  category_names <- names(obj[[sign]][[data_type]])
  cormat <- obj_cor[[method]]
  #--------------------------------------------------
  # History
  #--------------------------------------------------
  slot_name <- paste("separate_variables_", data_type, sep = "")
  obj[["history"]][[slot_name]][["method"]] <- method
  obj[["history"]][[slot_name]][["th_posi"]] <- th_posi
  obj[["history"]][[slot_name]][["th_nega"]] <- th_nega
  #--------------------------------------------------
  # Separate genes
  #--------------------------------------------------
  for(category in category_names){
    df <- obj[[sign]][[data_type]][[category]]
    res <- data.frame(
      matrix(ncol = 12, nrow = 0, dimnames = list(NULL, c(
        "ID",
        "Description",
        "Count_strg",
        "Count_vari",
        "Count_weak",
        "Corr_strg",
        "Corr_vari",
        "Corr_weak",
        "Gene_strg",
        "Gene_vari",
        "Gene_weak",
        "GeneID"
      ))))
    for(i in 1:nrow(df)){
      #--------------------------------------------------
      # Definitions
      #--------------------------------------------------
      genes <- unlist(strsplit(df$Gene[i], "/"))
      if(length(genes) <= 1){
        next
      }
      inds <- which(rownames(cormat) %in% genes)
      mat <- as.matrix(cormat[inds, inds])
      #--------------------------------------------------
      # Decomposition of correlation matrix
      #--------------------------------------------------
      diag(mat) <- -99
      inds_posi <- which(apply(mat, 2, function(x) sum(x >= th_posi)) > 0)
      diag(mat) <- 99
      inds_nega <- which(apply(mat, 2, function(x) sum(x <= th_nega)) > 0)
      #--------------------------------------------------
      #
      #--------------------------------------------------
      if(length(inds_posi) == 0){
        genes_strg <- NA
        genes_vari <- NA
        genes_weak <- genes
        mean_strg <- NA
        mean_vari <- NA
        inds <- which(rownames(mat) %in% genes_weak)
        mtx <- mat[inds, inds]
        mean_weak <- ifelse(length(inds) <= 1, NA, mean(mtx[mtx <= 1]))
      }else if(length(inds_nega) >= 2){
        #------------------------------
        # Definitions
        #------------------------------
        inds <- union(inds_posi, inds_nega)
        tmp <- as.matrix(mat[inds, inds])
        gset <- list()
        mean <- list()
        #------------------------------
        # Pam clustering
        #------------------------------
        diag(tmp) <- 1
        set.seed(8)
        clust <- pam(tmp, k = 2)
        diag(tmp) <- 99
        for(j in 1:2){
          gset[[j]] <- names(clust[["clustering"]])[which(clust[["clustering"]] == j)]
          inds <- which(rownames(tmp) %in% gset[[j]])
          mtx <- tmp[inds, inds]
          mean[[j]] <- ifelse(length(inds) <= 1, NA, mean(mtx[mtx <= 1]))
        }
        #------------------------------
        # Definitions
        #------------------------------
        means <- unlist(mean)
        inds <- order(means, decreasing = TRUE)
        genes_strg <- gset[[inds[1]]]
        genes_vari <- gset[[inds[2]]]
        mean_strg <- means[inds[1]]
        mean_vari <- means[inds[2]]
        genes_weak <- setdiff(genes, union(gset[[1]], gset[[2]]))
        inds <- which(rownames(mat) %in% genes_weak)
        mtx <- mat[inds, inds]
        mean_weak <- ifelse(length(inds) <= 1, NA, mean(mtx[mtx <= 1]))
      }else{
        mtx <- mat[inds_posi, inds_posi]
        #------------------------------
        # Definitions
        #------------------------------
        genes_strg <- rownames(mtx)
        genes_vari <- NA
        genes_weak <- setdiff(genes, genes_strg)
        mean_strg <- mean(mtx[mtx <= 1])
        mean_vari <- NA
        inds <- which(rownames(mat) %in% genes_weak)
        mtx <- mat[inds, inds]
        mean_weak <- ifelse(length(inds) <= 1, NA, mean(mtx[mtx <= 1]))
      }
      #--------------------------------------------------
      # Re-separation
      #--------------------------------------------------
      if((!(is.na(mean_strg))) && (mean_strg < th_posi)){
        genes_strg <- NA
        genes_vari <- NA
        genes_weak <- genes
        mean_strg <- NA
        mean_vari <- NA
        inds <- which(rownames(mat) %in% genes_weak)
        mtx <- mat[inds, inds]
        mean_weak <- ifelse(length(inds) <= 1, NA, mean(mtx[mtx <= 1]))
      }
      #--------------------------------------------------
      # res
      #--------------------------------------------------
      res <- rbind(res, data.frame(
        ID = df$ID[i],
        Description = df$Description[i],
        Count_strg = as.integer(ifelse(is.element(NA, genes_strg), 0, length(genes_strg))),
        Count_vari = as.integer(ifelse(is.element(NA, genes_vari), 0, length(genes_vari))),
        Count_weak = as.integer(ifelse(is.element(NA, genes_weak), 0, length(genes_weak))),
        Corr_strg = mean_strg,
        Corr_vari = mean_vari,
        Corr_weak = mean_weak,
        Gene_strg = ifelse(is.element(NA, genes_strg), NA, paste(genes_strg, collapse = "/")),
        Gene_vari = ifelse(is.element(NA, genes_vari), NA, paste(genes_vari, collapse = "/")),
        Gene_weak = ifelse(is.element(NA, genes_weak), NA, paste(genes_weak, collapse = "/")),
        GeneID = df$GeneID[i]
      ))
    }
    #--------------------------------------------------
    # Store the results
    #--------------------------------------------------
    obj[[sign]][[data_type]][[category]] <- res
  }

  return(obj)
}

#' Select signs by user-defined criteria
#'
#' @param obj ASURAT object.
#' @param data_type Database to use. ***Need clarification***
#' @param min_cnt ***Need clarification***
#' @param min_cnt_weak ***Need clarification***
#'
#' @return ASURAT object.
#' @export
#'
#' @examples select_sign(obj = object, data_type = "DO",
#' min_cnt = 2, min_cnt_weak = 2)
select_sign <- function(obj, data_type, min_cnt, min_cnt_weak = 2){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  sign <- "sign"
  category_names <- names(obj[[sign]][[data_type]])
  slot_name <- paste("select_", data_type, sep = "")
  #--------------------------------------------------
  # History
  #--------------------------------------------------
  obj[["history"]][[slot_name]][["min_cnt"]] <- min_cnt
  obj[["history"]][[slot_name]][["min_cnt_weak"]] <- min_cnt_weak
  #--------------------------------------------------
  #
  #--------------------------------------------------
  for(category in category_names){
    df <- obj[[sign]][[data_type]][[category]]
    #============================================================
    # User-defined criteria:
    #   `|` and `&` mean "or" and "and", respectively.
    #============================================================
    # (Start) ===================================================

    inds <- which(
      (df$Count_strg + df$Count_vari >= min_cnt) &
        df$Count_weak >= min_cnt_weak
    )

    # (End) =====================================================
    #============================================================
    df <- df[inds,]
    if(length(inds) != 0)
      rownames(df) <- 1:nrow(df)
    obj[[sign]][[data_type]][[category]] <- df
  }

  return(obj)
}

#' Computes pairwise semantic similarities between the biological terms
#'
#' @param obj ASURAT object.
#' @param data_type
#' @param measure Argument of GOSemSim() function: "Resnik", "Rel", "Jiang", or "Lin".
#' @param orgdb Annotation data package (e.g. org.Hs.eg.db, org.Mm.eg.db)
#' @param treeTable ? ***Need clarification***
#' @param IC ? ***Need clarification***
#'
#' @return ASURAT object.
#' @export
#'
#' @examples compute_SemSim_sign(obj = object, data_type = "DO",
#' measure = "Jiang", orgdb = NULL,
#' treeTable = NULL, IC = IC_DO)
compute_SemSim_sign <- function(obj, data_type, measure, orgdb, treeTable = NULL, IC){
  #------------------------------
  # Definitions
  #------------------------------
  sign <- "sign"
  slot_name <- paste(data_type, "_Sim", sep = "")
  category_names <- names(obj[[sign]][[data_type]])
  #------------------------------
  # History
  #------------------------------
  text <- paste("compute_SemSim_", data_type, sep = "")
  obj[["history"]][[text]][["measure"]] <- measure
  #------------------------------
  # Compute semantic similarities
  #------------------------------
  for(category in category_names){
    #------------------------------
    # Add IC
    #------------------------------
    df <- obj[[sign]][[data_type]][[category]]
    tmp <- IC[[category]]
    if(nrow(df) == 0){
      next
    }
    for(i in 1:nrow(df)){
      ind <- which(tmp$ID == df$ID[i])
      if(length(ind) == 0){
        df$IC[i] <- Inf
      }else{
        df$IC[i] <- tmp[ind,]$IC
      }
    }
    obj[[sign]][[data_type]][[category]] <- df
    #------------------------------
    # Reorder: necessary for the next step
    #------------------------------
    df <- df[order(df$IC, decreasing = FALSE),]

    #==================================================
    # DO
    #==================================================
    if(data_type == "DO"){
      #------------------------------
      # doSim
      #------------------------------
      simmat <- doSim(df$ID, df$ID, measure = measure)
      obj[[sign]][[slot_name]][[category]][["matrix"]] <- simmat  # Note: class(simmat) is "matrix"
    }
    #==================================================
    # CO
    #==================================================
    if(data_type == "CO"){
      simmat <- matrix(0, nrow = nrow(df), ncol = nrow(df))
      tree <- treeTable[[category]]
      for(i in 1:(nrow(df)-1)){
        for(j in (i+1):nrow(df)){
          #------------------------------
          # Information content (IC)
          #------------------------------
          IC_i <- tmp[which(tmp$ID == df$ID[i]),]$IC
          IC_j <- tmp[which(tmp$ID == df$ID[j]),]$IC
          #------------------------------
          # IC of most informative common ancestor (MICA)
          #------------------------------
          ancestors_i <- tree[which(tree$child == df$ID[i]),]$parent
          ancestors_j <- tree[which(tree$child == df$ID[j]),]$parent
          common_ancestors <- intersect(ancestors_i, ancestors_j)
          common_ancestors <- data.frame(ID = common_ancestors, IC = NA)
          for(n in 1:nrow(common_ancestors))
            common_ancestors$IC[n] <- tmp[which(tmp$ID == common_ancestors$ID[n]),]$IC
          IC_MICA <- common_ancestors[order(common_ancestors$IC, decreasing = TRUE),]$IC[1]
          #------------------------------
          # Resnik, Lin
          #------------------------------
          if(measure == "Resnik")
            simmat[i,j] <- IC_MICA
          else if(measure == "Lin")
            simmat[i,j] <- 2 * IC_MICA / (IC_i + IC_j)
          else
            stop("Error: check the mesure.")
        }
      }
      simmat[lower.tri(simmat)] <- simmat[upper.tri(simmat)]
      diag(simmat) <- 1
      rownames(simmat) <- df$ID
      colnames(simmat) <- df$ID
      obj[[sign]][[slot_name]][[category]][["matrix"]] <- simmat
    }
    #==================================================
    # GO
    #==================================================
    if(data_type == "GO"){
      #------------------------------
      # mgoSim
      #------------------------------
      simdata <- godata(OrgDb = orgdb, ont = category, computeIC = TRUE)
      simmat <- mgoSim(df$ID, df$ID, semData = simdata, measure = measure, combine = NULL)
      obj[[sign]][[slot_name]][[category]][["matrix"]] <- simmat  # Note: class(simmat) is "matrix"
    }
  }

  return(obj)
}

#' Reduce biological terms
#'
#' Removes the biological terms based on the results of compute_SemSim_sign()
#'
#' @param obj ASURAT object.
#' @param data_type Database to use. ***Need clarification***
#' @param threshold Threshold for the ***Need clarification***.
#' @param keep_rareID ***Need clarification*** Bookdown instruction is unclear
#'
#' @return ASURAT object.
#' @export
#'
#' @examplesreduce_sign(obj = object, data_type = "GO",
#' threshold = 0.7, keep_rareID = TRUE)
reduce_sign <- function(obj, data_type, threshold, keep_rareID){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  sign <- "sign"
  slot_name <- paste(data_type, "_Sim", sep = "")
  category_names <- names(obj[[sign]][[data_type]])
  #--------------------------------------------------
  # History
  #--------------------------------------------------
  text <- paste("reduce_", data_type, sep = "")
  obj[["history"]][[text]][["threshold"]] <- threshold
  obj[["history"]][[text]][["keep_rareID"]] <- keep_rareID
  #--------------------------------------------------
  # Remove IDs
  #--------------------------------------------------
  for(category in category_names){
    #--------------------------------------------------
    # Definition of report
    #--------------------------------------------------
    df <- obj[[sign]][[data_type]][[category]]
    if(nrow(df) == 0){
      next
    }
    simmat <- obj[[sign]][[slot_name]][[category]][["matrix"]]
    I <- dim(simmat)[1]
    if(I < 2){
      next
    }
    report <- data.frame(
      matrix(ncol = 11, nrow = 0, dimnames = list(NULL, c(
        "Similarity",
        "Kept_ID",
        "Removed_ID",
        "Kept_Description",
        "Removed_Description",
        "Kept_Count",
        "Removed_Count",
        "Kept_Gene",
        "Removed_Gene",
        "Kept_IC",
        "Removed_IC"
      ))))
    flag <- rep(0, dim(simmat)[1])
    #--------------------------------------------------
    #
    #--------------------------------------------------
    if(keep_rareID == TRUE){
      for(i in I:2){
        J <- dim(simmat)[2] - (I - i + 1)  # Must be set before the next command
        if(flag[i] == 1)                   # Must be set after the previous command
          next
        for(j in 1:J){
          if(is.na(simmat[i,j]))
            next
          if(abs(simmat[i,j]) >= threshold){
            ind_kept <- which(df$ID == rownames(simmat)[i])
            ind_removed <- which(df$ID == rownames(simmat)[j])
            report <- rbind(report, data.frame(
              Similarity = simmat[i,j],
              Kept_ID = df[ind_kept,]$ID,
              Removed_ID = df[ind_removed,]$ID,
              Kept_Description = df[ind_kept,]$Description,
              Removed_Description = df[ind_removed,]$Description,
              Kept_Count = df[ind_kept,]$Count_strg +
                df[ind_kept,]$Count_vari + df[ind_kept,]$Count_weak,
              Removed_Count = df[ind_removed,]$Count_strg +
                df[ind_removed,]$Count_vari + df[ind_removed,]$Count_weak,
              Kept_Gene = paste(df[ind_kept,]$Gene_strg, df[ind_kept,]$Gene_vari,
                                df[ind_kept,]$Gene_weak, sep = "||"),
              Removed_Gene = paste(df[ind_removed,]$Gene_strg, df[ind_removed,]$Gene_vari,
                                   df[ind_removed,]$Gene_weak, sep = "||"),
              Kept_IC = df[ind_kept,]$IC,
              Removed_IC = df[ind_removed,]$IC
            ))
            flag[j] <- 1
          }
        }
      }
    }else{
      I <- dim(simmat)[1] - 1
      J <- dim(simmat)[2]
      for(i in 1:I){
        j0 <- i + 1       # Must be set before the next command
        if(flag[i] == 1)  # Must be set after the previous command
          next
        for(j in j0:J){
          if(is.na(simmat[i,j]))
            next
          if(abs(simmat[i,j]) >= threshold){
            ind_kept <- which(df$ID == rownames(simmat)[i])
            ind_removed <- which(df$ID == rownames(simmat)[j])
            report <- rbind(report, data.frame(
              Similarity = simmat[i,j],
              Kept_ID = df[ind_kept,]$ID,
              Removed_ID = df[ind_removed,]$ID,
              Kept_Description = df[ind_kept,]$Description,
              Removed_Description = df[ind_removed,]$Description,
              Kept_Count = df[ind_kept,]$Count_strg +
                df[ind_kept,]$Count_vari + df[ind_kept,]$Count_weak,
              Removed_Count = df[ind_removed,]$Count_strg +
                df[ind_removed,]$Count_vari + df[ind_removed,]$Count_weak,
              Kept_Gene = paste(df[ind_kept,]$Gene_strg, df[ind_kept,]$Gene_vari,
                                df[ind_kept,]$Gene_weak, sep = "||"),
              Removed_Gene = paste(df[ind_removed,]$Gene_strg, df[ind_removed,]$Gene_vari,
                                   df[ind_removed,]$Gene_weak, sep = "||"),
              Kept_IC = df[ind_kept,]$IC,
              Removed_IC = df[ind_removed,]$IC
            ))
            flag[j] <- 1
          }
        }
      }
    }
    #--------------------------------------------------
    # Store the results
    #--------------------------------------------------
    obj[[sign]][[slot_name]][[category]][["report"]] <- report

    #--------------------------------------------------
    # Remove IDs
    #--------------------------------------------------
    if(length(report$Removed_ID) >= 1){
      kept_IDs <- setdiff(df$ID, report$Removed_ID)
      res <- df[which(df$ID %in% kept_IDs),]
      rownames(res) <- 1:nrow(res)
    }else{
      res <- df
    }
    obj[[sign]][[data_type]][[category]] <- res
  }

  return(obj)
}

#' Removes biological terms by specifying IDs or keywords
#'
#' @param obj ASURAT object.
#' @param data_type Database from which biological terms are to remove. ***Need clarification***
#' @param keywords Vector of strings? Example? ***Need clarification***
#'
#' @return ASURAT object.
#' @export
#'
#' @examples manual_curation_sign(obj = object, data_type = "GO", keywords = keywords)
manual_curation_sign <- function(obj, data_type, keywords){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  sign <- "sign"
  category_names <- names(obj[[sign]][[data_type]])
  #--------------------------------------------------
  # History
  #--------------------------------------------------
  slot_name <- paste("manual_curation_", data_type, sep = "")
  obj[["history"]][[slot_name]][["keywords"]] <- keywords
  #--------------------------------------------------
  # manual curation
  #--------------------------------------------------
  for(category in category_names){
    df <- obj[[sign]][[data_type]][[category]]
    if(nrow(df) != 0){
      df <- df[!((grepl(keywords, df$ID)) | (grepl(keywords, df$Description))),]
      rownames(df) <- 1:nrow(df)
    }
    obj[[sign]][[data_type]][[category]] <- df
  }

  return(obj)
}

#' Title
#'
#' @param obj
#' @param data_type
#' @param keywords
#'
#' @return
#' @export
#'
#' @examples
manual_selection_sign <- function(obj, data_type, keywords){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  sign <- "sign"
  category_names <- names(obj[[sign]][[data_type]])
  #--------------------------------------------------
  # History
  #--------------------------------------------------
  slot_name <- paste("manual_selection_", data_type, sep = "")
  obj[["history"]][[slot_name]][["keywords"]] <- keywords
  #--------------------------------------------------
  # manual curation
  #--------------------------------------------------
  for(category in category_names){
    df <- obj[[sign]][[data_type]][[category]]
    if(nrow(df) != 0){
      df <- df[(grepl(keywords, df$ID)) | (grepl(keywords, df$Description)),]
      rownames(df) <- 1:nrow(df)
    }
    obj[[sign]][[data_type]][[category]] <- df
  }

  return(obj)
}

#' Make sign by sample matrix
#'
#' @param obj ASURAT object.
#' @param data_type
#' @param weight_strg
#' @param weight_vari
#'
#' @return ASURAT object.
#' @export
#'
#' @examples make_signxsample_matrix(obj = object, data_type = "DO",
#' weight_strg = 0.5, weight_vari = 0.5)
make_signxsample_matrix <- function(
  obj, data_type, weight_strg = 0.5, weight_vari = 0.5
){
  #--------------------------------------------------
  # Error
  #--------------------------------------------------
  if((weight_strg < 0) | (weight_strg > 1) | (weight_vari < 0) | (weight_vari > 1)){
    stop("Error: weight_* must be from 0 to 1.")
  }
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  sign <- "sign"
  category_names <- names(obj[[sign]][[data_type]])
  #--------------------------------------------------
  # History
  #--------------------------------------------------
  text <- paste("make_signxsample_matrix_", data_type, sep = "")
  obj[["history"]][[text]][["weight_strg"]] <- weight_strg
  obj[["history"]][[text]][["weight_vari"]] <- weight_vari
  #--------------------------------------------------
  # Sign-by-sample matrices
  #--------------------------------------------------
  mat <- obj[["data"]][["centered"]]
  for(category in category_names){
    res <- c()
    df <- obj[[sign]][[data_type]][[category]]
    if(nrow(df) != 0){
      for(i in 1:nrow(df)){
        #------------------------------
        # Strongly correlated gene set
        #------------------------------
        if(df$Count_strg[i] != 0){
          genes_strg <- unlist(strsplit(df$Gene_strg[i], "/"))
        }else{
          genes_strg <- NA
        }
        #------------------------------
        # Variably correlated gene set
        #------------------------------
        if(df$Count_vari[i] != 0){
          genes_vari <- unlist(strsplit(df$Gene_vari[i], "/"))
        }else{
          genes_vari <- NA
        }
        #------------------------------
        # Weakly correlated gene set
        #------------------------------
        if(df$Count_weak[i] != 0){
          genes_weak <- unlist(strsplit(df$Gene_weak[i], "/"))
        }else{
          genes_weak <- NA
        }
        #------------------------------
        # Sign-by-sample matrix
        #------------------------------
        if((df$Count_strg[i] >= 2) & (df$Count_vari[i] < 2) & (df$Count_weak[i] >= 2)){
          tmp_strg <- as.data.frame(mat[which(rownames(mat) %in% genes_strg),])
          tmp_weak <- as.data.frame(mat[which(rownames(mat) %in% genes_weak),])
          vec_strg <- weight_strg * apply(tmp_strg, 2, mean) +
            (1 - weight_strg) * apply(tmp_weak, 2, mean)
          tmp <- as.matrix(t(vec_strg))
          rownames(tmp) <- paste(df$ID[i], "_", "S", sep = "")
        }
        if((df$Count_strg[i] >= 2) & (df$Count_vari[i] >= 2) & (df$Count_weak[i] >= 2)){
          tmp_strg <- as.data.frame(mat[which(rownames(mat) %in% genes_strg),])
          tmp_vari <- as.data.frame(mat[which(rownames(mat) %in% genes_vari),])
          tmp_weak <- as.data.frame(mat[which(rownames(mat) %in% genes_weak),])
          vec_strg <- weight_strg * apply(tmp_strg, 2, mean) +
            (1 - weight_strg) * apply(tmp_weak, 2, mean)
          vec_vari <- weight_vari * apply(tmp_vari, 2, mean) +
            (1 - weight_vari) * apply(tmp_weak, 2, mean)
          tmp <- rbind(vec_strg, vec_vari)
          rownames(tmp) <- c(paste(df$ID[i], "_", "S", sep = ""), paste(df$ID[i], "_", "V", sep = ""))
        }
        res <- rbind(res, tmp)
      }
    }
    slot_name <- paste(data_type, "xSample", sep = "")
    obj[[sign]][[slot_name]][[category]] <- as.data.frame(res)
  }

  return(obj)
}

#' Title
#'
#' @param obj
#' @param data_type
#' @param category
#'
#' @return
#' @export
#'
#' @examples
do_pca_sign <- function(obj, data_type, category){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  sign <- "sign"
  slot_name <- paste(data_type, "xSample", sep = "")
  mat <- as.matrix(obj[[sign]][[slot_name]][[category]])
  #--------------------------------------------------
  # PCA
  #--------------------------------------------------
  set.seed(8)
  obj[["reduction"]][["pca"]][[data_type]][[category]] <- prcomp(t(mat), scale = FALSE)

  return(obj)
}

#' Title
#'
#' @param obj
#' @param data_type
#' @param category
#' @param pca_dim
#' @param tsne_dim
#'
#' @return
#' @export
#'
#' @examples
do_tsne_sign <- function(obj, data_type, category, pca_dim, tsne_dim){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  sign <- "sign"
  slot_name <- paste(data_type, "xSample", sep = "")
  #--------------------------------------------------
  # tSNE
  #--------------------------------------------------
  set.seed(8)
  if(is.null(pca_dim)){
    mat <- as.matrix(obj[[sign]][[slot_name]][[category]])
    obj[["reduction"]][["tsne"]][[data_type]][[category]] <-
      Rtsne(t(mat), dim = tsne_dim, pca = FALSE)
  }else{
    mat <- obj[["reduction"]][["pca"]][[data_type]][[category]][["x"]]
    obj[["reduction"]][["tsne"]][[data_type]][[category]] <-
      Rtsne(mat[,1:pca_dim], dim = tsne_dim, pca = FALSE)
  }

  return(obj)
}

#' Title
#'
#' @param obj
#' @param data_type
#' @param category
#' @param pca_dim
#' @param umap_dim
#'
#' @return
#' @export
#'
#' @examples
do_umap_sign <- function(obj, data_type, category, pca_dim, umap_dim){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  sign <- "sign"
  slot_name <- paste(data_type, "xSample", sep = "")
  #--------------------------------------------------
  # UMAP
  #--------------------------------------------------
  set.seed(8)
  if(is.null(pca_dim)){
    mat <- as.matrix(obj[[sign]][[slot_name]][[category]])
    obj[["reduction"]][["umap"]][[data_type]][[category]] <-
      umap(t(mat), n_components = umap_dim)
  }else{
    mat <- obj[["reduction"]][["pca"]][[data_type]][[category]][["x"]]
    obj[["reduction"]][["umap"]][[data_type]][[category]] <-
      umap(mat[,1:pca_dim], n_components = umap_dim)
  }

  return(obj)
}

#' Title
#'
#' @param obj
#' @param data_type
#' @param category
#' @param sigma
#' @param distance
#' @param pca_dim
#'
#' @return
#' @export
#'
#' @examples
do_DiffusionMap_sign <- function(
  obj, data_type, category, sigma = "local", distance = "euclidean", pca_dim
){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  algo_name <- "dmap"
  sign <- "sign"
  text <- paste(data_type, "xSample", sep = "")
  #--------------------------------------------------
  # History
  #--------------------------------------------------
  obj[["history"]][["doDiffusionMap"]][[data_type]][[category]][["sigma"]] <- sigma
  obj[["history"]][["doDiffusionMap"]][[data_type]][[category]][["distance"]] <- distance
  #--------------------------------------------------
  # DiffusionMap
  #--------------------------------------------------
  set.seed(8)
  if(is.null(pca_dim)){
    mat <- as.matrix(obj[[sign]][[text]][[category]])
    res <- DiffusionMap(t(mat), sigma = sigma, distance = distance)
  }else{
    mat <- obj[["reduction"]][["pca"]][[data_type]][[category]][["x"]]
    res <- DiffusionMap(mat[,1:pca_dim], sigma = sigma, distance = distance)
  }
  #--------------------------------------------------
  # Store the results
  #--------------------------------------------------
  obj[["reduction"]][[algo_name]][[data_type]][[category]] <- res

  return(obj)
}

#' Perform sample classification using Partitioning Around Medoids (PAM)
#'
#' @param obj ASURAT object.
#' @param data_type
#' @param category
#' @param k
#'
#' @return ASURAT object.
#' @export
#'
#' @examples classify_samples_pam_sign(
#   obj = object, data_type = "DO",category = "disease", k = 4)
classify_samples_pam_sign <- function(obj, data_type, category, k){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  algo_name <- "pam"
  sign <- "sign"
  text <- paste(data_type, "xSample", sep = "")
  mat <- as.matrix(obj[[sign]][[text]][[category]])
  slot_name <- paste(algo_name, "_", data_type, "_", category, sep = "")
  #--------------------------------------------------
  # History
  #--------------------------------------------------
  text <- paste("classify_samples_", algo_name, "_", data_type, "_", category, sep = "")
  obj[["history"]][[text]][["k"]] <- k
  #--------------------------------------------------
  # PAM
  #--------------------------------------------------
  set.seed(8)
  res <- pam(x = t(mat), k = k)
  #--------------------------------------------------
  # Store the results
  #--------------------------------------------------
  obj[["classification"]][[algo_name]][[data_type]][[category]] <- res
  obj[["sample"]][[slot_name]] <- as.integer(res$cluster)

  return(obj)
}

#' Title
#'
#' @param obj
#' @param data_type
#' @param category
#' @param method
#' @param k
#'
#' @return
#' @export
#'
#' @examples
classify_samples_hclustCutree_sign <- function(obj, data_type, category, method, k){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  algo_name <- "hclustCutree"
  sign <- "sign"
  text <- paste(data_type, "xSample", sep = "")
  mat <- as.matrix(obj[[sign]][[text]][[category]])
  slot_name <- paste(algo_name, "_", data_type, "_", category, sep = "")
  #--------------------------------------------------
  # History
  #--------------------------------------------------
  text <- paste("classify_samples_", algo_name, "_", data_type, "_", category, sep = "")
  obj[["history"]][[text]][["method"]] <- method
  obj[["history"]][[text]][["k"]] <- k
  #--------------------------------------------------
  # hclust and cutree
  #--------------------------------------------------
  set.seed(8)
  hc <- hclust(dist(t(mat)), method = method)
  res <- cutree(hc, k = k)
  #--------------------------------------------------
  # Store the results
  #--------------------------------------------------
  obj[["classification"]][[algo_name]][[data_type]][[category]][["hclust"]] <- hc
  obj[["classification"]][[algo_name]][[data_type]][[category]][["cutree"]] <- res
  obj[["sample"]][[slot_name]] <- as.integer(res)

  return(obj)
}

#' Title
#'
#' @param obj
#' @param data_type
#' @param category
#' @param reduction
#' @param dims
#' @param k.param
#' @param prune.SNN
#' @param resolution
#' @param algorithm
#'
#' @return
#' @export
#'
#' @examples
classify_samples_seuratFindClusters_sign <- function(
  obj, data_type, category,
  reduction = "pca", dims, k.param, prune.SNN, resolution, algorithm
){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  algo_name <- "seuratFindClusters"
  sign <- "sign"
  text <- paste(data_type, "xSample", sep = "")
  mat <- as.matrix(obj[[sign]][[text]][[category]])
  slot_name <- paste(algo_name, "_", data_type, "_", category, sep = "")
  #--------------------------------------------------
  # Error control
  #--------------------------------------------------
  if(length(dims) > nrow(obj[[sign]][[text]][[category]])){
    stop("Error: \"dims\" must be less than the number of rows of the sign-by-sample matrix.")
  }
  #--------------------------------------------------
  # History
  #--------------------------------------------------
  text <- paste("classify_samples_", algo_name, "_", data_type, "_", category, sep = "")
  obj[["history"]][[text]][["reduction"]] <- reduction
  obj[["history"]][[text]][["dims"]] <- dims
  obj[["history"]][[text]][["k.param"]] <- k.param
  obj[["history"]][[text]][["prune.SNN"]] <- prune.SNN
  obj[["history"]][[text]][["resolution"]] <- resolution
  obj[["history"]][[text]][["algorithm"]] <- algorithm
  #--------------------------------------------------
  # Set Seurat object
  #--------------------------------------------------
  res <- CreateSeuratObject(
    counts = mat, project = obj[["history"]][["make_asurat_obj"]][["obj_name"]])
  res <- ScaleData(res, features = rownames(res))
  res <- RunPCA(res, features = rownames(res))
  #--------------------------------------------------
  # FindNeighbors
  #--------------------------------------------------
  set.seed(8)
  res <- FindNeighbors(
    res, reduction = reduction, dims = dims, k.param = k.param, prune.SNN = prune.SNN)
  #--------------------------------------------------
  # FindClusters
  #--------------------------------------------------
  res <- FindClusters(res, resolution = resolution, algorithm = algorithm)

  #--------------------------------------------------
  # Store the results
  #--------------------------------------------------
  obj[["classification"]][[algo_name]][[data_type]][[category]] <- res
  obj[["sample"]][[slot_name]] <- as.integer(as.integer(as.character(Idents(res))) + 1)

  return(obj)
}

#' Title
#'
#' @param obj
#' @param data_type
#' @param category
#' @param dims
#' @param NEndpoints
#' @param random_seed
#'
#' @return
#' @export
#'
#' @examples
do_CalculateScaffoldTree_dmap <- function(
  obj, data_type, category, dims, NEndpoints, random_seed
){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  algo_name <- "merlot"
  #--------------------------------------------------
  # CalculateScaffoldTree
  #--------------------------------------------------
  tmp <- CalculateScaffoldTree(
    CellCoordinates = obj[["reduction"]][["dmap"]][[data_type]][[category]]@eigenvectors[,dims],
    NEndpoints = NEndpoints, random_seed
  )
  obj[["classification"]][[algo_name]][[data_type]][[category]][["ScaffoldTree"]] <- tmp

  return(obj)
}

#' Title
#'
#' @param obj
#' @param data_type
#' @param category
#' @param N_yk
#'
#' @return
#' @export
#'
#' @examples
do_CalculateElasticTree_dmap <- function(obj, data_type, category, N_yk){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  algo_name <- "merlot"
  #--------------------------------------------------
  # CalculateElasticTree
  #--------------------------------------------------
  tmp <- CalculateElasticTree(
    ScaffoldTree = obj[["classification"]][[algo_name]][[data_type]][[category]][["ScaffoldTree"]],
    N_yk = N_yk
  )
  obj[["classification"]][[algo_name]][[data_type]][[category]][["ElasticTree"]] <- tmp

  return(obj)
}

#' Title
#'
#' @param obj
#' @param data_type
#' @param category
#'
#' @return
#' @export
#'
#' @examples
do_SignSpaceEmbedding_dmap <- function(obj, data_type, category){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  algo_name <- "merlot"
  slot_name <- paste(data_type, "xSample", sep = "")
  mat <- t(obj[["sign"]][[slot_name]][[category]])
  ept <- obj[["classification"]][[algo_name]][[data_type]][[category]][["ElasticTree"]]
  #--------------------------------------------------
  # GenesSpaceEmbedding
  #--------------------------------------------------
  tmp <- GenesSpaceEmbedding(ExpressionMatrix = mat, ElasticTree = ept)
  obj[["classification"]][[algo_name]][[data_type]][[category]][["EmbeddedTree"]] <- tmp

  return(obj)
}

#' Title
#'
#' @param obj
#' @param data_type
#' @param category
#' @param T0
#'
#' @return
#' @export
#'
#' @examples
do_CalculatePseudotimes_dmap <- function(obj, data_type, category, T0){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  algo_name <- "merlot"
  #--------------------------------------------------
  # CalculatePseudotimes
  #--------------------------------------------------
  tmp <- CalculatePseudotimes(
    InputTree = obj[["classification"]][[algo_name]][[data_type]][[category]][["EmbeddedTree"]],
    T0 = T0
  )
  obj[["classification"]][[algo_name]][[data_type]][[category]][["Pseudotimes"]] <- tmp

  return(obj)
}

#' Title
#'
#' @param obj
#' @param data_type
#' @param category
#'
#' @return
#' @export
#'
#' @examples
classify_samples_merlot_sign <- function(obj, data_type, category){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  algo_name <- "merlot"
  sign <- "sign"
  text <- paste(data_type, "xSample", sep = "")
  mat <- as.matrix(obj[[sign]][[text]][[category]])
  slot_name <- paste(algo_name, "_", data_type, "_", category, sep = "")
  #--------------------------------------------------
  # Store the results
  #--------------------------------------------------
  res <- obj[["classification"]][[algo_name]][[data_type]][[category]][["Pseudotimes"]][["Cells2Branches"]]
  obj[["sample"]][[slot_name]] <- as.integer(res)

  return(obj)
}

compute_nswap_001 <-
  "int compute_nswap_001(NumericVector enco){
  int  swaps = 0;

  for(int i=0; i<(enco.length()-1); ++i){
    for(int j=i+1; j<enco.length(); ++j){
      if(enco[i] > enco[j])
        swaps++;
    }
  }
  return(swaps);
}"
Rcpp::cppFunction(compute_nswap_001)
#--------------------------------------------------------------------------------
# This function computes the number of swaps of adjacent elements required for
# transforming vector 1 to vector 2, which have the same length and same amount
# of given numbers (e.g., vector1 = c(0,2,1), vector2 = c(1,2,0)).
# The following R code is inspired by Kendall tau distance algorithm.
#--------------------------------------------------------------------------------

#' Title
#'
#' @param vec1
#' @param vec2
#'
#' @return
#' @export
#'
#' @examples
compute_nswaps <- function(vec1, vec2){
  I <- length(vec1)

  # (1)
  dict <- c()
  for(w in unique(vec1)){
    val <- which(vec1 == w)
    dict <- c(dict, list(val))
  }
  names(dict) <- unique(vec1)

  # (2)
  cnt <- c()
  for(c in names(dict)){
    cnt <- c(cnt, list(1))
  }
  names(cnt) <- names(dict)
  enco <- vec2
  for(i in 1:I){
    enco[i] <- dict[[as.character(vec2[i])]][cnt[[as.character(vec2[i])]]]
    cnt[[as.character(vec2[i])]] <- cnt[[as.character(vec2[i])]] + 1
  }

  # (3): since this step is time consuming, we utilize `002_codes/utilities.cpp`
  swaps <- compute_nswap_001(enco)

  return(swaps)
}

#' Title
#'
#' @param obj
#' @param data_type_for_label
#' @param category_for_label
#' @param algo_name_for_label
#' @param data_type_for_expr
#' @param category_for_expr
#' @param labels_1
#' @param labels_2
#'
#' @return
#' @export
#'
#' @examples
find_marker_sign <- function(
  obj, data_type_for_label, category_for_label, algo_name_for_label,
  data_type_for_expr, category_for_expr, labels_1, labels_2
){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  sign <- "sign"
  #--------------------------------------------------
  # Divide samples
  #--------------------------------------------------
  slot_name <- paste(algo_name_for_label, "_", data_type_for_label, "_", category_for_label, sep = "")
  tmp <- obj[["sample"]][[slot_name]]
  inds_1 <- which(tmp %in% labels_1)
  inds_2 <- which(tmp %in% labels_2)
  pop_1 <- obj[["sample"]][["barcode"]][inds_1]
  pop_2 <- obj[["sample"]][["barcode"]][inds_2]
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  slot_name <- paste(data_type_for_expr, "xSample", sep = "")
  mat <- as.matrix(obj[[sign]][[slot_name]][[category_for_expr]])
  mat <- mat[,union(inds_1, inds_2)]
  tmp <- obj[[sign]][[data_type_for_expr]][[category_for_expr]]
  #--------------------------------------------------
  # Prepare a data frame to be output
  #--------------------------------------------------
  res <- data.frame(
    Label = as.factor(paste(labels_1, collapse = "/")),
    Rank = rep(NA, dim(mat)[1]),
    Sign = NA,
    ParentDescription = NA,
    Gene = NA,
    sep_I = NA
  )
  for(i in 1:dim(mat)[1]){
    #--------------------------------------------------
    # Sign
    #--------------------------------------------------
    res$Sign[i] <- rownames(mat)[i]
    #--------------------------------------------------
    # ParentDescription
    #--------------------------------------------------
    res$ParentDescription[i] <-
      tmp[which(tmp$ID == strsplit(res$Sign[i], split='_')[[1]][1]),]$Description
    #--------------------------------------------------
    # Gene
    #--------------------------------------------------
    if(strsplit(res$Sign[i], split='_')[[1]][2] == "S"){
      res$Gene[i] <- tmp[which(tmp$ID == strsplit(res$Sign[i], split='_')[[1]][1]),]$Gene_strg
    }else{
      res$Gene[i] <- tmp[which(tmp$ID == strsplit(res$Sign[i], split='_')[[1]][1]),]$Gene_vari
    }
    #--------------------------------------------------
    # Edit distance between two (0,1)-vectors
    #--------------------------------------------------
    data <- mat[i,]
    vec_1 <- sort(data, decreasing = FALSE)
    vec_1 <- ifelse(names(vec_1) %in% pop_1, 1, 0)
    vec_2 <- sort(vec_1, decreasing = FALSE) # vec_3 = (0, 0, ..., 1, 1)
    vec_3 <- sort(vec_1, decreasing = TRUE)  # vec_2 = (1, 1, ..., 0, 0)
    dist1 <- compute_nswaps(vec_1, vec_2)
    dist2 <- compute_nswaps(vec_1, vec_3)
    res$sep_I[i] <- round(1 - 2 * dist1 / (dist1 + dist2), digits = 6)
  }
  #--------------------------------------------------
  # Arrange the data frame in order of res$sep_I
  #--------------------------------------------------
  inds <- order(res$sep_I, decreasing = TRUE)
  res <- res[inds,]
  res$Rank <- 1:length(inds)
  #--------------------------------------------------
  # Output
  #--------------------------------------------------
  slot_name <- paste(
    "Label_", data_type_for_label, "_", category_for_label, "_",
    paste(labels_1, collapse = "/"), "_vs_", paste(labels_2, collapse = "/"), sep = ""
  )
  obj[["marker"]][[data_type_for_label]][[category_for_label]][[slot_name]] <- res
  return(obj)
}

#' Title
#'
#' @param obj
#' @param data_type_for_label
#' @param category_for_label
#' @param algo_name_for_label
#' @param data_type_for_expr
#' @param category_for_expr
#'
#' @return
#' @export
#'
#' @examples
auto_find_marker_sign <- function(obj, data_type_for_label, category_for_label,
                                  algo_name_for_label, data_type_for_expr, category_for_expr
){
  #--------------------------------------------------
  # Initialize
  #--------------------------------------------------
  obj[["marker"]][[data_type_for_label]][[category_for_label]] <- NULL
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  slot_name <- paste(algo_name_for_label, "_", data_type_for_label, "_",
                     category_for_label, sep = "")
  labels <- unique(sort(obj[["sample"]][[slot_name]]))
  #--------------------------------------------------
  # Looping
  #--------------------------------------------------
  res <- c()
  for(i in labels){
    labels_1 <- i
    labels_2 <- setdiff(labels, i)
    tmp <- find_marker_sign(
      obj = obj, data_type_for_label = data_type_for_label,
      category_for_label = category_for_label,
      algo_name_for_label = algo_name_for_label,
      data_type_for_expr = data_type_for_expr,
      category_for_expr = category_for_expr,
      labels_1 = labels_1, labels_2 = labels_2)
    slot_name <- paste(
      "Label_", data_type_for_label, "_", category_for_label, "_",
      paste(labels_1, collapse = "/"), "_vs_", paste(labels_2, collapse = "/"), sep = ""
    )
    res[[slot_name]] <- tmp[["marker"]][[data_type_for_label]][[category_for_label]][[slot_name]]
  }
  res[["all"]] <- c()
  for(name in names(res)){
    res[["all"]] <- rbind(res[["all"]], res[[name]])
  }
  rownames(res[["all"]]) <- 1:nrow(res[["all"]])
  #--------------------------------------------------
  # Output
  #--------------------------------------------------
  obj[["marker"]][[data_type_for_label]][[category_for_label]] <- res

  return(obj)
}

#' Title
#'
#' @param obj
#' @param data_type_for_label
#' @param category_for_label
#' @param algo_name_for_label
#' @param labels_1
#' @param labels_2
#' @param parametric
#'
#' @return
#' @export
#'
#' @examples
find_marker_gene <- function(
  obj, data_type_for_label, category_for_label, algo_name_for_label,
  labels_1, labels_2, parametric
){
  #--------------------------------------------------
  # Divide samples
  #--------------------------------------------------
  slot_name <- paste(algo_name_for_label, "_", data_type_for_label, "_", category_for_label, sep = "")
  tmp <- obj[["sample"]][[slot_name]]
  inds_1 <- which(tmp %in% labels_1)
  inds_2 <- which(tmp %in% labels_2)
  pop_1 <- obj[["sample"]][["barcode"]][inds_1]
  pop_2 <- obj[["sample"]][["barcode"]][inds_2]
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  mat <- as.matrix(obj[["data"]][["log1p"]])
  #--------------------------------------------------
  # Error control
  #--------------------------------------------------
  if((length(inds_1) <= 1) || (length(inds_2) <= 1)){
    stop("Error: one of the subpopulations is too small.")
  }
  #--------------------------------------------------
  # Prepare a data frame to be output
  #--------------------------------------------------
  res <- data.frame(
    Label = as.factor(paste(labels_1, collapse = "/")),
    Rank = rep(NA, dim(mat)[1]),
    Gene = NA,
    p_val = NA,
    p_adj = NA
  )
  if(parametric){
    for(i in 1:dim(mat)[1]){
      #--------------------------------------------------
      # Gene
      #--------------------------------------------------
      res$Gene[i] <- rownames(mat)[i]
      #--------------------------------------------------
      # p_val
      #--------------------------------------------------
      values_1 <- mat[, inds_1][i,]
      values_2 <- mat[, inds_2][i,]
      res$p_val[i] <- t.test(x = values_1, y = values_2,
                             var.equal = FALSE, paired = FALSE)[["p.value"]]
    }
  }else{
    for(i in 1:dim(mat)[1]){
      #--------------------------------------------------
      # Gene
      #--------------------------------------------------
      res$Gene[i] <- rownames(mat)[i]
      #--------------------------------------------------
      # p_val
      #--------------------------------------------------
      values_1 <- mat[, inds_1][i,]
      values_2 <- mat[, inds_2][i,]
      res$p_val[i] <- wilcox.exact(x = values_1, y = values_2,
                                   paired = FALSE)[["p.value"]]
    }
  }
  #--------------------------------------------------
  # p_adj
  #--------------------------------------------------
  res$p_adj <- p.adjust(res$p_val, method = "BH")
  #--------------------------------------------------
  # Arrange the data frame in order of res$p_val
  #--------------------------------------------------
  inds <- order(res$p_val, decreasing = FALSE)
  res <- res[inds,]
  res$Rank <- 1:length(inds)
  #--------------------------------------------------
  # Output
  #--------------------------------------------------
  slot_name <- paste(
    "Label_", data_type_for_label, "_", category_for_label, "_",
    paste(labels_1, collapse = "/"), "_vs_", paste(labels_2, collapse = "/"), sep = ""
  )
  obj[["marker"]][["gene"]][[data_type_for_label]][[category_for_label]][[slot_name]] <- res
  return(obj)
}

#' Title
#'
#' @param obj
#' @param data_type_for_label
#' @param category_for_label
#' @param algo_name_for_label
#' @param parametric
#'
#' @return
#' @export
#'
#' @examples
auto_find_marker_gene <- function(
  obj, data_type_for_label, category_for_label, algo_name_for_label, parametric
){
  #--------------------------------------------------
  # Initialize
  #--------------------------------------------------
  obj[["marker"]][["gene"]][[data_type_for_label]][[category_for_label]] <- NULL
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  slot_name <- paste(
    algo_name_for_label, "_", data_type_for_label, "_", category_for_label,
    sep = ""
  )
  labels <- unique(sort(obj[["sample"]][[slot_name]]))
  #--------------------------------------------------
  # Looping
  #--------------------------------------------------
  res <- c()
  for(i in labels){
    labels_1 <- i
    labels_2 <- setdiff(labels, i)
    tmp <- find_marker_gene(
      obj = obj, data_type_for_label = data_type_for_label,
      category_for_label = category_for_label,
      algo_name_for_label = algo_name_for_label,
      labels_1 = labels_1, labels_2 = labels_2,
      parametric = parametric
    )
    slot_name <- paste(
      "Label_", data_type_for_label, "_", category_for_label, "_",
      paste(labels_1, collapse = "/"), "_vs_", paste(labels_2, collapse = "/"), sep = ""
    )
    res[[slot_name]] <-
      tmp[["marker"]][["gene"]][[data_type_for_label]][[category_for_label]][[slot_name]]
  }
  res[["all"]] <- c()
  for(name in names(res)){
    res[["all"]] <- rbind(res[["all"]], res[[name]])
  }
  rownames(res[["all"]]) <- 1:nrow(res[["all"]])
  #--------------------------------------------------
  # Output
  #--------------------------------------------------
  obj[["marker"]][["gene"]][[data_type_for_label]][[category_for_label]] <- res

  return(obj)
}

#' Title
#'
#' @param obj1
#' @param obj2
#'
#' @return
#' @export
#'
#' @examples
concatenate_obj_sign <- function(obj1, obj2){
  #--------------------------------------------------
  # history
  #--------------------------------------------------
  hist_names_1 <- names(obj1[["history"]])
  hist_names_2 <- names(obj2[["history"]])
  for(names in setdiff(hist_names_2, hist_names_1)){
    obj1[["history"]][[names]] <- obj2[["history"]][[names]]
  }
  #--------------------------------------------------
  # sample
  #--------------------------------------------------
  samp_names_1 <- names(obj1[["sample"]])
  samp_names_2 <- names(obj2[["sample"]])
  for(name in setdiff(samp_names_2, samp_names_1)){
    obj1[["sample"]][[name]] <- obj2[["sample"]][[name]]
  }
  #--------------------------------------------------
  # misc
  #--------------------------------------------------
  misc_names_1 <- names(obj1[["misc"]])
  misc_names_2 <- names(obj2[["misc"]])
  for(name in setdiff(misc_names_2, misc_names_1)){
    obj1[["misc"]][[name]] <- obj2[["misc"]][[name]]
  }
  #--------------------------------------------------
  # sign
  #--------------------------------------------------
  sign_names_1 <- names(obj1[["sign"]])
  sign_names_2 <- names(obj2[["sign"]])
  for(name in setdiff(sign_names_2, sign_names_1)){
    obj1[["sign"]][[name]] <- obj2[["sign"]][[name]]
  }
  #--------------------------------------------------
  # classification
  #--------------------------------------------------
  classification_names_1 <- names(obj1[["classification"]])
  classification_names_2 <- names(obj2[["classification"]])
  classification_names <- union(classification_names_1, classification_names_2)
  for(name in classification_names){
    if(length(obj2[["classification"]][[name]]) != 0){
      category_names <- names(obj2[["classification"]][[name]])
      for(cname in category_names){
        obj1[["classification"]][[name]][[cname]] <-
          obj2[["classification"]][[name]][[cname]]
      }
    }
  }
  #--------------------------------------------------
  # reduction
  #--------------------------------------------------
  reduction_names_1 <- names(obj1[["reduction"]])
  reduction_names_2 <- names(obj2[["reduction"]])
  reduction_names <- union(reduction_names_1, reduction_names_2)
  for(name in reduction_names){
    if(length(obj2[["reduction"]][[name]]) != 0){
      category_names <- names(obj2[["reduction"]][[name]])
      for(cname in category_names){
        obj1[["reduction"]][[name]][[cname]] <- obj2[["reduction"]][[name]][[cname]]
      }
    }
  }
  #--------------------------------------------------
  # marker
  #--------------------------------------------------
  marker_names_1 <- names(obj1[["marker"]])
  marker_names_2 <- names(obj2[["marker"]])
  marker_names <- union(marker_names_1, marker_names_2)
  for(name in marker_names){
    if(length(obj2[["marker"]][[name]]) != 0){
      category_names <- names(obj2[["marker"]][[name]])
      for(cname in category_names){
        obj1[["marker"]][[name]][[cname]] <- obj2[["marker"]][[name]][[cname]]
      }
    }
  }

  return(obj1)
}
johannesnicolaus/ASURAT_source documentation built on Dec. 21, 2021, 2:11 a.m.