R/import_functions.R
In AnanseSeurat: Construct ANANSE GRN-Analysis Seurat

Documented in import_seurat_maelstrom import_seurat_scANANSE per_cluster_df

#' import_seurat_scANANSE
#'
#' import the influences from a anansnake directory into a seurat object
#' @param seurat_object seurat object
#' @param cluster_id ID used for finding clusters of cells
#' @param anansnake_inf_dir influence directory generated by anansnake
#' @param return_df return both the seurat object and a dataframe with influence scores as a list
#' @return seurat object with the influence scores addes as an assay
#' @examples
#' sce_small <- readRDS(system.file("extdata","sce_small.Rds",package = 'AnanseSeurat'))
#' infdir <- system.file("extdata","influence",package = 'AnanseSeurat')
#' sce_small <- import_seurat_scANANSE(sce_small, anansnake_inf_dir = infdir)
#' @export
import_seurat_scANANSE <- function(seurat_object,
                                   cluster_id = 'seurat_clusters',
                                   anansnake_inf_dir = 'None',
                                   return_df = FALSE) {
  files <-
    list.files(
      path = anansnake_inf_dir,
      pattern = "*.tsv",
      full.names = TRUE,
      recursive = FALSE
    )
  GRN_files <-
    list.files(
      path = anansnake_inf_dir,
      pattern = "_diffnetwork.tsv",
      full.names = TRUE,
      recursive = FALSE
    )
  if (length(GRN_files) == 0) {
    warning(paste0(
      paste0(
        'Error: no _diffnetwork.tsv files found in influence dir ',
        anansnake_inf_dir
      )
    ), ' \n')
    warning('double check the location of the influence dir provided in anansnake_inf_dir \n')
    stop()
  }
  files <- files[!files %in% GRN_files]
  
  influence_scores_avg <- list()
  influence_targets <- list()
  i <- 1
  
  for (file in files) {
    cell_target <- stringr::str_split(basename(file), "_")[[1]][2]
    cell_source <- stringr::str_split(basename(file), "_")[[1]][3]
    cell_source <- stringr::str_replace(cell_source, '.tsv', '')
    in_df <- utils::read.table(file, header = TRUE)
    in_df <- in_df[c('factor', 'influence_score')]
    colnames(in_df) <- c('factor', cell_target)
    if (cell_source == 'average') {
      influence_scores_avg[[i]] <- as.data.frame(in_df)
      influence_targets[[i]] <- cell_target
      i <- i + 1
    }
  }
  avg_df <-
    influence_scores_avg %>% purrr::reduce(dplyr::full_join, by = "factor", copy = T)
  avg_df[is.na(avg_df)] <- 0
  rownames(avg_df) <- avg_df$factor
  avg_df$factor <- NULL
  
  #add the TF intensities to the seurat object
  TF_df <- t(avg_df)
  cell_signal_list <- list()
  i <- 1
  #get cell IDs from the pbmcs
  for (cell_type in rownames(TF_df)) {
    cells_in_seurat <- seurat_object[[cluster_id]] == cell_type
    if (TRUE %in% cells_in_seurat) {
      relevant_IDS <-
        seurat_object[, seurat_object[[cluster_id]] == cell_type][[cluster_id]]
      TF_signal <- TF_df[rownames(TF_df) == cell_type, ]
      TF_signal <- as.data.frame(TF_signal)
      TF_signal <- t(TF_signal)
      TF_signal <-
        TF_signal[rep(seq_len(nrow(TF_signal)), each = nrow(relevant_IDS)),]
      rownames(TF_signal) <- rownames(as.data.frame(relevant_IDS))
      cell_signal_list[[i]] <-
        TF_signal
    } else{
      warning(paste0(
        paste0('no cells of id ', cell_type),
        ' found in seurat object'
      ))
    }
    i <- i + 1
  }
  TF_array <- do.call("rbind", cell_signal_list)
  
  #add cell IDs that do not have ANANSE signal and give them a NA vallue
  cell_barcodes_all <-
    rownames(as.data.frame(Seurat::Idents(object = seurat_object)))
  cell_barcodes_missing <-
    cell_barcodes_all[!cell_barcodes_all %in% rownames(TF_array)]
  cell_barcodes_missing <- as.data.frame(cell_barcodes_missing)
  if (length(cell_barcodes_missing > 0)) {
    message('adding cells with no influence scores with NA')
    for (TF in colnames(TF_signal)) {
      cell_barcodes_missing[[TF]] <- NA
    }
    rownames(cell_barcodes_missing) <-
      cell_barcodes_missing$cell_barcodes_missing
    cell_barcodes_missing$cell_barcodes_missing <- NULL
    TF_output <- t(rbind(TF_array, cell_barcodes_missing))
  }
  else{
    TF_output <- t(TF_array)
  }
  seurat_object[['influence']] <-
    Seurat::CreateAssayObject(TF_output)
  if (return_df) {
    return(list(seurat_object, avg_df))
  }
  else{
    return(seurat_object)
  }
}

#' import_seurat_Maelstrom
#'
#' load Maelstrom enriched motifs
#' @param seurat_object object
#' @param cluster_id ID used for finding clusters of cells
#' @param maelstrom_file maelstrom final.out.txt file
#' @param return_df return both the seurat object and a dataframe with maelstrom scores as a list
#' @return seurat object with the maelstrom motif scores addes as an assay
#' @examples
#' sce_small <- readRDS(system.file("extdata","sce_small.Rds",package = 'AnanseSeurat'))
#' maelstromfile_path <- system.file("extdata","maelstrom","final.out.txt",package = 'AnanseSeurat')
#' sce_small <- import_seurat_maelstrom(sce_small, maelstrom_file = maelstromfile_path)
#' @export
import_seurat_maelstrom <- function(seurat_object,
                                    cluster_id = 'seurat_clusters',
                                    maelstrom_file = '~/final.out.txt',
                                    return_df = FALSE) {
  maelstrom_df <-
    utils::read.table(
      maelstrom_file,
      header = TRUE,
      row.names = 1,
      sep = '\t',
      check.names = FALSE
    )
  rownames(maelstrom_df) <- gsub('_', '-', rownames(maelstrom_df))
  maelstrom_Zscore <-
    maelstrom_df[grep("z-score ", colnames(maelstrom_df))]
  maelstrom_corr <-
    maelstrom_df[grep("corr ", colnames(maelstrom_df))]
  #add the motif intensities to the seurat object
  motif_df <- t(maelstrom_Zscore)
  cell_signal_list <- list()
  rownames(motif_df) <-
    mapply(
      gsub,
      pattern = "z-score ",
      x = rownames(motif_df),
      replacement = ''
    )
  i <- 1
  
  #get cell IDs from the single cell object
  for (cell_type in rownames(motif_df)) {
    #lets check if any cells with this annotation are present in the seurat object
    cells_in_seurat <- seurat_object[[cluster_id]] == cell_type
    if (TRUE %in% cells_in_seurat) {
      relevant_IDS <-
        seurat_object[, seurat_object[[cluster_id]] == cell_type][[cluster_id]]
      TF_signal <- motif_df[rownames(motif_df) == cell_type, ]
      TF_signal <- as.data.frame(TF_signal)
      TF_signal <- t(TF_signal)
      TF_signal <-
        TF_signal[rep(seq_len(nrow(TF_signal)), each = nrow(relevant_IDS)),]
      rownames(TF_signal) <- rownames(as.data.frame(relevant_IDS))
      cell_signal_list[[i]] <-
        TF_signal
    } else{
      warning(paste0(
        paste0('no cells of id ', cell_type),
        ' found in seurat object'
      ))
    }
    i <- i + 1
  }
  TF_array <- do.call("rbind", cell_signal_list)
  #add cell IDs that do not have Maelstrom signal and give them a NA vallue
  cell_barcodes_all <-
    rownames(as.data.frame(Seurat::Idents(object = seurat_object)))
  cell_barcodes_missing <-
    cell_barcodes_all[!cell_barcodes_all %in% rownames(TF_array)]
  if (length(cell_barcodes_missing) > 0) {
    cell_barcodes_missing <- as.data.frame(cell_barcodes_missing)
    for (TF in colnames(TF_signal)) {
      cell_barcodes_missing[[TF]] <- NA
    }
    rownames(cell_barcodes_missing) <-
      cell_barcodes_missing$cell_barcodes_missing
    cell_barcodes_missing$cell_barcodes_missing <- NULL
    TF_array <- rbind(TF_array, cell_barcodes_missing)
  }
  seurat_object[['maelstrom']] <-
    Seurat::CreateAssayObject(t(TF_array))
  motif_df <- t(motif_df)
  if (return_df) {
    return(list(seurat_object, motif_df))
  }
  else{
    return(seurat_object)
  }
}

#' per_cluster_df
#'
#' generate a table of the assay score averages per cluster identifier cell
#' @param seurat_object seurat object
#' @param assay assay containing influence or motif scores generated from cluster pseudobulk
#' @param cluster_id ID used for finding clusters of cells
#' @return dataframe with assay scores, concatinating cells from each per cluster
#' @examples
#' sce_small <- readRDS(system.file("extdata","sce_small.Rds",package = 'AnanseSeurat'))
#' df <- per_cluster_df(sce_small)
#' @export
per_cluster_df <- function(seurat_object,
                           assay = 'influence',
                           cluster_id = 'seurat_clusters') {
  Seurat::Idents(seurat_object) <- cluster_id
  #make a dataframe with the values per cluster:
  clusters <- unique(seurat_object[[cluster_id]])
  
  #check if assay exists
  if (is.null(seurat_object@assays[[assay]])) {
    stop(paste0('assay ', assay, ' not found in the seurat object '))
  }
  cluster_data <-
    as.data.frame(rownames(seurat_object@assays[[assay]]@data))
  rownames(cluster_data) <- cluster_data[[1]]
  
  for (cluster in unique(seurat_object[[cluster_id]])[[cluster_id]]) {
    seurat_object_subset <- subset(x = seurat_object, idents = cluster)
    #get cluster data
    cluster_matrix <-
      as.data.frame(seurat_object_subset@assays[[assay]]@data)
    if (length(unique(as.list(cluster_matrix))) != 1) {
      warning(
        paste0(
          'not all cells of the cluster ',
          cluster,
          ' have the same value in the assay ',
          assay
        )
      )
    }
    cluster_data[cluster] <- rowMeans(cluster_matrix)
    
  }
  cluster_data[[1]] <- NULL
  cluster_data <-
    cluster_data[, colSums(is.na(cluster_data)) < nrow(cluster_data)]#remove columns with NAs
  return(cluster_data)
}