R/manage_container.R

Defines functions get_donor_meta identify_sex_metadata update_params initialize_params make_new_container

Documented in get_donor_meta identify_sex_metadata initialize_params make_new_container update_params

#' Create a container to store all data and results for the project. You must
#' provide a params list as generated by initialize_params(). You also need to
#' provide either a Seurat object or both a count_data matrix and a meta_data matrix.
#'
#' @param params list A list of the experiment params to use as generated by
#' initialize_params()
#' @param count_data dgCMatrix Matrix of raw counts with genes as rows
#' and cells as columns (default=NULL)
#' @param meta_data data.frame Metadata with cells as rows and variables
#' as columns. Number of rows in metadata should equal number of columns
#' in count matrix (default=NULL)
#' @param seurat_obj Seurat object that has been cleaned and includes the normalized,
#' log-transformed counts. The meta.data should include a column with the header
#' 'sex' and values of 'M' or 'F' if available. The metadata should
#' also have a column with the header 'ctypes' with the corresponding names of
#' the cell types as well as a column with header 'donors' that contains
#' identifiers for each donor. (default=NULL)
#' @param scMinimal environment A sub-container for the project typically
#' consisting of gene expression data in its raw and processed forms as well
#' as metadata (default=NULL)
#' @param gn_convert data.frame Gene identifier -> gene name conversions table.
#' Gene identifiers used in counts matrices should appear in the first column and
#' the corresponding gene symbols should appear in the second column. Can remain
#' NULL if the identifiers are already gene symbols. (default=NULL)
#' @param metadata_cols character The names of the metadata columns to use
#' (default=NULL)
#' @param metadata_col_nm character New names for the selected metadata columns
#' if wish to change their names. If NULL, then the preexisting column names are
#' used. (default=NULL)
#' @param label_donor_sex logical Set to TRUE to label donor sex in the meta data
#' by using expressing of sex-associated genes (default=FALSE)
#'
#' @return A project container of class environment that stores sub-containers
#' for each cell type as well as results and plots from all analyses.
#' @export
make_new_container <- function(params, count_data=NULL, meta_data=NULL,
                               seurat_obj=NULL, scMinimal=NULL, gn_convert=NULL,
                               metadata_cols=NULL, metadata_col_nm=NULL,
                               label_donor_sex=FALSE) {

  if (!is.null(seurat_obj)) {
    # check cell type names don't have special characters
    all_ct_names <- levels(as.factor(seurat_obj@meta.data$ctypes))
    names(all_ct_names) <- all_ct_names
    new_ct_names <- sapply(params$ctypes_use,function(x) {
      split_names <- strsplit(x, "[\\._:]", fixed = FALSE)[[1]]
      if (length(split_names)>1) {
        return(paste(split_names, collapse = ""))
      } else {
        return(x)
      }
    })
    if (any(new_ct_names != names(new_ct_names))) {
      warning("Cell type names cannot have special characters '.', '_', or ':'. Removing these characters and concatenating name components.")
      all_ct_names[names(new_ct_names)] <- new_ct_names
      seurat_obj@meta.data$ctypes <- as.factor(seurat_obj@meta.data$ctypes)
      levels(seurat_obj@meta.data$ctypes) <- all_ct_names
      params$ctypes_use <- new_ct_names
      names(params$ctypes_use) <- NULL
    }

    scMinimal <- seurat_to_scMinimal(seurat_obj, metadata_cols=metadata_cols,
                        metadata_col_nm=metadata_col_nm)
  } else if (!is.null(count_data) & !is.null(meta_data)) {
    # throw error if metadata is not a dataframe
    if (class(meta_data)[1]!='data.frame') {
      stop('meta_data must be a data.frame')
    }
    
    # check cell type names don't have special characters
    all_ct_names <- levels(as.factor(meta_data$ctypes))
    names(all_ct_names) <- all_ct_names
    new_ct_names <- sapply(params$ctypes_use,function(x) {
      split_names <- strsplit(x, "[\\._:]", fixed = FALSE)[[1]]
      if (length(split_names)>1) {
        return(paste(split_names, collapse = ""))
      } else {
        return(x)
      }
    })
    if (any(new_ct_names != names(new_ct_names))) {
      warning("Cell type names cannot have special characters '.', '_', or ':'. Removing these characters and concatenating name components.")
      all_ct_names[names(new_ct_names)] <- new_ct_names
      meta_data$ctypes <- as.factor(meta_data$ctypes)
      levels(meta_data$ctypes) <- all_ct_names
      params$ctypes_use <- new_ct_names
      names(params$ctypes_use) <- NULL
    }
    
    # make counts to be dgCMatrix if not already
    if (class(count_data)[1]!='dgCMatrix') {
      count_data <- methods::as(as.matrix(count_data),'sparseMatrix')
    }

    # throw error if data is pre-normalized
    if (any(count_data%%1!=0)) {
      stop('It seems your count_data might already be normalized. The count_data matrix should only contain integer UMI counts.')
    }

    # throw error if dimensions or cell names don't match
    if (ncol(count_data)!=nrow(meta_data)) {
      stop('ncol(count_data) must match nrow(meta_data)')
    } else if (!identical(colnames(count_data),rownames(meta_data))) {
      stop('Cell names must be identical in both count_data and meta_data')
    }
    
    scMinimal <- instantiate_scMinimal(count_data, meta_data,
                                       metadata_cols=metadata_cols,
                                       metadata_col_nm=metadata_col_nm)
  } else if (is.null(scMinimal)) {
    stop("Need to provide either a seurat object or a scMinimal object or both a count matrix and meta data matrix")
  }
  
  # throw warning if cell-level metadata is included
  metadata_test <- scMinimal$metadata[scMinimal$metadata$ctypes==params$ctypes_use[1],]
  metadata_test <- unique(metadata_test)
  if (nrow(metadata_test) != length(unique(metadata_test$donors))) {
    warning('You may have included metadata that varies across cells within each donor/sample. 
              We recommend only including metadata that varies across donors/samples.')
  }

  # create new empty environment for all experiment data and results
  container <- new.env()
  container$scMinimal_full <- scMinimal
  container$experiment_params <- params

  if (!is.null(gn_convert)) {
    rownames(gn_convert) <- gn_convert[,1]
  }
  container$gn_convert <- gn_convert

  if (label_donor_sex) {
    container <- identify_sex_metadata(container)
  }

  return(container)
}

#' Initialize parameters to be used throughout scITD in various functions
#'
#' @param ctypes_use character Names of the cell types to use for the analysis
#' (default=NULL)
#' @param ncores numeric Number of cores to use (default=4)
#' @param rand_seed numeric Random seed to use (default=10)
#'
#' @return A list of the experiment parameters to use.
#' @export
#' 
#' @examples
#' param_list <- initialize_params(ctypes_use = c("CD4+ T", "CD8+ T"),
#' ncores = 1, rand_seed = 10)
initialize_params <- function(ctypes_use, ncores=4, rand_seed=10) {

  if (is.null(ctypes_use)) {
    stop("You must provide the ctypes_use parameter")
  }

  params_list <- list(ctypes_use = ctypes_use,
                      ncores = ncores,
                      rand_seed = rand_seed)

  return(params_list)
}

#' Update any of the experiment-wide parameters
#'
#' @param container environment Project container that stores sub-containers
#' for each cell type as well as results and plots from all analyses
#' @param ctypes_use character Names of the cell types to use for the analysis
#' (default=NULL)
#' @param ncores numeric Number of cores to use (default=NULL)
#' @param rand_seed numeric Random seed to use (default=NULL)
#'
#' @return The project container with updated experiment parameters in
#' container$experiment_params.
#' @export
#' 
#' @examples
#' test_container <- update_params(test_container, ncores=1)
update_params <- function(container, ctypes_use=NULL,
                          ncores=NULL, rand_seed=NULL) {

  # if user/code enters a value for a param then reset its value
  if (!is.null(ctypes_use)) {
    container$experiment_params$ctypes_use <- ctypes_use
  }
  if (!is.null(ncores)) {
    container$experiment_params$ncores <- ncores
  }
  if (!is.null(rand_seed)) {
    container$experiment_params$rand_seed <- rand_seed
  }
  return(container)
}


#' Extract metadata for sex information if not provided already
#' @useDynLib scITD
#' @importFrom Rcpp sourceCpp
#'
#' @param container environment Project container that stores sub-containers
#' for each cell type as well as results and plots from all analyses
#' @param y_gene character Gene name to use for identifying male donors (default='RPS4Y1')
#' @param x_gene character Gene name to use for identifying female donors (default='XIST')
#'
#' @return The project container with sex metadata added to the metadata.
#' @export
identify_sex_metadata <- function(container,y_gene='RPS4Y1',x_gene='XIST') {
  scMinimal <- container$scMinimal_full

  dge_sparse <- t(scMinimal$count_data)

  # get donor sums for each gene in the dataset
  donor_meta <- as.factor(scMinimal$metadata$donors)
  d_sums <- get_sums(dge_sparse,donor_meta)
  d_sums <- d_sums[2:nrow(d_sums),]

  # normalize counts
  d_sums <- t(normalize_counts(t(d_sums),scale_factor=10000))

  # convert rownames to gene symbols using provided mapping
  gn_names <- convert_gn(container, colnames(d_sums))

  y_ndx <- which(gn_names == y_gene)
  x_ndx <- which(gn_names == x_gene)
  y_mean <- mean(d_sums[,y_ndx])
  x_mean <- mean(d_sums[,x_ndx])

  make_note <- FALSE
  scMinimal$metadata$sex <- NA
  for (i in 1:nrow(scMinimal$metadata)) {
    d <- scMinimal$metadata$donors[i]
    if (d_sums[d,y_ndx] > y_mean && d_sums[d,x_ndx] < x_mean) {
      scMinimal$metadata$sex[i] <- 'M'
    } else if (d_sums[d,y_ndx] < y_mean && d_sums[d,x_ndx] > x_mean) {
      scMinimal$metadata$sex[i] <- 'F'
    } else {
      scMinimal$metadata$sex[i] <- 'A'
      make_note <- TRUE
    }
  }

  if (make_note) {
    message('Some assignments are ambiguous and are labeled A in the metadata.
          We recommend correcting these manually or providing the sex metadata when
          instantiating scMinimal.')
  }

  container$scMinimal_full <- scMinimal

  return(container)

}


#' Get metadata matrix of dimensions donors by variables (not per cell)
#'
#' @param container environment Project container that stores sub-containers
#' for each cell type as well as results and plots from all analyses
#' @param additional_meta character A vector of other variables to include (default=NULL)
#' @param only_analyzed logical Set to TRUE to only include donors that were included
#' in the formed tensor, otherwise set to FALSE (default=TRUE)
#'
#' @return The project container with metadata per donor (not per cell) in container$donor_metadata.
#' @export
#' 
#' @examples
#' test_container <- get_donor_meta(test_container, additional_meta='lanes')
get_donor_meta <- function(container,additional_meta=NULL,only_analyzed=TRUE) {
  if (only_analyzed) {
    if (is.null(container$tensor_data)) {
      stop('Need to run form_tensor() first or set only_analyzed to FALSE')
    }
    donors <- container$tensor_data[[1]]
  } else {
    donors <- unique(container$scMinimal_full$metadata$donors)
  }
  meta_sub <- container$scMinimal_full$metadata[container$scMinimal_full$metadata$donors %in% donors,]
  donor_metadata <- unique(meta_sub[,c('donors',additional_meta),drop=FALSE])
  rownames(donor_metadata) <- donor_metadata[,'donors']
  container$donor_metadata <- donor_metadata
  return(container)
}

Try the scITD package in your browser

Any scripts or data that you put into this service are public.

scITD documentation built on Sept. 8, 2023, 5:11 p.m.