R/calcCellContam.R

# library(matrixStats)
# calculate contamination by looking at each types markers

sumMarkerExpression = function(expr_matrix, target_markers_name, comparison_markers_name){

  # calc sum expr for target markers
  target_marker_genes = getValidGenes(mouseMarkerGenes$Cortex[[target_markers_name]], colnames(expr_matrix))
  contam_marker_genes = getValidGenes(mouseMarkerGenes$Cortex[[comparison_markers_name]], colnames(expr_matrix))

  target_expr = rowSums(expr_matrix[, target_marker_genes])
  comp_expr = rowSums(expr_matrix[, contam_marker_genes])

  return(cbind(target_expr, comp_expr))
}

sumExpression = function(expr_matrix, markers, colwise = F){
  # returns a vector of summed expression values per cell

  if (!(colwise)){
    target_marker_genes = getValidGenes(markers, colnames(expr_matrix))
    target_expr = rowSums(expr_matrix[, target_marker_genes %>% make.names()] %>% log2)
  } else{
    target_marker_genes = getValidGenes(markers, rownames(expr_matrix))
    target_expr = colSums(expr_matrix[target_marker_genes, ] %>% log2)
  }

  return(target_expr)

}

getValidGenes = function(input_genes, target_genes = aibs_gene_names){
  # returns set intersection of genes

  matching_inds = which(input_genes %in% target_genes)

  return(input_genes[matching_inds])
}

#' Title Given a single-cell expression matrix, summarize marker expression as log sum across multiple cell types
#'
#' @param expr_matrix single cell expression matrix (samples by genes, with gene names as column names)
#' @param markerList list of lists defining markers for various cell types
#' @param colwise boolean for whether matrix is stored as samples by genes (T) or genes by cells (F)
#'
#' @return Dataframe with cells by cell types, which each cell type (as a column) showing summed expression of cell type markers
#' @export
#'
#' @examples
calcContamAllTypes = function(expr_matrix, markerList, colwise = F){
  # calculate sum of expression values given a list of marker genes per cell type

  contam_vals = lapply(markerList, function(cell_type_markers){
    return(sumExpression(expr_matrix, cell_type_markers, colwise))
  }) %>% as.data.frame()
  return(contam_vals)

}


normalizeContam = function(contam_values, aibs_med_exprs, compare_cell_types){
  # normalize cell's contamination values against expected values from tasic or comparison dataset

  expected_expr = aibs_med_exprs[contam_values$contam_type[1], compare_cell_types] %>% unlist()

  compare_cell_type_exprs = aibs_med_exprs[compare_cell_types, compare_cell_types] %>% as.matrix %>% diag

  normalizedContamValues = apply(contam_values[, compare_cell_types], 1, function(x) (x - expected_expr) / (compare_cell_type_exprs- expected_expr))
  return(normalizedContamValues)

}

normalizeContamSum = function(contam_values, aibs_med_exprs, compare_cell_types){
  # calculate sum of contamination values across all cell types in compare_cell_types
  normalizedContamValues = normalizeContam(contam_values, aibs_med_exprs, compare_cell_types) %>% repWZero() %>% colSums
  return(normalizedContamValues)
}


repWZero = function(df){
  # replace negative values with 0

  ndf = df
  ndf[df < 0] = 0
  return(ndf)
}

calculateDatasetQCMeasures = function(joined_df, dataset_genes, protein_genes = protein_coding_genes, mito_genes = mt_gene_names, min_expr_val = 1){
  # given an input patch seq data frame, and set of genes, calculate quality control measures

  valid_genes = getValidGenes(protein_genes, dataset_genes)
  num_genes = rowSums(joined_df[, valid_genes ] > min_expr_val)

  mt_gene_expr = sumExpression(joined_df, mito_genes %>% make.names())

  out_df = data.frame(num_genes = num_genes, mt_gene_expr = mt_gene_expr, cell_id = joined_df$cell_id)
  return(out_df)
}
PavlidisLab/patchSeqQC documentation built on May 20, 2019, 9:13 p.m.