DoubletDeconSeurat: Seurat wrappers with DoubletDecon

Documented in DoubletDecon

#' Main DoubletDecon v1.0.1
#'
#' This is the main function. This function identifies clusters of doublets with a combination of deconvolution analysis and unique gene expression and individual doublet cells with deconvolution analysis.
#'
#' @param rawDataFile Name of file containing ICGS or Seurat expression data (gene by cell)
#' @param groupsFile Name of file containing group assignments (3 column: cell, group(numeric), group(numeric or character))
#' @param filename Unique filename to be incorporated into the names of outputs from the functions.
#' @param location Directory where output should be stored
#' @param fullDataFile Name of file containing full expression data (gene by cell). Default is NULL.
#' @param removeCC Remove cell cycle gene cluster by KEGG enrichment. Default is FALSE.
#' @param species Species as scientific species name, KEGG ID, three letter	species abbreviation, or NCBI ID. Default is "mmu".
#' @param rhop x in mean+x*SD to determine upper cutoff for correlation in the blacklist. Default is 1.
#' @param write Write output files as .txt files. Default is TRUE.
#' @param PMF Use step 2 (unique gene expression) in doublet determination criteria. Default is TRUE.
#' @param useFull Use full gene list for PMF analysis. Requires fullDataFile. Default is FALSE.
#' @param heatmap Boolean value for whether to generate heatmaps. Default is TRUE. Can be slow to datasets larger than ~3000 cells.
#' @param centroids Use centroids as references in deconvolution instead of the default medoids.
#' @param num_doubs The user defined number of doublets to make for each pair of clusters. Default is 100.
#' @param only50 use only synthetic doublets created with 50\%/50\% mix of parent cells, as opposed to the extended option of 30\%/70\% and 70\%/30\%, default is FALSE.
#' @param min_uniq minimum number of unique genes required for a cluster to be rescued
#' @param seed Set a random seed.
#'
#' @return data_processed = new expression file (cleaned).
#' @return groups_processed = new groups file (cleaned).
#' @return PMF_results = pseudo marker finder t-test results (gene by cluster).
#' @return DRS_doublet_table = each cell and whether it is called a doublet by deconvolution analysis.
#' @return DRS_results = results of deconvolution analysis (cell by cluster) in percentages.
#' @return Decon_called_freq = percentage of doublets called in each cluster by deconvolution analysis.
#' @return Final_doublets_groups = new groups file containing only doublets.
#' @return Final_nondoublets_groups = new groups file containing only non doublets.
#'
#' @keywords doublet decon main
#'
#' @export
#'
DoubletDecon <- function(
  rawDataFile,
  groupsFile,
  filename,
  location,
  fullDataFile = NULL,
  removeCC = FALSE,
  species = 'mmu',
  rhop = 1,
  write = TRUE,
  PMF = TRUE,
  useFull = FALSE,
  heatmap = TRUE,
  centroids = FALSE,
  num_doubs = 100,
  only50 = FALSE,
  min_uniq = 4,
  seed = 100
){

  #load required packages
  message('Loading packages')
  suppressMessages(require(DeconRNASeq))
  suppressMessages(require(gplots))
  suppressMessages(require(plyr))
  suppressMessages(require(MCL))
  suppressMessages(require(clusterProfiler))
  suppressMessages(require(mygene))
  suppressMessages(require(tidyr))
  suppressMessages(require(R.utils)) #for new PMF
  suppressMessages(require(dplyr)) #for new PMF
  suppressMessages(require(foreach)) #for new PMF
  suppressMessages(require(doParallel)) #for new PMF
  suppressMessages(require(stringr)) #for new PMF

  #Set up log file
  log_file_name <- file.path(location, paste0(filename, '.log'))
  log_con <- file(log_file_name)
  cat(paste0('filename: ', filename), file = log_file_name, append = TRUE, sep = '\n')
  cat(paste0('location: ', location), file = log_file_name, append=TRUE, sep = '\n')
  cat(paste0('removeCC: ', removeCC), file = log_file_name, append = TRUE, sep = '\n')
  cat(paste0('species: ', species), file = log_file_name, append = TRUE, sep = '\n')
  cat(paste0('rhop: ', rhop), file = log_file_name, append = TRUE, sep = '\n')
  cat(paste0('write: ', write), file = log_file_name, append = TRUE, sep = '\n')
  cat(paste0('PMF: ', PMF), file = log_file_name, append = TRUE, sep = '\n')
  cat(paste0('useFull: ', useFull), file = log_file_name, append = TRUE, sep = '\n')
  cat(paste0('heatmap: ', heatmap), file = log_file_name, append = TRUE, sep = '\n')
  cat(paste0('centroids: ', centroids), file = log_file_name, append = TRUE, sep = '\n')
  cat(paste0('num_doubs: ', num_doubs), file = log_file_name, append = TRUE, sep = '\n')
  cat(paste0('only50: ', only50), file = log_file_name, append = TRUE, sep = '\n')
  cat(paste0('min_uniq: ', min_uniq), file = log_file_name, append = TRUE, sep = '\n')
  cat(paste0('seed: ', seed), file = log_file_name, append = TRUE, sep = '\n')

  #Check variables
  if(is.character(rawDataFile) != TRUE & is.data.frame(rawDataFile) != TRUE) {stop('ERROR: rawDataFile must be a character string!')}
  if(is.character(groupsFile) != TRUE & is.data.frame(groupsFile) != TRUE & is.matrix(groupsFile) != TRUE) {stop('ERROR: groupsFile must be a character string!')}
  if(is.character(filename) != TRUE) {stop('ERROR: filename must be a character string!')}
  if(is.character(location) != TRUE) {stop('ERROR: location must be a character string!')}
  if(is.character(fullDataFile) != TRUE & is.null(fullDataFile) != TRUE & is.data.frame(fullDataFile) != TRUE) {stop('ERROR: fullDataFile must be a character string or NULL!')}
  if(is.logical(removeCC) != TRUE) {stop('ERROR: removeCC must be TRUE or FALSE!')}
  if(is.character(species) != TRUE) {stop('ERROR: species must be a character string!')}
  if(is.numeric(rhop) != TRUE) {stop('ERROR: rhop must be numeric!')}
  if(is.logical(write) != TRUE) {stop('ERROR: write must be TRUE or FALSE!')}
  if(is.logical(PMF) != TRUE) {stop('ERROR: PMF must be TRUE or FALSE!')}
  if(is.logical(useFull) != TRUE) {stop('ERROR: useFull must be TRUE or FALSE!')}
  if(is.logical(heatmap) != TRUE) {stop('ERROR: heatmap must be TRUE or FALSE!')}
  if(is.logical(centroids) != TRUE) {stop('ERROR: centroids must be TRUE or FALSE!')}
  if(is.numeric(num_doubs) != TRUE) {stop('ERROR: numdoubs must be numeric!')}
  if(is.logical(only50) != TRUE) {stop('ERROR: only50 must be TRUE or FALSE!')}
  if(is.numeric(min_uniq) != TRUE) {stop('ERROR: min_uniq must be numeric!')}
  if(is.integer(seed) != TRUE) {stop('ERROR: seed must be integer!')}

  #Read in data
  message('Reading data')
  cat('Reading data', file = log_file_name, append = TRUE, sep = '\n')

  ICGS2_flag <- FALSE #set for checking if the input file is in ICGS2 format

  if (class(rawDataFile) == 'character') {
    #NEW: test for ICGS2
    rawDataHeader <- read.table(rawDataFile, sep = '\t', header = FALSE, row.names = 1, nrows = 1, stringsAsFactors = FALSE)
    if (length(x = grep(':', rawDataHeader[2])) == 1) {
      ICGS2_flag <- TRUE
      ICGS2 <- ICGS2toICGS1(rawDataFile = rawDataFile, groupsFile = groupsFile)
      rawData <- ICGS2$rawData
    } else {
      rawData <- read.table(rawDataFile, sep = '\t', header = T, row.names = 1)
    }
  } else {
    warning("if using ICGS2 file input, please import 'rawDataFile' and 'groupsFile' as path/location instead of an R object.")
    rawData <- rawDataFile
  }

  if (class(groupsFile) == 'character') {
    if (ICGS2_flag == TRUE) {
      groups <- ICGS2$groups
    } else {
      groups <- read.table(groupsFile, sep = '\t', header = FALSE, row.names = 1)
    }
  } else {
    groups <- groupsFile
  }

  #Clean up data and groups file
  message('Processing raw data')
  cat('Processing raw data', file = log_file_name, append = TRUE, sep = '\n')
  data <- CleanUpInput(rawData = rawData, groups = groups, log_file_name = log_file_name)
  og_processed_data <- data$processed
  groups <- data$groups

  #Centroids or medoids?
  if (centroids == TRUE) {
    centroid_flag <- TRUE
  } else {
    centroid_flag <- FALSE
  }

  #Original data heatmap
  if (heatmap == TRUE) {
    message('Creating original data heatmap')
    cat('Creating original data heatmap', file = log_file_name, append = TRUE, sep = '\n')
    breaks <- seq(0, #start point of color key
                  as.numeric(x = quantile(data.matrix(data$processed[2:nrow(x = data$processed), 2:ncol(x = data$processed)]), 0.99)),  #end point of color key
                  by = 0.05) #length of sub-division
    mycol <- colorpanel(n = length(x = breaks) - 1, low = 'black', high = 'yellow') #heatmap colors
    suppressWarnings(DDheatmap(data.matrix(data$processed[2:nrow(x = data$processed), 2:ncol(x = data$processed)]), #the data matrix
                               Colv = FALSE, # No clustering of columns
                               Rowv = FALSE, #no clustering of rows
                               dendrogram = 'none', #do not generate dendrogram
                               col = mycol, #colors used in heatmap
                               ColSideColors = as.color(Renumber(data$processed[1,2:ncol(x = data$processed)]), alpha = 1, seed = 4), #column color bar
                               RowSideColors = as.color(Renumber(data$processed[2:nrow(x = data$processed), 1]), alpha = 1, seed = 2), # row color bar
                               breaks = breaks, #color key details
                               trace = 'none', #no trace on map
                               na.rm = TRUE, #ignore missing values
                               margins = c(5, 5), # size and layout of heatmap window
                               labRow = NA, #turn off gene labels
                               labCol = NA, #turn off cell labels
                               xlab = 'Samples', #x axis title
                               ylab =  'Genes', # y axis title
                               main = paste0('Original data: ', filename))) #main title
  }

  #Remove cell cycle gene cluster (optional)
  if (removeCC == TRUE) {
    message('Removing cell cycle clusters')
    cat('Removing cell cycle clusters', file = log_file_name, append = TRUE, sep='\n')
    data <- RemoveCellCycle(data = data$processed, species = species, log_file_name = log_file_name)
  } else {
    data <- data$processed
  }
  if (write == TRUE) {
    write.table(data, file = file.path(location, paste0('data_processed_', filename, '.txt')), sep = '\t')
    write.table(groups, file = file.path(location, paste0('groups_processed_', filename, '.txt')), sep = '\t')
  }

  #Calculate medoids, medoid correlations, blacklist to create new combine medoids
  message('Combining similar clusters')
  BL <- BlacklistGroups(data = data, groups = groups, rhop = rhop, centroid_flag = centroid_flag, log_file_name = log_file_name)
  newMedoids <- BL$newMedoids
  groupsMedoids <- BL$newGroups

  #Create synthetic doublets to get average synthetic profiles
  message('Creating synthetic doublet profiles')
  if (.Platform$OS.type == "unix") {
    sink("/dev/null") #hides DeconRNASeq output
    synthProfilesx <- SyntheticDoublets(data = data, groups = groups, groupsMedoids = groupsMedoids, newMedoids = newMedoids, num_doubs = num_doubs, only50 = only50, location = location, seed = seed)
    sink()
  } else {
    synthProfilesx <- SyntheticDoublets(data = data, groups = groups, groupsMedoids = groupsMedoids, newMedoids = newMedoids, num_doubs = num_doubs, only50 = only50, location = location, seed = seed)
  }
  synthProfiles <- synthProfilesx$averagesAverages
  doubletCellsInput2 <- synthProfilesx$doubletCellsInput2
  if (write == TRUE) {
    write.table(doubletCellsInput2, file = file.path(location, paste0('Synth_doublet_info_', filename, '.txt')), sep = '\t')
  }

  #Calculate doublets using DeconRNASeq
  message('Step 1: Removing possible doublets')
  cat('Step 1: Removing possible doublets', file = log_file_name, append = TRUE, sep = '\n')
  if (.Platform$OS.type == 'unix') {
    sink('/dev/null') #hides DeconRNASeq output
    doubletTable <- IsDoublet(data = data, newMedoids = newMedoids, groups = groups, synthProfiles = synthProfiles, log_file_name = log_file_name)
    sink()
  } else {
    doubletTable <- IsDoublet(data = data, newMedoids = newMedoids, groups = groups, synthProfiles = synthProfiles, log_file_name = log_file_name)
  }
  if (write == TRUE) {
    write.table(doubletTable$isADoublet, file = file.path(location, paste0('DRS_doublet_table_', filename, '.txt')), sep = '\t')
    write.table(doubletTable$resultsreadable, file = file.path(location, paste0('DRS_results_', filename, '.txt')), sep = "\t")
  }

  #Recluster doublets and non-doublets
  message('Step 2: Re-clustering possible doublets')
  cat('Step 2: Re-clustering possible doublets', file = log_file_name, append = TRUE, sep = '\n')
  reclusteredData <- Recluster(isADoublet = doubletTable$isADoublet, data = data, groups = groups, log_file_name = log_file_name)
  data <- reclusteredData$newData2$processed
  groups <- reclusteredData$newData2$groups
  write.table(data, file = file.path(location, paste0('data_processed_reclust_', filename, '.txt')), sep = '\t', col.names = NA, quote = FALSE)
  write.table(groups, file = file.path(location, paste0('groups_processed_reclust_', filename, '.txt')), sep = '\t')

  #Run Pseudo Marker Finder to identify clusters with no unique gene expression
  if (PMF == FALSE) {
    message('SKIPPING Step 3: Rescuing cells with unique gene expression')
    cat('SKIPPING Step 3: Rescuing cells with unique gene expression', file = log_file_name, append = TRUE, sep = '\n')
    PMFresults <- NULL
  } else {
    message('Step 3: Rescuing cells with unique gene expression')
    cat('Step 3: Rescuing cells with unique gene expression', file = log_file_name, append = TRUE, sep = '\n')
    if (useFull == TRUE) {
      PMFresults <- PseudoMarkerFinder(groups = as.data.frame(groups), redu_data2 = file.path(location, paste0('data_processed_reclust_', filename, '.txt')), full_data2 = fullDataFile, min_uniq = min_uniq, log_file_name = log_file_name)
    } else {
      PMFresults <- PseudoMarkerFinder(groups = as.data.frame(groups), redu_data2 = file.path(location, paste0('data_processed_reclust_', filename, '.txt')), full_data2 = NULL, min_uniq = min_uniq, log_file_name = log_file_name)
    }
    if (write == TRUE) {
      write.table(PMFresults, file = file.path(location, paste0('new_PMF_results_', filename, '.txt')), sep = '\t')
    }
  }
  #Doublet Detection method 2: Pseudo_Marker_Finder
  allClusters <- unique(groups[,1])
  if (PMF == FALSE) {
    newDoubletClusters <- allClusters
  } else {
    hallmarkClusters <- as.numeric(x = unique(PMFresults[, 2]))
    newDoubletClusters <- setdiff(allClusters, hallmarkClusters)
  }
  #Doublet Detection method 1: Is_A_Doublet
  uniqueClusters <- as.character(x = unique(groups[, 2]))
  DeconCalledFreq <- as.data.frame(matrix(nrow = length(x = allClusters), ncol = 1), row.names = uniqueClusters)
  for (clus in 1:length(x = allClusters)) {
    #modified this line, was originally "clus in allClusters"
    temp1 <- subset(doubletTable$isADoublet, Group_Cluster == uniqueClusters[clus])
    if (nrow(temp1) == 0) {
      #not an original cluster, only a new doublet cluster
      DeconCalledFreq[clus, 1] <- 100
    } else {
      DeconCalledFreq[clus, 1] <- (length(x = which(temp1$isADoublet == TRUE))/nrow(x = temp1))*100
    }
  }

  #Combine to find real doublets
  if (PMF == FALSE) {
    finalDoublets <- row.names(x = doubletTable$isADoublet)[which(doubletTable$isADoublet$isADoublet == TRUE)] #this gives you the names of cells called as doublets by deconvolution
  } else {
    finalDoublets <- intersect(row.names(x = doubletTable$isADoublet)[which(doubletTable$isADoublet$isADoublet == TRUE)], row.names(x = subset(groups, groups[, 1] %in% newDoubletClusters)))
  }
  #Results
  finalDoubletCellCall <- groups[row.names(x = groups) %in% finalDoublets, ]
  finalNotDoubletCellCall <- groups[!(row.names(x = groups) %in% finalDoublets), ]
  if (write == TRUE) {
    write.table(finalDoubletCellCall, file = file.path(location, paste0('Final_doublets_groups_', filename, '.txt')), sep = '\t')
    write.table(finalNotDoubletCellCall, file = file.path(location, paste0('Final_nondoublets_groups_', filename, '.txt')), sep = '\t')
  }

  #Subset expression matrix for doublets and save
  doublets_matrix <- cbind(og_processed_data[, 1], og_processed_data[, which(colnames(x = og_processed_data) %in% row.names(x = finalDoubletCellCall))])
  if (write == TRUE) {
    write.table(doublets_matrix, file = file.path(location, paste0('Final_doublets_exp_', filename, '.txt')), sep = '\t')
  }

  #Heatmap of cells removed as doubets
  if (heatmap == TRUE) {
    message('Creating doublets heatmap')
    cat('Creating doublets heatmap', file = log_file_name, append = TRUE, sep = '\n')
    breaks <- seq(0, #start point of color key
                  as.numeric(quantile(data.matrix(doublets_matrix[2:nrow(x = doublets_matrix), 2:ncol(x = doublets_matrix)]), 0.99)),  #end point of color key
                  by = 0.05) #length of sub-division
    mycol <- colorpanel(n = length(x = breaks) - 1, low = 'black', high = 'yellow') #heatmap colors
    suppressWarnings(DDheatmap(data.matrix(doublets_matrix[2:nrow(x = doublets_matrix), 2:ncol(x = doublets_matrix)]), #the data matrix
                               Colv = FALSE, # No clustering of columns
                               Rowv = FALSE, #no clustering of rows
                               col = mycol, #colors used in heatmap
                               dendrogram = 'none', #turn of dendrogram generation
                               ColSideColors = as.color(Renumber(doublets_matrix[1, 2:ncol(x = doublets_matrix)]), alpha = 1, seed = 4), #column color bar
                               RowSideColors = as.color(Renumber(doublets_matrix[2:nrow(x = doublets_matrix),1]), alpha = 1, seed = 2), # row color bar
                               breaks = breaks, #color key details
                               trace = 'none', #no trace on map
                               na.rm = TRUE, #ignore missing values
                               margins = c(5,5), # size and layout of heatmap window
                               labRow = NA, #turn off gene labels
                               labCol = NA, #turn off cell labels
                               xlab = 'Samples', #x axis title
                               ylab =  'Genes', # y axis title
                               main = paste0('Doublets: ', filename))) #main title)
  }
  #Subset expression matrix for non-doublets and save
  nondoublets_matrix <- cbind(og_processed_data[, 1], og_processed_data[, which(colnames(x = og_processed_data) %in% row.names(x = finalNotDoubletCellCall))])
  if (write == TRUE) {
    write.table(nondoublets_matrix, file = file.path(location, paste0('Final_nondoublets_exp_', filename, '.txt')), sep = '\t')
  }
  #New heatmap of non-doublet cells
  if (heatmap == TRUE) {
    message('Creating non-doublets heatmap')
    cat('Creating non-doublets heatmap', file = log_file_name, append = TRUE, sep = '\n')
    breaks <- seq(0, #start point of color key
                  as.numeric(quantile(data.matrix(nondoublets_matrix[2:nrow(x = nondoublets_matrix), 2:ncol(x = nondoublets_matrix)]), 0.99)),  #end point of color key
                  by = 0.05) #length of sub-division
    mycol <- colorpanel(n = length(x = breaks) - 1, low = 'black', high = 'yellow') #heatmap colors
    suppressWarnings(DDheatmap(data.matrix(nondoublets_matrix[2:nrow(x = nondoublets_matrix), 2:ncol(x = nondoublets_matrix)]), #the data matrix
                               Colv = FALSE, # No clustering of columns
                               Rowv = FALSE, #no clustering of rows
                               col = mycol, #colors used in heatmap
                               dendrogram = 'none', #turn of dendrogram generation
                               ColSideColors = as.color(Renumber(x = nondoublets_matrix[1, 2:ncol(x = nondoublets_matrix)]), alpha = 1, seed = 4), #column color bar
                               RowSideColors = as.color(Renumber(x = nondoublets_matrix[2:nrow(x = nondoublets_matrix),1]), alpha = 1, seed = 2), # row color bar
                               breaks = breaks, #color key details
                               trace = 'none', #no trace on map
                               na.rm = TRUE, #ignore missing values
                               margins = c(5, 5), # size and layout of heatmap window
                               labRow = NA, #turn off gene labels
                               labCol = NA, #turn off cell labels
                               xlab = 'Samples', #x axis title
                               ylab =  'Genes', # y axis title
                               main = paste0('Non-Doublets: ', filename))) #main title
  }
  #last message
  message('Finished!', '\n')
  cat('Finished!', file = log_file_name, append = TRUE, sep = '\n')

  #close the log file connection
  close(log_con)

  return(list(data_processed = data,
              groups_processed = groups,
              DRS_doublet_table = doubletTable$isADoublet,
              DRS_results = doubletTable$resultsreadable,
              PMF_results = PMFresults,
              Final_doublets_groups = finalDoubletCellCall,
              Final_nondoublets_groups = finalNotDoubletCellCall,
              Final_doublets_exp = doublets_matrix,
              Final_nondoublets_exp = nondoublets_matrix,
              Synth_doublet_info = doubletCellsInput2))
}