R/load.R

Defines functions sorted_merge normalize_reads filter_reads normalize_screens split_guides_by_type retrieve_guides_by_label retrieve_guides_by_gene unique_gene_pairs split_guides

Documented in filter_reads normalize_reads normalize_screens retrieve_guides_by_gene retrieve_guides_by_label split_guides split_guides_by_type unique_gene_pairs

######
# PRE-PROCESSING CODE
######

#' Splits data into different types of guides
#' 
#' 
#' @param guides Dataframe returned from \code{retrieve_guides_by_gene}.
#' @param screens List of screens generated with \code{add_screens}.
#' @param id_col1 A column containing guide-specific indices, e.g. guide sequences or IDs, for
#'   guides in the gene1 column (optional, default NULL).  
#' @param id_col2 A column containing guide-specific indices, e.g. guide sequences or IDs, for
#'   guides in the gene1 column (optional, default NULL).  
#' @return A list of several separate guide lists. Combinatorial-targeting guides are stored
#'   in the key "combn", dual-targeting guides are stored in the key "dual", 
#'   single-targeting guides are stored in the key "single" and guides which target
#'   two control genes are stored in the key "control".
#' @export
split_guides <- function(guides, screens, id_col1 = NULL, id_col2 = NULL) {
  gene_pairs <- unique_gene_pairs(guides)
  guides <- retrieve_guides_by_gene(guides, gene_pairs, screens, id_col1, id_col2)
  return(split_guides_by_type(guides))
}

#' Get unique gene pairs
#' 
#' This function returns all unique gene pairs from two columns containing gene names in a dataframe.
#' This is necessary for scoring combinatorial data where each row corresponds to a single guide 
#' construct that targets two genes. NegControl regions must be listed as "NegControl" values, and guides
#' which only target one gene must have the non-targeted gene listed as "None". 
#' 
#' @param df A dataframe where each row corresponds to a single guide construct that targets 
#'   two genes, which contains the columns gene1 and gene2. 
#' @return A dataframe with two gene name columns where each row contains one unique gene pair. 
unique_gene_pairs <- function(df) {
  
  # Gets unique gene pairs
  index <- unlist(apply(df, 1, function(x) sorted_merge(x, "gene1", "gene2")))
  index <- index[!duplicated(index)]
  
  # Converts to dataframe
  pair_df <- data.frame(gene1 = rep(NA, length(index)),
                        gene2 = rep(NA, length(index)))
  for (i in 1:nrow(pair_df)) {
    temp <- strsplit(index[i], ";")[[1]]
    pair_df$gene1[i] <- temp[1]
    pair_df$gene2[i] <- temp[2]
  }
  return(pair_df)
}

#' Retrieve guide values for gene pairs
#' 
#' Gets all guides in each orientation for all gene pairs and the given columns, assuming that 
#' guide orientation is determined by the gene name columns. Must be run after \code{unique_gene_pairs}
#' to take the dataframe returned from that function as input. 
#' 
#' @param df A dataframe where each row corresponds to a single guide construct that targets 
#'   two genes, which contains the columns gene1, gene2 and optionally the columns specified by
#'   id_col1 and id_col2.
#' @param gene_pairs Dataframe returned by \code{unique_gene_pairs}.
#' @param screens List of screens generated with \code{add_screens}.
#' @param id_col1 A column containing guide-specific indices, e.g. guide sequences or IDs, for
#'   guides in the column gene1 (optional, default NULL).  
#' @param id_col2 A column containing guide-specific indices, e.g. guide sequences or IDs, for
#'   guides in the column gene2 (optional, default NULL).  
#' @return A list indexed by unique gene pairs that contains the gene names in the keys 
#'   "gene1" and "gene2". Each column specified in guide_cols has its guide values 
#'   stored in two keys, one for each orientation. For instance, the column "DMSO_T15"
#'   will have its values stored in the keys "orient1_DMSO_T15" and "orient2_DMSO_T15".
retrieve_guides_by_gene <- function(df, gene_pairs, screens,
                                    id_col1 = NULL, id_col2 = NULL) {
  
  # Gets all columns
  guide_cols <- c()
  for (screen in screens) {
    guide_cols <- c(guide_cols, screen[["replicates"]]) 
  }
  
  # Constructs output list of guide-level values for the given columns
  guides <- list()
  for (i in 1:nrow(gene_pairs)) {
    
    # Gets guide values for both orientations
    gene1 <- gene_pairs$gene1[i]
    gene2 <- gene_pairs$gene2[i]
    all_index <- df$gene1 == gene1 & df$gene2 == gene2 | 
      df$gene1 == gene2 & df$gene2 == gene1
    guide_list <- list()
    guide_list[["gene1"]] <- gene1
    guide_list[["gene2"]] <- gene2
    
    # Handles guides targeting intergenic-intergenic regions and 
    # guides that target a single gene twice
    if ((gene1 == "NegControl" & gene2 == "NegControl") | gene2 == "None") {
      if (gene1 == "NegControl" & gene2 == "NegControl") {
        guide_list[["guide_type"]] <- "control"
      } else if (gene2 == "None") {
        guide_list[["guide_type"]] <- "dual"
      }
      for (col in guide_cols) {
        guide_name <- col
        guide_list[[guide_name]] <- df[all_index, col]
      }
      if (!is.null(id_col1)) {
        guide_list[["id1"]] <- df[all_index, id_col1]
      }
      if (!is.null(id_col2)) {
        guide_list[["id2"]] <- df[all_index, id_col2]
      }
    }
    
    # Handles exonic-intergenic guides
    else if (gene1 == "NegControl" | gene2 == "NegControl") {
      guide_list[["guide_type"]] <- "single"
      orient1_index <- all_index & df$gene2 == "NegControl"
      orient2_index <- all_index & df$gene1 == "NegControl"
      for (col in guide_cols) {
        guide_name1 <- paste0("orient1_", col)
        guide_name2 <- paste0("orient2_", col)
        guide_list[[guide_name1]] <- df[orient1_index, col]
        guide_list[[guide_name2]] <- df[orient2_index, col]
      }
      if (!is.null(id_col1)) {
        guide_list[["orient1_id1"]] <- df[orient1_index, id_col1]
        guide_list[["orient1_id2"]] <- df[orient1_index, id_col2]
      }
      if (!is.null(id_col2)) {
        guide_list[["orient2_id1"]] <- df[orient2_index, id_col1]
        guide_list[["orient2_id2"]] <- df[orient2_index, id_col2]
      }
    }
    
    # Handles exonic-exonic guides targeting two genes
    else if (gene1 != gene2) {
      guide_list[["guide_type"]] <- "combn"
      index1 <- all_index & df$gene1 == gene1 & df$gene2 == gene2
      index2 <- all_index & df$gene1 == gene2 & df$gene2 == gene1
      for (col in guide_cols) {
        guide_name1 <- paste0("orient1_", col)
        guide_name2 <- paste0("orient2_", col)
        guide_list[[guide_name1]] <- df[index1, col]
        guide_list[[guide_name2]] <- df[index2, col]
      }
      if (!is.null(id_col1)) {
        guide_list[["orient1_id1"]] <- df[index1, id_col1]
        guide_list[["orient1_id2"]] <- df[index1, id_col2]
      }
      if (!is.null(id_col2)) {
        guide_list[["orient2_id1"]] <- df[index2, id_col1]
        guide_list[["orient2_id2"]] <- df[index2, id_col2]
      }
    }
    
    # Throws a warning if no guides found for the current gene pair
    else {
      warning(paste("No unique guides found for", gene1, "and", gene2))
    }
    
    # Appends guides to list and sets empty values to NA
    guide_list[which(guide_list == numeric())] <- NA
    guides[[i]] <- guide_list
  }
  return(guides)
}

#' Splits guides by type
#'
#' Gets all guides in each orientation for all unique gene pairs and the given columns.
#' Assumes that orientation is determined by label columns which specify whether each
#' guide targets an exonic region or an intergenic region. Exonic-targeting guides must
#' be labeled as "exonic" and intergenic-targeting guides must be labeled as "intergenic".
#' The column specified by label_col1 corresponds to the gene names specified by the 
#' column gene_col1, and similarly for label_col2 and gene_col2. 
#' 
#' 
#' @param df A dataframe where each row corresponds to a single guide construct that targets 
#'   two genes, which contains the columns named in the variables gene_col1, gene_col2, 
#'   label_col1, label_col2 and replicate columns in screens.
#' @param gene_pairs Dataframe returned by \code{unique_gene_pairs}.
#' @param screens List of screens generated with \code{add_screens}.
#' @param label_col1 A column specify whether the guide in gene_col1 targeted an exonic or
#'   intergenic region, which must be labeled as "exonic" or "intergenic". 
#' @param label_col2 See above, but for gene_col2. 
#' @param id_col1 A column containing guide-specific indices, e.g. guide sequences or IDs, for
#'   guides in gene_col1 (optional, default NULL).  
#' @param id_col2 A column containing guide-specific indices, e.g. guide sequences or IDs, for
#'   guides in gene_col2 (optional, default NULL). 
#' @return A list indexed by unique gene pairs that contains the gene names in the keys 
#'   "gene1" and "gene2". Each column contained in each screen's replicates has its guide values 
#'   stored in two keys, one for each orientation. For instance, the column "DMSO_T15"
#'   will have its values stored in the keys "orient1_DMSO_T15" and "orient2_DMSO_T15".
#'   The guide type is additionally contained in the key "guide_type". 
retrieve_guides_by_label <- function(df, gene_pairs, screens,
                                     label_col1, label_col2,
                                     id_col1 = NULL, id_col2 = NULL) {
  
  # Gets all columns
  guide_cols <- c()
  for (screen in screens) {
   guide_cols <- c(guide_cols, screen[["replicates"]]) 
  }
  
  # Constructs output list of guide-level values for the given columns
  guides <- list()
  for (i in 1:nrow(gene_pairs)) {
    
    # Gets guide values for both orientations and all indices
    gene1 <- gene_pairs$gene1[i]
    gene2 <- gene_pairs$gene2[i]
    all_index <- df$gene1 == gene1 & df$gene2 == gene2 |
      df$gene1 == gene2 & df$gene2 == gene1
    
    # Creates empty guide list to fill depending on guide type
    guide_list <- list()
    guide_list[["gene1"]] <- gene1
    guide_list[["gene2"]] <- gene2
    
    # Handles guides targeting intergenic-intergenic regions
    if (gene1 == "NegControl" & gene2 == "NegControl") {
      guide_list[["guide_type"]] <- "intergenic_intergenic"
      for (col in guide_cols) {
        guide_name <- col
        guide_list[[guide_name]] <- df[all_index, col]
      }
      if (!is.null(id_col1)) {
        guide_list[["id1"]] <- df[all_index, id_col1]
      }
      if (!is.null(id_col2)) {
        guide_list[["id2"]] <- df[all_index, id_col2]
      }
    }
    
    # Handles guides targeting a single gene twice
    else if (gene2 == "None") {
      guide_list[["guide_type"]] <- "single_gene_dual_targeted"
      for (col in guide_cols) {
        guide_name <- col
        guide_list[[guide_name]] <- df[all_index, col]
      }
      if (!is.null(id_col1)) {
        guide_list[["id1"]] <- df[all_index, id_col1]
      }
      if (!is.null(id_col2)) {
        guide_list[["id2"]] <- df[all_index, id_col2]
      }
    }
    
    # Handles exonic-intergenic guides
    else if (gene2 == "NegControl") {
      guide_list[["guide_type"]] <- "exonic_intergenic"
      orient1_index <- all_index & df[[label_col1]] == "exonic"
      orient2_index <- all_index & df[[label_col2]] == "exonic"
      for (col in guide_cols) {
        guide_name1 <- paste0("orient1_", col)
        guide_name2 <- paste0("orient2_", col)
        guide_list[[guide_name1]] <- df[orient1_index, col]
        guide_list[[guide_name2]] <- df[orient2_index, col]
      }
      if (!is.null(id_col1)) {
        guide_list[["orient1_id1"]] <- df[orient1_index, id_col1]
        guide_list[["orient1_id2"]] <- df[orient1_index, id_col2]
      }
      if (!is.null(id_col2)) {
        guide_list[["orient2_id1"]] <- df[orient2_index, id_col1]
        guide_list[["orient2_id2"]] <- df[orient2_index, id_col2]
      }
    }
    
    # Handles exonic-exonic guides targeting two genes
    else {
      guide_list[["guide_type"]] <- "exonic_exonic"
      index1 <- all_index & df$gene1 == gene1 & df$gene2 == gene2
      index2 <- all_index & df$gene1 == gene2 & df$gene2 == gene1
      for (col in guide_cols) {
        guide_name1 <- paste0("orient1_", col)
        guide_name2 <- paste0("orient2_", col)
        guide_list[[guide_name1]] <- df[index1, col]
        guide_list[[guide_name2]] <- df[index2, col]
      }
      if (!is.null(id_col1)) {
        guide_list[["orient1_id1"]] <- df[index1, id_col1]
        guide_list[["orient1_id2"]] <- df[index1, id_col2]
      }
      if (!is.null(id_col2)) {
        guide_list[["orient2_id1"]] <- df[index2, id_col1]
        guide_list[["orient2_id2"]] <- df[index2, id_col2]
      }
    }
    
    # Appends guides to list and sets empty values to NA
    guide_list[which(guide_list == numeric())] <- NA
    guides[[i]] <- guide_list
  }
  return(guides)
}

#' Splits guides by type.
#' 
#' Splits guide returned from \code{retrieve_guides_by_gene} by targeting type, which is 
#' one of dual-gene exonic-exonic, single-gene exonic-exonic or exonic-intergenic.
#' 
#' @param guides Dataframe returned from \code{retrieve_guides_by_gene}.
#' @return A list of three separate guide lists. Combinatorial-targeting guides are
#'   stored in the key "combn", single-targeting guides are stored in the 
#'   key "single" and dual-targeting guides are stored in the key "dual". 
split_guides_by_type <- function(guides) {
  dual <- guides[unlist(lapply(guides, function(x) x[["guide_type"]] == "dual"))]
  single <- guides[unlist(lapply(guides, function(x) x[["guide_type"]] == "single"))]
  combn <- guides[unlist(lapply(guides, function(x) x[["guide_type"]] == "combn"))]
  return(list("dual" = dual, 
              "single" = single, 
              "combn" = combn))
}

#' Normalizes reads for given screens
#' 
#' Log2 and depth-normalizes reads between a given list of columns and a given
#' column of an earlier timepoint (e.g. T0) specified in the "normalize_name"
#' entry of each screen. Screens with NULL for their "normalize_name" entry
#' are log2 and depth-normalized, but not normalized to earlier timepoints.
#' If a screen to normalize against has multiple replicates, those replicates
#' are averaged before normalization. Multiple replicates for a screen being
#' normalized, however, are normalized separately against the provided early
#' timepoint.
#' 
#' @param df Reads dataframe.
#' @param screens List of screens generated with \code{add_screens}. 
#' @param filter_names List of screen names to filter based on read counts by. 
#' @param scaling_factor Scaling factor for normalizing reads, helps make different
#'   replicates comparable but the specific choice typically has little to no 
#'   impact on results (default 1e6).
#' @param pseudocount Pseudocount for normalized reads, where lower values are
#'   such as 1-10 are recommended for most combinatorial screens with sufficient
#'   coverage, e.g. 200x and upwards (default 1).
#' @param min_reads Minimum number of reads to keep (default 30, anything
#'   below this value will be filtered out).
#' @param max_reads Maximum number of reads to keep (default 10000, anything
#'   above this value will be filtered out).
#' @param nonessential_norm Whether or not to normalize each screen against its
#'   population of core non-essential genes, as defined by Traver et al. 2015 
#'   (default FALSE).
#' @param nonessential_genes Replaces the Traver et al. 2015 nonessential gene 
#'   list with this list if specified (default NULL)
#' @return Normalized dataframe.
#' @export 
normalize_screens <- function(df, screens, filter_names = NULL, scaling_factor = 1e6, 
                              pseudocount = 1, min_reads = 30, max_reads = 10000,
                              nonessential_norm = TRUE, nonessential_genes = NULL) {
  
  # Checks for input errors
  check_screen_params(df, screens)
  
  # Sets non-essential genes 
  nonessentials <- traver_nonessentials
  if (!is.null(nonessential_genes)) {
    nonessentials <- nonessential_genes
  }
  
  # Flags guides with too few read counts
  all_names <- names(screens)
  for (name in filter_names) {
    if (!(name %in% all_names)) {
      cat(paste("WARNING: screen", name, "not found, data not filtered by this screen\n"))
    }
  }
  to_remove <- rep(FALSE, nrow(df))
  filter_cols <- sapply(screens[filter_names], "[[", "replicates")
  for (col in filter_cols) {
    to_remove[df[,col] < min_reads] <- TRUE
  }
  sum_low <- sum(to_remove)
  for (col in filter_cols) {
    to_remove[df[,col] > max_reads] <- TRUE
  }
  sum_high <- sum(to_remove) - sum_low
  removed_guides_ind <- which(to_remove)
  
  # Log2 and depth-normalizes every screen
  for (screen in screens) {
    for (col in screen[["replicates"]]) {
      df[,col] <- normalize_reads(df[,col], scaling_factor, pseudocount)
    }
  }
  
  # Normalizes specified screens to earlier timepoints
  for (screen in screens) {
    normalize_name <- screen[["normalize_name"]]
    if (!is.null(normalize_name)) {
      if (normalize_name %in% all_names) {
        for (col in screen[["replicates"]]) {
          rep_cols <- screens[[normalize_name]][["replicates"]]
          rep_norm <- df[,rep_cols]
          if (length(rep_cols) > 1) {
            rep_norm <- rowMeans(rep_norm)
          }
          df[,col] <- df[,col] - rep_norm
        }
      } else {
        cat(paste("WARNING: screen", normalize_name, "not found.\n"))
      }
    }
  }
  
  # Normalizes all LFCs against per-screen non-essential median LFCs if specified
  nonessential_ind <- (df$gene1 %in% nonessentials) & (df$gene2 %in% nonessentials)
  if (sum(nonessential_ind) > 0 & nonessential_norm) {
    for (screen in screens) {
      for (col in screen[["replicates"]]) {
        nonessential_vals <- as.numeric(unlist(df[nonessential_ind, col]))
        nonessential_median <- stats::median(nonessential_vals, na.rm = TRUE)
        df[,col] <- df[,col] - nonessential_median
      }
    }
  } else {
    cat(paste("Skipping LFC normalization to median per-screen non-essential gene LFC\n"))
  }
  
  # Removes flagged guides
  df <- df[!to_remove,]
  cat(paste("Excluded a total of", sum_low, "guides for low t0 representation\n"))
  cat(paste("Excluded a total of", sum_high, "guides for high t0 representation\n"))
  return(df)
}

#' Filters guides with too few read counts.
#' 
#' Filters guides out with too few read counts from a given reads dataframe
#' and a given set of columns (e.g. T0 columns).
#' 
#' @param df Reads dataframe.
#' @param cols Columns to filter by.
#' @param min_reads Minimum number of reads to keep (anything below
#'   this value will be filtered out).
#' @param max_reads Maximum number of reads to keep (anything above
#'   this value will be filtered out).
#' @return Filtered dataframe.
filter_reads <- function(df, cols, min_reads = 30, max_reads = 10000) {
  to_remove <- rep(FALSE, nrow(df))
  for (col in cols) {
    to_remove[df[,col] < min_reads] <- TRUE
  }
  sum_low <- sum(to_remove)
  for (col in cols) {
    to_remove[df[,col] > max_reads] <- TRUE
  }
  sum_high <- sum(to_remove) - sum_low
  removed_guides_ind <- which(to_remove)
  df <- df[!to_remove,]
  cat(paste("Excluded a total of", sum_low, "guides for low t0 representation\n"))
  cat(paste("Excluded a total of", sum_high, "guides for low t0 representation\n"))
  return(df)
}

#' Log-normalizes reads.
#' 
#' Log2-normalizes reads with a given pseudocount and scaling factor, and also
#' depth-normalizes the data. 
#' 
#' @param df List of read counts.
#' @param scaling_factor Scaling factor (default 1e6).
#' @param pseudocount Pseudocount (default 1).
#' @return Log- and depth-normalized read counts.
#' @export
normalize_reads <- function(df, scaling_factor = 1e6, pseudocount = 1) {
  log2((df / sum(df, na.rm = TRUE)) * scaling_factor + pseudocount)
}

# Inner function to get sorted merge of two columns for a single row. 
# Maintains a special order for intergenic "NegControl" values and "None"
sorted_merge <- function(row, col1, col2) {
  temp <- sort(c(row[[col1]], row[[col2]]))
  if (temp[1] == "None" | temp[1] == "NegControl") {
    second_item <- temp[2]
    temp[2] <- temp[1]
    temp[1] <- second_item
  }
  temp <- paste0(temp[1], ";", temp[2])
  return(temp)
}
HenryWard/orthrus documentation built on June 2, 2023, 10:28 p.m.