R/make_auc_matrix.R

Defines functions make_auc_matrix

Documented in make_auc_matrix

#' @title Make an n*p AUC matrix for a given sample set and mark
#' @description This function takes as input a .csv file with fields: Label, Mark, Bigwig
#' @description This is a new line ...
#' @details What's this?
#' @param input_data The metadata for each sample. A .csv/.xslx file that contains 'Label', 'Mark', and 'Bigwig' fields.
#' @param root TO ADD
#' @param roi Region of interest, a sorted Granges object that specifies the genomic regions to calculate AUC. It also need 'gene' in the metadata
#' @param mark Which mark in \code{input_data} to use
#' @export

make_auc_matrix <- function(input_data, roi, mark, tmp_dir, quantile_norm=TRUE) {
  
  require(tidyverse)
  require(stringr)
  require(readr)
  require(limma)
  require(readxl)
  
  if(grepl("\\.csv$", input_data)) {
    dat = readr::read_csv(input_data)
  }
  if(grepl("\\.xlsx$", input_data)) {
    dat = read_excel(input_data)
  }
  
  dat = dplyr::filter(dat, Mark==mark)
  
  res_matrix = matrix(NA, nrow=dim(dat)[1], ncol=length(roi))
  roi_df = as.data.frame(roi)[,1:3]
  
  colnames(res_matrix) = str_replace_all(apply(roi_df, 1, function(x) paste(as.character(x), collapse="_")), "[[:blank:]]+", "")
  rownames(res_matrix) = dat$Label
  
  if(!dim(res_matrix)[1]) { # no samples for this mark - return empty matrix
    
    return(list(res=res_matrix, annot=data.frame()))
    
  } else {
    
    tmp_bed = str_replace(tempfile(tmpdir=tmp_dir, pattern="", fileext=".bed"), "\\\\", "")
    roi_df[,2] = as.character(roi_df[,2])
    roi_df[,3] = as.character(roi_df[,3]) # make sure scinot is not used in export
    write_tsv(roi_df, tmp_bed, col_names=FALSE)
    
    for(i in 1:dim(dat)[1]) {
      
      if(is.na(dat$Bigwig[i])) {
        print(paste("Sample", i, "has no data for this mark, skipping ..."))
        next
      } else {
        print(paste("Sample", i))
      }
      
      cmd = paste0("WiggleTools/wiggletools apply AUC ", tmp_bed, " ", dat$Bigwig[i])
      res = system(cmd, intern=TRUE)
      
      if(!length(res)) {
        print(paste("Sample", i, "did not work with WiggleTools - no data returned, skipping ..."))
        next
      }
      
      res_idx = grep("fixedStep", res)
      
      # need to clean wiggletools output because of, for example:
      # 'fixedStep chrom=chr11 start=867501 step=1' at certain lines
      if(length(res_idx)) res = res[-res_idx]
      # parse res
      res_coords = str_replace(res, "^(.*)\t(.*)\t(.*)\t(.*)$", "\\1_\\2_\\3")
      res_score = as.numeric(str_replace(res, "^(.*)\t(.*)\t(.*)\t(.*)$", "\\4"))
      res_match = match(colnames(res_matrix), res_coords)
      res_score = res_score[res_match]
      res_matrix[i,] = res_score
      
    }
    
    if(quantile_norm) {
      r_ix = which(rowSums(res_matrix, na.rm=TRUE)==0)
      if(length(r_ix)) {
        res_trans = t(res_matrix[-r_ix,])
        res_trans = normalizeQuantiles(res_trans)
        res_matrix[-r_ix,] = t(res_trans)
      } else {
        res_trans = t(res_matrix)
        res_trans = normalizeQuantiles(res_trans)
        res_matrix = t(res_trans)
      }
    }
    
    return(list(res=res_matrix, annot=dat))
    
  }
}
aidanmacnamara/epiChoose documentation built on Dec. 26, 2021, 3:13 a.m.