R/archaic_pool.R

Defines functions archaic_pool

Documented in archaic_pool

#' @title Aggregating signature counts from MFF files from one or more studies
#' (folders).
#'
#' @description Aggregate signature counts data using archaic_prepare() from
#' multiple study directories into a single data frame or matrix.
#'
#' @param dat may be a list of count data matrices generated by archaic_prepare()
#'  or a vector of folder names containing MFF files.
#'
#' @return The function creates a matrix of counts with combined samples from
#' multiple studies (stored as .csv MFF files and/or .RData file). The number of
#' rows of this matrix is same as the number of study samples across all the
#' studies considered, with columns representing mismatch signatures and the c
#' matrix cells recording the counts of the signatures occurring in the samples.
#'
#'
#' @keywords archaic_pool
#' @export



archaic_pool = function(dat){

  message("Checking if the folders exist")

  if(class(dat) == "list"){

############################   when dat is a list generated by archaic_prepare  ##############################

    cat("The data is read as a list of matrices - processed by archaic_prepare() \n")
    datalist <- dat
    sig_names <- colnames(datalist[[1]])
    row_names_pool <- rownames(datalist[[1]])
    if(length(datalist) >= 2){
      for(num in 2:length(datalist)){
        sig_names <- union(sig_names, colnames(datalist[[num]]))
        row_names_pool <- c(row_names_pool, rownames(datalist[[num]]))
      }
    }

    pooled_data <- matrix(0, length(row_names_pool), length(sig_names))
    rownames(pooled_data) <- row_names_pool
    colnames(pooled_data) <- sig_names
    for(num in 1:length(datalist)){
      pooled_data[match(rownames(datalist[[num]]), rownames(pooled_data)),
                  match(colnames(datalist[[num]]), sig_names)] <- as.matrix(datalist[[num]])
    }
  }else if(class(dat) == "character"){

 ##############   when dat is a vector of folder names generated by archaic_prepare  ##############################

    message("The data is read as names of folders")
    folders <- dat
    for(i in 1:length(folders)){
      if(!file.exists(folders[i]))
        stop(paste0("The folder", folders[i], "in the folder list does not exist:  aborting"))
    }
    datalist <- list()
    for(numdir in 1:length(folders)){
      if(file.exists(paste0(folders[numdir], tail(strsplit(folders[numdir], "/")[[1]],1), ".rda"))){
        datalist[[numdir]] <- get(load(paste0(folders[numdir], tail(strsplit(folders[numdir], "/")[[1]],1), ".rda")))
        cat("Successfully read .RData file from the folder, ", folders[numdir], "CHECK : \n")
      }else{
        message(".RData file not found in folder", folders[numdir], "running
                archaic_prepare on the MFF files in the folder")
        proc_out <- archaic_prepare(folders[numdir])
        datalist[[numdir]] <- proc_out
      }
      labs <- c(labs, rep(tail(strsplit(folders[numdir], "/")[[1]],1), dim(datalist[[numdir]])[1]))
    }

    sig_names <- colnames(datalist[[1]])
    row_names_pool <- rownames(datalist[[1]])
    if(length(datalist) >= 2){
      for(num in 2:length(datalist)){
        sig_names <- union(sig_names, colnames(datalist[[num]]))
        row_names_pool <- c(row_names_pool, rownames(datalist[[num]]))
      }
    }

    pooled_data <- matrix(0, length(row_names_pool), length(sig_names))
    rownames(pooled_data) <- row_names_pool
    colnames(pooled_data) <- sig_names
    for(num in 1:length(datalist)){
      pooled_data[match(rownames(datalist[[num]]), rownames(pooled_data)),
                  match(colnames(datalist[[num]]), sig_names)] <- as.matrix(datalist[[num]])
    }
  }
  return(pooled_data)
}
kkdey/aRchaic documentation built on Jan. 17, 2021, 5:33 p.m.