R/dataload_main.R

Defines functions load_data load_main tree.check

Documented in load_data load_main tree.check

# File loading functionality

#' Load data sets from pre-specified sources
#' @description This function loads all the necessary data sets (metabolomics and 16S only) from
#'    pre-specified sources. The difference between `taxa` and `timetaxa` is that `timetaxa` has specific
#'    taxa based on time while `taxa` collects data jointly analzyed between the 6W and 12M time point. All
#'    directories should be in a \code{data_directory.csv} file.
#' @param type Type of data required to load, can be of class "metabo", "taxa", "reference" or "all" or "timetaxa"
#' @param file Directory towards the \code{data_directory.csv} file.
#' @export
load_data <- function(file, type){
  # error handling
  if(dir.exists("//afs/northstar/users") == F){
    stop("Cannot load files, either AFS is not connected or the files do not exist and have changed")
  }
  dir <- utils::read.csv(file, row.names = 1) # data directory file
  dir$directory <- as.character(dir$directory)
  if (type == "reference" | type == "all"){
    message("loading reference data...")
    mblreference <- utils::read.csv(file = dir["mblreference",])
  }
  if (type == "metabo" | type == "all"){
    message("loading metabolic data...")
    metabo <- readRDS(file = dir["metabo",])
    metabo.key <- readRDS(file = dir["metabokey",])
  }
  if (type == "timetaxa" | type == "all"){
    message("Loading time point specific taxa data...")
    tax.6W <- readRDS(dir["tax6W",])
    tax.12M <- readRDS(dir["tax12M",])
    tax.6W.key <- readRDS(dir["tax6Wkey",])
    tax.12M.key <- readRDS(dir["tax12Mkey",])
  }
  if (type == "taxa" | type == "all"){
    message("loading joint processed data ...")
    tax <- readRDS(dir["taxall",])
    tax.key <- readRDS(dir["taxallkey",])
  }

  # object export
  message("Exporting to File...")
  if (type == "all"){
    result <- list(metabo, metabo.key, tax.6W, tax.12M, tax.6W.key, tax.12M.key, mblreference, tax, tax.key)
    names(result) <- c('metabo', 'metabo.key',
                       'tax.6W', 'tax.12M', 'tax.6W.key', 'tax.12M.key',
                       'mblreference',
                       "full.tax", "full.tax.key")
  } else if (type == "metabo"){
    result <- list(metabo, metabo.key)
    names(result) <- c('metabo','metabo.key')
  } else if (type == "taxa"){
    result <- list(tax.6W, tax.12M, tax.6W.key, tax.12M.key)
    names(result) <- c('tax.6W', 'tax.12M', 'tax.6W.key', 'tax.12M.key')
  } else if (type == "reference"){
    result <- list(mblreference)
    names(result) <- c('mblreference')
  }

  return(result)
}


#' @title Main function to load all data
#' @description Stringing all data loading functions together to load matching metabolomics and 16S data sets.
#'     For now, the function doesn't have matching between 6W and 12M samples.
#' @param file Directory containing the file \code{data_directory.csv}.
#' @export
load_main <- function(file){
  data <- load_data(file = file, type = "all") #load all possible data

  # separate by time
  metabo.6W <- data$metabo[data$metabo$TimePeriod == '6W',]
  metabo.12M <- data$metabo[data$metabo$TimePeriod == '12M',]

  # impute and remove missing labels
  message("For 12M samples...")
  metabo.12M <- missing.imputation(reference = data$mblreference, data = metabo.12M, timelabel = "12M")
  extraction.index <- extraction(metabo.12M, tax = data$full.tax)
  metabo.12M <- extraction.index[[1]]
  tax.12M <- extraction.index[[2]]

  message("For 6W samples...")
  metabo.6W <- missing.imputation(reference = data$mblreference, data = metabo.6W, timelabel = "6W")
  extraction.index <- extraction(metabo.6W, tax = data$full.tax)
  metabo.6W <- extraction.index[[1]]
  tax.6W <- extraction.index[[2]]

  # Post processing
  message("Post processing...")
  # assigning similar row names and removing the first 6 unecessary columns
  rownames(metabo.12M) <- metabo.12M$mblid
  metabo.12M <- metabo.12M[,-c(1:6)]
  rownames(metabo.6W) <- metabo.6W$mblid
  metabo.6W <- metabo.6W[,-c(1:6)]
  # dealing with p-histidine which is a pain
  colnames(metabo.12M)[ncol(metabo.12M)] <- "p-Methylhistidine"
  colnames(metabo.6W)[ncol(metabo.6W)] <- "p-Methylhistidine"
  # returning all data frame in proper matrix form
  names <- rownames(metabo.12M)
  metabo.12M <- apply(metabo.12M,MARGIN = 2, as.numeric)
  rownames(metabo.12M) <- names
  names <- rownames(metabo.6W)
  metabo.6W <- apply(metabo.6W, MARGIN = 2, as.numeric)
  rownames(metabo.6W) <- names
  rm(names)
  # removing the standard column
  metabo.12M <- metabo.12M[,-7]
  metabo.6W <- metabo.6W[,-7]
  #transforming the taxonomic matrix back to matrix form
  tax.12M <- as.matrix(tax.12M)
  tax.6W <- as.matrix(tax.6W)

  result <- list(metabo.6W, metabo.12M, tax.6W, tax.12M, data$metabo.key, data$full.tax.key)
  names(result) <- c("metabo.6W", "metabo.12M", "tax.6W", "tax.12M", "metabo.key", "tax.key")
  return(result)
}

#' @title Checking for phylogenetic tree and constructing tree
#' @description Check whether the tree file exists in a directory. If not, construct tree from
#'     16S sequences stored in the joint analysis file (Available for taxa-all options only). The FastTree
#'     program will be run externally to generate trees accordingly.
#' @param file Directory containing the \code{data_directory.csv} file.
#' @param progfile Directory containing the \code{FastTree} executable.
#' @examples
#'     # do not run
#'     # tree.check(file = "./data_directory.csv", progfile = "../software/FastTree")
#' @export

tree.check <- function(file, progfile){
  dir <- utils::read.csv(file, row.names = 1)
  dir$directory <- as.character(dir$directory)
  if (!file.exists("./tree.tre")){
    message("No tree exists")
    seqs <- readRDS(dir["treeall",])
    alignment <- DECIPHER::AlignSeqs(Biostrings::DNAStringSet(seqs), anchor = NA, verbose = TRUE)
    phagAlign <- phangorn::phyDat(methods::as(alignment, "matrix"), type = "DNA")
    phangorn::write.phyDat(phagAlign, file = "alignment.fasta", format = "fasta")
    system(paste(progfile,"-gtr -nt alignment.fasta > tree.tre"))
  }
  tree <- ape::read.tree(file = "tree.tre")
  return(tree)
}
quangnguyen1995/MCTools documentation built on May 23, 2019, 8:56 a.m.