R/load_datasets.R

Defines functions load_datasets

Documented in load_datasets

#
# tb.treatments = table(c(df.annotation$condition, df.annotation$condition_optional))
# tb.treatments = tb.treatments[names(tb.treatments) != ""]
#
# # tissue and treatment distributions
# tb.tissues <- table(df.annotation$tissue)
# v.conditionGroups = names(tb.treatments)
#
# # loading gene expressino matrix
# m.expression <- read.table(filename.geneExpression, row.names = 1, header = TRUE, sep = "\t", quote = "", stringsAsFactors = FALSE)
# m.expression <- as.matrix(m.expression)
#
# tb.conditions = table(df.annotation$condition)
# tb.conditions[names(table(df.annotation$condition_optional))] = tb.conditions[names(table(df.annotation$condition_optional))] + table(df.annotation$condition_optional)
#

#' Load dataset function
#'
#' This function loads a datasets
#' @param
#' @keywords
#' @export
#' @examples
#' load_datasets()
load_datasets = function(filename.genes = "data/genes.txt",
                         filename.experiment_ids = "data/experiment_ids.txt",
                         filename.foldChange_differentialExpression = "data/m.foldChange_differentialExpression.txt",
                         filename.pvalue_differentialExpression =	"data/m.pvalue_differentialExpression.txt",
                         filename.experiment_condition_tissue_annotation =	"data/df.experiment_condition_annotation.txt",
                         filename.transcriptionfactor_annotation = "data/df.transcriptionFactorAnnotation.txt",
                         filename.geneGroups = "data/df.enzymes_w_metabolic_domains.txt"){

  genes = read.table(filename.genes, header = F, sep = "\t", stringsAsFactors = F)[,1]
  experiment_series_ids = read.table(filename.experiment_ids, header = F, sep = "\t", stringsAsFactors = F)[,1]
  experiment_series_ids = as.character(experiment_series_ids)

  df.annotation <- read.csv(filename.experiment_condition_tissue_annotation,  header = TRUE, sep = "\t", fill = TRUE, stringsAsFactors = FALSE)
  v.colnames_mandatory = c("series_id", "condition", "condition_optional", "tissue", "unique_ID")
  if(!all(v.colnames_mandatory %in% names(df.annotation))){
    stop(paste("could not find all mandatory columns in file:", paste(v.colnames_mandatory, collapse = ", ")))
  }
  # df.annotation = df.annotation[,c("series_id", "condition", "condition_optional", "tissue")]
  df.annotation <- subset(df.annotation, !is.na(df.annotation$unique_ID))

  df.foldChange_differentialExpression = read.table(filename.foldChange_differentialExpression, header = F, sep = "\t", stringsAsFactors = F)
  df.pvalue_differentialExpression = read.table(filename.pvalue_differentialExpression, header = F, sep = "\t", stringsAsFactors = F)

  if(length(genes) == 0){
    stop("Error: no genes found")
  }
  if(length(experiment_series_ids) == 0){
    stop("Error: no experiments found")
  }
  if(nrow(df.annotation) == 0){
    stop("Error: no condition annotation found")
  }
  if(nrow(df.foldChange_differentialExpression) == 0){
    stop("Error: no differential expression foldchange found")
  }
  if(nrow(df.pvalue_differentialExpression) == 0){
    stop("Error: no differential expression pvalue found")
  }

  m.foldChange_differentialExpression = data.matrix(df.foldChange_differentialExpression, rownames.force = NA)
  m.pvalue_differentialExpression     = data.matrix(df.pvalue_differentialExpression, rownames.force = NA)
  rownames(m.foldChange_differentialExpression) = rownames(m.pvalue_differentialExpression) = genes
  colnames(m.foldChange_differentialExpression) = colnames(m.pvalue_differentialExpression) = experiment_series_ids

  tb.treatments = table(c(df.annotation$condition, df.annotation$condition_optional))
  tb.treatments = tb.treatments[!is.na(tb.treatments)]

  tb.condition_tissues = table(df.annotation$tissue)
  v.treatments = unique(c(df.annotation$condition, df.annotation$condition_optional))
  v.treatments = v.treatments[!v.treatments == ""]
  tb.condition_treatments = tb.treatments[v.treatments]

  #tb.experiment_series_ids = table(experiment_series_ids)

  #df.annotation["number_series"] = 0
  #for(i in 1:nrow(df.annotation)){
  #  df.annotation$number_series[i] = tb.experiment_series_ids[as.character(df.annotation$unique_ID[i])]
  #}

  # tb.tissues = numeric(length(v.tissues))
  # names(tb.tissues) = v.tissues
  # for(i in 1:length(v.tissues)){
  #   idx = which(df.annotation$tissue == v.tissues[i])
  #   tissues.i = df.annotation$number_series[idx]
  #   tissues.i = tissues.i[!is.na(tissues.i)]
  #   tb.tissues[i] = sum(tissues.i)
  # }
  # tb.condition_tissues = tb.tissues


  # tb.condition_treatments = numeric(length(v.treatments))
  # names(tb.condition_treatments) = unique(v.treatments)
  # for(i in 1:length(tb.condition_treatments)){
  #   idx_1 = which(df.annotation$condition %in% names(tb.condition_treatments)[i])
  #   idx_2 = which(df.annotation$condition_optional %in% names(tb.condition_treatments)[i])
  #   tb.condition_treatments[i] = length(idx_1) + length(idx_2)# sum(df.annotation$number_series[idx_1]) + sum(df.annotation$number_series[idx_2])
  # }


  df.transcriptionFactorAnnotation = read.table(filename.transcriptionfactor_annotation, header = T, sep = "\t", stringsAsFactors = F)
  if(nrow(df.transcriptionFactorAnnotation) == 0){
    stop("Error: no transcription factor annotation found")
  }
  df.transcriptionFactorAnnotation["with_geneExpression"] = "no"
  df.transcriptionFactorAnnotation$with_geneExpression[which(df.transcriptionFactorAnnotation$TF_ID %in% genes)] = "yes"


  df.geneGroups = read.table(filename.geneGroups, header = T,  sep = "\t", stringsAsFactors = F)
  if(nrow(df.geneGroups) == 0){
    stop("Error: no gene group annotation found")
  }

  rownames(df.geneGroups) = df.geneGroups$Gene_ID
  df.geneGroups <- df.geneGroups[,!names(df.geneGroups) %in% c("Gene_ID")]

  tb.geneGroups = colSums(df.geneGroups)
  v.geneGroups = colnames(df.geneGroups)


  l.geneGroups <- vector(mode = "list", length = length(v.geneGroups))
  names(l.geneGroups) <- v.geneGroups
  for(i in 1:length(v.geneGroups)){
    l.geneGroups[[i]] <- rownames(df.geneGroups)[which(df.geneGroups[,v.geneGroups[i]] == 1)]
    l.geneGroups[[i]] <- intersect(l.geneGroups[[i]], genes)
  }


  df.geneGroups["with_geneExpression"] = "no"
  df.geneGroups$with_geneExpression[which(rownames(df.geneGroups) %in% genes)] = "yes"


  return(list(m.foldChange_differentialExpression=m.foldChange_differentialExpression,
              m.pvalue_differentialExpression=m.pvalue_differentialExpression,
              df.experiment_condition_annotation=df.annotation,
              tb.condition_treatments=tb.condition_treatments,
              tb.condition_tissues=tb.condition_tissues,
              df.transcriptionFactorAnnotation=df.transcriptionFactorAnnotation,
              df.geneGroups=df.geneGroups,
              tb.geneGroups=tb.geneGroups,
              v.geneGroups=v.geneGroups,
              l.geneGroups=l.geneGroups,
              genes = genes
  ))
}
mbanf/MERIT documentation built on June 16, 2021, 1:07 p.m.