R/read.R

Defines functions read_molecule_file read.sim.profiles.from.master .get.sheet.info.str .read.all.sheet is.master read.master.file read.obs.profiles is.profile read.pop.profiles is.matched.profiles read.pop.profiles.from.master groups.master .match.mols.from.string .id.col .id.molecule.cols .id.time.col

# extract time column information and unit
.id.time.col <- function(df, match = "time [") {
  col.names <- tolower(colnames(df))
  col.nr <- which(grepl(match, col.names, fixed = TRUE))

  if (length(col.nr) != 1) {
    stop("Could not identify time column")
  }

  result <- list(id = "Time", col = col.nr, unit = .extract.unit(col.names[col.nr]))

  return(result)
}


# extract molecule column information and unit
.id.molecule.cols <- function(df, match.tag, add.match.tags = c(), is.fraction = F, fixed.unit = NA,
                              silent = F) {
  col.names <- tolower(colnames(df))
  match.tag <- tolower(match.tag)
  if (length(add.match.tags) > 0) {
    add.match.tags <- tolower(add.match.tags)
  }

  grep.tag <- match.tag

  if (length(add.match.tags > 0)) {
    grep.tag <- c(grep.tag, add.match.tags)
    grep.tag <- sapply(grep.tag, function(x) paste("(?=.*", x, ")", sep = ""), simplify = T, USE.NAMES = F)
    grep.tag <- paste(grep.tag, collapse = "")
  }

  col.nrs <- c(grep(grep.tag, col.names, perl = TRUE))
  n.cols <- length(col.nrs)

  if (n.cols < 1) {
    if (silent)
      return(NULL)

    stop(paste("Could not identify value column for match tag <", match.tag, ">"))
  }

  unit <- NA
  if (!is.na(fixed.unit)) {
    unit <- fixed.unit
  } else if (is.fraction) {
    unit <- units::as_units(units::unitless)
  } else {
    unit <- .extract.unit(col.names[col.nrs[1]])
  }

  result <- list(
    id = match.tag,
    cols = col.nrs,
    unit = unit
  )

  return(result)
}


.id.col <- function(df, match, fixed = TRUE) {
  col.names <- trimws(tolower(colnames(df)))
  match <- tolower(match)

  col.nr <- which(grepl(match, col.names, fixed = fixed))

  if (identical(col.nr, integer(0))) {
    return(NA)
  }

  return(col.nr)
}


.match.mols.from.string <- function(ids, molecules) {
  mols <- list()
  ids <- trimws(tolower(ids))
  for (id in ids) {
    match <- FALSE
    for (mol in molecules) {
      if (id == tolower(mol$id)) {
        match <- TRUE
        mols <- append(mols, list(mol))
        break
      }
    }

    if (!match) {
      stop(paste("Could not find molecule with id <", id, ">"))
    }
  }

  return(mols)
}

groups.master <- function(master.data) {
  group_idx <- which(grepl("group", tolower(colnames(master.data$data))))
  has_groups <- length(group_idx) > 0
  if (has_groups) {
    return(colnames(master.data$data)[group_idx])
  }

  return(NULL)
}


read.pop.profiles.from.master <- function(master.data,
                                          molecules = list(),
                                          files.dir = ".",
                                          pop.file.ext = "",
                                          pop.file.format = c("auto", "xsl", "csv"),
                                          pop.file.csv.sep = ",",
                                          pop.file.csv.dec = ".",
                                          pop.file.xls.sheet = 1) {
  if (!is.master(master.data)) {
    stop("Input must be of class master")
  }

  data <- master.data$data
  lines <- which(!is.na(data$pop))

  group_names <- groups.master(master.data)

  results <- list()
  for (i in lines) {
    pop <- trimws(data$pop[i])
    mols <- trimws(data$pop.mol[i])
    obs.ids <- if (is.na(data$obs[i])) NA else trimws(unlist(strsplit(data$obs[i], ",")))
    mols <- .match.mols.from.string(unlist(strsplit(mols, ",", fixed = TRUE)),
      molecules = molecules
    )

    fixed.pop <- gsub("/", "_", pop)
    file <- paste0(fixed.pop, pop.file.ext)
    file <- file.path(files.dir, file)

    profiles <- read.pop.profiles(file,
      molecules = mols,
      format = pop.file.format,
      csv.sep = pop.file.csv.sep,
      csv.dec = pop.file.csv.dec,
      xls.sheet = pop.file.xls.sheet
    )

    plot.infos <- list(
      x.min = data$x.min[i],
      x.max = data$x.max[i],
      y.min = data$y.min[i],
      y.max = data$y.max[i],
      y.min = data$y.min[i],
      y.max = data$y.max[i],
      y.min.log = data$y.min.log[i],
      y.max.log = data$y.max.log[i],
      x.offset = data$x.offset[i],
      main = data$main[i]
    )

    groups <- NULL
    if (length(group_names) > 0)
      groups <- as.character(data[i, group_names])

    pop.sim <- list(
      id = pop, obs.ids = obs.ids, profiles = profiles,
      groups = groups,
      plot.infos = plot.infos
    )
    class(pop.sim) <- "MatchedProfiles"
    results <- append(results, list(pop.sim))
  }

  return(results)
}

is.matched.profiles <- function(x) inherits(x, "MatchedProfiles")


# Read study list from file
read.pop.profiles <- function(file,
                              molecules = list(),
                              format = c("auto", "xsl", "csv"),
                              add.file.ext = "",
                              csv.sep = ",",
                              csv.dec = ".",
                              xls.sheet = 1) {
  file <- paste0(file, add.file.ext)
  base.name <- basename(file)
  message(paste("Reading profile from file <", base.name, ">"))

  if (!.does.file.exist(file)) {
    stop(paste("File <", file, "> does not exist"))
  }

  if (!.is.file.readable(file)) {
    stop(paste("File <", file, "> is not readable"))
  }

  format <- match.arg(format)

  if (format == "auto") {
    format <- .identify.file.ext(file)
    if (is.na(format)) {
      stop(paste("File <", file, "> has unknown file extension. Please specify the format."))
    }
  }

  if (format == "tsv") {
    format <- "csv"
  }

  df <- NA
  if (format == "xls") {
    excel.sheet <- suppressMessages(readxl::read_excel(file,
      sheet = xls.sheet,
      col_names = TRUE
    ))
    df <- as.data.frame(excel.sheet)
  }

  if (format == "csv") {
    df <- .read.csv(
      file = file, header = TRUE, sep = csv.sep, dec = csv.dec,
      encoding = "UTF-8"
    )
  }

  # extract time, individual and molecule columns
  time.info <- .id.time.col(df)
  mol.infos <- list()
  for (mol in molecules) {
    match <- .id.molecule.cols(df, mol$file.name.match,
      mol$add.file.matcher,
      is.fraction = mol$is.fraction, fixed.unit = mol$fixed.unit
    )

    if (length(match$cols) > 1) {
      stop(paste("Ambiguous column found for tag <", mol$file.name.match, ">"))
    }

    mol.infos <- c(mol.infos, list(match))
  }

  id.col <- .id.col(df, "IndividualId")
  if (is.na(id.col) || length(id.col) > 1) {
    stop("Could not identify individual id column")
  }

  # extract time data and id data
  times <- sort(unique(df[, time.info$col]))
  # prepare results
  results <- list()
  for (i in 1:length(molecules)) {
    mol <- molecules[[i]]
    entry <- list(
      molecule = mol,
      id = "",
      reference = "",
      group = NA,
      time.unit = time.info$unit,
      value.unit = mol.infos[[i]]$unit,
      data = data.frame(Time = times),
      type = "population",
      data.type = "individual",
      origin = "sim"
    )
    class(entry) <- "profile"
    results <- c(results, list(entry))
  }

  # for every id
  ids <- sort(unique(df[, id.col]))
  for (id in ids) {
    id.df <- df[df[id.col] == id, ]
    id.df <- id.df[order(id.df[, time.info$col]), ]
    for (i in 1:length(molecules)) {
      results[[i]]$data <- cbind(results[[i]]$data, id.df[, mol.infos[[i]]$cols])
    }

    # check for duplicated times
    tmp.data <- results[[i]]$data
    dup.times <- duplicated(tmp.data$Time)
    if (any(dup.times)) {
      message(paste("Deleted duplicated time entries for file <", base.name, ">"))
      results[[i]]$data <- tmp.data[!dup.times, ]
    }
  }

  for (i in 1:length(molecules)) {
    colnames(results[[i]]$data) <- c("Time", paste("ID", ids))
  }

  return(results)
}

is.profile <- function(x) inherits(x, "profile")


read.obs.profiles <- function(file,
                              obs.sheet = 1,
                              reference.sheet = 2,
                              molecules = list(),
                              id.filter = NA) {
  if (length(id.filter) > 1 || !is.na(id.filter)) {
    id.filter <- unlist(lapply(id.filter, toString))
  }

  base.name <- basename(file)
  if (!.does.file.exist(file)) {
    stop(paste("File <", base.name, "> does not exist"))
  }

  if (!.is.file.readable(file)) {
    stop(paste("File <", base.name, "> is not readable"))
  }

  if (!.does.sheet.exist(file, obs.sheet)) {
    stop(paste("Observed data sheet in file <", base.name, "> does not exist"))
  }

  if (!.does.sheet.exist(file, reference.sheet)) {
    stop(paste("Observed data sheet in file <", base.name, "> does not exist"))
  }


  # reference sheet
  # we text parsing because auto-guess will freak out on mixed column types
  ref.sheet <- suppressMessages(readxl::read_excel(file,
    sheet = reference.sheet,
    col_names = TRUE,
    col_types = "text"
  ))

  ref.sheet <- as.data.frame(ref.sheet)
  id.col <- .id.col(ref.sheet, "ID")
  if (is.na(id.col)) {
    stop(paste("File <", base.name, ">: Could not identify ID column in reference sheet"))
  }

  mol.col <- .id.col(ref.sheet, "Compound")
  if (is.na(mol.col)) {
    stop(paste("File <", base.name, ">: Could not identify Compound column in reference sheet"))
  }

  ref.col <- .id.col(ref.sheet, "Reference")
  if (is.na(ref.col)) {
    stop(paste("File <", base.name, ">: Could not identify Reference column in reference sheet"))
  }

  group.col <- .id.col(ref.sheet, "Group")
  if (is.na(group.col[1])) {
    stop(paste("File <", base.name, ">: Could not identify Group column in reference sheet"))
  }

    cite.col <- .id.col(ref.sheet, "Citekey")
  if (is.na(cite.col)) {
    stop(paste("File <", base.name, ">: Could not identify Citekey column in reference sheet"))
  }

  dose.col <- .id.col(ref.sheet, "^Dose", fixed = FALSE)
  if (is.na(dose.col)) {
    stop(paste("File <", base.name, ">: Could not identify Dose column in reference sheet"))
  }

  dunit.col <- .id.col(ref.sheet, "Unit Dose")
  if (is.na(dunit.col)) {
    stop(paste("File <", base.name, ">: Could not identify Unit Dose column in reference sheet"))
  }

  route.col <- .id.col(ref.sheet, "Route")
  if (is.na(route.col)) {
    stop(paste("File <", base.name, ">: Could not identify Unit Route column in reference sheet"))
  }
  
  # optional since 0.5
  pop.size.col <- .id.col(ref.sheet, "^N$", fixed = FALSE)
  pop.size.col.name <- "N"
  if (is.na(pop.size.col)) {
    message(paste("File <", base.name, ">: Could not identify optional column 'N' in reference sheet."))
    pop.size.col <- NULL
    pop.size.col.name <- NULL
  }
  
  col.names <- c("ID", "MOL", "REF", "GROUP", "GROUP2", "GROUP3", "CKEY", 
                 "DOSE", "DOSEUNIT", "ROUTE", pop.size.col.name)
  
  ref.sheet <- ref.sheet[c(id.col, mol.col, ref.col, group.col, cite.col, 
                           dose.col, dunit.col, route.col, pop.size.col,
                           pop.size.col)]
  colnames(ref.sheet) <- col.names
  
  ref.sheet$ID <- trimws(ref.sheet$ID)
  ref.sheet$GROUP <- trimws(ref.sheet$GROUP)
  ref.sheet$GROUP2 <- trimws(ref.sheet$GROUP2)
  ref.sheet$GROUP3 <- trimws(ref.sheet$GROUP3)
  id.mol.list <- list()
  for (i in 1:length(ref.sheet$MOL)) {
    mol <- ref.sheet$MOL[i]
    tmp.mol <- .find.molecule.from.id(molecules, mol)
    if (length(tmp.mol) < 2 && is.na(tmp.mol)) {
      stop(paste("File <", base.name, ">: Could not find molecule with id <", mol, 
                 "> from the reference sheet"), call. = FALSE)
    }
    id.mol.list <- c(id.mol.list, list(tmp.mol))
  }

  # checks
  if (length(ref.sheet$ID) != length(unique(ref.sheet$ID))) {
    stop(paste("File <", base.name, ">: Found non-unique IDs in the reference sheet:", 
               ref.sheet$ID[which(duplicated(ref.sheet$ID))]), call. = FALSE)
  }

  # observed data sheet
  # we text parsing because auto-guess will freak out on mixed column types
  obs.sheet <- suppressMessages(readxl::read_excel(file,
    sheet = obs.sheet,
    col_names = TRUE,
    col_types = "text"
  ))
  df <- as.data.frame(obs.sheet)
  id.col <- .id.col(df, "ID")
  if (is.na(id.col)) {
    stop(paste("File <", base.name, ">: Could not identify ID column for observed sheet"), 
         call. = FALSE)
  }

  if (nrow(df) < 1) {
    message(paste("Parsed 0 oberved profiles - sheet is empty"))
    return(list())
  }

  time.col <- .id.time.col(df)
  value.col <- .id.molecule.cols(df, "Value")
  error.col <- .id.molecule.cols(df, "SD")

  df[, time.col$col] <- sapply(df[, time.col$col], as.numeric)
  df[, value.col$cols] <- sapply(df[, value.col$cols], as.numeric)
  df[, error.col$cols] <- sapply(df[, error.col$col], as.numeric)

  ids <- unique(df[, id.col])
  if (length(id.filter) > 1 || !is.na(id.filter)) {
    ids <- id.filter
  }

  results <- list()
  for (id in ids) {
    data <- df[df[, id.col] == id, ]
    data <- data[c(time.col$col, value.col$cols, error.col$cols)]
    data <- cbind(data, Max = data[, 2] + data[, 3])
    data[, 3] <- data[, 2] - data[, 3]
    colnames(data) <- c("Time", "Avg", "Min", "Max")
    i <- which(ref.sheet$ID == id)
    if (length(i) != 1) {
      warning(paste("File <", base.name, ">: Found observed ID <", id,
                    "> that has no match in the reference data sheet. Data will be skipped"))
      next
    }

    # check for duplicated times
    dup.times <- duplicated(data$Time)
    if (any(dup.times)) {
      message(paste(
        "Deleted duplicated time entries for file <",
        base.name, "> and molecule id <", id.mol.list[[i]]$id, ">"
      ))
      data <- data[!dup.times, ]
    }

    # optionals since 0.5
    N <- ref.sheet$N[i]
    if (is.null(N)) N <- NA
    
    entry <- list(
      molecule = id.mol.list[[i]],
      reference = gsub("\r\n", "\n", ref.sheet$REF[i]),
      citekey = trimws(ref.sheet$CKEY[i]),
      dose = trimws(ref.sheet$DOSE[i]),
      dose.unit = trimws(ref.sheet$DOSEUNIT[i]),
      route = trimws(ref.sheet$ROUTE[i]),
      group = ref.sheet$GROUP[i],
      group2 = ref.sheet$GROUP2[i],
      group3 = ref.sheet$GROUP3[i],
      id = id,
      N = N,
      time.unit = time.col$unit,
      value.unit = value.col$unit,
      data = data,
      type = "individual",
      data.type = "mean",
      origin = "obs"
    )

    #colnames(ref.sheet) <- col.names
    class(entry) <- "profile"
    results <- c(results, list(entry))
  }

  message(paste("Parsed <", length(ids), "> oberved profiles"))
  return(results)
}


read.master.file <- function(master.file,
                             molecules = list(),
                             format = c("auto", "xsl", "csv"),
                             csv.sep = ",",
                             csv.dec = ".",
                             xls.sheet = 1) {
  base.name <- basename(master.file)

  if (!.does.file.exist(master.file)) {
    stop(paste("File <", base.name, "> does not exist"))
  }

  if (!.is.file.readable(master.file)) {
    stop(paste("File <", base.name, "> is not readable"))
  }

  format <- match.arg(format)

  if (format == "auto") {
    format <- .identify.file.ext(master.file)
    if (is.na(format)) {
      stop(paste("File <", base.name, "> has unknown file extension. Please specify the format."))
    }
  }

  if (format == "tsv") {
    format <- "csv"
  }

  df <- NA
  if (format == "xls") {
    suppressMessages(excel.sheet <- readxl::read_excel(master.file,
      sheet = xls.sheet,
      col_names = TRUE
    ))
    df <- as.data.frame(excel.sheet)
  }

  if (format == "csv") {
    df <- .read.csv(
      file = master.file, header = TRUE, sep = csv.sep, dec = csv.dec,
      encoding = "UTF-8"
    )
  }

  c.names <- tolower(colnames(df))
  # column identifier
  pop.id <- which(grepl("pop_name", c.names))
  sim.id <- which(grepl("sim_name", c.names))
  obs.id <- which(grepl("obs", c.names))
  pop.molecules.id <- which(grepl("pop_mol", c.names))
  sim.molecules.id <- which(grepl("sim_mol", c.names))

  if (length(pop.id) == 0 || length(pop.id) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown population column."))
  }

  if (length(sim.id) == 0 || length(sim.id) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown simulation column."))
  }

  if (length(obs.id) == 0 || length(obs.id) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown observed column."))
  }

  if (length(pop.molecules.id) == 0 || length(pop.molecules.id) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown population molecule column."))
  }

  if (length(sim.molecules.id) == 0 || length(sim.molecules.id) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown simulation molecule column."))
  }

  # optionals
  xmin.id <- which(grepl("x_min", c.names))
  if (length(xmin.id) == 1) {
    units(df[, xmin.id]) <- .extract.unit(c.names[xmin.id])
    c.names[xmin.id] <- "x.min"
  }
  xmax.id <- which(grepl("x_max", c.names))
  if (length(xmax.id) == 1) {
    units(df[, xmax.id]) <- .extract.unit(c.names[xmax.id])
    c.names[xmax.id] <- "x.max"
  }

  ymin.id <- which(grepl("^y_min", c.names))
  if (length(ymin.id) == 1) {
    units(df[, ymin.id]) <- .extract.unit(c.names[ymin.id])
    c.names[ymin.id] <- "y.min"
  }
  ymax.id <- which(grepl("^y_max", c.names))
  if (length(ymax.id) == 1) {
    units(df[, ymax.id]) <- .extract.unit(c.names[ymax.id])
    c.names[ymax.id] <- "y.max"
  }

  log_y_min.id <- which(grepl("^log_y_min", c.names))
  if (length(log_y_min.id) == 1) {
    units(df[, log_y_min.id]) <- .extract.unit(c.names[log_y_min.id])
    c.names[log_y_min.id] <- "y.min.log"
  }
  log_ymax.id <- which(grepl("^log_y_max", c.names))
  if (length(log_ymax.id) == 1) {
    units(df[, log_ymax.id]) <- .extract.unit(c.names[log_ymax.id])
    c.names[log_ymax.id] <- "y.max.log"
  }

  offset.id <- which(grepl("offset", c.names))
  if (length(offset.id) == 1) {
    units(df[, offset.id]) <- .extract.unit(c.names[offset.id])
    c.names[offset.id] <- "x.offset"
  }

  headline.id <- which(grepl("headline", c.names))
  c.names[headline.id] <- "main"

  c.names[pop.id] <- "pop"
  c.names[sim.id] <- "sim"
  c.names[obs.id] <- "obs"
  c.names[pop.molecules.id] <- "pop.mol"
  c.names[sim.molecules.id] <- "sim.mol"
  colnames(df) <- c.names

  group_cols <- which(grepl("group", c.names))
  group_cols <- c.names[group_cols]

  # drop unknown colums
  known <- c(
    "pop", "sim", "obs", "pop.mol", "sim.mol",
    "x.min", "x.max",
    "y.min", "y.max",
    "y.min.log", "y.max.log",
    "x.offset", "main", group_cols)
  df <- df[, colnames(df) %in% known]

  if ("obs" %in% colnames(df))
    df$obs <- as.character(df$obs)

  # delete non-pop/sim lines
  missing.lines <- c()
  for (i in 1:nrow(df)) {

    p.name <- trimws(df$pop[i])
    s.name <- trimws(df$sim[i])
    if ((p.name == "" || is.na(p.name))
        && (s.name == "" || is.na(s.name))) {
      message(paste("Skipped line", i, ": No population or simulation entry found."))
      missing.lines <- c(missing.lines, i)

    }
  }

  # check if sheet has entries
  if (length(missing.lines > 0))
    df <- df[-missing.lines,]
  if (nrow(df) < 1) {
    stop("No enties (rows) found in the sheet", call. = FALSE)
  }

  # try to convert wrongly parsed obs ids to strings
  obs.to.str <- function(x) {
    suppressWarnings(num <- as.numeric(x))
    if (is.na(num))
      return(x)

    if (num%%1==0)
      return(x)

    return(gsub("\\.", ",", x))
  }
  df$obs <- unlist(lapply(df$obs, obs.to.str))


  # test for missing molecules
  for (i in 1:nrow(df)) {

    mols <- df$pop.mol
    mol.strs <- unlist(strsplit(mols, ",", fixed = TRUE))
    if (length(mol.strs) == 0) {
      stop(paste("File <", base.name, "> in line <", i, ">: no population molecule definition found."))
    }

    .match.mols.from.string(mol.strs, molecules)

    mols <- df$sim.mol
    mol.strs <- unlist(strsplit(mols, ",", fixed = TRUE))
    if (length(mol.strs) == 0) {
      stop(paste("File <", base.name, "> in line <", i, ">: no simulation molecule definition found."))
    }

    .match.mols.from.string(mol.strs, molecules)
  }

  result <- list(id = base.name, data = df)
  class(result) <- "master"
  return(result)
}

is.master <- function(x) inherits(x, "master")


# read all sheets or csv data into a list of dataframes
.read.all.sheet <- function(files,
                            folder = ".",
                            format = c("auto", "xsl", "csv"),
                            csv.sep = ",",
                            csv.dec = ".") {
  format <- match.arg(format)

  results <- list()
  for (file in files) {
    base.name <- basename(file)
    file <- file.path(folder, file)
    message(paste("Reading file <", base.name, ">"))

    if (!.does.file.exist(file)) {
      stop(paste("File <", base.name, "> does not exist"))
    }

    if (!.is.file.readable(file)) {
      stop(paste("File <", base.name, "> is not readable"))
    }

    f.format <- format
    if (f.format == "auto") {
      f.format <- .identify.file.ext(file)
    }

    if (is.na(f.format)) {
      stop(paste("File <", base.name, "> : Could not detect file format"))
    }

    if (f.format == "tsv") {
      f.format <- "csv"
    }

    df <- NA
    if (f.format == "csv") {
      df <- .read.csv(
        file = file, header = TRUE, sep = csv.sep, dec = csv.dec,
        encoding = "UTF-8"
      )

      attr(df, "file.name") <- base.name
      attr(df, "sheet.idx") <- 1
      attr(df, "sheet.name") <- NA
      results <- append(results, df)
    } else {
      sheet.names <- .sheets(file)
      sheets <- length(sheet.names)
      for (i in 1:sheets) {
        suppressMessages(excel.sheet <- readxl::read_excel(file,
          sheet = i,
          col_names = TRUE
        ))

        df <- as.data.frame(excel.sheet)
        attr(df, "file.name") <- base.name
        attr(df, "sheet.idx") <- i
        attr(df, "sheet.name") <- sheet.names[i]

        results <- append(results, list(df))
      }
    }
  }

  return(results)
}


.get.sheet.info.str <- function(df) {
  file.name <- attributes(df)["file.name"]
  sheet.idx <- attributes(df)["sheet.idx"]
  sheet.name <- attributes(df)["sheet.name"]
  return(paste(
    "  File: ", file.name,
    "\n  Sheet Nr:", sheet.idx,
    "\n  Sheet Name:", sheet.name
  ))
}

read.sim.profiles.from.master <- function(master.data,
                                          sim.files,
                                          molecules = list(),
                                          files.dir = ".",
                                          file.format = c("auto", "xsl", "csv"),
                                          file.csv.sep = ",",
                                          file.csv.dec = ".",
                                          action.on.multimatch = c("stop", "warning", "message", "silent"),
                                          multifile.enties = F
                                          ) {
  if (!is.master(master.data)) {
    stop("Input must be of class master")
  }


  action.on.multimatch <- match.arg(action.on.multimatch)
  multi.action <- stop
  if (action.on.multimatch == "warning") {
    multi.action <- warning
  } else if (action.on.multimatch == "message") {
    multi.action <- message
  } else if (action.on.multimatch == "silent") {
    multi.action <- function(...) invisible(NULL)
  } # noop

  group_names <- groups.master(master.data)

  results <- list()
  sim.data <- .read.all.sheet(sim.files,
    folder = files.dir, format = file.format,
    csv.sep = file.csv.sep, csv.dec = file.csv.dec
  )

  master <- master.data$data
  for (i in 1:nrow(master)) {
    id <- master$sim[i]
    if (is.na(id)) {
      next
    }

    message(paste("Lookup id <", id, ">"))

    # match molecules
    mols <- trimws(master$sim.mol[i])
    obs.ids <- if (is.na(master$obs[i])) NA else trimws(unlist(strsplit(master$obs[i], ",")))
    mols <- .match.mols.from.string(unlist(strsplit(mols, ",", fixed = TRUE)),
      molecules = molecules
    )

    # find entry in sim.data
    sheet_entry <- list()
    prev_matches <- NA
    for (sheet in sim.data) {
      matches <- .id.col(sheet, paste0(id, "|"))
      if (length(matches) > 1 || !is.na(matches)) {
        # found it and check for double entry
        if (!multifile.enties && is.data.frame(sheet_entry)) {
          multi.action(paste0(
            "Found ambiguous entry for ID < ", id, " >",
            "\n Matched      : < ", paste0(colnames(sheet)[matches], " > IN:\n", .get.sheet.info.str(sheet)),
            "\n First Match  : ", prev_matches
          ))
          multi.action("Selected the FIRST match !\n")
        } else {
          prev_matches <- paste0("< ", colnames(sheet)[matches], " > IN:\n", .get.sheet.info.str(sheet))
          if (multifile.enties)
            sheet_entry <- append(sheet_entry, list(sheet))
          else
            sheet_entry[[1]] <- sheet
        }
      }
    }

    if (length(sheet_entry) == 0) {
      stop(paste("Did not find entry <", id, ">"))
    }

    # sheet entry is now a list of data.frames (sheets) that hold information about
    # one of more molecules

    # extract time and molecule columns
    mol.infos <- list()
    for (mol in mols) {
      match <- NULL
      for (sheet in sheet_entry) {
        match_tmp <- .id.molecule.cols(sheet, mol$file.name.match,
                                       mol$add.file.matcher, is.fraction = mol$is.fraction,
                                       fixed.unit = mol$fixed.unit, silent = TRUE)
        # found the same entry in multiple sheets
        if (length(match_tmp) > 0 && length(match) > 0)
          stop(paste("Ambiguous sheets found for tag <", mol$file.name.match, ">"))

        # found the same entry in multiple columns
        if (length(match_tmp) > 0 && length(match_tmp$cols) > 1) {
          stop(paste("Ambiguous column found for tag <", mol$file.name.match, ">"))
        }

        if (length(match_tmp) > 0) {
          match <- match_tmp
          match[["sheet"]] <- sheet
        }
      }

      if (length(match) == 0)
        stop(paste("Could not find entry for tag <", mol$file.name.match, ">"))

      #
      #  result <- list(
      #  id = match.tag,
      #  cols = col.nrs,
      #  unit = unit)
      #
      mol.infos <- c(mol.infos, list(match))
    }

    # gather results for each molecule
    pro.results <- list()
    for (j in 1:length(mols)) {
      mol <- mols[[j]]
      mol.i <- mol.infos[[j]]

      time.info <- .id.time.col(mol.i$sheet)
      data <- data.frame(mol.i$sheet[, time.info$col], mol.i$sheet[, mol.i$cols], NA, NA)
      colnames(data) <- c("Time", "Avg", "Min", "Max")

      dup.times <- duplicated(data$Time)
      if (any(dup.times)) {
        message(paste(
          "Deleted duplicated time entries for ID <",
          id, "> and molecule id <", mol$id, ">"
        ))
        data <- data[!dup.times, ]
      }


      entry <- list(
        molecule = mol,
        id = "",
        reference = "",
        group = NA,
        time.unit = time.info$unit,
        value.unit = mol.infos[[j]]$unit,
        data = data,
        type = "individual",
        data.type = "mean",
        origin = "sim"
      )
      class(entry) <- "profile"
      pro.results <- c(pro.results, list(entry))
    }

    plot.infos <- list(
      x.min = master$x.min[i],
      x.max = master$x.max[i],
      y.min = master$y.min[i],
      y.max = master$y.max[i],
      y.min.log = master$y.min.log[i],
      y.max.log = master$y.max.log[i],
      x.offset = master$x.offset[i],
      main = master$main[i]
    )
    
    groups <- NULL
    if (length(group_names) > 0)
      groups <- as.character(master[i, group_names])
    
    sim <- list(
      id = id, obs.ids = obs.ids, profiles = pro.results,
      groups = groups,
      plot.infos = plot.infos
    )
    class(sim) <- "MatchedProfiles"
    results <- append(results, list(sim))
  }

  return(results)
}


read_molecule_file <- function(file,
                               format = c("auto", "xsl", "csv"),
                               csv.sep = ",",
                               csv.dec = ".",
                               xls.sheet = 1,
                               silent = FALSE) {

  base.name <- basename(file)
  
  if (!.does.file.exist(file)) {
    stop(paste("File <", base.name, "> does not exist"))
  }
  
  if (!.is.file.readable(file)) {
    stop(paste("File <", base.name, "> is not readable"))
  }
  
  format <- match.arg(format)
  
  if (format == "auto") {
    format <- .identify.file.ext(file)
    if (is.na(format)) {
      stop(paste("File <", base.name, "> has unknown file extension. Please specify the format."))
    }
  }
  
  if (format == "tsv") {
    format <- "csv"
  }
  
  df <- NA
  if (format == "xls") {
    suppressMessages(excel.sheet <- readxl::read_excel(file,
                                                       sheet = xls.sheet,
                                                       col_names = TRUE
    ))
    df <- as.data.frame(excel.sheet)
  }
  
  if (format == "csv") {
    df <- .read.csv(
      file = file, header = TRUE, sep = csv.sep, dec = csv.dec,
      encoding = "UTF-8"
    )
  }
  
  c.names <- tolower(colnames(df))    
  
  name.idx <- which(grepl("name", c.names))
  display.idx <- which(grepl("display", c.names))
  id.idx <- which(grepl("id", c.names))
  pubchem.idx <- which(grepl("pubchem", c.names))
  color.idx <- which(grepl("color", c.names))
  ylab.idx <- which(grepl("ylab", c.names))
  pch.idx <- which(grepl("pch", c.names))
  lty.idx <- which(grepl("lty", c.names))
  mw.idx <- which(grepl("mw", c.names))
  fixed.idx <- which(grepl("fixed", c.names))
  fraction.idx <- which(grepl("fraction", c.names))
  legend.idx <- which(grepl("legend", c.names))
  match.idx <- which(grepl("match", c.names))
  add.match.idx <- which(grepl("more", c.names))
  
  
  if (length(name.idx) == 0 || length(name.idx) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown Name column."))
  }
  
  if (length(display.idx) == 0 || length(display.idx) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown Display column."))
  }
  
  if (length(id.idx) == 0 || length(id.idx) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown ID column."))
  }
  
  if (length(pubchem.idx) == 0 || length(pubchem.idx) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown Pubchem column."))
  }
  
  if (length(color.idx) == 0 || length(color.idx) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown Color column."))
  }
  
  if (length(ylab.idx) == 0 || length(ylab.idx) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown ylab column."))
  }
  
  if (length(pch.idx) == 0 || length(pch.idx) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown pch column."))
  }
  
  if (length(lty.idx) == 0 || length(lty.idx) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown lty column."))
  }
  
  if (length(mw.idx) == 0 || length(mw.idx) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown MW column."))
  }
  
  if (length(fixed.idx) == 0 || length(fixed.idx) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown Fixed Unit column."))
  }
  
  if (length(fraction.idx) == 0 || length(fraction.idx) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown Is Fraction column."))
  }
  
  if (length(legend.idx) == 0 || length(legend.idx) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown Show Legend column."))
  }
  
  if (length(match.idx) == 0 || length(match.idx) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown Match column."))
  }
  
  if (length(add.match.idx) == 0 || length(add.match.idx) > 1) {
    stop(paste("File <", base.name, "> has ambiguous or unknown More column."))
  }
  
  
  msg_fn <- if (silent) function(x, nl = TRUE) {} else function(x, nl = TRUE) {message(x, appendLF = nl)}
  
  molecules <- list()
  
  for (i in 1:nrow(df)) {
    msg_fn(paste("* Parsing row <", i , ">"))
    
    name <- df[i, name.idx]
    display.name <- df[i, display.idx]
    id <- df[i, id.idx]
    file.name.match <- df[i, match.idx]
    pubchem.id <- df[i, pubchem.idx]
    MW <- df[i, mw.idx]
    
    # pch
    pch <- df[i, pch.idx]
    if (is.na(pch))
      pch <- 19
    else
      pch <- as.numeric(pch)
    
    # color
    color <- df[i, color.idx]
    if (is.na(color)) {
      
      color <- "black"
    } else  {
      rgb <- unlist(strsplit(color, ",", fixed = T))
      if (length(rgb) > 1) {
        if (length(rgb) != 4)
          stop("Color in must be defined as string or RGBA separated by comma")
        
        rgb <- as.numeric(rgb)
        color <- do.call(grDevices::rgb, append(rgb, list(maxColorValue = 255)))
      }
    }
    
    # ylab
    ylab <- df[i, ylab.idx]
    if (is.na(ylab))
      ylab <- "Plasma Concentration"
    
    lty <-  df[i, lty.idx]
    if (is.na(lty))
      lty <- 1
    else
      lty <- as.numeric(lty)
    
      
    # in legend
    in.legend <- df[i, legend.idx]
    if (is.na(in.legend)) {
      
      in.legend <- TRUE
    } else {
      in.legend <- .parse_logical(in.legend)
      if (is.null(in.legend))
        stop("Show Legend must be convertible to logical")
    }
    
    # is fraction
    is.fraction <- df[i, fraction.idx]
    if (is.na(is.fraction)) {
      
      is.fraction <- FALSE
    } else {
      is.fraction <- .parse_logical(is.fraction)
      if (is.null(is.fraction))
        stop("Is Fraction must be convertible to logical")
    }
    
    # fixed unit
    fixed.unit <- df[i, fixed.idx]
    if (!is.na(fixed.unit))
      fixed.unit <- as_units(fixed.unit)
    
    # additional files matcher
    add.file.matcher <- df[i, add.match.idx]
    if (!is.na(add.file.matcher)) {
      add.file.matcher <- base::trimws(unlist(strsplit(add.file.matcher, ",", fixed = T)))
    }  
    
    mol <- molecule(name = name, display.name = display.name, id = id, 
                    file.name.match = file.name.match, 
                    add.file.matcher = add.file.matcher, 
                    pubchem.id = pubchem.id, MW = MW, is.fraction = is.fraction, 
                    fixed.unit = fixed.unit, 
                    ylab = ylab, color = color, 
                    pch = pch, lty = lty, in.legend = in.legend)
    
    molecules <- append(molecules, list(mol))
  }
  
  msg_fn(paste("Created <", nrow(df), "> molecules"))
  
  return(molecules)
}
onwhenrdy/pksimhelper documentation built on Aug. 19, 2022, 7:56 a.m.