R/utility.R

fixSeqWareTSV <- function(t.df, valid.dt) {
  t.df.colnames <- colnames(t.df)

  # Library column is duplicated for some reason in Run Report TSV files. This removes is.
  # Note that if duplicated column names with different data exist, the second one will be ignored.
  t.df.colnames <- unique(t.df.colnames)

  # Create a data table for each column
  dt.list <- lapply(t.df.colnames, function(x) {
    conversion <- valid.dt[file.name == x, ]
    data <- t.df[[x]]

    if (nrow(conversion) != 1) {
      stop(
        paste(
          "Field name",
          x,
          "cannot be unambiguously parsed. It is found",
          nrow(conversion),
          "times in the annotation file. Expected 1."
        )
      )
    }

    # Split Read Length from a character column into two numeric columns
    if (conversion$app.name == "#Split_Read1_Read2") {
      read.length <- sapply(data, function(x)
        strsplit(x, ",")[[1]])
      data <-
        as.data.table(list(
          `R1 Length` = as.numeric(read.length[1, ]),
          `R2 Length` = as.numeric(read.length[2, ])
        ))
      return(data)
    }

    # Split insert size and SD from a character column into two numeric columns
    if (conversion$app.name == "#Split_Mean_SD") {
      in.sd <- gsub("[\\(\\)]", "", data)
      in.sd <- sapply(in.sd, function(x)
        strsplit(x, " ")[[1]])
      data <-
        as.data.table(list(
          `Insert Mean` = as.numeric(in.sd[1, ]),
          `Insert Stddev` = as.numeric(in.sd[2, ])
        ))
      return(data)
    }

    # If it is a numeric type, only leave numbers and periods
    if (conversion$type == "Numeric") {
      data <- gsub("[^0-9\\.]", "", data)
      data <- as.numeric(data)
    }

    result <- list()

    # The column is renamed to canonical name
    result[[conversion$app.name]] <- data
    return(as.data.table(result))
  })

  result.dt <- as.data.table(dt.list)

  # Add Study name
  result.dt <- addCustomTSVMetrics(
    result.dt,
    valid.dt,
    "Study",
    sapply(strsplit(result.dt$Library, "_"), function(x) x[[1]])
  )

  # Add On Target Percentage
  result.dt <- addCustomTSVMetrics(
    result.dt,
    valid.dt,
    "On Target Percentage",
    result.dt$`Percent Mapped on Target` * result.dt$`Map Percent` / 100
  )

  # Field keeps track if a library has been selected by user
  result.dt <- addCustomTSVMetrics(
    result.dt,
    valid.dt,
    "plotly_library_selected",
    FALSE
  )

  # Field that must be unique for each library. For illumina that is: Library name + lane + barcode
  result.dt <- addCustomTSVMetrics(
    result.dt,
    valid.dt,
    "plotly_unique_key",
    paste(result.dt$Library, result.dt$Lane, result.dt$Barcode, sep = "_")
  )

  # Add any missing columns as all NA (Columns from the valid.dt starting with # are ignored, as it is a flag to split column)
  missing.columns <- setdiff(valid.dt$app.name, colnames(result.dt))
  remove.flag.columns <- grepl('^[^#]', missing.columns) # Columns that do not start with flag
  missing.columns <- missing.columns[remove.flag.columns]

  if(!identical(missing.columns, character(0))) {
    # https://stackoverflow.com/questions/11745169/dynamic-column-names-in-data-table
    result.dt[, (missing.columns) := NA]
  }

  return(result.dt)
}

# Add fields not present in the original TSV file
addCustomTSVMetrics <- function(dt, valid.dt, field_name, field_values) {
  if (field_name %in% colnames(dt)) {
    stop(
      paste(
        "Custom field name cannot be added as it already exists:",
        field_name
      )
    )
  }

  if (!(field_name %in% valid.dt$app.name)) {
    stop(
      paste(
        "Custom field not specified in annotation file:",
        field_name
      )
    )
  }

  set(dt, j = field_name, value = field_values)
}

createLong <- function(t.df, all_plots, info_columns) {
  epic.dt.long <-
    melt(
      t.df,
      id.vars = intersect(info_columns, colnames(t.df)),
      variable.name = "Type",
      value.name = "Value"
    )

  # Order the different metrics, so that they will always be displayed in the same order
  # Only present metrics can be included in factor levels, as data.table::split bugs out otherwise
  set(
    epic.dt.long,
    j = "Type",
    value = factor(
      epic.dt.long$Type,
      levels = intersect(all_plots, epic.dt.long$Type)
    )
  )
  setorder(epic.dt.long, Type)

  return(epic.dt.long)
}

# This function loads the file and adds a column with the path
# This is necessary as the TSV file itself does not tell you which run it came from
# Any warning is thrown as an error, as fread tries its best to read even nonstandard files
readSeqWareTSV <- function(path) {
  tryCatch({
    dt <-
      fread(
        path,
        sep = "\t",
        header = TRUE,
        na.strings = c("NA", "na")
      )

    if ("Run Name" %in% colnames(dt)) {
      stop("Column 'Run Name' is used internally by the App and cannot be present in Run Report")
    }
    set(dt, j = "Run Name", value = factor(rep(path, nrow(dt))))

  }, warning = function(w) {
    # This is a crutch to fix a bug in data.table. If fread causes a warning, it is caught, but the function is left in a failed state.
    # The next valid call to fread will fail again, as the function has to reset itself. The next valid call will work fine.
    # I am manually making a valid call to fread to reset it
    # Issue is at https://github.com/Rdatatable/data.table/issues/2904
    tryCatch({
      fread("echo bug, fix")
    }, warning = function(w) {
      # Squash the warning message produced by resetting fread
    })

    stop(conditionMessage(w))
  })
}

# The data table that will be used throughout the app, starting from the path to the Run Report
createAppDT <- function(path, valid.dt) {
  current.run <- readSeqWareTSV(path)
  current.run <- fixSeqWareTSV(current.run, valid.dt)
  return(current.run)
}

generateRunReportURL <- function(run_alias) {
  paste("https://www.hpc.oicr.on.ca/archive/web/runReports/", run_alias, "/", run_alias, "_report.html", sep = "")
}

generateMisoRunURL <- function(run_alias) {
  paste("https://miso.oicr.on.ca/miso/run/alias/", run_alias, sep = "")
}
oicr-gsi/acquacotta-shiny-run-report documentation built on May 30, 2019, 4:05 p.m.