sandbox/do_allele_match.R

#!/usr/bin/env Rscript

# This script will perform matching of genotypes where user can control matching parameters.
# It is also possible to run profiling to determine best parameters. Output can be an image.
# Result is controlled through parameters and can be HTML or CSV.
# Verbose logging is done to a file, controlled by a parameter (see --help for more info).
# Read arguments passed to the script

library(optparse)
option_list <- list(
  make_option(c("-i", "--input"), type = "character", default = NULL, 
              help = "A file (path) to the source file of genotypes, e.g. 'dataset.csv' or 
                     './data/dataset.csv' if nested in /data/ relative to the working di-
                     rectory (where script is ran from). File should have a comma as del-
                     imiters and tab separating columns. It should have sample name and 
                     quality index columns, along with loci names."),
  make_option(c("-a", "--alleleMismatch"), type = "integer", default = NA,
              help = "Integer. Number of mismatched alleles (m-hat). Related to matchThreshold 
                 and cutHeight."),
  make_option(c("-m", "--matchThreshold"), type = "double", default = NA, 
              help = "Double. Dissimilarity score which constitutes a match when identifying 
                individuals (s-hat)."),
  make_option(c("-c", "--cutHeight"), type = "double", default = NA,
              help = "Double. Used for dynamic cutting of the tree by amCluster."),
  make_option(c("-o", "--output"), type = "character", default = "default_output.csv",
              help = "Character. The way results will be output. Can be 'html' or 'csv'."),
  make_option(c("-p", "--profile"), type = "logical", metavar = "", default = FALSE,
              action = "store_true",
              help = "If specified, profiling will be performed (but not matching)."),
  make_option(c("-f", "--fig"), type = "character", default = "profile_result.png",
              help = "Name of the resulting figure."),
  make_option(c("-v", "--verbose"), type = "character", default = "debugging.txt",
              help = "Name of file into where verbose debugging text is parsed.")
)

opt <- parse_args(OptionParser(option_list = option_list))

# Load packages
library(allelematch)

# Process input parameters
if (is.null(opt$input)) {
  stop("No input detected. Check path or existance of file.")
} else {
  input <- opt$input
}

# Import data (can be csv, even SQL statement)
# Implementing reading from a csv with decimal delimiter and tab as separator. 
message("Importing dataset...")
cat("Importing dataset.\n", file = opt$verbose) # create a new debugging file, overwrite any previous
xy <- read.table(input, header = TRUE, sep = ";")
cat("[OK] Successfully imported dataset.\n", file = opt$verbose, append = TRUE)

cat("Reshaping data.\n", file = opt$verbose, append = TRUE)
xy <- reshape(xy, timevar = "marker", idvar = c("sample", "id"), direction = "wide")
names(xy) <- gsub("measured\\.", replacement = "", x = names(xy))
cat("[OK] Done reshaping data.\n", file = opt$verbose, append = TRUE)

# Convert the imported dataset into a proper structure for matching
message("Creating dataset for further processing...")
cat("Preparing dataset for analysis.\n", file = opt$verbose, append = TRUE)
d.xy <- amDataset(xy, missingCode = NA, indexColumn = "sample", metaDataColumn = "id")
cat("[OK] Dataset ready for analysis.\n", file = opt$verbose, append = TRUE)

# Do profiling. Figure is saved to working directory.
if (opt$profile) {
  message("Profiling... This may take a while.")
  cat("Performing profiling.\n", file = opt$verbose, append = TRUE)
  
  png(filename = opt$fig,  width = 500, height = 500) # output figure size, adapt if needed
  prof <- tryCatch(amUniqueProfile(amDatasetFocal = d.xy, verbose = TRUE),
                   error = function(e) e,
                   warning = function(w) w)
  
  prof.ok <- any(class(prof) %in% c("error", "warning"))
  
  if (prof.ok) {
    plot.new()
    plot.window(xlim = c(-10, 10), ylim = c(-5, 5))
    text(x = 0, y = 1, cex = 2, labels = "Profiling failed with error message:")
    text(x = 0, y = 0, cex = 2, labels = prof$message)
    
    cat("[STOP] Profiling took a turn for the worse.\n", file = opt$verbose, append = TRUE)
  }
  
  dev.off()
  
  if (prof.ok) {
    return(sprintf("[STOP] Profiling failed, please see error log in %s.", opt$verbose))
  }
  
  cat("[OK] Done performing profiling, figure saved.\n", file = opt$verbose, append = TRUE)
  message("Figure saved.")
} else {
  mp <- c("alleleMismatch", "matchThreshold", "cutHeight")
  opt[mp] <- lapply(opt[mp], FUN = function(x) {
    if (is.na(x)) {
      x <- NULL
    }
    x
  })
  
  if (all(is.null(opt$alleleMismatch), is.null(opt$matchThreshold), is.null(opt$cutHeight))) {
    stop("Please specify at least one parameter (alleleMismatch, matchThreshold or cutHeight), see --help.")
  }
  
  cat("Performing matching.\n", file = opt$verbose, append = TRUE)
  message("Performing matching...")
  
  result <- tryCatch(amUnique(amDatasetFocal = d.xy, 
                     alleleMismatch = opt$alleleMismatch, 
                     matchThreshold = opt$matchThreshold,
                     cutHeight = opt$cutHeight),
                     error = function(e) e,
                     warning = function(w) w)
  
  match.ok <- any(class(result) %in% c("error", "warning"))
  
  if (match.ok) {
    
  }
  
  cat("[OK] Matching done.\n", file = opt$verbose, append = TRUE)
  
  # Produce result as specified in output
  # Catch csv, if not, assume html is requested
  message("Writing output...")
  
  if (grepl("\\.csv$", opt$output)) {
    suppressMessages(require(tidyr))
    
    cat("Writing result to CSV.\n", file = opt$verbose, append = TRUE)
    
    amCSV.amUnique(x = result, csvFile = opt$output)
    
    # Reread the data and reflow it into a long format, subset only 
    # relevant columns and resave it
    rein <- read.table(opt$output, header = TRUE, sep = ",")
    rein <- gather(rein, key = markerName, value = value, -uniqueGroup, -rowType, -uniqueIndex,
                   -matchIndex, -nUniqueGroup, -alleleMismatch, -matchThreshold, -cutHeight,
                   -Psib, -score, -metaData)[, c("metaData", "uniqueGroup", "rowType", "uniqueIndex", "matchIndex", "Psib",
                                      "markerName", "value")]
    names(rein)[grepl("metaData", names(rein))] <- "id" # rename metaData to id
    
    write.table(rein, file = opt$output, quote = FALSE, sep = ",",
                col.names = TRUE, row.names = FALSE)
    
    cat("[OK] Done writing CSV file.\n", file = opt$verbose, append = TRUE)
  } else {
    cat("Writing data to HTML.\n", file = opt$verbose, append = TRUE)
    amHTML.amUnique(x = result, htmlFile = opt$output)
    cat("[OK] Done writing data to HTML.\n", file = opt$verbose, append = TRUE)
  }
}

cat("\n", file = opt$verbose, append = TRUE)
cat("Analysis performed using the following tools (https://github.com/romunov/zvau) \n\n",
    file = opt$verbose, append = TRUE)
sink(file = opt$verbose, append = TRUE, type = "output")

sessionInfo()
sink()
romunov/zvau documentation built on May 27, 2019, 1:50 p.m.