# Function : leqtar_process_files
# Input : processed arguments
# Output : Data objects
# Note to self: 1 genotype, 2 phenotype, 3 covariates, 4 output, 5 valid
# leqtar_process_files -----------------------------------------------------------
#' leqtar_process_files function
#'
#' Processes files based on their extensions.
#' Checks the order of column- and row- names and dimensions of the data sets.
#' If all checks pass, continue with genotype conversion.
#'
#' @param arguments path to file supplied by user.
#' @return content of data files.
#' @importFrom "gdata" "read.xls"
#' @importFrom "stringr" "str_split"
#' @importFrom "gtools" "mixedorder"
#' @importFrom "gtools" "mixedsort"
#' @importFrom "utils" "read.table"
#' @importFrom "utils" "read.csv"
#' @note Hard requirement, no dots should be present in the file name, except for the extension.
leqtar_process_files <- function(arguments) {
# Check for data or file path --------------
# Set flags
message("[INFO] Processing Files..")
message("[INFO] ----------#----------")
message("[INFO] Reading files..")
# Check paramters for file's or objects --------------------
phenotype_file_content <- check_object_or_file(arguments$phenotype, arguments$phenotypeData, "Phenotype")
phenotype_position_content <- check_object_or_file(arguments$phenotypePosition, arguments$phenotypePositionData, "Phenotype positions")
genotype_file_content <- check_object_or_file(arguments$genotype, arguments$genotypeData, "Genotype")
genotype_position_content <- check_object_or_file(arguments$genotypePosition, arguments$genotypePositionData, "Genotype positions")
covariates_file_content <- check_object_or_file(arguments$covariates, arguments$covariatesData, "Covariates")
message("[INFO] Reading files OK..")
# Check dimensions -------------------
message("[INFO] ----------#----------")
message("[INFO] Checking dimensions..")
dim_genotype <- dim(genotype_file_content)
dim_phenotype <- dim(phenotype_file_content)
warnings <- 0
if ( !is.null(covariates_file_content) ) {
dim_covariates <- dim(covariates_file_content)
if (dim_genotype[2] != dim_covariates[2]) {
message("[WARN] The number of samples in your genotype and covariate files do not match..",
"\n\\___ leqtar will try to correct for this sample indifference..")
warnings <- warnings + 1
} else if (dim_genotype[2] != dim_phenotype[2]) {
message("[WARN] The number of samples in your genotype and phenotype files do not match..",
"\n\\___ leqtar will try to correct for this sample indifference..")
warnings <- warnings + 1
} else if (dim_covariates[2] != dim_phenotype[2]) {
message("[WARN] The number of samples in your covariates and phenotype files do not match..",
"\n\\___ leqtar will try to correct for this sample indifference..")
warnings <- warnings + 1
}
} else {
if (dim_genotype[2] != dim_phenotype[2]) {
message("[WARN] The number of samples in your genotype and phenotype files do not match..",
"\n\\___ leqtar will try to correct for this sample indifference..")
warnings <- warnings + 1
}
}
if ( warnings > 0 ) {
message("[INFO] Checking dimensions OK with: ", as.character(warnings), " warning(s) ..")
} else {
message("[INFO] Checking dimensions OK..")
}
# Changing genotypes to frequencies -------------------------
message("[INFO] ----------#----------")
message("[INFO] Checking genotype data..")
if ( class( as.vector(genotype_file_content[1,1]) ) == "character" && arguments$genoToFreq == F ) {
stop("[STOP] Detected characters in genotype data. If you want leqtar to change them to\n \\___ frequencies, set argument 'genoToFreq=T'..")
} else if ( class( as.vector(genotype_file_content[1,1]) ) == "character" && arguments$genoToFreq == T ) {
message("[INFO] Genotype conversion: ", as.character(arguments$genoToFreq), ", conversing genotypes..")
sub_arguments <- leqtar_genotypes_to_frequencies(genotype_file_content)
genotype_file_content <- sub_arguments$genotypeConverted
genotype_file_content_unconverted <- sub_arguments$genotypeNotConverted
} else if ( class( as.vector(genotype_file_content[1,1]) ) == "integer" && arguments$genoToFreq == F ||
class( as.vector(genotype_file_content[1,1]) ) == "numeric" && arguments$genoToFreq == F ) {
message("[INFO] Genotype conversion: ", as.character(arguments$genoToFreq), ", but genotypes already conversed..")
} else if ( class( as.vector(genotype_file_content[1,1]) ) == "integer" && arguments$genoToFreq == T ||
class( as.vector(genotype_file_content[1,1]) ) == "numeric" && arguments$genoToFreq == T ) {
message("[INFO] Genotype conversion: ", as.character(arguments$genoToFreq), ", but genotypes already conversed..")
} else if ( class( as.vector(genotype_file_content[1,1]) ) == "factor" ) {
stop("[STOP] Factor variables are not yet supported..")
} else {
stop("[STOP] Unexpected error, your genotype file is probably incorrect. If this is a persistent error,
report the issue in the github issue tracker..")
}
message("[INFO] Checking genotype data OK..")
# Check column name order ---------------------------------
message("[INFO] ----------#----------")
message("[INFO] Checking sample names..")
# Bind 'covariates_samples'
covariates_samples <- NULL
genotype_samples <- colnames(genotype_file_content)
phenotype_samples <- colnames(phenotype_file_content)
if ( !is.null(covariates_file_content) ) {
covariates_samples <- colnames(covariates_file_content)
}
if ( !is.null(covariates_file_content) ) {
if ( length(genotype_samples) == length(phenotype_samples) &&
length(genotype_samples) == length(covariates_samples) ) {
if ( all(genotype_samples == phenotype_samples) && all(genotype_samples == covariates_samples) ) {
message("[INFO] Checking sample names OK..")
}
}
if ( length( intersect(genotype_samples, phenotype_samples) ) == length(genotype_samples) &&
length( intersect(genotype_samples, covariates_samples) ) == length(genotype_samples) ) {
message("[WARN] Sample names OK, but are in the wrong order..\n\\___ Re-ordering samples..")
stop("[STOP] This is not yet implemented!")
} else {
message("[WARN] Some samples do not co-exists in all files..\n\\___ Re-ordering samples and trying to run anyway..")
# Index the different samples for the covariates
covariates_in_phenotype <- covariates_samples[which(covariates_samples %in% phenotype_samples)]
covariates_in_genotype <- covariates_samples[which(covariates_samples %in% genotype_samples)]
# Index the different samples for the covariates
genotype_in_covariates <- genotype_samples[which(genotype_samples %in% covariates_samples)]
genotype_in_phenotype <- genotype_samples[which(genotype_samples %in% phenotype_samples)]
# Index the different samples for the covariates
phenotype_in_genotype <- phenotype_samples[which(phenotype_samples %in% genotype_samples)]
phenotype_in_covariates <- phenotype_samples[which(phenotype_samples %in% covariates_samples)]
# Detect differences between samples.
if ( length(covariates_samples) != length(covariates_in_phenotype) ) {
message("[WARN] Detected different amount of samples between covariates file and the phenotype file..")
} else if ( length(covariates_samples) != length(covariates_in_genotype) ) {
message("[WARN] Detected different amount of samples between covariates file and the genotype file..")
}
if ( length(genotype_samples) != length(genotype_in_covariates) ) {
message("[WARN] Detected different amount of samples between genotype file and the covariate file..")
} else if ( length(genotype_samples) != length(genotype_in_phenotype) ) {
message("[WARN] Detected different amount of samples between genotype file and the phenotype file..")
}
if ( length(phenotype_samples) != length(phenotype_in_genotype) ) {
message("[WARN] Detected different amount of samples between phenotype file and the genotype file..")
} else if ( length(phenotype_samples) != length(phenotype_in_covariates) ) {
message("[WARN] Detected different amount of samples between phenotype file and the covariate file..")
}
# Define coexisting samples.
coexistingSamples <- intersect( intersect(covariates_samples, genotype_samples), phenotype_samples)
# Create subsets.
genotype_file_content <- genotype_file_content[,coexistingSamples, drop=F]
phenotype_file_content <- phenotype_file_content[,coexistingSamples, drop=F]
covariates_file_content <- covariates_file_content[,coexistingSamples, drop=F]
# Re-order data.
genotype_file_content <- genotype_file_content[,mixedsort( colnames(genotype_file_content) ), drop=F]
phenotype_file_content <- phenotype_file_content[,mixedsort( colnames(phenotype_file_content) ), drop=F]
covariates_file_content <- covariates_file_content[,mixedsort( colnames(covariates_file_content) ), drop=F]
if (arguments$genoToFreq == T) {
genotype_file_content_unconverted <- genotype_file_content_unconverted[,coexistingSamples, drop=F]
genotype_file_content_unconverted <- genotype_file_content_unconverted[, mixedsort( colnames(genotype_file_content_unconverted) ), drop=F]
}
# Output changes.
message("[WARN] Initial number of phenotype samples: ", length(phenotype_samples),
"\n\\___ Initial number of genotype samples: ", length(genotype_samples),
"\n\\___ Initial number of covariate samples: ", length(covariates_samples),
"\n\\___ Excluded: ", as.character( length(phenotype_samples) - length( colnames(phenotype_file_content) ) ),
" samples from the phenotype data.",
"\n\\___ Exlcuded: ", as.character( length(genotype_samples) - length( colnames(genotype_file_content) ) ),
" samples from the genotype data.",
"\n\\___ Excluded: ", as.character( length(covariates_samples) - length( colnames(covariates_file_content) ) ),
" samples from the covariates data.")
}
} else {
if ( length(genotype_samples) == length(phenotype_samples) ) {
if ( all(genotype_samples == phenotype_samples) ) {
message("[INFO] Checking sample names OK..")
} else if ( length( intersect(genotype_samples, phenotype_samples) ) == length( unique(genotype_samples) ) ) {
message("[WARN] Sample names OK, but are in the wrong order..\n\\___ Re-ordering samples..")
stop("[STOP] This is not yet implemented!")
}
} else {
message("[WARN] Some samples do not co-exists in both files..\n\\___ Re-ordering samples and trying to run anyway..")
# Define coexisting samples.
coexistingSamples <- intersect( genotype_samples, phenotype_samples)
# Set subsets of data.
phenotype_file_content <- phenotype_file_content[,coexistingSamples, drop=F]
genotype_file_content <- genotype_file_content[,coexistingSamples, drop=F]
# Re-order data.
phenotype_file_content <- phenotype_file_content[,mixedsort( colnames(phenotype_file_content) ), drop=F]
genotype_file_content <- genotype_file_content[,mixedsort( colnames(genotype_file_content) ), drop=F]
if ( arguments$genoToFreq == T ) {
genotype_file_content_unconverted <- genotype_file_content_unconverted[,coexistingSamples, drop=F]
genotype_file_content_unconverted <- genotype_file_content_unconverted[, mixedsort( colnames(genotype_file_content_unconverted) ), drop=F]
}
# Output changes.
message("[WARN] Initial number of phenotype samples: ", length(phenotype_samples),
"\n\\___ Initial number of genotype samples: ", length(genotype_samples),
"\n\\___ Excluded: ", as.character( length(phenotype_samples) - length( colnames(phenotype_file_content) ) ),
" samples from the phenotype data.",
"\n\\___ Exlcuded: ", as.character( length(genotype_samples) - length( colnames(genotype_file_content) ) ),
" samples from the genotype data.",
"\n\\___ New number of phenotype samples: ", length( colnames(phenotype_file_content) ),
"\n\\___ New number of genotype samples: ", length ( colnames(genotype_file_content) ) )
}
}
message("[INFO] ----------#----------")
# Checking phenotype data -----------------
message("[INFO] Checking phenotype data..")
if ( class( as.vector(phenotype_file_content[1,1]) ) == "character" ) {
message("[WARN] Detected characters in phenotype data.\n\\___ Conversing to integers/numeric values..")
# Save the number of NA's before conversion.
numberOfNABefore <- sum( is.na(phenotype_file_content) )
# Change the type of values in the data.frame
phenotype_file_content <- as.matrix(phenotype_file_content)
suppressWarnings( class(phenotype_file_content) <- "double" )
# Check the number of Na's after conversion
numberOfNAAfter <- sum( is.na(phenotype_file_content) )
message("\\___ Number of NA's in phenotype data before conversion: ", numberOfNABefore,
"\n\\___ Number of NA's in phenotype data after conversion: ", numberOfNAAfter)
}
message("[INFO] Checking phenotype data OK..")
message("[INFO] ----------#----------")
message("[INFO] Checking column names additional files..")
# Genotype Position Data -----
if ( !is.null(genotype_position_content) ) {
expected_genotypePosDataCols <- c("snps", "chr", "pos")
genotypePosDataCols <- colnames( arguments$genotypePositionData )
if ( length( genotypePosDataCols ) == length( expected_genotypePosDataCols ) ) {
if ( all( genotypePosDataCols == expected_genotypePosDataCols ) ) {
message("[INFO] Columns genotype position: OK..")
} else {
stop("[STOP] Please check the leqtar help, the column names of the genotype position file have to match 'snps chr pos'.." )
}
} else {
stop("[STOP] Please check the leqtar help, there are either too many or too few columns in the genotype position file..",
"\n \\___ Expected number of columns in the genotype position file: ", length( expected_genotypePosDataCols ),
"\n \\___ Observed number of columns in the genotype position file: ", length( genotypePosDataCols ) )
}
}
# Phenotype Position Data -----
if ( !is.null(phenotype_position_content) ) {
expected_phenotypePosDataCols <- c("geneid", "chr", "s1", "s2")
phenotypePosDataCols <- colnames(arguments$phenotypePositionData)
if ( length( phenotypePosDataCols ) == length( expected_phenotypePosDataCols ) ) {
if ( all( phenotypePosDataCols == expected_phenotypePosDataCols ) ) {
message("[INFO] Columns phenotype position: OK..")
} else {
stop("[STOP] Please check the leqtar help, the column names of the phenotype position file have to match 'geneid chr s1 s2'.." )
}
} else {
stop("[STOP] Please check the leqtar help, there are either too many or too few columns in the phenotype position file..",
"\n \\___ Expected number of columns in the phenotype position file: ", length( expected_phenotypePosDataCols ),
"\n \\___ Observed number of columns in the phenotype position file: ", length( phenotypePosDataCols ) )
}
}
message("[INFO] Checking column names additional files OK..")
message("[INFO] ----------#----------")
message("[INFO] Processing files OK..")
message("[INFO] ----------#----------")
# Store content in global argument object.
arguments$genotypeData <- genotype_file_content
arguments$genotypePositionData <- genotype_position_content
arguments$phenotypeData <- phenotype_file_content
arguments$phenotypePositionData <- phenotype_position_content
if ( !is.null(arguments$covariates) ) {
arguments$covariatesData <- covariates_file_content
}
if ( arguments$genoToFreq == T ) {
arguments$genotypeUnconvertedData <- genotype_file_content_unconverted
}
return( arguments )
}
# check_object_or_file -------------------
#' check_object_or_file
#'
#' Checks wether the given parameter is a file path or an R object. Thus determines wether to read it or not.
#'
#' @param path_argument the argument that should contain the path.
#' @param data_argument the argument that contains the object.
#' @param name_argument variable name to specify genotype phenotype covariaties or w/e in the messages.
#' @return If the path_argument is indeed given return the file content, else return the object.
check_object_or_file <- function(path_argument, data_argument, name_argument) {
# If not required
if ( is.null(path_argument) & is.null(data_argument) ) {
message("[INFO] Not using ", name_argument, " argument..")
return( NULL )
}
# Check if Object
else if ( is.null(path_argument) ) {
if ( is.matrix(data_argument) | is.data.frame(data_argument) ) {
message("[INFO] ", name_argument, " argument: Object..")
file_content <- data_argument
return( file_content )
} else {
message("[WARN] ", name_argument, " argument is not a matrix or a data.frame.. ")
}
# Check if File
} else {
if ( is.null(data_argument) ) {
if ( is.character(path_argument) ) {
message("[INFO] ", name_argument, " argument: File path..")
extension <- unlist(str_split(path_argument, "\\."))[2]
file_content <- read_files(extension, path_argument)
return( file_content )
} else {
stop("[STOP] ", name_argument, " argument has to be a file path.")
}
} else {
stop("[STOP] Dev note: ", name_argument, " data and Genotype cannot both be assigned.")
}
}
}
# read_files function ---------------------------------------------
#' read_files function
#'
#' Tries to read in files determined by their file extensions.
#' Always returns the content of the given file.
#'
#' @param file_extension the file extension
#' @param file_path the file path
#' @return the file content
read_files <- function(file_extension, file_path) {
file_content <- tryCatch({
message("[INFO] File path: ", file_path, "\n\\___ File extension: ", file_extension)
if (file_extension == "txt") {
read.table(file_path, stringsAsFactors = F, header = T, sep="\t", row.names=NULL)
} else if (file_extension == "Rdata" | file_extension == "RData") {
tmp_env <- new.env()
load( file_path, tmp_env )
file_content = get( ls( tmp_env )[1], envir=tmp_env )
rm(tmp_env)
file_content
} else if (file_extension == "xlsx" | file_extension == "xls") {
read.xls(file_path)
} else if (file_extension == "csv") {
read.csv(file_path, header = T, sep=",")
} else if (file_extension == "tsv") {
read.table(file_path, stringsAsFactors = F, header = T, sep="\t")
} else {
stop("[STOP] File extension: ", file_extension, " not supported!\n\\___ Currently supports RData, txt, xlsx, csv and tsv files.")
}
},
error=function(condition) {
message("[ERR] Could not read in file, critical error..\n\\___ Reason:\n\\___ ", condition)
return(NULL)
},
warning=function(condition) {
message("[WARN] Could not read in file..\n\\___ Reason:\n\\___ ", condition)
return(NULL)
},
finally={
message("[INFO] File: ", file_path, " read correctly..")
})
return(file_content)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.