R/MultiGWASTools.R

################################################################
## Function 1: Take filenames and create MonetDBlite database ##
################################################################
#' makeDatabase
#' 
#' Takes flat text files (comma, tab, or other separated) and converts
#' the data to a MonetDBLite database (<https://github.com/hannesmuehleisen/MonetDBLite>).
#' Given the filepath, file type (PLINK output only, at this time),
#' file extension/suffix and separator, the function automatically pulls 
#' chromosome number and phenotype from the file name, marks the data with
#' that phenotype name, and sorts it into a queryable database.
#'
#' @param filesuffix A string indicating the file extension of the data files
#' to be sorted into the database.
#' @param srcfilepath A string indicating the location of the data to be 
#' sorted into the database, by default the working directory. File path
#' must be absolute, not relative! (eg no "./")
#' @param dbfilepath A string indicating the location of the database, by default
#' the workind directory. File path must be absolute, not relative! (eg no "./")
#' @param separator A string indicating the separator between the phenotype
#' and chromosome in the first part of the file name, by default ".".
#' @param dbname A string giving the name of the folder that will contain the database, or 
#' the name of an existing folder if the data is to be appended to the database. Default "MultiGWASdb".
#' @param ignore A vector of filenames to ignore, that will not be included in the
#' database. You only need to list files here that ALSO have the indicated file suffix-
#' files in the filepath without this suffix will already be ignored.
#' @param pattern An optional string vector to search the filenames for, to subset to only
#' a specific set of data to input (eg only certain chromosomes).
#' @param filetype A string indicating the filetype to be input. At this time only the default
#' "plink" is accepted.
#' @param delimiter A string giving the delimiter in the flat text file, by default tab
#' (\t).
#' @param overwrite Logical, indicating whether a database with the same dbname in the 
#' same filepath should be overwritten to create a new database. Default FALSE.
#' @param append Logical, indicating whether data should be appended to a database with 
#' the same dbname in the same filepath, if it exists. Default TRUE.
#' 
#'
#'
#' @import DBI
#' 
#'
#'
#'
#' @export
makeDatabase <- function(filesuffix, srcfilepath = getwd(), dbfilepath= getwd(), separator = ".", dbname = "MultiGWASdb", ignore = NULL,
						pattern = NULL, filetype = "plink", delimiter = "\t", overwrite = FALSE, append = TRUE) {
	
	# TODO add SNPTEST format input.
	# TODO add ability to input manual column names for pvalue, OR, etc.
	a <- proc.time()
	# Check that filepath inputs are character strings
	if(is.character(srcfilepath) == FALSE | is.character(filesuffix) == FALSE| is.character(separator) == FALSE | 
		is.character(dbname) == FALSE | is.character(dbfilepath) == FALSE) {
		stop("filepaths, filesuffix, separator, and dbdir parameters must be character strings")
	}
	# Check that filetype parameter is one of the options given
	if(filetype %in% c("plink", "snptest", "custom") == FALSE) {
		stop("Filetype must be 'plink' or 'snptest'- see manual for description of file inputs")
	}
	
	if(filetype %in% c("snptest", "custom") == TRUE) {
		stop("We do not yet support SNPTEST or custom header outputs. Please check lea-urpa.github.io/MultiGWASTools for updated development versions of MultiGWASTools")
	}
	# Check that list of files to ignore is a string of characters
	if(!is.null(ignore) & !is.character(ignore)) {
		stop("'ignore' must be a string of characters (filenames to ignore)")
	}
	# Check that logical append/overwrite are mutually exclusive
	if(append == TRUE & overwrite == TRUE) {
		stop("'append' and 'overwrite' cannot both be set to TRUE")
	}
	
	# Check that the given filepath actually exists
	if(dir.exists(srcfilepath) == FALSE) {
		stop("Given srcfilepath does not exist!")
	}	
	if(dir.exists(dbfilepath) == FALSE) {
		stop("Given dbfilepath does not exist!")
	}
	
	# If separator is problematic for searching with grep, fix it
	if(separator == "."){ separator <- "[.]" }
	
	# If there are files to ignore, wrap in object, otherwise give dummy string
	if(!is.null(ignore)) {
		ignorefiles <- ignore
	} else {
		ignorefiles <- "___empty___"
	}
	
	# Define database directory
	dbdir <- paste(dbfilepath, dbname, sep= "/")
	
	# Warn if append = TRUE and database doesn't exist
	if(dir.exists(dbdir) == FALSE & append == TRUE) {
		warning("Append set to TRUE, but specified directory does not exist. New database was created.")
	} 
	
	# Stop if database exists and append & overwrite set to false
	if(dir.exists(dbdir) == TRUE & append == FALSE & overwrite == FALSE) {
		stop("Database in pathway with specfied dbname already exists. Set either append or overwrite to TRUE to continue.")
	}
	
	# If overwrite = TRUE, remove existing directory
 	if(overwrite == TRUE) {
		if(dir.exists(dbdir) == TRUE) {
			warning("Deleted and replaced previous database.")
			unlink(dbdir, recursive = TRUE)
		} else {
			warning("Overwrite set to TRUE, but specified database directory does not exist. No data was overwritten.")
		}
 	}
	
	# Create folder if dbdir does not exist
	if(dir.exists(dbdir) == FALSE) {
		dir.create(dbdir)
	}
	
	# Generate list of files in specified filepath, report to user
	file_list <- list.files(path=srcfilepath, pattern = paste("*",filesuffix, "$", sep = ""))
	if(length(file_list) == 0) {
		stop("There are no files with the specified file suffix in the filepath given.")
	}
	
	# Check if subset pattern given if so subset the file list
	if(!is.null(pattern)) {
		if(grepl("[.]", pattern) == TRUE) {
			pattern <- gsub("[.]", "[.]", pattern)
		}
		file_list <- grep(pattern, file_list, value = TRUE)
	}
	print("List of data files identified in the folder:")
	print(file_list)
	
	# Define function to extract phenotype information
	extract_phenotypes <- function(file) {
		# Extract phenotype
		pheno <- unlist(strsplit(file, separator))[1]
		pheno <- paste("'",pheno,"'", sep = "")
		return(pheno)
	}
	
	# Extract phenotypes from files
	phenotypes <- sapply(file_list, extract_phenotypes, simplify = FALSE, USE.NAMES = TRUE)
	
	b <- proc.time()
	print(paste("Data check and phenotype extraction took:", (b - a)[3]))
	
	
	# Set up database
	a <- proc.time()
	con <- DBI::dbConnect(MonetDBLite::MonetDBLite(), dbdir)
	b <- proc.time()
	print(paste("Connecting to DB took ", (b-a)[3]))
	
	
	tables <- DBI::dbListTables(con)
	
	# check if any tables exist 
	if( any(c("variants", "phenos", "tests", "results") %in% tables) == TRUE) {
		
		if( any( c("variants", "phenos", "tests", "results") %in% tables) == FALSE){
			stop("Some tables (variants, phenos, tests, or results) are missing from database!")
			# Disconnect from database
			DBI::dbDisconnect(con, shutdown=TRUE)
		} 
	} else {
		# Else create the tables
		a <- proc.time()
		
		# Start an SQL transaction
		DBI::dbBegin(con)
	
		DBI::dbSendQuery(con, "CREATE TABLE variants(
									variantid INT PRIMARY KEY AUTO_INCREMENT,
									chrom INT,
									rsid TEXT,
									pos INT,
									ref TEXT,
									alt1 TEXT,
									firth TEXT,
									obsct TEXT,
									UNIQUE(variantid, chrom, rsid, pos, ref, alt1, firth, obsct)
									);" )
								
		DBI::dbSendQuery(con, "CREATE TABLE phenos(
									phenoid INT PRIMARY KEY AUTO_INCREMENT,
									phenoname TEXT,
									UNIQUE(phenoid, phenoname)
									);")
								
		DBI::dbSendQuery(con, "CREATE TABLE tests(
									testid INT PRIMARY KEY AUTO_INCREMENT,
									testname TEXT,
									UNIQUE(testid, testname)
									);")
	
		DBI::dbSendQuery(con, "CREATE TABLE results(
									phenoid INT,
									variantid INT,
									testid INT,
									oddsr FLOAT,
									se FLOAT,
									tstat FLOAT,
									p FLOAT,
									FOREIGN KEY (phenoid) REFERENCES phenos(phenoid),
									FOREIGN KEY (variantid) REFERENCES variants(variantid),
									FOREIGN KEY (testid) REFERENCES tests(testid)
									);")
								
		DBI::dbCommit(con)
		b <- proc.time()
		print(paste("Creating tables took", (b - a)[3] ))
	}
	
	# Loop to load in data
	for(i in 1:length(file_list)) {
		if(file_list[i] %in% ignorefiles == FALSE) {
			
			# Print a ticker for the user
			print(paste("Processing data into MonetDB database for file", file_list[i]))
			a <- proc.time()
			# Get phenotype
			pheno <- phenotypes[file_list[i]]
			
			# Get number of records in file
			nrec <- system(paste("cat ", srcfilepath, "/", file_list[i], "| wc -l", sep = ""), intern = TRUE)
			nrec <- as.numeric(nrec)
			b <- proc.time()
			print(paste("Getting pheno and number of records took", (b - a)[3]))
			
			# Start an SQL transaction
			DBI::dbBegin(con)
			
			a <- proc.time()
			# Manually define temptable and load in data
			if(filetype == "plink" & i == 1) {
				DBI::dbSendQuery(con, 'CREATE TABLE temptable (
														"X.CHROM" INT,
														"POS" INT,
														"ID" TEXT,
														"REF" TEXT,
														"ALT1" TEXT,
														"FIRTH." TEXT,
														"TEST" TEXT,
														"OBS_CT" INT,
														"OR" FLOAT,
														"SE" FLOAT,
														"T_STAT" FLOAT,
														"P" FLOAT
														);')
			# From what I understand float is OK, since we have less than 8 places accuracy in input data anyway 
			} else if (filetype == "snptest" | filetype == "custom") {
				stop("snptest or custom file output not supported yet. How did you even get this far anyway?")
			} 
			DBI::dbCommit(con)
			
			#Begin next transaction
			DBI::dbBegin(con)
			DBI::dbSendQuery(con, paste0("COPY ", nrec, " OFFSET 2 RECORDS INTO temptable FROM '", paste(srcfilepath, file_list[i], sep= "/"),
											"' USING DELIMITERS '", delimiter, "','\n','\"' NULL as 'NA'"))
			b <- proc.time()
			print(paste("Creating temptable and inserting data took:", (b - a)[3] ))
			
			DBI::dbCommit(con)
			
			#Begin next transaction
			DBI::dbBegin(con)	
			# Extract data repeated for each rsid to variants table
			a <- proc.time()
			DBI::dbSendQuery(con, 'INSERT INTO variants (chrom, rsid, pos, ref, alt1, firth, obsct) 
										SELECT "X.CHROM","ID","POS", "REF", "ALT1","FIRTH.", "OBS_CT" 
										FROM temptable 
										WHERE NOT EXISTS (SELECT 1 
															FROM variants
															WHERE variants.rsid = temptable."ID"
															AND variants.ref = temptable."REF"
															AND variants.alt1 = temptable."ALT1"
															AND variants.firth = temptable."FIRTH."
															AND variants.obsct = temptable."OBS_CT")
										GROUP BY "X.CHROM","ID","POS", "REF", "ALT1","FIRTH.", "OBS_CT"' )
			b <- proc.time()
			print(paste("Extracting unique variants took", (b - a)[3] ))
			DBI::dbCommit(con)
			
			#Begin next transaction
			DBI::dbBegin(con)
			
			# IF test not already in table, insert test
			a <- proc.time()
			DBI::dbSendQuery(con, 'INSERT INTO tests (testname)
										SELECT "TEST"
										FROM temptable
										WHERE NOT EXISTS (SELECT 1
															FROM tests
															WHERE tests.testname = temptable."TEST")
										GROUP BY "TEST"')
			b <- proc.time()
			print(paste("Extracting unique tests took", (b - a)[3] ))
			
			DBI::dbCommit(con)
			
			#Begin next transaction
			DBI::dbBegin(con)
			
			# IF phenotype not already in table, insert phenotype into phenos table
			a <- proc.time()
		    tablephenos <- DBI::dbGetQuery(con, 'SELECT phenoname FROM phenos') 
	    	pheno_noquote <- gsub("'", "", pheno) 
	    	if( pheno_noquote %in% tablephenos$phenoname == FALSE ) { 
	        	DBI::dbSendQuery(con, paste('INSERT INTO phenos (phenoname) VALUES (', pheno , ')')) 
	      	}  
			b <- proc.time()
			print(paste("Extracting unique phenotype took", (b - a)[3] ))
			
			DBI::dbCommit(con)
			
			#Begin next transaction
			DBI::dbBegin(con)
			
			a <- proc.time()
			# Save phenoid as R object
			phenoid <- DBI::dbGetQuery(con, paste('SELECT phenoid FROM phenos WHERE phenoname =', pheno))					
			
			# Add phenoid column to temp table, set value to your saved phenoid
			DBI::dbSendQuery(con, 'ALTER TABLE temptable ADD phenoid INT')
			DBI::dbSendQuery(con, paste('UPDATE temptable SET phenoid =', phenoid))
			b <- proc.time()
			print(paste("Identifying phenoid and inserting to temptable took", (b - a)[3] ))
			
			DBI::dbCommit(con)
			
			#Begin next transaction
			DBI::dbBegin(con)
			
			# Create temptable2, insert data by left joining temptable with variants table
			a <- proc.time()
			if( i == 1){
				DBI::dbSendQuery(con, 'CREATE TABLE temptable2 (
															test TEXT,
															oddsr FLOAT,
															se FLOAT,
															tstat FLOAT,
															p FLOAT,
															phenoid TEXT,
															variantid TEXT
															);')
			}
			b <- proc.time()
			print(paste("Creating temptable2 took", (b - a)[3]))
			
			a <- proc.time()
			DBI::dbSendQuery(con, 'INSERT INTO temptable2 (test, oddsr, se, tstat, p, phenoid, variantid) 
									SELECT "TEST", "OR", "SE", "T_STAT", "P", phenoid, variantid
									FROM temptable
									LEFT JOIN variants
										ON temptable."ID" = variants.rsid
										AND temptable."REF" = variants.ref
										AND temptable."ALT1" = variants.alt1
										AND temptable."FIRTH." = variants.firth
										AND temptable."OBS_CT" = variants.obsct
										')
			b <- proc.time()
			print(paste("Inserting joined data to temptable2 took", (b - a)[3]))
			
			DBI::dbCommit(con)
			
			#Begin next transaction
			DBI::dbBegin(con)
				
			# Create temptable3, insert data by left joining temptable2 with tests table
			a <- proc.time()
			if(i == 1){
				DBI::dbSendQuery(con, 'CREATE TABLE temptable3 (
														oddsr FLOAT,
														se FLOAT,
														tstat FLOAT,
														p FLOAT,
														phenoid TEXT,
														variantid TEXT,
														testid TEXT)')
			}
			b <- proc.time()
			print(paste("Creating temptable3 took", (b-a)[3]))
			
			a <- proc.time()
			DBI::dbSendQuery(con, 'INSERT INTO temptable3 (oddsr, se, tstat, p, phenoid, variantid, testid)
									SELECT oddsr, se, tstat, p, phenoid, variantid, testid
									FROM temptable2
									LEFT JOIN tests
										ON temptable2.test = tests.testname')
			b <- proc.time()
			print(paste("Inserting joined data to temptable3 took", (b-a)[3]))
			
			DBI::dbCommit(con)
			
			#Begin next transaction
			DBI::dbBegin(con)
			
			a <- proc.time()
			# Insert important data into results table
			DBI::dbSendQuery(con, 'INSERT INTO results (oddsr, se, tstat, p, phenoid, variantid, testid)
										SELECT oddsr, se, tstat, p, phenoid, variantid, testid
										FROM temptable3')
			b <- proc.time()
			print(paste("Inserting temptable data into results table took", (b - a)[3]))
			
			DBI::dbCommit(con)
			
			#Begin next transaction
			DBI::dbBegin(con)
			
			a <- proc.time()
			# Drop or delete from temptables (also drops associated indices)
			if(i == length(file_list)){
				DBI::dbSendQuery(con, 'DROP TABLE temptable')
				DBI::dbSendQuery(con, 'DROP TABLE temptable2')
				DBI::dbSendQuery(con, 'DROP TABLE temptable3')
			} else {
				DBI::dbSendQuery(con, 'DELETE FROM temptable')
				DBI::dbSendQuery(con, "ALTER TABLE temptable DROP COLUMN phenoid")
				DBI::dbSendQuery(con, 'DELETE FROM temptable2')
				DBI::dbSendQuery(con, 'DELETE FROM temptable3')
			}
			b <- proc.time()
			print(paste("Deleting from temptables took", (b -a)[3] ))
			
			DBI::dbCommit(con)
			
		} else {
			print(paste("Ignoring file", file_list[i]))
		}
	}
	
	# Disconnect from database
	DBI::dbDisconnect(con, shutdown=TRUE)
}
###############################################
## Function 1.5: Add indices on the database ## 
###############################################
# This is faster than dropping and adding again the indices...
#' addIndices
#'
#' Connects to previously created database (currently only a MonetDBLite database
#' created with makeDatabase)			
#' and adds indices to the database.
#'
#' @param dbdir A string giving the directory (filepath + folder name) of the database to 
#' connect to. The file path must be absolute, not relative (eg no "./").
#' @param drop Logical indicating whether the indicies should be dropped (TRUE), otherwise the
#' indices will be created (FALSE).
#' 
#' @import DBI
#'
#'
#'
#'
#' @export
addIndices <- function(dbdir, drop = FALSE){
	if(dir.exists(dbdir) == FALSE) {
		stop("Database directory specified does not exist")
	}
	# Connect to database
	print(paste("Connecting to database ", dbdir))
	a <- proc.time()
	try(con <- DBI::dbConnect(MonetDBLite::MonetDBLite(), dbdir))
	b <- proc.time()
	print(paste("Connecting to database took", (b-a)[3]))
	
	if (drop == FALSE){
		#Begin next transaction
		DBI::dbBegin(con)

		print(paste0("Creating indices for ", dbdir))
		a <- proc.time()
		try(DBI::dbSendQuery(con, "CREATE INDEX Idx1 ON variants (rsid)"))
		try(DBI::dbSendQuery(con, "CREATE INDEX Idx2 ON phenos (phenoname)"))
		try(DBI::dbSendQuery(con, "CREATE INDEX Idx3 ON tests (testname)"))
		try(DBI::dbSendQuery(con, "CREATE INDEX Idx4 ON results (testid)"))
		try(DBI::dbSendQuery(con, "CREATE INDEX Idx5 ON results (variantid)"))
		try(DBI::dbSendQuery(con, "CREATE INDEX Idx6 ON results (phenoid)"))

		try(DBI::dbSendQuery(con, "CREATE INDEX Idx7 ON variants (variantid)"))
		try(DBI::dbSendQuery(con, "CREATE INDEX Idx8 ON phenos (phenoid)"))
		try(DBI::dbSendQuery(con, "CREATE INDEX Idx9 ON tests (testid)"))
		b <- proc.time()
		print(paste("Creating indices took", (b - a)[3]))

		DBI::dbCommit(con)
		
	} else if (drop == TRUE) {
		#Begin next transaction
		DBI::dbBegin(con)

		print(paste("Dropping indices for ", dbdir))
		a <- proc.time()
		try(DBI::dbSendQuery(con, "DROP INDEX Idx1"))
		try(DBI::dbSendQuery(con, "DROP INDEX Idx2"))
		try(DBI::dbSendQuery(con, "DROP INDEX Idx3"))
		try(DBI::dbSendQuery(con, "DROP INDEX Idx4"))
		try(DBI::dbSendQuery(con, "DROP INDEX Idx5"))
		try(DBI::dbSendQuery(con, "DROP INDEX Idx6"))

		try(DBI::dbSendQuery(con, "DROP INDEX Idx7"))
		try(DBI::dbSendQuery(con, "DROP INDEX Idx8"))
		try(DBI::dbSendQuery(con, "DROP INDEX Idx9"))
		b <- proc.time()
		print(paste("Dropping indices took", (b - a)[3]))

		DBI::dbCommit(con)
	}	
	# Disconnect from database
	a <- proc.time()
	DBI::dbDisconnect(con, shutdown=TRUE)
	b <- proc.time()
	print(paste("Disconnecting from database took", (b-a)[3]))
}

#######################################################################################
## Function 2: Take list of rsids/ positions, extract data from database to R object ##
#######################################################################################
#' extractData
#'
#' Connects to previously created database (currently only a MonetDBLite database
#' created with makeDatabase)			
#' and queries specific data from the database. Essentially a wrapper for
#' MonetDBLite query statements, without having to learn SQL query syntax. Users
#' can provide either a vector of rsids to query the database with OR a vector
#' of start/stop positions along a given chromosome. One can also threshold the 
#' data with minimum p value, test (additive or other), or phenotype.
#'
#' @param dbdir A string giving the directory (filepath + folder name) of the database to 
#' connect to. The file path must be absolute, not relative (eg no "./").
#' @param rsids A vector containing the rsids to query the database. Must provide
#' either rsids or positions, not both! 
#' @param pos A vector containing three numbers, the chromosome, start and stop positions
#' along that chromosome to query the database. Must provide either rsids or
#' positions, not both!
#' @param pvalThresh A number indicating the minimum threshold of pvalues for the
#' data to return, by default all (pvalue = 1).
#' @param phenotype A string or vector of strings containing the phenotypes that will
#' be extracted from the data, by default all phenotypes.
#' @param test A string or vector of strings containing the tests that will be
#' extracted  from the data, by default all tests.
#' @param dbtype A string indicating database type. Currently only the default "MonetDB" accepted.
#' 
#' @import DBI
#' 
#'
#'
#'
#' @export
extractData <- function( dbdir, rsids = NULL, pos = NULL, pvalThresh = 1, phenotype = "all", test = "all", dbtype = "MonetDB") {
	# Check inputs
	if(dbtype != "MonetDB") {
		stop("dbtype must be a string indicating databaset type. MongoDB not yet supported.")
	}
	if(is.null(rsids) == TRUE & is.null(pos) == TRUE) {
		stop("Specify either rsids or pos to query from database")
	}
	if(is.null(rsids) == FALSE & is.null(pos) == FALSE) {
		stop("Specify only one: rsids or pos to query from database")
	}
	if(is.numeric(pvalThresh) == FALSE) {
		stop(" pvalThresh argument must be a number")
	}
	if(dir.exists(dbdir) == FALSE) {
		stop("Database directory specified does not exist")
	}
	
	if(is.null(pos) == FALSE ) {
		if(is.vector(pos) == FALSE | length(pos) != 3) { 
			stop("Positions must be a vector of length three (chrom, start position, stop position)")
		}
		if(pos[1] %in% c(1:22,"x","y","X","Y") == FALSE) {
			stop("Chromosome in pos vector must be 1-22, 'x', 'y', 'X', or 'Y'.")
		}
		if(pos[3] < pos[2]) {
			stop("End position in pos vector must be greater than start position!")
		}
	}
	# Connect to database
	con <- DBI::dbConnect(MonetDBLite::MonetDBLite(), dbdir)
	
	# If phenotype set to all in database, save that as a vector to iterate through
	if(phenotype == "all") {
		#Begin next transaction
		DBI::dbBegin(con)
		
		phenos <- DBI::dbGetQuery(con, "SELECT phenos.phenoname FROM phenos")
		phenotype <- as.vector(phenos$phenoname)
		
		DBI::dbCommit(con)
	}
	
	# If test set to all in database, save that as a vector to iterate through (limit as overshot of test)
	if(test == "all") {
		#Begin next transaction
		DBI::dbBegin(con)
		
		test <- DBI::dbGetQuery(con, "SELECT DISTINCT results.testid FROM results LIMIT 60") 
		test <- as.vector(test$test)
		
		DBI::dbCommit(con)
	}
	
	# Extract data by user specifications
	for(i in 1:length(phenotype)) {
		for(j in 1:length(test)) {
			if(is.null(rsids) == FALSE) {
				
				# If rsids given, iterate through list of rsids
				for(k in 1:length(rsids)) {
					print(paste("Extracting data for pheno:", phenotype[i], "test:", test[j], "rsid:", rsids[k], "from database, where pval <",  pvalThresh))
					
					#Begin next transaction
					DBI::dbBegin(con)
					
					tempres <- DBI::dbGetQuery(con,
							paste("SELECT rsid, chrom, pos, ref, alt1, firth, obsct, testname, phenoname, oddsr, se, tstat, p
									FROM variants
									JOIN results
									ON results.variantid = variants.variantid
									JOIN phenos
									ON results.phenoid = phenos.phenoid
									JOIN tests 
									ON results.testid = tests.testid
									WHERE  phenos.phenoname = \'", phenotype[i], "\' AND tests.testname = \'", test[j], "\' AND variants.rsid = \'", rsids[k], "\'",
									"AND p <",  pvalThresh, sep = "" 
								))
								
					DBI::dbCommit(con)
					
					if(nrow(tempres) == 0) {
						warning(paste("No data in database for rsid:", rsids[k], " phenotype:", phenotype[i], "test:", test[j], " at pvalue less than",  pvalThresh))
					}
					if(i == 1 & j== 1 & k == 1 ){
						res <- tempres
					} else {
						res <- rbind(res, tempres)
					}
				}
			} else {
				
				# If positions given, extract all with specified values
				print(paste("Extracting data for phenotype:", phenotype[i], "test:", test[j], "between pos", pos[2], "and", pos[3], "where pval <",  pvalThresh ))
				
				#Begin next transaction
				DBI::dbBegin(con)
				
				tempres <- DBI::dbGetQuery(con,
						paste(" SELECT rsid, chrom, pos, ref, alt1, firth, obsct, testname, phenoname, oddsr, se, tstat, p
								FROM variants
								JOIN results
								ON results.variantid = variants.variantid
								JOIN phenos
								ON results.phenoid = phenos.phenoid
								JOIN tests
								ON results.testid = tests.testid
								WHERE phenos.phenoname = \'", phenotype[i], "\' AND tests.testname = \'", test[j], "\' AND p <",  pvalThresh,
								"AND chrom = ", pos[1], "AND pos BETWEEN ", pos[2], " AND ", pos[3], sep = ""
						))
						
				DBI::dbCommit(con)
				
				if(nrow(tempres) == 0){
					warning(paste("No data in database for phenotype:", phenotype[i], "test:", test[j], "between", pos[2], "and", pos[3], "at pvalue less than",  pvalThresh))
				}
				if(i == 1 & j == 1 ) {
					res <- tempres
				} else {
					res <- rbind(res, tempres)
				}
						
			}
		}
	}
	
	# Disconnect from database
	DBI::dbDisconnect(con, shutdown=TRUE)
	
	return(res)
}

############################################################################################
## Small functions for users to check data: tables, fields in tables, and heads of tables ##
############################################################################################
#' listTables
#'
#' A convenience function, simply a wrapper for MonetDBLite's dbListTables
#'
#' @param dbdir A string giving the directory (filepath + folder name) of the database to 
#' connect to. The file path must be absolute, not relative (eg no "./").
#'
#' @import DBI
#' 
#'
#' @export
listTables <- function(dbdir) {
	con <- DBI::dbConnect(MonetDBLite::MonetDBLite(), dbdir)
	tables <- DBI::dbListTables(con)
	DBI::dbDisconnect(con, shutdown=TRUE)
	return(tables)
}
#' listColumnNames
#'
#' A convenience function, simply a wrapper for MonetDBLite's dbListFields
#'
#' @param dbdir A string giving the directory (filepath + folder name) of the database to 
#' connect to. The file path must be absolute, not relative (eg no "./").
#' @param table A string giving the name of the table in the database whose column names
#' should be listed.
#'
#' @import DBI
#'
#'
#' @export
listColumnNames <- function(dbdir, table) {
	if(is.character(table) == FALSE){
		stop("table must be the name of one of the tables in your database (variants, phenos, tests, or results)")
	}
	con <- DBI::dbConnect(MonetDBLite::MonetDBLite(), dbdir)
	names <- DBI::dbListFields(con, table)
	DBI::dbDisconnect(con, shutdown=TRUE)
	return(names)
}
#' headTable
#'
#' A convenience function, simply a wrapper for MonetDBLite's dbGetQuery, querying the
#' specified table for first n rows.
#'
#' @param dbdir A string giving the directory (filepath + folder name) of the database to 
#' connect to. The file path must be absolute, not relative (eg no "./").
#' @param table A string giving the name of the table in the database whose column names
#' should be listed.
#' @param n A number giving the first n rows of the table that should be displated.
#'
#' @import DBI
#' 
#'
#' @export
headTable <- function(dbdir, table, n) {
	if(is.numeric(n) == FALSE) {
		stop("n must be a number")
	}
	if(is.character(table) == FALSE) {
		stop("table must be the name of one of the tables in your database (variants, phenos, tests, or results)")
	}
	con <- DBI::dbConnect(MonetDBLite::MonetDBLite(), dbdir)
	#Begin next transaction
	DBI::dbBegin(con)
	
	head <- DBI::dbGetQuery(con, paste("SELECT * FROM", table, "LIMIT", n))
	DBI::dbCommit(con)
	
	DBI::dbDisconnect(con, shutdown=TRUE)
	return(head)
}

###########################
## html widget functions ##
###########################
#' MultiVariantsViewer
#'
#' Takes a dataset (eg the output of extractData) and plots the data interactively. Data must
#' contain headers 'chrom', 'rsid', 'pos', 'ref', 'alt1', 'firth', 'obsct', 'testname', 'oddsr',
#' 'se', 'tstat', 'p', and 'phenoname'.
#' 
#' @param dataset The dataset, the result of extractData, that will be plotted.
#' @param width Optional plot width specification.
#' @param width Optional plot height specification.
#' @param elementId Optional plot html element id.
#' 
#' @import htmlwidgets
#'
#'
#' @export 
MultiVariantViewer <- function(dataset, width = NULL, height = NULL, elementId = NULL) {
	
	if( all(colnames(dataset) %in% c('chrom', 'rsid', 'pos','ref','alt1','firth','obsct','testname','oddsr','se','tstat','p','phenoname')) == FALSE ){
		stop("Dataset must contain headers 'chrom', 'rsid', 'pos','ref','alt1','firth','obsct','testname','oddsr','se','tstat','p', and 'phenoname'")
	}
	
	# Create list of unique ids (keys) for each data point
	keys <- paste(dataset$chrom, dataset$pos, dataset$rsid, dataset$phenoname, dataset$test, sep="_")

	# Find max and min plot values
	ORrange <- list( min = min(dataset$oddsr), max = max(dataset$oddsr) )
	Prange <- list(min = min(-log10(dataset$p)), max = max(-log10(dataset$p)))

	# Find unique rsids, phenotypes, and tests
	rsids <- unique(dataset$rsid)
	phenonames <- unique(dataset$phenoname)
	tests <- unique(dataset$test)
	
	# Get vector of colors to use for later graphing
	if(length(phenonames) > length(tests)) {
		colors <- rainbow( length(phenonames), v = 0.8)
	} else {
		colors <- rainbow( length(tests), v = 0.8)
	}
	
	# Take off opacity tag for browser compatability
	colors <- substr(colors, 1,7) 

	data <- list(keys = keys, ids = dataset$rsid, chrom=dataset$chrom, pos=dataset$pos, pval= -log10(dataset$p),
		 		oddsr=dataset$oddsr, obsct=dataset$obsct,
				pheno=dataset$phenoname, test=dataset$testname, ORrange=ORrange, Prange=Prange, rsids=rsids,
				phenonames= phenonames, tests=tests, colors = colors)
		
	# create widget
	htmlwidgets::createWidget(
		name = 'MultiVariantViewer',
		data,
		width = width,
		height = height,
		package = 'MultiGWASTools',
		elementId = elementId
	)
}

#' SingleVariantViewer
#'
#' Takes the dataset for a single variant (eg the output of extractData) and plots the data interactively. 
#' Data must contain headers 'chrom', 'rsid', 'pos', 'ref', 'alt1', 'firth', 'obsct', 'testname', 'oddsr',
#' 'se', 'tstat', 'p' and 'phenoname'.
#'
#' @param dataset The dataset, the results of extractData, that will be plotted.
#' @param width Optional plot width specification.
#' @param width Optional plot height specification.
#' @param elementId Optional plot html element id.
#'
#' @import htmlwidgets
#'
#' @export 
SingleVariantViewer <- function(dataset, width = NULL, height = NULL, elementId = NULL ) {
	if( all(colnames(dataset) %in% c('chrom', 'rsid', 'pos','ref','alt1','firth','obsct','testname','oddsr','se','tstat','p','phenoname')) == FALSE ){
		stop("Dataset must contain headers 'chrom', 'rsid', 'pos','ref','alt1','firth','obsct','testname','oddsr','se','tstat','p', and 'phenoname'")
	}
	if(length(unique(dataset$rsid)) > 1) {
		stop('This dataset contains more than one RSID. SingleVariantViewer should be used with only one RSID.')
	}
	
	# Create list of unique ids (keys) for each data point
	keys <- paste(dataset$chrom, dataset$pos, dataset$rsid, dataset$phenoname, dataset$test, sep="_")

	# Find max and min plot values
	ORrange <- list( min = min(dataset$oddsr), max = max(dataset$oddsr) )
	Prange <- list(min = min(-log10(dataset$p)), max = max(-log10(dataset$p)))

	# Find unique rsids, phenotypes, and tests
	rsids <- unique(dataset$rsid)
	phenonames <- unique(dataset$phenoname)
	tests <- unique(dataset$test)
	
	# Get vector of colors to use for later graphing
	if(length(phenonames) > length(tests)) {
		colors <- rainbow( length(phenonames), v = 0.8)
	} else {
		colors <- rainbow( length(tests), v = 0.8)
	}
	
	# Take off opacity tag for browser compatability
	colors <- substr(colors, 1,7) 

	data <- list(keys = keys, ids = dataset$rsid, chrom=dataset$chrom, pos=dataset$pos, pval= -log10(dataset$p),
		 		oddsr=dataset$oddsr, obsct=dataset$obsct,
				pheno=dataset$phenoname, test=dataset$testname, ORrange=ORrange, Prange=Prange, rsids=rsids,
				phenonames= phenonames, tests=tests, colors = colors)
	
	# create widget
	htmlwidgets::createWidget(
		name = 'SingleVariantViewer',
		data,
		width = width,
		height = height,
		package = 'MultiGWASTools',
		elementId = elementId
	)
}
lea-urpa/MultiGWASTools documentation built on May 24, 2019, 5:01 a.m.