Nothing
#' @useDynLib GxEScanR, .registration = TRUE
#' @importFrom Rcpp sourceCpp
#' @importFrom prodlim row.match
#' @importFrom stats complete.cases glm var
NULL
subsetsnps <- function(snps, snplist) {
if (length(snps) == 0) {
stop("No SNPs selected")
}
if (is.character(snps) == TRUE) {
if (length(snps) == 1 & snps[1] == "all")
return(rep(TRUE, length(snplist)))
snps2 <- match(snps, snplist)
snps2 <- snps2[!is.na(snps2)]
if (length(snps2) == 0)
stop("No matching SNPs found")
if (length(snps) != length(snps2))
print(paste(length(snps) - length(snps2),
" of ",
length(snps),
" SNPs were not found"))
snps <- snps2
}
if (is.numeric(snps) == FALSE)
stop("snps must be a character or integer array")
if (is.integer(snps) == FALSE) {
if (all(floor(snps) == snps) == FALSE)
stop("snps must be a character or integer array")
}
if (min(snps) < 1)
stop("snp indices must be positive")
if (max(snps) > length(snplist))
stop("at least one snp index is greater than the number of SNPs available")
snpstouse <- rep(FALSE, length(snplist))
snpstouse[snps] <- TRUE
return (snpstouse)
}
# Subsets the covariate data to complete cases
# Also gets subject indices in the genetic data
subsetdata <- function(subdata, bdinfo, binary, mincov) {
# There must be at least two columns in the subject data
if(ncol(subdata) < 2)
stop("There must me at least two columns in the subject data")
# Check if the first column is a character value
if (is.character(subdata[,1]) == FALSE)
stop("First column of subject data must be a character value")
# Remove subjects without complete data
covdata <- subdata[complete.cases(subdata),]
if (nrow(covdata) == 0)
stop("No subjects have complete phenotype/covariate data")
colnames(covdata) <- colnames(subdata)
# Determine if family IDs are used
# and get the indices of the subjects in the genetic data that are
# in the covariate data
if (bdinfo$usesfid == FALSE) {
covdata$genindex <- match(covdata[,1], bdinfo$samples$sid)
phenocol <- 2
} else {
# When family ID is used, there must be at least 3 columns
# in the subject data
if (ncol(covdata) < 3)
stop("When using family ID, subject data must have at least 3 columns")
# When family ID is used, the second columns must be a character value
if (is.character(covdata[,2]) == FALSE)
stop("When using family ID, the first two columns must be character values")
covdata$genindex <- row.match(covdata[,1:2], bdinfo$samples)
phenocol <- 3
}
if (ncol(subdata) < phenocol + mincov)
stop("Subject data has no covariates")
for (i in phenocol:ncol(subdata)) {
if (is.numeric(covdata[,i]) == FALSE)
stop("Phenotype and covariate values must be numeric")
}
# Drop subjects that don't have genetic values
covdata <- covdata[complete.cases(covdata),]
# Check if any subjects have complete data
if (nrow(covdata) == 0)
stop("No subjects have complete data")
# If the outcome is binary sort by outcome and
if (binary == TRUE) {
covdata <- covdata[order(covdata[, phenocol]),]
# Check there are two outcome values
if (length(unique(covdata[,phenocol])) != 2)
stop("When using a binary outcome there must only be two values")
# check if outcome values are 0, 1
if (all(unique(covdata[, phenocol]) == c(0,1)) == FALSE)
stop("When using a binary outcome must be coded 0,1")
}
phenocov = as.matrix(covdata[,phenocol:(ncol(covdata)-1)])
dimnames(phenocov) <- list(covdata[,ncol(covdata)],
colnames(covdata)[phenocol:(ncol(covdata)-1)])
return (phenocov)
}
# Create blocks used for faster reading of binary dosage files
assignblocks <- function(nsub, nsnps, snploc, snpbytes, reqblksize) {
if (nsub < 10000) {
blksnps <- 5000
} else if (nsub < 25000) {
blksnps <- 2000
} else if (nsub < 50000) {
blksnps <- 1000
} else if (nsub < 100000) {
blksnps <- 500
} else if (nsub < 250000) {
blksnps <- 200
} else if (nsub < 500000) {
blksnps <- 100
} else {
blksnps <- 50
}
if (reqblksize > 2 * blksnps & nsnps > 2 * blksnps)
stop("Requested block size greater than twice the recommended block size")
if (reqblksize != 0)
blksnps <- reqblksize
nblks <- ceiling(nsnps / blksnps)
if (nblks == 1) {
blksnps <- nsnps
blkloc <- snploc[1]
blkbytes <- snploc[nsnps] - snploc[1] + snpbytes[length(snpbytes)]
} else {
fsnp <- seq(1, (nblks - 1) * blksnps + 1, blksnps)
blkloc <- snploc[fsnp]
blkbytes <- numeric(nblks)
blkbytes[1:(nblks - 1)] <- snploc[fsnp[2:nblks]] - snploc[fsnp[1:(nblks - 1)]]
blkbytes[nblks] <- snploc[nsnps] - snploc[fsnp[nblks]] + snpbytes[length(snpbytes)]
}
return(list(snpsperblk = blksnps,
blkloc = blkloc,
blkbytes = blkbytes))
}
#####################################################
### Check for valid input values
#####################################################
validateinput <- function(data, bdinfo, outfile, skipfile,
minmaf, blksize, binary) {
# Check if input values are of correct type
if (is.data.frame(data) == FALSE)
stop("data must be a data frame")
if (class(bdinfo) != "genetic-info")
stop("bdinfo not a genetic-info class")
if (class(bdinfo$additionalinfo) != "bdose-info")
stop("bdinfo does not have information about a binary dosage file")
if (is.character(outfile) == FALSE)
stop("outfile must be a character value")
if (length(outfile) != 1)
stop("outfile must be a character vector of length 1")
if (is.character(skipfile) == FALSE)
stop("skipfile must be a character value")
if (length(skipfile) != 1)
stop("skipfile must be a character vector of length 1")
if (is.numeric(minmaf) == FALSE)
stop("minmaf must be a numeric value")
if (length(minmaf) != 1)
stop("minmaf must be a numeric vector of length 1")
if (minmaf < 0 | minmaf > 0.25)
stop("minmaf must be a value from 0 to 0.25, inclusive")
if (is.numeric(blksize) == FALSE)
stop("blksize must be an integer")
if (length(blksize) != 1)
stop("blksize must be an integer vector of length 1")
if (blksize != floor(blksize))
stop("blksize must be an integer")
blksize <- as.integer(blksize)
if (blksize < 0)
stop("blksize must be greater than or equal to 0")
if (is.logical(binary) == FALSE)
stop("binary must be a logical value")
if (length(binary) != 1)
stop("binary must be a logical vector of length 1")
}
#' gwas
#'
#' Run a gwas using genetic data from a binary dosage file
#' @param data Data frame containing the subject ID, phenotype
#' and covariates
#' @param bdinfo Information about the binary dosage file returned
#' from the BinaryDosage::getbdinfo routine
#' @param snps The SNPs to be used in the scan. This may be an integer
#' vector indicate which SNPs to use in the binary dosage file or a
#' character vector of the SNP IDs to use. The value may also be "all",
#' indicating to use all SNPs. The default value is "all".
#' @param outfile The file name for the results Can be blank.
#' If the value is "", the results are returned as a data frame. Default
#' value is ""
#' @param skipfile The name of the file to write the SNPs that were not
#' used and the reason they weren't used. If the value is blank, there is
#' no output of the unused SNPs. Default value is "".
#' @param minmaf Minimum minor allele frequency of SNPs to include
#' in analysis. SNPS that have less than 20 minor alleles observed
#' will be excluded from the analysis regardless of the value of
#' minmaf. A value of 0 indicates to use all the SNPs that have 20
#' minor alleles observed. Default value is 0.
#' @param blksize Size of blocks of SNPs to read in at one time.
#' Larger blocks can improve overall speed but require larger
#' amounts of computer memory. A value of 0 indicates to use the
#' recommended block size. Default value is 0.
#' @param binary Logical value indicating if the phenotype
#' is a binary value. Default value is false.
#' @return
#' 0
#' @export
#'
#' @examples
#' bdinfo <- readRDS(system.file("extdata/pdata_4_1.bdinfo", package = "GxEScanR"))
#' covdata <- readRDS(system.file("extdata/covdata.rds", package = "GxEScanR"))
#'
#' results <- gwas(data = covdata, bdinfo = bdinfo, binary = FALSE)
gwas <- function(data, bdinfo, snps, outfile, skipfile,
minmaf, blksize, binary) {
# Set missing values to default values
if (missing(snps) == TRUE)
snps = "all"
if (missing(outfile) == TRUE)
outfile <- ""
if (missing(skipfile) == TRUE)
skipfile <- ""
if (missing(minmaf) == TRUE)
minmaf <- 0.
if (missing(blksize))
blksize <- 0L
if (missing(binary) == TRUE)
binary = TRUE
validateinput(data, bdinfo, outfile, skipfile, minmaf, blksize, binary)
snps <- subsetsnps(snps = snps,
snplist = bdinfo$snps$snpid)
#####################################################
### Subset data to subjects with complete
### data and run baseline regression
#####################################################
data <- subsetdata(subdata = data,
bdinfo = bdinfo,
binary = binary,
mincov = 0)
stddata <- matrix(data = 0,
nrow = nrow(data),
ncol = ncol(data))
means <- numeric(ncol(data))
stddevs <- numeric(ncol(data))
stdmat(data, stddata, means, stddevs)
print(paste(nrow(data), "subjects have complete data"))
if (binary == TRUE)
modfamily = "binomial"
else
modfamily = "gaussian"
modformula = paste(colnames(data)[1], '~', '.')
basemodel = glm(formula = modformula,
family = modfamily,
data = as.data.frame(data))
#####################################################
### Get information about the binary
### dosage file and create the blocks
### needed for reading the data
#####################################################
blkinfo <- assignblocks(nsub = nrow(bdinfo$samples),
nsnps = length(bdinfo$snps$snpid),
snploc = bdinfo$indices,
snpbytes = bdinfo$datasize,
reqblksize = blksize)
if (bdinfo$additionalinfo$format == 1) {
if (bdinfo$additionalinfo$subformat == 1)
base = 0L
else
base = 1L
} else {
base <- 2L
}
subindex <- as.integer(rownames(data))
if (binary == TRUE) {
ncov <- length(basemodel$coefficients)
beta0 <- numeric(ncov)
if(ncov == 1) {
beta0[1] = basemodel$coefficients[1]
} else {
beta0[2:ncov] <- basemodel$coefficients[2:ncov] * stddevs[2:ncov]
beta0[1] <- basemodel$coefficients[1] +
sum(means[2:ncov] * basemodel$coefficients[2:ncov])
}
return (logreggwas(bdinfo = bdinfo,
blkinfo = blkinfo,
snps = snps,
stddata = stddata,
subindex = subindex,
outfile = outfile,
skipfile = skipfile,
beta0 = beta0,
minmaf = minmaf,
base = base))
}
return (linreggwas(bdinfo = bdinfo,
blkinfo = blkinfo,
snps = snps,
stddata = stddata,
subindex = subindex,
outfile = outfile,
skipfile = skipfile,
minmaf = minmaf,
base = base))
}
#####################################################
### GWEIS
#####################################################
#' gweis
#'
#' Run a gweis using genetic data from a binary dosage file
#' @param data Data frame containing the subject ID, phenotype
#' and covariates
#' @param bdinfo Information about the binary dosage file returned
#' from the BinaryDosage::getbdinfo routine
#' @param snps The SNPs to be used in the scan. This may be an integer
#' vector indicate which SNPs to use in the binary dosage file or a
#' character vector of the SNP IDs to use. The value may also be "all",
#' indicating to use all SNPs. The default value is "all".
#' @param outfile The file name for the results Can be blank.
#' If the value is "", the results are returned as a data frame. Default
#' value is ""
#' @param skipfile The name of the file to write the SNPs that were not
#' used and the reason they weren't used. If the value is blank, there is
#' no output of the unused SNPs. Default value is "".
#' @param minmaf Minimum minor allele frequency of SNPs to include
#' in analysis. SNPS that have less than 20 minor alleles observed
#' will be excluded from the analysis regardless of the value of
#' minmaf. A value of 0 indicates to use all the SNPs that have 20
#' minor alleles observed. Default value is 0.
#' @param blksize Size of blocks of SNPs to read in at one time.
#' Larger blocks can improve overall speed but require larger
#' amounts of computer memory. A value of 0 indicates to use the
#' recommended block size. Default value is 0.
#' @param binary Logical value indicating if the phenotype
#' is a binary value. Default value is false.
#' @return
#' 0
#' @export
#'
#' @examples
#' bdinfo <- readRDS(system.file("extdata/pdata_4_1.bdinfo", package = "GxEScanR"))
#' covdata <- readRDS(system.file("extdata/covdata.rds", package = "GxEScanR"))
#'
#' results <- gweis(data = covdata, bdinfo = bdinfo)
gweis <- function(data, bdinfo, snps, outfile, skipfile,
minmaf, blksize, binary) {
# Set missing values to default values
if (missing(snps) == TRUE)
snps = "all"
if (missing(outfile) == TRUE)
outfile <- ""
if (missing(skipfile) == TRUE)
skipfile <- ""
if (missing(minmaf) == TRUE)
minmaf <- 0.
if (missing(blksize))
blksize <- 0L
if (missing(binary) == TRUE)
binary = TRUE
validateinput(data, bdinfo, outfile, skipfile, minmaf, blksize, binary)
snps <- subsetsnps(snps = snps,
snplist = bdinfo$snps$snpid)
#####################################################
### Subset data to subjects with complete
### data and run baseline regression
#####################################################
data <- subsetdata(subdata = data,
bdinfo = bdinfo,
binary = binary,
mincov = 1)
nsub <- nrow(data)
ncov <- ncol(data)
stddata <- matrix(data = 0,
nrow = nsub,
ncol = ncov)
means <- numeric(ncov)
stddevs <- numeric(ncov)
stdmat(data, stddata, means, stddevs)
print(paste(nsub, "subjects have complete data"))
#####################################################
### Get information about the binary
### dosage file and create the blocks
### needed for reading the data
#####################################################
blkinfo <- assignblocks(nsub = nrow(bdinfo$samples),
nsnps = length(bdinfo$snps$snpid),
snploc = bdinfo$indices,
snpbytes = bdinfo$datasize,
reqblksize = blksize)
blkbuffer <- integer((max(blkinfo$blkbytes) + 3) %/% 4)
numblks <- length(blkinfo$blkloc)
if (bdinfo$additionalinfo$format == 1) {
if (bdinfo$additionalinfo$subformat == 1)
base = 0L
else
base = 1L
} else {
base <- 2L
}
subindex <- as.integer(rownames(data))
if (binary == TRUE)
return (logreggweis(bdinfo = bdinfo,
blkinfo = blkinfo,
snps = snps,
stddata = stddata,
subindex = subindex,
outfile = outfile,
skipfile = skipfile,
minmaf = minmaf,
base = base,
e = data[,ncol(data)]))
return (linreggweis(bdinfo = bdinfo,
blkinfo = blkinfo,
snps = snps,
stddata = stddata,
subindex = subindex,
minmaf = minmaf,
outfile = outfile,
skipfile = skipfile,
base = base,
estddev = stddevs[length(stddevs)]))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.