Nothing
# Description:
# Parse input data and format it for stan and statistical learning
getStanData <- function(genotype,
phenotype,
phenotype.type) {
# Description:
# Returns TRUE if the genotype data is composed of at most two-alleles
# at each SNP.
isBiallelic <- function(genotype) {
isBi <- function(x) {
return(length(unique(x)) <= 2)
}
is.bi <- apply(X = genotype, MARGIN = 2, FUN = isBi)
return(all(is.bi == TRUE))
}
# Description:
# Get genotype-phenotype data
getFormattedGenphen <- function(genotype,
phenotype,
phenotype.type) {
# convert AAMultipleAlignment to matrix if needed
genotype <- convertMsaToGenotype(genotype = genotype)
# if vector genotype => matrix genotype
if(is.vector(genotype)) {
genotype <- matrix(data = genotype, ncol = 1)
}
# if vector genotype => matrix genotype
if(is.vector(phenotype)) {
phenotype <- matrix(data = phenotype, ncol = 1)
}
phenotype <- data.frame(phenotype)
# TODO: check
if(sum(phenotype.type == "D") != 0) {
d <- which(phenotype.type == "D")
for(i in 1:length(d)) {
if(all(phenotype[, d] %in% c(1, 0)) == FALSE) {
# mapping 1st element to 1, 2nd to 0
u <- unique(phenotype[, d])
phenotype[phenotype[, d] == u[1], d] <- "1"
phenotype[phenotype[, d] == u[2], d] <- "0"
cat("Mapping dichotomous phenotype:", i,
"(", u[1], "->1,", u[2], "->0) \n")
}
phenotype[, d] <- as.factor(as.character(phenotype[, d]))
}
}
# return
return (list(genotype = genotype,
phenotype = phenotype,
phenotype.type = phenotype.type))
}
# Description:
# Convert genphen data to stan data
getStanFormat <- function(f.data,
is.bi) {
# Split SNP into pairs of genotypes
getSplitX <- function(x) {
ux <- unique(x)
ns <- length(ux)
nc <- max(c(choose(n = ns, k = 2), 1))
x.split <- matrix(data = 0, nrow = length(x), ncol = nc)
if(ns == 1) {
x.split[, 1] <- 1
x.map <- data.frame(ref = ux[1], alt = NA,
refN = sum(x == ux[1]),
altN = 0,
stringsAsFactors = FALSE)
}
else {
x.map <- c()
counter <- 1
for(i in 1:(ns-1)) {
for(j in (i+1):ns) {
x.split[x == ux[i], counter] <- 1
x.split[x == ux[j], counter] <- -1
counter <- counter + 1
x.map <- rbind(x.map, data.frame(ref = ux[i], alt = ux[j],
refN = sum(x == ux[i]),
altN = sum(x == ux[j]),
stringsAsFactors = FALSE))
}
}
}
# return
return(list(x.split = x.split,
x.map = x.map))
}
# X.data
getXData <- function(x) {
return (x$x.split)
}
# X.map
getXMap <- function(x) {
return (x$x.map)
}
# make huge matrix with predictors
X <- f.data$genotype
colnames(X) <- 1:ncol(X)
x.data <- apply(X = X, MARGIN = 2, FUN = getSplitX)
for(i in 1:length(x.data)) {
x.data[[i]]$x.map$site <- i
}
X <- do.call(cbind, lapply(X = x.data, FUN = getXData))
x.map <- do.call(rbind, lapply(X = x.data, FUN = getXMap))
# filtering
filter.i <- which(x.map$refN > 1 & x.map$altN > 1)
X <- X[, filter.i]
if(length(X) == 0) {
stop("All SNPs have 1 element (Problem)")
}
if(is.vector(X) == T) {
X <- matrix(data = X, ncol = 1)
}
x.map <- x.map[filter.i, ]
if(sum(phenotype.type == "Q") != 0) {
Yq <- as.matrix(f.data$phenotype[, f.data$phenotype.type == "Q"])
}
else {
Yq <- matrix(data = 0, ncol = 0, nrow = nrow(X))
}
if(sum(phenotype.type == "D") != 0) {
Yd <- as.matrix(f.data$phenotype[, f.data$phenotype.type == "D"])
}
else {
Yd <- matrix(data = 0, ncol = 0, nrow = nrow(X))
}
s <- list(X = X,
Y = f.data$phenotype,
Yq = Yq,
Yd = Yd,
N = nrow(X),
Ns = ncol(f.data$genotype),
Nsk = ncol(X),
Ntq = ncol(Yq),
Ntd = ncol(Yd),
is.bi = is.bi,
xmap = x.map,
genotype = f.data$genotype,
phenotype.type = f.data$phenotype.type)
return (s)
}
# genphen data
f.data <- getFormattedGenphen(genotype = genotype,
phenotype = phenotype,
phenotype.type = phenotype.type)
# stan data
stan.data <- getStanFormat(f.data = f.data,
is.bi = isBiallelic(f.data$genotype))
# return
return (stan.data)
}
# Description:
# Parse input data and format it for stan and statistical learning
getPhyloData <- function(genotype) {
# Description:
# Convert genphen data to stan data
getPhyloFormat <- function(genotype) {
# Split SNP into pairs of genotypes
getSplitX <- function(x) {
ux <- unique(x)
ns <- length(ux)
nc <- max(c(choose(n = ns, k = 2), 1))
x.split <- matrix(data = 0, nrow = length(x), ncol = nc)
if(ns == 1) {
x.split[, 1] <- 1
x.map <- data.frame(ref = ux[1], alt = NA,
refN = sum(x == ux[1]),
altN = 0,
stringsAsFactors = FALSE)
}
else {
x.map <- c()
counter <- 1
for(i in 1:(ns-1)) {
for(j in (i+1):ns) {
x.split[x == ux[i], counter] <- 1
x.split[x == ux[j], counter] <- -1
counter <- counter + 1
x.map <- rbind(x.map, data.frame(ref = ux[i], alt = ux[j],
refN = sum(x == ux[i]),
altN = sum(x == ux[j]),
stringsAsFactors = FALSE))
}
}
}
# return
return(list(x.split = x.split,
x.map = x.map))
}
# X.data
getXData <- function(x) {
return (x$x.split)
}
# X.map
getXMap <- function(x) {
return (x$x.map)
}
# convert AAMultipleAlignment to matrix if needed
genotype <- convertMsaToGenotype(genotype = genotype)
# if vector genotype => matrix genotype
if(is.vector(genotype)) {
genotype <- matrix(data = genotype, ncol = 1)
}
# make huge matrix with predictors
X <- genotype
colnames(X) <- 1:ncol(X)
x.data <- apply(X = X, MARGIN = 2, FUN = getSplitX)
for(i in 1:length(x.data)) {
x.data[[i]]$x.map$site <- i
}
X <- do.call(cbind, lapply(X = x.data, FUN = getXData))
x.map <- do.call(rbind, lapply(X = x.data, FUN = getXMap))
return (x.map)
}
# phylo format data
phylo.data <- getPhyloFormat(genotype = genotype)
# return
return (phylo.data)
}
# Description:
# Check the optional (...) parameters, if provided.
checkDotParameters <- function(...) {
checkAdaptDelta <- function(adapt_delta) {
if(length(adapt_delta) != 1) {
stop("adapt_delta must be in range (0, 1) (default = 0.8).")
}
if(is.numeric(adapt_delta) == FALSE) {
stop("adapt_delta must be in range (0, 1)")
}
if(adapt_delta >= 1 | adapt_delta <= 0) {
stop("adapt_delta must be in range (0, 1)")
}
}
checkMaxTreedepth <- function(max_treedepth) {
if(length(max_treedepth) != 1) {
stop("max_treedepth is numeric parameter.")
}
if(is.numeric(max_treedepth) == FALSE) {
stop("max_treedepth is numeric parameter.")
}
if(max_treedepth < 5) {
stop("max_treedepth >= 5 (default = 10).")
}
}
checkCvFold <- function(cv.fold) {
if(length(cv.fold) != 1) {
stop("cv.fold must be in range (0, 1) (default = 0.66).")
}
if(is.numeric(cv.fold) == FALSE) {
stop("cv.fold must be in range (0, 1)")
}
if(cv.fold >= 1 | cv.fold <= 0) {
stop("cv.fold must be in range (0, 1)")
}
}
checkNtree <- function(ntree) {
if(length(ntree) != 1) {
stop("ntree is numeric parameter.")
}
if(is.numeric(ntree) == FALSE) {
stop("ntree is numeric parameter.")
}
if(ntree < 100) {
stop("ntree >= 100 (default = 500).")
}
}
checkVerbose <- function(verbose) {
if(length(verbose) != 1) {
stop("verbose is a logical parameter.")
}
if(is.logical(verbose) == FALSE) {
stop("verbose is a logical parameter.")
}
}
checkRefresh <- function(refresh) {
if(length(refresh) != 1) {
stop("refresh is numeric parameter.")
}
if(is.numeric(refresh) == FALSE) {
stop("refresh is a numeric parameter.")
}
return (refresh)
}
available.names <- c("adapt_delta",
"max_treedepth",
"ntree",
"cv.fold",
"refresh",
"verbose")
default.values <- list(adapt_delta = 0.8,
max_treedepth = 10,
ntree = 1000,
cv.fold = 0.66,
refresh = 250,
verbose = TRUE)
# get the optional parameters
dot.names <- names(list(...))
if(length(dot.names) > 0) {
if(any(dot.names %in% available.names) == FALSE) {
wrong.names <- dot.names[!dot.names %in% available.names]
stop(paste("Unknown optional parameter were provided! The following
optional parameters are available:", dot.names, sep = ' '))
}
}
# check each parameter
for(p in dot.names) {
if(is.null(list(...)[[p]]) || is.na(list(...)[[p]])) {
stop(paste("optional parameter ", p, " can't be NULL", sep = ''))
}
if(p == "adapt_delta") {
checkAdaptDelta(adapt_delta = list(...)[[p]])
default.values[["adapt_delta"]] <- list(...)[[p]]
}
if(p == "max_treedepth") {
checkMaxTreedepth(max_treedepth = list(...)[[p]])
default.values[["max_treedepth"]] <- list(...)[[p]]
}
if(p == "cv.fold") {
checkCvFold(cv.fold = list(...)[[p]])
default.values[["cv.fold"]] <- list(...)[[p]]
}
if(p == "ntree") {
checkNtree(ntree = list(...)[[p]])
default.values[["ntree"]] <- list(...)[[p]]
}
if(p == "refresh") {
checkRefresh(refresh = list(...)[[p]])
default.values[["refresh"]] <- list(...)[[p]]
}
if(p == "verbose") {
checkVerbose(verbose = list(...)[[p]])
default.values[["verbose"]] <- list(...)[[p]]
}
}
return (default.values)
}
# Description:
# Provided the input arguments, this function checks their validity. It
# stops the execution if a problem is encountered and prints out warnings.
checkInput <- function(genotype,
phenotype,
phenotype.type,
model.type,
mcmc.chains,
mcmc.steps,
mcmc.warmup,
cores,
hdi.level,
stat.learn.method,
cv.steps,
rpa.iterations,
with.stan.obj) {
checkGenotypePhenotype <- function(genotype, phenotype, phenotype.type) {
# CHECK: genotype
if(is.null(attr(genotype, "class")) == FALSE) {
if(!attr(genotype, "class") %in% c("AAMultipleAlignment",
"DNAMultipleAlignment")) {
stop("genotype can be one of the following structures: vector (for a
single SNP), matrix, data.frame or Biostrings structures such as
DNAMultipleAlignment or AAMultipleAlignment")
}
else {
temp <- as.matrix(genotype)
if(nrow(temp) < 2 | ncol(temp) == 0) {
stop("The genotypes cannot have less than two observations, or the
number of genotypes cannot be 0.")
}
}
}
else {
if(is.vector(genotype)) {
genotype <- matrix(data = genotype, ncol = 1)
}
if(nrow(genotype) < 2 | ncol(genotype) == 0) {
stop("The genotypes cannot have less than two observations or the
number of genotypes cannot be 0.")
}
if(!is.matrix(genotype) & !is.data.frame(genotype)) {
stop("genotype can be one of the following structures: vector (for a
single SNP), matrix, data.frame or Biostrings structures such as
DNAMultipleAlignment or AAMultipleAlignment")
}
if(typeof(genotype) != "character") {
stop("If it is structured as vector/matrix/data.frame,
the genotype have be of character type.")
}
}
# CHECK: phenotype
if(!is.vector(phenotype) & !is.matrix(phenotype)) {
stop("The phenotype must be either a vector (single phenotype) or matrix
(with multiple phenotypes = columns), where the rows match the rows
of the genotype data")
}
# convert vector -> matrix
if(is.vector(phenotype) == TRUE) {
phenotype <- matrix(data = phenotype, ncol = 1)
}
if(!is.numeric(phenotype)) {
stop("The phenotype must be of numeric type.")
}
if(length(phenotype) < 3) {
stop("The phenotype must contain at least 3 data points.")
}
if(nrow(genotype) != nrow(phenotype)) {
stop("length(genotype) != length(phenotype),
they must be equal in length.")
}
# CHECK: phenotype.type
if(is.vector(phenotype.type) == FALSE) {
stop("phenotype.type must be vector. Each element in this vector refers
to the type of each phenotype, with 'Q' (for quantitative phenotypes)
or 'D' (for dichotomous) included as each columns in the phenotype
data. If a single phenotype is provided, then the phenotype type
should be a single 'Q' or 'D'.")
}
if(length(phenotype.type) == 0) {
stop("The phenotype.type vector must contain at least 1 element.")
}
if(typeof(phenotype.type) != "character") {
stop("phenotype.type must be character vector with elements 'Q' (for
quantitative phenotypes) or 'D' (for dichotomous)")
}
if(ncol(phenotype) != length(phenotype.type)) {
stop("Number of phenotypes provided differs from phenotypes types.")
}
if(all(phenotype.type %in% c("Q", "D")) == FALSE) {
stop("phenotype.type must be character vector with elements 'Q' (for
quantitative phenotypes) or 'D' (for dichotomous)")
}
}
checkPhenotypeValidity <- function(phenotype, phenotype.type) {
# convert vector -> matrix
if(is.vector(phenotype) == TRUE) {
phenotype <- matrix(data = phenotype, ncol = 1)
}
# check phenotype types
for(i in 1:length(phenotype.type)) {
if(phenotype.type[i] == "D") {
if(length(unique(phenotype[, i])) != 2) {
stop("The dichotomous phenotypes must contains exactly two
categories (classes) \n")
}
}
if(phenotype.type[i] == "Q") {
if(length(unique(phenotype[, i])) <= 3) {
warning("The quantitative phenotype/s contains 3 or less unique
elements \n")
}
}
}
}
checkModelType <- function(model.type) {
# CHECK: model.type
if(length(model.type) != 1) {
stop("model.type must be a string (default = 'univariate')")
}
if(!is.character(model.type)) {
stop("model.type must be a string: 'univariate' or 'hierarchical'")
}
if(!model.type %in% c("univariate", "hierarchical")) {
stop("phenotype.type must be a string: 'univariate' or 'hierarchical'")
}
}
checkMcmcIterations <- function(mcmc.steps) {
# CHECK: mcmc.steps
if(length(mcmc.steps) != 1) {
stop("the mcmc.steps must be a number > 0 (default = 10000).")
}
if(!is.numeric(mcmc.steps)) {
stop("mcmc.steps must be a numeric argument (default = 10000).")
}
if(mcmc.steps <= 0) {
stop("mcmc.steps must be larger than 0 (default = 10000).")
}
}
checkMcmcWarmup <- function(mcmc.warmup) {
# CHECK: mcmc.warmup
if(length(mcmc.warmup) != 1) {
stop("the mcmc.warmup must be a number > 0 (default = 5000).")
}
if(!is.numeric(mcmc.warmup)) {
stop("mcmc.warmup must be a numeric argument (default = 5000).")
}
if(mcmc.warmup <= 0) {
stop("mcmc.warmup must be larger than 0 (default = 5000).")
}
}
checkMcmcChains <- function(mcmc.chains) {
# CHECK: mcmc.chains
if(length(mcmc.chains) != 1) {
stop("mcmc.chains must be a positive integer > 0 (default = 1).")
}
if(!is.numeric(mcmc.chains)) {
stop("mcmc.chains must be a positive integer > 0 (default = 1).")
}
if(mcmc.chains <= 0) {
stop("mcmc.chains must be a positive integer > 0 (default = 1).")
}
}
checkCores <- function(cores) {
# CHECK: cores
if(length(cores) != 1) {
stop("cores is numeric parameter.")
}
if(is.numeric(cores) == FALSE) {
stop("cores is numeric parameter.")
}
if(cores <= 0) {
stop("cores is numeric parameter >=1.")
}
}
checkHdi <- function(hdi.level) {
if(length(hdi.level) != 1) {
stop("The HDI level must be in range (0, 1).")
}
if(is.numeric(hdi.level) == FALSE) {
stop("The HDI level must be in range (0, 1).")
}
if(hdi.level >= 1 | hdi.level <= 0) {
stop("The HDI level must be in range (0, 1).")
}
}
checkMlMethod <- function(stat.learn.method) {
# CHECK: phenotype.type
if(length(stat.learn.method) != 1) {
stop("stat.learn.method must be a string: 'rf' or 'svm'")
}
if(!is.character(stat.learn.method)) {
stop("stat.learn.method must be a string: 'rf', 'svm' or 'none'")
}
if(!stat.learn.method %in% c("rf", "svm", "none")) {
stop("stat.learn.method must be a string: 'rf', 'svm' or 'none'")
}
}
checkCv <- function(stat.learn.method, cv.steps) {
if(stat.learn.method %in% c("rf", "svm")) {
if(length(cv.steps) != 1) {
stop("cv.steps must be a number (default = 1,000).")
}
if(is.numeric(cv.steps) == FALSE) {
stop("cv.steps must be a number (default = 1,000).")
}
if(cv.steps < 100) {
stop("cv.steps >= 100 recomended (default = 1,000).")
}
}
}
if(is.null(genotype) | missing(genotype) |
is.null(phenotype) | missing(phenotype) |
is.null(phenotype.type) | missing(phenotype.type) |
is.null(model.type) | missing(model.type) |
is.null(mcmc.chains) | missing(mcmc.chains) |
is.null(mcmc.steps) | missing(mcmc.steps) |
is.null(mcmc.warmup) | missing(mcmc.warmup) |
is.null(cores) | missing(cores) |
is.null(hdi.level) | missing(hdi.level) |
is.null(stat.learn.method) | missing(stat.learn.method) |
is.null(cv.steps) | missing(cv.steps)) {
stop("arguments must be non-NULL/specified")
}
checkGenotypePhenotype(genotype = genotype,
phenotype = phenotype,
phenotype.type = phenotype.type)
checkPhenotypeValidity(phenotype = phenotype,
phenotype.type = phenotype.type)
checkModelType(model.type = model.type)
checkMcmcIterations(mcmc.steps = mcmc.steps)
checkMcmcWarmup(mcmc.warmup = mcmc.warmup)
checkMcmcChains(mcmc.chains = mcmc.chains)
checkCores(cores = cores)
checkHdi(hdi.level = hdi.level)
checkMlMethod(stat.learn.method = stat.learn.method)
checkCv(stat.learn.method = stat.learn.method, cv.steps = cv.steps)
}
# Description:
# Provided the input arguments, this function checks their validity. It
# stops the execution if a problem is encountered and prints out warnings.
checkDiagnosticInput <- function(genotype,
phenotype,
phenotype.type,
rf.trees) {
checkGenotypePhenotype <- function(genotype, phenotype, phenotype.type) {
# CHECK: genotype
if(is.null(attr(genotype, "class")) == FALSE) {
if(!attr(genotype, "class") %in% c("AAMultipleAlignment",
"DNAMultipleAlignment")) {
stop("genotype can be one of the following structures: vector (for a
single SNP), matrix, data.frame or Biostrings structures such as
DNAMultipleAlignment or AAMultipleAlignment")
}
else {
temp <- as.matrix(genotype)
if(nrow(temp) < 2 | ncol(temp) == 0) {
stop("The genotypes cannot have less than two observations, or the
number of genotypes cannot be 0.")
}
}
}
else {
if(is.vector(genotype)) {
genotype <- matrix(data = genotype, ncol = 1)
}
if(nrow(genotype) < 2 | ncol(genotype) == 0) {
stop("The genotypes cannot have less than two observations or the
number of genotypes cannot be 0.")
}
if(!is.matrix(genotype) & !is.data.frame(genotype)) {
stop("genotype can be one of the following structures: vector (for a
single SNP), matrix, data.frame or Biostrings structures such as
DNAMultipleAlignment or AAMultipleAlignment")
}
if(typeof(genotype) != "character") {
stop("If it is structured as vector/matrix/data.frame,
the genotype have be of character type.")
}
}
# CHECK: phenotype
if(!is.vector(phenotype)) {
stop("The phenotype must be a vector (single phenotype),
where each element corresponds to each individual")
}
# convert vector -> matrix
if(is.vector(phenotype) == TRUE) {
phenotype <- matrix(data = phenotype, ncol = 1)
}
if(!is.numeric(phenotype)) {
stop("The phenotype must be of numeric type.")
}
if(length(phenotype) < 3) {
stop("The phenotype must contain at least 3 data points.")
}
if(nrow(genotype) != nrow(phenotype)) {
stop("length(genotype) != length(phenotype),
they must be equal in length.")
}
# CHECK: phenotype.type
if(is.vector(phenotype.type) == FALSE) {
stop("phenotype.type must be vector. Each element in this vector refers
to the type of each phenotype, with 'Q' (for quantitative phenotypes)
or 'D' (for dichotomous) included as each columns in the phenotype
data. If a single phenotype is provided, then the phenotype type
should be a single 'Q' or 'D'.")
}
if(length(phenotype.type) == 0) {
stop("The phenotype.type vector must contain at least 1 element.")
}
if(typeof(phenotype.type) != "character") {
stop("phenotype.type must be character vector with elements 'Q' (for
quantitative phenotypes) or 'D' (for dichotomous)")
}
if(ncol(phenotype) != 1) {
stop("The diagnosis can be done using single phenotypes.")
}
if(ncol(phenotype) != length(phenotype.type)) {
stop("Number of phenotypes provided differs from phenotypes types.")
}
if(all(phenotype.type %in% c("Q", "D")) == FALSE) {
stop("phenotype.type must be character vector with elements 'Q' (for
quantitative phenotypes) or 'D' (for dichotomous)")
}
}
checkPhenotypeValidity <- function(phenotype, phenotype.type) {
# convert vector -> matrix
if(is.vector(phenotype) == TRUE) {
phenotype <- matrix(data = phenotype, ncol = 1)
}
# check phenotype types
for(i in 1:length(phenotype.type)) {
if(phenotype.type[i] == "D") {
if(length(unique(phenotype[, i])) != 2) {
stop("The dichotomous phenotypes must contains exactly two
categories (classes) \n")
}
}
if(phenotype.type[i] == "Q") {
if(length(unique(phenotype[, i])) <= 3) {
warning("The quantitative phenotype/s contains 3 or less unique
elements \n")
}
}
}
}
checkRfTrees <- function(rf.trees) {
if(length(rf.trees) != 1) {
stop("rf.trees must be a number (default = 5,000).")
}
if(is.numeric(rf.trees) == FALSE) {
stop("rf.trees must be a number (default = 5,000).")
}
if(rf.trees < 5000) {
stop("rf.trees >= 1,000 accepted (default = 5,000).")
}
}
if(is.null(genotype) | missing(genotype) |
is.null(phenotype) | missing(phenotype) |
is.null(phenotype.type) | missing(phenotype.type) |
is.null(rf.trees) | missing(rf.trees)) {
stop("arguments must be non-NULL/specified")
}
checkGenotypePhenotype(genotype = genotype,
phenotype = phenotype,
phenotype.type = phenotype.type)
checkPhenotypeValidity(phenotype = phenotype,
phenotype.type = phenotype.type)
checkRfTrees(rf.trees = rf.trees)
}
# Description:
# Provided the input arguments, this function checks their validity. It
# stops the execution if a problem is encountered and prints out warnings.
checkInputPhyloBias <- function(input.kinship.matrix,
genotype) {
checkGenotype <- function(genotype) {
if(is.null(attr(genotype, "class")) == FALSE) {
if(!attr(genotype, "class") %in% c("AAMultipleAlignment",
"DNAMultipleAlignment")) {
stop("genotype can be one of the following structures: matrix or
data.frame or DNAMultipleAlignment/AAMultipleAlignment")
}
else {
temp <- as.matrix(genotype)
if(nrow(temp) < 2 | ncol(temp) == 0) {
stop("the genotypes cannot have less than two observations the or
number of genotypes cannot be 0.")
}
}
}
else {
if(is.vector(genotype)) {
genotype <- matrix(data = genotype, ncol = 1)
}
if(nrow(genotype) < 2 | ncol(genotype) == 0) {
stop("the genotypes cannot have less than two observations or the
number of genotypes cannot be 0.")
}
if(!is.matrix(genotype) & !is.data.frame(genotype)) {
stop("genotype can be one of the following structures: matrix or
data.frame or DNAMultipleAlignment/AAMultipleAlignment")
}
if(typeof(genotype) != "character") {
stop("if it is structured in matrix/data.frame the genotype must
be of character type.")
}
}
}
checkKinship <- function(input.kinship.matrix) {
if(is.matrix(input.kinship.matrix) == FALSE) {
stop("precomputed kinship matrix must be a numeric matrix.")
}
if(is.numeric(input.kinship.matrix) == FALSE) {
stop("precomputed kinship matrix must be a numeric matrix.")
}
if(nrow(input.kinship.matrix) != ncol(input.kinship.matrix)) {
stop("precomputed kinship matrix must be NxN numeric matrix.")
}
if(nrow(input.kinship.matrix) <= 0) {
stop("at least two individuals needed for the analysis.")
}
}
if((is.null(input.kinship.matrix) | missing(input.kinship.matrix))
& (is.null(genotype) | missing(genotype))) {
stop("arguments must be non-NULL/specified")
}
if(is.null(input.kinship.matrix) | missing(input.kinship.matrix)) {
if(is.null(genotype) | missing(genotype)) {
stop("arguments must be non-NULL/specified")
}
}
else {
checkKinship(input.kinship.matrix = input.kinship.matrix)
}
checkGenotype(genotype = genotype)
}
# Description:
# Given two vectors, one dependent (genotype) and one independent
# (phenotype), compute the classification accuracy of classifying
# the genotype from the phenotype alone (and corresponding HDI).
# The classification is computed using random forest.
runStatLearn <- function(genphen.data,
method,
cv.fold,
cv.steps,
ntree,
hdi.level,
cores) {
# RF analysis
runRf <- function(Xd, Y, cv.fold, cv.steps,
ntree, hdi.level, site) {
getBoot <- function(D, cv.fold, cv.steps,
ntree, hdi.level) {
# posterior output (one extra for multi-trait)
posterior <- vector(mode = "list", length = ncol(D)-1)
for(i in 1:length(posterior)) {
posterior[[i]] <- matrix(data = NA, nrow = cv.steps, ncol = 2)
}
rm(i)
D$Y <- as.character(D$Y)
for(i in 1:cv.steps) {
# sample at random
s <- sample(x = 1:nrow(D), size = ceiling(x = cv.fold*nrow(D)),
replace = FALSE)
train <- D[s, ]
test <- D[-s, ]
# TODO: dummy (check if inference needed at all e.g. 1 class only)
if(length(unique(train$Y)) == 1) {
for(j in 1:length(posterior)) {
posterior[[j]][i, ] <- c(NA, NA)
}
}
else {
for(j in 1:length(posterior)) {
# train classification model (try condition to avoid errors in
# case only one-level predictor is train data)
rf.out <- try(ranger::ranger(as.factor(Y)~.,
data = train[, c(1, j+1)],
num.trees = ntree),
silent = TRUE)
if(class(rf.out) == "try-error") {
posterior[[j]][i, 1:2] <- c(NA, NA)
}
else {
# test classification model
pr <- stats::predict(object = rf.out, data = test)
# compute classification accuracy (1 - classification error)
test$Y <- as.character(test$Y)
pr$predictions <- as.character(pr$predictions)
ca <- sum(test$Y == pr$predictions)/nrow(test)
# compute k statistics
k <- getKappa(real = test$Y,
predicted = pr$predictions,
aas = unique(D$Y))
posterior[[j]][i, 1:2] <- c(ca, k)
}
}
}
}
# compute stats
summary <- c()
for(j in 1:length(posterior)) {
ca <- posterior[[j]][, 1]
ca <- ca[is.finite(ca)]
ca.mean <- mean(x = ca)
# get HDI
ca.hdi <- getHdi(vec = posterior[[j]][,1], hdi.level = hdi.level)
ca.L <- as.numeric(ca.hdi[1])
ca.H <- as.numeric(ca.hdi[2])
k <- posterior[[j]][, 2]
k <- k[is.finite(k)]
k.mean <- mean(x = k)
# get HDI
k.hdi <- getHdi(vec = posterior[[j]][, 2], hdi.level = hdi.level)
k.L <- as.numeric(k.hdi[1])
k.H <- as.numeric(k.hdi[2])
# summary append
row <- data.frame(ca = ca.mean, ca.L = ca.L, ca.H = ca.H,
k = k.mean, k.L = k.L, k.H = k.H, p = j)
summary <- rbind(summary, row)
}
return (list(summary = summary,
posterior = posterior))
}
X <- as.matrix(Xd[, site])
i <- which(X %in% c(1, -1))
D <- data.frame(Y = as.character(X))
colnames(Y) <- paste("P", 1:ncol(Y), sep = '')
D <- cbind(D, Y)
D <- D[i, ]
# if c.v. can be done with given cv.fold and # of data points
result <- c()
if(ceiling(cv.fold*nrow(D)) == nrow(D)) {
for(i in 1:ncol(Y)) {
out <- data.frame(ca = NA, ca.L = NA, ca.H = NA,
k = NA, k.L = NA, k.H = NA,
p = i, stringsAsFactors = FALSE)
result <- rbind(result, out)
}
}
else {
# run
p <- getBoot(D = D,
cv.fold = cv.fold,
cv.steps = cv.steps,
hdi.level = hdi.level,
ntree = ntree)
result <- rbind(result, p$summary)
}
return (list(result = result,
posterior = p$posterior))
}
# SVM analysis
runSvm <- function(Xd, Y, cv.fold, cv.steps,
hdi.level, site) {
getBoot <- function(D, cv.fold, cv.steps, hdi.level) {
# posterior output (one extra for multi-trait)
posterior <- vector(mode = "list", length = ncol(D)-1)
for(i in 1:length(posterior)) {
posterior[[i]] <- matrix(data = NA, nrow = cv.steps, ncol = 2)
}
rm(i)
D$Y <- as.character(D$Y)
for(i in 1:cv.steps) {
# sample at random
s <- sample(x = 1:nrow(D), size = ceiling(x = cv.fold*nrow(D)),
replace = FALSE)
train <- D[s, ]
test <- D[-s, ]
# TODO: dummy (check if inference needed at all e.g. 1 class only)
if(length(unique(train$Y)) == 1) {
for(j in 1:length(posterior)) {
posterior[[j]][i, ] <- c(NA, NA)
}
}
else {
for(j in 1:length(posterior)) {
# train classification model (try condition to avoid errors in case
# only one-level predictor is train data)
svm.out <- try(e1071::svm(as.factor(Y)~.,data = train[, c(1, j+1)],
type = "C-classification"), silent = TRUE)
if(class(svm.out)[1] == "try-error") {
posterior[[j]][i, 1:2] <- c(NA, NA)
}
else {
# test classification model
pr <- stats::predict(object = svm.out, newdata = test)
# compute classification accuracy (1 - classification error)
test$Y <- as.character(test$Y)
pr <- as.character(pr)
ca <- sum(test$Y == pr)/nrow(test)
# compute k statistics
k <- getKappa(real = test$Y, predicted = pr, aas = unique(D$Y))
posterior[[j]][i, 1:2] <- c(ca, k)
}
}
}
}
# compute stats
summary <- c()
for(j in 1:length(posterior)) {
ca <- posterior[[j]][, 1]
ca <- ca[is.finite(ca)]
ca.mean <- mean(x = ca)
# get HDI
ca.hdi <- getHdi(vec = posterior[[j]][,1], hdi.level = hdi.level)
ca.L <- as.numeric(ca.hdi[1])
ca.H <- as.numeric(ca.hdi[2])
k <- posterior[[j]][, 2]
k <- k[is.finite(k)]
k.mean <- mean(x = k)
# get HDI
k.hdi <- getHdi(vec = posterior[[j]][, 2], hdi.level = hdi.level)
k.L <- as.numeric(k.hdi[1])
k.H <- as.numeric(k.hdi[2])
# summary append
row <- data.frame(ca = ca.mean, ca.L = ca.L, ca.H = ca.H,
k = k.mean, k.L = k.L, k.H = k.H, p = j)
summary <- rbind(summary, row)
}
return (list(summary = summary,
posterior = posterior))
}
X <- as.matrix(Xd[, site])
i <- which(X %in% c(1, -1))
D <- data.frame(Y = as.character(X))
colnames(Y) <- paste("P", 1:ncol(Y), sep = '')
D <- cbind(D, Y)
D <- D[i, ]
# if c.v. can be done with given cv.fold and # of data points
result <- c()
if(ceiling(cv.fold*nrow(D)) == nrow(D)) {
for(i in 1:ncol(Y)) {
out <- data.frame(ca = NA, ca.L = NA, ca.H = NA,
k = NA, k.L = NA, k.H = NA,
p = i, stringsAsFactors = FALSE)
result <- rbind(result, out)
}
}
else {
# run
p <- getBoot(D = D,
cv.fold = cv.fold,
cv.steps = cv.steps,
hdi.level = hdi.level)
result <- rbind(result, p$summary)
}
return (list(result = result,
posterior = p$posterior))
}
# Format posterior
getPosteriorFormat <- function(cas) {
ca.list <- vector(mode = "list", length = length(cas[[1]]$posterior))
kappa.list <- vector(mode = "list", length = length(cas[[1]]$posterior))
# empty result matrix
results <- c()
for(i in 1:length(ca.list)) {
# empty posterior matrices
ca.m <- matrix(data = 0, nrow = nrow(cas[[1]]$posterior[[1]]),
ncol = length(cas))
kappa.m <- matrix(data = 0, nrow = nrow(cas[[1]]$posterior[[1]]),
ncol = length(cas))
for(j in 1:length(cas)) {
# collect results
if(i == 1) {
row <- cas[[j]]$result
row$s <- j
results <- rbind(results, row)
}
# collect posteriors
ca.m[, j] <- cas[[j]]$posterior[[i]][, 1]
kappa.m[,j ] <- cas[[j]]$posterior[[i]][, 2]
}
ca.list[[i]] <- ca.m
kappa.list[[i]] <- kappa.m
}
return (list(results = results,
ca.list = ca.list,
kappa.list = kappa.list))
}
if(method == "rf") {
cas <- lapply(X = 1:ncol(genphen.data$X),
FUN = runRf,
Xd = genphen.data$X,
Y = genphen.data$Y,
cv.fold = cv.fold,
cv.steps = cv.steps,
hdi.level = hdi.level,
ntree = ntree)
}
else if(method == "svm") {
cas <- lapply(X = 1:ncol(genphen.data$X),
FUN = runSvm,
Xd = genphen.data$X,
Y = genphen.data$Y,
cv.fold = cv.fold,
cv.steps = cv.steps,
hdi.level = hdi.level)
}
# format posterior
cas <- getPosteriorFormat(cas = cas)
return (cas)
}
# Description:
# Bayesian inference
runBayesianInference <- function(genphen.data,
mcmc.chains,
mcmc.steps,
mcmc.warmup,
cores,
model.stan,
...) {
# extra conversion
Yd <- c()
if(genphen.data$Ntd > 0) {
for(i in 1:genphen.data$Ntd) {
Yd <- cbind(Yd, as.numeric(genphen.data$Yd[, i]))
}
genphen.data$Yd <- Yd
}
data.list <- list(N = genphen.data$N,
Ntq = genphen.data$Ntq,
Ntd = genphen.data$Ntd,
Ns = genphen.data$Ns,
Nsk = genphen.data$Nsk,
Yq = genphen.data$Yq,
Yd = genphen.data$Yd,
X = genphen.data$X,
M = genphen.data$Ms)
# get initial parameter values
control <- list(adapt_delta = list(...)[["adapt_delta"]],
max_treedepth = list(...)[["max_treedepth"]])
refresh <- list(...)[["refresh"]]
verbose <- list(...)[["verbose"]]
posterior <- rstan::sampling(object = model.stan,
data = data.list,
iter = mcmc.steps,
warmup = mcmc.warmup,
chains = mcmc.chains,
cores = cores,
control = control,
verbose = verbose,
refresh = refresh)
# return
return (list(posterior = posterior))
}
# Description:
# Combine data from Bayesian inference and statistical learning
getScores <- function(p, s,
genphen.data,
hdi.level) {
probs <- c((1-hdi.level)/2, 1-(1-hdi.level)/2)
d.full <- data.frame(summary(p$posterior, probs = probs)$summary,
stringsAsFactors = FALSE)
d.full$par <- rownames(d.full)
xmap <- genphen.data$xmap
scores <- vector(mode = "list", length = ncol(genphen.data$Y))
for(i in 1:ncol(genphen.data$Y)) {
s.p <- s[s$p == i, ]
# list(results = results,
# ca.list = ca.list,
# kappa.list = kappa.list)
key <- paste("beta\\[", i, "\\,", sep = '')
d <- d.full[which(regexpr(pattern = key, text = d.full$par) != -1
& regexpr(pattern = "alpha|sigma|nu|mu|tau|z",
text = d.full$par) == -1), ]
d$i <- i
d$i <- gsub(pattern = key, replacement = '', x = d$par)
d$i <- gsub(pattern = "\\]", replacement = '', x = d$i)
d$i <- as.numeric(d$i)
scores[[i]] <- cbind(xmap, d, s[s$p == i, ])
}
return (scores)
}
# Description:
# Combine data from Bayesian inference and statistical learning
# betas = complete posterior
# cas = list with matrix (posterior) elements for traits
# kappa = list with matrix (posterior) elements for traits
getParetoScores <- function(scores) {
phenotype.ids <- unique(scores$phenotype.id)
result <- c()
for(p in phenotype.ids) {
t <- scores[scores$phenotype.id == p, ]
p <- rPref::high(abs(t$beta.mean))*rPref::high(t$kappa.mean)
f <- rPref::psel(t, p, top = nrow(t))
f$rank <- f[, ".level"]
f[, ".level"] <- NULL
result <- rbind(result, f)
}
# p <- rPref::high(abs(scores$beta.mean))*
# rPref::high(scores$ca.mean)*
# rPref::high(scores$kappa.mean)
return (result)
}
# Description:
# Computes posterior prediction
getPpc <- function(posterior, genphen.data,
hdi.level, phenotype.type) {
ppc <- rstan::summary(object = posterior, prob = c(
(1-hdi.level)/2, 0.5, 1-(1-hdi.level)/2),
par = "Y_hat")$summary
ppc <- data.frame(ppc)
ppc$par <- rownames(ppc)
ppc$par <- gsub(pattern = "Y_hat|\\[|\\]",
replacement = '', x = ppc$par)
par.map <- do.call(rbind, strsplit(x = ppc$par, split = ','))
ppc$trait <- par.map[, 1]
ppc$X <- ifelse(test = par.map[, 2]=="1", yes = 1, no = -1)
ppc$S <- par.map[, 3]
# pos
ppc.X.p <- ppc[ppc$X==1, ]
colnames(ppc.X.p) <- c("ref.mean", "ref.se.mean", "ref.sd",
"ref.L", "ref.median", "ref.H", "ref.Neff",
"ref.Rhat", "par", "trait", "X", "S")
ppc.X.p$X <- NULL
ppc.X.p$par <- NULL
# neg
ppc.X.m <- ppc[ppc$X==-1, ]
colnames(ppc.X.m) <- c("alt.mean", "alt.se_mean", "alt.sd",
"alt.L", "alt.median", "alt.H", "alt.Neff",
"alt.Rhat", "par", "trait", "X", "S")
ppc.X.m$X <- NULL
ppc.X.m$par <- NULL
# merge ref vs. alt cols
ppc <- merge(x = ppc.X.p, y = ppc.X.m,
by = c("trait", "S"))
rm(ppc.X.m, ppc.X.p)
ppc$S <- as.numeric(ppc$S)
ppc$trait <- as.numeric(ppc$trait)
ppc <- ppc[with(ppc, order(trait, S, decreasing = F)), ]
# add info on data
ppc <- cbind(genphen.data$xmap, ppc)
# set trait types
ppc$trait.type <- rep(x = phenotype.type,
each = max(ppc$S))
return (ppc)
}
# Description:
# Given a genotype dataset containing SNPs (columns) and N individuals (rows),
# the procedure computes a NxN kinship matrix for the individuals and estimates
# the phylogenetic bias related to each SNP.
getPhyloBias <- function(genotype,
k.matrix) {
phylo.bias <- c()
# total mean phylogenetic distance
mean.d.t <- mean(k.matrix[upper.tri(x = k.matrix, diag = FALSE)])
for(i in 1:ncol(genotype)) {
gs <- unique(genotype[, i])
for(g in gs) {
# feature mean phylogenetic distance
mean.d.f <- mean(k.matrix[genotype[, i] == g, genotype[, i] == g])
row <- data.frame(site = i, genotype = g, feature.dist = mean.d.f,
total.dist = mean.d.t, stringsAsFactors = FALSE)
phylo.bias <- rbind(phylo.bias, row)
}
}
return(phylo.bias)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.