Nothing
## ----setup, include=FALSE-----------------------------------------------------
library(evola)
## -----------------------------------------------------------------------------
set.seed(1)
# Data
Gems <- data.frame(
Color = c("Red", "Blue", "Purple", "Orange",
"Green", "Pink", "White", "Black",
"Yellow"),
Weight = round(runif(9,0.5,5),2),
Value = round(abs(rnorm(9,0,5))+0.5,2)
)
head(Gems)
## -----------------------------------------------------------------------------
# Task: Gem selection.
# Aim: Get highest combined value.
# Restriction: Max weight of the gem combined = 10.
res0<-evolafit(cbind(Weight,Value)~Color, dt= Gems,
# constraints: if greater than this ignore
constraintsUB = c(10,Inf),
# constraints: if smaller than this ignore
constraintsLB= c(-Inf,-Inf),
# weight the traits for the selection
b = c(0,1),
# population parameters
nCrosses = 100, nProgeny = 20, recombGens = 1,
# coancestry parameters
D=NULL, lambda=0, nQtlStart = 1,
# selection parameters
propSelBetween = .9, propSelWithin =0.9,
nGenerations = 15, verbose = FALSE
)
pmonitor(res0)
## -----------------------------------------------------------------------------
# index for the best solution for trait Value
best=bestSol(res0$pop)[,"Value"]; best
# actual solution
Q <- pullQtlGeno(res0$pop, simParam = res0$simParam, trait=1); Q <- Q/2
Q[best,]
# value and weight for the selected solution
qa = Q[best,] %*% as.matrix(Gems[,c("Weight","Value")]); qa
## ----fig.show='hold'----------------------------------------------------------
data(DT_cpdata)
DT <- DT_cpdata
head(DT)
## ----fig.show='hold'----------------------------------------------------------
# get best 20 individuals weighting variance by 0.5
res<-evolafit(cbind(Yield, occ)~id, dt= DT,
# constraints: if sum is greater than this ignore
constraintsUB = c(Inf,20),
# constraints: if sum is smaller than this ignore
constraintsLB= c(-Inf,-Inf),
# weight the traits for the selection
b = c(1,0),
# population parameters
nCrosses = 100, nProgeny = 10,
# coancestry parameters
D=A, lambda= (30*pi)/180 , nQtlStart = 2,
# selection parameters
propSelBetween = 0.5, propSelWithin =0.5,
nGenerations = 15, verbose=FALSE)
## ----fig.show='hold'----------------------------------------------------------
Q <- pullQtlGeno(res$pop, simParam = res$simParam, trait=1); Q <- Q/2
best = bestSol(res$pop)[,"Yield"];
sum(Q[best,]) # total # of inds selected
## ----fig.show='hold'----------------------------------------------------------
pmonitor(res)
plot(DT$Yield, col=as.factor(Q[best,]),
pch=(Q[best,]*19)+1)
## -----------------------------------------------------------------------------
data(DT_technow)
DT <- DT_technow
DT$occ <- 1; DT$occ[1]=0
M <- M_technow
D <- A.mat(M)
head(DT)
## -----------------------------------------------------------------------------
# silent for CRAN checks restriction on vignettes time
# run the genetic algorithm
# res<-evolafit(formula = c(GY, occ)~hy, dt= DT,
# # constraints: if sum is greater than this ignore
# constraintsUB = c(Inf,100),
# # constraints: if sum is smaller than this ignore
# constraintsLB= c(-Inf,-Inf),
# # weight the traits for the selection
# b = c(1,0),
# # population parameters
# nCrosses = 100, nProgeny = 10,
# # coancestry parameters
# D=D, lambda= (20*pi)/180 , nQtlStart = 100,
# # selection parameters
# propSelBetween = 0.5, propSelWithin =0.5,
# nGenerations = 15, verbose=FALSE)
#
# Q <- pullQtlGeno(res$pop, simParam = res$simParam, trait=1); Q <- Q/2
# best = bestSol(res$pop)[1,1]
# sum(Q[best,]) # total # of inds selected
## -----------------------------------------------------------------------------
# pmonitor(res)
# plot(DT$GY, col=as.factor(Q[best,]),
# pch=(Q[best,]*19)+1)
## -----------------------------------------------------------------------------
data(DT_wheat)
DT <- as.data.frame(DT_wheat)
DT$id <- rownames(DT) # IDs
DT$occ <- 1; DT$occ[1]=0 # to track occurrences
DT$dummy <- 1; DT$dummy[1]=0 # dummy trait
# if genomic
# GT <- GT_wheat + 1; rownames(GT) <- rownames(DT)
# D <- GT%*%t(GT)
# D <- D/mean(diag(D))
# if pedigree
D <- A_wheat
## -----------------------------------------------------------------------------
##Perform eigenvalue decomposition for clustering
##And select cluster 5 as target set to predict
pcNum=25
svdWheat <- svd(D, nu = pcNum, nv = pcNum)
PCWheat <- D %*% svdWheat$v
rownames(PCWheat) <- rownames(D)
DistWheat <- dist(PCWheat)
TreeWheat <- cutree(hclust(DistWheat), k = 5 )
plot(PCWheat[,1], PCWheat[,2], col = TreeWheat,
pch = as.character(TreeWheat), xlab = "pc1", ylab = "pc2")
vp <- rownames(PCWheat)[TreeWheat == 3]; length(vp)
tp <- setdiff(rownames(PCWheat),vp)
## -----------------------------------------------------------------------------
As <- D[tp,tp]
DT2 <- DT[rownames(As),]
## -----------------------------------------------------------------------------
res<-evolafit(cbind(dummy, occ)~id, dt= DT2,
# constraints: if sum is greater than this ignore
constraintsUB = c(Inf, 100),
# constraints: if sum is smaller than this ignore
constraintsLB= c(-Inf, -Inf),
# weight the traits for the selection
b = c(1,0),
# population parameters
nCrosses = 100, nProgeny = 10,
# coancestry parameters
D=As,
lambda=(60*pi)/180, nQtlStart = 80,
# selection parameters
propSelBetween = 0.5, propSelWithin =0.5,
nGenerations = 15, verbose = FALSE)
Q <- pullQtlGeno(res$pop, simParam = res$simParam, trait=1); Q <- Q/2
best = bestSol(res$pop)[,1]
sum(Q[best,]) # total # of inds selected
## -----------------------------------------------------------------------------
cex <- rep(0.5,nrow(PCWheat))
names(cex) <- rownames(PCWheat)
cex[names(which(Q[best,]==1))]=2
plot(PCWheat[,1], PCWheat[,2], col = TreeWheat, cex=cex,
pch = TreeWheat, xlab = "pc1", ylab = "pc2")
## -----------------------------------------------------------------------------
DT2$cov <- apply(D[tp,vp],1,mean)
## -----------------------------------------------------------------------------
res<-evolafit(cbind(cov, occ)~id, dt= DT2,
# constraints: if sum is greater than this ignore
constraintsUB = c(Inf, 100),
# constraints: if sum is smaller than this ignore
constraintsLB= c(-Inf, -Inf),
# weight the traits for the selection
b = c(1,0),
# population parameters
nCrosses = 100, nProgeny = 10,
# coancestry parameters
D=As,
lambda=(60*pi)/180, nQtlStart = 80,
# selection parameters
propSelBetween = 0.5, propSelWithin =0.5,
nGenerations = 15, verbose = FALSE)
Q <- pullQtlGeno(res$pop, simParam = res$simParam, trait=1); Q <- Q/2
best = bestSol(res$pop)[,1]
sum(Q[best,]) # total # of inds selected
## -----------------------------------------------------------------------------
cex <- rep(0.5,nrow(PCWheat))
names(cex) <- rownames(PCWheat)
cex[names(which(Q[best,]==1))]=2
plot(PCWheat[,1], PCWheat[,2], col = TreeWheat, cex=cex,
pch = TreeWheat, xlab = "pc1", ylab = "pc2")
## -----------------------------------------------------------------------------
data(DT_technow)
DT <- DT_technow
DT$occ <- 1; DT$occ[1]=0
M <- M_technow
D <- A.mat(M)
Z=with(DT,overlay(dent,flint) )# Matrix::sparse.model.matrix(~dent-1, data=DT)
rownames(Z) <- DT$hy # needed to link to the QTL matrix
## -----------------------------------------------------------------------------
# regular fitness function
fitnessf <-function (Y, b, d, Q, Z) {
fit <- Y %*% b - d
return(fit)
}
# new fitness function with constraint
fitnessf <-function (Y, b, Q, D, a, lambda, scale=TRUE, Z) {
X=Q%*%Z[colnames(Q),]
bad <- as.vector( apply(X,1, function(x){length(which(x > 5))}) )
bad <- which(bad > 0)
# (q'a)b - l(q'Dq)
if(scale){
fit <- stan( apply(Y,2,scale) %*% b) - lambda*stan( Matrix::diag(Q %*% Matrix::tcrossprod(D, Q)) )
}else{
fit <- stan( Y %*% b) - lambda*stan( Matrix::diag(Q %*% Matrix::tcrossprod(D, Q)) )
}
if(length(bad) > 0){fit[bad,1]=min(fit[,1])}
return(fit)
}
## -----------------------------------------------------------------------------
# silent for CRAN checks restriction on time
# res<-evolafit(formula = c(GY, occ)~hy,
# dt= DT,
# # constraints: if sum is greater than this ignore
# constraintsUB = c(Inf,50),
# # constraints: if sum is smaller than this ignore
# constraintsLB= c(-Inf,-Inf),
# # weight the traits for the selection
# b = c(1,0),
# # population parameters
# nCrosses = 100, nProgeny = 10,
# # coancestry parameters
# D=D, lambda= (10*pi)/180 , nQtlStart = 40,
# # new fitness function and additional args
# fitnessf = fitnessf, Z=Z,
# # selection parameters
# propSelBetween = 0.5, propSelWithin =0.5,
# nGenerations = 15, verbose=FALSE)
#
# Q <- pullQtlGeno(res$pop, simParam = res$simParam, trait=1); Q <- Q/2
# best = bestSol(res$pop)[,1]
# qa = (Q %*% DT$GY)[best,]; qa
# qDq = Q[best,] %*% D %*% Q[best,]; qDq
# sum(Q[best,]) # total # of inds selected
## -----------------------------------------------------------------------------
# # check how many times an individual was used in the final crosses
# crosses <- data.frame(cross=names(which( Q[best,] == 1)))
# table(unlist(strsplit(crosses$cross,":")))
# # check performance of crosses selected
# plot(DT$GY, col=as.factor(Q[best,]),
# pch=(Q[best,]*19)+1)
## -----------------------------------------------------------------------------
data("mtcars")
# we scale the variables
mtcars <- as.data.frame(apply(mtcars,2,scale))
mtcars$inter <- 1 # add an intercept if desired
# define the train and validation set
train <- sample(1:nrow(mtcars) , round((nrow(mtcars)*.4)))
validate <- setdiff(1:nrow(mtcars),train)
mtcarsT <- mtcars[train,]
mtcarsV <- mtcars[validate,]
##############################
# fit the regular linear model
head(mtcarsT)
mod <- lm(mpg~cyl+disp+hp+drat, data=mtcarsT);mod
##############################
# fit the genetic algorithm
# 1) create initial QTL effects to evolve
nqtls=100
dt <- data.frame(alpha=rnorm(nqtls,0,.3),qtl=paste0("Q",1:nqtls))
head(dt); nrow(dt)
# generate n samples equivalent to the number of progeny
# you are planning to start the simulation with (e.g., 500)
# these are fixed values
sam <- sample(1:nrow(mtcarsT),500,replace = TRUE)
y <- mtcarsT$mpg[sam]
X = mtcarsT[sam,c("cyl","disp","hp","drat")]
# Task: linear regression
res0<-evolafit(alpha~qtl, dt= dt,
# constraints: if greater than this ignore
constraintsUB = c(Inf),
# constraints: if smaller than this ignore
constraintsLB= c(-Inf),
# weight the traits for the selection
b = c(1),
# population parameters
nCrosses = 50, nProgeny = 10, recombGens = 1,
# coancestry parameters
D=NULL, lambda=0, nQtlStart = 4, fixNumQtlPerInd = TRUE,
# least MSE function (y - Xb)^2; Y are betas; X*Y is X*beta;
# Y and X are fixed, we just evolve the betas
fitnessf=regFun,
# selection parameters
propSelBetween = 0.65, propSelWithin =0.65, selectTop=FALSE,
nGenerations = 10, y=y, X=X, verbose = FALSE
)
# check how the fitness function changed across generations
pmonitor(res0, kind = 1)
# this time the best solution is the one that minimizes the error
Q <- pullQtlGeno(res0$pop, simParam = res0$simParam, trait=1); Q <- Q/2
bestid <- bestSol(res0$pop, selectTop = FALSE)[,"fitness"]
bestid
betas <- res0$simParam$traits[[1]]@addEff[which(Q[bestid,] > 0)]
betas
# plot predicted versus real values
plot( as.matrix(mtcarsV[,c("cyl","disp","hp","drat")]) %*% betas , mtcarsV$mpg,
xlab="predicted mpg value by GA", ylab="mpg",
main="Correlation between GA-prediction and observed") # GA
plot( as.matrix(mtcarsV[,c("inter","cyl","disp","hp","drat")]) %*% mod$coefficients , mtcarsV$mpg,
xlab="predicted mpg value by lm", ylab="mpg",
main="Correlation between lm-prediction and observed") # LM
# Correlation between GA-prediction and observed
cor( as.matrix(mtcarsV[,c("cyl","disp","hp","drat")]) %*% betas , mtcarsV$mpg)
# Correlation between lm-prediction and observed
cor( as.matrix(mtcarsV[,c("inter","cyl","disp","hp","drat")]) %*% mod$coefficients , mtcarsV$mpg) # LM
## -----------------------------------------------------------------------------
# "not in" operator
'%!in%' <- function(x,y)!('%in%'(x,y))
# function to simulate cities
simCities <- function(n_cities = 5) {
# extend "LETTERS" function to run from "A" to "ZZ"
MORELETTERS <- c(LETTERS, sapply(LETTERS, function(x) paste0(x, LETTERS)))
cities <- matrix(runif(2*n_cities,-1,1),ncol=2)
rownames(cities) <- MORELETTERS[1:n_cities]
colnames(cities) <- c("x","y")
return(cities)
}
# function to plot cities
plotCities <- function(cities, route, dist=NULL,
main="", bg="white",
main_col = "black",
point_col = "deepskyblue",
start_col = "red",
line_col = "black") {
# plot cities
city_colors <- c(start_col, rep(point_col,nrow(cities)))
par(bg=bg)
plot(cities,
pch=16, cex=3,
col=point_col,
ylim=c(-1,1.1), xlim=c(-1,1),
# yaxt="n", xaxt="n",
# ylab="", xlab="",
bty="n",
main=main, col.main=main_col)
text(x=cities[,1], y=cities[,2], labels=rownames(cities))
# plot route
if(!missing(route)){
for(i in 1:length(route)) {
if(route[i]>0){
nodes0 <- strsplit(names(route)[i], "-")[[1]]
lines(x=cities[nodes0,"x"],
y=cities[nodes0,"y"],
col=line_col,
lwd=1.5)
}
}
}
# add distance in legend
if(!is.null(dist)) legend("topleft",
bty="n", # no box around legend
legend=round(dist,4),
bg="transparent",
text.col="black")
}
# function to compute adjacency matrix
compAdjMat <- function(cities) return(as.matrix(dist(cities)))
## -----------------------------------------------------------------------------
nCities=5
cities <- simCities(nCities)
adjmat <- compAdjMat(cities)
# make the distance matrix a data frame
df2 <- as.data.frame(as.table(adjmat))
df2 <- df2[-which(df2$Var1 == df2$Var2),]
colnames(df2)[3] <- c("distances")
df2$route <- paste(df2$Var1, df2$Var2, sep="-")
df2
## -----------------------------------------------------------------------------
H <- with(df2, evola::overlay(Var1,Var2) )
rownames(H) <- df2$route
head(H)
## -----------------------------------------------------------------------------
salesf <- function(Y,b,Q,D,a,lambda ,H, nCities){
# simple fitness function
fitnessVal <- (Y%*%b)
###############
# CONSTRAINTS
###############
# calculate how many cities the solution has travelled to
QH <- Q %*% H
# ensure all cities have at least 2 edges
edgeCheck <- apply(QH,1,function(x){if(all(x>1)){return(1)}else{return(0)} })
#apply condition on arriving and leaving the city
fitnessVal <- ifelse(edgeCheck == 0, Inf, fitnessVal) # if not touching at least 2 cities give inf distance
# number of unique cities visited
nCityCheck <- apply(QH,1,function(x){length(which(x > 0))})
# apply condition on visiting all cities
fitnessVal <- ifelse(nCityCheck < nCities, Inf, fitnessVal) # if not in all cities give an infinite distance
return(fitnessVal)
}
## -----------------------------------------------------------------------------
res<-evolafit(formula=distances~route, dt= df2,
# constraints on traits: if greater than this ignore
constraintsUB = c(Inf),
# constraints on traits: if smaller than this ignore
constraintsLB= c(-Inf),
# weight the traits for the selection (fitness function)
b = c(1),
# population parameters
nCrosses = 50, nProgeny = 10,
# genome parameters
recombGens = 1, nChr=1, mutRateAllele=0,
# start with at least n QTLs equivalent to n cities
nQtlStart = nCities*2,
# coancestry parameters
D=NULL, lambda=0,
fitnessf = salesf,
selectTop = FALSE,
# additional variables for the fitness function
H=H, nCities=nCities,
# selection parameters
# propSelBetween = .8, propSelWithin =0.8,
nGenerations = 50, verbose=FALSE
)
pmonitor(res, kind=1) # fitness should decrease
Q <- pullQtlGeno(res$pop, simParam = res$simParam, trait=1); Q <- Q/2
best <- bestSol(res$pop, selectTop = FALSE)[,"fitness"]
Q[best,] # routes taken
Q[best,] %*% H # cities visited (should have a 2 so we arrived and left once)
plotCities(cities, route=Q[best,])
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.