trainVal = function(data, colToInd, sample ){
ID = data[!duplicated(data[, colToInd]), colToInd]
ID = data.frame(ID)
idx = sample(nrow(ID), nrow(ID) * sample) #.25
Loc_Validate = data.frame(ID=ID[-idx, ])
colnames(Loc_Validate) = colToInd
Loc_Train = data.frame(ID=ID[idx, ])
colnames(Loc_Train) = colToInd
trainx2 = dplyr::inner_join(data, Loc_Train, by=colToInd) #%>% rbind(trainx1)
validatex2 = dplyr::inner_join(data, Loc_Validate, by=colToInd)
trainx2 = data.frame(trainx2) %>% dplyr::mutate_all(as.numeric) #%>% unique()
validatex2 = data.frame(validatex2) %>% dplyr::mutate_all(as.numeric)
return(list(data.frame(trainx2),data.frame(validatex2)))
}
randomEffect = function(CNN, CN,trainingx2, startVal=1){
CS = CN[1]
field.2 = (trainingx2[!duplicated(trainingx2[,CS]), CN])
field.2$num = c(startVal:(nrow(field.2)+(startVal-1)))
colnames(field.2)[-ncol(field.2)] = CN
trainingx2 = dplyr::left_join(trainingx2, field.2[,c(CS,"num")], by=CS)
colnames(trainingx2)[ncol(trainingx2)] = CNN
gc()
return(list(data.frame(field.2), data.frame(trainingx2)))
}
xgblinearBV = function( sdp,
fdp,
season,
s0,
s1,
s2,
s3,
s4 ,
s5 ,
seas0,
seas1 ,
seas2 ,
seas3 ,
seas4 ,
seas5,
inbred,
male,
genotype,
seed,
nthread
){
# #####################################################
s0=T
s1 =T
s2 =F
s3 =F
s4 =F
s5 =F
seas0 = 21
seas1 = 20
seas2 = ""
seas3 = ""
seas4 = ""
seas5 = ""
sdp = "C:/Users/jake.lamkey/Documents/"
fdp= "C:/Users/jake.lamkey/Documents/"
library(BreedStats)
library(tidyverse)
library(doParallel)
library(caretEnsemble)
library(caret)
library(data.table)
require(BGLR)
library(asreml)
season="21S"
# rounds = 30
# eta=1
# alpha = .0003
# lambda=.0003
male = data.frame(male=c('BSQ033', 'GP734GTCBLL', "BFA143", "BQS025", "BQS986",
'BRQ529', 'GP718', 'BSR095',
'BRP251', 'BUR070', 'BRS312',
'BAC020','BSU151', 'GP6823Hx1', 'I10516', 'W8039RPGJZ',
'GP717', 'BAA441', 'GP738Hx1', 'BHH069',
'TR4949', "BRS312", "BRS314", 'I12003', 'BSQ941',
"BAA419","BHB075","BHJ471","GP702",
"40QHQ-E07", "BQS941","BRS313","BSS009","GP738","BRR553"
))
genotype=F
seed = 30
nthread = 8
season0=as.numeric(seas0)
season1=as.numeric(seas1)
season2=as.numeric(seas2)
season3=as.numeric(seas3)
season4=as.numeric(seas4)
season5=as.numeric(seas5)
male.3=male
cores=parallel::detectCores()
cl <- parallel::makeCluster(cores[1]-2, outfile="")
doParallel::registerDoParallel(cl)
######################################################
trainingx2 = data.table::fread(paste0(sdp,"BV.HSIdentical.df.csv"))
#linked.peds = openxlsx::read.xlsx(paste0("R:/Breeding/MT_TP/Models/Data/Department Data/linked.peds.updated_21S_all.xlsx"),1)
linked.peds = openxlsx::read.xlsx(paste0(sdp, "linked.peds.xlsx"),1)
industryNames = InbredNameLibrary()
industryNames = industryNames[[2]]
linked.peds[,"match"] <- suppressWarnings(suppressMessages(
plyr::revalue(as.character(linked.peds[,"match"]), industryNames))) #industry name to inbred name conversion
group_and_concat <- linked.peds %>%
dplyr::select(uniqued_id, match, Gender) %>%
dplyr::group_by(match) %>%
dplyr::mutate(Prism_Mped_Fped_HId_Ped_IName_Var = paste(uniqued_id, collapse = " , "),
HG = paste(Gender, collapse = " , "))
group_and_concat$HG = gsub(group_and_concat$HG, pattern="/", replacement = " , ")
group_and_concat$HetGrp <- sapply(group_and_concat$HG, function(x) paste(unique(unlist(stringr::str_split(x," , "))),
collapse = " , "))
group_and_concat$HetGrp = gsub(group_and_concat$HetGrp, pattern="FEMALE , Male", replacement = "Female/Male")
group_and_concat$HetGrp = gsub(group_and_concat$HetGrp, pattern="Male , FEMALE", replacement = "Female/Male")
group_and_concat = group_and_concat[!duplicated(group_and_concat$match),]
trainingx2 = dplyr::left_join(trainingx2, group_and_concat[,c(2,4,6)], by=c("FEMALE"="match"))
#BV.MC.Entry.data.test = fread(paste0(hdp,"BV.HSIdentical.df.csv"))
trainingx2 = trainingx2 %>% dplyr::filter(Plot.Discarded != "Yes",
Plot.Status != "3 - Bad",
Yield < 600,
PCT.HOH < 50 ) %>%
data.frame()
RE = randomEffect(CNN= "field", CN=c("FIELD","LINE"), trainingx2)
field.2 = RE[[1]]
trainingx2 = RE[[2]]
RE = randomEffect(CNN= "male", CN=c("MALE","LINE"), trainingx2)
male.2 = RE[[1]]
trainingx2 = RE[[2]]
RE = randomEffect(CNN= "female", CN=c("FEMALE","LINE"), trainingx2)
female.2 = RE[[1]]
trainingx2 = RE[[2]]
RE = randomEffect(CNN= "ID", CN=c("LINE","MALE","FEMALE"), trainingx2)
ID.2 = RE[[1]]
trainingx2 = RE[[2]]
RE = randomEffect(CNN= "Year", CN=c("YEAR","LINE"), trainingx2)
Year.2 = RE[[1]]
trainingx2 = RE[[2]]
RE = randomEffect(CNN= "variety", CN=c("Variety","LINE"), trainingx2)
variety.2 = RE[[1]]
trainingx2 = RE[[2]]
RE = randomEffect(CNN= "hetgrp", CN=c("HetGrp","LINE"), trainingx2)
gender.2 = RE[[1]]
trainingx2 = RE[[2]]
trainingx2$popId = paste0(trainingx2$field,"00", trainingx2$male )
trainingx2$popId = as.numeric(trainingx2$popId)
trainingx2$popIdFemale = paste0(trainingx2$field,"00", trainingx2$female )
trainingx2$popIdFemale = as.numeric(trainingx2$popIdFemale)
trainingx2$BreedLoc = trainingx2$FIELD
trainingx2$BreedLoc = plyr::revalue(trainingx2$BreedLoc,
c('Choice- Bourbon'='Beck - A-BB',
'Choice- Blissfield'='Beck - A-BF',
'Choice- Bluffton'='Beck - A-BU',
'Choice- Champaign'='Beck - A-CP',
'Choice- Greensburg'='Beck - A-GB',
'Choice- Grand Rapids'='Beck - A-GRC',
'Choice- Grelton'='Beck - A-GT',
'Choice- Jenera'='Beck - A-JEC',
'Choice- Kankakee'='Beck - A-KK',
'Choice- London'='Beck - A-LD',
'Choice- Lafayette'='Beck - A-LF',
'Choice- New Haven'='Beck - A-NH',
'Choice- Niles'='Beck - A-NI',
'Choice- Paris'='Beck - A-PS',
'Choice- Remington'='Beck - A-RM',
'Choice- Sandusky'='Beck - A-SD',
'Choice- Seymour'='Beck - A-SM',
'Choice- St. Marys'='Beck - A-SY',
'Choice- Terre Haute'='Beck - A-TH',
'Choice- Washington CH'='Beck - A-WC',
'Choice- Wheatfield'='Beck - A-WH',
'Choice- Beaver Dam'='Beck - E-BD',
'Choice- Bloomington'='Beck - E-BL',
'Choice- Dorsey'='Beck - E-DRC',
'Choice- Freeport'='Beck - E-FP',
'Choice- Janesville'='Beck - E-JN',
'Choice- Jacksonville'='Beck - E-JV',
'Choice- Princeton'='Beck - E-PR',
'Choice- Princeville'='Beck - E-PV',
'Choice- Rochelle'='Beck - E-RC',
'Choice- Seneca'='Beck - E-SE',
'Choice- Waterman'='Beck - E-WM',
'Choice- Waterloo'='Beck - E-WO',
'Choice- Benton'='Beck - H-BT',
'Choice- Columbia'='Beck - H-CO',
'Choice- Danville'='Beck - H-DK',
'Choice- Dyersburg'='Beck - H-DYC',
'Choice- Fort Branch'='Beck - H-FB',
'Choice- Neoga'='Beck - H-NGC',
'Choice- Paducah'='Beck - H-PD',
'Choice- Ridgeway'='Beck - H-RI',
'Choice- Russelville'='Beck - H-RUC',
'Choice- Sikeston'='Beck - H-SK',
'Choice- Union City'='Beck - H-UC',
'Choice- Kempton'='Beck - KE',
'Choice- Knightstown'='Beck - KT',
'Choice- Bradgate'='Beck - M-BG',
'Choice- Clinton'='Beck - M-CL',
'Choice- Chariton'='Beck - M-CT',
'Choice- Colfax'='Beck - M-CX',
'Choice- Dodgeville'='Beck - M-DG',
'Choice- Dayton'='Beck - M-DT',
'Choice- Dewitt'='Beck - M-DW',
'Choice- Iowa Falls'='Beck - M-IF',
'Choice- Ida Grove'='Beck - M-IG',
'Choice- Keota'='Beck - M-KA',
'Choice- Lexington'='Beck - M-LX',
'Choice- Nashua'='Beck - M-NU',
'Choice- Red Oak'='Beck - M-RO',
'Choice- Walcott'='Beck - M-WT2 (East)',
'Choice- Storm Lake'='Beck - SL',
'Choice- Washington'='Beck - WS',
'Choice- Charlotte'='Beck- CR',
'Choice- Frankfort'='Beck- FF',
'Choice- Rockville'='Beck- RK',
'Choice- Wabash'='Beck- WA',
'Choice- Webb'='Beck- WB',
'Choice- Charleston'='Beck-CH',
'Choice- Knoxville'='Beck-KX',
'Choice- Sacramento'='Beck-SC',
'Choice-3MG-Aberdeen'='Choice-Other',
'Choice-3MG-Wessington Springs'='Choice-Other',
'Choice- Royal Center'='Choice-Other',
'Choice- Stewartville Choice'='Choice-Other',
'Choice- Pekin'='Choice-Other',
'Choice- Virginia'='Choice-Other',
'Choice- Ridgway'='Choice-Other',
'Choice-CRD-Fairfax'='Choice-Other',
'Choice- Wells MN Choice'='Choice-Other',
'Choice- Protivin'='Choice-Other',
'Choice- Cascade'='Choice-Other',
'Choice- Chilicothe, OH'='Choice-Other',
'Choice- Paris TN'='Choice-Other',
'Choice- Portageville'='Choice-Other',
'Choice- Ft. Branch'='Choice-Other',
'Choice-CRD-Blomkest'='Choice-Other',
'Choice- Platteville'='Choice-Other',
'Choice- Chillicothe'='Choice-Other',
'Choice-CRD-Stewartville'='Choice-Other',
'CRD- Stewartville Choice'="Choice-Other")
)
trainingx2$BreedLoc = ifelse(grepl(trainingx2$BreedLoc, pattern = "*- A-*"),1,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- M-*"),2,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- O-*"),3,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- E-*"),4,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- H-*"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- IG"),2,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- BF"),1,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- WC"),1,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- RM"),1,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- KT"),1,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- RO"),2,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- GB"),1,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- LF"),1,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- SE"),4,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- KK"),1,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- NH"),1,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- NI"),1,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- DT"),2,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- DG"),2,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- NU"),2,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- BG"),2,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- KA"),2,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- RC"),4,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- WT"),2,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- SL"),2,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- JN"),4,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- JV"),4,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- CP"),1,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- BL"),4,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- PV"),4,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- PS"),1,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- WM"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- RI"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- RK"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- FB"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- WA"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- KE"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- WS"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- UC"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- SK"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- TH"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- KE"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- FP"),4,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- WB"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- DW"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- KX"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- PR"),5,trainingx2$BreedLoc
)))))))))))))))))))))))))))))))))))))))))))))))
trainingx2$BreedLoc = ifelse(grepl(trainingx2$BreedLoc, pattern = "*- DN"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- CR"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- FF"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*- CH"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*-Keith"),1,
#ifelse(grepl(trainingx2$BreedLoc, pattern = "Choice-*"),6,
ifelse(grepl(trainingx2$BreedLoc, pattern = "Contract -*"),7,
ifelse(grepl(trainingx2$BreedLoc, pattern = "Trade -*"),7,
ifelse(grepl(trainingx2$BreedLoc, pattern = "CRD*"),7,
ifelse(grepl(trainingx2$BreedLoc, pattern = "SSR-*"),7,
ifelse(grepl(trainingx2$BreedLoc, pattern = "Choice-Other"),7,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*-CH"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*-SC"),5,
ifelse(grepl(trainingx2$BreedLoc, pattern = "*-KX"),5,
ifelse(is.na(trainingx2$BreedLoc),0,
ifelse((trainingx2$BreedLoc) == "",0,trainingx2$BreedLoc
)))))))))))))))
Breedl = trainingx2[!duplicated(trainingx2$BreedLoc),c("BreedLoc")]
Breedl
BreedLoca = trainingx2[!duplicated(trainingx2$field),c("BreedLoc","field")]
linked.peds.rmdups = linked.peds[!duplicated(linked.peds$match),]
trainingx2 = left_join(trainingx2, linked.peds.rmdups[,2:3], by=c("FEMALE"="match"))
nullvarnum = variety.2 %>% dplyr::filter((Variety)=="") %>% dplyr::select(num) %>% as.integer()
BV.HSIdentical.df.A = levelSelector(level="A",BV.MC.Entry.data=trainingx2,s0=s0,s1=s1,s2=s2,s3=s3,s4=s4,s5=s5,
season0=season0,season1=season1,season2=season2,
season3=season3,season4=season4,season5=season5)
BV.HSIdentical.df.Prop = pcSelector(commericalType = "Prop", altCommericalType = "PET",
BV.MC.Entry.data=trainingx2,s0=s0,s1=s1,s2=s2,s3=s3,s4=s4,s5=s5,
season0=season0,season1=season1,season2=season2,
season3=season3,season4=season4,season5=season5)
BV.HSIdentical.df = rbind(BV.HSIdentical.df.A,
BV.HSIdentical.df.Prop)
if(genotype){
#Genos = openxlsx::read.xlsx(paste0(sdp,"exportmarkers.xlsx"), 1 )#; colnames(earht.prism.norm)[1] = "Female Pedigree"
#trimpeds = read.csv(paste0(sdp,"BV.HSIdentical.df.trimpeds.csv") )#; colnames(earht.prism.norm)[1] = "Female Pedigree"
#need two files for this function preprosseed separatly
BV.HSIdentical.df=genoReady(sdp=sdp, inbreds=inbreds, linked.peds=linked.peds,
trainingx2=BV.HSIdentical.df)
}
#male = BV.HSIdentical.df[!duplicated(BV.HSIdentical.df$male),"male"]
female = BV.HSIdentical.df[!duplicated(BV.HSIdentical.df$FEMALE),c("FEMALE","female")]
female = BV.HSIdentical.df[!duplicated(BV.HSIdentical.df$FEMALE),c("FEMALE","female")]
female.index = grepl(female$FEMALE, pattern="^B|^G|^T|^S|^R")
female.grid = female[female.index==T,]
field = BV.HSIdentical.df[!duplicated(BV.HSIdentical.df$field),c("field","FIELD","YEAR")]
field = field %>% dplyr::filter(YEAR == 2021) %>% dplyr::select(field,FIELD)
variety = trainingx2[!duplicated(trainingx2$ID), c("variety","ID")]
id = trainingx2[!duplicated(trainingx2$ID), c("male","female","ID")]
gender.male = trainingx2[!duplicated(trainingx2$male), c("hetgrp","male")]
gender.female = trainingx2[!duplicated(trainingx2$female), c("hetgrp","female")]
rm(BV.HSIdentical.df.A, BV.HSIdentical.df.Prop, BV.MC.Entry.data.test, BV.MC.Entry.data,RE,group_and_concat)
gc()
#male.3 =male.3
male.3 = data.frame(male= male.3[!duplicated(male.3),])
male.3 = dplyr::left_join(male.3,male.2[,-2],by=c("male"="MALE") )
male.3 = na.omit(male.3)
male = data.frame(male.2[,1]); female = data.frame(female.2[,1])
colnames(male)="FEMALE"
colnames(female)="FEMALE"
inbreds = rbind(male,female)
inbreds = inbreds[!duplicated(inbreds$FEMALE), ]
inbreds = data.frame(inbreds)
rm(linked.peds.rmdups)
gc()
##############################################
#process genotypes
#############################################
if(genotype){
#Genos = openxlsx::read.xlsx(paste0(sdp,"exportmarkers.xlsx"), 1 )#; colnames(earht.prism.norm)[1] = "Female Pedigree"
#trimpeds = read.csv(paste0(sdp,"BV.HSIdentical.df.trimpeds.csv") )#; colnames(earht.prism.norm)[1] = "Female Pedigree"
#need two files for this function preprosseed separatly
trainingx2=genoReady(sdp=sdp, inbreds=inbreds, linked.peds=linked.peds,
trainingx2=trainingx2)
}
#########################
field = field %>% data.frame() %>%
dplyr::filter(FIELD != c("Contract - SSR-Garden City"),
FIELD != c("(HOLDING)"),
!grepl(FIELD, pattern = "Contract"),
!grepl(FIELD, pattern = "Beck - H")) %>%
dplyr::select(field)
gc()
#field =data.frame(field=c(1,2))
testx2 = expand.grid(male.3$num, female.grid$female, Year.2$num, field$field )
# testx2$ID.cat = paste0(testx2$female, " + ", testx2$male)
testx2 = dplyr::left_join(testx2, id, by=c("Var1"="male","Var2"="female"))
testx2= dplyr::left_join(testx2, variety, by=c("ID"="ID"))
colnames(testx2)=c("male","female","Year","field","ID","variety")
testx2= dplyr::left_join(testx2, gender.female, by=c("female"="female"))
#testx2= dplyr::left_join(testx2, gender.male, by=c("male"="male"))
testx2$ID.cat = paste0(testx2$female, " + ", testx2$male)
id.unk = testx2 %>% dplyr::filter(is.na(ID)) %>%
dplyr::mutate(ID.concat = paste0(female, " + ", male)) %>%
dplyr::distinct(ID.concat) %>%
dplyr::mutate(num = ((max(trainingx2$ID)+1):(length(ID.concat)+(max(trainingx2$ID))) ) )
testx2 = dplyr::left_join(testx2, id.unk, by=c("ID.cat"="ID.concat"))
testx2$ID = ifelse(is.na(testx2$ID), testx2$num,testx2$ID)
testx2$variety = ifelse(is.na(testx2$variety), nullvarnum, testx2$variety)
testx2$popId = paste0(testx2$field,"00",testx2$male)
testx2$popIdFemale = paste0(testx2$field,"00",testx2$female)
testx2 = left_join(testx2, BreedLoca, by=c("field"))
#testx2$field = 9
testx2 = testx2[ ,c(6,5,1,2,3,4,7,10,11,12)]
if(genotype){
BV.HSIdentical.df.join.female=BV.HSIdentical.df[!duplicated(BV.HSIdentical.df$female),c("female","PC1","PC2","PC3")]
BV.HSIdentical.df.join.male=BV.HSIdentical.df[!duplicated(BV.HSIdentical.df$female),c("male","PC1","PC2","PC3")]
#Genos = openxlsx::read.xlsx(paste0(sdp,"exportmarkers.xlsx"), 1 )#; colnames(earht.prism.norm)[1] = "Female Pedigree"
#trimpeds = read.csv(paste0(sdp,"BV.HSIdentical.df.trimpeds.csv") )#; colnames(earht.prism.norm)[1] = "Female Pedigree"
#need two files for this function preprosseed separatly
testx2 = testx2 %>%
left_join(BV.HSIdentical.df.join.female) %>%
left_join(BV.HSIdentical.df.join.male)
rm(BV.HSIdentical.df.join.female, BV.HSIdentical.df.join.male)
BV.HSIdentical.df = BV.HSIdentical.df %>% dplyr::select(-c("X1","pedigree"))
trainingx2 = trainingx2 %>% dplyr::select(-c("X1","pedigree"))
gc()
}else{
BV.HSIdentical.df = BV.HSIdentical.df %>% dplyr::select(-c("pedigree"))
trainingx2 = trainingx2 %>% dplyr::select(-c("pedigree"))
}
l<-length(trainingx2)#; l
names<-names(trainingx2[,c(10:13,15,17,20:22)]); names
classes<-sapply(trainingx2[c(10:22)], class); classes
#cat(paste0(fdp),"\n")
#cat("F", "\n")
if(genotype){
if(!dir.exists(paste0(fdp,season,"_genotype"))){
dir.create(paste0(fdp,season,"_genotype"))
}
sink(file=paste0(fdp,season,"_genotype","/XGBlinearBV_genotype",season,".txt"),split=TRUE)
pdf(file = paste0(fdp,season,"_genotype","/XGBlinearBV_genotype",season,".pdf"), paper="special",width = 11,
height = 8.5,family="Times", pointsize=11,bg="white",fg="black")
#name='yield'
}else{
if(!dir.exists(paste0(fdp,season))){
dir.create(paste0(fdp,season))
}
sink(file=paste0(fdp,season,"/XGBlinearBV",season,".txt"),split=TRUE)
pdf(file = paste0(fdp,season,"/XGBlinearBV",season,".pdf"), paper="special",width = 11,
height = 8.5,family="Times", pointsize=11,bg="white",fg="black")
}
for(name in names){
print(name)
}
#
# cat("G", "\n")
# cl=parallel::detectCores()
# cl <- makePSOCKcluster(cl-1)
# registerDoParallel(cl)
#
# bind.linked.male.peds=foreach(name=names,
# .packages=c("dplyr","asreml","stats","data.table"),
# .export=c("mutate","filter","setNames","group_by","summarise",
# "asreml","setDT","left_join","fitted","data.table","transform","data.table")
# ) %dopar% {
#library(asreml)
rm(linked.peds, id.unk, gender,female,variety, male,male.3, id, field, female.grid)
gc()
name="Yield"
cat("G", "\n")
for(name in names){
cat(paste0("--------------------------------------",name,"--------------------------------------"), "\n")
if( "feature" %in% colnames(trainingx2)){
trainingx2 = trainingx2 %>% dplyr::select(-feature)
}
nameCol = paste0(name)
trainingx2 = data.frame(trainingx2)
trainingx2$feature = trainingx2[,name]
#aprop= na.omit(BV.HSIdentical.df[,c(name,41,39,37,38,40,36,42)])
if( "feature" %in% colnames(BV.HSIdentical.df)){
BV.HSIdentical.df = BV.HSIdentical.df %>% dplyr::select(-feature)
}
nameCol = paste0(name)
BV.HSIdentical.df = data.frame(BV.HSIdentical.df)
BV.HSIdentical.df$feature = BV.HSIdentical.df[,name]
#trainingx2 = na.omit(trainingx2[,c(name,41,39,37,38,40,36,42)]) #yield = 22, plt.height=13, ear=10
if(genotype){
# for (i in Index.PopID) {
# print(paste0(i))
# y = subset(QTL.MC.Entry.Population, PopID == paste0(i))
#
# if (!is.na(sum(y$`Plt Height`))) {
# print("Plt Height")
# #hist(y[,"Plt Height"],main=paste0(i))
# st.plt = shapiro.test(y$`Plt Height`)
# PltHt.cv = print(st.plt$p.value)
#
# plt.name = paste0(i)
# } else{
# PltHt.cv = print(paste0(i, " Plt Height has no values!"))
# plt.name = paste0(i)
#
# }
#
# if (!is.na(sum(y$`EarHt`))) {
# print("EarHt")
# #hist(y[,"EarHt"],main=paste0(i))
# st.ear = shapiro.test(y$`EarHt`)
# EarHt.cv = print(st.ear$p.value)
#
# ear.name = paste0(i)
# } else{
# EarHt.cv = print(paste0(i, " EarHt has no values!"))
# ear.name = paste0(i)
#
# }
#
# CV.rank.plt[[length(CV.rank.plt) + 1]] = PltHt.cv
# CV.rank.plt.name[[length(CV.rank.plt.name) + 1]] = plt.name
#
# CV.rank.ear[[length(CV.rank.ear) + 1]] = EarHt.cv
# CV.rank.ear.name[[length(CV.rank.ear.name) + 1]] = ear.name
#
# rm(PltHt.cv, EarHt.cv, y, st.ear, st.plt)
#
# }
#
#
# ear.norm = ear.plt %>% filter(EarHt >= 0.05) %>% select(-`Plt.Height`, -X)
# colnames(ear.norm) = c("PopID", "Ear.norm")
# plt.norm = ear.plt %>% filter(`Plt.Height` >= 0.05) %>% select(-EarHt, -X)
# colnames(plt.norm) = c("PopID", "Plt.norm")
#
#
#
# if (!file.exists(destfile3) && !file.exists(destfile4)) {
# nrowOfPopId.plt = list()
# PopIdName.plt = list()
# nrowOfPopId.ear = list()
# PopIdName.ear = list()
# CV.rank.plt = list()
# CV.rank.ear = list()
# Index.PopID = levels(as.factor(peds.QtlPopId.filtered$`PopID`))
# length(Index.PopID)
# setwd("R:/Breeding/MT_TP/Models/QTL/2020")
# sink(file = paste0("QTL_20S_SelectedPopIds", wd, ".txt"),
# split = TRUE)
#
# for (i in Index.PopID) {
# y = subset(peds.QtlPopId.filtered, PopID == paste0(i))
# if (nrow(y) >= 3) {
# if (!is.na(sum(y$`Plt Height`))) {
# print("PltHt")
# numOfRows.plt = print(nrow(y))
# PopId.plt = paste0(i)
# st.plt = shapiro.test(y$`Plt Height`)
# PltHt.cv = print(st.plt$p.value)
#
# } else{
# numOfRows.plt = print(paste0(i, " PltHt has no values!"))
# PltHt.cv = print(paste0(i, " PltHt has no values!"))
#
# PopId.plt = paste0(i)
#
# }
#
# if (!is.na(sum(y$`EarHt`))) {
# print("EarHt")
# numOfRows.ear = print(nrow(y))
# PopId.ear = paste0(i)
# st.ear = shapiro.test(y$`EarHt`)
# EarHt.cv = print(st.ear$p.value)
#
# } else{
# numOfRows.ear = print(paste0(i, " EarHt has no values!"))
# EarHt.cv = print(paste0(i, " EarHt has no values!"))
#
# PopId.ear = paste0(i)
#
# }
# } else{
# numOfRows.ear = print(paste0(i, " EarHt has no values!"))
# EarHt.cv = print(paste0(i, " EarHt has no values!"))
#
# PopId.ear = paste0(i)
# numOfRows.plt = print(paste0(i, " PltHt has no values!"))
# PltHt.cv = print(paste0(i, " PltHt has no values!"))
#
# PopId.plt = paste0(i)
# }
# nrowOfPopId.plt[[length(nrowOfPopId.plt) + 1]] = numOfRows.plt
# PopIdName.plt[[length(PopIdName.plt) + 1]] = PopId.plt
#
# nrowOfPopId.ear[[length(nrowOfPopId.ear) + 1]] = numOfRows.ear
# PopIdName.ear[[length(PopIdName.ear) + 1]] = PopId.ear
#
# CV.rank.plt[[length(CV.rank.plt) + 1]] = PltHt.cv
# CV.rank.ear[[length(CV.rank.ear) + 1]] = EarHt.cv
#
# rm(
# numOfRows.plt,
# PopId.plt,
# st.plt,
# PltHt.cv,
# numOfRows.ear,
# PopId.ear,
# st.ear,
# EarHt.cv,
# y
# )
# }
# sink()
#markerList = list()
# markerSelect = function(trainingx2 ,j ){
# markerLm = stats::lm(feature ~ trainingx2[, j], data = trainingx2)
#
# #variety + PC1 + PC2 + PC3 +field + ID + hetgrp female + male + Year +
#
# sumMarkerLm = summary(markerLm)
# sumMarkerLmPvalue=as.numeric(sumMarkerLm$coefficients[ ,"Pr(>|t|)"]["trainingx2[, j]"])
# #
# if(sumMarkerLmPvalue <= 0.001){
# #print(j)
# #cat(j,": P-value is ",sumMarkerLmPvalue,"\n" )
# # #markerList[[length(markerList)+1]] = j
# a = data.frame(j, sumMarkerLmPvalue)
# return(a)
# }
# rm(markerLM,sumMarkerLm,sumMarkerLmPvalue,i )
#
# }
#
# markerData=foreach(j=colnames(trainingx2)[48:ncol(trainingx2)] ,.packages=c("stats"),
# .export=c("lm"),.combine=rbind,.inorder=F) %dopar% {
# #for( j in colnames(trainingx2)[46:ncol(trainingx2)] ){
#
# a = markerSelect(trainingx2 =trainingx2 ,j=j )
# a
#
#
# }
# markerData.index = order(markerData$sumMarkerLmPvalue, decreasing = F)
# markerData = markerData[markerData.index, ]
# markerData = markerData[1:300, ]
# #markerData = data.frame(markers = markerData)
#
# trainingMarkers = trainingx2[, (markerData$j)]
# cat("----------------------------Training Marker List-------------------------------","\n")
# markerData[1:300,]
#
# cat( "\n")
#
# trainingx3 = data.frame(trainingx2[ ,1:47], trainingMarkers)
#
# rm(markerData, trainingMarkers)
# gc()
datasets = trainVal(data = trainingx2, colToInd= "ID", sample = 0.95)
gc()
trainx2 = na.omit((datasets[[1]])[, -c(1:35) ])
validatex2 =na.omit( datasets[[2]][, -c(1:35) ] )
rm(datasets,trainingx3)
gc()
# # r, proportion of phenotypic variance attributed to model residuals
# R2=0.5
#
# # Prior hyperparameter values
# # sigmaE2 (residual variance)
# mode.sigE=R2*var(trainx2 %>% rbind(validatex2) %>% dplyr::select(feature) %>% as.matrix())
# dfe=5
# Se=mode.sigE*(dfe + 2)
#
# # lambda
# mode.sigL=(1-R2)*var(trainx2 %>% rbind(validatex2) %>% dplyr::select(feature) %>% as.matrix())
# lambda.hat=sqrt(2*mode.sigE/mode.sigL*sum(colMeans(trainx2[,13:(ncol(trainx2)-1)] %>%
# rbind(validatex2[,13:(ncol(validatex2)-1)])
# )^2))
# delta.lambda=0.05 # rate
# r.lambda=lambda.hat*delta.lambda # shape
#
#
# prior=list( varE=list(S=Se,df=dfe),
# lambda=list(type='random',
# value=lambda.hat,
# shape=r.lambda,
# rate=delta.lambda) )
#
# ETA = list( list(X = rbind( trainx2[,1:10], validatex2[,1:10]), model="FIXED"),
# list(K = as.matrix(rbind(trainx2[,10:(ncol(trainx2)-1)], validatex2[,10:(ncol(validatex2)-1)])) ,
# model = "RKHS",
# prior))
#
# fmR<-BGLR::BGLR(y = trainx2 %>% rbind(validatex2) %>% dplyr::select(feature) %>% as.matrix(),
# ETA=ETA,
# nIter=100,
# burnIn=50,
# thin=1,
# verbose = T
# #saveAt=paste(outpath,"B",df,'LASSO_r_',r,'_',sep=''))
# )
#
# cat("r2 for train ALL is: ",
# cor(trainx2[, "feature"],
# fmR$yHat[1:(nrow(trainx2))])^2, "\n")
#
# cat("r2 for Validate ALL is: ",
# cor(validatex2[, "feature"],
# fmR$yHat[(nrow(trainx2)+1):(nrow(validatex2)+nrow(trainx2)) ])^2, "\n")
#
}else{
set.seed(seed)
datasets = trainVal(data = trainingx2, colToInd= "ID", sample = 0.95)
gc()
trainx2 = na.omit((datasets[[1]])[, -c(1:35) ])
validatex2 =na.omit( datasets[[2]][, -c(1:35) ] )
rm(datasets)
gc()
}
##################################################################
#final_grid1=expand.grid(nrounds=550, eta=.5, max_depth=3, gamma=0,colsample_bytree=0.95,min_child_weight=1,subsample = 1)
#final_grid2=expand.grid(nrounds=100, eta=.5, max_depth=5, gamma=0,colsample_bytree=0.95,min_child_weight=1,subsample = 1)
#final_grid1 <- expand.grid(nrounds = 500, eta = .3, lambda = .5, alpha=1.5)
#final_grid2 <- expand.grid(nrounds = 2500, eta = .5, lambda = 0.0003, alpha=0.0003)
# final_grid3 <- expand.grid(nrounds = rounds, eta = eta, lambda = lambda, alpha=alpha)
#
# final_grid3 <- expand.grid(nrounds = 3000, eta = 1, lambda = 0.0003, alpha=0.0003)
#
# final_grid4 <- expand.grid(nrounds = c(2500), eta = 1, lambda = 0.0003, alpha=0.0003)
# final_grid2 <- expand.grid(nrounds = c(2000), eta = 1, lambda = 0.0003, alpha=0.0003)
# final_grid3 <- expand.grid(mstop = 500, maxdepth = 2, nu = 0.1)
# final_grid4 <- expand.grid(committees = 10, neighbors = 20)
#trainx2$norm = (trainx2$Yield - mean(trainx2$Yield))/(max(trainx2$Yield)-min(trainx2$Yield))
if(name == "Plt.Height"){
NCAA.stacked = xgboostTraitLoop(trainx2=trainx2,validatex2=validatex2,max_depth = 6,
min_child_weight = 0,refresh_leaf = 0,
grow_policy="lossguide", max_bin = 10000, max_leaves = 40,
eta = .17, nrounds = 3000, r2 = 0.8,subsample=1,nthread=nthread)
}#done
if(name == "EarHt"){
NCAA.stacked = xgboostTraitLoop(trainx2=trainx2,validatex2=validatex2,max_depth = 15,
min_child_weight = 0,refresh_leaf = 0,
grow_policy="lossguide", max_bin = 20000, max_leaves = 50,
eta = 0.15, nrounds = 3000, r2 = 0.6, subsample=1,nthread=nthread)
}#done
if(name == "GS.Late"){
NCAA.stacked = xgboostTraitLoop(trainx2=trainx2,validatex2=validatex2,max_depth = 9,
min_child_weight = 0,refresh_leaf = 0,
grow_policy="lossguide", max_bin = 2000, max_leaves = 100,
eta = 0.2, nrounds = 3000, r2 = 0.43,subsample=1,nthread=nthread)
}#done
if(name == "PCT.HOH"){
NCAA.stacked = xgboostTraitLoop(trainx2=trainx2,validatex2=validatex2,max_depth = 6,
min_child_weight = 0,refresh_leaf = 0,
grow_policy="lossguide", max_bin = 10000, max_leaves = 40,
eta = .17, nrounds = 3000, r2 = 0.8,subsample=1,nthread=nthread)
}
if(name == "RL.Count"){
NCAA.stacked = xgboostTraitLoop(trainx2=trainx2,validatex2=validatex2,max_depth = 6,
min_child_weight = 0,refresh_leaf = 0,
grow_policy="lossguide", max_bin = 10000, max_leaves = 40,
eta = .17, nrounds = 3000, r2 = 0.8,subsample=1,nthread=nthread)
}
if(name == "SL.Count"){
NCAA.stacked = xgboostTraitLoop(trainx2=trainx2,validatex2=validatex2,max_depth = 6,
min_child_weight = 0,refresh_leaf = 0,
grow_policy="lossguide", max_bin = 10000, max_leaves = 40,
eta = .17, nrounds = 3000, r2 = 0.8,subsample=1,nthread=nthread)
}
if(name == "Test.WT"){
NCAA.stacked = xgboostTraitLoop(trainx2=trainx2,validatex2=validatex2,max_depth = 14,
min_child_weight = 0,refresh_leaf = 0,
grow_policy="lossguide", max_bin = 20000, max_leaves = 500,
eta = 0.2, nrounds = 3000, r2 = 0.77,subsample=1,nthread=nthread)
} #done
if(name == "Y.M"){
NCAA.stacked = xgboostTraitLoop(trainx2=trainx2,validatex2=validatex2,max_depth = 8,
min_child_weight = 0,refresh_leaf = 0,
grow_policy="lossguide", max_bin = 10000, max_leaves = 85,
eta = .2, nrounds = 3000, r2 = 0.6,subsample=0.95,nthread=nthread)
}
if(name == "Yield"){
NCAA.stacked = xgboostTraitLoop(trainx2=trainx2,validatex2=validatex2,max_depth = 8,
min_child_weight = 0,refresh_leaf = 0,
grow_policy="lossguide", max_bin = 10000, max_leaves = 85,
eta = .2, nrounds = 3000, r2 = 0.6,subsample=0.95,nthread=nthread)
} #done
#NCAA.stacked = bstDense
#
# models.list2 <- caretEnsemble::caretList(
# x=trainx2 %>% dplyr::select(-feature) %>% as.matrix(),
# y=(trainx2[,"feature"]),
# continue_on_fail = T,
# trControl=caret::trainControl(method="cv",
# number=1, #1
# index = createFolds((trainx2[,ncol(trainx2)]),k=2), #2
# savePredictions = TRUE,
# #classProbs=T,
# allowParallel = TRUE,
# verboseIter = TRUE
# # preProcOptions =list(
# # # method = c("knnImpute"),
# # k = 7,
# # knnSummary = mean)
# #na.remove = TRUE #method = c("center", "scale"))
# # outcome = NULL,
# # fudge = 0.2,
# # numUnique = 3,
# # verbose = FALSE,
# # freqCut = 95/5,
# # uniqueCut = 10,
# #cutoff = 0.9)
# # rangeBounds = c(0, 1))
# #p=.75
# # seeds=c(1,2,3,4,5,6,7,8,9),
# # indexFinal = length(sample(nrow(trainx2), (nrow(trainx2))*.3))
# ),
# tuneList=list(
# # qrf1=caretModelSpec(method="qrf", ntree=500, tuneLength = 1), #11
# # qrf2=caretModelSpec(method="qrf", ntree=7, tuneLength = 1), #11
# # qrf3=caretModelSpec(method="qrf", ntree=10, tuneLength = 1), #9
# # # #qrf4=caretModelSpec(method="qrf", ntree = 150, tuneLength = 1), #7
# #qrf5=caretModelSpec(method="qrf", ntree=10, tuneLength = 1), #5
# #qrf6=caretModelSpec(method="xgbLinear", tuneGrid = final_grid2), #5
# #qrf5=caretModelSpec(method="qrf",ntree=10, tuneLength = 1), #5
# # qrf6=caretModelSpec(method="xgbLinear", tuneGrid = final_grid2), #5
#
# qrf7=caretEnsemble::caretModelSpec(method="xgbLinear", tuneGrid = final_grid3) #5
# #qrf8=caretEnsemble::caretModelSpec(method="xgbLinear", tuneGrid = final_grid4) #5
#
# #qrf9=caretModelSpec(method="BstLm") #5
# #qrf8=caretModelSpec(method="cubist") #5
# # qrf6=caretModelSpec(method="qrf", ntree=2, tuneLength = 1) #5
# )
# # ),
# # methodList = c(
# # "cubist",
# # "xgbLinear"
# #
# # )
# )
#
# invisible(gc())
#
# models.list2
#
# NCAA.stacked<-caretEnsemble::caretEnsemble(models.list2, # + 95
# trControl = caret::trainControl(
# number=2,
# method="boot",
# verboseIter =TRUE,
# allowParallel = T
# )
# );NCAA.stacked # + 95
#---------------------------------------------------------------------------
# invisible(gc())
# #626063 + 36001
# #----Yield Val = 59, 52
# #----Yield tra = 67, 72
# #----Yield apr = 63, 68
# #~~~~~~~~~~~~~~~~~~
# #95958 + 5087
# #----pltht Val = 79, 74
# #----pltht tra = 83, 95
# #~~~~~~~~~~~~~~~~~~
# #95958 + 5087
# #----earht Val = 60
# #----earht tra = 68
# #~~~~~~~~~~~~~~~~~~
# #95958 + 5087
# #----gs.late Val = 43
# #----gs.late tra = 53
# #~~~~~~~~~~~~~~~~~~
# #95958 + 5087
# #----pct.hoh Val = 80
# #----pct.hoh tra = 84
# #~~~~~~~~~~~~~~~~~~
# #95958 + 5087
# #----rl.count Val = 42
# #----rl.count tra = 60
# #~~~~~~~~~~~~~~~~~~
# #95958 + 5087
# #----Y.M Val = 67
# #----Y.M tra = 73
# #~~~~~~~~~~~~~~~~~~
# #95958 + 5087
# #----Test.WT Val = 75
# #----Test.WT tra = 76
# #~~~~~~~~~~~~~~~~~~
# #95958 + 5087
# #----SL.Count Val = 47
# #----SL.Count tra = 63
#---------------------------------------------------------------------------
# #######Validate Cor
# (caret::varImp(models.list2$qrf7, scale=T))
preds = predict(NCAA.stacked, validatex2 %>% dplyr::select(-feature) %>% as.matrix())
cat("r2 for Validate ALL is: ",cor(validatex2[, "feature"], preds)^2, "\n")
cat("rmse for Validate ALL is: ",sqrt(mean((validatex2[, "feature"] - preds)^2)), "\n")
hist(preds, main= paste0(name))
plot(preds, validatex2[,"feature"], col = c("red","blue"), main = paste0(name))
#######training set
preds.t = predict(NCAA.stacked, trainx2 %>% dplyr::select(-feature) %>% as.matrix())
cat("r2 for Train ALL is: ",cor(trainx2[, "feature"], preds.t)^2, "\n")
cat("rmse for Train ALL is: ",sqrt(mean((trainx2[, "feature"] - preds.t)^2)), "\n")
hist(preds.t, main= paste0(name))
plot(preds.t, trainx2[,"feature"], col = c("red","blue"), main = paste0(name))
#######AProp set
ap.prop = na.omit( BV.HSIdentical.df[, colnames(trainx2) ] ) %>% mutate_all( as.numeric )
preds.ap = predict(NCAA.stacked, ap.prop %>% dplyr::select(-feature) %>% as.matrix() )
cat("r2 for Prop and A level is: ",cor(ap.prop[,"feature"], preds.ap)^2, "\n")
cat("rmse for Prop and A level is: ", sqrt(mean((ap.prop[,"feature"] - preds.ap)^2)), "\n")
#sqrt(mean((ap.prop[,8] - preds.ap)^2))
hist(preds.ap, main= paste0(name))
plot(preds.ap, ap.prop[,"feature"], col = c("red","blue"), main = paste0(name))
preds.ap = data.table(ap.prop %>% dplyr::select(-feature), preds.ap)
preds.test.agg.FEMALE = preds.ap %>%
dplyr::group_by(female) %>%
dplyr::summarize(preds.ap = mean(preds.ap))
preds.test.agg.MALE = preds.ap %>%
dplyr::group_by(male) %>%
dplyr::summarize(preds.ap = mean(preds.ap))
preds.test.agg.FEMALE = dplyr::left_join(preds.test.agg.FEMALE, female.2[,-2],by=c("female"="num"))
preds.test.agg.MALE = dplyr::left_join(preds.test.agg.MALE, male.2[,-2],by=c("male"="num"))
colnames(preds.test.agg.MALE) = c("female", "preds.ap", "FEMALE")
preds.test.agg = rbind(preds.test.agg.FEMALE, preds.test.agg.MALE)
preds.test.agg = preds.test.agg %>%
dplyr::group_by(FEMALE) %>%
dplyr::summarize(preds.ap = mean(preds.ap))
#dplyr::mutate(BV = (preds.ap - 228)/2 )
hist(preds.test.agg$preds.ap, main= paste0(name))
#biplot(preds.test.agg$FEMALE, preds.test.agg$BV)
# preds.test.agg.index.female = order(preds.test.agg$FEMALE)
# preds.test.agg = preds.test.agg[preds.test.agg.index.female,]
preds.test.agg.index.trait = order(preds.test.agg$preds.ap, decreasing = T)
preds.test.agg = preds.test.agg[preds.test.agg.index.trait,]
preds.test.agg = data.frame(preds.test.agg)
#ggplot(data=preds.test.agg, aes(x=reorder(FEMALE, preds.ap), y=preds.ap)) + geom_line("identity")
# rm(id.unk.all,df5,Blup, datasets, aprop,id.unk, preds.ap, id, preds.t, preds, models.list2,trainingx2,variety,
# BV.HSIdentical.df.3, male.3, validatex2, trainx2, BV.HSIdentical.df)
# gc()
#######expand.grind set male.female.year
rm(validatex2, trainx2)
gc()
cat("Predicting A and Prop test level for all combinations over Years, Locations, Male, Female", "\n")
ap.prop = ap.prop[!duplicated(ap.prop$female),]
#testx2 = testx2 %>% left_join(ap.prop[, colnames(ap.prop)[-c(1,2,4,5,6,7,8,9,10,11)]], by = "female")
#testx2 = testx2[,c(6,3,4,2,5,1,7,8:ncol(testx2))]
preds.test = predict(NCAA.stacked, testx2[,colnames(ap.prop)[-ncol(ap.prop)]] %>% mutate_all(as.numeric) %>% as.matrix())
hist(preds.test, main= paste0(name))
#filter(preds.test > 250)
# preds.test.bind.2 = preds.test.bind[,c(1,3,2,4,5,6,7)]
# colnames(preds.test.bind.2)[c(2,3)] = c("MALE","FEMALE")
# #
# BV.HSIdentical.df.3 = rbind(preds.test.bind.2,preds.test.bind)
# BV.HSIdentical.df.3 = data.frame(BV.HSIdentical.df.3)
#testx2 = testx2[,-c(8:ncol(testx2))]
gc()
preds.test = data.table(testx2, preds.test)
gc()
preds.test.bind = preds.test %>%
dplyr::left_join( Year.2[,-2],by=c("Year"="num")) %>%
dplyr::group_by(field, ID) %>%
dplyr::summarize(preds.test = mean(preds.test)) %>%
dplyr::select(preds.test) %>%
data.frame()
hist(preds.test$preds.test, main= paste0(name))
BreedlocName = data.frame(num = c(1,2,3,4,5,6),
fieldName = c("Atlanta","Marshalltown","Olivia","Exp","Henderison","Choice"))
preds.test.loc = preds.test %>%
dplyr::mutate_all(as.numeric) %>%
dplyr::left_join( Year.2[,-2],by=c("Year"="num")) %>%
dplyr::left_join(BreedlocName, by=c("BreedLoc"="num")) %>%
dplyr::group_by(fieldName, ID) %>%
dplyr::summarize(preds.test = mean(preds.test)) %>%
dplyr::select(preds.test) %>%
data.frame()
#head(preds.test.loc)
# preds.test.agg.FEMALE = preds.test.bind %>%
# group_by(FEMALE) %>%
# summarize(preds.test = mean(preds.test))
#
# preds.test.agg.MALE = preds.test.bind %>%
# group_by(MALE) %>%
# summarize(preds.test = mean(preds.test))
#
# colnames(preds.test.agg.MALE) = c("FEMALE","preds.test")
# preds.test.agg = rbind(preds.test.agg.FEMALE,preds.test.agg.MALE)
#
# preds.test.agg.FEMALE = preds.test.agg %>%
# group_by(FEMALE) %>%
# summarize(preds.test = mean(preds.test))
#rm(preds.test.agg.FIELD);gc()
###############################
preds.test.agg$preds.ap = round(preds.test.agg$preds.ap, 0)
assign(paste0(name,"_XGBlinearBV_",season,"SbyFemale"), preds.test.agg)
assign(paste0(name,"_XGBlinearBV_",season,"SbyField"), data.frame(round(preds.test.bind$preds.test,0)))
assign(paste0(name,"_XGBlinearBV_",season,"SbyLoc"), data.frame(round(preds.test.loc$preds.test,0)))
rm(ap.prop,datasets, id.unk,models.list2, NCAA.stacked, preds.ap,preds.test.agg,preds.test.agg.FEMALE,preds.test.agg.MALE,
trainx2, validatex2, preds,preds.t,preds.test,preds.test.bind)
rm(gender,field,id,variety,preds.test.loc)
gc()
}
sink()
dev.off()
###############################
gc()
colnames(inbreds) = "FEMALE"
preds.testFemale = dplyr::left_join(dplyr::left_join(dplyr::left_join(dplyr::left_join(dplyr::left_join(
dplyr::left_join(dplyr::left_join(dplyr::left_join(dplyr::left_join(
inbreds,
eval(as.name(paste0("EarHt_XGBlinearBV_",season,"SbyFemale"))), by="FEMALE"),
eval(as.name(paste0("GS.Late_XGBlinearBV_",season,"SbyFemale"))), by="FEMALE"),
eval(as.name(paste0("PCT.HOH_XGBlinearBV_",season,"SbyFemale"))), by="FEMALE"),
eval(as.name(paste0("Plt.Height_XGBlinearBV_",season,"SbyFemale"))), by="FEMALE"),
#eval(as.name(paste0("RL.._XGBlinearBV_",season,"SbyFemale"))), by="FEMALE"),
eval(as.name(paste0("RL.Count_XGBlinearBV_",season,"SbyFemale"))), by="FEMALE"),
#eval(as.name(paste0("SL.._XGBlinearBV_",season,"SbyFemale"))), by="FEMALE"),
eval(as.name(paste0("SL.Count_XGBlinearBV_",season,"SbyFemale"))), by="FEMALE"),
#eval(as.name(paste0("StandCnt..Final._XGBlinearBV_",season,"SbyFemale"))), by="FEMALE"),
eval(as.name(paste0("Test.WT_XGBlinearBV_",season,"SbyFemale"))), by="FEMALE"),
eval(as.name(paste0("Y.M_XGBlinearBV_",season,"SbyFemale"))), by="FEMALE"),
eval(as.name(paste0("Yield_XGBlinearBV_",season,"SbyFemale"))), by="FEMALE")
colnames(preds.testFemale) = c("Female","EarHT_BV","GS.Late_BV","PCT.HOH_BV",
"Plt.Height_BV","RL.Count_BV","SL.Count_BV",
"Test.WT_BV","Y.M_BV","Yield_BV")
preds.testFemale = preds.testFemale[!is.na(preds.testFemale$Yield), ]
rm(list=grep(pattern = paste0("*_XGBlinearBV_",season,"SbyFemale"), x=ls(), value=TRUE))
preds.test = cbind(eval(as.name(paste0("EarHt_XGBlinearBV_",season,"SbyField"))),
eval(as.name(paste0("GS.Late_XGBlinearBV_",season,"SbyField"))),
eval(as.name(paste0("PCT.HOH_XGBlinearBV_",season,"SbyField"))),
eval(as.name(paste0("Plt.Height_XGBlinearBV_",season,"SbyField"))),
#eval(as.name(paste0("RL.._XGBlinearBV_",season,"SbyField"))),
eval(as.name(paste0("RL.Count_XGBlinearBV_",season,"SbyField"))),
#eval(as.name(paste0("SL.._XGBlinearBV_",season,"SbyField"))),
eval(as.name(paste0("SL.Count_XGBlinearBV_",season,"SbyField"))),
#eval(as.name(paste0("StandCnt..Final._XGBlinearBV_",season,"SbyField"))),
eval(as.name(paste0("Test.WT_XGBlinearBV_",season,"SbyField"))),
eval(as.name(paste0("Y.M_XGBlinearBV_",season,"SbyField"))),
eval(as.name(paste0("Yield_XGBlinearBV_",season,"SbyField"))))
colnames(preds.test) = names
rm(list=grep(pattern = paste0("*_XGBlinearBV_",season,"SbyField"), x=ls(), value=TRUE))
gc()
preds.test.loc = cbind(eval(as.name(paste0("EarHt_XGBlinearBV_",season,"SbyLoc"))),
eval(as.name(paste0("GS.Late_XGBlinearBV_",season,"SbyLoc"))),
eval(as.name(paste0("PCT.HOH_XGBlinearBV_",season,"SbyLoc"))),
eval(as.name(paste0("Plt.Height_XGBlinearBV_",season,"SbyLoc"))),
#eval(as.name(paste0("RL.._XGBlinearBV_",season,"SbyLoc"))),
eval(as.name(paste0("RL.Count_XGBlinearBV_",season,"SbyLoc"))),
#eval(as.name(paste0("SL.._XGBlinearBV_",season,"SbyLoc"))),
eval(as.name(paste0("SL.Count_XGBlinearBV_",season,"SbyLoc"))),
#eval(as.name(paste0("StandCnt..Final._XGBlinearBV_",season,"SbyLoc"))),
eval(as.name(paste0("Test.WT_XGBlinearBV_",season,"SbyLoc"))),
eval(as.name(paste0("Y.M_XGBlinearBV_",season,"SbyLoc"))),
eval(as.name(paste0("Yield_XGBlinearBV_",season,"SbyLoc"))))
colnames(preds.test.loc) = names
rm(list=grep(pattern = paste0("*_XGBlinearBV_",season,"SbyLoc"), x=ls(), value=TRUE))
gc()
testx2.2 = testx2 %>%
dplyr::left_join( Year.2[,-2],by=c("Year"="num")) %>%
dplyr::group_by(field, ID, male, female) %>%
dplyr::summarize(Year = mean(Year)) %>%
dplyr::select(-Year)
gc()
preds.test = data.table(testx2.2, preds.test)
preds.test = preds.test %>% filter( Yield >=180 )
preds.test = preds.test %>% filter( Test.WT >=55 )
preds.test = preds.test %>% filter( EarHt >=43 )
preds.test = preds.test %>% filter( Plt.Height >=96 )
preds.test = preds.test %>% filter( GS.Late <=4 )
gc()
preds.test.bind = preds.test %>%
dplyr::left_join( field.2[, c(-2)],by=c("field"="num")) %>%
dplyr::left_join( male.2[, c(-2)],by=c("male"="num")) %>%
dplyr::left_join( female.2[, c(-2)],by=c("female"="num")) %>%
dplyr::select(-c(1:4))
preds.test.bind$ID = paste0(preds.test.bind$FEMALE," + ", preds.test.bind$MALE)
rm(preds.test, testx2.2)
gc()
#################################
testx2.3 = testx2 %>%
dplyr::mutate_all(as.numeric) %>%
dplyr::left_join( Year.2[,-2],by=c("Year"="num")) %>%
dplyr::left_join(BreedlocName, by=c("BreedLoc"="num")) %>%
dplyr::group_by(fieldName, ID,male,female) %>%
dplyr::summarize(BreedlocName = mean(BreedlocName)) %>%
data.frame()
preds.test.loc = data.table(testx2.3, preds.test.loc)
preds.test.loc = preds.test.loc %>% filter( Yield >=180 )
preds.test.loc = preds.test.loc %>% filter( Test.WT >=55 )
preds.test.loc = preds.test.loc %>% filter( EarHT >=43 )
preds.test.loc = preds.test.loc %>% filter( Plt.Height >=96 )
preds.test.loc = preds.test.loc %>% filter( GS.Late <=4 )
gc()
preds.test.bind.loc = preds.test.loc %>%
#dplyr::left_join( field.2[, c(-2)],by=c("field"="num")) %>%
dplyr::left_join( male.2[, c(-2)],by=c("male"="num")) %>%
dplyr::left_join( female.2[, c(-2)],by=c("female"="num")) %>%
dplyr::select(-c(2:5))
preds.test.bind.loc = preds.test.bind.loc[,c(2:12,1)]
preds.test.bind.loc$ID = paste0(preds.test.bind.loc$FEMALE," + ", preds.test.bind.loc$MALE)
rm(preds.test.loc,testx2.3)
#preds.test.agg.FIELD = tidyr::separate(preds.test.agg.FIELD, sep= " \\+ " ,col = LINE, into=c("FEMALE","MALE"), remove=F)
preds.test.bind = preds.test.bind[,c(10:13,1:9)]
preds.test.bind.loc = preds.test.bind.loc[,c(10:13,1:9)]
cat("Printing Colnames ", colnames(preds.test.bind), "\n")
# preds.test.agg.FIELD = preds.test.agg.FIELD %>%
# dplyr::filter(FIELD != c("Contract - SSR-Garden City"),
# FIELD != c("(HOLDING)"),
# !grepl(FIELD, pattern = "Contract"),
# !grepl(FIELD, pattern = "Beck - H"))
preds.test.bind.inbredselect = preds.test.bind %>% dplyr::filter(MALE == inbred)
preds.test.bind.LINE = preds.test.bind %>%
dplyr::group_by(ID, MALE, FEMALE) %>%
dplyr::summarize(EarHt.BV = mean(EarHt),
GS.Late.BV = mean(GS.Late),
PCT.HOH = mean(PCT.HOH),
Plt.Height.BV = mean(Plt.Height),
RL.Count.BV = mean(RL.Count),
SL.Count.BV = mean(SL.Count),
Test.WT.BV = mean(Test.WT),
Y.M.BV = mean(Y.M),
Yield.BV = mean(Yield))
gc()
#preds.test.agg.FIELD.LINE = tidyr::separate(preds.test.agg.FIELD.LINE, sep= " \\+ " ,col = LINE, into=c("FEMALE","MALE"), remove=F)
if(genotype){
openxlsx::write.xlsx(preds.test.bind.LINE, paste0(fdp,season,"_genotype","/","A.Prop",season,"_predsByLine.xlsx"),rowNames=F,overwrite=T)
rm(preds.test.bind.LINE);gc()
cat("Finished writing by LINE", "\n")
openxlsx::write.xlsx(preds.test.bind.inbredselect, paste0(fdp,season,"_genotype","/","A.Prop",season,"_predsbyLine",inbred,".xlsx"),rowNames=F,overwrite=T)
rm(preds.test.bind.inbredselect);gc()
cat("Finished writing by LINE MALE", "\n")
openxlsx::write.xlsx(preds.testFemale, paste0(fdp,season,"_genotype","/","A.Prop",season,"_predsByFemale.xlsx"),rowNames=F,overwrite=T)
rm(preds.testFemale);gc()
cat("Finished writing by FEMALE", "\n")
Field= "Field"
if(!dir.exists(paste0(fdp,season,"_genotype",Field))){
dir.create(paste0(fdp,season,"_genotype",Field))
}
field.index = preds.test.bind[!duplicated(preds.test.bind$FIELD), "FIELD"]
field.index = as.matrix(field.index)
for(i in field.index){
field.subset = subset(x=preds.test.bind, FIELD == i )
openxlsx::write.xlsx(field.subset, paste0(fdp,season,"_genotype","/",Field,"/A.Prop",season,"_predsbyLINE",i,".xlsx"),rowNames=F,overwrite=T)
}
}else{
openxlsx::write.xlsx(preds.test.bind.LINE, paste0(fdp,season,"/","A.Prop",season,"_predsByLine.xlsx"),rowNames=F,overwrite=T)
rm(preds.test.bind.LINE);gc()
cat("Finished writing by LINE", "\n")
openxlsx::write.xlsx(preds.test.bind.inbredselect, paste0(fdp,season,"/","A.Prop",season,"_predsbyLine",inbred,".xlsx"),rowNames=F,overwrite=T)
rm(preds.test.bind.inbredselect);gc()
cat("Finished writing by LINE MALE", "\n")
openxlsx::write.xlsx(preds.testFemale, paste0(fdp,season,"/","A.Prop",season,"_predsByFemale.xlsx"),rowNames=F,overwrite=T)
rm(preds.testFemale);gc()
cat("Finished writing by FEMALE", "\n")
openxlsx::write.xlsx(preds.test.bind.loc, paste0(fdp,season,"/","A.Prop",season,"_predsByLocation.xlsx"),rowNames=F,overwrite=T)
rm(preds.test.bind.loc);gc()
cat("Finished writing by LOCATION", "\n")
Field= "Field"
if(!dir.exists(paste0(fdp,season,"/",Field))){
dir.create(paste0(fdp,season,"/",Field))
}
field.index = preds.test.bind[!duplicated(preds.test.bind$FIELD), "FIELD"]
field.index = as.matrix(field.index)
for(i in field.index){
field.subset = subset(x=preds.test.bind, FIELD == i )
openxlsx::write.xlsx(field.subset, paste0(fdp,season,"/",Field,"/A.Prop",season,"_predsbyLINE",i,".xlsx"),rowNames=F,overwrite=T)
}
}
BreedLoca = dplyr::left_join(BreedLoca, field.2, by=c("field"="num"))
BreedLoca$BreedLoc = as.numeric( BreedLoca$BreedLoc)
BreedLoca = dplyr::left_join(BreedLoca, BreedlocName, by=c("BreedLoc"="num"))
preds.test.bind = dplyr::left_join(preds.test.bind,BreedLoca, by=c("FIELD"))
preds.test.bind = preds.test.bind[,c(1,2,3,4,17,5:13)]
rm(BreedLoca, BreedlocName,BV.HSIdentical.df, field.subset,trainingx2,testx2)
gc()
parallel::stopCluster(cl)
gc()
cat("DONE", "\n")
return(data.frame(preds.test.bind))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.