##' @title Quality Control of Phenotype and Genotype
##' (STEP1 of The Add-Dom Model)
##'
##' @description Quality control of Phenotype and Genotype (Input file format: PLINK or GenABEL)
##' (1) Discard phenotypes with <200 individuals or logical variables;
##' (2) Remove extreme values over threefold sd from the mean;
##' (3) Remove genotypes with MAF<0.05 or missing rate>0.1;
##' (4) Normalized phenotypes using "quantile" or "-log2" transforming;
##' (5) Histogram Plot of the raw, clean, residual, normalized and transformed phenotypes;
##' (6) Calculate kinship matrix using GenABEL, EMMA, EMMAX, GEMMA, GCTA, HOMEBREW_AFW or HOMEBREW_AS;
##' (7) Summary the mean, sd and sum of each phenotype.
##'
##' @details NOTE1: PLINK Input Format (1) Genotype File, named "file.bed", "file.bim" & "file.fam" (2) Phenotype File, named "file.phe" (1st column name should be "id"; The covariates columns should be prior than phenotypes; The sex column should coded as female=0 and male=1) (3) Covariates File, named "file.covs" (1st column is phenotype names; 2nd column is corresponding covariates separated by ","). NOTE2: GenABEL Input Format (1) file.ABEL.dat (Just contain one GenABEL type variable named "dat") (2) file.covs (1st column is phenotype name; 2nd column is corresponding covariates and all covariates should be separated by ",") NOTE3: Required Softwares: plink (v.1.90) & gcta64 (or emma/emmax-kin/gemma, only required when specified)
##'
##' @param indir A character. The input directory where contains the input bPLINK or GenABEL data.
##' @param outdir A character. The output directory where generates the folder: "1_PheGen".
##' @param Input_name A character. The prefixes of the input files.
##' @param Input_type A character. The format of input data. Please select from "PLINK" or "GenABEL".
##' @param Kinship_type A character. The method to generate kinship matrix. Please select from "GenABEL","EMMA","EMMAX","GEMMA", "GCTA", "GCTA_ad", "HOMEBREW_AFW" or "HOMEBREW_AS".
##' @param PheList_Choose A logic variable. T: Investigate specified phenotypes; F: Investigate all phenofiles.
##' @param PheList A vector of character. Please specifie a list like c("id","cov1","cov2","phe1","phe2"), when "PheList_Choose=F".
##' @param Phe_HistogramPlot A logic variable. T: Draw the histogram plots for all phenotypes; F: Aviod the histogram plots.
##' @param Phe_ResDone A logic variable. T: The input data has already been residualize, won't correct the covariates effect; F: Correct the covariates effect.
##' @param Phe_NormDone A logic variable. T: The input data has already been normalized, won't implement Log Transforming; F: Implement Log Transforming.
##' @param Normal_method A character. When choose "Phe_NormDone = F", the specified normalized method will be needed, "LOG2" or "QUANTILE".
##' @param covariates_sum A numeric variable. The sum of all covariates.
##' @param covariates_types A vector of character. The type of all covariates. Please select from "n" and "f". "f" stands for factorization.
##' @param Phe_IndMinimum A numeric variable. Remove phenotypes without enough available individuals.
##' @param Phe_Extreme A numeric variable. Phenotype QC2: Remove extreme phenotype values over -/+ Phe_Extreme*sd from mean.
##' @param GT_maf A numeric variable. Genotype QC1: Remove genotypes with MAF<GT_maf.
##' @param GT_missing A numeric variable. Genotype QC2: Remove genotypes with rate>GT_missing.
##' @param num_nodes A numeric variable. The number of cores used parallelly.
##'
##' @return a folder named "1_PheGen" with phenotypes and genotypes after QC.
##'
##' @examples
##' covariates_types = c("n","f")
##' names(covariates_types) = c("sex","batch")
##' ADDO_AddDom1_QC(indir=indir, outdir=outdir, Input_name="TEST", Kinship_type="GCTA_ad", PheList=c("sex","batch"), covariates_sum=2)
##'
##' @author Leilei Cui and Bin Yang
ADDO_AddDom1_QC <- function(indir = indir,
outdir = outdir,
Input_name = Input_name,
Input_type = "PLINK",
Kinship_type = Kinship_type,
PheList_Choose = F,
PheList = PheList,
Phe_HistogramPlot = F,
Phe_ResDone = F,
Phe_NormDone = F,
Normal_method = "QUANTILE",
covariates_sum = covariates_sum,
covariates_types = covariates_types,
Phe_IndMinimum = 200,
Phe_Extreme = 5,
GT_maf = 0.05,
GT_missing = 0.1,
num_nodes = 10){
setwd(outdir); system("mkdir 1_PheGen 1_PheGen/0_Data 1_PheGen/1_PheRaw 1_PheGen/2_PheClean 1_PheGen/3_PheRes 1_PheGen/4_PheNor 1_PheGen/5_PheTrans")
read <- function(...) as.data.frame(fread(header=F,colClasses="double",...))
read.header <- function(...) as.data.frame(fread(header=T,colClasses="double",...))
read.ped <- function(...) as.data.frame(fread(header=F,colClasses="character",...))
Plot_Phe_Histogram <- function(x,Phe_data,Phe_names){
try({
phe = Phe_data[,x]; Phe_type = Phe_names[x]
data_histogram = as.numeric(phe)
data_histogram <- data_histogram[!is.na(data_histogram)]
png(paste("HisSca_Plot_",Phe_type,".png",sep=""),width=1500,height=1500,res=200,type="cairo")
par(mai=c(0.95,0.95,1,0.5))
h <- hist(data_histogram,main=paste("Histogram Plot : ",Phe_type,sep=""),col=c("red"),breaks=170,cex.lab=1.5,cex.axis=1.5,cex.main=1.7)
xfit = seq(min(data_histogram),max(data_histogram),length=1000)
yfit = dnorm(xfit,mean=mean(data_histogram),sd=sd(data_histogram))
yfit = yfit*diff(h$mids[1:2])*length(data_histogram)
lines(xfit,yfit,col="blue",lwd=3)
box()
dev.off()
})
}
covs_matrix <- function(data,new_covs,covariates_types){
try({
x_mat=matrix(1,nrow=dim(data)[1],ncol=1)
for (c in 1:length(new_covs)) {
cov=new_covs[c]
type_cov=covariates_types[cov]
if (type_cov=='f') {
if(length(unique(data[,cov])) == 1){
matrice = rep(1,nrow(data))
}else{
old=data[,cov]
old=as.factor(as.numeric(old))
levels=levels(old)
n=length(levels)-1
matrice=matrix(ncol=n,nrow=length(old),NA)
colnames(matrice)=levels[1:n]
for (l in levels[1:n]) {
matrice[,l]=(old==l)
}
}
x_mat=cbind(x_mat,matrice)
} else {
data[,cov]=as.numeric(data[,cov])
x_mat=cbind(x_mat,data[,cov])
}
}
return(x_mat)
})
}
#batch = model.matrix(~as.factor(data.tmp[,"batch"]))
log2_norm <- function(x){
try({
x_log2 = x
for(i in 1:length(x)){
if(!(is.na(x[i])) && x[i]>0){
x_log2[i] = log2(x[i])
}else{
x_log2[i] = NA
}
}
return(x_log2)
})
}
quantile_norm <- function(x){
try({
names(x) = paste("Ind",1:length(x),sep="")
x_clean = x[!is.na(x)]
x_clean_norm = qnorm(rank(x_clean)/(1+length(x_clean)))
x[names(x_clean)] = x_clean_norm
return(x)
})
}
if(Input_type == "PLINK"){
setwd(paste(outdir,"/1_PheGen/0_Data",sep=""))
if(!file.exists("Data_clean.bed")){system(paste("plink --noweb --silent --bfile ",indir,"/",Input_name," --maf ",GT_maf," --geno ",GT_missing," --make-bed --out Data_clean",sep=""))}
if(!file.exists("Data_clean.frq")){system("plink --noweb --silent --bfile Data_clean --freq --out Data_clean")}
Clean_map = read("Data_clean.bim")
Phe_Raw = read.header(paste(indir,"/",Input_name,".phe",sep=""))
rownames(Phe_Raw) = Phe_Raw[,1]; colnames(Phe_Raw)[1] = "id"
}
if(Input_type == "GenABEL"){
load(paste(indir,"/",Input_name,".ABEL.dat",sep="")); dat = dat # Note0: The raw variable name of GenABEL #
setwd(paste(outdir,"/1_PheGen/0_Data",sep=""))
export.plink(dat,filebasename="Data_raw",phenotypes="all")
if(!file.exists("Data_raw.tped")){system("plink --noweb --silent --file Data_raw --transpose --recode --out Data_raw")}
if(!file.exists("Data_clean.bed")){system(paste("plink --noweb --silent --tfile Data_raw --maf ",GT_maf," --geno ",GT_missing," --make-bed --out Data_clean",sep=""))}
if(!file.exists("Data_clean.frq")){system("plink --noweb --silent --bfile Data_clean --freq --out Data_clean")}
Clean_map = read("Data_clean.bim")
Phe_Raw = read.header("Data_raw.phe")[,-1] # NOTE: When the input files are in GenABEL format, the 1st col of exported phenotype file is "FID" #
rownames(Phe_Raw) = Phe_Raw[,1]; colnames(Phe_Raw)[1] = "id" # NOTE: When the input files are in GenABEL format, the colname of individual's column is "IID" #
}
if(PheList_Choose == T){ PheList = PheList } else{ PheList = colnames(Phe_Raw) }
PheRaw_judge = rep("clean_phe",ncol(Phe_Raw))
for(i in 1:ncol(Phe_Raw)){
if(length(Phe_Raw[,i][unique(Phe_Raw[,i]) %in% c("T","TRUE","F","FALSE")]) > 0 || length(unique(Phe_Raw[,i][!is.na(Phe_Raw[,i])])) < 3){
PheRaw_judge[i] = "bad_phe"
}else{
PheRaw_judge[i] = "clean_phe"
}
}
names(PheRaw_judge) = colnames(Phe_Raw)
PheRaw_judge[1:(covariates_sum+1)] = "clean_phe" # In case some covariates are logical or case_control #
PheList = PheList[!PheList %in% names(PheRaw_judge[PheRaw_judge=="bad_phe"])]
Phe_Raw = Phe_Raw[,PheList]
write.table(Phe_Raw,file="../1_PheRaw/Phe_Raw.txt",row.names=F,col.names=T,quote=F)
setwd(paste(outdir,"/1_PheGen/1_PheRaw",sep=""))
if(Phe_HistogramPlot){ Plot_Phe_raw = mclapply((covariates_sum+2):length(PheList),Plot_Phe_Histogram,Phe_data=Phe_Raw,Phe_names=PheList,mc.cores=num_nodes) }
Phe_Clean = Phe_Raw[,PheList]; rownames(Phe_Clean) = paste("Ind_",Phe_Clean[,1],sep=""); PheList_rm = c() # In case for the individual names are just number #
for(i in (covariates_sum+2):length(PheList)){ # Note1: The first 3 cols are "id","sex","batch" #
Phe_tmp = Phe_Clean[,c("id",PheList[i])]; Phe_tmp_clean = Phe_tmp[!is.na(Phe_tmp[,2]),] # Remove NA inds #
if(length(unique(Phe_tmp_clean)[,2]) != 2){ # In case for some 0/1 phenotypes #
Phe_extreme1 = rownames(Phe_tmp_clean[Phe_tmp_clean[,2]>(mean(Phe_tmp_clean[,2],na.rm=T)+sd(Phe_tmp_clean[,2],na.rm=T)*Phe_Extreme),])
Phe_extreme2 = rownames(Phe_tmp_clean[Phe_tmp_clean[,2]<(mean(Phe_tmp_clean[,2],na.rm=T)-sd(Phe_tmp_clean[,2],na.rm=T)*Phe_Extreme),])
Phe_Clean[c(Phe_extreme1,Phe_extreme2),PheList[i]] = NA # Assign individuals with extreme phenotype to be NA #
}
if(length(Phe_Clean[,PheList[i]][!is.na(Phe_Clean[,PheList[i]])]) < Phe_IndMinimum) {PheList_rm = c(PheList_rm,PheList[i])} # Get the phe name with Num_Ind < Phe_IndMinimum #
}
rownames(Phe_Clean) = Phe_Clean[,1]
Phe_Clean = Phe_Clean[,colnames(Phe_Clean)[!colnames(Phe_Clean) %in% PheList_rm]]; PheList = PheList[!PheList %in% PheList_rm]
write.table(Phe_Clean,file="../0_Data/Data_clean.phe",row.names=F,col.names=T,quote=F)
write.table(Phe_Clean,file="../2_PheClean/Phe_Clean.txt",row.names=F,col.names=T,quote=F)
setwd(paste(outdir,"/1_PheGen/2_PheClean",sep=""))
if(Phe_HistogramPlot){ Plot_Phe_clean = mclapply((covariates_sum+2):length(PheList),Plot_Phe_Histogram,Phe_data=Phe_Clean,Phe_names=PheList,mc.cores=num_nodes) }
if(Phe_ResDone == T){
Phe_Res = Phe_Clean
write.table(Phe_Res,file="../3_PheRes/Phe_Res.txt",row.names=F,col.names=T,quote=F)
setwd(paste(outdir,"/1_PheGen/3_PheRes",sep=""))
if(Phe_HistogramPlot){ Plot_Phe_res = mclapply((covariates_sum+2):length(PheList),Plot_Phe_Histogram,Phe_data=Phe_Res,Phe_names=PheList,mc.cores=num_nodes) }
}else{
Phe_Res = Phe_Clean; Phe_covs = read.header(paste(indir,"/",Input_name,".covs",sep="")); rownames(Phe_covs) = Phe_covs[,1]
for(args in (covariates_sum+2):length(PheList)){
phenotype_name = PheList[args]; Phe_covs_tmp = strsplit(Phe_covs[phenotype_name,2],",")[[1]]
keep_inds_tmp = rownames(Phe_Clean)[!is.na(Phe_Clean[,phenotype_name])] # (1) Remove individuals with NA phenotype #
x_mat.tmp = Phe_Clean[keep_inds_tmp,c("id",Phe_covs_tmp)]
keep_inds = rownames(x_mat.tmp[complete.cases(x_mat.tmp),]) # (2) Remove individuals with NA covariates #
y.used = Phe_Clean[keep_inds,phenotype_name]; names(y.used) = keep_inds
covs.tmp = as.data.frame(Phe_Clean[keep_inds,Phe_covs_tmp]); rownames(covs.tmp) = keep_inds; colnames(covs.tmp) = Phe_covs_tmp
x_mat.used = covs_matrix(data = covs.tmp, new_covs = Phe_covs_tmp, covariates_types=covariates_types)
res.used = residuals(lm(y.used ~ x_mat.used[,-1])) # Method1: Using covariates matrix #
# residual_data = cbind(phe=y.used,covs.tmp)
# for(covs_order in 1:length(Phe_covs_tmp)){ if(covariates_types[Phe_covs_tmp[covs_order]] == "f"){ residual_data[,Phe_covs_tmp[covs_order]] = as.factor(residual_data[,Phe_covs_tmp[covs_order]]) }}
# res.used = residuals(lm(paste("phe ~ ",paste(Phe_covs_tmp,collapse=" + "),sep=""),data=residual_data)) # Method2: Using factored covariates #
Phe_Res[,phenotype_name] = NA; Phe_Res[names(res.used),phenotype_name] = res.used
}
write.table(Phe_Res,file="../3_PheRes/Phe_Res.txt",row.names=F,col.names=T,quote=F)
setwd(paste(outdir,"/1_PheGen/3_PheRes",sep=""))
if(Phe_HistogramPlot){ Plot_Phe_res = mclapply((covariates_sum+2):length(PheList),Plot_Phe_Histogram,Phe_data=Phe_Res,Phe_names=PheList,mc.cores=num_nodes) }
}
if(Phe_NormDone == T){
Phe_Nor = Phe_Res
write.table(Phe_Nor,file="../4_PheNor/Phe_Nor.txt",row.names=F,col.names=T,quote=F)
setwd(paste(outdir,"/1_PheGen/4_PheNor",sep=""))
if(Phe_HistogramPlot){ Plot_Phe_nor = mclapply((covariates_sum+2):length(PheList),Plot_Phe_Histogram,Phe_data=Phe_Nor,Phe_names=PheList,mc.cores=num_nodes) }
}else{
# normPvals = sapply(PheList[-c(1:(covariates_sum+1))],function(x){
# if(length(unique(Phe_Res[,x])[!is.na(unique(Phe_Res[,x]))])>1){
# return(shapiro.test(Phe_Res[,x])$p)
# }else{
# return(NA)} # [4-1] Skip phenotypes which just contain one unique value #
# })
Phe_Nor = Phe_Res
for(i in (covariates_sum+2):length(PheList)){
# if(!is.na(normPvals[i-(covariates_sum+1)])){
# if(normPvals[i-(covariates_sum+1)] < 1e-8){ # [4-2] Skip phenotypes with shapiro.test < 1e-8 #
if(Normal_method == "LOG2"){ Phe_Nor[,PheList[i]] = log2_norm(Phe_Res[,PheList[i]]) }
if(Normal_method == "QUANTILE"){ Phe_Nor[,PheList[i]] = quantile_norm(Phe_Res[,PheList[i]]) }
# }
# }
}
write.table(Phe_Nor,file="../4_PheNor/Phe_Nor.txt",row.names=F,col.names=T,quote=F)
setwd(paste(outdir,"/1_PheGen/4_PheNor",sep=""))
if(Phe_HistogramPlot){ Plot_Phe_nor = mclapply((covariates_sum+2):length(PheList),Plot_Phe_Histogram,Phe_data=Phe_Nor,Phe_names=PheList,mc.cores=num_nodes) }
}
try({
setwd(paste(outdir,"/1_PheGen/0_Data",sep=""))
MAF_info = read.header("Data_clean.frq")
Plot_Phe_Histogram(1,as.data.frame(MAF_info[,"MAF"]),paste(Input_name,"_MAF",sep=""))
})
try({
if(dim(Clean_map)[1] <= 100000){
setwd(paste(outdir,"/1_PheGen/0_Data",sep=""))
system("awk '{if($1!=0 && $4!=0) print $2}' Data_clean.bim > LD_tmp.txt")
system("plink --noweb --silent --bfile Data_clean --extract LD_tmp.txt --recode --out LD_tmp")
system("plink --noweb --silent --bfile Data_clean --r2 --ld-window 99999 --ld-window-r2 0 --out LD_tmp")
map_tmp = read("Data_clean.bim")
dat_tmp = read.header("LD_tmp.ld"); system("rm LD_tmp*")
distance = abs(dat_tmp[,5] - dat_tmp[,2])
R2 = dat_tmp[,7]
Beta.st=c(beta1=0.00001)
Beta.nonlinear = nls(R2~1/(1+4*beta1*distance),start=Beta.st,control=nls.control(maxiter=500))
tt=summary(Beta.nonlinear)
Beta.est = tt$parameters[1]
Beta.error = tt$parameters[2]
mkrs = c(0,1e3,2e3,4e3,8e3,12e3,16e3,20e3,40e3,80e3,120e3,160e3,200e3,300e3,460e3,800e3,900e3,1000e3)
D.est = 1/(1+4*Beta.est*mkrs)
png(paste("LD_decay_",Input_name,".png",sep=""),width=10000,height=8000,res=600,type="cairo")
par(mai=c(1.4,1.4,1.6,0.3),mgp=c(4.5,2,0))
plot(log2(mkrs),D.est,col="black",type="l",ylim=c(0,1),ylab=expression(Predicted~LD~r^2),xaxt="n",xlab="Physical distance(kb)",main=paste("LD Decay: ",Input_name," (",nrow(map_tmp)," SNPs)",sep=""),lwd=3,lty=1,cex.axis=2,cex.lab=2,cex.main=3)
axis(side=1,at=log2(mkrs),labels=mkrs/1000,cex.axis=2)
abline(h=0.3,lty=3,col="black",lwd=3)
dev.off(); rm(map_tmp,dat_tmp)
}
})
try({
if(dim(Clean_map)[1] <= 100000){
setwd(paste(outdir,"/1_PheGen/0_Data",sep=""))
system("plink --noweb --silent --bfile Data_clean --cluster --matrix --out IBS_tmp")
mibs_raw = read("IBS_tmp.mibs"); system("rm IBS_tmp*")
plotdat_mibs = mibs_raw[lower.tri(mibs_raw, diag = FALSE)]
Plot_Phe_Histogram(1,as.data.frame(plotdat_mibs),paste(Input_name,"_IBS_IndPairs",sep=""))
}
})
setwd(paste(outdir,"/1_PheGen/0_Data",sep=""))
if(Kinship_type == "GenABEL"){
if(!file.exists("Data_clean.tped")){system("plink --noweb --silent --bfile Data_clean --transpose --recode --out Data_clean")}
convert.snp.tped(tpedfile = "Data_clean.tped", tfamfile = "Data_clean.fam", outfile = "Data_clean.raw")
dat_clean = load.gwaa.data(phenofile= "Data_clean.phe", genofile= "Data_clean.raw", force =TRUE, makemap = FALSE, sort = TRUE, id = "id") # Note2: The colnames of inds is "id" #
save(dat_clean,file="Data_clean.ABEL.dat")
system("plink --noweb --silent --bfile Data_clean --indep-pairwise 50 10 0.5 --out tmp_genabel")
prunedSNP = read("tmp_genabel.prune.in")[,1]
prunedSNP = as.character(prunedSNP[prunedSNP %in% Clean_map[,2]])
gkin = ibs(dat_clean[,prunedSNP],weight="freq")
write.table(gkin,file=paste(outdir,"/1_PheGen/0_Data/Data_clean_kinship_GenABEL.txt",sep=""),row.names=F,col.names=F,quote=F)
system("rm tmp_*")
}
if(Kinship_type == "EMMA"){
if(!file.exists("tmp_emma.raw")){system("plink --noweb --silent --bfile Data_clean --recodeA --out tmp_emma")}
emma_snp1 = t(read.header("tmp_emma.raw")); colnames(emma_snp1) = emma_snp1["IID",]
emma_snp2 = as.matrix(emma_snp1[-c(1:6),])
emma_snp2 = gsub("1","0.5",emma_snp2); emma_snp2 = gsub("2","1",emma_snp2)
emma_snp3 = data.matrix(as.data.frame(emma_snp2), rownames.force = NA)
emma_kinship_A = gkin = emma.kinship(emma_snp3, method="additive", use = "complete.obs") # Note3: emma_snp3 must be (1) matrix (2) numeric elements (3) AA:0;AB:0.5;BB:1 #
#emma_kinship_D = emma.kinship(emma_snp3, method="dominant", use = "complete.obs")
#emma_kinship_R = emma.kinship(emma_snp3, method="recessive", use = "complete.obs")
write.table(emma_kinship_A,file=paste(outdir,"/1_PheGen/0_Data/Data_clean_kinship_EMMA.Additive.txt",sep=""),row.names=F,col.names=F,quote=F)
system("rm tmp_*")
}
if(Kinship_type == "EMMAX"){
if(!file.exists("tmp_emmax.tped")){system("plink --noweb --silent --bfile Data_clean --recode12 --output-missing-genotype 0 --transpose --out tmp_emmax")}
system("emmax-kin -v -h -s -d 10 tmp_emmax") # *.hIBS, IBS matrix #
system("mv tmp_emmax.hIBS.kinf Data_clean_kinship_EMMAX.hIBS.txt")
#system("emmax-kin -v -h -d 10 tmp_emmax") # *.BN, BN matrix #
#system("mv tmp_emmax.hBN.kinf Data_clean_kinship_EMMAX.hBN.txt")
gkin = read("Data_clean_kinship_EMMAX.hIBS.txt")
system("rm tmp_*")
}
if(Kinship_type == "GEMMA"){
system("sed -i s/-9/1/g Data_clean.fam")
system("gemma -bfile Data_clean -gk 1 -o Data_clean_kinship_GEMMA") # *.cXX, centered matrix #
system("mv output/Data_clean_kinship_GEMMA.cXX.txt ./")
#system("gemma -bfile Data_clean -gk 2 -o Data_clean_kinship_GEMMA") # *.sXX, standardized matrix #
#system("mv output/Data_clean_kinship_GEMMA.sXX.txt ./")
gkin = read("Data_clean_kinship_GEMMA.cXX.txt")
system("rm -r output"); system("rm tmp_*")
}
if(Kinship_type == "GCTA"){
system(paste("gcta64 --bfile Data_clean --thread-num ",num_nodes," --autosome --make-grm-gz --out tmp_gcta_add",sep=""))
tmp_gcta = read.table("tmp_gcta_add.grm.gz",header=F); tmp_gcta_id = read("tmp_gcta_add.grm.id")
tmp_gcta_matrix = matrix(0,nrow(tmp_gcta_id),nrow(tmp_gcta_id))
for(gcta_id in 1:nrow(tmp_gcta_matrix)){
tmp_gcta_matrix[gcta_id,1:gcta_id] = subset(tmp_gcta,tmp_gcta[,1]==gcta_id)[,4]
}
tmp_gcta_matrix[upper.tri(tmp_gcta_matrix)] = t(tmp_gcta_matrix)[upper.tri(tmp_gcta_matrix)]
gkin = tmp_gcta_matrix
write.table(tmp_gcta_matrix,file=paste(outdir,"/1_PheGen/0_Data/Data_clean_kinship_GCTA.Additive.txt",sep=""),row.names=F,col.names=F,quote=F)
system("rm tmp_*")
}
if(Kinship_type == "GCTA_ad"){
system(paste("gcta64 --bfile Data_clean --thread-num ",num_nodes," --autosome --make-grm-gz --out tmp_gcta_add",sep=""))
tmp_gcta = read.table("tmp_gcta_add.grm.gz",header=F); tmp_gcta_id = read("tmp_gcta_add.grm.id")
tmp_gcta_matrix = matrix(0,nrow(tmp_gcta_id),nrow(tmp_gcta_id))
for(gcta_id in 1:nrow(tmp_gcta_matrix)){
tmp_gcta_matrix[gcta_id,1:gcta_id] = subset(tmp_gcta,tmp_gcta[,1]==gcta_id)[,4]
}
tmp_gcta_matrix[upper.tri(tmp_gcta_matrix)] = t(tmp_gcta_matrix)[upper.tri(tmp_gcta_matrix)]
gkin = tmp_gcta_matrix
write.table(tmp_gcta_matrix,file=paste(outdir,"/1_PheGen/0_Data/Data_clean_kinship_GCTA.Additive.txt",sep=""),row.names=F,col.names=F,quote=F)
system(paste("gcta64 --bfile Data_clean --thread-num ",num_nodes," --autosome --make-grm-d-gz --out tmp_gcta_dom",sep=""))
tmp_gcta = read.table("tmp_gcta_dom.d.grm.gz",header=F); tmp_gcta_id = read("tmp_gcta_dom.d.grm.id")
tmp_gcta_matrix = matrix(0,nrow(tmp_gcta_id),nrow(tmp_gcta_id))
for(gcta_id in 1:nrow(tmp_gcta_matrix)){
tmp_gcta_matrix[gcta_id,1:gcta_id] = subset(tmp_gcta,tmp_gcta[,1]==gcta_id)[,4]
}
tmp_gcta_matrix[upper.tri(tmp_gcta_matrix)] = t(tmp_gcta_matrix)[upper.tri(tmp_gcta_matrix)]
gkin = tmp_gcta_matrix
write.table(tmp_gcta_matrix,file=paste(outdir,"/1_PheGen/0_Data/Data_clean_kinship_GCTA.Dominant.txt",sep=""),row.names=F,col.names=F,quote=F)
system("rm tmp_*")
}
if(Kinship_type == "HOMEBREW_AFW"){
system("plink --noweb --silent --bfile Data_clean --recode --out tmp_matrix")
Chip_map <- read("Data_clean.bim"); Chip_ped <- read.ped("tmp_matrix.ped")
system("rm tmp_matrix*")
ped_allele_merge <- function(i){
x = Chip_ped[,(i*2+5):(i*2+6)]
x_merge = paste(x[,1],x[,2],sep="")
return(x_merge)
}
Chip_ped_merge = do.call('cbind',mclapply(1:((dim(Chip_ped)[2]-6)/2),ped_allele_merge,mc.cores=num_nodes))
Chip_ped_merge = cbind(Chip_ped[,1:6],Chip_ped_merge)
colnames(Chip_ped_merge) = c("FID","IID","PID","MID","sex","phe",as.character(Chip_map[,2]))
Chip_ped_merge_used = Chip_ped_merge[,7:dim(Chip_ped_merge)[2]]
Chip_freq = read.header("Data_clean.frq") # Note: Here we use the minor allele A1 as the reference allele (as well as frequency) #
rownames(Chip_freq) = Chip_freq[,2]
X_AFW_Calculate <- function(j){
if(Kinship_model == "Additive"){
g = NA; p = Chip_freq[colnames(X)[j],"MAF"]
if(Chip_ped_merge_used[i,j] == paste(Chip_freq[colnames(X)[j],"A1"],Chip_freq[colnames(X)[j],"A1"],sep="")){ g = 2 } # Recode A1A2 as 2 #
if(Chip_ped_merge_used[i,j] == paste(Chip_freq[colnames(X)[j],"A1"],Chip_freq[colnames(X)[j],"A2"],sep="")){ g = 1 } # Recode A1A2 as 1 #
if(Chip_ped_merge_used[i,j] == paste(Chip_freq[colnames(X)[j],"A2"],Chip_freq[colnames(X)[j],"A2"],sep="")){ g = 0 } # Recode A2A2 as 0 #
if(is.na(g)){ X_OneInd_tmp = NA } else { x = (g-2*p)/(sqrt(2*p*(1-p))); X_OneInd_tmp = x }
}
if(Kinship_model == "Dominant"){
g = NA; p = Chip_freq[colnames(X)[j],"MAF"]
if(Chip_ped_merge_used[i,j] == paste(Chip_freq[colnames(X)[j],"A1"],Chip_freq[colnames(X)[j],"A1"],sep="")){ g = (4*p)-2 } # Recode A1A2 as 4p-2 #
if(Chip_ped_merge_used[i,j] == paste(Chip_freq[colnames(X)[j],"A1"],Chip_freq[colnames(X)[j],"A2"],sep="")){ g = 2*p } # Recode A1A2 as 2p #
if(Chip_ped_merge_used[i,j] == paste(Chip_freq[colnames(X)[j],"A2"],Chip_freq[colnames(X)[j],"A2"],sep="")){ g = 0 } # Recode A2A2 as 0 #
if(is.na(g)){ X_OneInd_tmp = NA } else { x = (g-2*p*p)/(2*p*(1-p)); X_OneInd_tmp = x }
}
return(X_OneInd_tmp)
}
Kinship_model = "Additive"; X = matrix(0,dim(Chip_ped_merge_used)[1],dim(Chip_ped_merge_used)[2])
rownames(X) = paste(Chip_ped_merge[,1],Chip_ped_merge[,2],sep="_"); colnames(X) = colnames(Chip_ped_merge)[-c(1:6)]
for(i in 1:(dim(X)[1])){ X[i,] = do.call('cbind',mclapply(1:(dim(Chip_ped_merge_used)[2]), X_AFW_Calculate, mc.cores=num_nodes))[1,] }
SNP_num = dim(X)[2]; K = (X%*%t(X))/SNP_num
write.table(K,file=paste(outdir,"/1_PheGen/0_Data/Data_clean_kinship_AFW.Additive.txt",sep=""),row.names=F,col.names=F,quote=F)
Kinship_model = "Dominant"; X = matrix(0,dim(Chip_ped_merge_used)[1],dim(Chip_ped_merge_used)[2])
rownames(X) = paste(Chip_ped_merge[,1],Chip_ped_merge[,2],sep="_"); colnames(X) = colnames(Chip_ped_merge)[-c(1:6)]
for(i in 1:(dim(X)[1])){ X[i,] = do.call('cbind',mclapply(1:(dim(Chip_ped_merge_used)[2]), X_AFW_Calculate, mc.cores=num_nodes))[1,] }
SNP_num = dim(X)[2]; K = (X%*%t(X))/SNP_num
write.table(K,file=paste(outdir,"/1_PheGen/0_Data/Data_clean_kinship_AFW.Dominant.txt",sep=""),row.names=F,col.names=F,quote=F)
}
if(Kinship_type == "HOMEBREW_AS"){
system("plink --noweb --silent --bfile Data_clean --recode --out tmp_matrix")
Chip_map <- read("Data_clean.bim"); Chip_ped <- read.ped("tmp_matrix.ped")
system("rm tmp_matrix*")
ped_allele_merge <- function(i){
x = Chip_ped[,(i*2+5):(i*2+6)]
x_merge = paste(x[,1],x[,2],sep="")
return(x_merge)
}
Chip_ped_merge = do.call('cbind',mclapply(1:((dim(Chip_ped)[2]-6)/2),ped_allele_merge,mc.cores=num_nodes))
Chip_ped_merge = cbind(Chip_ped[,1:6],Chip_ped_merge)
colnames(Chip_ped_merge) = c("FID","IID","PID","MID","sex","phe",as.character(Chip_map[,2]))
Chip_ped_merge_used = Chip_ped_merge[,7:dim(Chip_ped_merge)[2]]
Chip_freq = read.header("Data_clean.frq") # Note: Here we use the minor allele A1 as the reference allele, as well as frequency #
rownames(Chip_freq) = Chip_freq[,2]
X_AS_Calculate <- function(j){
if(Kinship_model == "Additive"){
g = NA; p = Chip_freq[colnames(X)[j],"MAF"]
if(Chip_ped_merge_used[i,j] == paste(Chip_freq[colnames(X)[j],"A1"],Chip_freq[colnames(X)[j],"A1"],sep="")){ g = 2 } # Recode A1A2 as 2 #
if(Chip_ped_merge_used[i,j] == paste(Chip_freq[colnames(X)[j],"A1"],Chip_freq[colnames(X)[j],"A2"],sep="")){ g = 1 } # Recode A1A2 as 1 #
if(Chip_ped_merge_used[i,j] == paste(Chip_freq[colnames(X)[j],"A2"],Chip_freq[colnames(X)[j],"A2"],sep="")){ g = 0 } # Recode A2A2 as 0 #
X_OneInd_tmp = g
}
if(Kinship_model == "Dominant"){
g = NA; p = Chip_freq[colnames(X)[j],"MAF"]
if(Chip_ped_merge_used[i,j] == paste(Chip_freq[colnames(X)[j],"A1"],Chip_freq[colnames(X)[j],"A1"],sep="")){ g = (4*p)-2 } # Recode A1A2 as 4p-2 #
if(Chip_ped_merge_used[i,j] == paste(Chip_freq[colnames(X)[j],"A1"],Chip_freq[colnames(X)[j],"A2"],sep="")){ g = 2*p } # Recode A1A2 as 2p #
if(Chip_ped_merge_used[i,j] == paste(Chip_freq[colnames(X)[j],"A2"],Chip_freq[colnames(X)[j],"A2"],sep="")){ g = 0 } # Recode A2A2 as 0 #
X_OneInd_tmp = g
}
return(X_OneInd_tmp)
}
Kinship_model = "Additive"; X = matrix(0,dim(Chip_ped_merge_used)[1],dim(Chip_ped_merge_used)[2])
rownames(X) = paste(Chip_ped_merge[,1],Chip_ped_merge[,2],sep="_"); colnames(X) = colnames(Chip_ped_merge)[-c(1:6)]
for(i in 1:(dim(X)[1])){ X[i,] = scale( do.call('cbind',mclapply(1:(dim(Chip_ped_merge_used)[2]), X_AS_Calculate, mc.cores=num_nodes))[1,] )[,1] }
SNP_num = dim(X)[2]; K = (X%*%t(X))/SNP_num
write.table(K,file=paste(outdir,"/1_PheGen/0_Data/Data_clean_kinship_AS.Additive.txt",sep=""),row.names=F,col.names=F,quote=F)
Kinship_model = "Dominant"; X = matrix(0,dim(Chip_ped_merge_used)[1],dim(Chip_ped_merge_used)[2])
rownames(X) = paste(Chip_ped_merge[,1],Chip_ped_merge[,2],sep="_"); colnames(X) = colnames(Chip_ped_merge)[-c(1:6)]
for(i in 1:(dim(X)[1])){ X[i,] = scale( do.call('cbind',mclapply(1:(dim(Chip_ped_merge_used)[2]), X_AS_Calculate, mc.cores=num_nodes))[1,] )[,1] }
SNP_num = dim(X)[2]; K = (X%*%t(X))/SNP_num
write.table(K,file=paste(outdir,"/1_PheGen/0_Data/Data_clean_kinship_AS.Dominant.txt",sep=""),row.names=F,col.names=F,quote=F)
}
setwd(paste(outdir,"/1_PheGen/0_Data",sep=""))
trait2analy <- colnames(Phe_Clean)[-c(1:(covariates_sum+1))] # phe_name #
dat2analy <- as.matrix(Phe_Clean[,trait2analy]) # phe_data #
traitSummary <- matrix(0,length(trait2analy),4) # output_results #
colnames(traitSummary) <- c("Mean","SD","N","h2")
rownames(traitSummary) <- trait2analy
traitSummary[,"Mean"] <- apply(dat2analy,2,mean,na.rm=T)
traitSummary[,"SD"] <- apply(dat2analy,2,sd, na.rm=T)
traitSummary[,"N"] <- apply(dat2analy,2,function(x){length(which(!is.na(x)))})
traitSummary = cbind("Traits"=rownames(traitSummary), traitSummary)
setwd(paste(outdir,"/1_PheGen/0_Data",sep="")); try({system("rm *.log *.hh")})
write.table(traitSummary,file=paste(outdir,"/1_PheGen/Data_clean_TraitSum.txt",sep=""),row.names=F,col.names=T,quote=F)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.