# Snp Heritability
snpH2=function(gen,K=NULL){
if(!is.numeric(gen)) stop("Object gen must to be a numeric matrix")
if(!is.null(K)){ if(!is.numeric(K)) stop("Object K must to be a numeric matrix") }
anyNA = function(x) any(is.na(x))
# Checking for missing and imputing
if(any(is.na(gen))) gen = IMP(gen)
# Setup
snps = ncol(gen)
obs = nrow(gen)
x = matrix(1:obs,ncol=1)
# If pedigree is not provided
if(is.null(K)){
K = tcrossprod(gen)
K = K/mean(diag(K))}
# Defining log-REML
loglike<-function(theta){
lambda<-exp(theta)
logdt<-sum(log(lambda*delta+1))
h<-1/(lambda*delta+1)
yy<-sum(yu*h*yu)
yx<-matrix(0,q,1)
xx<-matrix(0,q,q)
for(i in 1:q){
yx[i]<-sum(yu*h*xu[,i])
for(j in 1:q){xx[i,j]=sum(xu[,i]*h*xu[,j])}}
loglike = -0.5*logdt-0.5*(n-q)*log(yy-t(yx)%*%solve(xx)%*%yx)-0.5*log(det(xx))
return(-loglike)}
fixed<-function(lambda){
h<-1/(lambda*delta+1)
yy<-sum(yu*h*yu)
yx=timesVec(yu,h,xu,q)
xx=timesMatrix(xu,h,xu,q,q)
beta<-qr.solve(xx,yx)
sigma2<-(yy-t(yx)%*%solve(xx)%*%yx)/(n-q)
sigma2 = as.numeric(sigma2)
var<-diag((solve(xx))*sigma2)
stderr<-sqrt(var)
return(c(beta,stderr,sigma2))}
# Eigendecomposition of K
qq<-eigen(as.matrix(K),symmetric=T)
delta<-qq[[1]]
uu<-qq[[2]]
q<-1
n<-ncol(K)
H2 = c()
# LOOP
pb=txtProgressBar(style=3)
for(i in 1:snps){
y = gen[,i]
yu<-t(uu)%*%y
xu<-t(uu)%*%x
vp<-var(y)
# Finding lambda through optimization
parm<-optim(par=10,fn=loglike,method="L-BFGS-B",lower=1e-12,upper=1e12)
lambda<-exp(parm$par)
# Results
parmfix<-fixed(lambda)
Ve<-parmfix[2*q+1]
Vg<-lambda*Ve
h2=Vg/(Vg+Ve)
# Saving loop
H2 = c(H2,h2)
setTxtProgressBar(pb,i/snps)}
close(pb)
names(H2)=colnames(gen)
class(H2)="H2"
return(H2)
}
# Plot H2
plot.H2 = function(x,...,chr=NULL){
anyNA = function(x) any(is.na(x))
her = as.numeric(x)
plot(her,xlab="Genome",ylab="Heritability",main="Gene content")
rect(par("usr")[1],par("usr")[3],
par("usr")[2],par("usr")[4],
col=rgb(0.25,0.25,0.25,0.5))
lines(her,lwd=4,pch=20,type="p",col="blue")
lines(her,lwd=1,pch=20,type="p",col="lightblue")
if(is.null(chr)!=T) abline(v=cumsum(chr[-length(chr)])+0.5,lty=3)
}
# Fixation index functions
Fst = function(gen,fam){
anyNA = function(x) any(is.na(x))
# Weir and Cockerham (1984): Unbiased Fst from weighted AOV
if(anyNA(fam)) stop("No missing fam is allowed")
if(anyNA(gen)) gen[is.na(gen)]=1 # replacing missing
# number of markers and populations
m = ncol(gen)
r = length(unique(fam))
n = table(fam)
nbar = mean(n)
# function to calculate binomial variance
Var = function(snp){p=mean(snp,na.rm=T)/2;return(p*(1-p))}
# function to estimate p and heterozigousity by family
p_i = function(snp,fam) tapply(snp,fam,mean)/2
h_i= function(snp,fam) tapply(snp,fam,function(x)mean(x==1,na.rm=T))
# Whole-Population parameters
C2 = var(n)/(mean(n)^2) # squared CV of sample sizes
nc = nbar*(1-C2/r)
# Parameters by population
M = apply(gen,2,p_i,fam=fam) # observed P by family by allele
N = matrix(n,r,m)
Pbar = colSums(M*N)/(r*nbar) # expected P
eP = t(matrix(Pbar,m,r))
s2 = colSums(N*(M-eP)^2)/((r-1)*nbar) # var over pop
H = apply(gen,2,h_i,fam=fam) # observed heterozigousity
Hbar = colSums(H*N)/(r*nbar)
V = Pbar*(1-Pbar)
# Three variance components: A, B and C
Va = nbar/nc*(s2-(1/(nbar-1))*(V-(r-1)/r*s2-Hbar/4))
Vb = (nbar/(nbar-1))*(V-(r-1)/r*s2-(2*nbar-1)/(4*nbar)*Hbar)
Vc = Hbar/2
# F - statistics!
fst = Va/(Va+Vb+Vc); fst[is.nan(fst)|fst<=0]=0
fit = 1-Vc/(Va+Vb+Vc); fit[is.nan(fit)|fit<=0]=0
fis = 1-Vc/(Vb+Vc); fis[is.nan(fis)|fis<=0]=0
Fstat = data.frame(fst,fit,fis)
class(Fstat) <- "fst"
return(Fstat)
}
# Plot FST
plot.fst = function(x,..., p=NULL,chr=NULL){
anyNA = function(x) any(is.na(x))
class(x)=NULL; FST=x$fst
C = 1 - 0.1 * (4 + 6 * abs(FST)/max(abs(FST)))
if(is.null(chr)!=T){
plot(FST,...,xaxt = "n",main="Fixation Index",pch=20,xlab="Genome",col=(rgb(C,C,C)))
}else{
plot(FST,...,main="Fixation Index",pch=20,xlab="Genome",col=(rgb(C,C,C)))
}
K = smooth(smooth(FST))
lines(K,type="l",lwd=1,col=rgb(0,0,1,0.6))
if(is.null(chr)!=T){
# Adding Chromosome in X axis
medians=rep(NA,length(chr))
CHR=c(); for(i in 1:length(chr)) CHR=c(CHR,rep(i,chr[i]))
for(i in 1:length(chr)) medians[i] = median(which(CHR==i))
axis(1, at=round(medians), labels=1:length(medians))
abline(v=cumsum(chr[-length(chr)])+0.5,lty=3)
}
# P-value
if(is.null(p)!=TRUE){
par(ask=TRUE); CDF = ecdf(FST);
Pval = -log(1-CDF(FST)) ; THR = -log(p/length(FST))
plot(Pval,ylab="Empirical -log probatility",
xlab="Genome",main="P-values",pch=20,col=1+1*(Pval>THR))
if(is.null(chr)!=T) abline(v=cumsum(chr[-length(chr)])+0.5,lty=3)
abline(h=THR,col=rgb(1,0,0,0.4),lty=2);par(ask=FALSE)
KK = ksmooth(1:length(FST),Pval,"normal",bandwidth = 10)
lines(KK,type="h",lwd=2,col=rgb(1,0.5,0,0.6))}
}
# function for marker quality control
snpQC=function(gen,psy=1,MAF=0.05,misThr=0.8,remove=TRUE,impute=FALSE){
anyNA = function(x) any(is.na(x))
# CHECKING REDUNDANT MARKERS
gen2=gen; redundancy=c(0); for(i in 1:(ncol(gen)-1)){
a=mean((gen[,i]==gen[,(i+1)]),na.rm=TRUE);redundancy=c(redundancy,a)}
a=which(redundancy>psy);b=length(which(redundancy>psy))
if(b>0){cat("Genotypic data contains",b,"redundant SNPs",'\n')
if(remove==TRUE){gen2=gen[,-a]}
}else{cat("No redundant SNPs found",'\n')}
# REMOVE SNP A LOT OF MISSING # NEW!
msnp = apply(gen2,2,function(x) mean(is.na(x)) )
noVal = which(msnp>misThr)
# CHECKING MINOR ALLELE FREQUENCY
if(MAF>0){
Count_maf = function(x){
AA=sum(x==2)
Aa=sum(x==1)
aa=sum(x==0)
Total=AA+Aa+aa
PA=(AA+0.5*Aa)/Total
Pa=(aa+0.5*Aa)/Total
lowerAF=min(PA,Pa)
return(lowerAF)}
LAF = apply(gen2,2,Count_maf)
maf=which(LAF<MAF)
# REMOVE SNP A LOT OF MISSING # NEW!
msnp = apply(gen2,2,function(x) mean(is.na(x)) )
noVal = which(msnp>misThr)
# hist(LAF,col=3,nclass=50,main="Histogram of MAF",xlab="Minor Allele Frequency")
if(length(maf)>0){
cat("There are",length(maf),"markers with MAF below the threshold",'\n')
if(remove==TRUE){
gen3=gen2[,-maf]
if(any(noVal)){gen3=gen3[,-noVal]}
}else{
gen3=gen2
}
}else{cat("No marker below MAF threshold",'\n');
gen3=gen2
if(any(noVal)){gen3=gen3[,-noVal]}
}
}else{gen3=gen2}
# IMPUTATION
if(impute){
rf = function(X) apply(X,2,function(x){x[is.na(x)]=mean(x,na.rm=T);return(x)} )
gen=gen3;gen[gen==5]=NA
k=100*length(which(is.na(gen)))/length(gen)
k=round(k,2);cat(k,"% of missing data",'\n')
cat("Imputations via expectation",'\n')
if(any(is.na(gen))){gen=suppressWarnings(rf(gen));gen=round(gen)}
gen3=gen}
return(gen3)}
# function to remove repeated genotypes
cleanREP = function(y,gen,fam=NULL,thr=0.95){
if(is.vector(y)) y=matrix(y,ncol=1)
if(is.null(fam)) fam = rep(1,nrow(y))
ibs = function(gen){
f1 = function(x,gen) apply(gen,1,function(y,x) mean(y==x,na.rm = T),x=x)
f2 = apply(gen,1,f1,gen=gen)
return(f2)}
GG = function(gen, r = 1) {
a1 = (gen - 1)
a1[a1 == -1] = 0
A1 = (tcrossprod(a1))
a2 = -(gen - 1)
a2[a2 == -1] = 0
A2 = (tcrossprod(a2))
d = round(exp(-abs(gen - 1)))
D = tcrossprod(d)
G = A1 + A2 + D
G = (G/ncol(gen))^r
return(G)
}
cat("solving identity matrix\n")
G = GG(gen)
rownames(G) = 1:nrow(G)
lt = G*lower.tri(G) # lower triang
r = 1* lt>thr # logical matrix: repeatitions
# starting point of new data
rownames(gen) = 1:nrow(gen)
Ny=y; Nfam=fam; Ngen=gen
# summary
cs = colSums(r) # how many times id will be repeated
while(any(cs>0)){
i = which(cs>0)[1]
cat("indiviual",rownames(gen)[i],"had",cs[i],'duplicate(s)\n')
w = which(r[,i])
if(ncol(y)>1){y[i,] = colMeans(y[c(i,w),],na.rm=T)
}else{y[i] = mean(y[c(i,w)],na.rm=T)}
if(ncol(y)>1){Ny=Ny[-w,]}else{Ny=Ny[-w]}
Nfam=Nfam[-w]
Ngen=Ngen[-w,]
r = r[-w,]
cs = colSums(r)
}
return(list(y=Ny,gen=Ngen,fam=Nfam))
}
# Some sort of Hidden Markov model for imputation
markov=function(gen,chr=NULL){
if(is.null(chr)) chr = ncol(gen)
# vector chr
CHR=NULL;for(i in 1:length(chr)){CHR=c(CHR,rep(i,chr[i]))}
# Expectation and Transition Probability
tr = function(v1,v2){
tp=rep(NA,9) # Transition Probability
tp[1]=mean(v1==0&v2==0,na.rm=T);tp[2]=mean(v1==0&v2==1,na.rm=T);tp[3]=mean(v1==0&v2==2,na.rm=T)
tp[4]=mean(v1==1&v2==0,na.rm=T);tp[5]=mean(v1==1&v2==1,na.rm=T);tp[6]=mean(v1==1&v2==2,na.rm=T)
tp[7]=mean(v1==2&v2==0,na.rm=T);tp[8]=mean(v1==2&v2==1,na.rm=T);tp[9]=mean(v1==2&v2==2,na.rm=T)
tp[tp==0]=1e-5;tp[1:3]=tp[1:3]/sum(tp[1:3]);tp[4:6]=tp[4:6]/sum(tp[4:6]);tp[7:9]=tp[7:9]/sum(tp[7:9])
return(tp)}
# Transition matrix
TM = function(gen){M = ncol(gen); N = nrow(gen)
step1 = rbind(gen[,-M],gen[,-1])
step2 = function(snps) tr(snps[1:N],snps[-c(1:N)])
step3 = apply(step1,2,step2); rm(step1)
rownames(step3) = paste(gl(3,3,9,0:2),0:2,sep='to')
return(step3)}
# Calculate log-prob of transitions
mis = gen; tm=log(TM(gen))
# Imputation with Expectation
IE = function(v1,v2,tp){
exp=rep(NA,3);exp[1]=which.max(tp[1:3])-1;exp[2]=which.max(tp[4:6])-1;exp[3]=which.max(tp[7:9])-1
w=which(is.na(v2));r=v1[w]+1;v=exp[r];v2[w]=v;return(v2)}
# Imputing first row (starting point)
gen[,1] = IE(IE(IE(gen[,4],gen[,3],tm[,3]),gen[,2],tm[,2]),gen[,1],tm[,1])
if(anyNA(gen[,1]))gen[,1][is.na(gen[,1])]=as.numeric(names(which.max(table(gen[,1],exclude=NA))))
# Imputing the rest
for(i in 2:ncol(gen)) gen[,i]=IE(gen[,i-1],gen[,i],tm[,i-1])
# THE END
return(gen)
}
# LD matrix function
LD = function(gen){
# Phasing via EM
EM=function(A,B,n=10){
Za = paste(A,B,sep="")
Z = matrix(0,3,3)
colnames(Z)=c('A0','A1','A2')
rownames(Z)=c('B0','B1','B2')
# Main diagonal
Z[1,1] = sum(Za=='00')
Z[2,2] = sum(Za=='11')
Z[3,3] = sum(Za=='22')
# Upper diagonal
Z[1,2] = sum(Za=='01')
Z[1,3] = sum(Za=='02')
Z[2,3] = sum(Za=='12')
# Lower diagonal
Z[1,2] = sum(Za=='10')
Z[1,3] = sum(Za=='20')
Z[2,3] = sum(Za=='21')
# Initial guess
COUP = 0.5 ; REPU = 0.5
# Function to estimate haplotypes
HapProb=function(Z,Co,Re){
AB = 2*Z[1,1] + Z[2,1] + Z[1,2] + Co*Z[2,2]
Ab = 2*Z[3,1] + Z[2,1] + Z[3,2] + Re*Z[2,2]
aB = 2*Z[1,3] + Z[1,2] + Z[2,3] + Re*Z[2,2]
ab = 2*Z[3,3] + Z[3,2] + Z[2,3] + Co*Z[2,2]
props = data.frame(AB,Ab,aB,ab)
haps=(data.matrix(props)/2)/sum(Z)
rownames(haps)="Hap"
return(haps)}
HHH=matrix(NA,n,6)
# Loop
for(i in 1:n){
H=HapProb(Z,COUP,REPU)
H2=matrix(H,2,2)
dia=H2[1,1]*H2[2,2]
off=H2[2,1]*H2[1,2]
tt=dia+off
COUP = dia/tt
REPU = off/tt
oo = c(COUP,REPU)
oo[is.nan(oo)]=0
H[is.nan(H)]=0
HHH[i,]=c(H,oo)
}
rownames(HHH)=1:n
EM=round(tail(HHH,1)[1:4],4)
names(EM)=colnames(HHH)[1:4]
return(EM)
}
# LD values
EM_LD = function(A,B){
phase=EM(A,B)
if(det(matrix(phase,2,2))<0) phase=phase[c(2,1,4,3)]
X = matrix(phase,2,2)
D = X[1,1]-sum(X[,1])*sum(X[1,])
r=D/sqrt(prod(c(colSums(X),rowSums(X))))
return(r^2)
}
# LD matrix
LDmat = function(gen){
n = ncol(gen)
snps = colnames(gen)
LD1 = matrix(1,n,n)
dimnames(LD1) = list(snps,snps)
for(i in 1:n){for(j in 1:n){if(j>i){
ld = as.numeric(EM_LD(gen[,i],gen[,j]))
LD1[i,j]=LD1[j,i]=ld
}}}
return(LD1)}
return(LDmat(gen))
}
# Compute genetic distance
Gdist = function(gen,method=1){
# from adegenet{dist.genpop}
nloc = ncol(gen)
recode = function(gen) cbind(gen,(gen-2)*-1)
X = recode(gen)
# NEI
if (method == 1) {
cat("Nei distance\n")
d = tcrossprod(X)
vec = sqrt(diag(d))
d = d/vec[col(d)]
d = d/vec[row(d)]
d = -log(d)
d = as.dist(d)}
# Edwards
else if (method == 2) {
cat("Edwards distance\n")
X2 = sqrt(X)
d = tcrossprod(X2)
d = 1 - d/(nloc*2)
diag(d) = 0
d = sqrt(d)
d = as.dist(d)}
# Reynolds
else if (method == 3){
cat("Reynolds distance\n")
X = X/2
denomi = tcrossprod(X)
vec = rowSums((X)^2)
d = -2 * denomi + vec[col(denomi)] + vec[row(denomi)]
diag(d) = 0
denomi = 2 * (nloc - denomi)
diag(denomi) = 1
d = d/denomi
d = sqrt(d)
d = as.dist(d)}
# Rogers
else if (method == 4) {
cat("Rogers' distance\n")
p = ncol(gen)
d = dist(gen,method = 'euclidean')
d = d/p}
# Prevosti
else if (method == 5){
cat("Prevosti's distance\n")
p = ncol(gen)
d = dist(gen,method = 'manhattan')
d = d/(2*p)}
# MRD
else if (method == 6){
cat("Modified Rogers' distance\n")
p = ncol(gen)
d = dist(gen,method = 'euclidean')
d = d/sqrt(4*p)}
return(d)
}
# G2A Kernels
G2A_Kernels = function(gen){
# Centralizing G2A
Z = apply(gen,2,function(x) x-mean(x,na.rm = T));Z[is.na(Z)]=0
gen[gen==2]=0
W = apply(gen,2,function(x) x-mean(x,na.rm = T));W[is.na(W)]=0
# Additive
ZZ = tcrossprod(Z)
A = ZZ/mean(diag(ZZ))
# Dominant
WW = tcrossprod(W)
D = WW/mean(diag(WW))
# Epistatic additive
ZZ = tcrossprod(Z)^2 - tcrossprod(Z^2)
AA = ZZ/mean(diag(ZZ))
# Epistatic dominant
WW = tcrossprod(W)^2 - tcrossprod(W^2)
DD = WW/mean(diag(WW))
# Epistatic add x dom
AD = A*D
AD = AD/mean(diag(AD))
# List
KS = list('A'=A,'D'=D,'AA'=AA,'DD'=DD,'AD'=AD)
# Return
return(KS)
}
# IMPORT DATA
Import_data = function(file,type=NULL){
if(is.null(type)) stop("'type' must be specified with one the following: 'GBS','HapMap','VCF'")
# GBS
GBS = function(path){
G = read.delim(path)
rownames(G)=G[,1];G=G[,-1]
G[G!="A"&G!="C"&G!="G"&G!="T"]=NA
Recode=function(aa){
t=table(aa)
t[t==0]=NA;A=which.max(t)
a=which.min(t)
A1=names(t)[A]
A2=names(t)[a]
W1=which(aa==A1)
W2=which(aa==A2)
aa=as.numeric(aa)
aa[W1]=0;aa[W2]=2
return(aa)
}
gen=apply(G,2,Recode)
dimnames(gen)=dimnames(G)
gen=data.matrix(gen)
# chr
chr = NULL; for(i in 1:max(G[,3])) chr=c(chr,sum(G[,3]==i))
# return
final = list('gen'=gen,'chr'=chr)
return(final)
}
if(type=='GBS'){
return(GBS(file))
}
# HAPMAP
HAP = function(path){
G = read.delim(path, header=T)
AA=as.character(G[,2])
AA=gsub('/','',AA)
Gen=t(G[,-c(1:11)])
n=nrow(Gen)
m=ncol(Gen)
gen=matrix(NA,n,m)
for(i in 1:m){
A1=strsplit(AA[i],'')[[1]][1]
A2=strsplit(AA[i],'')[[1]][2]
BB=paste(A1,A1,sep='')
Bb=paste(A1,A2,sep='')
bb=paste(A2,A2,sep='')
M=as.character(Gen[,i])
gen[M==BB,i]=2
gen[M==Bb,i]=1
gen[M==bb,i]=0
}
colnames(gen)=paste(G[,3],G[,4],G[,2],sep='.')
# chr
chr = NULL; for(i in 1:max(G[,3])) chr=c(chr,sum(G[,3]==i))
rm(AA,A1,A2,BB,Bb,bb,m,n,Gen)
# return
final = list('gen'=gen,'chr'=chr)
return(final)
}
if(type=='HapMap'){
return(HAP(file))
}
# VCF
VCF = function(path){
G = read.table(path,header = TRUE)
if(any(duplicated(G$POS))) G = G[-which(duplicated(G$POS)),]
rownames(G) = paste(G$CHROM,G$POS,sep='_')
G = G[,-c(1:9)]
tags = dimnames(G)
recode = function(g){
g = as.character(g)
g[g=='1/1']=2
g[g=='0/1']=1
g[g=='1/0']=1
g[g=='0/0']=0
g = as.numeric(g)
return(g)
}
G = apply(G,2,recode)
dimnames(G) = tags
G = t(G)
chr = as.vector(table(gsub('_.+','',tags[[1]])))
final = list('gen'=G,'chr'=chr)
return(final)
}
if(type=='VCF'){
return(VCF(file))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.