##read data source("./R/qcFun.R") dataList=c("./data/2019_protein_normal.txt", "./data/2019_protein_normal_impute.txt", "./data/2019_protein_normal_impute_combat.txt", "./data/2019_protein_no_normal.txt", "./data/2019_protein_no_normal_combat.txt", "./data/2019_protein_no_normal_impute.txt", "./data/2019_protein_no_normal_impute_combat.txt" ) proFile=dataList[3] proFile="2018-05-21pppb_prot.txt" prot=read.table(proFile, as.is = T, header = T) rownames(prot)=prot[,1] prot=prot[,-c(1,2)] PPP0=read.csv("PPPB1_lineup.csv", as.is = T, header = T) PPP1=PPP0 PPP1$PPPA_ID=tolower(PPP1$PPPA_ID) datmp=t(prot) ###line up protein matrix and datmp exDat=matrix(0, nrow(datmp), ncol(datmp)) rID=matrix(0, nrow(datmp), 1) for (i in 1:nrow(PPP1)) { idx=which(rownames(datmp) == PPP1$PPPA_ID[i]) rID[i,1]=idx exDat[i,] = datmp[idx,] } rownames(exDat) = PPP1$PPPA_ID[rID[,1]] colnames(exDat) = rownames(prot) #evaluate missing value missCut=0.8 misD=matrix(1, nrow(exDat), 4) for(i in 1:nrow(misD)) { naIdx=which(is.na(exDat[i,])) if (length(naIdx) < ncol(exDat) ) { misD[i, 1] = length(naIdx)/ncol(exDat) misD[i, 2] = min(datmp[i,-naIdx]) misD[i, 3] = median(exDat[i, -naIdx]) misD[i, 4] = mean(exDat[i, -naIdx]) } } blankInd = which(misD[,1]> missCut) hist(misD[,1], breaks = 50, main="Missing per sample", xlab="Missing rate") if(length(blankInd)> 0) { exDat=exDat[-blankInd,] PPP1=PPP1[-blankInd,] } ###quantile normalization ###https://en.wikipedia.org/wiki/Quantile_normalization library(preprocessCore) #exDat1=exDat #qdat=t(normalize.quantiles(t(exDat1))) qdat=exDat rownames(qdat)=rownames(exDat) colnames(qdat)=colnames(exDat) for(i in 1:ncol(datmp)) { idx=which(rownames(qdat) == rownames(datmp)[i]) if(length(idx) > 0) { datmp[i,] = qdat[idx,] } } write.table(t(datmp), "Apr_PPP1.txt", row.names = T, col.names = T, quote=F)
###proQC package qdat=t(read.table("Apr_PPP1.txt", as.is = T, header = T)) #row for individual, column for protein PPP1=read.csv("PPPB1_lineup.csv", as.is = T, header = T) indM=array(0, nrow(qdat)) for(i in 1:nrow(qdat)) { indM[i]=length(which(is.na(qdat[i,]))) } idx=which(indM/ncol(qdat) > 0.8) if(length(idx) > 0) { qdat=qdat[-idx,] PPP1=PPP1[-idx,] }
######Slide 1: density plot print("Slide 1: A-AQUA, C-CTRL, N-Normal, T-Tumor") proDensityPlot(qdat, PPP1$Tissue)
Quality score is defined as $$Q=\frac{\sum_{i=1}^\tilde{M}\tilde{s}i(k-\frac{n_o}{n})^c}{\sum{i=1}^Ms_i}$$ in which $\tilde{s}i=[y_i-(E(y_i|n{o_i})+T_{\alpha/M}\sigma_i)]^2$ if $y_i > E(y_i|n_{o_i})$ or $\tilde{s}i=[y_i-(E(y_i|n{o_i})-T_{\alpha/M}\sigma_i)]^2$ if $y_i < E(y_i|n_{o_i})$. In contrast, $s_i=[y_i-E(y_i|n_{o_i})]^2$.
$T_{\alpha/M}$ is the z-score given the p-value cutoff $\alpha/M$, and $\alpha$ can be for example 0.01 or 0.05.
$E(y_i|n_{o_i})$ can be predicted from the fitted spline.
$\alpha$ determins the confidence interval of the spline. The smaller the $\alpha$, the larger the interval and the smaller $\tilde{M}$ till zero.
$K$, between 0 and 1, indicates the direction of the penalty. When it is 1, the penalty is more weighted on the proteins with fewer observations, and nearly no penalty at all for proteins without missing values.
$C$ is the penalty for the score.
In summary, the quality score is upon $M$, $\alpha$, $K$, and $C$.
#####Slide 2: Missing plot print("Slide 2: Qscore") Qalpha=0.05 TAG=c("A", "C", "N", "T") indMPro=list(qdat[PPP1$Tissue==TAG[1],], qdat[PPP1$Tissue==TAG[2],], qdat[PPP1$Tissue==TAG[3],], qdat[PPP1$Tissue==TAG[4],]) layout(matrix(1:(length(TAG)+1), 1, (length(TAG)+1))) QStest=array(0, dim=c(6, length(TAG))) colnames(QStest)=TAG for(i in 1:length(TAG)) { refAmean=colMeans(indMPro[[i]], na.rm = T) refObsPro=array(1, dim=length(refAmean)) for(j in 1:length(refAmean)) { if(length(which(!is.na(indMPro[[i]][,j])))>0) { refObsPro[j] = length(which(!is.na(indMPro[[i]][,j]))) } } idNA=which(is.na(refAmean)) if(length(idNA)>0) { ss=smooth.spline(1-refObsPro[-idNA]/nrow(indMPro[[i]]), refAmean[-idNA], df=5) err=qnorm(1-Qalpha/length(refAmean[-idNA])/2)*sd(ss$x) } else { ss=smooth.spline(1-refObsPro/nrow(indMPro[[i]]), refAmean, df=5) err=qnorm(1-Qalpha/length(refAmean)/2)*sd(ss$x) } Qs=QScore(refAmean, refObsPro, nrow(indMPro[[i]]), Qpval = Qalpha, K=0, C=0.5) QStest[1,i] = mean(abs(Qs[,6]), na.rm = T)/mean(Qs[,4],na.rm = T) Qs=QScore(refAmean, refObsPro, nrow(indMPro[[i]]), Qpval = Qalpha, K=1, C=0.5) QStest[2,i] = mean(abs(Qs[,6]), na.rm = T)/mean(Qs[,4],na.rm = T) Qs=QScore(refAmean, refObsPro, nrow(indMPro[[i]]), Qpval = Qalpha, K=0, C=1) QStest[3,i] = mean(abs(Qs[,6]), na.rm = T)/mean(Qs[,4],na.rm = T) Qs=QScore(refAmean, refObsPro, nrow(indMPro[[i]]), Qpval = Qalpha, K=1, C=1) QStest[4,i] = mean(abs(Qs[,6]), na.rm = T)/mean(Qs[,4],na.rm = T) Qs=QScore(refAmean, refObsPro, nrow(indMPro[[i]]), Qpval = Qalpha, K=0, C=2) QStest[5,i] = mean(abs(Qs[,6]), na.rm = T)/mean(Qs[,4],na.rm = T) Qs=QScore(refAmean, refObsPro, nrow(indMPro[[i]]), Qpval = Qalpha, K=1, C=2) QStest[6,i] = mean(abs(Qs[,6]), na.rm = T)/mean(Qs[,4],na.rm = T) } print(QStest) layout(matrix(1:2, 1, 2)) barplot(main="K=0 (remove low missing rate)", QStest[c(1,3,5),], beside = T, border = NA, ylim=c(0, max(QStest)*1.2)) barplot(main="K=1 (remove high missing rate)", QStest[c(2,4,6),], beside = T, border = NA, ylim=c(0, max(QStest)*1.2)) legend("topright", legend = paste("C=", c(0.5, 1, 2)), pch=15, col=c("grey10", "grey50", "grey80"))
##QS Qalpha=0.05 TAG=c("A", "C", "N", "T") indMPro=list(qdat[PPP1$Tissue==TAG[1],], qdat[PPP1$Tissue==TAG[2],], qdat[PPP1$Tissue==TAG[3],], qdat[PPP1$Tissue==TAG[4],]) layout(matrix(1:(length(TAG)+1), 1, (length(TAG)+1))) qdatAC=qdat[PPP1$Tissue==TAG[1] | PPP1$Tissue==TAG[2],] refAmean=colMeans(qdatAC, na.rm = T) refObsPro=array(1, dim=length(refAmean)) for(i in 1:length(refAmean)) { if(length(which(!is.na(qdatAC[,i])))>0) { refObsPro[i] = length(which(!is.na(qdatAC[,i]))) } } idNA=which(is.na(refAmean)) plot(bty='n', main="A+C", xlab="Missing rate", ylab="Expression", xlim=c(0, 1), ylim=c(0, 23), 1-refObsPro[-idNA]/nrow(qdatAC[-idNA,]), refAmean[-idNA], cex=0.5,pch=16, col="grey") ssAC=smooth.spline(1-refObsPro[-idNA]/nrow(qdatAC[-idNA,]), refAmean[-idNA], df=5) err=qnorm(1-Qalpha/length(refAmean)/2)*sd(ssAC$x) lines(ssAC, col="green", lty=1, lwd=3) lines(ssAC$x, ssAC$y+err, col="green", lty=3, lwd=2) lines(ssAC$x, ssAC$y-err, col="green", lty=3, lwd=2) Qscore=array(0, dim=length(TAG)) for(i in 1:length(TAG)) { refAmean=colMeans(indMPro[[i]], na.rm = T) refObsPro=array(1, dim=length(refAmean)) for(j in 1:length(refAmean)) { if(length(which(!is.na(indMPro[[i]][,j])))>0) { refObsPro[j] = length(which(!is.na(indMPro[[i]][,j]))) } } idNA=which(is.na(refAmean)) ss=smooth.spline(1-refObsPro[-idNA]/nrow(indMPro[[i]]), refAmean[-idNA], df=5) err=qnorm(1-Qalpha/length(refAmean)/2)*sd(ss$x) Qs=QScore(refAmean, refObsPro, nrow(indMPro[[i]]), Qpval = Qalpha, K=1, C=2) Qscore[i] = mean(abs(Qs[,6]), na.rm = T)/mean(Qs[,4],na.rm = T) plot(bty='n', main=paste(TAG[i], ": Qscore=", format(1-Qscore[i], digits = 3)), xlab="Missing rate", ylab="Expression", xlim=c(0, 1), ylim=c(0, 23), 1-refObsPro/nrow(indMPro[[i]]), refAmean,cex=0.5,pch=16, col="grey") if(i==1) { ssA=smooth.spline(1-refObsPro[-idNA]/nrow(indMPro[[i]]), refAmean[-idNA], df=5) } lines(ssA, col="blue", lty=2, lwd=3) lines(ss, col=ifelse(i!=1, "red", "blue"), lty=1, lwd=3) lines(ss$x, ss$y+err, col=ifelse(i!=1, "red", "blue"), lty=3, lwd=1) lines(ss$x, ss$y-err, col=ifelse(i!=1, "red", "blue"), lty=3, lwd=1) }
alpha=0.05 beta=0.2 qdatT=list(a=qdat[PPP1$Tissue==TAG[1],], c=qdat[PPP1$Tissue==TAG[2],], n=qdat[PPP1$Tissue==TAG[3],], t=qdat[PPP1$Tissue==TAG[4],]) pT=matrix(NA, ncol(qdat), 7) missT=matrix(NA, ncol(qdat), 7) naCnt=matrix(NA, ncol(qdat), 2) for(i in 1:nrow(pT)) { naCnt[i,1]=naN=length(which(!is.na(qdatT$n[,i]))) naCnt[i,2]=naT=length(which(!is.na(qdatT$t[,i]))) if(naN > 1 && naT > 1) { tt=t.test(qdatT$t[,i], qdatT$n[,i]) pT[i,c(1,2)] = c(tt$statistic, tt$parameter) pT[i,3]=-1*pt(abs(pT[i,1]), pT[i,2], lower.tail=FALSE, log.p=T)/log(10) - log10(2) pT[i,c(4,5)]=tt$estimate pT[i,6]=var(qdatT$n[,i], na.rm = T) pT[i,7]=var(qdatT$t[,i], na.rm = T) mTT=t.test(c(rep(1, naT), rep(0, nrow(qdatT$t))), c(rep(1, naN), rep(0, nrow(qdatT$n)))) missT[i,c(1,2)] = c(mTT$statistic, mTT$parameter) missT[i,3]=-1*pt(abs(missT[i,1]), missT[i,2], lower.tail=FALSE, log.p=T)/log(10) - log10(2) } } # mat=matrix(0, 4, 4) mat[col(mat)<row(mat)]=seq(1,11,2) mat=t(mat) mat[col(mat)<row(mat)]=seq(2,12,2) diag(mat)=13 mat=t(mat) layout(mat) par(mai=c(0.2,0.2,0.2,0.2)) for(i in 1:3) { for(j in (i+1):4) { mT=colMeans(qdatT[[i]], na.rm = T) mN=colMeans(qdatT[[j]], na.rm = T) id=which(!is.na(mT)&!is.na(mN)) mTclean=mT[id] mNclean=mN[id] rkT=order(mTclean) rkN=order(mNclean) plot(main=paste(length(mNclean), TAG[i], "vs", TAG[j]), mNclean[rkT], col=j, cex=0.5, bty='n') points(1:length(mTclean), mTclean[rkT], pch=16, cex=0.5, col=i) plot(main=paste(length(mTclean), TAG[j], "vs", TAG[i]), mTclean[rkN], col=i, cex=0.5, bty='n') points(1:length(mNclean), mNclean[rkN], pch=16, cex=0.5, col=j) } }
## slide 3: gc print("Slide 3: GC control") print("Bartlett test") BTpval=array(1, ncol(qdat)) for(i in 1:length(BTpval)) { ll=array(0, 4) for(j in 1:length(TAG)) { ll[j]=length(which(!is.na(qdat[PPP1$Tissue==TAG[j],i]))) } if(all(ll >= 2)) { BTpval[i] = bartlett.test(qdat[,i], PPP1$Tissue)$p.value } } plot(-log10(BTpval), pch=16, cex=0.5, bty='l') pmat=matrix(NA, 6, ncol(indMPro[[1]])) pIdx=1 layout(matrix(c(1,2,3,8,4,5,7,7,6), 3, 3)) gc=array(0,6) for(i in 1:3) { dn=indMPro[[i]] for(j in (i+1):4) { dm=indMPro[[j]] for(k in 1:ncol(dn)) { if( (length(which(!is.na(dn[,k]))) > 1) & (length(which(!is.na(dm[,k]))) >1)) { ft= var.test(dn[,k], dm[,k], na.rm = T) pmat[pIdx,k] = ft$p.value } } hist(main=paste0(TAG[j], "/", TAG[i]), xlab="p-value (F test)", pmat[pIdx,which(!is.na(pmat[pIdx,]))], breaks = 20, col=pIdx) abline(h=length(which(!is.na(pmat[pIdx,])))/20, lty=2, col="grey") gc[pIdx]=qchisq(median(pmat[pIdx,], na.rm = T), df=1, lower.tail = F)/0.455 legend("topright", bty = 'n', legend = paste("GC=", format(gc[pIdx], digits = 3))) pIdx=pIdx+1 } } pIdx=1 for(i in 1:3) { for(j in (i+1):4) { if(pIdx==1) { qqplot(bty='n', col=pIdx, ylim=c(0,70), xlab="Expected -log10(p)", ylab="Observed -log10(p)", -log10(runif(length(which(pmat[pIdx,]<1)))),-log10(pmat[pIdx,]), pch=16, cex=0.5) abline(a=0, b=1, col="grey", lty=2) } else { points(sort(-log10(runif(length(which(pmat[pIdx,]<1))))),sort(-log10(pmat[pIdx,])), col=pIdx, pch=16,cex=0.5) } pIdx=pIdx+1 } } legend("topleft", bty='n', legend = paste0("GC=", format(gc, digits=3)), pch=15, col=1:6)
#generate correlation matrix print("Slide 4: PCA") pCorM=proCorMatrix(qdat) ####### pEg=eigen(pCorM) #barplot(pEg$values[1:20], border = F) eVec=apply(pEg$vectors[,c(1,2)], 2, scale) eM=list("e1"=matrix(0, 4, 3), "e2"=matrix(0,4,3)) for(i in 1:2) { for(j in 1:length(TAG)) { eM[[i]][j,1]=mean(eVec[PPP1$Tissue==TAG[j], i]) eM[[i]][j,2]=var(eVec[PPP1$Tissue==TAG[j], i]) eM[[i]][j,3]=abs(eM[[i]][j,1])^2+eM[[i]][j,2] } } print(eM[[1]]) print(eM[[2]]) #pEvecPlot(pEg, c(1,2,3,4), COL=as.numeric(as.factor(PPP1$Tissue))) pEvecCat(pEg, PPP1$Tissue, c(1,2)) #tissue #pEvecCat(pEg, PPP1$MS_ID, c(1,2)) #tissue #pcaCatPlot(pEg, 5, PPP1$Tissue) #pcaCatPlot(pEg, 5, PPP1$MS_ID) #pVCA(pEg, 10, PPP1$Tissue) #pVCA(pEg, 10, PPP1$MS_ID)
##### print("Slide 5: var of each eigen vector") SD=matrix(0, 4, ncol(pEg$vectors)) layout(matrix(1:4, 2, 2)) for(i in 1:4) { dt=pEg$vectors[PPP1$Tissue==TAG[i],] SD[i,]=apply(dt, 2, var) } for(i in 1:4) { plot(main=TAG[i], xlab="Eigen index", ylab="Variance", ylim=c(0, max(SD)*1.1), SD[i,], pch=16, cex=0.5, col=i, bty='l') ssL=smooth.spline(seq(1, ncol(SD)), SD[i,], df=7) lines(ssL, lwd=2, lty=2, col="grey") }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.