#' @title Perform phenotype-enriched features analysis
#'
#' @description Perform phenotype-enriched features analysis
#' @param data
#' @param phenodata
#' @param k
#' @param SavePath
#' @param ShuffleWstat
#' @param BoundarySample
#' @param BoundaryPair
#' @param BoundaryPair
#' @param ShuffleTime
#' @param Uppercent
#' @param DownPercent
#' @return Mid.Matrix
#' @examples
#' @export
pair_find<-function(data=data,phenodata=data.frame(),k="euclidean",SavePath = NULL,ShuffleWstat = NULL,
BoundarySample = NULL,BoundaryPair=NULL,PvalueCutoff=0.05,
Cut_pair=25, method_choose=c("Wilcox","Permutation"),
ShuffleTime=10000,DownPercent = 0.2,Uppercent=0.8){ # colnames(phenodata) = c("id","grp"); grp1 = "Ctrl", grp2 = "Disease"
suppressMessages(library(tidyverse))
#suppressMessages(library(fdrtool))
#suppressMessages(library(qvalue))
phenodata <- phenodata %>% dplyr::arrange(grp) %>% data.frame()
#data must be ordered as grp order
groupnum <- table(phenodata$grp) %>% data.frame()
RAN_num = groupnum[1,2]
RAP_num = groupnum[2,2]
RAN.Samples<-phenodata %>% dplyr::filter(grp=="grp1")
RAP.Samples<-phenodata %>% dplyr::filter(grp=="grp2")
RAN <- data[rownames(data) %in% RAN.Samples$id,] %>% data.frame()
RAP <- data[rownames(data) %in% RAP.Samples$id,] %>% data.frame()
n=dim(data)[1]
num=floor(sqrt(RAN_num+RAP_num))
results_whole=matrix(nrow=0,ncol=(num+1))
for (i in 1:n) {
new<-rbind(data[i,],data)
new_dis<-dist(new,method = k)
new_dis<-as.matrix(new_dis)
wa_d<-sort(new_dis[,1])[3:(num+2)]
new_row<-c(rownames(data)[i],wa_d)
results_whole<-rbind(results_whole,new_row)
}
mean_whole=mean(as.numeric(as.vector(results_whole[,2:(num+1)])))
sd_whole=sd(as.numeric(as.vector(results_whole[,2:(num+1)])))
RAP_knn_num=floor(sqrt(RAP_num))
RAN_knn_num=floor(sqrt(RAN_num))
results_sample_pair<-matrix(nrow=0,ncol=(2*RAN_knn_num+2*RAP_knn_num+2))
for (j in 1:(RAN_num+RAP_num)){
sample_mean<-mean(as.numeric(results_whole[j,2:(num+1)]))
if (sample_mean <= (mean_whole + sd_whole)){
#print(results_whole[j,1])
if (results_whole[j,1] %in% rownames(RAN)){
new_RANN<-rbind(data[which(rownames(data) == results_whole[j,1]),],RAN)
#print(results_whole[j,1])
new_RANN_dis<-dist(new_RANN,method = k)
new_RANN_dis<-as.matrix(new_RANN_dis)
wa_RANN_dis<-sort(new_RANN_dis[,1])[3:(RAN_knn_num+2)]
knn_RANN_mean<-mean(as.numeric(wa_RANN_dis))
knn_RANN_sd <- sd(as.numeric(wa_RANN_dis))
new_RANP<-rbind(data[which(rownames(data) == results_whole[j,1]),],RAP)
new_RANP_dis<-dist(new_RANP,method = k)
new_RANP_dis<-as.matrix(new_RANP_dis)
wa_RANP_dis<-sort(new_RANP_dis[,1])[2:(RAP_knn_num+1)]
knn_RANP_mean<-mean(as.numeric(wa_RANP_dis))
knn_RANP_sd<-sd(as.numeric(wa_RANP_dis))
name_NN<-c(rep(0,RAN_knn_num))
name_NP<-c(rep(0,RAP_knn_num))
if(knn_RANP_mean <= (knn_RANN_mean+knn_RANN_sd)){
for (m in 1:RAN_knn_num){
RANN_sample<-rownames(new_RANN_dis)[which(new_RANN_dis[,1] == wa_RANN_dis[m])]
name_NN[m]<-RANN_sample
}
for (h in 1:RAP_knn_num){
RANP_sample<-rownames(new_RANP_dis)[which(new_RANP_dis[,1] == wa_RANP_dis[h])]
name_NP[h]<-RANP_sample
}
new.row<-c(as.character(results_whole[j,1]),rownames(data[which(rownames(data) == results_whole[j,1]),]),name_NN, as.vector(wa_RANN_dis), name_NP, as.vector(wa_RANP_dis))
#print(new.row)
results_sample_pair<-rbind(results_sample_pair,new.row)
}
} else {
new_RAPN<-rbind(data[which(rownames(data) == results_whole[j,1]),],RAN)
new_RAPN_dis<-dist(new_RAPN,method = k)
new_RAPN_dis<-as.matrix(new_RAPN_dis)
wa_RAPN_dis<-sort(new_RAPN_dis[,1])[2:(RAN_knn_num+1)]
knn_RAPN_mean<-mean(wa_RAPN_dis)
new_RAPP<-rbind(data[which(rownames(data) == results_whole[j,1]),],RAP)
new_RAPP_dis<-dist(new_RAPP,method = k)
new_RAPP_dis<-as.matrix(new_RAPP_dis)
wa_RAPP_dis<-sort(new_RAPP_dis[,1])[3:(RAP_knn_num+2)]
knn_RAPP_mean<-mean(wa_RAPP_dis)
knn_RAPP_sd<-sd(wa_RAPP_dis)
name_PN<-c(rep(0,RAN_knn_num))
name_PP<-c(rep(0,RAP_knn_num))
if (knn_RAPN_mean <= (knn_RAPP_mean+knn_RAPP_sd)){
for (m in 1:RAN_knn_num){
RAPN_sample<-rownames(new_RAPN_dis)[which(new_RAPN_dis[,1] == wa_RAPN_dis[m])]
name_PN[m]<-RAPN_sample
}
for (h in 1:RAP_knn_num){
RAPP_sample<-rownames(new_RAPP_dis)[which(new_RAPP_dis[,1] == wa_RAPP_dis[h])]
name_PP[h]<-RAPP_sample
}
new.row<-c(as.character(results_whole[j,1]),rownames(data[which(rownames(data) == results_whole[j,1]),]), name_PN, as.vector(wa_RAPN_dis), name_PP, as.vector(wa_RAPP_dis))
#print(new.row)
results_sample_pair<-rbind(results_sample_pair,new.row)
}
}
}
}
cat("KNN FINISHED\n")
if (dim(results_sample_pair)[1] > 1) {
res<-results_sample_pair[,1:(2*RAN_knn_num+2*RAP_knn_num+2)]
}else{
#cat("Nothing Done \n")
cat("Insufficient number of paired samples\n")
return(NULL)
}
if (is.null(SavePath)) {
SavePath="./"
}else{
if(!dir.exists(SavePath)){
dir.create(SavePath)
}
}
#res<-res[,c(2:(2+RAN_knn_num),(2+1+2*RAN_knn_num):(2+2*RAN_knn_num+RAP_knn_num))] %>% data.frame() %>% remove_rownames()
res<-res[,c(2:(2+2*RAN_knn_num+2*RAP_knn_num))] %>% data.frame() %>% remove_rownames()
if (!is.null(BoundarySample)) {
write.csv(res,paste(SavePath,"/",BoundarySample,".csv",sep = ''),row.names = F)
}
#View(res)
#return(res)
#sum(str_detect(res$X1,groupnum$Var1[1]%>% as.vector()))
#View(res)
#write.csv(res,file = filename)
pairinfor = matrix(ncol =3,nrow = 0)
for (i in 1:dim(res)[1]) {
indexpo1 = which(as.character(phenodata[,1]) == as.character(res[i,1]))
indexpo2 = which(as.character(phenodata[,1]) == as.character(res[i,2]))
if (phenodata[indexpo1,2] == phenodata[indexpo2,2]) {
for (mkl in 1:RAP_knn_num) {
pairinfor <- rbind(pairinfor,c(as.character(res[i,1]),as.character(res[i,1+2*RAN_knn_num+mkl]),as.numeric(as.character(res[i,1+2*RAN_knn_num+mkl+RAP_knn_num]))))
}
}else{
for (pkl in 1:RAN_knn_num) {
pairinfor <- rbind(pairinfor,c(as.character(res[i,pkl+1]),as.character(res[i,1]),as.numeric(as.character(res[i,pkl+1+RAN_knn_num]))))
}
}
}
pairinfor[,3] <- as.numeric(as.character(pairinfor[,3]))
Newmatrix <<- matrix(ncol = 3,nrow = 0)
Extract_Dist <- function(PairData,...){
if (dim(PairData)[1] > 0) {
Findline <- PairData[which(PairData[,3] == min(PairData[,3])),]
Newmatrix <<- rbind(Newmatrix,Findline)
#View(Newmatrix)
NewMidData <- PairData %>% filter(!(Ctl %in% Findline$Ctl) & !(Disease %in% Findline$Disease))
#cat(dim(NewMidData)[1])
#cat("\n")
return(Extract_Dist(NewMidData))
}else{
return(Newmatrix)
}
}
#View(pairinfor)
pairinfor <- pairinfor %>% data.frame() %>% dplyr::rename(Ctl=1,Disease=2,Distance=3) %>%
mutate(Distance = as.numeric(as.character(Distance))) %>% dplyr::arrange(Ctl,Disease) %>% unique()
#View(pairinfor)
pairinfor <- Extract_Dist(pairinfor)
pairinfor <- pairinfor %>% data.frame() %>% dplyr::select(-Distance)
if (!is.null(BoundaryPair)) {
write.csv(pairinfor,paste(SavePath,"/",BoundaryPair,".csv",sep = ''),row.names = F)
}
#write.csv(pairinfor,file = "test.csv",row.names = F)
cat(paste("the redundant pair number is ",dim(pairinfor)[1],"\n",sep = ''))
if (dim(pairinfor)[1] < 10){
cat("Insufficient number of paired samples\n")
return(NULL)
}
cat("PAIR FINISHED\n")
#return(pairinfor)
###### Pair shuffle or Wilcox Rank test #####
if((dim(pairinfor)[1] < 25) | (dim(pairinfor)[1] >= 25 & method_choose == "Permutation")){
ShufflePair <- lapply(1:ShuffleTime, function(Shuffle){
Random <- runif(1,DownPercent,Uppercent)
n <- round(dim(pairinfor)[1]*Random,0)
RandomIndex <- sample(1:dim(pairinfor)[1],n,replace = F)
pairinforMiddata.mid1.1 <- pairinfor[-RandomIndex,] %>% data.frame()
pairinforMiddata.mid2 <- pairinfor[RandomIndex,] %>% data.frame()
pairinforMiddata.mid2.1 <- data.frame(Ctl = pairinforMiddata.mid2$Disease, Disease = pairinforMiddata.mid2$Ctl)
pairinforMiddata.mid <- rbind(pairinforMiddata.mid1.1,pairinforMiddata.mid2.1) %>% data.frame()
})
cat("SHUFFLE PAIR FINISHED\n")
####### calculate W stat and rank
FinalMatrix <- matrix(ncol = 2+ShuffleTime,nrow = 0) #Species, Original W,
MeanData <- matrix(ncol = 3,nrow = 0) # record mean value of pair
for (i in 1:dim(data)[2]) {
### original cohort
former<-rep(NA,dim(pairinfor)[1])
latter<-rep(NA,dim(pairinfor)[1])
for (j in 1:dim(pairinfor)[1]) {
index_former<-which(rownames(data) == pairinfor$Ctl[j])
former[j]=as.numeric(as.character(data[index_former,i]))
index_latter<-which(rownames(data) == pairinfor$Disease[j])
latter[j]=as.numeric(as.character(data[index_latter,i]))
}
Middata <- data.frame(Ctrl = former, CRC = latter)
test<-wilcox.test(Middata$Ctrl,Middata$CRC,paired = TRUE)
AllNW <- (dim(pairinfor)[1]*(dim(pairinfor)[1]+1))/2
OriginalStat = AllNW - test$statistic
Ctrlmean <- mean(Middata$Ctrl)
CRCmean <- mean(Middata$CRC)
MeanData <- rbind(MeanData,c(as.character(colnames(data)[i]),Ctrlmean,CRCmean))
#cat(as.character(colnames(data)[i]))
#cat("\n")
#print(AllNW)
### calculate shuffle W stat
#ShuffleStat <- rep(NA,ShuffleTime)
ShuffleStat <- lapply(ShufflePair, function(x){
former<-rep(NA,dim(x)[1])
latter<-rep(NA,dim(x)[1])
for (j in 1:dim(x)[1]) {
index_former<-which(rownames(data) == x$Ctl[j])
former[j]=as.numeric(as.character(data[index_former,i]))
index_latter<-which(rownames(data) == x$Disease[j])
latter[j]=as.numeric(as.character(data[index_latter,i]))
}
Middata.mid <- data.frame(Ctrl = former, CRC = latter)
test1<-wilcox.test(Middata.mid$Ctrl,Middata.mid$CRC,paired = TRUE)
#ShuffleStat[Iter] = AllNW - test1$statistic
return(AllNW - test1$statistic)
})
ShuffleStat <- unlist(ShuffleStat)
FinalMatrix <- rbind(FinalMatrix,c(as.character(colnames(data)[i]),OriginalStat,ShuffleStat))
}
colnames(FinalMatrix) <- c("Feature","OriginalSata.W",paste("ShuffleStat.W.",1:ShuffleTime,sep = ""))
if (!is.null(ShuffleWstat)) {
write.csv(pairinfor,paste(SavePath,"/",ShuffleWstat,".csv",sep = ''),row.names = F)
}
Mid.Matrix <- matrix(ncol = 6,nrow = 0)
for (I.index in 1:dim(FinalMatrix)[1]) {
Increasing.Rank.Min <- rank(as.numeric(as.character(FinalMatrix[I.index,2:(ShuffleTime+2)])),ties.method= "min")[1]
Decreasing.Rank.Min <- rank(-as.numeric(as.character(FinalMatrix[I.index,2:(ShuffleTime+2)])),ties.method= "min")[1]
Increasing.Rank.Max <- rank(as.numeric(as.character(FinalMatrix[I.index,2:(ShuffleTime+2)])),ties.method= "max")[1]
Decreasing.Rank.Max <- rank(-as.numeric(as.character(FinalMatrix[I.index,2:(ShuffleTime+2)])),ties.method= "max")[1]
Increasing.Rank.Average <- rank(as.numeric(as.character(FinalMatrix[I.index,2:(ShuffleTime+2)])),ties.method= "average")[1]
Decreasing.Rank.Average <- rank(-as.numeric(as.character(FinalMatrix[I.index,2:(ShuffleTime+2)])),ties.method= "average")[1]
Mid.Matrix <- rbind(Mid.Matrix,c(Increasing.Rank.Min,Decreasing.Rank.Min,Increasing.Rank.Max,Decreasing.Rank.Max,Increasing.Rank.Average,Decreasing.Rank.Average))
}
Mid.Matrix <- Mid.Matrix %>% data.frame() %>% remove_rownames() %>%
dplyr::rename(Increasing.Rank.Min=1,Decreasing.Rank.Min=2,Increasing.Rank.Max=3,Decreasing.Rank.Max=4,Increasing.Rank.Average=5,Decreasing.Rank.Average=6)
Mid.Matrix$Decre.maxRank.P <- Mid.Matrix$Decreasing.Rank.Max/(ShuffleTime+1)
Mid.Matrix$Decre.aveRank.P <- Mid.Matrix$Decreasing.Rank.Average/(ShuffleTime+1)
Mid.Matrix$Decre.minRank.P <- Mid.Matrix$Decreasing.Rank.Min/(ShuffleTime+1)
Mid.Matrix$Incre.maxRank.P <- Mid.Matrix$Increasing.Rank.Max/(ShuffleTime+1)
Mid.Matrix$Incre.aveRank.P <- Mid.Matrix$Increasing.Rank.Average/(ShuffleTime+1)
Mid.Matrix$Incre.minRank.P <- Mid.Matrix$Increasing.Rank.Min/(ShuffleTime+1)
Mid.Matrix$Species <- FinalMatrix[,1] #Feature
Mid.Matrix <- Mid.Matrix %>% data.frame() %>%
dplyr::arrange(Decre.minRank.P) %>%
dplyr::mutate(Decre.minRank.P.FDR = p.adjust(.$Decre.minRank.P,method = "BH",n=length(.$Decre.minRank.P))) %>%
dplyr::arrange(Decre.maxRank.P) %>%
dplyr::mutate(Decre.maxRank.P.FDR = p.adjust(.$Decre.maxRank.P,method = "BH",n=length(.$Decre.maxRank.P))) %>%
dplyr::arrange(Decre.aveRank.P) %>%
dplyr::mutate(Decre.aveRank.P.FDR = p.adjust(.$Decre.aveRank.P,method = "BH",n=length(.$Decre.aveRank.P))) %>%
dplyr::arrange(Incre.aveRank.P) %>%
dplyr::mutate(Incre.aveRank.P.FDR = p.adjust(.$Incre.aveRank.P,method = "BH",n=length(.$Incre.aveRank.P)))
MeanData <- MeanData %>% data.frame() %>% dplyr::rename(Species=1,Ctlmean=2,Dismean=3)
Mid.Matrix <- merge(Mid.Matrix,MeanData,by="Species") %>% data.frame() %>% mutate(Enrieched = if_else(Decre.aveRank.P <= PvalueCutoff,"Disease",if_else(Incre.aveRank.P <= PvalueCutoff,"Ctrl","N.S.")))
} else{
Mid.Matrix <- data.frame()
for (i in 1:dim(data)[2]) {
### original cohort
former<-rep(NA,dim(pairinfor)[1])
latter<-rep(NA,dim(pairinfor)[1])
for (j in 1:dim(pairinfor)[1]) {
index_former<-which(rownames(data) == pairinfor$Ctl[j])
former[j]=as.numeric(as.character(data[index_former,i]))
index_latter<-which(rownames(data) == pairinfor$Disease[j])
latter[j]=as.numeric(as.character(data[index_latter,i]))
}
Middata <- data.frame(Ctrl = former, CRC = latter)
test<-wilcox.test(Middata$Ctrl,Middata$CRC,paired = TRUE)
Ctlmean <- mean(Middata$Ctrl)
CRCmean <- mean(Middata$CRC)
#MeanData <- rbind(MeanData,c(as.character(colnames(data)[i]),Ctlmean,CRCmean))
Mid.Matrix <- data.frame(Species=as.character(colnames(data)[i]),
Ctlmean=Ctlmean,
Dismean=CRCmean,
pvalue=test$p.value) %>% data.frame(check.names=F) %>%
mutate(Enrieched = if_else(pvalue <= PvalueCutoff,"Sig","None")) %>% rbind.data.frame(Mid.Matrix)
}
}
if (!is.null(Mid.Matrix)) {
write.csv(Mid.Matrix,paste(SavePath,"/PairFind.Output.csv",sep = ''),row.names = F)
}
cat("All done\n")
return(Mid.Matrix)
}# END - function: Pair_Find
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.