R/RankProducts_francesco_advance.R

"RP.advance" <- function(data,cl,origin, logged=TRUE,na.rm=TRUE,gene.names=NULL,
                        plot=FALSE, rand = NULL,
                        calculateProduct=TRUE, MinNumOfValidPairs=NA,
                        RandomPairs=NA, huge=FALSE, fast=TRUE, tail.time=0.05){
# the paramenter rand is used to put the random number generator in a
# reproducible state using the rand value as seed 
    if (!is.null(rand)) { set.seed(rand)} 
    if (is.vector(data)) {
        cat("Warning: There is only one gene in the data set","\n","\n")
        data=matrix(data,nrow=1)  ##force into matrix format
    }
if(is.data.frame(data)){data <- as.matrix(data)}
check.cl <- sort(unique(cl), decreasing = F)
if (all(check.cl!= c(0,1))){
    cat("\n the class label has to contain only 0 or 1")
    stop()
}
# number of pairs equal to the number of the smaples in the smallest class
    npairs <- min(length(which(cl==0)),length(which(cl==1)))
    if (npairs==0){
        nrep<-dim(data)[2]  #in the paired case, npairs=0 but I still need to
                            #know the number of replicates for the evaluation
                            #of the pvalues
    }else{
    nrep<-npairs
    }
## Number of RandomPairs to generate, if set to NA, the odd integer
##closer to n*n is used
    if (is.na(RandomPairs)) RandomPairs <- npairs^2
    if (RandomPairs==0) RandomPairs=1
    if ((RandomPairs %% 2) == 0) RandomPairs <- RandomPairs + 1
# Replace -Inf, Inf, NaN with NA.
    data <- mzmatch.R.wrongvalues(data)
## Estimate detection limit value.
detection.limit <- min(data[data!=0],na.rm=TRUE)
#if logged=FALSE zeros are substitute with the detection.limit
if (logged==FALSE) data[data==0]<-detection.limit
## check number of samples 
    total.sam=dim(data)[2]
    total.sam1=length(cl)
    total.sam2=length(origin)
    if(total.sam!=total.sam1 | total.sam!=total.sam2)
    stop("Number of classes and/or origins should match
    the columns in the data")
# I have to separate the dataset according to the different origin
    data.pre=OriginxyCall(data,cl,origin, !calculateProduct)
#data.pre$data1 is a list of matrix, each matrix has a different origin and all
#the samples in it are from class1
#data.pre$data2 is a list of matrix, each matrix has a different origin and all
#the samples in it are from class2
    num.ori=length(unique(origin)) #counting the number of different origins
    num.gene=dim(data)[1]  #defining the number of variables
##set seed for random number generator
####AveFC
    yy<-data.pre$data2
    if (!is.null(yy) ){   ##two-sample
        num.class=2
        data1.all=data.pre$data1 ##data under condition1
        data2.all=data.pre$data2  ##data under condition2 
        fold.change=NULL
        for (l in 1:num.ori){
            data1=as.matrix(data1.all[[l]])
            data2=as.matrix(data2.all[[l]])
            data1.ave=apply(data1,1,mean) 
            data2.ave=apply(data2,1,mean) 
            if (logged) {
                fold.change=cbind(fold.change,(data1.ave-data2.ave))
            }else{
                fold.change=cbind(fold.change,(data1.ave/data2.ave))}
                rm(data1,data2,data1.ave,data2.ave)
            }
        ave.fold.change=apply(fold.change,1,mean)
        }
if ( is.null(yy)){ ##one-sample
    num.class=1
    data1.all=data.pre$data1 ##data under condition1
    data2.all=data.pre$data2  ##data under condition2 
    fold.change=NULL
    for (l in 1:num.ori){
        data1=as.matrix(data1.all[[l]])
        fold.change=cbind(fold.change,apply(data1,1,mean))
        rm(data1)
    }
    ave.fold.change=apply(fold.change,1,mean)
    }
########AveFC
#This two variables are used to save the ranks of each iteration,
#if the parameter huge is set to true these variables 
#are not saved in order to save memory
ranks1<- vector("list",RandomPairs)
ranks2<- vector("list",RandomPairs)
if(!huge){
    Orirank<-vector("list",2)
    Orirank.ori<-vector("list",2)#
    } 
badrows<-NULL
prod_out <- list ()
prod_out.Class2 <- list()
output<-vector("list", RandomPairs)
output.Class2<-vector("list", RandomPairs)
MinNumOfValidPairsBUP <- MinNumOfValidPairs
for (ori in 1:num.ori){
  MinNumOfValidPairs <- MinNumOfValidPairsBUP
    data.ori <-cbind(data.pre$data1[[ori]],data.pre$data2[[ori]])
    if(!is.null(data.pre$data2)){
        cl.ori <-c(rep(0,dim(data.pre$data1[[ori]])[2]),
                    rep(1,dim(data.pre$data2[[ori]])[2]))
    }else{
        cl.ori<-c(rep(0,dim(data.pre$data1[[ori]])[2]))
    }
    npairs.ori <- min(length(which(cl.ori==0)),length(which(cl.ori==1)))
    if(npairs.ori==0) {nrep.ori <- length(cl.ori)} else {nrep.ori <- npairs.ori}
    if (is.na(MinNumOfValidPairs)) MinNumOfValidPairs <- floor(nrep.ori/2)
    xy.out.ori <- xyCall(cl.ori, RandomPairs, !calculateProduct)
    cl.ori <- xy.out.ori$cl
    x.ori <- xy.out.ori$x
    y.ori <- xy.out.ori$y
# Replace -Inf, Inf, NaN with NA.
#data.ori <- mzmatch.R.wrongvalues(data.ori)
## Estimate detection limit value.
detection.limit.ori <- min(data.ori[data.ori!=0],na.rm=TRUE)
## Prepare ratios for paired samples
    gr1.ori <- data.ori[, x.ori]
    gr2.ori <- data.ori[, y.ori]
## If in one group there was not detection in all samples,
## but there are signals detected in group B,
## values are replaced with detection.limit value (are we sure of this?)
## Also check which metabolites doesn not match MinNumOfValidPairs criteria. 
## Variables gr1NA and gr2NA will store information about which measurements
## don't contain values and those will be ignored during random pairing.
    badrows.ori <- NULL
    gr1NA.ori <- vector ("list", nrow(gr1.ori))
    gr2NA.ori <- vector ("list", nrow(gr2.ori))
    if (RandomPairs>1){
        for (coln in 1:nrow(gr1.ori)) {
            toreplacegr1.ori <- which(is.na(gr1.ori[coln,]))
            toreplacegr2.ori <- which(is.na(gr2.ori[coln,]))
            gr1NA.ori[[coln]] <- toreplacegr1.ori
            gr2NA.ori[[coln]] <- toreplacegr2.ori
            toreplacegr1.ori <- length(toreplacegr1.ori)
            toreplacegr2.ori <- length(toreplacegr2.ori)
            if (MinNumOfValidPairs > min((c(ncol(gr1.ori)-toreplacegr1.ori,
                    ncol(gr2.ori)-toreplacegr2.ori)))){
                badrows.ori <- append(badrows.ori,coln)
                gr1NA.ori[[coln]] <- NULL
                gr2NA.ori[[coln]] <- NA
            }
        if (toreplacegr1.ori==ncol(gr1.ori)){
        toreplacegr1.ori <- 1
        }else{
        toreplacegr1.ori <- 0
        }
        if (toreplacegr2.ori==ncol(gr2.ori)){
        toreplacegr2.ori <- 1
        }else{
        toreplacegr2.ori <- 0
        }
        if (toreplacegr1.ori==1 & toreplacegr2.ori==0){
        gr1.ori[coln,] <- detection.limit.ori
        }
        if (toreplacegr2.ori==1 & toreplacegr1.ori==0){
            gr2.ori[coln,] <- detection.limit.ori
        }
    }
}
    
# If option is set all NA left are replaced with the median value of the
# single feature.
    if (na.rm==TRUE){
        tmp<-NAreplace(gr1.ori,gr2.ori,y.ori)
        gr1.ori<-tmp[[1]]
        gr2.ori<-tmp[[2]]
        rm(tmp)
    }
## Create list of random sample matching, and ratios. If RandomPairs is
## set to 1, original sample order is used.
## If 'RandomPairs' is set to 1, only initial sample order is used for
## pairing (A paired test).
    ratios.list.ori <- vector("list",RandomPairs)
    ratios.list.Class2.ori <- vector("list",RandomPairs)
    for(i in 1:RandomPairs){
        ratios.list.Class2.ori[[i]] <- matrix(data = NA, ncol = npairs.ori,
                                                nrow = nrow(gr1.ori))
    }
    for(i in 1:RandomPairs){
        ratios.list.ori[[i]] <- matrix(data = NA, ncol = npairs.ori,
                                        nrow = nrow(gr1.ori))
    }
    if (is.null(y.ori)){
        ratios.list.ori[[1]] <- gr1.ori
        fold.change.ori <- apply(gr1.ori,1,mean,na.rm=TRUE)
        ratios.list.Class2.ori[[1]] <- 1/gr1.ori
        ratios.list.Class2.ori[[1]] <-
                            mzmatch.R.wrongvalues(ratios.list.Class2.ori[[1]])
    }else{
        data1.ave.ori=apply(gr1.ori, 1, mean, na.rm=TRUE)
        data2.ave.ori=apply(gr2.ori, 1, mean, na.rm=TRUE)
    if(logged==TRUE){
        ratios.list.ori[[1]] <- gr1.ori[,1:npairs.ori] - gr2.ori[,1:npairs.ori]
        ratios.list.ori[[1]] <- mzmatch.R.wrongvalues(ratios.list.ori[[1]])
        fold.change.ori=data1.ave.ori-data2.ave.ori
    }else{
        ratios.list.ori[[1]] <- gr1.ori[,1:npairs.ori]/gr2.ori[,1:npairs.ori]
        ratios.list.ori[[1]] <- mzmatch.R.wrongvalues(ratios.list.ori[[1]])
        fold.change.ori=data1.ave.ori/data2.ave.ori
    }
    if(logged==TRUE){
    ratios.list.Class2.ori[[1]] <- gr2.ori[,1:npairs.ori]-gr1.ori[,1:npairs.ori]
    ratios.list.Class2.ori[[1]] <-
                            mzmatch.R.wrongvalues(ratios.list.Class2.ori[[1]])
    }else{
    ratios.list.Class2.ori[[1]] <- gr2.ori[,1:npairs.ori]/gr1.ori[,1:npairs.ori]
    ratios.list.Class2.ori[[1]] <-
                            mzmatch.R.wrongvalues(ratios.list.Class2.ori[[1]])
        }  
    }
 
if (RandomPairs>1){
    for (coln in 1:nrow(gr1.ori)){
        val1.ori <- gr1.ori[coln,]
        val2.ori <- gr2.ori[coln,]
        ## get rid of NA's
        val1.ori <- sort(val1.ori)
        val2.ori <- sort(val2.ori)
        # calculate ratios of ranodmly paired samples
        for (rp in 2:RandomPairs) {
            val1gr.ori <- sample(val1.ori)[1:npairs.ori]
            val2gr.ori <- sample(val2.ori)[1:npairs.ori]
            if (logged==TRUE){
                ratios.list.ori[[rp]][coln,] <- val1gr.ori-val2gr.ori
                ratios.list.Class2.ori[[rp]][coln,] <- val2gr.ori-val1gr.ori
            }else{
                ratios.list.ori[[rp]][coln,] <- val1gr.ori/val2gr.ori
                ratios.list.Class2.ori[[rp]][coln,] <- val2gr.ori/val1gr.ori
            }
        }
    }
}
badrows<-unique(append(badrows,badrows.ori))
       
###########################################################################
## Ranks calculation
for (randpair in 1:RandomPairs){
    if (logged==TRUE){
        inputmatrix.ori <- ratios.list.ori[[randpair]]
    }else{
        MIN <- min(ratios.list.ori[[randpair]],na.rm=TRUE)
        if (MIN <=0) {
            cat ("Error: Input data contain negative values or values equal
                to 0.","\n")
            stop ()  
        }else{
            inputmatrix.ori <- log (ratios.list.ori[[randpair]], 10)
        }
    }
      
## replace values in badrows with NA, after "order" function rank to those rows
## will be set to NA, and ignored in sum or prod calculation
    if (length(badrows.ori)!=0){
        inputmatrix.ori[badrows.ori,] <- NA
      }
output.ori <- matrix(ncol=ncol(inputmatrix.ori),nrow=nrow(inputmatrix.ori))
inputmatrix.Class2.ori <- inputmatrix.ori*(-1)
output.Class2.ori <- matrix(ncol=ncol(inputmatrix.Class2.ori),
                            nrow=nrow(inputmatrix.Class2.ori))

for (coln in 1:ncol(inputmatrix.ori)){
# After assigning rank with "order" function, ranks for NA's are set back to NA
    NAs <- which(is.na(inputmatrix.ori[,coln]))
    if (length(NAs)>0){
        output.ori[order(inputmatrix.ori[,coln],na.last=NA),coln] <-
            c(1:(nrow(inputmatrix.ori)-length(NAs)))
        }else{
            output.ori[order(inputmatrix.ori[,coln]),coln] <-
                c(1:nrow(inputmatrix.ori))
        }
    }
for (coln in 1:ncol(inputmatrix.Class2.ori)){
    NAs <- which(is.na(inputmatrix.Class2.ori[,coln]))
    if (length(NAs)>0){
    output.Class2.ori[order(inputmatrix.Class2.ori[,coln],na.last=NA),coln]<-
    c(1:(nrow(inputmatrix.Class2.ori)-length(NAs)))
        }else{
            output.Class2.ori[order(inputmatrix.Class2.ori[,coln]),coln] <-
                                c(1:nrow(inputmatrix.Class2.ori))
        }
    }
#saving the ranks of each pairing, if the parameter huge is set to true
#these variables are not saved in order to save memory
if(ori>1){
    ranks1[[randpair]]<-cbind(output.ori,ranks1[[randpair]])
    ranks2[[randpair]]<-cbind(output.Class2.ori,ranks2[[randpair]])
}else{
    ranks1[[randpair]]<-output.ori
    ranks2[[randpair]]<-output.Class2.ori
    }
}
if (!is.null(badrows)){
    cat("Warning: There are",length(badrows),"variables with at least one
        missing value. You can consider using a 'MinNumOfValidPairs' of this
        function.","\n","\n")
    cat(" These values won't be used to compute rank product.","\n","\n")
}

########################################################################
########################################################################
#### evaluating the orirank output as it was in the old version##
if(!huge & RandomPairs>1){
    Orirank.ori<-computeOrirank(gr1.ori,gr2.ori,logged)
    Orirank[[1]]<-cbind(Orirank[[1]],Orirank.ori[[1]])
    Orirank[[2]]<-cbind(Orirank[[2]],Orirank.ori[[2]])
}

}
for( jj in 1:RandomPairs){
# performing the Rank Product or the Rank Sum according
# to the parameter calculateProduct
output<-ranks1[[jj]]
output.Class2<-ranks2[[jj]]
    if (calculateProduct==TRUE){
        ProdFunc <- function(x, DATA) exp(mean(log(DATA[x,]),na.rm=TRUE))
        products <- sapply (1:nrow(output), ProdFunc, DATA=output)
        products.Class2 <- sapply (1:nrow(output.Class2),ProdFunc,
                                   DATA=output.Class2)
    } else {
        products <- apply(output,1,mean,na.rm=TRUE)
        products.Class2 <- apply (output.Class2,1, mean,na.rm=TRUE)
    }
prod_out[[jj]] <- products
prod_out.Class2[[jj]] <- products.Class2
}
prod_out <- do.call (rbind,prod_out)
ranks_out <- apply(prod_out,2,median,na.rm=TRUE)
prod_out.Class2 <- do.call (rbind,prod_out.Class2)
ranks_out.Class2 <- apply(prod_out.Class2,2,median,na.rm=TRUE)
### evaluation of pvalues and pfps
# if we are performing the RankProduct we use the fast algorithm to evaluate
# the pfps
pvals<-matrix(NA,nrow=nrow(data), ncol=2)
pfps<-matrix(NA,nrow=nrow(data),ncol=2)
if (calculateProduct==TRUE){
    ###check max value
    MaxRP <- which(ranks_out>nrow(data))
    MaxRP2 <- which(ranks_out.Class2>nrow(data))
    ranks_out[MaxRP] <- nrow(data)
    ranks_out.Class2[MaxRP2] <- nrow(data)
    #
    ind<-which(!is.na(ranks_out))
    try(pvals[ind,1]<-rankprodbounds(ranks_out[ind]^nrep,
            (nrow(data)-length(badrows)),nrep,Delta='geometric'),silent=TRUE)
    if (is.na(pvals[ind[1],1])){
        for (i in 1:nrow(pvals)){
            if(is.na(ranks_out[i])){
                pvals[i,1]<-NA
            }else {
                pvals[i,1]<-rankprodbounds(ranks_out[i]^nrep,
                            (nrow(data)-length(badrows)),nrep,Delta='geometric')
            }
        }
    }
ind<-which(!is.na(ranks_out.Class2))
try(pvals[ind,2]<-rankprodbounds(ranks_out.Class2[ind]^nrep,
            (nrow(data)-length(badrows)),nrep,Delta='geometric'),silent=TRUE)
if (is.na(pvals[ind[1],2])){
    for (i in 1:nrow(pvals)){
        if(is.na(ranks_out.Class2[i])){
            pvals[i,2]<-NA
        }else {
            pvals[i,2]<-rankprodbounds(ranks_out.Class2[i]^nrep,
                    (nrow(data)-length(badrows)),nrep,Delta='geometric')
        }
    }
}
}else{ ### pvalues evaluation for Rank Sums
    ## using the fast function to evaluate the pvalues
    ij <- which(!is.na(ranks_out))
    ji <- which(!is.na(ranks_out.Class2))
    if(((nrep <= 30) |
        (nrep>=30 & nrep < 40 & (nrow(data)-length(badrows)<= 10^7)) |
        (nrep>=40 & nrep <= 50 & (nrow(data)-length(badrows)<= 10^6)) |
        (nrep>=50 & nrep <= 60 & (nrow(data)-length(badrows)<= 5*10^4)) |
        (nrep>=60 & nrep <= 70 & (nrow(data)-length(badrows)<= 10^4)))){
        
        pvals[,1]<- sapply(ranks_out[ij]*nrep, dice.sum.cdf, n=nrep,
                           s=nrow(data)-length(badrows))
        
        pvals[,2]<- sapply(ranks_out.Class2[ji]*nrep, dice.sum.cdf, n=nrep,
                           s=nrow(data)-length(badrows))
        
    }else if(fast == FALSE){
        
        pvals[,1]<- sapply(ranks_out[ij]*nrep, dice.sum.cdf.accurate, n=nrep,
                           s=nrow(data)-length(badrows))
        
        pvals[,2]<- sapply(ranks_out.Class2[ji]*nrep, dice.sum.cdf.accurate, n=nrep,
                           s=nrow(data)-length(badrows))
    } else{
        ord.RS1 <- order(ranks_out, na.last = TRUE)
        ord.RS2 <- order(ranks_out.Class2, na.last = TRUE)
        jhi <-1
        time <- 0
        str <- Sys.time()
        while((time < tail.time) & (jhi < (nrow(data)-length(badrows)))){
            pvals[ord.RS1[jhi],1]<-
                dice.sum.cdf.accurate(ranks_out[ord.RS1[jhi]]*nrep,
                                      nrep,(nrow(data)-length(badrows)))
            pvals[ord.RS2[jhi],2]<-
                dice.sum.cdf.accurate(ranks_out.Class2[ord.RS2[jhi]]*nrep,
                                      nrep,(nrow(data)-length(badrows)))  
            jhi<-jhi+1
            time <- as.numeric(Sys.time()-str, units="mins")
        }
        cat("\nThe highest p-values evaluated with the exact method are",
            pvals[ord.RS1[jhi-1],1],"and", pvals[ord.RS1[jhi-1],2])
        NA1 <- which(is.na(pvals[,1]))
        NA2 <- which(is.na(pvals[,2]))
        pvals[NA1,1]<-pnorm(ranks_out[NA1]*nrep,nrep*(((nrow(data)-length(badrows))+1)/2),
                               sqrt(((((nrow(data)-length(badrows))^2)-1)/12)*nrep))
        pvals[NA2,2]<-pnorm(ranks_out.Class2[NA2]*nrep,nrep*(((nrow(data)-length(badrows))+1)/2),
                               sqrt(((((nrow(data)-length(badrows))^2)-1)/12)*nrep))
    }
}
### evaluation of the pfps from the pvalues
rankorder1 <- rep(NA,nrow(pvals))
rankorder1[order(pvals[,1],na.last=NA)] <- 1:length(order(pvals[,1],na.last=NA))
rankorder2 <- rep(NA,nrow(pvals))
rankorder2[order(pvals[,2],na.last=NA)] <- 1:length(order(pvals[,2],na.last=NA))
for (i in 1:nrow(pfps)){
    pfps[i,1] <- (pvals[i,1]*(nrow(data)-length(badrows)))/rankorder1[i]
    pfps[i,2] <- (pvals[i,2]*(nrow(data)-length(badrows)))/rankorder2[i]
}  
# output RPs (RSs) and RPrank (RSrank)
out <- list()
if (calculateProduct==TRUE){
    out$RPs <- cbind(ranks_out,ranks_out.Class2)
    RPrank<-matrix(NA,nrow(data),2)
    RPrank[,1]<-rank(out$RPs[,1], na.last =TRUE)
    RPrank[badrows,1]<-NA
    RPrank[,2]<-rank(out$RPs[,2], na.last =TRUE)
    RPrank[badrows,2] <- NA
    colnames (RPrank) <- c("class1 < class2","c11lass1 > class2")
    colnames(out$RPs) <- c("class1 < class2","class1 > class2")
    out$RPrank<-RPrank
} else {
    out$RSs <- cbind(ranks_out,ranks_out.Class2)
    RSrank<-matrix(NA,nrow(data),2)
    RSrank[,1]<-rank(out$RSs[,1], na.last =TRUE)
    RSrank[badrows,1]<-NA
    RSrank[,2]<-rank(out$RSs[,2], na.last =TRUE)
    RSrank[badrows,2] <- NA
    colnames (RSrank) <- c("class1 < class2","class1 > class2")
    colnames(out$RSs) <- c("class1 < class2","class1 > class2")
    out$RSrank<-RSrank
}
# output pfps, pvals, AveFC, groups, RandomPairs, nrep
colnames (pfps) <- c("class1 < class2","class1 > class2")
colnames (pvals) <- c("class1 < class2","class1 > class2") 
out$pfp<-pfps
out$pval<-pvals
fold.change=t(t(ave.fold.change))
fold.change[badrows]<-NA
out$AveFC<-fold.change
out$groups <- cl
out$RandomPairs_ranks <- prod_out
out$nrep <- nrep
# if the parameter huge is set to TRUE the allrank1, allrank2 and
# Orirank are not saved in order to save space!
if(!huge){
    if(RandomPairs >1){
        allrank1<-ranks1
        allrank2<-ranks2
        names(allrank1)=c("class1 < class2","class1 > class 2")
        names(allrank2)=c("class1 < class2","class1 > class 2")
        out$allrank1<-allrank1
        out$allrank2<-allrank2
        out$Orirank<-Orirank
#### evaluating the orirank output as it was in the old version##
        out$Orirank<-Orirank
    }
}  
if(!is.null(gene.names)){
    out<-genenames(out,gene.names,RandomPairs)
}
if(plot){
    plotPFP(out,calculateProduct)
}
cat ("\n done  ")  
out
}

Try the RankProd package in your browser

Any scripts or data that you put into this service are public.

RankProd documentation built on Nov. 8, 2020, 8 p.m.