R/RankProducts_francesco_2.R

"RankProducts" <- function (data, cl, logged=TRUE, na.rm=TRUE, gene.names=NULL,
                            plot=FALSE, rand = NULL, calculateProduct=TRUE,
                            MinNumOfValidPairs=NA, RandomPairs=NA,
                            huge=FALSE,fast=TRUE, tail.time=0.05){
# the paramenter rand is used to put the random number generator in a
# reproducible state using the rand value as seed 
if (!is.null(rand)) set.seed(rand)
if (is.vector(data)) {
    cat("Warning: There is only one gene in the data set","\n","\n")
    data=matrix(data,nrow=1)  ##force into matrix format
    }
if(is.data.frame(data)){data <- as.matrix(data)}
# number of pairs equal to the number of the smaples in the smallest class
npairs <- min(length(which(cl==0)),length(which(cl==1)))
if (npairs==0){
    nrep<-dim(data)[2]  #in the paired case, npairs=0 but I still need to know
                        #the number of replicates for the evaluation of
                        #the pvalues
}else{
    nrep<-npairs
}
## Number of RandomPairs to generate, if set to NA, the odd integer closer
## to n*n is used
if (is.na(RandomPairs)) RandomPairs <- npairs^2
if (RandomPairs==0) RandomPairs=1
if ((RandomPairs %% 2) == 0) RandomPairs <- RandomPairs + 1
# MinNumOfValidPairs is a parameter that indicates the minimum number of NAs
#accepted per each variable/feature/gene
# If it is set to NA the half of the number of replicates is used
if (is.na(MinNumOfValidPairs)) MinNumOfValidPairs <- floor(nrep/2)
#This two variables are used to save the ranks of each iteration,
#if the parameter huge is set to true these variables 
#are not saved in order to save memory
if(!huge){
    ranks1<- vector("list",RandomPairs)
    ranks2<- vector("list",RandomPairs)
}
## checking if the cl vector and the data matrix are congruent
total.sam <- length(cl)
total.sam2 <- dim(data)[2]
if (total.sam != total.sam2){ 
    stop("Number of classes should match the columns in the data")
}  
## this fuction gives the index of the samples of the classes
## if y is null that mean that there is only one class (paired case)
xy.out <- xyCall(cl, RandomPairs, !calculateProduct)
cl <- xy.out$cl
x <- xy.out$x
y <- xy.out$y
# Replace -Inf, Inf, NaN with NA.
data <- mzmatch.R.wrongvalues(data)
## Estimate detection limit value.
detection.limit <- min(data[data!=0],na.rm=TRUE)
#if logged=FALSE zeros are substitute with the detection.limit
if (logged==FALSE) data[data==0]<-detection.limit
## Prepare ratios for paired samples
gr1 <- data[, x]
gr2 <- data[, y]
## If in one group there was not detection in all samples, but there are signals
## detected in group B,
## values are replaced with detection.limit value 
## Also check which metabolites doesn not match MinNumOfValidPairs criteria. 
## Variables gr1NA and gr2NA will store information about which measurements
## don't contain values and those will be ignored during random pairing.
badrows <- NULL
gr1NA <- vector ("list", nrow(gr1))
gr2NA <- vector ("list", nrow(gr2))
if (RandomPairs>1){
    for (coln in 1:nrow(gr1)){
        toreplacegr1 <- which(is.na(gr1[coln,]))
        toreplacegr2 <- which(is.na(gr2[coln,]))
        gr1NA[[coln]] <- toreplacegr1
        gr2NA[[coln]] <- toreplacegr2
        toreplacegr1 <- length(toreplacegr1)
        toreplacegr2 <- length(toreplacegr2)
        if (MinNumOfValidPairs >
            min((c(ncol(gr1)-toreplacegr1,ncol(gr2)-toreplacegr2)))){
            badrows <- append(badrows,coln)
            gr1NA[[coln]] <- NULL
            gr2NA[[coln]] <- NA
        }
        if (toreplacegr1==ncol(gr1)) toreplacegr1 <- 1 else toreplacegr1 <- 0
        if (toreplacegr2==ncol(gr2)) toreplacegr2 <- 1 else toreplacegr2 <- 0
        if (toreplacegr1==1 & toreplacegr2==0){
            gr1[coln,] <- detection.limit
        }
        if (toreplacegr2==1 & toreplacegr1==0){
            gr2[coln,] <- detection.limit
        }
    }
}
if (!is.null(badrows)){
    cat("Warning: There are",length(badrows),"variables with at least one
        missing value. You can consider using a 'MinNumOfValidPairs'
        of this function.","\n","\n")
    cat(" These values won't be used to compute rank product.","\n","\n")
}
  
# If option is set all NA left are replaced with the median value
# of the single feature.
if (na.rm==TRUE){
    tmp<-NAreplace(gr1,gr2,y)
    gr1<-tmp[[1]]
    gr2<-tmp[[2]]
    rm(tmp)
}
## Create list of random sample matching, and ratios. If RandomPairs is set
## to 1, original sample order is used. If 'RandomPairs' is set to 1, only
##initial sample order is used for pairing (A paired test).
ratios.list <- vector("list",RandomPairs)
ratios.list.Class2 <- vector("list",RandomPairs)
for(i in 1:RandomPairs){
    ratios.list.Class2[[i]] <-
        matrix(data = NA, ncol = npairs, nrow = nrow(gr1))
    }
for(i in 1:RandomPairs){
    ratios.list[[i]] <- matrix(data = NA, ncol = npairs, nrow = nrow(gr1))
    }
# If there is only one class label present (y is null), we assume that
# input is already ratios. Proceed with paired test.
# Otherwise calculate class1/class2
if (is.null(y)){
    ratios.list[[1]] <- gr1
    fold.change <- apply(gr1,1,mean,na.rm=TRUE)
    ratios.list.Class2[[1]] <- 1/gr1
    ratios.list.Class2[[1]] <- mzmatch.R.wrongvalues(ratios.list.Class2[[1]])
} else {
    data1.ave=apply(gr1, 1, mean, na.rm=TRUE)
    data2.ave=apply(gr2, 1, mean, na.rm=TRUE)
    if(logged==TRUE){
        ratios.list[[1]] <- gr1[,1:npairs] - gr2[,1:npairs]
        ratios.list[[1]] <- mzmatch.R.wrongvalues(ratios.list[[1]])
        fold.change=data1.ave-data2.ave
    }else{
        ratios.list[[1]] <- gr1[,1:npairs]/gr2[,1:npairs]
        ratios.list[[1]] <- mzmatch.R.wrongvalues(ratios.list[[1]])
        fold.change=data1.ave/data2.ave
    }
    if(logged==TRUE){
        ratios.list.Class2[[1]] <- gr2[,1:npairs]-gr1[,1:npairs]
      ratios.list.Class2[[1]] <- mzmatch.R.wrongvalues(ratios.list.Class2[[1]])
    }else{
        ratios.list.Class2[[1]] <- gr2[,1:npairs]/gr1[,1:npairs]
      ratios.list.Class2[[1]] <- mzmatch.R.wrongvalues(ratios.list.Class2[[1]])
    }  
}
if (RandomPairs>1){
    for (coln in 1:nrow(gr1)){
        val1 <- gr1[coln,]
        val2 <- gr2[coln,]
## get rid of NA's
        val1 <- sort(val1)
        val2 <- sort(val2)
# calculate ratios of ranodmly paired samples
        for (rp in 2:RandomPairs) {
            val1gr <- sample(val1)[1:npairs]
            val2gr <- sample(val2)[1:npairs]
            if (logged==TRUE){
                ratios.list[[rp]][coln,] <- val1gr-val2gr
            } else{
                ratios.list[[rp]][coln,] <- val1gr/val2gr
            }
            if (logged==TRUE){
                ratios.list.Class2[[rp]][coln,] <- val2gr-val1gr
          } else {
              ratios.list.Class2[[rp]][coln,] <- val2gr/val1gr
          }
        }
    }
}
## Rank products calculation
prod_out <- list ()
prod_out.Class2 <- list()
for (randpair in 1:RandomPairs){
    if (logged==TRUE){
        inputmatrix <- ratios.list[[randpair]]
    } else {
        MIN <- min(ratios.list[[randpair]],na.rm=TRUE)
        if (MIN <=0) {
            cat ("Error: Input data contain negative values or
                values equal to 0.","\n")
        stop ()	
        } else {
            inputmatrix <- log (ratios.list[[randpair]], 10)
        }
    }
    
## replace values in badrows with NA, after "order" function rank to those
## rows will be set to NA, and ignored in sum or prod calculation
if (length(badrows)!=0){
    inputmatrix[badrows,] <- NA
}
output <- matrix(ncol=ncol(inputmatrix),nrow=nrow(inputmatrix))
inputmatrix.Class2 <- inputmatrix*(-1)
output.Class2 <- matrix(ncol=ncol(inputmatrix.Class2),
                        nrow=nrow(inputmatrix.Class2))
for (coln in 1:ncol(inputmatrix)){
# After assigning rank with "order" function, ranks for NA's are set back to NA
    NAs <- which(is.na(inputmatrix[,coln]))
    if (length(NAs)>0){
        output[order(inputmatrix[,coln],na.last=NA),coln] <-
                    c(1:(nrow(inputmatrix)-length(NAs)))
    } else {
        output[order(inputmatrix[,coln]),coln] <- c(1:nrow(inputmatrix))
    }
}
for (coln in 1:ncol(inputmatrix.Class2)){
    NAs <- which(is.na(inputmatrix.Class2[,coln]))
        if (length(NAs)>0){
        output.Class2[order(inputmatrix.Class2[,coln],na.last=NA),coln] <-
                            c(1:(nrow(inputmatrix.Class2)-length(NAs)))
        }else{
            output.Class2[order(inputmatrix.Class2[,coln]),coln] <-
                                    c(1:nrow(inputmatrix.Class2))
        }
}
#saving the ranks of each pairing, if the parameter huge is set to true
#these variables are not saved in order to save memory
if (!huge){
    ranks1[[randpair]]<-output
    ranks2[[randpair]]<-output.Class2
}
#performing the Rank Product or the Rank Sum according to the
#parameter calculateProduct
if (calculateProduct==TRUE){
    ProdFunc <- function(x, DATA) exp(mean(log(DATA[x,]),na.rm=TRUE))
    products <- sapply (1:nrow(output),ProdFunc, DATA=output)
    products.Class2 <- sapply (1:nrow(output.Class2),ProdFunc,
                                DATA=output.Class2)
    } else {
        products <- apply(output,1,mean,na.rm=TRUE)
        products.Class2 <- apply (output.Class2,1, mean,na.rm=TRUE)
    }
prod_out[[randpair]] <- products
prod_out.Class2[[randpair]] <- products.Class2
}
prod_out <- do.call (rbind,prod_out)
ranks_out <- apply(prod_out,2,median,na.rm=TRUE)
prod_out.Class2 <- do.call (rbind,prod_out.Class2)
ranks_out.Class2 <- apply(prod_out.Class2,2,median,na.rm=TRUE)
### evaluation of pvalues and pfps
### if we are performing the RankProduct we use the
### fast algorithm to evaluate the pfps


pvals<-matrix(NA,nrow=nrow(data), ncol=2)
pfps<-matrix(NA,nrow=nrow(data),ncol=2)
if (calculateProduct==TRUE){
    ###check max value
    MaxRP <- which(ranks_out>nrow(data))
    MaxRP2 <- which(ranks_out.Class2>nrow(data))
    ranks_out[MaxRP] <- nrow(data)
    ranks_out.Class2[MaxRP2] <- nrow(data)
    #
    ind<-which(!is.na(ranks_out))
    try(pvals[ind,1]<-rankprodbounds(ranks_out[ind]^nrep,
            (nrow(data)-length(badrows)),nrep,Delta='geometric'),silent=TRUE)
    if (is.na(pvals[ind[1],1])){
        for (i in 1:nrow(pvals)){
            if(is.na(ranks_out[i])){
                pvals[i,1]<-NA
            }else {
                pvals[i,1]<-rankprodbounds(ranks_out[i]^nrep,
                        (nrow(data)-length(badrows)),nrep,Delta='geometric')
            }
        }
    }
    ind<-which(!is.na(ranks_out.Class2))
    try(pvals[ind,2]<-rankprodbounds(ranks_out.Class2[ind]^nrep,
            (nrow(data)-length(badrows)),nrep,Delta='geometric'),silent=TRUE)
    if (is.na(pvals[ind[1],2])){
        for (i in 1:nrow(pvals)){
            if(is.na(ranks_out.Class2[i])){
                pvals[i,2]<-NA
            }else {
                pvals[i,2]<-rankprodbounds(ranks_out.Class2[i]^nrep,
                        (nrow(data)-length(badrows)),nrep,Delta='geometric')
            }
        }
    }
    }else{ 
        ## using the fast function to evaluate the pvalues
        ij <- which(!is.na(ranks_out))
        ji <- which(!is.na(ranks_out.Class2))
        if(((nrep <= 30) |
            (nrep>=30 & nrep < 40 & (nrow(data)-length(badrows)<= 10^7)) |
            (nrep>=40 & nrep <= 50 & (nrow(data)-length(badrows)<= 10^6)) |
            (nrep>=50 & nrep <= 60 & (nrow(data)-length(badrows)<= 5*10^4)) |
            (nrep>=60 & nrep <= 70 & (nrow(data)-length(badrows)<= 10^4)))){
       
            pvals[,1]<- sapply(ranks_out[ij]*nrep, dice.sum.cdf, n=nrep,
                               s=nrow(data)-length(badrows))
                
            pvals[,2]<- sapply(ranks_out.Class2[ji]*nrep, dice.sum.cdf, n=nrep,
                               s=nrow(data)-length(badrows))
                
        }else if(fast == FALSE){
            
            pvals[,1]<- sapply(ranks_out[ij]*nrep, dice.sum.cdf.accurate, n=nrep,
                               s=nrow(data)-length(badrows))
            
            pvals[,2]<- sapply(ranks_out.Class2[ji]*nrep, dice.sum.cdf.accurate, n=nrep,
                               s=nrow(data)-length(badrows))
        } else{
            ord.RS1 <- order(ranks_out, na.last = TRUE)
            ord.RS2 <- order(ranks_out.Class2, na.last = TRUE)
            jhi <-1
            time <- 0
            str <- Sys.time()
            while((time < tail.time) & (jhi < (nrow(data)-length(badrows)))){
                pvals[ord.RS1[jhi],1]<-
                    dice.sum.cdf.accurate(ranks_out[ord.RS1[jhi]]*nrep,
                                             nrep,(nrow(data)-length(badrows)))
                pvals[ord.RS2[jhi],2]<-
                    dice.sum.cdf.accurate(ranks_out.Class2[ord.RS2[jhi]]*nrep,
                                             nrep,(nrow(data)-length(badrows)))  
                jhi<-jhi+1
                time <- as.numeric(Sys.time()-str, units="mins")
            }
            
            cat("\nThe highest p-values evaluated with the exact method are",
                pvals[ord.RS1[jhi-1],1],"and", pvals[ord.RS2[jhi-1],2]," ")
            NA1 <- which(is.na(pvals[,1]))
            NA2 <- which(is.na(pvals[,2]))
            pvals[NA1,1]<-pnorm(ranks_out[NA1]*nrep,nrep*(((nrow(data)-length(badrows))+1)/2),
                               sqrt(((((nrow(data)-length(badrows))^2)-1)/12)*nrep))
            pvals[NA2,2]<-pnorm(ranks_out.Class2[NA2]*nrep,nrep*(((nrow(data)-length(badrows))+1)/2),
                                   sqrt(((((nrow(data)-length(badrows))^2)-1)/12)*nrep))
         }
}




### evaluation of the pfps from the pvalues
rankorder1 <- rep(NA,nrow(pvals))
rankorder1[order(pvals[,1],na.last=NA)] <- 1:length(order(pvals[,1],na.last=NA))
rankorder2 <- rep(NA,nrow(pvals))
rankorder2[order(pvals[,2],na.last=NA)] <- 1:length(order(pvals[,2],na.last=NA))
for (i in 1:nrow(data)){
    pfps[i,1] <- (pvals[i,1]*(nrow(data)-length(badrows)))/rankorder1[i]
    pfps[i,2] <- (pvals[i,2]*(nrow(data)-length(badrows)))/rankorder2[i]
}  
# output RPs (RSs) and RPrank (RSrank)
out <- list()
if (calculateProduct==TRUE){
    out$RPs <- cbind(ranks_out,ranks_out.Class2)
    RPrank<-matrix(NA,nrow(data),2)
    RPrank[,1]<-rank(out$RPs[,1], na.last =TRUE)
    RPrank[badrows,1]<-NA
    RPrank[,2]<-rank(out$RPs[,2], na.last =TRUE)
    RPrank[badrows,2] <- NA
    colnames (RPrank) <- c("class1 < class2","class1 > class2")
    colnames(out$RPs) <- c("class1 < class2","class1 > class2")
    out$RPrank<-RPrank
} else {
    out$RSs <- cbind(ranks_out,ranks_out.Class2)
    RSrank<-matrix(NA,nrow(data),2)
    RSrank[,1]<-rank(out$RSs[,1], na.last =TRUE)
    RSrank[badrows,1]<-NA
    RSrank[,2]<-rank(out$RSs[,2], na.last =TRUE)
    RSrank[badrows,2] <- NA
    colnames (RSrank) <- c("class1 < class2","class1 > class2")
    colnames(out$RSs) <- c("class1 < class2","class1 > class2")
    out$RSrank<-RSrank
    }
# output pfps, pvals, AveFC, groups, RandomPairs, nrep
colnames (pfps) <- c("class1 < class2","class1 > class2")
colnames (pvals) <- c("class1 < class2","class1 > class2") 
out$pfp<-pfps
out$pval<-pvals
fold.change=t(t(fold.change))
fold.change[badrows]<-NA
out$AveFC<-fold.change
colnames(fold.change)="log/unlog(class1/class2)"
out$groups <- cl
out$RandomPairs_ranks <- prod_out
out$nrep <- nrep
# if the parameter huge is set to TRUE the allrank1, allrank2 and Orirank
# are not saved in order to save space
if(!huge){
    if(RandomPairs >1){
        allrank1<-ranks1
        allrank2<-ranks2
        names(allrank1)=c("class1 < class2","class1 > class 2")
        names(allrank2)=c("class1 < class2","class1 > class 2")
        out$allrank1<-allrank1
        out$allrank2<-allrank2
#### evaluating the orirank output as it was in the old version##
        out$Orirank<-computeOrirank(gr1,gr2,logged)
################################################
    }
}
if(!is.null(gene.names)){
    out<-genenames(out,gene.names,RandomPairs)
}
if(plot){
    plotPFP(out,calculateProduct)
}
cat ("\n done  ")  
out
}

Try the RankProd package in your browser

Any scripts or data that you put into this service are public.

RankProd documentation built on Nov. 8, 2020, 8 p.m.