R/nlhc.R
In nlnet: Nonlinear Network, Clustering, and Variable Selection Based on DCOL

Documented in nlhc

nlhc <-
function(array, hamil.method="nn", concorde.path=NA, use.normal.approx=FALSE, normalization="standardize", combine.linear=TRUE, use.traditional.hclust=FALSE, method.traditional.hclust="average")
{
    #library(TSP)
    
    if(hamil.method=="linkern")
    {
        if(is.na(concorde.path))
        {
            message("linkern is chosen but concorde path wasn't given.")
            return(-1)
        }else{
            concorde_path(concorde.path)
        }
    }
    
    gene.specific.null<-function(array, B=500)
    {
        null.mat<-matrix(0, nrow=nrow(array), ncol=B)
        l<-ncol(array)
        d.array<-array[,1:(l-1)]
        for(i in 1:B)
        {
            this.order<-sample(l, l, replace=FALSE)
            for(j in 1:(l-1)) d.array[,j]<-abs(array[,this.order[j+1]]-array[,this.order[j]])
            null.mat[,i]<-apply(d.array, 1, sum)
        }
        r<-cbind(apply(null.mat, 1, mean), apply(null.mat, 1, sd))
        return(r)
    }
    
    gene.specific.p<-function(null.distr, new.d)
    {
        for(i in 1:length(new.d))
        {
            new.d[i]<-pnorm(new.d[i], mean=null.distr[i,1], sd=null.distr[i,2], lower.tail=TRUE)
        }
        return(new.d)
    }
    
    sim.emp.distri<-function(m, normalization="standardize", n.perm=1e5)
    {
        r<-rep(0, n.perm)
        if(normalization == "normal_score")
        {
            a<-qnorm((1:m)/(1+m))
            for(i in 1:n.perm)
            {
                a2<-sample(a, m, replace=F)
                r[i]<-sum(abs(diff(a2)))
            }
        }else{
            for(i in 1:n.perm)
            {
                a2<-rnorm(m)
                r[i]<-sum(abs(diff(a2)))
            }
            
        }
        return(r)
    }
    
    scol.matrix.order<-function(a,x) # x is the vector, a is the matrix, find ordered distance of rows.of.a|x
    {
        if(is.null(nrow(a)) | nrow(a) == 1)
        {
            a<-as.vector(a)
            a<-a[order(x)]
            d<-a[2:length(a)]-a[1:(length(a)-1)]
            dd<-sum(abs(d),na.rm=T)
        }else{
            a<-a[,order(x)]
            d<-a[,2:ncol(a)]-a[,1:(ncol(a)-1)]
            dd<-apply(abs(d),1,sum,na.rm=T)
        }
        return(dd)
    }
    
    scol.matrix<-function(a, direction=2)    # when direction is 1, scol.matrix[i,j] = SCOL(a[i,], a[j,]), j|i
    {
        
        rdmat<-matrix(0, ncol=nrow(a), nrow=nrow(a))
        for(j in 1:nrow(a))
        {
            rdmat[j,]<-scol.matrix.order(a, a[j,])
        }
        
        if(direction == 2)
        {
            rdmat.diff<-rdmat-t(rdmat)
            sel<-which(rdmat.diff > 0)
            rdmat[sel]<-t(rdmat)[sel]
        }
        return(rdmat)
    }
    
    hamil<-function(z, method.1="linkern")  # z is a matrix, with column being genes
    {
        #library(TSP)
        d<-dist(z, method="man")
        tsp<-TSP(d)
        tsp<-insert_dummy(tsp, label = "cut")
        tour<-solve_TSP(tsp, method=method.1)
        path <- cut_tour(tour, "cut")
        
        return(list(path, attributes(tour)$tour_length))
    }
    
    normscore.row<-function(a)
    {
        #library(coin)
        b<-t(apply(a, 1, normal_trafo))
        return(b)
    }
    
    normrow<-function(a)
    {
        m<-apply(a,1,mean,na.rm=T)
        s<-apply(a,1,sd,na.rm=T)
        a<-(a-m)/s
        return(a)
    }
    
    find.min.pos<-function(d)
    {
        pos<-which(d==min(d))[1]
        pos.x<-pos %% nrow(d)
        if(pos.x == 0) pos.x<-nrow(d)
        pos.y<-floor((pos-1)/nrow(d)+1)
        pos<-c(pos.x, pos.y)
        return(pos)
    }
    
    reorg.tree<-function(h)
    {
        h2<-h
        used<-rep(F, nrow(h$merge))
        used[1]<-T
        corresponder<-h$height*0
        corresponder[1]<-1
        
        for(i in 2:nrow(h$merge))
        {
            next.height<-min(h$height[!used])
            sel<-which(h$height==next.height & !used)[1]
            corresponder[i]<-sel
            this<-h$merge[sel,]
            if(this[1] > 0) this[1]<-which(corresponder == this[1])
            if(this[2] > 0) this[2]<-which(corresponder == this[2])
            
            used[sel]<-T
            h2$merge[i,]<-this
            sel.2<-which(!used)
            h2$height[i]<-h$height[sel]
        }
        return(h2)
    }
    
    # initiation
    n<-nrow(array)
    m<-ncol(array)
    
    if(use.normal.approx & normalization=="none") normalization<-"standardize"
    
    if(normalization=="normal_score")
    {
        a<-normscore.row(array)
    }else{
        if(normalization=="standardize")
        {
            a<-normrow(array)
        }else{
            if(normalization=="none")
            {
                a<-array
            }else{
                return("wrong normalization method.")
            }
        }
    }
    
    if(!use.normal.approx)
    {
        null.distr<-gene.specific.null(a)
    }else{
        m.emp<-2/sqrt(pi)*(m-1)
        s.emp<-sqrt(m-1)*4*(2*pi+3*sqrt(3)-9)/3/pi
        null.distr<-cbind(rep(m.emp, nrow(a)), rep(s.emp, nrow(a)))
    }
    
    orig.a<-a        # a records cluster profiles; orig.a records gene profiles
    
    sim.mat<-scol.matrix(a,direction=1)  ## similarity matrix by SCOL, asymmetric, column given row
    gc()
    d.mat<-sim.mat
    if(!use.normal.approx)
    {
        for(i in 1:nrow(d.mat)) d.mat[i,]<-log10(gene.specific.p(null.distr, sim.mat[i,]))
    }else{
        d.mat<-log10(pnorm(d.mat, mean=null.distr[1,1], sd=null.distr[1,2], lower.tail=TRUE))
    }
    diag(d.mat)<-0        # d.mat (column given row) records log10 p-value between clusters
    
    if(combine.linear)
    {
        d.linear<-cor(t(a))
        d.linear<- -abs(d.linear)*sqrt((m-2)/(1-d.linear^2))
        d.linear<-2*pt(d.linear, df=m-2, lower.tail=T)
        d.linear<-log10(d.linear)
        diag(d.linear)<-0
        min.non.inf<-min(d.linear[d.linear != -Inf])
        d.linear[d.linear == -Inf]<-min.non.inf
        
        sel<-which(d.linear < d.mat)
        d.mat[sel]<-d.linear[sel]
        rm(d.linear)
    }
    gc()
    
    if(!use.traditional.hclust)
    {
        grps<-lab<--(1:nrow(a))
        # grps records which cluster the gene belongs to
        # lab points to which row of a has a cluster profile
        
        ptr<-1
        
        h.merge<-matrix(0,ncol=2,nrow=n-1)
        h.height<-rep(0,n-1)
        h.order<-NULL
        
        # iteration
        # every round, select the lowest distance and merge
        h<-hclust(as.dist(d.mat[1:2,1:2]))
        
        while(ptr<nrow(a))
        {
            pos<-find.min.pos(d.mat)
            h.height[ptr]<-min(d.mat)
            h.merge[ptr,]<-lab[pos]
            this.labs<-sort(lab[pos])
            
            if(sum(this.labs < 0) == 2)
            {
                h.order<-c(h.order, this.labs)
                mem<--this.labs
            }else{
                if(sum(this.labs < 0) == 1)
                {
                    this.members<--which(grps==this.labs[2])
                    mem<-c(-this.members, -this.labs[1])
                    to.insert<-max(which(h.order %in% this.members))
                    if(to.insert == length(h.order))
                    {
                        h.order<-c(h.order, this.labs[1])
                    }else{
                        h.order<-c(h.order[1:to.insert], this.labs[1], h.order[(to.insert+1):length(h.order)])
                    }
                }else{
                    this.members.1<--which(grps==this.labs[1])
                    this.members.2<--which(grps==this.labs[2])
                    mem<--c(this.members.1, this.members.2)
                    this.range.1<-range(which(h.order %in% this.members.1))
                    this.range.2<-range(which(h.order %in% this.members.2))
                    if(this.range.1[2] > this.range.2[2])
                    {
                        this.range.0<-this.range.2
                        this.range.2<-this.range.1
                        this.range.1<-this.range.0
                    }
                    if(this.range.2[1] == this.range.1[2]+1)
                    {}else{
                        range.between<-c(this.range.1[2]+1, this.range.2[1]-1)
                        if(this.range.2[2] == length(h.order))
                        {
                            h.order<-c(h.order[1:this.range.1[2]], h.order[this.range.2[1]:this.range.2[2]], h.order[range.between[1]:range.between[2]])
                        }else{
                            h.order<-c(h.order[1:this.range.1[2]], h.order[this.range.2[1]:this.range.2[2]], h.order[range.between[1]:range.between[2]], h.order[(this.range.2[2]+1):length(h.order)])
                        }
                    }
                }
            }
            
            lab[pos[1]]<-ptr
            lab[pos[2]]<-NA
            a[pos[2],]<-rep(NA, m)
            grps[mem]<-ptr
            
            trav<-hamil(t(orig.a[mem,]), method.1=hamil.method)
            prof<-qnorm((1:m)/(m+1))[order(trav[[1]])]
            a[pos[1],]<-prof
            
            prof<-matrix(prof, nrow=1)
            
            all.given.curr<-scol.matrix.order(orig.a, prof)
            all.given.curr<-gene.specific.p(null.distr,all.given.curr)
            curr.given.all<-d.mat[,mem]
            
            d.mat[pos[1],]<-log10(all.given.curr)
            d.mat[pos[2],]<-NA
            
            uniq.grps<-unique(grps)
            all.given.curr<-qnorm(all.given.curr)
            grp.given.curr<-all.given.curr
            grp.given.curr[is.na(lab)]<-NA
            
            grp.given.curr[which(grps<0)]<-log10(pnorm(grp.given.curr[which(grps<0)], lower.tail=T))
            
            for(i in uniq.grps[uniq.grps>0])
            {
                this<-mean(all.given.curr[which(grps == i)])
                grp.given.curr[which(lab == i)]<-log10(pnorm(this, lower.tail=T))
            }
            
            if(length(mem) > 1)
            {
                curr.given.all<-qnorm(10^curr.given.all)
                curr.given.all<-apply(curr.given.all,1,mean)
                curr.given.all<-log10(pnorm(curr.given.all, lower.tail=T))
            }
            
            new.sim<-apply(cbind(grp.given.curr, curr.given.all), 1, min, na.rm=F)
            new.sim[is.na(new.sim)]<-0
            d.mat[pos[1], ] <- d.mat[, pos[1]] <- new.sim
            diag(d.mat)<-0
            d.mat[pos[2],]<-d.mat[,pos[2]]<-rep(0, n)
            
            ptr<-ptr+1
        }
        
        h.height.0<-h.height
        for(i in (n-1):2)
        {
            for(k in 1:2)
            {
                if(h.merge[i,k]>0)
                {
                    if(h.height[h.merge[i,k]] >= h.height[i]) h.height[h.merge[i,k]]<-h.height[i]-0.01/nrow(array)
                }
            }
        }
        
        h$merge<-h.merge
        h$order<--h.order
        h$height<-h.height
        h$height.0<-h.height.0
        h$call[1]<-"NLHC"
        h$call[2]<-"SCOL matrix"
        h$method<-NULL
        
        return(reorg.tree(h))
    }else{
        rdmat.diff<-d.mat-t(d.mat)
        sel<-which(rdmat.diff > 0)
        d.mat[sel]<-t(d.mat)[sel]
        
        h<-hclust(as.dist(d.mat), method=method.traditional.hclust)
        return(h)
    }
}