R/obliqueRF.default.R

##--------------------------------------
## oblique random forest 
##--------------------------------------

obliqueRF.default<- function(
                    x,
                    y,
                    x.test=NULL,
                    y.test=NULL,
                    mtry=NULL,
                    ntree=100,
                    training_method="ridge",                    
                    bImportance=F,
                    bProximity = F,
                    verbose=F,
                    ...
                    )
{

    num_split_vars <- mtry

    # always check out-of-bag error
    do_oob<-TRUE

    if(verbose)
      cat("obliqueRF: WARNING: status messages enabled; set \"verbose\" to false to disable\n");

	# --- importance ---

	impStruct<-list(bImportance=bImportance,importance_pvalue=0,importance_weight="");

    if(bImportance){
        impStruct$importance_pvalue <- 0.05 # p value
        impStruct$importance_weight <- "none"
        do_oob<-FALSE  
        
        if(bProximity){
            cat("obliqueRF: WARNING: importance and proximity cannot be calculated at the same time. Will set \'bProximity = FALSE\' and continue.\n")
            bProximity = FALSE
        }

        if(!(training_method=="log")){
            cat("obliqueRF: WARNING: importance requires training_method=\"log\". Adjusting training_method...\n");
            training_method="log"    
        }

    }
    
    # --- check training method selection ---

    node_train_method=NULL;    
    if(training_method=="ridge"){        
        node_train_method=ridge_svd_wrapper
        cat("obliqueRF: using fast ridge regression with SVD as node model,\n does not scale data at the node,\n tests regularization parameter \"lambda\" for 10^(c(-5:5)).\n In case of problems with the SVD variant try training_method=\"ridge_slow\" for separate ridge regressions (slower).\n")
    }
    if(training_method=="ridge_slow"){        
        node_train_method=ridge_slow_wrapper
        cat("obliqueRF: using slow ridge regression as node model,\n does not scale data at the node,\n tests regularization parameter \"lambda\" for 10^(c(-5:5)) with separate explicit ridge regressions.\n")
    }
    if(training_method=="log"){
        node_train_method=log_wrapper
        cat("obliqueRF: using logistic regression as node model.\n")        
    }
    if(training_method=="svm"){
        node_train_method=svm_wrapper        
        cat("obliqueRF: using linear SVM as node model,\n scales data at the node,\n tests regularization parameter \"cost\" for 10^(c(-3,-1,1)).\n")        
    }
    if(training_method=="pls"){
        node_train_method=pls_wrapper        
        cat("obliqueRF: using partial least squares regression as node model,\n does not scale data at the node,\n optimizes regularization parameter \"ncomp\" (number of components) for 1:min(20, mtry).\n")
    }        
    if(training_method=="rnd"){        
        node_train_method=rnd_wrapper
        cat("obliqueRF: using a random hyperplane as node model,\n scales data to unit variance at the node,\n coefficients of random subspace projection are chosen from a normal distribution with unit variance.\n")
    }
	if( is.null(node_train_method) )
	{
		stop("obliqueRF: the training method is not valid, choose one of the following:\n  ridge, ridge_slow, log, svm, pls or rnd.\n  See documentation for more details. Aborting");
	}

    node_pred_method=predict

    # generic impurity support
    node_impurity_method=gini_impurity_wrapper;


    # ------------- DATA -------------

    # make sure we have valid data
    if(
      any(is.nan(as.vector(x))) |
      any(is.infinite(as.vector(x))) |
      any(is.nan(as.vector(y))) |
      any(is.infinite(as.vector(y)))
      )
    stop("obliqueRF: Training data contains NaNs or Inf values. Please correct and try again. Aborting...");

    if(is.null(y.test)){
        cat("obliqueRF: no test set defined. Will use training data.\n")
        x.test<-x
        y.test<-y    
    }
    if(
      sum(is.nan(as.vector(x.test)))>0 |
      sum(is.infinite(as.vector(x.test)))>0 |
      sum(is.nan(as.vector(y.test)))>0 |
      sum(is.infinite(as.vector(y.test)))>0
      )
      stop("obliqueRF: Test data contains NaNs or Inf values. Please correct and try again. Aborting...");

    # labels: transform factors etc. to numerical values in [1,0]
    clnames <- sort(unique(paste(y)))
    if(!all(y==0|y==1)){
        
         yout<-rep(0,length(y))
         ytestout<-rep(0,length(y.test))
         
         yout[y==clnames[1]]<-0
         ytestout[y.test==clnames[1]]<-0     
         yout[y==clnames[2]]<-1
         ytestout[y.test==clnames[2]]<-1     
        
         y<-yout
         y.test<-ytestout
    }    

    # get a data.frame from x, y
    num_vars=ncol(x);
    num_samples=nrow(x);    

    if(length(y)!=num_samples)
        stop("obliqueRF: WARNING: number of samples and length of y does not match.")

    test_set<-cbind(as.matrix(x.test),y.test)
    
    d<-cbind(as.matrix(x),y)
    cl_id=num_vars+1;
    cl<- y

    # number of variables to split on at each node
    if(is.null(num_split_vars))
        num_split_vars=max(as.integer(sqrt(num_vars)),2); # default
    if(num_split_vars> num_vars){
        stop('obliqueRF: WARNING: mtry > number of variable.')
    }

    # check number of variables
    num_classes<-length(unique(cl));
    if((num_classes<2) || min(cl))
        stop("obliqueRF: WARNING: wrong number of classes. Aborting...")

    # ------------- VARIABLES -----------------

    # allocate variables
      nr_nodes=2*num_samples+1; #total number of possible nodes
      nd_min_size=1; #minimum size of a node
      oob_glob=rep(0, num_samples); #indicates how often a sample was oob

      oob_class_votes <- matrix(0, num_classes, nrow(d));
      test_set_class_votes<-matrix(0, num_classes, nrow(test_set));

      if(bImportance){
		  imp<-rep(0,ncol(x))
		  imp_class_votes<-matrix(0, num_classes, nrow(d));
      }

      oob_prox <- matrix(NA, nrow(d), ntree)

    # ------------- TRAIN -------------

     err<-c()
     err_0<-c()
     err_1<-c()    
    
    # main loop
    trees=vector("list",ntree);
	if(verbose)
        cat("obliqueRF: starting.",date(),"\n")
    
    for(curr_num_tree in 1:ntree)
        {   
			if(verbose)
			  if((curr_num_tree%%10)==1) cat("", curr_num_tree,"out of",ntree,"trees so far.",date(),"\n");

            class_n=rep(0, num_classes); #vector containing the class distribution in sample

            # sample with replacement
            k<-sample(1:num_samples, num_samples, replace=TRUE);

            # "bag" statistic, how often is a sample in the bag?
            sample_pop=as.numeric(table(factor(k,levels=1:num_samples)));
            # count the class distribution in sample (weights here?)
            class_n=as.numeric(table(cl[k]));

            dframe<-d
            b_sample=sample_pop; # sampling frequency and index vector for oob
            nr_diff_cases=length(sample_pop>0); #nr of different (unique) cases in the bootstrap sample
            n_case=1:num_samples;   #translation to original case numbers (needed for class population, etc.)

            # ---------------------- BUILDTREE ------------------------
            # build the tree, i.e. nr of columns in dframe (-1 for cl)

            tree<-buildtree(
                dframe,
                b_sample,
                class_n,
                n_case,
                nr_nodes,
                nd_min_size,
                nr_diff_cases,
                num_vars,
                num_split_vars,
                num_classes,
                node_train_method,
                node_pred_method,
				node_impurity_method,
                num_samples,
                impStruct
                );
            trees[[curr_num_tree]]=tree;

            # ---------------------- PREDICT ------------------------

			# convert the node_class from 0/1 to 1/2
			node_classIdx=tree$node_class+1;

            # -----------------
            # oob error
            if(do_oob){
				#global: count the oob's
				oob_glob=oob_glob+(sample_pop==0)*1;

				for(M in which(sample_pop==0)) # only oob samples
				{
					curr_node=1;
					for(i in 1:(tree$total_nr_nodes))
					{
						if(tree$node_status[curr_node]==-1) #==terminal node
						{
							# increment class count by 1
							oob_class_votes[node_classIdx[curr_node],M]=
							  oob_class_votes[node_classIdx[curr_node],M]+1;
							  
							oob_prox[M,curr_num_tree]<-curr_node
							break;
						}
						dset<-d[M, tree$best_vars[,curr_node]]; # *NOT* test_set
						score=sum(tree$training_stats[,curr_node]*dset);
						if(score<tree$best_split_value[curr_node])
							curr_node=tree$treemap[1, curr_node]
						else
							curr_node=tree$treemap[2, curr_node];
					}
				}
            }
            # -----------------
            # importance

            if(bImportance){
                for(pi in 1:ncol(tree$signif))
                 imp[tree$best_vars[,pi]]<-imp[tree$best_vars[,pi]] + (tree$significance[,pi]) 
                 #plot(imp,t="l", xlab="Features", ylab="Importance", main=curr_num_tree)
            }

            # -----------------
            # test set?
            
                for(M in 1:nrow(test_set))
                {
                    curr_node=1;
                    for(i in 1:(tree$total_nr_nodes))
                    {
                        if(tree$node_status[curr_node]==-1) #==terminal node
                        {
						    # increment class count by 1
                            test_set_class_votes[node_classIdx[curr_node],M]=
						      test_set_class_votes[node_classIdx[curr_node],M]+1;
                            break;
                        }
                        dset<-test_set[M, tree$best_vars[,curr_node]];
                        score=sum(tree$training_stats[,curr_node]*dset);
                        if(score<tree$best_split_value[curr_node])
                            curr_node=tree$treemap[1, curr_node]
                        else
                            curr_node=tree$treemap[2, curr_node];
                    }
                }

            # -------------------- EVAL --------------------

            v<- 1*(test_set_class_votes[2,]/sum(test_set_class_votes[,1])>.5)
            m<-table(factor(y.test,levels=c(0,1)),factor(v,levels=c(0,1)))
            err_0[curr_num_tree]<- m[1,2]/sum(m[1,])
            err_1[curr_num_tree]<- m[2,1]/sum(m[2,])
            err[curr_num_tree]<- (m[1,2]+m[2,1])/sum(m)


        
     } # loop over trees

    test.votes<-test_set_class_votes[2,]/sum(test_set_class_votes[,1])

    # calc proximity
    if(bProximity){
		oob_counts<-matrix(0,nrow(d),nrow(d)) # diag(oob_counts) is oob_glob
		#count how often each sample was oob
		for(op in 1:ncol(oob_prox)){                
			wh<-which(!is.na(oob_prox[,op]))
			oob_counts[wh, wh]<-oob_counts[wh, wh] +1
		}

		node_counts<-matrix(0,nrow(d),nrow(d))
		for(op in 1:ncol(oob_prox)){
			tabl<-table(oob_prox[,op])
			mults<-as.numeric(names(tabl)[tabl>1])
			for(mi in mults){
			wh<-which(oob_prox[,op]==mi)
			node_counts[wh, wh]<-node_counts[wh, wh] +1
			}
		} 
		dst<-1 - node_counts/oob_counts
		diag(dst)<-0
    }



    oob.votes<-NULL; thr<-NULL
    if(do_oob){
		oob.votes<-oob_class_votes[2,]/(oob_class_votes[1,]+oob_class_votes[2,])
		pred <- prediction(oob.votes, y)
		perf <- performance(pred,"acc")
		thr<-perf@"x.values"[[1]][which.max(perf@"y.values"[[1]])]
    }

    cl <- match.call()
    cl[[1]] <- as.name("obliqueRF")

    res<-list(
    	call=cl,
    	type = "classification",
        class_names = clnames,
    	errs=list(err_0 = err_0, err_1 = err_1, err = err),
    	pred=list(  pred.test = 1*(test.votes>.5),
    		    votes.test = test.votes,
    		    y.test = y.test,
    		    votes.oob = oob.votes,
    		    y = y),
    	lab=paste('oRF - node:', tree$lab),
    	ntree = ntree,
    	mtry = num_split_vars,
    	importance = if(bImportance) imp else NULL,
    	proximity = if(bProximity) dst else NULL,
    	num_classes = num_classes,
    	trees = trees
    )

    class(res) <- "obliqueRF"
    return(res);
}


##--------------------------------------------
## buildtree
##--------------------------------------------


buildtree <- function(
    dframe,
    b_sample,
    class_n,
    n_case,
    nr_nodes,
    nd_min_size,
    nr_diff_cases,
    num_vars,
    num_split_vars,
    num_classes,
    node_train_method,
    node_pred_method,
    node_impurity_method,
    num_samples,
    impStruct
    )
{
    class_distr_nodes=matrix(0, num_classes, nr_nodes); #class distribution per node
    node_status=rep(0, nr_nodes); #indicates what type a node is: 1: parent node, 2: has not been processed yet, -1: terminal node
    node_start=rep(0, nr_nodes);  #start index for a node in the data matrix
    node_length=rep(0, nr_nodes); #nr of cases belonging to that node, starting from node_start

    var_imp=rep(0, num_vars);

    best_vars=matrix(0, num_split_vars, nr_nodes); #holds the split variables
    treemap<-matrix(0,2,nr_nodes); #holds the indexes of the child nodes
    dgini=rep(0, nr_nodes);
    dgini_perc=rep(0, nr_nodes);
    significance=matrix(0, num_split_vars, nr_nodes);
    best_split_values=rep(0,nr_nodes);
    ridge_values=rep(NA,nr_nodes);
    training_stats=matrix(0, num_split_vars,nr_nodes);

    ncur=1; #index of the node we can write to
    node_status[1]=2; #first node has not been split yet...
    node_start[1]=1;
    node_length[1]=nr_diff_cases;

    #copy the class-numbers
    class_distr_nodes[,1]=class_n;

    n_temp_case=n_case;
    #iterate over the node_status array and check if there are unprocessed nodes;
    #split if necessary until terminal nodes are reached
    nr_parent_nds=0;
    curr_node=0;
    lab<-" "
    
    for(curr_node in 1:nr_nodes){

        if(node_status[curr_node]==0) break;
        if(node_status[curr_node]!=2) next; #the current node is either a parent or a terminal node
        
        #start and end index of the cases in the data matrix
        nd_start=node_start[curr_node];
        nd_length=node_length[curr_node];
        nd_end=nd_start+nd_length-1;

        #get the class distribution for the current node
        class_n=class_distr_nodes[,curr_node];

        res<-try(findbestsplit(
           dframe=dframe[n_case[nd_start:nd_end],],
            b_sample=b_sample[n_case[nd_start:nd_end]],
            class_n,
            nd_start,
            nd_length,
            num_vars,
            num_split_vars,
            num_classes,
            node_train_method,
            node_pred_method,
		    node_impurity_method,
            impStruct
            ),TRUE);

        if((res$jstat==1)|(class(res)=="try-error"))
        {
            node_status[curr_node]=-1;
            next;
        }

        lab<-res$lab

        best_vars[,curr_node]=res$variables;
        dgini[curr_node]=res$decsplit;
        dgini_perc[curr_node]=res$decsplit_perc;
        significance[,curr_node]=res$signif;
        best_split_values[curr_node]=res$best_split_value;
        ridge_values[curr_node]=res$ridge_value;
        training_stats[,curr_node]=res$training_stats;
        var_imp[res$variables]=var_imp[res$variables]+abs(res$training_stats)*((res$decsplit));#(nd_length/nr_diff_cases))

        #count the parent nodes
        nr_parent_nds=nr_parent_nds+1;

        #save size, etc. for the two child nodes
        node_length[ncur+1]=res$isplit;
        node_length[ncur+2]=node_length[curr_node]-res$isplit;
        node_start[ncur+1]=nd_start;
        node_start[ncur+2]=nd_start+res$isplit;

        #update n_case
        true_left_ind=(nd_start-1)+((1:node_length[curr_node])[res$split_vect==1]);
        true_right_ind=(nd_start-1)+((1:node_length[curr_node])[res$split_vect==0]);
        n_temp_case[nd_start:(nd_start+res$isplit-1)]=n_case[true_left_ind];
        n_temp_case[(nd_start+res$isplit):nd_end]=n_case[true_right_ind];
        n_case[nd_start:nd_end]=n_temp_case[nd_start:nd_end];

        #class population/distribution  # sum over b_sample
        for(n in nd_start:(nd_start+res$isplit-1))
        {
            nc=n_case[n];
            j=dframe[nc, num_vars+1]+1;
            class_distr_nodes[j,ncur+1]=class_distr_nodes[j,ncur+1]+b_sample[nc];
        }
        for(n in (nd_start+res$isplit):nd_end)
        {
            nc=n_case[n];
            j=dframe[nc, num_vars+1]+1;
            class_distr_nodes[j,ncur+2]=class_distr_nodes[j,ncur+2]+b_sample[nc];
        }

        #adjust the node status
        node_status[ncur+1]=2;
        node_status[ncur+2]=2;
        if (node_length[ncur+1]<=nd_min_size) node_status[ncur+1]=-1;
        if (node_length[ncur+2]<=nd_min_size) node_status[ncur+2]=-1;

        #if pure node then terminal node
        if(sum(class_distr_nodes[,ncur+1]>0)==1) node_status[ncur+1]=-1;
        if(sum(class_distr_nodes[,ncur+2]>0)==1) node_status[ncur+2]=-1;
        nr_samples=sum(class_distr_nodes[,ncur+1])+sum(class_distr_nodes[,ncur+2]);

        #remember the child nodes
        treemap[1,curr_node]=ncur+1;
        treemap[2,curr_node]=ncur+2;

        node_status[curr_node]=1; #current node has been processed
        ncur=ncur+2;
    } #for(curr_node in 1:nr_nodes)

    #normalise the variable importance
    var_imp=var_imp/nr_parent_nds;
    if(nr_parent_nds>0) var_imp=var_imp/nr_parent_nds;

    total_nr_nodes=curr_node;
    node_status[node_status==2]=-1;

    #get the class for each node
    node_class=apply(class_distr_nodes[,1:total_nr_nodes],2,which.max)-1;

    res2<-list(
        best_vars=best_vars[,1:total_nr_nodes], 
        best_split_values=best_split_values[1:total_nr_nodes], 
        ridge_values=ridge_values[1:total_nr_nodes], 
        training_stats=training_stats[,1:total_nr_nodes], 
        significance=significance[,1:total_nr_nodes],
        node_class=node_class, 
        dgini=dgini[1:total_nr_nodes], 
        dgini_perc=dgini_perc[1:total_nr_nodes], 
        total_nr_nodes=total_nr_nodes, 
        treemap=treemap[,1:total_nr_nodes], 
        node_status=node_status[1:total_nr_nodes], 
        var_imp=var_imp,
        lab=lab
        );

    return(res2);
}


##--------------------------------------------
## findbestsplit
##--------------------------------------------


findbestsplit <- function(
    dframe,
    b_sample,
    class_n,
    nd_start,
    nd_length,
    num_vars,
    num_split_vars,
    num_classes,
    node_train_method,
    node_pred_method,
    node_impurity_method,
    impStruct
    )
{
    #--------

    #draw some variable indices without replacement
    vars<- sample(1:num_vars, num_split_vars);

    #construct the subset (don't forget the labels!)
    temp_matrix<-dframe[,vars];
    cls<-dframe[,num_vars+1];



    #do the training 
    #we can get warnings from the training function - ignore for now
      ow<-options("warn");
	  options(warn=-1);
      z<- try(node_train_method(x=temp_matrix, y=cls, ind = b_sample, impStruct = impStruct),TRUE);
#      z<- try(node_train_method(x=temp_matrix, y=cls, ind = b_sample, impStruct = impStruct),F);

      options(ow);

      lab<-" "
      
      if((class(z)=="try-error") || (sum(is.na(z$coef))>0) || (table(cls)==1) || is.null(z$coef) )
      {
	  jstat=1;
	  res<-list(jstat=jstat);
	  return(res);
      }

    #call some impurity function
    impurity_meas<-node_impurity_method(temp_matrix, cls, b_sample, class_n, num_classes, nd_length, z);

    #if something went wrong we have no valid impurity data
    # the jstat will be tested by the calling function as well
    if( impurity_meas$jstat==1 )
    return(impurity_meas);


    #assign the results
    res<-list(
        jstat=impurity_meas$jstat,
        variables=vars,
        isplit=impurity_meas$isplit,
        split_vect=impurity_meas$split_vect,
        decsplit=impurity_meas$decsplit,
        decsplit_perc=impurity_meas$decsplit_perc,
        best_split_value=impurity_meas$best_split_value,
        ridge_value=z$ridge_value,
        training_stats=z$coef,
        signif=z$signif,
        lab=z$lab
        );
    return(res);
}



##--------------------------------------------
## Impurity wrapper
##--------------------------------------------
gini_impurity_wrapper<-function(dmatrix, cls, b_sample, class_n, num_classes, nd_length, z,...)
{
    #coefficients for gini index
    pno<-0;#numerator
    pdo<-0;#denominator

    pno<-sum(class_n^2);
    pdo<-sum(class_n);

    crit0<-pno/pdo;
    critmax<-(-1.0e20);
    jstat<-0;
    isplit<-0;


    #get the scores
#     pred<-as.matrix(dmatrix)%*%z$coef;
    pred<-(dmatrix)%*%z$coef;
    #sort them
    pred_sort_matching<-sort.list(pred);
    p<-pred[pred_sort_matching];

    uv<-b_sample[pred_sort_matching];
    clv<-cls[pred_sort_matching]+1;

    rldv<-cumsum(uv);
    rrdv<-pdo-rldv;

    cum_class<-array(0,c(num_classes,nd_length,2));
    temp_uv<-numeric(nd_length);
    for(i in 1:num_classes)
    {
        idv<-clv==i;
        temp_uv[idv]<-uv[idv];
        temp_cumsum<-cumsum(temp_uv);
        cum_class[i,,1]<-(temp_cumsum^2);
        cum_class[i,,2]<-((class_n[i]-temp_cumsum)^2);
        temp_uv<-numeric(nd_length);
    }
    rlnv<-apply(cum_class[,,1], 2, sum);
    rrnv<-apply(cum_class[,,2], 2, sum);

    crit<-(rlnv/rldv)+(rrnv/rrdv);
    isplit<-which.max(crit)
    critmax<-crit[isplit];

    if (critmax<(-1.0e10))
    {
        jstat<-1;
        res<-list(jstat=jstat);
        return(res);
    }

    split_vect<-numeric(nd_length);
    #assign 1 to the first group, the 2nd will stay 0
    split_vect[pred_sort_matching[1:isplit]]<-1;
    decsplit<-((critmax-crit0));

    #calculate the offset
    best_split_value<-(p[isplit]+p[isplit+1])/2;
    #assign the results
    res<-list(
	jstat=jstat,
        isplit=isplit,
        split_vect=split_vect,
        decsplit=decsplit,
        best_split_value=best_split_value,
        decsplit_perc=(decsplit/crit0)
        );
    return(res);
}





## -------------------------------------------
## NODE MODELS
## -------------------------------------------


##--------------------------------------------
## Logistic wrapper
## if "IMPORTANCE" is chosen ANOVA is calculated at every split
##--------------------------------------------

# require(stats)

log_wrapper<-function(x=NULL, y=NULL, ind, impStruct,...) {

   ## what samples: get bootstrapped data set 
   wh<-c();
   for(i in 1:max(ind)) wh<-c(wh,which(ind>=i))

   ## what features: remove constant variables
   wp<-which(unlist(lapply(1:ncol(x),function(A) length(unique(x[wh,A]))>1)))

    X<-scale(x[wh,wp])
    z<-try(glm(y[wh]~.,data=as.data.frame(X),family=binomial(link = "logit")),TRUE)

    if(class(z)[1]!="try-error")
    {
        COEF<-coef(z)[-1]/attr(X,"scaled:scale")  

        v_norm=sqrt(COEF%*%COEF);
        z$coef<-rep(0,ncol(x))
        z$coef[wp]<-COEF/v_norm;

	    z$signif<-rep(0,ncol(x))
		if(impStruct$bImportance)
		{
		    if(impStruct$importance_weight == "none")  
		     z$signif[wp]<- c(-1,1)[1+(anova(z, test="Chisq")[-1,5]<impStruct$importance_pvalue)*1]
		    if(impStruct$importance_weight == "length")  
		     z$signif[wp]<- c(-1,1)[1+(anova(z, test="Chisq")[-1,5]<impStruct$importance_pvalue)*1]*length(wh)
		}

        z$lab<-"logit"
        z$ridge_value = 0
    }
    return(z);
}



##--------------------------------------------
## PLS  wrapper 
##--------------------------------------------



pls_wrapper<-function(x=NULL, y=NULL, ind,...) {

       ## what samples: get bootstrapped data set 
       wh<-c();
       for(i in 1:max(ind)) wh<-c(wh,which(ind>=i))
    
       ## what features: remove constant variables
       wp<-which(unlist(lapply(1:ncol(x),function(A) length(unique(x[wh,A]))>1)))

    m_steps<-min(20,length(wp));

    lbd<-c()
    lab<-c(-1,1)[y+1]
    
    X<-scale(x[wh,wp], scale=F)

    obj<-kernelpls.fit(X=as.matrix(X), Y=lab[wh], ncomp =m_steps, stripped = T)

    # find optimal lambda from oob
    Xt<-scale(x[ind==0,wp], center=attr(X,"scaled:center"), scale=F)
    for(l in 1:m_steps){
      if(sum(ind==0)>1)
      lbd[l]<-sum((as.matrix(Xt)%*%coef(obj)[,,l]-as.numeric(obj$Xm%*%coef(obj)[,,l])>0)*1==y[ind==0])
    }
    
    wm<-max(which.max(as.numeric(lbd))[1],2)
    z<-obj

    COEF<-coef(z)[,,wm]

    v_norm=sqrt(COEF%*%COEF);
    z$coef<-rep(0,ncol(x))
    z$coef[wp]<-COEF/v_norm;

    z$signif<-rep(0,ncol(x))
    z$signif[wp]<-0;

    z$ridge_value = wm
    z$lab<-"PLS"
        
    return(z);
}


##--------------------------------------------
## ridge wrapper
##--------------------------------------------


ridge_slow_wrapper<-function(x=NULL, y=NULL, ind,...) {
   ## what samples: get bootstrapped data set 
    # ind contains the sample population;
    # build the bootstrap vector with repetitions
    wh<-c();
    for(i in 1:max(ind)) wh<-c(wh,which(ind>=i))

   ## what features: remove constant variables
    wp<-which(unlist(lapply(1:ncol(x),function(A) length(unique(x[wh,A]))>1)))

    numLambdas=11;
    ridgeValues=c(-5:5);
    lambdas=10^(ridgeValues); #vector of possible lambdas
    
    bestRidgeValue=ridgeValues[1];

    # the current sample and feature subset for the node
    X<-scale(x[wh,wp], scale=F)

    lbd<-rep(0, numLambdas); #vector for number of matches per lambda
    lab<-c(-1,1)[y+1] # convert two class case into fit to -1 and 1

   ## if there are no oobs left there is no need for testing different lambdas
    if(sum(ind==0)==0)
    {
      z<-gen.ridge(x=t(t(X)), y=lab[wh], lambda=lambdas[1]);
    }
    else #test different lambdas
    {
      #save objects
      objs<-vector("list",numLambdas);
      
      # the oobs with the current features
      Xt<-scale(x[ind==0,wp],  center=attr(X,"scaled:center"), scale=F)

      # loop over lambda
      for(l in 1:numLambdas){
	  
	  objs[[l]]<-gen.ridge(x=t(t(X)), y=lab[wh], lambda=lambdas[l])
	  p<-predict(objs[[l]], t(t(Xt)));

	  # find optimal lambda from (local) oob: check how many classes match
	  lbd[l]<-sum((p>0)*1==y[ind==0])
	  
      }
      # find (first) best lambda (with best matching)
      wm<-which.max(as.numeric(lbd))[1]
      # get again the stats for the best lambda
      #z<-gen.ridge(x=t(t(X)), y=lab[wh], lambda=lambdas[wm])
      z<-objs[[wm]];
      bestRidgeValue=ridgeValues[wm];
    }

    COEF<-z$coefficients[,1]

    v_norm=sqrt(COEF%*%COEF);
    z$coef<-rep(0,ncol(x))
    z$coef[wp]<-COEF/v_norm;

    z$signif<-rep(0,ncol(x))
    z$signif[wp]<-0;

    z$ridge_value = bestRidgeValue;
    z$lab<-"ridge"

    return(z);
}


##--------------------------------------------
## ridge wrapper using SVD
##--------------------------------------------


ridge_svd_wrapper<-function(x=NULL, y=NULL, ind,...) {
   ## what samples: get bootstrapped data set 
    # ind contains the sample population;
    # build the bootstrap vector with repetitions
    wh<-c();
    for(i in 1:max(ind)) wh<-c(wh,which(ind>=i))

   ## what features: remove constant variables
    wp<-which(unlist(lapply(1:ncol(x),function(A) length(unique(x[wh,A]))>1)))

    numLambdas=11;
    ridgeValues=c(-5:5);
    lambdas=10^(ridgeValues); #vector of possible lambdas
    
    bestRidgeValue=ridgeValues[1];

    # the current sample and feature subset for the node
    X<-scale(x[wh,wp], scale=F)

    lbd<-rep(0, numLambdas); #vector for number of matches per lambda
    lab<-c(-1,1)[y+1] # convert two class case into fit to -1 and 1

   ## if there are no oobs left there is no need for testing different lambdas
    if(sum(ind==0)==0)
    {
      z<-gen.ridge(x=t(t(X)), y=lab[wh], lambda=lambdas[1]);
    }
    else #test different lambdas
    {
      #save objects
      objs<-vector("list",numLambdas);
      
      # the oobs with the current features
      Xt<-scale(x[ind==0,wp],  center=attr(X,"scaled:center"), scale=F)

      #SVD
      A=t(t(X));
      B=t(t(Xt));
#      s<-svd(x=A);
      s<-svd(x=A, nu=0, nv=ncol(A));
      #precalculate everything we can
      vH_Ab=Conj(t(s$v)) %*% Conj(t(A)) %*% lab[wh];

      #make sure we don't run into trouble because of
      #low rank
      sigmas<-rep(0,ncol(A));
      sigmas[1:length(s$d)]<-s$d;
      sigmasSquared<-sigmas^2;
      # loop over lambda
      for(l in 1:numLambdas){
	  
	  #the regularised pseudo inverse
	  svdCoeffs<-s$v%*%diag(1/((sigmasSquared+lambdas[l]) ) ) %*% vH_Ab;
	  objs[[l]]$coefficients<-svdCoeffs;
	  
	  #calc distance to hyperplane
	  p<-B %*% svdCoeffs;

	  # find optimal lambda from (local) oob: check how many classes match
	  lbd[l]<-sum((p>0)*1==y[ind==0])

	  #for comparison
	  #tmp<-gen.ridge(x=t(t(X)), y=lab[wh], lambda=lambdas[l])
	  #p<-predict(tmp, t(t(Xt)));
      }
#browser()
      # find (first) best lambda (with best matching)
      wm<-which.max(as.numeric(lbd))[1]
      # get again the stats for the best lambda
      #z<-gen.ridge(x=t(t(X)), y=lab[wh], lambda=lambdas[wm])
      z<-objs[[wm]];
      bestRidgeValue=ridgeValues[wm];
    }

    COEF<-z$coefficients[,1]

    v_norm=sqrt(COEF%*%COEF);
    z$coef<-rep(0,ncol(x))
    z$coef[wp]<-COEF/v_norm;

    z$signif<-rep(0,ncol(x))
    z$signif[wp]<-0;

    z$ridge_value = bestRidgeValue;
    z$lab<-"ridge"

    return(z);
}



##--------------------------------------------
## svm wrapper
##--------------------------------------------

svm_wrapper<-function(x=NULL, y=NULL, ind,...) {

   ## what samples: get bootstrapped data set 
   wh<-c();
   for(i in 1:max(ind)) wh<-c(wh,which(ind>=i))

   ## what features: remove constant variables
   wp<-which(unlist(lapply(1:ncol(x),function(A) length(unique(x[wh,A]))>1)))


    # loop over lambda
    lbd<-c()
     
    X<-scale(x[wh,wp])
    Xt<-scale(x[ind==0,wp], scale=attr(X,"scaled:scale"), center=attr(X,"scaled:center"))

    for(l in 1:3){
        
        obj<-try(svm(y=as.factor(y[wh]),x=as.matrix(X),kernel='linear', scale=T, tolerance=1, cost=10^(c(-3,-1,1))[l]),T)
        
        # find optimal lambda from oob
        if(sum(ind==0)>1)
            lbd[l]<-sum(predict(obj, newdata=Xt)==y[ind==0])

    }
    wm<-which.max(as.numeric(lbd))[1]
    z<-try(svm(y=as.factor(y[wh]),x=as.matrix(X),kernel='linear', scale=T, cost=10^(c(-3,-1,1))[wm],tolerance=1),TRUE)

    if(class(z)!="try-error")
    {
        COEF<-as.numeric(t(z$coefs)%*%as.matrix((X)[z$index,]))/attr(X,"scaled:scale")  

        v_norm=sqrt(COEF%*%COEF);
        z$coef<-rep(0,ncol(x))
        z$coef[wp]<-COEF/v_norm;

        z$signif<-rep(0,ncol(x))
        z$signif[wp]<-0;

        z$lab<-"svm"
        z$ridge_value = lbd
    }
    return(z);
}


##--------------------------------------------
## random coefficients
##--------------------------------------------

rnd_wrapper<-function(x=NULL, y=NULL, ind,...) {

   ## what samples: get bootstrapped data set 
   wh<-c();
   for(i in 1:max(ind)) wh<-c(wh,which(ind>=i))

   ## what features: remove constant variables
   wp<-which(unlist(lapply(1:ncol(x),function(A) length(unique(x[wh,A]))>1)))

    z<-list()

    X<-scale(x[wh,wp])

    COEF<-rnorm(ncol(x))

    v_norm=sqrt(COEF%*%COEF)/attr(X,"scaled:scale");
    z$coef<-COEF/v_norm;

    z$signif<-rep(0,ncol(x))
    z$ridge_value = 0    
    z$lab<-"rnorm"
    
    return(z);
}

Try the obliqueRF package in your browser

Any scripts or data that you put into this service are public.

obliqueRF documentation built on May 2, 2019, 4:19 a.m.