R/pls.analysis.R

pls.analysis <-
structure(function # PLS-DA analysis
### The function performs PLS-DA analysis and returns the detailed result about the created model. 
### There are two possible input formats: set1 and set2 (and the function would prepare and form
### all nessacery sets by itself), and train and test (user defines these sets in the proper format). 
(set1=NULL, ##<< a matrix or a data.frame for set1 with objects in rows and descriptors in columns
set2=NULL,  ##<< a matrix or a data.frame for set2 with objects in rows and descriptors in columns
train=NULL, ##<< a data.frame with two subobjects of class AsIs - descriptors and binary classes
test=NULL, ##<< a data.frame with two subobjects of class AsIs - descriptors and binary classes
shuffle=TRUE, ##<< logical: do you want to shuffle objects order in each set?
train.part=50 ##<< a proportion of objects to be put in the training process; only if you specify set1 and set2
){

  if(length(train)+length(test)+length(set1)+length(set2)==0){
      stop('Wrong argument: no data specified.\n')
  }
  if(length(train)+length(test)>0 & length(set1)+length(set2)>0){
      stop('Wrong argument: specify set1 and set2 or train and test! It is imposible to use both at one time.\n')
  }
  if(!require(pls)){
      stop("Method requires library 'pls'. Please install and relaunch.\n")
  }  
  if(train.part<0 | train.part>100){
      stop('Wrong argument: the value train.part should be from 0 to 100%.\n')
  }

  result<-NA 
  if (length(train)+length(test)>0 & length(set1)+length(set2)==0){
    if(dim(train)[2]!=2 | dim(test)[2]!=2){
	stop('Wrong argument: train and test sets do not contain two subobjects - descriptors and classes.\n')
    }
    if(class(train[, 1])[1]!='AsIs' | class(test[, 1])[1]!='AsIs'){
	stop('Wrong argument: train and test arguments should be of class "AsIs".\n')
    }
    if(dim(train)[2]!=dim(test)[2]){
	stop('Wrong argument: different number of columns in train and test sets!\n')
    }
    if(dim(train[,1])[2]<5){
	stop('Wrong argument: number of descriptors is too small!\n')
    }
    if(dim(train[,1])[1]<10){
	stop('Wrong argument: number of objects is too small!\n')
    }

    tf.train<-train
    tf.test<-test
    colnames(tf.train)<-c('CH', 'TY')
    colnames(tf.test)<-c('CH', 'TY')
    tp.i<-which(tf.test$TY==1)
    
    tf.pls <-plsr(TY ~ CH, ncomp=10, data=tf.train, scale=TRUE, centered=TRUE, validation='CV')
    tf.predict<-predict(tf.pls, tf.test, ncomp=1:10, type='response')
    
    r2<-100*drop(R2(tf.pls, estimate='all', intercept=FALSE)$val)
    bc<-which.max(r2[2,])
    
    res<-c()
    for (threshold in seq(0, 1, 0.05)){
	tp<-length(which(tf.predict[tp.i, , bc]>=threshold))
	fn<-length(which(tf.predict[tp.i, , bc]<threshold))
	tn<-length(which(tf.predict[-tp.i, , bc]<threshold))
	fp<-length(which(tf.predict[-tp.i, , bc]>threshold))
	r<-tp/(tp+fn) 
	p<-tp/(tp+fp)
	a<-(tp+tn)/(tp+tn+fp+fn)
	s<-tn/(fp+tn)  
	res<-rbind(res, c(tp, fn, tn, fp, r, s, p, a))
    }
    colnames(res)<-c('tp', 'fn', 'tn', 'fp', 'r', 's', 'p', 'a')
    rownames(res)<-seq(0, 1, 0.05)


  
    ##<<source pls package: http://cran.r-project.org/web/packages/pls/index.html
    result<-list(pls=tf.pls, train=tf.train, test=tf.test, predict=tf.predict, perform=res, bc=bc)
  }
  
  if (is.null(train) & is.null(test)==0 & length(set1)[1]+length(set2)[1]>0){ 
    if(!class(set1)%in%c('data.frame', 'matrix')){
      stop('Wrong argument: check set1 class! It should be a matrix or a data.frame.\n')
    }
    if(!class(set2)%in%c('data.frame', 'matrix')){
	stop('Wrong argument: check set2 class! It should be a matrix or a data.frame.\n')
    } 
    if(dim(set1)[2]!=dim(set2)[2]){
	stop('Wrong argument: different number of columns in set1 and set2 sets!\n')
    }
    if(dim(set1)[2]<5){
	stop('Wrong argument: number of descriptors is too small!\n')
    }
    if(dim(set1)[1]<10 | dim(set2)[1]<10){
	stop('Wrong argument: number of objects is too small!\n')
    }
    l1<-dim(set1)[1]
    l2<-dim(set2)[1]
    
    if (is.null(rownames(set1))){ rownames(set1)<-paste('set1', 1:l1, sep='_') }
    if (is.null(rownames(set2))){ rownames(set2)<-paste('set2', 1:l2, sep='_') }

    if (shuffle==TRUE){ 
      set1<-set1[sample(1:l1, l1),]
      set2<-set2[sample(1:l2, l2),]
    }
    l<-round(min(l1, l2)/(100/train.part))

    tr<-rbind(set1[1:l, ], set2[1:l,])
    te<-rbind(set1[1:(l1-l), ], set2[1:(l2-l),])

    Y.train<-c(rep(1, l), rep(0, l))
    Y.test<-c(rep(1,(l1-l)), rep(0, (l2-l)))

    tf.train<-data.frame(CH=I(as.matrix(tr)), TY=I(Y.train))
    tf.test<-data.frame(CH=I(as.matrix(te)), TY=I(Y.test))   
    
    tf.pls <-plsr(TY ~ CH, ncomp=10, data=tf.train, scale=TRUE, centered=TRUE, validation='CV')
    tf.predict<-predict(tf.pls, tf.test, ncomp=1:10, type='response')
    ll<-dim(tf.predict)[1]
    
    r2<-100*drop(R2(tf.pls, estimate='all', intercept=FALSE)$val)
    bc<-which.max(r2[2,])
    
    res<-c()
    for (threshold in seq(0, 1, 0.05)){
	tp<-length(which(tf.predict[1:l, , bc]>=threshold))
	fn<-length(which(tf.predict[1:l, , bc]<threshold))
	tn<-length(which(tf.predict[(l+1):ll, , bc]<threshold))
	fp<-length(which(tf.predict[(l+1):ll, , bc]>threshold))
	r<-tp/(tp+fn) 
	p<-tp/(tp+fp)
	a<-(tp+tn)/(tp+tn+fp+fn)
	s<-tn/(fp+tn)  
	res<-rbind(res, c(tp, fn, tn, fp, r, s, p, a))
    }
    colnames(res)<-c('tp', 'fn', 'tn', 'fp', 'r', 's', 'p', 'a')
    rownames(res)<-seq(0, 1, 0.05)


    
    ##<<source pls package: http://cran.r-project.org/web/packages/pls/index.html
    result<-list(pls=tf.pls, train=tf.train, test=tf.test, predict=tf.predict, perform=res, bc=bc)
  }
  ##<<value list with results from PLS-DA analysis
  return(result)
}, ex = function(){
  set1<-matrix(rnorm(500, 1), nrow=50,  byrow=TRUE)
  set2<-matrix(rnorm(500, 2), nrow=50,  byrow=TRUE)
  res<-pls.analysis(set1=set1, set2=set2)

  train.set<-rbind(set1[1:25,], set2[1:25,])
  test.set<-rbind(set1[26:50,], set2[26:50,])
  train<-data.frame(CH=I(train.set), TY=I(c(rep(1, 25), rep(0, 25))))
  test<-data.frame(CH=I(test.set), TY=I(c(rep(1, 25), rep(0, 25))))
  res<-pls.analysis(train=train, test=test)
})
RedJane/tfbs.qsam documentation built on May 9, 2019, 9:37 a.m.