testcode/demo-haxby-caret.R

library(mvpa)
#install.packages("oro.nifti")
#install.packages("FactoMineR")
#install.packages("fmri")
#install.packages("R.matlab")
library(R.matlab)
library(FactoMineR)
library(MASS)
library(class)
#install.packages("cvTools")
library(cvTools)
#install.packages("caret")
library(caret)
install.packages(mlbench)
library(mlbench)
#when using caret, we should set up multicore processing.
#http://topepo.github.io/caret/parallel.html

register.machine.cores()
#load mask
mask.img <- read.image(file = "/../mnt/data/haxby-2001-set/mask_cat_select_vt.nii") 

summary(mask.img)

#go through all the files and read them into a list of files
files = list()
for (n in 1:10){  
  test.path <- paste0("/../mnt/data/haxby-2001-set/haxby8_r",as.character(n),".nii")
  if(file.exists(test.path)){
    files <- c(files, test.path)
  }
}
getwd()
#now we know which files we want! Let's see if they can be loaded
images <- as.list(rep(NA,length(files)))
for (i in 1:length(files)){
  print(paste0("Loading ", i, " into memory"))
  images[[i]] <- read.image(file = files[[i]]) 
}

#take each image series, mask it, and then put it into a matrix where rows represent the masked voxels and columns represent time.
#to put these back into 4D, we'd need to use the mask image to select the sum(mask.img>0) values to put back in.
#cool!
#images.masked <- lapply(images,function(img){
#  masked.image <- matrix(img[mask.img>0],c(sum(mask.img>0),dim(images[[1]])[4]))
#  return(masked.image)
#})

#take each image series, mask it, and then put it into a matrix where rows represent time and the columns represent the masked voxels.
#this will require some transposition
#to put these back into 4D, we'd need to use the mask image to select the sum(mask.img>0) values to put back in.
#and will need to transpose again
#cool!
images.masked <- lapply(images,function(img){
  masked.image <- t(matrix(img[mask.img>0],c(sum(mask.img>0),dim(images[[1]])[4])))
  #we transpose because the general R convention, set e.g., by data frames,
  #is that each row is an observation
  #and each column is a variable
  return(masked.image)
})

#OK, now for each of the 10 samples we need to classify each of these images as:
#train or test
#one of 8 classes or rest data.

#how do we know which images in the sereis are of each class, so that we can train?
#let's read the .mat files in the dir with the haxby data and see what's in there.

#this is from a slightly different version of the file.
#http://www.pymvpa.org/datadb/haxby2001.html
#haxby had 12 runs...why is there only 10 here??

#This is where the haxby-2001-set came from:
#http://code.google.com/p/princeton-mvpa-toolbox/wiki/Downloads?tm=2
#see: http://code.google.com/p/princeton-mvpa-toolbox/wiki/TutorialIntro
#there are 10 runs of a single subject here.
#each blog contains a 9-run TR with a (1?) rest in between and in each end.

#tutorial including info tutorial_runs can be found here:
#http://code.google.com/p/princeton-mvpa-toolbox/wiki/TutorialIntro

#get the run codes and categories of each ite
tutorial.runs <- readMat("/../mnt/data/haxby-2001-set/tutorial_runs.mat")
tutorial.regs <- readMat("/../mnt/data/haxby-2001-set/tutorial_regs.mat")
#OK now we have the codes for each!
dim(tutorial.regs$regs)
#tutorial.regs has the codes
plot(tutorial.regs$regs[1, ])
plot(tutorial.regs$regs[2, ])
plot(tutorial.regs$regs[3, ])
length(which(tutorial.regs$regs[3, ]==1))#OK Great this is what we want!

#we can create a vector which stores which category each image is.
category.ts <- apply(tutorial.regs$regs,2, function(x){
  x.cat <- which(x==1)
  if(length(x.cat)==0){
    return(NA)
  }else{
    return(x.cat)
  }
})

run.ts <- array(unlist(tutorial.runs$runs))

#so to recap, did we apply the mask before?
dim(images.masked[[1]])
#since our category vector is 1210-item long, concatenated list of (we assume consecutive) runs,
#we should try to get it into that format :-D
length(category.ts)
class(images.masked[[1]])

#check all matrices have the same number of cols (i.e., variables, voxels)
img.series.cols = unlist(lapply(images.masked,function(x){return(dim(x)[2])}))
#and check the total length of all observation, datapoints.
img.series.row.sum = sum(unlist(lapply(images.masked,function(x){return(dim(x)[1])})))

if (length(unique(img.series.cols))>1) stop("SOME RUNS APPEAR TO HAVE A DIFFERING NUMBER OF VOXELS MASKED.")

#create a matrix to populate with a concatenated timeseries of masked images 
images.masked.concat = matrix(nrow=img.series.row.sum,ncol=img.series.cols[1])
#now populate images.masked.concat
#we won't assume that every image has the same number of timepoints.
row.index <-1
for (i in 1:length(images.masked)){
  image.i.endpos <- row.index + dim(images.masked[[i]])[1]-1
  images.masked.concat[row.index:image.i.endpos,] <- images.masked[[i]]
  row.index <- image.i.endpos + 1  
}

#display the dataset
display.ds <- function(ds){
  minval <- min(ds,na.rm=T)
  range <- max(ds,na.rm=T) - min(ds,na.rm=T)
  ds.scaled <- (ds-minval)/range
  max.img.side<-max(dim(ds))
  plot(c(1, max.img.side+1), c(1, max.img.side+1), type = "n", xlab = "col (voxel)", ylab = "row (timepoint)")
  rasterImage((ds.scaled), 1, 1, dim(ds.scaled)[2]+1, dim(ds.scaled)[1]+1, interpolate = FALSE)
}

#PREPROCESSING
#now for this dataset, we should:
#-subtract the mean *image* of each run from all of the images in that run,
# because we assume that runs are counterbalanced
#image.timeseries<-images.masked.concat
#run.timeseries <- run.ts
subtract.mean.image.by.run <- function(image.timeseries,run.timeseries){
  image.timeseries.mc <- matrix(nrow=dim(image.timeseries)[1],ncol=dim(image.timeseries)[2])
  #r=1
  for (r in unique(run.timeseries)){
    r.timepoints <- which(run.timeseries==r)
    dim(image.timeseries[r.timepoints,])
    run.mean.image <- colMeans(image.timeseries[r.timepoints,])
    #print(length(run.mean.image))
    image.timeseries.mc[r.timepoints,] <- t(t(image.timeseries[r.timepoints,]) - run.mean.image)
    #display.ds(image.timeseries.mc)
    #sweep(image.timeseries,r.timepoints,#seems like it could a very good function to use, but we don't need it..
    warning("run.mean.image is a double matrix. For slight loss in precision but less memory usage, it could be converted into an integer matrix.")
  }
  return(image.timeseries.mc)
}
images.masked.concat.mc <- subtract.mean.image.by.run(images.masked.concat,run.ts)
display.ds(images.masked.concat.mc)

#first - what are the distributional properties of the values in each category?
#to classify we should probably know how much the variances vary by each variable.
hist(apply(images.masked.concat.mc,2,sd),breaks=40)
length(apply(images.masked.concat.mc,2,sd))
#this is a fairly wide range of variances.
#they should probably be normalized.


# (same number of samples in each run, so that's reasonable)
#-then subtract the mean voxel from all voxels in each image, 
# because we assume that what's most important is the *relative* activity of each voxel.
#though I don't know if this is all that necessary now; looking at the dataset there appears to be as much variability across voxels as within them.
#image.timeseries<-images.masked.concat.mc
#http://stackoverflow.com/questions/20596433/how-to-divide-each-row-of-a-matrix-by-elements-of-a-vector-in-r
mean.center.each.image <- function(image.timeseries){
  image.timeseries.mc <- matrix(nrow=dim(image.timeseries)[1],ncol=dim(image.timeseries)[2])
  #get the mean voxel from all voxels in each image
  mean.voxel.by.image <- rowMeans(image.timeseries)
  length(mean.voxel.by.image)
  #and subtract
  image.timeseries.mc <- image.timeseries - mean.voxel.by.image
  warning("you may want to normalize each voxel, but here, we're only demeaning them.")  
  #now check that we did that right...
  vox=500:510
  tp=54
  image.timeseries.mc[tp,vox]+mean.voxel.by.image[tp]
  image.timeseries[tp,vox]
  return(image.timeseries.mc)
}
#this might be superfluous - do we need to mean center each image?

images.masked.concat.mc2 <- mean.center.each.image(images.masked.concat.mc)

#what we WILL do is now normalize each voxel
normalize.each.voxel <- function(image.timeseries){
  image.timeseries.normed <- t((t(image.timeseries) - colMeans(image.timeseries)) / apply(image.timeseries,2,sd))
  return(image.timeseries.normed)
}
images.masked.concat.normed <- normalize.each.voxel(images.masked.concat.mc2)

display.ds(images.masked.concat.normed)
images.preprocessed <- images.masked.concat.normed

#NOW...finally...we can start some training!!!!
#let's just grab the stuff that's in a category.
#set up the data in a format we can use to train. We might have to put it into a single data frame but let's try to avoid a data frame for now.
image.rc <- images.preprocessed[!is.na(category.ts),]
category.rc <- category.ts[!is.na(category.ts)]
runs.rc <- run.ts[!is.na(category.ts)]
set.seed(42) #select a randomly determined 50% of the dataset to test on. 
train.set.indices <- sample(1:5,5)
test.set.indices <- which(!(1:10 %in% train.set.indices))

training.obs <- runs.rc%in% train.set.indices
testing.obs <-runs.rc%in% test.set.indices 
modelLookup(nnet)
#wraps all the testing in a function that is called from the cross validator to get overall results.
evaluate.model <- function(training.obs, testing.obs){
  res <-list()
  train.vars<-image.rc[training.obs,]
  test.vars<-image.rc[testing.obs,]
  train.targets <- category.rc[training.obs]
  test.targets <- category.rc[testing.obs]
  print("LDA Result:")
  test.res <- lapply(
    c('moment','mle'),function(da.method){
    print(paste0("Trying LDA with",da.method))
    lda.fit <- lda(train.vars, train.targets, method = da.method)
    lda.predict <- predict(lda.fit, train.vars)
    train.train.success <- data.frame("predictions" = lda.predict$class, "ground.truth" = train.targets, "success" = lda.predict$class==train.targets)
    lda.test.predict <- predict(lda.fit, test.vars)
    train.test.success <- data.frame("predictions" = lda.test.predict$class, "ground.truth" = test.targets, "success" = lda.test.predict$class==test.targets)
    lda.success = mean(train.test.success$success)
    return(lda.success)
  })
  res$LDA.moment <- test.res[[1]]
  res$LDA.mle <- test.res[[2]]
  
  #OK so not resounding success with LDA.
  
  #We could try a few different methods.
  
  #QDA
  tryCatch({
  qda.fit <- qda(train.vars, train.targets, method = 'mle')
  qda.predict <- predict(qda.fit, train.vars)
  train.train.success <- data.frame("predictions" = qda.predict$class, "ground.truth" = train.targets, "success" = qda.predict$class==train.targets)
  lda.test.predict <- predict(lda.fit, test.vars)
  train.test.success <- data.frame("predictions" = qda.test.predict$class, "ground.truth" = test.targets, "success" = qda.test.predict$class==test.targets)
  qda.success = mean(train.test.success$success)
  res$QDA <<- qda.success
  }, warning = function(war){
    print(war)
  }, error = function(err){
    print(err)
  })
  #could also use...what's the package for trying out different values?
  #can use:
  #apply
  #tune (e1071 package)
  #caret http://machinelearningmastery.com/tuning-machine-learning-models-using-the-caret-r-package/
  #a useful repository: http://topepo.github.io/caret/training.html
  #List of all the models avialable by default in caret:  http://topepo.github.io/caret/modelList.html
  
  #for now, apply will do :-)
  
  #KNN
  
  knn.success <- lapply(c(1:8),function(k.val){
    set.seed(1)
    knn.pred <- knn(train.vars,test.vars, train.targets,k=k.val)
    table(knn.pred,test.targets)
    return(sum(diag(table(knn.pred,test.targets)))/sum(table(knn.pred,test.targets)))
  })
  res$KNN <- knn.success
  
  return(res)
}
evaluate.model(training.obs,testing.obs)

#and...what data manipulations could we try?
#could probably legitimately try taking the average from each category/block.
#and could also try putting them together into consecutive training bits.

#OK, we should be doing k-fold cross validation here if we're comparing models...
#vals <- cvFolds(dim(image.rc)[1],K=8,type="consecutive") #ONLY works because the data is stored consecutively.

#let's try this.
#method is a function which we call to do the kfold cross validation.
#folds is vector that describes which fold each data point belongs to.
do.kfold <- function(method,folds){
  #method=evaluate.model,folds=runs.rc
  res.list <- list()
  res.values <- list()
  res.byfold=list()
  #f=1
  for (f in unique(folds)){
    print(paste0("KFold Excluding fold ",f))
    #for each fold, exclude it and use all the others as the training data, and then use f as the test data.
    training.obs <- folds!=f
    testing.obs <-folds==f
    res<-method(training.obs,testing.obs)
    res.byfold[[f]]=res
    res.values=unique(c(res.values,names(res)))
  }
  #take the average of each value in res
  #cycle through all the values found.
#   for (res.value in res.values){
#     res.list[[res.value]] <- mean(
#       unlist(lapply(res,function(x){
#       return(res[[res.value]])
#     })))
#     }
#   return(list("res.list" = res.list,"res.byfold" = res.byfold))
  return(res.byfold)
}

res.kfold  = do.kfold(evaluate.model,runs.rc)


#So none of these method perofmr marvelously, and to add suspicion, why did we got from 66 to 55 % accuracy?


#caret createDataPartition
table(category.rc)
flds <- createFolds(category.rc, k = 10, list = TRUE, returnTrain = FALSE)

flds.custom <- createFoldsByGroup(runs.rc)

#putting createFolds format into trControl
#https://stat.ethz.ch/pipermail/r-help/2011-May/277722.html


caret.lda.foldByRun <- train(image.rc,
                   factor(category.rc),
                   method = "lda",
                   trControl = trainControl(
                     method = "LGOCV",
                     number = 10,
                     repeats =10,
                     index = flds.custom
                     )
                     )

caret.knn.foldByRun <- train(image.rc,
                             factor(category.rc),
                             method = "knn",
                             trControl = trainControl(
                               method = "LGOCV",
                               index = flds.custom
                             ),
                             tuneLength=5)
#OK we're using caret nicely, although for a reason I can't explain, caret isn't performing 
#well here when I use the leave-one-run-out cross validation.
#I don't think there is much to tune so I don't knwo why this is not working!
#I suppose there are theoretical reasons why generalizing from one run to the next might not perform so well
#I don't think we need this to slow us down too much; once caret has given us some nice results with LOOCV, we can always go back 
#to other forms of cross-validation.

tcLGOCV <- trainControl(
  method = "LGOCV",
  number = 10,
  repeats =10,
  index = flds
)

control <- trainControl(method="repeatedcv", number=10, repeats=3)

register.machine.cores()

caret.model.list <- c(caret.model.list,knn = list(
                      train(image.rc,
                   factor(category.rc),
                   method = "knn",
                   trControl = control,
                   tuneLength=5)))

caret.model.list <- c(caret.model.list,list(svmLinear= 
                      train(image.rc,
                               factor(category.rc),
                               method = "svmLinear",
                               trControl = control,
                               tuneLength=5)))

svmLinearTuned= 
  train(image.rc,
        factor(category.rc),
        method = "svmLinear",
        trControl = control,
        tuneGrid = expand.grid(C=c(1:5))
          )
#so lets see if we can try out multiple methods with caret?
#first, let's do another kind of repetition...since my custom trainControl doesn't seem to work 
#http://machinelearningmastery.com/compare-models-and-select-the-best-using-the-caret-r-package/
                      
results <- resamples(caret.model.list)
summary(results)

# boxplots of results
bwplot(results)
# dot plots of results
dotplot(results)

#try again with some new functions I wrote.

train.list = list(
  get.caret.model.spec("lda"),
  get.caret.model.spec("knn",10),
  get.caret.model.spec("svmLinear",expand.grid
                       (C=c(1:5)))
)


new.cml <- caret.train.model.list(
  image.rc,
  factor(category.rc),
  control,
  train.list)

results <- resamples(new.cml)
summary(results)

# boxplots of results
bwplot(results)
# dot plots of results
dotplot(results)
 
#next thing I'd like to do is download the rest of the haxby data and apply this on those too. But don't spend more than 2 hours on this
#we need to move on to doing RBMs, and those could require a custom caret method to be trained.
bjsmith/r-mvpa documentation built on May 30, 2019, 11:53 a.m.