tests/tst_form_ext_dset_model.R

install.packages('dplyr')
install.packages('readr')
install.packages('ranger')
install.packages('foreach')

#### Library functions ====
library(dplyr)
library(readr)
library(ranger)
library(foreach)

#### FUnctions ====
'%ni%' <- Negate('%in%')

#### Load in the data ====
data_splits = list.files('/shared/mtl-qsar/data_splits/', full.names = T) # List of the datasets
# ID shows the row number in a given dataset

assist_data = list.files('/shared/mtl-qsar/datasets/assist_data_v190709', full.names = T) # List with fold information
# row numbers for 10 fold cv

original = list.files('/shared/mtl-qsar/datasets/originals', full.names = T) # lists the original datasets

## Only included to test the 10 completed assistant task sets
data_splits = data_splits[1:10]
original = original[1:10]

#### Prep the analysis ====
## Length of data in use
len = length(data_splits)

#### Sorting the data ====
tst.list = list()
trn.list = list()
a.trn = list()
ex.data = list()

tst.out = list()
stl.out = list()
mtl.out = list()
ext.data = list()

for (dnum in 1:len){
  
  # split = read_csv(data_splits[dnum]) # pulls out the split information for the primary task 
  # assist = read_csv(assist_data[dnum]) # pull out fold info for given dataset
  # orig = read_csv(original[dnum])
  # 
  # names(split)[1] = 'molecule_id'
  # 
  # for (iter in 1:10){
  #   
  #  # tmp1 = assist[[iter]] # pulls out the fold IDs
  #   
  #   tst = split[split$fold == iter,] # pulls out data matching the CV folds
  #   tst = merge(orig, tst, by = 'molecule_id')
  #   tst = select(tst, -c(fold))
  #   
  #   trn = split[split$fold != iter,] # pulls out not matching CV folds
  #   trn = merge(orig, trn, by = 'molecule_id')
  #   trn = select(trn, -c(fold))
  #   
  #   tst.list[[iter]] = tst # stores tst set in list
  #   trn.list[[iter]] = trn # stores training set in a list
  #   
  #   a_trn = assist[assist$fold == iter,]
  #   a_trn = select(a_trn, -c(fold, dataset_id))
  #   a.trn[[iter]] = a_trn
  # }
  # 
  # stl.out[[dnum]] = trn.list # stores training sets for whole dataset in list - reference with trn.out[['a']]
  # mtl.out[[dnum]] = a.trn # stores the assistant task for whole dataset in list
  # tst.out[[dnum]] = tst.list # stores test set for whole dataset in list form
  #
  
  split = read_csv(data_splits[trains_iter]) # pulls out the split information for the primary task 
  assist = read_csv(assist_data[trains_iter]) # pull out fold info for given dataset
  orig = read_csv(original[trains_iter])
  
  names(split)[1] = 'molecule_id'
  
  foreach (iter = 1:len, .combine = "rbind") %do% {
    tst = split[split$fold == iter,] # pulls out data matching the CV folds
    tst = merge(orig, tst, by = 'molecule_id')
    tst = select(tst, -c(fold))
    
    trn = split[split$fold != iter,] # pulls out not matching CV folds
    trn = merge(orig, trn, by = 'molecule_id')
    trn = select(trn, -c(fold))
    
    tst.list[[iter]] = tst # stores tst set in list
    trn.list[[iter]] = trn # stores training set in a list
    
    a_trn = assist[assist$fold == iter,]
    a_trn = select(a_trn, -c(fold, dataset_id))
    a.trn[[iter]] = a_trn
  }
  
  stl.out[[trains_iter]] = trn.list # stores training sets for whole dataset in list - reference with trn.out[['a']]
  mtl.out[[trains_iter]] = a.trn # stores the assistant task for whole dataset in list
  tst.out[[trains_iter]] = tst.list # stores test set for whole dataset in list form
}

##### Create the extended dataset ====
for(outer in 1:10){
  for(inner in 1:10){
      
    tmp1 = stl.out[[outer]][inner]
    tmp1 = do.call(rbind.data.frame, tmp1)
    tmp2 = mtl.out[[outer]][inner]
    tmp2 = do.call(rbind.data.frame, tmp2)
      
    ext = rbind(tmp1, tmp2)
    
    ex.data[[inner]] = ext
      
  }
    
  ext.data[[outer]] = ex.data
  
}
  
#### Run the MTL model ====

a = '/shared/mtl-qsar/predictions/MTL/'

###### RANDOM FOREST ######
RF.mult.1 = foreach(trains_iter = 1:length(tst.out), .combine = "rbind") %do% {
  for(inloop in 1:10){
  trn.data = ext.data[[trains_iter]][inloop] # Select given datafold 
  trn.data = do.call(rbind.data.frame, trn.data)
  trn.data = trn.data %>% select(-molecule_id) # removes the molecule column
  
  tst.data = tst.out[[trains_iter]][inloop]# Select corresponding datafold
  tst.data = do.call(rbind.data.frame, tst.data)
  tst.data = tst.data %>% select(-molecule_id)
  
  mdl = ranger(pXC50 ~ ., trn.data) # makes the random frest model, with the pXC50 as the output and the remainder inputs
  
  preds = predict(mdl, data = tst.data)
  
  names = original[trains_iter]
  names = substr(names, 37, nchar(names))
  names = substr(names, 1, nchar(names)-4)
  
  print(inloop) # To chek progress
  
  outp = data_frame(iter = trains_iter, true = tst.data$pXC50, predicted = preds$predictions, dataset_id = names) # stores the output with the actual values and the predicted values in seperate columns
  }
  outp
}

write_csv(RF.mult, '/shared/mtl-qsar/predictions/MTL/RF_MTL_perf.csv')
iaolier/mtl-qsar documentation built on Aug. 8, 2019, 5:55 p.m.