tests/full code 100519_IO.R

############################################################################################################
############################################################################################################
######################################### Functions and Libraries# #########################################
############################################################################################################
############################################################################################################

## Functions ====
# Tanimoto score
tani.dist = function(){
  fp_sims = fp.sim.matrix(trn_set, tst_mol) # Compares how similar two sets of fingerprints are
}

split_dset = function(iter, .dset){
  list(tst_set = .dset %>% filter(cvf$which == iter), # splits the dataset into test and training samples
       trn_set = .dset %>% filter(cvf$which != iter)) 
}

df2matfp = function(df, .fp_prefix = fp_prefix){
  as.matrix(df %>% select(starts_with(.fp_prefix))) # Extracts instances that start with a given value
}

create_fplist = function(matr_fp){
  map(1:nrow(matr_fp), ~ new("fingerprint", nbit = ncol(matr_fp), bits=which(as.logical(matr_fp[.x,]))))
  # takes ever instace in the matrix and changes it to a fingerprint b mapping the function to all 
}

calc_thresh = function(samp_it) {
  fp_sim_mat = fp.sim.matrix(multipl_fp[samp_it$trn_set$molecule_id],
                             multipl_fp[samp_it$tst_set$molecule_id]) # calculate similarity matrix 
  map_dbl(as_data_frame(fp_sim_mat), max) # extracts the highest value from each as the threshold value
}

# Subsetted Data
sub_data = function(threshs, assi_molid){
  thre_molid = names(threshs) # rename
  fp_sim_mat = fp.sim.matrix(multipl_fp[assi_molid],
                             multipl_fp[thre_molid]) # compares similarity 
  simplify(map2(as_data_frame(fp_sim_mat), threshs, ~ which(.x > .y))) # WHAT DOES THIS MEAN?
}

create_extdset = function(dseti){
  prim_dset = multipl %>% filter(File == dseti) # WHAT DOES THIS MEAN? 
  assi_dset = multipl %>% filter(File != dseti) # WHAT DOES THIS MEAN?
  
  cvf = cvFolds(nrow(prim_dset), CV.split) # performs the 10fold cv
  samp_dsets = map(1:CV.split, 
                   ~ list(tst_set = prim_dset %>% filter(cvf$which == .x),
                          trn_set = prim_dset %>% filter(cvf$which != .x)) )
  
 
  threshs = samp_dsets %>% map(calc_thresh) %>% simplify # WHAT DOES THIS MEAN?
  
  sub_ind = sub_data(threshs, assi_dset$molecule_id) ##
  
  # Add assistant dataset 
  if(length(sub_ind) > 0) {
    assi_dset = assi_dset %>% slice(sub_ind)
    prim_dset = prim_dset %>% bind_rows(assi_dset)
  }
  
  # Add fingerprints
  fp_dset = as_data_frame(fp.to.matrix(multipl_fp[prim_dset$molecule_id]))
  names(fp_dset) = fpcolnams # renames the columns
  prim_dset = prim_dset %>% bind_cols(fp_dset) %>% distinct(molecule_id, .keep_all = T) %>% select(-File)
  write_csv(prim_dset, paste0(path_dset_outp, "ext_", dseti)) # stores as csv file and adds string to name
}


## Packages ====
library(tidyverse)
library(ggplot2)
library(foreach)
library(cvTools)
library(readbulk)
library(fingerprint)
library(ranger)
library(randomForest)
library(infotheo)


############################################################################################################
############################################################################################################
########################################## Create the 90-10 Splits #########################################
############################################################################################################
############################################################################################################

## Set Working Directory ====
<<<<<<< HEAD
#setwd("C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/PhD/MTL Work/MTL Paper")
#a = "C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/PhD/MTL Work/MTL Paper"

#dats = "C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/PhD/MTL Work/MTL Paper/100_Dataset_Sample" # 180327_data_sample_qsars

dats <- "/shared/mtl-qsar/datasets/originals/"
dat.sz = list.files(dats, full.names = F)

## Paths ====
#path_dset_outp = "C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/PhD/MTL Work/MTL Paper/100 Datasets/Splits Train/"
path_dset_outp <- "/shared/mtl-qsar/mtl_assisst/data_splits/train/"
dir.create(path_dset_outp, recursive = T)

#path_dset_outp_test = "C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/PhD/MTL Work/MTL Paper/100 Datasets/Splits Test/"
path_dset_outp <- "/shared/mtl-qsar/mtl_assisst/data_splits/test/"
=======
setwd("C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/PhD/MTL Work/MTL Paper")
a = "C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/PhD/MTL Work/MTL Paper"

dats = "C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/PhD/MTL Work/MTL Paper/100_Dataset_Sample" # 180327_data_sample_qsars
dat.sz = list.files(dats, full.names = T)

## Paths ====
path_dset_outp = "C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/PhD/MTL Work/MTL Paper/100 Datasets/Splits Train/"
dir.create(path_dset_outp, recursive = T)

path_dset_outp_test = "C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/PhD/MTL Work/MTL Paper/100 Datasets/Splits Test/"
>>>>>>> d5cb01acc50d4b5be9f5f62c349fc3fd3b723fc3
dir.create(path_dset_outp_test, recursive = T)

## Seed for reproducability ====
set.seed(123)

## Number of Splits ====
CV.split = 10
nrow = length(dat.sz)

<<<<<<< HEAD
####This bit needs to be replace by something else so we can use the splits already available
## Split 90% - 10% ====
for (i in 1:nrow) {
  data_fname = dat.sz[i] 
  data = read_csv(paste0(dats, data_fname)) # reads in each file of data
  
  a = nchar(data_fname) # Number of characters
  abrev_name = substr(data_fname, (a - 13), a) # shortened name ##FIXME: extract this info using str_remove_all 
  
  # creates the data splitted into CV.split groups
  cv = cvFolds(n = nrow(data), CV.split) # trying to make sure that the groups are of equal size ##FIXME: replace cvFolds by my own version
=======
## Split 90% - 10% ====
for (i in 1:nrow) {
  data_fname = dat.sz[i] 
  data = read_csv(data_fname) # reads in each file of data
  
  a = nchar(data_fname) # Number of characters
  abrev_name = substr(data_fname, (a - 13), a) # shortened name
  
  # creates the data splitted into CV.split groups
  cv = cvFolds(n = nrow(data), CV.split) # trying to make sure that the groups are of equal size
>>>>>>> d5cb01acc50d4b5be9f5f62c349fc3fd3b723fc3
  
  index = foreach(iter = 1:CV.split, .combine = "rbind") %do% {
    trn.id = cv$subsets[cv$which != iter] # Takes out row ids that are not equal to the iteration number
    tst.id = cv$subsets[cv$which == iter] # test set are the row ids that are equal to the iteration number
    
    index = list(train = trn.id, test = tst.id) # stores test and train id in list
  }
  
  for (j in 1:CV.split){
    
    a_name = substr(abrev_name, 1, 
                    nchar(abrev_name)-4) # removes the .csv part
    
    a_name = paste(a_name, j-1, sep = '_') # adds iter value to name
    abrevname = paste(a_name, '.csv', sep = '') # adds '.csv' to name
    
    trn.data = slice(data, index[[j]]) # Takes out the training sample based on the row ids
    write_csv(trn.data, paste0(path_dset_outp, "CV_train_", abrevname)) # stores as csv file and adds string to name
    
    tst.data = slice(data, index[[j + 10]]) # Takes out test sample based on the row ids
    write_csv(tst.data, paste0(path_dset_outp_test, "CV_test_", abrevname))
  }
  
}
<<<<<<< HEAD
#######
=======
>>>>>>> d5cb01acc50d4b5be9f5f62c349fc3fd3b723fc3

############################################################################################################
############################################################################################################
######################################## Create an extended dataset ########################################
############################################################################################################
############################################################################################################

## Fingerprints
fp_nbits = 1024
fp_prefix = "FCFP4_1024b"

## Seed for reproducability ====
set.seed(123)

<<<<<<< HEAD
## Set working directory ====
setwd("C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/PhD/MTL Work/MTL Paper/100 Datasets")

## Paths ====
path_dsets = "C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/PhD/MTL Work/MTL Paper/100 Datasets/Splits Train/"
path_dset_outp = "C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/PhD/MTL Work/MTL Paper/100 Datasets/extended data/"
dir.create(path_dset_outp, recursive = T)

## Read in all datasets ====
multipl = read_bulk(directory = path_dsets, # Finds the path for the files
=======
## Paths ====
path_dsets = '/shared/mtl-qsar/data_splits/'
dsets = '/shared/mtl-qsar/datasets/originals/'
path_dset_outp = "/shared/mtl-qsar/datasets/extended data/"
dir.create(path_dset_outp, recursive = T)

## Read in all datasets ====
multipl = read_bulk(directory = dsets, # Finds the path for the files
>>>>>>> d5cb01acc50d4b5be9f5f62c349fc3fd3b723fc3
                    fun = readr::read_csv,
                    col_types = cols(
                      .default = col_integer(),
                      molecule_id = col_character(),
                      pXC50 = col_double()
                    ))

## Fingerprints
fp_nbits = 1024
fp_prefix = "FCFP4_1024b"
CV.split = 10

## Fingerprint representation ====
multipl_fp = multipl %>% distinct(molecule_id, .keep_all = T) 
molid_list = multipl_fp$molecule_id
multipl_fp = multipl_fp %>% df2matfp %>% create_fplist
names(multipl_fp) = molid_list # renames the columns

# Drop fingerprints from datasets to save memory
# ... but before store fp col names
fpcolnams = str_subset(names(multipl), fp_prefix) 
multipl = multipl %>% select(-starts_with(fp_prefix))

<<<<<<< HEAD
train.split = list.files(path_dsets)
=======
train.split = list.files(path_dsets, full.names = TRUE)

data.full = list.files(dsets, full.names = TRUE)

train.data = foreach(num = train.split) %do% {
  read_csv(num)
}

full.dsets = foreach(count = data.full) %do% {
  read_csv(count)
}

sz = length(train.data)

trn.lst = list()
dset.lst = list()

for(i in 1:sz){
  folds = train.data[[i]] # reads in each file of data containing test sets per fold
  
  for(dati in 1:sz){
    dat = full.dsets[[dati]]
    
    for(foldi in 1:10){
      match = subset(folds, foldi == folds$fold)
      colnames(match)[1] <- "molecule_id"
      
      splt = anti_join(dat, match, by = 'molecule_id')
      trn.lst[[foldi]] = splt
      }
  }
  dset.lst[[i]] = trn.lst
}
>>>>>>> d5cb01acc50d4b5be9f5f62c349fc3fd3b723fc3

walk(train.split, create_extdset)

############################################################################################################
############################################################################################################
########################################## Create the data splits ##########################################
############################################################################################################
############################################################################################################

## Set Working Directory ====
setwd("G:/MTL Paper/datasets_dhfr/Piled")

## Paths ====
path_dset_outp = "C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/PhD/MTL Paper/100 Datasets/CV Single splits Train/"
dir.create(path_dset_outp, recursive = T)

path_dset_outp_ext = "C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/PhD/MTL Paper/100 Datasets/CV Assistant Splits Train/"
dir.create(path_dset_outp_ext, recursive = T)

## List the files in the folder needed for the analysis  ====
file = "C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/PhD/MTL Paper/100 Datasets/Splits Train"
list_mydata.t = list.files(file, 
                           full.names = T)
list_mydata.t = list_mydata.t

file.ext = "C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/PhD/MTL Paper/100 Datasets/extended"
list_mydata.ext = list.files(file.ext, 
                             full.names = T)

sz = length(list_mydata.t)

## Seed for reproducability ====
set.seed(123)

## Arrange the splits in the data ====
for( i in 1: sz){
  data_fname = list_mydata.t[i] 
  data = read_csv(data_fname) # reads in each file of data
  
  a = nchar(data_fname) # Number of characters
  abrev_name = substr(data_fname, (a - 13), a) # shortened name
  
  # creates the data splitted into CV.split groups
  cv = cvFolds(n = nrow(data), CV.split) # trying to make sure that the groups are of equal size
  
  index = foreach(iter = 1:CV.split, .combine = "rbind") %do% {
    trn.id = cv$subsets[cv$which != iter] # Takes out row ids that are not equal to the iteration number
    tst.id = cv$subsets[cv$which == iter] # test set are the row ids that are equal to the iteration number
    
    index = list(train = trn.id, test = tst.id) # stores test and train id in list
  }
  
  for (j in 1:CV.split){
    
    a_name = substr(abrev_name, 1, 
                    nchar(abrev_name)-4) # removes the .csv part
    
    a_name = paste(a_name, j-1, sep = '_') # adds iter value to name
    abrevname = paste(a_name, '.csv', sep = '') # adds '.csv' to name
    
    trn.data = slice(data, index[[j]]) # Takes out the training sample based on the row ids
    write_csv(trn.data, paste0(path_dset_outp, "CV_train_", abrevname)) # stores as csv file and adds string to name
    
  }
  
  data_fname.ext = list_mydata.ext[i]
  data.ext = read_csv(data_fname.ext)# reads in each file of data
  
  prim = nrow(data[i]) # counts rows in primary data
  end = nrow(data.ext[i]) # number of rowws in extened dataset
  
  extd = c(((prim + 1):end))
  ext.dset = list(extended = extd)
  
  assist.data = slice(data.ext, ext.dset[[1]])
  
  size = nrow(assist.data)
  if (size < CV.split){ # Restriction in R needs size to be at least equal to k folds for CV to work
    blanks = matrix(NA, nrow = (CV.split - size), ncol = (ncol(assist.data))) # if not, matrix of NA (not changing anything ) for the missing number of rows is generated
    colnames(blanks) = colnames(assist.data) # for binding, ensures column names are the same
    assist.data = rbind(assist.data, blanks) # binds NA's and the assistant data 
  }
  
  cv.assist = cvFolds(n = nrow(assist.data), CV.split) # cross validation the assistant tasks
  
  index.assist = foreach(iter = 1:CV.split, .combine = "rbind") %do% {
    trn.id = cv$subsets[cv$which != iter] # Takes out row ids that are not equal to the iteration number
    tst.id = cv$subsets[cv$which == iter] # test set are the row ids that are equal to the iteration number
    
    index = list(train = trn.id, test = tst.id) # stores test and train id in list
  }
  
  for (j in 1:CV.split){
    
    a_name = substr(abrev_name, 1, 
                    nchar(abrev_name)-4) # removes the .csv part
    
    a_name = paste(a_name, j-1, sep = '_') # adds iter value to name
    abrevname = paste(a_name, '.csv', sep = '') # adds '.csv' to name
    
    trn.data = slice(data, index.assist[[j]]) # Takes out the training sample based on the row ids
    
    write_csv(assist.data, paste0(path_dset_outp_ext, "assist_", abrevname)) # stores as csv file and adds string to name
  }
}

############################################################################################################
############################################################################################################
############################################ Reformat the data #############################################
############################################################################################################
############################################################################################################

## Set Working Directory ====
setwd("C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/MTL Paper/100 Datasets/Alternative Approach")

## Seed for reproducability ====
set.seed(123)

## List the files in the folder needed for the analysis ====
file_locations = list.files(path = getwd(), 
                            full.names = F)

file_locations

list_assist <-list.files(file_locations[1], 
                         full.names = T)

list_CVtest <-list.files(file_locations[2], 
                         full.names = T)

list_CVtrain <-list.files(file_locations[3], 
                          full.names = T)

## Generate storage files ====
path_dset_trn = "C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/MTL Paper/100 Datasets/Alternative Approach/Train data/"
dir.create(path_dset_trn, recursive = T)

path_dset_tst = "C:/Users/cmppmcc1/OneDrive - Liverpool John Moores University/MTL Paper/100 Datasets/Alternative Approach/Test data/"
dir.create(path_dset_tst, recursive = T)

file.copy(list_CVtest, path_dset_tst) # copy test files to new folder

## Format training data ====
train = foreach(tr.file = list_CVtrain) %do% {
  read_csv(tr.file)
}

added = foreach(assist = list_assist) %do% {
  read_csv(assist)
}

## Run extender ====
for (i in 1:length(list_CVtrain)) {
  data_fname = list_CVtrain[i]
  a = nchar(data_fname)
  abrev_name = substr(data_fname, (a - 15), a)
  
  dat = rbind(train[[i]], added[[i]])
  
  write_csv(dat, paste0(path_dset_trn, abrev_name))
}

############################################################################################################
############################################################################################################
##################################### Run random forest - STL and MTL ######################################
############################################################################################################
############################################################################################################

## Set Working Directory ====
setwd("F:/MTL Paper/10 Datasets/Standard Assistant/")
a = "F:/MTL Paper/10 Datasets/Standard Assistant"

## Seed for reproducability ====
set.seed(123)

## List the files in the folder needed for the analysis ====
file_locations = list.files(path = getwd(), 
                            full.names = F)

file_locations # check positions

list_added.tr <-list.files(file_locations[6], 
                           full.names = T) # change position number if needed

list_CVtest <-list.files(file_locations[5], 
                         full.names = T)

list_CVtrain <-list.files(file_locations[3], 
                          full.names = T)

# Iterations for 10-fold cross validation
n_iters = 10

## Format the data ====
trains = foreach(data_tr.name = list_CVtrain) %do% {
  read_csv(data_tr.name) # reads in each file of data
}

tests = foreach(data_tst.name = list_CVtest) %do% {
  read_csv(data_tst.name)
}

trains.added = foreach(data_tr.name = list_added.tr) %do% {
  read_csv(data_tr.name) 
}


## Run the Random Forest ====
############################################################
#### SINGLE TASK ====
RF.sep = foreach(trains_iter = 1:length(trains), .combine = "rbind") %do% {
  trn.data = trains[[trains_iter]] # Select given datafold 
  trn.dset = trn.data %>% select(-molecule_id) # removes the molecule column
  
  tst.data = tests[[trains_iter]] # Select corresponding datafold
  tst.dset = tst.data %>% select(-molecule_id)
  
  mdl = ranger(pXC50 ~ ., data = trn.dset) # makes the random frest modell, with the pXC50 as the output and the remainder inputs
  
  preds = predict(mdl, data = tst.dset)
  
  names = list_CVtrain[trains_iter]
  names = substr(names, 26, nchar(names))
  names = substr(names, 1, nchar(names)-6)
  
  outp = data_frame(iter = trains_iter, true = tst.dset$pXC50, predicted = preds$predictions, dataset_id = names) # stores the output with the actual values and the predicted values in seperate columns
}

write_csv(RF.sep, paste0(a, 'RF single perf', '.csv'))
############################################################

#### MTL ====
RF.mult = foreach(trains_iter = 1:length(tests), .combine = "rbind") %do% {
  trn.data = trains.added[[trains_iter]] # Select given datafold 
  trn.data = na.omit(trn.data)
  trn.dset = trn.data %>% select(-molecule_id) # removes the molecule column
  
  tst.data = tests[[trains_iter]] # Select corresponding datafold
  tst.dset = tst.data %>% select(-molecule_id)
  
  mdl = ranger(pXC50 ~ ., trn.dset) # makes the random frest modell, with the pXC50 as the output and the remainder inputs
  
  preds = predict(mdl, data = tst.dset)
  
  names = list_CVtrain[trains_iter]
  names = substr(names, 26, nchar(names))
  names = substr(names, 1, nchar(names)-6)
  
  print(trains_iter) # To chek progress
  
  outp = data_frame(iter = trains_iter, true = tst.dset$pXC50, predicted = preds$predictions, dataset_id = names) # stores the output with the actual values and the predicted values in seperate columns
}

write_csv(RF.mult, paste0(a, 'RF MTL perf', '.csv'))
iaolier/mtl-qsar documentation built on Aug. 8, 2019, 5:55 p.m.