E2Predictor:

library(E2Predictor)
library(gtools)
library(ggplot2)

### Configuration #########################

E2Predictor::initConfiguration("", "/kitty/projects/E2P")

selected.cell.line = c("JY", "JY", "JY")
is_perturbation = c(F,F,T)
condition = c("", "", "Control")
additionalGrepCondition = c("", "", " I ")
class_category = "has_ligands"
analysis_name = "mixed.Prot_JY_ligs_JYPerturbControlI"

allele.predictions = readRDS(file.path(E2Predictor.Config$working.path, "allele.predictions", "perturbations", "JY", "Control_I","allele.predictions.Rds"))
minReplicates = 2
allele.predictor = "ic50"
nM.threshold = 1000
saveAllResults = T

GOenrichment.pos <- readRDS(file.path(E2Predictor.Config$working.path, "GO.enrichment", "go.enrichment.positive.Rds"))
GOenrichment.neg <- readRDS(file.path(E2Predictor.Config$working.path, "GO.enrichment", "go.enrichment.negative.Rds"))

GOcategories <- c(GOenrichment.pos$GOid, GOenrichment.neg$GOid)

dbfile <- file.path(E2Predictor.Config$working.path, "databases", "canonical", "uniprot-human.tab.gz")

omics <- E2Predictor::merge_omics(selected.cell.line = selected.cell.line,
                                     dbfile = dbfile,
                                     is_perturbation = is_perturbation,
                                     condition = condition,
                                     additionalGrepCondition = additionalGrepCondition,
                                     minReplicates = minReplicates,
                                     allele.predictions = allele.predictions,
                                     nM.threshold = nM.threshold,
                                     saveAllResults = saveAllResults,
                                     GOcategories = GOcategories)

analysis_folder <- file.path(E2Predictor.Config$working.path, "data_analyses", "machine_learning")

requireAllele = TRUE
factor.training <- 0.6
hidden.layers <- c(6, 2)

set.seed(1234567890)
###########################################


# load data for modeling
#omics <- readRDS(modeling.file)

omics <- omics %>% mutate(has_ligands = ifelse(is.na(ligands), FALSE, TRUE))  # variable to be predicted

if(requireAllele) omics$has_ligands <- sapply(omics$predicted.allele, function(x) !all(is.na(x)) )

#omics$key <- paste(cell.line, "perturbation", is_perturbation, condition, addgrepcond, "requireAllele", requireAllele, sep = "_" )
omics$key <- "Prot_JY_ligs_from_perturbationControlI"

# Just for the moment, so that files are homogene.
if("FILENAME" %in% names(omics)) omics$FILENAME <- NULL

## additional knowledge databases

# crawler includes several predictions for cell localization and presence of helices
crawler <- read_file_crawler( file.path(E2Predictor.Config$working.path, "predictions.crowler/Uniprot_Predictions_Localization_Crawler.xlsx") )
crawler.preML <- prelim_ML(df = crawler, dim.reduction.factor = 0.6, save.to = file.path(analysis_folder, "crawler"))
crawler.red <- crawler[, !names(crawler) %in% crawler.preML$highly.correlated.vars]

# SwissProt database, including many info about each protein. Here I filtered it to only 5
# features (Length, Mass, Transmembrane, Glycosylation and Lipidation), but you may include more
swissprot <- readr::read_tsv(file = dbfile)
swissprot <- swissprot %>%
    filter(Status == "reviewed") %>%
    rename(protein_entry = Entry) %>%
    select(protein_entry, Length, Mass, Transmembrane, Glycosylation, Lipidation) %>%
    mutate( Transmembrane = ifelse(is.na(Transmembrane), FALSE, TRUE)
            ,Glycosylation = ifelse(is.na(Glycosylation), FALSE, TRUE)
            ,Lipidation = ifelse(is.na(Lipidation), FALSE, TRUE))


# Table generated by Sebastian with netctl predictions
netctl <- read.table(file.path(E2Predictor.Config$working.path, "predictions", "netctl.tsv"), sep = "\t", header = T)
# netctl.preML <- prelim_ML(netctl, dim.reduction.factor = 0.6, save.to = file.path(analysis_folder, "netctl_table"))

# Merge several tables. Remember that here you can also add more cell lines, etc...
omics.combined <- merge(omics, crawler.red, by="protein_entry")
omics.combined <- merge(omics.combined, swissprot, by="protein_entry")
omics.combined <- merge(omics.combined, netctl, by.x="Entry name", by.y = "Entry.name")

# prepare data modeling

## Defined variables for preparing data modeling. I guess it would be enough to have just one definition for these variables,
## so that all numeric (and boolean) variables are normalized. After wards you may train over a subset of variables by using
## a different formula
df1.vars <- c("has_ligands", "PROTEIN_QUANTITY", "RPKM.avg",  grep("\\[GO.*", names(omics.combined), value = T)) #
df2.vars <- c(df1.vars, "Length", "Mass", "Transmembrane", "Glycosylation", "Lipidation", names(crawler.red)[2:length(crawler.red)])
df.vars <- c(df2.vars, names(netctl)[2:length(names(netctl))])

## Prepare the data for modeling, by using only the variables defined above (I still keep these three variable subsets, which is useless...)
omics.combined.datamodel <- prepare_data_modeling(omics.combined, useVariables = df.vars)

## test / training datasets. This returns an object containing two data frames: trainset and testset
omics.combined_tr <- get_training_sample(omics.combined.datamodel, class_category = class_category, factor.training = factor.training)

## Preliminary Machine Learning (var. correlation & PCA)
omics.combined.preML <- prelim_ML(df= omics.combined.datamodel, dim.reduction.factor = 0.7, save.to = file.path(analysis_folder, analysis_name))

## Machine Learning

f.simple <- as.formula(paste("has_ligands ~ PROTEIN_QUANTITY + RPKM.avg"))
f.all <- as.formula(paste(class_category, "~", paste(names(omics.combined_tr$trainset)[!names(omics.combined_tr$trainset) %in% class_category], collapse = " + ")))

decTreeFit.all <- rpart::rpart(f.all, data = omics.combined_tr$trainset, method = "anova")

rpart::printcp(decTreeFit.all)
rpart::plotcp(decTreeFit.all)
plot(decTreeFit.all, uniform=TRUE)
summary(decTreeFit.all)

decTreeFit.all$variable.importance

## Evaluate Machine Learning. It saves some results (ROC curves, etc) at a user defined folder
path.ANN.Optimization.base = file.path(analysis_folder, analysis_name, "optimization")

features <- names(sort(decTreeFit.all$variable.importance, decreasing = T))[1:10]

feat.combinations <- combinations(n = length(features), r=length(features) -1 , v=features, set=TRUE, repeats.allowed = FALSE)

run.ANN.nnet <- function(feats, traintest, folder.name){

    f <- as.formula( paste(class_category, "~" ,  paste(feats, collapse = "+")) )
    print(paste0("Running ANN: ", n.comb))
    print(f)
    save.to <- file.path(path.ANN.Optimization.base, folder.name, paste("comb", n.comb, sep = "_"))
    mkdir(save.to)
    sink(file = file.path(save.to, "features.txt"))
    print(f)
    sink()

    annFit <- nnet::nnet(f, data=traintest$trainset, size = 10, linout=F, skip=T, maxit=10000)
    if(is.null(annFit)) return(NA)
    annFit.ROC <- evaluate_ML(predictor=annFit, testset=traintest$testset, class_category = class_category, evaluation.features = feats, save.to = save.to)


    n.comb <<- n.comb + 1

    return(list( predictor = annFit, roc = annFit.ROC) )
}




n.comb <- 1
permutations.training.nnet.JY <- apply(feat.combinations, 1, run.ANN.nnet, omics.combined_tr, "permutations.training.nnet.skipTrue.JY")