inst/scripts/MLpipeline.R

library(E2Predictor)
library(readr)
library(stringr)
library(tidyr)
library(rpart)
library(randomForest)
library(pROC)
#library(neuralnet)

### Configuration #########################

# You always need to init the configuration of E2Predictor. Change the paths to your convenience.
initConfiguration(netMHCpath = "/Users/napedro/external_sources/netMHC-4.0/netMHC",
                  working.path = "/Users/napedro/CloudStation")

# UniProt database
dbfile <- file.path(E2Predictor.Config$working.path, "databases", "canonical", "uniprot-human.tab.gz")


# Classification category.
class_category = "has_ligands"


# Define wich experiment (cell line, perturbation...) you want to access
cell.lines <- names(E2Predictor.Config$hla_alleles)
conditions <- c("Control", "Bortezomib", "DMOG", "IFNg", "Rapamycin")
cell.line <- cell.lines[1]
is_perturbation <- F
condition <- conditions[5]
addgrepcond <- ""  # " II "

analysis_folder <- file.path(E2Predictor.Config$working.path, "data_analyses", "machine_learning")

# requireAllele: if FALSE, all ligands will be considered;
# if TRUE only proteins related to at least one ligand associated to a reliable predicted allele are considered
requireAllele = TRUE

# factor.training applied to the smallest population (in this case, has_ligands=TRUE)
factor.training <- 0.6

#just for neural network configuration
hidden.layers <- c(7, 2)

set.seed(1234567890)
###########################################


experim.folder <- "cell.lines"
if(is_perturbation) experim.folder <- "perturbations"

# folder for saving results from neural networks... not important at all
ANN.folder <- file.path(E2Predictor.Config$working.path, "data_analyses", "machine_learning", "ANN", experim.folder, cell.line)
if(is_perturbation) ANN.folder <- file.path(ANN.folder, paste(condition, addgrepcond, sep = "_"))
mkdir(ANN.folder)

## load data: this loads tables from the folder "tables.for.modeling". They include protein values, RPKM, alleles, and GO enriched terms
omics <- load_data_modeling(cell.line = cell.lines[1], is_perturbation = T, condition = conditions[1], addgrepcond = " I ", requireAllele = requireAllele)

## Example combining two cell lines, training in only one of them, and testing at the other one
omics.cellline1 <- load_data_modeling(cell.line = cell.lines[1], is_perturbation = F, requireAllele = requireAllele)    # This is JY
omics.cellline2 <- load_data_modeling(cell.line = cell.lines[2], is_perturbation = F, requireAllele = requireAllele)    # This is LCLC-103H
omics.combined <- rbind(omics.cellline1, omics.cellline2)


## additional knowledge databases

# crawler includes several predictions for cell localization and presence of helices
crawler <- read_file_crawler( file.path(E2Predictor.Config$working.path, "predictions.crowler/Uniprot_Predictions_Localization_Crawler.xlsx") )
crawler.preML <- prelim_ML(df = crawler, dim.reduction.factor = 0.6, save.to = file.path(analysis_folder, "crawler"))
crawler.red <- crawler[, !names(crawler) %in% crawler.preML$highly.correlated.vars]

# SwissProt database, including many info about each protein. Here I filtered it to only 5
# features (Length, Mass, Transmembrane, Glycosylation and Lipidation), but you may include more
swissprot <- readr::read_tsv(file = dbfile)
swissprot <- swissprot %>%
    filter(Status == "reviewed") %>%
    rename(protein_entry = Entry) %>%
    select(protein_entry, Length, Mass, Transmembrane, Glycosylation, Lipidation) %>%
    mutate( Transmembrane = ifelse(is.na(Transmembrane), FALSE, TRUE)
                                    ,Glycosylation = ifelse(is.na(Glycosylation), FALSE, TRUE)
                                    ,Lipidation = ifelse(is.na(Lipidation), FALSE, TRUE))


# Table generated by Sebastian with netctl predictions
netctl <- read.table(file.path(E2Predictor.Config$working.path, "predictions", "netctl.tsv"), sep = "\t", header = T)

netctl.preML <- prelim_ML(netctl, dim.reduction.factor = 0.6, save.to = file.path(analysis_folder, "netctl_table"))

# Merge several tables. Remember that here you can also add more cell lines, etc... The best moment
# to do so is before merging to the additional knowledge tables
## something like:
## omics.2 <- load_data_modeling(cell.line = cell.lines[2], is_perturbation = F, requireAllele = requireAllele)
## omics <- rbind(omics, omics.2)
omics <- merge(omics, crawler.red, by="protein_entry" )
omics <- merge(omics, swissprot, by="protein_entry")
omics <- merge(omics, netctl, by.x="Entry name", by.y = "Entry.name")

omics.combined <- merge(omics.combined, crawler.red, by="protein_entry")
omics.combined <- merge(omics.combined, swissprot, by="protein_entry")
omics.combined <- merge(omics.combined, netctl, by.x="Entry name", by.y = "Entry.name")

# prepare data modeling

## Defined variables for preparing data modeling. I guess it would be enough to have just one definition for these variables,
## so that all numeric (and boolean) variables are normalized. After wards you may train over a subset of variables by using
## a different formula
df1.vars <- c("has_ligands", "PROTEIN_QUANTITY", "RPKM.avg",  grep("\\[GO.*", names(omics), value = T)) #
df2.vars <- c(df1.vars, "Length", "Mass", "Transmembrane", "Glycosylation", "Lipidation", names(crawler.red)[2:length(crawler.red)])
df3.vars <- c(df1.vars, names(netctl)[2:length(names(netctl))])

## Prepare the data for modeling, by using only the variables defined above (I still keep these three variable subsets, which is useless...)
df1 <- E2Predictor::prepare_data_modeling(omics, useVariables = df1.vars)
df2 <- E2Predictor::prepare_data_modeling(omics, useVariables = df2.vars)
df3 <- E2Predictor::prepare_data_modeling(omics, useVariables = df3.vars)

omics.combined.datamodel <- prepare_data_modeling(omics.combined, useVariables = df3.vars)

## test / training datasets. This returns an object containing two data frames: trainset and testset
df1_tr <- get_training_sample(df1, class_category = class_category, factor.training = factor.training)
df2_tr <- get_training_sample(df2, class_category = class_category, factor.training = factor.training)
df3_tr <- get_training_sample(df3, class_category = class_category, factor.training = factor.training)

# For the training sample of the combined JY and LCLC, we use 100% of JY as training. If it works, all proteins in trainset should be JY, and all
# proteins in testset should be LCLC
omics.combined_tr <- get_training_sample(omics.combined.datamodel, class_category = class_category, factor.training = 1, grep_rowname = "^JY")

## Preliminary Machine Learning (var. correlation & PCA)
df1.preML <- prelim_ML(df = df1, dim.reduction.factor = 0.6, save.to = file.path(analysis_folder, "df1") )
df2.preML <- prelim_ML(df = df2, dim.reduction.factor = 0.6, save.to = file.path(analysis_folder, "df2") )
df3.preML <- prelim_ML(df = df3, dim.reduction.factor = 0.6, save.to = file.path(analysis_folder, "df3") )

omics.combined.preML <- prelim_ML(df= omics.combined.datamodel, dim.reduction.factor = 0.7, save.to = file.path(analysis_folder, "combined.JY.LCLC"))

## Machine Learning

f.simple <- as.formula(paste("has_ligands ~ PROTEIN_QUANTITY + RPKM.avg"))
f.all <- as.formula(paste(class_category, "~", paste(names(omics.combined_tr$trainset)[!names(omics.combined_tr$trainset) %in% class_category], collapse = " + ")))

forestFit <- randomForest(x=df1_tr$trainset[, !names(df1_tr$trainset) %in% class_category], y=as.factor(df1_tr$trainset[,class_category]),
                          importance=TRUE, do.trace=100, ntree = 500)

forestFit.2 <- randomForest(x=df2_tr$trainset[, !names(df2_tr$trainset) %in% class_category], y=as.factor(df2_tr$trainset[,class_category]),
                          importance=TRUE, do.trace=100, ntree = 500)


forestFit.3 <- randomForest(x=df3_tr$trainset[, !names(df3_tr$trainset) %in% class_category], y=as.factor(df3_tr$trainset[,class_category]),
                            importance=TRUE, do.trace=100, ntree = 500)


decTreeFit <- rpart(f.simple, data = omics.combined_tr$trainset, method = "anova")
decTreeFit.all <- rpart(f.all, data = omics.combined_tr$trainset, method = "anova")
decTreeFit.all.poisson <- rpart(f.all, data = omics.combined_tr$trainset, method = "poisson")

printcp(decTreeFit.all)
plotcp(decTreeFit.all)
plot(decTreeFit.all, uniform=TRUE)
summary(decTreeFit.all)


forestFit.Combined <- randomForest(x=omics.combined_tr$trainset[, !names(omics.combined_tr$trainset) %in% class_category], y=as.factor(omics.combined_tr$trainset[,class_category]),
                            importance=TRUE, do.trace=100, ntree = 500)


df2.vars.red <- names(df2_tr$trainset)[ !(names(df2_tr$trainset) %in% df2.preML$highly.correlated.vars) ]

##f2.red <- as.formula(paste("has_ligands~", paste( df2.vars.red , collapse = "+")))
##annFit.2.red <-  neuralnet(f.simple, data=df2_tr$trainset, hidden=hidden.layers, linear.output=F, stepmax = 1e+06)

annFit.Combined <- neuralnet::neuralnet(formula = f.simple, data = omics.combined_tr$trainset, linear.output = F, stepmax = 1e+06)


## Evaluate Machine Learning. It saves some results (ROC curves, etc) at a user defined folder
path.forestFit1 = file.path(analysis_folder, "df1")
path.forestFit2 = file.path(analysis_folder, "df2")
path.forestFit3 = file.path(analysis_folder, "df3")
path.decTreeFit = file.path(analysis_folder, "decTree")
path.decTreeFit.all = file.path(analysis_folder, "decTree.all")
path.decTreeFit.all.poisson = file.path(analysis_folder, "decTree.all.poisson")

path.forestFit.Comb = file.path(analysis_folder, "combined.JY.LCLC")
path.ANN.Comb = file.path(analysis_folder, "combined.JY.LCLC_ANN")


evaluate_ML(forestFit, df1_tr$testset, class_category = class_category, save.to = path.forestFit1)
evaluate_ML(forestFit.2, df2_tr$testset, class_category = class_category, save.to = path.forestFit2)
evaluate_ML(forestFit.3, df3_tr$testset, class_category = class_category, save.to = path.forestFit3)
evaluate_ML(forestFit.Combined, omics.combined_tr$testset, class_category = class_category, save.to = path.forestFit.Comb)
evaluate_ML(decTreeFit, df3_tr$testset, class_category = class_category, save.to = path.decTreeFit)
evaluate_ML(decTreeFit.all, df3_tr$testset, class_category = class_category, save.to = path.decTreeFit.all)
x <- evaluate_ML(decTreeFit.all.poisson, df3_tr$testset, class_category = class_category, save.to = path.decTreeFit.all.poisson)
#E2Predictor::evaluate_ML(annFit.Combined, omics.combined_tr$testset, class_category = class_category, save.to = path.ANN.Comb)


f.red <- as.formula( paste(class_category, "~" ,  paste(names(decTreeFit.all$variable.importance), collapse = "+")) )
path.ANN.Comb.red <- file.path(analysis_folder, "combined.ANN.varReducedWithDecTree")
annFit.2.red <- neuralnet::neuralnet(f.red, data=omics.combined_tr$trainset, hidden=hidden.layers, linear.output=F, stepmax = 1e+06)
annFit.2.red.eval <- evaluate_ML(predictor=annFit.2.red, testset=omics.combined_tr$testset, class_category = class_category, save.to = path.ANN.Comb.red)

##### FROM HERE: CHAOS, FIRE, DEATH, DESTRUCTION, ... ##################################################


do.not.look.at.me <- function(){

    #forestFit$importance

    df1_tr$testset$predicted.rf <- as.numeric(forestPredict)
    plot.new()
    r3 <- roc(has_ligands ~ predicted.rf, df1_tr$testset, plot=TRUE, smooth=F, add=TRUE, percent=T)
    print(r3)

    df2_tr$testset$predicted.rf <- as.numeric(forestPredict)
    plot.new()
    r4 <- roc(has_ligands ~ predicted.rf, df2_tr$testset, plot=TRUE, smooth=F, add=TRUE, percent=T)
    print(r4)


    f <- as.formula(paste("has_ligands ~", paste(names(df2_tr$trainset)[!names(df2_tr$trainset) %in% "has_ligands"], collapse = " + ")))
    f.simple <- as.formula(paste("has_ligands ~ PROTEIN_QUANTITY + RPKM.avg"))

    lm.fit <- glm(f, data = df2_tr$trainset)
    pr.lm <- predict(lm.fit, df2_tr$testset)
    MSE.lm <- sum((pr.lm - df2_tr$testset$has_ligands)^2)/nrow(df2_tr$testset)

    lm.simple.fit <- glm(f.simple, data = df2_tr$trainset)
    pr.simple.lm <- predict(lm.simple.fit, df2_tr$testset)
    MSE.simple.lm <- sum((pr.simple.lm - df2_tr$testset$has_ligands)^2)/nrow(df2_tr$testset)

       # Train the neural network
    n <- names(trainset)
    f <- as.formula(paste("has_ligands ~", paste(n[!n %in% "has_ligands"], collapse = " + ")))
    #nn <- neuralnet(f,data=trainset,hidden=hidden.layers,linear.output=F)
    nn <- neuralnet(f,data=trainset,hidden=hidden.layers,linear.output=F, stepmax = 1e+06)

    #save the neural network
    nn_name <- paste(omics.model.variables, sep="_", collapse = "_")
    nn_name <- paste(nn_name, "HL", hidden.layers, sep="_", collapse ="_")

    saveRDS(nn, file=file.path(ANN.folder, paste("ANN.trained.GOandprotein", nn_name, "Rds", sep = ".") ))



    #Predicting values with the neural network
    pr.nn <- compute(nn, testset[, 2:ncol(testset)])

    pr.nn_ <- pr.nn$net.result*(max(omics.model$has_ligands)-min(omics.model$has_ligands))+min(omics.model$has_ligands)
    test.r <- (testset$has_ligands)*(max(omics.model$has_ligands)-min(omics.model$has_ligands))+min(omics.model$has_ligands)

    MSE.nn <- sum((test.r - pr.nn_)^2)/nrow(testset)




    decTreeFit <- rpart(f, data = df2_tr$trainset, method = "anova")
    printcp(decTreeFit)
    plotcp(decTreeFit)
    plot(decTreeFit, uniform=TRUE)
    summary(decTreeFit)

    decTreePredict <- predict(decTreeFit, df2_tr$testset)
    colors <- ifelse(df2_tr$testset$has_ligands == 0, "black", "red")
    plot(decTreePredict, pch=19, col=colors)
    plot(density(decTreePredict[df2_tr$testset$has_ligands==F]))
    lines(density(decTreePredict[df2_tr$testset$has_ligands==T]), col="red")
    df2_tr$testset$predicted.dt <- as.numeric(decTreePredict)
    r4 <- roc(has_ligands ~ predicted.dt, df2_tr$testset, plot=TRUE, smooth=F)
    print(r4)

    l <- evaluate_ML(decTreeFit, df2_tr$testset, "has_ligands")

    l$auc
    l$sensitivities
    str(l)


    # Plot the neural network
    pdf(file.path(ANN.folder,  paste("NeuralNetwork_map_GOandprotein", nn_name, "pdf", sep = ".") ), width = 10, height = 10)

    #relative importance of input variables for Y1
    rel.imp<-gar.fun('Y1',nn,bar.plot=F)$rel.imp

    #color vector based on relative importance of input values
    cols<-colorRampPalette(c('green','red'))(num.vars)[rank(rel.imp)]

    #plot model with new color vector
    #separate colors for input vectors using a list for 'circle.col'
    plot.nnet(nn, circle.col = list(cols, 'lightblue') )
    dev.off()


    ## Compare with a linear model fit
    lm.fit <- glm(f, data = trainset)
    pr.lm <- predict(lm.fit, testset)
    MSE.lm <- sum((pr.lm - testset$has_ligands)^2)/nrow(testset)


    neuralnet::print.nn(nn)

    nn <- readRDS("/Users/napedro/CloudStation/data_analyses/machine_learning/ANN/cell.lines/JY/ANN.trained.GO.HL_5_HL_2.Rds")
    pr.alldataset.nn <- compute(nn, omics.model.sc[, 2:ncol(omics.model.sc)])
    pr.alldataset.nn_ <- pr.alldataset.nn$net.result*(max(omics.model$has_ligands)-min(omics.model$has_ligands))+min(omics.model$has_ligands)

    colors <- ifelse(omics.model.sc$has_ligands == 0, "black", "red")
    plot(pr.alldataset.nn_,pch=19, col=colors)
    plot(density(pr.alldataset.nn_[omics.model$has_ligands==F]))
    lines(density(pr.alldataset.nn_[omics.model$has_ligands==T]), col="red")

}
IFIproteomics/E2Predictor documentation built on May 8, 2019, 10:54 a.m.