library(E2Predictor)
library(gtools)
library(ggplot2)
### Configuration #########################
E2Predictor::initConfiguration("", "/kitty/projects/E2P")
selected.cell.line = c("JY", "JY", "JY")
is_perturbation = c(F,F,T)
condition = c("", "", "Control")
additionalGrepCondition = c("", "", " I ")
class_category = "has_ligands"
analysis_name = "mixed.Prot_JY_ligs_JYPerturbControlI"
allele.predictions = readRDS(file.path(E2Predictor.Config$working.path, "allele.predictions", "perturbations", "JY", "Control_I","allele.predictions.Rds"))
minReplicates = 2
allele.predictor = "ic50"
nM.threshold = 1000
saveAllResults = T
GOenrichment.pos <- readRDS(file.path(E2Predictor.Config$working.path, "GO.enrichment", "go.enrichment.positive.Rds"))
GOenrichment.neg <- readRDS(file.path(E2Predictor.Config$working.path, "GO.enrichment", "go.enrichment.negative.Rds"))
GOcategories <- c(GOenrichment.pos$GOid, GOenrichment.neg$GOid)
dbfile <- file.path(E2Predictor.Config$working.path, "databases", "canonical", "uniprot-human.tab.gz")
omics <- E2Predictor::merge_omics(selected.cell.line = selected.cell.line,
dbfile = dbfile,
is_perturbation = is_perturbation,
condition = condition,
additionalGrepCondition = additionalGrepCondition,
minReplicates = minReplicates,
allele.predictions = allele.predictions,
nM.threshold = nM.threshold,
saveAllResults = saveAllResults,
GOcategories = GOcategories)
analysis_folder <- file.path(E2Predictor.Config$working.path, "data_analyses", "machine_learning")
requireAllele = TRUE
factor.training <- 0.6
hidden.layers <- c(6, 2)
set.seed(1234567890)
###########################################
# load data for modeling
#omics <- readRDS(modeling.file)
omics <- omics %>% mutate(has_ligands = ifelse(is.na(ligands), FALSE, TRUE)) # variable to be predicted
if(requireAllele) omics$has_ligands <- sapply(omics$predicted.allele, function(x) !all(is.na(x)) )
#omics$key <- paste(cell.line, "perturbation", is_perturbation, condition, addgrepcond, "requireAllele", requireAllele, sep = "_" )
omics$key <- "Prot_JY_ligs_from_perturbationControlI"
# Just for the moment, so that files are homogene.
if("FILENAME" %in% names(omics)) omics$FILENAME <- NULL
## additional knowledge databases
# crawler includes several predictions for cell localization and presence of helices
crawler <- read_file_crawler( file.path(E2Predictor.Config$working.path, "predictions.crowler/Uniprot_Predictions_Localization_Crawler.xlsx") )
crawler.preML <- prelim_ML(df = crawler, dim.reduction.factor = 0.6, save.to = file.path(analysis_folder, "crawler"))
crawler.red <- crawler[, !names(crawler) %in% crawler.preML$highly.correlated.vars]
# SwissProt database, including many info about each protein. Here I filtered it to only 5
# features (Length, Mass, Transmembrane, Glycosylation and Lipidation), but you may include more
swissprot <- readr::read_tsv(file = dbfile)
swissprot <- swissprot %>%
filter(Status == "reviewed") %>%
rename(protein_entry = Entry) %>%
select(protein_entry, Length, Mass, Transmembrane, Glycosylation, Lipidation) %>%
mutate( Transmembrane = ifelse(is.na(Transmembrane), FALSE, TRUE)
,Glycosylation = ifelse(is.na(Glycosylation), FALSE, TRUE)
,Lipidation = ifelse(is.na(Lipidation), FALSE, TRUE))
# Table generated by Sebastian with netctl predictions
netctl <- read.table(file.path(E2Predictor.Config$working.path, "predictions", "netctl.tsv"), sep = "\t", header = T)
# netctl.preML <- prelim_ML(netctl, dim.reduction.factor = 0.6, save.to = file.path(analysis_folder, "netctl_table"))
# Merge several tables. Remember that here you can also add more cell lines, etc...
omics.combined <- merge(omics, crawler.red, by="protein_entry")
omics.combined <- merge(omics.combined, swissprot, by="protein_entry")
omics.combined <- merge(omics.combined, netctl, by.x="Entry name", by.y = "Entry.name")
# prepare data modeling
## Defined variables for preparing data modeling. I guess it would be enough to have just one definition for these variables,
## so that all numeric (and boolean) variables are normalized. After wards you may train over a subset of variables by using
## a different formula
df1.vars <- c("has_ligands", "PROTEIN_QUANTITY", "RPKM.avg", grep("\\[GO.*", names(omics.combined), value = T)) #
df2.vars <- c(df1.vars, "Length", "Mass", "Transmembrane", "Glycosylation", "Lipidation", names(crawler.red)[2:length(crawler.red)])
df.vars <- c(df2.vars, names(netctl)[2:length(names(netctl))])
## Prepare the data for modeling, by using only the variables defined above (I still keep these three variable subsets, which is useless...)
omics.combined.datamodel <- prepare_data_modeling(omics.combined, useVariables = df.vars)
## test / training datasets. This returns an object containing two data frames: trainset and testset
omics.combined_tr <- get_training_sample(omics.combined.datamodel, class_category = class_category, factor.training = factor.training)
## Preliminary Machine Learning (var. correlation & PCA)
omics.combined.preML <- prelim_ML(df= omics.combined.datamodel, dim.reduction.factor = 0.7, save.to = file.path(analysis_folder, analysis_name))
## Machine Learning
f.simple <- as.formula(paste("has_ligands ~ PROTEIN_QUANTITY + RPKM.avg"))
f.all <- as.formula(paste(class_category, "~", paste(names(omics.combined_tr$trainset)[!names(omics.combined_tr$trainset) %in% class_category], collapse = " + ")))
decTreeFit.all <- rpart::rpart(f.all, data = omics.combined_tr$trainset, method = "anova")
rpart::printcp(decTreeFit.all)
rpart::plotcp(decTreeFit.all)
plot(decTreeFit.all, uniform=TRUE)
summary(decTreeFit.all)
decTreeFit.all$variable.importance
## Evaluate Machine Learning. It saves some results (ROC curves, etc) at a user defined folder
path.ANN.Optimization.base = file.path(analysis_folder, analysis_name, "optimization")
features <- names(sort(decTreeFit.all$variable.importance, decreasing = T))[1:10]
feat.combinations <- combinations(n = length(features), r=length(features) -1 , v=features, set=TRUE, repeats.allowed = FALSE)
run.ANN.nnet <- function(feats, traintest, folder.name){
f <- as.formula( paste(class_category, "~" , paste(feats, collapse = "+")) )
print(paste0("Running ANN: ", n.comb))
print(f)
save.to <- file.path(path.ANN.Optimization.base, folder.name, paste("comb", n.comb, sep = "_"))
mkdir(save.to)
sink(file = file.path(save.to, "features.txt"))
print(f)
sink()
annFit <- nnet::nnet(f, data=traintest$trainset, size = 10, linout=F, skip=T, maxit=10000)
if(is.null(annFit)) return(NA)
annFit.ROC <- evaluate_ML(predictor=annFit, testset=traintest$testset, class_category = class_category, evaluation.features = feats, save.to = save.to)
n.comb <<- n.comb + 1
return(list( predictor = annFit, roc = annFit.ROC) )
}
n.comb <- 1
permutations.training.nnet.JY <- apply(feat.combinations, 1, run.ANN.nnet, omics.combined_tr, "permutations.training.nnet.skipTrue.JY")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.