DIANE: DIANE

library(DIANE)

data("abiotic_stresses")
data("gene_annotations")
data("regulators_per_organism")

####### go analysis

genes <- convert_from_agi(get_locus(abiotic_stresses$heat_DEGs))
background <- convert_from_agi(get_locus(rownames(abiotic_stresses$normalized_counts)))



go <- enrich_go(genes, background)
#DIANE::draw_enrich_go(go, max_go = 30)


library(plotly)
draw_heatmap(data = abiotic_stresses$normalized_counts)


heatmap(abiotic_stresses$heat_DEGs_regulatory_links, )


library(ggraph)

########### tests ACP

draw_PCA(abiotic_stresses$normalized_counts)


## demo 

raw_counts <- read.csv("D:/These/DIANE_inputs/demo_counts.csv", sep = ',', h = T, row.names = "Gene")

design <- read.csv("D:/These/DIANE_inputs/abiotic_stresses_design.csv", sep = ',', h = T, row.names = "Condition")

library(DIANE)
data("abiotic_stresses")
data("regulators_per_organism")
data("gene_annotations")


genes <- sample(abiotic_stresses$heat_DEGs, 4)
ggplotly(draw_expression_levels(abiotic_stresses$normalized_counts, genes = genes))%>%
  layout(legend = list(font = list(size = 15)))

conditions <- stringr::str_split_fixed(colnames(raw_counts), '_', 2)[,1]

gene_annotations[["Arabidopsis thaliana"]] <- gene_annotations[["Arabidopsis thaliana"]][1:dim(gene_annotations[["Arabidopsis thaliana"]])[1]-1,]

tail(gene_annotations[["Arabidopsis thaliana"]])

abiotic_stresses <- list(raw_counts = raw_counts, design = design, conditions = conditions)


############ tests group tfs


genes <- get_locus(abiotic_stresses$heat_DEGs)
regressors <- intersect(genes, regulators_per_organism$`Arabidopsis thaliana`)


data <- aggregate_splice_variants(abiotic_stresses$normalized_counts)
head(data)



r <- DIANE::group_regressors(data, genes, regressors)
tail(r$counts)


r$correlated_regressors_graph


mat <- DIANE::network_inference(r$counts, conds = abiotic_stresses$conditions, targets = r$grouped_genes,
                                 regressors = r$grouped_regressors)

network <- DIANE::network_thresholding(mat, n_edges = 150)

d_GENIE3 <- network_data(network, regulators_per_organism[["Arabidopsis thaliana"]])

library(visNetwork)
DIANE::draw_network(d_GENIE3$nodes, d_GENIE3$edges) %>% visNodes(font = list("size" = 0) )
DIANE::draw_network_degrees(d_GENIE3$nodes, network)


################# custom GO

library(clusterProfiler)

data <- read.csv("D:/These/DIANE_inputs/lupin_gene_count_matrix.csv", header = TRUE, row.names = "Gene")

tcc <- DIANE::normalize(data, conditions = str_split_fixed(colnames(data), '_', 2)[,1])
tcc <- DIANE::filter_low_counts(tcc, 10*ncol(data))

normalized_counts <- TCC::getNormalizedData(tcc)
DIANE::draw_distributions(normalized_counts)


fit <- DIANE::estimateDispersion(tcc )

degs <- DIANE::estimateDEGs(fit, reference = "S0", perturbation = "S6", p.value = 0.01, lfc = 3)
genes <- degs$table$genes

GOs <- read.table("D:/These/DIANE_inputs/lupin_golist.txt", header = TRUE, sep = ',')

universe <- intersect(rownames(normalized_counts), GOs[,1])

go <- enrich_go_custom(genes, universe, GOs)


DIANE::draw_enrich_go_map(go)


########## R to cytoscape

load("~/Documents/network_CO2genes12Conds.RData")
ig <- igraph::graph_from_data_frame(net_data$edges, directed=TRUE, vertices = net_data$nodes)
igraph::plot.igraph(ig, vertex.size = 2, vertex.cex = 2)
RCy3::createNetworkFromIgraph(ig,"myIgraph")

########## sig genie3 tests and abiotic stresses values for mat and tests and grouped
library(DIANE)

data("abiotic_stresses")
data("gene_annotations")
data("regulators_per_organism")


genes <- get_locus(abiotic_stresses$heat_DEGs)
regressors <- intersect(genes, 
                        regulators_per_organism$`Arabidopsis thaliana`)

data <- aggregate_splice_variants(abiotic_stresses$normalized_counts)

r <- DIANE::group_regressors(data, genes, regressors)



mat <- DIANE::network_inference(r$counts, 
                                conds = abiotic_stresses$conditions, 
                                targets = r$grouped_genes,
                                regressors = r$grouped_regressors, 
                                importance_metric = "MSEincrease_oob", 
                                verbose = TRUE)

library(tictoc)

tic("test edges")
res <- DIANE::test_edges(mat, normalized_counts = r$counts, density = 0.03,
                         nGenes = length(r$grouped_genes), 
                         nRegulators = length(r$grouped_regressors), 
                         nTrees = 1000, verbose = TRUE)
toc()

# abiotic_stresses$heat_edge_tests = res
# abiotic_stresses$heat_DEGs_regulatory_links <- mat

########## verif de la correlation des fdrs d'un run à l'autre des tests

links0 <- abiotic_stresses$heat_edge_tests$links
links <- res$links

links$pair <- paste(links$regulatoryGene, links$targetGene)
links0$pair <- paste(links0$regulatoryGene, links0$targetGene)

common <- intersect(links$pair, links0$pair)
commonlinks <- links[links$pair %in% common,]

plot(commonlinks$fdr, links0[match(commonlinks$pair, links0$pair),]$fdr)
cor(commonlinks$fdr, links0[match(commonlinks$pair, links0$pair),]$fdr)


res$fdr_nEdges_curve

res$pvalues_distributions + xlim(0,0.1)

net <- DIANE::network_from_tests(res$links, fdr = 0.01)


net_data <- network_data(net, regulators_per_organism[["Arabidopsis thaliana"]], gene_annotations$`Arabidopsis thaliana`)

DIANE::draw_network(net_data$nodes, net_data$edges)

abiotic_stresses$heat_grouped_data <- r



# mat2 <- DIANE::network_inference(r$counts, conds = abiotic_stresses$conditions, targets = r$grouped_genes,
#                                 regressors = r$grouped_regressors, 
#                                 verbose = TRUE)

# res <- data.frame(density = seq(0.001, 0.1, length.out = 20),
#                   nEdges = sapply(seq(0.001, 0.1, length.out = 20), 
#                                   get_nEdges, nGenes, nRegulators))
# ggplot(res, aes(x = density, y = nEdges)) + geom_line(size = 1) + ggtitle()


######## Mm TFs 

tfs <- read.csv('D:/These/DIANE_inputs/Mus_musculus_TF.txt', sep = '\t')
tfs <- tfs$Ensembl
write.table(tfs, row.names = F, quote = F, file = "D:/These/DIANE_inputs/Mus_musculus_TF.txt")
regulators_per_organism[["Mus musculus"]] <- tfs





################ lupine data



annot <- read.csv("D:/These/Thesis/DIANE_inputs/genesLupinAnnot.csv", sep = '\t', row.names = 1, header = FALSE)
colnames(annot) <- c("description")
annot$label <- rownames(annot)

library(DIANE)
data("regulators_per_organism")

regulators_per_organism[["Lupinus albus"]] <- annot[stringr::str_detect(annot$description, "transcription"), 'label']
gos <- read.csv("D:/These/Thesis/DIANE_inputs/lupin_golist.txt", sep = '\t')
lupine <- list(annotation = annot, go_list = gos)



######### labels TAIT/ensembl


tair <- read.csv("~/Documents/Seafile/Thesis/DIANE_inputs/col_brm_lbd_David_TAIR10.csv", sep = '\t', row.names = "Gene", h = TRUE)
ens <- read.csv("~/Documents/Seafile/Thesis/DIANE_inputs/col_brm_lbd_David_Ensembl.csv", sep = '\t', row.names = "Gene", h = TRUE)


DIANE::check_IDs(rownames(tair), organism = "Arabidopsis thaliana")
DIANE::check_IDs(rownames(ens), organism = "Arabidopsis thaliana")
length(intersect(rownames(tair), rownames(ens)))

library(DIANE)
data("abiotic_stresses")
data("gene_annotations")
data("regulators_per_organism")


conditions <- stringr::str_split_fixed(colnames(ens), '_', 2)[,1]

annot <- gene_annotations$`Arabidopsis thaliana`


which(!stringr::str_detect( rownames(ens), pattern = "\\.[[:digit:]]+$"))

loci.ens <- get_locus(rownames(ens))

DIANE::

gene_annotations$`Arabidopsis thaliana`[sample(rownames(ens), 20),]
gene_annotations$`Arabidopsis thaliana`[sample(rownames(tair), 20),]


tcc_object <- DIANE::normalize(abiotic_stresses$raw_counts, abiotic_stresses$conditions, iteration = FALSE)
threshold = 10*length(conditions)
tcc_object <- DIANE::filter_low_counts(tcc_object, threshold)
normalized_counts <- TCC::getNormalizedData(tcc_object)

############### new coseq tests

library(DIANE)
data("abiotic_stresses")
data("gene_annotations")
data("regulators_per_organism")


genes <- abiotic_stresses$heat_DEGs


clustering <- DIANE::run_coseq(
  conds = unique(abiotic_stresses$conditions), 
  data = abiotic_stresses$normalized_counts, genes = genes, K = 6:9)

normSin <- run_coseq(
  conds = unique(abiotic_stresses$conditions), transfo = "arcsin", model = "Normal",
  data = abiotic_stresses$normalized_counts, genes = genes, K = 2:9)

normLogit <- run_coseq(
  conds = unique(abiotic_stresses$conditions), transfo = "logit", model = "Poisson",
  data = abiotic_stresses$normalized_counts, genes = genes, K = 2:9)


DIANE::draw_coseq_run(normSin$model, plot = "barplots")
DIANE::draw_coseq_run(normSin$model, plot = "ICL")
DIANE::draw_coseq_run(clustering$model, plot = "ICL")


draw_profiles(abiotic_stresses$normalized_counts, normLogit$membership, abiotic_stresses$conditions)
coseq::compareICL(list(normLogit$model, normSin$model))

summary(normLogit)




########## sig genie time estimation
library(DIANE)

data("abiotic_stresses")
data("gene_annotations")
data("regulators_per_organism")


genes <- get_locus(abiotic_stresses$heat_DEGs)
regressors <- intersect(genes, 
                        regulators_per_organism$`Arabidopsis thaliana`)

data <- aggregate_splice_variants(abiotic_stresses$normalized_counts)

r <- DIANE::group_regressors(data, genes, regressors)



mat <- DIANE::network_inference(r$counts, 
                                conds = abiotic_stresses$conditions, 
                                targets = r$grouped_genes,
                                regressors = r$grouped_regressors, 
                                importance_metric = "MSEincrease_oob", 
                                verbose = TRUE)

library(tictoc)

tic("test edges")
res <- DIANE::test_edges(mat, normalized_counts = r$counts, density = 0.02,
                         nGenes = length(r$grouped_genes), 
                         nRegulators = length(r$grouped_regressors), 
                         nTrees = 1000, verbose = TRUE)

toc()

tic("test")
a <- toc()
elapsed <- a$toc - a$tic




t <- estimate_test_edges_time(mat, normalized_counts = r$counts, density = 0.02,
                         nGenes = length(r$grouped_genes), 
                         nRegulators = length(r$grouped_regressors), 
                         nTrees = 1000, verbose = TRUE)
t/60


g2 <- sample(rownames(abiotic_stresses$normalized_counts), size = 4000)
g1 <- sample(rownames(abiotic_stresses$normalized_counts), size = 4040)

venn <- ggVennDiagram::ggVennDiagram(list(g1 = g1, g2 = g2), color = "#888888")
gene_lists <- list(g1 = g1, g2 = g2)

library(rlist)

length(setdiff(g1, g2))


list.count(gene_list, )


get_specific <- function(gene_lists, l ){
  common <- Reduce(intersect, gene_lists)
  return(setdiff(gene_lists[[l]], common))
}



length(get_specific(gene_lists, "g1"))



### tfs droso

tfs <- read.table("D:/These/DIANE_inputs/TFs_droso.txt", h = F)
regulators_per_organism[["Drosophilia melanogaster"]] <- tfs$V1

## tfs c elegans 


tfs <- read.table("D:/These/DIANE_inputs/Ce_TF_list.csv", h = T, sep = ',')
regulators_per_organism[["Caenorhabditis elegans"]] <- tfs$Gene.WB.ID

## tfs e coli


tfs <- read.table("D:/These/DIANE_inputs/TFSet.txt", h = F, sep = '\t')
tfs <- unique(tfs$V3)
tfs <- c(stringr::str_split_fixed(tfs, ', ', 2)[,1], stringr::str_split_fixed(tfs, ', ', 2)[,2][stringr::str_split_fixed(tfs, ', ', 2)[,2] != ""])
regulators_per_organism[["Escherichia coli"]] <- tfs

data("abiotic_stresses")
DIANE::draw_profiles(data = abiotic_stresses$normalized_counts, 
membership = abiotic_stresses$heat_DEGs_coseq_membership,
conds = unique(abiotic_stresses$conditions)) 
DIANE::draw_profiles(data = abiotic_stresses$normalized_counts, 
                      membership = abiotic_stresses$heat_DEGs_coseq_membership, 
                      conds = unique(abiotic_stresses$conditions), k = 3) + 
  gghighlight::gghighlight() 


gene <- sample(get_genes_in_cluster(abiotic_stresses$heat_DEGs_coseq_membership,  3),1)



####################### rice

## annotations


data("gene_annotations")

d <- read.csv("D:/These/DIANE_inputs/OSativa_rapdb/OSativa_annotation_rapdb.tsv", sep = '\t', h = T, row.names = "Gene")
colnames(d) <- c("description", "label")


gene_annotations[["Oryza sativa (rapdb)"]] <- d


d <- read.csv("D:/These/DIANE_inputs/rice_annotation_gene_level_oceane/OSativa_msu_gene_level/OSativa_msu_annotation_gene_level.tsv", sep = '\t', h = T, row.names = "Gene")
colnames(d) <- c("description")
gene_annotations[["Oryza sativa (msu)"]] <- d

gene_annotations$`Oryza sativa rapdb` <- NULL
gene_annotations$`Oryza sativa msu` <- NULL

#d <- read.csv("D:/These/DIANE_inputs/rice_annotation_gene_level_oceane/OGlaberrima_gene_level/OGlaberrima_name_product_gene_level.tsv", sep = '\t', h = T, row.names = "Gene")
#colnames(d) <- c("description")
#gene_annotations[["Oryza sativa msu"]] <- d


d2 <- gene_annotations$`Oryza sativa msu`


splitted <- stringr::str_split_fixed(rownames(d2), '\\.', 2)[,1]
length(unique(splitted))


length(intersect(splitted, rownames(d)))
rownames(d)[!rownames(d) %in% splitted]

d <- read.csv("D:/These/DIANE_inputs/OGlaberrima/OGlaberrima_name_product.tsv", sep = '\t', h = T, row.names = "Gene")
colnames(d) <- c("description")
rownames(d) <- stringr::str_split_fixed(rownames(d), '\\.1', 2)[,1]
gene_annotations[["Oryza glaberrima"]] <- d


## go terms

DIANE::lupine$go_list

go_matchings <- list()

data("go_matchings")

d <- read.csv("D:/These/DIANE_inputs/OSativa_rapdb/OSativa_go.tsv", sep = '\t', h = T)
go_matchings[["Oryza sativa (rapdb)"]] <- d


d <- read.csv("D:/These/DIANE_inputs/rice_annotation_gene_level_oceane/OSativa_msu_gene_level/OSativa_msu_go_gene_level.tsv", sep = '\t', h = T)
go_matchings[["Oryza sativa (msu)"]] <- d


d <- read.csv("D:/These/DIANE_inputs/rice_annotation_gene_level_oceane/OGlaberrima_gene_level/OGlaberrima_go_gene_level.tsv", sep = '\t', h = T)
go_matchings[["Oryza glaberrima"]] <- d


DIANE::check_IDs(go_matchings$`Oryza sativa (msu)`$geneid, organism = "Oryza sativa (msu)")