ECCO: Analysis for the ECCO project

library("edgeR")
library("tidyr")
library("ggplot2")
library("ggrepel")
library("dplyr")
library("sva")
library("here")
library("matrixStats")
library("hgu219.db") # Newest
library("hgu219hsentrezg.db") # From local storage...
library("sva")

## Metadata ####
rename_cols <- c(colname = "AGILENT/ARRAY CODE",
                 cell_type = "STEM/DIFF",
                 status = "SAMPLE STATUS",
                 AaD = "Age at diagnosis",
                 GROUP = "GROUP",
                 age = "age",
                 TYPE = "TYPE",
                 LOCATION = "LOCATION",
                 SAMPLE = "SAMPLE"
)

meta <- readxl::read_excel(here::here("data", "variables CD cohort.xlsx")) %>%
    dplyr::select(rename_cols) %>%
    filter(GROUP != "UC") %>%
    mutate(GROUP = toupper(GROUP),
           reanalyzed = endsWith(colname, "V"),
           # Age at diagnosis correted as per Isa instructions
           age = ifelse(colname == "34V", 52, age),
           # Isa: consider as adult above the 17 years
           TYPE = case_when(age <= 17 & !is.na(age) ~ "pediatric",
                            age > 17 & !is.na(age) ~ "adult",
                            TRUE ~ TYPE),
           SAMPLE2 = gsub("/.*", "", gsub(" .*", "", trimws(gsub("STEM|DIFF.?", "", gsub("RNA ", " ", SAMPLE)))))
    )

# RNAseq ####
counts <- read.table(here::here("data", "ORGANOIDS_STAR_RSEM_GENES.txt"), check.names = FALSE)

samples <- gsub("_.*", "", colnames(counts))
uniq_samples <- unique(samples)
genes <- gsub("\\..*", "", rownames(counts))

# To summarize the samples together
group_samples <- function(expr, names) {
    umat <- matrix(nrow = nrow(expr), ncol = length(names))
    for (i in 1:length(names)) {
        umat[, i] <- rowSums(expr[, which(samples == names[i])])
    }
    colnames(umat) <- names
    rownames(umat) <- rownames(expr)
    umat
}
umat <- group_samples(counts, uniq_samples)
umat <- umat[rowSums(umat) != 0, ]

# Subset to just old samples
shared_samples <- intersect(colnames(umat), meta$colname[meta$reanalyzed])
expr <- umat[, shared_samples]
meta <- meta[match(shared_samples, meta$colname), ]

stopifnot(meta$colname == colnames(expr)) # Same order


# Design ####
grouping <- apply(meta[, c("cell_type", "GROUP", "TYPE", "LOCATION"), drop = FALSE], 1,
                  paste0, collapse = "_")
grouping <- as.factor(grouping)
design <- model.matrix(~0 + grouping, grouping)
colnames(design) <- levels(grouping)
design <- cbind(Intercept = 1, design) # Truco 1 afegir intercept
rownames(design) <- meta$colname

design <- design[ , colSums(design) != 1] # Truco 2 treure aquelles amb 1 sola mostra


# Add id of the sample
# samples_d <- model.matrix(~0 + SAMPLE2, data = meta[, "SAMPLE2", drop = FALSE])
# design <- cbind(design, samples_d)
# design <- design[ , colSums(design) != 1]

# Contrasts ####
# Helper functions for contrasts
paste_plus <- function(...){paste(..., collapse = " + ")}
paste_ab <- function(a, b){paste(a, "- (", b, ")")}
grep_level <- function(x, a){
    # Force to actually look up on the design being used!
    # This makes it dependent on the order it is run!!
    grep(x, a, value = TRUE)
}
contrasting <- function(x, y, design_m) {
    d <- colnames(design_m)
    paste_ab(paste_plus(grep_level(x, d)),
             paste_plus(grep_level(y, d)))
}


# DIFF eq 1, STEM to -1
# On 25/11/2019 Isa asked me to add all the comparisons instead of just the few
# She also identified some duplicated comparisons (my fault)
# On 28/11/2019 morning Isa asked me to make it as in the previous analysis of microarrays
# On 28/112019 afternon Isa asked me to make it the full 36 contrasts from pegs_organoids_2017 project
# and remove those that don't make sense
# On 29/11/2019 afternon I asked Isa  about what to do about contrast14_stem_colon_CD_A_P and contrast29_CD_A_P
# The difference is between having healthy and not having healthy samples on each group
contrasts <- c(
    c1_stem_diff_colon_A = contrasting("STEM.*adult_sigma", "DIFF.*adult_sigma", design), # 1
    c2_stem_diff_colon_P = contrasting("STEM.*pediatric_sigma", "DIFF.*pediatric_sigma", design), #  2
    c3_stem_diff_ileum_A = contrasting("STEM.*adult_ileum", "DIFF.*adult_ileum", design), #  3
    # c4_stem_diff_ileum_P = contrasting("STEM.*pediatric_ileum", "DIFF.*pediatric_ileum", design),  #  4
    c5_stem_ileum_colon_A = contrasting("STEM.*adult_ileum", "STEM.*adult_sigma", design), #  5
    c6_stem_ileum_colon_P = contrasting("STEM.*pediatric_ileum", "STEM.*pediatric_sigma", design), #  6
    c7_diff_ileum_colon_A = contrasting("DIFF.*adult_ileum", "DIFF.*adult_sigma", design), #  7
    # c8_diff_ileum_colon_P = contrasting("DIFF.*pediatric_ileum", "DIFF.*pediatric_sigma", design),  #  8
    # c9_diff_H_CD_colon_A in reality has not inflamed CD only!! vs healthy ctrl
    # c10_stem_H_CD_colon_A in reality has non inflamed CD only!! vs healthy ctrl
    c11_stem_H_CD_colon_P = contrasting("STEM_CTRL_pediatric_sigma", "STEM_CD_pediatric_sigma", design), #  11
    # c12_diff_H_CD_colon_P = contrasting("DIFF_CTRL_pediatric_sigma", "DIFF_CD_pediatric_sigma", design), #  12
    # c13_stem_ileum_CD_A_P in reality stem_ileum_CD_A_P has not inflamed only!!
    # c14_stem_colon_CD_A_P has not inflamed only on adults!!
    # c15_diff_ileum_CD_A_P only not inflamed!!
    # c16_diff_colon_CD_A_P adult diff not inflamed!!
    # 17
    # 18
    # 19
    # 20
    # 21
    # 22
    # 23
    # 24
    # 25
    # 26
    # 27
    # 28
    c29_stem_colon_CD_A_P = contrasting("STEM_CD_adult_sigma", "STEM_CD_pediatric_sigma", design), #  29
    c30_diff_colon_CD_A_P = contrasting("DIFF_CD_adult_sigma", "DIFF_CD_pediatric_sigma", design), #  30
    c31_stem_ileum_CD_A_P = contrasting("STEM_CD_adult_ileum", "STEM_CD_pediatric_ileum", design), #  31
    # c32_diff_ileum_CD_A_P = contrasting("DIFF_CD_adult_ileum", "DIFF_CD_pediatric_ileum", design), # 32 No n
    # Just in case Siempre por separado
    c33_diff_vs_stem = contrasting("^DIFF", "^STEM", design),
    c34_CD_vs_CTRL = contrasting("_CD_", "_CTRL_", design),
    c35_pediatric_vs_adult = contrasting("pediatric", "adult", design),
    c36_ileum_vs_sigma = contrasting("ileum", "sigma", design)
    )

contr.matrix <- makeContrasts(contrasts = contrasts, levels = colnames(design))
colnames(contr.matrix) <- names(contrasts)

design2 <- meta %>%
    mutate(
        c9_diff_H_CD_colon_A = case_when( # Number 9
            cell_type == "DIFF" & status == "not inflamed" & GROUP == "CD" & TYPE == "adult" & LOCATION == "sigma" ~ -1,
            cell_type == "DIFF" & status == "healthy" & GROUP == "CTRL" & TYPE == "adult" & LOCATION == "sigma"~ 1,
            TRUE ~ 0),
        c10_stem_H_CD_colon_A = case_when( # Number 10
            cell_type == "STEM" & status == "not inflamed" & GROUP == "CD" & TYPE == "adult" & LOCATION == "sigma" ~ -1,
            cell_type == "STEM" & status == "healthy" & GROUP == "CTRL" & TYPE == "adult" & LOCATION == "sigma"~ 1,
            TRUE ~ 0),
        c13_stem_ileum_CD_A_P = case_when( # Number 13
            cell_type == "STEM" & status == "not inflamed" & GROUP == "CD" & TYPE == "pediatric" & LOCATION == "ileum" ~ -1,
            cell_type == "STEM" & status == "not inflamed" & GROUP == "CD" & TYPE == "adult" & LOCATION == "ileum"~ 1,
            TRUE ~ 0),
        c14_stem_colon_CD_A_P = case_when( # Number 14
            cell_type == "STEM" & GROUP == "CD" & TYPE == "pediatric" & LOCATION == "sigma" ~ -1,
            cell_type == "STEM" & status == "not inflamed" & GROUP == "CD" & TYPE == "adult" & LOCATION == "sigma"~ 1,
            TRUE ~ 0),
        c15_diff_ileum_CD_A_P = case_when( # Number 15
            cell_type == "DIFF" & status == "not inflamed" & GROUP == "CD" & TYPE == "pediatric" & LOCATION == "ileum" ~ -1,
            cell_type == "DIFF" & status == "not inflamed" & GROUP == "CD" & TYPE == "adult" & LOCATION == "ileum"~ 1,
            TRUE ~ 0),
        c16_diff_colon_CD_A_P = case_when( # Number 16
            cell_type == "DIFF" & GROUP == "CD" & TYPE == "pediatric" & LOCATION == "sigma" ~ -1,
            cell_type == "DIFF" & status == "not inflamed" & GROUP == "CD" & TYPE == "adult" & LOCATION == "sigma"~ 1,
            TRUE ~ 0),
        c17_colon_CD_healthy_involved_A = case_when( # Number 17
            cell_type == "DIFF" & status == "not inflamed" & GROUP == "CD" & TYPE == "adult" & LOCATION == "sigma" ~ -1,
            cell_type == "DIFF" & status == "healthy" & GROUP == "CD" & TYPE == "adult" & LOCATION == "sigma"~ 1,
            TRUE ~ 0),
        c18_stem_colon_CD_Healthy_involved_A = case_when( # Number 18
            cell_type == "STEM" & status == "healthy" & GROUP == "CTRL" & TYPE == "adult" & LOCATION == "sigma"~ 1,
            cell_type == "STEM" & status == "not inflamed" & GROUP == "CD" & TYPE == "adult" & LOCATION == "sigma" ~ -1,
            TRUE ~ 0),
        c19_diff_ileum_CD_healthy_involved_A = case_when( # Number 19
            cell_type == "DIFF" & status == "healthy" & GROUP == "CTRL" & TYPE == "adult" & LOCATION == "ileum"~ 1,
            cell_type == "DIFF" & status == "not inflamed" & GROUP == "CD" & TYPE == "adult" & LOCATION == "ileum" ~ -1,
            TRUE ~ 0),
        c20_stem_ileum_CD_healthy_involved_A = case_when( # Number 20
            cell_type == "STEM" & status == "healthy" & GROUP == "CTRL" & TYPE == "adult" & LOCATION == "ileum"~ 1,
            cell_type == "STEM" & status == "not inflamed" & GROUP == "CD" & TYPE == "adult" & LOCATION == "ileum" ~ -1,
            TRUE ~ 0),
        # c21_diff_colon_CD_healthy_involved_P = case_when( # Number 21
        #     cell_type == "DIFF" & status == "not involved" & GROUP == "CD" & TYPE == "pediatric" & LOCATION == "sigma"~ 1,
        #     cell_type == "DIFF" & GROUP == "CD" & TYPE == "pediatric" & LOCATION == "sigma" ~ -1,
        #     TRUE ~ 0),
        c22_stem_colon_CD_Healthy_involved_P = case_when( # Number 22
            cell_type == "STEM" & status == "healthy" & GROUP == "CTRL" & TYPE == "pediatric" & LOCATION == "sigma"~ 1,
            cell_type == "STEM" & status == "not inflamed" & GROUP == "CD" & TYPE == "pediatric" & LOCATION == "sigma" ~ -1,
            TRUE ~ 0),
        c23_diff_ileum_CD_healthy_involved_P = case_when( # Number 23
            cell_type == "DIFF" & status == "not involved" & GROUP == "CD" & TYPE == "pediatric" & LOCATION == "ileum"~ 1,
            cell_type == "DIFF" & status == "not inflamed" & GROUP == "CD" & TYPE == "pediatric" & LOCATION == "ileum" ~ -1,
            TRUE ~ 0),
        c24_stem_ileum_CD_healthy_involved_P = case_when( # Number 24
            cell_type == "STEM" & status == "healthy" & GROUP == "CD" & TYPE == "pediatric" & LOCATION == "ileum"~ 1,
            cell_type == "STEM" & status == "involved" & GROUP == "CD" & TYPE == "pediatric" & LOCATION == "ileum" ~ -1,
            TRUE ~ 0),
        # c25_stem_ileum_H_healthy_CD = case_when( # Number 25 just 1 sample
        #     cell_type == "STEM" & GROUP == "CTRL" & LOCATION == "ileum"~ 1,
        #     cell_type == "STEM" & status == "not involved" & GROUP == "CD" & LOCATION == "ileum" ~ -1,
        #     TRUE ~ 0),
        c26_diff_ileum_H_healthy_CD = case_when( # Number 26
            cell_type == "DIFF" & GROUP == "CTRL" & LOCATION == "ileum"~ 1,
            cell_type == "DIFF" & status == "not involved" & GROUP == "CD" & LOCATION == "ileum" ~ -1,
            TRUE ~ 0),
        c27_stem_colon_H_healthy_CD = case_when( # Number 27
            cell_type == "STEM" & GROUP == "CTRL" & LOCATION == "sigma"~ 1,
            cell_type == "STEM" & status == "not involved" & GROUP == "CD" & LOCATION == "sigma" ~ -1,
            TRUE ~ 0),
        c28_diff_colon_H_CD = case_when( # Number 28
            cell_type == "DIFF" & GROUP == "CTRL" & LOCATION == "sigma"~ 1,
            cell_type == "DIFF" & status == "not involved" & GROUP == "CD" & LOCATION == "sigma" ~ -1,
            TRUE ~ 0)
        ) %>%
    dplyr::select(-colname, -cell_type, -status, -AaD, -age, -GROUP, -LOCATION,
                  -SAMPLE, -reanalyzed, -SAMPLE2, -TYPE) %>%
    as.matrix()

# If there are less than 3 samples we can't have meaningful comparisons
design2 <- design2[, colSums(design2 != 0) > 2]
keep <- apply(design2, 2, function(x){length(unique(x)) != 2 }) # Check that there aren't empty groups
design2 <- design2[, keep]

pick <- design %*% contr.matrix # To know which samples are taken
pick <- cbind(pick, design2)

# Three samples that are only on the intercept group
w <- which(rowSums(design) == 1)
stopifnot(w == c(16, 18, 40))

pick[w[1], c("stem_ileum_colon_P", "pediatric_vs_adult", "ileum_vs_sigma")] <- 1
pick[w[1], c("diff_vs_stem", "CD_vs_CTRL")] <- -1

pick[w[2], c("stem_diff_colon_P", "CD_vs_CTRL", "ileum_vs_sigma")] <- -1
pick[w[2], c("diff_vs_stem", "pediatric_vs_adult")] <- 1

pick[w[3], c("diff_vs_stem", "CD_vs_CTRL", "pediatric_vs_adult", "ileum_vs_sigma")] <- 1

p <- apply(pick, 2, table)
if (any(lengths(p) == 2)) {
    p[lengths(p) == 2] <- lapply(p[lengths(p) == 2], function(x){
        y <- c(x, "0" = 0)
        y[c("-1", "0", "1")]})
    samples_used <- t(simplify2array(p))
} else {
    samples_used <- t(p)
}
stopifnot(ncol(samples_used) == 3)
write.csv(samples_used[, c(3, 1)], "processed/samples_used.csv")


pu <- unique(t(pick))
# If there are duplicate contrast check the output and remove them
if ((nrow(t(pick)) - nrow(pu)) != 0){

    duplicated_contr <- base::setdiff(rownames(t(pick)), rownames(pu))

    dups <- sapply(duplicated_contr, function(y) {
        apply(pick, 2, function(x) {all(x ==pick[, y])})
    })
    dups
}

# Look if there is any contrast with just one sample on one side
dge_rna <- DGEList(expr)
keep <- filterByExpr(dge_rna)
dge_rna <- dge_rna[keep, ]
CPM <- cpm(dge_rna$counts, log = TRUE)
keep <- rowSums(CPM > 0.5) >= 23 # 24 samples with more expression than 1 CPM
keep2 <- rowSums(CPM)
# Prepare files for Juanjo
write.csv(dge_rna$counts, file = "processed/old_counts.csv",
          row.names = TRUE)

d2 <- pick
d2[d2 == 1] <- 2
d2[d2 == -1] <- 1
d2[d2 == 0] <- NA
write.csv(d2, file = "processed/comparisons.csv", row.names = TRUE)

dge_rna <- calcNormFactors(dge_rna, method = "TMM")
voom_rna <- voom(dge_rna, plot = TRUE, normalize.method = "cyclicloess")

vfit <- lmFit(voom_rna, design = design)

# Continue evaluation whole ####
vfit <- contrasts.fit(vfit, contrasts = contr.matrix)
efit <- eBayes(vfit)
plotSA(efit, main="Final model: Mean-variance trend")
ttc <- vector("list", 21)
pdf("processed/MA_complex_old_samples2.pdf")
for (i in seq_len(ncol(contr.matrix))) {
    ttc[[i]] <- topTable(efit, coef = names(contrasts)[i],
                         number = Inf)
    plot(ttc[[i]]$AveExpr, ttc[[i]]$logFC,
         main = names(contrasts)[i], pch = ".")
}
dev.off()

saveRDS(ttc, "processed/tt_complex.RDS")

dt <- decideTests(efit, lfc = log2(1.5), adjust.method = "fdr")
dtt <- summary(dt)
dtt

# Evaluation ####
tt <- vector("list", length(contrasts))

# Prepare for GET ####
# perl ~/Documents/projects/GETS/gets.pl --matrix=./processed/STEM_old.tsv --geneinfo=./processed/gene_info_old.tsv --sampleinfo=./processed/STEM_samples_info_old.tsv --colors=./processed/colors_info_old.tsv --output=./processed/STEM_GETS_old --center=TRUE --overwrite=TRUE
genes <- gsub("\\..*", "", rownames(dt))
genes_n <- gsub(".*\\.[0-9]*_", "", rownames(dt))
gene_names <- mapIds(org.Hs.eg.db, keys = genes, keytype = "ENSEMBL", "SYMBOL")
# Possible step to filter the genes to reduce the number of rows
# summary(is.na(gene_info$gene_names))
# kruskal.test(list(table(is_na), table(is_pr))) Is not conclusive
gene_info <- data.frame(rownames = rownames(dt), genes, gene_names, genes_n)
gene_info$Description <- mapIds(org.Hs.eg.db,
                                keys = as.character(gene_info$genes),
                                keytype = "ENSEMBL", column = "GENENAME")

# Adding data from the comparisons

tt_all <- lapply(seq_along(contrasts), function(x) {
    tt <- topTable(efit, coef = x, number = Inf, adjust.method = "BH")
    sign <- sign(tt$logFC)
    tt$FC <- sign*2^(abs(tt$logFC))
    above <- abs(tt$FC) > 1.5
    signif <- tt$P.Value < 0.05
    signiff <- tt$adj.P.Val < 0.05
    tt$sign <- ""

    tt$sign[signif & sign < 0 & above] <- "DW"
    tt$sign[signiff & sign < 0 & above] <- "DDW"
    tt$sign[signif & sign > 0 & above] <- "UP"
    tt$sign[signiff & sign > 0 & above] <- "UUP"
    tt <- tt[, !grepl("B$|AveExpr$|t$", colnames(tt))]
    colnames(tt) <- paste0("contrast", x, "_", names(contrasts)[x], "_", colnames(tt))
    tt
})

tt_all_m <- do.call(cbind, tt_all)
sign_c <- grep("_sign$", colnames(tt_all_m))
FC_c <- grep("_FC$", colnames(tt_all_m))
adj_c <- grep("adj.P.Value$", colnames(tt_all_m))
raw_c <- grep("P.Value$", colnames(tt_all_m))

tt_all_m <- tt_all_m[, c(sign_c, FC_c, adj_c, raw_c)]
gene_info2 <- cbind(gene_info, tt_all_m)


write.table(gene_info2[, -1], "processed/gene_info_old.tsv", sep = "\t", row.names = FALSE,
            col.names = TRUE, quote = FALSE, na = "")

stopifnot(ncol(voom_rna$E) == nrow(meta))

r <- order(meta$cell_type, meta$LOCATION, meta$TYPE, meta$GROUP)
meta_o <- meta[r, -10]
meta_o <- meta_o[, c("colname", "cell_type", "LOCATION", "TYPE", "GROUP", "status",
           "age", "AaD", "SAMPLE", "SAMPLE2")]
expr_csv <- voom_rna$E[, r]
stopifnot(ncol(expr_csv) == nrow(meta))
stopifnot(nrow(gene_info2) == nrow(expr_csv))
expr2_csv <- cbind.data.frame(genes = rownames(expr_csv), expr_csv)
write.table(expr2_csv, file = "processed/old.tsv", sep = "\t",
            row.names = FALSE, quote = FALSE)
stem <- cbind.data.frame(genes = rownames(expr_csv), expr_csv[, meta_o$cell_type == "STEM"])
write.table(stem,
            file = "processed/STEM_old.tsv", sep = "\t",
            row.names = FALSE, quote = FALSE, na = "")
diff <- cbind.data.frame(genes = rownames(expr_csv), expr_csv[, meta_o$cell_type == "DIFF"])
write.table(diff,
            file = "processed/DIFF_old.tsv", sep = "\t",
            row.names = FALSE, quote = FALSE, na = "")

meta_o %>%
    dplyr::filter(colname %in% colnames(expr2_csv)) %>%
    dplyr::select(-colname) %>%
    t() %>%
    write.table(file = "processed/samples_info_old.tsv", sep = "\t",
                row.names = TRUE, quote = FALSE, col.names = FALSE, na = "")
meta_o %>%
    dplyr::filter(colname %in% colnames(stem)[-1]) %>%
    dplyr::select(-colname) %>%
    t() %>%
    write.table(file = "processed/STEM_samples_info_old.tsv", sep = "\t",
                row.names = TRUE, quote = FALSE, col.names = FALSE, na = "")
meta_o %>%
    dplyr::filter(colname %in% colnames(diff)[-1]) %>%
    dplyr::select(-colname) %>%
    t() %>%
    write.table(file = "processed/DIFF_samples_info_old.tsv", sep = "\t",
                row.names = TRUE, quote = FALSE, col.names = FALSE, na = "")

val <- sapply(meta_o[2:ncol(meta_o)], unique)

data.frame(pos = "SAMPLEINFO",
           value = rep(names(val), lengths(val)),
           "s" = unlist(val, use.names = FALSE),
           color = "blue") %>%
    mutate(color = is.character(color)) %>%
    mutate(color = case_when(s == "inflamed" ~ "red",
                             s == "healthy" ~ "green",
                             s == "not involved" ~ "LIGHT_YELLOW",
                             s == "not inflamed" ~ "orange",
                             s == "CTRL" ~ "yellow",
                             s == "CD" ~ "blue",
                             s == "pediatric" ~ "CYAN",
                             s == "adult" ~ "light_orange",
                             s == "sigma" ~ "white",
                             s == "ileum" ~ "light_blue"
    )) %>%
    filter(!is.na(color)) %>%
    mutate(color = toupper(color)) %>%
    write.table(file = "processed/colors_info_old.tsv", quote = FALSE, sep = "\t",
                col.names = FALSE, row.names = FALSE)