ECCO: Analysis for the ECCO project

library(tximport)
library(gdata)
library(ggbiplot)
library(limma)
library(edgeR)
library(WriteXLS)

assigncol <- function(pheno) {
  colx <- c("blue", "red", "green", "orange", "yellow", "gray", "pink", "white", "brown")
  colxm <- matrix("gray", nrow = length(pheno))
  g <- unique(pheno)

  for (i in 1:length(g)) {
    ss <- which(pheno == g[i])
    colxm[ss] <- colx[i]
  }
  if (length(colx) > 9) {
    cat("N>9:D'ont Run")
  }

  # print(data.frame(group = 1:9, color = colx))

  return(colxm)
}





plotPCA <- function(xmat, gg) {
  data.class <- gg
  data.pca <- prcomp(xmat, scale. = TRUE)
  colx <- assigncol(data.class)
  g <- ggbiplot(data.pca,
    obs.scale = 0.25, var.scale = 0.5, labels.size = 3,
    groups = data.class, var.axes = FALSE, ellipse = TRUE, circle = FALSE
  ) + theme(
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 13)
  ) + theme_bw()


  print(g)

  g <- ggbiplot(data.pca,
    obs.scale = 0.25, var.scale = 0.5, labels.size = 3,
    groups = data.class, var.axes = FALSE, labels = rownames(xmat), ellipse = TRUE, circle = FALSE
  ) + theme(
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 13)
  ) +
    theme_bw()


  print(g)
}

extractPC123 <- function(mydata) {
  # extract from ggbiplot
  pcobj <- prcomp(mydata, scale. = TRUE)
  choices <- 1:3
  obs.scale <- 1
  var.scale <- 1
  nobs.factor <- sqrt(nrow(pcobj$x) - 1)
  d <- pcobj$sdev
  u <- sweep(pcobj$x, 2, 1 / (d * nobs.factor), FUN = "*")
  v <- pcobj$rotation
  choices <- pmin(choices, ncol(u))
  df.u <- as.data.frame(sweep(u[, choices], 2, d[choices]^obs.scale, FUN = "*"))
  df.u <- df.u * nobs.factor
  colnames(df.u) <- c("PC1", "PC2", "PC3")
  rownames(df.u) <- rownames(mydata)
  return(df.u)
}


tx2gene <- read.table(here::here("data-raw", "annot_gencodev29.txt"), sep = "\t", header = F)
genedb <- read.table(here::here("data-raw","geneannot.txt"), sep = "\t", header = F)

equiv <- tx2gene[, c(1, 3)]
colnames(equiv) <- c("TXNAME", "GENEID")

xls <- read.xls(here::here("data-raw","lista.xls"))
files <- paste(as.character(xls[, 1]), "/quant.sf", sep = "")


txi <- tximport(files, type = "salmon", tx2gene = NULL, txOut = TRUE)

transcripts <- txi$counts # for transcripts


rownames(txi$counts) <- rownames(txi$abundance) <- rownames(txi$length) <- rownames(txi$counts) <- equiv[, 1]


txi.sum <- summarizeToGene(txi, tx2gene)$counts

usample <- as.character(unique(xls[, 2]))

umat <- matrix(nrow = dim(txi.sum)[1], ncol = length(usample))
for (i in 1:length(usample)) {
  umat[, i] <- apply(txi.sum[, which(xls[, 2] == usample[i])], 1, sum)
}
colnames(umat) <- usample


cnt <- umat
rownames(umat) <- rownames(txi.sum)

y <- DGEList(cnt)
# filtering
keep <- filterByExpr(y)
y <- y[keep, ]
y <- calcNormFactors(y)

v <- voom(y) # v$E <-normalised matrix
dbclin <- read.xls(here::here("data-raw", "CLIN.xls")) # database metadata

# check merging in two first columns

fulldb <- data.frame(usample, dbclin[match(colnames(v$E), dbclin[, 1]), ])
########################

loc <- fulldb$LOCATION
type <- fulldb$TYPE
STEM <- fulldb[, 4]
STATUS <- fulldb$SAMPLE.STATUS

pdf(here::here("plots","PCA_plots.pdf"))
plotPCA(t(v$E), STEM)
plotPCA(t(v$E), paste(STEM, "_", loc, sep = ""))
dev.off()