scripts/build_matrix_original_new.R

#!/usr/bin/env Rscript

#######################################
## scfindME build a splicing index
## ysong 07 Mar 2022
#######################################

suppressMessages(library(optparse))
suppressMessages(library(tidyverse))
suppressMessages(library(Matrix))

option_list <- list(
  make_option(c("-n", "--data_name"), type = "character", default = NULL, help = "Name of dataset"),
  make_option(c("-p", "--pseudobulk_psi"),
    type = "character", default = NULL, help = "Combined pseudobulk psi matrix as input, tab-deliminated"
  ),
  make_option(c("-i", "--node_info"),
    type = "character", default = NULL, help = "Node information input from Whippet, tab-deliminated"
  ),
  make_option(c("-o", "--output"),
    type = "character",
    default = NULL, help = "Directory where multiple output RDS file will be written"
  ),
  make_option(c("-r", "--num_reads_min"),
    type = "numeric", default = 10,
    help = "Minimum number of total reads covering node, which will be included in the output. This is for ensure meaningful psi quantification, default = 10"
  ),
  make_option(c("-d", "--psi_diff_cutoff"), type = "numeric", default = 0.2, help = "Minimum PSI difference from dataset average which will lead to the node being kept in the index, default = 0.2"),
  make_option(c("-s", "--species"), 
              type = 'character', default = NULL, help = 'ENSEMBL name of species to get gene annotations') 
)

# parse input
opt <- parse_args(OptionParser(option_list = option_list))

NAME <- opt$data_name
INPUT <- opt$pseudobulk_psi
NODE <- opt$node_info
OUTPUT <- opt$output
NUM_READS_MIN <- opt$num_reads_min
PSI_DIFF_CUTOFF <- opt$psi_diff_cutoff
species = opt$species

######################## Functions build various inputs for scfindME

# build original matrix function

message("Start building inputs for scASfind index")

data <- readr::read_tsv(INPUT, col_names = TRUE, progress = show_progress())

message("Read PSI input, generating matrces...")

matrix.original <- data %>%
  tidyr::unite("Gene_node", Gene, Node, sep = "_") %>%
  dplyr::group_by(Sample) %>%
  dplyr::distinct(Gene_node, .keep_all = TRUE) %>%
  dplyr::select(Sample, Gene_node, Total_Reads, Psi) %>%
  dplyr::mutate(Filter = case_when(Total_Reads < NUM_READS_MIN ~ "DROP", Total_Reads >= NUM_READS_MIN ~ "KEEP", TRUE ~ NA_character_)) %>%
  dplyr::filter(Filter == "KEEP") %>%
  dplyr::select(Sample, Gene_node, Psi) %>%
  dplyr::ungroup(Sample) %>%
  dplyr::group_by(Gene_node) %>%
  tidyr::pivot_wider(names_from = Sample, values_from = Psi)

message("Matrices constructed, scailing")

df <- data.frame(matrix.original, row.names = matrix.original$Gene_node)
dm <- as.matrix(df[, -1])
mean <- rowMeans(dm, na.rm = TRUE)
matrix.scaled_diff <- dm - mean
matrix.scaled_diff <- matrix.scaled_diff * 100
# drop all-na rows
matrix.scaled_diff <- matrix.scaled_diff[which(rowSums(is.na(matrix.scaled_diff)) < ncol(matrix.scaled_diff)), ]
matrix.scaled_diff_selected <- data.frame(row.names = rownames(matrix.scaled_diff))

diff_cut <- matrix(0, nrow = nrow(matrix.scaled_diff), ncol = ncol(matrix.scaled_diff))

for (cell in seq(1, ncol(matrix.scaled_diff))) {
  tv <- matrix.scaled_diff[, cell]
  tvd <- diff_cut[, cell]
  temp <- which(abs(tv) < 0.2 * 100)
  tv[temp] <- NA
  tvd[temp] <- 1
  matrix.scaled_diff_selected[[colnames(matrix.scaled_diff)[cell]]] <- tv
  diff_cut[, cell] <- tvd
}

# 1 in diff_cut means dropped calls

rownames(diff_cut) <- rownames(matrix.scaled_diff_selected)
colnames(diff_cut) <- colnames(matrix.scaled_diff_selected)

matrix.scaled_diff_selected <- matrix.scaled_diff_selected %>% filter(if_any(everything(), ~ !is.na(.)))

diff_cut <- diff_cut[rownames(matrix.scaled_diff_selected), ]
diff_cut <- Matrix(diff_cut, sparse = TRUE)

matrix.above <- data.frame(row.names = rownames(matrix.scaled_diff_selected))
# set na and below ones to zero
for (cell in seq(1, ncol(matrix.scaled_diff_selected))) {
  tv <- matrix.scaled_diff_selected[, cell]
  temp <- which(is.na(tv) | tv < 0)
  tv[temp] <- 0
  matrix.above[[colnames(matrix.scaled_diff_selected)[cell]]] <- tv
}

matrix.below <- data.frame(row.names = rownames(matrix.scaled_diff_selected))
# set na and above ones to zero
for (cell in seq(1, ncol(matrix.scaled_diff_selected))) {
  tv <- matrix.scaled_diff_selected[, cell]
  temp <- which(is.na(tv) | tv > 0)
  tv[temp] <- 0
  matrix.below[[colnames(matrix.scaled_diff_selected)[cell]]] <- tv
}
matrix.below <- matrix.below * (-1)



df <- data.frame(matrix.original, row.names = matrix.original$Gene_node)
dm <- as.matrix(df[, -1])


mean <- transform(dm, mean = apply(dm, 1, mean, na.rm = TRUE))
sd <- transform(dm, SD = apply(dm, 1, sd, na.rm = TRUE))
mean$SD <- sd$SD
mean <- mean[order(mean$SD), ]
stats <- mean[, c("mean", "SD")]

stats <- stats[which(rownames(stats) %in% rownames(matrix.scaled_diff_selected)), ]

node_list <- rownames(matrix.scaled_diff_selected)

message("Read node information...")

ni <- readr::read_tsv(NODE)
ni$Node_id <- paste(ni$Gene, ni$Node, sep = "_")

node_list_all <- ni[which(ni$Node_id %in% node_list), ]
node_list_all$Gene_num <- gsub("\\..*$", "", node_list_all$Gene)

# install.packages('XML', repos = 'http://www.omegahat.net/R') BiocManager::install('biomaRt')
library("biomaRt")
# listMarts()
ensembl <- useMart("ensembl")
ensembl <- useDataset(paste0(species, "_gene_ensembl"), mart = ensembl)

# takes a few minuites to match gene to name
gene_name <- getBM(attributes = c("ensembl_gene_id", "external_gene_name"), filters = "ensembl_gene_id", values = node_list_all$Gene_num, mart = ensembl)

# de-duplicate and match gene name using gene id
gene_name <- gene_name[!duplicated(gene_name$ensembl_gene_id), ]

gene_node_all <- merge(node_list_all, gene_name, by.x = "Gene_num", by.y = "ensembl_gene_id", all.x = TRUE)

names(gene_node_all)[names(gene_node_all) == "Gene"] <- "Gene_id"
names(gene_node_all)[names(gene_node_all) == "external_gene_name"] <- "Gene_name"
names(gene_node_all)[names(gene_node_all) == "Gene_node"] <- "Node_id"
gene_node_all$Node_name <- paste(gene_node_all$Gene_name, gene_node_all$Node, sep = "_")

message("Save results...")

saveRDS(matrix.scaled_diff_selected, paste(OUTPUT, "_matrix_scaled_diff_selected.rds", sep = ""))
saveRDS(matrix.above, paste(OUTPUT, "_matrix_above.rds", sep = ""))
saveRDS(matrix.below, paste(OUTPUT, "_matrix_below.rds", sep = ""))
saveRDS(diff_cut, paste(OUTPUT, "_diff_cut.rds", sep = ""))
saveRDS(stats, paste(OUTPUT, "_stats.rds", sep = ""))
saveRDS(gene_node_all, paste(OUTPUT, "_gene_node_all.rds", sep = ""))
YY-SONG0718/scfindME documentation built on April 17, 2023, 4:27 a.m.