#!/usr/bin/env Rscript
#author: "Theron Palmer"
#date: "08/06/2021"
library(stringr)
library(dplyr)
library(optparse)
library(TCGAutils)
#------------------------------------------------------------------------------#
# handling command line input
arguments <- parse_args(OptionParser(usage = "",
description="",
option_list=list(
make_option(c("-d","--dat_file"),
default = sprintf("%s",getwd()),
help="dat_file"),
make_option(c("-s","--splice_dat"),
default = sprintf("%s",getwd()),
help="splicemutr data"),
make_option(c("-m","--mut_count"),
default = sprintf("%s",getwd()),
help="mutation counts"),
make_option(c("-g","--genotypes_files"),
default = sprintf("%s",getwd()),
help="genotypes files"),
make_option(c("-a","--hla_files"),
default = sprintf("%s",getwd()),
help="hla files"),
make_option(c("-r","--groups_file"),
default = sprintf("%s",getwd()),
help="groups file"),
make_option(c("-j","--junc_rse"),
default = sprintf("%s",getwd()),
help="junc_rse"))))
opt=arguments
dat_file <- opt$dat_file
if (!file.exists(dat_file)){
print(dat_file)
quit(save="no")
}
load(dat_file)
splicemutr_file <- opt$splice_dat
mutation_count_file <- opt$mut_count
genotypes_files <- opt$genotypes_files
HLA_files <- opt$hla_files
groups_file <- opt$groups_file
junc_rse_file <- opt$junc_rse
#------------------------------------------------------------------------------#
# Internal functions
create_tcga_splicemutr <- function(introns,splicemutr_dat){
juncs_to_find <- sprintf("%s:%s:%s",introns$chr,introns$start,introns$end)
rownames(introns) <- juncs_to_find
splicemutr_dat$juncs <- sprintf("%s:%s:%s",splicemutr_dat$chr,splicemutr_dat$start,splicemutr_dat$end)
specific_splicemutr_dat <- subset(splicemutr_dat,juncs %in% juncs_to_find)
specific_splicemutr_dat$cluster <- introns[specific_splicemutr_dat$juncs,"clusterID"]
specific_splicemutr_dat$verdict <- introns[specific_splicemutr_dat$juncs,"verdict"]
specific_splicemutr_dat$deltapsi <- introns[specific_splicemutr_dat$juncs,"deltapsi"]
return(specific_splicemutr_dat)
}
#------------------------------------------------------------------------------#
# reading in the data
# genotypes_files <- "/media/theron/My_Passport/TCGA_junctions/ext_dat/OptiTypeCallsHLA_20171207.tsv"
# HLA_files <- "/media/theron/My_Passport/TCGA_junctions/summary_files/%s_tx_dict_summary_perc.txt"
# splicemutr_file <- "/media/theron/My_Passport/TCGA_junctions/formed_transcripts/data_splicemutr.txt"
# mutation_count_file <-"/media/theron/My_Passport/TCGA_junctions/cbioportal_data/Mutation_Count.txt"
# dat_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/CHOL/data.Rdata"
# load(dat_file)
# groups_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/CHOL/groups_file.txt"
# junc_rse_file <- "/media/theron/My_Passport/TCGA_junctions/TCGA_cancers/CHOL/juncrse.rds"
leafcutter_groups <- read.table(groups_file)
colnames(leafcutter_groups) <- c("external_id","type")
splicemutr_dat <- read.table(splicemutr_file, sep = " ",header=T)
mutation_counts <- read.table(mutation_count_file, sep="\t",header=T)
genotypes <- read.table(genotypes_files,sep=",",header=T)
junc_rse <- readRDS(junc_rse_file)
junc_metadata <- as.data.frame(junc_rse@colData@listData)
#------------------------------------------------------------------------------#
# assigning filenames to genotypes
genotypes$A1 <- unname(vapply(genotypes$A1,function(HLA){
HLA_split<-str_split(HLA,"[*:]")[[1]]
sprintf("HLA-%s%s-%s",HLA_split[1],HLA_split[2],HLA_split[3])
},character(1)))
genotypes$A2 <- unname(vapply(genotypes$A2,function(HLA){
HLA_split<-str_split(HLA,"[*:]")[[1]]
sprintf("HLA-%s%s-%s",HLA_split[1],HLA_split[2],HLA_split[3])
},character(1)))
genotypes$B1 <- unname(vapply(genotypes$B1,function(HLA){
HLA_split<-str_split(HLA,"[*:]")[[1]]
sprintf("HLA-%s%s-%s",HLA_split[1],HLA_split[2],HLA_split[3])
},character(1)))
genotypes$B2 <- unname(vapply(genotypes$B2,function(HLA){
HLA_split<-str_split(HLA,"[*:]")[[1]]
sprintf("HLA-%s%s-%s",HLA_split[1],HLA_split[2],HLA_split[3])
},character(1)))
genotypes$C1 <- unname(vapply(genotypes$C1,function(HLA){
HLA_split<-str_split(HLA,"[*:]")[[1]]
sprintf("HLA-%s%s-%s",HLA_split[1],HLA_split[2],HLA_split[3])
},character(1)))
genotypes$C2 <- unname(vapply(genotypes$C2,function(HLA){
HLA_split<-str_split(HLA,"[*:]")[[1]]
sprintf("HLA-%s%s-%s",HLA_split[1],HLA_split[2],HLA_split[3])
},character(1)))
#------------------------------------------------------------------------------#
# matching TCGA samples between mutation counts and genotypes
mutation_IDs <- vapply(genotypes$aliquot_id,function(ID){
id_parts <- str_split(ID,"[-]")[[1]]
sprintf("%s-%s-%s",id_parts[1],id_parts[2],id_parts[3])
},character(1))
genotypes$mutation_IDs <- mutation_IDs
cancer_type <- unname(vapply(genotypes$mutation_IDs,function(ID){
cancer <- str_split(mutation_counts[mutation_counts$Patient.ID==ID,"Study.ID"],"[_]")
if (length(cancer)==0){
cancer <- "NONE"
} else {
cancer <- cancer[[1]][1]
}
},character(1)))
tum_or_norm <- unname(vapply(genotypes$aliquot_id,function(ID){
type <- str_split(ID,"[-]")[[1]][4]
type<-as.numeric(substr(type,1,2))
if (type >= 1 & type <= 9){
return("T")
} else if (type >= 10 & type <= 19) {
return("N")
}
return("C")
},character(1)))
genotypes$cancer_type <- cancer_type
genotypes$tum_or_norm <- tum_or_norm
genotypes <- genotypes %>% dplyr::filter(cancer_type != "NONE")
cancer_types <- unique(genotypes$cancer_type)
# genotypes <- genotypes %>% dplyr::filter(cancer_type == tolower(basename(dirname(dat_file))))
#------------------------------------------------------------------------------#
# determining the cancer type per mutation count sample
mutation_counts$cancer <- vapply(mutation_counts$Study.ID,
function(ID){str_split(ID,"[_]")[[1]][1]},
character(1))
# mutation_counts <- mutation_counts %>% dplyr::filter(cancer == tolower(basename(dirname(dat_file))))
#------------------------------------------------------------------------------#
# creating uniform names
metadata_rows <- unlist(lapply(leafcutter_groups$external_id,
function(external_id){
which(junc_metadata$external_id == external_id)
}))
leafcutter_groups$sample_id <- vapply(leafcutter_groups$external_id,
function(ID){
barcode <- TCGAbarcode(junc_metadata$tcga.tcga_barcode[which(junc_metadata$external_id == ID)],
sample=T)
substr(barcode,1,nchar(barcode)-1)
},character(1))
genotypes$sample_id <- TCGAbarcode(genotypes$aliquot_id, sample=T)
genotypes$sample_id <- vapply(genotypes$sample_id,function(ID){substr(ID,1,nchar(ID)-1)},character(1))
#------------------------------------------------------------------------------#
# joining leafcutter, genotypes, and mutation_count data to the leafcutter groups information
cols <- c("A1","A2","B1","B2","C1","C2")
HLA_alleles <- lapply(seq(length(leafcutter_groups$sample_id)),function(num){
ID <- leafcutter_groups$sample_id[num]
type <- leafcutter_groups$type[num]
external_id <- leafcutter_groups$external_id[num]
a<-genotypes[genotypes$sample_id == ID,cols]
if (nrow(a) == 0){
a <- data.frame(t(rep(NA,6)))
colnames(a) <-colnames(genotypes[,seq(6)])
}
a$sample_id <- ID
a$type <- type
a$external_id <- external_id
return(a)
})
genotypes_leafcutter <- data.frame(HLA_alleles[[1]])
for (i in seq(2,length(HLA_alleles))){
genotypes_leafcutter <- rbind(genotypes_leafcutter,
HLA_alleles[[i]])
}
genotypes_leafcutter <- unique(genotypes_leafcutter)
genotypes_leafcutter$mut_counts<- vapply(genotypes_leafcutter$sample_id,function(ID){
count <- mutation_counts[mutation_counts$Sample.ID == ID,"Mutation.Count"]
if (length(count) == 0){
return(0)
} else {
return(count)
}
},numeric(1))
#------------------------------------------------------------------------------#
# calculating average tumor and average normal scores per sample
# dat <- data.frame(matrix(0,nrow=nrow(splicemutr_dat),ncol=nrow(genotypes_leafcutter)))
# vec <- vector("list",nrow(splicemutr_dat))
# gene_row <- nrow(genotypes_leafcutter)
# for (i in seq(1,gene_row)){
# row_vec <- rep(F,gene_row)
# print(sprintf("%s:%d:%d",basename(dirname(dat_file)),i,gene_row))
# class1_alleles <- as.character(unname(genotypes_leafcutter[i,seq(6)]))
# iter <- 0
# kmers <- vector(mode = "list", length = nrow(splicemutr_dat))
# for (allele in class1_alleles){
# file_HLA <- sprintf(HLA_files,str_replace(allele,":","-"))
# if (!file.exists(file_HLA)){next}
# info = file.info(file_HLA)
# if (info$size == 0){next}
# iter <- iter + 1
# HLA_dat <- read.table(file_HLA)
# HLA_dat[,1]<-as.numeric(HLA_dat[,1])+1
# rows <- HLA_dat[,1]
# kmers_split<-str_split(HLA_dat[,2],":")
# fill <- vapply(seq(length(rows)),function(row){
# kmers[[rows[row]]]<<-unique(c(kmers[[rows[row]]],kmers_split[[row]]))
# return(T)
# },logical(1))
# }
# kmers <- unlist(lapply(seq(length(kmers)),function(val){
# paste(kmers[[val]],collapse=":")
# }))
# dat[,i] <- kmers
# }
# cols_to_remove <- which(is.na(genotypes_leafcutter$A1))
# tumor_cols <- genotypes_leafcutter$type == "T"
# tumor_cols[cols_to_remove] <- F
# normal_cols <- genotypes_leafcutter$type == "N"
# normal_cols[cols_to_remove] <- F
#
#
# splicemutr_dat <- cbind(splicemutr_dat, dat)
#------------------------------------------------------------------------------#
# creating the specific splicemutr data
# specific_splicemutr_dat <- create_tcga_splicemutr(introns,splicemutr_dat)
#
# write.table(specific_splicemutr_dat,
# file=sprintf("%s/%s_splicemutr.txt",dirname(dat_file),basename(dirname(dat_file))),
# sep="\t",
# quote=F,
# col.names=T,
# row.names=F)
write.table(genotypes_leafcutter,
file=sprintf("%s/%s_genotypes.txt",dirname(dat_file),basename(dirname(dat_file))),
sep="\t",
quote=F,
col.names=T,
row.names=F)
write.table(junc_metadata,
file=sprintf("%s/%s_metadata.txt",dirname(dat_file),basename(dirname(dat_file))),
sep="\t",
quote=F,
col.names=T,
row.names=F)
write.table(introns,
file=sprintf("%s/introns.txt",dirname(dat_file)),
sep="\t",
quote=F,
col.names=T,
row.names=F)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.