#Clear workspace
rm(list=ls())
#Import needed packages
library(MutationalPatterns)
library(BSgenome)
library("BSgenome.Hsapiens.UCSC.hg19", character.only = TRUE)
library(plyr)
#Set reference genome
ref_genome = "BSgenome.Hsapiens.UCSC.hg19"
#Load cosmic_signatures from file in folder
setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3")
cosmic_signatures <- as.matrix(read.table("cosmic_signatures.txt",header=TRUE))
#Set wd
setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/VCF_files")
# Cohort section ----------------------------------------------------------
#Read all vcf_cohort files
vcf_cohort_list <- list.files(path="C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/VCF_files",recursive=TRUE,pattern="cohort.vcf")
#Optional: Select only certain cancertypes
vcf_cohort_list <- vcf_cohort_list[gsub("/.*","",vcf_cohort_list) %in% c('CHOL','ACC','USC','UVM','DLBC','MESO')]
name_cohort <- gsub("/.*","",vcf_cohort_list)
#Read each cohort.vcf file as Grange oject
cohort_vcfs <- read_vcfs_as_granges(vcf_cohort_list,name_cohort,ref_genome)
#Create mutational matrix for each cancer type
cohort_mut_matrix = mut_matrix(cohort_vcfs,ref_genome)
#optional, extract optimal rank for NMF
if (FALSE){
library(NMF)
#Add pseudocount to mut matrix ?
#mut_mat <- mut_mat + 0.0001
estim.r <- nmf(cohort_mut_matrix, rank=2:8,method="brunet", nrun=10, seed=123456)
plot(estim.r)
}
#Extract mutational signatures (Cant extract more signatures than given samples)
number_of_signatures <- 5
cohort_extracted_signatures = extract_signatures(cohort_mut_matrix, number_of_signatures)
colnames(cohort_extracted_signatures$signatures) <- c(1:number_of_signatures)
#Plot extracted signatures
setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/Pictures/Cohort")
pdf("Extracted_signatures.pdf")
plot_96_profile(cohort_extracted_signatures$signatures, condensed = TRUE)
dev.off()
#Compare extracted signatures with cosmic signatures
cosine_similarity = cos_sim_matrix(cohort_extracted_signatures$signatures, cosmic_signatures)
# Plot heatmap with specified signature order
hclust_cosmic = cluster_signatures(cosmic_signatures, method = "average")
# store signatures in new order
cosmic_order = colnames(cosmic_signatures)[hclust_cosmic$order]
pdf("Extracted_vs_Cosmic_cosine.pdf")
plot_cosine_heatmap(cosine_similarity, col_order = cosmic_order, cluster_rows = TRUE)
dev.off()
#Plot contribution of extracted signature vs cancer
pdf("Cancer_vs_Contribution_extracted_signatures.pdf")
plot_contribution(cohort_extracted_signatures$contribution, cohort_extracted_signatures$signature, mode = "absolute", coord_flip = TRUE)
dev.off()
#Plot contribution of cosmic vs cancer
fit_res <- fit_to_signatures(cohort_mut_matrix, cosmic_signatures)
# Select signatures with some contribution
select <- which(rowSums(fit_res$contribution) > 10)
# Plot contribution barplot
pdf("Cancer_vs_contribution_Cosmic_signatures.pdf")
plot_contribution(fit_res$contribution[select,],
cosmic_signatures[,select],
coord_flip = TRUE,
mode = "absolute")
dev.off()
# Section One cancer type at a time ---------------------------------------
#load cohort_table.txt
setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3")
Cohort <- read.table("cohort.txt",header = TRUE)#maybe unness could just aswell list all files in folder (except cohort.vcf files)
#list.dirs(path = "C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/VCF_files", full.names = TRUE, recursive = TRUE)
#loop over each cancer type
cancer_list <- as.vector(unique(Cohort$subtype))
cancer_list <- cancer_list[cancer_list %in% c("UCS","UVM") ]
#Alter what cancer types you want to loop over
#not_in = head(cancer_list,29)
#cancer_list <- cancer_list[!(cancer_list %in% not_in)]
for (cancer_type in cancer_list){
print(paste("Starting on ",cancer_type,sep=""))
setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/VCF_files")
VCF_files_to_read <- as.vector(Cohort$vcf[Cohort$subtype %in% cancer_type])
VCF_files_ID <- as.vector(Cohort$sample[Cohort$subtype %in% cancer_type])
#Read VCF as Grange object
print("reading file")
cancer_vcfs <- read_vcfs_as_granges(VCF_files_to_read,VCF_files_ID,ref_genome) #Gets warning messages about alternative alleles
#Create mutational matrix
print("creating mutational matrix")
mutational_matrix = mut_matrix(cancer_vcfs,ref_genome)
#Write only TCGA Patient ID
colnames(mutational_matrix) <- gsub("-[A-Z0-9]*-[A-Z0-9]*-[A-Z0-9]*$","",(gsub("TCGA-[A-Z0-9]*-","",colnames(mutational_matrix))))
colnames(mutational_matrix)
#Optinal find best rank
if (FALSE){
library(NMF)
#Add pseudocount to mut matrix ?
#mut_mat <- mut_mat + 0.0001
estim.r <- nmf(mutational_matrix, rank=1:8,method="brunet", nrun=20, seed=123456)
plot(estim.r)
}
#Extract mutational signatures (Cant extract more signatures than given samples)
#Rank set to 6 , see rankoptimisation ACC
number_of_signatures <- 6
print("Extracting signatures")
extracted_sign = extract_signatures(mutational_matrix, number_of_signatures)
print("Made it past extracting")
colnames(extracted_sign$signatures) <- c(1:number_of_signatures)
rownames(extracted_sign$contribution) <- c(1:number_of_signatures)
#Create folder for saving plots and set wd
directory_name <- paste("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/Pictures","/",cancer_type,sep="")
#crate folder if it doesnt exist
ifelse(!dir.exists(file.path(directory_name)), dir.create(file.path(directory_name)), FALSE)
setwd(directory_name)
pdf_name <- paste(cancer_type,"_all_plots.pdf",sep = "")
pdf(pdf_name)
#Plot extracted signatures
print(plot_96_profile(extracted_sign$signatures, condensed = TRUE))
#Compare extracted signatures with cosmic signatures
cosine_similarity = cos_sim_matrix(extracted_sign$signatures, cosmic_signatures)
# Plot heatmap with specified signature order
hclust_cosmic = cluster_signatures(cosmic_signatures, method = "average")
# store signatures in new order
cosmic_order = colnames(cosmic_signatures)[hclust_cosmic$order]
print(plot_cosine_heatmap(cosine_similarity, col_order = cosmic_order, cluster_rows = TRUE))
#Ugly plot for studies with many patients
k_vector <- c(1:33)
k_vector <- c(1:length(VCF_files_to_read))
while (length(k_vector) > 0) {
interval_to_plot <- head(k_vector,100)
print(plot_contribution(extracted_sign$contribution, extracted_sign$signature,
mode = "absolute", coord_flip = TRUE,index = interval_to_plot))
k_vector <- k_vector[! (k_vector %in% interval_to_plot) ]
}
#Plot contribution of extracted signature vs cancer
#print(plot_contribution(extracted_sign$contribution, extracted_sign$signature, mode = "absolute", coord_flip = TRUE))
#Plot contribution of cosmic vs cancer
fit_res <- fit_to_signatures(mutational_matrix, cosmic_signatures)
# Select signatures with some contribution
select <- which(rowSums(fit_res$contribution) > 10)
# Plot contribution barplot
#Ugly plot for studies with many patients
k_vector <- c(1:length(VCF_files_to_read))
while (length(k_vector) > 0) {
interval_to_plot <- head(k_vector,100)
print(plot_contribution(fit_res$contribution[select,],
cosmic_signatures[,select],
coord_flip = TRUE,
mode = "absolute",
index = interval_to_plot))
k_vector <- k_vector[! (k_vector %in% interval_to_plot) ]
}
#Plot cosine mutational_matrix vs cosmic_signature
#Similarity between cosmic_signatures and mut_matrix
cos_sim_samples_cosmic <- cos_sim_matrix(mutational_matrix, cosmic_signatures)
#error NA/NaN/Inf in foreign function call #2835-03B
#cos_sim_s amples_cosmic[which(cos_sim_samples_cosmic == 0.0000000000)] <- 0.0000000001
#######CRAAAAAAASSSSSSSSSSSH here
print(plot_cosine_heatmap(cos_sim_samples_cosmic,
col_order = cosmic_order,
cluster_rows = TRUE))
#Plot cosine mutational_matrix vs extracted_signatures
print(plot_contribution_heatmap(extracted_sign$contribution, cluster_samples=TRUE))
#Save-Data as R object
setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/generated_data")
#Create folder for saving data and set wd
directory_name <- paste("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/generated_data","/",cancer_type,sep="")
#crate folder if it doesnt exist
ifelse(!dir.exists(file.path(directory_name)), dir.create(file.path(directory_name)), FALSE)
setwd(directory_name)
save(mutational_matrix,file="mut_mat.rda")
save(extracted_sign, file=paste("extracted_sign_",number_of_signatures,".rda",sep = ""))
#Calculate number of mutations for each sample.
#Does not get the same as the cohort$mutations -> why?
Number_of_mutation <- cancer_vcfs@unlistData@ranges@NAMES
Number_of_mutation <- count(Number_of_mutation)
colnames(Number_of_mutation) <- c("Sample_id","#mutations")
table_name <- paste(cancer_type,"_number_of_mutations.txt")
write.table(Number_of_mutation,table_name,row.names = FALSE)
#Plot mutation distribution
d <- density(Number_of_mutation$`#mutations`)
print(plot(d, type="n", main="Distribution of mutations")
,polygon(d, col="lightgray", border="gray")
,rug(Number_of_mutation$`#mutations`, col="red"))
dev.off()
#break() #turn on if you want to try only one sample
}
# Total/Extract signatures from highly mutated samples at the same time --------------
rm(list=ls())
#Import needed packages
library(MutationalPatterns)
library(BSgenome)
library("BSgenome.Hsapiens.UCSC.hg19", character.only = TRUE)
library(plyr)
library(dplyr)
#Set reference genome
ref_genome = "BSgenome.Hsapiens.UCSC.hg19"
setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/VCF_files")
vcf_list <- list.files(path="C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/VCF_files",recursive=TRUE,pattern=".vcf")
#Remove cohort.vcf files
vcf_list <- vcf_list[grep("cohort",vcf_list,invert = TRUE)]
vcf_list_names <- gsub(".vcf","",gsub("[A-Z0-9]*/","",vcf_list))
#Get list with highly mutated samples
setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3")
cohort <- read.table("cohort.txt",stringsAsFactors = FALSE)
treshold_nmut <- function(cancer_name){
samples <- cohort %>% filter(subtype == cancer_name)
median <- median(samples$nmut)
S <- sd(samples$nmut)
treshold <- median + 3*S
high_mut_sample <- samples %>% filter(nmut > treshold) %>% select(sample)
return(high_mut_sample)
}
#Extract vector of names containing highly mutated samples
High_mut_samples <- sapply(unique(cohort$subtype), treshold_nmut)
High_mut_samples <- unlist(High_mut_samples)
High_mut_samples <- unname(High_mut_samples)
#Trim vcf_list for only high mutation samples
vcf_list <- vcf_list[vcf_list_names %in% High_mut_samples]
vcf_list_names <- gsub(".vcf","",gsub("[A-Z0-9]*/","",vcf_list))
#--------------------------------------------------------------------
setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/VCF_files")
#Read vcf-files and save as R object.
start_time <- Sys.time()
print("Reading all vcfs")
all_vcfs <- read_vcfs_as_granges(vcf_list,vcf_list_names,ref_genome)
print("Done reading all vcfs!!!!!!!")
end_time <- Sys.time()
print("Read time")
end_time - start_time # Took about 10 min for 65 samples
#Save the all_vcfs object
setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/Global_mutsign/High_mutations(3s)")
save(all_vcfs,file="high_mut_vcfs.rda")
#Extract mutational matrix
print("creating mutational matrix")
mutational_matrix = mut_matrix(all_vcfs,ref_genome)
save(mutational_matrix,file="high_mut_matrix.rda")
#Add cancer abb as name
colnames(mutational_matrix) <- gsub("/.*$","",vcf_list) ####################For having cancer abb as names
#[cohort$sample %in% vcf_list_names]
#just to give unique name to each (so one can plot)
for (i in 1:length(colnames(mutational_matrix))) {
colnames(mutational_matrix)[i]<- paste(colnames(mutational_matrix)[i],i,sep="-")
}
#Find optimal rank
library(NMF)
#Add pseudocount to mut matrix ?
#mut_mat <- mut_mat + 0.0001
estim.r <- nmf(mutational_matrix, rank=1:8,method="brunet", nrun=12, seed=123456)
plot(estim.r)
#Clear workspace
#rm(all_vcfs,vcf_list,vcf_list_names)
#Extract signatures
number_of_signatures <- 2
print("Extracting signatures")
extracted_sign = extract_signatures(mutational_matrix, number_of_signatures)
colnames(extracted_sign$signatures) <- c(1:number_of_signatures)
rownames(extracted_sign$contribution) <- c(1:number_of_signatures)
save(extracted_sign, file="extracted_sign_high_mut.rda")
print("Made it past extracting")
#colnames(extracted_sign$contribution) <- colnames(mutational_matrix) ####################For having cancer abb as name
#Needed for comparing to cosmic
setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3")
#cosmic_signatures <- as.matrix(read.table("cosmic_signatures.txt",header=TRUE))
cosmic_signatures <- as.matrix(read.table("cosmic_signatures_extended.txt",header=TRUE))
#Print results
#Set options here
setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/Global_mutsign/High_mutations")
pdf_name <- paste("HighMut_new_cosmic.pdf")
print_mutsign <- function() {
pdf(pdf_name)
#Plot extracted signatures
print(plot_96_profile(extracted_sign$signatures, condensed = TRUE))
#Compare extracted signatures with cosmic signatures
cosine_similarity = cos_sim_matrix(extracted_sign$signatures, cosmic_signatures)
# Plot heatmap with specified signature order
hclust_cosmic = cluster_signatures(cosmic_signatures, method = "average")
# store signatures in new order
cosmic_order = colnames(cosmic_signatures)[hclust_cosmic$order]
print(plot_cosine_heatmap(cosine_similarity, col_order = cosmic_order, cluster_rows = TRUE))
#Ugly plot for studies with many patients
k_vector <- c(1:length(vcf_list))
#k_vector <- c(1:length(VCF_files_to_read))
while (length(k_vector) > 0) {
interval_to_plot <- head(k_vector,100)
print(plot_contribution(extracted_sign$contribution, extracted_sign$signature,
mode = "absolute", coord_flip = TRUE,index = interval_to_plot))
k_vector <- k_vector[! (k_vector %in% interval_to_plot) ]
}
#Plot contribution of extracted signature vs cancer
#print(plot_contribution(extracted_sign$contribution, extracted_sign$signature, mode = "absolute", coord_flip = TRUE))
#Plot contribution of cosmic vs cancer
fit_res <- fit_to_signatures(mutational_matrix, cosmic_signatures)
# Select signatures with some contribution
select <- which(rowSums(fit_res$contribution) > 10)
# Plot contribution barplot
#Ugly plot for studies with many patients
k_vector <- c(1:length(vcf_list))
while (length(k_vector) > 0) {
interval_to_plot <- head(k_vector,100)
print(plot_contribution(fit_res$contribution[select,],
cosmic_signatures[,select],
coord_flip = TRUE,
mode = "absolute",
index = interval_to_plot))
k_vector <- k_vector[! (k_vector %in% interval_to_plot) ]
}
#Plot cosine mutational_matrix vs cosmic_signature
#Similarity between cosmic_signatures and mut_matrix
cos_sim_samples_cosmic <- cos_sim_matrix(mutational_matrix, cosmic_signatures)
#error NA/NaN/Inf in foreign function call #2835-03B
#cos_sim_s amples_cosmic[which(cos_sim_samples_cosmic == 0.0000000000)] <- 0.0000000001
#######CRAAAAAAASSSSSSSSSSSH here
print(plot_cosine_heatmap(cos_sim_samples_cosmic,
col_order = cosmic_order,
cluster_rows = TRUE))
#Plot cosine mutational_matrix vs extracted_signatures
print(plot_contribution_heatmap(extracted_sign$contribution, cluster_samples=TRUE))
#plot reconstructed mutational profiles from extracted
print(plot_compare_profiles(mutational_matrix[,1],
extracted_sign$reconstructed[,1],profile_names = c("Original", "Reconstructed"),condensed = TRUE))
#plot reconstructed mutational profiles from cosmic
print(plot_compare_profiles(mutational_matrix[,1], fit_res$reconstructed[,1],
profile_names = c("Original", "Reconstructed(cosmic)"),condensed = TRUE))
dev.off()
}
print_mutsign()
# Cluster cosine-sim matrix
##################################################
rm(list=ls())
setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/Global_mutsign/High_mutations(3s)")
#scource("C:/Users/Nils_/MutationalPatterns/R")
#source("C:/Users/Nils_/MutationalPatterns/R")
load("high_mut_matrix.rda")
colnames(mutational_matrix) <- gsub("-[A-Z0-9]*-[A-Z0-9]*-[A-Z0-9]*$","",colnames(mutational_matrix))
setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3")
cosmic_signatures <- as.matrix(read.table("cosmic_signatures_extended.txt",header=TRUE))
#Modify mutational patterns package (Open file nils)
#"C:/Users/Nils_/MutationalPatterns/R" modify code
#Only wokrs if mutational patterns already installed?
#install.packages("C:/Users/Nils_/MutationalPatterns", repos = NULL, type = "source")
#library(MutationalPatterns)
library(MutationalPatterns)
library(dplyr)
library(robustbase)
###############################################
cos_sim_samples_cosmic <- cos_sim_matrix(mutational_matrix, cosmic_signatures)
sample_cluster <- hclust(dist(cos_sim_samples_cosmic,method="euclidean"),method="complete")
N<- 10
method <- "complete"
cluster_groups <- cutree(sample_cluster,k=N)
#Plot with found clusters
rect.hclust.labels <- function(cluster,k){
cluster_groups <- cutree(cluster,k=N)
X <- table(cluster_groups)[unique(cluster_groups[cluster$order])]
m <- c(0, cumsum(X))
library(zoo)
xpos <- rollmean(m,k=2)
#ypos <- rep(par("usr")[3L],length(X))
ypos <- rep(2.2,length(X))
text(xpos, ypos, names(X), adj=c(0,0), pos=3, col="black", cex=0.75)
}
#plot(sample_cluster, cex = 0.5,labels=FALSE,xlab="",sub="",ylab="",axes=F,main="") # for naked/simple
plot(sample_cluster, cex = 0.5,labels=FALSE,xlab="",sub="",ylab="",axes=F,main="")
rect.hclust(sample_cluster, k = N, border = 2:5)
rect.hclust.labels(sample_cluster,k=N)
#Test
#Extract what signatures are over certain treshold in each cluster
sampleDF <- data.frame("cluster"=cluster_groups,"sample"=names(cluster_groups))
treshold <- 0.6
signatures_in_cluster <- function(cluster_id){
samples <- sampleDF %>% filter(cluster == cluster_id) %>% select(sample)
samples_cosine <- cos_sim_samples_cosmic[rownames(cos_sim_samples_cosmic) %in% samples$sample ,]
samples_cosine_mean <- colMeans(samples_cosine)
samples_cosine_median <- colMedians(as.matrix(samples_cosine))
sig_signatures <- samples_cosine_mean[samples_cosine_mean > treshold]
sig_signatures <- sort(sig_signatures,decreasing = TRUE)
return(sig_signatures)
}
sampleDF <- data.frame("cluster"=cluster_groups,"sample"=names(cluster_groups))
signatures <- lapply(c(1:7),signatures_in_cluster)
signatures
#plot cosine-heatmap
hclust_cosmic = cluster_signatures(cosmic_signatures, method = "average")
# store signatures in new order
cosmic_order = colnames(cosmic_signatures)[hclust_cosmic$order]
print(plot_cosine_heatmap(cos_sim_samples_cosmic, col_order = cosmic_order, cluster_rows = TRUE))
#Determine optimal number of clusters (3 different methods)
library(factoextra)
#elbow method
fviz_nbclust(cos_sim_samples_cosmic, FUN = hcut, method = "wss")
#silhouette method
fviz_nbclust(cos_sim_samples_cosmic, FUN = hcut, method = "silhouette")
#Gap statistic method
set.seed(123)
gap_stat <- clusGap(cos_sim_samples_cosmic, FUN = kmeans, nstart = 25,K.max = 15, B = 50)
print(gap_stat, method = "firstmax")
fviz_gap_stat(gap_stat)
#tidyverse seems like a nice package !!!!
#Extract what samples are in each cluster
setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/Global_mutsign/High_mutations(3s)")
cluster_groups <- data.frame("Sample_id"=vcf_list_names,"Cluster"=as.vector(cluster_groups))
filename <- paste("Sample_In_Cluster",method,N,sep="_")
write.table(cluster_groups,paste(filename,".txt",sep=""),row.names = FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.