Examples/mutational_pattern_test.R

rm(list=ls())
setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3")

TSS2Study_DF <- read.table("TSS2Studyabb.txt",header=TRUE)




#library(BiocParallel)
library(BSgenome)
library(GenomicRanges)
library("BSgenome.Hsapiens.UCSC.hg19", character.only = TRUE)
library(MutationalPatterns)
library(plyr)
library(data.table)
#Set reference genome
ref_genome = "BSgenome.Hsapiens.UCSC.hg19"


# Read mc3-file (From PanCan)

#MC3_DF <- read.table('mc3.v0.2.8.PUBLIC.maf.gz',header=TRUE,sep='\t',nrow=4000); #Has a total of 3600964 rows Around 24 min to read whole file




start_time <- Sys.time()


MC3_DF <- fread('mc3.v0.2.8.PUBLIC.maf.gz') #<- run for entire file

end_time <- Sys.time()
print("Read time")
end_time - start_time


#Select only necessary columns to save memory
library(dplyr)
MC3_DF <- select(MC3_DF, Chromosome,Tumor_Seq_Allele2,Tumor_Sample_Barcode,Start_Position,Reference_Allele)




#Add Study.abbrevation to the MC3_DF matrix.
MC3_DF$TSS.Code <-gsub("-[A-Z0-9]*-[A-Z0-9]*-[A-Z0-9]*-[A-Z0-9]*-[A-Z0-9]*$","",gsub("TCGA-","",MC3_DF$Tumor_Sample_Barcode))
MC3_DF <- join(MC3_DF,TSS2Study_DF, by="TSS.Code")



#MC3_DF <- read.table('mc3.v0.2.8.PUBLIC.code.maf',header=TRUE,sep='\t',nrow=200);


#Save mc3-file in separate folders as VCF-files

MC3_DF_CancerType <- split (MC3_DF, f=MC3_DF$Study.Abbreviation,drop=TRUE) # mayby i need drop = TRUE ?




vcffile=vector()
subtype=vector()
sample=vector()



#Start of loops --------------------------------------------

for (cancerType in MC3_DF_CancerType){ # Loop over all cancer types
  
  #Add a catch that prevents closing loop
  if (dim(cancerType)[1]  == 0) {
    
    next()
  
  }
    

  
  directory_name <- paste("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/VCF_files","/",unique(cancerType$Study.Abbreviation),sep="") 
  
  #crate folder if it doesnt exist
  ifelse(!dir.exists(file.path(directory_name)), dir.create(file.path(directory_name)), FALSE)
  print (directory_name)
  setwd(directory_name)
  
  
  #Split cancer type into individual samples
  MC3_DF_Sample <- split (cancerType, f= cancerType$Tumor_Sample_Barcode,drop = TRUE) # not sure if this works
  
  
 
  

  

    for (Sample in MC3_DF_Sample){ #Loop over each sample in respective cancer type
      
      
      if (dim(Sample)[1]  == 0) { ########### Why does CancerType get split into empty samples??????????!!!!!!!!
                                  #Seem to split MC3_DF_Sample but still save old levels  # try drop = TRUE in split command
        
        next()
        
      } 
      
      
    
      sample_id <- unique(Sample$Tumor_Sample_Barcode);
      vcfdata <- matrix(".", nrow=nrow(Sample), ncol = 10);
      columns=c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT",sample_id )
      colnames(vcfdata) <- columns
      
      vcfdata[,1]=as.character(Sample$Chromosome)
      vcfdata[,2]=as.character(Sample$Start_Position)
      vcfdata[,4]=as.character(Sample$Reference_Allele)
      vcfdata[,5]=as.character(Sample$Tumor_Seq_Allele2)
      vcfdata[,9]="GT"
      vcfdata[,10]="1/0"
      #vcfdata[,3]=as.character(sample_id)
      outfile <- paste(sample_id,".vcf", sep="")
      #outfile = paste(directory_name,outfile, sep="/")
      #write.table(vcfdata, file=outfile, row.names=FALSE, sep="\t", quote=FALSE)
      
      subtype <- c(subtype,as.character(unique(cancerType$Study.Abbreviation)))
      sample <- c(sample,as.character(sample_id))
      vcffile <- c(vcffile, paste (unique(cancerType$Study.Abbreviation),outfile,sep = "/") )
    
  }
  
  
}



cohort <- data.frame("vcf"=vcffile, "subtype"=subtype, "sample"=sample)

#Count number of mutations and add to cohort table
setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/VCF_files")
library(R.utils)
nmut_total <- sapply(cohort$vcf,countLines)-1
cohort$nmut <- nmut_total
#write.table(cohort,"cohort.txt") # new code


#END OF LOOPS----------------------------------------------






#Save cohort_level vcf files.
for (cancerType in MC3_DF_CancerType) {
  
  #Add a catch that prevents closing loop
  if (dim(cancerType)[1]  == 0) {
    
    next()
    
  }
  
  directory_name <- paste("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/VCF_files","/",unique(cancerType$Study.Abbreviation),sep="") 
  
  
  #crate folder if it doesnt exist
  ifelse(!dir.exists(file.path(directory_name)), dir.create(file.path(directory_name)), FALSE)
  print (directory_name)
  setwd(directory_name)
  
  #Save data to vcf file
  
  
  cancer_abb <- unique(cancerType$Study.Abbreviation);
  vcfdata <- matrix(".", nrow=nrow(cancerType), ncol = 10);
  columns=c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", as.character(cancerType$Tumor_Sample_Barcode) )
  colnames(vcfdata) <- columns
  
  vcfdata[,1]=as.character(cancerType$Chromosome)
  vcfdata[,2]=as.character(cancerType$Start_Position)
  vcfdata[,4]=as.character(cancerType$Reference_Allele)
  vcfdata[,5]=as.character(cancerType$Tumor_Seq_Allele2)
  vcfdata[,9]="GT"
  vcfdata[,10]="1/0"
  #vcfdata[,3]=as.character(cancerType$Tumor_Sample_Barcode)
  
  outfile <- paste(cancer_abb,".cohort.vcf", sep="")
  #outfile = paste(directory_name,outfile, sep="/")
 # write.table(vcfdata, file=outfile, row.names=FALSE, sep="\t", quote=FALSE)
  

  
}









##############   Start  Cohort section-------------------------------------------------------------------------------
##############-------------------------------------------------------------------------------------------------------

#Read all cohort-level files and transform to G-range objects

setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/VCF_files")

vcf_cohort_list <- list.files(path="C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/VCF_files",recursive=TRUE,pattern="cohort.vcf")

#Optional: Select only certain cancertypes
vcf_cohort_list <- vcf_cohort_list[gsub("/.*","",vcf_cohort_list) %in% c('CHOL')] 

name_cohort <- gsub("/.*","",vcf_cohort_list)




#Read each cohort.vcf file as Grange oject
cohort_vcfs <- read_vcfs_as_granges(vcf_cohort_list,name_cohort,ref_genome)

# Create mutational matrix for each cancer type
cohort_mut_matrix = mut_matrix(cohort_vcfs,ref_genome)

#Extract mutational signatures
extract_3 = extract_signatures(cohort_mut_matrix, rank = 4) # Cant extract more signatures than number of cancer types



# Plot extracted signatures and their contribution

setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/Pictures/Cohort")

pdf("extracted_3.pdf")
plot_96_profile(extract_3$signatures)
dev.off()
pdf("extracted_contribution.pdf")
plot_contribution(extract_3$contribution, extract_3$signature, mode = "absolute", coord_flip = T)
dev.off()

####### End cohort section-------------------------------------------------------------
#######--------------------------------------------------------------------------------




#Open VCF-files and transform into G-ranges object-----------------------------------------------

#library(MutationalPatterns)

#setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/VCF_files")


auto = extractSeqlevelsByGroup(species="Homo_sapiens", 
                               style="UCSC",
                               group="auto")






########## ---- Mutational signatures for each sample for one cancer type  #######

setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3")
#Load cosmic signature from file.
cosmic_signatures <- as.matrix(read.table("cosmic_signatures.txt",header=TRUE))

setwd("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/VCF_files")


#list_of_cancers <- as.vector(unique(cohort$subtype))
#for (cancer_type in list_of_cancers){}



#Single run!

file_list <- as.character(cohort$vcf[which(cohort$subtype %in% "OV")])
name_list <- as.character(cohort$sample[which(cohort$subtype %in% "OV")])

GRange_VCF <- read_vcfs_as_granges(file_list,name_list,ref_genome)
GRange_VCF_auto = lapply(GRange_VCF, function(x) keepSeqlevels(x, auto, pruning.mode='coarse'))


mut_matrix_OV <- mut_matrix(GRange_VCF_auto,ref_genome)
mut_signatures_OV <- extract_signatures(mut_matrix_OV,rank=20)


#Rank optimisation
if (FALSE){
  
  library(NMF)
  #Add pseudocount to mut matrix ?
  #mut_mat <- mut_mat + 0.0001
  estim.r <- nmf(mut_matrix_OV, rank=4:20,method="brunet", nrun=10, seed=123456)
  plot(estim.r)

}





#Set wd for saving pictures
directory_name <- paste("C:/Users/Nils_/OneDrive/Skrivbord/Data/MC3/Pictures","/","OV",sep="") 
#crate folder if it doesnt exist
ifelse(!dir.exists(file.path(directory_name)), dir.create(file.path(directory_name)), FALSE)
print (directory_name)
setwd(directory_name)
#Plot results of signature extraction
pdf("mut_signatures_OV.pdf")
plot_96_profile(mut_signatures_OV$signatures,condensed = TRUE)
dev.off()
pdf("extracted_contribution.pdf") # Mayby plot in chunks of 100? Hard to see anything
plot_contribution(mut_signatures_OV$contribution, mut_signatures_OV$signature, mode = "absolute", coord_flip = T)
dev.off()





################Download Cosmic-signatures or just use the cosmic_signature.txt file

#sp_url <- paste("https://cancer.sanger.ac.uk/cancergenome/assets/","signatures_probabilities.txt", sep = "")
#cosmic_signatures = read.table(sp_url, sep = "\t", header = TRUE)
# Match the order of the mutation types to MutationalPatterns standard
#new_order = match(row.names(mut_matrix_OV), cosmic_signatures$Somatic.Mutation.Type)
# Reorder cosmic signatures dataframe
#cosmic_signatures = cosmic_signatures[as.vector(new_order),]
# Add trinucletiode changes names as row.names
#row.names(cosmic_signatures) = cosmic_signatures$Somatic.Mutation.Type
# Keep only 96 contributions of the signatures in matrix
#cosmic_signatures = as.matrix(cosmic_signatures[,4:33])
##save as cosmic_signatures.txt
#write.table(cosmic_signatures, file="cosmic_signatures.txt", row.names=TRUE, col.names=TRUE)
#cosmic_signatures <- read.table()


#Similarity between cosmic_signatures and mut_matrix_OV
cos_sim_samples_signatures <- cos_sim_matrix(mut_matrix_OV, cosmic_signatures)

#Test to remove sample-barcode for cleaner plot
rownames(cos_sim_samples_signatures) <- c(1:nrow(cos_sim_samples_signatures))


hclust_cosmic = cluster_signatures(cosmic_signatures, method = "average")
cosmic_order = colnames(cosmic_signatures)[hclust_cosmic$order]


pdf("Cosine_heatmap.pdf")

plot_cosine_heatmap(cos_sim_samples_signatures,
                     col_order = cosmic_order,
                     cluster_rows = TRUE)
dev.off()


#Reconstruct mut_matrix_OV by linear combination of cosmic_signatures

fit_res <- fit_to_signatures(mut_matrix_OV, cosmic_signatures)
select <- which(rowSums(fit_res$contribution) > 10)
#Plot contribution barplot

pdf("Reconstructed_mut_matrix_from_cosmic_signatures.pdf")
plot_contribution(fit_res$contribution[select,],
                       cosmic_signatures[,select],
                       coord_flip = FALSE,
                       mode = "absolute")

dev.off()

##########---------------------------------------------------------------------


  









  
NilsWeng/TCGAproject_repo documentation built on May 14, 2019, 4:07 a.m.