# The location of each data file downloaded from the PCAWG project.
# File that contains sample identifiers and the file names for variant calls:
# http://dcc.icgc.org/releases/PCAWG/donors_and_biospecimens/pcawg_sample_sheet.tsv
#
# File that links sample identifiers with cancer type
# https://dcc.icgc.org/releases/PCAWG/clinical_and_histology/pcawg_specimen_histology_August2016_v9.xlsx
#
# Files that contain stuctural variant calls on a per sample basis, separated by ICGC and TCGA
# https://dcc.icgc.org/releases/PCAWG/consensus_sv/final_consensus_sv_bedpe_passonly.icgc.public.tgz
# https://dcc.icgc.org/releases/PCAWG/consensus_sv/final_consensus_sv_bedpe_passonly.tcga.public.tgz
# (After downloading these two files I unzipped them)
# Merging SV callsets across TCGA and ICGC and annotating them
# with a sample ID and a cancer type
# Getting the files paths to the SV calls
projects <- c("tcga", "icgc")
sv_files <- list.files(file.path("/dcl01/scharpf1/data/dbruhm/delfi_followup/split-reads/pcawg_analysis/data/raw_data", projects, "open"),
pattern = ".gz$", full.names = TRUE)
# Adding the SV calls from the 2748 samples into a single table
rm("svs"); rm("ids")
for (i in 1:length(sv_files)) {
print(i)
sv <- read.table(sv_files[i], header = TRUE, stringsAsFactors = FALSE)
id <- rep(gsub("\\..+", "", basename(sv_files[i])), nrow(sv))
if (!exists("svs") & !exists("ids")) {
svs <<- sv
ids <<- id
} else {
svs <- rbind(svs, sv)
ids <- c(ids, id)
}
}
# Linking file name to sample ID
sample_sheet <- read.table("/dcl01/scharpf1/data/dbruhm/delfi_followup/split-reads/pcawg_analysis/data/raw_data/pcawg_sample_sheet.tsv",
header = TRUE, stringsAsFactors = FALSE, sep = "\t")
fname_hits <- match(ids, sample_sheet$aliquot_id)
donor_id <- sample_sheet$donor_unique_id[fname_hits]
# Linking sample ID to tumor type
histology_sheet <- readxl::read_excel("/dcl01/scharpf1/data/dbruhm/delfi_followup/split-reads/pcawg_analysis/data/raw_data/pcawg_specimen_histology_August2016_v9.xlsx")
hist_hits <- match(donor_id, histology_sheet$donor_unique_id)
hist_type <- histology_sheet$histology_tier2[hist_hits]
# Combining donor id, histology type, and SV calls into a single table
out.df <- cbind(data.frame(donor_id = donor_id,
donor_tissue = hist_type),
svs)
write.table(out.df, file = "/dcl01/scharpf1/data/dbruhm/delfi_followup/split-reads/pcawg_analysis/data/processed_data/pcawg_sv.txt",
row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE)
library(GenomicRanges)
bp1.gr <- GRanges(seqnames = paste0("chr", out.df$chrom1), ranges = as.character(out.df$end1), strand = out.df$strand1)
bp2.gr <- GRanges(seqnames = paste0("chr", out.df$chrom2), ranges = as.character(out.df$end2), strand = out.df$strand2)
pcawg_sv <- bp1.gr
pcawg_sv$linked.to <- bp2.gr
save(pcawg_sv, file = "/dcl01/scharpf1/data/dbruhm/delfi_followup/split-reads/pcawg_analysis/data/processed_data/pcawg_sv.rda")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.