data-raw/pcawg_sv.R

# The location of each data file downloaded from the PCAWG project.

# File that contains sample identifiers and the file names for variant calls:
# http://dcc.icgc.org/releases/PCAWG/donors_and_biospecimens/pcawg_sample_sheet.tsv
#
# File that links sample identifiers with cancer type
# https://dcc.icgc.org/releases/PCAWG/clinical_and_histology/pcawg_specimen_histology_August2016_v9.xlsx
#
# Files that contain stuctural variant calls on a per sample basis, separated by ICGC and TCGA
# https://dcc.icgc.org/releases/PCAWG/consensus_sv/final_consensus_sv_bedpe_passonly.icgc.public.tgz
# https://dcc.icgc.org/releases/PCAWG/consensus_sv/final_consensus_sv_bedpe_passonly.tcga.public.tgz
# (After downloading these two files I unzipped them)

# Merging SV callsets across TCGA and ICGC and annotating them
# with a sample ID and a cancer type

# Getting the files paths to the SV calls
projects <- c("tcga", "icgc")
sv_files <- list.files(file.path("/dcl01/scharpf1/data/dbruhm/delfi_followup/split-reads/pcawg_analysis/data/raw_data", projects, "open"),
                       pattern = ".gz$", full.names = TRUE)

# Adding the SV calls from the 2748 samples into a single table
rm("svs"); rm("ids")
for (i in 1:length(sv_files)) {
  print(i)
  sv <- read.table(sv_files[i], header = TRUE, stringsAsFactors = FALSE)
  id <- rep(gsub("\\..+", "", basename(sv_files[i])), nrow(sv))

  if (!exists("svs") & !exists("ids")) {
    svs <<- sv
    ids <<- id
  } else {
    svs <- rbind(svs, sv)
    ids <- c(ids, id)
  }
}

# Linking file name to sample ID
sample_sheet <- read.table("/dcl01/scharpf1/data/dbruhm/delfi_followup/split-reads/pcawg_analysis/data/raw_data/pcawg_sample_sheet.tsv",
                           header = TRUE, stringsAsFactors = FALSE, sep = "\t")
fname_hits <- match(ids, sample_sheet$aliquot_id)
donor_id <- sample_sheet$donor_unique_id[fname_hits]

# Linking sample ID to tumor type
histology_sheet <- readxl::read_excel("/dcl01/scharpf1/data/dbruhm/delfi_followup/split-reads/pcawg_analysis/data/raw_data/pcawg_specimen_histology_August2016_v9.xlsx")
hist_hits <- match(donor_id, histology_sheet$donor_unique_id)
hist_type <- histology_sheet$histology_tier2[hist_hits]


# Combining donor id, histology type, and SV calls into a single table
out.df <- cbind(data.frame(donor_id = donor_id,
                           donor_tissue = hist_type),
                svs)
write.table(out.df, file = "/dcl01/scharpf1/data/dbruhm/delfi_followup/split-reads/pcawg_analysis/data/processed_data/pcawg_sv.txt",
            row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE)


library(GenomicRanges)
bp1.gr <- GRanges(seqnames = paste0("chr", out.df$chrom1), ranges = as.character(out.df$end1), strand = out.df$strand1)
bp2.gr <- GRanges(seqnames = paste0("chr", out.df$chrom2), ranges = as.character(out.df$end2), strand = out.df$strand2)
pcawg_sv <- bp1.gr
pcawg_sv$linked.to <- bp2.gr
save(pcawg_sv, file = "/dcl01/scharpf1/data/dbruhm/delfi_followup/split-reads/pcawg_analysis/data/processed_data/pcawg_sv.rda")
cancer-genomics/plasmasv documentation built on May 15, 2020, 11:35 a.m.