R/prepare_enriched_regions_for_ordering.R

Defines functions multipcf_new_breakpoints merge_enriched_regions remove_artefacts

#' Remove artefacts from the enriched regions, eg. segments near telomere, centromere, HLA region, in a few samples
#'
#' @param annotated_segments_file Full path to the file with the annotated CNA
#' @param tumour_type String that represents the type of tumour we work with
#' @param output_dir Full path to the directory where the files with the enriched regions (and the p-values) will be output
#' @param pvalues_dir Full path to the directory with the FDR-corrected p-values and the enriched regions
#' @param genome_chr_coordinates Full path to the file with the chromosome coordinates of the hg19 or hg38 build genome
#' @param min_region_choice A numeric representing the minimum length of enriched region; default = 10000
#' @param minN_choice A numeric representing the minimum number of events per segment to be used; default = 3
#' @return files with enriched regions (separate files for LOH, HD and gain), with any artefacts removed


remove_artefacts <- function(annotated_segments_file, tumour_type, output_dir, pvalues_dir, genome_chr_coordinates, min_region_choice, minN_choice){

  # LOOP TO GENERATE OUTPUT FOR ALL THREE CNA TYPES
  # load the all segments data
  allsegments <- read.table(annotated_segments_file, sep = "\t", stringsAsFactors = F, header = T)
  Ntumour = length(unique(allsegments$Tumour_Name))   # how many samples in the cohort
  cna_type  = c("Gain","LOH","HD")

  for (cna in 1:length(cna_type)){
   print(paste0(cna_type[cna], " - Removing arefacts"))	  

    # load the pvalues data for this type of CNA
    pvalues_data = read.table(paste0(pvalues_dir, tumour_type,"_samples_significant_pvals_FDR_",cna_type[cna],".txt"), header = T, stringsAsFactors = F)
    names(pvalues_data) = c("chr","sp","ep","val","no.perms.better","p.val","bonf","fdr")

    # pick only the important columns (chr, start and end position, number of segments with the region being enriched and the FDR-corrected p-value)
    pvalues_data <- data.frame("chr" = pvalues_data$chr, "sp" = pvalues_data$sp, "ep" = pvalues_data$ep, "val" = pvalues_data$val, "fdr" = pvalues_data$fdr)
    pvalues_data = pvalues_data[which(pvalues_data$fdr<0.05),]  # pick only the ones with p-value < 0.05 (they should all be < 0.05)


    if (nrow(pvalues_data)>0){
      #cumulative chromosome coordinate generation
      v = NULL
      for (i in c(1:22,"X")){
        data.chr <- pvalues_data[pvalues_data$chr==i,]
        norow <- nrow(data.chr)
        if (i=="X"){
          j=23
        } else{
          j=i
        }
        v[j] = norow    #number of segments for each chromomsome
      }
      v2 = cumsum(v)    # cumulative sum of number of segments/rows


      all.sig.data = data.frame()    # a data frame with all the ordered enriched regions from the chromosomes (chr startpos endpos) and eventindex (which is the number/index of the enriched region in this data frame)
      for (i in c(1:22,"X")){
        data.chr <- pvalues_data[pvalues_data$chr==i,]

        if (nrow(data.chr)>0){
          segnumber <- nrow(data.chr)    # number of segments for this chromosome
          tempvector = NULL
          sig.data = NULL
          no.chr=ifelse(i=="X",22,as.numeric(i)-1)
          for (j in 1:segnumber){
            k = ifelse(no.chr==0,j,v2[no.chr]+j)
            sig.data.segment <- data.frame("chr"=i, "startpos"= data.chr$sp[j], "endpos" = data.chr$ep[j], "eventindices" = k)    # put together all the enriched segments
            all.sig.data     <- rbind(all.sig.data,sig.data.segment)
            sig.data.segment = NULL
          }
        } else {print(paste0("chr",i," has no enriched segments"))}
      }
      all.sig.data$chr = as.character(all.sig.data$chr)


      enriched_data = data.frame()      # create enriched_data data frame where we are going to merge the FDR-enriched regions in each chr
      chrs = unique(all.sig.data$chr)   # all the chromosomes having enriched regions

      # load the chromosome coordinates for hg19 or hg38
      chr_loc = read.table(genome_chr_coordinates, header = T, stringsAsFactors = F)


      for (i in 1:length(chrs)){
        seg_count = nrow(enriched_data)                                 # number of included segments so far
        CHR = all.sig.data[which(all.sig.data$chr==chrs[i]),]           # pick all the data for that chr
        if (nrow(CHR)==1){
          enriched_data = rbind(enriched_data, data.frame(chr = chrs[i], startpos = CHR$startpos, endpos = CHR$endpos, length = CHR$endpos-CHR$startpos))
        } else{
          # check how close the next enriched region is: if it is withing 1e4 of the previous, merge it to the previous segment; if not, don't merge
          start = CHR$startpos[1]
          for (j in 2:nrow(CHR)){
            if (CHR$startpos[j] <= CHR$endpos[j-1]+1e4){                # change interval to appropriate level (e.g. 1e4 or 1e5) ?
              end = CHR$endpos[j]                                       # include the new row (j) in the merge
            }
            else {
              end = CHR$endpos[j-1]                                      # stop merge at the previous row (j-1)
              enriched_data = rbind(enriched_data, data.frame(chr=chrs[i], startpos = start, endpos = end, length = end-start))
              start = CHR$startpos[j]
            }
          }

          # check if the merging of the smaller enriched regions into larger has resulted in the creation of one large segment
          if (start==CHR$startpos[1] & end==CHR$endpos[nrow(CHR)]){
            enriched_data = rbind(enriched_data,data.frame(chr = chrs[i], startpos = start, endpos = end, length = end-start))                 # if all segments were adjacent to each other
          } else {
            enriched_data = rbind(enriched_data,data.frame(chr = chrs[i], startpos = start, endpos = CHR$endpos[j], length = CHR$endpos[j]-start)) # if final segment is distant from previous
          }
        }
        segs_per_chr = nrow(enriched_data) - seg_count
        print(paste0(chrs[i]," has ",segs_per_chr," enriched region(s)"))
      }
      enriched_data$chr = as.character(enriched_data$chr)

      #identify enriched regions which are at telomeres and REMOVE #
      chrom = as.vector(unique(enriched_data$chr))
      ENRICHED = data.frame()             # this data frame will store all the enriched regions which are not near centromere/telomere or in the HLA region

      for (i in 1:length(chrom)){
        indices_remove = NULL                          # this will store the indices of the segments to be removed from the enriched_data
        CHROM = enriched_data[which(enriched_data$chr==chrom[i]),]
        # not sure why we have a limit on how big the size of segment - don't we want to remove any segment that is near the telomeres??

        for (j in 1:nrow(CHROM)){
          # if end of the enriched segment is within 1Mb of the end of the chromosome and the length of the segment is less than 5Mb; remove this segment
          if (CHROM$endpos[j]>=chr_loc[ifelse(chrom[i]!="X",chrom[i],23),]$end-1e6 & CHROM$length[j]<=5e6){ # length increased to 5Mb from 2Mb
            indices_remove = append(indices_remove,j)
          }
          # if the start of the enriched region is near the beginning of the chromosome and the length is less than 2Mb; remove this segment
          if (CHROM$startpos[j]<1e6 & CHROM$length[j]<=2e6){
            indices_remove = append(indices_remove,j)
          }
        }
        if (!is.null(indices_remove)){
          print(paste("enriched segment", indices_remove, " removed from chromosome ", chrom[i]))
          CHROM = CHROM[-indices_remove,]
        }
        if (nrow(CHROM)>0){
          ENRICHED = rbind(ENRICHED,CHROM)
        }
        else {print(paste("Chromosome",chrom[i],": only telomeric end removed"))}
      }

      #identify enriched regions which are at the HLA region and remove #
      #https://www.ncbi.nlm.nih.gov/grc/human/regions/MHC?asm=GRCh37 MHC region based on hg37
      if (nrow(ENRICHED)>0){
        hla = c(28477797,33448354)        # coordinates of the hla region
        # added the last condition because the statement was missing the case where the enriched segment was covering the whole HLA region
        HLA = ENRICHED[which(ENRICHED$chr==6 & ((ENRICHED$startpos>hla[1] & ENRICHED$endpos<hla[2])|(ENRICHED$endpos>hla[1] & ENRICHED$startpos<hla[1])|(ENRICHED$startpos<hla[2] & ENRICHED$endpos>hla[2]|ENRICHED$startpos<hla[1] & ENRICHED$endpos>hla[2]))),]

        ENRICHED=setdiff(ENRICHED,HLA)    # remove any segments that are found in HLA region
      }

      # REMOVE SINGLETONS/RARE DOUBLETS when N is high:
      rare = NULL
      if (nrow(ENRICHED)>0){
        for (i in 1:nrow(ENRICHED)){
          ENRICHEDi = ENRICHED[i,]

          # pick up from the pvalues_data how many samples have each of the enriched events and remove the event if insufficient number of samples (max(3,Ntumour/100)) have it
          idata = pvalues_data[which(pvalues_data$chr==ENRICHEDi$chr & pvalues_data$sp<=ENRICHEDi$endpos & pvalues_data$ep>=ENRICHEDi$startpos),]
          if (max(idata$val) < max(minN,Ntumour/100)){
            rare = append(rare,i)
          }
        }
      }

      if (!is.null(rare)){
        ENRICHEDclean = ENRICHED[-rare,]
      } else {ENRICHEDclean = ENRICHED}

      # for noevent:
      if (nrow(ENRICHEDclean)>0){
        ENRICHEDclean = ENRICHEDclean[which(ENRICHEDclean$length>min_region),]
        ENRICHEDclean$length = NULL
        if (nrow(ENRICHEDclean)>0) {
          ENRICHEDclean$noevent = 1:nrow(ENRICHEDclean)
          write.table(ENRICHEDclean, paste0(output_dir,tumour_type,"_enriched_regions_",cna_type[cna],"_noevent.txt"), sep="\t",quote=F,row.names = F)
	}
      } else {print(paste(output_dir,tumour_type,": no file written out - no enriched region remained after correction"))}
    } else {print(paste(output_dir ,tumour_type,"_samples_significant_pvals_FDR_",cna_type[cna],".txt"," is empty",sep=""))}
  }
}


#' Merge segments overlapping the enriched regions
#'
#' @param annotated_segments_file Full path to the file with the annotated segments file
#' @param tumour_type String that represents the type of tumour we work with
#' @param enriched_dir Full path to the directory with the files with the enriched regions
#' @param output_dir Full path to the directory where the merged enriched segments and the plots of the enriched regions are output
#' @param genome_chr_coordinates Full path to the file with the chromosome coordinates of the hg19 or hg38 build genome
#' @return files with merged enriched regions (separate files for LOH, HD and gain) and plots of the segments overlapping the enriched regions


merge_enriched_regions <- function(annotated_segments_file, tumour_type, enriched_dir, genome_chr_coordinates, output_dir){

  cna_type = c("Gain","LOH","HD")
  allsegments <- read.table(annotated_segments_file, sep = "\t", stringsAsFactors = F, header = T) # load the allsegments file

  for (cna in 1:length(cna_type)){
    print(paste0(cna_type[cna], " - Merging enriched regions"))	  
	  
    CNAtype <- paste0(c("s","c"), cna_type[cna])
    cna.data <- allsegments[allsegments$CNA %in% CNAtype, ]    # pick up the subclonal and clonal events for this CNA

    #remove subclonal after clonal AND order by sample, chr and startpos
    cna.data$chr[cna.data$chr=='X'] <- 23                   # set chr X as 23
    cna.data$chr = as.numeric(cna.data$chr)
    cna.data = cna.data[!duplicated(cna.data[,1:4]),]       # remove any duplicated events

    if (file.exists(paste0(enriched_dir, tumour_type,"_enriched_regions_",cna_type[cna],"_noevent.txt"))){
      enriched = read.table(paste0(enriched_dir, tumour_type,"_enriched_regions_",cna_type[cna],"_noevent.txt"),header=T,stringsAsFactors = F,sep="\t")
      enriched$chr[enriched$chr=="X"] = 23
      enriched$chr <- as.numeric(enriched$chr)

      print("files read")


      data.table::setDT(cna.data)                                              # convert cna.data into data.table
      data.table::setDT(enriched)                                              # all the enriched segments
      data.table::setkey(cna.data,"chr","startpos","endpos")                   # sort the cna.data by chr, startpos,endpos
      data.table::setkey(enriched,"chr","startpos","endpos")                   # sort the enriched by chr, startpos, endpos

      merged = data.table::foverlaps(cna.data, enriched, type = "any", nomatch = 0)     # find the overlaps between the cna.data and the enriched regions (will return only the matches)
      # merged is in the format first four columns corresponding to/being the same as the enriched regions intervals and the number of the event; then a column with the tumour names and columns with interval for
      # that tumour name overlapping this enriched region (with i.startpos and i.endpos being the start/end of the segment) and then the copy number information for this segment

      # remove regions outside of ENRICHED REGIONS - this is removing any parts of the segment overlapping the enriched region that are outside the enriched region
      merged$i.startpos = ifelse(merged$i.startpos<merged$startpos, merged$startpos, merged$i.startpos)        # i.startpos and i.endpos - the start and end of the segment overlapping with the enriched region
      merged$i.endpos = ifelse(merged$i.endpos>merged$endpos, merged$endpos, merged$i.endpos)
      print("files merged")

      # add 'y' to plot the subclones segments in the enriched regions
      tumour_names = unique(merged$Tumour_Name)
      for (i in 1:length(tumour_names)){
        for (j in 1:nrow(merged)){
          if (merged$Tumour_Name[j]==tumour_names[i]){
            merged$y[j] = i
          }
        }
      }

      # all chromosomes that have enriched regions
      chrs = unique(merged$chr)

      # load the chromosome coordinates for hg19 or hg38
      chr_loc = read.table(genome_chr_coordinates, header = T, stringsAsFactors = F)

      for (k in 1:length(chrs)){
        chr_enriched_regions = merged[which(merged$chr==chrs[k]),]   # plot the segments in the enriched regions
        enriched_regions_plot = ggplot2::ggplot() +
          ggplot2::geom_segment(data = chr_enriched_regions, aes(x = i.startpos, y = y, xend = i.endpos, yend = y, col = Tumour_Name))+
          ggplot2::theme(legend.position = "NONE") +
          ggplot2::ggtitle(paste0(tumour_type, cna_type[cna]," - Enriched regions chr",chrs[k]))+
          ggplot2::expand_limits(y = 0, x = 0)+
          ggplot2::geom_vline(xintercept = chr_loc[chr_loc$chr==chrs[k],]$end, col = "black", linetype = "longdash")  # plot centromere/telomere

        # save the plot for this enriched region
        cowplot::save_plot(paste0(output_dir, tumour_type,"_enriched_",cna_type[cna],"_chr",chrs[k],"_cowplot.pdf"), plot = enriched_regions_plot)

      }
        write.table(merged, paste0(output_dir, tumour_type,"_segments_in_enriched_regions_",cna_type[cna],".txt"), sep = "\t", quote = F,row.names = F)

      } else {print(paste0(paste0(output_dir,tumour_type,"_enriched_regions_",cna_type[cna],"_noevent.txt",sep="")," does not exist",sep=""))}
    }
  }

#' Check if any new breakpoints should be added to the enriched regions
#'
#' @param enriched_dir Full path to the directory with the files with the enriched regions
#' @param tumour_type String that represents the type of tumour we work with
#' @param output_dir Full path to the directory where the enriched segments with the new breakpoints and the plots of start and end positions of the enriched regions are output
#' @return files with merged enriched regions (separate files for LOH, HD and gain) and plots of the segments overlapping the enriched regions

multipcf_new_breakpoints <- function(enriched_dir, tumour_type, output_dir){

  cna_type=c("Gain","LOH","HD")

  for (cna in 1:length(cna_type)){
    print(paste0(cna_type[cna], " - Checking for new breakpoints"))	  

    if (file.exists(paste0(enriched_dir, tumour_type,"_segments_in_enriched_regions_",cna_type[cna],".txt"))){
      # load the enriched data for this CNA type
      enriched_regions = read.table(paste0(enriched_dir, tumour_type,"_segments_in_enriched_regions_",cna_type[cna],".txt"), header = T, stringsAsFactors = F)
      enriched_regions = enriched_regions[!duplicated(enriched_regions[,c("Tumour_Name","i.startpos","i.endpos")]),] # remove any subclonal after clonal e.g. 20% 2:1, 80% 2:2 -> 100% 2:1 gain


      # adding weighted CCF to all segments in same individual
      enriched_regions$diff = enriched_regions$i.endpos - enriched_regions$i.startpos
      enriched_regions = enriched_regions[which(enriched_regions$diff>1),]               # remove single bp segments
      chrs = unique(enriched_regions$chr)


      # VISUALISE SEGMENT START POINTS#
      title = paste(tumour_type,"- CNA",cna_type[cna],"patterns")
      all_enriched_segments = data.frame()
      for (i in 1:length(chrs)){
        CHR = enriched_regions[which(enriched_regions$chr==chrs[i]),]
        sample_names = unique(CHR$Tumour_Name)   # all the samples that have data for this chromosome
        for (j in 1:length(sample_names)){
          sample_data = CHR[which(CHR$Tumour_Name==sample_names[j]),]
          sample_data$w.mean = weighted.mean(sample_data$frac1_A,sample_data$diff)   # ccf for the event; diff is the length of the segment
          all_enriched_segments = rbind(all_enriched_segments, sample_data)
        }
      }

      enriched_segments_plot = ggplot2::ggplot() + ggplot2::geom_point(data = all_enriched_segments, aes(x = i.startpos, y = w.mean, col = Tumour_Name)) + ggplot2::theme(legend.position = "NONE") +
        ggplot2::facet_grid(chr~.) +
        ggplot2::labs(x = "Chromosome Position")
      cowplot::save_plot(paste0(output_dir, tumour_type,"_CNA_",cna_type[cna],"_patterns.pdf"), plot = enriched_segments_plot)


      # pivot table to prepare for multiPCF  #
      #  the data frame input for multiPCF should be in the following format:
      # the rows of the data frame should be the probes
      # the columns: column 1 = numeric/character chromosome numbers
      #              column 2 = the numeric local probe coordinates
      #              subsequent columns = the copy number measurements for 2 or more samples
      # the header should contain the sample IDs

      print("multiPCF input generation")
      all_enriched_segments$ID = paste(all_enriched_segments$chr, all_enriched_segments$i.startpos,sep = "_")        # add ID to the segments overlapping the enriched regions which are the chromosome and the starting position of this segment

      dc = dcast(all_enriched_segments, ID~Tumour_Name, value.var="w.mean")   # restructure the data for multipcf; the IDs of the events are in the first column, and the patient platekeys are in the subsequent columns; the matrix is filled with the CCF of the event for the corresponding sample
      dc[is.na(dc)] = 0

      out = dc %>% separate(ID, into = c("chr","startpos"),sep = "\\_")                    # split the ID column into 2 - chromosome and starting position
      out = out %>% mutate_if(is.character,as.numeric) %>% arrange(chr,startpos)           # sort by chromosome and position

      # run multiPCF
      mp = multipcf(out, gamma = 0.1) # gamma 10 or 40 = same breakpoints
      mp = mp[order(mp$chrom,mp$arm,mp$start.pos),]

      # save the multipcf output as the new enriched regions output
      ENRICHED = data.frame(chr = mp$chrom, startpos = mp$start.pos, endpos = mp$end.pos, noevent = 1:nrow(mp))
      write.table(ENRICHED, paste(output_dir, tumour_type,"_enriched_regions_",cna_type[cna],"_noevent.txt"), sep = "\t", quote = F, row.names = F)
    } else {print(paste(output_dir, tumour_type,"_segments_in_enriched_regions_",cna_type[cna],".txt","does not exist",sep=""))}
  }
}



#' Process the enriched data into the format required for the ordering step with the Plackett-Luce model
#'
#' @param annotated_segments_file Full path to the file with the annotated CNA
#' @param tumour_type String that represents the type of tumour we work with
#' @param enriched_dir Full path the directory with the enriched regions
#' @param genome_chr_coordinates Full path to the file with the chromosome coordinates of the hg19 or hg38 build genome
#' @param output_dir Full path to the directory where the files with the enriched regions will be output
#' @return files with enriched regions (separate files for LOH, HD and gain) in the format for the ordering script
prepare_ordering_data <- function(annotated_segments_file, tumour_type, enriched_dir, genome_chr_coordinates, output_dir){
  # load the allsegments data
  allsegments <- read.table(annotated_segments_file, sep = "\t", stringsAsFactors = F, header = T)
  cna_type    <- c("LOH", "Gain", "HD")

  for (cna in 1:length(cna_type)){
    print(paste0(cna_type[cna], " - Preparing ordering data"))	  
    CNAtype  <- paste0(c("s","c"), cna_type[cna])
    cna.data <- allsegments[allsegments$CNA %in% CNAtype, ]   # get all the clonal and subclonal events for this type of CNA

    # remove subclonal after clonal AND order by sample, chr and startpos cna.data$chr[is.na(cna.data$chr)]=23
    cna.data$chr = as.numeric(cna.data$chr)
    cna.data$chr[is.na(cna.data$chr)] = 23
    cna.data = cna.data[!duplicated(cna.data[,1:4]), ]

    if (file.exists(paste0(enriched_dir, tumour_type,"_enriched_regions_",cna_type[cna],"_noevent.txt"))){
      # load the enriched regions for this type of CNA
      enriched = read.table(paste0(enriched_dir, tumour_type,"_enriched_regions_",cna_type[cna],"_noevent.txt"), header = T, stringsAsFactors = F, sep = "\t")
      enriched$chr[enriched$chr=="X"] = 23
      enriched$chr = as.numeric(enriched$chr)
      print("files read")

      # convert cna.data and enriched to data.tables and order by chromosome, startpos and endpos
      data.table::setDT(cna.data)
      data.table::setDT(enriched)
      data.table::setkey(cna.data,"chr","startpos","endpos")
      data.table::setkey(enriched,"chr","startpos","endpos")

      # find the overlap between the cna.data and the enriched
      merged = data.table::foverlaps(cna.data, enriched, type = "any", nomatch = 0)

      # remove regions outside of ENRICHED REGIONS
      merged$i.startpos = ifelse(merged$i.startpos < merged$startpos, merged$startpos, merged$i.startpos)
      merged$i.endpos = ifelse(merged$i.endpos > merged$endpos, merged$endpos, merged$i.endpos)
      print("files merged")

      # add 'y' for geom_segment() to plot the segments covering each of the enriched regions following the multipcf step
      tumour.names = unique(merged$Tumour_Name)
      for (i in 1:length(tumour.names)){
        for (j in 1:nrow(merged)){
          if (merged$Tumour_Name[j]==tumour.names[i]){
            merged$y[j]=i
          }
        }
      }

      # plot the segments in each enriched region for all chromosomes
      chrs = unique(merged$chr)
      # load the chromosome coordinates for hg19 or hg38
      chr_loc = read.table(genome_chr_coordinates, header = T,stringsAsFactors = F)

      for (i in 1:length(chrs)){
        chr_enriched_regions=merged[which(merged$chr==chrs[i]),]
        enriched_regions_plot = ggplot2::ggplot() + ggplot2::geom_segment(data=chr_enriched_regions, aes(x = i.startpos, y = y, xend = i.endpos, yend=y, col = Tumour_Name)) +
          ggplot2::theme(legend.position = "NONE") +
          ggplot2::ggtitle(paste(tumour_type, cna_type[cna]," - Enriched regions chr",chrs[i])) +
          ggplot2::expand_limits(y=0,x=0) +
          ggplot2::geom_vline(xintercept = chr_loc[chr_loc$chr==chrs[i],]$end, col = "black", linetype = "longdash")

        # save the plot
        cowplot::save_plot(paste0(output_dir, tumour_type,"_enriched_", cna_type[cna],"_chr",chrs[i],"cowplot_postFDR4.pdf"), plot = enriched_regions_plot)
      }

      # Prepare the input for the ordering script
      merged_enriched_regions = merged
      merged_enriched_regions = merged_enriched_regions[!duplicated(merged_enriched_regions[,c("Tumour_Name","i.startpos","i.endpos")]), ] # remove any subclonal after clonal e.g. 20% 2:1, 80% 2:2 -> 100% 2:1 gain

      # adding weighted CCF to all segments in same individual
      merged_enriched_regions$diff = merged_enriched_regions$i.endpos - merged_enriched_regions$i.startpos
      merged_enriched_regions = merged_enriched_regions[which(merged_enriched_regions$diff>1),]      # remove single bp segments

      chrs = unique(merged_enriched_regions$chr)

      data_for_ordering = data.frame()
      for (i in 1:length(chrs)){
        CHR = merged_enriched_regions[which(merged_enriched_regions$chr==chrs[i]),]    # pick the data for this chromosome
        sample_names = unique(CHR$Tumour_Name)                                         # all the sample names with data for this chromosome
        for (j in 1:length(sample_names)){
          sample_data = CHR[which(CHR$Tumour_Name==sample_names[j]),]                               # pick the info for this sample
          sample_data_weighted = data.frame(sample_data[1,c(5,1:3,8:9,12:13,4)], w.mean = weighted.mean(sample_data$frac1_A,sample_data$diff)) # weigh the ccf
          data_for_ordering = rbind(data_for_ordering, sample_data_weighted)
        }
      }

      # save the table
      write.table(data_for_ordering, paste0(output_dir, tumour_type,"_",cna_type[cna],"_mergedsegs.txt"), sep= "\t", row.names = F, quote = F)
    } else {print(paste(output_dir, tumour_type,"_enriched_regions_",cna_type[cna],"_noevent.txt"," does not exist",sep=""))}
  }

}
mairenileath/PLTimeR documentation built on Dec. 21, 2021, 1:44 p.m.