R/FrankellA.functions.R

Defines functions extract_ccf_table correct.specific.ps cal.Subject.accross.regions rename.TxIDs.in.mutid trx_rename.fn integrate.mutTables extract.tree.mutTable Overlay.tree.parents mceiling mfloor mround add.cummulative.start.end make.axis.adjust.cummulative

Documented in add.cummulative.start.end cal.Subject.accross.regions correct.specific.ps extract_ccf_table extract.tree.mutTable integrate.mutTables make.axis.adjust.cummulative mceiling mfloor mround Overlay.tree.parents rename.TxIDs.in.mutid trx_rename.fn

### Fucntions which I find useful in mulitple different projects/analyses ###


#' Function to make a cummulative x axis for chromosomes across the genome
#'
#'
#'
#'
#' @export
make.axis.adjust.cummulative <- function(input.with.cummulative=NA, chrlength=NA){
  
  
  if(!all(is.na(input.with.cummulative))){
  chrname <- names(input.with.cummulative)[names(input.with.cummulative) %in% c("Chromosome","chromosome","Chr","chr","chromosome_name","seqnames")][1]

  #if using X chromosome relapse with orderable numberic
  chromosomes <- as.character(input.with.cummulative[, chrname])
  chromosomes[chromosomes == "X"] <- 23
  chromosomes <- as.numeric(chromosomes)
  
  means <-sapply(unique(chromosomes), function(chr) ((max(input.with.cummulative[chromosomes==chr,"End.cummulitive"]) - min(input.with.cummulative[chromosomes==chr,"Start.cummulitive"])) / 2) + min(input.with.cummulative[chromosomes==chr,"Start.cummulitive"]) )
  out <- data.frame(chr = unique(input.with.cummulative[, chrname]),
                    center = means)
}
  if(!all(is.na(chrlength))){
    chrlengths.cum <- sapply(1:length(chrlength), function(i) sum(chrlength[1:i]))
    means <- sapply(1:length(chrlength), function(i){
      if(i==1) lastchr <- 1 else lastchr <- chrlengths.cum[i-1]
      return(((chrlengths.cum[i] - lastchr)/2) + lastchr)
    })
    out <- data.frame(chr = names(chrlength),
                      center = means)
  }
  return(out)
}

#' add cummulaative start / end columns too order the genome chr1 -> chrX
#'
#'
#'
#'
#' @export
add.cummulative.start.end <- function(input, add.GrCh37.scale=TRUE){
  
  chrname <- names(input)[names(input) %in% c("Chromosome","chromosome","Chr","chr","chromosome_name","seqnames")][1]
  startname <- names(input)[names(input) %in% c("start","Start","startpos","Startpos","start_position")][1]
  endname <- names(input)[names(input) %in% c("end","End","endpos","Endpos","end_position")][1]
  
  #if using X chromosome relapse with orderable numberic
  chromosomes <- as.character(input[, chrname])
  chromosomes[chromosomes == "X"] <- 23
  chromosomes <- as.numeric(chromosomes)
  
  input <- input[order(as.numeric(chromosomes)),]
  
  #add bin.index
  input$bin.index <- 1:nrow(input)
  

  #get rid of factors
  input <- as.data.frame(apply(input,2,as.character),stringsAsFactors=F)
  input[,2:3] <- as.data.frame(apply(input[,2:3],2,as.numeric),stringsAsFactors=F)
  
  if(add.GrCh37.scale){
    load(file=paste0(home,"/proj-tracerx-lung/tctProjects/frankella/Tx_exome/Inputs/GrCh37.scale.rda"))
  } 
  if(!add.GrCh37.scale){
    scale <- lapply(unique(chromosomes), function(chr) max(input[chromosomes==chr,endname]))
    names(scale) <- unique(chromosomes)
  }
  
  nCHR <- length(scale)
  input$Start.cummulitive <- NA
  s <- 0
  nbp <- c()
  for (chr in as.numeric(names(scale))){
    nbp[chr] <- scale[[which(names(scale)==chr)]]
    input[chromosomes == chr,"Start.cummulitive"] <- input[chromosomes == chr,startname] + s
    s <- s + nbp[chr]
  }
  input$End.cummulitive <- NA
  s <- 0
  nbp <- c()
  for (chr in as.numeric(names(scale))){
    nbp[chr] <- scale[[which(names(scale)==chr)]]
    input[chromosomes == chr,"End.cummulitive"] <- input[chromosomes == chr,endname] + s
    s <- s + nbp[chr]
  }
  
  return(input)
  
}


#' function to round to a specific base 
#'
#'
#'
#'
#' @export
mround <- function( x, base ) base * round( x / base )

#' function to roun down to a specific base 
#'
#'
#'
#'
#' @export
mfloor <- function( x, base ) base * floor( x / base )

#' function to round up to a specific base 
#'
#'
#'
#'
#' @export
mceiling <- function( x, base ) base * ceiling( x / base )


#' function to overlay tree information onto a TRACERx mutTable
#'
#'
#'
#'
#' @export
Overlay.tree.parents <- function(mutTable, trees, clone.field, patient.field){
  mutTable$Tree.parent <- NA
  pats <- unique(mutTable[,patient.field])
  pat.clone.table <- do.call(rbind,lapply(pats, function(pat){
    print(pat)
    if( !any(names(trees) == pat)) return(NULL)
    pat.tree <- trees[[which(names(trees)==pat)]]
    pat.clones <- unique(mutTable[mutTable[,patient.field]==pat,clone.field])
    pat.tree <- remove.clones.on.tree(pat.tree,clones.to.keep = pat.clones)
    pat.tree <- logically.order.tree(pat.tree)
    
    if(class(pat.tree)=="character" | class(pat.tree)=="numeric") pat.tree <- matrix(pat.tree,ncol = 2,byrow = TRUE)
    root <-pat.tree[1,1]
    out <- data.frame(SampleID = pat, parent = c("root",pat.tree[,1]), daughter = c(root,pat.tree[,2]),stringsAsFactors = F)
    
    if(all(as.character(pat.tree)==root)){
      out <- data.frame(SampleID = pat, parent = "root", daughter = root,stringsAsFactors = F)
    }
    return(out)
    
  }))
  
  order.change <- order(paste(mutTable[,patient.field],mutTable[,clone.field]))
  mutTable <- mutTable[order.change,]
  
  #overlay
  mutTable$Tree.parent <- unlist(lapply(unique(paste(mutTable[,patient.field],mutTable[,clone.field])), function(pat.clone){
    mut.no <- sum( paste(mutTable[,patient.field],mutTable[,clone.field]) ==  pat.clone)
    parent <- pat.clone.table[paste(pat.clone.table$SampleID,pat.clone.table$daughter) == pat.clone,"parent"]
    if(length(parent)==0) parent <- NA
    out <- rep(parent,mut.no)
    return(out)
  }))
  
  #reverse order change
  mutTable <- mutTable[match(1:nrow(mutTable),order.change),]
  
  return(mutTable)
  
}

#' function to exact tree that hase been overlaid onto a TRACERx mutTable
#'
#'
#'
#'
#' @export
extract.tree.mutTable <- function(mutTable, clone.field, patient.field=NA, parent.field){
  if(!is.na(patient.field)){
    pats <- unique(mutTable[,patient.field])
    
    out <- lapply(pats, function(pat){
      tree <- as.matrix(mutTable[mutTable[,patient.field]==pat ,c(parent.field,clone.field)])
      
      tree <- tree[!duplicated(tree[,2]),]
      if(class(tree)=="character" | class(tree)=="numeric") tree <- matrix(tree,ncol = 2,byrow = TRUE)
      
      tree <- tree[!(is.na(tree[,2])| is.na(tree[,1])),]
      if(class(tree)=="character" | class(tree)=="numeric") tree <- matrix(tree,ncol = 2,byrow = TRUE)
      
      if(any(!tree[,1]=="root")){
        tree <- tree[!tree[,1]=="root",]
      } else {
        tree[tree[,1]=="root",1] <- tree[tree[,1]=="root",2]
      }
      if(class(tree)=="character" | class(tree)=="numeric") tree <- matrix(tree,ncol = 2,byrow = TRUE)
      
      tree <- apply(tree,2,as.numeric)
      if(class(tree)=="character" | class(tree)=="numeric") tree <- matrix(tree,ncol = 2,byrow = TRUE)
      
      tree <- apply(tree,2,as.character)
      if(class(tree)=="character" | class(tree)=="numeric") tree <- matrix(tree,ncol = 2,byrow = TRUE)
      
      colnames(tree) <- NULL
      return(tree)
    })
    names(out) <- pats
    
  } else {
    
    tree <- as.matrix(mutTable[ ,c(parent.field,clone.field)])
    
    tree <- tree[!duplicated(tree[,2]),]
    if(class(tree)=="character" | class(tree)=="numeric") tree <- matrix(tree,ncol = 2,byrow = TRUE)
    
    tree <- tree[!(is.na(tree[,2])| is.na(tree[,1])),]
    if(class(tree)=="character" | class(tree)=="numeric") tree <- matrix(tree,ncol = 2,byrow = TRUE)
    
    if(any(!tree[,1]=="root")){
      tree <- tree[!tree[,1]=="root",]
    } else {
      tree[tree[,1]=="root",1] <- tree[tree[,1]=="root",2]
    }
    if(class(tree)=="character" | class(tree)=="numeric") tree <- matrix(tree,ncol = 2,byrow = TRUE)
    
    tree <- apply(tree,2,as.numeric)
    if(class(tree)=="character" | class(tree)=="numeric") tree <- matrix(tree,ncol = 2,byrow = TRUE)
    
    tree <- apply(tree,2,as.character)
    if(class(tree)=="character" | class(tree)=="numeric") tree <- matrix(tree,ncol = 2,byrow = TRUE)
    
    colnames(tree) <- NULL
    out <- tree
  }
  
  
  return(out)
}


#mutTableAllpriority will keep all its mutations and those mutations in mutTableAllextra that are duplicated in mutTableAllpriority will be removed

#' function to merge MutTables ensuring the sampleID formats etc are consistent 
#'
#' 
#'
#'
#' @export
integrate.mutTables <- function(mutTableAllpriority,mutTableAllextra){
  
  #match IDs
  mutTableAllpriority$SampleID <- trx_rename.fn(mutTableAllpriority$SampleID)
  mutTableAllextra$SampleID <- trx_rename.fn(mutTableAllextra$SampleID)
  
  mutTableAllpriority$mutation_id <- rename.TxIDs.in.mutid(mutTableAllpriority$mutation_id)
  mutTableAllextra$mutation_id <- rename.TxIDs.in.mutid(mutTableAllextra$mutation_id)
  
  #remove dups in extra
  mutTableAllextra <- mutTableAllextra[!mutTableAllextra$mutation_id %in% unique(mutTableAllpriority$mutation_id),]
  
  #full join (missing cols in present in the other will be NA)
  suppressPackageStartupMessages(library(dplyr))
  
  mutTableAllout <- suppressMessages(full_join(mutTableAllpriority,mutTableAllextra))

  return(mutTableAllout)  
  
}

#' Function to rename any TRACERx patient ids to the latest version
#'
#'
#'
#'
#' @export
trx_rename.fn <- function(trxid, trialID='LTX'){
  new_trxid <- as.character(trxid)
  #Includes hospital id? 
  hospital<-grepl('\\w_', substr(trxid[1],1,2))
  if(hospital){
    new_trxid <- substring(new_trxid,3)
  }
  new_trxid <- as.numeric(sub(trialID, '', new_trxid))
  new_trxid <- sprintf("%04d", new_trxid)
  new_trxid <- paste(trialID, new_trxid,sep='')
  return(new_trxid)
}


#'  function to rename TRACERx mutation ids with correct most up to datte patient ids
#'
#'
#'
#'
#' @export
rename.TxIDs.in.mutid <- function(mutids) {
  seperatedrownames <- strsplit(mutids,split=":")
  return(
    paste( trx_rename.fn( sapply( seperatedrownames,function(x) x[1])),
           sapply( seperatedrownames,function(x){ paste(x[2:4], collapse  = ":") }),
           sep = ":")
  )
}


#' Function to adjust any quantification (subject) for cellularity (cell) and then average accross regions to get 1 quantification per tumour
#'
#'
#'
#'
#' @export
cal.Subject.accross.regions <- function(SubjectbyRegion,CellbyRegion){
  if(is.na(SubjectbyRegion)){
    
    return(NA)
    
  }
  
  #Extract subject in each region
  Subject <- data.frame(Subject = sapply(strsplit(strsplit(SubjectbyRegion,split=";")[[1]],split = ":"), function (x) x[2]),
                        Region = as.character(sapply(strsplit(strsplit(SubjectbyRegion,split=";")[[1]],split = ":"), function (x) x[1])),stringsAsFactors = F)
  #Extract cellularity in each region
  Cell <- data.frame(Cellularity = sapply(strsplit(strsplit(CellbyRegion,split=";")[[1]],split = ":"), function (x) x[2]),
                     Region = as.character(sapply(strsplit(strsplit(CellbyRegion,split=";")[[1]],split = ":"), function (x) x[1])),stringsAsFactors = F)
  
  #someimtes a "-" is used instrad of a "." seperater in region names - make consistant either way
  Subject$Region <- gsub("\\-","\\.",Subject$Region)
  
  #remove NAs & where no cellularity is availible
  nodata.region <- Cell[Cell$Cellularity == "Region.Absent.Cellularity.Data", "Region"]
  if(length(nodata.region)>0){
    Cell <- Cell[ !Cell$Region == nodata.region, ]
    Subject <- Subject[ !Subject$Region == nodata.region, ]
  }
  
  #if no data left after removal of NAs simpy return NA
  if(nrow(Cell) == 0 | nrow(Subject) == 0){
    
    return(NA)
    
  } 
  
  #make remaining data numeric
  Cell$Cellularity <- as.numeric(Cell$Cellularity)
  Subject$Subject <- as.numeric(Subject$Subject) 
  
  #if we have extra regions in ASCAT output but not in Muttable (failed mutect eg)
  Cell <- Cell[Cell$Region %in% Subject$Region,]
  #also remove for now if not in ASCAT (perhasp failed for now) but in muttable
  Subject <-Subject[Subject$Region %in% Cell$Region,]
  
  #reorder 
  Cell <- Cell[match( Cell$Region, Subject$Region ),]
  
  #Calcuate the fraction of total tumour cells in each sample, adjusted by if tumour cells spread evenly accross all regions (ie 1.1 = if 10% more tumour cells here than would be expected in even spread)
  adjCell <- Cell$Cellularity / mean(Cell$Cellularity)
  
  #estimate meanSubject accross whole tumour
  return ( mean( Subject$Subject * adjCell ) ) 
  
}

#' Function to FDR correct a set of p-values even when duplicates exist and only a subset is being considered for correction 
#'
#'
#'
#'
#' @export
correct.specific.ps <- function(table,field.to.deduplicate=NA,field.to.correct, indices.to.correct = "all",method ="BH"){
  
  if(all(indices.to.correct =="all")){
    indices.to.correct <- rep(TRUE,nrow(table))
  }
  
  #check classes are correct
  table[,field.to.deduplicate] <- as.character(table[,field.to.deduplicate])
  table[,field.to.correct] <- as.numeric(table[,field.to.correct])
  
  #need to ensure all dedup fields ordered together (makes code a lot faster) then can restore order at the end
  order.change <- order(paste(table[,field.to.deduplicate]))
  table <- table[order.change,]
  indices.to.correct <- indices.to.correct[order.change]
  
  if(!is.na(field.to.deduplicate)){
    table.filtered <- table[indices.to.correct,]
    ps.to.correct <- table.filtered[!duplicated(table.filtered[,field.to.deduplicate]),field.to.correct]
    unique.dedups <- table.filtered[!duplicated(table.filtered[,field.to.deduplicate]),field.to.deduplicate]
  } else {
    ps.to.correct <- table[indices.to.correct,field.to.correct]
  }
  
  if(grepl("\\.p$|_p$",field.to.correct)) new.field.name <- gsub("\\.p$|_p$",".q",field.to.correct) else new.field.name <- paste0(field.to.correct,".q")
  
  table[,new.field.name] <- NA
  ps.corrected <- p.adjust(ps.to.correct, n = length(ps.to.correct), method = method)
  
  if(!is.na(field.to.deduplicate)){
    table[,new.field.name][indices.to.correct] <- round(unlist(lapply(1:length(ps.corrected), function(i) rep(ps.corrected[i],sum(table[,field.to.deduplicate][indices.to.correct] == unique.dedups[i])))),digits=4)
  } else {
    table[,new.field.name][indices.to.correct] <- ps.corrected
  }
  
  #restore order
  table <- table[match(1:nrow(table),order.change),]
  
  #make the q column next to the p column
  field.to.correct.col <- which(names(table)==field.to.correct)
  table <- table[,c(1:field.to.correct.col,ncol(table),(field.to.correct.col+1):(ncol(table)-1))]
  
  return(table)
  
}

#' Function to extract CCF tables from a TRACERx mutTable containing many samples
#'
#'
#'
#'
#' @export
extract_ccf_table <- function( muttable_df, ccf_col = "PhyloCCF_MPhase", sampleid_col = "tumour_id", cluster_col = "PyCloneCluster_MPhase", driver_col = "DriverMut" ){
  
  ## malnipulate as data/table bt return to original class (tibble, data.frame) at the end ##
  
  orig_class <- class(muttable_df)
  muttable_df <- data.table::as.data.table(muttable_df)
  
  ## which samples in the table have data in CCf column ##
  
  samples <- muttable_df[ !is.na( get(ccf_col) ), unique( get(sampleid_col) ) ] 
  
  ## output list of CCF tables for ech sample ##
  
  out <- lapply(samples, function(sample){
    
    muttable_dfsamp <- muttable_df[ get(sampleid_col) == sample ]
    
    CCFs <- muttable_dfsamp[ !is.na( get(cluster_col) ), .(get(cluster_col), get(ccf_col)) ]
    setnames(CCFs, c("cluster", "CCFs")) 
    
    ccf_by_region <- data.table::as.data.table( do.call(rbind, lapply( strsplit(CCFs[, CCFs],split=";"), function(x) sapply(strsplit(x,split = ":"), function (x) x[2]))) )
    data.table::setnames(ccf_by_region, sapply(strsplit( strsplit(CCFs[, CCFs],split=";")[[1]],split = ":"), function (x) x[1]) )
    
    ccf_by_region <- apply( ccf_by_region, 2, as.numeric)
    
    CCFs <- cbind( CCFs, ccf_by_region)
    
    region_cols <- names(CCFs)[3:ncol(CCFs)]
    
    meanCCF <- rowMeans( CCFs[, ..region_cols ] )
    
    CCFs[, meanCCF := meanCCF ]
    
    mean_region_cols <- paste0("mean", region_cols)
    
    ## couldn't find a good data table wy of doing all regiono cols at once
    for( i in 1:length(region_cols) )  CCFs[, (mean_region_cols[i]) := mean( get(region_cols[i])), by = cluster, ]
    
    ### now summarise by cluster ###
    
    CCF.table <- cbind( CCFs[, .(meanCCF = mean(meanCCF)), by = cluster, ], CCFs[ !duplicated(cluster), ..mean_region_cols] )
    
    ## add on other info (no of muts and whther there are drivers) ##
    
    CCF.table[, driver := sapply(cluster, function(cluster_id) any( muttable_dfsamp[ get(cluster_col) == cluster_id, get(driver_col) ] == 'TRUE' ))]
    CCF.table[, no_muts := sapply(cluster, function(cluster_id) muttable_dfsamp[ get(cluster_col) == cluster_id, .N ])  ]
    
    setcolorder(CCF.table, c("cluster", "no_muts", "driver", "meanCCF"))
    
    
    class(CCF.table) <- orig_class
    
    return(CCF.table)
    
  })
  
  names(out) <- samples
  
  return(out)
  
}


###################################################
### Function to caulcated wgII from seg CN data ###
###################################################

## code written by Nicolai ##
## Input is an ASCAT output matrix, with nA in column 7, nB in column 8 #

#' Function to caulcated wgII from seg CN data
#'
#'
#'
#'
#' @export
calc.wgii <- function(seg, threshold = 0.6, check.names=FALSE, include.sexchrom=FALSE){
  # Edit 20140826: gii & wgii: exclude sex chromosomes
  # Edit 20150316: Use wMajor as ploidy
  seg[,1] <- as.character(seg[,1])
  if(check.names){
    seg <- check.names.fn(seg)
  }
  output.mat <- matrix(NA, nrow=length(unique(seg[,1])), ncol=4)
  colnames(output.mat) <- c('GII', 'wGII', 'FLOH', 'wFLOH')
  rownames(output.mat) <- unique(seg[,1])
  chrom.length <- setNames(rep(NA, length(unique(seg[,2]))), unique(seg[,2]))
  if(! all(seg[,8] <= seg[,7]) ){
    cat("Warning!! nB  not always <= nA!!  -- Correcting for internal use (only!)\n") # In case ASCAT people change the algorithm
    tmp <- seg
    seg[tmp[,8] > tmp[,7],7]  <- tmp[tmp[,8] > tmp[,7],8]
    seg[tmp[,8] > tmp[,7],8]  <- tmp[tmp[,8] > tmp[,7],7]
  }
  cat("Setting sample ploidy according to major proportion copy number. Ploidy < 2 re-set at 2\n")
  
  message( "Calculating pliody for each sample..\n" )
  
  ploidy <- calc.ploidy(seg, check.names=check.names)
  ploidy <- setNames(ploidy[,2], rownames(ploidy))
  ploidy[ploidy < 2] <- 2
  for(i in names(chrom.length)){chrom.length[i] <- max(seg[seg[,2] %in% i,4]) - min(seg[seg[,2] %in% i,3])} # restrict to the part that can be measured
  chrom.length <- setNames(as.numeric(chrom.length), names(chrom.length))
  
  message( "Calculating metrics for each sample..\n" )
  
  pb <- txtProgressBar(1, length(unique(seg[,1])), style=3, width = 30)
  
  for(i in unique(seg[,1])){
    sample.seg <- seg[seg[,1] %in% i,] # Restrict to sample
    sample.seg.cn <- sample.seg[sample.seg[,6] < (ploidy[i] - threshold) | sample.seg[,6] > (ploidy[i] + threshold),] # Restrict to aberrant segments
    sample.seg.loh <- sample.seg[sample.seg[,8] == 0 & sample.seg[,7] != 0,] # Restrict to LOH segments
    output.mat[i,1] <- sum(as.numeric(sample.seg.cn[,4]) - as.numeric(sample.seg.cn[,3]))/sum(chrom.length) # GII
    output.mat[i,3] <- sum(as.numeric(sample.seg.loh[,4] - sample.seg.loh[,3]))/sum(chrom.length) # FLOH
    wgii <- vector()
    wfloh <- vector()
    if(!include.sexchrom){
      chrom.length <- chrom.length[!names(chrom.length) %in% c('X','Y','x','y',23,24)]
    }
    for(j in names(chrom.length)){
      sample.seg.chr <- sample.seg.cn[sample.seg.cn[,2] %in% j,]
      sample.seg.chr.loh <- sample.seg.loh[sample.seg.loh[,2] %in% j,]
      wgii <- c(wgii, sum(sample.seg.chr[,4] - sample.seg.chr[,3])/chrom.length[j])
      wfloh <- c(wfloh, sum(sample.seg.chr.loh[,4] - sample.seg.chr.loh[,3])/chrom.length[j])
    }
    output.mat[i,2] <- sum(wgii)/length(chrom.length)
    output.mat[i,4] <- sum(wfloh)/length(chrom.length)
    
    setTxtProgressBar(pb, which( unique(seg[,1]) == i ))
    
  }
  return(output.mat)
}


#' Function to calculate ploidy distribution and main ploidy per sample from seg input (Nicolai)
#'
#'
#'
#'
#' @export
calc.ploidy <- function(seg, cnCol=6,check.names=FALSE){
  #20140311 : added wMajor, weighted ploidy by chromosome
  #20151209 : Added the option to call mean raw copy if it exists
  #seg = ASCAT segmented output (or any segmented output), with total CN in column 6
  seg[,1] <- as.character(seg[,1])
  if(check.names){
    seg <- check.names.fn(seg)
  }
  samples <- unique(seg[,1])
  out.ploidy <- matrix(nrow=length(samples), ncol=14)
  rownames(out.ploidy) <- samples
  colnames(out.ploidy) <- c('Major','wMajor','MeanRaw',0:10) #wMajor is weighted by chromosome, MeanRaw is mean of the raw CN values from ASCAT 2.3
  
  pb <- txtProgressBar(1, length(samples), style=3, width = 30)
  
  for(j in samples){
    sample.seg <- seg[seg[,1] %in% j,]
    ploidy <- vector()
    for(k in unique(sample.seg[,cnCol])){
      tmp <- sample.seg[sample.seg[,cnCol] %in% k,]
      ploidy <- c(ploidy, setNames(sum(tmp[,4]-tmp[,3]), k))
    }
    ploidy <- (ploidy/sum(ploidy))[order(ploidy/sum(ploidy),decreasing=T)]
    out.ploidy[j,1] <- as.numeric(names(ploidy)[1])
    out.ploidy[j,3:13] <- ploidy[colnames(out.ploidy)[3:13]]
    wMajor <- c()
    for(k in unique(sample.seg[,2])){
      tmp <- sample.seg[sample.seg[,2] %in% k,]
      tmp <- setNames(tmp[,4]-tmp[,3], tmp[,cnCol])
      wMajor <- c(wMajor,setNames(as.numeric(names(sort(tapply(tmp, names(tmp),sum), decreasing=T)[1])), k))
    }
    out.ploidy[j,2] <- as.numeric(names(sort(table(wMajor), decreasing=T)[1]))
    if(any(grepl('nAraw', colnames(seg)))){
      a <- rowSums(sample.seg[,c('nAraw','nBraw')])
      b <- sample.seg[,4]-sample.seg[,3]
      out.ploidy[j,'MeanRaw'] <- sum(a * b/sum(b))
    }
    setTxtProgressBar(pb, which( samples == j ))
  }
  if(!any(grepl('nAraw', colnames(seg)))){
    out.ploidy <- out.ploidy[,-c(3)]
  }
  out.ploidy
}

calc.perc.subclonal <- function(seg, threshold = 0.6, check.names=FALSE, include.sexchrom=FALSE, bootstrap = FALSE, tumour_id_col = 164){
  seg[,1] <- as.character(seg[,1])
  if(check.names){
    seg <- check.names.fn(seg)
  }
  output.mat <- matrix(NA, nrow=length(unique(seg[,1])), ncol=4)
  colnames(output.mat) <- c('GII', 'wGII', 'FLOH', 'wFLOH')
  rownames(output.mat) <- unique(seg[,1])
  chrom.length <- setNames(rep(NA, length(unique(seg[,2]))), unique(seg[,2]))
  if(! all(seg[,8] <= seg[,7]) ){
    cat("Warning!! nB  not always <= nA!!  -- Correcting for internal use (only!)\n") # In case ASCAT people change the algorithm
    tmp <- seg
    seg[tmp[,8] > tmp[,7],7]  <- tmp[tmp[,8] > tmp[,7],8]
    seg[tmp[,8] > tmp[,7],8]  <- tmp[tmp[,8] > tmp[,7],7]
  }
  cat("Setting sample ploidy according to major proportion copy number. Ploidy < 2 re-set at 2\n")
  
  message( "Calculating pliody for each sample..\n" )
  
  ploidy <- calc.ploidy(seg, check.names=check.names)
  ploidy <- setNames(ploidy[,2], rownames(ploidy))
  ploidy[ploidy < 2] <- 2
  for(i in names(chrom.length)){chrom.length[i] <- max(seg[seg[,2] %in% i,4]) - min(seg[seg[,2] %in% i,3])} # restrict to the part that can be measured
  chrom.length <- setNames(as.numeric(chrom.length), names(chrom.length))
  
  message( "Calculating metrics for each sample..\n" )
  
  pb <- txtProgressBar(1, length(unique(seg[,1])), style=3, width = 30)
  
  # Define gains / losses in each sample
  seg$gain <- FALSE
  seg$loss <- FALSE
  seg$loh  <- FALSE
  seg$width <- seg[,3] - seg[,4]
  for(i in unique(seg[,1])){
    seg[ seg[,1] == i, 'gain'] <- seg[seg[,1] == i,6] > (ploidy[i] + threshold)
    seg[ seg[,1] == i, 'loss'] <- seg[seg[,1] == i,6] < (ploidy[i] - threshold)
    seg[ seg[,1] == i, 'loh']  <- seg[seg[,1] == i,8] == 0 & seg[seg[,1] == i,7] != 0
    seg$abberant <- seg$gain | seg$loss | seg$loh
    setTxtProgressBar(pb, which( unique(seg[,1]) == i ))
  }
  
  calc_cin_ith <- function(input, bootstrap = FALSE){
    
    seg_tumour <- copy(input)
    samples <- unique(seg_tumour$sample)
    
    calc_cin_ith_sub <- function( seg, samples = NA ){
      if( !all(is.na(samples) ) ) seg <- seg[ sample %in% samples ]
      seg[, is_hetergenous_gain := length(unique(gain)) > 1, by = .(startpos, endpos)]
      seg[, is_hetergenous_loss := length(unique(loss)) > 1, by = .(startpos, endpos)]
      seg[, is_hetergenous_loh  := length(unique(loss)) > 1, by = .(startpos, endpos)]
      seg[, is_hetergenous  := any(is_hetergenous_gain, is_hetergenous_loss ), by = .(startpos, endpos)]  # don't use loh for now
      seg[, sum(is_hetergenous) / .N ]
    }
    
    if( !bootstrap ){
      return( calc_cin_ith_sub( seg_tumour ) )
    } else {
      pairs <- combn(samples, 2)
      pairs <- lapply(1:ncol(pairs), function(col) pairs[, col] )
      return(  mean( sapply(1:length(pairs), function(pairi) calc_cin_ith_sub( seg_tumour, samples = pairs[[pairi]] )  ) ) )
    }
  }
  
  seg <- as.data.table(seg)
  
  out <- seg[, .(CIN_frac_het = calc_cin_ith( .SD ),
                 CIN_frac_het_bootstrapped = calc_cin_ith( .SD, bootstrap = TRUE ) ), by = tumour_id]
  
  return(out)
}


#===================================#
# interpret_following_Args function #
#===================================#

###  function designed to assign objects to following arguemnts extracted by commandArgs(). ###
###  takes commardArgs output (args) and a table (arg.table) specifying a automatic order   ###
###  of object names to assign to each argument and a list of flags and matching object     ###
###  names so that flags can be used in any order to specify what objects following         ###
###  arguments should be assigned                                                           ###

# # example data:
# 
# arg.table <- data.table( arguments  = c("mut_table_path", "scna_table_path", "trees_path", "clin_path", "outputs.folder"),
#                          flags = c("--muts", "--scnas", "--trees", "--clin", "--out") )
# 
# args <- c("--scnas", "path/to/scna.table.txt", "--muts", "path/to/mut.table.txt")
# # ( in terminal = 'Rscript script.path --scnas path/to/scna.table.txt --muts path/to/mut.table.txt'
# #  with in script args <- commandArgs() )
# 
# # or
# 
# args <- c("path/to/mut.table.txt", "path/to/scna.table.txt")
# # ( in terminal = 'Rscript script.path path/to/mut.table.txt path/to/scna.table.txt'
# #   with in script args <- commandArgs() )
# 
# # both egs above of args will be intrepretted the same. The first uses flags to assign arguments and the second uses the order
# # of the arguuemnts field in the arg.table
# 
# # in these cases the function will assign "../../scna.table.txt" to an object called scna_table_path
# # and "../../mut.table.txt" to object called mut_table_path

#' Function to allow R script to interpret following arguments specified in terminal with flags
#'
#'
#'
#'
#' @export
interpret_following_Args <- function(args, arg.table){
  
  if(length(args) > 0){
    
    using_flags <- any( grepl("--", args) )  ## have flags been used in the following args - if not just use the default order 
    ## default order specified by order of args field in args.table 
    ## this means input arguments cannot contain "--" symbols which are reserved for flags
    
    ## if no flags just use the default order to specify ##
    
    if( using_flags == FALSE ){
      
      for( i in 1:length(args) ) assign( arg.table[ i, arguments ], args[i], envir = parent.frame() ) 
      
    }
    
    if( using_flags == TRUE ){
      
      args.input.table <- data.table( input.arguments = args[ seq(2, length(args), by = 2 ) ],    ## odd args will be input arguments
                                      input.flags = args[ seq(1, (length(args) - 1), by = 2 ) ] ) ## even args will be flags
      
      # limit arg table to those present in flags #
      arg.table <- arg.table[ flags %in% sapply( strsplit( args, split = " " ), "[[", 1 ) ]
      
      arg.table[, arg.input := args.input.table[ match( flags, input.flags ), input.arguments] ]
      
      for( i in 1:nrow(arg.table) ) assign( arg.table[ i, arguments ], arg.table[ i, arg.input ], envir = parent.frame() ) 
      
    }
    
  }
  
}

#' Function to extract the tree information from all patients in a tree output directory
#'
#'
#'
#'
#' @export
read_trees <- function( tree_path, return_all = FALSE, remove_missing = TRUE ){
  
  tumours <- system( paste('ls', tree_path ), intern = TRUE )
  tumours <- tumours[ grepl('LTX', tumours) ]
  
  tree_data <- lapply(tumours, function(tumour){
     
    tumour_no_cluster <- gsub( '_Cluster.{1}', '', tumour)
    tree_path <- paste0(tree_path, '/', tumour, '/', tumour_no_cluster, '.tree.RDS' )
    
    if( file.exists(tree_path) ){
    tree <- readRDS( tree_path ) 
    } else return( NULL )
    
  })
  names(tree_data) <- tumours
 
  if( remove_missing ) tree_data <- tree_data[ which(!sapply( tree_data, function(tree) all(is_null( tree )) )) ]
  
  if( return_all ) return( tree_data )
   
  tree_data <- lapply(tree_data, function(tree) tree$graph_pyclone$Corrected_tree )
  names(tree_data) <- tumours
  
  return(tree_data)
  
}


#=====#
# END #
#=====#



# suppressPackageStartupMessages(library(sf))
# suppressPackageStartupMessages(library(units))
# suppressPackageStartupMessages(library(rgeos))
# suppressPackageStartupMessages(library(qlcMatrix))
# suppressPackageStartupMessages(library(RColorBrewer))
# suppressPackageStartupMessages(library(smoothr))
amf71/amfFunctions documentation built on Sept. 12, 2021, 10:14 p.m.