SaaRclust: R Package to cluster long sequencing reads into chromosomes to facilitate de novo genome assembly.

Documented in exportClusteredReads exportGenomicLocations getClusterAcc getClusterIdentityPerChr getClusterIdentityPerChrPerDir getQualMeasure hardClustAccuracy kahansum randomTheta thetaRescale

#' Obtain summary measures of minimap output.
#'
#' Takes imported data table and export relevant data quality measures.
#'
#' @param inputData A \code{data.frame} loaded by \code{\link[SaaRclust]{importTestData}}
#' @importFrom dplyr group_by summarise
#' @author David Porubsky
#' @export

getQualMeasure <- function(inputData) {
  ptm <- startTimedMessage("Getting data quality measures")
  
  #get number of SS reads per PB read
  SSreads.perPB <- sort(table(inputData$PBreadNames), decreasing = T) #this won't be needed when output will be already sorted by PBreads
  
  #get number of SS libs per PB read
  inputData %>% dplyr::group_by(PBreadNames) %>% dplyr::summarise(counts = length(unique(SSlibNames))) -> SSlib.perPB
  
  #get number of SS reads per lib per PB read
  SSreads.perPB.l <- split(inputData$SSlibNames, inputData$PBreadNames)
  SSreads.perlib.perPB <- sapply(SSreads.perPB.l, function(x) rle(sort(x))$lengths)
  SSreads.perlib.perPB <- do.call(c, SSreads.perlib.perPB)
  
  #get PB read distribution hist
  #hist.data <- hist(inputData$PBreadLen, breaks = 100)
  #hist.df <- data.frame(midpoints= hist.data$mids, freq= hist.data$counts)
  
  stopTimedMessage(ptm)
  return(list(SSreads.perPB=SSreads.perPB, SSlib.perPB=SSlib.perPB, SSreads.perlib.perPB=SSreads.perlib.perPB))
}

#' Check clustering accuracy
#'
#' Takes assigned identity of each PB read and compares it to known location.
#'
#' @param clusters List of clusters with predicted chromosome identity for each PB read
#' @author David Porubsky

getClusterAcc <- function(clusters) {
  stat <- list()
  miss.reads <- list()
  for (i in 1:length(clusters)) { 
    cluster <- clusters[[i]]
    max.chr <- names(which.max(table(cluster)))
    trues <- length(cluster[cluster == max.chr])
    miss <- length(cluster[cluster != max.chr])
    stat.df <- data.frame(trues=trues, miss=miss, id=i)
    
    miss.PBreads <- names(cluster[cluster != max.chr])
    miss.chr <- unname(cluster[cluster != max.chr])
    if (length(miss.PBreads)>0) {
      tab <- data.frame(miss.PBreads=miss.PBreads, miss.chr=miss.chr, id=i)
      miss.reads[[length(miss.reads)+1]] <- tab
    }
    stat[[length(stat)+1]] <- stat.df  
  }
  stat <- do.call(rbind, stat)
  miss.reads <- do.call(rbind, miss.reads)
  return(list(stat=stat, miss.reads=miss.reads))
}

                               
#' Computes the performance of the hard clustering algorithm
#' 
#' This function takes hard clustering results and check the accuracy against know genomic locations of PacBio reads
#' 
#' @param hard.clust An \code{integer} vecotr of cluster assignements of each PacBio read.
#' @param pb.chr The true chromosomes of PacBio reads.
#' @param pb.flag The true directionality of PacBio reads.
#' @param tab.filt A \code{data.frame} obejct containing selected the best PacBio alignments.
#' @param female Set to \code{TRUE} if analyzed data are coming from female individual.
#' @author Maryam Ghareghani, David Porubsky
#' @export

hardClustAccuracy <- function(hard.clust, pb.chr, pb.flag, tab.filt, female=TRUE)
{
  # filter and keep PB reads that have only defined (true) chromosome names with flag 0 or 16
  if(female)
  {
    filt.chrom <- which(grepl('^chr[0-9X][0-9]?$',tab.filt$PBchrom))
  } else {
    filt.chrom <- which(grepl('^chr[0-9XY][0-9]?$',tab.filt$PBchrom))
  }
  filt.flag <- union(which(tab.filt$PBflag == 0), which(tab.filt$PBflag == 16))
  filt <- intersect(filt.chrom, filt.flag)
  tab <- tab.filt[filt,]
  
  # define the classe (true clusters)
  classes <- paste0(pb.chr, "_", pb.flag)
  # some extra filtering ---> take only those PB reads that have flag 0 or 16
  names(classes) <- names(pb.chr)
  # filter out the PB reads that have more than one chr or flag
  oneChrPBreads <- which(sapply(pb.chr, length)==1)
  oneFlagPBreads <- which(sapply(pb.flag, length)==1)
  classes <- classes[intersect(oneChrPBreads, oneFlagPBreads)]
  
  classes <- classes[which(names(classes) %in% tab$PBreadNames)]
  clusters <- hard.clust[which(names(hard.clust) %in% names(classes))]
  ord <- order(names(classes))
  classes <- classes[ord]
  clusters <- clusters[ord]
  
  t <- table(classes, clusters)
  clust.assigned <- apply(t, 2, function(x) which(x == max(x))[1])
  clust.purity <- sum(sapply(1:length(clust.assigned), function(i) t[clust.assigned[i],i]))/length(clusters)
  
  missed <- setdiff(1:nrow(t), clust.assigned)
  names(missed) <- rownames(t)[missed]
  clust.purity.perClust <- sapply(1:length(clust.assigned), function(i) t[clust.assigned[i],i]/sum(t[,i]))
  
  df <- data.frame(clustIDs=clust.assigned, clustNames=rownames(t)[clust.assigned], accuracy=clust.purity.perClust)
  df <- df[order(clust.purity.perClust),]
  
  
  return( list(acc=clust.purity, missed.clusters = missed) )
}


#' Simulate random theta estimates for cell type
#'
#' Function to simulate random theta values in order to initialize EM algorithm.
#'
#' @param num.cells Number of cells used in the analysis.
#' @param num.clusters Expected number of clusters. (for 22 autosomes == 44 clusters)
#' @author David Porubsky
#' @export

#TODO: set parameter alpha to be optional
randomTheta <- function(num.cells=100, num.clusters=44) {
  theta.l <- list()
  for (j in 1:num.cells) {
    cell.theta <- list()
    for (k in 1:num.clusters) {
      random.type <- runif(3)
      max.type <- which.max(random.type)
      if (max.type == 1) {
        theta <- c(0.9, 0.05, 0.05)
      } else if (max.type == 2) {
        theta <- c(0.05, 0.9, 0.05)
      } else {
        theta <- c(0.05, 0.05, 0.9)
      }
      cell.theta[[k]] <- theta
    }
    theta.l[[j]] <- do.call(rbind, cell.theta)
  }
  return(theta.l)
}


#' Rescale theta values for WC cell type.
#'
#' Adjust estimated theta values based on expected number of WC chromosomes given random segregation of parental homologues.
#'
#' @param theta.param A \code{list} of estimated cell types for each cluster and each cell.
#' @param theta.expected A \code{vector} of expected sums of probabilities to observe WW,CC or WC chromosomes across all cells.
#' @author David Porubsky
#' @export

thetaRescale <- function(theta.param, theta.expected) {
  theta.rescaled <- list()
  for (i in 1:length(theta.param)) {
    theta.cell <- theta.param[[i]]
    theta.ObservedSums <- colSums(theta.cell)
    corr.factor <- theta.expected / theta.ObservedSums
    theta.param.rescaled <- t(t(theta.cell) * corr.factor)
    theta.param.rescaled.norm <- theta.param.rescaled / rowSums(theta.param.rescaled)
    theta.rescaled[[i]] <- theta.param.rescaled.norm
  }
  return(theta.rescaled)
}

#' Alternative sum function to reduce numerical error caused by finite precision of floating point numbers.
#'
#' @param x A \code{vector} of values tp sum up.
#' @export

kahansum <- function(x) {
  ks <- 0
  c <- 0
  for(i in 1:length(x)) {
    y <- x[i] - c
    kt <- ks + y
    c = (kt - ks) - y
    ks = kt
  }
  ks
}


#' Export best cluster IDs for each PB read
#' 
#' Function to get best clust ID for each PB read or eventually multiple cluster IDs if max pVal is lower than threshold
#'
#' @param soft.clust ...
#' @param prob.th ...
#' @author David Porubsky
#' @export

exportGenomicLocations <- function(soft.clust, prob.th=0.6) {
  
  getMaxPvals <- function(pval.vector) {
    srt.pval <- sort(pval.vector, decreasing = T)
    breakpoint <- which.min(diff(srt.pval))
    max.pvals <- srt.pval[1:breakpoint]
    max.pvals.clust <- which(pval.vector %in% max.pvals)
    return(max.pvals.clust)
  }  
  
  max.pVal <- apply(soft.clust, 1, max)
  mask <- max.pVal >= prob.th
  ord <- apply(soft.clust[mask,], 1, which.max)
  
  read.IDs <- rep(list(0), nrow(soft.clust)) 
  read.IDs[mask] <- ord 
  
  best.idx <- apply(soft.clust[!mask,], 1, getMaxPvals)
  read.IDs[!mask] <- best.idx 
  
  names(read.IDs) <- rownames(soft.clust)

  return(list(clust.IDs=read.IDs, th.boolean=mask))
  #best.idx <- t( apply(soft.clust[!mask,], 1, function(x) which(x %in% sort(x, decreasing = T)[1:5])) )
  #best.pval <- t( apply(soft.clust[!mask,], 1, function(x) x[which(x %in% sort(x, decreasing = T)[1:5])]) )
}

exportGenomicLocationsAllBest <- function(soft.clust, prob.th=0.6) {
  
  getMaxPvals <- function(pval.vector) {
    srt.pval <- sort(pval.vector, decreasing = T)
    breakpoint <- which.min(diff(srt.pval))
    max.pvals <- srt.pval[1:breakpoint]
    max.pvals.clust <- which(pval.vector %in% max.pvals)
    return(max.pvals.clust)
  }  
  
  best.idx <- apply(soft.clust, 1, getMaxPvals)
  read.IDs <- best.idx 
  names(read.IDs) <- rownames(soft.clust)
  
  return(read.IDs)
}


#' Export corresponding cluster for true chromosome and directionality
#'
#' @param soft.clust Soft clustering probabilities for each long read and each cluster
#' @param chr.rows Expected location of long reads based on their mapping to a respective chromosome
#' @param chr.flag Expected directionality of long reads based on their mapping to a respective chromosome
#' @author David Porubsky
#' @export

getClusterIdentityPerChrPerDir <- function(soft.clust, chr.rows, chr.flag) {
  max.Clust <- apply(soft.clust, 1, which.max)
  unique.clust.ID <- paste0(chr.rows,"_",chr.flag)
  unique.clust.ID <- factor(unique.clust.ID, unique(unique.clust.ID))
  clustByChromByflag <- split(max.Clust, unique.clust.ID)
  clustIdPerChrom <- lapply(clustByChromByflag, function(x) names(which.max(table(x))))
  clustIDperPB <- rep(as.numeric(unlist(clustIdPerChrom)), table(unique.clust.ID))
  return(clustIDperPB)
}

#' Export corresponding clusters for true chromosome
#'
#' @param soft.clust Soft clustering probabilities for each long read and each cluster
#' @param chr.rows Expected location of long reads based on their mapping to a respective chromosome
#' @author David Porubsky
#' @export

getClusterIdentityPerChr <- function(soft.clust, chr.rows) {
  max.Clust <- apply(soft.clust, 1, which.max)
  clustByChrom <- split(max.Clust, chr.rows)
  clustIdPerChrom <- lapply(clustByChrom, function(x) as.numeric(names(sort(table(x), decreasing=T)[1:2])))
  return(clustIdPerChrom)
}


#' Print names of long reads into separate files based on soft clustering assignment.
#'
#' @param inputfolder Path to the data analysis folder
#' @param prob.th Filter out long reads with max probability below this threshold
#' @param minLib Filter out long reads with number of StrandS libraries being represented below this threshold
#' @author David Porubsky
#' @export

exportClusteredReads <- function(inputfolder=NULL, prob.th=NULL, minLib=NULL) {
  
  #This function exports group of largest probabilites per PacBio read.
  getMaxPvals <- function(pval.vector) {
    srt.pval <- sort(pval.vector, decreasing = T)
    breakpoint <- which.min(diff(srt.pval))
    max.pvals <- srt.pval[1:breakpoint]
    max.pvals.clust <- which(pval.vector %in% max.pvals)
    return(max.pvals.clust)
  }  
  
  folder.name <- paste0("ReadPerCluster_minLib", minLib, "_probTh", prob.th)
  destination <- file.path(inputfolder, folder.name)
  if (!file.exists(destination)) {
    dir.create(destination)
  } else {
    beQuiet <- do.call(file.remove, list(list.files(destination, full.names = TRUE)))
  }
  
  Clusters2process <- list.files(file.path(inputfolder, 'Clusters'), pattern = "clusters.RData", full.names = TRUE)
  Quals2process <- list.files(file.path(inputfolder, 'RawData'), pattern = "dataQuals.RData", full.names = TRUE)
  
  for (i in 1:length(Clusters2process)) {  
    data.file <- get(load(Clusters2process[i]))
    data.qual <- get(load(Quals2process[i]))
    fileID <- basename(Clusters2process[i])
    message("Processing file: ",fileID)
    
    #Sort data quals according to PB order in clusters
    SSlib.perPB <- data.qual$SSlib.perPB
    SSlib.perPB <- SSlib.perPB[match(rownames(data.file$soft.pVal), SSlib.perPB$PBreadNames),]
    pb.minLib <- SSlib.perPB$counts
    prob.tab <- data.file$soft.pVal
    
    #Find WC cluster in all cells
    theta.sums <- Reduce("+", data.file$theta.param)
    remove.clust <- which.max(theta.sums[,3])
    #Remove probabilities for always WC cluster
    prob.tab <- prob.tab[,-remove.clust]
    
    #Remove PB reads represneted by SSlib less than minLib
    filt <- pb.minLib >= minLib
    prob.tab <- prob.tab[filt,]
    
    #filter reads based on set probability treshold
    if (prob.th > 0 & !is.null(prob.th)) {
      max.prob <- apply(prob.tab, 1, max)
      mask <- max.prob >= prob.th
      Clust.locations <- apply(prob.tab[mask,], 1, which.max)
      readNames.perCluster <- split(names(Clust.locations), Clust.locations)
    } else {
      Clust.locations <- apply(prob.tab, 1, getMaxPvals)
      Clust.locations.expanded <- unlist(Clust.locations) #expand list of vectors
      names(Clust.locations.expanded) <- rep(names(Clust.locations), lengths(Clust.locations)) #assigne proper PB read names to expanded vector
      readNames.perCluster <- split(names(Clust.locations.expanded), Clust.locations.expanded)
    }
    
    for (k in 1:length(readNames.perCluster)) {
      filename <- paste0("reads_cluster", k, "_minLib", minLib, "_probTh", prob.th, ".txt")
      path2file <- file.path(destination, filename)
      write.table(readNames.perCluster[[k]], file = path2file, append = TRUE, quote = FALSE, row.names = FALSE, col.names = FALSE)
    }
  } 
}

daewoooo/SaaRclust documentation built on May 28, 2019, 7:50 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

daewoooo/SaaRclust
R Package to cluster long sequencing reads into chromosomes to facilitate de novo genome assembly.

R/helperFuctions.R
In daewoooo/SaaRclust: R Package to cluster long sequencing reads into chromosomes to facilitate de novo genome assembly.

Defines functions getQualMeasure getClusterAcc hardClustAccuracy randomTheta thetaRescale kahansum exportGenomicLocations exportGenomicLocationsAllBest getClusterIdentityPerChrPerDir getClusterIdentityPerChr exportClusteredReads

Documented in exportClusteredReads exportGenomicLocations getClusterAcc getClusterIdentityPerChr getClusterIdentityPerChrPerDir getQualMeasure hardClustAccuracy kahansum randomTheta thetaRescale

R Package Documentation

Browse R Packages

We want your feedback!

daewoooo/SaaRclust R Package to cluster long sequencing reads into chromosomes to facilitate de novo genome assembly.

R/helperFuctions.R In daewoooo/SaaRclust: R Package to cluster long sequencing reads into chromosomes to facilitate de novo genome assembly.

Defines functions getQualMeasure getClusterAcc hardClustAccuracy randomTheta thetaRescale kahansum exportGenomicLocations exportGenomicLocationsAllBest getClusterIdentityPerChrPerDir getClusterIdentityPerChr exportClusteredReads

Documented in exportClusteredReads exportGenomicLocations getClusterAcc getClusterIdentityPerChr getClusterIdentityPerChrPerDir getQualMeasure hardClustAccuracy kahansum randomTheta thetaRescale

R Package Documentation

Browse R Packages

We want your feedback!

daewoooo/SaaRclust
R Package to cluster long sequencing reads into chromosomes to facilitate de novo genome assembly.

R/helperFuctions.R
In daewoooo/SaaRclust: R Package to cluster long sequencing reads into chromosomes to facilitate de novo genome assembly.