R/utils.R

Defines functions get_het_pos filter_peaks findGSE_sp kmer_count_modify initial_count_recover error_minimize3 error_minimize2 error_minimize initialize_start_time

Documented in error_minimize error_minimize2 error_minimize3 filter_peaks findGSE_sp get_het_pos initial_count_recover initialize_start_time kmer_count_modify

#' Initialize Start Time
#'
#' This function initializes the start time for a process.
#'
#' @return The start time of the process.
#' @export
initialize_start_time <- function() {
  start_time <- proc.time()
  return(start_time)
}
#' Minimize the Error for K-mer Frequency Fitting
#'
#' This function minimizes the error for k-mer frequency fitting by adjusting the mean, standard deviation, and scaling factors.
#'
#' @param tooptimize A numeric vector containing the scale factors to optimize.
#' @param x A numeric vector representing the histogram replicated from \code{d}.
#' @param xfit x-values to get the skew normal distribution
#' @param end An integer indicating the right-side position for fitting.
#' @param xfit_left A numeric value for the left-side position to calculate initial mean and standard deviation.
#' @param xfit_right A numeric value for the right-side position to calculate initial mean and standard deviation.
#' @param d A data frame representing the observed k-mer frequencies that will be fitted.
#' @param min_valid_pos An integer indicating the left-side position from which the observed k-mer frequencies will be fitted.
#' @param itr An integer representing the iteration count.
#' @return A numeric value representing the minimized error.
#'
#' @examples
#'
#' tooptimize <- c(1, 1, 1, 1)
#' x <- rnorm(100)
#' end <- 100
#' xfit <- seq(min(x), max(x), length=end)
#' xfit_left <- min(x)
#' xfit_right <- max(x)
#' d <- data.frame(V1=1:100, V2=rnorm(100))
#' min_valid_pos <- 10
#' itr <- 100
#' error <- error_minimize(tooptimize, x, end, xfit, xfit_left, xfit_right, d, min_valid_pos, itr)
#' print(error)
#'
#' @export
error_minimize<-function(tooptimize, x, end, xfit, xfit_left, xfit_right, d, min_valid_pos, itr)
{
  # x            : histogram replicated from d, which is formed from dr
  # xfit         : x-values to get the skew normal distribution
  # xfit-left    : a left-side  position to calculate initial mean and sd
  # xfit-right   : a right-side position to calculate initial mean and sd
  # d            : the observed kmer-freq that will be fitted
  # min_valid_pos: a left-side  position, from which the observed kmer-freq will be fitted
  # end          : a rigth-side position, till which the observed kmer-freq will be fitted
  sd_scale_factor    <- tooptimize[1]
  yfit_scale_factor  <- tooptimize[2]*100000/itr
  mean_scale_factor  <- tooptimize[3]
  xi_scale_factor    <- tooptimize[4]
  xfit               <- seq(min(x),max(x),length=end)
  meanfit            <- mean(x[x>=xfit_left & x<=xfit_right])*mean_scale_factor
  sdfit              <- sd(x[  x>=xfit_left & x<=xfit_right])*sd_scale_factor
  xifit              <- 1*xi_scale_factor
  # message(paste("   Info: initialized mean as ", meanfit, "\n", sep=""))
  # message(paste("   Info: initialized sd   as ", sdfit, "\n", sep=""))
  #message(paste("   Info: initialized skew as ", xifit, "\n", sep=""))
  #yfit               <- dsnorm(xfit,mean=meanfit,sd=sdfit, xi=xifit)*yfit_scale_factor*length(x)
  yfit               <- dnorm(xfit,mean=meanfit,sd=sdfit)*yfit_scale_factor*length(x)
  diff0              <- sqrt(sum((d[min_valid_pos:end, 2] - yfit[min_valid_pos:end])^2))
  #diff0              <- sqrt(sum((d[xfit_left:end, 2] - yfit[xfit_left:end])^2))
  # message(paste("   111 Info: initialized min_valid_pos   as ", min_valid_pos, "\n", sep=""))
  # message(paste("   111 Info: initialized end as ", end, "\n", sep=""))
  # message(paste("   111 Info: initialized x_fit_left   as ", xfit_left, "\n", sep=""))
  # message(paste("   111 Info: initialized x_fit_right as ", xfit_right, "\n", sep=""))
  # diff0              <- sqrt(sum((d[xfit_left:xfit_right, 2] - yfit[xfit_left:xfit_right])^2))
  #
  return(diff0)
}
# function for tunning final fitting if heterozgyous genomes
#' Tuning Final Fitting for Heterozygous Genomes
#'
#' This function tunes the final fitting for heterozygous genomes by adjusting the delta values for heterozygous and homozygous regions.
#'
#' @param tooptimize A numeric vector containing the scale factors to optimize.
#' @param h_het A numeric vector representing the raw fitting for the heterozygous region.
#' @param h_hom A numeric vector representing the raw fitting for the homozygous region.
#' @param h_target A numeric vector representing the target k-mer frequency.
#' @return A numeric value representing the minimized difference.
#' @examples
#'
#' tooptimize <- c(0.5)
#' h_het <- rnorm(100)
#' h_hom <- rnorm(100)
#' h_target <- rnorm(100)
#' diff <- error_minimize2(tooptimize, h_het, h_hom, h_target)
#' print(diff)
#'
#' @export
error_minimize2<-function(tooptimize, h_het, h_hom, h_target)
{
  # h_het   : raw fitting for the heterozygous region
  # h_hom   : raw fitting for the homozygous   region
  # h_target: before valley: fitted cnt, after valley: raw cnt
  het_delta <- 1-tooptimize[1]
  hom_delta <- 1 # reserved parameter: might be applied in future
  h_merge   <- het_delta*h_het + hom_delta*h_hom
  diff2     <- sum(abs(h_merge - h_target))
  #
  return(diff2)
}
## function for minimizing difference from remainder kmer-freq dr: mean, sd, and scaling factor ##
#' Minimize the Error for Remaining K-mer Frequency
#'
#' This function minimizes the error for the remaining k-mer frequency by adjusting the scaling factor.
#'
#' @param tooptimize A numeric vector containing the scale factors to optimize.
#' @param x A numeric vector representing the histogram.
#' @param end An integer indicating the right-side position for fitting.
#' @param xfit_left A numeric value for the left-side position to calculate initial mean and standard deviation.
#' @param xfit_right A numeric value for the right-side position to calculate initial mean and standard deviation.
#' @param d A data frame representing the observed k-mer frequencies that will be fitted.
#' @param min_valid_pos An integer indicating the left-side position from which the observed k-mer frequencies will be fitted.
#' @param itr An integer representing the iteration count.
#' @param meanfit A numeric value representing the initial mean.
#' @param sdfit A numeric value representing the initial standard deviation.
#' @return A numeric value representing the minimized error.
#' @examples
#'
#' tooptimize <- c(1, 1, 1, 1)
#' x <- rnorm(100)
#' end <- 100
#' xfit <- seq(min(x), max(x), length=end)
#' xfit_left <- 20
#' xfit_right <- 80
#' d <- data.frame(V1=1:100, V2=rnorm(100))
#' min_valid_pos <- 10
#' itr <- 100
#' meanfit <- 18
#' sdfit <- 4.21
#' error <- error_minimize3(tooptimize, x, end, xfit_left, xfit_right, d,
#' min_valid_pos, itr, meanfit, sdfit)
#' print(error)
#'
#' @export
error_minimize3<-function(tooptimize, x, end, xfit_left, xfit_right, d, min_valid_pos, itr, meanfit, sdfit)
{
  # x            : histogram
  # xfit         : x-values to get the normal distribution
  # xfit-left    : a left-side  position to calculate initial mean and sd
  # xfit-right   : a right-side position to calculate initial mean and sd
  # d            : the observed kmer-freq that will be fitted
  # min_valid_pos: a left-side  position, from which the observed kmer-freq will be fitted
  # end          : a rigth-side position, till which the observed kmer-freq will be fitted
  #sd_scale_factor    <- tooptimize[1]
  yfit_scale_factor  <- tooptimize[2]*100000/itr
  # mean_scale_factor  <- tooptimize[3]
  xfit               <- seq(min(x),max(x),length=end)
  # meanfit            <- mean(x[x>=xfit_left & x<=xfit_right])*mean_scale_factor
  #sdfit              <- sd(x[  x>=xfit_left & x<=xfit_right])*sd_scale_factor
  yfit               <- dnorm(xfit,mean=meanfit,sd=sdfit)*yfit_scale_factor*length(x)
  diff0              <- sqrt(sum((d[min_valid_pos:end, 2] - yfit[min_valid_pos:end])^2))
  # message(paste("   111 Info: initialized min_valid_pos   as ", min_valid_pos, "\n", sep=""))
  # message(paste("   111 Info: initialized end as ", end, "\n", sep=""))
  # message(paste("   111 Info: initialized x_fit_left   as ", xfit_left, "\n", sep=""))
  # message(paste("   111 Info: initialized x_fit_right as ", xfit_right, "\n", sep=""))
  # diff0              <- sqrt(sum((d[xfit_left:xfit_right, 2] - yfit[xfit_left:xfit_right])^2))
  #diff0              <- sqrt(sum((d[xfit_left:end, 2] - yfit[xfit_left:end])^2))
  #
  return(diff0)
}
# recover count 0: in initial count, making kmer freq consecutive.
#' Recover Initial K-mer Count
#'
#' This function recovers the initial k-mer count, making the k-mer frequency consecutive.
#'
#' @param d0 A data frame representing the initial k-mer count from software like Jellyfish.
#' @return A data frame with recovered k-mer counts.
#' @examples
#' d0 <- data.frame(V1 = c(1, 2, 4), V2 = c(100, 200, 300))
#' dr <- initial_count_recover(d0)
#' @export
initial_count_recover <- function(d0)
{
  # dr: the initial kmer count from softwares like jellyfish
  dr <- cbind(1:d0[length(d0[,1]), 1], rep(0, d0[length(d0[,1]), 1]))
  i  <- 0
  while( i < length(d0[,1]))
  {
    i               <- i + 1
    dr[d0[i, 1], 2] <- d0[i, 2]
  }
  return (dr)
}
# modify kmer freq before fitting
#' Modify K-mer Frequency Before Fitting
#'
#' This function modifies the k-mer frequency before fitting, adjusting either the left or right part based on the peak position.
#'
#' @param start An integer indicating the starting position (does not include the peak position).
#' @param end An integer indicating the ending position (does not include the peak position).
#' @param left_right An integer indicating whether to modify the left part (0) or the right part (1).
#' @param histx A data frame representing the k-mer histogram.
#' @return A modified data frame with adjusted k-mer frequencies.
#' @examples
#' histx <- data.frame(V1 = 1:10, V2 = c(100, 200, 300, 400, 500, 400, 300, 200, 100, 50))
#' histx_new <- kmer_count_modify(3, 7, 0, histx)
#' @export
kmer_count_modify <- function(start, end, left_right, histx)
{
  # start or end does not include the peak position
  # modify rigth part according to left
  if(left_right == 0)
  {
    for (i in start:end)
    {
      histx[2*end+2-i, 2]   <- histx[i, 2]
    }
  }else
  {
    for (i in start:end)
    {
      histx[2*start-2-i, 2] <- histx[i, 2]
    }
  }
  return (histx)
}
#' Estimate Genome Size Using K-mer Frequencies
#'
#' This function estimates the genome size of a species using k-mer frequencies.
#'
#' @param histo A character string specifying the path to the histogram file.
#' @param sizek An integer indicating the size of k used to generate the histogram.
#' @param outdir A character string specifying the output directory. If not specify, will use tempdir() as output directory.
#' @param exp_hom A numeric value representing the expected average k-mer coverage for the homozygous regions.
#' @param species A character string specifying the species name.
#' @param ploidy_ind An integer indicating the ploidy index (default is 2).
#' @param avg_cov A numeric value representing the average coverage.
#' @param left_fit_ratio A numeric value for the left fit ratio (default is 0.835).
#' @param meanfit_old A numeric value representing the previous mean fit.
#' @param sdfit_old A numeric value representing the previous standard deviation fit.
#' @param scale_flag A logical value indicating whether to apply scaling (default is FALSE).
#' @return A list containing the estimated genome size and other fitting parameters.
#' @examples
#' \donttest{
#'
#' histo <- "sample1.histo"
#' sizek <- 21
#' outdir <- tempdir()
#' exp_hom <- 200
#' species <- ""
#' ploidy_ind <- 2
#' avg_cov <- 0
#' left_fit_ratio <- 0.835
#' meanfit_old <- 0
#' sdfit_old <- 0
#' scale_flag <- FALSE
#' fit_lists <- findGSE_sp(path, samples, sizek, exp_hom, ploidy, range_left,
#'  range_right, xlimit, ylimit, output_dir)
#' }
#' @export
findGSE_sp <- function(histo="", sizek=0, outdir="", exp_hom=0, species="",ploidy_ind=2,avg_cov = 0,left_fit_ratio = 0.835, meanfit_old = 0, sdfit_old = 0, scale_flag = FALSE)
{
  old_options <- options() #
  on.exit(options(old_options)) #
  # initial values
  if(missing(histo))
  {
    stop(paste("Cannot find histo file: ", histo, ". Program exited.\n",sep=""))
  }
  if(missing(sizek))
  {
    stop(paste("Cannot find sizek info: ", sizek, ". Program exited.\n",sep=""))
  }
  # defaults
  message('ploidy index start: ',ploidy_ind,'\n')
  if(missing(exp_hom))  exp_hom <- 0
  #if(missing(outdir))   outdir  <- getwd()
  if(missing(outdir))   outdir  <- tempdir()
  ######################################## libararies required ###############################################
  #message("\n special version: \n")
  # find all peaks in a time series
  # suppressWarnings(suppressMessages(library("pracma")))
  # provide skew normal distrbution fitting (dsnorm(...))
  # suppressWarnings(suppressMessages(library("fGarch")))
  ## version id
  vers <- 'v1.94.'
  ##
  message("\nfindGSE initialized...\n")
  ## running time
  start_t <- Sys.time()
  ## check args
  # expert-tunning het fitting (later it would be automatically adjusted)
  het_fitting_delta <- 0.11
  histof  <- FALSE
  kpro    <- FALSE
  if(histo != "") histof <- TRUE
  if(sizek >  0)  kpro   <- TRUE
  if(histof == FALSE || kpro == FALSE)
  {
    stop("Option -histo or -sizek was provided incorrectly. Program exited.\n")
  }
  ## default args
  countingfile    <- histo
  if(file.exists(countingfile)==FALSE)
  {
    stop(paste("Cannot find histo file: ", countingfile, ". Program exited.\n",sep=""))
  }
  message(paste("   Info: histo file provided as ", countingfile, "\n", sep=""))
  #
  targetsizek     <- sizek
  if(targetsizek  < 15)
  {
    stop(paste("Unexpected size of k: ", targetsizek, ". Size k>=15 required. Program exited.\n",sep=""))
  }
  message(paste("   Info: size k set as ", targetsizek, "\n", sep=""))
  #
  path_set        <- FALSE
  path            <- outdir
  if(path==".")
  {
    #path <- getwd()
    path <- tempdir()
  }else
    if(path == "")
    {
      path         <- dirname(normalizePath(countingfile))
    }
  path_set       <- TRUE
  message(paste("   Info: output folder set as ", path, "\n", sep=""))
  #
  het_observed    <- FALSE
  # expected maximum value for the hom k-mer peak! set as fpeak~2*fpeak
  exp_hom_cov_max <- exp_hom
  message(paste("   Info: expected coverage of homozygous k-mers set as ", exp_hom_cov_max, "\n", sep=""))
  exp_hom_cov_max_save <- exp_hom_cov_max
  # derived  maximum value for the het k-mer peak!
  der_het_cov_max <- 0
  ## if output folder does not exist, create it; if existing, give a warning.
  dir.create(file.path(path), showWarnings = FALSE)
  if(exp_hom_cov_max > 0)
  {
    het_observed      <- TRUE
    # derived  maximum value for the het k-mer peak.
    der_het_cov_max   <- round(ploidy_ind/(ploidy_ind+1)*exp_hom_cov_max);
    message(paste("   Info: het observed set as true ==> heterozygous fitting asked. \n", sep=""))
    if(het_fitting_delta != 0.11)
    {
      message(paste("   Info: value for tuning het fitting set as ", het_fitting_delta, "\n", sep=""))
    }
  }else
  {
    het_observed      <- FALSE
    only_one_het_peak <- FALSE
    only_one_hom_peak <- TRUE
    message(paste("   Info: het observed set as false ==> heterozygous fitting not asked. \n", sep=""))
  }
  ##
  prefix <- ''
  samples          <- basename(countingfile)
  for (sample in samples)
  {
    message(paste("\nIterative fitting process for sample ", sample, " started... \n", sep=""))
    # all accessions -- where i can change and modify
    # initialize variables where outputs are recorded
    size_all       <-  0   # Gsize with               error-k-mer-freq
    size_exl       <-  0   # Gsize without            error-k-mer-freq
    size_fit       <-  0   # Gsize with              fitted-k-mer-freq
    size_cat       <-  0   # Gsize with fitted-cat-observed-k-mer-freq
    size_cor       <-  0   # Gsize corrected with size_cat and skewness; size_cor=size_cat/skewness (no)
    size_cor2      <-  0   # Gsize corrected with fitted-cat-observed-freq and upperbound freq 'e' (yes)
    i              <-  0   # the ith sample/accession
    kmer_peak_cnt  <-  0   # k-mer count at the peak position of k-mer counting
    kmer_peak_pos  <-  0   # genome-wide k-mer coverage
    kmer_peak_pos2 <-  0   # genome-wide k-mer coverage according to fitted curve
    labels         <- NA   # labels according input below
    namebank       <- NA
    border_pos     <-  1
    min_valid_pos  <-  1
    first_peak_pos <-  1
    first_mean     <-  0
    first_sd       <-  1
    first_mean_raw <-  0
    # initialize parameters for several iterations of fitting
    mymeanfit      <-  0
    mysdfit        <-  1
    myxifit        <-  1
    myscalefit     <-  1
    # special fit at itr 1
    norm_fit_itr1  <-  FALSE
    # open pdf to write visualization of fitting
    pdf(paste(path, '/',prefix, vers, 'est.', sample, '.sizek', targetsizek, '.curvefitted.pdf',
              sep=""))
    first_fit  <- TRUE
    for (sizek in targetsizek[1]) {
      normal   <- 0
      if(file.exists(countingfile)==TRUE)
      {
        dhet   <- read.table(countingfile)
        ## recover positions in dr whose initial counts are 0
        dr     <- initial_count_recover(dhet[1:length(dhet[,1]),])
        rm("dhet")
        dhet   <- dr
        # if heterozygosity oberved in k-mer freq,
        #    fit het and remove fitted values from subsequent hom-fitting
        selected    <- min(2000, length(dhet[,1]))
        if(selected < exp_hom_cov_max)
        {
          selected  <- min(round(1.5*exp_hom_cov_max), length(dhet[,1]))
        }
        main_peak_is_hom <- TRUE
        if(het_observed)
        {
          # initialize count as error for fitting
          error    <- dr
          message(paste("    Size ", sizek , " fitting for het k-mers \n", sep=""));
          # find hom and het peaks (if any).
          peaks    <- pracma::findpeaks(error[2:exp_hom_cov_max, 2])
          #peaks    <- filter_peaks(tmp_peaks, dhet)
          if(length(peaks) == 0)
          {
            plot(dhet[, 1], dhet[, 2],
                 col="gray",
                 xlab="K-mer freq", ylab="Number of k-mers",
                 xlim=c(1, 4*(which.max(dhet[5:selected, 2])+4)), ylim=c(1, 1.5*max(dhet[5:selected, 2])))
            text(1.5*(which.max(dhet[5:selected, 2])+4), 1.1*max(dhet[5:selected, 2]),
                 paste("Main peak (after freq=5) at k-mer freq: ",
                       which.max(dhet[5:selected, 2])+4,
                       sep=""))
            dev.off();
            stop("No k-mer freq peak was found with the cutoff given by -exp_hom.
                  You may need to increase the cutoff!")
            quit("no")
          }
          hom_peak_pos <- 0
          het_peak_pos <- 0
          message('enter ploidy equal to ',ploidy_ind)
          if(ploidy_ind == 1){
            hom_peak_pos <- peaks[which.max(peaks[,1]), 2]+1   # hom peak
            peaks[which.max(peaks[,1]), 1] <- 0
            het_peak_pos <- peaks[which.max(peaks[,1]), 2]+1   # het peak
            peaks[which.max(peaks[,1]), 1] <- 0
          }else if(ploidy_ind == 10){

            het_peak_pos <- 462 #ploidy_ind * avg_cov
            hom_peak_pos <- 615 #(ploidy_ind + 1) * avg_cov
          }
          else{
            het_peak_pos <- ploidy_ind * avg_cov
            hom_peak_pos <- (ploidy_ind + 1) * avg_cov
          }
          message('hom_peak_pos is: ',hom_peak_pos,'\n')
          message('het_peak_pos is: ',het_peak_pos,'\n')
          # if two peaks found, the larger one is hom; the other is het
          if(het_peak_pos > hom_peak_pos)
          {
            tmp               <- het_peak_pos
            het_peak_pos      <- hom_peak_pos
            hom_peak_pos      <- tmp
            main_peak_is_hom  <- FALSE
            message("    Warning: two peaks found in k-mer freq with given -exp_hom,
                  peak with lower height as hom-peak!\n")
          }
          if(het_peak_pos>0 & hom_peak_pos>0)
          {
            # in case two peask are near each other (e.g. it may happen with k-mer cov >100x)
            ratio        <- hom_peak_pos/het_peak_pos;
            if (round(ratio)<1.5) # nearly equal
            {
              hom_peak_pos <- max(hom_peak_pos, het_peak_pos)
              het_peak_pos <- min(hom_peak_pos, het_peak_pos)
            }
          }
          # if one peak  found, need to check if it is hom or het accordingly
          only_one_hom_peak <- FALSE
          only_one_het_peak <- FALSE
          if(het_peak_pos == hom_peak_pos)
          {
            unkownpeak    <- het_peak_pos
            if(unkownpeak >  der_het_cov_max)
            {
              het_peak_pos      <- round(ploidy_ind/(ploidy_ind+1)*hom_peak_pos)
              message("    Warning: only one peak in k-mer freq with given -exp_hom,
                  determined as hom-peak!\n")
              message("            -exp_hom needs to be increased
                  if you think it is a het peak.\n")
              only_one_hom_peak <- TRUE
            }else
              if(unkownpeak  < der_het_cov_max)
              {
                hom_peak_pos      <- (ploidy_ind+1)/ploidy_ind*het_peak_pos
                message("    Warning: only one peak in k-mer freq with given -exp_hom,
                    determined as het-peak!\n")
                message("          -exp_hom needs to be decreased
                    if you think it is a hom peak.\n")
                main_peak_is_hom  <- FALSE
                only_one_het_peak <- TRUE
              }else
              {
                plot(dhet[, 1], dhet[, 2],
                     col="gray",
                     xlab="K-mer freq", ylab="Number of k-mers",
                     xlim=c(1, selected), ylim=c(1, 1.5*max(dhet[3:selected, 2])))
                text(100, max(dhet[3:selected, 2]),
                     paste("Main peak at k-mer freq: ",
                           which.max(dhet[3:selected, 2])+2,
                           sep=""))
                dev.off()
                stop("Unexpected peaks found -
                     please check the raw k-mer curve provided in pdf,
                     and reset the option -exp_hom with a proper value x,
                     which must satisfy 2*hom_peak>x>hom_peak !\n\n");
              }
          }
          message('    Info: het_peak_pos for het fitting: ', het_peak_pos, '\n')
          message('    Info: hom_peak_pos for hom fitting: ', hom_peak_pos, '\n')
          het_peak_pos_save <- het_peak_pos
          hom_peak_pos_save <- hom_peak_pos
          het_min_valid_pos <- which.min(error[1:het_peak_pos, 2])
          if(ploidy_ind == 1){
            # fit het distribution
            het_xfit_left     <- max(min(het_min_valid_pos, het_peak_pos-10), 3)        # update on 20200129
            if(het_peak_pos   > 80)
            {
              het_xfit_left  <- floor(het_xfit_left+het_peak_pos*0.1473684)
            }
            het_xfit_right   <- het_peak_pos + round(0.5*(hom_peak_pos-het_peak_pos))   # update on 20200129
            if(het_peak_pos   > 80)
            {
              het_xfit_right <- floor(het_xfit_right-het_peak_pos*0.1473684)
            }
          }else if(ploidy_ind == 10){

            het_xfit_left <- 350 #ploidy_ind * avg_cov
            het_xfit_right <- 650 #(ploidy_ind + 1) * avg_cov
          }
          else{
            het_xfit_left <- het_peak_pos * left_fit_ratio  # 0.85  0.83
            het_xfit_right <- het_peak_pos * 1.15  # 1.15  1.13
          }
          if(het_xfit_right > het_peak_pos-1 - het_xfit_left + het_peak_pos-1)
          {
            het_xfit_right  <- het_peak_pos-1 - het_xfit_left + het_peak_pos-1
          }
          histx2    <- kmer_count_modify(het_xfit_left, het_peak_pos-1, 0, error)
          x         <- rep(1:het_xfit_right,ceiling(abs(histx2[1:het_xfit_right,2])/(100000/1)))
          xfit      <- seq(min(x),max(x),length=het_xfit_right)
          # optimizing parameters
          message('    Info: het_xfit_left  for het fitting: ', het_xfit_left, '\n')
          message('    Info: het_xfit_right for het fitting: ', het_xfit_right, '\n')
          het_xfit_right_save <- het_xfit_right
          if(ploidy_ind == 1 || scale_flag == TRUE){
            v<-optim(par=c(1, 1, 1, 0.8),
                     fn=error_minimize, x=x, end=het_xfit_right, xfit=xfit,
                     xfit_left=het_xfit_left, xfit_right=het_xfit_right,
                     d=histx2, min_valid_pos=het_min_valid_pos, itr=1)
            #
            # scale and correct yfit with optimized values in v$par
            xfit2     <- seq(0.01,selected,0.01)

            meanfit   <- mean(x[x>=het_xfit_left & x<=het_xfit_right])*v$par[3]
            meanfit_old <- meanfit
            message('first meanfit_old: ',meanfit_old,' \n')
            sdfit     <- sd(x[  x>=het_xfit_left & x<=het_xfit_right])*v$par[1] # raw sd
            sdfit_old <- sdfit

          }
          else if (scale_flag == FALSE){
            meanfit <- meanfit_old * ploidy_ind
            sdfit <- sdfit_old + sdfit_old*max(ploidy_ind - 1, 0)*0.1
            #sdfit <- max(sdfit, 7.6)
            message('meanfit for ploidy: ',ploidy_ind,' is: ',meanfit,'\n')
            v<-optim(par=c(1, 1, 1, 0.8),
                     fn=error_minimize3, x=x, end=het_xfit_right,
                     xfit_left=het_xfit_left, xfit_right=het_xfit_right,
                     d=histx2, min_valid_pos=het_min_valid_pos, itr=1, meanfit = meanfit, sdfit = sdfit_old)
            #
            message('optimize done...\n')
            # scale and correct yfit with optimized values in v$par
            xfit2     <- seq(0.01,selected,0.01)
            #meanfit <- meanfit_old * ploidy_ind
          }

          if(meanfit < 0)
          {
            message(paste("    Warning: data does not follow
                      assumed distribution anymore; fitting stopped.\n", sep=""))
          }
          # sdfit     <- sd(x[  x>=het_xfit_left & x<=het_xfit_right])*v$par[1] # raw sd
          #
          xifit     <- 1*v$par[4];
          #yfit0     <- dsnorm(xfit2,mean=meanfit,sd=sdfit, xi=xifit)*(100000/1)*v$par[2]*length(x)
          message('sdfit is: ',sdfit,'\n')
          yfit0     <- dnorm(xfit2,mean=meanfit,sd=sdfit)*(100000/1)*v$par[2]*length(x)
          #
          ## scale fitting according to observation
          hetfit    <- yfit0[seq(100, length(yfit0), 100)]
          hetfit    <- hetfit*dhet[het_peak_pos,2]/max(hetfit[1:selected])
          ## end scaling ##
          #
          # set d0 for further hom-fitting
          d0        <- dr
          if(only_one_hom_peak)
          {
            d0[1:selected , 2] <- d0[1:selected , 2] - (1-het_fitting_delta)*hetfit[1:selected]
          }else
          {
            d0[1:selected , 2] <- d0[1:selected , 2] - hetfit[1:selected]
          }
        }else
        {
          d0                 <- dr
          hetfit             <- 0
          hetfit[1:selected] <- 0
          het_min_valid_pos  <- 100000
          only_one_hom_peak  <- TRUE
        }
        i    <- i + 1
        if(i > length(targetsizek)) { break }
        rm("dr")
        # dr can be different from raw k-mer counting due to het fit
        dr   <- d0
        #
        # initialize count as error for fitting: initial error vector
        error        <- dr
        # set the right-bound for fitting
        maxright4fit <- dr[length(dr[,1])-1, 1]
        maxright4fit <- min(10000,length(dr[,1]))
        # set the number of total iterations corresponding
        #     to raw count given by softwares such as jellyfish
        totalitr     <- 2
        myitr        <-  0
        myyfit       <-  0
        homfit       <-  0
        #
        first_fit    <- TRUE
        itr = 1
        while(itr <= totalitr)
        {
          if(first_fit)
          {
            message(paste("    Size ", sizek , " at itr ", itr, "\n", sep=""))
            if(itr>=2 && 2*mymeanfit[itr-1] > maxright4fit) { myitr <- itr - 1; break;}
            if(itr == 1) # original data
            {
              # 1st-col    : height,
              # 2nd-col    : index of maximum,
              # 3rd/4th-col: indices that peak begins/ends
              if(het_observed)
              {
                # for  border_pos
                peaks           <- pracma::findpeaks(error[3:selected, 2])
                peaks[,2:4]     <- peaks[,2:4]+2
                border_pos      <- peaks[1, 3]          # caution: peaks[peakindex, 3]
                # for others: caution!
                if(main_peak_is_hom==F)
                {
                  peaks           <- pracma::findpeaks(error[round(0.5*(het_peak_pos+hom_peak_pos)):selected, 2])
                  peaks[,1:4]     <- peaks[,1:4]+round(0.5*(het_peak_pos+hom_peak_pos))-1
                }else
                {
                  peaks           <- pracma::findpeaks(error[het_peak_pos:selected, 2])
                  peaks[,1:4]     <- peaks[,1:4]+het_peak_pos-1
                }
              }
              else
              {
                peaks           <- pracma::findpeaks(error[3:selected, 2])
                peaks[,2:4]     <- peaks[,2:4]+2
                ## new on 2017-09-16
                peaks           <- filter_peaks(peaks, error)
                ## end 2017-09-16
                border_pos      <- peaks[1, 3]          # caution: peaks[peakindex, 3]
              }
              peakindex         <- which.max(peaks[,1]) # caution: if double peaks
              min_valid_pos     <- peaks[1, 3]          # caution: peaks[peakindex, 3]
              original_peak_pos <- peaks[peakindex, 2]
              original_peal_val <- peaks[peakindex, 1]
              if(het_observed && only_one_het_peak)
              {
                original_peak_pos <- (ploidy_ind+1)/ploidy_ind*het_peak_pos
                original_peal_val <- peaks[original_peak_pos, 1]
                min_valid_pos     <- het_peak_pos
              }
              datacheck         <- min(300, maxright4fit)
              dci               <- 0
              message('    Info: min_valid_pos: ', min_valid_pos,'\n')
              min_valid_pos_save = min_valid_pos
              while(min_valid_pos == original_peak_pos)
              {
                min_valid_pos     <- min_valid_pos + 1
                original_peak_pos <- which.max(error[min_valid_pos:maxright4fit,2])+(min_valid_pos-1)
                original_peal_val <- max(error[min_valid_pos:maxright4fit,2])
                dci               <- dci + 1
                if(dci > datacheck) { normal=1;break; }
              }
              if(dci   > datacheck)
              {
                message(paste("Error: weird initial count-data of sample ",
                          sample, ". Please check!!!\n",
                          sep=""))
                normal <- 1
                break
              }
              message('    Info: signal error border: ', border_pos,'\n')
              first_peak_pos  <- original_peak_pos
              # copy
              d               <- error
              # complement the left side of the peak position
              #    according to observations on the right side of the peak
              for (li in 1:(original_peak_pos-1))
              {
                d[li, 2] <- d[min(2*original_peak_pos-li, length(dr[,1])),2]
              }
              end <- min(original_peak_pos+round((original_peak_pos-min_valid_pos)*5/12),length(dr[,1]))
              if(end - original_peak_pos <= 3 && original_peak_pos>=10)
              {
                end      <- end + 5
              }
            }
            else # error reminder
            {
              # caution: valid kmer count is at least 3
              min_valid_pos     <- round(mymeanfit[itr-1])+3
              original_peak_pos <- which.max(error[min_valid_pos:maxright4fit, 2])+(min_valid_pos-1)
              original_peal_val <- max(error[min_valid_pos:maxright4fit, 2])
              datacheck         <- min(300, maxright4fit)
              dci               <- 0
              while(min_valid_pos == original_peak_pos)
              {
                min_valid_pos     <- min_valid_pos + 1
                original_peak_pos <- which.max(error[min_valid_pos:maxright4fit,2])+(min_valid_pos-1)
                original_peal_val <- max(error[min_valid_pos:maxright4fit,2])
                dci               <- dci + 1
                if(dci > datacheck) { break; }
              }
              if(dci   > datacheck)
              {
                message(paste("    Warning: weired initial count-data of sample ",
                          sample, ". Please check!!!\n", sep=""))
                normal   <- 0
                xlimmax  <- 100
                break
              }
              # copy
              d          <- abs(error)
              # complement the left side of the peak position according to observations on the right side of the peak
              thisnormal <- 0
              for (li in (original_peak_pos-round(mymeanfit[1])):(original_peak_pos-1)) {
                if(2*original_peak_pos-li > maxright4fit){ thisnormal<-1; break}
                d[li, 2] <- d[2*original_peak_pos-li,2]
              }
              if(thisnormal==1){normal=0; break;}
              end        <- min(original_peak_pos+round(mymeanfit[itr-1]), maxright4fit)
              if(end - original_peak_pos <= 3 && original_peak_pos>=10)
              {
                end      <- end + 5
              }
            }
          }
          else
          {
            # due to the large diff in the first fitting and original curve, increase end for fitting.
            end           <- min(2*original_peak_pos-1, maxright4fit)
            min_valid_pos <- min_valid_pos + 1 ##### 2016-09-16 new
            d             <- error
            first_fit     <- TRUE
          }
          # prepare the histogram data with replication
          forrep     <- ceiling(abs(d[1:end,2])/(100000/itr))
          anyNA      <- which(is.na(forrep))
          if (length(anyNA) >= 1)
          {
            d[anyNA, 2] <- dr[anyNA, 2]; # caution: missing values could happen here!!!
            message(paste("    Warning: ", length(anyNA),
                      " data-points of d has been replaced as 1 to properly prepare hisgram data x
                      for function optim at itr ", itr, "\n", sep=""))
          }
          x          <- rep(1:end,ceiling(abs(d[1:end,2])/(100000/itr)))
          xfit       <- seq(min(x),max(x),length=end)
          # normal fit
          matchset   <- '0'
          if(!is.na(match(sizek,matchset)))
          {
            xfit_left  <- min_valid_pos
            xfit_right <- 2*original_peak_pos-xfit_left
            if(first_peak_pos>=200)
            {
              #xfit_left  <- original_peak_pos - 50
              #xfit_right <- original_peak_pos + 50
              xfit_left  <- original_peak_pos * left_fit_ratio #- 50 0.85
              xfit_right <- original_peak_pos * 1.13 #+ 50 1.15
            }
            if (ploidy_ind == 1 || scale_flag == TRUE){
              v<-optim(par=c(1, 1, 1, 1),
                       fn=error_minimize, x=x, end=end, xfit=xfit,
                       xfit_left=xfit_left, xfit_right=xfit_right,
                       d=error,
                       min_valid_pos=min_valid_pos, itr=itr)
            }
            else if (scale_flag == FALSE){
              v<-optim(par=c(1, 1, 1, 1),
                       fn=error_minimize3, x=x, end=end,
                       xfit_left=xfit_left, xfit_right=xfit_right,
                       d=error,
                       min_valid_pos=min_valid_pos, itr=itr, meanfit = meanfit, sdfit = sdfit_old)
            }

          }
          else
          {
            xfit_left  <- min_valid_pos
            xfit_right <- 2*original_peak_pos-1
            if(first_peak_pos>=100 && itr<2)
            {
              xfit_left  <- original_peak_pos - 20
              xfit_right <- original_peak_pos + 20
            }else
              if(first_peak_pos>=30  && itr<2)
              {
                xfit_left  <- original_peak_pos - 10
                xfit_right <- original_peak_pos + 10
              }
            if(ploidy_ind == 1 || scale_flag == TRUE){
              v<-optim(par=c(1, 1, 1, 0.8),
                       fn=error_minimize, x=x, end=end, xfit=xfit,
                       xfit_left=xfit_left, xfit_right=xfit_right,
                       d=error,
                       min_valid_pos=min_valid_pos, itr=itr)
            }
            else if (scale_flag == FALSE){
              v<-optim(par=c(1, 1, 1, 0.8),
                       fn=error_minimize3, x=x, end=end,
                       xfit_left=xfit_left, xfit_right=xfit_right,
                       d=error,
                       min_valid_pos=min_valid_pos, itr=itr, meanfit = meanfit, sdfit = sdfit_old)
            }

          }
          message('    Info: hom_xfit_left  for hom fitting at itr ', itr,  ': ', xfit_left, '\n')
          message('    Info: hom_xfit_right for hom fitting at itr ', itr,  ': ', xfit_right, '\n')
          # if(itr == 1){
          #   xfit_left
          # }
          #
          # scale and correct yfit with optimized values in v$par
          xfit2     <- seq(0.01,maxright4fit,0.01)

          # if(ploidy_ind == 1){
          #   # meanfit   <- mean(x[x>=het_xfit_left & x<=het_xfit_right])*v$par[3]
          #   meanfit   <- mean(x[x>=xfit_left & x<=xfit_right])*v$par[3]
          #   meanfit_old <- meanfit
          #   message('last meanfit_old: ',meanfit_old,' \n')
          # }
          # else{
          #   meanfit <- meanfit_old * ploidy_ind
          # }

          meanfit   <- mean(x[x>=xfit_left & x<=xfit_right])*v$par[3]
          if(meanfit < 0)
          {
            message(paste("    Warning: data does not follow assumed distribution anymore at itr ",
                      itr, ", fitting stopped.\n", sep=""))
            normal  <- 0
            if(myyfit==0 && itr==1)
            {
              v<-optim(par=c(1, 1, 1),
                       fn=error_minimize3, x=x, end=end,
                       xfit_left=xfit_left, xfit_right=xfit_right,
                       d=error,
                       min_valid_pos=min_valid_pos, itr=itr, meanfit = meanfit, sdfit = sdfit_old )
              xfit2    <- seq(0.01,maxright4fit,0.01)
              yfit0    <- dnorm(xfit2,
                                mean=meanfit,
                                sd=sdfit)*(100000/itr)*v$par[2]*length(x)
              myyfit   <- myyfit + yfit0
              norm_fit_itr1 <- TRUE
              if(het_observed && itr == 1)
              {
                homfit  <- yfit0
              }
            }
            break
          } # stop if the data does not follow normal distribution anymore
          sdfit     <- sd(x[  x>=xfit_left & x<=xfit_right])*v$par[1]
          xifit     <- 1*v$par[4]
          #yfit0     <- dsnorm(xfit2,mean=meanfit,sd=sdfit, xi=xifit)*(100000/itr)*v$par[2]*length(x)
          yfit0     <- dnorm(xfit2,mean=meanfit,sd=sdfit)*(100000/itr)*v$par[2]*length(x)
          yfit      <- yfit0[seq(100, length(yfit0), 100)]
          ## repeat first fitting if fitting is not satisfactory
          if(itr==1 && end<min(2*original_peak_pos-1, maxright4fit))
          {
            fitting_check <- error[1:maxright4fit, 2] - yfit
            n_less_than_0 <- 0
            for(fci in c((original_peak_pos+3):min(round(2*original_peak_pos), length(dr[,1]))))
            {
              if(fitting_check[fci] < 0)
              {
                n_less_than_0 <- n_less_than_0 + 1
              }
            }
            if(n_less_than_0 >= 3)
            {
              first_fit <- FALSE
              message(paste("       Fitting has to be repeated: sizek ", sizek,
                        " at itr ", itr, "\n",
                        sep=""))
              i
              next
            }
          }
          # plot
          if(i == 1)
          {
            #plot(error[1:maxright4fit, 1], abs(error[1:maxright4fit, 2]),
            #      xlim=c(0, min(maxright4fit, 4*meanfit)), ylim=c(0, 1.5*max(error[xfit_left:maxright4fit, 2])),
            #      type='b', col="gray",
            #      xlab="K-mer Coverage", ylab="Genomewide k-mer Frequency")
            # title(main=paste('Example: fitting at itr ', itr, 'of sample ', sample, 'k=', sizek, ')'),
            #     cex.main=0.8)
            # lines(xfit2[100:length(xfit2)], yfit0[100:length(xfit2)], col="blue", lwd=2)
          }
          #### begin
          if(max(yfit) > 1.2*max(error[min_valid_pos:maxright4fit, 2]))
          {
            message(paste("    Note on hom fitting: fitting stopped at iter ",
                      itr, ", expected: ", totalitr, "\n",sep=""))
            if(length(myyfit) == 1){


            if(myyfit==0 && itr==1)
            {
              v<-optim(par=c(1, 1, 1),
                       fn=error_minimize3, x=x, end=end,
                       xfit_left=xfit_left, xfit_right=xfit_right,
                       d=error,
                       min_valid_pos=min_valid_pos, itr=itr, meanfit = meanfit,sdfit = sdfit_old )
              xfit2    <- seq(0.01,maxright4fit,0.01)
              yfit0    <- dnorm(xfit2,
                                mean=meanfit,
                                sd=sdfit)*(100000/itr)*v$par[2]*length(x)
              if(sum(yfit0) > 0)
              {
                myyfit   <- myyfit + yfit0
                norm_fit_itr1 <- TRUE
                if(het_observed && itr == 1)
                {
                  homfit  <- yfit0
                }
              }else # abnormal case: caution!: no fitting performed on hom fitting
              {
                yfit0    <- rep(0, length(xfit2))
                myyfit   <- myyfit + yfit0
                norm_fit_itr1 <- TRUE
                if(het_observed && itr == 1)
                {
                  homfit  <- yfit0
                }
              }
            }
            }
            break;
          }
          #### end
          # estimate error
          error[1:maxright4fit, 2] <- abs(error[1:maxright4fit, 2] - yfit)
          # get more accurate fitted values
          xfit2    <- seq(0.01,maxright4fit,0.01);
          #yfit0    <- dsnorm(xfit2,mean=meanfit,sd=sdfit, xi=xifit)*(100000/itr)*v$par[2]*length(x)


          yfit0    <- dnorm(xfit2,mean=meanfit,sd=sdfit)*(100000/itr)*v$par[2]*length(x)
          myyfit   <- myyfit + yfit0
          # record fitted parameters
          mymeanfit[itr]  <- meanfit
          mysdfit[itr]    <- sdfit
          myxifit[itr]    <- xifit
          myscalefit[itr] <- (100000/itr)*v$par[2]*length(x)
          if(itr == 1)
          {
            first_mean     <- meanfit
            first_sd       <- sdfit
          }
          #
          ## het est: new feature
          if(het_observed && itr == 1)
          {
            homfit  <- yfit0
          }
          ## end of het rate est: new feature
          #
          itr <- itr + 1
        }
        message("Iterative fitting done.\n\n")
        if(normal == 0)
        {
          # original count
          xlimmax <- min(3*first_peak_pos, selected)
          plot(dhet[, 1], abs(dhet[, 2]),
               xlim=c(0,xlimmax), ylim=c(0, 1.5*max(dhet[border_pos:xlimmax, 2])),
               type='b', col="gray",
               xlab="K-mer freq", ylab="Number of k-mers")
          title(main=paste('Sample ', sample, ' k=', sizek), cex.main=0.8) #
          # fitted count
          if(sum(hetfit[1:selected] > 0)) # plus het
          {
            # fitted peak
            fitted_peak_value <- which.max(myyfit)/100
            if(norm_fit_itr1 == TRUE)
            {
              fitted_peak_value <- hom_peak_pos
            }
            fittedyvalues <- myyfit[seq(100, length(myyfit), 100)]

            if(only_one_hom_peak)
            {
              # begin find optinum het_fitting_delta 2016-12-20
              h_target <- c(hetfit[1:min(border_pos, het_min_valid_pos)],
                            as.vector(dhet[min(border_pos, het_min_valid_pos)+1:(length(dr[,2])-min(border_pos, het_min_valid_pos)),2]))
              v2 <- optim(par=c(0.11, 1),
                          fn=error_minimize2,
                          h_het=hetfit[1:first_peak_pos],
                          h_hom=fittedyvalues[1:first_peak_pos],
                          h_target=h_target[1:first_peak_pos])
              ## update delta
              het_fitting_delta <- v2$par[1]
              message(paste("    Info: het_fitting_delta optimized as ", het_fitting_delta, "\n\n", sep=""))
              #message(paste("    Info: hom_fitting_delta optimized as ", v2$par[2],         "\n\n", sep=""))
              # end   find optinum het_fitting_delta 2016-12-20
            }

            if(only_one_hom_peak)
            {
              fittedyvalues[1:selected] <- fittedyvalues[1:selected] + (1-het_fitting_delta)*hetfit[1:selected]
            }else
            {
              fittedyvalues[1:selected] <- fittedyvalues[1:selected] + hetfit[1:selected]
            }
            lines(1:length(fittedyvalues), fittedyvalues, col="blue", lwd=5)
            # het
            if(only_one_hom_peak)
            {
              lines(1:selected, (1-het_fitting_delta)*hetfit[1:selected], col="cyan", lwd=2, lty="dashed")
            }
            else
            {
              lines(1:selected,  hetfit[1:selected], col="cyan", lwd=2, lty="dashed")
              options(scipen=999)
              het_fit <- cbind(1:selected,  round(hetfit[1:selected]))
              write.table(het_fit, file = paste(path, '/',prefix, vers, 'est.', sample, ".genome.size.estimated.k", min(targetsizek), 'to', max(targetsizek),".fitted_hetfit_count.txt", sep=""), sep = " ", quote=F, row.names = F, col.names = F)
              message(paste0('writing hetfit_count file done!!!\n het_fit length: ',dim(het_fit)[1]," lines.\n"))
              options(scipen=0)
            }
          }
          else
          {
            lines(xfit2[100:length(xfit2)], myyfit[100:length(xfit2)], col="blue", lwd=5)
            # fitted peak
            fitted_peak_value <- which.max(myyfit)/100
            fittedyvalues     <- myyfit[seq(100, length(myyfit), 100)]
          }
          # error rate of missing fitting: from raw k-mer counting dhet which includes het k-mers
          sum(fittedyvalues[border_pos:maxright4fit]-dhet[border_pos:maxright4fit,2])/sum(dhet[border_pos:maxright4fit,2])
          # fitted catting original with border_pos <- min_valid_pos; from raw k-mer counting dhet which includes het k-mers
          yfit2 <- c(fittedyvalues[1:min(border_pos, het_min_valid_pos)],
                     as.vector(dhet[min(border_pos, het_min_valid_pos)+1:(length(dr[,2])-min(border_pos, het_min_valid_pos)),2]))
          #
          lines(dhet[, 1], yfit2, col="red", lwd=1.5)
          #title(main=paste('Sample ', sample, ' k=', sizek, ')'), cex.main=0.8)
          #
          # calculate genome size (Mb) according to original data
          # with all kmers
          genome_size                 <- round(sum(dr[,1]*dhet[,2]/first_peak_pos))
          # with valid kmers: caution: 1.peak-value 2. valid kmer with cov >= min_valid_pos (or border_pos)
          genome_size_filtering_error <- round(sum(dr[border_pos:length(dhet[,1]),1]*dhet[border_pos:length(dr[,1]),2]/first_peak_pos))
          ##
          if(het_observed)
          {
            genome_size                 <- round(sum(dr[,1]*dhet[,2]/hom_peak_pos))
            genome_size_filtering_error <- round(sum(dr[border_pos:length(dhet[,1]),1]*dhet[border_pos:length(dr[,1]),2]/hom_peak_pos))
          }
          # genomic kmers
          # message(paste("        err-exl genomic ", sizek, "mers: ", sum(dhet[border_pos:length(dhet[,2]),2]), " Mio\n", sep=""));
          # fitted data with fitted peak
          genome_size_fitted <- round(sum(c(1:length(fittedyvalues))*fittedyvalues[1:length(fittedyvalues)]/fitted_peak_value))
          # genomic kmers
          # message(paste("        fit     genomic ", sizek, "mers: ", round(sum(fittedyvalues)), " Mio\n", sep=""));
          # catted data with fitted peak
          genome_size_catted <- round(sum(c(1:length(yfit2))*yfit2[1:length(yfit2)]/fitted_peak_value))
          # genomic kmers
          # message(paste("        catted  genomic ", sizek, "mers: ", round(sum(yfit2)), " Mio\n", sep=""));
          # corrected genome size (not meaningful)
          genome_size_corrected  <- round(genome_size_catted/myxifit[1])
          # genome size: catted data + raw mean
          if(sum(hetfit[1:selected] > 0))
          {
            end_for_mean <- round(fitted_peak_value*2 - fitted_peak_value*2/4) # >7% error; k should be small
          }else
            if((sizek=="15" || sizek=="17") && myxifit[1]>=0.93 && myxifit[1]<=1.07) # normal case: reduce effect of copy-2 k-mers
            {
              end_for_mean <- round(fitted_peak_value*2 - fitted_peak_value*1/4)
            }else
              if(myxifit[1]  > 1.07)
              {
                end_for_mean <- round(fitted_peak_value*2 + fitted_peak_value*2/4)
              }else
                if(myxifit[1]  < 0.93)
                {
                  end_for_mean <- round(fitted_peak_value*2 - fitted_peak_value*4/8)
                }else
                {
                  end_for_mean <- round(fitted_peak_value*2)
                }
          #### caution: end_for_mean should be not larger than vector size
          end_for_mean <- min(end_for_mean, length(dr[,1]))       ####
          ####
          if(het_observed && only_one_het_peak) ## new ##
          {
            end_for_mean     <- round(het_peak_pos*4 - het_peak_pos*4/4)
            end_for_mean     <- min(end_for_mean, length(dr[,1])) ####
            #message("end_for_mean (only one het peak): ", end_for_mean, '\n')
            message("hetfit: ceiling(abs(round(hetfit[1:end_for_mean]))/1000): ",ceiling(abs(round(hetfit[1:end_for_mean]))/1000),"\n")
            xtmp             <- rep(1:end_for_mean,ceiling(abs(round(hetfit[1:end_for_mean]))/1000))
            first_mean_raw   <- 2*mean(xtmp[xtmp>=1 & xtmp<=end_for_mean])
          }else
            if(het_observed & main_peak_is_hom==F)
            {
              het_end_for_mean <- first_peak_pos
              het_end_for_mean <- min(het_end_for_mean, length(dr[,1])) ####
              xtmp             <- rep(1:het_end_for_mean,ceiling(abs(round(hetfit[1:het_end_for_mean]))/1000))
              first_mean_raw   <- 2*mean(xtmp[xtmp>=1 && xtmp<=het_end_for_mean])
            }else
              if(het_observed && only_one_hom_peak) # caution: expert suppl.
              {
                dtmp           <- ceiling(abs(round(yfit2[1:end_for_mean]-(1-het_fitting_delta)*hetfit[1:end_for_mean]))/1000) # 2016-08-25
                ## start of specific in v1.94: from [0.5*het_peak, 1.5*het_peak]
                offset_count   <- round((fittedyvalues[1:end_for_mean]-yfit2[1:end_for_mean])/1000)
                offl <- round(0.5*which.max(hetfit))
                offr <- round(1.5*which.max(hetfit))
                for (off_i in c(offl:offr))
                {
                  if(offset_count[off_i] > 0)
                  {
                    dtmp[off_i] <- dtmp[off_i] + offset_count[off_i]
                  }
                  else
                  {
                    dtmp[off_i] <- dtmp[off_i] - offset_count[off_i]
                  }
                }
                ## end   of specific in v1.94
                xtmp           <- rep(1:end_for_mean,dtmp)
                first_mean_raw <- mean(xtmp[xtmp>=1 & xtmp<=end_for_mean])
              }else
              {
                xtmp           <- rep(1:end_for_mean,ceiling(abs(round(yfit2[1:end_for_mean]-hetfit[1:end_for_mean]))/1000))
                first_mean_raw <- mean(xtmp[xtmp>=1 & xtmp<=end_for_mean]);
              }
          #
          genome_size_corrected2 <- round(sum(c(1:length(yfit2))*yfit2[1:length(yfit2)]/first_mean_raw))
          #
          #
          ## heterozygosity est: new feature - 2016-11-18
          if(het_observed)
          {
            # alpha = 1 / [ (O/E)*K + K ]
            # het
            het_end <- round(first_mean_raw)
            hom_end <- min(round(2*first_mean_raw), selected)
            if(only_one_hom_peak)
            {
              # only for humans: do the correction for those k-mers from X-Y
              het_correction <- 0
              if(species=="human")
              {
                het_correction <- (155270560 - 59373566)*first_mean_raw/2;
              }
              #sum(c(1:het_end)*(1-het_fitting_delta)*hetfit[1:het_end])
              alpha     <- 1 / ( sum(seq(1:hom_end)*homfit[seq(100, length(homfit), 100)][1:hom_end])/  sum(seq(1:het_end)*hetfit[1:het_end])/(1-het_fitting_delta)*as.numeric(sizek) + as.numeric(sizek) )
              alpha_cor <- 0
              alpha_cor <- 1 / ( sum(seq(1:hom_end)*homfit[seq(100, length(homfit), 100)][1:hom_end])/ (sum(seq(1:het_end)*hetfit[1:het_end])-het_correction)/(1-het_fitting_delta)*as.numeric(sizek) + as.numeric(sizek) )
            }
            else
            {
              # only for humans: do the correction for those k-mers from X-Y
              het_correction <- 0
              if(species=="human")
              {
                het_correction <- (155270560 - 59373566)*first_mean_raw/2;
              }
              #sum(c(1:het_end)*hetfit[1:het_end])
              alpha     <- 1 / ( sum(seq(1:hom_end)*homfit[seq(100, length(homfit), 100)][1:hom_end])/  sum(seq(1:het_end)*hetfit[1:het_end])*as.numeric(sizek) + as.numeric(sizek) )
              alpha_cor <- 0
              alpha_cor <- 1 / ( sum(seq(1:hom_end)*homfit[seq(100, length(homfit), 100)][1:hom_end])/ (sum(seq(1:het_end)*hetfit[1:het_end])-het_correction )*as.numeric(sizek) + as.numeric(sizek) )
            }
          }
          ## end of heterozygosity rate est: new feature - 2016-11-18
          #
          #
          # single copy region of genome
          singlestart = 1
          singleend   = min(round(2*first_mean_raw),length(dr[,1])-1) #### caution: out of bounds
          singlesize  = sum(dhet[singlestart:singleend, 1]*yfit2[singlestart:singleend]/first_mean_raw)
          # begin: new on 2016-08-08
          repsize  = round(sum(dhet[(singleend+1):length(yfit2), 1]*yfit2[(singleend+1):length(yfit2)]/first_mean_raw))
          #legend(45, 1.36*max(dr[border_pos:60, 2]),
          if(het_observed)
          {
            legend("topright",
                   fill   = c('gray','white', 'white', 'cyan', 'white', 'blue',
                              'white', 'white', 'white', 'red', 'white', 'white'),
                   legend = c(paste('Observed (obs.): ',
                                    round(genome_size/10^6, digits = 3), ' Mb (error-excluded: ',
                                    round(genome_size_filtering_error/10^6, digits = 3), ' Mb)', sep=""),
                              paste('* k-mer_cov obs.: ',     first_peak_pos, sep=""),
                              paste('* signal_error_border ', het_min_valid_pos,                 sep=""),
                              paste('Heterozygous fitting (intermediate info)'),
                              paste('* het rate: ',
                                    round(alpha, digits=8),  " (ori) : (cor) ",
                                    round(alpha_cor, digits = 8),                                sep=""),
                              paste('Fitted count with fitted k-mer_cov: ',
                                    round(genome_size_fitted/10^6, digits = 3), ' Mb',           sep=""),
                              paste('* k-mer_cov fit.: ',
                                    round(fitted_peak_value, digits=2),                          sep=""),
                              paste('* 1st-sd:',
                                    round(first_sd, digits=2),                                   sep=""),
                              paste('* 1st-skewness: ',
                                    round(myxifit[1], digits=2),                                 sep=""),
                              paste('Fitted+obs. with corrected k-mer_cov: ',
                                    round(genome_size_corrected2/10^6, digits = 3), ' Mb',       sep=""),
                              paste('* k-mer_cov cor.: ',
                                    round(first_mean_raw, digits=2),                             sep=""),
                              paste('* repetitive_ratio ',
                                    round(repsize/genome_size_corrected2, digits=2), ': ',
                                    round(repsize/10^6, digits = 3), ' Mb',                      sep="")),
                   border="NA",
                   cex=0.7)
          }
          else
          {
            legend("topright",
                   fill   = c('gray','white', 'white', 'blue', 'white',
                              'white', 'white', 'red', 'white', 'white'),
                   legend = c(paste('Observed (obs.): ',
                                    round(genome_size/10^6, digits = 3), ' Mb (error-excluded: ',
                                    round(genome_size_filtering_error/10^6, digits = 3), ' Mb)',               sep=""),
                              paste('* k-mer_cov obs.: ',     first_peak_pos, sep=""),
                              paste('* signal_error_border ', border_pos,                     sep=""),
                              paste('Fitted count with fitted k-mer_cov: ',
                                    round(genome_size_fitted/10^6, digits = 3), ' Mb',                         sep=""),
                              paste('* k-mer_cov fit.: ',     round(fitted_peak_value, digits=2),              sep=""),
                              paste('* 1st-sd:',              round(first_sd, digits=2),                       sep=""),
                              paste('* 1st-skewness: ',       round(myxifit[1], digits=2),                     sep=""),
                              paste('Fitted+obs. with corrected k-mer_cov: ',
                                    round(genome_size_corrected2/10^6, digits = 3), ' Mb',                     sep=""),
                              paste('* k-mer_cov cor.: ',     round(first_mean_raw, digits=2),                 sep=""),
                              paste('* repetitive_ratio ',
                                    round(repsize/genome_size_corrected2, digits=2), ': ',
                                    round(repsize/10^6, digits = 3), ' Mb',                                    sep="")),
                   border="NA",
                   cex=0.7)
          }
          # end: new on 2016-08-08
          # genome sizes
          size_all[i]  <- genome_size
          size_exl[i]  <- genome_size_filtering_error
          size_fit[i]  <- genome_size_fitted
          size_cat[i]  <- genome_size_catted
          size_cor[i]  <- genome_size_corrected
          size_cor2[i] <- genome_size_corrected2
          # rbind(size_all, size_exl, size_fit, size_cat)
          # genome-wide average kmer coverage related info
          kmer_peak_cnt[i]  <- dr[first_peak_pos, 2]
          kmer_peak_pos[i]  <- first_peak_pos
          kmer_peak_pos2[i] <- fitted_peak_value
        }
        # labels for later plotting purpose
        labels[3*i-1]  <- paste("", sizek, sep="")
        namebank[i]    <- sizek
      }
      else
      {
        dev.off()
        stop(paste("file ", countingfile, " not found!",sep=""))
        quit("no")
      }
    }
    # summarize all kmer counting plots
    ylimmax <- 0
    for (sizek in targetsizek[1]) {
      histx<-read.table(countingfile)
      peaks2             <- pracma::findpeaks(histx$V2[3:selected])
      peaks2[,2:4]       <- peaks2[,2:4]+2
      if(1.5*max(peaks2[,1]) > ylimmax)
      {
        ylimmax <- 1.5*max(peaks2[,1])
      }
    }
    # colors
    colors <- rainbow(length(size_all))
    #
    ## plot comparision among gsize-estimations using different info
    genome_size_summary  <-rbind(size_all, size_exl, size_cat, size_fit, size_cor2)
    genome_size_summary2 <-rbind(size_exl, size_fit, size_cor2)
    message(paste("Genome size estimate for ", sample, ": ", size_cor2, " bp.\n\n", sep=""))
    # caution
    if(length(which(is.na(genome_size_summary))) > 0)
    {
      message("\n caution: some samples have NA predicted for observed genome size!\n")
      genome_size_summary[is.na(genome_size_summary)]   <- 0
    }
    if(length(which(is.na(genome_size_summary2))) > 0)
    {
      message("\n caution: some samples have NA predicted for fitted   genome size!\n")
      genome_size_summary2[is.na(genome_size_summary2)] <- 0
    }
    if(length(genome_size_summary[1,])!=0 & length(genome_size_summary2[1,])!=0)
    {
      # record in file
      write.table(genome_size_summary,
                  paste(path, '/',prefix, vers, 'est.', sample, ".genome.size.estimated.k",
                        min(targetsizek), 'to', max(targetsizek), ".fitted.txt", sep=""),
                  quote = FALSE, row.names = TRUE, col.names=FALSE)
      # bar plot
      mingsize <- min(genome_size_summary2[!is.na(genome_size_summary2)])
      maxgsize <- max(genome_size_summary2[!is.na(genome_size_summary2)])
      colors <- grey((2:0)/3)
      mp <- barplot(genome_size_summary2, beside=TRUE, col=colors,
                    xlab=paste("Size k", sep=""), ylab="Genome size (bp)",
                    axes = FALSE, axisnames  = FALSE, border=NA,
                    ylim=c(0.5*max(genome_size_summary2),1.2*max(genome_size_summary2)),
                    xpd=FALSE)
      legend("topright",
             legend = c(paste('EXL: ', round(mean(genome_size_summary2[1,]/10^6), digits=3), 'Mb'),
                        paste('FIT: ', round(mean(genome_size_summary2[2,]/10^6), digits=3), 'Mb'),
                        paste('COR: ', round(mean(genome_size_summary2[3,]/10^6), digits=3), 'Mb')),
             fill = colors, cex=.8, border=NA)
      text(mp, par("usr")[3], labels = labels, srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=.8)
      axis(2, cex.axis=.8)
      title(main=paste("Genome size estimation by error-excluding, fitting, and correcting\n", sep=""))
      dev.off()
    }
  } # end of sample
  # output est. on het if asked.
  if(het_observed)
  {
    write(paste("Het_rate ", round(alpha, digits = 8), " ", round(alpha_cor, digits = 8), sep=""),
          file = paste(path, '/',prefix, vers, 'est.',
                       sample, ".genome.size.estimated.k",
                       min(targetsizek), 'to', max(targetsizek),
                       ".fitted.txt", sep=""),
          append = TRUE,
          sep = " ")
  }
  # output est. on ratio of repeats
  write(paste("Est. ratio of repeats ", round(repsize/genome_size_corrected2, digits=8), sep=""),
        file = paste(path, '/',prefix, vers, 'est.',
                     sample, ".genome.size.estimated.k",
                     min(targetsizek), 'to', max(targetsizek),
                     ".fitted.txt", sep=""),
        append = TRUE,
        sep = " ")
  # output est. on avg k-mer cov
  write(paste("Final k-mer cov ", round(first_mean_raw, digits=8), sep=""),
        file = paste(path, '/',prefix, vers, 'est.',
                     sample, ".genome.size.estimated.k",
                     min(targetsizek), 'to', max(targetsizek),
                     ".fitted.txt", sep=""),
        append = TRUE,
        sep = " ")
  if(het_observed && only_one_hom_peak)
  {
    write(paste("het_fitting_delta ", round(het_fitting_delta, digits=8), sep=""),
          file = paste(path, '/',prefix, vers, 'est.',
                       sample, ".genome.size.estimated.k",
                       min(targetsizek), 'to', max(targetsizek),
                       ".fitted.txt", sep=""),
          append = TRUE,
          sep = " ")
  }
  #
  end_t <- Sys.time()
  message('Time consumed: ', format(end_t-start_t), '\n')
  return(list(het_xfit_right_save,hom_peak_pos_save,het_peak_pos_save,meanfit_old, sdfit_old))
}


## additional
#' Filter Peaks from K-mer Histogram
#'
#' This function filters peaks from a k-mer histogram to find a major peak with enough support information on both sides.
#'
#' @param peaks A data frame containing the peaks from the histogram.
#' @param histo A data frame representing the k-mer histogram.
#' @return A data frame with filtered peaks.
#' @examples
#' peaks <- data.frame(V1=1:20, V2=sample(1:10, 20, replace=TRUE))
#' histo <- data.frame(V1=1:100, V2=sample(1:10, 100, replace=TRUE))
#' filtered_peaks <- filter_peaks(peaks, histo)
#' print(filtered_peaks)
#' @export
filter_peaks <- function(peaks, histo)
{
  # find out a major peak with enough support information on both sides (to exlcude potential local maxima)
  pos <- 1 # peak position (namely, the k-mer freq at the peak)
  i   <- 0 # traverse candidate peaks
  # a peak freq=pos should start at least from 5, or else too small to perform curve fitting
  while(pos < 5 & i < length(peaks[,1]))
  {
    i <- i+1
    pos <- peaks[i, 2] # x-axis peak
  }
  if(pos < 5)
  {
    stop(paste("   Error: k-mer coverage seemed too low to perform fitting. Minimum average k-mer coverage 5 required. Program exited.\n",sep=""))
  }
  # check if there are enough support on both sides of pos
  while(i < length(peaks[,1]))
  {
    # check left: how many on left are smaller than count at current peak
    posleft <- pos - 1
    j <- 0 # counter of smaller cases on left
    while(posleft>=1)
    {
      if(histo[posleft, 2] < histo[pos, 2])
      {
        j <- j + 1;
      }
      posleft <- posleft - 1
    }
    # check right: if there are right-side counts larger than count at the current peak
    posright <- pos + 1
    k <- 0 # counter of larger cases on right
    maxpos <- peaks[length(peaks[,1]), 2]
    while(posright < maxpos)
    {
      if(histo[posright, 2] > histo[pos, 2])
      {
        k <- k + 1
        if(k>4)
        {
          break
        }
      }
      posright <- posright + 1
    }
    # condition to continue or break
    if(j <= 4 || k>4)
    {
      i <- i + 1
      pos <- peaks[i, 2] # x-axis peak
    }
    else
    {
      break
    }
  }
  # filter out peaks near erroneous peak
  rpeaks <- peaks
  fpi <- 1
  while(rpeaks[fpi, 2] < pos/4)
  {
    rpeaks <- rpeaks[-fpi,]
  }
  return(rpeaks)
}

#' Filter Peaks from K-mer Histogram
#'
#' This function filters peaks from a k-mer histogram to find a major peak with enough support information on both sides.
#'
#' @param histo_data A data frame representing the k-mer histogram.
#' @return A data frame with filtered peaks.
#' @examples
#'
#' x <- seq(-10, 10, length.out = 100)
#' y <- dnorm(x)
#' histo_data <- data.frame(V1 = 1:100, V2 = y * 10)
#' get_het_pos(histo_data)
#'
#' @export
get_het_pos <- function(histo_data){
  data <- histo_data
  y_data <- data$V2

  # Calculate slope (second derivative)
  slope <- diff(y_data)

  # Find the index where slope becomes positive and its following 5 slopes remain positive or zero
  start_index <- NULL
  for (i in 2:(length(slope) - 4)) {
    if (slope[i] > 0 && all(slope[(i+1):(i+4)] >= 0)) {
      start_index <- i
      break
    }
  }
  message("start index: ",start_index,"\n")
  end_index <- NULL
  # Find the first interval where slope starts decreasing after positive slope
  for (i in (start_index + 1):(length(slope) - 1)) {
    if (slope[i] < 0 && slope[i+1] < 0) {
      end_index <- i
      break
    }
    else if (slope[i] >= 0 && slope[i] < slope[i-1] && slope[i] < slope[i+1] &&
             slope[i-1] < slope[i-2] && slope[i-2] < slope[i-3] &&
             slope[i+1] < slope[i+2] && slope[i+2] < slope[i+3]){
      if (is.null(end_index)) {
        end_index <- i
        break
      }
    }
  }

  #print(end_index)
  return(list(start_index ,end_index))
}

Try the findGSEP package in your browser

Any scripts or data that you put into this service are public.

findGSEP documentation built on May 29, 2024, 3:35 a.m.