findGSE: Estimating Genome Size by Fitting k-mer Frequencies in Reads with Skew Normal Distribution Model

Documented in findGSE

# findGSE: estimating genome size by fitting k-mer frequencies in short reads
#          with a skew normal distribution model.
#
# Copyright (C) 2016-2024 Hequan Sun (MPIPZ)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# For any questions on usage or potential bugs, please contact via schneeberger@mpipz.mpg.de or sunhequan@gmail.com .
#
##  Assumption and process
###  assume a skew normal distribution of kmer-freq in
###    the unique regions and subsequent repetitive regions
###  copy the observed values on the right side of the peak to form
###    a sequence (to initialize mean and variance of a skew-normal-distribution)
###  optimize the skew-normal dsitribution curve to fit the observed
###    values from proper 'xleft' to 'xright'
## Result: the estimate is corrected to be larger or smaller (due to
##   the left/right-shift of the fitted peak; estimated counts from 1, 2,...
##   can result in the increased estimates).
## with first fitting if 'heterozygous' known! -- to use fitting for heterozygous,
##   expected Kmer coverage for homozygous region must be provided.
## het_fitting_delta.
##
## output:
##  curve-fit .pdf: *curvefitted.pdf, and a corresponding
##            .txt: listing gs estimates (naive, fitted and final-corrected)
##                  and some other info like repetitive ration (and hetero-rate).

## function for minimizing difference from original observed kmer-freq dr: mean, sd, xi, and scaling factor ##
error_minimize<-function(tooptimize, x, end, xfit, xfit_left, xfit_right, d, min_valid_pos, itr)
{
  # x            : histogram replicated from d, which is formed from dr
  # xfit         : x-values to get the skew normal distribution
  # xfit-left    : a left-side  position to calculate initial mean and sd
  # xfit-right   : a right-side position to calculate initial mean and sd
  # d            : the observed kmer-freq that will be fitted
  # min_valid_pos: a left-side  position, from which the observed kmer-freq will be fitted
  # end          : a rigth-side position, till which the observed kmer-freq will be fitted
  sd_scale_factor    <- tooptimize[1]
  yfit_scale_factor  <- tooptimize[2]*100000/itr
  mean_scale_factor  <- tooptimize[3]
  xi_scale_factor    <- tooptimize[4]
  xfit               <- seq(min(x),max(x),length=end)
  meanfit            <- mean(x[x>=xfit_left & x<=xfit_right])*mean_scale_factor
  sdfit              <- sd(x[  x>=xfit_left & x<=xfit_right])*sd_scale_factor
  xifit              <- 1*xi_scale_factor
  #cat(paste("   Info: initialized mean as ", meanfit, "\n", sep=""))
  #cat(paste("   Info: initialized sd   as ", sdfit, "\n", sep=""))
  #cat(paste("   Info: initialized skew as ", xifit, "\n", sep=""))
  yfit               <- dsnorm(xfit,mean=meanfit,sd=sdfit, xi=xifit)*yfit_scale_factor*length(x)
  diff0              <- sqrt(sum((d[min_valid_pos:end, 2] - yfit[min_valid_pos:end])^2))
  #
  return(diff0)
}
# function for tunning final fitting if heterozgyous genomes
error_minimize2<-function(tooptimize, h_het, h_hom, h_target)
{
  # h_het   : raw fitting for the heterozygous region
  # h_hom   : raw fitting for the homozygous   region
  # h_target: before valley: fitted cnt, after valley: raw cnt
  het_delta <- 1-tooptimize[1]
  hom_delta <- 1 # reserved parameter: might be applied in future
  h_merge   <- het_delta*h_het + hom_delta*h_hom
  diff2     <- sum(abs(h_merge - h_target))
  #
  return(diff2)
}
## function for minimizing difference from remainder kmer-freq dr: mean, sd, and scaling factor ##
error_minimize3<-function(tooptimize, x, end, xfit_left, xfit_right, d, min_valid_pos, itr)
{
  # x            : histogram
  # xfit         : x-values to get the normal distribution
  # xfit-left    : a left-side  position to calculate initial mean and sd
  # xfit-right   : a right-side position to calculate initial mean and sd
  # d            : the observed kmer-freq that will be fitted
  # min_valid_pos: a left-side  position, from which the observed kmer-freq will be fitted
  # end          : a rigth-side position, till which the observed kmer-freq will be fitted
  sd_scale_factor    <- tooptimize[1]
  yfit_scale_factor  <- tooptimize[2]*100000/itr
  mean_scale_factor  <- tooptimize[3]
  xfit               <- seq(min(x),max(x),length=end)
  meanfit            <- mean(x[x>=xfit_left & x<=xfit_right])*mean_scale_factor
  sdfit              <- sd(x[  x>=xfit_left & x<=xfit_right])*sd_scale_factor
  yfit               <- dnorm(xfit,mean=meanfit,sd=sdfit)*yfit_scale_factor*length(x)
  diff0              <- sqrt(sum((d[min_valid_pos:end, 2] - yfit[min_valid_pos:end])^2))
  #
  return(diff0)
}
# recover count 0: in initial count, making kmer freq consecutive.
initial_count_recover <- function(d0)
{
  # dr: the initial kmer count from softwares like jellyfish
  dr <- cbind(1:d0[length(d0[,1]), 1], rep(0, d0[length(d0[,1]), 1]))
  i  <- 0
  while( i < length(d0[,1]))
  {
    i               <- i + 1
    dr[d0[i, 1], 2] <- d0[i, 2]
  }
  return (dr)
}
# modify kmer freq before fitting
kmer_count_modify <- function(start, end, left_right, histx)
{
  # start or end does not include the peak position
  # modify rigth part according to left
  if(left_right == 0)
  {
    for (i in start:end)
    {
      histx[2*end+2-i, 2]   <- histx[i, 2]
    }
  }else
  {
    for (i in start:end)
    {
      histx[2*start-2-i, 2] <- histx[i, 2]
    }
  }
  return (histx)
}
#
################################################## main #############################################################
#' @title Estimating genome size by fitting k-mer frequencies in short reads
#' with a skew normal distribution model.
#'
#' @description findGSE is a function for (heterozygous diploid or homozygous)
#' genome size estimation by fitting k-mer frequencies iteratively
#' with a skew normal distribution model. (version still under testing)
#'
#' @description To use findGSE, one needs to prepare a histo file,
#' which contains two tab-separated columns.
#' The first column gives frequencies at which k-mers occur in reads,
#' while the second column gives counts of such distinct k-mers.
#' Parameters k and related histo file are required for any estimation.
#'
#' @description Dependencies (R library) required: pracma, fGarch - see INSTALL.
#'
#' @description For heterozygous genomes, another parameter about
#' the average k-mer coverage for the homozygous regions must be provided.
#'
#' @param histo is the histo file (mandatory).
#' @param sizek is the size of k used to generate the histo file (mandatory).
#' K is involved in calculating heterzygosity if the genome is heterozygous.
#' @param outdir is the path to write output files (optional).
#' If not provided, by default results will be written in the folder
#' where the histo file is.
#' @param exp_hom a rough average k-mer coverage for finding the homozygous regions.
#' In general, one can get peaks in the k-mer frequencies file, but has to
#' determine which one is for the homozygous regions, and which one is for the
#' heterozygous regions. It is optional, however, it must be provided
#' if one wants to estimate size for a heterozygous genome.
#' VALUE for exp_hom must satisfy fp < VALUE < 2*fp, where fp is the freq for homozygous peak.
#' If not provided, 0 by default assumes the genome is homozygous.
#' @param species an optional parameter only applied in calculating heterozygosity for human.
#' This is used to indicate that (lx-ly)*hom_c/2 k-mers should be removed from het-kmers,
#' where lx is length of chromosome X, ly is length of chromosome Y, and hom_c is the
#' average k-mer coverage for the homozygous k-mers.
#' Two estimates will be provied as ORIGINAL_EST CORRECTED_EST:
#' for males,   select the second (CORRECTED_EST);
#' for females, select the first  (ORIGINAL_EST)..
#' @export
findGSE <- function(histo="", sizek=0, outdir="", exp_hom=0, species="")
{
  # initial values
  if(missing(histo))
  {
    stop(paste("Cannot find histo file: ", histo, ". Program exited.\n",sep=""))
  }
  if(missing(sizek))
  {
    stop(paste("Cannot find sizek info: ", sizek, ". Program exited.\n",sep=""))
  }
  # defaults
  if(missing(exp_hom))  exp_hom <- 0
  if(missing(outdir))   outdir  <- getwd()
  ######################################## libararies required ###############################################
  # find all peaks in a time series
  suppressWarnings(suppressMessages(library("pracma")))
  # provide skew normal distrbution fitting (dsnorm(...))
  suppressWarnings(suppressMessages(library("fGarch")))
  ## version id
  vers <- 'v1.94.'
  ##
  cat("\nfindGSE initialized...\n")
  ## running time
  start_t <- Sys.time()
  ## check args
  # expert-tunning het fitting (later it would be automatically adjusted)
  het_fitting_delta <- 0.11
  histof  <- F
  kpro    <- F
  if(histo != "") histof <- T
  if(sizek >  0)  kpro   <- T
  if(histof == F || kpro == F)
  {
    stop("Option -histo or -sizek was provided incorrectly. Program exited.\n")
  }
  ## default args
  countingfile    <- histo
  if(file.exists(countingfile)==F)
  {
    stop(paste("Cannot find histo file: ", countingfile, ". Program exited.\n",sep=""))
  }
  cat(paste("   Info: histo file provided as ", countingfile, "\n", sep=""))
  #
  targetsizek     <- sizek
  if(targetsizek  < 15)
  {
    stop(paste("Unexpected size of k: ", targetsizek, ". Size k>=15 required. Program exited.\n",sep=""))
  }
  cat(paste("   Info: size k set as ", targetsizek, "\n", sep=""))
  #
  path_set        <- F
  path            <- outdir
  if(path==".")
  {
    path <- getwd()
  }else
  if(path == "")
  {
     path         <- dirname(normalizePath(countingfile))
  }
  path_set       <- T
  cat(paste("   Info: output folder set as ", path, "\n", sep=""))
  #
  het_observed    <- F
  # expected maximum value for the hom k-mer peak! set as fpeak~2*fpeak
  exp_hom_cov_max <- exp_hom
  cat(paste("   Info: expected coverage of homozygous k-mers set as ", exp_hom_cov_max, "\n", sep=""))
  # derived  maximum value for the het k-mer peak!
  der_het_cov_max <- 0
  ## if output folder does not exist, create it; if existing, give a warning.
  dir.create(file.path(path), showWarnings = T)
  if(exp_hom_cov_max > 0)
  {
    het_observed      <- T
    # derived  maximum value for the het k-mer peak.
    der_het_cov_max   <- round(0.5*exp_hom_cov_max);
    cat(paste("   Info: het observed set as true ==> heterozygous fitting asked. \n", sep=""))
    if(het_fitting_delta != 0.11)
    {
      cat(paste("   Info: value for tuning het fitting set as ", het_fitting_delta, "\n", sep=""))
    }
  }else
  {
    het_observed      <- F
    only_one_het_peak <- F
    only_one_hom_peak <- T
    cat(paste("   Info: het observed set as false ==> heterozygous fitting not asked. \n", sep=""))
  }
  ##
  prefix <- ''
  samples          <- basename(countingfile)
  for (sample in samples)
  {
    cat(paste("\nIterative fitting process for sample ", sample, " started... \n", sep=""))
    # all accessions -- where i can change and modify
    # initialize variables where outputs are recorded
    size_all       <-  0   # Gsize with               error-k-mer-freq
    size_exl       <-  0   # Gsize without            error-k-mer-freq
    size_fit       <-  0   # Gsize with              fitted-k-mer-freq
    size_cat       <-  0   # Gsize with fitted-cat-observed-k-mer-freq
    size_cor       <-  0   # Gsize corrected with size_cat and skewness; size_cor=size_cat/skewness (no)
    size_cor2      <-  0   # Gsize corrected with fitted-cat-observed-freq and upperbound freq 'e' (yes)
    i              <-  0   # the ith sample/accession
    kmer_peak_cnt  <-  0   # k-mer count at the peak position of k-mer counting
    kmer_peak_pos  <-  0   # genome-wide k-mer coverage
    kmer_peak_pos2 <-  0   # genome-wide k-mer coverage according to fitted curve
    labels         <- NA   # labels according input below
    namebank       <- NA
    border_pos     <-  1
    min_valid_pos  <-  1
    first_peak_pos <-  1
    first_mean     <-  0
    first_sd       <-  1
    first_mean_raw <-  0
    # initialize parameters for several iterations of fitting
    mymeanfit      <-  0
    mysdfit        <-  1
    myxifit        <-  1
    myscalefit     <-  1
    # special fit at itr 1
    norm_fit_itr1  <-  F
    # open pdf to write visualization of fitting
    pdf(paste(path, '/',prefix, vers, 'est.', sample, '.sizek', targetsizek, '.curvefitted.pdf',
              sep=""))
    first_fit  <- TRUE
    for (sizek in targetsizek[1]) {
      normal   <- 0
      if(file.exists(countingfile)==TRUE)
      {
        dhet   <- read.table(countingfile)
        ## recover positions in dr whose initial counts are 0
        dr     <- initial_count_recover(dhet[1:length(dhet[,1]),])
        rm("dhet")
        dhet   <- dr
        # if heterozygosity oberved in k-mer freq,
        #    fit het and remove fitted values from subsequent hom-fitting
        selected    <- min(2000, length(dhet[,1]))
        if(selected < exp_hom_cov_max)
        {
          selected  <- min(round(1.5*exp_hom_cov_max), length(dhet[,1]))
        }
        main_peak_is_hom <- T
        if(het_observed)
        {
          # initialize count as error for fitting
          error    <- dr
          cat(paste("    Size ", sizek , " fitting for het k-mers \n", sep=""));
          # find hom and het peaks (if any).
          peaks    <- findpeaks(error[2:exp_hom_cov_max, 2])
          if(length(peaks) == 0)
          {
            plot(dhet[, 1], dhet[, 2],
                 col="gray",
                 xlab="K-mer freq", ylab="Number of k-mers",
                 xlim=c(1, 4*(which.max(dhet[5:selected, 2])+4)), ylim=c(1, 1.5*max(dhet[5:selected, 2])))
            text(1.5*(which.max(dhet[5:selected, 2])+4), 1.1*max(dhet[5:selected, 2]),
                 paste("Main peak (after freq=5) at k-mer freq: ",
                       which.max(dhet[5:selected, 2])+4,
                       sep=""))
            dev.off();
            stop("No k-mer freq peak was found with the cutoff given by -exp_hom.
                  You may need to increase the cutoff!")
            quit("no")
          }
          hom_peak_pos <- 0
          het_peak_pos <- 0
          hom_peak_pos <- peaks[which.max(peaks[,1]), 2]+1   # hom peak
          peaks[which.max(peaks[,1]), 1] <- 0
          het_peak_pos <- peaks[which.max(peaks[,1]), 2]+1   # het peak
          peaks[which.max(peaks[,1]), 1] <- 0
          # if two peaks found, the larger one is hom; the other is het
          if(het_peak_pos > hom_peak_pos)
          {
            tmp               <- het_peak_pos
            het_peak_pos      <- hom_peak_pos
            hom_peak_pos      <- tmp
            main_peak_is_hom  <- F
            cat("    Warning: two peaks found in k-mer freq with given -exp_hom,
                  peak with lower height as hom-peak!\n")
          }
          if(het_peak_pos>0 & hom_peak_pos>0)
          {
            # in case two peask are near each other (e.g. it may happen with k-mer cov >100x)
            ratio        <- hom_peak_pos/het_peak_pos;
            if (round(ratio)<1.5) # nearly equal
            {
              hom_peak_pos <- max(hom_peak_pos, het_peak_pos)
              het_peak_pos <- max(hom_peak_pos, het_peak_pos)
            }
          }
          # if one peak  found, need to check if it is hom or het accordingly
          only_one_hom_peak <- F
          only_one_het_peak <- F
          if(het_peak_pos == hom_peak_pos)
          {
            unkownpeak    <- het_peak_pos
            if(unkownpeak >  der_het_cov_max)
            {
              het_peak_pos      <- round(0.5*hom_peak_pos)
              cat("    Warning: only one peak in k-mer freq with given -exp_hom,
                  determined as hom-peak!\n")
              cat("            -exp_hom needs to be increased
                  if you think it is a het peak.\n")
              only_one_hom_peak <- T
            }else
              if(unkownpeak  < der_het_cov_max)
              {
                hom_peak_pos      <- 2*het_peak_pos
                cat("    Warning: only one peak in k-mer freq with given -exp_hom,
                    determined as het-peak!\n")
                cat("          -exp_hom needs to be decreased
                    if you think it is a hom peak.\n")
                main_peak_is_hom  <- F
                only_one_het_peak <- T
              }else
              {
                plot(dhet[, 1], dhet[, 2],
                     col="gray",
                     xlab="K-mer freq", ylab="Number of k-mers",
                     xlim=c(1, selected), ylim=c(1, 1.5*max(dhet[3:selected, 2])))
                text(100, max(dhet[3:selected, 2]),
                     paste("Main peak at k-mer freq: ",
                           which.max(dhet[3:selected, 2])+2,
                           sep=""))
                dev.off()
                stop("Unexpected peaks found -
                     please check the raw k-mer curve provided in pdf,
                     and reset the option -exp_hom with a proper value x,
                     which must satisfy 2*hom_peak>x>hom_peak !\n\n");
              }
          }
          cat('    Info: het_peak_pos for het fitting: ', het_peak_pos, '\n')
          cat('    Info: hom_peak_pos for hom fitting: ', hom_peak_pos, '\n')
          het_min_valid_pos <- which.min(error[1:het_peak_pos, 2])
          # fit het distribution
          het_xfit_left     <- max(min(het_min_valid_pos, het_peak_pos-10), 3)        # update on 20200129
          if(het_peak_pos   > 80)
          {
            het_xfit_left  <- floor(het_xfit_left+het_peak_pos*0.1473684)
          }
          het_xfit_right   <- het_peak_pos + round(0.5*(hom_peak_pos-het_peak_pos))   # update on 20200129
          if(het_peak_pos   > 80)
          {
            het_xfit_right <- floor(het_xfit_right-het_peak_pos*0.1473684)
          }
          cat('    Info: het_xfit_left  for het fitting: ', het_xfit_left, '\n')
          cat('    Info: het_xfit_right for het fitting: ', het_xfit_right, '\n')
          if(het_xfit_right > het_peak_pos-1 - het_xfit_left + het_peak_pos-1)
          {
            het_xfit_right  <- het_peak_pos-1 - het_xfit_left + het_peak_pos-1
          }
          histx2    <- kmer_count_modify(het_xfit_left, het_peak_pos-1, 0, error)
          x         <- rep(1:het_xfit_right,ceiling(abs(histx2[1:het_xfit_right,2])/(100000/1)))
          xfit      <- seq(min(x),max(x),length=het_xfit_right)
          # optimizing parameters
          v<-optim(par=c(1, 1, 1, 0.8),
                   fn=error_minimize, x=x, end=het_xfit_right, xfit=xfit,
                   xfit_left=het_xfit_left, xfit_right=het_xfit_right,
                   d=histx2, min_valid_pos=het_min_valid_pos, itr=1)
          #
          # scale and correct yfit with optimized values in v$par
          xfit2     <- seq(0.01,selected,0.01)
          meanfit   <- mean(x[x>=het_xfit_left & x<=het_xfit_right])*v$par[3]
          if(meanfit < 0)
          {
            cat(paste("    Warning: data does not follow
                      assumed distribution anymore; fitting stopped.\n", sep=""))
          }
          sdfit     <- sd(x[  x>=het_xfit_left & x<=het_xfit_right])*v$par[1] # raw sd
          #
          xifit     <- 1*v$par[4];
          yfit0     <- dsnorm(xfit2,mean=meanfit,sd=sdfit, xi=xifit)*(100000/1)*v$par[2]*length(x)
          #
          ## scale fitting according to observation
          hetfit    <- yfit0[seq(100, length(yfit0), 100)]
          hetfit    <- hetfit*dhet[het_peak_pos,2]/max(hetfit[1:selected])
          ## end scaling ##
          #
          # set d0 for further hom-fitting
          d0        <- dr
          if(only_one_hom_peak)
          {
            d0[1:selected , 2] <- d0[1:selected , 2] - (1-het_fitting_delta)*hetfit[1:selected]
          }else
          {
            d0[1:selected , 2] <- d0[1:selected , 2] - hetfit[1:selected]
          }
        }else
        {
          d0                 <- dr
          hetfit             <- 0
          hetfit[1:selected] <- 0
          het_min_valid_pos  <- 100000
          only_one_hom_peak  <- T
        }
        i    <- i + 1
        if(i > length(targetsizek)) { break }
        rm("dr")
        # dr can be different from raw k-mer counting due to het fit
        dr   <- d0
        #
        # initialize count as error for fitting: initial error vector
        error        <- dr
        # set the right-bound for fitting
        maxright4fit <- dr[length(dr[,1])-1, 1]
        maxright4fit <- min(10000,length(dr[,1]))
        # set the number of total iterations corresponding
        #     to raw count given by softwares such as jellyfish
        totalitr     <- 10
        myitr        <-  0
        myyfit       <-  0
        homfit       <-  0
        #
        first_fit    <- TRUE
        itr = 1
        while(itr <= totalitr)
        {
          if(first_fit)
          {
            cat(paste("    Size ", sizek , " at itr ", itr, "\n", sep=""))
            if(itr>=2 && 2*mymeanfit[itr-1] > maxright4fit) { myitr <- itr - 1; break;}
            if(itr == 1) # original data
            {
              # 1st-col    : height,
              # 2nd-col    : index of maximum,
              # 3rd/4th-col: indices that peak begins/ends
              if(het_observed)
              {
                # for  border_pos
                peaks           <- findpeaks(error[3:selected, 2])
                peaks[,2:4]     <- peaks[,2:4]+2
                border_pos      <- peaks[1, 3]          # caution: peaks[peakindex, 3]
                # for others: caution!
                if(main_peak_is_hom==F)
                {
                  peaks           <- findpeaks(error[round(0.5*(het_peak_pos+hom_peak_pos)):selected, 2])
                  peaks[,1:4]     <- peaks[,1:4]+round(0.5*(het_peak_pos+hom_peak_pos))-1
                }else
                {
                  peaks           <- findpeaks(error[het_peak_pos:selected, 2])
                  peaks[,1:4]     <- peaks[,1:4]+het_peak_pos-1
                }
              }
              else
              {
                peaks           <- findpeaks(error[3:selected, 2])
                peaks[,2:4]     <- peaks[,2:4]+2
                ## new on 2017-09-16
                peaks           <- filter_peaks(peaks, error)
                ## end 2017-09-16
                border_pos      <- peaks[1, 3]          # caution: peaks[peakindex, 3]
              }
              peakindex         <- which.max(peaks[,1]) # caution: if double peaks
              min_valid_pos     <- peaks[1, 3]          # caution: peaks[peakindex, 3]
              original_peak_pos <- peaks[peakindex, 2]
              original_peal_val <- peaks[peakindex, 1]
              if(het_observed && only_one_het_peak)
              {
                original_peak_pos <- 2*het_peak_pos
                original_peal_val <- peaks[original_peak_pos, 1]
                min_valid_pos     <- het_peak_pos
              }
              datacheck         <- min(300, maxright4fit)
              dci               <- 0
              cat('    Info: min_valid_pos: ', min_valid_pos,'\n')
              while(min_valid_pos == original_peak_pos)
              {
                min_valid_pos     <- min_valid_pos + 1
                original_peak_pos <- which.max(error[min_valid_pos:maxright4fit,2])+(min_valid_pos-1)
                original_peal_val <- max(error[min_valid_pos:maxright4fit,2])
                dci               <- dci + 1
                if(dci > datacheck) { normal=1;break; }
              }
              if(dci   > datacheck)
              {
                cat(paste("Error: weird initial count-data of sample ",
                          sample, ". Please check!!!\n",
                          sep=""))
                normal <- 1
                break
              }
              cat('    Info: signal error border: ', border_pos,'\n')
              first_peak_pos  <- original_peak_pos
              # copy
              d               <- error
              # complement the left side of the peak position
              #    according to observations on the right side of the peak
              for (li in 1:(original_peak_pos-1))
              {
                d[li, 2] <- d[min(2*original_peak_pos-li, length(dr[,1])),2]
              }
              end <- min(original_peak_pos+round((original_peak_pos-min_valid_pos)*5/12),length(dr[,1]))
              if(end - original_peak_pos <= 3 && original_peak_pos>=10)
              {
                end      <- end + 5
              }
            }
            else # error reminder
            {
              # caution: valid kmer count is at least 3
              min_valid_pos     <- round(mymeanfit[itr-1])+3
              original_peak_pos <- which.max(error[min_valid_pos:maxright4fit, 2])+(min_valid_pos-1)
              original_peal_val <- max(error[min_valid_pos:maxright4fit, 2])
              datacheck         <- min(300, maxright4fit)
              dci               <- 0
              while(min_valid_pos == original_peak_pos)
              {
                min_valid_pos     <- min_valid_pos + 1
                original_peak_pos <- which.max(error[min_valid_pos:maxright4fit,2])+(min_valid_pos-1)
                original_peal_val <- max(error[min_valid_pos:maxright4fit,2])
                dci               <- dci + 1
                if(dci > datacheck) { break; }
              }
              if(dci   > datacheck)
              {
                cat(paste("    Warning: weired initial count-data of sample ",
                          sample, ". Please check!!!\n", sep=""))
                normal   <- 0
                xlimmax  <- 100
                break
              }
              # copy
              d          <- abs(error)
              # complement the left side of the peak position according to observations on the right side of the peak
              thisnormal <- 0
              for (li in (original_peak_pos-round(mymeanfit[1])):(original_peak_pos-1)) {
                if(2*original_peak_pos-li > maxright4fit){ thisnormal<-1; break}
                d[li, 2] <- d[2*original_peak_pos-li,2]
              }
              if(thisnormal==1){normal=0; break;}
              end        <- min(original_peak_pos+round(mymeanfit[itr-1]), maxright4fit)
              if(end - original_peak_pos <= 3 && original_peak_pos>=10)
              {
                end      <- end + 5
              }
            }
          }
          else
          {
            # due to the large diff in the first fitting and original curve, increase end for fitting.
            end           <- min(2*original_peak_pos-1, maxright4fit)
            min_valid_pos <- min_valid_pos + 1 ##### 2016-09-16 new
            d             <- error
            first_fit     <- TRUE
          }
          # prepare the histogram data with replication
          forrep     <- ceiling(abs(d[1:end,2])/(100000/itr))
          anyNA      <- which(is.na(forrep))
          if (length(anyNA) >= 1)
          {
            d[anyNA, 2] <- dr[anyNA, 2]; # caution: missing values could happen here!!!
            cat(paste("    Warning: ", length(anyNA),
                      " data-points of d has been replaced as 1 to properly prepare hisgram data x
                      for function optim at itr ", itr, "\n", sep=""))
          }
          x          <- rep(1:end,ceiling(abs(d[1:end,2])/(100000/itr)))
          xfit       <- seq(min(x),max(x),length=end)
          # normal fit
          matchset   <- '0'
          if(!is.na(match(sizek,matchset)))
          {
            xfit_left  <- min_valid_pos
            xfit_right <- 2*original_peak_pos-xfit_left
            if(first_peak_pos>=200)
            {
              xfit_left  <- original_peak_pos - 50
              xfit_right <- original_peak_pos + 50
            }
            v<-optim(par=c(1, 1, 1, 1),
                     fn=error_minimize, x=x, end=end, xfit=xfit,
                     xfit_left=xfit_left, xfit_right=xfit_right,
                     d=error,
                     min_valid_pos=min_valid_pos, itr=itr)
          }
          else
          {
            xfit_left  <- min_valid_pos
            xfit_right <- 2*original_peak_pos-1
            if(first_peak_pos>=100 && itr<2)
            {
              xfit_left  <- original_peak_pos - 20
              xfit_right <- original_peak_pos + 20
            }else
            if(first_peak_pos>=30  && itr<2)
            {
              xfit_left  <- original_peak_pos - 10
              xfit_right <- original_peak_pos + 10
            }
            v<-optim(par=c(1, 1, 1, 0.8),
                     fn=error_minimize, x=x, end=end, xfit=xfit,
                     xfit_left=xfit_left, xfit_right=xfit_right,
                     d=error,
                     min_valid_pos=min_valid_pos, itr=itr)
          }
          cat('    Info: hom_xfit_left  for hom fitting at itr ', itr,  ': ', xfit_left, '\n')
          cat('    Info: hom_xfit_right for hom fitting at itr ', itr,  ': ', xfit_right, '\n')
          #
          # scale and correct yfit with optimized values in v$par
          xfit2     <- seq(0.01,maxright4fit,0.01)
          meanfit   <- mean(x[x>=xfit_left & x<=xfit_right])*v$par[3]
          if(meanfit < 0)
          {
            cat(paste("    Warning: data does not follow assumed distribution anymore at itr ",
                      itr, ", fitting stopped.\n", sep=""))
            normal  <- 0
            if(myyfit==0 && itr==1)
            {
              v<-optim(par=c(1, 1, 1),
                       fn=error_minimize3, x=x, end=end,
                       xfit_left=xfit_left, xfit_right=xfit_right,
                       d=error,
                       min_valid_pos=min_valid_pos, itr=itr)
              xfit2    <- seq(0.01,maxright4fit,0.01)
              yfit0    <- dnorm(xfit2,
                                mean=meanfit,
                                sd=sdfit)*(100000/itr)*v$par[2]*length(x)
              myyfit   <- myyfit + yfit0
              norm_fit_itr1 <- T
              if(het_observed && itr == 1)
              {
                homfit  <- yfit0
              }
            }
            break
          } # stop if the data does not follow normal distribution anymore
          sdfit     <- sd(x[  x>=xfit_left & x<=xfit_right])*v$par[1]
          xifit     <- 1*v$par[4]
          yfit0     <- dsnorm(xfit2,mean=meanfit,sd=sdfit, xi=xifit)*(100000/itr)*v$par[2]*length(x)
          yfit      <- yfit0[seq(100, length(yfit0), 100)]
          ## repeat first fitting if fitting is not satisfactory
          if(itr==1 && end<min(2*original_peak_pos-1, maxright4fit))
          {
            fitting_check <- error[1:maxright4fit, 2] - yfit
            n_less_than_0 <- 0
            for(fci in c((original_peak_pos+3):min(round(2*original_peak_pos), length(dr[,1]))))
            {
              if(fitting_check[fci] < 0)
              {
                n_less_than_0 <- n_less_than_0 + 1
              }
            }
            if(n_less_than_0 >= 3)
            {
              first_fit <- FALSE
              cat(paste("       Fitting has to be repeated: sizek ", sizek,
                        " at itr ", itr, "\n",
                        sep=""))
              i
              next
            }
          }
          # plot
          if(i == 1)
          {
            #plot(error[1:maxright4fit, 1], abs(error[1:maxright4fit, 2]),
            #      xlim=c(0, min(maxright4fit, 4*meanfit)), ylim=c(0, 1.5*max(error[xfit_left:maxright4fit, 2])),
            #      type='b', col="gray",
            #      xlab="K-mer Coverage", ylab="Genomewide k-mer Frequency")
            # title(main=paste('Example: fitting at itr ', itr, 'of sample ', sample, 'k=', sizek, ')'),
            #     cex.main=0.8)
            # lines(xfit2[100:length(xfit2)], yfit0[100:length(xfit2)], col="blue", lwd=2)
          }
          #### begin
          if(max(yfit) > 1.2*max(error[min_valid_pos:maxright4fit, 2]))
          {
            cat(paste("    Note on hom fitting: fitting stopped at iter ",
                      itr, ", expected: ", totalitr, "\n",sep=""))
            if(myyfit==0 && itr==1)
            {
              v<-optim(par=c(1, 1, 1),
                       fn=error_minimize3, x=x, end=end,
                       xfit_left=xfit_left, xfit_right=xfit_right,
                       d=error,
                       min_valid_pos=min_valid_pos, itr=itr)
              xfit2    <- seq(0.01,maxright4fit,0.01)
              yfit0    <- dnorm(xfit2,
                                mean=meanfit,
                                sd=sdfit)*(100000/itr)*v$par[2]*length(x)
              if(sum(yfit0) > 0)
              {
                myyfit   <- myyfit + yfit0
                norm_fit_itr1 <- T
                if(het_observed && itr == 1)
                {
                  homfit  <- yfit0
                }
              }else # abnormal case: caution!: no fitting performed on hom fitting
              {
                yfit0    <- rep(0, length(xfit2))
                myyfit   <- myyfit + yfit0
                norm_fit_itr1 <- T
                if(het_observed && itr == 1)
                {
                  homfit  <- yfit0
                }
              }
            }
            break;
          }
          #### end
          # estimate error
          error[1:maxright4fit, 2] <- abs(error[1:maxright4fit, 2] - yfit)
          # get more accurate fitted values
          xfit2    <- seq(0.01,maxright4fit,0.01);
          yfit0    <- dsnorm(xfit2,mean=meanfit,sd=sdfit, xi=xifit)*(100000/itr)*v$par[2]*length(x)
          myyfit   <- myyfit + yfit0
          # record fitted parameters
          mymeanfit[itr]  <- meanfit
          mysdfit[itr]    <- sdfit
          myxifit[itr]    <- xifit
          myscalefit[itr] <- (100000/itr)*v$par[2]*length(x)
          if(itr == 1)
          {
            first_mean     <- meanfit
            first_sd       <- sdfit
          }
          #
          ## het est: new feature
          if(het_observed && itr == 1)
          {
            homfit  <- yfit0
          }
          ## end of het rate est: new feature
          #
          itr <- itr + 1
        }
        cat("Iterative fitting done.\n\n")
        if(normal == 0)
        {
          # original count
          xlimmax <- min(3*first_peak_pos, selected)
          plot(dhet[, 1], abs(dhet[, 2]),
               xlim=c(0,xlimmax), ylim=c(0, 1.5*max(dhet[border_pos:xlimmax, 2])),
               type='b', col="gray",
               xlab="K-mer freq", ylab="Number of k-mers")
          title(main=paste('Sample ', sample, ' k=', sizek), cex.main=0.8) #
          # fitted count
          if(sum(hetfit[1:selected] > 0)) # plus het
          {
            # fitted peak
            fitted_peak_value <- which.max(myyfit)/100
            if(norm_fit_itr1 == T)
            {
              fitted_peak_value <- hom_peak_pos
            }
            fittedyvalues <- myyfit[seq(100, length(myyfit), 100)]

            if(only_one_hom_peak)
            {
              # begin find optinum het_fitting_delta 2016-12-20
              h_target <- c(hetfit[1:min(border_pos, het_min_valid_pos)],
                            as.vector(dhet[min(border_pos, het_min_valid_pos)+1:(length(dr[,2])-min(border_pos, het_min_valid_pos)),2]))
              v2 <- optim(par=c(0.11, 1),
                          fn=error_minimize2,
                          h_het=hetfit[1:first_peak_pos],
                          h_hom=fittedyvalues[1:first_peak_pos],
                          h_target=h_target[1:first_peak_pos])
              ## update delta
              het_fitting_delta <- v2$par[1]
              cat(paste("    Info: het_fitting_delta optimized as ", het_fitting_delta, "\n\n", sep=""))
              #cat(paste("    Info: hom_fitting_delta optimized as ", v2$par[2],         "\n\n", sep=""))
              # end   find optinum het_fitting_delta 2016-12-20
            }

            if(only_one_hom_peak)
            {
              fittedyvalues[1:selected] <- fittedyvalues[1:selected] + (1-het_fitting_delta)*hetfit[1:selected]
            }else
            {
              fittedyvalues[1:selected] <- fittedyvalues[1:selected] + hetfit[1:selected]
            }
            lines(1:length(fittedyvalues), fittedyvalues, col="blue", lwd=5)
            # het
            if(only_one_hom_peak)
            {
              lines(1:selected, (1-het_fitting_delta)*hetfit[1:selected], col="cyan", lwd=2, lty="dashed")
            }
            else
            {
              lines(1:selected,  hetfit[1:selected], col="cyan", lwd=2, lty="dashed")
            }
          }
          else
          {
            lines(xfit2[100:length(xfit2)], myyfit[100:length(xfit2)], col="blue", lwd=5)
            # fitted peak
            fitted_peak_value <- which.max(myyfit)/100
            fittedyvalues     <- myyfit[seq(100, length(myyfit), 100)]
          }
          # error rate of missing fitting: from raw k-mer counting dhet which includes het k-mers
          sum(fittedyvalues[border_pos:maxright4fit]-dhet[border_pos:maxright4fit,2])/sum(dhet[border_pos:maxright4fit,2])
          # fitted catting original with border_pos <- min_valid_pos; from raw k-mer counting dhet which includes het k-mers
          yfit2 <- c(fittedyvalues[1:min(border_pos, het_min_valid_pos)],
                     as.vector(dhet[min(border_pos, het_min_valid_pos)+1:(length(dr[,2])-min(border_pos, het_min_valid_pos)),2]))
          #
          lines(dhet[, 1], yfit2, col="red", lwd=1.5)
          #title(main=paste('Sample ', sample, ' k=', sizek, ')'), cex.main=0.8)
          #
          # calculate genome size (Mb) according to original data
          # with all kmers
          genome_size                 <- round(sum(dr[,1]*dhet[,2]/first_peak_pos))
          # with valid kmers: caution: 1.peak-value 2. valid kmer with cov >= min_valid_pos (or border_pos)
          genome_size_filtering_error <- round(sum(dr[border_pos:length(dhet[,1]),1]*dhet[border_pos:length(dr[,1]),2]/first_peak_pos))
          ##
          if(het_observed)
          {
            genome_size                 <- round(sum(dr[,1]*dhet[,2]/hom_peak_pos))
            genome_size_filtering_error <- round(sum(dr[border_pos:length(dhet[,1]),1]*dhet[border_pos:length(dr[,1]),2]/hom_peak_pos))
          }
          # genomic kmers
          # cat(paste("        err-exl genomic ", sizek, "mers: ", sum(dhet[border_pos:length(dhet[,2]),2]), " Mio\n", sep=""));
          # fitted data with fitted peak
          genome_size_fitted <- round(sum(c(1:length(fittedyvalues))*fittedyvalues[1:length(fittedyvalues)]/fitted_peak_value))
          # genomic kmers
          # cat(paste("        fit     genomic ", sizek, "mers: ", round(sum(fittedyvalues)), " Mio\n", sep=""));
          # catted data with fitted peak
          genome_size_catted <- round(sum(c(1:length(yfit2))*yfit2[1:length(yfit2)]/fitted_peak_value))
          # genomic kmers
          # cat(paste("        catted  genomic ", sizek, "mers: ", round(sum(yfit2)), " Mio\n", sep=""));
          # corrected genome size (not meaningful)
          genome_size_corrected  <- round(genome_size_catted/myxifit[1])
          # genome size: catted data + raw mean
          if(sum(hetfit[1:selected] > 0))
          {
            end_for_mean <- round(fitted_peak_value*2 - fitted_peak_value*2/4) # >7% error; k should be small
          }else
          if((sizek=="15" || sizek=="17") && myxifit[1]>=0.93 && myxifit[1]<=1.07) # normal case: reduce effect of copy-2 k-mers
          {
            end_for_mean <- round(fitted_peak_value*2 - fitted_peak_value*1/4)
          }else
            if(myxifit[1]  > 1.07)
            {
              end_for_mean <- round(fitted_peak_value*2 + fitted_peak_value*2/4)
            }else
              if(myxifit[1]  < 0.93)
              {
                end_for_mean <- round(fitted_peak_value*2 - fitted_peak_value*4/8)
              }else
              {
                end_for_mean <- round(fitted_peak_value*2)
              }
          #### caution: end_for_mean should be not larger than vector size
          end_for_mean <- min(end_for_mean, length(dr[,1]))       ####
          ####
          if(het_observed && only_one_het_peak) ## new ##
          {
            end_for_mean     <- round(het_peak_pos*4 - het_peak_pos*4/4)
            end_for_mean     <- min(end_for_mean, length(dr[,1])) ####
            #cat("end_for_mean (only one het peak): ", end_for_mean, '\n')
            xtmp             <- rep(1:end_for_mean,ceiling(abs(round(hetfit[1:end_for_mean]))/1000))
            first_mean_raw   <- 2*mean(xtmp[xtmp>=1 & xtmp<=end_for_mean])
          }else
            if(het_observed & main_peak_is_hom==F)
            {
              het_end_for_mean <- first_peak_pos
              het_end_for_mean <- min(het_end_for_mean, length(dr[,1])) ####
              xtmp             <- rep(1:het_end_for_mean,ceiling(abs(round(hetfit[1:het_end_for_mean]))/1000))
              first_mean_raw   <- 2*mean(xtmp[xtmp>=1 & xtmp<=het_end_for_mean])
            }else
              if(het_observed && only_one_hom_peak) # caution: expert suppl.
              {
                dtmp           <- ceiling(abs(round(yfit2[1:end_for_mean]-(1-het_fitting_delta)*hetfit[1:end_for_mean]))/1000) # 2016-08-25
                ## start of specific in v1.94: from [0.5*het_peak, 1.5*het_peak]
                offset_count   <- round((fittedyvalues[1:end_for_mean]-yfit2[1:end_for_mean])/1000)
                offl <- round(0.5*which.max(hetfit))
                offr <- round(1.5*which.max(hetfit))
                for (off_i in c(offl:offr))
                {
                  if(offset_count[off_i] > 0)
                  {
                    dtmp[off_i] <- dtmp[off_i] + offset_count[off_i]
                  }
                  else
                  {
                    dtmp[off_i] <- dtmp[off_i] - offset_count[off_i]
                  }
                }
                ## end   of specific in v1.94
                xtmp           <- rep(1:end_for_mean,dtmp)
                first_mean_raw <- mean(xtmp[xtmp>=1 & xtmp<=end_for_mean])
              }else
              {
                xtmp           <- rep(1:end_for_mean,ceiling(abs(round(yfit2[1:end_for_mean]-hetfit[1:end_for_mean]))/1000))
                first_mean_raw <- mean(xtmp[xtmp>=1 & xtmp<=end_for_mean]);
              }
          #
          genome_size_corrected2 <- round(sum(c(1:length(yfit2))*yfit2[1:length(yfit2)]/first_mean_raw))
          #
          #
          ## heterozygosity est: new feature - 2016-11-18
          if(het_observed)
          {
            # alpha = 1 / [ (O/E)*K + K ]
            # het
            het_end <- round(first_mean_raw)
            hom_end <- min(round(2*first_mean_raw), selected)
            if(only_one_hom_peak)
            {
              # only for humans: do the correction for those k-mers from X-Y
              het_correction <- 0
              if(species=="human")
              {
                het_correction <- (155270560 - 59373566)*first_mean_raw/2;
              }
              #sum(c(1:het_end)*(1-het_fitting_delta)*hetfit[1:het_end])
              alpha     <- 1 / ( sum(seq(1:hom_end)*homfit[seq(100, length(homfit), 100)][1:hom_end])/  sum(seq(1:het_end)*hetfit[1:het_end])/(1-het_fitting_delta)*as.numeric(sizek) + as.numeric(sizek) )
              alpha_cor <- 0
              alpha_cor <- 1 / ( sum(seq(1:hom_end)*homfit[seq(100, length(homfit), 100)][1:hom_end])/ (sum(seq(1:het_end)*hetfit[1:het_end])-het_correction)/(1-het_fitting_delta)*as.numeric(sizek) + as.numeric(sizek) )
            }
            else
            {
              # only for humans: do the correction for those k-mers from X-Y
              het_correction <- 0
              if(species=="human")
              {
                het_correction <- (155270560 - 59373566)*first_mean_raw/2;
              }
              #sum(c(1:het_end)*hetfit[1:het_end])
              alpha     <- 1 / ( sum(seq(1:hom_end)*homfit[seq(100, length(homfit), 100)][1:hom_end])/  sum(seq(1:het_end)*hetfit[1:het_end])*as.numeric(sizek) + as.numeric(sizek) )
              alpha_cor <- 0
              alpha_cor <- 1 / ( sum(seq(1:hom_end)*homfit[seq(100, length(homfit), 100)][1:hom_end])/ (sum(seq(1:het_end)*hetfit[1:het_end])-het_correction )*as.numeric(sizek) + as.numeric(sizek) )
            }
          }
          ## end of heterozygosity rate est: new feature - 2016-11-18
          #
          #
          # single copy region of genome
          singlestart = 1
          singleend   = min(round(2*first_mean_raw),length(dr[,1])-1) #### caution: out of bounds
          singlesize  = sum(dhet[singlestart:singleend, 1]*yfit2[singlestart:singleend]/first_mean_raw)
          # begin: new on 2016-08-08
          repsize  = round(sum(dhet[(singleend+1):length(yfit2), 1]*yfit2[(singleend+1):length(yfit2)]/first_mean_raw))
          #legend(45, 1.36*max(dr[border_pos:60, 2]),
          if(het_observed)
          {
            legend("topright",
                   fill   = c('gray','white', 'white', 'cyan', 'white', 'blue',
                              'white', 'white', 'white', 'red', 'white', 'white'),
                   legend = c(paste('Observed (obs.): ',
                                    round(genome_size/10^6, digits = 3), ' Mb (error-excluded: ',
                                    round(genome_size_filtering_error/10^6, digits = 3), ' Mb)', sep=""),
                              paste('* k-mer_cov obs.: ',     first_peak_pos, sep=""),
                              paste('* signal_error_border ', het_min_valid_pos,                 sep=""),
                              paste('Heterozygous fitting (intermediate info)'),
                              paste('* het rate: ',
                                    round(alpha, digits=8),  " (ori) : (cor) ",
                                    round(alpha_cor, digits = 8),                                sep=""),
                              paste('Fitted count with fitted k-mer_cov: ',
                                    round(genome_size_fitted/10^6, digits = 3), ' Mb',           sep=""),
                              paste('* k-mer_cov fit.: ',
                                    round(fitted_peak_value, digits=2),                          sep=""),
                              paste('* 1st-sd:',
                                    round(first_sd, digits=2),                                   sep=""),
                              paste('* 1st-skewness: ',
                                    round(myxifit[1], digits=2),                                 sep=""),
                              paste('Fitted+obs. with corrected k-mer_cov: ',
                                    round(genome_size_corrected2/10^6, digits = 3), ' Mb',       sep=""),
                              paste('* k-mer_cov cor.: ',
                                    round(first_mean_raw, digits=2),                             sep=""),
                              paste('* repetitive_ratio ',
                                    round(repsize/genome_size_corrected2, digits=2), ': ',
                                    round(repsize/10^6, digits = 3), ' Mb',                      sep="")),
                   border="NA",
                   cex=0.7)
          }
          else
          {
            legend("topright",
                   fill   = c('gray','white', 'white', 'blue', 'white',
                              'white', 'white', 'red', 'white', 'white'),
                   legend = c(paste('Observed (obs.): ',
                                    round(genome_size/10^6, digits = 3), ' Mb (error-excluded: ',
                                    round(genome_size_filtering_error/10^6, digits = 3), ' Mb)',               sep=""),
                              paste('* k-mer_cov obs.: ',     first_peak_pos, sep=""),
                              paste('* signal_error_border ', border_pos,                     sep=""),
                              paste('Fitted count with fitted k-mer_cov: ',
                                    round(genome_size_fitted/10^6, digits = 3), ' Mb',                         sep=""),
                              paste('* k-mer_cov fit.: ',     round(fitted_peak_value, digits=2),              sep=""),
                              paste('* 1st-sd:',              round(first_sd, digits=2),                       sep=""),
                              paste('* 1st-skewness: ',       round(myxifit[1], digits=2),                     sep=""),
                              paste('Fitted+obs. with corrected k-mer_cov: ',
                                    round(genome_size_corrected2/10^6, digits = 3), ' Mb',                     sep=""),
                              paste('* k-mer_cov cor.: ',     round(first_mean_raw, digits=2),                 sep=""),
                              paste('* repetitive_ratio ',
                                    round(repsize/genome_size_corrected2, digits=2), ': ',
                                    round(repsize/10^6, digits = 3), ' Mb',                                    sep="")),
                   border="NA",
                   cex=0.7)
          }
          # end: new on 2016-08-08
          # genome sizes
          size_all[i]  <- genome_size
          size_exl[i]  <- genome_size_filtering_error
          size_fit[i]  <- genome_size_fitted
          size_cat[i]  <- genome_size_catted
          size_cor[i]  <- genome_size_corrected
          size_cor2[i] <- genome_size_corrected2
          # rbind(size_all, size_exl, size_fit, size_cat)
          # genome-wide average kmer coverage related info
          kmer_peak_cnt[i]  <- dr[first_peak_pos, 2]
          kmer_peak_pos[i]  <- first_peak_pos
          kmer_peak_pos2[i] <- fitted_peak_value
        }
        # labels for later plotting purpose
        labels[3*i-1]  <- paste("", sizek, sep="")
        namebank[i]    <- sizek
      }
      else
      {
        dev.off()
        stop(paste("file ", countingfile, " not found!",sep=""))
        quit("no")
      }
    }
    # summarize all kmer counting plots
    ylimmax <- 0
    for (sizek in targetsizek[1]) {
      histx<-read.table(countingfile)
      peaks2             <- findpeaks(histx$V2[3:selected])
      peaks2[,2:4]       <- peaks2[,2:4]+2
      if(1.5*max(peaks2[,1]) > ylimmax)
      {
        ylimmax <- 1.5*max(peaks2[,1])
      }
    }
    # colors
    colors <- rainbow(length(size_all))
    #
    ## plot comparision among gsize-estimations using different info
    genome_size_summary  <-rbind(size_all, size_exl, size_cat, size_fit, size_cor2)
    genome_size_summary2 <-rbind(size_exl, size_fit, size_cor2)
    cat(paste("Genome size estimate for ", sample, ": ", size_cor2, " bp.\n\n", sep=""))
    # caution
    if(length(which(is.na(genome_size_summary))) > 0)
    {
      cat("\n caution: some samples have NA predicted for observed genome size!\n")
      genome_size_summary[is.na(genome_size_summary)]   <- 0
    }
    if(length(which(is.na(genome_size_summary2))) > 0)
    {
      cat("\n caution: some samples have NA predicted for fitted   genome size!\n")
      genome_size_summary2[is.na(genome_size_summary2)] <- 0
    }
    if(length(genome_size_summary[1,])!=0 & length(genome_size_summary2[1,])!=0)
    {
      # record in file
      write.table(genome_size_summary,
                  paste(path, '/',prefix, vers, 'est.', sample, ".genome.size.estimated.k",
                        min(targetsizek), 'to', max(targetsizek), ".fitted.txt", sep=""),
                  quote = FALSE, row.names = TRUE, col.names=FALSE)
      # bar plot
      mingsize <- min(genome_size_summary2[!is.na(genome_size_summary2)])
      maxgsize <- max(genome_size_summary2[!is.na(genome_size_summary2)])
      colors <- grey((2:0)/3)
      mp <- barplot(genome_size_summary2, beside=TRUE, col=colors,
                    xlab=paste("Size k", sep=""), ylab="Genome size (bp)",
                    axes = FALSE, axisnames  = FALSE, border=NA,
                    ylim=c(0.5*max(genome_size_summary2),1.2*max(genome_size_summary2)),
                    xpd=FALSE)
      legend("topright",
             legend = c(paste('EXL: ', round(mean(genome_size_summary2[1,]/10^6), digits=3), 'Mb'),
                        paste('FIT: ', round(mean(genome_size_summary2[2,]/10^6), digits=3), 'Mb'),
                        paste('COR: ', round(mean(genome_size_summary2[3,]/10^6), digits=3), 'Mb')),
             fill = colors, cex=.8, border=NA)
      text(mp, par("usr")[3], labels = labels, srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=.8)
      axis(2, cex.axis=.8)
      title(main=paste("Genome size estimation by error-excluding, fitting, and correcting\n", sep=""))
      dev.off()
    }
  } # end of sample
  # output est. on het if asked.
  if(het_observed)
  {
    write(paste("Het_rate ", round(alpha, digits = 8), " ", round(alpha_cor, digits = 8), sep=""),
          file = paste(path, '/',prefix, vers, 'est.',
                       sample, ".genome.size.estimated.k",
                       min(targetsizek), 'to', max(targetsizek),
                       ".fitted.txt", sep=""),
          append = T,
          sep = " ")
  }
  # output est. on ratio of repeats
  write(paste("Est. ratio of repeats ", round(repsize/genome_size_corrected2, digits=8), sep=""),
        file = paste(path, '/',prefix, vers, 'est.',
                     sample, ".genome.size.estimated.k",
                     min(targetsizek), 'to', max(targetsizek),
                     ".fitted.txt", sep=""),
        append = T,
        sep = " ")
  # output est. on avg k-mer cov
  write(paste("Final k-mer cov ", round(first_mean_raw, digits=8), sep=""),
        file = paste(path, '/',prefix, vers, 'est.',
                     sample, ".genome.size.estimated.k",
                     min(targetsizek), 'to', max(targetsizek),
                     ".fitted.txt", sep=""),
        append = T,
        sep = " ")
  if(het_observed && only_one_hom_peak)
  {
    write(paste("het_fitting_delta ", round(het_fitting_delta, digits=8), sep=""),
          file = paste(path, '/',prefix, vers, 'est.',
                       sample, ".genome.size.estimated.k",
                       min(targetsizek), 'to', max(targetsizek),
                       ".fitted.txt", sep=""),
          append = T,
          sep = " ")
  }
  #
  end_t <- Sys.time()
  cat('Time consumed: ', format(end_t-start_t), '\n')
}
##
## additional
filter_peaks <- function(peaks, histo)
{
  # find out a major peak with enough support information on both sides (to exlcude potential local maxima)
  pos <- 1 # peak position (namely, the k-mer freq at the peak)
  i   <- 0 # traverse candidate peaks
  # a peak freq=pos should start at least from 5, or else too small to perform curve fitting
  while(pos < 5 & i < length(peaks[,1]))
  {
    i <- i+1
    pos <- peaks[i, 2] # x-axis peak
  }
  if(pos < 5)
  {
    stop(paste("   Error: k-mer coverage seemed too low to perform fitting. Minimum average k-mer coverage 5 required. Program exited.\n",sep=""))
  }
  # check if there are enough support on both sides of pos
  while(i < length(peaks[,1]))
  {
    # check left: how many on left are smaller than count at current peak
    posleft <- pos - 1
    j <- 0 # counter of smaller cases on left
    while(posleft>=1)
    {
      if(histo[posleft, 2] < histo[pos, 2])
      {
        j <- j + 1;
      }
      posleft <- posleft - 1
    }
    # check right: if there are right-side counts larger than count at the current peak
    posright <- pos + 1
    k <- 0 # counter of larger cases on right
    maxpos <- peaks[length(peaks[,1]), 2]
    while(posright < maxpos)
    {
      if(histo[posright, 2] > histo[pos, 2])
      {
        k <- k + 1
        if(k>4)
        {
          break
        }
      }
      posright <- posright + 1
    }
    # condition to continue or break
    if(j <= 4 || k>4)
    {
      i <- i + 1
      pos <- peaks[i, 2] # x-axis peak
    }
    else
    {
      break
    }
  }
  # filter out peaks near erroneous peak
  rpeaks <- peaks
  fpi <- 1
  while(rpeaks[fpi, 2] < pos/4)
  {
    rpeaks <- rpeaks[-fpi,]
  }
  return(rpeaks)
}
schneebergerlab/findGSE documentation built on Jan. 26, 2024, 8:10 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
schneebergerlab/findGSE
Estimating Genome Size by Fitting k-mer Frequencies in Reads with Skew Normal Distribution Model

R/findGSE_v1.94.R
In schneebergerlab/findGSE: Estimating Genome Size by Fitting k-mer Frequencies in Reads with Skew Normal Distribution Model

Defines functions filter_peaks findGSE kmer_count_modify initial_count_recover error_minimize3 error_minimize2 error_minimize

Documented in findGSE

R Package Documentation

Browse R Packages

We want your feedback!

schneebergerlab/findGSE Estimating Genome Size by Fitting k-mer Frequencies in Reads with Skew Normal Distribution Model

R/findGSE_v1.94.R In schneebergerlab/findGSE: Estimating Genome Size by Fitting k-mer Frequencies in Reads with Skew Normal Distribution Model

Defines functions filter_peaks findGSE kmer_count_modify initial_count_recover error_minimize3 error_minimize2 error_minimize

Documented in findGSE

R Package Documentation

Browse R Packages

We want your feedback!

schneebergerlab/findGSE
Estimating Genome Size by Fitting k-mer Frequencies in Reads with Skew Normal Distribution Model

R/findGSE_v1.94.R
In schneebergerlab/findGSE: Estimating Genome Size by Fitting k-mer Frequencies in Reads with Skew Normal Distribution Model