Nothing
#' Initialize Start Time
#'
#' This function initializes the start time for a process.
#'
#' @return The start time of the process.
#' @export
initialize_start_time <- function() {
start_time <- proc.time()
return(start_time)
}
#' Minimize the Error for K-mer Frequency Fitting
#'
#' This function minimizes the error for k-mer frequency fitting by adjusting the mean, standard deviation, and scaling factors.
#'
#' @param tooptimize A numeric vector containing the scale factors to optimize.
#' @param x A numeric vector representing the histogram replicated from \code{d}.
#' @param xfit x-values to get the skew normal distribution
#' @param end An integer indicating the right-side position for fitting.
#' @param xfit_left A numeric value for the left-side position to calculate initial mean and standard deviation.
#' @param xfit_right A numeric value for the right-side position to calculate initial mean and standard deviation.
#' @param d A data frame representing the observed k-mer frequencies that will be fitted.
#' @param min_valid_pos An integer indicating the left-side position from which the observed k-mer frequencies will be fitted.
#' @param itr An integer representing the iteration count.
#' @return A numeric value representing the minimized error.
#'
#' @examples
#'
#' tooptimize <- c(1, 1, 1, 1)
#' x <- rnorm(100)
#' end <- 100
#' xfit <- seq(min(x), max(x), length=end)
#' xfit_left <- min(x)
#' xfit_right <- max(x)
#' d <- data.frame(V1=1:100, V2=rnorm(100))
#' min_valid_pos <- 10
#' itr <- 100
#' error <- error_minimize(tooptimize, x, end, xfit, xfit_left, xfit_right, d, min_valid_pos, itr)
#' print(error)
#'
#' @export
error_minimize<-function(tooptimize, x, end, xfit, xfit_left, xfit_right, d, min_valid_pos, itr)
{
# x : histogram replicated from d, which is formed from dr
# xfit : x-values to get the skew normal distribution
# xfit-left : a left-side position to calculate initial mean and sd
# xfit-right : a right-side position to calculate initial mean and sd
# d : the observed kmer-freq that will be fitted
# min_valid_pos: a left-side position, from which the observed kmer-freq will be fitted
# end : a rigth-side position, till which the observed kmer-freq will be fitted
sd_scale_factor <- tooptimize[1]
yfit_scale_factor <- tooptimize[2]*100000/itr
mean_scale_factor <- tooptimize[3]
xi_scale_factor <- tooptimize[4]
xfit <- seq(min(x),max(x),length=end)
meanfit <- mean(x[x>=xfit_left & x<=xfit_right])*mean_scale_factor
sdfit <- sd(x[ x>=xfit_left & x<=xfit_right])*sd_scale_factor
xifit <- 1*xi_scale_factor
# message(paste(" Info: initialized mean as ", meanfit, "\n", sep=""))
# message(paste(" Info: initialized sd as ", sdfit, "\n", sep=""))
#message(paste(" Info: initialized skew as ", xifit, "\n", sep=""))
#yfit <- dsnorm(xfit,mean=meanfit,sd=sdfit, xi=xifit)*yfit_scale_factor*length(x)
yfit <- dnorm(xfit,mean=meanfit,sd=sdfit)*yfit_scale_factor*length(x)
diff0 <- sqrt(sum((d[min_valid_pos:end, 2] - yfit[min_valid_pos:end])^2))
#diff0 <- sqrt(sum((d[xfit_left:end, 2] - yfit[xfit_left:end])^2))
# message(paste(" 111 Info: initialized min_valid_pos as ", min_valid_pos, "\n", sep=""))
# message(paste(" 111 Info: initialized end as ", end, "\n", sep=""))
# message(paste(" 111 Info: initialized x_fit_left as ", xfit_left, "\n", sep=""))
# message(paste(" 111 Info: initialized x_fit_right as ", xfit_right, "\n", sep=""))
# diff0 <- sqrt(sum((d[xfit_left:xfit_right, 2] - yfit[xfit_left:xfit_right])^2))
#
return(diff0)
}
# function for tunning final fitting if heterozgyous genomes
#' Tuning Final Fitting for Heterozygous Genomes
#'
#' This function tunes the final fitting for heterozygous genomes by adjusting the delta values for heterozygous and homozygous regions.
#'
#' @param tooptimize A numeric vector containing the scale factors to optimize.
#' @param h_het A numeric vector representing the raw fitting for the heterozygous region.
#' @param h_hom A numeric vector representing the raw fitting for the homozygous region.
#' @param h_target A numeric vector representing the target k-mer frequency.
#' @return A numeric value representing the minimized difference.
#' @examples
#'
#' tooptimize <- c(0.5)
#' h_het <- rnorm(100)
#' h_hom <- rnorm(100)
#' h_target <- rnorm(100)
#' diff <- error_minimize2(tooptimize, h_het, h_hom, h_target)
#' print(diff)
#'
#' @export
error_minimize2<-function(tooptimize, h_het, h_hom, h_target)
{
# h_het : raw fitting for the heterozygous region
# h_hom : raw fitting for the homozygous region
# h_target: before valley: fitted cnt, after valley: raw cnt
het_delta <- 1-tooptimize[1]
hom_delta <- 1 # reserved parameter: might be applied in future
h_merge <- het_delta*h_het + hom_delta*h_hom
diff2 <- sum(abs(h_merge - h_target))
#
return(diff2)
}
## function for minimizing difference from remainder kmer-freq dr: mean, sd, and scaling factor ##
#' Minimize the Error for Remaining K-mer Frequency
#'
#' This function minimizes the error for the remaining k-mer frequency by adjusting the scaling factor.
#'
#' @param tooptimize A numeric vector containing the scale factors to optimize.
#' @param x A numeric vector representing the histogram.
#' @param end An integer indicating the right-side position for fitting.
#' @param xfit_left A numeric value for the left-side position to calculate initial mean and standard deviation.
#' @param xfit_right A numeric value for the right-side position to calculate initial mean and standard deviation.
#' @param d A data frame representing the observed k-mer frequencies that will be fitted.
#' @param min_valid_pos An integer indicating the left-side position from which the observed k-mer frequencies will be fitted.
#' @param itr An integer representing the iteration count.
#' @param meanfit A numeric value representing the initial mean.
#' @param sdfit A numeric value representing the initial standard deviation.
#' @return A numeric value representing the minimized error.
#' @examples
#'
#' tooptimize <- c(1, 1, 1, 1)
#' x <- rnorm(100)
#' end <- 100
#' xfit <- seq(min(x), max(x), length=end)
#' xfit_left <- 20
#' xfit_right <- 80
#' d <- data.frame(V1=1:100, V2=rnorm(100))
#' min_valid_pos <- 10
#' itr <- 100
#' meanfit <- 18
#' sdfit <- 4.21
#' error <- error_minimize3(tooptimize, x, end, xfit_left, xfit_right, d,
#' min_valid_pos, itr, meanfit, sdfit)
#' print(error)
#'
#' @export
error_minimize3<-function(tooptimize, x, end, xfit_left, xfit_right, d, min_valid_pos, itr, meanfit, sdfit)
{
# x : histogram
# xfit : x-values to get the normal distribution
# xfit-left : a left-side position to calculate initial mean and sd
# xfit-right : a right-side position to calculate initial mean and sd
# d : the observed kmer-freq that will be fitted
# min_valid_pos: a left-side position, from which the observed kmer-freq will be fitted
# end : a rigth-side position, till which the observed kmer-freq will be fitted
#sd_scale_factor <- tooptimize[1]
yfit_scale_factor <- tooptimize[2]*100000/itr
# mean_scale_factor <- tooptimize[3]
xfit <- seq(min(x),max(x),length=end)
# meanfit <- mean(x[x>=xfit_left & x<=xfit_right])*mean_scale_factor
#sdfit <- sd(x[ x>=xfit_left & x<=xfit_right])*sd_scale_factor
yfit <- dnorm(xfit,mean=meanfit,sd=sdfit)*yfit_scale_factor*length(x)
diff0 <- sqrt(sum((d[min_valid_pos:end, 2] - yfit[min_valid_pos:end])^2))
# message(paste(" 111 Info: initialized min_valid_pos as ", min_valid_pos, "\n", sep=""))
# message(paste(" 111 Info: initialized end as ", end, "\n", sep=""))
# message(paste(" 111 Info: initialized x_fit_left as ", xfit_left, "\n", sep=""))
# message(paste(" 111 Info: initialized x_fit_right as ", xfit_right, "\n", sep=""))
# diff0 <- sqrt(sum((d[xfit_left:xfit_right, 2] - yfit[xfit_left:xfit_right])^2))
#diff0 <- sqrt(sum((d[xfit_left:end, 2] - yfit[xfit_left:end])^2))
#
return(diff0)
}
# recover count 0: in initial count, making kmer freq consecutive.
#' Recover Initial K-mer Count
#'
#' This function recovers the initial k-mer count, making the k-mer frequency consecutive.
#'
#' @param d0 A data frame representing the initial k-mer count from software like Jellyfish.
#' @return A data frame with recovered k-mer counts.
#' @examples
#' d0 <- data.frame(V1 = c(1, 2, 4), V2 = c(100, 200, 300))
#' dr <- initial_count_recover(d0)
#' @export
initial_count_recover <- function(d0)
{
# dr: the initial kmer count from softwares like jellyfish
dr <- cbind(1:d0[length(d0[,1]), 1], rep(0, d0[length(d0[,1]), 1]))
i <- 0
while( i < length(d0[,1]))
{
i <- i + 1
dr[d0[i, 1], 2] <- d0[i, 2]
}
return (dr)
}
# modify kmer freq before fitting
#' Modify K-mer Frequency Before Fitting
#'
#' This function modifies the k-mer frequency before fitting, adjusting either the left or right part based on the peak position.
#'
#' @param start An integer indicating the starting position (does not include the peak position).
#' @param end An integer indicating the ending position (does not include the peak position).
#' @param left_right An integer indicating whether to modify the left part (0) or the right part (1).
#' @param histx A data frame representing the k-mer histogram.
#' @return A modified data frame with adjusted k-mer frequencies.
#' @examples
#' histx <- data.frame(V1 = 1:10, V2 = c(100, 200, 300, 400, 500, 400, 300, 200, 100, 50))
#' histx_new <- kmer_count_modify(3, 7, 0, histx)
#' @export
kmer_count_modify <- function(start, end, left_right, histx)
{
# start or end does not include the peak position
# modify rigth part according to left
if(left_right == 0)
{
for (i in start:end)
{
histx[2*end+2-i, 2] <- histx[i, 2]
}
}else
{
for (i in start:end)
{
histx[2*start-2-i, 2] <- histx[i, 2]
}
}
return (histx)
}
#' Estimate Genome Size Using K-mer Frequencies
#'
#' This function estimates the genome size of a species using k-mer frequencies.
#'
#' @param histo A character string specifying the path to the histogram file.
#' @param sizek An integer indicating the size of k used to generate the histogram.
#' @param outdir A character string specifying the output directory. If not specify, will use tempdir() as output directory.
#' @param exp_hom A numeric value representing the expected average k-mer coverage for the homozygous regions.
#' @param species A character string specifying the species name.
#' @param ploidy_ind An integer indicating the ploidy index (default is 2).
#' @param avg_cov A numeric value representing the average coverage.
#' @param left_fit_ratio A numeric value for the left fit ratio (default is 0.835).
#' @param meanfit_old A numeric value representing the previous mean fit.
#' @param sdfit_old A numeric value representing the previous standard deviation fit.
#' @param scale_flag A logical value indicating whether to apply scaling (default is FALSE).
#' @return A list containing the estimated genome size and other fitting parameters.
#' @examples
#' \donttest{
#'
#' histo <- "sample1.histo"
#' sizek <- 21
#' outdir <- tempdir()
#' exp_hom <- 200
#' species <- ""
#' ploidy_ind <- 2
#' avg_cov <- 0
#' left_fit_ratio <- 0.835
#' meanfit_old <- 0
#' sdfit_old <- 0
#' scale_flag <- FALSE
#' fit_lists <- findGSE_sp(path, samples, sizek, exp_hom, ploidy, range_left,
#' range_right, xlimit, ylimit, output_dir)
#' }
#' @export
findGSE_sp <- function(histo="", sizek=0, outdir="", exp_hom=0, species="",ploidy_ind=2,avg_cov = 0,left_fit_ratio = 0.835, meanfit_old = 0, sdfit_old = 0, scale_flag = FALSE)
{
old_options <- options() #
on.exit(options(old_options)) #
# initial values
if(missing(histo))
{
stop(paste("Cannot find histo file: ", histo, ". Program exited.\n",sep=""))
}
if(missing(sizek))
{
stop(paste("Cannot find sizek info: ", sizek, ". Program exited.\n",sep=""))
}
# defaults
message('ploidy index start: ',ploidy_ind,'\n')
if(missing(exp_hom)) exp_hom <- 0
#if(missing(outdir)) outdir <- getwd()
if(missing(outdir)) outdir <- tempdir()
######################################## libararies required ###############################################
#message("\n special version: \n")
# find all peaks in a time series
# suppressWarnings(suppressMessages(library("pracma")))
# provide skew normal distrbution fitting (dsnorm(...))
# suppressWarnings(suppressMessages(library("fGarch")))
## version id
vers <- 'v1.94.'
##
message("\nfindGSE initialized...\n")
## running time
start_t <- Sys.time()
## check args
# expert-tunning het fitting (later it would be automatically adjusted)
het_fitting_delta <- 0.11
histof <- FALSE
kpro <- FALSE
if(histo != "") histof <- TRUE
if(sizek > 0) kpro <- TRUE
if(histof == FALSE || kpro == FALSE)
{
stop("Option -histo or -sizek was provided incorrectly. Program exited.\n")
}
## default args
countingfile <- histo
if(file.exists(countingfile)==FALSE)
{
stop(paste("Cannot find histo file: ", countingfile, ". Program exited.\n",sep=""))
}
message(paste(" Info: histo file provided as ", countingfile, "\n", sep=""))
#
targetsizek <- sizek
if(targetsizek < 15)
{
stop(paste("Unexpected size of k: ", targetsizek, ". Size k>=15 required. Program exited.\n",sep=""))
}
message(paste(" Info: size k set as ", targetsizek, "\n", sep=""))
#
path_set <- FALSE
path <- outdir
if(path==".")
{
#path <- getwd()
path <- tempdir()
}else
if(path == "")
{
path <- dirname(normalizePath(countingfile))
}
path_set <- TRUE
message(paste(" Info: output folder set as ", path, "\n", sep=""))
#
het_observed <- FALSE
# expected maximum value for the hom k-mer peak! set as fpeak~2*fpeak
exp_hom_cov_max <- exp_hom
message(paste(" Info: expected coverage of homozygous k-mers set as ", exp_hom_cov_max, "\n", sep=""))
exp_hom_cov_max_save <- exp_hom_cov_max
# derived maximum value for the het k-mer peak!
der_het_cov_max <- 0
## if output folder does not exist, create it; if existing, give a warning.
dir.create(file.path(path), showWarnings = FALSE)
if(exp_hom_cov_max > 0)
{
het_observed <- TRUE
# derived maximum value for the het k-mer peak.
der_het_cov_max <- round(ploidy_ind/(ploidy_ind+1)*exp_hom_cov_max);
message(paste(" Info: het observed set as true ==> heterozygous fitting asked. \n", sep=""))
if(het_fitting_delta != 0.11)
{
message(paste(" Info: value for tuning het fitting set as ", het_fitting_delta, "\n", sep=""))
}
}else
{
het_observed <- FALSE
only_one_het_peak <- FALSE
only_one_hom_peak <- TRUE
message(paste(" Info: het observed set as false ==> heterozygous fitting not asked. \n", sep=""))
}
##
prefix <- ''
samples <- basename(countingfile)
for (sample in samples)
{
message(paste("\nIterative fitting process for sample ", sample, " started... \n", sep=""))
# all accessions -- where i can change and modify
# initialize variables where outputs are recorded
size_all <- 0 # Gsize with error-k-mer-freq
size_exl <- 0 # Gsize without error-k-mer-freq
size_fit <- 0 # Gsize with fitted-k-mer-freq
size_cat <- 0 # Gsize with fitted-cat-observed-k-mer-freq
size_cor <- 0 # Gsize corrected with size_cat and skewness; size_cor=size_cat/skewness (no)
size_cor2 <- 0 # Gsize corrected with fitted-cat-observed-freq and upperbound freq 'e' (yes)
i <- 0 # the ith sample/accession
kmer_peak_cnt <- 0 # k-mer count at the peak position of k-mer counting
kmer_peak_pos <- 0 # genome-wide k-mer coverage
kmer_peak_pos2 <- 0 # genome-wide k-mer coverage according to fitted curve
labels <- NA # labels according input below
namebank <- NA
border_pos <- 1
min_valid_pos <- 1
first_peak_pos <- 1
first_mean <- 0
first_sd <- 1
first_mean_raw <- 0
# initialize parameters for several iterations of fitting
mymeanfit <- 0
mysdfit <- 1
myxifit <- 1
myscalefit <- 1
# special fit at itr 1
norm_fit_itr1 <- FALSE
# open pdf to write visualization of fitting
pdf(paste(path, '/',prefix, vers, 'est.', sample, '.sizek', targetsizek, '.curvefitted.pdf',
sep=""))
first_fit <- TRUE
for (sizek in targetsizek[1]) {
normal <- 0
if(file.exists(countingfile)==TRUE)
{
dhet <- read.table(countingfile)
## recover positions in dr whose initial counts are 0
dr <- initial_count_recover(dhet[1:length(dhet[,1]),])
rm("dhet")
dhet <- dr
# if heterozygosity oberved in k-mer freq,
# fit het and remove fitted values from subsequent hom-fitting
selected <- min(2000, length(dhet[,1]))
if(selected < exp_hom_cov_max)
{
selected <- min(round(1.5*exp_hom_cov_max), length(dhet[,1]))
}
main_peak_is_hom <- TRUE
if(het_observed)
{
# initialize count as error for fitting
error <- dr
message(paste(" Size ", sizek , " fitting for het k-mers \n", sep=""));
# find hom and het peaks (if any).
peaks <- pracma::findpeaks(error[2:exp_hom_cov_max, 2])
#peaks <- filter_peaks(tmp_peaks, dhet)
if(length(peaks) == 0)
{
plot(dhet[, 1], dhet[, 2],
col="gray",
xlab="K-mer freq", ylab="Number of k-mers",
xlim=c(1, 4*(which.max(dhet[5:selected, 2])+4)), ylim=c(1, 1.5*max(dhet[5:selected, 2])))
text(1.5*(which.max(dhet[5:selected, 2])+4), 1.1*max(dhet[5:selected, 2]),
paste("Main peak (after freq=5) at k-mer freq: ",
which.max(dhet[5:selected, 2])+4,
sep=""))
dev.off();
stop("No k-mer freq peak was found with the cutoff given by -exp_hom.
You may need to increase the cutoff!")
quit("no")
}
hom_peak_pos <- 0
het_peak_pos <- 0
message('enter ploidy equal to ',ploidy_ind)
if(ploidy_ind == 1){
hom_peak_pos <- peaks[which.max(peaks[,1]), 2]+1 # hom peak
peaks[which.max(peaks[,1]), 1] <- 0
het_peak_pos <- peaks[which.max(peaks[,1]), 2]+1 # het peak
peaks[which.max(peaks[,1]), 1] <- 0
}else if(ploidy_ind == 10){
het_peak_pos <- 462 #ploidy_ind * avg_cov
hom_peak_pos <- 615 #(ploidy_ind + 1) * avg_cov
}
else{
het_peak_pos <- ploidy_ind * avg_cov
hom_peak_pos <- (ploidy_ind + 1) * avg_cov
}
message('hom_peak_pos is: ',hom_peak_pos,'\n')
message('het_peak_pos is: ',het_peak_pos,'\n')
# if two peaks found, the larger one is hom; the other is het
if(het_peak_pos > hom_peak_pos)
{
tmp <- het_peak_pos
het_peak_pos <- hom_peak_pos
hom_peak_pos <- tmp
main_peak_is_hom <- FALSE
message(" Warning: two peaks found in k-mer freq with given -exp_hom,
peak with lower height as hom-peak!\n")
}
if(het_peak_pos>0 & hom_peak_pos>0)
{
# in case two peask are near each other (e.g. it may happen with k-mer cov >100x)
ratio <- hom_peak_pos/het_peak_pos;
if (round(ratio)<1.5) # nearly equal
{
hom_peak_pos <- max(hom_peak_pos, het_peak_pos)
het_peak_pos <- min(hom_peak_pos, het_peak_pos)
}
}
# if one peak found, need to check if it is hom or het accordingly
only_one_hom_peak <- FALSE
only_one_het_peak <- FALSE
if(het_peak_pos == hom_peak_pos)
{
unkownpeak <- het_peak_pos
if(unkownpeak > der_het_cov_max)
{
het_peak_pos <- round(ploidy_ind/(ploidy_ind+1)*hom_peak_pos)
message(" Warning: only one peak in k-mer freq with given -exp_hom,
determined as hom-peak!\n")
message(" -exp_hom needs to be increased
if you think it is a het peak.\n")
only_one_hom_peak <- TRUE
}else
if(unkownpeak < der_het_cov_max)
{
hom_peak_pos <- (ploidy_ind+1)/ploidy_ind*het_peak_pos
message(" Warning: only one peak in k-mer freq with given -exp_hom,
determined as het-peak!\n")
message(" -exp_hom needs to be decreased
if you think it is a hom peak.\n")
main_peak_is_hom <- FALSE
only_one_het_peak <- TRUE
}else
{
plot(dhet[, 1], dhet[, 2],
col="gray",
xlab="K-mer freq", ylab="Number of k-mers",
xlim=c(1, selected), ylim=c(1, 1.5*max(dhet[3:selected, 2])))
text(100, max(dhet[3:selected, 2]),
paste("Main peak at k-mer freq: ",
which.max(dhet[3:selected, 2])+2,
sep=""))
dev.off()
stop("Unexpected peaks found -
please check the raw k-mer curve provided in pdf,
and reset the option -exp_hom with a proper value x,
which must satisfy 2*hom_peak>x>hom_peak !\n\n");
}
}
message(' Info: het_peak_pos for het fitting: ', het_peak_pos, '\n')
message(' Info: hom_peak_pos for hom fitting: ', hom_peak_pos, '\n')
het_peak_pos_save <- het_peak_pos
hom_peak_pos_save <- hom_peak_pos
het_min_valid_pos <- which.min(error[1:het_peak_pos, 2])
if(ploidy_ind == 1){
# fit het distribution
het_xfit_left <- max(min(het_min_valid_pos, het_peak_pos-10), 3) # update on 20200129
if(het_peak_pos > 80)
{
het_xfit_left <- floor(het_xfit_left+het_peak_pos*0.1473684)
}
het_xfit_right <- het_peak_pos + round(0.5*(hom_peak_pos-het_peak_pos)) # update on 20200129
if(het_peak_pos > 80)
{
het_xfit_right <- floor(het_xfit_right-het_peak_pos*0.1473684)
}
}else if(ploidy_ind == 10){
het_xfit_left <- 350 #ploidy_ind * avg_cov
het_xfit_right <- 650 #(ploidy_ind + 1) * avg_cov
}
else{
het_xfit_left <- het_peak_pos * left_fit_ratio # 0.85 0.83
het_xfit_right <- het_peak_pos * 1.15 # 1.15 1.13
}
if(het_xfit_right > het_peak_pos-1 - het_xfit_left + het_peak_pos-1)
{
het_xfit_right <- het_peak_pos-1 - het_xfit_left + het_peak_pos-1
}
histx2 <- kmer_count_modify(het_xfit_left, het_peak_pos-1, 0, error)
x <- rep(1:het_xfit_right,ceiling(abs(histx2[1:het_xfit_right,2])/(100000/1)))
xfit <- seq(min(x),max(x),length=het_xfit_right)
# optimizing parameters
message(' Info: het_xfit_left for het fitting: ', het_xfit_left, '\n')
message(' Info: het_xfit_right for het fitting: ', het_xfit_right, '\n')
het_xfit_right_save <- het_xfit_right
if(ploidy_ind == 1 || scale_flag == TRUE){
v<-optim(par=c(1, 1, 1, 0.8),
fn=error_minimize, x=x, end=het_xfit_right, xfit=xfit,
xfit_left=het_xfit_left, xfit_right=het_xfit_right,
d=histx2, min_valid_pos=het_min_valid_pos, itr=1)
#
# scale and correct yfit with optimized values in v$par
xfit2 <- seq(0.01,selected,0.01)
meanfit <- mean(x[x>=het_xfit_left & x<=het_xfit_right])*v$par[3]
meanfit_old <- meanfit
message('first meanfit_old: ',meanfit_old,' \n')
sdfit <- sd(x[ x>=het_xfit_left & x<=het_xfit_right])*v$par[1] # raw sd
sdfit_old <- sdfit
}
else if (scale_flag == FALSE){
meanfit <- meanfit_old * ploidy_ind
sdfit <- sdfit_old + sdfit_old*max(ploidy_ind - 1, 0)*0.1
#sdfit <- max(sdfit, 7.6)
message('meanfit for ploidy: ',ploidy_ind,' is: ',meanfit,'\n')
v<-optim(par=c(1, 1, 1, 0.8),
fn=error_minimize3, x=x, end=het_xfit_right,
xfit_left=het_xfit_left, xfit_right=het_xfit_right,
d=histx2, min_valid_pos=het_min_valid_pos, itr=1, meanfit = meanfit, sdfit = sdfit_old)
#
message('optimize done...\n')
# scale and correct yfit with optimized values in v$par
xfit2 <- seq(0.01,selected,0.01)
#meanfit <- meanfit_old * ploidy_ind
}
if(meanfit < 0)
{
message(paste(" Warning: data does not follow
assumed distribution anymore; fitting stopped.\n", sep=""))
}
# sdfit <- sd(x[ x>=het_xfit_left & x<=het_xfit_right])*v$par[1] # raw sd
#
xifit <- 1*v$par[4];
#yfit0 <- dsnorm(xfit2,mean=meanfit,sd=sdfit, xi=xifit)*(100000/1)*v$par[2]*length(x)
message('sdfit is: ',sdfit,'\n')
yfit0 <- dnorm(xfit2,mean=meanfit,sd=sdfit)*(100000/1)*v$par[2]*length(x)
#
## scale fitting according to observation
hetfit <- yfit0[seq(100, length(yfit0), 100)]
hetfit <- hetfit*dhet[het_peak_pos,2]/max(hetfit[1:selected])
## end scaling ##
#
# set d0 for further hom-fitting
d0 <- dr
if(only_one_hom_peak)
{
d0[1:selected , 2] <- d0[1:selected , 2] - (1-het_fitting_delta)*hetfit[1:selected]
}else
{
d0[1:selected , 2] <- d0[1:selected , 2] - hetfit[1:selected]
}
}else
{
d0 <- dr
hetfit <- 0
hetfit[1:selected] <- 0
het_min_valid_pos <- 100000
only_one_hom_peak <- TRUE
}
i <- i + 1
if(i > length(targetsizek)) { break }
rm("dr")
# dr can be different from raw k-mer counting due to het fit
dr <- d0
#
# initialize count as error for fitting: initial error vector
error <- dr
# set the right-bound for fitting
maxright4fit <- dr[length(dr[,1])-1, 1]
maxright4fit <- min(10000,length(dr[,1]))
# set the number of total iterations corresponding
# to raw count given by softwares such as jellyfish
totalitr <- 2
myitr <- 0
myyfit <- 0
homfit <- 0
#
first_fit <- TRUE
itr = 1
while(itr <= totalitr)
{
if(first_fit)
{
message(paste(" Size ", sizek , " at itr ", itr, "\n", sep=""))
if(itr>=2 && 2*mymeanfit[itr-1] > maxright4fit) { myitr <- itr - 1; break;}
if(itr == 1) # original data
{
# 1st-col : height,
# 2nd-col : index of maximum,
# 3rd/4th-col: indices that peak begins/ends
if(het_observed)
{
# for border_pos
peaks <- pracma::findpeaks(error[3:selected, 2])
peaks[,2:4] <- peaks[,2:4]+2
border_pos <- peaks[1, 3] # caution: peaks[peakindex, 3]
# for others: caution!
if(main_peak_is_hom==F)
{
peaks <- pracma::findpeaks(error[round(0.5*(het_peak_pos+hom_peak_pos)):selected, 2])
peaks[,1:4] <- peaks[,1:4]+round(0.5*(het_peak_pos+hom_peak_pos))-1
}else
{
peaks <- pracma::findpeaks(error[het_peak_pos:selected, 2])
peaks[,1:4] <- peaks[,1:4]+het_peak_pos-1
}
}
else
{
peaks <- pracma::findpeaks(error[3:selected, 2])
peaks[,2:4] <- peaks[,2:4]+2
## new on 2017-09-16
peaks <- filter_peaks(peaks, error)
## end 2017-09-16
border_pos <- peaks[1, 3] # caution: peaks[peakindex, 3]
}
peakindex <- which.max(peaks[,1]) # caution: if double peaks
min_valid_pos <- peaks[1, 3] # caution: peaks[peakindex, 3]
original_peak_pos <- peaks[peakindex, 2]
original_peal_val <- peaks[peakindex, 1]
if(het_observed && only_one_het_peak)
{
original_peak_pos <- (ploidy_ind+1)/ploidy_ind*het_peak_pos
original_peal_val <- peaks[original_peak_pos, 1]
min_valid_pos <- het_peak_pos
}
datacheck <- min(300, maxright4fit)
dci <- 0
message(' Info: min_valid_pos: ', min_valid_pos,'\n')
min_valid_pos_save = min_valid_pos
while(min_valid_pos == original_peak_pos)
{
min_valid_pos <- min_valid_pos + 1
original_peak_pos <- which.max(error[min_valid_pos:maxright4fit,2])+(min_valid_pos-1)
original_peal_val <- max(error[min_valid_pos:maxright4fit,2])
dci <- dci + 1
if(dci > datacheck) { normal=1;break; }
}
if(dci > datacheck)
{
message(paste("Error: weird initial count-data of sample ",
sample, ". Please check!!!\n",
sep=""))
normal <- 1
break
}
message(' Info: signal error border: ', border_pos,'\n')
first_peak_pos <- original_peak_pos
# copy
d <- error
# complement the left side of the peak position
# according to observations on the right side of the peak
for (li in 1:(original_peak_pos-1))
{
d[li, 2] <- d[min(2*original_peak_pos-li, length(dr[,1])),2]
}
end <- min(original_peak_pos+round((original_peak_pos-min_valid_pos)*5/12),length(dr[,1]))
if(end - original_peak_pos <= 3 && original_peak_pos>=10)
{
end <- end + 5
}
}
else # error reminder
{
# caution: valid kmer count is at least 3
min_valid_pos <- round(mymeanfit[itr-1])+3
original_peak_pos <- which.max(error[min_valid_pos:maxright4fit, 2])+(min_valid_pos-1)
original_peal_val <- max(error[min_valid_pos:maxright4fit, 2])
datacheck <- min(300, maxright4fit)
dci <- 0
while(min_valid_pos == original_peak_pos)
{
min_valid_pos <- min_valid_pos + 1
original_peak_pos <- which.max(error[min_valid_pos:maxright4fit,2])+(min_valid_pos-1)
original_peal_val <- max(error[min_valid_pos:maxright4fit,2])
dci <- dci + 1
if(dci > datacheck) { break; }
}
if(dci > datacheck)
{
message(paste(" Warning: weired initial count-data of sample ",
sample, ". Please check!!!\n", sep=""))
normal <- 0
xlimmax <- 100
break
}
# copy
d <- abs(error)
# complement the left side of the peak position according to observations on the right side of the peak
thisnormal <- 0
for (li in (original_peak_pos-round(mymeanfit[1])):(original_peak_pos-1)) {
if(2*original_peak_pos-li > maxright4fit){ thisnormal<-1; break}
d[li, 2] <- d[2*original_peak_pos-li,2]
}
if(thisnormal==1){normal=0; break;}
end <- min(original_peak_pos+round(mymeanfit[itr-1]), maxright4fit)
if(end - original_peak_pos <= 3 && original_peak_pos>=10)
{
end <- end + 5
}
}
}
else
{
# due to the large diff in the first fitting and original curve, increase end for fitting.
end <- min(2*original_peak_pos-1, maxright4fit)
min_valid_pos <- min_valid_pos + 1 ##### 2016-09-16 new
d <- error
first_fit <- TRUE
}
# prepare the histogram data with replication
forrep <- ceiling(abs(d[1:end,2])/(100000/itr))
anyNA <- which(is.na(forrep))
if (length(anyNA) >= 1)
{
d[anyNA, 2] <- dr[anyNA, 2]; # caution: missing values could happen here!!!
message(paste(" Warning: ", length(anyNA),
" data-points of d has been replaced as 1 to properly prepare hisgram data x
for function optim at itr ", itr, "\n", sep=""))
}
x <- rep(1:end,ceiling(abs(d[1:end,2])/(100000/itr)))
xfit <- seq(min(x),max(x),length=end)
# normal fit
matchset <- '0'
if(!is.na(match(sizek,matchset)))
{
xfit_left <- min_valid_pos
xfit_right <- 2*original_peak_pos-xfit_left
if(first_peak_pos>=200)
{
#xfit_left <- original_peak_pos - 50
#xfit_right <- original_peak_pos + 50
xfit_left <- original_peak_pos * left_fit_ratio #- 50 0.85
xfit_right <- original_peak_pos * 1.13 #+ 50 1.15
}
if (ploidy_ind == 1 || scale_flag == TRUE){
v<-optim(par=c(1, 1, 1, 1),
fn=error_minimize, x=x, end=end, xfit=xfit,
xfit_left=xfit_left, xfit_right=xfit_right,
d=error,
min_valid_pos=min_valid_pos, itr=itr)
}
else if (scale_flag == FALSE){
v<-optim(par=c(1, 1, 1, 1),
fn=error_minimize3, x=x, end=end,
xfit_left=xfit_left, xfit_right=xfit_right,
d=error,
min_valid_pos=min_valid_pos, itr=itr, meanfit = meanfit, sdfit = sdfit_old)
}
}
else
{
xfit_left <- min_valid_pos
xfit_right <- 2*original_peak_pos-1
if(first_peak_pos>=100 && itr<2)
{
xfit_left <- original_peak_pos - 20
xfit_right <- original_peak_pos + 20
}else
if(first_peak_pos>=30 && itr<2)
{
xfit_left <- original_peak_pos - 10
xfit_right <- original_peak_pos + 10
}
if(ploidy_ind == 1 || scale_flag == TRUE){
v<-optim(par=c(1, 1, 1, 0.8),
fn=error_minimize, x=x, end=end, xfit=xfit,
xfit_left=xfit_left, xfit_right=xfit_right,
d=error,
min_valid_pos=min_valid_pos, itr=itr)
}
else if (scale_flag == FALSE){
v<-optim(par=c(1, 1, 1, 0.8),
fn=error_minimize3, x=x, end=end,
xfit_left=xfit_left, xfit_right=xfit_right,
d=error,
min_valid_pos=min_valid_pos, itr=itr, meanfit = meanfit, sdfit = sdfit_old)
}
}
message(' Info: hom_xfit_left for hom fitting at itr ', itr, ': ', xfit_left, '\n')
message(' Info: hom_xfit_right for hom fitting at itr ', itr, ': ', xfit_right, '\n')
# if(itr == 1){
# xfit_left
# }
#
# scale and correct yfit with optimized values in v$par
xfit2 <- seq(0.01,maxright4fit,0.01)
# if(ploidy_ind == 1){
# # meanfit <- mean(x[x>=het_xfit_left & x<=het_xfit_right])*v$par[3]
# meanfit <- mean(x[x>=xfit_left & x<=xfit_right])*v$par[3]
# meanfit_old <- meanfit
# message('last meanfit_old: ',meanfit_old,' \n')
# }
# else{
# meanfit <- meanfit_old * ploidy_ind
# }
meanfit <- mean(x[x>=xfit_left & x<=xfit_right])*v$par[3]
if(meanfit < 0)
{
message(paste(" Warning: data does not follow assumed distribution anymore at itr ",
itr, ", fitting stopped.\n", sep=""))
normal <- 0
if(myyfit==0 && itr==1)
{
v<-optim(par=c(1, 1, 1),
fn=error_minimize3, x=x, end=end,
xfit_left=xfit_left, xfit_right=xfit_right,
d=error,
min_valid_pos=min_valid_pos, itr=itr, meanfit = meanfit, sdfit = sdfit_old )
xfit2 <- seq(0.01,maxright4fit,0.01)
yfit0 <- dnorm(xfit2,
mean=meanfit,
sd=sdfit)*(100000/itr)*v$par[2]*length(x)
myyfit <- myyfit + yfit0
norm_fit_itr1 <- TRUE
if(het_observed && itr == 1)
{
homfit <- yfit0
}
}
break
} # stop if the data does not follow normal distribution anymore
sdfit <- sd(x[ x>=xfit_left & x<=xfit_right])*v$par[1]
xifit <- 1*v$par[4]
#yfit0 <- dsnorm(xfit2,mean=meanfit,sd=sdfit, xi=xifit)*(100000/itr)*v$par[2]*length(x)
yfit0 <- dnorm(xfit2,mean=meanfit,sd=sdfit)*(100000/itr)*v$par[2]*length(x)
yfit <- yfit0[seq(100, length(yfit0), 100)]
## repeat first fitting if fitting is not satisfactory
if(itr==1 && end<min(2*original_peak_pos-1, maxright4fit))
{
fitting_check <- error[1:maxright4fit, 2] - yfit
n_less_than_0 <- 0
for(fci in c((original_peak_pos+3):min(round(2*original_peak_pos), length(dr[,1]))))
{
if(fitting_check[fci] < 0)
{
n_less_than_0 <- n_less_than_0 + 1
}
}
if(n_less_than_0 >= 3)
{
first_fit <- FALSE
message(paste(" Fitting has to be repeated: sizek ", sizek,
" at itr ", itr, "\n",
sep=""))
i
next
}
}
# plot
if(i == 1)
{
#plot(error[1:maxright4fit, 1], abs(error[1:maxright4fit, 2]),
# xlim=c(0, min(maxright4fit, 4*meanfit)), ylim=c(0, 1.5*max(error[xfit_left:maxright4fit, 2])),
# type='b', col="gray",
# xlab="K-mer Coverage", ylab="Genomewide k-mer Frequency")
# title(main=paste('Example: fitting at itr ', itr, 'of sample ', sample, 'k=', sizek, ')'),
# cex.main=0.8)
# lines(xfit2[100:length(xfit2)], yfit0[100:length(xfit2)], col="blue", lwd=2)
}
#### begin
if(max(yfit) > 1.2*max(error[min_valid_pos:maxright4fit, 2]))
{
message(paste(" Note on hom fitting: fitting stopped at iter ",
itr, ", expected: ", totalitr, "\n",sep=""))
if(length(myyfit) == 1){
if(myyfit==0 && itr==1)
{
v<-optim(par=c(1, 1, 1),
fn=error_minimize3, x=x, end=end,
xfit_left=xfit_left, xfit_right=xfit_right,
d=error,
min_valid_pos=min_valid_pos, itr=itr, meanfit = meanfit,sdfit = sdfit_old )
xfit2 <- seq(0.01,maxright4fit,0.01)
yfit0 <- dnorm(xfit2,
mean=meanfit,
sd=sdfit)*(100000/itr)*v$par[2]*length(x)
if(sum(yfit0) > 0)
{
myyfit <- myyfit + yfit0
norm_fit_itr1 <- TRUE
if(het_observed && itr == 1)
{
homfit <- yfit0
}
}else # abnormal case: caution!: no fitting performed on hom fitting
{
yfit0 <- rep(0, length(xfit2))
myyfit <- myyfit + yfit0
norm_fit_itr1 <- TRUE
if(het_observed && itr == 1)
{
homfit <- yfit0
}
}
}
}
break;
}
#### end
# estimate error
error[1:maxright4fit, 2] <- abs(error[1:maxright4fit, 2] - yfit)
# get more accurate fitted values
xfit2 <- seq(0.01,maxright4fit,0.01);
#yfit0 <- dsnorm(xfit2,mean=meanfit,sd=sdfit, xi=xifit)*(100000/itr)*v$par[2]*length(x)
yfit0 <- dnorm(xfit2,mean=meanfit,sd=sdfit)*(100000/itr)*v$par[2]*length(x)
myyfit <- myyfit + yfit0
# record fitted parameters
mymeanfit[itr] <- meanfit
mysdfit[itr] <- sdfit
myxifit[itr] <- xifit
myscalefit[itr] <- (100000/itr)*v$par[2]*length(x)
if(itr == 1)
{
first_mean <- meanfit
first_sd <- sdfit
}
#
## het est: new feature
if(het_observed && itr == 1)
{
homfit <- yfit0
}
## end of het rate est: new feature
#
itr <- itr + 1
}
message("Iterative fitting done.\n\n")
if(normal == 0)
{
# original count
xlimmax <- min(3*first_peak_pos, selected)
plot(dhet[, 1], abs(dhet[, 2]),
xlim=c(0,xlimmax), ylim=c(0, 1.5*max(dhet[border_pos:xlimmax, 2])),
type='b', col="gray",
xlab="K-mer freq", ylab="Number of k-mers")
title(main=paste('Sample ', sample, ' k=', sizek), cex.main=0.8) #
# fitted count
if(sum(hetfit[1:selected] > 0)) # plus het
{
# fitted peak
fitted_peak_value <- which.max(myyfit)/100
if(norm_fit_itr1 == TRUE)
{
fitted_peak_value <- hom_peak_pos
}
fittedyvalues <- myyfit[seq(100, length(myyfit), 100)]
if(only_one_hom_peak)
{
# begin find optinum het_fitting_delta 2016-12-20
h_target <- c(hetfit[1:min(border_pos, het_min_valid_pos)],
as.vector(dhet[min(border_pos, het_min_valid_pos)+1:(length(dr[,2])-min(border_pos, het_min_valid_pos)),2]))
v2 <- optim(par=c(0.11, 1),
fn=error_minimize2,
h_het=hetfit[1:first_peak_pos],
h_hom=fittedyvalues[1:first_peak_pos],
h_target=h_target[1:first_peak_pos])
## update delta
het_fitting_delta <- v2$par[1]
message(paste(" Info: het_fitting_delta optimized as ", het_fitting_delta, "\n\n", sep=""))
#message(paste(" Info: hom_fitting_delta optimized as ", v2$par[2], "\n\n", sep=""))
# end find optinum het_fitting_delta 2016-12-20
}
if(only_one_hom_peak)
{
fittedyvalues[1:selected] <- fittedyvalues[1:selected] + (1-het_fitting_delta)*hetfit[1:selected]
}else
{
fittedyvalues[1:selected] <- fittedyvalues[1:selected] + hetfit[1:selected]
}
lines(1:length(fittedyvalues), fittedyvalues, col="blue", lwd=5)
# het
if(only_one_hom_peak)
{
lines(1:selected, (1-het_fitting_delta)*hetfit[1:selected], col="cyan", lwd=2, lty="dashed")
}
else
{
lines(1:selected, hetfit[1:selected], col="cyan", lwd=2, lty="dashed")
options(scipen=999)
het_fit <- cbind(1:selected, round(hetfit[1:selected]))
write.table(het_fit, file = paste(path, '/',prefix, vers, 'est.', sample, ".genome.size.estimated.k", min(targetsizek), 'to', max(targetsizek),".fitted_hetfit_count.txt", sep=""), sep = " ", quote=F, row.names = F, col.names = F)
message(paste0('writing hetfit_count file done!!!\n het_fit length: ',dim(het_fit)[1]," lines.\n"))
options(scipen=0)
}
}
else
{
lines(xfit2[100:length(xfit2)], myyfit[100:length(xfit2)], col="blue", lwd=5)
# fitted peak
fitted_peak_value <- which.max(myyfit)/100
fittedyvalues <- myyfit[seq(100, length(myyfit), 100)]
}
# error rate of missing fitting: from raw k-mer counting dhet which includes het k-mers
sum(fittedyvalues[border_pos:maxright4fit]-dhet[border_pos:maxright4fit,2])/sum(dhet[border_pos:maxright4fit,2])
# fitted catting original with border_pos <- min_valid_pos; from raw k-mer counting dhet which includes het k-mers
yfit2 <- c(fittedyvalues[1:min(border_pos, het_min_valid_pos)],
as.vector(dhet[min(border_pos, het_min_valid_pos)+1:(length(dr[,2])-min(border_pos, het_min_valid_pos)),2]))
#
lines(dhet[, 1], yfit2, col="red", lwd=1.5)
#title(main=paste('Sample ', sample, ' k=', sizek, ')'), cex.main=0.8)
#
# calculate genome size (Mb) according to original data
# with all kmers
genome_size <- round(sum(dr[,1]*dhet[,2]/first_peak_pos))
# with valid kmers: caution: 1.peak-value 2. valid kmer with cov >= min_valid_pos (or border_pos)
genome_size_filtering_error <- round(sum(dr[border_pos:length(dhet[,1]),1]*dhet[border_pos:length(dr[,1]),2]/first_peak_pos))
##
if(het_observed)
{
genome_size <- round(sum(dr[,1]*dhet[,2]/hom_peak_pos))
genome_size_filtering_error <- round(sum(dr[border_pos:length(dhet[,1]),1]*dhet[border_pos:length(dr[,1]),2]/hom_peak_pos))
}
# genomic kmers
# message(paste(" err-exl genomic ", sizek, "mers: ", sum(dhet[border_pos:length(dhet[,2]),2]), " Mio\n", sep=""));
# fitted data with fitted peak
genome_size_fitted <- round(sum(c(1:length(fittedyvalues))*fittedyvalues[1:length(fittedyvalues)]/fitted_peak_value))
# genomic kmers
# message(paste(" fit genomic ", sizek, "mers: ", round(sum(fittedyvalues)), " Mio\n", sep=""));
# catted data with fitted peak
genome_size_catted <- round(sum(c(1:length(yfit2))*yfit2[1:length(yfit2)]/fitted_peak_value))
# genomic kmers
# message(paste(" catted genomic ", sizek, "mers: ", round(sum(yfit2)), " Mio\n", sep=""));
# corrected genome size (not meaningful)
genome_size_corrected <- round(genome_size_catted/myxifit[1])
# genome size: catted data + raw mean
if(sum(hetfit[1:selected] > 0))
{
end_for_mean <- round(fitted_peak_value*2 - fitted_peak_value*2/4) # >7% error; k should be small
}else
if((sizek=="15" || sizek=="17") && myxifit[1]>=0.93 && myxifit[1]<=1.07) # normal case: reduce effect of copy-2 k-mers
{
end_for_mean <- round(fitted_peak_value*2 - fitted_peak_value*1/4)
}else
if(myxifit[1] > 1.07)
{
end_for_mean <- round(fitted_peak_value*2 + fitted_peak_value*2/4)
}else
if(myxifit[1] < 0.93)
{
end_for_mean <- round(fitted_peak_value*2 - fitted_peak_value*4/8)
}else
{
end_for_mean <- round(fitted_peak_value*2)
}
#### caution: end_for_mean should be not larger than vector size
end_for_mean <- min(end_for_mean, length(dr[,1])) ####
####
if(het_observed && only_one_het_peak) ## new ##
{
end_for_mean <- round(het_peak_pos*4 - het_peak_pos*4/4)
end_for_mean <- min(end_for_mean, length(dr[,1])) ####
#message("end_for_mean (only one het peak): ", end_for_mean, '\n')
message("hetfit: ceiling(abs(round(hetfit[1:end_for_mean]))/1000): ",ceiling(abs(round(hetfit[1:end_for_mean]))/1000),"\n")
xtmp <- rep(1:end_for_mean,ceiling(abs(round(hetfit[1:end_for_mean]))/1000))
first_mean_raw <- 2*mean(xtmp[xtmp>=1 & xtmp<=end_for_mean])
}else
if(het_observed & main_peak_is_hom==F)
{
het_end_for_mean <- first_peak_pos
het_end_for_mean <- min(het_end_for_mean, length(dr[,1])) ####
xtmp <- rep(1:het_end_for_mean,ceiling(abs(round(hetfit[1:het_end_for_mean]))/1000))
first_mean_raw <- 2*mean(xtmp[xtmp>=1 && xtmp<=het_end_for_mean])
}else
if(het_observed && only_one_hom_peak) # caution: expert suppl.
{
dtmp <- ceiling(abs(round(yfit2[1:end_for_mean]-(1-het_fitting_delta)*hetfit[1:end_for_mean]))/1000) # 2016-08-25
## start of specific in v1.94: from [0.5*het_peak, 1.5*het_peak]
offset_count <- round((fittedyvalues[1:end_for_mean]-yfit2[1:end_for_mean])/1000)
offl <- round(0.5*which.max(hetfit))
offr <- round(1.5*which.max(hetfit))
for (off_i in c(offl:offr))
{
if(offset_count[off_i] > 0)
{
dtmp[off_i] <- dtmp[off_i] + offset_count[off_i]
}
else
{
dtmp[off_i] <- dtmp[off_i] - offset_count[off_i]
}
}
## end of specific in v1.94
xtmp <- rep(1:end_for_mean,dtmp)
first_mean_raw <- mean(xtmp[xtmp>=1 & xtmp<=end_for_mean])
}else
{
xtmp <- rep(1:end_for_mean,ceiling(abs(round(yfit2[1:end_for_mean]-hetfit[1:end_for_mean]))/1000))
first_mean_raw <- mean(xtmp[xtmp>=1 & xtmp<=end_for_mean]);
}
#
genome_size_corrected2 <- round(sum(c(1:length(yfit2))*yfit2[1:length(yfit2)]/first_mean_raw))
#
#
## heterozygosity est: new feature - 2016-11-18
if(het_observed)
{
# alpha = 1 / [ (O/E)*K + K ]
# het
het_end <- round(first_mean_raw)
hom_end <- min(round(2*first_mean_raw), selected)
if(only_one_hom_peak)
{
# only for humans: do the correction for those k-mers from X-Y
het_correction <- 0
if(species=="human")
{
het_correction <- (155270560 - 59373566)*first_mean_raw/2;
}
#sum(c(1:het_end)*(1-het_fitting_delta)*hetfit[1:het_end])
alpha <- 1 / ( sum(seq(1:hom_end)*homfit[seq(100, length(homfit), 100)][1:hom_end])/ sum(seq(1:het_end)*hetfit[1:het_end])/(1-het_fitting_delta)*as.numeric(sizek) + as.numeric(sizek) )
alpha_cor <- 0
alpha_cor <- 1 / ( sum(seq(1:hom_end)*homfit[seq(100, length(homfit), 100)][1:hom_end])/ (sum(seq(1:het_end)*hetfit[1:het_end])-het_correction)/(1-het_fitting_delta)*as.numeric(sizek) + as.numeric(sizek) )
}
else
{
# only for humans: do the correction for those k-mers from X-Y
het_correction <- 0
if(species=="human")
{
het_correction <- (155270560 - 59373566)*first_mean_raw/2;
}
#sum(c(1:het_end)*hetfit[1:het_end])
alpha <- 1 / ( sum(seq(1:hom_end)*homfit[seq(100, length(homfit), 100)][1:hom_end])/ sum(seq(1:het_end)*hetfit[1:het_end])*as.numeric(sizek) + as.numeric(sizek) )
alpha_cor <- 0
alpha_cor <- 1 / ( sum(seq(1:hom_end)*homfit[seq(100, length(homfit), 100)][1:hom_end])/ (sum(seq(1:het_end)*hetfit[1:het_end])-het_correction )*as.numeric(sizek) + as.numeric(sizek) )
}
}
## end of heterozygosity rate est: new feature - 2016-11-18
#
#
# single copy region of genome
singlestart = 1
singleend = min(round(2*first_mean_raw),length(dr[,1])-1) #### caution: out of bounds
singlesize = sum(dhet[singlestart:singleend, 1]*yfit2[singlestart:singleend]/first_mean_raw)
# begin: new on 2016-08-08
repsize = round(sum(dhet[(singleend+1):length(yfit2), 1]*yfit2[(singleend+1):length(yfit2)]/first_mean_raw))
#legend(45, 1.36*max(dr[border_pos:60, 2]),
if(het_observed)
{
legend("topright",
fill = c('gray','white', 'white', 'cyan', 'white', 'blue',
'white', 'white', 'white', 'red', 'white', 'white'),
legend = c(paste('Observed (obs.): ',
round(genome_size/10^6, digits = 3), ' Mb (error-excluded: ',
round(genome_size_filtering_error/10^6, digits = 3), ' Mb)', sep=""),
paste('* k-mer_cov obs.: ', first_peak_pos, sep=""),
paste('* signal_error_border ', het_min_valid_pos, sep=""),
paste('Heterozygous fitting (intermediate info)'),
paste('* het rate: ',
round(alpha, digits=8), " (ori) : (cor) ",
round(alpha_cor, digits = 8), sep=""),
paste('Fitted count with fitted k-mer_cov: ',
round(genome_size_fitted/10^6, digits = 3), ' Mb', sep=""),
paste('* k-mer_cov fit.: ',
round(fitted_peak_value, digits=2), sep=""),
paste('* 1st-sd:',
round(first_sd, digits=2), sep=""),
paste('* 1st-skewness: ',
round(myxifit[1], digits=2), sep=""),
paste('Fitted+obs. with corrected k-mer_cov: ',
round(genome_size_corrected2/10^6, digits = 3), ' Mb', sep=""),
paste('* k-mer_cov cor.: ',
round(first_mean_raw, digits=2), sep=""),
paste('* repetitive_ratio ',
round(repsize/genome_size_corrected2, digits=2), ': ',
round(repsize/10^6, digits = 3), ' Mb', sep="")),
border="NA",
cex=0.7)
}
else
{
legend("topright",
fill = c('gray','white', 'white', 'blue', 'white',
'white', 'white', 'red', 'white', 'white'),
legend = c(paste('Observed (obs.): ',
round(genome_size/10^6, digits = 3), ' Mb (error-excluded: ',
round(genome_size_filtering_error/10^6, digits = 3), ' Mb)', sep=""),
paste('* k-mer_cov obs.: ', first_peak_pos, sep=""),
paste('* signal_error_border ', border_pos, sep=""),
paste('Fitted count with fitted k-mer_cov: ',
round(genome_size_fitted/10^6, digits = 3), ' Mb', sep=""),
paste('* k-mer_cov fit.: ', round(fitted_peak_value, digits=2), sep=""),
paste('* 1st-sd:', round(first_sd, digits=2), sep=""),
paste('* 1st-skewness: ', round(myxifit[1], digits=2), sep=""),
paste('Fitted+obs. with corrected k-mer_cov: ',
round(genome_size_corrected2/10^6, digits = 3), ' Mb', sep=""),
paste('* k-mer_cov cor.: ', round(first_mean_raw, digits=2), sep=""),
paste('* repetitive_ratio ',
round(repsize/genome_size_corrected2, digits=2), ': ',
round(repsize/10^6, digits = 3), ' Mb', sep="")),
border="NA",
cex=0.7)
}
# end: new on 2016-08-08
# genome sizes
size_all[i] <- genome_size
size_exl[i] <- genome_size_filtering_error
size_fit[i] <- genome_size_fitted
size_cat[i] <- genome_size_catted
size_cor[i] <- genome_size_corrected
size_cor2[i] <- genome_size_corrected2
# rbind(size_all, size_exl, size_fit, size_cat)
# genome-wide average kmer coverage related info
kmer_peak_cnt[i] <- dr[first_peak_pos, 2]
kmer_peak_pos[i] <- first_peak_pos
kmer_peak_pos2[i] <- fitted_peak_value
}
# labels for later plotting purpose
labels[3*i-1] <- paste("", sizek, sep="")
namebank[i] <- sizek
}
else
{
dev.off()
stop(paste("file ", countingfile, " not found!",sep=""))
quit("no")
}
}
# summarize all kmer counting plots
ylimmax <- 0
for (sizek in targetsizek[1]) {
histx<-read.table(countingfile)
peaks2 <- pracma::findpeaks(histx$V2[3:selected])
peaks2[,2:4] <- peaks2[,2:4]+2
if(1.5*max(peaks2[,1]) > ylimmax)
{
ylimmax <- 1.5*max(peaks2[,1])
}
}
# colors
colors <- rainbow(length(size_all))
#
## plot comparision among gsize-estimations using different info
genome_size_summary <-rbind(size_all, size_exl, size_cat, size_fit, size_cor2)
genome_size_summary2 <-rbind(size_exl, size_fit, size_cor2)
message(paste("Genome size estimate for ", sample, ": ", size_cor2, " bp.\n\n", sep=""))
# caution
if(length(which(is.na(genome_size_summary))) > 0)
{
message("\n caution: some samples have NA predicted for observed genome size!\n")
genome_size_summary[is.na(genome_size_summary)] <- 0
}
if(length(which(is.na(genome_size_summary2))) > 0)
{
message("\n caution: some samples have NA predicted for fitted genome size!\n")
genome_size_summary2[is.na(genome_size_summary2)] <- 0
}
if(length(genome_size_summary[1,])!=0 & length(genome_size_summary2[1,])!=0)
{
# record in file
write.table(genome_size_summary,
paste(path, '/',prefix, vers, 'est.', sample, ".genome.size.estimated.k",
min(targetsizek), 'to', max(targetsizek), ".fitted.txt", sep=""),
quote = FALSE, row.names = TRUE, col.names=FALSE)
# bar plot
mingsize <- min(genome_size_summary2[!is.na(genome_size_summary2)])
maxgsize <- max(genome_size_summary2[!is.na(genome_size_summary2)])
colors <- grey((2:0)/3)
mp <- barplot(genome_size_summary2, beside=TRUE, col=colors,
xlab=paste("Size k", sep=""), ylab="Genome size (bp)",
axes = FALSE, axisnames = FALSE, border=NA,
ylim=c(0.5*max(genome_size_summary2),1.2*max(genome_size_summary2)),
xpd=FALSE)
legend("topright",
legend = c(paste('EXL: ', round(mean(genome_size_summary2[1,]/10^6), digits=3), 'Mb'),
paste('FIT: ', round(mean(genome_size_summary2[2,]/10^6), digits=3), 'Mb'),
paste('COR: ', round(mean(genome_size_summary2[3,]/10^6), digits=3), 'Mb')),
fill = colors, cex=.8, border=NA)
text(mp, par("usr")[3], labels = labels, srt = 45, adj = c(1.1,1.1), xpd = TRUE, cex=.8)
axis(2, cex.axis=.8)
title(main=paste("Genome size estimation by error-excluding, fitting, and correcting\n", sep=""))
dev.off()
}
} # end of sample
# output est. on het if asked.
if(het_observed)
{
write(paste("Het_rate ", round(alpha, digits = 8), " ", round(alpha_cor, digits = 8), sep=""),
file = paste(path, '/',prefix, vers, 'est.',
sample, ".genome.size.estimated.k",
min(targetsizek), 'to', max(targetsizek),
".fitted.txt", sep=""),
append = TRUE,
sep = " ")
}
# output est. on ratio of repeats
write(paste("Est. ratio of repeats ", round(repsize/genome_size_corrected2, digits=8), sep=""),
file = paste(path, '/',prefix, vers, 'est.',
sample, ".genome.size.estimated.k",
min(targetsizek), 'to', max(targetsizek),
".fitted.txt", sep=""),
append = TRUE,
sep = " ")
# output est. on avg k-mer cov
write(paste("Final k-mer cov ", round(first_mean_raw, digits=8), sep=""),
file = paste(path, '/',prefix, vers, 'est.',
sample, ".genome.size.estimated.k",
min(targetsizek), 'to', max(targetsizek),
".fitted.txt", sep=""),
append = TRUE,
sep = " ")
if(het_observed && only_one_hom_peak)
{
write(paste("het_fitting_delta ", round(het_fitting_delta, digits=8), sep=""),
file = paste(path, '/',prefix, vers, 'est.',
sample, ".genome.size.estimated.k",
min(targetsizek), 'to', max(targetsizek),
".fitted.txt", sep=""),
append = TRUE,
sep = " ")
}
#
end_t <- Sys.time()
message('Time consumed: ', format(end_t-start_t), '\n')
return(list(het_xfit_right_save,hom_peak_pos_save,het_peak_pos_save,meanfit_old, sdfit_old))
}
## additional
#' Filter Peaks from K-mer Histogram
#'
#' This function filters peaks from a k-mer histogram to find a major peak with enough support information on both sides.
#'
#' @param peaks A data frame containing the peaks from the histogram.
#' @param histo A data frame representing the k-mer histogram.
#' @return A data frame with filtered peaks.
#' @examples
#' peaks <- data.frame(V1=1:20, V2=sample(1:10, 20, replace=TRUE))
#' histo <- data.frame(V1=1:100, V2=sample(1:10, 100, replace=TRUE))
#' filtered_peaks <- filter_peaks(peaks, histo)
#' print(filtered_peaks)
#' @export
filter_peaks <- function(peaks, histo)
{
# find out a major peak with enough support information on both sides (to exlcude potential local maxima)
pos <- 1 # peak position (namely, the k-mer freq at the peak)
i <- 0 # traverse candidate peaks
# a peak freq=pos should start at least from 5, or else too small to perform curve fitting
while(pos < 5 & i < length(peaks[,1]))
{
i <- i+1
pos <- peaks[i, 2] # x-axis peak
}
if(pos < 5)
{
stop(paste(" Error: k-mer coverage seemed too low to perform fitting. Minimum average k-mer coverage 5 required. Program exited.\n",sep=""))
}
# check if there are enough support on both sides of pos
while(i < length(peaks[,1]))
{
# check left: how many on left are smaller than count at current peak
posleft <- pos - 1
j <- 0 # counter of smaller cases on left
while(posleft>=1)
{
if(histo[posleft, 2] < histo[pos, 2])
{
j <- j + 1;
}
posleft <- posleft - 1
}
# check right: if there are right-side counts larger than count at the current peak
posright <- pos + 1
k <- 0 # counter of larger cases on right
maxpos <- peaks[length(peaks[,1]), 2]
while(posright < maxpos)
{
if(histo[posright, 2] > histo[pos, 2])
{
k <- k + 1
if(k>4)
{
break
}
}
posright <- posright + 1
}
# condition to continue or break
if(j <= 4 || k>4)
{
i <- i + 1
pos <- peaks[i, 2] # x-axis peak
}
else
{
break
}
}
# filter out peaks near erroneous peak
rpeaks <- peaks
fpi <- 1
while(rpeaks[fpi, 2] < pos/4)
{
rpeaks <- rpeaks[-fpi,]
}
return(rpeaks)
}
#' Filter Peaks from K-mer Histogram
#'
#' This function filters peaks from a k-mer histogram to find a major peak with enough support information on both sides.
#'
#' @param histo_data A data frame representing the k-mer histogram.
#' @return A data frame with filtered peaks.
#' @examples
#'
#' x <- seq(-10, 10, length.out = 100)
#' y <- dnorm(x)
#' histo_data <- data.frame(V1 = 1:100, V2 = y * 10)
#' get_het_pos(histo_data)
#'
#' @export
get_het_pos <- function(histo_data){
data <- histo_data
y_data <- data$V2
# Calculate slope (second derivative)
slope <- diff(y_data)
# Find the index where slope becomes positive and its following 5 slopes remain positive or zero
start_index <- NULL
for (i in 2:(length(slope) - 4)) {
if (slope[i] > 0 && all(slope[(i+1):(i+4)] >= 0)) {
start_index <- i
break
}
}
message("start index: ",start_index,"\n")
end_index <- NULL
# Find the first interval where slope starts decreasing after positive slope
for (i in (start_index + 1):(length(slope) - 1)) {
if (slope[i] < 0 && slope[i+1] < 0) {
end_index <- i
break
}
else if (slope[i] >= 0 && slope[i] < slope[i-1] && slope[i] < slope[i+1] &&
slope[i-1] < slope[i-2] && slope[i-2] < slope[i-3] &&
slope[i+1] < slope[i+2] && slope[i+2] < slope[i+3]){
if (is.null(end_index)) {
end_index <- i
break
}
}
}
#print(end_index)
return(list(start_index ,end_index))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.