#' @title Perform Smoothing and / or m-th Derivative
#' @description Performs a smoothing and / or the calculation of the m-th
#' derivative on the given dataset using the Savitzky-Golay filter.
#' @details The underlying function is \code{\link[signal]{sgolayfilt}}.
#' @param dataset An object of class 'aquap_data" as produced by
#' \code{\link{gfd}} or as can be extracted from a 'cube' via
#' \code{\link{getcd}}.
#' @param p Numeric length one, the order of the filter. Default = 2.
#' @param n Numeric length one, the filter length, must be odd. Default = 21.
#' @param m Numeric length one, Return the m-th derivative of the filter
#' coefficients. Default = 0.
#' @param exportModel Logical. If a possible model should be stored in the set,
#' (leave at \code{FALSE}).
#' @return Returns the dataset with the NIR-data smoothed or transformed.
#' @examples
#' \dontrun{
#' fd <- gfd() # get the full dataset
#' fd <- selectWls(fd, 1100, 1800)
#' fd_avg <- do_avg(fd, n=71) # apply strong smoothing
#' plot(fd - fd_avg, pg.where="", pg.main="| smoothed subtracted")
#' }
#' @family Data pre-treatment functions
#' @family dpt modules documentation
#' @export
do_sgolay <- function(dataset, p=2, n=21, m=0, exportModel=FALSE) {
autoUpS()
cns <- colnames(dataset$NIR)
rns <- rownames(dataset$NIR)
NIR <- t(apply(dataset$NIR, 1, signal::sgolayfilt, p=p, n=n, m=m))
exportAdditionalModelToAp2GD(doExport=exportModel, thisMod=NULL, thisType=pv_dptModules[1]) # sgol
colnames(NIR) <- cns
rownames(NIR) <- rns
dataset$NIR <- I(NIR)
return(dataset)
} # EOF
# is used when we have no access to gl_ap2GD like in parallel operations
do_sgolay_sys <- function(dataset, p=2, n=21, m=0) {
cns <- colnames(dataset$NIR)
rns <- rownames(dataset$NIR)
NIR <- t(apply(dataset$NIR, 1, signal::sgolayfilt, p=p, n=n, m=m))
colnames(NIR) <- cns
rownames(NIR) <- rns
dataset$NIR <- I(NIR)
return(dataset)
} # EOF
### for baseline removal
#' @title Calculate Standard Normal Variation SNV
#' @description Calculate the standard normal variation (SNV) by autoscaling the
#' transformed NIR-data
#' @inheritParams do_sgolay
#' @return Returns the dataset with the transformed NIR data.
#' @examples
#' \dontrun{
#' fd <- gfd() # get the full dataset
#' fd <- selectWls(fd, 1100, 1800)
#' fd_snv <- do_snv(fd)
#' plot(fd - fd_snv, pg.where="", pg.main="| snv subtracted")
#' }
#' @family Data pre-treatment function
#' @family dpt modules documentation
#' @export
do_snv <- function(dataset, exportModel=FALSE) {
autoUpS()
NIR <- t(scale(t(dataset$NIR),center=TRUE,scale=TRUE))
exportAdditionalModelToAp2GD(doExport=exportModel, thisMod=NULL, thisType=pv_dptModules[2]) # snv
colnames(NIR) <- colnames(dataset$NIR)
rownames(NIR) <- rownames(dataset$NIR)
dataset$NIR <- I(NIR)
return(dataset)
} # EOF
#' @title Perform MSC
#' @description Performs MSC (multiplicative scatter correction) on the
#' provided dataset.
#' @details If no reference is provided, the average of all spectra of the provided
#' dataset is used as a reference for the baseline correction. Provide a dataset
#' with a single spectrum (as e.g. produced by \code{\link{do_avg}}) to use this
#' as a reference for baseline correction. Internally, the function
#' \code{\link[pls]{msc}} is used.
#' @inheritParams do_sgolay
#' @param ref An object of class 'aquap_data' containing a single spectrum,
#' i.e. a single row (as e.g. produced by \code{\link{do_avg}}, or by
#' subscripting via '[]'). If no reference is provided, the average of all spectra
#' of the provided dataset is used as a reference for the baseline correction.
#' @param extMscModel Provide an external msc model to use this for predicting
#' the data instead of running msc on the data. Experimental feature.
#' @return Returns the dataset with the transformed NIR data.
#' @seealso \code{\link{ssc}} for sub-selecting in a datset using a specific
#' criterion.
#' @examples
#' \dontrun{
#' fd <- gfd() # get the full dataset
#' fd <- selectWls(fd, 1100, 1800)
#' fd_msc <- do_msc(fd)
#' plot(fd - fd_msc, pg.where="", pg.main="| msc subtracted")
#' fd_msc_ref <- do_msc(fd, ref=fd[4]) # use the 4th row of fd as a reference
#' plot(fd - fd_msc_ref, pg.where="", pg.main="| 4th row as reference")
#' fd_msc_mq <- do_msc(fd, ref=do_avg(ssc(fd, C_Group=="MQ"))) # is assuming the
#' # existence of a column name "C_Group" and one or more of its values being "MQ"
#' plot(fd - fd_msc_mq, pg.where="", pg.main="| average of MQ as reference")
#' }
#' @seealso \code{\link{getcd}}
#' @family Data pre-treatment functions
#' @family dpt modules documentation
#' @export
do_msc <- function(dataset, ref=NULL, extMscModel=NULL, exportModel=FALSE) {
autoUpS()
if (!is.null(ref)) {
if (!is(ref, "aquap_data")) {
stop("Please provide an object of class 'aquap_data' to the argument 'ref'.", call.=FALSE)
}
if (nrow(ref) != 1) {
stop("Please provide a dataset with only one single row, i.e. only one single spectrum, to the argument 'ref'.", call.=FALSE)
}
if (ncol(ref$NIR) != ncol(dataset$NIR)) {
stop("Please provide a dataset containing the same number of wavelengths to the argument 'ref'", call.=FALSE)
}
refInput <- as.numeric(ref$NIR)
} else {
refInput <- NULL
}
if (is.null(extMscModel)) {
NIR <- pls::msc(dataset$NIR, refInput)
} else {
NIR <- predict(extMscModel, as.matrix(dataset$NIR))
}
exportAdditionalModelToAp2GD(doExport=exportModel, thisMod=NIR, thisType=pv_dptModules[3]) # msc
colnames(NIR) <- colnames(dataset$NIR)
rownames(NIR) <- rownames(dataset$NIR)
dataset$NIR <- I(NIR)
return(dataset)
} # EOF
#' @title Average spectra in a dataset
#' @description Calculate the average spectrum of all the given spectra in a
#' dataset, or define groups to be averaged by providing one or more
#' class-variable.
#' @details The total and group-wise average spectrum is returned each in a
#' single row with the header-data taken from the first row of the subset of
#' the original dataset as defined by the grouping. If parameter \code{clv} is
#' left at the default NULL, all the spectra in the dataset will be averaged
#' together into a single row.
#' @inheritParams do_sgolay
#' @param clv Character vector, one or more valid class-variables defining the
#' subsets of the dataset to be averaged.
#' @param inParallel Logical, if the averaging of spectra should be done in
#' parallel. Defaults to TRUE.
#' @return The transformed dataset.
#' @examples
#' \dontrun{
#' fd <- gfd() # get the full dataset
#' fd <- selectWls(fd, 1100, 1800)
#' fd_avg <- do_avg(fd)
#' fd_avg2 <- do_avg(ssc(fd, C_water=="MQ")) # assumes the existence of column "C_water"
#' plot(fd_avg, pg.where="", pg.main="| averaged")
#' plot(fd - fd_avg, pg.where="", pg.main="| avg subtracted")
#' #######
#' fd_avg_group <- do_avg(fd, "C_Group") # averaging within groups
#' fd_avg_time_group <- do_avg(fd, c("C_Time", "C_Group")) # averaging within a
#' # subset defined by "C_Time" and "C_Group"
#' }
#' @family Data pre-treatment functions
#' @seealso \code{\link{getcd}}
#' @export
do_avg <- function(dataset, clv=NULL, inParallel=TRUE) {
autoUpS()
if (is.null(clv)) {
NIR <- matrix(apply(dataset$NIR, 2, mean), nrow=1)
colnames(NIR) <- colnames(dataset$NIR)
dsNew <- dataset[1,]
dsNew$NIR <- I(NIR)
return(dsNew)
} # end is.null(clv)
#
cns <- colnames(dataset$header)
if (!all(clv %in% cns)) {
ind <- which(!clv %in% cns)
stop(paste0("Sorry, the class-variables '", paste(clv[ind], collapse="', '"), "' seem not to be present in the provided dataset."), call.=FALSE)
}
fdf <- makeFlatDataFrameMulti(dataset, clv)
if (inParallel) {
registerParallelBackend()
} else {
registerDoSEQ()
}
##
fdfMean <- plyr::ddply(fdf, .variables=clv, .fun=plyr::colwise(mean), .parallel=inParallel) ##### CORE #####
NIR <- as.matrix(fdfMean[,-(1:length(clv))])
smh <- fdfMean[, 1:length(clv), drop=FALSE] # XXX problem with collapsing !! XXX
##
header <- as.data.frame(matrix(rep(NA, ncol(dataset$header)), nrow=1))
colnames(header) <- colnames(dataset$header)
colRep <- as.data.frame(matrix(rep(NA, ncol(dataset$colRep)), nrow=1))
colnames(colRep) <- colnames(dataset$colRep)
ds <- dataset
for (i in 1: nrow(smh)) {
for (k in 1: ncol(smh)) {
ds <- ssc_s(ds, clv[k], smh[i,k]) # search the matching data to construct the header
} # end for k
ds <- ds[1,] # only take the first row
siH <- ds$header
class(siH) <- "data.frame"
header <- rbind(header, siH)
siCol <- ds$colRep
class(siCol) <- "data.frame"
colRep <- rbind(colRep, siCol)
ds <- dataset
} # end for i
header <- header[-1,] # leave out all the NAs
colRep <- colRep[-1,]
rownames(NIR) <- rownames(header)
newData <- data.frame(I(header), I(colRep), I(NIR))
dataset@.Data <- newData
dataset@row.names <- rownames(header)
for (i in 1: ncol(dataset$header)) {
if (all(is.character(dataset$header[,i]))) {
dataset$header[i] <- factor(dataset$header[,i])
}
}
return(dataset)
} # EOF
### input is a data frame with one or 2 loading vectors or one regression vector
calc_emsc <- function(dataset, input) { ## this one possibly used "external"
autoUpS()
cnsWls <- colnames(dataset$NIR)
rns <- rownames(dataset$NIR)
NIRdata <- dataset$NIR
class(NIRdata) <- "matrix"
wls <- getWavelengths(dataset)
ColMeansNIR <- colMeans(NIRdata)
if (ncol(input) == 1) {
Xcal1 <- cbind(rep(1, ncol(NIRdata)), ColMeansNIR, input[1])
Ycor2_tmp <- t(rep(NA, ncol(NIRdata)))
for (i in 1: nrow(NIRdata)) {
Ycal <- as.data.frame(matrix(NIRdata[i,], ncol=1))
tmp <- lm(Ycal$V1 ~ Xcal1[,1] + Xcal1[,2] + Xcal1[,3])
b <- tmp$coefficients[-2]
Ycor2_tmp <- rbind(Ycor2_tmp, t(((Ycal-b[1])-b[3]*Xcal1[,3])/b[2])) ## here things are happening !!
} # end for i
Ycor2 <- out <- Ycor2_tmp[-1,]
} else {
Xcal1 <- cbind(rep(1, ncol(NIRdata)), ColMeansNIR, input[1], input[2])
Ycal <- NA
Ycor2_tmp <- t(rep(NA, ncol(NIRdata)))
for ( i in 1: nrow(NIRdata)) {
Ycal <- as.data.frame(matrix(NIRdata[i,], ncol=1))
tmp <- lm(Ycal$V1 ~ Xcal1[,1] + Xcal1[,2] + Xcal1[,3] + Xcal1[,4] )
b <- tmp$coefficients[-2]
Ycor2_tmp <- rbind(Ycor2_tmp, t(((Ycal-b[1]) - b[3]*Xcal1[,3]-b[4]*Xcal1[,4]) / b[2]) )
} # end for i
Ycor2 <- out <- Ycor2_tmp[-1,]
} # all is done
colnames(Ycor2) <- cnsWls
rownames(Ycor2) <- rns
out <- as.data.frame(Ycor2)
return(out)
} # EOF
### input the vecLoad directly from the model via: pcaModel$loadings[, c(x,y)]
#' @title Perform EMSC
#' @description Performs EMSC with up to two signals on the provided dataset
#' @details A data frame with one or two loadings or with one regression vector
#' can be used as input to (try to) remove the effect on the data described by
#' said loadings / regr. vector. For example, from a pca-model the first and
#' second loading vector can be extracted with \code{pcaModel$loadings[, c(1,2)]}.
#' @inheritParams do_sgolay
#' @param vecLoad A data frame x (\code{ncol(x) <= 2}) with one or two loading
#' vectors or one regression vector.
#' @return Returns the dataset with the transformed NIR data.
#' @seealso \code{\link{getcm}} for easy extraction of single models where
#' loading vectors or a regression vector can be obtained.
#' @examples
#' \dontrun{
#' fd <- gfd()
#' cu <- gdmm(fd, getap(do.pca=TRUE)) # assumes no split
#' loadings <- getcm(cu, 1, what="pca")$loadings[, c(1,2)]
#' fd_emsc <- do_emsc(fd, loadings)
#' }
#' @family Data pre-treatment functions
#' @family dpt modules documentation
#' @export
do_emsc <- function(dataset, vecLoad=NULL, exportModel=FALSE) {
autoUpS()
input <- as.data.frame(vecLoad)
if (ncol(input) > 2) {
stop("At the moment, not more than 2 effects can be removed from the data. Please be a bit more content.", call.=FALSE)
}
if (is.null(vecLoad)) {
stop("Please provide a data frame with one or two loading vectors or one regression vector to the argument 'vecLoad' (do_emsc).", call.=FALSE)
}
NIR <- as.matrix(calc_emsc(dataset, input))
exportAdditionalModelToAp2GD(doExport=exportModel, thisMod=NULL, thisType=pv_dptModules[4]) # emsc
rownames(NIR) <- rownames(dataset)
colnames(NIR) <- colnames(dataset$NIR)
dataset$NIR <- I(NIR)
return(dataset)
} #EOF
do_scale <- function(dataset) {
NIR <- som::normalize(dataset$NIR, byrow=FALSE)
colnames(NIR) <- colnames(dataset$NIR)
rownames(NIR) <- rownames(dataset$NIR)
dataset$NIR <- NIR
return(dataset)
} # EOF
do_scale_fc <- function(dataset, calibAvgTable) { # used in aquagram norm foreign center
avg <- apply(calibAvgTable, 2, mean)
NIR <- scale(as.matrix(dataset$NIR), center=avg, scale=TRUE)
colnames(NIR) <- colnames(dataset$NIR)
rownames(NIR) <- rownames(dataset$NIR)
dataset$NIR <- I(NIR)
return(dataset)
} # EOF
#' @title Perform gap-segment derivatives
#' @description Performs gap-segment derivatives on the provided dataset. The
#' behaviour of the filter can be modified via its arguments, please see also
#' the documentation for \code{\link[prospectr]{gapDer}}.
#' @details The first column of the wavelengths and the last one get doubled in
#' order to have the same number of wavelengths in the resulting dataset as in
#' the original dataset. The underlying function is \code{\link[prospectr]{gapDer}}.
#' @inheritParams do_sgolay
#' @param m Numeric length one. The order of the derivative, can be between 1 and
#' 4. Default is 1.
#' @param w Numeric length one. The filter length (should be odd), ie. the spacing
#' between points
#' over which the derivative is computed. Default is 1.
#' @param s Numeric length one. The segment size, i.e. the range over which the
#' points are averaged.Default is 1.
#' @param deltaW Numeric length one. The sampling interval / band spacing. Default
#' is 1.
#' @section Note:
#' The documentation for the parameters was mostly taken from
#' \code{\link[prospectr]{gapDer}} by Antoine Stevens.
#' @seealso \code{\link[prospectr]{gapDer}}
#' @examples
#' \dontrun{
#' fd <- gfd()
#' fd_gsd <- do_gapDer(fd)
#' fd_gsd2 <- do_gapDer(fd, 1,11,13,1)
#' plot(fd_gsd - fd_gsd2)
#' }
#' @family Data pre-treatment functions
#' @family dpt modules documentation
#' @export
do_gapDer <- function(dataset, m=1, w=1, s=1, deltaW, exportModel=FALSE) {
autoUpS()
nir <- getNIR(dataset)
NIR <- prospectr::gapDer(nir, m=m, w=w, s=s, delta.wav=deltaW) # gives back a matrix with one columne less at the beginning and end, so in total two wavelengths missing !!
# NIR <- as.matrix(NIR)
# wlsGap <- as.numeric(gsub("w", "", colnames(NIR)))
# wlsD <- getWavelengths(dataset)
# indLow <- which(wlsD == min(wlsGap))
# indHigh <- which(wlsD == max(wlsGap))
# nrLostLow <- length(1:indLow)
# nrLostHigh <- length(indHigh: length(wlsD))
# cat(paste("Low wls lost:", nrLostLow, "\n"))
# cat(paste("High wls lost:", nrLostHigh, "\n"))
# return(NIR)
# first <- as.data.frame(NIR[,1])
# last <- as.data.frame(NIR[, ncol(NIR)])
# NIR <- cbind(first, NIR, last)
# colnames(NIR) <- colnames(dataset$NIR)
# rownames(NIR) <- rownames(dataset$NIR)
# NIR <- as.matrix(NIR) # because otherwise the "AsIs" is behaving strange
exportAdditionalModelToAp2GD(doExport=exportModel, thisMod=NULL, thisType=pv_dptModules[7]) # gapDer
dataset$NIR <- I(NIR)
return(dataset)
} # EOF
checkDeTrendSrcTrgInput <- function(dataset, src=NULL, trg="src") {
addInfo <- "\n(For the latter, please see ?dpt_modules for further information.)"
checkNumLengthTwo <- function(arg) {
argName <- deparse(substitute(arg))
if (!all(is.numeric(arg)) | length(arg) != 2) {
stop(paste0("Please provide a numeric length two to the argument '", argName, "' resp. in the de-trend argument in the analysis procedure / your input.", addInfo), call.=FALSE)
}
} # EOIF
##
checkWls <- function(arg, allWls= getWavelengths(dataset)) {
argName <- deparse(substitute(arg))
if (min(arg) < min(allWls) | max(arg) > max(allWls)) {
stop(paste0("Sorry, the range in the argument '", argName, "' is not within the wavelength-range of the provided dataset [", min(allWls) , " to ", max(allWls), "]. Please check your input at the argument '", argName, "' resp. in the de-trend argument in the analysis procedure / your input.", addInfo), call.=FALSE)
}
} # EOIF
checkRange <- function(arg) {
argName <- deparse(substitute(arg))
if (diff(arg) <= 0) {
stop(paste0("Please provide a wavelength range greater than zero for the argument '", argName, "' resp. in the de-trend argument in the analysis procedure / your input.", addInfo), call.=FALSE)
}
}
if (!is.null(src)) {
checkNumLengthTwo(src)
checkWls(src)
checkRange(src)
}
if (any(is.character(trg))) {
if (!trg %in% c("src", "all")) {
stop(paste0("Please provide one of 'src', 'all' or a numeric length two to the argument 'trg' resp. in the de-trend argument in the analysis procedure / your input.", addInfo), call.=FALSE)
}
}
if (all(trg == "src")) {
trg <- src
}
if (all(trg == "all")) {
trg <- range(getWavelengths(dataset))
}
if (!is.null(trg)) {
checkNumLengthTwo(trg)
checkWls(trg)
checkRange(trg)
}
assign("trg", trg, pos=parent.frame(n=1))
##
} # EOF
#' @title Perform De-Trend
#' @description Perform de-trending of the given dataset. For each observation,
#' the linear model \code{lm(absorbance ~ wavelength)} is calculated. The
#' coefficients of the resulting model are then used to modify the absorbance
#' values after the formula
#' \code{newAbsorbance = absorbance - intercept - k*wavelength}. It is possible
#' to separately specify the source and the target of the de-trend operation.
#' @details Via the arguments \code{src} ('source') and \code{trg} ('target')
#' it is possible to specify separately from which wavelength-range the values
#' for the de-trend should be taken resp. to which wavelength-range the
#' resulting de-trend should be applied. Please see documentation for the
#' arguments \code{src} and \code{trg} for further details. If the target
#' wavelengths are only a part of the whole wavelength-range in the dataset,
#' the de-trend will be applied only to this wavelengths, and the rest of the
#' wavelengths will remain unchanged. Abrupt steps in the resulting spectra can
#' be the result of this. If both arguments \code{src} and \code{trg} are left
#' at their defaults, the full range of wavelengths in the dataset is used for
#' calculating the de-trend values, i.e. the linear models, and the resulting
#' de-trend is also applied to the full range of wavelengths present in the
#' dataset.
#' @inheritParams do_sgolay
#' @param dataset An object of class 'aquap_data' as produced e.g. by
#' \code{\link{gfd}}.
#' @param src 'source'; the wavelength-range from where the values for de-trend
#' should be calculated (i.e. the linear models). Leave at the default NULL
#' to use the full range of wavelengths in the provided dataset, or provide
#' a numeric length two to use this wavelength range for calculating the values
#' for de-trend.
#' @param trg 'target'; character length one or numeric length two. The wavelengths
#' where the de-trend should be applied. If left at the default 'src' the same
#' wavelength as specified in \code{src} is used. Possible values are:
#' \describe{
#' \item{src}{Apply the de-trend to the same wavelength-range as specified in
#' argument \code{src} (the default).}
#' \item{all}{Apply the de-trend to all the wavelengths in the provided dataset.}
#' \item{Numeric length two}{Provide a numeric length two to the argument
#' \code{trg} to apply the de-trend only to this wavelength range.}
#' }
#' @return The transformed dataset
#' @examples
#' \dontrun{
#' fd <- gfd()
#' fdDT <- do_detrend(fd) # use the whole wavelength range of 'fd' as source and
#' # target for the de-trend
#' plot(fd)
#' plot(fdDT)
#' ###
#' fdc <- selectWls(fd, 1300, 1600)
#' plot(fdc)
#' plot(do_detrend(fdc)) # whole range as source and target
#' plot(do_detrend(fdc, src=c(1400, 1500))) # same target as soruce
#' plot(do_detrend(fdc, src=c(1400, 1500), trg="all")) # apply to full range
#' plot(do_detrend(fdc, src=c(1400, 1500), trg=c(1300, 1600))) # same as above
#' plot(do_detrend(fdc, src=c(1300, 1400), trg=c(1380, 1580)))
#' }
#' @family Data pre-treatment functions
#' @family dpt modules documentation
#' @export
do_detrend<- function(dataset, src=NULL, trg="src", exportModel=FALSE) {
autoUpS()
checkDeTrendSrcTrgInput(dataset, src, trg) # is assigning trg
# source
if (is.null(src)) {
absSrc <- getNIR(dataset)
wlsSrc <- getWavelengths(dataset)
} else {
ds <- selectWls(dataset, src[1], src[2])
absSrc <- getNIR(ds)
wlsSrc <- getWavelengths(ds)
}
mods <- apply(absSrc, 1, function(x) lm(x ~ wlsSrc)$coefficients) ### calculate the models !! one for every observation ### gives back a matrix with two rows and one column for every observation
# target
if (is.null(trg)) {
nirTrg <- getNIR(dataset)
wlsTrg <- getWavelengths(dataset)
} else {
if (is.numeric(trg)) {
ds <- selectWls(dataset, trg[1], trg[2])
nirTrg <- getNIR(ds)
wlsTrg <- getWavelengths(ds)
} else {
}
}
# NIR <- matrix(NA, nrow(nirTrg), ncol(nirTrg))
# for (i in 1: nrow(nirTrg)) {
# NIR[i,] <- as.numeric(nirTrg[i,]) - mods[1,i] - mods[2,i]*wlsTrg
# }
NIRnew <- t(sapply(1:nrow(nirTrg), function(i) as.numeric(nirTrg[i,]) - mods[1,i] - mods[2,i]*wlsTrg))
exportAdditionalModelToAp2GD(doExport=exportModel, thisMod=NULL, thisType=pv_dptModules[6]) # deTrend
colnames(NIRnew) <- cnsNew <- colnames(nirTrg)
cnsOld <- colnames(dataset$NIR)
indHere <- which(cnsOld %in% cnsNew)
dataset$NIR[, indHere] <- NIRnew
return(dataset)
} # EOF
#' @title Perform Data-Pretreatment Sequence
#' @description Manually perform a sequence of data pre-treatments defined in a
#' string.
#' @details For internal use.
#' @param dataset An object of class 'aquap_data'.
#' @param dptSeq A character holding at least one valid string for data pre-treatment.
#' @param extraModelList A list of possible external models to hand over to the
#' data pre-treatment process.
#' @param silent Logical. If status info should be printed or not.
#' @return The transformed dataset.
#' @export
do_dptSeq <- function(dataset, dptSeq, extraModelList=NULL, silent=TRUE) {
stn <- autoUpS(cfs=FALSE)
stn$allSilent <- silent
return(performDPT_Core(dataset, dptSeq, extraModelList, allExportModels=FALSE))
} # EOF
#' @title Resample data to new Wavelengths
#' @description Resample the data in the provided dataset to new, possibly
#' evenly spaced, wavelengths.
#' @details If nothing is provided for the argument \code{targetWls}, an evenly
#' spaced vector from the lowest to the highest available wavelength
#' is automatically generated as the target wavelengths to be resampled to.
#' @param dataset An object of class 'aquap_data' as produced e.g. by
#' \code{\link{gfd}}.
#' @param targetWls Numeric vector, the target wavelengths. If left at the
#' default \code{NULL}, an evenly spaced vector from the lowest to the
#' highest available wavelength is automatically generated as the target
#' wavelengths to be resampled to.
#' @param tby Numeric length one, the spacing in nm of the automatically
#' generated target wavelength. Only applies if \code{targetWls} is left at
#' \code{NULL}.
#' @param method The resampling method. For details see
#' \code{\link[pracma]{interp1}}. Defaults to 'cubic'; the default can be
#' changed in the settings.r file (parameter \code{gen_resample_method}).
#' @return The resampled dataset.
#' @examples
#' \dontrun{
#' fd <- gfd()
#' fdR <- do_resampleNIR(fd)
#' }
#' @family Data pre-treatment functions
#' @export
do_resampleNIR <- function(dataset, targetWls=NULL, tby=0.5, method=getstn()$gen_resample_method) {
ncpwl <- dataset@ncpwl
charPrevWls <- substr(colnames(dataset$NIR)[1], 1, (ncpwl))
#
x <- getWavelengths(dataset)
if (is.null(targetWls)) {
xNew <- seq(ceiling(x[1]/2) * 2, floor(x[length(x)]/2) * 2, tby)
} else {
xNew <- targetWls
# if (identical(x, xNew)) {
# return(dataset) # To save possibly time. In the merging, it could be that we provide the same target than the present wls.
# } # end if
} # end else
NIR <- t(apply(dataset$NIR, 1, pracma::interp1, x = x, xi = xNew, method = method))
colnames(NIR) <- paste0(charPrevWls, xNew)
dataset$NIR <- NIR
return(dataset)
} # EOF
checkBlowUpInput_grouping <- function(header, grp) {
stn <- getstn()
cns <- colnames(header)
cPref <- stn$p_ClassVarPref
cns <- cns[grep(cPref, cns)]
###
checkClassExistence <- function(X, char, ppv=TRUE) {
if (!all(is.character(X))) {
stop(paste0("Please only provide characters as input for the argument '", char, "'. Please check your input."), call.=FALSE)
}
for (i in 1: length(X)) {
if (!X[i] %in% cns) {
msg1 <- paste0("Sorry, it seems that the class-variable `", X[i], "` as grouping variable (", char, ") does not exist in the provided dataset.")
msg2 <- ""; if (ppv) { msg2 <- paste0("\nPossible values are: '", paste(cns, collapse="', '"), ".") }
stop(paste0(msg1, msg2), call.=FALSE)
}
} # end for i
} # EOIF
checkClassExistence(grp, "grp")
} # EOF
checkTransformBlowUpInput_TnAn <- function(minPart, tn, an) {
if (is.character(tn)) {
if (substr(tn,1,1) != "x") {
stop(paste0("Please provide `tn` in the format `xN`, with N being a positive integer"), call.=FALSE)
}
options(warn=-1)
tn <- round(as.numeric(substr(tn, 2, nchar(tn)))) * minPart # multiply with minPart, as we want to have it xFold
options(warn=0)
if (!is.numeric(tn)) {
stop(paste0("Please provide an integer after the character `x`in the argument `tn`."), call.=FALSE)
}
} else {
if (!is.numeric(tn)) {
stop(paste0("Please provide either an integer or a character in the format `xN`, with N being a positive integer, to the argument `tn`."), call.=FALSE)
}
rn <- round(tn)
} # end else
if (tn < minPart) {tn <- minPart}
#
if (is.character(an)) {
if (substr(an, nchar(an), nchar(an)) != "%") {
stop(paste0("Please provide `an` in the format `N%`, with N being a positive integer"), call.=FALSE)
}
options(warn=-1)
an <- round(as.numeric(substr(an, 1, nchar(an)-1)))
options(warn=0)
if (an > 100) { an <- 100}
an <- round((an * minPart) / 100) # as we want to have a percentage
if (!is.numeric(an)) {
stop(paste0("Please provide an integer before the character `%`in the argument `an`."), call.=FALSE)
}
} else {
if (!is.numeric(an)) {
stop(paste0("Please provide either an integer or a character in the format `N%`, with N being a positive integer, to the argument `an`."), call.=FALSE)
}
an <- round(an)
if (an < 1) {an <- 1}
} # end else
###
return(list(tn=tn, an=an))
} # EOF
lotto_loop <- function(tn, an, size, n=2000) {
lotto <- function(tn, an=3, size=8) {
nums <- lapply(1:tn, function(x) sample(1:size, an, replace = TRUE))
nums <- lapply(nums, sort)
numsChar <- unlist(lapply(nums, function(x) paste(x, collapse="-")))
out <- vector("logical", length(numsChar))
for (i in 1: length(numsChar)){
out[i] <- numsChar[i] %in% numsChar[-i]
}
le <- length(which(out))
perc <- round((le / tn)*100,0)
return(invisible(perc))
} # EOIF
percOut <- vector("numeric", n)
for (i in 1:n) {
percOut[i] <- lotto(tn, an, size)
}
percOut <- round(mean(percOut),1)
# cat(paste0("total average: ", out))
return(invisible(percOut))
}
#' @title Increase the Numbers of Observation in Dataset
#' @description Use random observations of the provided dataset (within a possible
#' grouping), calculate their average and add the resulting spectrum as new
#' observation to the dataset.
#' @details The random observations are sampled with replacement. The provenience
#' of each observation is marked in an additional column in the header named
#' `blowup`, where the character `orig` denotes an original observation, while the
#' character `artif` is marking an artificially generated observation.
#' @param dataset An object of class 'aquap_data' as produced e.g. by
#' \code{\link{gfd}}.
#' @param tn Numeric or character length one. If numeric, \code{tn} is denoting
#' the target-number, i.e. the desired number of observations in the expanded
#' dataset. If a grouping is provided via \code{grp}, the target number is
#' defining the desired number of observations within each subgroup obtained
#' by the grouping. If \code{tn} is a character it hast to be in the format
#' \code{xN}, with \code{N} being a positive integer. In this case it means
#' an N-fold increase of the dataset resp. of each subgroup as defined by the
#' grouping in \code{grp}. Defaults to \code{x10}.
#' @param an Numeric or character length one. If numeric, \code{an} is denoting
#' the "average-number", i.e. the number of observations resp. their spectra that
#' should be drawn (with replacement) from the dataset resp. from the respective
#' subgroup. These drawn samples then are averaged together into a new spectrum.
#' If \code{an} is a character it has to be in the format \code{N\%}, with
#' \code{N} being a positive integer. In this case it denotes the percentage of
#' observations in the dataset resp. in each of the subgroups possibly obtained
#' via \code{grp} that should be drawn (with replacement) and averaged together
#' into a new observation. Defaults to \code{100\%}, what is the recommended value.
#' @param grp Character. One ore more valid class-variable names that should be
#' used to from subgroups within the dataset. Similar to the \code{spl.var}
#' argument in the analysis procedure.
#' @param cst Logical. If consecutive scans should always be kept together.
#' NOT YET IMPLEMENTED - value is ignored, and consec. scans are NOT kept together.
#' @param conf Logical. If numbers should be presented and confirmation requested
#' before commencing the actual calculations.
#' @section Warning: Do take care of a correct grouping, otherwise the inclusion
#' of observations into the same group that do not belong together will destroy
#' any information.
#' @param replace Logical. If the sample drawing should be done with replacement.
#' Recommended value is TRUE.
#' @param pred Logical. If an estimation of the number of identical spectra should
#' be made. Only presented if \code{conf} is TRUE.
#' @return The dataset with increased numbers of observations.
#' @examples
#' \dontrun{
#' fd <- gfd()
#' fdPlus <- do_blowup(fd, tn="x4", an=4)
#' fdPlus <- do_blowup(fd, tn="x8", an="10%", grp=c("C_Foo", "C_Bar"))
#' fdPlus <- do_blowup(fd, tn=1000, an=5)
#' }
#' @family Data pre-treatment functions
#' @export
do_blowup <- function(dataset, grp=NULL, tn="x10", an="100%", cst=TRUE, conf=TRUE, pred=TRUE, replace=TRUE) {
stn <- autoUpS()
cPref <- stn$p_ClassVarPref
yPref <- stn$p_yVarPref
snColSet <- stn$p_sampleNrCol
snCol <- paste0(yPref, snColSet)
txtOrig <- "orig"
txtBlow <- "artif"
colNameBlow <- "blowup"
colOrig <- 1
colBlow <- 2
lottoLoopN <- stn$cl_extDatPred_N_lottoLoop
#
header <- headerFac <- getHeader(dataset) # headerFac only used for grouping
colRep <- getColRep(dataset)
blowupDf <- data.frame(X=rep(txtOrig, nrow(header))) ## add a new column to the header telling us what samples are new and what are old
colnames(blowupDf) <- paste0(cPref, colNameBlow)
header <- cbind(header, blowupDf)
blowCol <- data.frame(X=rep(colOrig, nrow(header)))
colnames(blowCol) <- paste0(cPref, colNameBlow)
colRep <- cbind(colRep, blowCol)
#
# first get out the factors
facInd <- vector("logical", length=ncol(header))
for (i in 1: ncol(header)) {
facInd[i] <- isFac <- is.factor(header[,i])
if (isFac) {
header[,i] <- as.character(header[,i])
}
} # end for i
#
minPart <- nrow(header)
splitList <- list(header)
NIR <- getNIR(dataset)
NIRsplitList <- list(NIR)
colRepSplitList <- list(colRep)
if (!is.null(grp)) {
checkBlowUpInput_grouping(header, grp) # stops if wrong or so
splitList <- split(header, headerFac[,grp]) # returns a list with a dataframe in each element
NIRsplitList <- split(NIR, headerFac[,grp])
colRepSplitList <- split(colRep, headerFac[,grp])
minPart <- min(unlist(lapply(splitList, nrow))) # get the minium number of participants from all the splitelements
}
aa <- checkTransformBlowUpInput_TnAn(minPart, tn, an)
tn <- aa$tn
an <- aa$an
# print(tn); print(an); print(minPart); print("-----")
# now we have the correct target and average number
requiredN <- tn - minPart
if (requiredN == 0) {
message("No dataset expansion has been performed.")
return(dataset)
}
eachNPart <- lapply(splitList, function(x) 1: nrow(x)) # gives a list with a vector from 1:N, with N being the number of participants from each element of the splitList
indList <- newNirList <- newHeaderList <- newColRepList <- vector("list", length=length(splitList))
for (i in 1: length(splitList)) { # now for every element within the splitList (could be only 1) we perform tn iterations of the resampling
###### CORE ######
indList[[i]] <- lapply(1:requiredN, function(x, npa, ana) sample(npa[[i]], ana, replace=replace), npa=eachNPart, ana=an) # gives a list for each element of the splitList
###### CORE ######
newNirList[[i]] <- matrix(NA, nrow=requiredN , ncol=ncol(NIR))
newHeaderList[[i]] <- data.frame(matrix(NA, nrow=requiredN, ncol=ncol(header)))
newColRepList[[i]] <- data.frame(matrix(NA, nrow=requiredN, ncol=ncol(colRep)))
} # end i
if (conf) {
avgDoubleMsg <- ""
if (pred) {
avgDoubles <- lotto_loop(tn=requiredN, an=an, size=minPart, n=lottoLoopN)
avgDoubleMsg <- paste0("The expected percentage of identical spectra is ~", avgDoubles, "%.")
}
cat(paste0(requiredN, " observations will be added to each of ", length(indList), " subgroups (the smallest containing ", minPart, " observations) by drawing from ", an, " observations.\n", avgDoubleMsg, "\n\nPress enter to continue or escape to abort:"))
scan(file = "", n = 1, quiet = TRUE)
}
# future: check here the IndList for doubles !!
# now we have indList containing the header segments, NIRsplitList containing the NIR segments, and the indList that contains the indices to be averaged for each segment
if (!stn$allSilent) {cat(paste0("Calculating (draw from ", an, ") and adding ", requiredN, " new observations to each of ", length(indList), " subgroups:\n"))}
for (i in 1: length(indList)) { # i is the number of the splitSegment as defined by the grouping
for (k in 1: length(indList[[i]])) { # k is the number of spectra we will add to each subgroup (k can be very large) ######(parallelize this, NOT above !!)
inds <- indList[[i]][[k]]
newNirList[[i]][k,] <- apply(NIRsplitList[[i]][inds,], 2, mean) # subselect from the NIR segment and calculate colwise average
newHeaderList[[i]][k,] <- splitList[[i]][inds[1],] # (splitList contains the header segments) just get the first of the obervations to be averaged
newColRepList[[i]][k,] <- colRepSplitList[[i]][inds[1],] # same
} # end for k
if (!stn$allSilent) {cat(".")}
} # end for i
# now we have to get the new elements out of the list into each one object
allNewHeader <- do.call("rbind", newHeaderList)
allNewHeader[,ncol(allNewHeader)] <- rep(txtBlow, nrow(allNewHeader))
colnames(allNewHeader) <- colnames(header)
allNewColRep <- do.call("rbind", newColRepList)
allNewColRep[,ncol(allNewColRep)] <- rep(colBlow, nrow(allNewColRep))
colnames(allNewColRep) <- colnames(colRep)
allNewNir <- do.call("rbind", newNirList)
colnames(allNewNir) <- colnames(NIR)
# (? maybe a problem with rownames?)
# now fuse together with the original data
tsi <- which(colnames(header) == "Timestamp")
if (length(tsi) != 0) {
allNewHeader[,tsi] <- as.POSIXct(allNewHeader[,tsi], origin="1970-01-01")
}
header <- rbind(header, allNewHeader)
snrs <- range(header[,snCol])
header[,snCol] <- seq(min(snrs), max(snrs), length.out=nrow(header))
colRep <- rbind(colRep, allNewColRep)
NIR <- as.matrix(rbind(NIR, allNewNir))
for (i in 1: ncol(header)) {
if (facInd[i]) {
header[,i] <- as.factor(header[,i]) # re-factorize if necessary
}
} # end for i
out <- new("aquap_data", data.frame(I(header), I(colRep), I(NIR)), metadata=dataset@metadata, anproc=dataset@anproc, ncpwl=dataset@ncpwl, version=dataset@version)
if (!stn$allSilent) {cat(" ok.\n")}
return(out)
## XXX still required:
# include consec. scans together
} # EOF
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.