R/filter4MinBatchsize.R

Defines functions filter4MinBatchsize

Documented in filter4MinBatchsize

#' @title Filter samples of a HT12prepro object for min batch size n>1
#'
#' @description This functions filters for min Batch size n>1  as this is a limitation for following removal of batch effects in the following step applying function removeBatchEffects()
#'
#'

#' @param ht12object A list object of class HT12prepro created with function filterTechnicallyFailed()
#' @param paramfile Path to the file specifying parameters
#' @param second_combat_withvar A second round of removeBatchEffects() can be intended. In this case, batches from the first round checking Sentrix.Barcode batches as well as batches from this second round specified as a column in ht12object$chipsamples named according to this variable must be checked for minimum batch size n>1.If "", no 2nd combat will be done. If "from_paramfile", than the parameter will be read from the paramfile with the location of this file given in parameter paramfile.
#'
#' @return A list object of class HT12prepro where the  slot with  sample-related attributes of the current processing-stage named `$chipsamples` is update. Excluded individual are characterized by column in_study ==F and reason4exclusion
#' @import data.table
#' @export

## debug
# paramfile = "/mnt/ifs1_projekte/genstat/02_projekte/1704_boettcher_ge_ht12/01_prepro/input_parameter_007.txt"
# ht12object =  prepro_ht12
# second_combat_withvar= "from_paramfile"

filter4MinBatchsize = function(ht12object,paramfile = NULL, second_combat_withvar= "from_paramfile") {

### strings are imported as strings and not as factors
  options(stringsAsFactors=FALSE)
myparameters = match.call()
showVennplots = F

# status checken
historie =  ht12object$history$calls
# if(any(grepl("filterTechnicallyFailed", historie))==F) stop("Function 'filterTechnicallyFailed()' has to be run before!")
if(any(grepl("filterLowExpressed", historie))==F) stop("Function 'filterLowExpressed()' has to be run before!")



#laden parameter
if(is.null(paramfile)==F) param <- data.frame(data.table::fread(paramfile))

ht12object$chipsamples$Sentrix.Barcode = as.character(ht12object$chipsamples$Sentrix.Barcode) ## added 22.8.18 , da int64 nicht  wie eine numerische Variable mit einem character verglichen werden kann, sollte aber bereits character sein, sicherheitsmassnahme
sample_overview_l7pre <-ht12object$chipsamples
mytable(sample_overview_l7pre$in_study)
sample_overview_l7preinstudy <- sample_overview_l7pre[ sample_overview_l7pre$in_study, ]
dim(sample_overview_l7pre)
dim(sample_overview_l7preinstudy)
table(table(sample_overview_l7preinstudy$new_ID))
if(length(table(table(sample_overview_l7preinstudy$new_ID))) != 1)
  stop("IDs (column new_ID) must be unique....stopping...")


#laden expressionsets
total_nobkgd_eset_ql = ht12object$total_nobkgd_eset_ql

## ----goodind-------------------------------------------------------------
# auszahlen aktualisiere barcode
table(table(sample_overview_l7pre$Sentrix.Barcode))
# table(sample_overview_l7pre[grep("mis", sample_overview_l7pre$Sentrix.Barcode), 5])
Biobase::pData(total_nobkgd_eset_ql)$hybridisierungchipserialnumber <- sample_overview_l7pre[match_hk(Biobase::pData(total_nobkgd_eset_ql)$sampleID,
                                                                                                      sample_overview_l7pre$new_ID), "Sentrix.Barcode" ]
check <- showNA(Biobase::pData(total_nobkgd_eset_ql))$NAs
if(sum(check) != 0)
  stop("not all smples appear to ahave a Sentrix ID!")
ht(Biobase::pData(total_nobkgd_eset_ql), 1)

# gute individuen
goodind  <- sample_overview_l7preinstudy$new_ID
# str(goodind)
total_nobkgd_eset_ql
total_nobkgd_eset_ql <- total_nobkgd_eset_ql[,goodind]
total_nobkgd_eset_ql

## ----criteria2-----------------------------------------------------------
message("Filtering for batch size > 1 for Sentrix IDs...")



tabled <- table(Biobase::pData(total_nobkgd_eset_ql)$hybridisierungchipserialnumber)
table(tabled)
singlbarcoders_chips <- names(tabled[tabled == 1])
singlbarcoders_chips
singlbarcoders_ind <- sample_overview_l7preinstudy[sample_overview_l7preinstudy$Sentrix.Barcode %in% singlbarcoders_chips, "new_ID"]
singlbarcoders_ind
# str(goodind)
goodind <- setdiff(goodind, singlbarcoders_ind)
# str(goodind)
total_nobkgd_eset_ql <- total_nobkgd_eset_ql[,goodind]
total_nobkgd_eset_ql




## ----combat8b------------------------------------------------------------
head(Biobase::pData(total_nobkgd_eset_ql))
Biobase::pData(total_nobkgd_eset_ql)$subgroup <- sample_overview_l7pre[ match_hk(Biobase::pData(total_nobkgd_eset_ql)$sampleID,
                                                                                 sample_overview_l7pre$new_ID), "subgroup"]
showNA(Biobase::pData(total_nobkgd_eset_ql))
singularcheck <- data.table(Biobase::pData(total_nobkgd_eset_ql))
singularcheck2 <- singularcheck[,.N, by = list(hybridisierungchipserialnumber, subgroup) ]
singularcheck3 <- singularcheck2[allDuplicatedEntries(hybridisierungchipserialnumber)]
singularcheck3



if(second_combat_withvar== "from_paramfile") second_combat_withvar_used <- getParam2("second_combat_withvar", myparam = param) else second_combat_withvar_used = second_combat_withvar
second_combat_withvar_used

if(second_combat_withvar_used != "") {
  message("Filtering for batch size > 1 as adjusting genexpression data in a second combat round for `",
          second_combat_withvar_used, "` is planned...")

  # Add data
  Biobase::pData(total_nobkgd_eset_ql_combat)[,second_combat_withvar_used] <-  sample_overview_l7pre[match_hk(Biobase::pData(total_nobkgd_eset_ql_combat)$sampleID,
                                                                                                              sample_overview_l7pre$new_ID),
                                                                                                     second_combat_withvar_used]
  check <- showNA(Biobase::pData(total_nobkgd_eset_ql_combat))
  if(sum(check) != 0)
    stop("Not all individuals have a Sentrix ID!")
  ht(Biobase::pData(total_nobkgd_eset_ql_combat), 1)

  # Exclude single barcoders
  tabled <- table(Biobase::pData(total_nobkgd_eset_ql_combat)[, second_combat_withvar_used])
  table(tabled)
  newsinglbarcoders <- names(tabled[tabled == 1])
  newsinglbarcoders
  newsinglbarcoders_ind <- sample_overview_l7preinstudy[sample_overview_l7preinstudy$Sentrix.Barcode %in% newsinglbarcoders, "new_ID"]
  newsinglbarcoders_ind
  # str(goodind)
  goodind <- setdiff(goodind, newsinglbarcoders_ind)
  # str(goodind)
}

# individuenattrib
sample_overview_l7pre[sample_overview_l7pre$new_ID %in% singlbarcoders_ind,"in_study"] = F
sample_overview_l7pre[sample_overview_l7pre$new_ID %in% singlbarcoders_ind,"reason4exclusion"] <- "Filtering as Batch/effect correction via ComBat not possible - only 1 Ind. within a batch  found"
mytable(sample_overview_l7pre$reason4exclusion)
mytable(sample_overview_l7pre$in_study)
ht12object$chipsamples =  sample_overview_l7pre

all_removed_singlbarcoders = unique(na.omit(singlbarcoders_ind)) # fix 22.8.18
message("Removed ", length(all_removed_singlbarcoders), " samples where only a single sample was found in at least one batch. (Removed IDs: ", paste(all_removed_singlbarcoders, collapse = ", "), ")")

fordoku =c(
  "singlbarcoders_ind",
  "singlbarcoders_chips",
  "sample_overview_l7pre"
)




stopifnot(sum(duplicated(fordoku))==0)


ht12object$dokuobjects_filter4MinBatchsize = lapply(fordoku, function(x) get(x))


names(ht12object$dokuobjects_filter4MinBatchsize) = fordoku


ht12object$history = rbind(ht12object$history, data.frame(calls = paste(Sys.time(), deparse(myparameters))))
ht12object$history
ht12object


}
holgerman/HT12ProcessoR documentation built on June 5, 2021, 9:18 a.m.