#' @title Check Illumina HT12v4 expression data
#' @description This function checks the data and extracts the sample-IDs including information from Illuminas sample-files. See vignette for an example.
#' @param paramfile Path to the file specifying preprocessing parameters, can be NULL
#' @param file_names_of_files_and_folders Path to a tab-delimeted file with information of files created in Illuminas Genome Studio. Following columns are required: `fileset_id`: a short ID for a fileset, `con_f`: full path and filename of the control probe data file, `nobkgd_f`: full path and filename of the gene probe data file, `sample_f`: full path and filename of the control probe data file. If NULL, information is used from parameterfile specified in 'paramfile', Default: 'from_paramfile'
#' @param colseparator Separator splitting numbers into units of 1000. Typically "." or ",". Default: 'from_paramfile'
#' @param renameDublettes If any sample Ids is found more than once in the sample file provided by Illumina, suffixes .1 .2 etc are added to the newly created new_ID used to designate samples within the preprocessing. If FALSE, the function will stop when duplicated Sample Ids are found. Default: 'from_paramfile'
#' @param prefix_new_samplename Prefix of new_ID, the new R-compatibel sample names used to designate samples within the preprocessing. Default: 'from_paramfile'
#' @return a list of class `HT12prepro` including a slot with sample-related attributes of the current processing-stage named `$chipsamples`
#' @importFrom plyr rename
#' @importFrom stringr str_locate str_length str_sub str_split str_trim str_replace_all str_replace
#' @import data.table
#' @export
# debug
# paramfile = myparamfile
# file_names_of_files_and_folders = "from_paramfile"; colseparator= "from_paramfile"; renameDublettes= "from_paramfile"; prefix_new_samplename= "from_paramfile"
checkExtractChipsamples = function(paramfile = NULL, file_names_of_files_and_folders = "from_paramfile", colseparator= "from_paramfile", renameDublettes= "from_paramfile",prefix_new_samplename= "from_paramfile") {
### strings are imported as strings and not as factors
options(stringsAsFactors=FALSE)
myparameters = match.call()
## ----addfunct------------------------------------------------------------
showVennplots = F
getIDTag <- function (file_con, endstr, id_tag_at_theend = T, general_prefix) {
#3/8/2012
# to get ID tag from Knut filenames
# endstr = string_controlfiles
nummeri = ifelse(id_tag_at_theend, 2, 1)
ende = stringr::str_locate(string=file_con, pattern=endstr)[,nummeri]
laenge = stringr::str_length(file_con)
if(id_tag_at_theend) file_con <- stringr::str_sub(string=file_con, ende+2, laenge)
if(id_tag_at_theend ==F) file_con <- stringr::str_sub(string=file_con, stringr::str_locate(string=file_con, pattern=general_prefix)[,1], ende-2)
file_con
}
getIDsFromKnut <- function(x) {
#20/7/2012
#funktion, die die IDs der PRoben aus den Dateien HT12v4 sausliest
#Diese funktioniert nur f?r nobkgd und bkgd und control files
#
# 24.6.14 umgestellt, das suffix statt punkt der ID erkenner ist, damit punkt in der ID auch erfasst werden kann
# 7.10.14 angepasst
# x = "20140626-PROGRESS I Wdhl 115_Sample Probe Profile_FinalReport.txt"
# x = j
test <- readLines(x, n=1)
test = ifelse(test == "[Header]", readLines(x, n=9)[9], test)
test <- unlist(stringr::str_split(test, "\t"))
idstring = "\\.AVG|\\.Detection|\\.BEAD|\\.Avg|\\.NARRAYS|\\.ARRAY_STDEV"
test <- (grep(idstring, test, value=T))
test <- sapply(stringr::str_split(test, idstring), "[",1)
table(test)
#checken, dass alle eintr?ge f?r proben gleich h?ufig da sind, e.g. immer nbead, standardabweichung etc
if(length(table(table(grep("[0-9]", test, value=T))))!=1) stop("Fuer manche als Zahlen kodierte Eintraege nicht alle Spaltenwerte gegeben!")
if(length(table(table(test)))!=1) stop("Fuer manche als automatisch mit punkt kodierte Eintraege nicht alle Spaltenwerte gegeben!")
samples_incl <- unique(test)
as.character(samples_incl)
}
getIDsFromKnutSamplefiles <- function(x,file_overview=file_overview) {
#20/7/2012
#funktion, die die IDs der PRoben aus den Dateien HT12v4 ausliest
#Diese funktioniert nur fuer files derklasse sampleinfo
# x = na.exclude(unlist(file_overview[,c("sample_f")]))[1]
# x = i
fn = grep(x, file_overview$sample_f, value = T, fixed = T)
vortest <- readLines( fn, n=1)
lines2skip = ifelse(vortest == "[Header]", 8, 0)
test <- read.delim( fn, skip = lines2skip, as.is = T)
samples_incl <- test$Sample.ID
as.character(samples_incl)
}
getAllInfosFromKnutSamplefiles3 <- function(x, docheck=T, mydec = colseparator_used) { # x="r:/genstat/01_daten/1603_eqtl_fettgewebe_kinder/ge_daten/Illumina_HT12_FG_Kinder_Gesamtgewebe/all_in_1folder/SamplesTable_premiRNA.txt"
# x = i
#20/7/2012 28.5.13 ergaenyt, dass nichtvorhandene spalten aus my_columns ergaenzt und aufgefuellt mit NA werden
#funktion, die die PRobenattribute aus den Dateien HT12v4 sausliest
my_columns <- c(c("Index", "Sample.ID", "Sample_ID", "Pool_ID",
"Sample_Well", "Sample_Plate", "Sample.Group",
"Sentrix.Barcode" , "Sample.Section","Detected.Genes..0.01.",
"Detected.Genes..0.05.", "Signal.Average", "Signal.P05",
"Signal.P25", "Signal.P50", "Signal.P75", "Signal.P95",
"BIOTIN", "CY3_HYB", "HOUSEKEEPING", "LABELING",
"LOW_STRINGENCY_HYB", "NEGATIVE..background.", "Noise"))
#testen, bei erstellen der formel
# names(test)[(!names(test) %in% my_columns )]
# my_columns[!my_columns %in% names(test)]
filename <- x
vortest <- readLines(filename, n=1)
lines2skip = ifelse(vortest == "[Header]", 8, 0)
all_info <- read.delim(filename, skip = lines2skip, dec = mydec, as.is = T
)
# ergaenzen
missingcol = setdiff(my_columns, names(all_info))
if(length(missingcol) >0) message("\nin file", filename, " missing cols are: ", missingcol )
for(mi in missingcol){
all_info[mi] = NA
}
all_info = all_info[,my_columns]
if(docheck==T & identical(all_info$Sample_ID, all_info$Sample.ID)==F) {
if(all(is.na(all_info$Sample_ID)==T)) message (paste("keine eintraege in spalte Sample_ID zum vergleichen mit Spalte Sample ID im info file von file ", x))
if(all(identical(all_info$Sample.ID, stringr::str_trim(all_info$Sample.ID) )==F)) warning("Leading/tailing spaces in 'Sample.ID' field found, ignoring them in consistency check")
# if(all(is.na(all_info$Sample_ID)!=T)) stop (paste("spalte Sample_ID und
#Sample ID matcht nicht im Sample-info file im file ", x))
}
if(docheck==F & identical(all_info$Sample_ID, stringr::str_trim(all_info$Sample.ID)==F)) {
if(unique(all_info$Sample_ID) =="") message (paste("keine eintraege in spalte Sample_ID zum vergleichen mit Spalte Sample ID im info file von file ", x))
if(unique(all_info$Sample_ID) !="") message (paste("spalte Sample_ID und Sample ID matcht nicht im Sample-info file im file ", x))
}
# venndiagram(x=all_info$Sample_ID, y=all_info$Sample.ID, unique=T, type="2")
all_info$sample_f <- x
all_info
}
checkKnutsBkgd <- function(filename_1,filename_2){
#20/7/2012
##Anschauen, dass sich die Mittelwerte der Backgroundkorrigierten Proben auch unterscheiden von den nicht-backgroundkorrigierten Files
##dabei einlesen und Auswerten der ersten 100 Zeilen
x=filename_1
file1 <- readLines(x, n=101)
file1 <- (stringr::str_split(file1, "\t"))
file1 <- data.frame(file1,stringsAsFactors=F)
file1 <- data.frame(t(file1),stringsAsFactors=F)
names(file1) <- as.character(file1[1,])
file1 <- file1[-1,]
y=filename_2
file2 <- readLines(y, n=101)
file2 <- (stringr::str_split(file2, "\t"))
file2 <- data.frame(file2,stringsAsFactors=F)
file2 <- data.frame(t(file2),stringsAsFactors=F)
names(file2) <- as.character(file2[1,])
file2 <- file2[-1,]
file1 <- file1[file1$PROBE_ID %in% intersect(file1$PROBE_ID,file2$PROBE_ID), ]
file2 <- file2[file2$PROBE_ID %in% intersect(file1$PROBE_ID,file2$PROBE_ID), ]
if(dim(file1)[1]<50) stop(paste(dim(file1)[1],"Vergleiche sind zu wenig vergleiche (Bedingung: mind 50 g?ltige von 100 potentiellen Vergleichen"))
# print(paste(x,y,dim(file1)[1],identical(file1,file2)))
return(identical(file1,file2))
}
## ----load----------------------------------------------------------------
#laden parameter
if(is.null(paramfile)==F) param <- data.frame(data.table::fread(paramfile))
# str(param)
#beispiel aller ilmn Genexpresssionsids eines vollst. datensatzes aus "r:/genstat/01_daten/1111_ht12v4_lifeb3_hss/Einzelanalyse_nonorm_nobgkd_HSS38-AMI457.txt"
laengeilmn = dim(allilmn)[1]
if(laengeilmn != 47323) stop("47323 ilmn-IDs not correctly loaded")
# ilmn controllannotation
## ----read----------------------------------------------------------------
if(file_names_of_files_and_folders== "from_paramfile") file_names_of_files_and_folders_used = getParam2("file_names_of_files_and_folders", myparam = param) else file_names_of_files_and_folders_used = file_names_of_files_and_folders
message("Using files as input found in ", file_names_of_files_and_folders_used)
file_overview = read.delim(file_names_of_files_and_folders_used, as.is = T)
showClassDF(file_overview)
## ----checkvol------------------------------------------------------------
checkvollstaendig <- file_overview[is.na(file_overview$sample)|is.na(file_overview$nobkgd)|is.na(file_overview$con),]
checkvollstaendig #keine NA filenames
if(dim(checkvollstaendig)[1] != 0) stop("Problem found: Incomplete filesets - three files of type 'nobkgd', 'con' and 'sample' NOT allways found).\n")
status01_allelfilesda = "Allways complete filesets identified, i.e. three files of type 'nobkgd', 'con' and 'sample', no problem identified.\n";
message(status01_allelfilesda)
## ----FiletattributeBaue--------------------------------------------------
# Informationen aus bkgd, nobkgd, control auslesen
sample_overview <- c()
file_types <- c("nobkgd_f", "con_f")
for (i in 1:length(file_types)){
my_filetype <- file_types[i]
for (j in na.omit(unlist(file_overview[,my_filetype]))) {
# j = na.omit(unlist(file_overview[,my_filetype]))[1]
j = stringr::str_replace_all(string=j, pattern='\\\\ ', replacement=" ") # workaround fuer utf-8 kodierung falls leerzeichen auftauchen
message('reading ', j)
# j = "SamplesTable_premiRNA.txt"
samples <- (getIDsFromKnut(j))
if(class(samples)=="try-error") samples = NA
filenames <- j
file_class <- i
sample_overview_neu <- data.frame(samples,filenames, file_class, stringsAsFactors = F)
sample_overview <- rbind(sample_overview, sample_overview_neu)
}
}
sample_overview
showClassDF(sample_overview)
## ----addsample-----------------------------------------------------------
next_fileclass <- max(sample_overview$file_class)+1
for (i in na.exclude(unlist(file_overview[,c("sample_f")]))){
# i = na.exclude(unlist(file_overview[,c("sample_f")]))[1]
# i = "SamplesTable_premiRNA.txt"
message("reading ", i)
samples <- getIDsFromKnutSamplefiles(i, file_overview = file_overview)
filenames <- i
file_class <- next_fileclass
# print(paste(samples,filenames, file_class))
sample_overview_neu <- data.frame(samples,filenames, file_class)
sample_overview <- rbind(sample_overview, sample_overview_neu)
}
# cleanup
rm(sample_overview_neu)
## ----ergi----------------------------------------------------------------
dim(sample_overview)
## ----checi11-------------------------------------------------------------
checkalle = showNA(sample_overview)
checkalle
if(sum(checkalle$NAs) != 0) {
status01b_allelPersoneninfosda = "\nProblem found: Some sample IDs not present in all three files of type 'nobkgd', 'con' and 'sample'\n"
stop(status01b_allelPersoneninfosda)
} else {status01b_allelPersoneninfosda = "\nAll sample IDs present in all three files of type 'nobkgd', 'con' and 'sample', no problem found.\n"; message(status01b_allelPersoneninfosda)}
## ----iterncode-----------------------------------------------------------
newprefix <- param[param$variable =="prefix_new_samplename", "value"]
if(stringr::str_length(newprefix) < 1) stop("please provide `prefix_new_samplename` with a string length of at least 1")
newprefix
id_table <- data.frame(oriname=unique(sample_overview$samples))
id_table$genstatid <- paste0(newprefix, seq(along=id_table$oriname))
sample_overview$newID <- id_table[match(sample_overview$samples, id_table$oriname), "genstatid"]
ht(sample_overview)
## ----unique--------------------------------------------------------------
ht(sample_overview)
# Check ob proben doppelt vorkommen--> die wuerden bei reshape unter den Tisch fallen
# Hier muss man noch einen rename-wrapper bauen
check <-table(sample_overview$samples) #[sample_overview$file_classsamples)
table(check)
#welche betrifft das
mehrfach_probes <- table(sample_overview$samples)[as.numeric(check) > 3]
mehrfach_probes
status02_mehrfachIDs = ifelse(length(mehrfach_probes) ==0, "No duplicated IDs across files detected, no problem found.\n", "Problem found and resolved: Duplicated IDs across files detected, treated according parameter renameDublettes.\n")
status02_mehrfachIDs
ht(sample_overview[ sample_overview$samples %in% names(mehrfach_probes),])
sample_overview[ sample_overview$samples %in% names(mehrfach_probes)[1],]
if(any(as.numeric(names(table(check))) < 3)) stop("Probleme mit ID import - checke Skript code sdlfjaklsdff")
if(any(as.numeric(names(table(check)))/3 != round(as.numeric(names(table(check)))/3,0))) stop("Probleme mit ID import - checke Skript code sdlfjaklsdff2")
## ----doublette-----------------------------------------------------------
sample_overview$samplename_count <- check[match(sample_overview$samples, names(check))]/3
if(max(as.numeric(names(table(check))) > 3)) {
message(paste0("CAVE - Proben mit Nummer(n): \n", names(mehrfach_probes),
"\n kommt/kommen doppelt vor, modifiziere code fuerr diesen Fall"))
message("\n\n")
message(paste0("betrifft \n", length(names(mehrfach_probes)), " von ", length(unique(sample_overview$samples)), " Proben"))
message("\n\n")
# print(sample_overview[ sample_overview$samples %in% names(mehrfach_probes),])
if(renameDublettes== "from_paramfile") param2 = getParam2("renameDublettes", myparam = param) else param2 = renameDublettes
if(param2 == "F" | param2 == "FALSE") stop("Found samples with duplicated name. As parameter 'renameDublettes'==F , stopping here. Consider setting parameter renameDublettes to TRUE)")
if(param2 == "T" | param2 == "TRUE") {
if(any(ceiling(sample_overview$samplename_count) != sample_overview$samplename_count)) stop("manche proben kommen nicht in allen files nobkgd, con und sample vor")
sample_overview <- sample_overview[order(sample_overview$samples, sample_overview$file_class),]
to_rename <- unique(sample_overview[ sample_overview$samplename_count>1 , "samples"])
for(i in to_rename ){ # i="4556239"
message("Working on ", i)
max_i <- unique(sample_overview[ sample_overview$samplename_count>1 , "samplename_count"])
index <- 1
for(j in 1:max_i){ #j=1
sample_overview[sample_overview$samples == i , "samplename_count"][c(j, j+max_i, j+max_i*2)] <- index
index <- index+1
}
}
stopifnot(all(is.na(sample_overview$samplename_count))==F)
check <- table(table(paste(sample_overview$samples, sample_overview$samplename_count)))
stopifnot(identical(names(check), "3"))
#suffix .1 .2 anfuegen
sample_overview[sample_overview$samples %in% to_rename, "newID"] <- paste0(
sample_overview[sample_overview$samples %in% to_rename, "newID"], ".", sample_overview[sample_overview$samples %in% to_rename, "samplename_count"])
message("\n Renamed samples: \n")
message(sample_overview[ sample_overview$samples %in% to_rename,])
}
else stop("Only TRUE or FALSE is a valid entry in parameter renameDublettes")
}
ht(id_table)
showNA(id_table)
## ----wide----------------------------------------------------------------
showNA(sample_overview)
sample_overview_l <- reshape(sample_overview, idvar= c("samples", "newID", "samplename_count"), timevar="file_class",direction="wide")
sample_overview_l$samplename_count <- NULL
head(sample_overview_l)
showNA(sample_overview_l)
sample_overview_l = plyr::rename(sample_overview_l, c(samples = "old_ID", newID = "samples", filenames.1 ="nobkgd_f",filenames.2 ="con_f", filenames.3="sample_f"))
ht(sample_overview_l,2)
check <- table(table(sample_overview_l$samples))
check
if(identical(names(check), "1")==F) stop("code 12341.2 proben kommen immer noch doppelt vor, kann nicht sein")
showNA(sample_overview_l)
##Liste mit weiteren Probenannotationen hinzuf?gen
#unique identifyer bauen
sample_overview_l$fileset_id <- file_overview[ match_hk(sample_overview_l$sample_f, file_overview$sample_f),"fileset_id"]
## ----umbennennen---------------------------------------------------------
ht(sample_overview_l[ grep("_neu", sample_overview_l$fileset_id),])
sample_overview_l$fileset_id = stringr::str_replace( sample_overview_l$fileset_id, "_neu","")
ht(sample_overview_l,3)
## ----weiter4-------------------------------------------------------------
sample_overview_l$paste_id <- with(sample_overview_l, paste(old_ID, fileset_id, sep="."))
#weitere annotationen hinzuf?gen aus sample - files
#Da manche Dateien nicht alle Attribute enthalten,werden diese getrennt ausgelesen
#proben mit allen Attributen
#proben mit allen Spalten
head(sample_overview_l)
showNA(sample_overview_l)
all_infofiles <- unique(sample_overview_l$sample_f)
all_infofiles <- all_infofiles[!all_infofiles %in% grep("/NA$",all_infofiles, value=T )] #hier wird de facto nichts gefiltert
getwd()
if(colseparator== "from_paramfile") colseparator_used = getParam2("colseparator", myparam = param) else colseparator_used = colseparator
all_sample_infos <- c()
for (i in setdiff(all_infofiles,c(""))){
# message(paste("processing...",i))
try(sample_infos <- getAllInfosFromKnutSamplefiles3(i))
try(all_sample_infos <- rbind(all_sample_infos, sample_infos))
}
showNA(all_sample_infos)
unique(all_sample_infos$sample_f)
dim(all_sample_infos)
ht(all_sample_infos)
rm(sample_infos)
## ------------------------------------------------------------------------
showNA(all_sample_infos)
na_sentrix_filter = is.na(all_sample_infos$Sentrix.Barcode)
if(any(na_sentrix_filter)) {
all_sample_infos$sentr_help = sapply(stringr::str_split(all_sample_infos$Sample.ID, "_"), "[", 1)
all_sample_infos$section_help = sapply(stringr::str_split(all_sample_infos$Sample.ID, "_"), "[", 2)
all_sample_infos[ na_sentrix_filter, 'Sentrix.Barcode'] = all_sample_infos[ na_sentrix_filter, 'sentr_help']
all_sample_infos[ na_sentrix_filter, 'Sample.Section'] = all_sample_infos[ na_sentrix_filter, 'section_help']
showNA(all_sample_infos)
}
## ----proceeding----------------------------------------------------------
#Was steht in den manchmal fehlenden Attributen drinne, wenn die da sind?
sapply(all_sample_infos[,c("Pool_ID", "Sample_Well", "Sample_Plate")], function (x) table(x, useNA = "always"))
if(identical(unique(unlist(all_sample_infos[,c("Pool_ID", "Sample_Well", "Sample_Plate")])), NA) == F) stop("code 39867632 unerwartete eintr?ge,wo NA erwartet")
all_sample_infos = all_sample_infos[ ,names(all_sample_infos) %nin% c("Pool_ID", "Sample_Well", "Sample_Plate")]
###Filter: ist der SEntrix ID richtig angegeben
bad_sentrix <- unique(all_sample_infos$Sentrix.Barcode)
bad_sentrix
# str(bad_sentrix)
check <- table(stringr::str_length(bad_sentrix))
# noch NA checken, wurde aber ausch schon in ExtraktionsFunktion gemacht
sm3 = all_sample_infos[ is.na(all_sample_infos$Sample_ID),]
ht(sm3,2)
qlist33 = venn2(all_sample_infos$Sample.ID, all_sample_infos$Sample_ID, plotte = showVennplots)
#identifizierung, wo der chip-identifyer falsch ist
bad_sentrix_file <- unique(all_sample_infos[stringr::str_length(all_sample_infos$Sentrix.Barcode) %nin% c(10, 11),"sample_f"])
bad_sentrix_file
sm_badsentrix = all_sample_infos[stringr::str_length(all_sample_infos$Sentrix.Barcode) %nin% c(10, 11),]
sm_badsentrix
# ## ersetze haendisch sentrix id im glauben dass reihenfolge passt
# all_sample_infos[ all_sample_infos$sample_f == bad_sentrix_file,"Sentrix.Barcode"] = c(rep("misSentri1", 12), rep("misSentri2", 12), rep("misSentri3", 12), rep("misSentri3", 12))
# bad_sentrix_file <- unique(all_sample_infos[stringr::str_length(all_sample_infos$Sentrix.Barcode) != 10,"sample_f"])
# bad_sentrix_file
if(length(bad_sentrix_file) != 0) status03_badSentrix = paste0("Problem found: no 10 or 11-digit Sentrix ID found in file(s): \n", bad_sentrix_file) else status03_badSentrix ="Sentrix ID allways found with 10 or 11 digits, no problem identified."
if(length(bad_sentrix_file) != 0) warning(status03_badSentrix) else message(status03_badSentrix)
## ----ProcessInfosEtBAckroundcheck----------------------------------------
#zusammenmergen der infos zu den Dateien und zu den probenattributen
dim(sample_overview_l)
dim(all_sample_infos)
head(sample_overview_l)
head(all_sample_infos)
showNA(sample_overview_l)
#bauen eines identifyers aus filename und evtl. doppelten
matcher = unique(sample_overview_l[,c('sample_f',"fileset_id")])
all_sample_infos$fileset_id <- matcher[match_hk(all_sample_infos$sample_f,
matcher$sample_f),"fileset_id"]
all_sample_infos$paste_id <- paste(all_sample_infos$Sample.ID, all_sample_infos$fileset_id, sep=".")
sample_overview_l <- merge(sample_overview_l, all_sample_infos[,!names(all_sample_infos) %in% c("sample_f","fileset_id")], by="paste_id", sort=F, all.x=T, incomparables=c(NA, NaN))
ht(sample_overview_l)
rm(all_sample_infos)
showNA(sample_overview)
#auf inkorrekte nomenklatur pr?fen
allnames <- names(sample_overview_l)
check <- allnames[grep("bgkd", allnames)]
if(length(check) != 0) stop("darf nicht sein code laklafklafljfalj")
## ----ShowExpressedGenes--------------------------------------------------
# exprimierte Gene visualisieren
par(mfrow=c(1,1))
titele = "Detected HT12-Probes pval > 0.05"
# hist(x=sample_overview_l$Detected.Genes..0.05., breaks=200, col="red", main = titele)
# boxplot(sample_overview_l$Detected.Genes..0.05., col="red", main=titele)
## ----allelIlmnDA---------------------------------------------------------
# pruefen, ob alle ilmn imputiert worden
# str(allilmn)
for(i in na.omit(unique(sample_overview_l$nobkgd_f))){
# i = na.omit(unique(sample_overview_l$nobkgd_f))[1]
message("\ncheck following file for presence of 47,323 ilmn-probes: ", i, "\n")
pfad <- i
fileset_id <- unique(na.omit(sample_overview_l[ sample_overview_l$nobkgd_f ==i, "fileset_id"]))
ilmn_read <- fread(pfad, select = "PROBE_ID")
ilmn_read = ilmn_read$PROBE_ID
# print(sum(ilmn_read %in% allilmn$ilmn))
allilmn$neu <- allilmn$ilmn %in% ilmn_read
names(allilmn)[names(allilmn)=="neu"] <- fileset_id
}
# str(allilmn)
hh(allilmn,2)
uebersicht_ilmns <- sapply(allilmn[,2:ncol(allilmn), drop = F], function(x) sum(x))
checkalleilmndrinn <- table(uebersicht_ilmns == laengeilmn, useNA="ifany")
checkalleilmndrinn
uebersicht_ilmns[ uebersicht_ilmns != laengeilmn]
missing_ilmns <- sapply(allilmn[,2:ncol(allilmn), drop = F], function(x) paste(allilmn$ilmn[x ==F], collapse = ", "))
missing_ilmns = missing_ilmns[ stringr::str_length(missing_ilmns) !=0]
missing_ilmns = data.frame(missing_ilmns=missing_ilmns, fileset_id = names(missing_ilmns))
all(missing_ilmns$fileset_id %in% sample_overview_l$fileset_id)
missing_ilmns$nobkgd_f = file_overview[ match_hk(missing_ilmns$fileset_id, file_overview$fileset_id), "nobkgd_f"]
missing_ilmns$n_ilmn = uebersicht_ilmns[ match_hk(missing_ilmns$fileset_id, names(uebersicht_ilmns))]
missing_ilmns
# nachschauen, wass das ist
missing_ilmns_IDs = stringr::str_trim(unlist(stringr::str_split(missing_ilmns$missing_ilmns, ", ")))
relcols = c("ilmn", "nuid", "Source", "ILMN_Gene",
"Entrez_Gene_ID", "Accession", "Symbol",
"Probe_Type", "Cytoband", "Definition",
"Synonyms", "is_control", "EntrezReannotated",
"ProbeQuality", "ProbeSequence", "SymbolReannotated",
"SymbolReannotated_orgHsEg", "GenomicLocation_chr",
"GenomicLocation_start", "GenomicLocation_ende",
"badprobe_dunning")
if(length(missing_ilmns_IDs)>0) missingilmns_det = ilmnAnnot014allgInfos[ paste0("ILMN_", ilmnAnnot014allgInfos$ilmn) %in% missing_ilmns_IDs,relcols] else missingilmns_det = structure(list(ilmn = integer(0), nuid = character(0), Source = character(0), ILMN_Gene = character(0), Entrez_Gene_ID = integer(0), Accession = character(0), Symbol = character(0), Probe_Type = character(0), Cytoband = character(0), Definition = character(0), Synonyms = character(0), is_control = logical(0), EntrezReannotated = integer(0), ProbeQuality = character(0), ProbeSequence = character(0), SymbolReannotated = character(0), SymbolReannotated_orgHsEg = character(0), GenomicLocation_chr = character(0), GenomicLocation_start = integer(0), GenomicLocation_ende = integer(0), badprobe_dunning = logical(0)), .Names = c("ilmn", "nuid", "Source", "ILMN_Gene", "Entrez_Gene_ID", "Accession", "Symbol", "Probe_Type", "Cytoband", "Definition", "Synonyms", "is_control", "EntrezReannotated", "ProbeQuality", "ProbeSequence", "SymbolReannotated", "SymbolReannotated_orgHsEg", "GenomicLocation_chr", "GenomicLocation_start", "GenomicLocation_ende", "badprobe_dunning"), row.names = integer(0), class = "data.frame")
status04_missingilmns = ifelse(length(checkalleilmndrinn)!= 1, paste("Problem found: Following expression files did not include all",
laengeilmn, "Illumina probes:\n",
paste0(missing_ilmns$nobkgd_f, "\n")),
paste("All expression files included ", laengeilmn,
"Illumina probes, no problem identified.\n"))
message(status04_missingilmns)
## ----allelIlmnDA2--------------------------------------------------------
head(all_con)
# all_con = read.delim("_archive//illumina_annot_control_beads_110407hk.txt")
for(i in na.omit(unique(sample_overview_l$con_f))){
# i = "ControlProbeProfile_nonorm_nobkgd_A1-Kohorte.txt"
message("Check following file for presence of 883 Illumina-control probes: ", i, "\n")
pfad <-i
fileset_id <- unique(na.omit(sample_overview_l[ sample_overview_l$con_f ==i, "fileset_id"]))
ilmn_read <- fread(pfad, select = 1:2)
# str(ilmn_read)
ilmn_read = ilmn_read$ProbeID
# print(sum(ilmn_read %in% all_con$Array_Address_Id))
all_con$neu <- all_con$Array_Address_Id %in% ilmn_read
names(all_con)[names(all_con)=="neu"] <- fileset_id
}
ht(all_con,1)
uebersicht_ilmnCons <- sapply(all_con[ duplicated(all_con$Probe_Id)==F,7:ncol(all_con), drop = F], function(x) sum(x))
checkall_condrinn <- table(uebersicht_ilmnCons == length(all_con$Array_Address_Id), useNA="ifany")
checkall_condrinn
uebersicht_ilmnCons[ uebersicht_ilmnCons != length(all_con$Array_Address_Id)]
missing_ilmnCons <- sapply(all_con[,7:ncol(all_con), drop = F], function(x) paste(all_con$Probe_Id[x ==F], collapse = ", "))
missing_ilmnCons = missing_ilmnCons[ stringr::str_length(missing_ilmnCons) !=0]
missing_ilmnCons = data.frame(missing_ilmnCons=missing_ilmnCons, fileset_id = names(missing_ilmnCons))
all(missing_ilmnCons$fileset_id %in% sample_overview_l$fileset_id)
missing_ilmnCons$nobkgd_f = file_overview[ match_hk(missing_ilmnCons$fileset_id, file_overview$nobkgd), "nobkgd_f"]
missing_ilmnCons$n_ilmn = uebersicht_ilmnCons[ match_hk(missing_ilmnCons$fileset_id, names(uebersicht_ilmnCons))]
laengeilmncon = unique(uebersicht_ilmnCons)
status04_missingilmnCons = ifelse(length(checkall_condrinn)!= 1,
paste("Problem found: Following control files did not include all",
length(all_con$Array_Address_Id), "Illumina control probes:\n",
paste0(missing_ilmnCons$nobkgd_f, "\n")),
paste("All control files included all",
laengeilmncon, "Illumina control probes, no problem identified.\n"))
## ----anhuebschen---------------------------------------------------------
# gleiche Ids
sample_overview_l$Sample.IDclean = stringr::str_trim(sample_overview_l$Sample.ID) # oben, in funktion getIDsFromKnutSamplefiles wird ein warning generiert fuer diesen fall
sample_overview_l$old_ID.IDclean = stringr::str_trim(sample_overview_l$old_ID) # oben, in funktion getIDsFromKnutSamplefiles wird ein warning generiert fuer diesen fall
checkedIDs = apply(sample_overview_l[, c('old_ID.IDclean', "Sample.IDclean", "Sample_ID")], 1,function (x) length(na.omit(unique(stringr::str_trim(x)))) >1)
check = sample_overview_l[checkedIDs,c( 'old_ID.IDclean', "Sample.IDclean", "Sample_ID")]
check
checkdetail = sample_overview_l[checkedIDs,]
status05checkedIDsConsistency = ifelse(nrow(check) != 0,('Problem found: Different IDs for identical samples used within columns "Sample.ID", "Sample_ID", and rownames of Expressionfiles. Please clarify reasons!'),('No different IDs for identical samples used within columns "Sample.ID", "Sample_ID", and rownames of Expressionfiles, no problem identified.\n'))
status05checkedIDsConsistency_data = check
if(nrow(check) != 0) warning(status05checkedIDsConsistency) else message(status05checkedIDsConsistency)
sample_overview_l$Sample.IDclean = NULL
sample_overview_l$old_ID.IDclean = NULL
## add on 25.6.18 old ID darf trailing leerzeichen haben, wird implizit ignoriert durch str_trim. Die trailing spacesim expression file scheinen ohnehin ignoriert zu werden,also sogar notwendig, wenn einheitlich trailing spaces vergeben wurden
sample_overview_l$old_ID = stringr::str_trim(sample_overview_l$old_ID)
## ----releva--------------------------------------------------------------
names(sample_overview_l)[names(sample_overview_l) == "samples"] = "new_ID"
good_columns = c("paste_id", "old_ID", "new_ID", "nobkgd_f", "con_f", "sample_f", "fileset_id", "Index", "Sample.Group", "Sentrix.Barcode", "Sample.Section", "Detected.Genes..0.01.", "Detected.Genes..0.05.", "Signal.Average", "Signal.P05", "Signal.P25", "Signal.P50", "Signal.P75", "Signal.P95", "BIOTIN", "CY3_HYB", "HOUSEKEEPING", "LABELING", "LOW_STRINGENCY_HYB", "NEGATIVE..background.", "Noise") # , "bkdg_prblm" entfernt, weil nicht mehr daten dafuer da
qlist44 = venn2(good_columns, names(sample_overview_l), plotte = showVennplots)
# str(qlist44)
sample_overview_l2 = sample_overview_l[,good_columns]
## ----sentrihaeurf und umwandeln--------------------------------------------------------
sample_overview_l2$Sentrix.Barcode = as.character(sample_overview_l2$Sentrix.Barcode) # neu 21.8.18 to avoid int64 class
check_haeufigkeitSentrixIDs = table(table(sample_overview_l2$Sentrix.Barcode))
check_haeufigkeitSentrixIDs
if(as.numeric(max(names(check_haeufigkeitSentrixIDs), na.rm = T)) >12) warning("sentrix IDs should only have 12 indis")
## ----speichern-----------------------------------------------------------
# for doku
alle_statusse = ls()[grep("^status", ls())]
alle_statusse
status01_allelfilesda
status02_mehrfachIDs
status03_badSentrix
status05checkedIDsConsistency
qlist332 = venn2(sample_overview$samples, sample_overview_l2$old_ID, plotte = showVennplots)
sample_overview_l2$in_study = TRUE
sample_overview_l2$reason4exclusion = NA_character_
sample_overview_l2$subgroup = 'all'
sample_overview_l2$processingbatch = sample_overview_l2$fileset_id
sample_overview_l2$strangebatch = 0
res = vector(mode = "list", length = 1)
class(res) = unique(c(class(res), "HT12prepro"))
res$chipsamples = sample_overview_l2
res$dokuobjects_checkExtractChipsamples = list(missing_ilmns, missing_ilmnCons,missingilmns_det,sample_overview_l2, check_haeufigkeitSentrixIDs, mget(alle_statusse))
names(res$dokuobjects_checkExtractChipsamples) = c('missing_ilmns', 'missing_ilmnCons','missingilmns_det','sample_overview_l2', 'check_haeufigkeitSentrixIDs', 'alle_statusse')
res$history = data.frame(calls = paste(Sys.time(), deparse(myparameters)))
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.