#' Main deviation.event processing fuction
#'
#' @param x row containing the deviation event
#' @param snpm.data dataframe containing the BAF's
#' @param correctionfactor number used in processing the event
#' @param gender string denoting gender
#' ( "Male", "Female" or "Unknown")
#' @import e1071
#' @importFrom graphics hist
#'
process.deviation.event <- function(x, snpm.data, correctionfactor, gender) {
#Saving parameters for homozygous BAF
minbaf <- c(0,0.1)
maxbaf <- c(1,0.9)
#Making of used variables
chr <- x[1]
start <- as.integer(x[2])
stop <- as.integer(x[3])
CNVType <- x[4]
#Manipulating Allelic Imbalance to be Loss Of Heterozygosity
##Only called when no matching copy number variation was called
if(CNVType == "Allelic Imbalance"){ ##COMMENT: PUT TERM IN CONSTANTS.R
CNVType <- "LOH" ##COMMENT: PUT TERM IN CONSTANTS.R
}
#If event matches XY (male), reapply the start and stop to fully cover the XY area
#(Ref start/stop: HumanCytoSNP 850K; Illumina) ###COMMENT: MOVE COMMENT ABOUT HUMANCYTOSNP 850K TO CONSTANTS.R FILE. KEEP GENERIC HERE.
if(all(chr == "XY", gender == "Male")){
start <- 0
stop <- as.integer(155270561) ####COMMENT: CAN THIS NUMBER BE INCLUDED IN A CONSTANTS.R FILE? AS FULL_X_HUMANCYTOSNP850K.
if(CNVType == "Homozygous Copy Loss"){ ##COMMENT: PUT TERM IN CONSTANTS.R
CNVType <- "CN Loss" ##COMMENT: PUT TERM IN CONSTANTS.R
}
}
#Making of Output Chromosome Region (OCR)
OCR <- paste("chr",chr,":",formatC(start, big.mark = ","),"-",formatC(stop, big.mark = ","), sep = "")
#Homozygous copy losses disturb the BAF signal
if(CNVType == "Homozygous Copy Loss"){
return(c(OCR,"Homozygous deletion","NA","NA","NA","NA","NA","NA","NA","NA"))
}
else{
#Manipulating the SNP-array file to only contain the matching chromosome.
df1 <- snpm.data[snpm.data$Chr == chr,]
df1 <- df1[complete.cases(df1),]
#Filtering the SNP-array file to contain the SNPs that are within range of start and stop
region_of_deviation <- c(as.integer(start), as.integer(stop))
df1 <- df1[with(df1, Position <= max(region_of_deviation) & Position >= min(region_of_deviation)),]
df1 <- df1[complete.cases(df1),]
df1 <- df1[with(df1, B.Allele.Freq >= max(minbaf) & B.Allele.Freq <= min(maxbaf)),]
df1 <- df1[complete.cases(df1),]
#Check to control the number of rows that remain
if(any(nrow(df1) < 1,nrow(df1[df1$B.Allele.Freq > 0.5,]) <= 3, nrow(df1[df1$B.Allele.Freq < 0.5,]) <= 3)){
#With deletions, its highly likely that all SNPs are filtered out because of a high percentage mosaicism
if(CNVType %in% c("CN Loss", "LOH")){ ##COMMENT: PUT TERM IN CONSTANTS.R
#Building of a percentage output: Number of Rows Top (NRT) and Number of Rows Bottom (NRB)
NRT <- paste("[>90/Too few usable SNPs][",nrow(df1[df1$B.Allele.Freq > 0.5,]),"]", sep = "")
NRB <- paste("[>90/Too few usable SNPs][",nrow(df1[df1$B.Allele.Freq < 0.5,]),"]", sep = "")
#Make CNVType more clear
if(CNVType == "CN Loss"){ ##COMMENT: PUT TERM IN CONSTANTS.R
CNVType <- "Deletion"
}
return(c(OCR,CNVType,"NA","NA",NRT,NRB,"NA","NA","NA","NA"))
}
else{
#Else: there are not enough SNPs to accuratly calculate a percentage range
NRT <- paste("[Too few usable SNPs][",nrow(df1[df1$B.Allele.Freq > 0.5,]),"]", sep = "")
NRB <- paste("[Too few usable SNPs][",nrow(df1[df1$B.Allele.Freq < 0.5,]),"]", sep = "")
#Make CNVType more clear
if(CNVType == "CN Gain"){ ##COMMENT: PUT TERM IN CONSTANTS.R
CNVType <- "Duplication"
}
if(CNVType == "High Copy Gain"){ ##COMMENT: PUT TERM IN CONSTANTS.R
CNVType <- "Tetrasomy"
}
return(c(OCR,CNVType,"NA","NA",NRT,NRB,"NA","NA","NA","NA"))
}
}
else {
#Splitting the dataframe in 2 dataframes, one containing SNP with a BAF >0.5 and one <0.5 for outlier removing
df1 <- df1[complete.cases(df1),]
df2 <- df1[df1$B.Allele.Freq > 0.5,]
df1 <- df1[df1$B.Allele.Freq < 0.5,]
#Apply the quantile outlier remove filter
df1$B.Allele.Freq <- remove_outliers(df1$B.Allele.Freq)
df1 <- df1[complete.cases(df1),]
df2$B.Allele.Freq <- remove_outliers(df2$B.Allele.Freq)
df2 <- df2[complete.cases(df2),]
#Combine the 2 dataframes to correct the BAF
equaldf <- rbind(df1, df2)
#Applying the correction factor to always correct towards 0.5
MeanOfMutation <- c(mean(df2$B.Allele.Freq),mean(df1$B.Allele.Freq))
MeanOfMutation <- mean(MeanOfMutation)
#Remove df1 and df2 to make sure no numbers get mixed up
remove(df1,df2)
if(all(MeanOfMutation > 0.5, correctionfactor > 1)){
equaldf$B.Allele.Freq <- equaldf$B.Allele.Freq / correctionfactor
}
if(all(MeanOfMutation < 0.5, correctionfactor < 1)){
equaldf$B.Allele.Freq <- equaldf$B.Allele.Freq / correctionfactor
}
if(all(MeanOfMutation > 0.5, correctionfactor < 1)){
equaldf$B.Allele.Freq <- equaldf$B.Allele.Freq * correctionfactor
}
if(all(MeanOfMutation < 0.5, correctionfactor > 1)){
equaldf$B.Allele.Freq <- equaldf$B.Allele.Freq * correctionfactor
}
#Reapply the above and below 0.5 BAF dataframes
df2 <- equaldf[equaldf$B.Allele.Freq > 0.5,]
df1 <- equaldf[equaldf$B.Allele.Freq < 0.5,]
#Create histograms of the deviation
#Create histogram CNVtype
if(CNVType == "CN Gain"){ ##COMMENT: PUT TERM IN CONSTANTS.R
hCNVType <- "Duplication"
}
if(CNVType == "CN Loss"){ ##COMMENT: PUT TERM IN CONSTANTS.R
hCNVType <- "Deletion"
}
if(CNVType == "High Copy Gain"){ ##COMMENT: PUT TERM IN CONSTANTS.R
hCNVType <- "Tetrasomy"
}
hist(equaldf$B.Allele.Freq, main = paste("[",chr,":",formatC(start, big.mark = ","),"-",formatC(stop, big.mark = ",")," ", hCNVType,"] [Combined BAF]", sep = ""), xlab = "BAF")
hist(df2$B.Allele.Freq, main = paste("[",chr,":",formatC(start, big.mark = ","),"-",formatC(stop, big.mark = ",")," ", hCNVType,"] [BAF > 0.5]", sep = ""), xlab = "BAF")
hist(df1$B.Allele.Freq, main = paste("[",chr,":",formatC(start, big.mark = ","),"-",formatC(stop, big.mark = ",")," ", hCNVType,"] [BAF < 0.5]", sep = ""), xlab = "BAF")
#A high chanse occurs for memory leaks, apply gc()
suppressMessages(gc())
#Building of an equaltest for low percentage mosaicisms
#Note: The statistical P values will always be rounded down!
#More than 2000 rows: Kolmogorov-Smirnov
if(nrow(equaldf) > 2000){
equaltest <- suppressWarnings(ks.test(x = equaldf$B.Allele.Freq, "pnorm", 0.5, sd(equaldf$B.Allele.Freq))[1:2][2][[1]])
equaltest <- floor(equaltest * 1000) / 1000
equaltest <- c(equaltest, "[K]")
}
#Between 30 and 2000 rows: Shapiro-Wilk
if(all(nrow(equaldf) <= 2000, nrow(equaldf[equaldf$B.Allele.Freq > 0.5,]) > 30, nrow(equaldf[equaldf$B.Allele.Freq < 0.5,]) > 30)){
equaltest <- shapiro.test(equaldf$B.Allele.Freq)[1:2][2][[1]]
equaltest <- floor(equaltest * 1000) / 1000
equaltest <- c(equaltest, "[S]")
}
#Lower than 30: No accurate test possible
if(all(nrow(equaldf) <= 2000, any(nrow(equaldf[equaldf$B.Allele.Freq > 0.5,]) <= 30, nrow(equaldf[equaldf$B.Allele.Freq <0.5,]) <= 30))){
equaltest <- "NA"
}
#Apply tag "Good" (N.V. < 0.05) or "M.G." (N.V. >= 0.05) for the combined BAF of mutation ###COMMENT: ASK MARLOES WHAT M.G. MEANS####
if(equaltest[1] >= 0.05){
outputequaltest <- paste("[M.G]",equaltest[2],"[",equaltest[1],"]", sep = "")
}
if(equaltest[1] < 0.05){
outputequaltest <- paste("[Good]",equaltest[2],"[",equaltest[1],"]", sep = "")
}
if(equaltest[1] == "NA"){
outputequaltest <- "NA"
}
#Remove MeanOfMutation so numbers don't get mixed up
remove(MeanOfMutation)
#Saving the mean of the mean, with applied correction factor
##Mean of mean, because now the mean of 2 points (both means) are taken
MeanOfMutation <- c(mean(df1$B.Allele.Freq),mean(df2$B.Allele.Freq))
MeanOfMutation <- round(mean(MeanOfMutation),digits = 4)
#Applying the Skewness, Shapiro-Wilk/Kolmogorov-Smirnov over the split dataframes
#Same parameters apply here as in equaltest
#Note: The statistical P values will always be rounded down!
SkewnessPValueT <- round(skewness(df2$B.Allele.Freq), digits = 4)
SkewnessPValueB <- round(skewness(df1$B.Allele.Freq), digits = 4)
if(nrow(equaldf) <= 2000){
if(any(nrow(df1) <= 30, nrow(df2) <= 30)){
shapiro1 <- "NA"
shapiro2 <- "NA"
}
else{
shapiro1 <- shapiro.test(df2$B.Allele.Freq)[1:2][2][[1]]
shapiro1 <- floor(shapiro1 * 1000) / 1000
shapiro1 <- c(shapiro1, "[S]")
shapiro2 <- shapiro.test(df1$B.Allele.Freq)[1:2][2][[1]]
shapiro2 <- floor(shapiro2 * 1000) / 1000
shapiro2 <- c(shapiro2, "[S]")
}
}
if(nrow(equaldf) > 2000){
shapiro1 <- suppressWarnings(ks.test(x = df2$B.Allele.Freq, "pnorm",mean(df2$B.Allele.Freq),sd(df2$B.Allele.Freq))[1:2][2][[1]])
shapiro1 <- floor(shapiro1 * 1000) / 1000
shapiro1 <- c(shapiro1, "[K]")
shapiro2 <- suppressWarnings(ks.test(x = df1$B.Allele.Freq, "pnorm",mean(df1$B.Allele.Freq),sd(df1$B.Allele.Freq))[1:2][2][[1]])
shapiro2 <- floor(shapiro2 * 1000) / 1000
shapiro2 <- c(shapiro2, "[K]")
}
#Calculating the standard deviation over both dataframes
stdevH <- sd(df2$B.Allele.Freq)
stdevL <- sd(df1$B.Allele.Freq)
#Calculating a 95% confidence interval over both dataframes
#BI95N: confidence interval, percentage, H for High (>0.5) and L for Low (<0.5)
BI95H <- qnorm(0.975)*stdevH/sqrt(nrow(df2))
BI95L <- qnorm(0.975)*stdevL/sqrt(nrow(df1))
StatsHigh95 <- c(mean(df2$B.Allele.Freq)-BI95H, mean(df2$B.Allele.Freq)+BI95H)
StatsLow95 <- c(mean(df1$B.Allele.Freq)-BI95L, mean(df1$B.Allele.Freq)+BI95L)
#Applying the correct formula for the current event
if(CNVType == "CN Loss"){ ##COMMENT: PUT TERM IN CONSTANTS.R
#Loading calculation for the BAF >0.5 for a deletion
calcdel2 <- function(x){-279.71*x^2+609.65*x-232.73}
#Loading calculation for the BAF <0.5 for a deletion
calcdel <- function(x){-279.71*x^2-50.244*x+97.219}
#Calculating percentage range
P95H <- c(round(calcdel2(min(StatsHigh95))), round(calcdel2(max(StatsHigh95))))
P95L <- c(round(calcdel(min(StatsLow95))), round(calcdel(max(StatsLow95))))
}
if(CNVType == "CN Gain"){ ##COMMENT: PUT TERM IN CONSTANTS.R
#Loading calculation for the BAF >0.5 for a duplication
calcdup2 <- function(x){1464*x^2-1118.2*x+193.99}
#Loading calculation for the BAF <0.5 for a duplication
calcdup <- function(x){1464*x^2-1809.8*x+539.83}
#Calculating percentage range
P95H <- c(round(calcdup2(min(StatsHigh95))), round(calcdup2(max(StatsHigh95))))
P95L <- c(round(calcdup(min(StatsLow95))), round(calcdup(max(StatsLow95))))
}
if(CNVType == "High Copy Gain"){ ##COMMENT: PUT TERM IN CONSTANTS.R
#Loading calculation for the BAF >0.5 for a tetrasomy
calctetra2 <- function(x){1118.8*x^2-1018.3*x+232.24}
#Loading calculation for the BAF <0.5 for a tetrasomy
calctetra <- function(x){1118.8*x^2-1219.3*x+332.73}
#Calculating percentage range
P95H <- c(round(calctetra2(min(StatsHigh95))), round(calctetra2(max(StatsHigh95))))
P95L <- c(round(calctetra(min(StatsLow95))), round(calctetra(max(StatsLow95))))
}
if(CNVType == "LOH"){ ##COMMENT: PUT TERM IN CONSTANTS.R
#Loading calculation for the BAF >0.5 for a uniparental disomy event
calchomo2 <- function(x){200*x-100}
#Loading calculation for the BAF <0.5 for a uniparental disomy event
calchomo <- function(x){-200*x+100}
#Calculating percentage range
P95H <- c(round(calchomo2(min(StatsHigh95))), round(calchomo2(max(StatsHigh95))))
P95L <- c(round(calchomo(min(StatsLow95))), round(calchomo(max(StatsLow95))))
}
#If the minimum percentage is higher than 100 (which is only possible with a duplication or tetrasomy):
#Calculate for tetrasomy
if(min(c(P95H,P95L)) > 100){
#Loading of calculations for a tetrasomy
calctetra2 <- function(x){1118.8*x^2-1018.3*x+232.24}
calctetra <- function(x){1118.8*x^2-1219.3*x+332.73}
#Calculating percentage range
P95H1 <- c(round(calctetra2(min(StatsHigh95))), round(calctetra2(max(StatsHigh95))))
P95L1 <- c(round(calctetra(min(StatsLow95))), round(calctetra(max(StatsLow95))))
#Reset percentages higher than 100 to 100
P95H1[P95H1 > 100] <- 100
P95L1[P95L1 > 100] <- 100
#Check whenever the N.V. > 0.05 has a P-value lower than 0.05
#If so, return with Note tag
if(shapiro1[1] < 0.05){
#Check whenever percentages are the same
if(min(P95H1) == max(P95H1)){
#If they are, return only 1
OPT <- paste("[Note][",min(P95H1),"][",nrow(df2),"]", sep = "")
}
else{
OPT <- paste("[Note][",min(P95H1),"-",max(P95H1),"][",nrow(df2),"]",sep = "")
}
}
#If not, return normally
else{
if(min(P95H1) == max(P95H1)){
OPT <- paste("[",min(P95H1),"][",nrow(df2),"]", sep = "")
}
else{
OPT <- paste("[",min(P95H1),"-",max(P95H1),"][",nrow(df2),"]",sep = "")
}
}
#Check whenever the N.V. < 0.05 has a P-value lower than 0.05
#If so, return with Note tag
if(shapiro2[1] < 0.05){
#Check whenever percentages are the same
if(min(P95L1) == max(P95L1)){
#If they are, return only 1
OPB <- paste("[Note][",min(P95L1),"][",nrow(df1),"]", sep = "")
}
else{
OPB <- paste("[Note][",min(P95L1),"-",max(P95L1),"][",nrow(df1),"]",sep = "")
}
}
#If not, return normally
else{
if(min(P95L1) == max(P95L1)){
OPB <- paste("[",min(P95L1),"][",nrow(df1),"]", sep = "")
}
else{
OPB <- paste("[",min(P95L1),"-",max(P95L1),"][",nrow(df1),"]",sep = "")
}
}
#Combining the 2 shapiro variables to 1 separately
if(length(shapiro1) > 1){
#Combining the 2 shapiro variables to 1 separately
shapiro1 <- paste(shapiro1[1],shapiro1[2], sep = "")
shapiro2 <- paste(shapiro2[1],shapiro2[2], sep = "")
}
#Return output, with event altered to !High Copy Gain!
return(c(OCR,"!Tetrasomy!",MeanOfMutation,outputequaltest,OPT,OPB,SkewnessPValueT,SkewnessPValueB,shapiro1,shapiro2))
}
else{
#Reset percentages higher than 100 to 100
P95H[P95H > 100] <- 100
P95L[P95L > 100] <- 100
#Check whenever P > 0.5 has a normal distribution
if(shapiro1[1] < 0.05){
#Check if it's a deletion with a minimum percentage of 80
if(all(CNVType %in% c("CN Loss","LOH"),min(P95H) > 80)){ ##COMMENT: PUT TERM IN CONSTANTS.R
OPT <- paste("[>",min(P95H),"][",nrow(df2),"]", sep = "")
}
else{
#Check whenever the percentages are the same
if(min(P95H) == max(P95H)){
#If they are, return only 1
P95H <- P95H[1]
OPT <- paste("[Note][",P95H,"][",nrow(df2),"]",sep = "")
}
else{
#If they aren't, return minimum to maximum
OPT <- paste("[Note][",min(P95H),"-",max(P95H),"][",nrow(df2),"]",sep = "")
}
}
}
#Else it is normally distributed
else{
#Check whenever the percentages are the same
if(min(P95H) == max(P95H)){
#If they are, return only 1
P95H <- P95H[1]
OPT <- paste("[",P95H,"][",nrow(df2),"]",sep = "")
}
else{
#If they aren't, return minimum to maximum
OPT <- paste("[",min(P95H),"-",max(P95H),"][",nrow(df2),"]",sep = "")
}
}
#Apply the same check for P < 0.5
if(shapiro2[1] < 0.05){
#Check whenever it's a deletion with a minimum percentage of 80
if(all(min(P95L) > 80, CNVType %in% c("CN Loss","LOH"))){ ##COMMENT: PUT TERM IN CONSTANTS.R
OPB <- paste("[>",min(P95L),"][",nrow(df1),"]", sep = "")
}
else{
#Check if minimum and maximum percentage are the same
if(min(P95L) == max(P95L)){
P95L <- P95L[1]
OPB <- paste("[Note][",P95L,"][",nrow(df1),"]", sep = "")
}
else{
OPB <- paste("[Note][",min(P95L),"-",max(P95L),"][", nrow(df1),"]", sep = "")
}
}
}
#Else it is normally distributed
else{
if(min(P95L) == max(P95L)){
P95L <- P95L[1]
OPB <- paste("[",P95L,"][",nrow(df1),"]",sep = "")
}
else{
OPB <- paste("[",min(P95L),"-",max(P95L),"][",nrow(df1),"]",sep = "")
}
}
#Combining the 2 shapiro variables to 1 separately
if(length(shapiro1) > 1){
#Combining the 2 shapiro variables to 1 separately
shapiro1 <- paste(shapiro1[1],shapiro1[2], sep = "")
shapiro2 <- paste(shapiro2[1],shapiro2[2], sep = "")
}
#Reapply chromosome Y or X with analysis of Y or X with male patients
if(all(gender == "Male", chr == "XY")){
OCR <- paste("PAR(X/Y)", sep = "")
}
#Make CNV Type more clear
if (CNVType == "CN Gain") { ##COMMENT: PUT TERM IN CONSTANTS.R
CNVType <- "Duplication"
}
if (CNVType == "CN Loss") { ##COMMENT: PUT TERM IN CONSTANTS.R
CNVType <- "Deletion"
}
if (CNVType == "High Copy Gain") { ##COMMENT: PUT TERM IN CONSTANTS.R
CNVType <- "Tetrasomy"
}
#Return output
return(c(OCR,CNVType,MeanOfMutation,outputequaltest,OPT,OPB,SkewnessPValueT,SkewnessPValueB,shapiro1,shapiro2))
}
}
}
}
#' Function to rebuild deviations to usefull information
#'
#' @param x line containing raw the deviation event
#' @param gender string denoting gender
#' ( "Male", "Female" or "Unknown")
process.deviation.line <- function(x, gender) {
#Selection of usefull information
events <- cbind(x[,1:3],x$Probes)
colnames(events) <- c("Chromosome.Region","Event","Length","Probes")
#Removing "chr" in front of the chromosome and the comma in the start and stop
events$Chromosome.Region <- gsub("chr","",events$Chromosome.Region)
events$Chromosome.Region <- gsub(",","",events$Chromosome.Region)
#Splitsing of the chromosome and start-stop
temp <- apply(events, 1, function(x){strsplit(x[1], ":")[[1]]})
temp <- data.frame(matrix(unlist(temp), nrow = length(temp) / 2, byrow = T), stringsAsFactors = F)
#Splitting of the start and stop
temp2 <- apply(temp,1,function(x){strsplit(x[2], "-")[[1]]})
temp2 <- data.frame(matrix(unlist(temp2), nrow = length(temp2) / 2, byrow = T), stringsAsFactors = F)
#Combining the chromosome, start, stop and event
events <- cbind(temp[,1],temp2,events[,2:4])
colnames(events) <- c("Chr","Start","Stop","Event","Length","Probes")
remove(temp,temp2)
#Alterating events on sex chromosomes in Male patients
if(gender == "Male"){
#Only the PAR areas are heterozygous, which are marked XY
events$Chr <- gsub("Y|X","XY",events$Chr)
#Removing allelic imbalance calls
events[with(events, Chr == "XY" & Event %in% c("LOH","Allelic Imbalance")),] <- NA ##COMMENT: PUT TERM IN CONSTANTS.R
events <- events[complete.cases(events),]
#If more than 1 event remains, only the first one will be used specifically
if(nrow(events[events$Chr == "XY",]) > 1){
events <- rbind(events[events$Chr != "XY",],events[events$Chr == "XY",][1,])
}
if(nrow(events[events$Chr == "XY",]) == 1){
events[events$Chr == "XY",][,5] <- 100001 #ASK MARLOES WHERE THIS NUMBER COMES FROM. START PAR1?
events[events$Chr == "XY",][,6] <- 11 #ASK MARLOES WHERE THIS NUMBER COMBES FROM.
}
}
if(gender == "Female"){
gsub("XY","X",events$Chr)
}
#Setting Start and Stop as integer
events$Start <- as.integer(events$Start)
events$Stop <- as.integer(events$Stop)
return(events)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.