R/findingFeatures/percentNormality.R

percentNormality <- function(DT, p_KS){

    ## Initialize list output
    featureNum <- matrix(0, nrow = 4, ncol = 1)
    featureDen <- matrix(0, nrow = 4, ncol = 1)
    featureNum <- rep(list(featureNum), 6)
    featureDen <- rep(list(featureDen), 6)
    ## 1. All lipids
    ## 2. Validation
    ## 3. Only standards
    ## 4. Only unidentified/artifact
    ## 5. Unique transition lipids
    ## 6. Multiple transition lipids

    minimumCol <- grep("ExactMass", colnames(DT)) +1
    meltDT <- melt(DT, measure.vars = c(colnames(DT)[minimumCol:(length(DT))]))
    meltDT <- meltDT[!is.na(value)]
    meltDT[which(meltDT[, GeneralID == "STANDARD"]), GeneralID := "Standard"]
    #meltDT <- meltDT[GeneralID != "Artifact"]
    #meltDT <- meltDT[GeneralID != "Unidentified"]
    #meltDT <- meltDT[GeneralID != "Standard"]
    num <- grep("Structural.ID.by.", colnames(meltDT))
    setnames(meltDT, num, "Validation")
    #meltDT <- meltDT[ Validation %in% c(3,4)]

    meltDT[, c("dataset", "matrix", "Method") := tstrsplit(variable, ";_;", fixed=TRUE)[2:4]]
    meltDT[, Mean := mean(as.numeric(value)), by = c("Barcode")]
    meltDT[, Sd := sd(as.numeric(value)), by = c("Barcode")]
    meltDT[, normality := ks.test(as.numeric(value), "pnorm", Mean, Sd)[2], by = c("Barcode")]
    meltDT[, temp := do.call(paste, c(.SD, sep = "_")), .SDcols=c("library.Q1", "library.Q3")]
    dups <- duplicated(meltDT, by = c("variable", "temp"));
    meltDT[, fD := dups | c(tail(dups, -1), FALSE)]
    featureNum[[1]][1,1] <- nrow(meltDT[normality > p_KS])
    featureDen[[1]][1,1] <- nrow(meltDT)
    featureNum[[2]][1,1] <- nrow(meltDT[Validation %in% c(3,4) & normality > p_KS])
    featureDen[[2]][1,1] <- nrow(meltDT[Validation %in% c(3,4)])
    featureNum[[3]][1,1] <- nrow(meltDT[GeneralID == "Standard" & normality > p_KS])
    featureDen[[3]][1,1] <- nrow(meltDT[GeneralID == "Standard"])
    featureNum[[4]][1,1] <- nrow(meltDT[GeneralID %in% c("Unidentified", "Artifact") & normality > p_KS])
    featureDen[[4]][1,1] <- nrow(meltDT[GeneralID %in% c("Unidentified", "Artifact")])
    featureNum[[5]][1,1] <- nrow(meltDT[fD == FALSE & normality > p_KS])
    featureDen[[5]][1,1] <- nrow(meltDT[fD == FALSE])
    featureNum[[6]][1,1] <- nrow(meltDT[fD == TRUE & normality > p_KS])
    featureDen[[6]][1,1] <- nrow(meltDT[fD == TRUE])

    meltDT[, logValue := log10(abs(as.numeric(value)))]
    meltDT[, Mean := log10(abs(as.numeric(value)))]
    meltDT[, Sd := log10(abs(as.numeric(value)))]
    meltDT[, Mean := mean(Mean), by = c("Barcode")]
    meltDT[, Sd := sd(Sd), by = c("Barcode")]
    meltDT[, normality := ks.test(logValue, "pnorm", Mean, Sd)[2], by = c("Barcode")]
    meltDT[, temp := do.call(paste, c(.SD, sep = "_")), .SDcols=c("library.Q1", "library.Q3")]
    dups <- duplicated(meltDT, by = c("variable", "temp"));
    meltDT[, fD := dups | c(tail(dups, -1), FALSE)]
    featureNum[[1]][2,1] <- nrow(meltDT[normality > p_KS])
    featureDen[[1]][2,1] <- nrow(meltDT)
    featureNum[[2]][2,1] <- nrow(meltDT[Validation %in% c(3,4) & normality > p_KS])
    featureDen[[2]][2,1] <- nrow(meltDT[Validation %in% c(3,4)])
    featureNum[[3]][2,1] <- nrow(meltDT[GeneralID == "Standard" & normality > p_KS])
    featureDen[[3]][2,1] <- nrow(meltDT[GeneralID == "Standard"])
    featureNum[[4]][2,1] <- nrow(meltDT[GeneralID %in% c("Unidentified", "Artifact") & normality > p_KS])
    featureDen[[4]][2,1] <- nrow(meltDT[GeneralID %in% c("Unidentified", "Artifact")])
    featureNum[[5]][2,1] <- nrow(meltDT[fD == FALSE & normality > p_KS])
    featureDen[[5]][2,1] <- nrow(meltDT[fD == FALSE])
    featureNum[[6]][2,1] <- nrow(meltDT[fD == TRUE & normality > p_KS])
    featureDen[[6]][2,1] <- nrow(meltDT[fD == TRUE])

    meltDT[, "Z.score" := scale(as.numeric(value)), by = "variable"]
    meltDT[, Mean := mean(as.numeric(Z.score)), by = c("Barcode")]
    meltDT[, Sd := sd(as.numeric(Z.score)), by = c("Barcode")]
    meltDT[, normality := ks.test(Z.score, "pnorm", Mean, Sd)[2], by = c("Barcode")]
    meltDT[, temp := do.call(paste, c(.SD, sep = "_")), .SDcols=c("library.Q1", "library.Q3")]
    dups <- duplicated(meltDT, by = c("variable", "temp"));
    meltDT[, fD := dups | c(tail(dups, -1), FALSE)]
    featureNum[[1]][3,1] <- nrow(meltDT[normality > p_KS])
    featureDen[[1]][3,1] <- nrow(meltDT)
    featureNum[[2]][3,1] <- nrow(meltDT[Validation %in% c(3,4) & normality > p_KS])
    featureDen[[2]][3,1] <- nrow(meltDT[Validation %in% c(3,4)])
    featureNum[[3]][3,1] <- nrow(meltDT[GeneralID == "Standard" & normality > p_KS])
    featureDen[[3]][3,1] <- nrow(meltDT[GeneralID == "Standard"])
    featureNum[[4]][3,1] <- nrow(meltDT[GeneralID %in% c("Unidentified", "Artifact") & normality > p_KS])
    featureDen[[4]][3,1] <- nrow(meltDT[GeneralID %in% c("Unidentified", "Artifact")])
    featureNum[[5]][3,1] <- nrow(meltDT[fD == FALSE & normality > p_KS])
    featureDen[[5]][3,1] <- nrow(meltDT[fD == FALSE])
    featureNum[[6]][3,1] <- nrow(meltDT[fD == TRUE & normality > p_KS])
    featureDen[[6]][3,1] <- nrow(meltDT[fD == TRUE])

    meltDT[, "Z.score.log" := scale(logValue), by = "variable"]
    meltDT[, Mean := mean(as.numeric(Z.score.log)), by = c("Barcode")]
    meltDT[, Sd := sd(as.numeric(Z.score.log)), by = c("Barcode")]
    meltDT[, normality := ks.test(Z.score.log, "pnorm", Mean, Sd)[2], by = c("Barcode")]
    meltDT[, temp := do.call(paste, c(.SD, sep = "_")), .SDcols=c("library.Q1", "library.Q3")]
    dups <- duplicated(meltDT, by = c("variable", "temp"));
    meltDT[, fD := dups | c(tail(dups, -1), FALSE)]
    featureNum[[1]][4,1] <- nrow(meltDT[normality > p_KS])
    featureDen[[1]][4,1] <- nrow(meltDT)
    featureNum[[2]][4,1] <- nrow(meltDT[Validation %in% c(3,4) & normality > p_KS])
    featureDen[[2]][4,1] <- nrow(meltDT[Validation %in% c(3,4)])
    featureNum[[3]][4,1] <- nrow(meltDT[GeneralID == "Standard" & normality > p_KS])
    featureDen[[3]][4,1] <- nrow(meltDT[GeneralID == "Standard"])
    featureNum[[4]][4,1] <- nrow(meltDT[GeneralID %in% c("Unidentified", "Artifact") & normality > p_KS])
    featureDen[[4]][4,1] <- nrow(meltDT[GeneralID %in% c("Unidentified", "Artifact")])
    featureNum[[5]][4,1] <- nrow(meltDT[fD == FALSE & normality > p_KS])
    featureDen[[5]][4,1] <- nrow(meltDT[fD == FALSE])
    featureNum[[6]][4,1] <- nrow(meltDT[fD == TRUE & normality > p_KS])
    featureDen[[6]][4,1] <- nrow(meltDT[fD == TRUE])

    percentNorm <- unlist(featureNum) / unlist(featureDen)
    percentNorm <- matrix(percentNorm, nrow = 4, ncol = 6, byrow=F)

    return(percentNorm)
}
jchitpin/lipidMS documentation built on June 3, 2019, 7:58 p.m.