#' kmFit data cleaning
#'
#' Data
#' @param dat EList object output by voom( ). Contains counts (dat$E), meta (dat$targets), and genes (dat$genes).
#' @param kin Matrix with pairwise kinship values between individuals. Must be numeric with rownames.
#' @param patientID Character of variable name to match dat$targets to kinship row and column names.
#' @param libraryID Character of variable name to match dat$targets to dat$E colnames
#'
#' Alternate data if not using EList object
#' @param counts Matrix of normalized expression. Rows are genes, columns are libraries.
#' @param meta Matrix or data frame of sample and individual metadata.
#' @param genes Matrix or data frame of gene metadata.
#' @param weights Matrix or data frame of gene specific weights
#'
#' Subset data (optional)
#' @param subset_var Character list of variable name(s) to filter data by.
#' @param subset_lvl Character list of variable value(s) or level(s) to filter data to. Must match order of subset_var
#' @param subset_genes Character vector of genes to include in models.
#' @param model_lm Character vector of simple linear model version of model provided
#' @param genotype_name Character string. Used internally for kmFit_eQTL
#'
#' @return Data frame formatted for use in kmFit
#' @keywords internal
#' @importFrom tidyselect vars_select_helpers
kimma_cleaning <- function(dat=NULL, kin=NULL, patientID="ptID", libraryID="libID",
counts=NULL, meta=NULL, genes=NULL, weights=NULL,
subset_var = NULL, subset_lvl = NULL, subset_genes = NULL,
model_lm = NULL, genotype_name = NULL,
run_lmerel = FALSE){
i <- rowname <- NULL
#If data are NOT a voom EList, create a mock version
if(is.null(dat)) {
dat.format <- list()
#Expression data
##Move rownames to column if exist
##Order columns as in metadata and genes as in genes
if(rownames(counts)[1]!=1){
counts.format <- as.data.frame(counts) %>%
tibble::rownames_to_column() %>%
dplyr::select(rowname, tidyselect::all_of(unlist(meta[,libraryID],
use.names=FALSE)))
} else {
counts.format <- as.data.frame(counts) %>%
dplyr::rename_if(is.character, ~"rowname")%>%
dplyr::select(rowname, tidyselect::all_of(unlist(meta[,libraryID],
use.names=FALSE)))
}
if(!is.null(genes)){
g.match <- c()
#match expression data to column in genes
for(g in colnames(genes)){
if(any(counts.format$rowname %in% genes[,g])){ g.match <- c(g.match, g) }
}
if(length(g.match) >1){ stop("Cannot combine gene and expression data. More than 1 column in dat$genes or genes contains values that match gene names in dat$E or counts. Please remove duplicate gene name column.") }
counts.format <- counts.format %>%
dplyr::arrange(match(rowname, genes[,g.match])) %>%
tibble::column_to_rownames()
} else {
counts.format <- counts.format %>%
tibble::column_to_rownames()
}
#Metadata
##Remove samples not in expression data
meta.format <- as.data.frame(meta) %>%
dplyr::filter(get(libraryID) %in% colnames(counts.format))
#Weights if provided
if(!is.null(weights)){
weights.format <- as.data.frame(weights)
if(rownames(counts)[1]!=1){
rownames(weights.format) <- rownames(counts)
colnames(weights.format) <- colnames(counts)
} else {
rownames(weights.format) <- as.data.frame(counts) %>%
dplyr::select_if(is.character) %>% unlist(use.names = FALSE)
colnames(weights.format) <- colnames(counts)[-1]
}
weights.format <- weights.format[order(rownames(weights.format)),
colnames(counts.format)]
} else{
weights.format <- NULL
}
#Put in list
dat.format$E <- counts.format
dat.format$targets <- meta.format
dat.format$genes <- genes
dat.format$weights <- weights.format
} else {
dat.format <- dat
#Format weights from voom object
if(!is.null(dat$weights)){
weights.format <- as.data.frame(dat$weights)
if(rownames(dat$E)[1]!=1){
rownames(weights.format) <- rownames(dat$E)
colnames(weights.format) <- colnames(dat$E)
} else {
rownames(weights.format) <- as.data.frame(dat$E) %>%
dplyr::select_if(is.character) %>% unlist(use.names = FALSE)
colnames(weights.format) <- colnames(dat$E)[-1]
}
dat.format$weights <- weights.format[order(rownames(weights.format)),
colnames(dat$E)]
} else {
dat.format$weights <- NULL
}
}
#Format data
#If has rownames, move into df
if(is.numeric(dat.format$E[,1])){
dat.format$E <- tibble::rownames_to_column(as.data.frame(dat.format$E))
} else {
#Rename 1st column
colnames(dat.format$E)[1] <- "rowname"
}
if(!is.null(dat.format$weights) & is.numeric(dat.format$weights[,1])){
dat.format$weights <- tibble::rownames_to_column(dat.format$weights)
} else if(!is.null(dat.format$weights)){
#Rename 1st column
colnames(dat.format$weights)[1] <- "rowname"
}
###### Subset to variable of interest if selected ######
dat.subset <- dat.format
#Subset samples
if(!is.null(subset_var)){
for(i in 1:length(subset_var)) {
dat.subset$targets <- dplyr::filter(dat.subset$targets,
get(subset_var[[i]]) %in% subset_lvl[[i]])
dat.subset$E <- dplyr::select(as.data.frame(dat.subset$E),
rowname,
tidyselect::all_of(dat.subset$targets[,libraryID]))
if(!is.null(dat.subset$weights)){
dat.subset$weights <- dplyr::select(as.data.frame(dat.subset$weights),
rowname,
tidyselect::all_of(dat.subset$targets[,libraryID]))
}
}
}
#Subset genes
if(!is.null(subset_genes)){
dat.subset$E <- dplyr::filter(as.data.frame(dat.subset$E),
rowname %in% subset_genes)
if(!is.null(dat.subset$weights)){
dat.subset$weights <- dplyr::filter(as.data.frame(dat.subset$weights),
rowname %in% subset_genes)
}
}
###### Format data for modeling ####
# Convert IDs if they are the same variable
if(patientID == libraryID){
to.modelID <- "libID"
dat.subset$targets[,to.modelID] <- dat.subset$targets[,patientID]
} else{ to.modelID <- patientID }
if(!is.null(kin) & run_lmerel){
#Format kinship matrix if rownames inside matrix
if(!is.numeric(as.matrix(kin))){
name.col <- as.data.frame(kin) %>%
dplyr::select(!tidyselect::vars_select_helpers$where(is.numeric))
name.col <- colnames(name.col)
if(length(name.col) > 1){
stop("More than one non-numeric column in kin data. Please correct.")
}
kin.format <- as.data.frame(kin) %>%
tibble::column_to_rownames(name.col) %>%
as.matrix()
} else{
kin.format <- as.matrix(kin)
}
#Combine expression data (E) and sample metadata (targets)
to.model <- dat.subset$E %>%
tidyr::pivot_longer(-rowname, names_to = libraryID, values_to = "expression") %>%
dplyr::inner_join(dat.subset$targets, by=libraryID)
if(run_lmerel){
to.model <- to.model %>%
#Remove samples missing kinship
dplyr::filter(get(to.modelID) %in% colnames(kin.format)) %>%
dplyr::arrange(get(to.modelID))
} else{
to.model <- to.model %>%
dplyr::arrange(get(to.modelID))
}
#Add weights if available
if(!is.null(dat.subset$weights)){
to.model <- to.model %>%
dplyr::left_join(tidyr::pivot_longer(dat.subset$weights, -rowname,
names_to = libraryID, values_to = "gene_weight"),
by=c("rowname", libraryID))
} else{
to.model <- to.model %>%
dplyr::mutate(gene_weight = NA)
}
#Remove samples from kinship missing expression data
#Order kinship as in to.model
if(run_lmerel){
to.keep <- unique(unlist(to.model[,to.modelID]))
kin.subset <- as.data.frame(kin.format) %>%
tibble::rownames_to_column() %>%
dplyr::filter(rowname %in% to.keep) %>%
dplyr::select(rowname, tidyselect::all_of(to.keep)) %>%
dplyr::arrange(rowname) %>%
tibble::column_to_rownames()
} else{
kin.subset <- NULL
}
#Total samples messages
##total libraries
lib.no <- nrow(dat.subset$targets)
##total donors
if(patientID %in% colnames(dat.subset$targets)){
pt.no <- dat.subset$targets %>%
dplyr::distinct(get(patientID)) %>% nrow()
message("Input: ", lib.no, " libraries from ", pt.no, " unique individuals")
} else {
message("Input: ",lib.no, " libraries")
}
## Missing data
## Missing kinship
if(run_lmerel){
kin.no <- to.model %>%
dplyr::distinct(get(libraryID)) %>% nrow()
kin.miss <- lib.no-kin.no
if(kin.miss > 0){
message("- ", kin.miss, " libraries missing kinship")
}
} else{
kin.miss <- 0
}
##Missing other variables
all_vars <- gsub("expression~", "", model_lm)
all_vars <- unique(strsplit(all_vars, "\\+|\\*|:")[[1]])
complete <- to.model %>%
dplyr::filter(get(patientID) %in% colnames(kin)) %>%
dplyr::select(tidyselect::all_of(c(libraryID, all_vars))) %>%
dplyr::distinct() %>%
tidyr::drop_na() %>% nrow()
if(run_lmerel){
miss.no <- lib.no-kin.miss-complete
if(miss.no>0){
message("- ", miss.no, " additional libraries missing fixed effect variable(s)") }
} else{
miss.no <- lib.no-complete
if(miss.no>0){
message("- ", miss.no, " libraries missing fixed effect variable(s)") }
}
message("Model: ",lib.no-kin.miss-miss.no, " libraries")
} else{
#Combine expression data (E) and sample metadata (targets)
to.model <- dat.subset$E %>%
tidyr::pivot_longer(-rowname, names_to = libraryID, values_to = "expression") %>%
dplyr::inner_join(dat.subset$targets, by=libraryID)
#Add weights if available
if(!is.null(dat.subset$weights)){
to.model <- to.model %>%
dplyr::left_join(tidyr::pivot_longer(dat.subset$weights, -rowname,
names_to = libraryID, values_to = "gene_weight"),
by=c("rowname", libraryID))
} else{
to.model <- to.model %>%
dplyr::mutate(gene_weight = NA)
}
kin.subset <- NULL
#Total samples messages
##total libraries
lib.no <- nrow(dat.subset$targets)
##total donors
if(patientID %in% colnames(dat.subset$targets)){
pt.no <- dat.subset$targets %>%
dplyr::distinct(get(patientID)) %>% nrow()
message("Input: ", lib.no, " libraries from ", pt.no, " unique patients")
} else {
message("Input: ", lib.no, " libraries")
}
## Missing data
all_vars <- gsub("expression~", "", model_lm)
all_vars <- unique(strsplit(all_vars, "\\+|\\*|:")[[1]])
if(!is.null(genotype_name)){ all_vars <- c(all_vars, genotype_name)}
complete <- to.model %>%
dplyr::select(tidyselect::all_of(c(libraryID, all_vars))) %>%
dplyr::distinct() %>%
tidyr::drop_na() %>% nrow()
miss.no <- lib.no-complete
if(miss.no>0){message("- ", miss.no, " libraries missing fixed effect variable(s)") }
message("Model: ", lib.no-miss.no, " libraries")
}
#Put back ptID if was renamed as libID
# if(!(patientID %in% colnames(to.model))){ to.model[[patientID]] <- to.model$libID }
#Combine for saving
to.model.ls <- list()
to.model.ls[["to.model"]] <- to.model
to.model.ls[["kin.subset"]] <- kin.subset
to.model.ls[["dat.subset"]] <- dat.subset
return(to.model.ls)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.