rmDupGenes <- function(data_dt, idCol_v = "ID", symbolCol_v = "Symbol", method_v = "max") {
#' Remove Duplicate Genes
#' @description Can't make data.frames with duplicate rownames. Have to remove duplicates.
#' @param data_dt data.table of gene expression.
#' @param idCol_v don't think this is needed actually
#' @param symbolCol_v Name of column that contains gene identifiers
#' @param method_v One of "max" (default) or "mean". Max means that the gene with max count will be taken, "mean" means will average them all.
#' @export
## Get genes
genes_v <- data_dt[[symbolCol_v]]
## Get their counts
geneCounts_dt <- as.data.table(table(genes_v))
## Get which have > 1 observation
dupGenes_dt <- geneCounts_dt[N > 1,]
## Empty data.table for rows to keep
keepRows_dt <- NULL
## Get count columns
countCol_v <- setdiff(colnames(data_dt), c(idCol_v, symbolCol_v))
otherCol_v <- setdiff(colnames(data_dt), countCol_v)
## Filter
for (i in 1:dupGenes_dt[,.N]) {
## Get gene and subset
currGene_v <- dupGenes_dt[i, genes_v]
currData_dt <- data_dt[get(symbolCol_v) == currGene_v,]
## Make output row
if (method_v == "max") {
currMean_v <- rowMeans(currData_dt[,mget(countCol_v)])
currMax_v <- which.max(currMean_v)
keepRows_dt <- rbind(keepRows_dt, currData_dt[currMax_v,])
} else if (method_v == "mean") {
currMean_dt <- currData_dt[, lapply(.SD, mean, na.rm = T), by = symbolCol_v, .SDcols = countCol_v]
currMean_dt <- merge(currData_dt[1,mget(otherCol_v)], currMean_dt, by = symbolCol_v)
keepRows_dt <- rbind(keepRows_dt, currMean_dt[,mget(c(otherCol_v, countCol_v))])
} else {
stop(sprintf("Please proved either 'max' or 'mean' as argument to method_v. You have provided: %s.", method_v))
}
## Remove duplicate rows
rmIdx_v <- which(data_dt[[symbolCol_v]] == currGene_v)
data_dt <- data_dt[-rmIdx_v,]
} # for i
## Add back chosen columns
data_dt <- rbind(data_dt, keepRows_dt)
## Return
return(data_dt)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.