R/fix_bad_mgi_symbols.r

Defines functions fix_bad_mgi_symbols

Documented in fix_bad_mgi_symbols

#' fix_bad_mgi_symbols
#' - Given an expression matrix, wherein the rows are supposed to be MGI
#' symbols, find those symbols which are not official MGI symbols, then
#' check in the MGI synonm database for whether they match to a proper MGI
#' symbol. Where a symbol is found to be an aliases for a gene that is already
#' in the dataset, the combined reads are summed together.
#'
#' Also checks whether any gene names contain "Sep", "Mar" or "Feb".
#' These should be checked for any suggestion that excel has corrupted the
#' gene names.
#' @param exp An expression matrix where the rows are MGI symbols, or a
#'  SingleCellExperiment (SCE) or
#'  other Ranged Summarized Experiment (SE) type object.
#' @param mrk_file_path Path to the MRK_List2 file which can be downloaded
#' from www.informatics.jax.org/downloads/reports/index.html
#' @param printAllBadSymbols Output to console all the bad gene symbols
#' @param as_sparse Convert \code{exp} to sparse matrix.
#' @param verbose Print messages.
#' @param localHub If working offline, add argument localHub=TRUE to work 
#' with a local, non-updated hub; It will only have resources available that
#' have previously been downloaded. If offline, Please also see BiocManager
#' vignette section on offline use to ensure proper functionality. 
#'
#' @returns Returns the expression matrix with the rownames corrected and rows
#' representing the same gene merged. If no corrections are necessary, input
#' expression matrix is returned. If a SingleCellExperiment (SCE) or other
#' Ranged Summarized Experiment (SE) type object was inputted this will be
#' returned with the corrected expression matrix under counts.
#'
#' @examples
#' # Load the single cell data
#' cortex_mrna <- ewceData::cortex_mrna()
#' # take a subset for speed
#' cortex_mrna$exp <- cortex_mrna$exp[1:50, 1:5]
#' cortex_mrna$exp <- fix_bad_mgi_symbols(cortex_mrna$exp)
#' @export
#' @importFrom SummarizedExperiment rowRanges assays
#' @importFrom methods as is
#' @importFrom utils read.csv
fix_bad_mgi_symbols <- function(exp,
                                mrk_file_path = NULL,
                                printAllBadSymbols = FALSE,
                                as_sparse = TRUE,
                                verbose = TRUE,
                                localHub = FALSE) {
    # Check arguments
    err_msg <- paste0(
        "'exp' is null. It should be a numerical",
        " matrix with the rownames being MGI symbols."
    )
    if (is.null(exp)) {
        stop(err_msg)
    }
    # Check if input is an SCE or SE object
    res_sce <- check_sce(exp)
    exp <- res_sce$exp
    SE_obj <- res_sce$SE_obj
    SE_exp <- res_sce$SE_exp

    err_msg2 <- paste0(
        "Input 'exp' should not contain factors.",
        " Perhaps stringsAsFactors was not set while loading."
    )
    if (!is.null(levels(exp[1, 3]))) {
        stop(err_msg2)
    }
    warn_msg <- paste0(
        "Warning: Input 'exp' stored as characters. Converting",
        " to numeric. Check that it looks correct."
    )
    #### Convert to data.table --> data.frame ####
    exp <- dt_to_df(exp = exp)
    #### Convert characters to numbers ####
    exp <- check_numeric(exp = exp)
    # Check that exp is not some weird input format like
    # those generated by readr functions
    if (!any(
        is_sparse_matrix(exp),
        is_matrix(exp),
        is_delayed_array(exp),
        is.data.frame(exp)
    )) {
        stop(
            "exp must be either a data.frame",
            " matrix (sparse or dense), DelayedArray."
        )
    }
    # Check for symbols which are not real MGI symbols
    all_mgi <- ewceData::all_mgi(localHub = localHub)
    not_MGI <- rownames(exp)[!rownames(exp) %in% all_mgi]
    messager(sprintf("%s rows do not have proper MGI symbols", length(not_MGI)),
        v = verbose
    )
    if (length(not_MGI) > 20) {
        messager(paste(not_MGI[seq_len(20)], collapse = ", "), v = verbose)
    } else {
        messager(paste(not_MGI, collapse = ", "), v = verbose)
    }
    # if no improper MGI symbols return input
    if (length(not_MGI) == 0) {
        if (SE_obj) { # if SCE/SE obj inputted return that
            return(SE_exp)
        } else {
            return(exp)
        }
    }
    # Checking for presence of bad date genes, i.e. Sept2 --> 02.Sep
    date_like <- not_MGI[grep("Sep|Mar|Feb", not_MGI)]
    if (length(date_like) > 0) {
        warning(
            sprintf(
                "Possible presence of excel corrupted date-like genes: %s",
                paste(date_like, collapse = ", ")
            )
        )
    }

    err_msg3 <- paste0(
        "The file path used in mrk_file_path does not",
        " direct to a real file. Either leave this argument",
        " blank or direct to an MRK_List2.rpt file downloaded",
        " from the MGI website. It should be possible to ",
        "obtain from: http://www.informatics.jax.org/",
        "downloads/reports/MRK_List2.rpt"
    )
    err_msg4 <- paste0(
        "The MRK_List2.rpt file does not seem to have",
        " a column named 'Marker.Synonyms..pipe.separated.'"
    )
    # Load data from MGI to check for synonyms
    if (is.null(mrk_file_path)) {
        mgi_data <- ewceData::mgi_synonym_data(localHub = localHub)
    } else {
        if (!file.exists(mrk_file_path)) {
            stop(err_msg3)
        }
        mgi_data <- utils::read.csv(mrk_file_path,
            sep = "\t",
            stringsAsFactors = FALSE
        )
        mgi_synonym_data <- ewceData::mgi_synonym_data(localHub = localHub)
        if (!"Marker.Synonyms..pipe.separated." %in%
            colnames(mgi_synonym_data)) {
            stop(err_msg4)
        }
        # file is downloaded from:
        # http://www.informatics.jax.org/downloads/reports/index.html
        mgi_data <- mgi_data[!mgi_data$Marker.Synonyms..pipe.separated. == "", ]
    }
    # If there are too many genes in not_MGI then the grep crashes...
    # so try separately
    stepSize <- 500
    if (length(not_MGI) > stepSize) {
        lower <- 1
        upper <- stepSize
        for (i in seq_len(ceiling(length(not_MGI) / stepSize))) {
            if (upper > length(not_MGI)) {
                upper <- length(not_MGI)
            }
            use_MGI <- not_MGI[lower:upper]
            tmp <- grep(
                paste(use_MGI, collapse = ("|")),
                mgi_data$Marker.Synonyms..pipe.separated.
            )
            if (i == 1) {
                keep_rows <- tmp
            } else {
                keep_rows <- c(keep_rows, tmp)
            }
            lower <- lower + stepSize
            upper <- upper + stepSize
        }
    } else {
        keep_rows <- grep(
            paste(not_MGI, collapse = ("|")),
            mgi_data$Marker.Synonyms..pipe.separated.
        )
    }
    countBottom <- countTop <- 0
    # Count how many "|" symbols are in
    # "mgi_data$Marker.Synonyms..pipe.separated" to determine how many rows
    # the dataframe needs
    allSYN <- matrix("", nrow = length(keep_rows) +
        sum(stringr::str_count(
            mgi_data$Marker.Synonyms..pipe.separated.,
            "\\|"
        )), ncol = 2)
    colnames(allSYN) <- c("mgi_symbol", "syn")
    for (i in keep_rows) {
        # if(i %% 100 == 0){print(i)}
        syn_data <- unlist(strsplit(
            mgi_data$Marker.Synonyms..pipe.separated.[i],
            "\\|"
        ))
        tmp <- data.frame(
            mgi_symbol = mgi_data[i, ]$Marker.Symbol,
            syn = syn_data,
            stringsAsFactors = FALSE
        )
        countBottom <- countTop + 1
        countTop <- countBottom + dim(tmp)[1] - 1
        allSYN[countBottom:countTop, 1] <- tmp[, 1]
        allSYN[countBottom:countTop, 2] <- tmp[, 2]
    }
    allSYN <- data.frame(allSYN)
    matchingSYN <- allSYN[allSYN$syn %in% not_MGI, ]
    matchingSYN <- matchingSYN[!duplicated(matchingSYN$syn), ]
    matchingSYN <- matchingSYN[as.character(matchingSYN$mgi_symbol) !=
        as.character(matchingSYN$syn), ]
    rownames(matchingSYN) <- matchingSYN$syn

    # Check for duplicates of existing genes
    dupGENES <- as.character(matchingSYN$mgi_symbol[matchingSYN$mgi_symbol %in%
        rownames(exp)])
    msg <- paste0(
        "%s poorly annotated genes are replicates of existing genes.",
        " These are: "
    )
    messager(sprintf(msg, length(unique(dupGENES))), v = verbose)
    messager(paste(unique(dupGENES), collapse = ", "), v = verbose)
    #### Replace mis-used synonyms from the expression data ####
    exp <- as.matrix(exp) # IMPORTANT! Allows apply() to work
    exp_Good <- exp[!rownames(exp) %in% as.character(matchingSYN$syn), ]
    exp_Bad <- exp[as.character(matchingSYN$syn), ]
    prelim_exp_Bad <- exp_Bad
    # exp_Bad is a vector if only one bad value so need to convert to a
    # one line matrix
    if (methods::is(exp_Bad, "numeric")) {
        exp_Bad <- Matrix::t(as.matrix(exp_Bad))
        rownames(exp_Bad) <- as.character(matchingSYN$syn)
    }
    # Where duplicates exist, sum them together
    for (dG in unique(dupGENES)) {
        old_good_dG <- exp_Good[rownames(exp_Good) == dG, ]
        exp_Good[rownames(exp_Good) == dG, ] <-
            apply(rbind(
                exp_Good[rownames(exp_Good) == dG, ],
                exp_Bad[as.character(matchingSYN$mgi_symbol) ==
                    dG, ]
            ), 2, sum)
    }
    exp_Bad <- exp_Bad[!as.character(matchingSYN$mgi_symbol) %in% dupGENES, ]
    matchingSYN_deDup <-
        matchingSYN[!as.character(matchingSYN$mgi_symbol) %in% dupGENES, ]
    dropDuplicatedMislabelled <-
        as.character(matchingSYN_deDup$syn[
            duplicated(matchingSYN_deDup$mgi_symbol)
        ])
    matchingSYN_deDup <-
        matchingSYN_deDup[
            !matchingSYN_deDup$syn %in% dropDuplicatedMislabelled,
        ]
    exp_Bad <- exp_Bad[!rownames(exp_Bad) %in% dropDuplicatedMislabelled, ]
    # Replace bad names with mgi symbols
    syn_exp_Bad <- rownames(exp_Bad)
    exp_Bad_old <- exp_Bad
    rownames(exp_Bad) <- as.character(matchingSYN_deDup$mgi_symbol)
    # combine results and return
    new_exp <- rbind(exp_Good, exp_Bad)
    new_exp <- to_sparse_matrix(
        exp = new_exp,
        as_sparse = as_sparse,
        verbose = verbose
    )
    messager(sprintf(
        "%s rows should have been corrected by checking synonyms.",
        dim(matchingSYN)[1]
    ), v = verbose)
    still_not_MGI <- sort(rownames(new_exp)[!rownames(new_exp) %in% all_mgi])
    messager(sprintf(
        "%s rows STILL do not have proper MGI symbols.",
        length(still_not_MGI)
    ), v = verbose)
    if (printAllBadSymbols == TRUE) {
        messager(paste(still_not_MGI, collapse = ", "), v = verbose)
    }
    # Now filter results in SE/SCE obj if inputted and return it
    if (SE_obj) {
        # Update all annotation datasets by replacing by corrected counts, add
        # in annotation and meta data if available
        SE_exp <- SE_exp[seq_len(nrow(new_exp)), ] # match the number of rows
        names(SummarizedExperiment::rowRanges(SE_exp)) <-
            rownames(new_exp) # Update gene names
        SummarizedExperiment::assays(SE_exp) <- list(counts = new_exp)
        return(SE_exp)
    }
    return(new_exp)
}
NathanSkene/EWCE documentation built on April 10, 2024, 1:02 a.m.