R/parseKaryotype.R

Defines functions parseKaryotype

Documented in parseKaryotype

#' Parse FISH karyotype strings
#'
#' Parses FISH karyotype strings into individual clones.
#'
#' @param data A data frame with columns named "PatientMRN", "PathNum",
#' "PathInstitution", "PathDate", and "PathKaryotype".
#' @param probes A data frame with columns named "probe" and "chr" that defines
#' the gene name and chromosomal location of the gene (e.g. 17p)
#' @param unmatched A Boolean indicating if unmatched reports should be included in the output.
#' If TRUE, then reports for which no FISH strings are detected or pattern matching cannot be performed are included in the output.
#' @param karyotype A Boolean indicating if the original karyotype string should be included in the output.
#' If TRUE, then the karyotype string is included.
#' @param deduplicate A Boolean indicating if duplicate clones with the same abnormality should be removed.  If TRUE, then duplicate
#' clones are removed and the columns for Gene and Copy number of omited from the output.  Be aware that when this option is set to TRUE, it will increase the computation time.
#' @return Returns a data frame where each row represents a clone from the
#' FISH karyotype string.
#' @details The five patterns below are recognized.  Pattern matching is case insensative.  Matched patterns are exluded if they do not contain a known probe.
#' \describe{
#' \item{pattern1}{(...)x#[...]}
#' \item{pattern2}{(...)[...]}
#' \item{pattern3}{(...)x#(...)[...]}
#' \item{pattern4}{(...)(...)x#[...]}
#' \item{pattern5}{(...)(...)[...]}
#' }
#' @export
#' @importFrom plyr ldply
#' @importFrom stringr str_extract_all str_extract
#' @importFrom stats na.omit
parseKaryotype <- function(data, probes, unmatched = TRUE, karyotype = TRUE, deduplicate = TRUE) {
    data <- data[!(is.na(data$PathKaryotype)),]
    if(nrow(data) == 0) {
        stop("No FISH strings found.", call. = FALSE)
    }
    # Create regular expression pattern to match strings between brackets and parentheses
    pattern.clone <- "\\([^()]+\\)x[[:digit:]]\\[[^()]{1,9}\\]|\\([^()]+\\)\\[[^()]+]|\\([^()]+\\)x[[:digit:]]\\([^()]+\\)\\[[^()]{1,9}]|\\([^()]+\\)\\([^()]+\\)x[[:digit:]]\\[[^()]{1,9}]|\\([^()]+\\)\\([^()]+\\)\\[[^()]{1,9}]"

    # Extract clones using pattern and convert to a list
    clones.list <- stringr::str_extract_all(string = data$PathKaryotype, pattern = regex(pattern.clone, ignore_case = TRUE))
    names(clones.list) <- data$PathNum

    # Extract probe specific clones
    clones <- data.frame()
    i <- 1
    for(i in 1:nrow(probes)){
        # Seperate clones into rows
        clone <- plyr::ldply(lapply(clones.list, function(x) grep(x, pattern = probes$probe[i], value = TRUE)), cbind)
        names(clone) <- c("PathNum", "Clone")
        clone$Gene <- rep(probes$probe[i], nrow(clone))

        # Parse copy number and clonal frequency
        pattern.copy.number.one.digit <- paste("\\)x[[:digit:]]", paste("IGH ?con ?", probes$probe[i], " ?x[[:digit:]]", sep = ""), paste0(probes$probe[i],c("\\)x[[:digit:]]", "x[[:digit:]]", " ?con ?IGH ?x[[:digit:]]"), collapse = "|"), sep = "|")
        pattern.copy.number.ranges1 <- paste("\\)x[[:digit:]]~[[:digit:]]", paste("IGH ?con ?", probes$probe[i], " ?x[[:digit:]]~[[:digit:]]", sep = ""), paste0(probes$probe[i],c("\\)x[[:digit:]]~[[:digit:]]", "x[[:digit:]]~[[:digit:]]", " ?con ?IGH ?x[[:digit:]]~[[:digit:]]"), collapse = "|"), sep = "|")
        pattern.copy.number.ranges2 <- paste("\\)x[[:digit:]]-[[:digit:]]", paste("IGH ?con ?", probes$probe[i], " ?x[[:digit:]]-[[:digit:]]", sep = ""), paste0(probes$probe[i],c("\\)x[[:digit:]]-[[:digit:]]", "x[[:digit:]]-[[:digit:]]", " ?con ?IGH ?x[[:digit:]]-[[:digit:]]"), collapse = "|"), sep = "|")
        pattern.copy.number = paste(pattern.copy.number.ranges1, pattern.copy.number.ranges2, pattern.copy.number.one.digit, sep = "|")
        clone$Copy <- unlist(stringr::str_extract(clone$Clone, pattern = regex(pattern.copy.number, ignore_case = TRUE)))
        clone$Copy <- stringr::str_extract(clone$Copy, pattern = regex(pattern = "x[[:digit:]]~[[:digit:]]|x[[:digit:]]-[[:digit:]]|x[[:digit:]]", ignore_case = TRUE))
        clone$Count <- stringr::str_extract(clone$Clone, pattern = "[0-9]{1,4}/[0-9]{1,4}|\\[[0-9]{1,4}\\]")
        clone$Count <- gsub(clone$Count, pattern = "\\[|\\]", replacement = "")
        clone$Frequency <- ifelse(grepl(clone$Count, pattern = "\\/"), sapply(clone$Count, function(x) eval(parse(text=x))), 1)

        # Define copy number abnormalities
        clone$Abnormality <- ifelse(grepl(clone$Copy, pattern = "x0|x1|x1~2|x1-2"), paste(probes$chr[i], "-", sep = ""),
                                    ifelse(clone$Copy == "x2", "None", paste(probes$chr[i], "+", sep = "")))

        # Define translocations
        clone$Abnormality <- ifelse(grepl(clone$Clone, pattern = "con", ignore.case = TRUE) & grepl(clone$Clone, pattern = "CCND1"), "t(11;14)",
                                    ifelse(grepl(clone$Clone, pattern = "con", ignore.case = TRUE) & grepl(clone$Clone, pattern = "FGFR3"), "t(4;14)",
                                           ifelse(grepl(clone$Clone, pattern = "con", ignore.case = TRUE) & grepl(clone$Clone, pattern = "MYC"), "t(8;14)",
                                                  ifelse(grepl(clone$Clone, pattern = "con", ignore.case = TRUE) & grepl(clone$Clone, pattern = "MAF"), "t(14;16)",
                                                         ifelse(grepl(clone$Clone, pattern = "con|sep", ignore.case = TRUE) & grepl(clone$Clone, pattern = "5'IGH|5' IGH"), "IGH rearrangement",
                                                                ifelse(grepl(clone$Clone, pattern = "sep", ignore.case = TRUE) & grepl(clone$Clone, pattern = "MYC"), "MYC rearrangement", clone$Abnormality))))))

        # Merge all clones with metadata
        metadata <- unique(data[,c("PatientMRN", "PathNum", "PathInstitution", "PathDate")])
        clone <- merge(metadata, clone, all = FALSE)
        clones <- rbind(clones, clone)
    }
    clones <- na.omit(clones)

    # Omit +14q and IGH rearrangements if there is a chr 14 translocation
    reportID <- levels(as.factor(clones$PathNum))
    reports <- data.frame()
    i <- 1
    for(i in 1:length(reportID)){
        report <- clones[clones$PathNum == reportID[i], ]
        report$Abnormality <- ifelse(grepl(report$Abnormality, pattern = "14q32.3\\+|IGH rearrangement") & any(grepl(report$Abnormality, pattern = "t\\(")), "None", report$Abnormality)

        # Merge all reports
        reports <- rbind(report, reports)
    }

    # Collapse duplicates
    if(deduplicate == TRUE) {
        dedup.reports <- data.frame()
        i <- 1
        for(i in 1:length(unique(reports$PathNum))){
            report <- reports[reports$PathNum == unique(reports$PathNum)[i],]
            unique.clones <- data.frame()
            j <- 1
            for(j in 1:length(unique(report$Clone))){
                unique.clone <- report[report$Clone == unique(report$Clone)[j], c("PathNum", "PatientMRN", "PathInstitution", "PathDate", "Clone", "Count", "Frequency", "Abnormality" )]
                if(nrow(unique.clone) > 1 & all(unique.clone$Abnormality == "None")){
                    unique.clone <- unique(unique.clone)
                }
                if(nrow(unique.clone) > 1 & any(unique.clone$Abnormality == "None")){
                    unique.clone <- unique.clone[unique.clone$Abnormality != "None", ]
                }
                unique.clones <- rbind(unique.clone, unique.clones)
            }
            dedup.reports <- rbind(unique.clones, dedup.reports)
        }
        reports <- dedup.reports
    }

    # Add unmatched reports
    if(unmatched == TRUE){
        unmatched.reports <- data[data$PathNum %in% setdiff(unique(data$PathNum), unique(reports$PathNum)), ]
        unmatched.reports$Clone <- ifelse(grepl(unmatched.reports$PathKaryotype, pattern = "nuc ish"), "Possible FISH detected, but pattern matching not possible", "FISH string not detected")
        reports <- merge(reports, unmatched.reports[,c("PathNum", "PatientMRN", "PathInstitution", "PathDate", "Clone")], all = TRUE)
    }

    # Include original karyotype string
    if(karyotype == TRUE){
        data.aggregate <- aggregate(data = data, PathKaryotype~PathNum, FUN = function(x) paste(unique(x), collapse="; "))
        reports <- merge(reports, data.aggregate[,c("PathNum", "PathKaryotype")])
    }

    # Sort by MRN, date, and clone
    reports = reports[order(reports$PatientMRN, reports$PathDate, reports$Clone), ]
    rownames(reports) = NULL
    return(reports)
}
davidcoffey/Diamonds documentation built on March 8, 2020, 12:34 a.m.