#' Apply NLP to genomic reports to extract copy number variants (CNVs) from oncopanel reports
#' @description
#' `cnv()` provides natural language processing tools to abstract copy number variants data from oncopanel genomic reports
#' @param data The data frame of the genomic report of interest. This can be copied to the Clipboard from the EHR report by selecting the relevant text and then "copy". That is the default. If you don't use the clipboard function, you can use a data frame of the text file. The single column data frame should be labeled "Results". Required.
#' @return a data frame with two columns: variables (the redcap variable names) and results (the data to be imported in to redcap)
#' @export
#'
cnv.onco <- function(data = dplyr::tibble(Results = readr::clipboard())){
##########################################################################################################################
# load data
##########################################################################################################################
dt <- data
##########################################################################################################################
# Create regexs that we will use to select our data
##########################################################################################################################
# Keep only those rows with the following
## call the genes_regex function to create a regex of all the gene names
genes_boundary_df <- genetex::genes_boundary_regex() # this is a df of unique genes names concatenated with "|"
genes_boundary_regex <- genes_boundary_df$genes # this is a character string of the genes for our regex
genes_df <- genetex::genes_regex() # this is a df of genes names concatenated with "|" (no word boundaries; "greedy")
genes_regex <- genes_df$genes # this is a character string of the genes for our regex
nuc_regex <- "[ACTG]>[ACTG]|del[ACTG]" # This regex should identify those rows that have nucleotide changes
aa_regex <- "(\\b([A-Z][0-9]{1,}(([A-Z])|(_[A-Z][1-9]{1,}del)|(fs\\*[1-9]{1,})|(\\*)|(fs)|(del)))|(p\\.[A-Z]))|([0-9]ins[A-Z])"
# This regex will detect all the possible AA alterations with single initials
genes_boundary_nuc_aa_regex <- paste(genes_boundary_regex, nuc_regex, aa_regex, sep = "|") # concatenate several regexs
# Assemble a regex_discard to perform an initial screen
regex_discard <- "H3K27|Sf3b1|H3B-6545|\\bfusion\\b|\\bthe\\b|\\bincluding\\b|\\bcells\\b|\\bcancer\\b"
gain_loss_amplification_regex <- "gain|loss|amplification"
genes_gain_loss_amplification_regex <- paste(genes_boundary_regex, gain_loss_amplification_regex, sep = "|")
##########################################################################################################################
# Initial cleaning and tokenization
##########################################################################################################################
dt.1a <- dt
dt.1a$Results <- stringr::str_trim(string = dt.1a$Results) # Trim white space
# We need to eliminate much of this report in order to mine only those gene alterations relevant to each subject
## So we will use the word "references" as a divider and take only those rows that are above it
references_interpretation_definitions <- "references|interpretation|definitions"
dt.1b <- dt.1a %>%
mutate(divider = stringr::str_detect(string = dt.1a$Results,
pattern = stringr::regex(references_interpretation_definitions, ignore_case = TRUE)),
group = base::cumsum(divider)) %>%
dplyr::filter(group == 0)
# Perform a clean to remove rows that have problematic strings
dt.1c <- dt.1b %>% dplyr::filter(!stringr::str_detect(string = dt.1b$Results,
pattern = stringr::regex(regex_discard, ignore_case = FALSE)))
# remove ?
dt.1d <- dt.1c
dt.1d$Results <- stringr::str_replace_all(string = dt.1c$Results, pattern = "\\?", replacement = "")
# tokenize the vector with space
dt.1 <- splitstackshape::cSplit(
indt = dt.1d,
splitCols = "Results",
sep = " ",
direction = "long"
)
# now let's create an object with a divider and a group based on that divider
## To do that, we will create two additional vectors, first is a logical vector "divider" that will result TRUE if the
### token in the "word" vector is contained in the "keyword" vector. The "group" vector, then will perform a cumulative
#### sum of the "divider" vector, such that all those tokens that follow a specific keyword will have the same "group",
##### we can then use this group to tag those gene names with the specific type of alteration they had
dt.2 <- dt.1 %>% dplyr::mutate(divider = stringr::str_detect(string = dt.1$Results,
pattern = stringr::regex(gain_loss_amplification_regex, ignore_case = TRUE)),
group = base::cumsum(divider))
# Now let's create an important column that will serve as a look up table of sorts for the specific group that is
## associated with the gene alteration (e.g. "Gain" or "Loss")
dt.3 <- dt.2 %>% dplyr::mutate(results_group = paste(Results,group, sep = "_"))
# To refine our search, we will narrow our range. The oncopanel form has "cytoband" before our relevant section and
## "interpretation" after, so we can define our range that way as well
cytoband_interpretation_regex <- "Cytoband|INTERPRETATION"
### We will use the same strategy as above with the keywords look up table and the cumulative sum grouping
dt.4a <- dt.3 %>% dplyr::mutate(divider_1 = stringr::str_detect(string = dt.3$Results, pattern = stringr::regex(cytoband_interpretation_regex)),
group_1 = cumsum(divider_1)) %>%
dplyr::filter(group_1 == 1)
## drop the "," at the end of the gene names
dt.4 <- dt.4a
dt.4$Results <- stringr::str_replace(string = dt.4$Results, pattern = ",$", replacement = "")
# Now we need a way to identify each of these groups (i.e. "group_1") with the appropriate gene modification
## e.g. "Gain" or "Loss"
### To do this, we will create an object that is a vector of only those "results_group"s that contain "Gain"; thus
#### all the numbers in this group that follow the "_" will be the "group_1" numbers associated with "Gain
gain_idx <- dt.4 %>% dplyr::select(results_group) %>% dplyr::filter(stringr::str_detect(string = dt.4$results_group, pattern = "Gain"))
# Let's remove the "Gain_" part so we can identify the numbers that will be the look up table numbers for "Gain"
gain_idx$results_group <- str_remove_all(string = gain_idx$results_group, pattern = "Gain_") %>% as.numeric() # and let's turn them into numerics
# rename "results_group" "group" so that it can link up with the appropriate column in "divided.1"
gain_idx <- gain_idx %>% rename(group = results_group)
# Add a "cnv" column and populate the cells with "gain"
## "cnv" will be used for all three types of cnvs
gain_idx <- gain_idx %>% dplyr::mutate(cnv = "gain")
# We are now going to execute an analogous set of steps with "Amplification"
amplification_idx <- dt.4 %>% dplyr::select(results_group)
amplification_idx <- amplification_idx %>% dplyr::filter(str_detect(string = results_group, pattern = "Amplification"))
amplification_idx$results_group <- stringr::str_remove_all(string = amplification_idx$results_group, pattern = "Amplification_") %>% as.numeric()
amplification_idx <- amplification_idx %>% dplyr::rename(group = results_group)
amplification_idx <- amplification_idx %>% dplyr::mutate(cnv = "amplification")
# We are now going to execute an analogous set of steps with "Loss"
loss_idx <- dt.4 %>% dplyr::select(results_group) %>% dplyr::filter(str_detect(string = results_group, pattern = "Loss"))
loss_idx$results_group <- stringr::str_remove_all(string = loss_idx$results_group, pattern = "Loss_") %>% as.numeric()
loss_idx <- loss_idx %>% dplyr::rename(group = results_group)
loss_idx <- loss_idx %>% dplyr::mutate(cnv = "loss")
# combine these three tables with "dt.4" to create a "cnv" table that we can use to ID the genes associated
## with "Gain" "Loss" "Amplification"
cnv <- left_join(dt.4, gain_idx)
cnv <- left_join(cnv,amplification_idx, by = c("group"))
cnv <- left_join(cnv, loss_idx, by = "group")
cnv <- cnv %>% unite("cnv_label",
cnv,
cnv.x,
cnv.y,
na.rm = TRUE,
remove = TRUE
)
##########################################################################################################################
# Let's create the keyword list of genes that we are going to use to pull out only those rows in "cnv" that have Genes
## as tokens
# Now we will pull out the rows that have gene names in the "Results" column
cnv.1 <- cnv %>% dplyr::filter(stringr::str_detect(string = cnv$Results, pattern = stringr::regex(genes_regex)))
# Let's select only "Results" and "cnv_label" and rename "Results" into "gene"
cnv.2 <- cnv.1 %>% dplyr::select(Results, cnv_label) %>% dplyr::rename(gene = Results)
## Thus, cnv.2 is the dataframe that contains those genes with gains, losses or amplifications
### But because in our REDCap tool, amplifications are separate from gains or losses, we will remove them from our data
####frame
##### Let's create a separate table for amplifications since these have their own variables
amplifications <- cnv.2 %>% dplyr::filter(cnv_label == "amplification")
###### Use anti-join to remove "amplifications" from cnv.2
cnv.3a <- dplyr::anti_join(cnv.2, amplifications)
cnv.3 <- if(is.na(cnv.3a[1,1])) base::data.frame(gene = NA, cnv_label = NA) else cnv.3a
###### Now let's modify "cnv.3" to set it up with the appropriate variable names for the import_tool
cnv.3 <- cnv.3 %>% dplyr::mutate(cnv_gene = "cnv_gene")
cnv.3 <- cnv.3 %>% dplyr::mutate(cnv_gain_or_loss = "cnv_gain_or_loss")
cnv.3 <- cnv.3 %>% dplyr::mutate(number = 1:nrow(cnv.3))
cnv.4a <- cnv.3 %>% dplyr::mutate(cnv_gene = paste(cnv_gene, number, sep = "_"))
cnv.4 <- cnv.4a %>% dplyr::mutate(cnv_gain_or_loss = paste(cnv_gain_or_loss, number, sep = "_"))
####### rename variables to set up for combining with import_tool
######## cnv.gene is a df that contains the name of the gene side-by-side with the variable name
cnv.gene <- cnv.4 %>% dplyr::select(gene, cnv_gene) %>% dplyr::rename(variables = cnv_gene) %>% dplyr::rename(results = gene) %>% dplyr::select(variables, results)
######## cnv.gene.gain.or.loss is a df that contains the type of cnv side-by-side with the variable name
cnv.gene.gain.or.loss <- cnv.4 %>% dplyr::select(cnv_label, cnv_gain_or_loss) %>% dplyr::rename(variables = cnv_gain_or_loss) %>% dplyr::rename(results = cnv_label) %>% dplyr::select(variables, results)
##########################################################################################################################
# Let's continue with our processing of amplifications
## This is a bit more complicated because for some patient there will be no amplifications, so we need to come up with a
### way to prevent the code from producing an error if the amplifications are absent from the sample
#### we will solve this by adding a NA row as a default and then removing that row later
amplifications.1 <- amplifications %>% dplyr::select(gene)
##*******************************************************KEY STEP*******************************************************##
#### we will solve this problem of a potential empty df by adding a NA row as a default, adding a random column here
##### "blank" to then add an actual value so we can run the nrow(). If we don't have a value in at least one of the rows
###### then the code will produce an error. We will remove this later, so it doesn't affect the final product
amplifications.2 <- if(is.na(amplifications.1[1,1])) base::data.frame(gene = NA, amplification_gene = "amplifications_gene") else amplifications.1
amplifications.3 <- amplifications.2 %>% dplyr::mutate(number = 1:nrow(amplifications.2))
##### Similar to what we did above, we will build a variables column to associate these values with the appropriate
##### variable that we can import later
amplifications.3 <- amplifications.3 %>% dplyr::mutate(amplification_gene = "amplifications_gene")
amplifications.4 <- amplifications.3 %>% dplyr::mutate(variables = paste(amplification_gene, number, sep = "_")) %>%
dplyr::rename(results = gene) %>% dplyr::select(variables, results)
cnv.final <- base::rbind(cnv.gene, cnv.gene.gain.or.loss, amplifications.4)
# Add cnv_number to data frame
cnv.number.1 <- stringr::str_detect(string = cnv.final$variables, pattern = "cnv_gene")
cnv.number.2 <- sum(cnv.number.1)
cnv.number <- if(is.na(cnv.final[1,2])) 0 else cnv.number.2
cnv.number.df <- data.frame(variables = "cnv_number",
results = cnv.number)
# Add amplifications_number to data frame
amp.number.1 <- stringr::str_detect(string = cnv.final$variables, pattern = "amplifications_gene")
amp.number.2 <- sum(amp.number.1)
amp.number <- if(is.na(cnv.final[1,2])) 0 else amp.number.2
amp.number.df <- data.frame(variables = "amplifications_number",
results = amp.number)
# add cnv.number.df and amp.number.df to cnv.final
cnv.final <- rbind(cnv.final,
cnv.number.df,
amp.number.df)
# change to character vector
cnv.final$results <- base::as.character(cnv.final$results)
return(cnv.final)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.