data-raw/drug_annot/setup.R

library(data.table)
library(dplyr)
library(tidyr)
library(dseqr)

# function definitions

# non UTF-8 encoded character cause DT alerts on filtering
remove_non_utf8 <- function(df) {
    df[] <- lapply(df, function(x) {
        Encoding(x) <- "UTF-8"
        iconv(x, "UTF-8", "UTF-8",sub='')
    })

    return(df)
}

summarise_func <- function(x) {

    unqx <- stats::na.omit(x)
    unqx <- unlist(strsplit(unqx, '|', fixed = TRUE))
    unqx <- unique(unqx)

    # keep as NA if they all are
    if (!length(unqx)) return(NA_character_)

    # collapse distinct non-NA entries
    return(paste(unqx, collapse = ' | '))
}

rename_cols <- function(annot) {
    annot <- rename(annot,
                    `Clinical Phase` = clinical_phase,
                    MOA = moa,
                    Target = target,
                    `Disease Area` = disease_area,
                    Indication = indication,
                    Vendor = vendor,
                    `Catalog #` = catalog_no,
                    `Vendor Name` = vendor_name)

    return(annot)
}

destructure_title <- function(pdata, drop = TRUE, ...) {
    title_split <- strsplit(pdata$title, '_')

    pdata <- tibble::add_column(pdata,
                                'Compound'  = sapply(title_split, `[`, 1),
                                'Cell Line' = sapply(title_split, `[`, 2),
                                'Dose'      = sapply(title_split, `[`, 3),
                                'Duration'  = sapply(title_split, `[`, 4), ...)

    if (drop) pdata$title <- NULL
    return(pdata)
}

setup_genes_annot <- function(study) {

    pdata <- readRDS(paste0('inst/extdata/', study, '_genes_pdata.rds'))

    # add genecards link
    study_annot <- pdata %>%
        mutate(Genecards = gsub('^([^_]+)_.+?$', '\\1', title)) %>%
        mutate(Genecards = gsub('-oe|-lig|-sh', '', Genecards))

    # fix any discovered bad links
    study_annot[study_annot$Genecards == '61E3.4', "Genecards"] <- 'SMG1'

    # get gene description
    tx2gene <- qs::qread('inst/extdata/tx2gene.qs') %>%
        dplyr::filter(gene_name %in% study_annot$Genecards) %>%
        dplyr::select(description, gene_name)  %>%
        distinct()

    # remove duplicates
    tx2gene <- tx2gene |>
        group_by(gene_name) |>
        slice(1) |>
        ungroup()

    stopifnot(length(unique(tx2gene$gene_name)) == nrow(tx2gene))

    # add gene descriptions
    study_annot <- study_annot %>%
        dplyr::mutate(gene_name = toupper(Genecards)) %>%
        dplyr::left_join(tx2gene, na_matches = 'never') %>%
        dplyr::select(-gene_name)

    stopifnot(all.equal(study_annot$title, pdata$title))

    # keep what not already in pdata
    study_annot <- study_annot %>%
        dplyr::rename(Description = description) %>%
        dplyr::select(Description, Genecards)

    saveRDS(study_annot, paste0('inst/extdata/', study, '_genes_annot.rds'))

}

setup_annot <- function(annot, study) {

    increasing_phases <- c('Withdrawn', 'Preclinical', 'Phase 1', 'Phase 2', 'Phase 2 | GRAS', 'Phase 3', 'Launched', 'Launched | GRAS')

    pdata <- readRDS(paste0('inst/extdata/', study, '_pdata.rds'))
    pdata <- destructure_title(pdata, drop=FALSE, .after=1)

    # ensure pdata has unique titles
    pdata$title <- make.unique(pdata$title, sep='___')

    # pubchem_cid and pert_iname together have the most matches
    sum(pdata$Compound %in% annot$pert_iname)
    sum(pdata$`Pubchem CID` %in% annot$pubchem_cid)
    sum(pdata$`Pubchem CID` %in% annot$pubchem_cid | pdata$Compound %in% annot$pert_iname)

    # first get matches on cid
    cat('Matching on cids ... \n')
    annot_cids <-  pdata %>%
        select(title, Compound, 'Pubchem CID') %>%
        inner_join(annot, by = c('Pubchem CID' = 'pubchem_cid'), na_matches = 'never')

    # where there wasn't a cid match try on pert_iname
    cat('Matching on Broad internal names ... \n')
    study_annot <- pdata %>%
        select(title, 'Pubchem CID', 'Compound') %>%
        anti_join(annot, by = c('Pubchem CID' = 'pubchem_cid'), na_matches = 'never') %>%
        left_join(annot, by = c('Compound' = 'pert_iname'), na_matches = 'never') %>%
        bind_rows(annot_cids)


    # fill in na values with non-na value by cid and name
    cat('Filling in NA by CIDs and compound names ... \n')
    study_annot <- study_annot %>%
        group_by(`Pubchem CID`) %>%
        fill(everything()) %>%
        fill(everything(), .direction = 'up') %>%
        ungroup() %>%
        group_by(Compound) %>%
        fill(everything()) %>%
        fill(everything(), .direction = 'up') %>%
        ungroup()

    # use most advanced phase if same title has multiple phases
    cat('Selecting most advanced clinical phase for each title ... \n')
    phase <- study_annot %>%
        select(title, clinical_phase) %>%
        mutate(clinical_phase = factor(clinical_phase, increasing_phases, ordered = TRUE)) %>%
        group_by(title) %>%
        summarise(clinical_phase = max(clinical_phase))

    # merge rows that have the same title
    cat('Merging rows with the same title ... \n')
    study_annot <- study_annot %>%
        select(-clinical_phase) %>%
        group_by(title) %>%
        summarise_all(summarise_func) %>%
        left_join(phase, by = 'title', na_matches = 'never') %>%
        select(title, `Pubchem CID`, clinical_phase, everything()) %>%
        arrange(match(title, pdata$title))

    # check that study_annot and pdata are in same order
    stopifnot(all.equal(study_annot$title, pdata$title))

    # save as seperate annotation, removing what pdata already has
    study_annot <- study_annot %>%
        select(-c(title, `Pubchem CID`, pert_iname, pubchem_cid, Compound)) %>%
        rename_cols()

    saveRDS(study_annot, paste0('inst/extdata/', study, '_annot.rds'))

    return(study_annot)
}

# setup BRH data ----

# downloaded from https://repo-hub.broadinstitute.org/repurposing#download-data
drugs <- fread('data-raw/drug_annot/repurposing_drugs_20180907.txt', skip = 'pert_iname')
samples <- fread('data-raw/drug_annot/repurposing_samples_20180907.txt', skip = 'pert_iname')

# fix up some imperfections
table(samples$pert_iname %in% drugs$pert_iname)
# FALSE  TRUE
#     2  10145

samples$pert_iname[!samples$pert_iname %in% drugs$pert_iname]
# "golgicide-A" "YM-298198-desmethyl"

grep('golgicide', drugs$pert_iname, value = TRUE)
grep('golgicide', samples$pert_iname, value = TRUE)
samples[samples$pert_iname == 'golgicide-A', 'pert_iname'] <- 'golgicide-a'

# merge based on pert_iname
brh_annot <- left_join(drugs, samples, by = 'pert_iname', na_matches = 'never')
brh_annot[brh_annot == ''] <- NA

# keep what is relevant for now
brh_annot <- select(brh_annot, -c(broad_id, qc_incompatible, purity, expected_mass, smiles, InChIKey, deprecated_broad_id))
brh_annot <- unique(brh_annot)
brh_annot$pubchem_cid <- as.character(brh_annot$pubchem_cid)

# use most advanced phase
brh_annot$clinical_phase <- gsub('^Phase \\d/', '', brh_annot$clinical_phase)
unique(brh_annot$clinical_phase)

brh_annot <- remove_non_utf8(brh_annot)


# add SIDER, DrugBank, Wikipedia, and GRAS ----

sider <- readRDS('data-raw/drug_annot/pug_view/sider.rds')
sider <- tibble(pubchem_cid = names(sider), SIDER = sider) %>%
  mutate(SIDER = ifelse(sider, pubchem_cid, NA))

pug_annot <- readRDS('data-raw/drug_annot/pug_view/pug_annot.rds')
pug_annot <- rename(pug_annot, DrugBank = drugbank, Wikipedia = wikipedia)

# get matches on pubchem cid
annot <- brh_annot %>%
  as_tibble() %>%
  select(-pert_iname) %>%
  left_join(sider, by = 'pubchem_cid', na_matches = 'never') %>%
  inner_join(pug_annot, by = 'pubchem_cid', na_matches = 'never') %>%
  mutate(clinical_phase = ifelse(gras, paste0(clinical_phase, ' | GRAS'), clinical_phase)) %>%
  select(-gras) %>%
  select(clinical_phase, DrugBank, SIDER, Wikipedia, everything())

# get those that didn't match on pubchem cid and match by pert_iname
annot <- brh_annot %>%
  as_tibble() %>%
  anti_join(pug_annot, by = 'pubchem_cid', na_matches = 'never') %>%
  select(-pubchem_cid) %>%
  full_join(pug_annot, by = c('pert_iname'), na_matches = 'never') %>%  # need full_join for subsequent fill
  select(-gras) %>%
  bind_rows(annot)

# fill in non-NA by cid then by pert_iname
annot <- annot %>%
  group_by(pubchem_cid) %>%
  fill(everything()) %>%
  fill(everything(), .direction = 'up') %>%
  ungroup() %>%
  group_by(pert_iname) %>%
  fill(everything()) %>%
  fill(everything(), .direction = 'up') %>%
  ungroup() %>%
  distinct()

# CMAP02/L1000 setup ----

cmap_annot <- setup_annot(annot, 'CMAP02')
l1000_drugs_annot <- setup_annot(annot, 'L1000_drugs')
l1000_genes_annot <- setup_genes_annot('L1000')
hms-dbmi/drugseqr.data documentation built on Oct. 23, 2024, 10:28 p.m.