data-raw/drug_annot/setup.R

library(data.table)
library(dplyr)
library(tidyr)
library(dseqr)

pkgconfig::set_config("dplyr::na_matches" = "never")

# function definitions

# non UTF-8 encoded character cause DT alerts on filtering
remove_non_utf8 <- function(df) {
  df[] <- lapply(df, function(x) {
    Encoding(x) <- "UTF-8"
    iconv(x, "UTF-8", "UTF-8",sub='')
  })

  return(df)
}

summarise_func <- function(x) {

  unqx <- stats::na.omit(x)
  unqx <- unlist(strsplit(unqx, '|', fixed = TRUE))
  unqx <- unique(unqx)

  # keep as NA if they all are
  if (!length(unqx)) return(NA_character_)

  # collapse distinct non-NA entries
  return(paste(unqx, collapse = ' | '))
}

rename_cols <- function(annot) {
  annot <- rename(annot,
                  `Clinical Phase` = clinical_phase,
                  MOA = moa,
                  Target = target,
                  `Disease Area` = disease_area,
                  Indication = indication,
                  Vendor = vendor,
                  `Catalog #` = catalog_no,
                  `Vendor Name` = vendor_name)

  return(annot)
}

destructure_title <- function(pdata, drop = TRUE, ...) {
  title_split <- strsplit(pdata$title, '_')

  pdata <- tibble::add_column(pdata,
                              'Compound'  = sapply(title_split, `[`, 1),
                              'Cell Line' = sapply(title_split, `[`, 2),
                              'Dose'      = sapply(title_split, `[`, 3),
                              'Duration'  = sapply(title_split, `[`, 4), ...)

  if (drop) pdata$title <- NULL
  return(pdata)
}

setup_genes_annot <- function(study) {

  pdata <- readRDS(paste0('inst/extdata/', study, '_genes_pdata.rds'))

  # add genecards link
  study_annot <- pdata %>%
    mutate(Genecards = gsub('^([^_]+)_.+?$', '\\1', title)) %>%
    mutate(Genecards = gsub('-oe|-lig|-sh', '', Genecards))

  # fix any discovered bad links
  study_annot[study_annot$Genecards == '61E3.4', "Genecards"] <- 'SMG1'

  # get gene description
  tx2gene <- readRDS('data-raw/tx2gene/tx2gene.rds') %>%
    dplyr::filter(gene_name %in% study_annot$Genecards) %>%
    dplyr::select(description, gene_name)  %>%
    distinct()

  # remove single duplicate
  tx2gene <- tx2gene[-which(tx2gene$gene_name == 'ALDOA')[1], ]

  stopifnot(length(unique(tx2gene$gene_name)) == nrow(tx2gene))

  # add gene descriptions
  study_annot <- study_annot %>%
    dplyr::mutate(gene_name = toupper(Genecards)) %>%
    dplyr::left_join(tx2gene) %>%
    dplyr::select(-gene_name)

  stopifnot(all.equal(study_annot$title, pdata$title))

  # keep what not already in pdata
  study_annot <- study_annot %>%
    dplyr::rename(Description = description) %>%
    dplyr::select(Description, Genecards)

  saveRDS(study_annot, paste0('inst/extdata/', study, '_genes_annot.rds'))

}

setup_annot <- function(annot, study) {

  increasing_phases <- c('Withdrawn', 'Preclinical', 'Phase 1', 'Phase 2', 'Phase 2 | GRAS', 'Phase 3', 'Launched', 'Launched | GRAS')

  pdata <- readRDS(paste0('inst/extdata/', study, '_pdata.rds'))
  pdata <- destructure_title(pdata, drop=FALSE, .after=1)

  # pubchem_cid and pert_iname together have the most matches
  sum(pdata$Compound %in% tolower(annot$pert_iname))
  sum(pdata$`Pubchem CID` %in% annot$pubchem_cid)
  sum(pdata$`Pubchem CID` %in% annot$pubchem_cid | pdata$Compound %in% tolower(annot$pert_iname))

  # first get matches on cid
  cat('Matching on cids ... \n')
  annot_cids <-  pdata %>%
    select(title, Compound, 'Pubchem CID') %>%
    inner_join(annot, by = c('Pubchem CID' = 'pubchem_cid'))

  # where there wasn't a cid match try on pert_iname
  cat('Matching on Broad internal names ... \n')
  study_annot <- pdata %>%
    select(title, 'Pubchem CID', 'Compound') %>%
    anti_join(annot, by = c('Pubchem CID' = 'pubchem_cid')) %>%
    left_join(annot, by = c('Compound' = 'pert_iname')) %>%
    bind_rows(annot_cids)


  # fill in na values with non-na value by cid and name
  cat('Filling in NA by CIDs and compound names ... \n')
  study_annot <- study_annot %>%
    group_by(`Pubchem CID`) %>%
    fill(everything()) %>%
    fill(everything(), .direction = 'up') %>%
    ungroup() %>%
    group_by(Compound) %>%
    fill(everything()) %>%
    fill(everything(), .direction = 'up') %>%
    ungroup()

  # use most advanced phase if same title has multiple phases
  cat('Selecting most advanced clinical phase for each title ... \n')
  phase <- study_annot %>%
    select(title, clinical_phase) %>%
    mutate(clinical_phase = factor(clinical_phase, increasing_phases, ordered = TRUE)) %>%
    group_by(title) %>%
    summarise(clinical_phase = max(clinical_phase))

  # merge rows that have the same title
  cat('Merging rows with the same title ... \n')
  study_annot <- study_annot %>%
    select(-clinical_phase) %>%
    group_by(title) %>%
    summarise_all(summarise_func) %>%
    left_join(phase, by = 'title') %>%
    select(title, `Pubchem CID`, clinical_phase, everything()) %>%
    arrange(match(title, pdata$title))

  # check that study_annot and pdata are in same order
  stopifnot(all.equal(study_annot$title, pdata$title))

  # save as seperate annotation, removing what pdata already has
  study_annot <- study_annot %>% select(-c(title, `Pubchem CID`, pert_iname, pubchem_cid, Compound)) %>% rename_cols()
  saveRDS(study_annot, paste0('inst/extdata/', study, '_annot.rds'))

  return(study_annot)
}

# setup BRH data ----

drugs <- fread('data-raw/drug_annot/repurposing_drugs_20180907.txt', skip = 'pert_iname')
samples <- fread('data-raw/drug_annot/repurposing_samples_20180907.txt', skip = 'pert_iname')

# fix up some imperfections
table(samples$pert_iname %in% drugs$pert_iname)
# FALSE  TRUE
#     2  10145

samples$pert_iname[!samples$pert_iname %in% drugs$pert_iname]
# "golgicide-A" "YM-298198-desmethyl"

grep('golgicide', drugs$pert_iname, value = TRUE)
grep('golgicide', samples$pert_iname, value = TRUE)
samples[samples$pert_iname == 'golgicide-A', 'pert_iname'] <- 'golgicide-a'

# merge based on pert_iname
brh_annot <- left_join(drugs, samples, by = 'pert_iname')
brh_annot[brh_annot == ''] <- NA

# keep what is relevant for now
brh_annot <- select(brh_annot, -c(broad_id, qc_incompatible, purity, expected_mass, smiles, InChIKey, deprecated_broad_id))
brh_annot <- unique(brh_annot)
brh_annot$pubchem_cid <- as.character(brh_annot$pubchem_cid)

# use most advanced phase
brh_annot$clinical_phase <- gsub('^Phase \\d/', '', brh_annot$clinical_phase)
unique(brh_annot$clinical_phase)

brh_annot <- remove_non_utf8(brh_annot)


# add SIDER, DrugBank, Wikipedia, and GRAS ----

sider <- readRDS('data-raw/drug_annot/pug_view/sider.rds')
sider <- tibble(pubchem_cid = names(sider), SIDER = sider) %>%
  mutate(SIDER = ifelse(sider, pubchem_cid, NA))

pug_annot <- readRDS('data-raw/drug_annot/pug_view/pug_annot.rds')
pug_annot <- rename(pug_annot, DrugBank = drugbank, Wikipedia = wikipedia)

# get matches on pubchem cid
annot <- brh_annot %>%
  as_tibble() %>%
  select(-pert_iname) %>%
  left_join(sider, by = 'pubchem_cid') %>%
  inner_join(pug_annot, by = 'pubchem_cid') %>%
  mutate(clinical_phase = ifelse(gras, paste0(clinical_phase, ' | GRAS'), clinical_phase)) %>%
  select(-gras) %>%
  select(clinical_phase, DrugBank, SIDER, Wikipedia, everything())

# get those that didn't match on pubchem cid and match by pert_iname
annot <- brh_annot %>%
  as_tibble() %>%
  anti_join(pug_annot, by = 'pubchem_cid') %>%
  select(-pubchem_cid) %>%
  full_join(pug_annot, by = c('pert_iname')) %>%  # need full_join for subsequent fill
  select(-gras) %>%
  bind_rows(annot)

# fill in non-NA by cid then by pert_iname
annot <- annot %>%
  group_by(pubchem_cid) %>%
  fill(everything()) %>%
  fill(everything(), .direction = 'up') %>%
  ungroup() %>%
  group_by(pert_iname) %>%
  fill(everything()) %>%
  fill(everything(), .direction = 'up') %>%
  ungroup() %>%
  distinct()

# CMAP02/L1000 setup ----

cmap_annot <- setup_annot(annot, 'CMAP02')
l1000_drugs_annot <- setup_annot(annot, 'L1000_drugs')
l1000_genes_annot <- setup_genes_annot('L1000')
hms-dbmi/drugseqr documentation built on Feb. 15, 2024, 10:38 p.m.