facets-suite: Functions to run and parse output from FACETS

suppressPackageStartupMessages({
    library(dplyr)
    library(data.table)
    library(stringr)
    library(readr)
    library(purrr)
    library(usethis)
    library(jsonlite)
})

# Genome builds ---------------------------------------------------------------------------------------------------
# The code use to download the genome builds is commented, the resulting data is in the form of tribbles below.

# extract_genome_info = function(url) {
#     
#     # url can be a path to a single file or a list of urls if split by chromosome
#     build = map_dfr(url, function(x) {
#         read_tsv(x, col_names = c('bin', 'chrom', 'start', 'end', 'ix', 'n', 'size', 'type', 'bridge'))
#         }) %>% 
#         group_by(chrom) %>% 
#         summarize(
#             size = max(end),
#             centstart = min(start[type == 'centromere']),
#             centend = max(end[type == 'centromere']),
#             centromere = centstart + ((centend-centstart)/2)) %>% 
#         mutate(chrom = str_replace(chrom, 'chr', ''),
#                chrom = str_replace_all(chrom, c('X' = '23', 'Y' = '24')),
#                chrom = as.numeric(chrom)) %>% 
#         filter(chrom %in% seq(1, 24)) %>% 
#         arrange(chrom)
# }

# hg18 // pulled from UCSC
# hg18 = map(c(seq(1:22), 'X', 'Y'), ~paste0('http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/chr', ., '_gap.txt.gz')) %>%
#     extract_genome_info()

hg18 = tibble::tribble(
    ~chrom,     ~size, ~centstart,  ~centend, ~centromere,
    1, 247249719,  121236957, 123476957,   122356957,
    2, 242951149,   91689898,  94689898,    93189898,
    3, 199501827,   90587544,  93487544,    92037544,
    4, 191273063,   49354874,  52354874,    50854874,
    5, 180857866,   46441398,  49441398,    47941398,
    6, 170899992,   58938125,  61938125,    60438125,
    7, 154817899,   58058273,  61058273,    59558273,
    8, 145403396,   43958052,  46958052,    45458052,
    9, 138336818,   47107499,  50107499,    48607499,
    10, 133577517,   39244941,  41624941,    40434941,
    11,  95942794,   51450781,  54450781,    52950781,
    12, 132349534,   34747961,  36142961,    35445461,
    13, 114142980,    1.6e+07,  17868000,    16934000,
    14, 106368585,   15070000,  18070000,    16570000,
    15,  96367672,   15260000,  18260000,    16760000,
    16,  88827254,   35143302,  36943302,    36043302,
    17,  78774742,   22187133,  22287133,    22237133,
    18,  73872808,   15400898,  16764896,    16082897,
    19,  63811651,   26923622,  29923622,    28423622,
    20,  60733814,   26267569,  28033230,  27150399.5,
    21,  43507092,   10260000,  13260000,    11760000,
    22,  49691432,   11330000,  14330000,    12830000,
    23, 148832720,   58598737,  61598737,    60098737,
    24,  57377044,   11253954,  12308578,    11781266
)

# hg19 // pulled from UCSC
# hg19 = extract_genome_info('http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/gap.txt.gz')
hg19 = tibble::tribble(
    ~chrom,     ~size, ~centstart,  ~centend, ~centromere,
    1, 249250621,  121535434, 124535434,   121535434,
    2, 243199373,   92326171,  95326171,    92326171,
    3, 198022430,   90504854,  93504854,    90504854,
    4, 191154276,   49660117,  52660117,    49660117,
    5, 180915260,   46405641,  49405641,    46405641,
    6, 171115067,   58830166,  61830166,    58830166,
    7, 159138663,   58054331,  61054331,    58054331,
    8, 146364022,   43838887,  46838887,    43838887,
    9, 141213431,   47367679,  50367679,    47367679,
    10, 135534747,   39254935,  42254935,    39254935,
    11, 135006516,   51644205,  54644205,    51644205,
    12, 133851895,   34856694,  37856694,    34856694,
    13, 115169878,    1.6e+07,   1.9e+07,     1.6e+07,
    14, 107349540,    1.6e+07,   1.9e+07,     1.6e+07,
    15, 102531392,    1.7e+07,     2e+07,     1.7e+07,
    16,  90354753,   35335801,  38335801,    35335801,
    17,  79759049,   22263006,  25263006,    22263006,
    18,  78077248,   15460898,  18460898,    15460898,
    19,  59128983,   24681782,  27681782,    24681782,
    20,  63025520,   26369569,  29369569,    26369569,
    21,  48129895,   11288129,  14288129,    11288129,
    22,  51304566,    1.3e+07,   1.6e+07,     1.3e+07,
    23, 155270560,   58632012,  61632012,    58632012,
    24,  59373566,   10104553,  13104553,    10104553
)

# hg38 // pulled from UCSC
# hg38 = 'http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/gap.txt.gz' # note: centromere info not longer in this type of file, but in centromeres.txt.gz
# hg38_centromeres = 'http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/centromeres.txt.gz' 
# hg38_centromeres = read_tsv(grch38_centromeres, col_names = c('bin', 'chrom', 'start', 'end', 'name')) %>% 
#     group_by(chrom) %>% 
#     summarize(centstart = min(start),
#               centend = max(end),
#               centromere = centstart + ((centend-centstart)/2)) %>% 
#     mutate(chrom = str_replace(chrom, 'chr', ''),
#            chrom = str_replace_all(chrom, c('X' = '23', 'Y' = '24')),
#            chrom = as.numeric(chrom))
# 
# hg38 = extract_genome_info(hg38) %>% 
#     select(-matches('cent')) %>% 
#     left_join(., hg38_centromeres, by = 'chrom')

hg38 = tibble::tribble(
    ~chrom,     ~size, ~centstart,  ~centend, ~centromere,
    1, 248956422,  122026459, 124932724, 123479591.5,
    2, 242193529,   92188145,  94090557,    93139351,
    3, 198295559,   90772458,  93655574,    92214016,
    4, 190214555,   49712061,  51743951,    50728006,
    5, 181538259,   46485900,  50059807,  48272853.5,
    6, 170805979,   58553888,  59829934,    59191911,
    7, 159345973,   58169653,  61528020,  59848836.5,
    8, 145138636,   44033744,  45877265,  44955504.5,
    9, 138394717,   43389635,  45518558,  44454096.5,
    10, 133797422,   39686682,  41593521,  40640101.5,
    11, 135086622,   51078348,  54425074,    52751711,
    12, 133275309,   34769407,  37185252,  35977329.5,
    13, 114364328,    1.6e+07,  18051248,    17025624,
    14, 107043718,    1.6e+07,  18173523,  17086761.5,
    15, 101991189,   17083673,  19725254,  18404463.5,
    16,  90338345,   36311158,  38265669,  37288413.5,
    17,  83257441,   22813679,  26616164,  24714921.5,
    18,  80373285,   15460899,  20861206,  18161052.5,
    19,  58617616,   24498980,  27190874,    25844927,
    20,  64444167,   26436232,  30038348,    28237290,
    21,  46709983,   10864560,  12915808,    11890184,
    22,  50818468,   12954788,  15054318,    14004553,
    23, 156040895,   58605579,  62412542,  60509060.5,
    24,  57227415,   10316944,  10544039,  10430491.5
)

# Gene positions --------------------------------------------------------------------------------------------------
tumor_suppressors = fromJSON(readLines('https://legacy.oncokb.org/api/v1/genes', warn = FALSE)) %>% 
    filter(tsg == TRUE) %>% 
    select(hugoSymbol) %>% 
    mutate(hugoSymbol = mapvalues(hugoSymbol,
                                  c('FAM175A', 'FAM58A', 'MRE11A', 'PARK2', 'FAM46C'), # these are not the official symbols
                                  c('ABRAXAS1', 'CCNQ', 'MRE11', 'PRKN', 'TENT5C'))) # these are

genes_hg19 = fread('ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/GRCh37_mapping/gencode.v29lift37.basic.annotation.gtf.gz',
                   header = F, skip = 'chr1',
                   col.names = c('chrom', 'source', 'type', 'start', 'end', 'na_1', 'strand', 'na_2', 'info')) %>%
    filter(type == 'gene', info %like% 'protein_coding') %>%
    mutate(chrom = str_replace(chrom, 'chr', ''),
           gene = str_extract(info, '(?<=gene_name ")[A-Za-z0-9\\.\\-]+(?=";)')) %>%
    group_by(gene, chrom) %>%
    summarize(start = min(start),
              end = max(end)) %>%
    ungroup() %>% 
    mutate(tsg = gene %in% tumor_suppressors$hugoSymbol)

genes_hg38 = fread('ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.basic.annotation.gtf.gz',
                   header = F, skip = 'chr1',
                   col.names = c('chrom', 'source', 'type', 'start', 'end', 'na_1', 'strand', 'na_2', 'info')) %>%
    filter(type == 'gene', info %like% 'protein_coding') %>%
    mutate(chrom = str_replace(chrom, 'chr', ''),
           gene = str_extract(info, '(?<=gene_name ")[A-Za-z0-9\\.\\-]+(?=";)')) %>%
    group_by(gene, chrom) %>%
    summarize(start = min(start),
              end = max(end)) %>%
    ungroup() %>% 
    mutate(tsg = gene %in% tumor_suppressors$hugoSymbol)

# use_data(hg18, hg19, hg38, genes_hg19, genes_hg38, internal = T, overwrite = T)

# Copy-number states ----------------------------------------------------------------------------------------------
copy_number_states = tibble::tribble(
    ~wgd, ~tcn, ~mcn, ~lcn, ~numeric_call,          ~call,
    # No genome doubling
    FALSE,    0,    0,    0,          -2,                           'HOMDEL',
    FALSE,    0,   NA,   NA,          -2,                           'HOMDEL',
    FALSE,    1,    1,    0,          -1,                          'HETLOSS',
    FALSE,    1,   NA,   NA,          -1,                          'HETLOSS',
    FALSE,    2,    2,    0,          -1,                            'CNLOH',
    FALSE,    2,   NA,   NA,          NA,                 'DIPLOID or CNLOH',    #numeric_call will be NA if it cannot be unambiguously ascertained. here it could be either 0 or -1
    FALSE,    3,    3,    0,           1,                     'CNLOH & GAIN',
    FALSE,    3,   NA,   NA,           1,               'GAIN (many states)',
    FALSE,    4,    4,    0,           1,                     'CNLOH & GAIN',
    FALSE,    4,   NA,   NA,           1,               'GAIN (many states)',
    FALSE,    5,    5,    0,           2,                        'AMP (LOH)',
    FALSE,    5,   NA,   NA,           2,                'AMP (many states)',
    FALSE,    6,    6,    0,           2,                        'AMP (LOH)',
    FALSE,    6,   NA,   NA,           2,                'AMP (many states)',
    FALSE,    2,    1,    1,           0,                          'DIPLOID',
    FALSE,    3,    2,    1,           1,                             'GAIN',
    FALSE,    4,    3,    1,           1,                             'GAIN',
    FALSE,    5,    4,    1,           2,                              'AMP',
    FALSE,    6,    5,    1,           2,                              'AMP',
    FALSE,    7,    6,    1,           2,                              'AMP',
    FALSE,    4,    2,    2,           1,                       'TETRAPLOID',
    FALSE,    5,    3,    2,           2,                              'AMP',
    FALSE,    6,    4,    2,           2,                              'AMP',
    FALSE,    7,    5,    2,           2,                              'AMP',
    FALSE,    8,    6,    2,           2,                              'AMP',
    FALSE,    6,    3,    3,           2,                   'AMP (BALANCED)',
    FALSE,    7,    4,    3,           2,                              'AMP',
    FALSE,    8,    5,    3,           2,                              'AMP',
    FALSE,    9,    6,    3,           2,                              'AMP',
    # With genome doubling
    TRUE,    0,    0,    0,          -2,                           'HOMDEL',
    TRUE,    0,   NA,   NA,          -2,                           'HOMDEL',
    TRUE,    1,    1,    0,          -1,              'LOSS BEFORE & AFTER',
    TRUE,    1,   NA,   NA,          -1,              'LOSS BEFORE & AFTER',
    TRUE,    2,    2,    0,          -1,                      'LOSS BEFORE',
    TRUE,    2,   NA,   NA,          -1, 'LOSS BEFORE or DOUBLE LOSS AFTER',
    TRUE,    3,    3,    0,          -1,              'CNLOH BEFORE & LOSS',
    TRUE,    3,   NA,   NA,          -1,               'LOSS (many states)',
    TRUE,    4,    4,    0,          -1,                     'CNLOH BEFORE',
    TRUE,    4,   NA,   NA,          -1,       'TETRAPLOID or CNLOH BEFORE',
    TRUE,    5,    5,    0,           1,              'CNLOH BEFORE & GAIN',
    TRUE,    5,   NA,   NA,          -1,               'GAIN (many states)',
    TRUE,    6,    6,    0,           2,                        'AMP (LOH)',
    TRUE,    6,   NA,   NA,           2,                'AMP (many states)',
    TRUE,    2,    1,    1,          -1,                'DOUBLE LOSS AFTER',
    TRUE,    3,    2,    1,          -1,                       'LOSS AFTER',
    TRUE,    4,    3,    1,          -1,                      'CNLOH AFTER',
    TRUE,    5,    4,    1,           1,                      'LOSS & GAIN',
    TRUE,    6,    5,    1,           2,                              'AMP',
    TRUE,    7,    6,    1,           2,                              'AMP',
    TRUE,    4,    2,    2,           0,                       'TETRAPLOID',
    TRUE,    5,    3,    2,           1,                             'GAIN',
    TRUE,    6,    4,    2,           2,                              'AMP',
    TRUE,    7,    5,    2,           2,                              'AMP',
    TRUE,    8,    6,    2,           2,                              'AMP',
    TRUE,    6,    3,    3,           2,                   'AMP (BALANCED)',
    TRUE,    7,    4,    3,           2,                              'AMP',
    TRUE,    8,    5,    3,           2,                              'AMP',
    TRUE,    9,    6,    3,           2,                              'AMP'
) %>% mutate(map_string := paste(wgd, tcn, mcn, lcn, sep = ':'))

use_data(hg18, hg19, hg38, genes_hg19, genes_hg38, copy_number_states, internal = T, overwrite = T)
mskcc/facets-suite documentation built on Sept. 13, 2022, 4:14 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
mskcc/facets-suite
Functions to run and parse output from FACETS

data-raw/sysdata.R
In mskcc/facets-suite: Functions to run and parse output from FACETS

R Package Documentation

Browse R Packages

We want your feedback!

mskcc/facets-suite Functions to run and parse output from FACETS

data-raw/sysdata.R In mskcc/facets-suite: Functions to run and parse output from FACETS

R Package Documentation

Browse R Packages

We want your feedback!

mskcc/facets-suite
Functions to run and parse output from FACETS

data-raw/sysdata.R
In mskcc/facets-suite: Functions to run and parse output from FACETS