In dongzhuoer/rGEO: read GEO (Gene Expression Omnibus) raw data

knitr::opts_knit$set(root.dir = here::here(''))

clean

dir(here::here('data'), full.names = T) %>% file.remove()
rm(list = ls(envir = globalenv(), all = T))

Process raw data

If you find find something puzzling, refer to R/platform.R.

knitr::opts_chunk$set(error = T, collapse = TRUE)

options(tibble.print_max = 40, tibble.width = Inf)

library(tidyverse)
library(parallel)

pkgload::load_all()

prepare GPL metadata

gpl_metas <- rGEO.data::gpl_metas
n_all <- length(gpl_metas)

sum_length <- . %>% sapply(length) %>% sum
# print selected gpl_metas
detail <- function(x, info_only = F) {
    if (info_only)
        print(gpl_metas[unique(x$accession)] %>% lapply(. %>% {.$info}))
    else
        print(gpl_metas[unique(x$accession)])
}

remove null

gpl_null <- gpl_metas %>% {.[sapply(., . %>% {.$info} %>% is.null)]} %>% names %T>% {print(head(.))}

gpl_metas %<>% {.[setdiff(names(.), gpl_null)]}

deal with duplicated columnname

# for all cases in which there is duplicated name, remove those name and description are both same, then manually check left case
gpl_metas %>% lapply(. %>% {.$info}) %>% 
    lapply(. %>% {
        if (.$name %>% duplicated %>% any) {
            . <- .[paste(.$name, .$description) %>% {!duplicated(.)}, ];
        }
        .
    }) %>% {.[sapply(., . %>% {duplicated(.$name)} %>% any)]} %>% 
    lapply(. %>% filter(name %in% name[duplicated(name)])) %>%
    yaml::write_yaml('tests/testthat/output/gpl-dup-col-name.yaml', column.major = F)

message('search "AlleleA_ProbeSeq in description" or "Illumina Strand 2" in description if you want to use sequence information')

#" for these special accession, the second one is better (refseq or entrez)
gpl_metas[dup_use_latter] %<>% 
    {print(lapply(., . %>% {.$info} %>% filter(name %in% name[duplicated(name)]))); .} %>%
    lapply(function(gpl) {
        duplicated_name <- gpl$info$name %>% {.[duplicated(.)]};
        col_drop <- sapply(duplicated_name, . %>% {which(gpl$info$name == .)[1]})
        gpl$info   = gpl$info[-col_drop, ];
        gpl$sample = gpl$sample[-col_drop];

        gpl
    })

#" for others, simply pick first is enough
gpl_metas %<>% lapply(function(gpl) {
    dup <- duplicated(gpl[[1]]$name)
    if (dup %>% any) {
        gpl$info   = gpl$info[!dup, ];
        gpl$sample = gpl$sample[!dup]
    }

    gpl
}) 

assertthat::assert_that(
    gpl_metas %>% lapply(. %>% {colnames(.$sample) %>% {.[duplicated(.)]}}) %>%
        unlist %>% unique %>% libzhuoer::print_or_T(),
    msg = 'duplicated column names not dealt'
)

make info df to be compatible with dplyr API

info_all <- setdiff(names(gpl_metas), special$accession) %>%
    mclapply(. %>% {add_column(gpl_metas[[.]]$info, accession = ., .before = 1)}) %>%
    bind_rows() %>% mutate(full = paste0(name, ': ', description)) %>%
    add_column(type = 'unknown') %T>% print

different types of GPL

non human

gpl_non_human <- info_all %>% filter(type == 'unknown') %>% filter_non_human() %>% 
    filter(!duplicated(accession)) %>% {.$accession} %T>% {print(head(.))}

info_all %<>% mutate(type = ifelse(accession %in% gpl_non_human, 'non_human', type))

ncRNA

gpl_miRNA <- info_all %>% filter(type == 'unknown') %>% filter_miRNA() %>% 
    filter(!duplicated(accession)) %>% {.$accession} %T>% {print(head(.))} %T>% {print(head(.))}
info_all %<>% mutate(type = ifelse(accession %in% gpl_miRNA, 'miRNA', type))

gpl_circRNA <- info_all %>% filter(type == 'unknown') %>% filter_circRNA() %>% 
    filter(!duplicated(accession)) %>% {.$accession} %T>% {print(head(.))} %T>% {print(head(.))}
info_all %<>% mutate(type = ifelse(accession %in% gpl_circRNA, 'circRNA', type))

entrez id

gpl_entrez_id <- info_all %>% filter(type == 'unknown') %>% filter_entrez_id() %>%
    {.$full %>% unique %>% cat(sep = '\n'); .} %>%
    {.$accession} %T>% {print(head(.))}

info_all %<>% mutate(type = ifelse(accession %in% gpl_entrez_id, 'entrez_id', type))

entrez

gpl_entrez <- info_all %>% filter(type == 'unknown') %>% filter_entrez %>% 
    {.$full %>% unique %>% cat(sep = '\n'); .} %>%
    {.$accession} %T>% {print(head(.))}

message('we don\'t distinguish between id and symbol here')

info_all %<>% mutate(type = ifelse(accession %in% gpl_entrez, 'entrez', type))

ORF

gpl_ORF <- info_all %>% filter(type == 'unknown') %>% filter_ORF() %>%  
    {.$full %>% unique %>% cat(sep = '\n'); .} %>%
    {.$accession} %T>% {print(head(.))}

info_all %<>% mutate(type = ifelse(accession %in% gpl_ORF, 'ORF', type))

symbol

gpl_symbol <- info_all %>% filter(type == 'unknown') %>% filter_symbol() %>%
    {.$full %>% unique %>% cat(sep = '\n'); .} %>%
    {.$accession} %T>% {print(head(.))}

message('For those whose sample is all empty, we still can\'t exclude the possibility that the field is name or description rather than symbol')

info_all %<>% mutate(type = ifelse(accession %in% gpl_symbol, 'symbol', type))

ensembl

gpl_ensembl <- info_all %>% filter(type == 'unknown') %>% filter_ensembl() %>%
    {.$full %>% unique %>% cat(sep = '\n'); .} %>% 
    {.$accession} %T>% {print(head(.))} 

info_all %<>% mutate(type = ifelse(accession %in% gpl_ensembl, 'ensembl', type))

refseq

gpl_refseq <- info_all %>% filter(type == 'unknown') %>% filter_refseq() %>% 
    {.$full %>% unique %>% cat(sep = '\n'); .} %>%
    {.$accession} %T>% {print(head(.))}

info_all %<>% mutate(type = ifelse(accession %in% gpl_refseq, 'refseq', type))

genbank

gpl_genbank <- info_all %>% filter(type == 'unknown') %>% filter_genbank() %>%  
    {.$full %>% unique %>% cat(sep = '\n'); .} %>% 
    {.$accession} %T>% {print(head(.))}
#" at least for GPL10348, name is better

info_all %<>% mutate(type = ifelse(accession %in% gpl_genbank, 'genbank', type))

unigene

gpl_unigene <- info_all %>% filter(type == 'unknown') %>% filter_unigene() %>% 
    {.$full %>% unique %>% cat(sep = '\n'); .} %>% 
    {.$accession} %T>% {print(head(.))}

info_all %<>% mutate(type = ifelse(accession %in% gpl_unigene, 'unigene', type))

sequence

gpl_sequence <- info_all %>% filter(type == 'unknown') %>% filter(type == 'unknown') %>% filter_sequence() %>% 
    {.$full %>% unique %>% cat(sep = '\n'); .} %>% 
    {.$accession} %T>% {print(head(.))}

info_all %<>% mutate(type = ifelse(accession %in% gpl_sequence, 'sequence', type))

code stored for future ID type

    # {.$full %>% unique %>% print; .} %>%
    # filter(!str_detect(full, fixed('description', T))) %>%  
    # {filter(., str_detect(description, fixed('alias', T))) %>% detail; .} 

    # lapply(. %>% {tibble(accession = ., symbol = select(gpl_metas[[.]]$sample, contains('symbol'))[[1]])}) %>% bind_rows() %>% filter(symbol != '') %>% filter(!(symbol %in% hgnc::hugo_symbol_all)) %>% group_by(accession) %>% summarise(n = n()) %>% arrange(desc(n))

unknown

gpl_unknown <- info_all %>% filter(type == 'unknown') %>% {unique(.$accession)} %T>% {print(head(.))}

check for possible omission

assertthat::assert_that(
    n_all == list(gpl_null, special, unique(info_all$accession)) %>% sum_length,
    msg = 'Not all platforms analysed'
)

assertthat::assert_that(
    full_join(
        ls(pattern = '^gpl_') %>% {.[. != 'gpl_null']} %>% 
            {tibble(type = ., n = sapply(., . %>% get %>% length))} %>% 
            mutate(type = str_remove(type, '^gpl_')), 
        info_all %>% filter(!duplicated(accession)) %>% count(type) %>% ungroup(),
        by = 'type'
    ) %>% filter(n.x != n.y) %>% libzhuoer::print_or_T(),
    msg = 'some IDs are parsed incorrectly'
)

Sum up & use data

gpl <- bind_rows(
    info_all %>% filter(!duplicated(accession)) %>% select(accession, type), 
    select(special, accession, type = database), 
    tibble(accession = gpl_null, type = 'unknown')
)

assertthat::assert_that(
    n_all == nrow(gpl),
    msg = 'not all platforms included in `gpl` data'
)

usethis::use_data(gpl, overwrite = T)

Afterward

devtools::test()     # test the new data
roxygen2::roxygenize() # you may also have edited data documentation

system('R CMD INSTALL --no-multiarch --with-keep.source .')
devtools::reload()   # now you can use the new data in current R session

dongzhuoer/rGEO documentation built on July 28, 2020, 7:23 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

dongzhuoer/rGEO
read GEO (Gene Expression Omnibus) raw data

In dongzhuoer/rGEO: read GEO (Gene Expression Omnibus) raw data

clean

Process raw data

prepare GPL metadata

remove null

deal with duplicated columnname

make info df to be compatible with dplyr API

different types of GPL

non human

ncRNA

entrez id

entrez

ORF

symbol

ensembl

refseq

genbank

unigene

sequence

code stored for future ID type

unknown

check for possible omission

Sum up & use data

Afterward

R Package Documentation

Browse R Packages

We want your feedback!

dongzhuoer/rGEO read GEO (Gene Expression Omnibus) raw data

In dongzhuoer/rGEO: read GEO (Gene Expression Omnibus) raw data

clean

Process raw data

prepare GPL metadata

remove null

deal with duplicated columnname

make info df to be compatible with dplyr API

different types of GPL

non human

ncRNA

entrez id

entrez

ORF

symbol

ensembl

refseq

genbank

unigene

sequence

code stored for future ID type

unknown

check for possible omission

Sum up & use data

Afterward

R Package Documentation

Browse R Packages

We want your feedback!

dongzhuoer/rGEO
read GEO (Gene Expression Omnibus) raw data