R/msbb.R

Defines functions wrangleMSBB

Documented in wrangleMSBB

#' Wrangling MSBB data
#'
#' Wrangles MSBB molecular and clinical variables into a common data frame
#'
#' @param destDir Destination directory where the file is to be written
#' @return Character vector of filenames of the wrangled datasets (one per brain region)
#' @importFrom magrittr %>%
#' @export
wrangleMSBB <- function( destDir = "~/data/amp-ad/msbb" )
{
    ## Create directory if it doesn't exist
    synDir <- file.path(destDir, "raw")
    dir.create( destDir, recursive=TRUE, showWarnings=FALSE )
    cat( "Wrangling MSBB dataset to", destDir, "\n" )

    syn <- function(s) {synapser::synGet(s, downloadLocation=synDir)$path}

    ## Login to Synapse and download/wrangle data
    cat( "Logging in to Synapse...\n" )
    synapser::synLogin()

    ## Load biotype annotations and retrieve names of protein-coding genes
    cat( "Downloading biotype annotations...\n" )
    BT <- readr::read_csv( syn("syn14236139"), col_types=readr::cols() ) %>%
        dplyr::filter( gene_biotype=="protein_coding" ) %>%
        dplyr::select( ENSEMBL = gene_id, HUGO = gene_name )

    ## Match sample barcodes against individual IDs
    cat( "Downloading ID map...\n" )
    Z <- syn("syn6100548") %>% readr::read_csv(col_types=readr::cols()) %>%
        dplyr::select( Barcode=barcode, individualIdentifier ) %>%
        dplyr::distinct() %>% dplyr::mutate_at( "Barcode", as.character )

    ## Load clinical covariates and combine with the expression matrix
    cat( "Downloading clinical covariates...\n" )
    Y <- syn("syn6101474") %>% readr::read_csv(col_types=readr::cols()) %>%
        dplyr::select( individualIdentifier=individualID, PMI=pmi, AOD=ageDeath, CDR, Braak ) %>%
        dplyr::inner_join( Z, by = "individualIdentifier" ) %>%
        dplyr::rename( ID=individualIdentifier )

    ## Read raw expression matrix
    ## PINX1 is the only gene with duplicate entries, but one of the entries has
    ##   a higher total count, so we keep it and discard the other entry.
    cat( "Downloading and wrangling expression data...\n" )
    synX <- c( BM10="syn16795931", BM22="syn16795934",
              BM36="syn16795937", BM44="syn16795940" )
    X <- purrr::map( synX, function(s) {
        read.delim( syn(s), check.names=FALSE ) %>%
            dplyr::inner_join( BT, by= c( "Ensembl ID"="ENSEMBL" ) ) %>%
            dplyr::filter( `Ensembl ID` != "ENSG00000258724" ) %>%
            dplyr::select( -`Ensembl ID` ) %>%
            tidyr::gather( RNA_ID, Value, -HUGO ) %>%
            dplyr::mutate( Barcode = stringr::str_split(RNA_ID, "_", simplify=TRUE)[,3],
                          RNA_ID = NULL ) %>%
            dplyr::inner_join( Y, by="Barcode" ) %>%
            tidyr::spread( HUGO, Value ) })

    ## Compose filenames
    ## Save each region into a separate file
    cat( "Writing output to", destDir, "\n" )
    fnOut <- names(X) %>% stringr::str_sub( 3 ) %>%
        stringr::str_c( "msbb", ., ".tsv.gz" ) %>% purrr::map_chr( ~file.path(destDir,.x) )
    R <- purrr::map2( X, fnOut, readr::write_tsv )
    fnOut
}
labsyspharm/DRIAD documentation built on Jan. 6, 2022, 3:04 p.m.