R/helper_pgxLoader.R

Defines functions pgxCount pgxcallsetLoader pgxFracLoader read_cnvstats_json pgxFreqLoader pgxVariantLoader read_variant_pgxseg read_variant_beacon pgxmetaLoader extract_beacon_query extract_all_results extract_special_results extract_general_results check_pgx_domain add_parameter transform_id check_missing_parameters check_unused_parameters

# general utility function ------------------------------------------------

check_unused_parameters <- function(param, param_name, rightparam_name) {
  if (!is.null(param)) {
    rightparam <- paste(rightparam_name,collapse = " and ")
    warning("\n The parameter ", param_name, " is not used in this query. Only ",rightparam," are accepted. \n")
  }
}

check_missing_parameters <- function(param, param_name){
  if (length(param) < 1) {
    param_name <- paste(param_name,collapse = " or ")
    stop("\n The parameter ",param_name," is missing. \n")
  }
}

transform_id <- function(id){
    filter <- paste(id,collapse = ",")
    return(filter)
}

add_parameter <- function(url, param_name, param_value){
    url  <- ifelse(is.null(param_value), url, paste0(url,"&",param_name,"=",param_value))
    return(url)
}

check_pgx_domain <- function(domain, data_type){
    if (!domain %in% c("http://progenetix.org","progenetix.org","https://cancercelllines.org","cancercelllines.org")){
      stop(data_type," can only be accessed from progenetix.org or cancercelllines.org")
    }
}

# utility function for transforming beacon response -----------------------

extract_general_results <- function(data, mapping){
    keys <- unlist(strsplit(mapping, "\\."))
    result <- data
    for (key in keys){
        # if key doesn't exist
        if (!key %in% names(result)){
          # in case a list
            if (length(result) == 1){
                result <- result[[1]]
            } else{
          # in case an array
                result <- unlist(result)
            }
        if (!key %in% names(result)) return(NA)
        result <- result[which(names(result) == key)]
        } else{
            result <- result[[key]]
        }
    }
    if (length(result) == 0) return(NA)
    result <- paste0(result,collapse = ",")
    return(result)
}

extract_special_results <- function(data, mapping, prefix){
    keys <- unlist(strsplit(mapping, "\\."))
    result <- data[[keys[1]]]
    result <- sapply(result,function(x){x[[keys[2]]]})
    if (prefix == "pubmed") prefix <- "PMID"
    if (keys[1] == "externalReferences") result <- result[grepl(prefix,result,ignore.case = TRUE)]
    if (length(result) == 0) return(NA)
    result <- paste0(result,collapse = ",")
    return(result)
}

extract_all_results <- function(data, mapping, type){
    result <- list()
    for (column in names(mapping)){
        column_prefix <- unlist(strsplit(column, "_"))[1]
        # special cohort information for biosamples
        if (column_prefix %in% c("pubmed","cellosaurus","cbioportal","tcga","cohort")){
            result[[column]] <- extract_special_results(data,mapping[[column]][[1]],column_prefix)
        # general information for all entity types
        } else{
            result[[column]] <- extract_general_results(data, mapping[[column]][[1]])
        }
    }  
  return(as.data.frame(result))
}

extract_beacon_query <- function(url, type, dataset){
    mapping_rules <- yaml::read_yaml(system.file("config", "datatable_mappings.yaml", package = "pgxRpi"))
    mapping_rules <- mapping_rules[[type]]
    query <- GET(url)
    data <- content(query)
    if (!data$responseSummary$exists) return(NULL)
    if (!is.null(dataset)){
        datasetids <- sapply(data$response$resultSets,function(x){x$id})
        id_idx <- which(datasetids %in% dataset)
        if (length(id_idx) == 0) stop("Data not found for the specified dataset ", dataset)
    } else{
        id_idx <- seq(length(data$response$resultSets))
    }
    
    data_df <- c()
    for (i in id_idx){
        data_list <- data$response$resultSets[[i]]$results
        data_info <- lapply(data_list, function(x){extract_all_results(x, mapping_rules, type)})
        data_df <- rbind(data_df,do.call(rbind, data_info))
    }
    return(data_df)
}

# function to query metadata for biosamples, individuals, analyses --------

pgxmetaLoader <- function(type, biosample_id, individual_id, filters, codematches, skip, limit, domain, entry_point, dataset){
    res <- c()
    # query by filters
    if (!(is.null(filters))){               
        url <- paste0(domain,"/",entry_point, "/", type, "?filters=",transform_id(filters))
        url <- add_parameter(url,"limit",limit)
        url <- add_parameter(url,"skip",skip)
        encoded_url <- URLencode(url)
        attempt::try_catch(
            res_1 <- extract_beacon_query(encoded_url, type, dataset),.e= function(e){
                warning("\n Query fails for the filters ", paste(filters,collapse=",") ,"\n")
            }
        )
        if (exists('res_1')){res <- rbind(res,res_1)}               
    }

     # query by biosample_id
    if (!(is.null(biosample_id))){
        biosample_ids <- transform_id(biosample_id)
        url <- paste0(domain,"/",entry_point, "/", type, "?biosampleIds=",biosample_ids)
        encoded_url <- URLencode(url)
        attempt::try_catch(
            res_2 <- extract_beacon_query(encoded_url, type, dataset),.e= function(e){
                warning("\n Query fails for biosample_id ", paste(biosample_ids,collapse = ","), "\n")
            }
        )
        if (exists('res_2')){res <- rbind(res,res_2)} 
    }

    # query by individual_id
    if (!(is.null(individual_id))){
        individual_ids <- transform_id(individual_id)
        url <- paste0(domain,"/",entry_point, "/", type,"?individualIds=",individual_ids)
        encoded_url <- URLencode(url)
        attempt::try_catch(
            res_3 <- extract_beacon_query(encoded_url, type, dataset),.e= function(e){
                warning("\n Query fails for individual_id ", paste(individual_ids,collapse = ","), "\n")
            }
        )
        if (exists('res_3')){res <- rbind(res,res_3)} 
    }
    
    if (length(res) == 0) stop("No data retrieved")
    
    rownames(res) <- seq(dim(res)[1])
    if (codematches){
        idx <- rownames(res)
        if (type == "biosamples"){
            idx <- res$biosample_id %in% biosample_id | res$individual_id %in% individual_id | 
            res$histological_diagnosis_id %in% filters | res$sampled_tissue_id %in% filters | 
            res$icdo_morphology_id %in% filters | res$icdo_topography_id %in% filters 
        } else if (type == "individuals"){
            idx <- res$histological_diagnosis_id %in% filters | res$individual_id %in% individual_id
            if (!is.null(biosample_id)){
              warning("\n The option `codematches=TRUE` filters out samples accessed by biosample_id \n")
            }
        }       
        res <- res[idx,]
        if (dim(res)[1] == 0){
            warning("\n The option `codematches=TRUE` filters out all samples \n")
        }
    }
    
    res <- res[!duplicated(res),]
    return(res)
}

# utility function for transforming variants data -------------------------

## beacon response

read_variant_beacon <- function(biosample_id, domain, entry_point, dataset){
    url <- paste0(domain,"/",entry_point, "/g_variants","?biosampleIds=",biosample_id)
    encoded_url <- URLencode(url)

    # make query not broken
    result <- NA
    attempt::try_catch(
    result <- extract_beacon_query(encoded_url, type="g_variants", dataset),.e= function(e){NULL}
    )        
    if (is.null(result)) return(result)
    if (all(is.na(result))) return(result)

    # variantInternalId in progenetix is interval-based and used for sorting
    if (domain %in% c("http://progenetix.org","progenetix.org","https://cancercelllines.org","cancercelllines.org")){
        # change interval order 
        location <- strsplit(result$variant_internal_id,split = ':')
        start <- sapply(location, function(x){x[2]})
        start <- as.numeric(gsub('-.*','',start))
        result <- result[order(start),]
        location <- strsplit(result$variant_internal_id,split = ':')
        chr <-  sapply(location, function(x){x[1]})
        chr[chr == 'X'] <- 23
        chr[chr == 'Y'] <- 24
        chr <- as.numeric(chr)
        result <- result[order(chr),]
    }

    # in case one sample corresponds to multiple analysis
    result <- result[order(result$analysis_id),]
    return(result)
}

## exported pgxseg data by bycon service 

read_variant_pgxseg <- function(biosample_id, domain){
    url <- paste0(domain,"/services/pgxsegvariants/?biosampleIds=",biosample_id)
    encoded_url <- URLencode(url)
    
    seg <- read.table(encoded_url, header = TRUE, sep="\t",quote="")
    if (dim(seg)[1] == 0) return(NA)
    col <- c("start","end","log2")
    suppressWarnings(seg[,col] <- sapply(seg[,col], as.numeric))
    seg <- seg[order(seg$start),]
    chr <- seg[,2]
    chr[which(chr == 'X')] <- 23
    chr[which(chr == 'Y')] <- 24
    chr <- as.integer(chr)
    seg <- seg[order(chr),]
    seg <- seg[order(seg$biosample_id),]

    meta <- readLines(encoded_url)
    idx <- length(grep("#",meta))
    meta <- meta[seq_len(idx)]

    return(list(seg=seg, meta=meta))
}

# function to query variants ----------------------------------------------

pgxVariantLoader <- function(biosample_id, output, save_file, filename, domain, entry_point, dataset, num_cores){
    num_cores <- min(num_cores, parallel::detectCores() - 1)
    future::plan(future::multisession,workers = num_cores)
    
    if (!(is.null(output))){
        check_pgx_domain(domain, "Variant data in non-beacon format")
        results <- future.apply::future_lapply(biosample_id,FUN = function(i){read_variant_pgxseg(i, domain)})
    }else{
        results <- future.apply::future_lapply(biosample_id,FUN = function(i){read_variant_beacon(i, domain, entry_point, dataset)})
    }
 
    fail_idx <- which(is.na(results))
    if (length(fail_idx) == length(results)) stop("Query fails for all samples")
    if (length(fail_idx > 0)) warning("\n Query fails for biosample_id ", paste(biosample_id[fail_idx],collapse = ','), "\n")
    
    results[is.na(results)] <- NULL
    
    if (!(is.null(output))){
      meta <- lapply(results,FUN= function(x){x[["meta"]]})
      head <- meta[[1]][1:2]
      meta <- lapply(meta, FUN = function(x){return(x[-c(1,2,3)])})
      meta <- do.call(c,meta)
      meta <- c(head,meta)
      results <- lapply(results,FUN= function(x){x[["seg"]]})
    }

    results <- do.call(rbind, results)
    # if the query succeed but no data in database
    if (is.null(results)) stop("No data retrieved")
    # quality check
    results <- results[results$biosample_id %in% biosample_id,]
    rownames(results) <- seq(nrow(results))
    
    # format conversion
    if (!(is.null(output))){
      if (output == 'seg'){
        results <- results[,c(1,2,3,4,6,5)]
      }
    }
    
    if (save_file){
      # pgxseg format
        if (!is.null(output)){
          if (output=='pgxseg'){
            # write result
            write.table(meta, file=filename,row.names = FALSE,col.names = FALSE, quote = FALSE)
            suppressWarnings(write.table(results, append=TRUE, sep='\t',file=filename,row.names = FALSE,col.names = TRUE, quote = FALSE))
          } 
      # tsv format
        } else {
            write.table(results, file=filename, sep='\t',row.names = FALSE,col.names = TRUE, quote = FALSE)
        } 
        message("\n The file is saved \n")
        return()
    }

    return(results)
}

# function to query cnv frequency -----------------------------------------

pgxFreqLoader <- function(output, filters, domain) {
    check_pgx_domain(domain, "CNV frequency data")

    # start query
    url <- paste0(domain,"/services/intervalFrequencies?output=",output)
    url <- add_parameter(url,"filters",transform_id(filters))
    encoded_url <- URLencode(url)

    # access data 
    pg.data  <- read.table(encoded_url, header=TRUE, sep="\t",,quote="")
    colnames(pg.data)[1] <- 'filters'

    filter_exist <- filters %in% unique(pg.data[,1])
    if (all(!filter_exist)) stop("No data retrieved")
    if (any(!filter_exist)) warning("\n Query fails for the filters ", paste(filters[!filter_exist],collapse = ','),"\n")
    filters <- filters[filter_exist]

    # access metadata from JSON output
    meta <- readLines(encoded_url)[seq_len(length(filters))+1]
    meta_lst <- unlist(strsplit(meta,split = ';'))
    label <- meta_lst[grep("label",meta_lst)]
    label<- gsub('.*=','',label)
    count <- meta_lst[grep("sample_count",meta_lst)]
    count <- as.numeric(gsub('.*=','',count))
    meta <- data.frame(filter = filters, label, sample_count=count)
    
    # convert to GRangesList
    if (output == 'pgxfreq'){
      colnames(pg.data)[2] <- 'chr'
      data_lst <- GenomicRanges::makeGRangesListFromDataFrame(pg.data,split.field = 'filters',keep.extra.columns=TRUE)
      S4Vectors::mcols(data_lst) <- meta
    
    # convert to RangedSummarizedExperiment
    } else if (output == "pgxmatrix"){
        frequency <- as.data.frame(t(pg.data[,-1]))
        rowRanges <- rownames(frequency)
        rowRanges <- lapply(rowRanges,function(x){
        range_str <- unlist(strsplit(x,split = '\\.'))
        chr <- range_str[1]
        chr <- gsub("X","",chr)
        if (chr == "") chr <- 'X'
        return(data.frame(chr=chr,start=range_str[2],end = range_str[3],type=range_str[4]))
      })
      rowRanges <- do.call(rbind, rowRanges)
      rowRanges <- GenomicRanges::GRanges(rowRanges)
      names(rowRanges) <- seq_len(dim(frequency)[1])
      
      colnames(frequency) <- pg.data[,1]
      rownames(frequency) <- seq_len(dim(frequency)[1])
      
      data_lst <- SummarizedExperiment::SummarizedExperiment(assays=list(cnv_frequency=frequency),
                           rowRanges=rowRanges, colData=meta)
    }
    
    return(data_lst)
}

# utility function for transforming cnv fraction data ---------------------

read_cnvstats_json <- function(url,codematches=FALSE,all_biosample_id=NULL){
    encoded_url <- URLencode(url)
    data <- content(GET(encoded_url))
    # this is progenetix/cellz specific data format, so the length of dataset is 1 and this function doesn't require dataset parameter  
    sample <- unlist(lapply(data$response$resultSets[[1]]$results, function(x){
    return(x$biosampleId)
    }))

    # filter with exact code match
    if (codematches){
    sel_sample_idx <- sample %in% all_biosample_id 
    if (sum(sel_sample_idx) == 0 & length(sample) > 0){
      warning("\n The option `codematches=TRUE` filters out all samples accessed by filters \n")
      return()
    }
    } else{
    sel_sample_idx <- seq(length(sample))
    }

    # extract fraction
    data_2 <- lapply(data$response$resultSets[[1]]$results[sel_sample_idx], function(x){
    stats <- lapply(x$cnvChroStats, function(y){
      as.data.frame(y)
    })
    cnvstats <- Reduce(rbind,stats)
    rownames(cnvstats) <- names(stats)
    res <- list()
    res[[1]] <- cnvstats
    res[[2]] <- as.data.frame(x$cnvStats)
    res[[3]] <- x$id
    return(res)
    })

    # some results are SNV instead of CNV
    total_frac <- lapply(data_2, function(x){x[[2]]})
    rm_idx <- which(sapply(total_frac,length) == 0)
    if (length(rm_idx) > 0) data_2 <- data_2[-rm_idx]
   
    analyses_ids <- sapply(data_2, function(x){x[[3]]})

    # whole genome CNV fraction

    total_frac <- Reduce(rbind, total_frac)
    rownames(total_frac) <- analyses_ids
    total_frac <- total_frac[,c(2,6,4)]

    # chromosome & chromosome arm CNV fraction
    data_2 <- lapply(data_2,function(x){x[[1]]})

    chrom_arm_list <- c()
    for ( i in c(seq(22),'x','y')){
    chrom_arm_list <- c(chrom_arm_list, paste0(i,'p'), paste0(i,'q'))
    }
    chrom_arm_idx <- match(chrom_arm_list, rownames(data_2[[1]]))

    chrom_list <- c(seq(22),'X','Y')
    chrom_idx <- match(chrom_list, rownames(data_2[[1]]))

    data_3 <- lapply(data_2, function(x){
    val <- c(x$dupfraction[chrom_arm_idx],
             x$delfraction[chrom_arm_idx],
             x$dupfraction[chrom_idx],
             x$delfraction[chrom_idx])
    names <- c(paste0('chr',chrom_arm_list,'.dup'),
               paste0('chr',chrom_arm_list,'.del'),
               paste0('chr',chrom_list,'.dup'),
               paste0('chr',chrom_list,'.del'))
    return( t(data.frame(val,row.names=names)))
    }) 

    data_3 <- Reduce(rbind,data_3)
    rownames(data_3) <- analyses_ids

    arm_frac <- data_3[,seq_len(96)]
    chrom_frac <- data_3[,c(97:144)]

    result <- list()
    result$arm_cnv_frac <- arm_frac
    result$chr_cnv_frac <- chrom_frac
    result$genome_cnv_frac <- total_frac
    return(result)
}

# function to query cnv fraction ------------------------------------------

pgxFracLoader <- function(biosample_id, individual_id, filters, codematches, skip, limit, domain){
    check_pgx_domain(domain, "CNV fraction data")
  
    pg.data <- list()
    if (!is.null(filters)){
        url <- paste0(domain,"/services/cnvstats/?filters=",transform_id(filters))
        url <- add_parameter(url,"limit",limit)
        url <- add_parameter(url,"skip",skip)

        if (codematches){
          suppressWarnings(all_biosample_id <- pgxmetaLoader(type = 'biosamples', 
                                                             biosample_id = NULL,individual_id = NULL,
                                                             filters = filters,codematches = TRUE,
                                                             skip = NULL,limit = 0,domain = domain,entry_point = "beacon",dataset = NULL))
          all_biosample_id <- all_biosample_id$biosample_id    
        }
        pg.data[[1]] <- read_cnvstats_json(url,codematches,all_biosample_id)
    } 
  
    if (!is.null(biosample_id)){
        url  <- paste0(domain,"/services/cnvstats/?biosampleIds=",transform_id(biosample_id))
        pg.data[[2]] <- read_cnvstats_json(url)
    }
    
    if (!is.null(individual_id)){
        url <- paste0(domain,"/services/cnvstats/?individualIds=",transform_id(individual_id))
        pg.data[[3]] <- read_cnvstats_json(url)
    }
    
    result <- list()
    result$arm_cnv_frac <- do.call(rbind,lapply(pg.data,function(x){return(x$arm_cnv_frac)}))
    result$chr_cnv_frac <- do.call(rbind,lapply(pg.data,function(x){return(x$chr_cnv_frac)}))
    result$genome_cnv_frac <- do.call(rbind,lapply(pg.data,function(x){return(x$genome_cnv_frac)}))
    
    return(result)
}

# function to query sample callset -----------------------------

pgxcallsetLoader <- function(biosample_id, individual_id, filters, limit, skip, codematches, domain){
    check_pgx_domain(domain, "pgxmatrix data")
  
    pg.data <- list()

    if (!is.null(filters)){
        url  <- paste0(domain,"/services/samplematrix/?filters=",transform_id(filters))
        url <- add_parameter(url,"limit",limit)
        url <- add_parameter(url,"skip",skip)        
        encoded_url <- URLencode(url)

        pg.data[[1]] <- read.table(encoded_url, header=TRUE, sep="\t",quote="")
        ori.dim <- dim(pg.data[[1]])[1]
        if (codematches){
            pg.data[[1]] <- pg.data[[1]][pg.data[[1]]$group_id %in% filters,]
        if (ori.dim > 0 & dim(pg.data[[1]])[1] == 0){
            warning("\n The option `codematches=TRUE` filters out all samples accessed by filters \n")
        }}
    } 
    
    if (!is.null(biosample_id)){
        url  <- paste0(domain,"/services/samplematrix/?biosampleIds=",transform_id(biosample_id))
        encoded_url <- URLencode(url)
        pg.data[[2]] <- read.table(encoded_url, header=TRUE, sep="\t",quote="")
    }  
    
    if (!is.null(individual_id)){
      url  <- paste0(domain,"/services/samplematrix/?individualIds=",transform_id(individual_id))
      encoded_url <- URLencode(url)
      pg.data[[3]] <- read.table(encoded_url, header=TRUE, sep="\t",quote="")
    }

    pg.data <- do.call(rbind,pg.data)
    # remove automatic prefix X 
    colnames(pg.data) <- gsub("X","",colnames(pg.data))
    # add chr prefix to avoid colnames with numeric start
    colnames(pg.data)[4:ncol(pg.data)] <- paste0("chr",colnames(pg.data)[4:ncol(pg.data)])
    # recover X chr    
    colnames(pg.data) <- gsub("chr\\.","chrX\\.",colnames(pg.data))

     # convert to RangedSummarizedExperiment
   
    callset <- as.data.frame(t(pg.data[,-c(1,2,3)]))
    rowRanges <- colnames(pg.data)[4:ncol(pg.data)]
    rowRanges <- lapply(rowRanges,function(x){
        range_str <- unlist(strsplit(x,split = '\\.'))
        chr <- range_str[1]
        chr <- gsub("X","",chr)
        if (chr == "") chr <- 'X'
        return(data.frame(chr=chr,start=range_str[2],end = range_str[3],type=range_str[4]))
        })
    rowRanges <- do.call(rbind, rowRanges)
    rowRanges <- GenomicRanges::GRanges(rowRanges)
    names(rowRanges) <- seq_len(dim(callset)[1])
      
    colnames(callset) <- pg.data$analysis_id
    rownames(callset) <- seq_len(dim(callset)[1])
      
    result <- SummarizedExperiment::SummarizedExperiment(assays=list(cnv_matrix=callset),
                           rowRanges=rowRanges, colData=pg.data[,c(1,2,3)])
    
    return(result)
}

# function to query sample count -----------------------------

pgxCount <- function(filters=NULL,domain="http://progenetix.org"){
    check_pgx_domain(domain, "sample count information")
    
    filter <- transform_id(filters)
    url <- paste0(domain,"/services/collations?filters=",filter)
    encoded_url <- URLencode(url)
    info <-  content(GET(url))
    res <- lapply(info$response$results, function(x){
        if (is.null(x$label)) x$label <- NA
        df <- data.frame(filters=x$id,label=x$label,total_count=x$count,exact_match_count=x$codeMatches)
    })
    
    res <- Reduce(rbind,res)
    if (length(res) == 0) stop("No data retrieved")
  
    return (res)
}
progenetix/pgxRpi documentation built on Sept. 14, 2024, 2:21 p.m.