SecretSanta: flexible pipelines for secretome prediction

Documented in topcons

# helper function to extract number of TM domains or check for presence of SP 
# based on stand-alone TOPCONS predictions
# sample input string: ('iiioooMMMMMMMMMMMMMMMMooooMMMMMMMiiioooo')

stretch_parse <- function(str, core_pattern){
    runs <- paste(rle(strsplit(str, "")[[1]])$values, collapse="")
    stringr::str_count(runs, core_pattern)
}

#' parse TOPCONS output 
#' 
#' This function parses results of TOPCONS method, used to predict transmembrane domains. The parser is restricted to 'piper'-like behaviour.\cr
#' @param input_obj an instance of CBSResult class, produced by previous step;
#' @param TM  allowed number of TM domains in mature peptides,
#' recommended value <= 1; use TM = 0 for strict filtering
#' @param SP filter according to TOPCONS prediction of signal peptides. TRUE - keep only proteins containg signal peptides according to TOPCONS; FALSE - disable filtering for TOPCONS-based predictions of signal peptides. Deafault = FALSE.
#' @param parse_dir dir with archived topcons output (for WSDL-API or WEB outputs)
#' or dir with output produced by stand-alone version.
#' @param topcons_mode
#' \itemize{
#' \item    WEB - output from TOPCOS2 web-server \url{http://topcons.net/}
#' \item    WSDL-API - output returned by WSDL API script \url{http://topcons.net/pred/help-wsdl-api/}
#' \item    stand-alone - results of TOPCONS stand-alone version \url{https://github.com/ElofssonLab/TOPCONS2}
#' }
#' @export
#' @return TopconsResult object
#' @examples 
#' aa <- readAAStringSet(system.file("extdata", "sample_prot_100.fasta",
#'            package = "SecretSanta"), use.names = TRUE)

#' inp <- CBSResult(in_fasta = aa)
#' # predict signal peptides:
#' sp <- signalp(inp, version = 2, organism = 'euk', run_mode = "starter",
#' legacy_method = 'hmm')
#' # integrate topcons predictions
#' p_dir <- system.file("extdata", "rst_SVw4hG.zip", package = "SecretSanta")
#' tpc <- topcons(input_obj = sp, parse_dir = p_dir, topcons_mode = "WEB-server",
#'               TM = 0, SP = TRUE)
               
               
topcons <- function(input_obj,
                    parse_dir,
                    topcons_mode = c('API', 'WEB-server', 'stand-alone'),
                    TM,
                    SP = FALSE){
    
    # ----- Check the input parameters:
    
    # check that path to zipped output is provided:
    
    if (missing(parse_dir)) {stop('Missing argument: parse_dir.')}
    
    
    # check topcons mode
    if (missing(topcons_mode)) {stop('Missing argument: topcons_mode')}
    topcons_mode <- match.arg(topcons_mode)
    
    # check that topcons output file exists:
    
    if ((topcons_mode %in% c('API', 'WEB-server')) & (!file.exists(parse_dir))) {
        stop('Please provide valid path to the zipped TOPCONS output.')
    }

    
    if (topcons_mode == 'stand-alone') {
        if (!(file.exists(paste(parse_dir, 'time.txt', sep = '/')))) {
            stop('Could not find "time.txt" file, please check the path.')
        }
        if (!(any(grepl('seq_', list.files(parse_dir))))) {
            stop('Could not find "seq_*" subdirectories, please check the path.')
        }
    }
    
    # check that input object belongs to a valid class,
    if (!(is(input_obj, "CBSResult"))){
        stop('Input object does not belong to CBSResult class.')
    } 


    # check TM threshold value
    if (!(is.numeric(TM))) stop('TM argument must be numeric.')
    if (TM >= 2) warning(
        'Recommended TM threshold value for secreted peptides is 0.')
    
    # check that input_obj contains non-empty out_fasta slot
    if (length(getOutfasta(input_obj)) != 0) {
            fasta <- getOutfasta(input_obj)
        } else {
            stop('out_fasta slot is empty.')
        }
    
    if (!(is.logical(SP))) {stop('SP argument must be a logical.')}

    # All checked, produce an encouragig/status messages
    
    message(paste("Running topcons parser for", topcons_mode, "format ..."))
    
    if (SP == FALSE) {
        message('Verify SP predictions ... YES')
    } else {
        message("Verify SP predictions ... NO")
    }
       
    message(paste('TM domains allowed ... '), TM)
    
    #essential fucntion to run simple topcons parser:
    
    parse_topcons <- function(dir_to_parse) {
        
        # parser bifurcation based on input format:
        
        if (topcons_mode == "API" | topcons_mode == "WEB-server") {
        
        # first, unzip the archive
        rst_id <- strsplit(basename(dir_to_parse), split = '.zip')[[1]]
        unzp <- unzip(zipfile = dir_to_parse, 
                      exdir = paste(dirname(dir_to_parse),
                                    '/',
                                    rst_id,
                                    sep = ''))
        # filter for file name with summary stats:
        unzp <- unzp[grepl('finished_seqs.txt', unzp)]
        topcons_tibble <- tibble::as.tibble(read.table(unzp,
                                                       sep = '\t',
                                                       header = FALSE,
                                                       quote = ""))
        
        # remove unpacked dir
        unlink(paste(dirname(dir_to_parse), '/', rst_id, sep = ''),
               recursive = TRUE)
        
        names(topcons_tibble) <- c('seq', 'length', 'TM',
                                   'SP', 'source', 'run_time',
                                   'gene_id')

        # crop names, to remove additional annotations:
        topcons_tibble$gene_id <- sapply(topcons_tibble$gene_id, crop_names,
                                         USE.NAMES = FALSE)
        }
        
        # stand-alone mode does not produce summary table
        # we need to collate/extract relevant fields from isolated files and 
        # dirs to match WEB/API format
        
        if (topcons_mode == 'stand-alone') {

            # extract seq_ids and run times from time.txt
            t_file <- read.table(paste(dir_to_parse, '/time.txt', sep = ''),
                                 sep = ';')
            # crop names, just in case
            gene_id <- sapply(t_file$V1, crop_names)
            run_times <- t_file$V2
            
            # extract lengths from seq_*/dg.txt files -> combine
            
            dirs <- list.files(path = dir_to_parse, pattern = "seq_")
            
            # make sure that ordering of seq_folders is correct, we don't need the
            # default ordering
            vals <- as.numeric(gsub("seq_", "", dirs))
            dirs <- dirs[order(vals)]
            
            # exact paths to dg.txt files
            dgs  <- sapply(dirs, function(x) paste(dir_to_parse,
                                                   x,
                                                   'dg.txt',
                                                   sep = '/'))
            
            # helper function to extract length field w/o reading the whole file
            get_length <- function(dg_path) {
                system(paste('cat', dg_path, "| grep SeqLength: | awk '{print $2}'"),
                       intern = TRUE) %>% as.numeric()
            }
            
            lengths <- sapply(dgs, get_length)
                    

            # extract number of TM and SP domains
            
            tops <- sapply(dirs, function(x) paste(dir_to_parse,
                                                        x,
                                                        'Topcons/topcons.top',
                                                        sep = '/'))
                
            # helper function to extract raw preds in correct order:
            get_preds <- function(top_path) {
                system(paste("cat", top_path, sep = ' '), intern = TRUE)
            }
            
            raw_preds <- sapply(tops, get_preds)
            
            # extract number of TM domains:
            TM_num <- sapply(raw_preds, function(x) stretch_parse(x, 'M'),
                         USE.NAMES = FALSE)
            
            SP_if <- sapply(raw_preds, function(x) stretch_parse(x, 'S'),
                         USE.NAMES = FALSE) %>% as.logical()
            
            # assemble the tibble now
            topcons_tibble <- tibble::tibble('seq' = dirs,
                                             'length' = lengths,
                                             'TM' = TM_num,
                                             'SP' = SP_if,
                                             'source' = 'stand-alone',
                                             'run_time' = run_times,
                                             'gene_id' = gene_id
                                             )
            
        }
    
        # ----filter the tibble and assemble result object
    
        # keep gene_ids only present in the input object
        topcons_tibble <- topcons_tibble[topcons_tibble$gene_id %in% names(fasta),]
    
        # filter based on TM threshold
        topcons_tibble <- (topcons_tibble %>% dplyr::filter_( ~ TM <= TM))
    
        # filter based on SP threshold
        if (SP == TRUE) {
            topcons_tibble <- topcons_tibble %>% dplyr::filter_( ~ SP == 'True')
        }
    
        message(paste('Number of result proteins ...', nrow(topcons_tibble)))
        # assemble TOPCONS object
    
        out_obj <- TopconsResult(top_tibble = topcons_tibble)
        out_obj <- setInfasta(out_obj, in_fasta = fasta)
        out_obj <- setOutfasta(out_obj, out_fasta = fasta[topcons_tibble$gene_id])
    
        if (validObject(out_obj)) { return(out_obj)}

    }
    
    parse_topcons(parse_dir)

}