R/DataPreparation_NetMHC.R

Defines functions NetMHC_Import NetMHC_Script NetMHC_PeptideExport

Documented in NetMHC_Import NetMHC_PeptideExport NetMHC_Script

#' Utility functions for NetMHC.
#'
#' To use the scripts generated by \code{NetMHC_Script}, NetMHC4.0 and NetMHCIIpan3.2 should be installed locally.
#'
#' @param peptideSet A set of peptide sequences.
#' @param outDir A path to the output directory in which peptides will be exported in FASTA format.
#' @param maxPeptides An integer indicating the maximum number of peptides in each FASTA files. Set \code{NULL} to disable chunking.
#' @param netMHCOutputFileNames A set of tab-delimited NetMHC result files.
#' @param MHCType A character string. "MHCI" or "MHCII".
#' @export
#' @rdname DataPreparation_NetMHC
#' @name DataPreparation_NetMHC
NetMHC_PeptideExport <- function(peptideSet, outDir="./NetMHC/", maxPeptides=NULL){
  dir.create(outDir, showWarnings=F, recursive=T)
  peptList <- split(peptideSet, nchar(peptideSet))
  lengList <- names(peptList)
  if(is.null(maxPeptides)){
    peptList <- lapply(peptList, list)
  }else{
    peptList <- lapply(peptList, BBmisc::chunk, chunk.size=maxPeptides)
  }
  nchunkList <- sapply(peptList, length)
  peptList <- purrr::flatten(peptList)
  names(peptList) <- unlist(mapply(function(x, y){paste0("Peptides_", x, "mer_", 1:y, ".fasta")}, lengList, nchunkList, SIMPLIFY=F))
  mapply(
    function(peptSeq, name){seqinr::write.fasta(as.list(peptSeq), peptSeq, file.path(outDir, name))},
    peptList, names(peptList), SIMPLIFY=F
  )
  return(TRUE)
}

#' @export
#' @rdname DataPreparation_NetMHC
#' @name DataPreparation_NetMHC
NetMHC_Script <- function(outDir="./NetMHC/"){
  # MHCI
  hla_string <- "HLA-A0101,HLA-A0201,HLA-A0301,HLA-A2402,HLA-A2601,HLA-B0702,HLA-B0801,HLA-B2705,HLA-B3901,HLA-B4001,HLA-B5801,HLA-B1501"
  script_single <- function(FASTAFileName){
    paste0(
      "./netMHC -a ", hla_string,
      " -l ", gsub("mer", "", unlist(strsplit(FASTAFileName, "_"))[2]),
      " -xls -xlsfile ./Peptides/", gsub("fasta", "xls", FASTAFileName),
      " -f ./Peptides/", FASTAFileName
    )
  }
  FASTAFileNames <- list.files(path=outDir, pattern="^Peptides_.+fasta$", full.names=F)
  FASTAFileNames <- FASTAFileNames[stringr::str_detect(FASTAFileNames, paste0(paste0("_", 8:11, "mer"), collapse="|"))]
  scripts <- unlist(lapply(FASTAFileNames, script_single))
  writeLines(scripts, file.path(outDir, "NetMHC_Script_MHCI.txt"), sep="\n")

  # MHCII
  hla_string <- "DRB1_0101,DRB3_0101,DRB4_0101,DRB5_0101,HLA-DPA10103-DPB10101,HLA-DQA10101-DQB10201"
  script_single <- function(FASTAFileName){
    paste0(
      "./netMHCIIpan -a ", hla_string,
      " -length ", gsub("mer", "", unlist(strsplit(FASTAFileName, "_"))[2]),
      " -xls -xlsfile ./Peptides/", gsub("fasta", "xls", FASTAFileName),
      " -f ./Peptides/", FASTAFileName
    )
  }
  FASTAFileNames <- list.files(path=outDir, pattern="^Peptides_.+fasta$", full.names=F)
  FASTAFileNames <- FASTAFileNames[stringr::str_detect(FASTAFileNames, paste0(paste0("_", 11:30, "mer"), collapse="|"))]
  scripts <- unlist(lapply(FASTAFileNames, script_single))
  writeLines(scripts, file.path(outDir, "NetMHC_Script_MHCII.txt"), sep="\n")

  return(TRUE)
}

#' @export
#' @rdname DataPreparation_NetMHC
#' @name DataPreparation_NetMHC
NetMHC_Import <- function(netMHCOutputFileNames, MHCType=c("MHCI", "MHCII")){
  if(MHCType=="MHCI"){
    hlaSuperTypeSet <- c("A01","A02","A03","A24","A26","B07","B08","B27","B39","B44","B58","B62")
    thr_w <- 2
    thr_s <- 0.5
  }
  if(MHCType=="MHCII"){
    hlaSuperTypeSet <- c("DRB1","DRB3","DRB4","DRB5","DP","DQ")
    thr_w <- 10
    thr_s <- 2
  }
  dt_netmhc <- data.table::rbindlist(pbapply::pblapply(netMHCOutputFileNames, data.table::fread))
  dt_netmhc <- unique(dt_netmhc, by="ID", fromLast=F)
  dt_netmhc <- dt_netmhc[!stringr::str_detect(Peptide, "X"), ]
  dt_netmhc <- dt_netmhc[,which(colnames(dt_netmhc) %in% c("Peptide","Rank")), with=F]
  colnames(dt_netmhc) <- c("Peptide", hlaSuperTypeSet)
  df.netmhc <- dt_netmhc %>%
    tidyr::gather(HLARestriction, Rank, -Peptide) %>%
    dplyr::mutate(BinderCategory=factor(dplyr::if_else(Rank<=thr_w, 1, 0) + dplyr::if_else(Rank<=thr_s, 1, 0), 
                                        levels=c(0, 1, 2),
                                        labels=c("NonBinder", "WeakBinder", "StrongBinder")))
  df.netmhc.rank <- df.netmhc %>%
    dplyr::select(-BinderCategory) %>%
    tidyr::spread(HLARestriction, Rank)
  df.netmhc.bind <- df.netmhc %>%
    dplyr::mutate(BinderCategory=as.numeric(BinderCategory)) %>%
    dplyr::select(-Rank) %>%
    tidyr::spread(HLARestriction, BinderCategory)
  list("DF"=df.netmhc, "DF_Rank"=df.netmhc.rank, "DF_Bind"=df.netmhc.bind)
}
masato-ogishi/Repitope documentation built on Feb. 14, 2023, 5:47 a.m.