R/Features.R

Defines functions Features Features_CPP Features_PeptDesc

Documented in Features Features_CPP Features_PeptDesc

#' Feature computation.
#'
#' \code{Features_PeptDesc} computes peptide descriptors.\cr
#' \code{Features_CPP} computes descriptive statistics of TCR repertoire-wide contact potential distributions.\cr
#' \code{Features} is a wrapper function.
#'
#' @param peptideSet A set of peptide sequences.
#' @param fragLib Either the dataframe of fragment libraries generated by \code{CPP_FragmentLibrary} or the path to the fst file.
#' @param aaIndexIDSet A set of AAIndex IDs indicating the AACP scales to be used. Set "all" to shortcut the selection of all available AACP scales.
#' @param fragLenSet A set of sliding window sizes.
#' @param fragDepth A fragment library size. Must be an integer vector of length one.
#' @param fragLibType A string indicating the types of fragment libraries to be used. Must be a character vector of length one.
#' @param featureSet A minimum set of features. Combinations of fragment lengths and AAIndex IDs are internally extracted for calculating CPPs.
#' @param seedSet A set of random seeds.
#' @param coreN The number of cores to be used for parallelization.
#' @param tmpDir Destination directory to save intermediate files.
#' @export
#' @rdname Features
#' @name Feature_Computation
Features_PeptDesc <- function(
  peptideSet,
  fragLenSet=3:8,
  featureSet=NULL,
  coreN=parallel::detectCores(logical=F)
){
  # Start calculation
  set.seed(12345)
  time.start <- proc.time()

  # Working functions
  peptideDescriptor.Batch <- function(peptide){
    unlist(list(
      Peptides::aIndex(peptide),
      Peptides::blosumIndices(peptide),
      Peptides::boman(peptide),
      Peptides::charge(peptide),
      Peptides::crucianiProperties(peptide),
      Peptides::fasgaiVectors(peptide),
      Peptides::hmoment(peptide),
      Peptides::hydrophobicity(peptide),
      Peptides::instaIndex(peptide),
      Peptides::kideraFactors(peptide),
      Peptides::mswhimScores(peptide),
      Peptides::pI(peptide),
      Peptides::protFP(peptide),
      Peptides::vhseScales(peptide),
      Peptides::zScales(peptide)
    ))
  }
  peptideDescriptor.NameSet <- c("AliphaticIndex","BLOSUM1","BLOSUM2","BLOSUM3","BLOSUM4","BLOSUM5","BLOSUM6","BLOSUM7","BLOSUM8","BLOSUM9","BLOSUM10","Boman","Charge","PP1","PP2","PP3","F1","F2","F3","F4","F5","F6","HydrophobicMoment","Hydrophobicity","Instability","KF1","KF2","KF3","KF4","KF5","KF6","KF7","KF8","KF9","KF10","MSWHIM1","MSWHIM2","MSWHIM3","pI","ProtFP1","ProtFP2","ProtFP3","ProtFP4","ProtFP5","ProtFP6","ProtFP7","ProtFP8","VHSE1","VHSE2","VHSE3","VHSE4","VHSE5","VHSE6","VHSE7","VHSE8","Z1","Z2","Z3","Z4","Z5")
  peptideDescriptor.FragStat.Single <- function(peptide, fragLen){
    f <- sapply(1:(max(nchar(peptide))-fragLen+1), function(i){stringr::str_sub(peptide, i, i+fragLen-1)})
    d <- sapply(f[nchar(f)==fragLen], function(s){peptideDescriptor.Batch(s)})
    data.table::data.table(
      "Peptide"=peptide,
      "FragLen"=fragLen,
      "AADescriptor"=peptideDescriptor.NameSet,
      "Min"=matrixStats::rowMins(d),
      "Max"=matrixStats::rowMaxs(d),
      "Mean"=matrixStats::rowMeans2(d),
      "Median"=matrixStats::rowMedians(d)
    )
  }

  # Parallelized calculation of descriptive statistics
  flag1 <- is.null(featureSet)
  flag2 <- length(grep("PeptDesc_", featureSet))>=1
  flag_comp <- flag1==T | (flag1==F & flag2==T)
  if(flag_comp==T){
    parameterDT <- data.table::CJ(peptideSet, fragLenSet) %>%
      magrittr::set_colnames(c("Peptide", "FragLen"))
    cl <- parallel::makeCluster(coreN, type="PSOCK")
    doSNOW::registerDoSNOW(cl)
    sink(tempfile())
    pb <- pbapply::timerProgressBar(max=nrow(parameterDT), style=1)
    sink()
    opts <- list(progress=function(n){pbapply::setTimerProgressBar(pb, n)})
    dt_peptdesc <- foreach::foreach(i=1:nrow(parameterDT), .inorder=F, .options.snow=opts)%dopar%{
      peptideDescriptor.FragStat.Single(parameterDT$"Peptide"[[i]], parameterDT$"FragLen"[[i]])
    } %>% data.table::rbindlist()
    close(pb)
    parallel::stopCluster(cl)
    gc();gc()
    col_id <- c("Peptide", "AADescriptor", "FragLen")
    col_val <- setdiff(colnames(dt_peptdesc), col_id)
    dt_peptdesc <- data.table::melt.data.table(dt_peptdesc, id=col_id, measure=col_val, variable.name="Stat", value.name="Value")
    dt_peptdesc[,"Feature":=paste0("PeptDesc_", AADescriptor, "_", Stat, "_", FragLen)][,"AADescriptor":=NULL][,"FragLen":=NULL][,"Stat":=NULL]
    dt_peptdesc <- data.table::dcast.data.table(dt_peptdesc, Peptide~Feature, value.var="Value", fun=mean)
  }else{
    dt_peptdesc <- data.table::data.table("Peptide"=peptideSet)
  }

  # Basic peptide characteristics
  dt_basic <- data.table::data.table("Peptide"=peptideSet, "Peptide_Length"=nchar(peptideSet))
  for(aa in Biostrings::AA_STANDARD){
    dt_basic[,paste0("Peptide_Contain", aa):=as.numeric(stringr::str_detect(peptideSet, aa))]
  }
  dt_peptdesc <- merge(dt_basic, dt_peptdesc, by="Peptide")
  data.table::setorder(dt_peptdesc, Peptide)

  # Minimum set of features (optional)
  if(!is.null(featureSet)){
    featureSet <- intersect(colnames(dt_peptdesc), featureSet)
    dt_peptdesc <- dt_peptdesc[, c("Peptide", featureSet), with=F]
  }

  # Finish the timer
  time.end <- proc.time()
  message("Overall time required = ", format((time.end-time.start)[3], nsmall=2), "[sec]")

  # Clear logs
  rm(list=setdiff(ls(), c("dt_peptdesc")))
  gc();gc()

  # Output
  return(dt_peptdesc)
}

#' @export
#' @rdname Features
#' @name Feature_Computation
Features_CPP <- function(
  peptideSet,
  fragLib,
  aaIndexIDSet="all",
  fragLenSet=3:8,
  fragDepth=10000,
  fragLibType=c("Deduplicated", "Weighted", "Mock"),
  featureSet=NULL,
  seedSet=1:5,
  coreN=parallel::detectCores(logical=F),
  tmpDir=file.path(tempdir(), "FeatureDF", format(Sys.time(), "%Y.%m.%d.%H.%M.%S"))
){
  # Temporary directory
  dir.create(tmpDir, showWarnings=F, recursive=T)

  # Start calculation
  set.seed(12345)
  time.start <- proc.time()

  # Pairwise contact potential matrix
  AACPMatrixList <- CPP_AACPMatrix()
  if(identical(aaIndexIDSet, "all")){
    aaIndexIDSet <- names(AACPMatrixList)
  }else{
    aaIndexIDSet <- sort(unique(c(aaIndexIDSet, stringr::str_replace(paste0(aaIndexIDSet, "inv"), "invinv", "inv"))))
    AACPMatrixList <- AACPMatrixList[aaIndexIDSet]
  }

  # Fragment library for pairwise alignment
  if(is.character(fragLib)) fragLib <- fst::read_fst(fragLib, as.data.table=T)
  condSet <- apply(data.table::CJ("Library"=fragLibType, "FragLen"=as.character(fragLenSet), "Seed"=as.character(seedSet)), 1, paste0, collapse="_")
  fragLib_temp <- fragLib[,condSet,with=F]
  fragLib_temp <- fragLib_temp[1:fragDepth,]
  for(s in seedSet){
    set.seed(s)
    condSet <- apply(data.table::CJ("Library"=fragLibType, "FragLen"=as.character(fragLenSet), "Seed"=as.character(s)), 1, paste0, collapse="_")
    for(col in condSet){
      fragLib_temp[[col]] <- sample(fragLib[[col]], size=fragDepth)
    }
  }

  # Parameter sets for contact potential profiling
  if(is.null(featureSet)){
    parameterDT <- data.table::CJ(peptideSet, aaIndexIDSet, fragLenSet, fragDepth, fragLibType) %>%
      magrittr::set_colnames(c("Peptide","AAIndexID","FragLen","FragDepth","Library"))
    data.table::setorder(parameterDT)
  }else{
    cjdt <- function(a,b){
      cj = data.table::CJ(1:nrow(a), 1:nrow(b))
      cbind(a[cj[[1]],], b[cj[[2]],])
    }
    featureDT <- strsplit(grep("^CPP_", featureSet, value=T), "_") %>%
      data.table::as.data.table() %>%
      data.table::transpose() %>%
      dplyr::select(V2, V4) %>%
      magrittr::set_colnames(c("AAIndexID", "FragLen")) %>%
      dplyr::distinct(.keep_all=T)
    featureDT[,FragLen:=as.integer(FragLen)]
    parameterDT <- data.table::CJ(peptideSet, fragDepth, fragLibType) %>%
      magrittr::set_colnames(c("Peptide","FragDepth","Library"))
    parameterDT <- cjdt(parameterDT, featureDT)
    data.table::setcolorder(parameterDT, c("Peptide","AAIndexID","FragLen","FragDepth","Library"))
    data.table::setorder(parameterDT)
  }
  message("Number of parameter combinations = ", nrow(parameterDT))

  # Skip random seed numbers for which feature dataframes were already created
  dt_cpp_filenames <- list.files(pattern="dt_feature_cpp_seed.+fst", path=tmpDir, full.names=F)
  if(length(dt_cpp_filenames)>=1){
    seedSet_done <- as.numeric(gsub("seed", "", gsub(".fst", "", t(as.data.frame(strsplit(dt_cpp_filenames, "_"), fix.empty.names=F))[,4], fixed=T)))
    message(" - Skipping random seeds ", paste0(seedSet_done, collapse=", "))
    seedSet <- setdiff(seedSet, seedSet_done)
  }

  # Conduct contact potential profiling
  statSet <- c("mean","sd","median","trimmed","mad","skew","kurtosis","se","IQR","Q0.1","Q0.9")
  statNameSet <- c("Mean","SD","Med","TrM","MAD","Skew","Kurt","SE","IQR","Q10","Q90")
  for(s in seedSet){
    cat("Random seed = ", s, "\n", sep="")
    cl <- parallel::makeCluster(coreN, type="PSOCK")
    snow::clusterSetupRNGstream(cl, seed=rep(s, 6))
    doSNOW::registerDoSNOW(cl)
    sink(tempfile())
    pb <- pbapply::timerProgressBar(max=nrow(parameterDT), style=1)
    sink()
    opts <- list(progress=function(n){pbapply::setTimerProgressBar(pb, n)})
    dt_cpp <- foreach::foreach(i=1:nrow(parameterDT), .inorder=F, .options.snow=opts)%dopar%{
      pept <- parameterDT[[i,1]]
      aaIndexID <- parameterDT[[i,2]]
      aacpMat <- AACPMatrixList[[aaIndexID]]
      fragLen <- parameterDT[[i,3]]
      libraryID <- paste0(c(fragLibType, fragLen, s), collapse="_")
      fragSet <- fragLib_temp[[libraryID]]
      al <- Biostrings::pairwiseAlignment(
        subject=pept, pattern=fragSet,
        substitutionMatrix=aacpMat,
        type="global-local", gapOpening=100, gapExtension=100,
        scoreOnly=T
      )
      stat <- psych::describe(al, trim=.1, interp=F, skew=T, type=3, ranges=T, IQR=T, quant=c(.10, .90))[statSet]
      stat <- data.table::as.data.table(stat)
      stat[,"Peptide":=pept][,"AAIndexID":=aaIndexID][,"FragLen":=fragLen]
      stat
    } %>%
      data.table::rbindlist()
    colnames(dt_cpp) <- c(statNameSet,"Peptide","AAIndexID","FragLen")
    dt_cpp[,"FragDepth":=fragDepth][,"Library":=fragLibType][,"Seed":=s]
    data.table::setcolorder(dt_cpp, c("Peptide","AAIndexID","FragLen","FragDepth","Library","Seed",statNameSet))
    data.table::setorder(dt_cpp, Peptide)
    fst::write_fst(dt_cpp, file.path(tmpDir, paste0("dt_feature_cpp_seed", s, ".fst")))
    close(pb)
    parallel::stopCluster(cl)
    gc();gc()
  }

  # Final formatting
  message("Data formatting...")
  dt_cpp_filenames <- list.files(pattern="dt_feature_cpp_seed.+fst", path=tmpDir, full.names=T)
  dt_cpp <- data.table::rbindlist(lapply(dt_cpp_filenames, fst::read_fst, as.data.table=T))
  col_id <- c("Peptide", "AAIndexID", "FragLen", "FragDepth", "Library", "Seed")
  col_val <- setdiff(colnames(dt_cpp), col_id)
  peptideSetList <- BBmisc::chunk(peptideSet, chunk.size=5000)
  aggregateMean <- function(dt){
    dt <- data.table::melt.data.table(dt, id=col_id, measure=col_val, variable.name="Stat", value.name="Value")
    dt[,"Feature":=paste0("CPP_", AAIndexID, "_", Stat, "_", FragLen)][,"AAIndexID":=NULL][,"FragLen":=NULL][,"Stat":=NULL]
    dt <- data.table::dcast.data.table(dt, Peptide+FragDepth+Library~Feature, value.var="Value", fun=mean)
    gc();gc()
    return(dt)
  }
  dt_cpp <- data.table::rbindlist(
    pbapply::pblapply(1:length(peptideSetList), function(i){aggregateMean(dt_cpp[Peptide %in% peptideSetList[[i]]])})
  )
  if(!is.null(featureSet)){
    featureSet <- intersect(colnames(dt_cpp), featureSet)
    dt_cpp <- dt_cpp[, c("Peptide", "FragDepth", "Library", featureSet), with=F]
  }

  # Finish the timer
  time.end <- proc.time()
  message("Overall time required = ", format((time.end-time.start)[3], nsmall=2), "[sec]")

  # Clear logs
  rm(list=setdiff(ls(), c("dt_cpp")))
  gc();gc()

  # Output
  return(dt_cpp)
}

#' @export
#' @rdname Features
#' @name Feature_Computation
Features <- function(
  peptideSet,
  fragLib,
  aaIndexIDSet="all",
  fragLenSet=3:8,
  fragDepth=10000,
  fragLibType=c("Deduplicated", "Weighted", "Mock"),
  featureSet=NULL,
  seedSet=1:5,
  coreN=parallel::detectCores(logical=F),
  tmpDir=file.path(tempdir(), "FeatureDF", format(Sys.time(), "%Y.%m.%d.%H.%M.%S"))
){
  message("Peptide contact potential profiling analysis...")
  dt_cpp <- Features_CPP(
    peptideSet=peptideSet,
    fragLib=fragLib,
    aaIndexIDSet=aaIndexIDSet,
    fragLenSet=fragLenSet,
    fragDepth=fragDepth,
    fragLibType=fragLibType,
    featureSet=featureSet,
    seedSet=seedSet,
    coreN=coreN,
    tmpDir=tmpDir
  )
  message("Peptide descriptor analysis...")
  dt_peptdesc <- Features_PeptDesc(
    peptideSet=peptideSet,
    fragLenSet=fragLenSet,
    featureSet=featureSet,
    coreN=coreN
  )
  message("Merging...")
  dt <- merge(dt_peptdesc, dt_cpp, by="Peptide")
  data.table::setorder(dt, Peptide, FragDepth, Library)
  dt[,"FragDepth":=format(FragDepth, trim=T, scientific=F)]
  return(split(dt, by=c("Library", "FragDepth"), keep.by=F))
}
masato-ogishi/Repitope documentation built on Feb. 14, 2023, 5:47 a.m.