#' Feature computation.
#'
#' \code{Features_PeptDesc} computes peptide descriptors.\cr
#' \code{Features_CPP} computes descriptive statistics of TCR repertoire-wide contact potential distributions.\cr
#' \code{Features} is a wrapper function.
#'
#' @param peptideSet A set of peptide sequences.
#' @param fragLib Either the dataframe of fragment libraries generated by \code{CPP_FragmentLibrary} or the path to the fst file.
#' @param aaIndexIDSet A set of AAIndex IDs indicating the AACP scales to be used. Set "all" to shortcut the selection of all available AACP scales.
#' @param fragLenSet A set of sliding window sizes.
#' @param fragDepth A fragment library size. Must be an integer vector of length one.
#' @param fragLibType A string indicating the types of fragment libraries to be used. Must be a character vector of length one.
#' @param featureSet A minimum set of features. Combinations of fragment lengths and AAIndex IDs are internally extracted for calculating CPPs.
#' @param seedSet A set of random seeds.
#' @param coreN The number of cores to be used for parallelization.
#' @param tmpDir Destination directory to save intermediate files.
#' @export
#' @rdname Features
#' @name Feature_Computation
Features_PeptDesc <- function(
peptideSet,
fragLenSet=3:8,
featureSet=NULL,
coreN=parallel::detectCores(logical=F)
){
# Start calculation
set.seed(12345)
time.start <- proc.time()
# Working functions
peptideDescriptor.Batch <- function(peptide){
unlist(list(
Peptides::aIndex(peptide),
Peptides::blosumIndices(peptide),
Peptides::boman(peptide),
Peptides::charge(peptide),
Peptides::crucianiProperties(peptide),
Peptides::fasgaiVectors(peptide),
Peptides::hmoment(peptide),
Peptides::hydrophobicity(peptide),
Peptides::instaIndex(peptide),
Peptides::kideraFactors(peptide),
Peptides::mswhimScores(peptide),
Peptides::pI(peptide),
Peptides::protFP(peptide),
Peptides::vhseScales(peptide),
Peptides::zScales(peptide)
))
}
peptideDescriptor.NameSet <- c("AliphaticIndex","BLOSUM1","BLOSUM2","BLOSUM3","BLOSUM4","BLOSUM5","BLOSUM6","BLOSUM7","BLOSUM8","BLOSUM9","BLOSUM10","Boman","Charge","PP1","PP2","PP3","F1","F2","F3","F4","F5","F6","HydrophobicMoment","Hydrophobicity","Instability","KF1","KF2","KF3","KF4","KF5","KF6","KF7","KF8","KF9","KF10","MSWHIM1","MSWHIM2","MSWHIM3","pI","ProtFP1","ProtFP2","ProtFP3","ProtFP4","ProtFP5","ProtFP6","ProtFP7","ProtFP8","VHSE1","VHSE2","VHSE3","VHSE4","VHSE5","VHSE6","VHSE7","VHSE8","Z1","Z2","Z3","Z4","Z5")
peptideDescriptor.FragStat.Single <- function(peptide, fragLen){
f <- sapply(1:(max(nchar(peptide))-fragLen+1), function(i){stringr::str_sub(peptide, i, i+fragLen-1)})
d <- sapply(f[nchar(f)==fragLen], function(s){peptideDescriptor.Batch(s)})
data.table::data.table(
"Peptide"=peptide,
"FragLen"=fragLen,
"AADescriptor"=peptideDescriptor.NameSet,
"Min"=matrixStats::rowMins(d),
"Max"=matrixStats::rowMaxs(d),
"Mean"=matrixStats::rowMeans2(d),
"Median"=matrixStats::rowMedians(d)
)
}
# Parallelized calculation of descriptive statistics
flag1 <- is.null(featureSet)
flag2 <- length(grep("PeptDesc_", featureSet))>=1
flag_comp <- flag1==T | (flag1==F & flag2==T)
if(flag_comp==T){
parameterDT <- data.table::CJ(peptideSet, fragLenSet) %>%
magrittr::set_colnames(c("Peptide", "FragLen"))
cl <- parallel::makeCluster(coreN, type="PSOCK")
doSNOW::registerDoSNOW(cl)
sink(tempfile())
pb <- pbapply::timerProgressBar(max=nrow(parameterDT), style=1)
sink()
opts <- list(progress=function(n){pbapply::setTimerProgressBar(pb, n)})
dt_peptdesc <- foreach::foreach(i=1:nrow(parameterDT), .inorder=F, .options.snow=opts)%dopar%{
peptideDescriptor.FragStat.Single(parameterDT$"Peptide"[[i]], parameterDT$"FragLen"[[i]])
} %>% data.table::rbindlist()
close(pb)
parallel::stopCluster(cl)
gc();gc()
col_id <- c("Peptide", "AADescriptor", "FragLen")
col_val <- setdiff(colnames(dt_peptdesc), col_id)
dt_peptdesc <- data.table::melt.data.table(dt_peptdesc, id=col_id, measure=col_val, variable.name="Stat", value.name="Value")
dt_peptdesc[,"Feature":=paste0("PeptDesc_", AADescriptor, "_", Stat, "_", FragLen)][,"AADescriptor":=NULL][,"FragLen":=NULL][,"Stat":=NULL]
dt_peptdesc <- data.table::dcast.data.table(dt_peptdesc, Peptide~Feature, value.var="Value", fun=mean)
}else{
dt_peptdesc <- data.table::data.table("Peptide"=peptideSet)
}
# Basic peptide characteristics
dt_basic <- data.table::data.table("Peptide"=peptideSet, "Peptide_Length"=nchar(peptideSet))
for(aa in Biostrings::AA_STANDARD){
dt_basic[,paste0("Peptide_Contain", aa):=as.numeric(stringr::str_detect(peptideSet, aa))]
}
dt_peptdesc <- merge(dt_basic, dt_peptdesc, by="Peptide")
data.table::setorder(dt_peptdesc, Peptide)
# Minimum set of features (optional)
if(!is.null(featureSet)){
featureSet <- intersect(colnames(dt_peptdesc), featureSet)
dt_peptdesc <- dt_peptdesc[, c("Peptide", featureSet), with=F]
}
# Finish the timer
time.end <- proc.time()
message("Overall time required = ", format((time.end-time.start)[3], nsmall=2), "[sec]")
# Clear logs
rm(list=setdiff(ls(), c("dt_peptdesc")))
gc();gc()
# Output
return(dt_peptdesc)
}
#' @export
#' @rdname Features
#' @name Feature_Computation
Features_CPP <- function(
peptideSet,
fragLib,
aaIndexIDSet="all",
fragLenSet=3:8,
fragDepth=10000,
fragLibType=c("Deduplicated", "Weighted", "Mock"),
featureSet=NULL,
seedSet=1:5,
coreN=parallel::detectCores(logical=F),
tmpDir=file.path(tempdir(), "FeatureDF", format(Sys.time(), "%Y.%m.%d.%H.%M.%S"))
){
# Temporary directory
dir.create(tmpDir, showWarnings=F, recursive=T)
# Start calculation
set.seed(12345)
time.start <- proc.time()
# Pairwise contact potential matrix
AACPMatrixList <- CPP_AACPMatrix()
if(identical(aaIndexIDSet, "all")){
aaIndexIDSet <- names(AACPMatrixList)
}else{
aaIndexIDSet <- sort(unique(c(aaIndexIDSet, stringr::str_replace(paste0(aaIndexIDSet, "inv"), "invinv", "inv"))))
AACPMatrixList <- AACPMatrixList[aaIndexIDSet]
}
# Fragment library for pairwise alignment
if(is.character(fragLib)) fragLib <- fst::read_fst(fragLib, as.data.table=T)
condSet <- apply(data.table::CJ("Library"=fragLibType, "FragLen"=as.character(fragLenSet), "Seed"=as.character(seedSet)), 1, paste0, collapse="_")
fragLib_temp <- fragLib[,condSet,with=F]
fragLib_temp <- fragLib_temp[1:fragDepth,]
for(s in seedSet){
set.seed(s)
condSet <- apply(data.table::CJ("Library"=fragLibType, "FragLen"=as.character(fragLenSet), "Seed"=as.character(s)), 1, paste0, collapse="_")
for(col in condSet){
fragLib_temp[[col]] <- sample(fragLib[[col]], size=fragDepth)
}
}
# Parameter sets for contact potential profiling
if(is.null(featureSet)){
parameterDT <- data.table::CJ(peptideSet, aaIndexIDSet, fragLenSet, fragDepth, fragLibType) %>%
magrittr::set_colnames(c("Peptide","AAIndexID","FragLen","FragDepth","Library"))
data.table::setorder(parameterDT)
}else{
cjdt <- function(a,b){
cj = data.table::CJ(1:nrow(a), 1:nrow(b))
cbind(a[cj[[1]],], b[cj[[2]],])
}
featureDT <- strsplit(grep("^CPP_", featureSet, value=T), "_") %>%
data.table::as.data.table() %>%
data.table::transpose() %>%
dplyr::select(V2, V4) %>%
magrittr::set_colnames(c("AAIndexID", "FragLen")) %>%
dplyr::distinct(.keep_all=T)
featureDT[,FragLen:=as.integer(FragLen)]
parameterDT <- data.table::CJ(peptideSet, fragDepth, fragLibType) %>%
magrittr::set_colnames(c("Peptide","FragDepth","Library"))
parameterDT <- cjdt(parameterDT, featureDT)
data.table::setcolorder(parameterDT, c("Peptide","AAIndexID","FragLen","FragDepth","Library"))
data.table::setorder(parameterDT)
}
message("Number of parameter combinations = ", nrow(parameterDT))
# Skip random seed numbers for which feature dataframes were already created
dt_cpp_filenames <- list.files(pattern="dt_feature_cpp_seed.+fst", path=tmpDir, full.names=F)
if(length(dt_cpp_filenames)>=1){
seedSet_done <- as.numeric(gsub("seed", "", gsub(".fst", "", t(as.data.frame(strsplit(dt_cpp_filenames, "_"), fix.empty.names=F))[,4], fixed=T)))
message(" - Skipping random seeds ", paste0(seedSet_done, collapse=", "))
seedSet <- setdiff(seedSet, seedSet_done)
}
# Conduct contact potential profiling
statSet <- c("mean","sd","median","trimmed","mad","skew","kurtosis","se","IQR","Q0.1","Q0.9")
statNameSet <- c("Mean","SD","Med","TrM","MAD","Skew","Kurt","SE","IQR","Q10","Q90")
for(s in seedSet){
cat("Random seed = ", s, "\n", sep="")
cl <- parallel::makeCluster(coreN, type="PSOCK")
snow::clusterSetupRNGstream(cl, seed=rep(s, 6))
doSNOW::registerDoSNOW(cl)
sink(tempfile())
pb <- pbapply::timerProgressBar(max=nrow(parameterDT), style=1)
sink()
opts <- list(progress=function(n){pbapply::setTimerProgressBar(pb, n)})
dt_cpp <- foreach::foreach(i=1:nrow(parameterDT), .inorder=F, .options.snow=opts)%dopar%{
pept <- parameterDT[[i,1]]
aaIndexID <- parameterDT[[i,2]]
aacpMat <- AACPMatrixList[[aaIndexID]]
fragLen <- parameterDT[[i,3]]
libraryID <- paste0(c(fragLibType, fragLen, s), collapse="_")
fragSet <- fragLib_temp[[libraryID]]
al <- Biostrings::pairwiseAlignment(
subject=pept, pattern=fragSet,
substitutionMatrix=aacpMat,
type="global-local", gapOpening=100, gapExtension=100,
scoreOnly=T
)
stat <- psych::describe(al, trim=.1, interp=F, skew=T, type=3, ranges=T, IQR=T, quant=c(.10, .90))[statSet]
stat <- data.table::as.data.table(stat)
stat[,"Peptide":=pept][,"AAIndexID":=aaIndexID][,"FragLen":=fragLen]
stat
} %>%
data.table::rbindlist()
colnames(dt_cpp) <- c(statNameSet,"Peptide","AAIndexID","FragLen")
dt_cpp[,"FragDepth":=fragDepth][,"Library":=fragLibType][,"Seed":=s]
data.table::setcolorder(dt_cpp, c("Peptide","AAIndexID","FragLen","FragDepth","Library","Seed",statNameSet))
data.table::setorder(dt_cpp, Peptide)
fst::write_fst(dt_cpp, file.path(tmpDir, paste0("dt_feature_cpp_seed", s, ".fst")))
close(pb)
parallel::stopCluster(cl)
gc();gc()
}
# Final formatting
message("Data formatting...")
dt_cpp_filenames <- list.files(pattern="dt_feature_cpp_seed.+fst", path=tmpDir, full.names=T)
dt_cpp <- data.table::rbindlist(lapply(dt_cpp_filenames, fst::read_fst, as.data.table=T))
col_id <- c("Peptide", "AAIndexID", "FragLen", "FragDepth", "Library", "Seed")
col_val <- setdiff(colnames(dt_cpp), col_id)
peptideSetList <- BBmisc::chunk(peptideSet, chunk.size=5000)
aggregateMean <- function(dt){
dt <- data.table::melt.data.table(dt, id=col_id, measure=col_val, variable.name="Stat", value.name="Value")
dt[,"Feature":=paste0("CPP_", AAIndexID, "_", Stat, "_", FragLen)][,"AAIndexID":=NULL][,"FragLen":=NULL][,"Stat":=NULL]
dt <- data.table::dcast.data.table(dt, Peptide+FragDepth+Library~Feature, value.var="Value", fun=mean)
gc();gc()
return(dt)
}
dt_cpp <- data.table::rbindlist(
pbapply::pblapply(1:length(peptideSetList), function(i){aggregateMean(dt_cpp[Peptide %in% peptideSetList[[i]]])})
)
if(!is.null(featureSet)){
featureSet <- intersect(colnames(dt_cpp), featureSet)
dt_cpp <- dt_cpp[, c("Peptide", "FragDepth", "Library", featureSet), with=F]
}
# Finish the timer
time.end <- proc.time()
message("Overall time required = ", format((time.end-time.start)[3], nsmall=2), "[sec]")
# Clear logs
rm(list=setdiff(ls(), c("dt_cpp")))
gc();gc()
# Output
return(dt_cpp)
}
#' @export
#' @rdname Features
#' @name Feature_Computation
Features <- function(
peptideSet,
fragLib,
aaIndexIDSet="all",
fragLenSet=3:8,
fragDepth=10000,
fragLibType=c("Deduplicated", "Weighted", "Mock"),
featureSet=NULL,
seedSet=1:5,
coreN=parallel::detectCores(logical=F),
tmpDir=file.path(tempdir(), "FeatureDF", format(Sys.time(), "%Y.%m.%d.%H.%M.%S"))
){
message("Peptide contact potential profiling analysis...")
dt_cpp <- Features_CPP(
peptideSet=peptideSet,
fragLib=fragLib,
aaIndexIDSet=aaIndexIDSet,
fragLenSet=fragLenSet,
fragDepth=fragDepth,
fragLibType=fragLibType,
featureSet=featureSet,
seedSet=seedSet,
coreN=coreN,
tmpDir=tmpDir
)
message("Peptide descriptor analysis...")
dt_peptdesc <- Features_PeptDesc(
peptideSet=peptideSet,
fragLenSet=fragLenSet,
featureSet=featureSet,
coreN=coreN
)
message("Merging...")
dt <- merge(dt_peptdesc, dt_cpp, by="Peptide")
data.table::setorder(dt, Peptide, FragDepth, Library)
dt[,"FragDepth":=format(FragDepth, trim=T, scientific=F)]
return(split(dt, by=c("Library", "FragDepth"), keep.by=F))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.