inst/doc/Rcpi.R

## ---- eval = FALSE-------------------------------------------------------
#  install.packages("BiocManager")
#  BiocManager::install("Rcpi")

## ---- eval = FALSE-------------------------------------------------------
#  BiocManager::install("Rcpi", dependencies = c("Imports", "Enhances"))

## ---- eval = FALSE-------------------------------------------------------
#  library("Rcpi")
#  
#  # load FASTA files
#  extracell = readFASTA(system.file(
#    "vignettedata/extracell.fasta", package = "Rcpi"))
#  mitonchon = readFASTA(system.file(
#    "vignettedata/mitochondrion.fasta", package = "Rcpi"))

## ---- eval = FALSE-------------------------------------------------------
#  length(extracell)

## ---- eval = FALSE-------------------------------------------------------
#  ## [1] 325

## ---- eval = FALSE-------------------------------------------------------
#  length(mitonchon)

## ---- eval = FALSE-------------------------------------------------------
#  ## [1] 306

## ---- eval = FALSE-------------------------------------------------------
#  extracell = extracell[(sapply(extracell, checkProt))]
#  mitonchon = mitonchon[(sapply(mitonchon, checkProt))]

## ---- eval = FALSE-------------------------------------------------------
#  length(extracell)

## ---- eval = FALSE-------------------------------------------------------
#  ## [1] 323

## ---- eval = FALSE-------------------------------------------------------
#  length(mitonchon)

## ---- eval = FALSE-------------------------------------------------------
#  ## [1] 304

## ---- eval = FALSE-------------------------------------------------------
#  # calculate APAAC descriptors
#  x1 = t(sapply(extracell, extractProtAPAAC))
#  x2 = t(sapply(mitonchon, extractProtAPAAC))
#  x  = rbind(x1, x2)
#  
#  # make class labels
#  labels = as.factor(c(rep(0, length(extracell)), rep(1, length(mitonchon))))

## ---- eval = FALSE-------------------------------------------------------
#  set.seed(1001)
#  
#  # split training and test set
#  tr.idx = c(
#    sample(1:nrow(x1), round(nrow(x1) * 0.75)),
#    sample(nrow(x1) + 1:nrow(x2), round(nrow(x2) * 0.75)))
#  te.idx = setdiff(1:nrow(x), tr.idx)
#  
#  x.tr   = x[tr.idx, ]
#  x.te   = x[te.idx, ]
#  y.tr   = labels[tr.idx]
#  y.te   = labels[te.idx]

## ---- eval = FALSE-------------------------------------------------------
#  library("randomForest")
#  rf.fit = randomForest(x.tr, y.tr, cv.fold = 5)
#  print(rf.fit)

## ---- eval = FALSE-------------------------------------------------------
#  ## Call:
#  ##  randomForest(x = x.tr, y = y.tr, cv.fold = 5)
#  ##                Type of random forest: classification
#  ##                      Number of trees: 500
#  ## No. of variables tried at each split: 8
#  ##
#  ##         OOB estimate of  error rate: 25.11%
#  ## Confusion matrix:
#  ##     0   1 class.error
#  ## 0 196  46   0.1900826
#  ## 1  72 156   0.3157895

## ---- eval = FALSE-------------------------------------------------------
#  # predict on test set
#  rf.pred = predict(rf.fit, newdata = x.te, type = "prob")[, 1]
#  
#  # plot ROC curve
#  library("pROC")
#  plot.roc(y.te, rf.pred, grid = TRUE, print.auc = TRUE)

## ---- eval = FALSE-------------------------------------------------------
#  ## Call:
#  ## plot.roc.default(x = y.te, predictor = rf.pred, col = "#0080ff",
#  ##                  grid = TRUE, print.auc = TRUE)
#  ##
#  ## Data: rf.pred in 81 controls (y.te 0) > 76 cases (y.te 1).
#  ## Area under the curve: 0.8697

## ---- eval = FALSE-------------------------------------------------------
#  library("Rcpi")
#  
#  RI.smi = system.file(
#    "vignettedata/FDAMDD.smi", package = "Rcpi")
#  RI.csv = system.file(
#    "vignettedata/RI.csv", package = "Rcpi")
#  
#  x.mol = readMolFromSmi(RI.smi, type = "mol")
#  x.tab = read.table(RI.csv, sep = "\t", header = TRUE)
#  y = x.tab$RI

## ---- eval = FALSE-------------------------------------------------------
#  # calculate selected molecular descriptors
#  x = suppressWarnings(cbind(
#      extractDrugALOGP(x.mol),
#      extractDrugApol(x.mol),
#      extractDrugECI(x.mol),
#      extractDrugTPSA(x.mol),
#      extractDrugWeight(x.mol),
#      extractDrugWienerNumbers(x.mol),
#      extractDrugZagrebIndex(x.mol)))

## ---- eval = FALSE-------------------------------------------------------
#  # regression on training set
#  library("caret")
#  library("pls")
#  
#  # cross-validation settings
#  ctrl = trainControl(
#    method = "repeatedcv", number = 5, repeats = 10,
#    summaryFunction = defaultSummary)
#  
#  # train a pls model
#  set.seed(1002)
#  pls.fit = train(
#    x, y, method = "pls", tuneLength = 10, trControl = ctrl,
#    metric = "RMSE", preProc = c("center", "scale"))
#  
#  # print cross-validation result
#  print(pls.fit)

## ---- eval = FALSE-------------------------------------------------------
#  ## Partial Least Squares
#  ##
#  ## 297 samples
#  ##  10 predictors
#  ##
#  ## Pre-processing: centered, scaled
#  ## Resampling: Cross-Validated (5 fold, repeated 10 times)
#  ##
#  ## Summary of sample sizes: 237, 237, 237, 238, 239, 238, ...
#  ##
#  ## Resampling results across tuning parameters:
#  ##
#  ##   ncomp  RMSE  Rsquared  RMSE SD  Rsquared SD
#  ##   1      104   0.884     9.44     0.0285
#  ##   2      86.4  0.92      6.99     0.0194
#  ##   3      83.8  0.924     6.56     0.0185
#  ##   4      79.6  0.931     6.98     0.0194
#  ##   5      76.3  0.937     7.45     0.0187
#  ##   6      74.7  0.94      6.85     0.0162
#  ##   7      73.7  0.941     6.75     0.0159
#  ##   8      73.5  0.942     6.5      0.0142
#  ##   9      72.5  0.944     6.18     0.0137
#  ##
#  ## RMSE was used to select the optimal model using  the smallest value.
#  ## The final value used for the model was ncomp = 9.

## ---- eval = FALSE-------------------------------------------------------
#  # # of components vs RMSE
#  print(plot(pls.fit, asp = 0.5))

## ---- eval = FALSE-------------------------------------------------------
#  # plot experimental RIs vs predicted RIs
#  plot(y, predict(pls.fit, x), xlim = range(y), ylim = range(y),
#       xlab = "Experimental RIs", ylab = "Predicted RIs")
#  abline(a = 0, b = 1)

## ---- eval = FALSE-------------------------------------------------------
#  library("Rcpi")
#  
#  fdamdd.smi = system.file("vignettedata/FDAMDD.smi", package = "Rcpi")
#  fdamdd.csv = system.file("vignettedata/FDAMDD.csv", package = "Rcpi")
#  
#  x.mol = readMolFromSmi(fdamdd.smi, type = "mol")
#  x.smi = readMolFromSmi(fdamdd.smi, type = "text")
#  y = as.factor(paste0("class", scan(fdamdd.csv)))

## ---- eval = FALSE-------------------------------------------------------
#  # calculate molecular fingerprints
#  x1 = extractDrugEstateComplete(x.mol)
#  x2 = extractDrugMACCSComplete(x.mol)
#  x3 = extractDrugOBFP4(x.smi, type = "smile")

## ---- eval = FALSE-------------------------------------------------------
#  library("caret")
#  
#  # remove near-zero variance variables
#  x1 = x1[, -nearZeroVar(x1)]
#  x2 = x2[, -nearZeroVar(x2)]
#  x3 = x3[, -nearZeroVar(x3)]
#  
#  # split training and test set
#  set.seed(1003)
#  tr.idx = sample(1:nrow(x1), round(nrow(x1) * 0.75))
#  te.idx = setdiff(1:nrow(x1), tr.idx)
#  x1.tr  = x1[tr.idx, ]
#  x1.te  = x1[te.idx, ]
#  x2.tr  = x2[tr.idx, ]
#  x2.te  = x2[te.idx, ]
#  x3.tr  = x3[tr.idx, ]
#  x3.te  = x3[te.idx, ]
#  y.tr   = y[tr.idx]
#  y.te   = y[te.idx]

## ---- eval = FALSE-------------------------------------------------------
#  # svm classification on training sets
#  library("kernlab")
#  
#  # cross-validation settings
#  ctrl = trainControl(method = "cv", number = 5, repeats = 10,
#                      classProbs = TRUE,
#                      summaryFunction = twoClassSummary)
#  
#  # SVM with RBF kernel
#  svm.fit1 = train(
#    x1.tr, y.tr, method = "svmRadial", trControl = ctrl,
#    metric = "ROC", preProc = c("center", "scale"))
#  svm.fit2 = train(
#    x2.tr, y.tr, method = "svmRadial", trControl = ctrl,
#    metric = "ROC", preProc = c("center", "scale"))
#  svm.fit3 = train(
#    x3.tr, y.tr, method = "svmRadial", trControl = ctrl,
#    metric = "ROC", preProc = c("center", "scale"))
#  
#  # print cross-validation result
#  print(svm.fit1)
#  print(svm.fit2)
#  print(svm.fit3)

## ---- eval = FALSE-------------------------------------------------------
#  ## Support Vector Machines with Radial Basis Function Kernel
#  ##
#  ## 597 samples
#  ##  23 predictors
#  ##   2 classes: "class0", "class1"
#  ##
#  ## Pre-processing: centered, scaled
#  ## Resampling: Cross-Validated (5 fold)
#  ##
#  ## Summary of sample sizes: 478, 479, 477, 477, 477
#  ##
#  ## Resampling results across tuning parameters:
#  ##
#  ##   C     ROC    Sens   Spec   ROC SD  Sens SD  Spec SD
#  ##   0.25  0.797  0.7    0.765  0.0211  0.0442   0.00666
#  ##   0.5   0.808  0.696  0.79   0.0173  0.059    0.0236
#  ##   1     0.812  0.703  0.781  0.0191  0.0664   0.0228
#  ##
#  ## Tuning parameter "sigma" was held constant at a value of 0.02921559
#  ## ROC was used to select the optimal model using  the largest value.
#  ## The final values used for the model were sigma = 0.0292 and C = 1.

## ---- eval = FALSE-------------------------------------------------------
#  ## Support Vector Machines with Radial Basis Function Kernel
#  ##
#  ## 597 samples
#  ## 126 predictors
#  ##   2 classes: "class0", "class1"
#  ##
#  ## Pre-processing: centered, scaled
#  ## Resampling: Cross-Validated (5 fold)
#  ##
#  ## Summary of sample sizes: 477, 477, 477, 478, 479
#  ##
#  ## Resampling results across tuning parameters:
#  ##
#  ##   C     ROC    Sens   Spec   ROC SD  Sens SD  Spec SD
#  ##   0.25  0.834  0.715  0.775  0.0284  0.0994   0.0589
#  ##   0.5   0.848  0.726  0.79   0.0299  0.065    0.0493
#  ##   1     0.863  0.769  0.793  0.0307  0.0229   0.0561
#  ##
#  ## Tuning parameter "sigma" was held constant at a value of 0.004404305
#  ## ROC was used to select the optimal model using  the largest value.
#  ## The final values used for the model were sigma = 0.0044 and C = 1.

## ---- eval = FALSE-------------------------------------------------------
#  ## Support Vector Machines with Radial Basis Function Kernel
#  ##
#  ## 597 samples
#  ##  58 predictors
#  ##   2 classes: "class0", "class1"
#  ##
#  ## Pre-processing: centered, scaled
#  ## Resampling: Cross-Validated (5 fold)
#  ##
#  ## Summary of sample sizes: 478, 478, 477, 478, 477
#  ##
#  ## Resampling results across tuning parameters:
#  ##
#  ##   C     ROC    Sens   Spec   ROC SD  Sens SD  Spec SD
#  ##   0.25  0.845  0.769  0.746  0.0498  0.0458   0.0877
#  ##   0.5   0.856  0.744  0.777  0.0449  0.0148   0.0728
#  ##   1     0.863  0.751  0.777  0.0428  0.036    0.0651
#  ##
#  ## Tuning parameter "sigma" was held constant at a value of 0.01077024
#  ## ROC was used to select the optimal model using  the largest value.
#  ## The final values used for the model were sigma = 0.0108 and C = 1.

## ---- eval = FALSE-------------------------------------------------------
#  # predict on test set
#  svm.pred1 = predict(svm.fit1, newdata = x1.te, type = "prob")[, 1]
#  svm.pred2 = predict(svm.fit2, newdata = x2.te, type = "prob")[, 1]
#  svm.pred3 = predict(svm.fit3, newdata = x3.te, type = "prob")[, 1]
#  
#  # generate colors
#  library("RColorBrewer")
#  pal = brewer.pal(3, "Set1")
#  
#  # ROC curves of different fingerprints
#  library("pROC")
#  plot(smooth(roc(y.te, svm.pred1)), col = pal[1], grid = TRUE)
#  plot(smooth(roc(y.te, svm.pred2)), col = pal[2], grid = TRUE, add = TRUE)
#  plot(smooth(roc(y.te, svm.pred3)), col = pal[3], grid = TRUE, add = TRUE)

## ---- eval = FALSE-------------------------------------------------------
#  library("Rcpi")
#  mols = readMolFromSDF(system.file(
#      "compseq/tyrphostin.sdf", package = "Rcpi"))

## ---- eval = FALSE-------------------------------------------------------
#  simmat = diag(length(mols))
#  
#  for (i in 1:length(mols)) {
#   for (j in i:length(mols)) {
#     fp1 = extractDrugEstate(mols[[i]])
#     fp2 = extractDrugEstate(mols[[j]])
#     tmp = calcDrugFPSim(fp1, fp2, fptype = "compact", metric = "tanimoto")
#     simmat[i, j] = tmp
#     simmat[j, i] = tmp
#   }
#  }

## ---- eval = FALSE-------------------------------------------------------
#  mol.hc = hclust(as.dist(1 - simmat), method = "ward.D")
#  
#  library("ape")  # tree visualization of clusters
#  clus5 = cutree(mol.hc, 5)  # cut dendrogram into 5 clusters
#  
#  # generate colors
#  library("RColorBrewer")
#  pal5 = brewer.pal(5, "Set1")
#  plot(as.phylo(mol.hc), type = "fan",
#       tip.color = pal5[clus5],
#       label.offset = 0.1, cex = 0.7)

## ---- eval = FALSE-------------------------------------------------------
#  library("Rcpi")
#  
#  mol   = system.file("compseq/DB00530.sdf", package = "Rcpi")
#  moldb = system.file("compseq/tyrphostin.sdf", package = "Rcpi")

## ---- eval = FALSE-------------------------------------------------------
#  rank1 = searchDrug(
#    mol, moldb, cores = 4, method = "fp",
#    fptype = "maccs", fpsim = "tanimoto")
#  rank2 = searchDrug(
#    mol, moldb, cores = 4, method = "fp",
#    fptype = "fp2", fpsim = "cosine")
#  rank3 = searchDrug(
#    mol, moldb, cores = 4, method = "mcs",
#    mcssim = "tanimoto")

## ---- eval = FALSE-------------------------------------------------------
#  head(rank1)
#  ##        92       100        83       101         1        36
#  ## 0.6491228 0.6491228 0.5882353 0.5660377 0.5000000 0.4861111
#  
#  head(rank2)
#  ##       100        92        83       101        94        16
#  ## 0.8310005 0.8208663 0.5405856 0.5033150 0.4390790 0.4274081
#  
#  head(rank3)
#  ##        92       100        23        39        94        64
#  ## 0.7000000 0.7000000 0.4000000 0.4000000 0.4000000 0.3783784

## ---- eval = FALSE-------------------------------------------------------
#  # convert SDF format to SMILES format
#  convMolFormat(
#      infile = mol, outfile = "DB00530.smi", from = "sdf", to = "smiles")
#  convMolFormat(
#      infile = moldb, outfile = "tyrphostin.smi", from = "sdf", to = "smiles")
#  
#  smi1 = readLines("DB00530.smi")
#  smi2 = readLines("tyrphostin.smi")[92]  # select the #92 molecule
#  calcDrugMCSSim(smi1, smi2, type = "smile", plot = TRUE)

## ---- eval = FALSE-------------------------------------------------------
#  ## [[1]]
#  ## An instance of "MCS"
#  ##  Number of MCSs: 1
#  ##  530: 29 atoms
#  ##  4705: 22 atoms
#  ##  MCS: 18 atoms
#  ##  Tanimoto Coefficient: 0.54545
#  ##  Overlap Coefficient: 0.81818
#  ##
#  ## [[2]]
#  ## Tanimoto_Coefficient
#  ##            0.5454545
#  ##
#  ## [[3]]
#  ## Overlap_Coefficient
#  ##           0.8181818

## ---- eval = FALSE-------------------------------------------------------
#  library("Rcpi")
#  
#  gpcr = read.table(system.file(
#      "vignettedata/GPCR.csv", package = "Rcpi"),
#      header = FALSE, as.is = TRUE)

## ---- eval = FALSE-------------------------------------------------------
#  head(gpcr)

## ---- eval = FALSE-------------------------------------------------------
#  ##          V1     V2
#  ## 1 hsa:10161 D00528
#  ## 2 hsa:10800 D00411
#  ## 3 hsa:10800 D01828
#  ## 4 hsa:10800 D05129
#  ## 5 hsa:11255 D00234
#  ## 6 hsa:11255 D00300

## ---- eval = FALSE-------------------------------------------------------
#  library("igraph")
#  library("arcdiagram")
#  library("reshape")
#  
#  g = graph.data.frame(gpcr[1:(nrow(gpcr)/2), ], directed = FALSE)
#  edgelist = get.edgelist(g)
#  vlabels  = V(g)$name
#  vgroups  = c(rep(0, 95), rep(1, 223))
#  vfill    = c(rep("#8B91D4", 95), rep("#B2C771", 223))
#  vborders = c(rep("#6F74A9", 95), rep("#8E9F5A", 223))
#  degrees  = degree(g)
#  
#  xx = data.frame(vgroups, degrees, vlabels, ind = 1:vcount(g))
#  yy = arrange(xx, desc(vgroups), desc(degrees))
#  new.ord = yy$ind
#  
#  arcplot(
#    edgelist, ordering = new.ord, labels = vlabels,
#    cex.labels = 0.1, show.nodes = TRUE,
#    col.nodes = vborders, bg.nodes = vfill,
#    cex.nodes = log10(degrees) + 0.1,
#    pch.nodes = 21, line = -0.5, col.arcs = hsv(0, 0, 0.2, 0.25))

## ---- eval = FALSE-------------------------------------------------------
#  library("Rcpi")
#  
#  gpcr = read.table(system.file(
#    "vignettedata/GPCR.csv", package = "Rcpi"),
#    header = FALSE, as.is = TRUE)
#  
#  protid = unique(gpcr[, 1])
#  drugid = unique(gpcr[, 2])
#  
#  protseq = getSeqFromKEGG(protid, parallel = 5)
#  drugseq = getSmiFromKEGG(drugid, parallel = 50)

## ---- eval = FALSE-------------------------------------------------------
#  x0.prot = cbind(
#    t(sapply(unlist(protseq), extractProtAPAAC)),
#    t(sapply(unlist(protseq), extractProtCTriad)))
#  
#  x0.drug = cbind(
#    extractDrugEstateComplete(readMolFromSmi(textConnection(drugseq))),
#    extractDrugMACCSComplete(readMolFromSmi(textConnection(drugseq))),
#    extractDrugOBFP4(drugseq, type = "smile"))

## ---- eval = FALSE-------------------------------------------------------
#  # generate drug x / protein x / y
#  x.prot = matrix(NA, nrow = nrow(gpcr), ncol = ncol(x0.prot))
#  x.drug = matrix(NA, nrow = nrow(gpcr), ncol = ncol(x0.drug))
#  for (i in 1:nrow(gpcr)) x.prot[i, ] = x0.prot[which(gpcr[, 1][i] == protid), ]
#  for (i in 1:nrow(gpcr)) x.drug[i, ] = x0.drug[which(gpcr[, 2][i] == drugid), ]
#  
#  y = as.factor(c(rep("pos", nrow(gpcr)/2), rep("neg", nrow(gpcr)/2)))

## ---- eval = FALSE-------------------------------------------------------
#  x = getCPI(x.prot, x.drug, type = "combine")

## ---- eval = FALSE-------------------------------------------------------
#  library("caret")
#  x = x[, -nearZeroVar(x)]
#  
#  # cross-validation settings
#  ctrl = trainControl(
#      method = "cv", number = 5, repeats = 10,
#      classProbs = TRUE,
#      summaryFunction = twoClassSummary)
#  
#  # train a random forest classifier
#  library("randomForest")
#  
#  set.seed(1006)
#  rf.fit = train(
#    x, y, method = "rf", trControl = ctrl,
#    metric = "ROC", preProc = c("center", "scale"))
#  \end{CodeInput}
#  
#  Print the cross-validation result:
#  
#  \begin{CodeInput}
#  print(rf.fit)

## ---- eval = FALSE-------------------------------------------------------
#  ## Random Forest
#  ##
#  ## 1270 samples
#  ##  562 predictors
#  ##    2 classes: "neg", "pos"
#  ##
#  ## Pre-processing: centered, scaled
#  ## Resampling: Cross-Validated (5 fold)
#  ##
#  ## Summary of sample sizes: 1016, 1016, 1016, 1016, 1016
#  ##
#  ## Resampling results across tuning parameters:
#  ##
#  ##   mtry  ROC    Sens   Spec   ROC SD  Sens SD  Spec SD
#  ##   2     0.83   0.726  0.778  0.0221  0.044    0.0395
#  ##   33    0.882  0.795  0.82   0.018   0.0522   0.0443
#  ##   562   0.893  0.822  0.844  0.0161  0.0437   0.0286
#  ##
#  ## ROC was used to select the optimal model using  the largest value.
#  ## The final value used for the model was mtry = 562.

## ---- eval = FALSE-------------------------------------------------------
#  rf.pred = predict(rf.fit$finalModel, x, type = "prob")[, 1]
#  
#  library("pROC")
#  plot(smooth(roc(y, rf.pred)), grid = TRUE, print.auc = TRUE)

Try the Rcpi package in your browser

Any scripts or data that you put into this service are public.

Rcpi documentation built on July 18, 2018, 10 a.m.