Nothing
## ----echo=FALSE,results='hide'----------------------------
library("knitr")
options(prompt = "R> ", continue = " ", width = 60, useFancyQuotes = FALSE)
opts_chunk$set(tidy=TRUE, highlight=FALSE, background="white")
opts_chunk$set(dev=c('pdf','postscript'), fig.path='cancerfigure/')
## ----message=FALSE, eval=FALSE----------------------------
# #The data files below were downloaded on June 1, 2016
# require("openxlsx")
# library("mpath")
# bc <- t(read.delim("GSE20194_series_matrix.txt.gz",sep="",header=FALSE,skip=80))
# colnames(bc) <- bc[1,]
# bc <- bc[-1, -c(1, 2)]
# ###The last column is empty with variable name !series_matrix_table_end, thus omitted
# bc <- bc[, -22284]
# mode(bc) <- "numeric" ### convert character to numeric
# dat1 <- openxlsx::openxlsx("GSE20194_MDACC_Sample_Info.xls", sheet=1, header=TRUE)
# y <- dat1$characteristics..ER_status
# y <- ifelse(y=="P", 1, -1)
# table(y)
# res <- rep(NA, dim(bc)[2])
# for(i in 1:dim(bc)[2])
# res[i] <- abs(t.test(bc[,i]~y)$statistic)
# ### find 3000 largest absolute value of t-statistic
# tmp <- order(res, decreasing=TRUE)[1:3000]
# dat <- bc[, tmp]
# ### standardize variables
# dat <- scale(dat)
## ----message=FALSE----------------------------------------
### number of replicates
nrun <- 100
### penalty type
penalty <- c("enet", "snet", "mnet")
### Smallest value for lambda, as a fraction of lambda.max, the smallest value for which all coefficients are zero except the intercept
ratio <- 0.25
type.path <- "nonactive"
nlam <- ifelse(type.path!="onestep", 30, 100)
### The training data is contaminated by randomly switching response variable labels at varying pre-specified proportions
per <- c(0, 0.05, 0.10, 0.15)
### what quantity is minimized for tuning parameter selection
tuning <- "error"
n.cores <- 5
### robust nonconvex loss function, rfamily type and logistic
type <- c("closs", "gloss", "qloss", "binomial")
### and corresponding labels
type1 <- c("Closs", "Gloss", "Qloss", "Logistic")
### and corresponding tuning parameter
s <- c(0.9, 1.01, 0.5)
mstop <- 50
plot.it <- TRUE
## ----brcancer, fig.show='hold', eval=FALSE----------------
# summary7 <- function(x) c(summary(x), sd=sd(x))
# ptm <- proc.time()
# for(k in (1:4)){ ### k controls family argument rfamily type (see above)
# if(type[k]=="gloss")
# nu <- 0.1
# else
# nu <- 0.01
# for(j in (1:3)){ ### j controls argument penalty type (see above)
# gam <- ifelse(penalty[j]=="snet", 3.7, 12)
# err.m1 <- nvar.m1 <- errbest.m1 <- lambest.m1 <- matrix(NA, ncol=4, nrow=nrun)
# nvarbest.m1 <- mstopcv.m1 <- matrix(NA, ncol=4, nrow=nrun)
# colnames(err.m1) <- c("cont-0%", "cont-5%", "cont-10%", "cont-15%")
# colnames(mstopcv.m1) <- colnames(nvarbest.m1) <- colnames(err.m1)
# colnames(nvar.m1) <- colnames(err.m1)
# colnames(errbest.m1) <- colnames(err.m1)
# colnames(lambest.m1) <- colnames(err.m1)
# for (ii in 1:nrun){
# set.seed(1000+ii)
# trid <- c(sample(which(y==1))[1:50], sample(which(y==-1))[1:50])
# dtr <- dat[trid, ]
# dte <- dat[-trid, ]
# ytrold <- y[trid]
# yte <- y[-trid]
# ### number of patients/no. variables in training and test data
# dim(dtr)
# dim(dte)
# ### randomly contaminate data
# ntr <- length(trid)
# set.seed(1000+ii)
# con <- sample(ntr)
# for(i in (1:4)){ ### i controls how many percentage of data contaminated, see argument per above
# ytr <- ytrold
# percon <- per[i]
# ### randomly flip labels of the samples in training set according to pre-defined contamination level
# if(percon > 0){
# ji <- con[1:(percon*ntr)]
# ytr[ji] <- -ytrold[ji]
# }
# ### fit a model with nclreg for nonconvex loss or glmreg for logistic loss, and use cross-validation to select best penalization parameter
# if(type[k] %in% c("closs", "gloss", "qloss")){
# dat.m1 <- nclreg(x=dtr, y=ytr, s=s[k], iter=100, rfamily = type[k], penalty=penalty[j], lambda.min.ratio=ratio, gamma=gam, mstop.init=mstop, nu.init=nu, type.path=type.path, decreasing=FALSE, type.init="bst")
# lambda <- dat.m1$lambda[1:nlam]
# set.seed(1000+ii)
# cvm1 <- cv.nclreg(x=dtr, y=ytr, nfolds=5, n.cores=n.cores, parallel=TRUE, s=s[k], lambda=lambda, rfamily = type[k], penalty=penalty[j], gamma=gam, type=tuning, plot.it=FALSE, type.init=dat.m1$type.init, mstop.init=dat.m1$mstop.init, nu.init=dat.m1$nu.init, type.path=type.path, decreasing=dat.m1$decreasing)
# err1 <- predict(dat.m1, newdata=dte, newy=yte, type="error")
# }
# else{
# dat.m1 <- glmreg(x=dtr, y=(ytr+1)/2, family = type[k], penalty=penalty[j], lambda.min.ratio=ratio, gamma=gam)
# set.seed(1000+ii)
# cvm1 <- cv.glmreg(x=dtr, y=(ytr+1)/2, nfolds=5, n.cores=n.cores, parallel=TRUE, lambda=dat.m1$lambda,
# family = type[k], penalty=penalty[j], gamma=gam, plot.it=FALSE)
# err1 <- apply((yte > -1)!=predict(dat.m1, newx=dte, type="class"), 2, mean)
# }
# optmstop <- cvm1$lambda.which
# err.m1[ii, i] <- err1[optmstop]
# if(ii==1) varid <- names(predict(dat.m1, which=optmstop, type="nonzero")) else
# varid <- intersect(varid, names(predict(dat.m1, which=optmstop, type="nonzero")))
# nvar.m1[ii, i] <- length(predict(dat.m1, which=optmstop, type="nonzero"))
# errbest.m1[ii, i] <- min(err1, na.rm=TRUE)
# lambest.m1[ii, i] <- which.min(err1)
# nvarbest.m1[ii, i] <- length(predict(dat.m1, which=which.min(err1), type="nonzero"))
# }
# if(ii %% nrun ==0){
# if(type[k]%in% c("closs", "gloss", "qloss"))
# cat(paste("\nfamily ", type1[k], ", s=", s[k], sep=""), "\n")
# else cat(paste("\nfamily ", type1[k], sep=""), "\n")
# pentype <- switch(penalty[j],
# "enet"="LASSO",
# "mnet"="MCP",
# "snet"="SCAD")
# cat("penalty=", pentype, "\n")
# if(penalty[j] %in% c("snet", "mnet")) cat("gamma=", gam, "\n")
# cat("common variables selected:", varid, "\n")
# cat("best misclassification error\n")
# print(round(apply(errbest.m1, 2, summary7),4))
# cat("which lambda has best error\n")
# print(round(apply(lambest.m1, 2, summary7),1))
# cat("number of variables selected with best error\n")
# print(round(apply(nvarbest.m1, 2, summary7),1))
# cat("CV based misclassification error\n")
# print(round(apply(err.m1, 2, summary7),4))
# cat("number of variables selected by CV\n")
# print(round(apply(nvar.m1, 2, summary7),1))
# if(plot.it){
# par(mfrow=c(2, 1))
# boxplot(err.m1, main="Misclassification error", subset="", sub=paste(type1[k], "-", pentype, sep=""))
# boxplot(nvar.m1, main="No. variables", subset="", sub=paste(type1[k], "-", pentype, sep=""))
# }
# }
# }
# }
# }
# print(proc.time()-ptm)
## ----sessionInfo------------------------------------------
sessionInfo();
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.