Sparse regression with paired covariates


title: Sparse regression with paired covariates header-includes: \usepackage{amsmath} \usepackage{xcolor} \usepackage{caption}\captionsetup[figure]{labelformat=empty} \usepackage[section]{placeins} \definecolor{blue}{HTML}{0000CD} \definecolor{red}{HTML}{CD0000} output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{script} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} editor_options: chunk_output_type: console


Code for reproducing the results shown in the manuscript.

# Set eval to TRUE to create Figures.
# Set echo to TRUE for html, but to FALSE for pdf.
knitr::opts_chunk$set(eval=FALSE,echo=TRUE,fig.path="images/")
eval <- FALSE # TRUE or FALSE

Initialisation

### Loading functions. ###

inst <- rownames(utils::installed.packages())

cran <- c("devtools","R.utils","Matrix","glmnet","pROC","BiocManager","ashr")
# "googledrive", "httpuv"
if(!all(cran %in% inst)){
    for(i in seq_along(cran)){
        if(!cran[i] %in% inst){
            install.packages(cran[i])
        }
    }
}

bioc <- c("edgeR","TCGAbiolinks")
if(!all(bioc %in% inst)){
    #source("http://bioconductor.org/biocLite.R")
    for(i in seq_along(bioc)){
        if(!bioc[i] %in% inst){
            #biocLite(bioc[i])
            BiocManager::install(bioc[i])
        }
    }
}

#if(!"ashr" %in% inst){
#  devtools::install_github("stephens999/ashr")
#}

user <- Sys.getenv("USERNAME")
path <- file.path("C:","Users",user,"Desktop","palasso")
if(user=="arra"){path <- "C:/Users/arra/Desktop/MATHS/palasso_desktop"}
if(user==""){path <- "/virdir/Scratch/arauschenberger/palasso"}
setwd(path)
folders <- c("data","results")
invisible(sapply(folders,function(x) if(!dir.exists(x)){dir.create(x)}))

if(user!="arra"){
    devtools::install_github("kkdey/CorShrink") # ref="a9f6ba0"
    devtools::install_github("rauschenberger/palasso") # ref="4a995a2"
}

if(FALSE){

    # The functions <<save>>, <<file.exists>> and <<file.remove>> access the hard disk, but also try to access googledrive.

    save <- function(object,file){
        base::save(object,file=file)
        tryCatch(expr=googledrive::drive_upload(media=file,path=file),
                 error=function(x) NULL)
        #Sys.sleep(0.5)
    }

    file.exists <- function(file){
        offline <- base::file.exists(file)
        online <- FALSE
        if(!offline){
            d <- googledrive::as_dribble(x=file)
            online <- tryCatch(expr=googledrive::some_files(d),
                               error=function(x) FALSE)
            #Sys.sleep(0.5)
        }
        return(offline|online)
    }

    file.remove <- function(file){
        base::file.remove(file)
        tryCatch(expr=googledrive::drive_trash(file=file),
             error=function(x) NULL)
        #Sys.sleep(0.5)
    }


# The function <<update>> moves files from the research cloud to the remote drive. Given both paths, it first verifies whether the folders SIM, GDC and CCA are available, and then copies all missing files from the research cloud to the remote drive.

# from: path to the origin
# to: path to the destination

update <- function(from,to){
    dir <- c("SIM","GDC","CCA")
    if(any(!dir.exists(file.path(from,dir)))){stop("Invalid.")}
    if(any(!dir.exists(file.path(to,dir)))){stop("Invalid.")}
    pb <- utils::txtProgressBar(min=0,max=1,style=3)
    for(i in seq_along(dir)){
        files0 <- dir(file.path(from,dir[i]))
        files1 <- dir(file.path(to,dir[i]))
        names <- files0[!files0 %in% files1]
        for(j in seq_along(names)){
            utils::setTxtProgressBar(pb=pb,value=(i-1)/3+(j*i)/(3*length(names)))
            file.copy(from=file.path(from,dir[i],names[j]),
                      to=file.path(to,dir[i],names[j]),
                      copy.date=TRUE)
        }
        utils::setTxtProgressBar(pb=pb,value=i/3)
    }
}

# update(from="results",to="//tsclient/N/palasso/results")

}

Preparation

Last data download started on 2019-03-26 (R version 3.5.3).

### Downloading "Isoform Expression Quantification". ###

#rm(list=ls())
#<<functions>>

directory <- file.path(path,"data")
setwd(directory)

# Retrieving cancer types:
project <- TCGAbiolinks::getGDCprojects()$id
project <- project[grepl(x=project,pattern="TCGA")]

# Downloading isoform expression data:
y <- X <- list()
for(i in seq_along(project)){
    query <- TCGAbiolinks::GDCquery(project=project[i],
                data.category="Transcriptome Profiling",
                data.type="Isoform Expression Quantification")
    TCGAbiolinks::GDCdownload(query,method="api",directory=directory)
    trace(TCGAbiolinks:::readTranscriptomeProfiling,tracer=quote(ignore.case<-TRUE))
    X[[i]] <- TCGAbiolinks::GDCprepare(query,directory=directory)
    X[[i]][,c("miRNA_ID","reads_per_million_miRNA_mapped",
              "cross-mapped","miRNA_region")] <- NULL
    y[[i]] <- rep(project[i],times=length(unique(X[[i]]$barcode)))
}

save(list=c("y","X"),file=file.path(path,"data","isoform_raw.RData"))
load(file.path(path,"data","isoform_raw.RData"),verbose=TRUE)

# Merging isoform expression data:
Xs <- do.call(what=rbind,args=X) # sparse matrix
y <- do.call(what="c",args=y)

# Transform to matrix
Xs$isoform_coords <- gsub(pattern="hg38:chr",replacement="",x=Xs$isoform_coords)
samples <- unique(Xs$barcode)
covariates <- unique(Xs$isoform_coords)
row <- match(Xs$barcode,samples)
col <- match(Xs$isoform_coords,covariates)
X <- Matrix::sparseMatrix(i=row,j=col,x=Xs$read_count,dimnames=list(samples,covariates))

# Order by molecular location
split <- strsplit(x=colnames(X),split=":|-")
chr <- sapply(split,function(x) x[[1]])
pos <- sapply(split,function(x) x[[2]])
order <- order(chr,pos)
X <- X[,order]

if(FALSE){ # testing
    i <- sample(seq_len(nrow(Xs)),size=1)
    Xs$read_count[i]
    X[Xs$barcode[i],Xs$isoform_coords[i]]
}

save(list=c("y","X"),file=file.path(path,"data","isoform_all.RData"))
### Downloading "miRNA Expression Quantification". ###

#rm(list=ls())
#<<functions>>

directory <- file.path(path,"data")
setwd(directory)

# Downloading data.
project <- TCGAbiolinks::getGDCprojects()$id 
project <- project[grepl(x=project,pattern="TCGA")]
y <- X <- list()
for(i in seq_along(project)){
    query <- TCGAbiolinks::GDCquery(project=project[i],
                           data.category="Transcriptome Profiling",
                           data.type="miRNA Expression Quantification")
    TCGAbiolinks::GDCdownload(query,method="api",directory=directory)
    trace(TCGAbiolinks:::readTranscriptomeProfiling,tracer=quote(ignore.case<-TRUE))
    data <- TCGAbiolinks::GDCprepare(query,directory=directory)
    X[[i]] <- t(data[,c(seq(from=2,to=ncol(data),by=3))])
    y[[i]] <- rep(project[i],times=nrow(X[[i]]))
}

save(list=c("y","X"),file=file.path(path,"data","miRNA_raw.RData"))
load(file.path(path,"data","miRNA_raw.RData"))

X <- do.call(what=rbind,args=X)
y <- do.call(what="c",args=y)
rownames(X) <- gsub(pattern="read_count_",replacement="",x=rownames(X))

save(list=c("y","X"),file=file.path(path,"data","miRNA_all.RData"))
### Downloading "Gene Expression Quantification". ###

#rm(list=ls())
#<<functions>>

directory <- file.path(path,"data")
setwd(directory)

# Retrieving cancer types:
project <- TCGAbiolinks::getGDCprojects()$id
project <- project[grepl(x=project,pattern="TCGA")]

# Downloading data:
memory.limit(size=16000) # Activate virtual memory in system control!
y <- X <- list()
for(i in seq_along(project)){
    query <- TCGAbiolinks::GDCquery(project=project[i],
                data.category="Transcriptome Profiling",
                data.type="Gene Expression Quantification",
                workflow.type="HTSeq - Counts"); gc()
    TCGAbiolinks::GDCdownload(query=query,method="api",directory=directory); gc()
    trace(TCGAbiolinks:::readTranscriptomeProfiling,tracer=quote(ignore.case<-TRUE)); gc()
    X[[i]] <- TCGAbiolinks::GDCprepare(query,directory=directory); gc()
    y[[i]] <- rep(project[i],times=ncol(X[[i]])); gc()
}

save(list=c("y","X"),file=file.path(path,"data","gene_raw.RData"))
load(file.path(path,"data","gene_raw.RData"))

genes <- SummarizedExperiment::rowData(X[[1]])
mart <- biomaRt::useMart("ensembl",dataset="hsapiens_gene_ensembl") # 
char <- biomaRt::getBM(attributes=c("ensembl_gene_id","chromosome_name","transcript_start","gene_biotype"),filters=c("biotype","chromosome_name"),values=list("protein_coding",c(1:22,"X")),mart=mart)
select <- genes$ensembl_gene_id[genes$ensembl_gene_id %in% char$ensembl_gene_id]

X <- lapply(X,function(x) t(SummarizedExperiment::assays(x)$"HTSeq - Counts"[select,]))
X <- do.call(what=rbind,args=X)
y <- do.call(what="c",args=y)

save(list=c("y","X"),file=file.path(path,"data","gene_all.RData"))
### Downloading "Copy Number Variation". ###

#rm(list=ls())
#<<functions>>

directory <- file.path(path,"data")
setwd(directory)

project <- TCGAbiolinks::getGDCprojects()$id
project <- project[grepl(x=project,pattern="TCGA")]

y <- X <- list()
for(i in seq_along(project)){
    query <- TCGAbiolinks::GDCquery(project=project[i],
                data.category="Copy Number Variation",
                data.type="Masked Copy Number Segment")
    TCGAbiolinks::GDCdownload(query=query,method="api",directory=directory)
    trace(TCGAbiolinks:::readTranscriptomeProfiling,tracer=quote(ignore.case<-TRUE))
    X[[i]] <- TCGAbiolinks::GDCprepare(query,directory=directory)
    y[[i]] <- rep(project[i],times=length(unique(X[[i]]$Sample))) # correct?
}

save(list=c("y","X"),file=file.path(path,"data","CNV_raw.RData"))
load(file.path(path,"data","CNV_raw.RData"),verbose=TRUE)

# Merging CNV data:
Xs <- do.call(what=rbind,args=X) # sparse matrix
y <- do.call(what="c",args=y)
#table(Xs$Sample)

# Prepare cutting.
cut <- list()
cut$chr <- c(1:22,"X")
cut$start <- sapply(cut$chr,function(x) min(Xs$Start[Xs$Chromosome==x]))
cut$end <- sapply(cut$chr,function(x) max(Xs$End[Xs$Chromosome==x]))
cut$length <- cut$end-cut$start
cut$dist <- sum(cut$length)/10000
cut$num <- round(cut$length/cut$dist)

# Create covariates.
cov <- list()
cov$p <- sum(cut$num)
cov$chromosome <- unlist(sapply(cut$chr,function(i) rep(i,times=cut$num[i])))
cov$location <- unlist(sapply(cut$chr,function(i) round(seq(from=cut$start[i],to=cut$end[i],length.out=cut$num[i]))))
cov$name <- paste0(cov$chromosome,":",cov$location)

# Create indices for each covariate.
index <- rep(list(integer()),times=cov$p)
pb <- utils::txtProgressBar(min=0,max=cov$p,style=3)
for(j in seq_len(cov$p)){
    utils::setTxtProgressBar(pb=pb,value=j)
    index[[j]] <- which((Xs$Chromosome==cov$chromosome[j]) & 
        (Xs$Start<=cov$location[j]) & (cov$location[j]<=Xs$End)) # consider <
}

# Expand indices to matrix.
X <- matrix(0,nrow=length(unique(Xs$Sample)),ncol=cov$p,
             dimnames=list(unique(Xs$Sample),cov$name))
for(j in seq_along(index)){
    mean <- Xs$Segment_Mean[index[[j]]]
    i <- Xs$Sample[index[[j]]]
    X[i,j] <- mean
}

if(FALSE){ # test
    sample <- sample(rownames(X),size=1)
    covariate <- sample(colnames(X),size=1)
    split <- strsplit(covariate,split=":")[[1]]
    a <- X[sample,covariate]
    b <- Xs$Segment_Mean[(Xs$Sample==sample) & (Xs$Chromosome==split[1]) & (Xs$Start<=as.numeric(split[2])) & (as.numeric(split[2]) < Xs$End)]
    all(a==b)
}

save(list=c("y","X","index"),file=file.path(path,"data","CNV_all.RData"))
### Extracting samples of interest. ###

#rm(list=ls())
#<<functions>>

type <- c("isoform","miRNA","CNV","gene")

for(i in seq_along(type)){
    cat(type[i],"\n")
    load(file.path(path,"data",paste0(type[i],"_all.RData")),verbose=TRUE)

    # TCGA barcode
    barcode <- rownames(X)
    code <- sapply(barcode,function(x) strsplit(x,split="-"))
    code <- as.data.frame(do.call(what=rbind,args=code))
    colnames(code) <- c("project","TSS","participant","sample_vial",
                        "portion_analyte","plate","center")
    code$sample <- substr(code$sample_vial,start=1,stop=2)
    code$vial <- substr(code$sample_vial,start=3,stop=3)
    code$portion <- substr(code$portion_analyte,start=1,stop=2)
    code$analyte <- substr(code$portion_analyte,start=3,stop=3)
    code$sample_vial <- code$portion_analyte <- NULL

    # solid tumour (except blood for LAML)
    solid <- (code$sample=="01" | (y=="TCGA-LAML" & code$sample=="03"))
    X <- X[solid,]
    y <- y[solid]

    # unique samples
    unique <- !duplicated(substr(rownames(X),start=1,stop=12))
    X <- X[unique,]
    y <- y[unique]

    save(list=c("y","X"),file=file.path(path,"data",paste0(type[i],"_sub.RData")))
}

# isoform: n=9'794, p=194'595, k=32
# miRNA: n=9'794, p=1'881, k=32
# gene: n=9'830, p=19'602, k=33
# CNV: n=10'578, p=10'000, k=33

## Understanding barcodes:
# overview: https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode
# details: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables

## Understanding replicate samples:
# http://gdac.broadinstitute.org/runs/sampleReports/latest/READ_Replicate_Samples.html

Analysis

Last data analysis started on 2019-04-12 (R version 3.5.3).

### Analysing the TCGA data. ###

#rm(list=ls())
#<<functions>>

for(type in c("miRNA","isoform","CNV","gene")){ 
  library(Matrix)
  for(k in c(207,sample(528))){
    set.seed(k); cat(k," ")
    if(type %in% c("isoform","miRNA") & k > 496){next}
    if(type %in% c("CNV","gene") & k > 528){next}

    # searching for missing cancer-cancer combinations
    rm(list=setdiff(ls(),c("k","type","path","save","file.exists","file.remove"))); gc()
    file0 <- paste0("results/",type,"_start_",k,".RData")
    file1 <- paste0("results/",type,"_loss_",k,".RData")
    if(file.exists(file0)||file.exists(file1)){next}
    save(object=k,file=file0)
    load(paste0("data/",type,"_sub.RData"))

    # indicating the cancer-cancer combination
    cancer <- substring(text=unique(y),first=6)
    comb <- utils::combn(x=cancer,m=2)
    select <- paste0("TCGA-",comb[,k])
    y <- ifelse(y==select[1],1,ifelse(y==select[2],0,NA))
    rm(cancer,select)

    # removing other cancer types
    cond <- !is.na(y)
    y <- y[cond]
    X <- X[cond,]
    rm(cond)

    # pre-processing
    if(type %in% c("isoform","miRNA")){
      x <- palasso:::.prepare(X,cutoff="zero")
    } else if(type=="gene"){
      x <- palasso:::.prepare(X,cutoff="knee")
    } else if(type=="CNV"){
      x <- list(X=X,Z=sign(X))
      x <- lapply(x,function(x) scale(x))
      attributes(x)$info <- data.frame(n=nrow(X),p=ncol(X),prop=mean(x$Z==0))
    }
    rm(X)

    # cross-validation
    loss <- tryCatch(expr=palasso:::.predict(y=y,X=x,nfolds.int=10),error=function(e) palasso:::.predict(y=y,X=x,nfolds.int=10))

    # information
    loss$info <- cbind(k=k,
                       y0=comb[2,k],
                       y1=comb[1,k],
                       n0=sum(y==0),
                       n1=sum(y==1),
                       attributes(x)$info,
                       loss$info)

    # refit
    object <- palasso::palasso(y=y,X=x,nfolds=10,family="binomial",standard=TRUE,elastic=TRUE,shrink=TRUE)

    model <- c(names(object),"elastic",
               paste0("paired.",c("adaptive","standard","combined")))

    for(max in c(10,50,Inf)){
      temp <- list()
      temp$nzero <- data.frame(model=model,x=NA,z=NA)
      for(i in seq_along(model)){
        coef <- palasso:::coef.palasso(object=object,model=model[i],max=max)
        temp$nzero$x[i] <- sum(coef$x!=0)
        temp$nzero$z[i] <- sum(coef$z!=0)
      }
      temp$select <- palasso:::subset.palasso(x=object,max=max,
                                              model="paired.adaptive")$palasso$select
      temp$weights <- palasso:::weights.palasso(object=object,max=max,
                                                model="paired.adaptive")
      temp$coef <- palasso:::coef.palasso(object=object,max=max,
                                          model="paired.adaptive")
      loss[[paste0("fit",max)]] <- temp
    }

    save(object=loss,file=file1)
    file.remove(file0)
  }

  index <- sum(grepl(dir(),pattern="sessionInfo"))
  sink(paste0("sessionInfo",index+1,".txt"))
  date()
  utils::sessionInfo()
  devtools::session_info()
  sink()

}
# The function <<collect>> loads all files from PATH including PATTERN in the file name, loads OBJECT into a list, and executes a function call.
#<<functions>>

# path: folder
# pattern: character, or NULL (all files)
# object: character vector, or NULL (all objects)
# what: function call

collect <- function(path=getwd(),pattern="",object=NULL,what="rbind"){
    OBJECT = object
    # identify files
    files <- dir(path)
    files <- files[grepl(x=files,pattern=pattern)]
    files <- files[grepl(x=files,pattern=".RData")]
    number <- gsub(pattern=paste0(pattern,"|.RData"),replacement="",x=files)
    files <- files[order(as.numeric(number))] # trial1
    names <- gsub(pattern=".RData",replacement="",x=files) # trial1
    if(length(files)==0){stop("Invalid datasets.")}
    # load data
    all <- list()
    for(i in seq_along(files)){
        x <- load(file.path(path,files[i]))
        x <- eval(parse(text=x))
        if(is.null(OBJECT)){
            all[[i]] <- x
            names(all)[i] <- names[i] 
        } else {
            for(j in seq_along(OBJECT)){
                all[[OBJECT[j]]][[i]] <- x[[OBJECT[j]]]
                names(all[[OBJECT[j]]])[i] <- names[i]
            }
        }
    }
    # fuse data
    if(is.null(OBJECT)){
        all <- do.call(what=what,args=all)
    } else {
        all <- lapply(all,function(x) do.call(what=what,args=x))
    }
    return(all)
}

LOSS <- list()
type <- c("gene","isoform","miRNA","CNV")
#type <- "miRNA"
for(i in seq_along(type)){
  LOSS[[type[i]]] <- collect(path="results",
                             pattern=paste0(type[i],"_loss_"),
                             object=c("deviance","auc","class","info",
                                      paste0("fit",c(10,50,Inf))))
}
for(i in seq_along(LOSS)){
  for(j in 1:3){
    colnames(LOSS[[i]][[j]])[colnames(LOSS[[i]][[j]])=="paired.adaptive"] <- "paired"
  }
}

#type <- "gene"
#a <- LOSS[[type]]$deviance[rownames(LOSS[[type]]$deviance)=="10","paired"]
#b <- LOSS[[type]]$deviance[rownames(LOSS[[type]]$deviance)=="10","elastic"]
#mean(a<b)
### Testing for significant differences. ###

#rm(list=ls())
#<<functions>>
#<<collect>>

row <- c("gene","isoform","miRNA","CNV")
col <- c("10","Inf")
lay <- c("standard_x","standard_z","standard_xz",
         "adaptive_x","adaptive_z","adaptive_xz",
         "elastic") # added "elastic"

M <- array(NA,dim=c(length(row),length(col),length(lay)),dimnames=list(row,col,lay))

for(i in seq_along(row)){

  loss <- LOSS[[row[i]]][c("info","deviance")]

  y0 <- as.character(loss$info$y0)
  y1 <- as.character(loss$info$y1)
  cancer <- sort(unique(c(y0,y1)))
  Z <- palasso:::.design(x=cancer)

  for(j in seq_along(col)){

    # differences
    cond <- rownames(loss$deviance)==col[j]

    for(k in seq_along(lay)){

      fill <- loss$deviance[cond,lay[k]] - loss$deviance[cond,"paired"]
      X <- matrix(NA,nrow=length(cancer),ncol=length(cancer),
                  dimnames=list(cancer,cancer))
      X[cbind(y0,y1)] <- X[cbind(y1,y0)] <- fill
      X[lower.tri(X)] <- NA

      # p-values
      pvalue <- rep(NA,times=max(Z))
      for(l in seq_len(max(Z))){
        x <- as.numeric(X[Z==l])
        if(col[j]=="10"){
          alternative <- "greater" # Never use "two.sided"!
        }
        if(col[j]=="Inf"){
          alternative <- "less" # Never use "two.sided"!
        }
        pvalue[l] <- stats::wilcox.test(x=x,alternative=alternative,
                                 exact=FALSE)$p.value
      }

      # Simes
      M[i,j,k] <- palasso:::.combine(pvalue,method="simes")
    }
  }
}

# Table SIG: significance
constraint <- "10"
table <- format(M[,constraint,1:6],digits=1,scientific=FALSE)
for(i in seq_len(nrow(table))){
    for(j in seq_len(ncol(table))){
        if(M[i,constraint,j]>=0.05){
            table[i,j] <- paste0("{\\color{gray}{",table[i,j],"}}")
        }
    }
}
one <- c("","\\text{standard}","","","\\text{adaptive}","")
two <- paste0("\\text{",c("x","z","xz","x","z","xz"),"}")
rownames(table) <- paste0("\\text{",rownames(table),"}")
table <- rbind(one,two,table,deparse.level=0)
rownames(table)[1] <- "~"
xtable <- xtable::xtable(table,align=c("r","|","c","c","c","|","c","c","c"))
xtable::print.xtable(xtable,type="latex",include.colnames=FALSE,sanitize.text.function=identity)
### Comparison with elastic net. ###
#rm(list=ls())
#<<functions>>
#<<collect>>

row <- c("gene","isoform","miRNA","CNV")
col <- c("10","50","Inf")
better <- worse <- less <- matrix(NA,nrow=length(row),ncol=length(col),
                 dimnames=list(row,col))
for(i in seq_along(row)){
  for(j in seq_along(col)){
    # proportion of improvements (cross-validation)
    cond <- rownames(LOSS[[row[i]]]$deviance)==col[j]
    loss <- LOSS[[row[i]]]$deviance[cond,c("paired","elastic")]
    better[i,j] <- round(mean(loss[,"paired"]<loss[,"elastic"]),digits=2)
    worse[i,j] <- round(mean(loss[,"paired"]>loss[,"elastic"]),digits=2)
    # average difference in nzero (refitted models)
    df_paired <- apply(LOSS[[row[i]]][[paste0("fit",col[j])]],1,function(x)   sum(x$nzero[x$nzero[,"model"]=="paired.adaptive",c("x","z")]))
    df_elastic <- apply(LOSS[[row[i]]][[paste0("fit",col[j])]],1,function(x)   sum(x$nzero[x$nzero[,"model"]=="elastic95",c("x","z")]))
    df_diff <- df_elastic-df_paired
    less[i,j] <- round(mean(df_diff),digits=2)
    #graphics::hist(df_diff,main=paste(row[i],col[j]),xlim=c(-1,1)*max(abs(df_diff)))
  }
}
better
worse
less
### Analysing the refitted models. ###

#rm(list=ls())
#<<functions>>
#<<collect>>

# Table SEL: selected model
nzero <- paste0("fit",c(5,10,Inf))
model <- c(paste0("standard_",c("x","z","xz")),
           paste0("adaptive_",c("x","z","xz")),
           "between_xz","within_xz")
type <- c("gene","isoform","miRNA","CNV")
table <- array(NA,dim=c(length(nzero),length(model),length(type)),
               dimnames=list(nzero,model,type))
for(i in seq_along(nzero)){
    for(j in seq_along(model)){
        for(k in seq_along(type)){
            sub <- LOSS[[type[k]]][[nzero[i]]]
            table[i,j,k] <- sum(sub[,"select"]==model[j])
        }
    }
}
colSums(table["fit10",,]) # CHECK WHETHER COMPLETE!
table <- round(prop.table(table["fit10",,],margin=2),digits=2)
table <- t(table)
table <- table[,apply(table,2,function(x) any(x!=0))]
rownames(table) <- paste0("\\text{",rownames(table),"}")
xtable <- xtable::xtable(table,align=c("r","|","c","c","c","c"))
xtable::print.xtable(xtable,type="latex",include.colnames=FALSE,sanitize.text.function=identity)

# selected weights and covariates
type <- c("gene","isoform","miRNA","CNV")
group <- c("x","z")
model <- c(paste0("standard_",c("x","z","xz")),
           paste0("adaptive_",c("x","z","xz")),
           "paired.adaptive","elastic") # added "elastic" , paste0("elastic",c(100,75,50,25))
weights10 <- weightsInf <- matrix(NA,nrow=length(group),ncol=length(type),
                dimnames=list(group,type))
coef10 <- coefInf <- array(NA,dim=c(length(group),length(type),length(model)),
               dimnames=list(group,type,model))
for(i in seq_along(group)){
    for(j in seq_along(type)){
        weights10[,j] <- rowMeans(sapply(LOSS[[type[j]]]$fit10[,"weights"],colMeans))
        weightsInf[,j] <- rowMeans(sapply(LOSS[[type[j]]]$fitInf[,"weights"],colMeans))
        for(k in seq_along(model)){
            coef10[i,j,k] <- mean(sapply(LOSS[[type[j]]]$fit10[,"nzero"], function(x) sum(x[x$model==model[k],group[i]])))
            coefInf[i,j,k] <- mean(sapply(LOSS[[type[j]]]$fitInf[,"nzero"], function(x) sum(x[x$model==model[k],group[i]])))
        }
    }
}
# coef10["x",,]+coef10["z",,]


# with sparsity constraint
round(prop.table(weights10,margin=2),2)
round(prop.table(coef10[,,"paired.adaptive"],margin=2),2)
round(colSums(coef10[,,"paired.adaptive"]),2)

# natural sparsity
round(prop.table(weightsInf,margin=2),2)
round(prop.table(coefInf[,,"paired.adaptive"],margin=2),2)
round(colSums(coefInf[,,"paired.adaptive"]),2)
round(colSums(coefInf[,,"elastic"])/colSums(coefInf[,,"paired.adaptive"]),1) # multiple nzero of elastic net and paired lasso

# Table NSC: number of non-zero coefficients
table <- round(coefInf["x",,]+coefInf["z",,])
colnames(table)[7] <- "paired"
one <- c("","\\text{standard}","","","\\text{adaptive}","","\\text{paired}","\\text{elastic}")
two <- paste0("\\text{",c("x","z","xz","x","z","xz","xz","xz"),"}")
rownames(table) <- paste0("\\text{",rownames(table),"}")
table <- rbind(one,two,table,deparse.level=0)
rownames(table)[1] <- "~"
xtable <- xtable::xtable(table,align=c("r","|","c","c","c","|","c","c","c","|","c","|","c"))
xtable::print.xtable(xtable,type="latex",include.colnames=FALSE,sanitize.text.function=identity)

Figures

### FIGURE CSW ###

#rm(list=ls())
#<<functions>>

set.seed(1)
overfit <- TRUE

# simulate
n <- 10
cx <- stats::rbeta(n=n,shape1=0.9,shape2=1)
cz <- stats::rbeta(n=n,shape1=0.4,shape2=0.9)

# collection
x <- list()
y <- list()

# adaptive weights (X only)
x[[1]] <- rep(1,times=n)
if(overfit){x[[1]] <- cx}
y[[1]] <- rep(0,times=n)

# adaptive weights (Z only)
x[[2]] <- rep(0,times=n)
y[[2]] <- rep(1,times=n)
if(overfit){y[[2]] <- cz}

# adaptive weights (X and Z)
x[[3]] <- y[[3]] <- rep(0.5,times=n)
if(overfit){x[[3]] <- cx}
if(overfit){y[[3]] <- cz}

# within-pair weights
x[[4]] <- cx^2/(cx+cz)
y[[4]] <- cz^2/(cx+cz)

# visualisation
graphics::par(mfrow=c(1,4),mar=c(4.5,0.5,0.5,0.5),oma=c(0,2,0,0))
for(i in seq_len(4)){
    palasso:::plot_pairs(x=x[[i]],y=y[[i]],lwd=4)
    if(i==1){
        graphics::mtext(text="covariate pair",side=2,line=1)
    }
}

```r provides suitable isomi\textsc{r} data for $9\,794$ samples (left), from $32$ cancer types (centre), forming $496$ cancer-cancer combinations (right). Each sample appears in $31$ combinations."}

FIGURE DIA

rm(list=ls())

<>

ellipse <- function(x,y,a=0.2,b=0.25,border=NA){ n <- max(c(length(x),length(y))) if(length(x)==1){x <- rep(x,times=n)} if(length(y)==1){y <- rep(y,times=n)} if(length(border)==1){border <- rep(border,times=n)} for(i in seq_len(n)){ angle <- seq(from=0,to=2*pi,length=100) xs <- x[i] + a * cos(angle) ys <- y[i] + b * sin(angle) graphics::polygon(x=xs,y=ys,col=grey(0.9),border=border[i]) } }

cancer <- c("ACC","BLCA","BRCA","UVM") number <- c(80,409,1078,80) col <- grDevices::rgb(red=0,green=0,blue=sample(seq(from=75,to=255,length.out=length(number))),maxColorValue=255)

lwd <- log(2number)-2 lwd = pmax(5number/max(number),1) x1 <- 1.3; x2 <- 2; x3 <- 3

set.seed(1)

first layer (bubble)

graphics::plot.new() graphics::par(mar=c(0,0,0,0)) graphics::plot.window(xlim=c(1.1,3.3),ylim=c(-1.6,1.6)) ellipse(x=x1,y=0) graphics::text(x=x1,y=0,labels="TCGA",font=2,adj=c(0.5,0)) graphics::text(x=x1,y=0,labels="n=9794",adj=c(0.5,1.2),cex=0.9)

second layer (colon)

y1 <- seq(from=1,to=-1,length.out=length(cancer)+1) graphics::text(x=x2,y=y1[length(cancer)],labels="...",font=2,srt=90) y1 <- y1[-length(cancer)]

first-second layer (connect)

graphics::segments(x0=x1+0.2,y0=0,x1=x2-0.2,y1=y1,lwd=lwd,col=col)

second layer (bubble)

ellipse(x=x2,y=y1,a=0.2,b=0.2) graphics::text(x=x2,y=y1,labels=cancer,font=2,col=col,adj=c(0.5,0)) graphics::text(x=x2,y=y1,labels=paste0("n=",number,""),adj=c(0.5,1.2),cex=0.9)

comb <- utils::combn(x=seq_along(y1),m=2)

third layer (colon)

y2 <- seq(from=1.5,to=-1.5,length.out=ncol(comb)+1) graphics::text(x=x3,y=y2[ncol(comb)],labels="...",font=2,srt=90) y2 <- y2[-ncol(comb)]

second-third layer (connect)

graphics::segments(x0=2.2,y0=y1[comb[1,]],x1=2.7,y1=y2,lwd=lwd[comb[1,]],col=col[comb[1,]]) graphics::segments(x0=2.2,y0=y1[comb[2,]],x1=2.7,y1=y2,lwd=lwd[comb[2,]],col=col[comb[2,]])

third layer (bubble)

ellipse(x=x3,y=y2,a=0.3,b=0.22) graphics::text(x=x3,y=y2,labels=paste0(cancer[comb[1,]]," "), font=2,col=col[comb[1,]],adj=c(1,0)) graphics::text(x=x3,y=y2,labels=paste0(" ",cancer[comb[2,]]), font=2,col=col[comb[2,]],adj=c(0,0)) graphics::text(x=x3,y=y2,labels=":",font=2,adj=c(0.5,0)) labels <- apply(comb,2,function(x) sum(number[x])) labels <- paste0("n=",labels,"") graphics::text(x=x3,y=y2,labels=labels,adj=c(0.5,1.2),cex=0.9)

```rs, mi\\textsc{rna}s and \\textsc{cnv}s (from top to bottom). The bar charts (left) count how often the paired lasso leads to a lower (dark) or higher (bright) deviance than the competing model. The box plots (right) show how much lower (dark) or higher (bright) the deviance is."}
### FIGURE CLA ###

#rm(list=ls())
#<<functions>>

graphics::par(mfrow=c(4,2),oma=c(0,0,0,0),mar=c(2.1,3.5,0.5,0.5))

for(type in c("gene","isoform","miRNA","CNV")){

    loss <- LOSS[[type]][c("deviance","auc","class")]
    choice <- "paired"
    loss <- lapply(loss,function(x) x[,c(paste0("standard_",c("x","z","xz")),paste0("adaptive_",c("x","z","xz")),choice)])

    for(constraint in c("10")){ # c("5","10","Inf")
        # change
        sub <- lapply(loss,function(x) x[rownames(x)==constraint,])
        palasso:::plot_score(sub$deviance,choice=choice)
        change <- sub$deviance[,7]-sub$deviance[,-7]
        palasso:::plot_box(change,ylab="change",zero=TRUE,choice=NA)
        # info
        info <- list()
        info$select <- names(which.min(apply(sub$deviance,2,median)[-7]))
        info$DEV_paired <- median(sub$deviance[,choice])
        info$DEV_select <- median(sub$deviance[,info$select])
        info$improve <- mean(sub$deviance[,info$select]>sub$deviance[,choice])
        info$AUC_paired <- median(sub$auc[,choice])
        info$CLASS_paired <- median(sub$class[,choice])
        print(as.data.frame(info)) # important
    }
}

```rs (top right), mi\textsc{rna}s (bottom left) and \textsc{cnv}s (bottom right). The median deviances ($y$-axis) of the standard (dotted), adaptive (dashed) and paired (solid) lasso converge as the sparsity constraint ($x$-axis) increases."}

FIGURE DEC

rm(list=ls())

<>

graphics::par(oma=c(1.0,1.0,0,0),mar=c(1.5,3.0,0.5,0.5),mfrow=c(1,1))

graphics::par(oma=c(1.0,1.0,0,0),mar=c(1.5,3.0,0.5,0.5),mfrow=c(2,2))

for(type in c("gene","isoform","miRNA","CNV")){ models <- c(paste0("standard_",c("x","z","xz")), paste0("adaptive_",c("x","z","xz")),"paired") constraint <- c("3","4","5","10","15","20","25","50","Inf")

loss <- LOSS[[type]]["deviance"]

loss <- lapply(loss,function(x) x[,models])

table <- matrix(NA,nrow=length(constraint),ncol=length(models),
                dimnames=list(constraint,models))
for(i in seq_along(constraint)){
    sub <- lapply(loss,function(x) x[rownames(x)==constraint[i],])
    table[i,] <- apply(sub$deviance,2,median)
}

# table <- log(table)
graphics::plot.new()
graphics::plot.window(xlim=c(1,length(constraint)),ylim=range(table))
graphics::box()
constraint[constraint=="Inf"] <- "n"
graphics::axis(side=2)
graphics::axis(side=1,at=seq_along(constraint),labels=constraint,tick=FALSE,line=-1)

for(k in c(1,2)){
    for(i in seq_along(models)){
        lty <- ifelse(i%in%c(1,2,3),3,ifelse(i%in%c(4,5,6),2,1))
        col <- ifelse(i==7,"#00007F","#FF3535") 
        pch <- ifelse(i%in%c(1,4),"x",ifelse(i%in%c(2,5),"z",1))
        if(k==1){
            graphics::lines(table[,i],col=col,lty=lty,lwd=2)
            graphics::points(table[,i],col="white",pch=16,cex=1.2)
        } else {
            graphics::points(table[,i],col=col,pch=1,font=2)  
        }
    }
}

} graphics::title(ylab="deviance",line=0.0,outer=TRUE) graphics::title(xlab="sparsity constraint",ylab="deviance",line=0.0,outer=TRUE)

```rs. The box plots show how much the paired lasso improves (dark) or deteriorates (bright) the \\textsc{auc} (left) and misclassification rate (right) of the competing models."}
### FIGURE CNV ###

#rm(list=ls())
#<<functions>>

graphics::par(oma=c(0,0,0,0),mar=c(2.1,3.5,0.5,0.5))
graphics::layout(matrix(c(1,1,2,2),nrow=1))

loss <- LOSS[[type]][c("deviance","auc","class")]
loss <- lapply(loss,function(x) x[rownames(x)=="10",])
model <- c(paste0("standard_",c("x","z","xz")),
           paste0("adaptive_",c("x","z","xz")))

diff <- loss$auc[,"paired"]-loss$auc[,model]
palasso:::plot_box(diff,zero=TRUE,invert=TRUE,ylab="change")
diff <- loss$class[,"paired"]-loss$class[,model]
palasso:::plot_box(diff,zero=TRUE,ylab="change")

```r for \textsc{cnv}s. Each cell represents one cancer-cancer combination (row, column). The colour indicates whether the paired lasso leads to a low (dark) or high (bright) \textsc{auc}."}

FIGURE MAP

rm(list=ls())

<>

loss <- LOSS[["CNV"]][c("info","auc")]

cancer <- sort(unique(c(levels(loss$info$y0),levels(loss$info$y1)))) X <- matrix(NA,nrow=length(cancer),ncol=length(cancer),dimnames=list(cancer,cancer))

Z <- palasso:::.design(x=cancer)

y0 <- as.character(loss$info$y0) y1 <- as.character(loss$info$y1) X[cbind(y0,y1)] <- X[cbind(y1,y0)] <- loss$auc[rownames(loss$auc)=="10","paired"]

graphics::par(mar=c(0.5,3.0,3.0,0.5)) dimnames(X) <- lapply(dimnames(X),function(x) paste0(" ",x," ")) palasso:::plot_table(X=X,margin=-1,labels=FALSE,las=2,cex=0.7)

sort(rowMeans(X,na.rm=TRUE),decreasing=TRUE)[1:2] # keep!

```rs. Given $32$ cancer types, this matrix shows the assignment of $496$ dependent pairs to $31$ groups of $16$ independent pairs, with each symbol representing one group."}
### FIGURE COM ###

# 32 cancer types for isoform and miRNA
# 33 cancer types for gene and CNV

#rm(list=ls())
#<<functions>>

for(type in c("miRNA")){

cancer <- sort(unique(as.character(unlist(LOSS[[type]]$info[,c("y0","y1")]))))

n <- length(cancer)
z <- as.numeric(palasso:::.design(x=n))
x <- rep(seq_len(n),each=n)
y <- rep(seq(from=n,to=1,by=-1),times=n)

pch <- z
pch[pch==0] <- NA
pex <- c(".","O","*","+","o","-","'","x")

# colour
base <- grDevices::colorRampPalette(colors=c('darkblue','blue','red','darkred'))(n)

col <- rep(NA,times=length(z))
col[z==0] <- "white"
for(i in seq_len(n)){
    col[z==i] <- base[i]
}

graphics::par(mfrow=c(1,1),mar=c(0,0,2,2))
graphics::plot.new()
graphics::plot.window(xlim=c(1,n),ylim=c(1,n))
graphics::points(x=x[pch<=25],y=y[pch<=25],
                 pch=pch[pch<=25],col=col[pch<=25],cex=0.9)
graphics::points(x=x[pch>25],y=y[pch>25],
                 pch=pex[(pch-25)[pch>25]],col=col[pch>25],cex=0.9)
graphics::segments(x0=0,x1=n+1,y0=n+1)
graphics::segments(x0=n+1,y0=n+1,y1=0)
graphics::segments(x0=0,x1=n+1,y0=n+1,y1=0,lty=2)

graphics::mtext(text=cancer,side=3,at=1:n,las=2,cex=0.7)
graphics::mtext(text=cancer,side=4,at=n:1,las=2,cex=0.7)

}





Try the palasso package in your browser

Any scripts or data that you put into this service are public.

palasso documentation built on April 19, 2021, 5:07 p.m.