R/cgdsr.R

library(R.oo)

.onAttach <- function(libname, pkgname){
    packageStartupMessage('Please send questions to cbioportal@googlegroups.com')
}

setConstructorS3("CGDS", function(url='',verbose=FALSE,ploterrormsg='',token=NULL) {
  extend(Object(), "CGDS",
         .url=url,
         .token=token,
         .verbose=verbose,
         .ploterrormsg='')
})

setMethodS3("processURL","CGDS", private=TRUE, function(x, url, force.comment.char.blank=FALSE, ...) {
  if (x$.verbose) cat(url,"\n")

  headers <- httr::add_headers()

  if(!is.null(x$.token)) {
    headers <- httr::add_headers(Authorization = paste("Bearer ", x$.token, sep = ""))
  }

  r <- httr::GET(url, headers)

  httr::stop_for_status(r)

  if(force.comment.char.blank) {
    df = read.table(skip=0, header=TRUE, as.is=TRUE, sep="\t", quote='', comment.char='', fill=TRUE, text=httr::content(r, "text"))
  } else {
    df = read.table(skip=0, header=TRUE, as.is=TRUE, sep="\t", quote='', fill=TRUE, text=httr::content(r, "text"))
  }
})

setMethodS3("setPlotErrorMsg","CGDS", function(x, msg, ...) {
  x$.ploterrormsg = msg
  return(msg)
})

setMethodS3("setVerbose","CGDS", function(x, verbose, ...) {
  x$.verbose = verbose
  return(verbose)
})

setMethodS3("getCancerStudies","CGDS", function(x, ...) {
  url = paste(x$.url, "webservice.do?cmd=getCancerStudies&",sep="")
  df = processURL(x,url)
  return(df)
})

setMethodS3("getCaseLists","CGDS", function(x, cancerStudy, ...) {
  url = paste(x$.url, "webservice.do?cmd=getCaseLists&cancer_study_id=", cancerStudy, sep="")
  df = processURL(x,url)
  return(df)
})

setMethodS3("getGeneticProfiles","CGDS", function(x, cancerStudy, ...) {
  url = paste(x$.url, "webservice.do?cmd=getGeneticProfiles&cancer_study_id=", cancerStudy, sep="")
  df = processURL(x,url)
  return(df)
})

setMethodS3("getMutationData","CGDS", function(x, caseList, geneticProfile, genes, ...) {
  url = paste(x$.url, "webservice.do?cmd=getMutationData",
    "&case_set_id=", caseList,
    "&genetic_profile_id=", geneticProfile,
    "&gene_list=", paste(genes,collapse=","), sep="")
  df = processURL(x,url)
  return(df)
})

setMethodS3("getProfileData","CGDS", function(x, genes, geneticProfiles, caseList='', cases=c(), caseIdsKey = '', ...) {
  url = paste(x$.url, "webservice.do?cmd=getProfileData",
    "&gene_list=", paste(genes,collapse=","),
    "&genetic_profile_id=", paste(geneticProfiles,collapse=","),
    "&id_type=", 'gene_symbol',
    sep="")

  if (length(cases)>0) { url = paste(url,"&case_list=", paste(cases,collapse=","),sep='')
  } else if (caseIdsKey != '') { url = paste(url,"&case_ids_key=", caseIdsKey,sep='')
  } else { url = paste(url,"&case_set_id=", caseList,sep='') }

  df = processURL(x,url)

  if (nrow(df) == 0) { return(df) }

  m = matrix()
  # process data before returning
  if (length(geneticProfiles) > 1) {
    cnames = df[,1]
    m = t(df[,-c(1:4)])
    colnames(m) = cnames
  } else {
    cnames = df[,2]
    m = t(df[,-c(1:2)])
    colnames(m) = cnames
  }

  return(data.frame(m))
})

setMethodS3("getClinicalData","CGDS", function(x, caseList='', cases=c(), caseIdsKey = '', ...) {
  url = paste(x$.url, "webservice.do?cmd=getClinicalData",sep="")

  if (length(cases)>0) { url = paste(url,"&case_list=", paste(cases,collapse=","),sep='')
  } else if (caseIdsKey != '') { url = paste(url,"&case_ids_key=", caseIdsKey,sep='')
  } else { url = paste(url,"&case_set_id=", caseList,sep='') }

  df = processURL(x,url)
  rownames(df) = make.names(df[,1])
  return(df[,-1])
})

setMethodS3("plot","CGDS", function(x, cancerStudy, genes, geneticProfiles, caseList='', cases=c(),  caseIdsKey = '', skin='cont', skin.normals='', skin.col.gp = c(), add.corr = '', legend.pos = 'topright', ...) {

  errormsg <- function(msg,error=TRUE) {
    # return empty plot with text
    if (error) {msg = paste('Error:',msg)}
    # override msg if global message provided in object
    if (x$.ploterrormsg != '') {msg = x$.ploterrormsg}
    plot.new()
    # set message text here ...
    #mtext(msg,cex=1.0,col='darkred')
    text(0.5,0.5,msg,cex=1.0,col='darkred')
    box()
    return(msg)
  }

  # we only allow the following combinations
  # a) gene1 (1 profile)  # b) gene1 vs gene2 (1 profile)
  # c) profile 1 vs profile 2 (1 gene)
  if((length(genes) > 1 & length(geneticProfiles) > 1) | (length(genes) > 2 | length(geneticProfiles) > 2)) {
    return(errormsg("use only 2 genetic profiles OR 2 genes"))
  }

  # make genenames conform to R variable names
  genesR = make.names(genes)

  # get data, check more than zero rows returned, otherwise return
  df = getProfileData(x, genes, geneticProfiles, caseList, cases, caseIdsKey)
  if (nrow(df) == 0) { return(errormsg(paste('empty data frame returned :\n',colnames(df)[1]))) }

  # check data returned with more than two genes or genetic profiles
  if (length(genes) == 2 & ncol(df) != 2) { return(errormsg(paste("gene not found:", setdiff(genesR,colnames(df))))) }
  if (length(geneticProfiles) == 2 & ncol(df) != 2) { return(errormsg("geneticProfile ID not found:", setdiff(geneticProfiles,colnames(df)))) }

  # get geneticProfiles annotation for axis labels
  gps = getGeneticProfiles(x, cancerStudy)
  if (nrow(gps) == 0) { errormsg(colnames(gps[1])) }
  rownames(gps) = gps[,1]
  gpaxisnames = gps[geneticProfiles,'genetic_profile_name']
  names(gpaxisnames) = geneticProfiles

  # we can have a situation where there is no data for a given combination of gene and genetic profile
  # in this case, we have a column of NaN, and we generate an error
  nacols = sapply(df, function(x) all(is.nan(x)))
  if (any(nacols)) {
    if (length(geneticProfiles) > 1) {
      # two genetic profiles in columns, one gene
      return(errormsg(paste(genes, "has no data\n for genetic profile(s):", paste(gpaxisnames[nacols],collapse=", ")),FALSE))
    } else {
      # one genetic profile, one or two genes in columns
      return(errormsg(paste(paste(genes[nacols],collapse=' and '), "has no data\n for genetic profile:", gpaxisnames),FALSE))
    }
  }

  # set sub title with correlation if specified
  plot.subtitle = ''
  if ((add.corr == 'pearson'  | add.corr == 'spearman') & ncol(df) == 2) {
    ct = cor.test(df[,1],df[,2],method=add.corr)
    plot.subtitle = paste(add.corr, ' r = ', sprintf("%.2f",ct$estimate), ', p = ',sprintf("%.1e",ct$p.value))
  }

  ###
  ### Skins
  ###

  if (skin == 'cont') {

    if(length(genes) == 1 & length(geneticProfiles) == 1) {
      hist(df[,1],xlab=paste(genes," , ",gpaxisnames,sep=""),main='')
    } else if (length(genes) == 2) {
      # two genes
      plot(df[,genesR[1]],df[,genesR[2]] , main = '', xlab = paste(genes[1],", ",gpaxisnames,sep=""), ylab = paste(genes[2],", ",gpaxisnames,sep=""), pch = 1, col = 'black', sub = plot.subtitle)
    } else {
      # two genetic profiles
      gpa = geneticProfiles[1]
      gpb = geneticProfiles[2]
      plot(df[,gpa],df[,gpb] , main = '', xlab = paste(genes[1],", ",gpaxisnames[gpa],sep=""), ylab = paste(genes[1],", ",gpaxisnames[gpb],sep=""), pch = 1, col = 'black', sub = plot.subtitle)
    }

  } else if (skin == 'disc') {

    if(length(genes) == 1 & length(geneticProfiles) == 1) {
      barplot(table(df[,1]),xlab=paste(genes,", ",gpaxisnames,sep=""),main='',ylab='frequency')
    } else {
      #discrete vs discrete
      return(errormsg('discrete vs. discrete data not implemented'))
    }

  } else if (skin =='disc_cont') {

    # skin only valid for two genetic profiles
    if(length(geneticProfiles) != 2) {
      return(errormsg("two genetic profiles required for skin 'disc_cont'"))
    } else {
      # skin assumes that first genetic profile is discret
      gp.disc = geneticProfiles[1] # b
      gp.cont = geneticProfiles[2] # a

      boxplot(df[,gp.cont] ~ df[,gp.disc], outpch = NA, main = '', xlab = paste(genes[1],", ",gpaxisnames[gp.disc],sep=""), ylab = paste(genes[1],", ",gpaxisnames[gp.cont],sep=""), border = 'gray', sub = plot.subtitle)
      stripchart(df[,gp.cont] ~ df[,gp.disc],vertical = TRUE, add = TRUE, method = 'jitter', pch = 1, col = 'black')
    }

  } else if (skin == 'cna_mrna_mut') {

    # skin uses parameters
    # * skin.col.gp = mut
    # * skin.normals = normal_case_set [optional]

    # fetch optional normal mRNA data

    if (skin.normals != '') {
      df.norm = getProfileData(x, genes, geneticProfiles[2], skin.normals)
      if (nrow(df.norm) == 0) { return(errormsg(paste('empty data frame returned :\n',colnames(df.norm)[1]))) }
      # check if data is missing (NaN)
      if ( !all(is.nan(df.norm[,1])) ) {
        # add normal data to dataframe
        df.norm2 = cbind(rep(-3,nrow(df.norm)),df.norm[,1])
        colnames(df.norm2) = geneticProfiles
        df = rbind(df,df.norm2)
      }
    }

    # create boxplot
    df.nona = df[apply(df, 1, function(x) {!any(is.na(x))}),]
    ylim=range(df.nona[,geneticProfiles[2]],na.rm=TRUE)
    labels=seq(-3,2)
    names(labels)=c("Normal","Homdel","Hetloss","Diploid","Gain","Amp")
    labels.inuse = sort(unique(df.nona[,geneticProfiles[1]])) # sort removes any NA

    boxplot(df.nona[,geneticProfiles[2]] ~ df.nona[,geneticProfiles[1]], main='', outline=FALSE,
            xlab = paste(genes,", ",gpaxisnames[geneticProfiles[1]],sep=""),
            ylab = paste(genes,", ",gpaxisnames[geneticProfiles[2]],sep=""),
            border="gray",ylim=ylim, axes=FALSE, outpch = NA, sub = plot.subtitle)

    axis(1,at=seq(1,length(labels.inuse)),labels=names(labels)[match(labels.inuse,labels)],cex.axis=0.8)
    axis(2,cex.axis=0.8,las=2)
    # box()

    # manually jitter data

    # order data by CNA status
    df.nona=df.nona[order(df.nona[,geneticProfiles[1]]),]
    xy=list()
    xy$MRNA=df.nona[,geneticProfiles[2]]

    cats=cbind(1:length(labels.inuse),as.data.frame(table(df.nona[,geneticProfiles[1]])))
    colnames(cats)=c("X","Class","Count")
    rownames(cats)=names(labels)[match(cats[,2],labels)]

    xy$JITTER = unlist( apply(cats,1, function(cc) {
      y=rep.int(as.numeric(cc[1]),as.numeric(cc[3]))
      y=y+stats::runif(length(y),-0.1,0.1)
    }))
    xy=as.data.frame(xy)
    colnames(xy) = c('MRNA','JITTER')

    # Initialize plotting features
    nonmut.pch = 4
    N=nrow(xy)
    cex=rep(0.9,N)
    pch=rep(nonmut.pch,N)
    col=rep("royalblue",N)
    bg=rep(NA,N)

    # fetch mutation data for color coding
    if (length(skin.col.gp == 1)) {
      df.mut = getProfileData(x, genes, skin.col.gp, caseList, cases, caseIdsKey)
      if (nrow(df.mut) == 0) { return(errormsg(paste('empty data frame returned :\n',colnames(df.mut)[1]))) }
      # get matrix corresponding to df.nona
      df.mut = df.mut[rownames(df.nona),1]
      # check if data is missing (NaN)
      mut=which(!is.na(df.mut))
      if(length(mut)>0) {
        # default mutation
        col[mut]="red3"
        bg[mut]="goldenrod"
        pch[mut]=21
        mt=list()
        mt$pch=c(21,23,24,25,22)
        mt$type=c("missense","nonsense","splice","shift","in_frame")
        mt$pattern=c("^[a-z][0-9]+[a-z]$","[*x]$","^(e.+[0-9]|.+_splice)$","fs$","del$")
        mt$bg=c("goldenrod","darkblue","darkgray","black","goldenrod")
        mt=as.data.frame(mt)
        for(i in 1:nrow(mt)) {
          idx=grep(mt$pattern[i],tolower(df.mut))
          pch[idx]=mt$pch[i]
          bg[idx]=as.character(mt$bg[i])
        }
        #col[mut]="red3" # ??????
        legend("topleft",bty="n",
               as.character(as.vector(mt[["type"]])),col="red3",
               pt.bg=as.character(as.vector(mt[["bg"]])),
               pch=mt[["pch"]],cex=0.85,pt.cex=1.0
               )
      }
    }

    ## # Plot the jittered data, add mutated points last
    xy.mut = (pch != nonmut.pch)
    points(xy$JITTER[!xy.mut],xy$MRNA[!xy.mut],pch=pch[!xy.mut],cex=cex[!xy.mut],col=col[!xy.mut],bg=bg[!xy.mut])
    points(xy$JITTER[xy.mut],xy$MRNA[xy.mut],pch=pch[xy.mut],cex=cex[xy.mut],col=col[xy.mut],bg=bg[xy.mut])
    box()

  } else if (skin == 'meth_mrna_cna_mut' | skin == 'cna_mut') {

    # these skins use parameters
    # * skin.col.gp = (cna,mut) [optional]
    # * skin.normals = normal_case_set [optional]
    # [meth_mrna_cna_mut] forces x axis range to [0,1.05]

    # fetch cna and mut data for color coding
    pch=rep(1,nrow(df))
    col=rep("black",nrow(df))

    if (length(skin.col.gp) == 2) { # color by both CNA and mutation
        df.col = getProfileData(x, genes, skin.col.gp, caseList, cases, caseIdsKey)
      if (nrow(df.col) == 0) { return(errormsg(paste('empty data frame returned :\n',colnames(df.col)[1]))) }
      # because mut is text vector, we need to transform cna to integer vector instead of factor
      cna = as.integer(as.vector(df.col[,skin.col.gp[1]]))
      mut = as.vector(df.col[,skin.col.gp[2]])
      col[cna==-2]="darkblue"
      col[cna==-1]="deepskyblue"
      col[cna==1]="hotpink"
      col[cna==2]="red3"
      col[mut!="NaN"]="orange"
      pch[mut!="NaN"]=20
    }
    else if (length(skin.col.gp) == 1) { # color only by CNA
      df.col = getProfileData(x, genes, skin.col.gp, caseList, cases, caseIdsKey)
      if (nrow(df.col) == 0) { return(errormsg(paste('empty data frame returned :\n',colnames(df.col)[1]))) }
      # because mut is text vector, we need to transform cna to integer vector instead of factor
      cna = as.integer(as.vector(df.col[,1]))
      col[cna==-2]="darkblue"
      col[cna==-1]="deepskyblue"
      col[cna==1]="hotpink"
      col[cna==2]="red3"
    }

    # fetch optional normal methylation and mRNA data
    if (skin.normals != '') {
      df.norm = getProfileData(x, genes, geneticProfiles, skin.normals)
      if (nrow(df.norm) == 0) { return(errormsg(paste('empty data frame returned :\n',colnames(df.norm)[1]))) }
      # remove missing data
      df.norm = df.norm[apply(df.norm, 1, function(x) {!any(is.na(x))}),]
      if ( length(df.norm) > 0) {
        df = rbind(df,df.norm)
        col = append(col, rep("black",nrow(df.norm)))
        pch = append(pch, rep(20,nrow(df.norm)))
      }
    }

    xlim = range(df[,geneticProfiles[1]],na.rm = TRUE)
    # add 15% to range, to make room for legend
    if (legend.pos == 'topright') {
      xlim = c(min(xlim),max(xlim) + (max(xlim)-min(xlim))*0.15)
    }
    else if (legend.pos == 'topleft') {
      xlim = c(min(xlim) - (max(xlim)-min(xlim))*0.15,max(xlim))
    }

    if (skin == 'meth_mrna_cna_mut') {
      # force x axis range to [0,1.05]
      xlim = c(0,1.05)
    }

    # now plot
    plot( df[,geneticProfiles[1]],df[,geneticProfiles[2]],main="",
         xlab=paste(genes,", ",gpaxisnames[geneticProfiles[1]],sep=""),
         ylab=paste(genes,", ",gpaxisnames[geneticProfiles[2]],sep=""),
         xlim=xlim,pch=pch,col=col,cex=1.2, sub = plot.subtitle
         )

    #abline(lm(d$rna~d$methylation),col="red3",lty=2,lwd=1.5)
    #lines(loess.smooth(d$methylation,d$rna),col="darkgray",lwd=2)

    # Replace with dynamically created legend when time permits
    legend(legend.pos,bty="n",
           c("Homdel","Hetloss","Diploid","Gain","Amp","Mutated","Normal"),
           col=c('darkblue','deepskyblue','black','hotpink','red','orange','black'),
           pch=c(1,1,1,1,1,20,20),cex=0.85,pt.cex=1.0
           )

  } else {
    return(errormsg(paste("unkown skin:",skin)))
  }

  return(TRUE)

})

setMethodS3("test","CGDS", function(x, ...) {
  checkEq = function(a,b) { if (identical(a,b)) "OK\n" else "FAILED!\n" }
  checkGrt = function(a,b) { if (a > b) "OK\n" else "FAILED!\n" }
  checkTrue = function(a) { if (a) "OK\n" else "FAILED!\n" }

  cancerstudies = getCancerStudies(x)
  cat('getCancerStudies... ',
      checkEq(colnames(cancerstudies),c("cancer_study_id","name","description")))
  ct = cancerstudies[2,1] # should be row 1 instead ...

  cat('getCaseLists (1/2) ... ',
      checkEq(colnames(getCaseLists(x,ct)),
              c("case_list_id","case_list_name",
                "case_list_description","cancer_study_id","case_ids")))
  cat('getCaseLists (2/2) ... ',
      checkEq(colnames(getCaseLists(x,'xxx')),
              'Error..Problem.when.identifying.a.cancer.study.for.the.request.'))

  cat('getGeneticProfiles (1/2) ... ',
      checkEq(colnames(getGeneticProfiles(x,ct)),
              c("genetic_profile_id","genetic_profile_name","genetic_profile_description",
                "cancer_study_id","genetic_alteration_type","show_profile_in_analysis_tab")))
  cat('getGeneticProfiles (2/2) ... ',
      checkEq(colnames(getGeneticProfiles(x,'xxx')),
              'Error..Problem.when.identifying.a.cancer.study.for.the.request.'))

  # clinical data
  # check colnames
  cat('getClinicalData (1/1) ... ',
      checkTrue("DFS_MONTHS" %in% colnames(getClinicalData(x,'gbm_tcga_all'))))

  # check one gene, one profile
  cat('getProfileData (1/6) ... ',
      checkEq(colnames(getProfileData(x,'NF1','gbm_tcga_mrna','gbm_tcga_all')),
              "NF1"))
  # check many genes, one profile
  cat('getProfileData (2/6) ... ',
      checkEq(colnames(getProfileData(x,c('MDM2','MDM4'),'gbm_tcga_mrna','gbm_tcga_all')),
              c("MDM2","MDM4")))
  # check one gene, many profile
  cat('getProfileData (3/6) ... ',
      checkEq(colnames(getProfileData(x,'NF1',c('gbm_tcga_mrna','gbm_tcga_mutations'),'gbm_tcga_all')),
              c('gbm_tcga_mrna','gbm_tcga_mutations')))
  # check 3 cases returns matrix with 3 columns
  cat('getProfileData (4/6) ... ',
      checkEq(rownames(getProfileData(x,'BRCA1','gbm_tcga_mrna',cases=c('TCGA-02-0001-01','TCGA-02-0003-01'))),
              make.names(c('TCGA-02-0001-01','TCGA-02-0003-01'))))
  # invalid gene names return empty data.frame
  cat('getProfileData (5/6) ... ',
      checkEq(nrow(getProfileData(x,c('NF10','NF11'),'gbm_tcga_mrna','gbm_tcga_all')),as.integer(0)))
  # invalid case_list_id returns error
  cat('getProfileData (6/6) ... ',
      checkEq(colnames(getProfileData(x,'NF1','gbm_tcga_mrna','xxx')),
              'Error..Invalid.case_set_id...xxx.'))
})

Try the cgdsr package in your browser

Any scripts or data that you put into this service are public.

cgdsr documentation built on June 27, 2019, 5:07 p.m.