R/CGDS_Sets.R

Defines functions setCGsurvival setCGclinic setCGstudy setCGbase

Documented in setCGbase setCGclinic setCGstudy setCGsurvival

# Install Package: Ctrl + Shift + B
# devtools::document()

#' create CGDS object
#'
#' Imports:
#' cgdsr
#'
#' @param address string, web address
#' @return cdgs object, and prints a test and the first 2 columns of the cdgs object
#' @details uses cgdsr::CGDS(address)
#' @examples
#' cgds <- setCGbase()
#' @export
setCGbase <- function(address="https://www.cbioportal.org/"){
  #==cdgs object==#
  cgds <- cgdsr::CGDS("https://www.cbioportal.org/")
  #==user info print==#
  print( cgdsr::test(cgds) )
  print( cgdsr::getCancerStudies(cgds)[,1:2] )
  #==output==#
  cgds
}

#' get name of a study
#'
#' Imports:
#' cgdsr
#'
#' @param cgds output from the setCGbase() function
#' @param studyNumber integer, index number of the study
#' @return study name, and prints caselist and genetic profiles
#' @details uses cgdsr::getCancerStudies(cgds)
#' @examples
#' cgds <- setCGbase()
#' study <- setCGstudy(cgds, 5)
#' @export
setCGstudy <- function(cgds, studyNumber){
  #==get specific study from the object==#
  study <- cgdsr::getCancerStudies(cgds)[studyNumber,1]
  #==user info print==#
  print( cgdsr::getCaseLists(cgds, study)[,2:3] )
  print( cgdsr::getGeneticProfiles(cgds, study)[,1:2] )
  #==output==#
  study
}

#' get clinical data from patients with specific data about genes
#'
#' Imports:
#' cgdsr
#'
#' @param cgds output from the setCGbase() function
#' @param study ouput from the setCGstudy() function
#' @param profile integer, profile name (what kind of data, RNA-Seq, Mutations, ...) obtained from the printout of the setCGstudy() function
#' @param caselist integer, caselist name (which samples) obtained from the printout of the setCGstudy() function
#' @param genes character vector, genes of interest, Symbols, EntrezIds and EnsemblIds work, maybe others too.
#' @return data.frame, clinical data of the patients (depending on what caselist was chosen) with data about the genes of interest (RNA-Seq counts, mutations occurences) depending on what profile was chosen
#' @details profile and caselist should be logically conntected, e.g. the caselist "samples with mutation data" would go along well with the profile "Mutations"
#' @examples
#' cgds <- setCGbase()
#' study <- setCGstudy(cgds, 5)
#' clinic <- setCGclinic(cgds, study, 2, 1, c("ACTA1","HOXA9"))
#' @export
setCGclinic <- function(cgds, study, profile, caselist, genes){
  #==get caselist (samples), profile (type of experiment) and the correpsonding data for the genes of interest==#
  caselist <- cgdsr::getCaseLists(cgds, study)[caselist,1]
  profile <- cgdsr::getGeneticProfiles(cgds, study)[profile,1]
  data1 <- cgdsr::getProfileData(cgds, genes, profile, caselist)
  #==get clinical data (patient info) for the data==#
  clinic <- cgdsr::getClinicalData(cgds,caselist)
  clinic <- transform(merge(clinic, data1, by=0, sort=F), row.names=Row.names, Row.names=NULL)
  #==output==#
  clinic
}

#' get clinical data from patients with specific data about genes
#'
#' Imports:
#' cgdsr,
#' survival,
#' survminer
#'
#' @param clinic output from the setCGclinic() function
#' @param gene character, gene of interest which had to be one of the genes specified in the setCGclinic() function. If you want to split only by genotype, use the splitByCol.
#' @param percsplit integer, how many percentiles are calculated? automactically controls what percentiles are calculated (2: 0th,50th; 3: 0th,33th,66th; 4: 0th,25th,50th,75th)
#' @param percs numbers, which percentiles should be displayed in the survival plot (e.g. 0, 50, 33, 90, etc.)
#' @param splitByCol character, a column name by which the groups should be split (in addition to the gene), e.g. whether or not another gene is mutated
#' @return list with 3 items: [[1]] a survival ggplot, [[2]] the extended clinic data.frame, [[3]] a summary, [[4]] the p value
#' @details uses the OS_MONTHS column of the clinic-data.frame to calculate survival
#' @examples
#' cgds <- setCGbase()
#' study <- setCGstudy(cgds, 5)
#' clinic <- setCGclinic(cgds, study, 2, 1, c("ACTA1","HOXA9"))
#' survival <- setCGsurvival(clinic, "HOXA9")
setCGsurvival <- function(clinic0, gene=NA, percsplit=2, percs=NA, splitByCol=NA){
  #==add new columns for survival==#
  clinic <- cbind(clinic0, deceased=grepl("DECEASED", clinic0[,"OS_STATUS"]))
  #==filter for rows with information about the gene of interest==#
  out <- NA
  if(!is.na(gene)){
    clinic <- clinic[!is.na(clinic[,gene]),]
  }
  if(nrow(clinic)==0){

    list(kaplanMeier=NA, table=NA, summary=NA, pval=1)

  }else{

    #==determine quantiles and split the table depending on the splitByCol parameter==#
    quantiles <- seq(0, 1-1/percsplit, 1/percsplit)
    if(!is.na(splitByCol)){
      separated <- split.data.frame(clinic, clinic[,splitByCol])
    }else{
      separated <- list(clinic)
    }
    #==if a gene of interest if specified, classify patients into quantiles based on their gene expression value==#
    if(!is.na(gene)){
      df1 <- lapply(separated, function(x) x[,gene,drop=F]) #get a list, each item is a single column of the gene's values
      df2 <- lapply(df1, function(x) quantile(x[,1], quantiles, na.rm=T)) #get a list of quantile vectors. Each vector has z items (z=2 for 0th&50th quantile, z=3 for 0th&33th&66th quantile)
      df3 <- mapply(function(x,y) lapply(x, function(z) rownames(y)[y>=z]), x=df2, y=df1, SIMPLIFY=F) #get a list where each patient ID is stored under a given quantile
      for(i in seq(df3)){
        a <- df3[[i]]
        for(j in 1:(length(a)-1)){ a[[j]] <- a[[j]][ !a[[j]] %in% a[[j+1]] ] }
        df3[[i]] <- a
      }
      df4 <- lapply(df3, function(z) mapply(function(x,y) data.frame(percentile=y, patient=x), x=z, y=paste0("perc", quantiles*100), SIMPLIFY=F))
      df4 <- lapply(df4, function(x) do.call(rbind, x))
      df4 <- do.call(rbind, df4)
    }else{
      df4 <- data.frame(percentile=rep(1,nrow(clinic)), patient=rownames(clinic))
    }
    #==merge original table with the quantile classification for each patient==#
    df5 <- transform(merge(clinic, df4, by.x=0, by.y="patient"), row.names=Row.names, Row.names=NULL)
    #==add a column that will be used during plotting==#
    if(is.na(splitByCol)){
      df5 <- cbind(df5, percentileSplitBy=df5$percentile)
    }else{
      df5 <- cbind(df5, percentileSplitBy=paste0(df5[,splitByCol], "_", df5$percentile))
    }
    #==if certain percentiles were defined, keep only those==#
    if(!is.na(percs[1])){
      df5 <- subset(df5, percentile %in% paste0("perc",percs))
    }
    #==calculate survival plot==#
    surv1 <- survival::survfit(survival::Surv(OS_MONTHS, deceased) ~percentileSplitBy, data=df5)
    plot1 <- survminer::ggsurvplot(surv1, data = df5, conf.int = F, pval = T)
    if(!is.na(gene) | !is.na(splitByCol)){pval <- survminer::surv_pvalue(surv1, data=df5)$pval}else{pval <- NA}
    #==output==#
    list(kaplanMeier=plot1, table=df5, summary=surv1, pval=pval)
  }

}
Solatar/setR documentation built on Dec. 5, 2020, 10:50 p.m.