R/CancerCanBeDownFor_Date.R

#' Date Cancer which can be downloaded
#' 
#' This funtion allows to get the data for a special Cancer which can be downloaded
#' 
#' @param cancer The cancer names of abbreviation
#' @import rvest
#' @return application
#' @export 
#'
#' @examples CancerCanBeDownFor_Date("OV)
CancerCanBeDownFor_Date<-function(cancer){
  
  list.of.packages <- c("rvest")
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) install.packages(new.packages)
  
  library(rvest)
  
  urlTime="http://gdac.broadinstitute.org/runs/"
  
  #get whole url
  arrayTime=read_html(urlTime) %>% html_nodes("td a") %>% html_text()
  Judge_stddata=grepl(pattern = "stddata__[0-9]",x = arrayTime )
  stddata=arrayTime[Judge_stddata==TRUE]
  
  ListUrl=paste0(urlTime,stddata)
  
  stddataT=gsub(pattern = "stddata__",replacement = "",x = stddata)
  cancertime=gsub(pattern = "/",replacement = "",x = stddataT)
  
  
  takement=list()
  
  cat("total data is",length(ListUrl),"\n")
  
  for (i in 1:length(ListUrl)){
    
    A=ListUrl[i] %>% read_html() %>% html_nodes("table") %>% html_table(fill=TRUE,header=TRUE)
    A_1=A[[1]]
    if (nrow(A_1)==0){
      cat("data for",cancertime[i],"can not be done","\n")
    }else{
      takement1=cbind(cancertime[i],A_1)
      takement=c(takement,list(takement1))
      cat(i,cancertime[i],"\n")
    }
  }
  
  cat("\n")
  
  
  tttt="ttttt 
  1"
  
  cancercanbedwon=read.table(textConnection(tttt),header = TRUE)
  
  for (i in 1: length(takement)){
    if ("OV" %in% takement[[i]][,2]){
      
      cancercanbedwonT=takement[[i]][takement[[i]][,2]==cancer,-2]#second column is cancer name so delet
      
      if (nrow(cancercanbedwonT) !=0 ){
        #delet space col
        a=grep(pattern = " ",x = cancercanbedwonT[1,])
        if (length(a) == 0){
          
          if ("# Datasets" %in% colnames(cancercanbedwonT)){
            colnames(cancercanbedwonT)[grep(pattern = "# Datasets",x =colnames(cancercanbedwonT))]="Number Of Datasets"
          }
          
          if ("% Processed" %in% colnames(cancercanbedwonT)){
            colnames(cancercanbedwonT)[grep(pattern = "% Processed",x =colnames(cancercanbedwonT))]="Percent Processed"
          }
          
          cancercanbedwonT[setdiff(names(cancercanbedwon), names(cancercanbedwonT))] <- ""
          cancercanbedwon[setdiff(names(cancercanbedwonT), names(cancercanbedwon))] <- ""
          cancercanbedwon=rbind(cancercanbedwon,cancercanbedwonT)
        }else{
          cancercanbedwonT=cancercanbedwonT[,-a]
          
          if ("# Datasets" %in% colnames(cancercanbedwonT)){
            colnames(cancercanbedwonT)[grep(pattern = "# Datasets",x =colnames(cancercanbedwonT))]="Number Of Datasets"
          }
          
          if ("% Processed" %in% colnames(cancercanbedwonT)){
            colnames(cancercanbedwonT)[grep(pattern = "% Processed",x =colnames(cancercanbedwonT))]="Percent Processed"
          }
          
          cancercanbedwonT[setdiff(names(cancercanbedwon), names(cancercanbedwonT))] <- ""
          cancercanbedwon[setdiff(names(cancercanbedwonT), names(cancercanbedwon))] <- ""
          cancercanbedwon=rbind(cancercanbedwon,cancercanbedwonT)
        }
      }
      
    }
    
  }
  cancercanbedwonF=cancercanbedwon[-1,-1]
  application <- as.data.frame(lapply(cancercanbedwonF, function(x) {
    gsub("\u00A0", " ", x) 
  }))
  
  return(application)
}
yikeshu0611/ConvTCGA documentation built on May 17, 2019, 7:58 a.m.