R/Tcga_DateCanBeDownFor_Cancer.R

Defines functions Tcga_DateCanBeDownFor_Cancer

Documented in Tcga_DateCanBeDownFor_Cancer

#' Tcga Detail information for a special Date
#' 
#' @param tcgadate date form should be like "2013_09_23" for day, or "2013_09" for month, or "2013" for year
#'
#' @description Detail information for a special Date which can be dowanload,
#' this function could be used with function TcgaCancerDate. Into increasing the speed, we use
#' concurrency spider.
#' 
#' @return A big data frame
#' @export
#'
#' @examples Tcga_DateCanBeDownFor_Cancer(date="2013")
#' @examples Tcga_DateCanBeDownFor_Cancer(date="2013_09")
#' @examples Tcga_DateCanBeDownFor_Cancer(date="2013_09_23")
Tcga_DateCanBeDownFor_Cancer<-function(tcgadate){
  library(rvest)
  urlTime="http://gdac.broadinstitute.org/runs/"
  #get whole url
  arrayTime=read_html(urlTime) %>% html_nodes("td a") %>% html_text()#long time
  Judge_stddata=grepl(pattern = "stddata__[0-9]",x = arrayTime )
  stddata=arrayTime[Judge_stddata==TRUE]
  stddataT=gsub(pattern = "stddata__",replacement = "",x = stddata)
  cancertime=gsub(pattern = "/",replacement = "",x = stddataT)
  system.time({
    loopcraw<-function(i){
      library(rvest)
      urlTime="http://gdac.broadinstitute.org/runs/"
      #get whole url
      arrayTime=read_html(urlTime) %>% html_nodes("td a") %>% html_text()
      Judge_stddata=grepl(pattern = "stddata__[0-9]",x = arrayTime )
      stddata=arrayTime[Judge_stddata==TRUE]
      stddataT=gsub(pattern = "stddata__",replacement = "",x = stddata)
      cancertime=gsub(pattern = "/",replacement = "",x = stddataT)
      B=paste0(urlTime,stddata,"ingested_data.html")
      if (grepl(x=cancertime[i],pattern = "2012_11_02")==TRUE){
        "http://gdac.broadinstitute.org/runs/stddata__2012_11_02/gdac_counts.html" %>% 
          read_html() %>% html_nodes("table") %>% html_table(fill=TRUE,header=TRUE)
      }else{
        B[i] %>% read_html() %>% html_nodes("table") %>% html_table(fill=TRUE,header=TRUE)
      }
    }
    library(parallel)
    x <- 1:length(stddata)
    core=detectCores()
    cl <- makeCluster(core) # make cluster core
    results <- parLapply(cl,x,loopcraw) # par cluster
    stopCluster(cl) # close cluster
    takement=data.frame()
    for (i in 1:length(stddata)){
      A=results[[i]]
      if (length(A)==0){
        cat("")
      }else{ 
        if (A[[1]][1,1]==""){
          A_1=A[[1]][-1,]
        }else{
          A_1=A[[1]]
        }
        if ("PANCANCER" %in% A_1[,1]){
          norow=grep(x = A_1[,1],pattern = "PANCANCER")
          takement1=cbind(cancertime[i],A_1[-norow,])
          takement=c(takement,list(takement1))
        }else if ("Totals" %in% A_1[,1]){
          norow=grep(x = A_1[,1],pattern = "Totals")
          takement1=cbind(cancertime[i],A_1[-norow,])
          takement=c(takement,list(takement1))
        }else{
          takement1=cbind(cancertime[i],A_1)
          takement=c(takement,list(takement1))
        }
      }
    }
    #set a check to be change
    checktcgadate=0
    if (nchar(tcgadate)==10){
      for (i in 1:length(takement)){
        if (grepl(tcgadate,takement[[i]][,1][1])){ 
          checktcgadate=i
          CancerCanBeSortName=takement[[i]]
          return(CancerCanBeSortName)
        }
      }
    }
    if (nchar(tcgadate)==7){
      tcagdate7=data.frame()
      for (i in 1:length(takement)){
        if (grepl(tcgadate,takement[[i]][,1][1])){ 
          checktcgadate=i
          #complite dif
          if (nrow(tcagdate7) !=0){
            tcagdate7[setdiff(names(takement[[i]]), names(tcagdate7))] <- 0
          }
          takement[[i]][setdiff(names(tcagdate7), names(takement[[i]]))] <- 0
          #combine
          tcagdate7=rbind(tcagdate7,takement[[i]])
        }
      }
      return(tcagdate7)
    }
    if (nchar(tcgadate)==4){
      tcagdate4=data.frame()
      for (i in 1:length(takement)){
        if (grepl(tcgadate,takement[[i]][,1][1])){ 
          checktcgadate=i
          #complite dif
          if (nrow(tcagdate4) !=0){
            tcagdate4[setdiff(names(takement[[i]]), names(tcagdate4))] <- 0
          }
          takement[[i]][setdiff(names(tcagdate4), names(takement[[i]]))] <- 0
          #combine
          tcagdate4=rbind(tcagdate4,takement[[i]])
        }
      }
      return(tcagdate4)
    }
    if (nchar(tcgadate)==0){
      tcagdate0=data.frame()
      for (i in 1:length(takement)){ 
          checktcgadate=i
          #complite dif
          if (nrow(tcagdate0) !=0){
            tcagdate0[setdiff(names(takement[[i]]), names(tcagdate0))] <- 0
          }
          takement[[i]][setdiff(names(tcagdate0), names(takement[[i]]))] <- 0
          #combine
          tcagdate0=rbind(tcagdate0,takement[[i]])
      }
      rownames(tcagdate0)=1:nrow(tcagdate0)
      TumorColn=grep(pattern = "Tumor",x = colnames(tcagdate0))
      tcagdateall=cbind(tcagdate0[,c(1,TumorColn)],tcagdate0[,c(-1,-TumorColn)])
      return(tcagdateall)
    }
    #if checktcgadate stll equal 0, that indicates no date can be found
    if (checktcgadate==0){
      cat("There is no data for your date. Please check your date fromat using function TcgaCancerDate()","\n")
    }
  })#detect time
}
yikeshu0611/TCGAFamiliar documentation built on May 21, 2019, 1:45 a.m.