R/TcgaCancerCanBeDownFor_Detail.R

Defines functions TcgaCancerCanBeDownFor_Detail

Documented in TcgaCancerCanBeDownFor_Detail

#' Tcga Detail information for a special Cancer
#' 
#' @description Detail information for a special Cancer which can be dowanload,
#' this function could be used with function TcgaCancerName. Into increasing the speed, we use
#' concurrency spider.
#' 
#' @param cancer The cancer names of abbreviation
#' @return A big data frame
#' @export
#'
#' @examples TcgaCancerCanBeDownFor_Detail("OV")
TcgaCancerCanBeDownFor_Detail<-function(cancer){
  library(rvest)
  url="http://gdac.broadinstitute.org/"
  #get html data
  tcgahome<-read_html(url) %>% html_nodes("table#counts_table") %>% html_table( )
  tcgahomedataframe=tcgahome[[1]]
  #aline to left by default
  tcgahomedataframeleft=format(x=tcgahomedataframe,justify = "left")
  tcgahomedataframeleft1=tcgahomedataframeleft[,1:2]
  matchcancer=cancer %in% gsub(" ","",tcgahomedataframeleft1[,2])
  if (matchcancer==FALSE){
    stop("your cancer name is WRONG. Please use function TcgaCancerName() to verify your input.")
  }
  urlTime="http://gdac.broadinstitute.org/runs/"
  #get whole url
  arrayTime=read_html(urlTime) %>% html_nodes("td a") %>% html_text()#long time
  Judge_stddata=grepl(pattern = "stddata__[0-9]",x = arrayTime )
  stddata=arrayTime[Judge_stddata==TRUE]
  stddataT=gsub(pattern = "stddata__",replacement = "",x = stddata)
  cancertime=gsub(pattern = "/",replacement = "",x = stddataT)
  system.time({
loopcraw<-function(i){
      library(rvest)
      urlTime="http://gdac.broadinstitute.org/runs/"
      #get whole url
      arrayTime=read_html(urlTime) %>% html_nodes("td a") %>% html_text()
      Judge_stddata=grepl(pattern = "stddata__[0-9]",x = arrayTime )
      stddata=arrayTime[Judge_stddata==TRUE]
      stddataT=gsub(pattern = "stddata__",replacement = "",x = stddata)
      cancertime=gsub(pattern = "/",replacement = "",x = stddataT)
      B=paste0(urlTime,stddata,"ingested_data.html")
      if (grepl(x=cancertime[i],pattern = "2012_11_02")==TRUE){
        "http://gdac.broadinstitute.org/runs/stddata__2012_11_02/gdac_counts.html" %>% 
          read_html() %>% html_nodes("table") %>% html_table(fill=TRUE,header=TRUE)
      }else{
        B[i] %>% read_html() %>% html_nodes("table") %>% html_table(fill=TRUE,header=TRUE)
      }
}
library(parallel)
  x <- 1:length(stddata)
  core=detectCores()
  cl <- makeCluster(core) # make cluster core
  results <- parLapply(cl,x,loopcraw) # par cluster
  stopCluster(cl) # close cluster
takement=data.frame()
for (i in 1:length(stddata)){
  A=results[[i]]
if (length(A)==0){
  cat("\n")
  cat("data for",cancertime[i],"can not be done","\n")
}else{ 
  if (A[[1]][1,1]==""){
    A_1=A[[1]][-1,]
  }else{
    A_1=A[[1]]
  }
  if ("PANCANCER" %in% A_1[,1]){
    norow=grep(x = A_1[,1],pattern = "PANCANCER")
    takement1=cbind(cancertime[i],A_1[-norow,])
    takement=c(takement,list(takement1))
  }else if ("Totals" %in% A_1[,1]){
    norow=grep(x = A_1[,1],pattern = "Totals")
    takement1=cbind(cancertime[i],A_1[-norow,])
    takement=c(takement,list(takement1))
  }else{
    takement1=cbind(cancertime[i],A_1)
    takement=c(takement,list(takement1))
  }
}
}
cat("\n")
tttt="ttttt 
1"
cancercanbedwon=read.table(textConnection(tttt),header = TRUE)
for (i in 1:length(takement)){
  if (cancer %in% takement[[i]][,2]){ 
    #if cancer is not included in this data
    cancercanbedwonT=takement[[i]][takement[[i]][,2]==cancer,-2]#second column is cancer name so delet
    cancercanbedwonT[setdiff(names(cancercanbedwon), names(cancercanbedwonT))] <- 0
    cancercanbedwon[setdiff(names(cancercanbedwonT), names(cancercanbedwon))] <- 0
    cancercanbedwon=rbind(cancercanbedwon,cancercanbedwonT)
  }
}
cancercanbedwonF=cancercanbedwon[-1,]
rownames(cancercanbedwonF)=1:nrow(cancercanbedwonF)
CancerCanBeDownD=cancercanbedwonF[,-grep(pattern = "ttttt",x = names(cancercanbedwonF))]
namesort=sort(names(CancerCanBeDownD[,-match("cancertime[i]",names(CancerCanBeDownD)) ]))
CancerCanBeSortName=CancerCanBeDownD[,c("cancertime[i]",namesort)]
return(CancerCanBeSortName)
})#detect time
}
yikeshu0611/TCGAFamiliar documentation built on May 21, 2019, 1:45 a.m.