sandbox/GRAN.R

# Project: repotools
# 
# Author: Renaud Gaujoux
# Created: May 30, 2014
###############################################################################


available.GRAN <- function(url = contrib.path(GRAN.repos(), type = type, version = release), type = getOption('pkgType'), release = NA
    , fields = GRAN.fields(all = TRUE) 
    , filters = .PACKAGES_filters_keep_all_versions, ..., drat.first = TRUE){
  
  # ensure the index is refreshed
  contrib_cache_clear(url)
  
  gran <- available.packages(url, fields = fields, filters = c(repotools_gh_rewrite, filters), ...)
  
  # add internal repos depends
  gran <- add_dcf_field(gran, 'XDepends')
  depends_repos <- apply(gran, 1L, function(p){
        res <- p[['XDepends']]
        res <- if( !is.na(res) ) res
        u <- p[['GithubUsername']]
        if( !is.na(u) ) res <- c(file.path(u, '~'), u, res)
        paste0(unique(res, fromLast = TRUE), collapse = " ")
      })
  depends_repos[!nzchar(depends_repos)] <- NA
  gran <- add_dcf_field(gran, 'XDepends', depends_repos, force = TRUE)
  
  # add field XPath
  gran <- add_dcf_field(gran, 'XPath')
  gran <- add_dcf_field(gran, 'XPath'
      , ifelse(gran[, 'GRANType'] %in% 'drat'
          , sprintf("%s/%s/~%s", gran[, 'GithubUsername'], gran[, 'Package'], gran[, 'GithubRepo'])
          , gran[, 'GRANPath'])
      , force = TRUE)
  
  # re-order putting drat repos first if requested
  if( drat.first ){
    # original index
    i0 <- seq(nrow(gran))
    gran <- gran[order(gran[, 'Package'], -(gran[, 'GRANType'] %in% 'drat'), i0), ]
  }
  
  # add ID fields
  gran <- cbind(gran, UniqueID = apply(gran, 1L, digest))
  
  gran
}

repotools_gran <- function(db){
  
  config <- gran_target_config()
  
  gran.only <- config$gran.only %||% FALSE
  # toogle log messages
  verbose <- config$verbose %||% TRUE
  if( !verbose ) message <- function(...) NULL
  
  result <- function(){
    if( gran.only ) targets
    else db
  }
  
  targets <- config$gran
  message("* Checking GRAN packages ... ", appendLF = FALSE)
  # do nothing if no GRAN target
  if( is.null(config$gran) ){
    message("none")
    return( result() )
  }
  
  # remove GRAN targets from db
  db <- db[!db[, 'Package'] %in% targets[, 'name'], , drop = FALSE]
  
  message(sprintf("yes [%s]", str_out(rownames(targets), total = TRUE)))
  
  # infer index type from first db entry
  m <- str_match(db[1L, 'Repository'], "((src/contrib)|(bin/windows)|(bin/macosx))(/contrib/([0-9.]+))?")
  stopifnot( !is.na(m[, 2L]) )
  type <- .contrib_path2type[m[, 2L]]
  r_release <- m[, 7L]
  
  # retrieve GRAN index: result is ordered by Package, Drat version first, then original order
  message(sprintf("* Fetching GRAN index [%s] ... ", type), appendLF = FALSE)
  gran0 <- gran <- available.GRAN(type = type, release = r_release, drat.first = TRUE)
  breakdown <- summary(factor(na.omit(gran[, 'GRANType']))) + 0
  message(sprintf("OK [%i packages - %s]", nrow(gran), paste0(names(breakdown), ': ', breakdown, collapse = " | ")))
  
  # lookup for target GRAN packages
  hit <- sapply(gran_pattern(rownames(targets)), function(x) grep(x, gran[, 'XPath'])[1L])
  targets <- cbind(targets, gran[hit, , drop = FALSE])
  
  if( anyNA(hit) ){
    miss <- which(is.na(hit))
    warning(sprintf("Could not find package%s in GRAN: %s", ifelse(length(miss) > 1, 's', '')
            , str_out(rownames(targets)[miss], Inf))
        , call. = FALSE)
    # remove missing packages from targets
    targets <- targets[-miss, , drop = FALSE]
    hit <- hit[-miss]
  }
  
  # return main db if no GRAN hit
  if( !nrow(targets) ) return( result() )
  
  message("* Found GRAN packages: ", str_out(targets[, 'XPath'], Inf, total = TRUE))
  # remove alternative package versions from GRAN for matched full targets
  hit_full_packages <- targets[which(rownames(targets) == targets[, 'XPath']), 'Package']
  if( length(hit_full_packages) && length(alt_gran <- setdiff(which(gran[, 'Package'] %in% hit_full_packages), hit)) ){
    gran <- gran[-alt_gran, , drop = FALSE]
  }
  
  ## merge with db
  library(plyr)
  available <- rbind.fill.matrix(db, gran)
  # add ID fields
  available <- add_dcf_field(available, 'UniqueID', apply(available, 1L, digest))
  
  # compute GRAN dependencies
  .f <- c('Package', 'Version', 'XPath', 'XDepends', 'GRANType')
  GRAN_dep <- function(pkgs, available){
    
    gdb <- pkgs
    gdeps <- pkgs
    while(TRUE){
      # list dependencies
      deps <- list.package.dependencies(gdb, all = TRUE, reduce = TRUE)
      m <- match.dependencies(deps, available)
      # look for further GRAN dependencies
      gdb <- m[!is.na(m[, 'Package']) 
              & !is.na(m[, 'XDepends']) 
              & !m[, 'UniqueID'] %in% gdeps[, 'UniqueID'], , drop = FALSE]
      
      # add up but only keep latest version
      gdeps <- rbind.fill.matrix(gdeps, gdb)
      gdeps <- reduce.dependencies(gdeps)
      
      if( !nrow(gdb) ) break
    }
    
    rownames(gdeps) <- unname(gdeps[, 'Package'])
    gdeps
  }
  
  targets <- GRAN_dep(targets, available)
#    print(gran_pkgs[, .f])
  # merge/force GRAN targets and dependencies into main db
  if( !gran.only && nrow(targets) ){
    
    # remove target GRAN packages from main db
    db <- db[!db[, 'Package'] %in% targets[, 'Package'], , drop = FALSE]
    # merge
    db <- rbind(db, targets[, colnames(db), drop = FALSE])
  }
  
  result()
}

# Parse GRAN path from package specification
as_gran_spec <- function(pkgs){
  
  if( !length(pkgs) || is(pkgs, 'gran_spec') ) return(pkgs)
  
  stopifnot( is.character(pkgs) &&  is.null(ncol(pkgs)) )
  
  # init result
  pkgs <- unique(pkgs)
  res <- structure(list(pkgs = pkgs, gran = NULL), class = 'gran_spec')
  
  # split GRAN path
  PKGS <- t(sapply(strsplit(pkgs, "/", fixed = TRUE), function(x) x[1:3] ))
  colnames(PKGS) <- c('user', 'repo', 'ref')
  if( length(i_gran <- !is.na(PKGS[, 2L])) ){
    
    gran <- PKGS[i_gran, , drop = FALSE]
    # build GRAN path from specs
    rownames(gran) <- GRAN_key(gran)
    gran <- cbind(query = rownames(gran), gran, name = gran[, 'repo'])
    res$gran <- gran
    
    # replace GRAN path with package name
    pkgs[i_gran] <- gran[, 'name']
    res$pkgs <- pkgs
  }
  
  # return
  res
}

with_gran <- function(expr, pkgs = NULL){
  
  # setup installation targets and package filters if necessary
  pkgs <- as_gran_spec(pkgs)
  if( length(pkgs$gran) ){
    gran_target_config(pkgs)
    on.exit( gran_target_config(NULL) )
  }
  
  # override default package filters
  filters <- list("R_version", "OS_type", "subarch", repotools_gh_rewrite, repotools_gran, "duplicates")
  oo <- options(available_packages_filters = filters)
  on.exit( options(oo), add = TRUE)
  
  # setup RCurl custom download.file method
  if( .setup_rcurl() ) on.exit( .setup_rcurl(TRUE), add = TRUE)
  
  e <- parent.frame()
  eval(expr, env = e)
}


gran_target_config <- sVariable()

install_gran <- function(pkgs, lib, repos = getOption('repos'), ...){
  
  # detect gran packages: user/repo/ref
  specs <- as_gran_spec(pkgs)
  if( !length(specs$gran) ) 
    return(install.packages(pkgs, lib = lib, repos = repos, ...))
  
  # replace gran key with package name
  pkgs <- specs$pkgs
  
#    # append GRAN repo (not now: index will be fetched in GRAN filter 
#    repos <- c(repos, GRAN.repos())
  
  # ensure GRAN fields are loaded by hooking in
  # tools:::.get_standard_repository_db_fields
#    restore <- ns_hook('.get_standard_repository_db_fields', function(...){
#                res <- c(.object(...), GRAN.fields())
#                print(res)
#                res
#            }, 'tools')
#    on.exit( restore() ) 
  
  res <- with_gran({
        install.packages(pkgs, lib = lib, repos = repos, ...)
      }, pkgs = specs)
  
}

#' Computes GRAN Package Dependencies
#' 
#' @param pkgs A character vector of package keys, e.g., \code{'renozao/repotools'} or 
#' \code{'renozao/NMF/devel'}.
#' @param repos CRAN-like repositories urls
#' @param ... other parameters passed to \code{\link[utils]{available.packages}}.
#' @param verbose logical that toggles log messages.
#' 
#' @export
#' @examples 
#' 
#' gran_dependencies(c('renozao/NMF/devel', 'renozao/repotools'))
#' 
gran_dependencies <- function(pkgs, repos = getOption('repos'), ..., verbose = FALSE){
  
  # detect gran packages: user/repo/ref
  specs <- as_gran_spec(pkgs)
  if( !length(specs$gran) ) return()
  
  o <- options(repos = c(repos, GRAN.repos()))
  on.exit( options(o) )
  
  specs$gran.only <- TRUE
  specs$verbose <- verbose
  
  with_gran({
        available.packages(...)
      }, pkgs = specs)
  
}

gh_api.path <- function(..., type = c('api', 'raw')){
    type = match.arg(type)
    url <- file.path(sprintf('https://%s.github.com', type), ...)
    #message(url)
    url
}

gh_io.path <- function(user, repo, ...) file.path(sprintf('http://%s.github.io/%s', user, repo), ...)

gh_rawapi.path <- function(...){
    gh_api.path(..., type = 'raw')
}

gh_repo_path <- function(user, repo, branch = 'master'){
    sprintf("https://github.com/%s", file.path(user, repo, branch))
}

GRAN.repos <- function(...){
    file.path('http://renozao.github.io/GRAN', ...)
    file.path('http://localhost/~renaud/projects/repotools/testRepo', ...)
}
GRAN.fields <- function(named = FALSE, all = FALSE){
    f <- c(Repo = 'GithubRepo', User = "GithubUsername"
            , Branch = "GithubRef", Forked = 'GithubFork'
            , SHA1 = 'GithubSHA1', RepoType = 'GRANType')
    if( all ) f <- c(f, XDepends = 'XDepends', Key = 'XPath', Type = 'GRANType', Path = 'GRANPath')
    if( !named ) f <- unname(f)
    f
}

GRAN.available <- function(type = getOption('pkgType'), fields = GRAN.fields(all = TRUE), ..., version = NULL){
    if( is.null(version) ){
        p <- available.pkgs(contrib.url2(GRAN.repos(), type = type), fields = fields, ...)
    }else{
        p <- available.pkgs(contrib.url(GRAN.repos(), type = type), fields = fields, ..., filters = .PACKAGES_filters_keep_all_versions)
        
        invert <- match.fun(ifelse(grepl("^!", version), "!", 'identity'))
        version <- gsub("^!", "", version)   
        sel <- invert(p[, 'GithubRef'] %in% version)
        p <- p[sel, , drop = FALSE]
    }
    # fix for migration of field names
    if( length(no_repo <- is.na(p[, 'GithubRepo'])) ){
        p[no_repo, 'GithubRepo'] <- p[no_repo, 'Package']  
    }
    
    p
}

# deprecated setup (prior-webhook)
#' @importFrom RCurl getURL
.GRAN.setup <- function(path = 'GRAN', repos = NULL, cleanup = FALSE, fields = GRAN.fields()){
    
    # create repo src/contrib directory
    repo_dir <- contrib.url(path, type = 'source')
    # cleanup on package directories exit
    if( cleanup ){
        on.exit( .cleanup(repo_dir) )
        .cleanup <- function(dir){
            unlink(list.dirs(dir, full.names = TRUE, recursive = FALSE), recursive = TRUE)
        }
    }
    if( !is.dir(repo_dir) ) dir.create(repo_dir, recursive = TRUE)
    
    
    
    if( !is.null(repos) ){
        repos <- cbind('renozao', c('RcppOctave', 'dbutils', 'repotools', 'CLIR', 'pkgmaker'))
        colnames(repos) <- c('user', 'project')
        
        .fields <- .repo_fields(fields)
        # fetch DESCRIPTION for each package
        raw_desc <- gh_read.dcf(repos, fields = .fields, raw = TRUE)
        
        
        sapply(raw_desc, function(desc){
                    # skip missin packages
                    if( is.na(desc) ) return()
                    
                    # read DESCRIPTION
                    con <- textConnection(desc)
                    on.exit( close(con) )
                    res <- setNames(read.dcf(con)[1L, ][.fields], .fields)
                    
                    # write DESCRIPTION
                    if( !is.dir(pdir <- file.path(repo_dir, res['Package'])) ) dir.create(pdir)
                    cat(desc, file = file.path(pdir, 'DESCRIPTION'))
                })
    }
    
    # write PACKAGES files
    create_repo(path, verbose = TRUE)
    write_PACKAGES(repo_dir, unpacked = TRUE, fields = fields, latestOnly = FALSE)
    # return path to file 
    invisible(file.path(repo_dir, 'PACKAGES'))
}

gh_getRaw <- function(user, repo, ..., ref = 'master'){
    url <- gh_rawapi.path(user, repo, ref, ...)
#    getURL(url, .opts = list(followlocation = TRUE))
    con <- curl(url)
    res <- readLines(con)
    close(con)
    res
}

#gh_getLastCommit <- function(){
#    
#}

.repo_fields <- function(fields){
    unique(c(ns_get('.get_standard_repository_db_fields', 'tools')('source'), fields))
}

#' @import stringr
gh_read.dcf <- function(repos, user = NULL, fields = NULL, raw = FALSE){
    
    if( is.vector(repos) ){
        repos <- cbind(dirname(repos), basename(repos))
    }
    if( !is.null(user) ) repos[, 1L] <- user
    
    fields <- .repo_fields(fields)
    .fetch_desc <- function(x){
        
        message("* Fetching package ", x[1], ":", x[2], " ... ", appendLF = FALSE)
        desc <- gh_getRaw(x[1], x[2], 'DESCRIPTION')
        # not found? => return dummy vector
        if( grepl("^not found", desc, ignore.case = TRUE) ){
            message("[ERROR: not found]")
            if( raw ) return(as.character(NA))
            return( setNames(rep(as.character(NA), length(fields)), fields) )
        }
        
        # read DESCRIPTION
        con <- textConnection(desc)
        on.exit( close(con) )
        res <- setNames(read.dcf(con)[1L, ][fields], fields)
        
        message("[OK - ", res['Version'], "]")
        if( raw ) desc else res
    }
    
    res <- apply(repos, 1L, .fetch_desc)
    
    if( raw ) res <- setNames(res, repos[, 2L])
    else{
        res <- t(res)
        rownames(res) <- repos[, 2L]
    }
    res
}

md5hash <- function(x, strip = x, skip = NULL){
    x <- x[file.exists(x)]
    
    if( !length(x) ) return( character() )
    
    x <- normalizePath(x)
    # compute hash list of source directory
    hash <- md5sum(dir(x, recursive = TRUE, full.names = TRUE))
    
    # process
    hash <- hash[basename(names(hash)) != 'MD5']
    if( !is_NA(strip) ){
        if( identical(strip, 1L) ) strip <- dirname(x)
        else strip <- normalizePath(strip) 
        names(hash) <- gsub(strip, '', names(hash), fixed = TRUE)
    }
    names(hash) <- gsub('^/', '', names(hash))
    
    # skip requested patterns
    if( !is.null(skip) ){
        for(s in skip){
            hash <- hash[grep(s, names(hash), invert = TRUE)]
        }
    }
    # return
    hash
}

GRAN_key <- function(...){
    
    
    .local <- function(...){
        res <- paste0(..., collapse = "/")
        gsub("(/NA)*$", "", res)
    }
    
    args <- list(...)
    if( length(args) == 1L ){
        x <- args[[1L]]
        if( !is.null(nrow(x)) ){
            if( length(grep("^Github", colnames(x))) ){
                .f <- c('GithubUsername', 'GithubRepo', 'GithubRef', 'GithubPath')
                x <- rbind.fill.matrix(x, matrix(NA, 0, length(.f), dimnames = list(NULL, .f)))
                x <- x[, .f, drop = FALSE]
            }
            return( apply(x, 1L, .local) )
        }
        stopifnot( is.character(x) )
    }
    do.call(.local, args)
    
}

GRAN.update <- function(src, outdir = dirname(normalizePath(src)), clean = FALSE, force = FALSE, fields = GRAN.fields(), update = NA, actions = c('changes', 'PACKAGES', 'index'), verbose = TRUE){
     
    # dump messages if non-verbose
    if( !verbose ) message <- function(...) NULL
    
    message(sprintf("* Updating GRAN packages in %s [source: %s]", outdir, src))
    
    # initialise complete repository structure if necessary
    if( !is.dir(outdir) || clean ) create_repo(outdir, pkgs = NA)
    
    # update GitHub source package
    update_gh <- update
    if( isTRUE(update_gh) ) update_gh <- c('Github', 'drat')
    contrib_github <- GRAN.update_github(src, update = update_gh)
     
    # update Drat packages
    DATA <- GRAN.update_drat(file.path(src, 'drat'), update = update)
    
    # match type of action to perform
    actions <- match.arg(actions, several.ok = TRUE)
    
    # update PACKAGES files
    if( 'PACKAGES' %in% actions ){
        
        # make PACKAGES in output repos for built packages
        create_repo(outdir, verbose = TRUE)
        
        ## add Github packages to those in src/contrib
        gh_P <- read.dcf(file.path(contrib_github, 'PACKAGES'))
        # fix for migration of field names
        gh_P <- add_dcf_field(gh_P, 'GithubRepo', gh_P[, 'Package'])
        gh_P <- add_dcf_field(gh_P, 'GRANType', 'github')
        gh_P <- cbind(gh_P, pkgType = 'source', R_release = NA)
        
        # preprend to drat packages
        library(plyr)
        PACKAGES <- rbind.fill(data.frame(gh_P, stringsAsFactors = FALSE), DATA$PACKAGES)
        PACKAGES <- add_dcf_field(PACKAGES, 'GRANPath', GRAN_key(PACKAGES))
        
        # Filter out bad packages
        # remove bad packages
        bad <- bad_version(PACKAGES[, 'Version'])
        if( any(bad) ){
            message(sprintf("* Removing packages with invalid versions [%s])"
                            , str_out(setNames(PACKAGES[bad, 'Version'], PACKAGES[bad, 'Package']), use.names = TRUE, total = TRUE)
                            )
            )
        }
        PACKAGES <- PACKAGES[!bad, , drop = FALSE]
        
        # combined GitHub + Drat repos
        write_GRAN_repo('pkgType', function(P){
            msg <- sprintf('  * Repo all :%i ', nrow(P))
            list(PACKAGES = P, path = outdir, msg = msg)
        }, PACKAGES = PACKAGES, append = TRUE, no.dups = FALSE)
                
    }
    
    # generate index file
    if( 'index' %in% actions ){
        message("* Generating index file:")
        write_PACKAGES_index(outdir, title = 'GRAN: Github R Archive Network')
    }
    
}

# Fetch package data from GitHub and update associated unpacked sub-directory
update_github_user <- function(user, dir, repos = NULL, repos_data = NULL, packages = NULL, all.R = FALSE){
    
    
    if( !is.null(repos) ) messagef(": %s", str_out(repos, 4, total = length(repos) > 4), appendLF = FALSE)
    # clean up priority repos
    repos <- repos[!is.na(repos)]
    
    # fetch all the users repositories
    user_repos <- gh_user_repo(user)
    
    if( !length(user_repos) ){
        message(" ... SKIP [user has no repo]")
        return()
    }
    
    user_repos0 <- user_repos
    
    # build user repos index
    user_url <- dirname(gsub("^https?://", '', user_repos[[1L]]$html_url))
    user <- user_repos[[1L]]$owner$login # to ensure the case is correct
    udir <- file.path(dir, user_url)
    loc_user_file <- file.path(udir, 'USER')
    is_updating <- file.exists(loc_user_file)
    is_updating <- nrow(repos_data) %||% 0
    fields <- c('name', 'full_name', 'html_url', 'fork', grep('_at$', names(user_repos0[[1L]]), value = TRUE), 'language')
    .USER_INDEX <- cbind(user = user
            , ldply(user_repos0, .id = NULL, function(x) as.data.frame(x[fields], stringsAsFactors = FALSE))
                , indexed_at = NA
                , refs_sha = NA
                , desc_sha = NA
            , stringsAsFactors = FALSE)
    if( is_updating ){
#        luser <- read.repos(file = loc_user_file)
        luser <- as.matrix(repos_data)
        # fix for backward compatiblility
        colnames(luser)[colnames(luser) == 'repo'] <- 'name'
        luser <- set_column(luser, 'indexed_at', luser[, 'pushed_at'])
        luser <- rbind.fill.matrix(luser, .USER_INDEX[0, , drop = FALSE])
        luser <- luser[!is.na(luser[ , 'name']), , drop = FALSE]
        # force in previous indexing data
        inm <- match(luser[, 'name'], .USER_INDEX$name, nomatch = 0L)
        .f <- c('indexed_at', 'refs_sha', 'desc_sha')
        .USER_INDEX[inm, .f] <- luser[inm > 0, .f]
    }
    # set rownames
    rownames(.USER_INDEX) <- .USER_INDEX[['name']]
    
    # save repos data on.exit
    .update_repo_index <- function(data, content){
        .USER_INDEX[data$name, 'indexed_at'] <<- data[['pushed_at']]
        if( is.matrix(content) ){
            SHA <- paste0(na.omit(content[2, ]), collapse = ' ')
            if( !nzchar(SHA) ) SHA <- NA
            .USER_INDEX[data$name, 'refs_sha'] <<- unname(SHA)
            desc <- paste0(na.omit(content[1, ]), collapse = ' ')
            if( !nzchar(desc) ) desc <- NA
            .USER_INDEX[data$name, 'desc_sha'] <<- unname(desc)
        }
        
    }

    .save_user_data <- function(file = loc_user_file){
        dir.create(dirname(file), recursive = TRUE, showWarnings = FALSE)
        write.repos(.USER_INDEX, file = file)
    }
    on.exit( .save_user_data(), add = TRUE)
    
    force <- FALSE
#    if( user != 'renozao' ) force <- TRUE
    
    # only check repos that were pushed after last indexing
    diffs <- .USER_INDEX$pushed_at !=  .USER_INDEX$indexed_at
    changed <- .USER_INDEX$name[which( is.na(diffs) | diffs )]
    if( !force ) user_repos <- user_repos[names(user_repos) %in% changed]
    
    if( !length(user_repos) ){
        message(sprintf(" ... SKIP [not changed - %s repos]", length(user_repos0)))
        return()
    }
    
    # only process given repos vector and all the user's R repos (if requested or first time indexing)
    user_R_repos <- which(sapply(user_repos, function(x) identical(x$language, 'R')))
    if( force || all.R || !is_updating ){
        repos <- c(repos, names(user_R_repos))
    }
    m <- match(names(user_repos), unique(repos))
    user_repos <- user_repos[!is.na(m)]
#    user_repos <- user_repos[order(m)]

    if( !length(user_repos) ){
        message(sprintf(" ... SKIP [not changed - %s R / %s changed / %s repos]", length(user_R_repos), length(user_repos), length(user_repos0)))
        return()
    }
    
    if( is_updating ) message(sprintf(" (updating %s/%s):", length(user_repos), length(user_repos0)))
    else message(sprintf(" (indexing %s repos):", length(user_repos)))
    
    res <- sapply(user_repos, function(rdata){
        
        # create base directory
        path <- gsub("^https?://", '', rdata$html_url)
        pdir <- file.path(dir, path)
        repo <- rdata$name
        RDATA <- .USER_INDEX[repo, , drop = TRUE]
        
        # last commit
        if( !is_NA(COMMIT_SHA0 <- RDATA[['refs_sha']]) ){
            m <- str_match(COMMIT_SHA0, "([^@]+)@([^ ]+)")
            COMMIT_SHA0 <- setNames(m[, 3], m[, 2])
        }
        # DESCRIPITON file SHA
        if( !is_NA(DESC_SHA0 <- RDATA[['desc_sha']]) ){
            m <- str_match(DESC_SHA0, "([^@]+)@([^ ]+)")
            DESC_SHA0 <- setNames(m[, 3], m[, 2])
        }
        
        message("  ** ", repo, appendLF = FALSE)
                
        # process all suitable branches: default branch, devel*
        refs <- gh_repo_head(user, rdata$name)
        res <- NULL
        if( !length(refs) ){
            message(" ... OK [empty]")
            
        }else{
            refs_todo <- unique(c(rdata$default_branch, grep("^devel[^/]*$", names(refs), value = TRUE)))
            if( length(refs_todo) > 1L ) message(":")
            
            res <- sapply(refs_todo, function(ref){
                
                # log branch name
                if( length(refs_todo) > 1L ){
                    message("     - ", ref, "/ ... ", appendLF = FALSE)
                }else message("/", ref, "/ ... ", appendLF = FALSE)
                
                refobject <- refs[[ref]] 
                SHA <- refobject$commit$sha
                
                # not sure how this can happen, but it does happens (e.g., wch/r-source/trunk)
                if( is.null(SHA) ){
                    message('SKIP [invalid ref]')
                    return(c(NA, NA))   
                }
                
                # ref sub-directory
                refdir <- file.path(pdir, 'tarball', ref, repo)
                added_flag <- if( is.na(RDATA[['indexed_at']]) ) '+' else ''
                # skip packages processed by the webhook
#                flag_file <- file.path(refdir, 'SHA1')
#                if( dir.exists(refdir) && !file.exists(flag_file) ){
#                    message('SKIP [in webhook]')
#                    return(0L)
#                }
                
                # load/check last local commit SHA
                desc_file <- list.files(refdir, recursive = TRUE, full.names = TRUE, pattern = '^DESCRIPTION$')
                stopifnot( length(desc_file) %in% c(0, 1) )
                # SHAs
                SHA0 <- COMMIT_SHA0[ref]
                SHA0_D <- DESC_SHA0[ref]
                DESC_SHA <- COMMIT_SHA <- sprintf("%s@%s", ref, SHA)
                if( !is.na(SHA0_D) ) DESC_SHA <- sprintf("%s@%s", ref, SHA0_D)
                if( is_NA(SHA0) && length(desc_file) ){
                    dcf <- read.dcf(desc_file)
                    dcf <- add_dcf_field(dcf, 'GithubSHA1')
                    SHA0 <- dcf[, 'GithubSHA1']
                }
                
                # skip whole repo if no changes
                if( SHA %in% SHA0 && !force ){
                    message('OK [no changes]')
                    return(c(DESC_SHA, COMMIT_SHA))
                }
                
                # fetch content of master branch
                cnt <- gh_get_content(user, repo, ref = ref)
                if( is.null(d <- cnt$DESCRIPTION) ){
                    
                    # test pkg/ sub-directory (e.g., rforge-like tree structure)
                    if( !is.null(cnt$pkg) && cnt$pkg$type == 'dir' ){
                        cnt <- gh_get_content(user, repo, ref = ref, path = 'pkg')
                    }
                    
                    if( is.null(d <- cnt$DESCRIPTION) ){
                        message(added_flag, 'SKIP [no package]')
                        return(c(NA, COMMIT_SHA))
                    }
                }
                                
                # append package subdirectory
                pkg_subdir <- dirname(cnt$DESCRIPTION$path)
                if( pkg_subdir != '.' ){
                    refdir <- file.path(refdir, paste0('!', pkg_subdir))
                    message(added_flag, file.path(pkg_subdir, ''), appendLF = FALSE)
                    added_flag <- ''
                }
                
                
                # Build Github data part
                GH_DATA <- cbind(GithubRepo = repo
                        , GithubUsername = user
                        , GithubRef = ref
                        , GithubSHA1 = SHA
                        , GithubFork = ifelse(rdata$fork, 'yes', 'no')
                        , GithubPath = if( pkg_subdir != '.' ) pkg_subdir else NA)
                
                # download DESCRIPTION file only if necessary
                SHA_D <- cnt$DESCRIPTION$sha
                # load data from packages db if present
                dcf <- if( i_pkg <- match(GRAN_key(GH_DATA), rownames(packages), nomatch = 0L) ){
                    packages[i_pkg, , drop = FALSE] 
                }
                # fetch update if necessary
                if( !SHA_D %in% SHA0_D || is.null(dcf) ){
                    
                    message(added_flag, "DESCRIPTION", appendLF = FALSE)
                    tmp_desc <- file.path(tempdir(), paste0('DESCRIPTION_', DESC_SHA))
                    on.exit( unlink(tmp_desc), add = TRUE)
                    download.file(d$download_url, dest = tmp_desc, quiet = TRUE)
                    # add fields
                    dcf <- try(read.dcf(tmp_desc), silent = TRUE)
                    if( is(dcf, 'try-error') ){
                        message(sprintf('[error: %s]', dcf))
                        return(c(DESC_SHA, COMMIT_SHA))
                    }
                }else{
                    message("CACHE", appendLF = FALSE)
                }
                message(sprintf("[%s] ", substr(SHA_D, 1, 7)), appendLF = FALSE)
                DESC_SHA <- sprintf("%s@%s", ref, SHA_D)
                
                # add/replace up-to-date Github data
                dcf <- cbind(dcf[, setdiff(colnames(dcf), colnames(GH_DATA)), drop = FALSE], GH_DATA)
                #
                
                # create ref sub-directory
                dir.create(refdir, recursive = TRUE, showWarnings = FALSE)
                
                # process src/ sub-directory
                has_src <- !is.null(cnt$src) && cnt$src$type == 'dir'
                refdir_src <- file.path(refdir, 'src')
                if( has_src ){
                    message('src ', appendLF = FALSE)
                    dir.create(refdir_src, recursive = TRUE, showWarnings = FALSE)
                }else unlink(refdir_src)
                dcf <- add_dcf_field(dcf, 'NeedsCompilation', .yesno(has_src), force = TRUE)
                #
                
                # update DESCRIPTION file
                write.dcf(dcf, file.path(refdir, 'DESCRIPTION'))
                
                message()
                return(c(DESC_SHA, COMMIT_SHA))
            })
        }
        
        # update indexed data
        .update_repo_index(rdata, res)
        
        res[res > 0L]
    }, simplify = FALSE)
    
#    res <- res[lengths(res) > 0L]
#    res
    .USER_INDEX
}

repo_matrix <- function(source, obj){
    
    fields <- c('source', 'user', 'name', 'full_name', 'html_url', 'fork', 'created_at', 'updated_at', 'pushed_at', 'language')
    res <- matrix(NA, 0, length(fields), dimnames = list(NULL, fields))
    
    if( !nargs() ) return(res)
    
    if( !length(obj) ) return()
    
    res <- t(sapply(obj, function(x){
       rdata <- c(user = x$owner$login)
       c(rdata, unlist(x[setdiff(fields, c('source', names(rdata)))])) 
    }))
    res <- cbind(source = source, res)
    rownames(res) <- res[, 'full_name']
    res[!duplicated(rownames(res)), , drop = FALSE]
}

read.repos <- function(dir, repos = NULL, file = NULL){
    
    if( is.null(file) ){
        src <- file.path(dir, 'repos/github')
        file <- file.path(src, 'REPOS') 
        if( !is.null(repos) ){
            src <- file.path(src, 'src/contrib/github.com', repos)
            file <- file.path(src, 'USER')
        }
    }
    read.table(file, sep = "\t", header = TRUE, stringsAsFactors = FALSE)
}

write.repos <- function(x, file, ...){
    write.table(x, file = file, sep = "\t", row.names = FALSE)
}

list.github.packages <- function(Rsince = Sys.Date(), skip = c('cran/.*', 'Bioconductor-mirror/.*', 'rforge/.*')){
    
    # init result
    res <- repo_matrix()
    
    # From R repositories on Github if a since date is passed
    if( !is.null(Rsince) ){
        message("* Checking Github R repositories pushed since ", Rsince, " ... ", appendLF = FALSE)
        r_repos <- gh_search_Rrepos(paste0("pushed:>=", Rsince))
        r_lang <- repo_matrix('Github', r_repos$content$items)
        message(sprintf("OK [%s repos]", nrow(r_lang)))
        res <- rbind(res, r_lang)
    }
    
    # From CRAN
    message("* Checking Github package on CRAN ... ", appendLF = FALSE)
    tmp <- file.path(tempdir(), 'packages.rds')
    if( !file.exists(tmp) ) download.file("http://cran.r-project.org/web/packages/packages.rds", dest = tmp, quiet = TRUE)
    CRAN <- readRDS(tmp)
    # select packages with GitHub BugReports
    m <- str_match(CRAN[, 'BugReports'], "://(github\\.com/([^/]+)/([^/]+))/issues")
    i_gh <- which(!is.na(m[, 1L]))
    m <- cbind(source = 'CRAN', m[i_gh, 3:4, drop = FALSE])
    colnames(m) <- c('source', 'user', 'name')
    message(sprintf("OK [%i/%i packages | %s users]", nrow(m), nrow(CRAN), length(unique(m[, 'user']))))
    res <- plyr::rbind.fill.matrix(res, m)
    
    # From drat forks
    message("* Checking drat forks on Github ... ", appendLF = FALSE)
    drat <- fetch_drat_forks()
    drat <- repo_matrix('drat', drat)
    # force language R for drat repos
    drat[, 'language'] <- 'R'
    drat_users <- unique(drat[, 'user'])
    message(sprintf("OK [%s users]", length(drat_users)))
    res <- rbind(res, drat)
    
    # remove duplicated entries
    rn <- res[, 'full_name']
    i <- is.na(rn) 
    res[i, 'full_name'] <- rn[i] <- file.path(res[i, 'user'], res[i, 'name'])
    rownames(res) <- rn
    res <- res[!duplicated(rownames(res)), , drop = FALSE]
    
    # skip some repos if requested
    if( !is.null(skip) ){
        i_skip <- unlist(lapply(skip, grep, rownames(res)))
        if( length(i_skip) ) res <- res[-i_skip, ]
        message(sprintf("* Skipping %i repos: %s", length(i_skip), str_out(skip, Inf)))
    }
    
    # total
    message(sprintf("* Number of unique Github users: %s", length(unique(res[, 'user']))))
    res
}

list.changed.packages <- function(dir){
    list.changed.files(dir, pattern.new = '^DESCRIPTION$', pattern.changed = "/((DESCRIPTION)|(src))$")
}

list.changed.user <- function(dir){
    list.changed.files(dir, pattern.new = '/USER$', pattern.changed = "/USER$", basename = TRUE)
}

list.changed.files <- function(dir, pattern.new, pattern.changed, basename = FALSE){
    library(git2r)
    .git <- discover_repository(dir)
    basedir <- dirname(.git)
    repo <- repository(.git)
    st <- status(repo)
    
    .list <- function(obj){
        lapply(obj, function(x){
                p <- file.path(basedir, x)
                list.files(p, recursive = TRUE, full.names = TRUE, pattern = pattern.new)
            })
    }
    res <- list(
        Changed = grep(pattern.changed, st$unstaged[names(st$unstaged) == 'modified'], value = TRUE)
        , New = if( !basename ).list(st$untracked) else grep(pattern.new, st$untracked, value = TRUE)
        , Deleted =  grep(pattern.changed, st$unstaged[names(st$unstaged) == 'deleted'], value = TRUE)
    )
    
    # clean up
    .clean <- function(x){
        x <- unlist(x, use.names = FALSE)
        if( !basename ) x <- dirname(x)
        x <- sub(sprintf("^%s/", basedir), '', x)
        unique(gsub("//", "/", x, fixed = TRUE))
    }
    res <- sapply(res, .clean, simplify = FALSE)
    
    attr(res, 'root') <- basedir
    res
}


pull_github_packages <- function(dir, sources = NULL){
    
    
    message("* Pullling Github package into ", dir)
    
    # look for a config file
    CONFIG <- list()
    if( file.exists(config_file <- file.path(dir, 'config')) ){
        CONFIG <- yaml::yaml.load_file(config_file)
    }
    PULL_TIME <- NULL
    on.exit({
        if( !is.null(PULL_TIME) ) CONFIG$pulled_at <- as.character(PULL_TIME) 
        cat(yaml::as.yaml(CONFIG), file = config_file)
    }, add = TRUE)
    ##
    
    result.ok <- function(res){
        PULL_TIME <<- Sys.time()
        res
    } 
    
    # fetch R repos pushed since last pulled date
    since <- CONFIG$pulled_at %||% Sys.Date()
    gh_packages <- list.github.packages(as.character(as.Date(since)))
    
    if( length(sources) )
        gh_packages <- gh_packages[gh_packages[, 'source'] %in% sources, , drop = FALSE]
    
    if( !nrow(gh_packages) ) return(result.ok(0L))
    
    # update each unpacked sub-directory
    on.exit(gh_context_save(), add = TRUE)
    library(plyr)
    
    # load previous index and save new on exit
    USER <- NULL
    if( file.exists(index_file <- file.path(dir, "REPOS")) ){
        message("* Loading previous repo index ... ", appendLF = FALSE)
        USER <- read.repos(file = index_file)                
        index <- as.matrix(USER[!duplicated(USER[, 'full_name']), , drop = FALSE])
        message(sprintf(" OK [%i refs | %i repos]", nrow(USER), nrow(index)))
        
        # add data from previous runs 
        i <- match(gh_packages[, 'full_name'], index[, 'full_name'])
        # pick Github data from index file for CRAN packages
        i_CRAN <- gh_packages[, 'source'] == 'CRAN'
        .f <- intersect(colnames(gh_packages), colnames(index))
        gh_packages[i_CRAN, .f] <- index[i[i_CRAN], .f]
        # add indexed date
        gh_packages <- cbind(gh_packages, indexed_at = index[i, 'indexed_at'])
        
        # limit check to repos that are either not R or with index date not up to date
        message("* Filtering up-to-date repos ... ", appendLF = FALSE)
        g <- as.data.frame(gh_packages, stringsAsFactors = FALSE)
        g[is.na(g)] <- ''
        to_update <- subset(g, (language == 'R' & pushed_at != indexed_at) | language != 'R' )
        message(sprintf(" OK [%i/%i changed]", nrow(to_update), nrow(gh_packages)))
        gh_packages <- as.matrix(to_update)
    }
    
    PACKS <- NULL
    if( file.exists(packages_file <- file.path(dir, 'packages.rds')) ){
        PACKS <- readRDS(packages_file)
        rownames(PACKS) <- GRAN_key(PACKS)
    }
    
    if( !nrow(gh_packages) ) return(result.ok(0L))
    
    USER_hash <- digest(USER)
    on.exit({
        message("* Checking changes in repo indexes ")
        user_path <- list.files(dir, recursive = TRUE, full.names = TRUE, pattern = "^USER$")
        uname <- basename(dirname(user_path))
        # remove users for which there are updated data 
        USER <- USER[!USER[, 'user'] %in% uname, , drop = FALSE]
        USER_NEW <- ldply(user_path, function(x) read.repos(file = x))
        USER <- rbind.fill(USER, USER_NEW)
        # drop old columns
        USER$refs <- NULL
        USER$commit_sha <- NULL
        # order by full_name
        USER <- USER[order(USER[, 'full_name']), , drop = FALSE]
        
        if( digest(USER) != USER_hash ){
            message("* Updating repo index ")
            # write file
            write.repos(USER, index_file)
            
        }
            
    }, add = TRUE)
    
    # update Github unpacked packages for each user
    sum <- summary(factor(gh_packages[!duplicated(paste(gh_packages[, 'source'], gh_packages[, 'user'])), 'source']))
    message("* Processing repos: ", str_out(sum, Inf, use.names = TRUE))
    with_rcurl({
        iuser <- split(seq(nrow(gh_packages)), gh_packages[, 'user'])
#        iuser <- head(iuser, 3L)
        USER_NEW <- ldply(iuser, function(i){
            data <- gh_packages[i, , drop = FALSE]
            user <- data[1L, 'user']
            message("* Checking ", user, appendLF = FALSE)
            update_github_user(user, dir = dir, repos = data[, 'name']
                                , repos_data = USER[USER[, 'user'] == user, , drop = FALSE]
                                , packages = PACKS[PACKS[, 'GithubUsername'] == user, , drop = FALSE])
        }, .id = NULL)
        
        # Update USER table if necessary
#        if( nrow(USER_NEW) ){
#            # remove data from updated repos
#            if( !is.null(USER) ){
#                USER <- USER[!USER[, 'full_name'] %in% unique(USER_NEW[, 'full_name']), , drop = FALSE]
#            }
#            # append new data
#            USER <- rbind.fill(USER, USER_NEW)
#        }
    })

    result.ok(nrow(USER_NEW))
}


write_GH_PACKAGES <- function(dir, fields = c('Date', GRAN.fields())){
    write_PACKAGES(dir, type = 'source', unpacked = TRUE, fields = fields, latestOnly = FALSE, subdirs = TRUE)
}

repopath2xpath <- function(x){
    gsub(".*/github.com/([^/]+)/([^/]+)/tarball/([^/]+)/.*", "\\1/\\2/\\3", x)
}

#' Updates GitHub Source Package Repository
#' 
#' @param src path to the directory that holds the package directory tree 
#' @importFrom tools md5sum
GRAN.update_github <- function(src, force = FALSE, fields = GRAN.fields(), update = FALSE, actions = c('changes', 'PACKAGES', 'index')
                                , clean = TRUE, test = FALSE, verbose = TRUE){
    
    library(plyr)
    do_commit <- TRUE
    if( test ){
        do_commit <- FALSE
        verbose <- TRUE
        clean <- FALSE
    }
    
    # dump messages if non-verbose
    if( !verbose ) message <- function(...) NULL
    
    src0 <- src
    base_github <- 'repos/github'
    outdir <- file.path(src0, base_github)
    src_github <- contrib.url(base_github, type = 'source')
    src <- file.path(src0, src_github)
    src_fullpath <- normalizePath(src)
    if( is_NA(update) ){
        return(src_fullpath)
    }
    message("* Updating GitHub source packages in ", src_fullpath)
    
    # match type of action to perform
    actions <- match.arg(actions, several.ok = TRUE)
    
    # pull updates for "all" GitHub R repos
    update.src <- NULL
    if( is.character(update) ){
        update.src <- update
        update <- TRUE
    }
    
    nUpdated <- 0
    if( update ){
        nUpdated <- pull_github_packages(src_fullpath, update.src)
    }
        
    ## CHECK CHANGES
    library(git2r)
    # initialise repository if not already in a git repo
    if( !in_repository(src) ){
        init(src0)
    }
    GH_repo <- repository(src0)
    #        status <- status(GH_repo) 
#    message("* Checking changes ", appendLF = FALSE)
#    CHANGES <- list.changed.packages(src)
#    message(sprintf("[%s]", paste(lengths(CHANGES), names(CHANGES), collapse = " | ")))
    #
    
    # update PACKAGES files
    if( 'PACKAGES' %in% actions ){
                
        ## add Github packages to source PACKAGES
        # write PACKAGES file from Github source directories 
        fields <- c('Date', fields)
        message("* Generating PACKAGES files in '", src, "'")
        
        message("* Loading updated PACKAGES ... ", appendLF = FALSE)
        d <- list.files(src_fullpath, recursive = TRUE, pattern = '^DESCRIPTION$', full.names = TRUE)
        PACKS_NEW <- do.call(rbind.fill.matrix, lapply(d, read.dcf))
        n_up <- nrow(PACKS_NEW) %||% 0L 
        messagef("OK [%i packages]", n_up)
        if( !n_up ) return()
        
        # enforce format
        PACKS_NEW <- add_dcf_field(PACKS_NEW, 'Package')
        PACKS_NEW <- add_dcf_field(PACKS_NEW, 'GithubRepo', PACKS_NEW[, 'Package'])
        
        # set rownames and check/remove  for duplicates
        rownames(PACKS_NEW) <- GRAN_key(PACKS_NEW)
        if( anyDuplicated(rownames(PACKS_NEW)) ){
            dups <- unique(rownames(PACKS_NEW)[duplicated(rownames(PACKS_NEW))])
            id <- which(rownames(PACKS_NEW) %in% dups)
            messagef("* Removing %i duplicated packages: %s", length(id), str_out(dups, 5, total = TRUE))
            PACKS_NEW <- PACKS_NEW[-id, , drop = FALSE] 
        } 
        
        # clean up
        bad <- which( is.na(PACKS_NEW[, 'Package']) | !nzchar(PACKS_NEW[, 'Package']) ) 
        if( length(bad) ){
            bad_nm <- unique(PACKS_NEW[bad, 'Package'])
            messagef("* Removing %i packages with invalid names: %s", length(bad), str_out(bad_nm, 5, total = TRUE))
            PACKS_NEW <- PACKS_NEW[-bad, , drop = FALSE]
        }
        
        # load previous PACKAGES
        message("* Loading old PACKAGES ... ", appendLF = FALSE)
        if( file.exists(packages_file <- file.path(src, 'packages.rds')) ){
            PACKS <- readRDS(packages_file)
            rownames(PACKS) <- GRAN_key(PACKS)
            PACKS <- PACKS[!rownames(PACKS) %in% rownames(PACKS)[duplicated(rownames(PACKS))], , drop = FALSE]
            messagef("OK [%i packages]", nrow(PACKS))
            
        }else{
            PACKS <- rbind.fill.matrix(matrix(NA, 0, 0), PACKS_NEW[0, , drop = FALSE])
            message("none")
        }
        
        # align package matrices
        i <- match(rownames(PACKS_NEW), rownames(PACKS), nomatch = 0L)
        PACKS <- rbind.fill.matrix(PACKS, PACKS_NEW[0L, , drop = FALSE])
        rownames(PACKS) <- GRAN_key(PACKS)
        PACKS_NEW <- rbind.fill.matrix(PACKS_NEW, PACKS[0L, , drop = FALSE])
        PACKS_NEW <- PACKS_NEW[, colnames(PACKS), drop = FALSE]
        rownames(PACKS_NEW) <- GRAN_key(PACKS_NEW)
        
        # add new packages
        new <- which(!i)
        if( length(new) ){
            messagef("  ** Adding %i packages to index", length(new))
            PACKS <- rbind(PACKS, PACKS_NEW[new, colnames(PACKS), drop = FALSE])
        }
        rownames(PACKS) <- GRAN_key(PACKS)
        
        # update other packages
        messagef("  ** Updating %s packages ", sum(i>0))
        PACKS[i, colnames(PACKS)] <- PACKS_NEW[i>0, colnames(PACKS), drop = FALSE]
        
        # clean up
        PACKS <- PACKS[!is.na(PACKS[, 'Package']), , drop = FALSE]
        PACKS <- add_dcf_field(PACKS, 'Path', file.path('github.com', rownames(PACKS)), force = TRUE)
        
        # TODO: remove deleted packages
        to_delete <- NULL
#        if( length(del <- which(rownames(PACKS) %in% to_delete)) ){
#            message(sprintf("  ** Removing %i packages from index", length(del)))
#            PACKS <- PACKS[-del, , drop = FALSE]
#        }
        
        # write files
        message(sprintf("  ** Updating file PACKAGES files [%i]", nrow(PACKS)))
#        write_PACKAGES_files(PACKS, src)
        if( !test ) saveRDS(PACKS, file.path(src, 'packages.rds'))
            
        # clean pulled data
        if( clean ){
            unlink( file.path(src_fullpath, 'github.com'), recursive = TRUE)
        }
        
#        }else if( force ){
#            message("  ** Writing PACKAGES from unpacked directories")
##            write_GH_PACKAGES(src, fields = fields)
#        }
        
    }
    
    # generate index file
    if( 'index' %in% actions ){
        message("* Generating index file:")
        write_PACKAGES_index(outdir, title = 'GRAN: Github R Archive Network')
    }
    
    # return output directory to calling script in non-interactive mode
    invisible(PACKS)
}
renozao/repotools documentation built on May 27, 2019, 5:53 a.m.