R/cransync.R

Defines functions getmeta process_cran parse_cran_file tsort cranstatus as_sd meansize meandeps

## working from the cran sync log

getmeta <- function(cransync = "./Data/cran.csv"){
    ## get the CRAN sync listing:
    ## rsync -azn --out-format="%f,%l,%M"  cran.r-project.org::CRAN/src/contrib . > Data/cran.csv 
    ##
    cran = data.table::fread(cransync, stringsAsFactors=FALSE)
    names(cran)=c("path","size","time")

    pc = process_cran

    regexps = list(
        live = "contrib/[A-Za-z][^/]*.tar.gz",
        archive = "contrib/Archive/[^/]*/[A-Za-z][^/]*tar.gz",
        last = "contrib/Archive/[^/]*$"
    )

    
    
    parts = lapply(regexps, function(regexp){
        pc(cran[grep(regexp, cran$path),])
    })

    names(parts) = names(regexps)

    ## Some "live" packages are symlinks. No real package should be smaller than this, and
    ## all symlinks should be smaller.
    parts$live = parts$live[parts$live$size > 120,]
    
    return(parts)

}

process_cran <- function(cran){
    ## add path parts and fix timestamp
    parts = parse_cran_file(cran$path)
    cran$time = as.POSIXct(gsub("-"," ",cran$time))
    cran$path = NULL
    cbind(parts, cran)
}
    
parse_cran_file <- function(path){
    ## split path into package anv version
    filename = gsub(".*/","",path)
    package = gsub("_.*","",filename)
    version = gsub(".*_","",filename)
    version = gsub(".tar.gz","",version)
    data.table::data.table(package=package, version=version, stringsAsFactors=FALSE)
}

tsort <- function(d){
    d[order(d$time),]
}

cranstatus <- function(name, meta, asof=Sys.time()){
    ## return full history to time `asof`
    
    ## dead packages return a non-NA $death which should also be the last timepoint
    ## in the $history which is a repeat version of the last released version
    ## live packages return NA $death and have a repeat of the current version with the
    ## `asof` date.
    ##
    ## the `asof` date should be later than any version dates, typically it would be "now", as
    ## in "whats the state of packages as of now". Hence the Sys.time() default...
    ##
    ## this could be modified to truncate $history at `asof` to answer Q's like "what was
    ## the state of "splancs" as of 2010-01-01.
    
    live = tsort(meta$live[meta$live$package==name,])
    archive = tsort(meta$archive[meta$archive$package==name,])
    last = tsort(meta$last[meta$last$package==name,])

    if(nrow(live)==0){
        history = rbind(archive, archive[nrow(archive),])
        history$version[nrow(history)]=history$version[nrow(history)-1]
        history$time[nrow(history)]=max(last$time)
        death = max(last$time)
    }else{
        history = rbind(archive, live)
        history = rbind(history, history[nrow(history),])
        history$time[nrow(history)] = asof
        death = NA
    }

    list(history=history, birth=min(history$time), death=death)
}

as_sd <- function(package, meta, now){
    h = cranstatus(package, meta)
    d = data.frame(package=package, birth=h$birth, death=h$death)
    if(is.na(h$death)){
        d$life = as.numeric(now)-as.numeric(d$birth)
        d$event = 0
    }else{
        d$life = as.numeric(d$death) - as.numeric(d$birth)
        d$event = 1
    }
    d
}

meansize <- function(package, meta){
    status = cranstatus(package, meta)
    hist = status$history
    meanZ(hist$t, hist$size)
}

meandeps <- function(package, meta, now){
    cranstat = cranstatus(package, meta)
    history = cran_package_history(package)
    history = rbind(history, history[nrow(history),])
    if(nrow(history) != nrow(cranstat$history)){
        warning("Row mismatch in CRAN listing (",nrow(cranstat$history)-1,") and pkgsearch history (",nrow(history)-1,") for ",package)
    }
    dt = deptab(history)
    dt$time = as.POSIXct(history$crandb_file_date)
    if(is.na(cranstat$death)){
        dt$time[length(dt$time)] = now
    }else{
        dt$time[length(dt$time)] = max(cranstat$history$time)
    }
    data.table::data.table(
        Depends = meanZ(dt$time, dt$Depends),
        Suggests = meanZ(dt$time, dt$Suggests),
        Imports = meanZ(dt$time, dt$Imports)
        )
}

    
barryrowlingson/cransurv documentation built on Feb. 6, 2020, 4:41 a.m.