Run RCMIP5 loads/calcs 1) current package 2) current git commit number

Run 'pure' hard coded loads/calcs w/ 1) data.frame 2) array 3) raster.

Table by model: size of org file, time to load, size of loaded variable, time to merge 2 ensembles, time to calculate annual mean, time to calc global mean, time to calc annual -> global mean

Big model: CCSM4 (404417K) gpp - historical Small model: HadGEM2-ES (32649K) nbp - historical

Report machine: # core, CPU speed, memory

#library(RCMIP5)
RDirPath <- '../R'
source("../sourceall.R")
library(ncdf4)
library(assertthat)
library(reshape2)
library(plyr)
library(raster)
source("profiler.R")

small.file <- list(model='HadGEM2-ES', experiment='historical', variable='nbp', ensemble=list('r3i1p1', 'r4i1p1'), domain=list('Lmon', 'Lmon'), path='../sampledata/monthly', 
                   areafile='../sampledata/fx/areacella_fx_HadGEM2-ES_historical_r0i0p0.nc',
                   filename=list('../sampledata/monthly/nbp_Lmon_HadGEM2-ES_historical_r3i1p1_195912-198411.nc','../sampledata/monthly/nbp_Lmon_HadGEM2-ES_historical_r4i1p1_195912-198411.nc'))
profileRCMIP5 <- function(file.info) {
    variableTime <- system.time(firstVar <- loadCMIP5(model=file.info$model, variable=file.info$variable, experiment=file.info$experiment, ensemble=file.info$ensemble[[1]], domain=file.info$domain[[1]], path=file.info$path))[3]

    mergeTime <- system.time(mergeVar <- loadCMIP5(model=file.info$model, variable=file.info$variable, experiment=file.info$experiment, domain=file.info$domain[[1]], path=file.info$path))[3]

    annualTime <- system.time(annualVar <- makeAnnualStat(firstVar))[3]

    #globalVar <- NA
    #globalTime <- NA
    globalTime <- system.time(globalVar <- makeGlobalStat(annualVar))[3]
    print(globalVar$val$value)
    temp <- data.frame(method=paste('RCMIP5', packageVersion('RCMIP5'), sep='v'), 
                       loadTime=variableTime, mergeTime=mergeTime, annualTime=annualTime, globalTime=NA, 
                       fileSize=file.info(file.info$filename[[1]])$size, 
                       loadSize=object.size(firstVar)[[1]], annualSize = object.size(annualVar)[[1]], globalSize = NA)
    row.names(temp) <- NULL

    return(temp)
    }

report.df <- profileRCMIP5(small.file)
profileRCMIP5Array <- function(file.info) {
    variableTime <- system.time(firstVar <- loadCMIP5(model=file.info$model, variable=file.info$variable, experiment=file.info$experiment, ensemble=file.info$ensemble[[1]], domain=file.info$domain[[1]], path=file.info$path, loadAs='array'))[3]

    mergeTime <- system.time(mergeVar <- loadCMIP5(model=file.info$model, variable=file.info$variable, experiment=file.info$experiment, domain=file.info$domain[[1]], path=file.info$path, loadAs='array'))[3]

    annualTime <- system.time(annualVar <- makeAnnualStat(firstVar))[3]

    globalTime <- system.time(globalVar <- makeGlobalStat(annualVar))[3]
    print(globalVar$val[TRUE])
    temp <- data.frame(method=paste('RCMIP5.array', packageVersion('RCMIP5'), sep='v'), 
                       loadTime=variableTime, mergeTime=mergeTime, annualTime=annualTime, globalTime=NA, 
                       fileSize=file.info(file.info$filename[[1]])$size, 
                       loadSize=object.size(firstVar)[[1]], annualSize = object.size(annualVar)[[1]], globalSize = NA)
    row.names(temp) <- NULL

    return(temp)
    }


report.df <- rbind(report.df, profileRCMIP5Array(small.file))
loadNCDF <- function(filename, varStr) {
    nc <- nc_open(filename, write=FALSE)
    ans <- ncvar_get(nc, varid=varStr)

    dimNames <- unlist(lapply(nc$var[[varStr]]$dim, FUN=function(x) { x$name }))
    if('time' %in% dimNames) {
        # Get the time unit (e.g. 'days since 1860')
        timeName <- dimNames[length(dimNames)]
        timeUnit <- ncatt_get(nc, timeName, 'units')$value
        # Get the type of calendar used (e.g. 'noleap')
        calendarStr <- ncatt_get(nc, timeName, 'calendar')$value
        calendarUnitsStr <- ncatt_get(nc, timeName, 'units')$value

        # Extract the number of days in a year
        if(grepl('^[^\\d]*\\d{3}[^\\d]day', calendarStr)) {
            calendarDayLength <- as.numeric(regmatches(calendarStr,
                                                       regexpr('\\d{3}', calendarStr)))
            } else {
                calendarDayLength <- 365
                }

        # Extract the year we are references in calendar
        # Set the default to year 1, month 1, day 1, hour 0, min 0, sec 0
        defaultCalendarArr <- c(1, 1, 1, 0, 0, 0)

        # Split the calandar unit string based on a '-', space, or ':'
        # ...this allows us to deal with YYYY-MM-DD hh:mm:ss, YYYY-MM-DD, or
        # ...YYYY-M-D
        calendarArr <- unlist(strsplit(calendarUnitsStr, split='[- :]'))

        # Check that the time is going to be in days otherwise latter
        # ...calculations for time array break
        assert_that(any(grepl('day', calendarArr)))

        # extract just the digits
        calendarArr <- as.numeric(calendarArr[grepl('^\\d+$', calendarArr)])

        # swap the default values with the extracted values, we assume
        # ... that years are listed before months, before days, and so on
        temp <- defaultCalendarArr
        temp[1:length(calendarArr)] <- calendarArr
        calendarArr <- temp

        # calculate the decimal starting year
        startYr <- sum((calendarArr - c(0, 1, 1, 0, 0, 0))
                       / c(1, 12, calendarDayLength, calendarDayLength*24,
                           calendarDayLength*24*60, calendarDayLength*24*60*60))

        # Load the actual time
        thisTimeRaw <- ncvar_get(nc, varid=timeName)
        attributes(thisTimeRaw) <- NULL

        # convert from days (we assume the units are days) to years
        thisTimeArr <- thisTimeRaw / calendarDayLength + startYr
        } else {
            thisTimeArr <- NULL
            }
    nc_close(nc)
    return(list(val=ans, time=thisTimeArr))
    }

profileArray <- function(file.info) {
    variableTime <- system.time(firstVar <- loadNCDF(file.info$filename[[1]], varStr=file.info$variable))[3]

    mergeTime <- system.time(mergeVar <- (loadNCDF(file.info$filename[[1]], varStr=file.info$variable)$val + loadNCDF(file.info$filename[[2]], varStr=file.info$variable)$val)/2)[3]

    annualTime <- system.time(
        {
            rawYrs <- table(floor(firstVar$time))
            yrs <- as.numeric(names(rawYrs)[rawYrs == 12])
            annualVar <- vapply(unique(yrs), FUN=function(yrNum) {apply(firstVar$val[,,yrNum == yrs], c(1,2), mean)}, FUN.VALUE=firstVar$val[,,1])
            })[3]

    globalTime <- system.time({
        area.ls <- loadNCDF(file.info$areafile, 'areacella')
        globalVar <- apply(annualVar, c(3), function(x) {sum(x*area.ls$val, na.rm=TRUE)})
        })[3]

    print(sprintf('annual global summary for %s-%s-%s', file.info$model, file.info$experiment, file.info$variable))
    print(globalVar)

    temp <- data.frame(method='ncdf-array', 
                       loadTime=variableTime, mergeTime=mergeTime, annualTime=annualTime, globalTime=globalTime, 
                       fileSize=file.info(file.info$filename[[1]])$size, 
                       loadSize=object.size(firstVar)[[1]], annualSize = object.size(annualVar)[[1]], globalSize = object.size(globalVar)[[1]])
    row.names(temp) <- NULL
    return(temp)
    }

report.df <- rbind(report.df, profileArray(small.file))
toDF <- function(ncdfload) {
    ans <- melt(ncdfload$val)
    if(is.null(ncdfload$time)) {
        names(ans) <- c('lonIndex', 'latIndex', 'val')
        } else { 
            ans$Var3 <- ncdfload$time[ans$Var3]
            names(ans) <- c('lonIndex', 'latIndex', 'time', 'val')
            }
    return(ans)
    }

profileDataframe <- function(file.info) {
    variableTime <- system.time(firstVar <- toDF(loadNCDF(file.info$filename[[1]], varStr=file.info$variable)))[3]

    mergeTime <- system.time({
        firstVar <- toDF(loadNCDF(file.info$filename[[1]], varStr=file.info$variable))
        secondVar <- toDF(loadNCDF(file.info$filename[[1]], varStr=file.info$variable))                     
        mergeVar <- merge(firstVar, secondVar, by=c('lonIndex', 'latIndex', 'time'))
        mergeVar$val <- (mergeVar$val.x + mergeVar$val.y)/2
        mergeVar$val.x <- NULL
        mergeVar$val.y <- NULL
        })[3]

    annualTime <- system.time(
        {firstVar$yrs <- floor(firstVar$time)
         annualVar <- ddply(firstVar, c('lonIndex', 'latIndex', 'yrs'), function(x) {
             if(length(x$val) != 12) {
                 return(data.frame(val=NA))
                 } else {
                     return(data.frame(val=mean(x$val)))
                     }
             firstVar$yrs <- NULL
             return(ans)})
         })[3]

    globalTime <- system.time({
        area.ls <- toDF(loadNCDF(file.info$areafile, 'areacella'))
        globalVar <- merge(annualVar, area.ls, by=c('lonIndex', 'latIndex'))
        globalVar <- ddply(globalVar, c('yrs'), function(x) {
            data.frame(val=sum(x$val.x*x$val.y, na.rm=TRUE))
            })
        })[3]

    print(sprintf('annual global summary for %s-%s-%s', file.info$model, file.info$experiment, file.info$variable))
    print(globalVar$yrs)
    print(globalVar$val)

    temp <- data.frame(method='ncdf-dataframe', 
                       loadTime=variableTime, mergeTime=mergeTime, annualTime=annualTime, globalTime=globalTime, 
                       fileSize=file.info(file.info$filename[[1]])$size, 
                       loadSize=object.size(firstVar)[[1]], annualSize = object.size(annualVar)[[1]], globalSize = object.size(globalVar)[[1]])
    row.names(temp) <- NULL
    return(temp)
    }

report.df <- rbind(report.df, profileDataframe(small.file))
#file.info <- small.file

profileRasterInMemory <- function(file.info) {
    variableTime <- system.time(firstVar <- stack(file.info$filename[[1]]))[3]

    mergeTime <- system.time({
        firstVar <- stack(file.info$filename[[1]])
        secondVar <- stack(file.info$filename[[1]])                     
        mergeVar <- (firstVar + secondVar)/2
        })[3]

    annualTime <- system.time({
        yrs <- as.numeric(substr(names(firstVar), 2, 5))
        annualVar <- stackApply(firstVar, yrs - yrs[1] + 1, mean)
        names(annualVar) <- unique(yrs)
        })[3]

    globalTime <- system.time({
        areaVar <- raster(file.info$areafile, varname='areacella')
        globalVar <- cellStats(annualVar*areaVar, stat='sum')
        names(globalVar) <- names(annualVar)
        })[3]

    print(sprintf('annual global summary for %s-%s-%s', file.info$model, file.info$experiment, file.info$variable))
    print(globalVar)

    temp <- data.frame(method='raster', 
                       loadTime=variableTime, mergeTime=mergeTime, annualTime=annualTime, globalTime=globalTime, 
                       fileSize=file.info(file.info$filename[[1]])$size, 
                       fileReadSize=object.size(firstVar)[[1]], inMemSize= object.size(mergeVar)[[1]], 
                       annualSize = object.size(annualVar)[[1]], globalSize = object.size(globalVar)[[1]])
    return(temp)
    }
#report.df <- profileRasterInMemory(small.file)
report.df <- merge(report.df, profileRasterInMemory(small.file), all=TRUE)
save(report.df, file='profiler.RData')
report.df[,grepl('Size$', names(report.df))] <- report.df[,grepl('Size$', names(report.df))]/1e6
names(report.df)[grepl('Size$', names(report.df))] <- paste(names(report.df)[grepl('Size$', names(report.df))], '[Mb]')
print(knitr::kable(report.df))


JGCRI/RCMIP5 documentation built on May 7, 2019, 7:43 a.m.