R/rfsi.R

rfsi <- function (formula, # without nearest obs
                  data, # sf | sftime | SpatVector | data.frame(x,y,obs,time,ec1,ec2,...)
                  data.staid.x.y.z = NULL, # = c(1,2,3,4), # if data.frame # time = mid.depth
                  n.obs = 5, # nearest obs
                  # time.nmax, # use all if not specified
                  avg = FALSE,
                  increment = 10000, # avg(nearest point dist)
                  range = 50000, # bbox smaller(a, b) / 2
                  quadrant = FALSE,
                  use.idw = FALSE,
                  idw.p = 2,
                  s.crs = NA, # to ce da leti - ...
                  p.crs = NA, # to ce da leti - ...
                  cpus = detectCores()-1, # for near.obs
                  progress = TRUE,
                  soil3d = FALSE, # soil RFSI
                  depth.range = 0.1, # in units of depth
                  no.obs = 'increase', # exactly
                  ...){ # ranger parameters
                        # quantreg,
                        # num.trees,
                        # mtry,
                        # min.node.size,
                        # sample.fraction,
  
  # check the input
  if (progress) print('Preparing data ...')
  if ((missing(data)) | missing(formula)) {
    stop('The arguments data (or obs and stations) and formula must not be empty!')
  }
  formula <- as.formula(formula)
  all_vars <- all.vars(formula)
  obs.col.name <- all_vars[1]
  names_covar <- all_vars[-1]
  
  # prepare data
  data.prep <- data.prepare(data=data, data.staid.x.y.z=data.staid.x.y.z, s.crs=s.crs)
  data.df <- data.prep[["data.df"]]
  data.staid.x.y.z <- data.prep[["data.staid.x.y.z"]]
  s.crs <- data.prep[["s.crs"]]
  
  # if vars exists
  c.dif <- setdiff(all_vars, names(data.df))
  if (!identical(c.dif, character(0))) {
    stop(paste('The variable(s) ', paste(c.dif, collapse = ", "), ' - missing from data!', sep = ""))
  }
  
  if (is.na(s.crs)) {
    warning('Source CRS is NULL! Using given coordinates for Euclidean distances calculation.')
  } else if (is.na(p.crs)) {
    if (progress) print('Projection CRS is NULL. Using source CRS for Euclidean distances calculation:')
    if (progress) print(s.crs$input)
  } else if (identical(s.crs, p.crs)) {
    if (progress) print('s.crs==p.crs')
  } else {
    if (progress) print('Using projection CRS for Euclidean distances calculation.')
    prj_print <- try(paste('Do reprojection from: ', s.crs$input, ' to ', p.crs$input, sep=""), silent = TRUE)
    if(inherits(prj_print, "try-error")) {
      prj_print <- paste('Do reprojection from: ', s.crs@projargs, ' to ', p.crs@projargs, sep="")
    }
    if (progress) print(prj_print)
    # if (progress) print(paste('Do reprojection from: ', s.crs$input, ' to ', p.crs$input, sep=""))
    # reproject data coordinates
    data.coord <- data.df[, data.staid.x.y.z[2:3]]
    data.coord <- st_as_sf(data.coord, coords = names(data.coord), crs = s.crs, agr = "constant")
    data.coord <- st_transform(data.coord, p.crs)
    data.coord <- as.data.frame(st_coordinates(data.coord$geometry))
    names(data.coord) <- c('x.proj', 'y.proj')
    data.df <- cbind(data.df, data.coord)
    data.staid.x.y.z[2:3] <- c(length(data.df)-1, length(data.df))
  }
  
  x.y <- names(data.df)[data.staid.x.y.z[2:3]]
  data.df = data.df[complete.cases(data.df[, names_covar]), ]
  if (soil3d) {
    depth.name <- names(data.df)[data.staid.x.y.z[4]]
  }
  
  # if space-time
  if (!is.na(data.staid.x.y.z[4]) & !soil3d) {
    # sort data.df
    data.df <- data.df[order(data.df[, data.staid.x.y.z[4]],
                             data.df[, data.staid.x.y.z[1]]), ]
    time=sort(unique(data.df[, data.staid.x.y.z[4]]))
    daysNum = length(time)
    
    # calculate obs and dist
    if (progress) print('Space-time process ...')
    if (progress) print('Calculating distances to the nearest observations ...')
    registerDoParallel(cores=cpus)
    nearest_obs <- foreach (t = time, .export = c("near.obs")) %dopar% {
      
      day_df <- data.df[data.df[, data.staid.x.y.z[4]]==t, c(x.y, obs.col.name)]
      if (nrow(day_df)==0) {
        return(NULL)
      }
      return(near.obs(
        locations = day_df,
        # locations.x.y = c(1,2),
        observations = day_df,
        # observations.x.y = c(1,2),
        obs.col = obs.col.name,
        n.obs = n.obs,
        avg = avg,
        range = range,
        increment = increment,
        quadrant = quadrant,
        idw=use.idw,
        idw.p=idw.p
      ))
      
    }
    stopImplicitCluster()
    nearest_obs <- do.call("rbind", nearest_obs)
    
    # calculate TPS
    # if (use.tps) {
    #   if (progress) print('Calculating TPS ...')
    #   registerDoParallel(cores=cpus)
    #   tps_fit <- foreach (t = time) %dopar% {
    #     day_df <- data.df[data.df[, data.staid.x.y.z[4]]==t, c(x.y, obs.col.name)]
    #     if (nrow(day_df)==0) {
    #       return(NULL)
    #     }
    #     if ((nrow(day_df)-3) <= tps.df) {
    #       tps.df.day <- nrow(day_df)-3
    #     } else {
    #       tps.df.day <- tps.df
    #     }
    #     m <- Tps(day_df[, x.y], day_df[, obs.col.name],# lon.lat = T,
    #              #lambda = tps.lambda) #, GCV=F)
    #              df=tps.df.day)
    #     return(as.vector(m$fitted.values))
    #   }
    #   stopImplicitCluster()
    #   tps_fit <- unlist(tps_fit) #as.vector(do.call("rbind", tps_fit))
    # }
    
  } else if (!is.na(data.staid.x.y.z[4]) & soil3d) { # soil 3D
    # calculate obs and dist
    day_df <- data.df[, c(x.y, depth.name, obs.col.name)]
    if (progress) print('Soil 3D process ...')
    if (progress) print('Calculating distances to the nearest observations (profiles) ...')
    nearest_obs <- near.obs.soil(
      locations = day_df,
      # locations.x.y.md = c(1,2,3),
      observations = day_df,
      # observations.x.y.md = c(1,2,3),
      obs.col = 4,
      n.obs = n.obs,
      depth.range = depth.range,
      no.obs = no.obs,
      parallel.processing = TRUE,
      pp.type = "doParallel", # "snowfall"
      cpus = cpus
    )
    
  } else { # if spatial
    # calculate obs and dist
    day_df <- data.df[, c(x.y, obs.col.name)]
    if (progress) print('Spatial process ...')
    if (progress) print('Calculating distances to the nearest observations ...')
    nearest_obs <- near.obs(
      locations = day_df,
      # locations.x.y = c(1,2),
      observations = day_df,
      # observations.x.y = c(1,2),
      obs.col = obs.col.name,
      n.obs = n.obs,
      avg = avg,
      range = range,
      increment = increment,
      quadrant = quadrant,
      idw=use.idw,
      idw.p=idw.p
    )
    
    # # calculate TPS
    # if (use.tps) {
    #   if (progress) print('Calculating TPS ...')
    #   m <- Tps(day_df[, x.y], day_df[, obs.col.name],# lon.lat = T,
    #            # lambda = tps.lambda) #, GCV=F)
    #            df=tps.df)
    #   tps_fit <- as.vector(m$fitted.values)
    # }
  }
  
  data.df <- cbind(data.df, nearest_obs)
  # if (use.tps) {
  #   tps.var <- paste("tps_", tps.df, sep="")
  #   data.df[, tps.var] <- tps_fit
  # }

  data.df = data.df[complete.cases(data.df[, c(names_covar, names(nearest_obs))]), ]
  if (nrow(data.df) == 0) {
    stop("There is no complete cases in data.")
  }
  
  # if (use.tps) {
  #   formula = as.formula(paste(deparse(formula), paste(names(nearest_obs), collapse = " + "), tps.var, sep = " + "))
  # } else {
  formula = as.formula(paste(paste((deparse(formula)), collapse=""), paste(names(nearest_obs), collapse = " + "), sep = " + "))
  # }
  rm(nearest_obs)
  
  # fit RF model
  if (progress) print('Fitting RFSI model ...')
  rfsi_model <- ranger(formula, data = data.df, ...)
  # rfsi_model <- ranger(formula, data = data.df, importance = importance, seed = seed,
  # num.trees = num.trees, mtry = mtry, splitrule = "variance",
  # min.node.size = min.node.size,  sample.fraction = sample.fraction,
  # quantreg = quantreg)
  if (progress) print('Done!')
  
  return(rfsi_model)
  # dodaj formulu
  # dodaj n.obs
  # avg ...
  
}

Try the meteo package in your browser

Any scripts or data that you put into this service are public.

meteo documentation built on Nov. 23, 2023, 3:01 p.m.