readhyd: Importing files for Hydrologial Databases

# always open the connection with the correct encoding

read.hzb <- function(file, parse.header = TRUE, fileEncoding = "ISO8859-1",
                     nlines = -1) {
  # separate the header, open the connection with correct enconding
  # todo: speedup using connections like in read.lfu()
  fh <- file(file, open = "rt", encoding = fileEncoding)
  header <- readLines(fh, n = 50)
  close(fh)

  type <- if (grepl(";", header[1])) "csv2" else "fwf"

  lines.header <- grep("Werte:", header, fixed = T)
  header <- head(header, lines.header - 1)

  na.strings <- "L\u00fccke"

  args <- list(file = file, header = F, skip = lines.header,
               na.strings = na.strings, fileEncoding = fileEncoding,
               strip.white = TRUE, as.is = TRUE)

  infile <- if(type == "fwf") {
    do.call(read.fwf, c(args, list(col.names = c("time", "value"),
                                   colClasses = c("character", "numeric"),
                                   widths = c(20, 20), n = nlines)))
  } else if (type == "csv2"){
    ncol <- length(strsplit(header[1], ";")[[1]]) + 1

    # also remove trailing separators in header
    header <- sub(";+$", "", header, fixed = FALSE)
    do.call(read.csv2, c(args,
                         list(col.names = c("time", "value", rep(NA, ncol - 2)),
                              colClasses = c("character", "numeric",
                                             rep("NULL", ncol -2)),
                              nrows = nlines)))
  }

  # letzter Wert ist immer NA, irgendwie unnötig
  # nicht entfernen, wenn nur ein Teil der Datei eingelesen wird
  if(nlines < 0) infile <- head(infile, -1)

  # sehr ungeschicktes Verhalten: keine Umstellung von Winterzeit auf Sommerzeit
  # aber auch im Sommer werden die Tagesmittelwerte mit "%d.%m.%Y 00:00:00"
  # bezeichnet
  # resultierende NA Werte im Zeitstempel entfernen
  # gibt vermutlich Probleme wenn eine Zeitreihe von Stundendaten erwartet wird
  infile$time <- try_timeformat(infile$time, format = "%d.%m.%Y %H:%M:%S")
  nas <- which(is.na(infile$time))
  if(length(nas)) {
    infile <- infile[!is.na(infile$time), ]
    warning(basename(file), ": removing NA values in index")
  }


  if (parse.header) {
    meta <- .parse_header_hzb(header, type = type)
    attr(x = infile, which = "list") <- meta[["list"]]
    attr(x = infile, which = "keyval") <- meta[["keyval"]]
  }

  return(infile)
}
.parse_header_hzb <- function(x, type = c("csv2", "fwf")) {
  type <- match.arg(type)

  # some lines are in key: value format,
  # there are also nested lists idented with spaces, treat them differently
  ident <- nchar(sub("^([ ]*).*$", "\\1", x))

  # lines with the same identation compose a list
  idx <- numeric(length(ident))
  j <- 0
  for (i in seq_along(ident)){
    # only increment counter if there is no ident
    if(i == length(ident) || ident[i] == 0) j <- j + 1
    idx[i] <- j
  }

  # unique indices reflect simple key-value pairs
  is.keyval <- idx %in% names(which(table(idx) == 1))
  keyval <- x[which(is.keyval & grepl(":", x, fixed = T))]
  keyval <- .split_keyval(keyval)

  header.list <-  split(x[which(!is.keyval)], idx[!is.keyval])

  # Abschnitt "Exportzeitreihe" is komisch formatiert...
  header.list <- header.list[!grepl("Exportzeitreihe", header.list)]
  header.list <- do.call(c, lapply(unname(header.list), .split_list_whitespace))

  return(list(keyval = keyval, list = header.list))
}


# .split_keyval_csv <- function(x, only.first = T) {
#   # sometimes, only the first ";" is a field separator, replace it with "\t"
#   x <- if(only.first) sub(";", "\t", x) else gsub(";", "\t", x)
#   keyval <- do.call(rbind, strsplit(x, split = "\t"))
#   keyval <- apply(keyval, 2, trimws)
#   keyval <- apply(keyval, 2, sub, pattern = ":$", replacement = "")
#
#   keyval[1, ] <- .format_title(keyval[1, ])
#
#   return(keyval)
# }

.split_keyval <- function(x, only.first = T) {
  y <- gsub(":[[:blank:]]+;?", "\t ", x)
  keyval <- do.call(rbind, strsplit(y, split = "\t"))
  keyval <- apply(keyval, 2, trimws)

  keyval[, 1] <- .format_title(keyval[, 1])

  return(keyval)
}

.split_list <- function(x) {
  # split list after colons
  txt <- strsplit(x[2], "")[[1]]
  colon <- head(which(txt == ":"), -1)
  offset <- sapply(colon, function(x) grep("[^ ;]", tail(txt, -x))[1])
  pos <- colon + offset
  y <- do.call(rbind, lapply(x[-1], substring,
                             first = c(1, pos),
                             last = c(pos - 1, 10000L)))

  tbl <- trimws(sub(";$", "", y[-1, , drop = F]))
  colnames(tbl) <- .format_title(y[1, ])
  tbl <- list(tbl)
  names(tbl) <- .format_title(x[1])

  return(tbl)
}


.split_list_whitespace <- function(x) {
  x <- trimws(x)
  y <- gsub(":?[[:blank:]]{3,};?", "\t", x)
  y <- do.call(rbind, strsplit(tail(y, -1), split = "\t", fixed = TRUE))

  tbl <- y[-1, , drop = F]
  colnames(tbl) <- .format_title(y[1, ])
  tbl <- list(tbl)
  names(tbl) <- .format_title(x[1])

  return(tbl)
}


.format_list_csv2 <- function(x) {
  title <- .format_title(x[1])

  tbl <- t(.split_keyval(x[-1], only.first = FALSE))

  # table can be empty
  if (is.null(nrow(tbl))) {
    tbl <- NULL
  } else {
    header <- .format_title(tbl[1, ])
    tbl <- data.frame(tbl[-1, , drop = F], stringsAsFactors = FALSE)

    colnames(tbl) <- header
  }


  tbl <- list(tbl)
  names(tbl) <- title

  return(tbl)
}



.format_list_fwf <- function(x) {
  title <- .format_title(x[1])
  widths <- c(27, 20, 25)

  tbl <- lapply(x[-1], substring,
                first = cumsum(c(1, head(widths, -1))),
                last = cumsum(widths))

  tbl <- apply(do.call(rbind, tbl), 2, trimws)
  tbl <- tbl[, apply(tbl != "", 2, any)]

  # table can be empty
  if (is.null(nrow(tbl))) {
    tbl <- NULL
  } else {
    header <- .format_title(tbl[1, ])
    tbl <- data.frame(tbl[-1, , drop = F])

    colnames(tbl) <- header
  }


  tbl <- list(tbl)
  names(tbl) <- title

  return(tbl)
}

.format_title <- function(x) {
  # remove trailing ":"
  y <- sub(":[[:blank:]]*;?$", "", trimws(x))

  # remove everything inside round brackets
  y <- gsub("\\(.*?\\)", "", y)

  # remove everything inside square brackets
  y <- gsub("\\[.*?\\]", "", y)
  return(trimws(y))
}

.value <- function(x, name) x[x[, 1] == name, 2]
.listvalue <- function(x, lname, ename) {
  y <- x[[lname]][, ename, drop = F]
  y <- if(nrow(y) < 1) NA else tail(y, 1)

  return(y)
}


extract_meta <- function(x) {

  att1 <- attr(x, "keyval")
  att2 <- attr(x, "list")

  y <- data.frame(name = .value(att1, "Messstelle"),
                  id.hzb = as.numeric(.value(att1, "HZB-Nummer")),
                  departement = .value(att1, "Dienststelle"),
                  lon = dms2dec(.listvalue(att2, "Geographische Koordinaten", "L\u00e4nge")),
                  lat = dms2dec(.listvalue(att2, "Geographische Koordinaten", "Breite")),
                  z = .toNum(.listvalue(att2, "Pegelnullpunkt", "H\u00f6he")))

  rownames(y) <- NULL
  return(y)
}

extract_meta_xts <- function(x) {
  att <- xtsAttributes(x)
  meta <- att[setdiff(names(att), "coordinates")]
  coord <- att[["coordinates"]]
  meta[names(coord)] <- coord

  names(meta)[names(meta) == "station"] <- "name"

  return(as.data.frame(meta))
}




# append list info
nlast <- function(x, n = 1, col) {
  y <- if (!length(nrow(x))) rep(NA, length(col)) else tail(x[, col], n)
  if(length(dim(y))) apply(y, 2, as.character) else as.character(y)
}




hzb2xts <- function(x){
  y <- xts(x = data.frame(discharge = x$value), order.by = x$time)
  y <- regularize(y)

  att <- attr(x, "keyval")
  coord <- numeric()
  coord[c("lon", "lat")] <- dms2dec(
    .listvalue(attr(x, "list"), "Geographische Koordinaten",
               c("L\u00e4nge", "Breite")))

  z <- .toNum(.listvalue(attr(x, "list"), "Pegelnullpunkt", "H\u00f6he"))

  dict <- c("Messstelle"="station", "HZB-Nummer"="id.hzb", "HD-Nummer"="id.hd",
            "DBMS-Nummer"="id.dbms", "Gew\u00e4sser"="river",
            "Messstellenbetreiber" = "operator", "Dienststelle" = "departement",
            "orogr.Einzugsgebiet"="catchment", "Einheit"="unit")

  keyval <- list()
  for(i in seq_along(dict)) keyval[[i]] <- .value(att, names(dict)[i])
  names(keyval) <- dict

  keyval[["id.hzb"]] <- as.numeric(keyval[["id.hzb"]])
  keyval[["catchment"]] <- .toNum(keyval[["catchment"]])
  keyval[["unit"]] <- sub("\u00B3", "^3",keyval[["unit"]])

  xtsAttributes(y) <- c(keyval, list(coordinates = coord, z = z))

  return(y)
}