Nothing
#clihomog.R.- Homogenization functions for the Climatol package.
#Author: Jose A. Guijarro. Licence: GPL >= 3.0
climatol.version <- '4.2.0'
#- cerrar.- Close output files.
cerrar <- function(graphics=TRUE) {
if(graphics) graphics.off()
palette('default') #reset palette to user's default
while (sink.number()>0) sink() #close logfile(s)
closeAllConnections()
}
#- xls2csv.- Dump data from all *.xls* files into a csv file.
xls2csv <- function(tmpdir, archdir, var, datcols=1:4, codesep='-', dec='.',
sep=',') {
#tmpdir: temporal directory containing the files to read
#archdir: directory where to archive files after processing
#var: destination name of the variable
#datcols: data columns to be written to the output file
#codesep: character string separating the code from the rest of the file name ('-' by default)
# dec='.': character to use as decimal point in the output file
# sep=',': character separating data in the output file
if(tmpdir==archdir) stop("Please, set archdir different from tmpdir")
if(!requireNamespace('readxl', quietly=TRUE))
stop("Please, install the package 'readxl' and run this function again")
fdat <- sprintf('xls_%s_data.csv',var) #name of the data output file
fsta <- sprintf('xls_%s_stations.csv',var) #name of the stations output file
Fs <- file(fdat,'a') #open data output file
Ft <- file(fsta,'a') #open stations output file
fich <- dir(tmpdir,'*\\.xls*') #files in tmpdir
nf <- 0 #counter of processed files
for(f in fich) { #for every file
cat(f,'\n')
if(grepl(codesep,f)) {
z <- strsplit(f,codesep)
code <- z[[1]][1] #station code
name <- z[[1]][2] #(likely) station name
} else code <- name <- strsplit(f,'\\.')[[1]][1]
z <- try(d <- as.data.frame(readxl::read_excel(sprintf('%s/%s',tmpdir,f)))[,datcols])
if(inherits(z,'try-error')) next
nalines <- apply(is.na(d),1,sum)>0 #lines with one or more missing data
d <- d[!nalines,]
d <- sapply (d,as.numeric)
d <- data.frame(code,d)
write.table(d,Fs,dec=dec,sep=sep,row.names=FALSE,col.names=FALSE)
write(sprintf('%s%s%s',code,sep,name),Ft)
file.rename(sprintf('%s/%s',tmpdir,f),sprintf('%s/%s',archdir,f))
nf <- nf+1 #update no. of processed files
}
close(Fs); close(Ft)
if(nf>0) {
cat(sprintf("Data from %d %s/*.xls* files have been saved into %s",nf,tmpdir,fdat),'\n')
cat(sprintf("Station codes and names have been written to %s",fsta),'\n')
cat(" Coordinates and elevations should be added to this file",'\n')
cat(" before running csv2climatol().",'\n')
cat(sprintf("(Original files have been moved to the %s directory.)",archdir),'\n\n')
} else cat(sprintf("No %s/*.xls* files found!",tmpdir),'\n\n')
}
#- csv2climatol.- Convert data in a single CSV file to Climatol input format.
#Station codes, names and coordinates can go in a separate CSV file.
csv2climatol <- function(csvfile, datacol=6:8, stnfile=csvfile, stncol=1:5,
varcli, anyi=NA, anyf=NA, mindat=NA, sep=',', dec='.', na.strings='NA',
dateformat='%Y-%m-%d', cf=1, ndec=1, header=TRUE) {
#csvfile: name of the CSV (text) file containing the data
#datacol: column(s) holding station codes, dates and data. If 4 (5) values
# are provided, dates are expected to appear as year, month (and days) in
# separate columns. Otherwise, dates must be provided as character strings
# (see parameter dateformat below)
#stnfile: name of the CSV file containing station codes, names and
# coordinates (if these data are not in the csvfile)
#stncol: columns holding longitudes, latitudes, elevations and station
# codes and names. At least coordinates and station codes must be present
# in either csvfile or stnfile. Put a zero for any inexistent columns.
# Example when stnfile contains only, in this order, latitudes, longitudes
# and station names: stncol=c(2,1,0,3,0)
#varcli: (short) name of the climatic variable under study
#anyi: first year to study
#anyf: last year to study
#mindat: minimum required number of data per station (by default, 60 monthly
# values or 365 daily values)
#sep: data separator (',' by default: Comma Separated Values)
#dec: decimal point ('.' by default)
#na.strings: strings coding missing data ('NA' by default)
#dateformat: format of dates (if not in separate columns. Default '%Y-%m-%d')
#cf: conversion factor to apply if data units need to be changed
#ndec: no. of decimals to round to
#header: TRUE by default, set to FALSE if csvfile has no header
#NOTE that if a stnfile is provided, then sep, dec, na.strings and header
# defined for csvfile will also be applied to stnfile.
#----------------- Operation: -----------------------------------
#read input table:
d <- read.csv(csvfile,sep=sep,dec=dec,header=header,na.strings=na.strings,as.is=TRUE)
ldc <- length(datacol); jd <- datacol[ldc] #data column
#disregard lines without data:
d <- d[!is.na(d[,jd]),]
#find out no. of stations and dates range:
if(stnfile!=csvfile) stn <- read.csv(stnfile,sep=sep,dec=dec,header=header,
na.strings=na.strings,as.is=TRUE)[,stncol]
else stn <- unique(d[,stncol])
ne <- nrow(stn) #no. of stations
#if elevations are missing, set them to 99:
if(stncol[3]==0) stn <- cbind(stn[,1:2],99,stn[,3])
#if station names are missing, duplicate station codes:
if(ncol(stn)==4) stn <- cbind(stn,stn[,4])
stid <- as.character(stn[,4]) #station codes
dupl <- duplicated(stid)
if(sum(dupl)>0) { #remove duplicated codes:
zz <- unique(stn[dupl,4])
cat("Codes with different names or coordinates",':\n')
print(stn[stn[,4]%in%zz,])
cat("Only one version of names and coordinates will be kept!",'\n')
moda <- function(x) names(which.max(table(x))) #mode function
for(qz in zz) {
kz <- which(stn[,4]==qz)
for(j in 1:5) stn[kz,j] <- moda(stn[kz,j])
}
stn <- unique(stn); stid <- stn[,4] #updated station list and codes
}
ne <- length(stid) #no. of stations
if(ldc==3) {
if(!inherits(d[,datacol[2]],'character')) d[,datacol[2]] <-
as.character(d[,datacol[2]])
if(grepl('M',dateformat)) fech <- as.POSIXct(d[,datacol[2]],'utc',
dateformat)
else if(grepl('H',dateformat)) fech <- as.POSIXct(d[,datacol[2]],
'utc','%Y%m%d%H')
else fech <- as.Date(d[,datacol[2]],dateformat)
} else if(length(datacol)==4) fech <- as.Date(sprintf('%d-%02d-01',
d[,datacol[2]],d[,datacol[3]]))
else fech <- as.Date(sprintf('%d-%02d-%02d',d[,datacol[2]],d[,datacol[3]],
d[,datacol[4]]))
xstep <- min(diff(sort(unique(fech)))) #minimum time interval
tu <- units(xstep) #units of the minimum time interval
if((tu=='days' & xstep==1) | tu=='hours'| tu=='minutes' | tu=='seconds') {
nm <- 0; if(is.na(mindat)) mindat <- 365 } #daily or subdaily values
else { nm <- 12; if(is.na(mindat)) mindat <- 60 } #monthly values
nas <- which(is.na(fech))
z <- as.integer(strftime(range(fech,na.rm=TRUE),'%Y'))
if(is.na(anyi)) anyi <- z[1] #initial year of data
if(is.na(anyf)) anyf <- z[2] #final year of data
#target dates vector:
if(nm==0) {
if(tu=='days') x <- seq(as.Date(sprintf('%d-01-01',anyi)),
as.Date(sprintf('%d-12-31',anyf)),1)
else x <- seq(as.POSIXct(sprintf('%d-01-01 00:00:00',anyi),'utc'),
as.POSIXct(sprintf('%d-12-31 23:50:50',anyf),'utc'),
by=sprintf('%d %s',xstep,units(xstep)))
} else x <- seq(as.Date(sprintf('%d-01-01',anyi)),
as.Date(sprintf('%d-12-01',anyf)),'1 month')
nd <- length(x) #number of dates (=data per station)
#initialize data matrix:
dat <- matrix(NA,nd,ne)
#populate data matrix:
cat(sprintf("Creating %s input files for Climatol from %s",
varcli,csvfile),'...\n')
for(i in 1:ne) { #for every station
cat(sprintf(' %s',stid[i]))
sel <- d[,datacol[1]]==stid[i] #select lines of current station
ds <- d[sel,jd] #data
fe <- fech[sel] #dates
kd <- match(fe,x) #match data dates with the dates vector
#avoid "NAs are not allowed in subscripted assignments" error:
z <- is.na(kd); if(sum(z>0)) { ds <- ds[!z]; kd <- kd[!z] }
dat[kd,i] <- round(ds*cf,ndec)
}
cat('\n')
#remove stations without mindat data:
ndat <- apply(!is.na(dat),2,sum)
sel <- ndat < mindat
if(sum(sel)==ne) stop("Not enough data in any station. No files created!")
if(sum(sel)>0) { dat <- dat[,!sel]; stn <- stn[!sel,] }
#write data file:
fich <- sprintf('%s_%s-%s.dat',varcli,anyi,anyf)
write(dat,fich,ncolumns=ifelse(nm==0,10,12))
cat('\n',"Data saved to file",fich,':\n')
print(summary(as.vector(dat)))
#write stations file:
fich <- sprintf('%s_%s-%s.est',varcli,anyi,anyf)
stn[,1:3] <- sapply(stn[,1:3],as.numeric) #avoid coordinates as characters
write.table(stn,fich,row.names=FALSE,col.names=FALSE)
if(length(stncol==5)) {
cat('\n',"Station coordinates and names saved to file",fich,':\n')
names(stn) <- c("X (lon)","Y (lat)","Z (elev)","Code","Name")
print(summary(stn))
} else {
cat('\n',"Station data saved to file",fich,'\n')
cat("It should have columns: X (lon), Y (lat), Z (elev), Code, Name",'\n')
cat("Please, edit the file to add the missing items in that order.",'\n\n')
}
if(length(nas)>20) {
write.table(d[nas,],sprintf('%s-wrong_dates.txt',csvfile))
cat("Skipped data lines because of wrong dates are listed in file",
sprintf('%s-wrong_dates.txt',csvfile),'\n')
} else if(length(nas)>0) {
cat("Skipped data lines because of wrong dates:",'\n')
print(d[nas,]); cat('\n')
}
}
#- daily2climatol.- Convert daily data files to Climatol input format.
daily2climatol <- function(stfile, stcol=1:6, datcol=1:4, varcli='VRB',
anyi=NA, anyf=NA, mindat=365, sep=',', dec='.', na.strings='NA',
dateformat='%Y-%m-%d', header=TRUE) {
#stfile: file with file names and station coordinates, codes and names
#stcol: columns in stfile holding file names, longitudes, latitudes,
# elevations and station codes and names. (Defaults to 1:6. Use 0 for codes
# and/or names columns if they are missing, and numeric values will be
# assigned.)
#datcol: columns in data files holding year,month,day,value or date,value
# (defaults to 1:4).
#varcli: short name of the climatic variable under study
#anyi: first year to study (defaults to the first year available in data)
#anyf: last year to study (defaults to the last year available in data)
#mindat: minimum required number of data per station
#sep: Field separator in all files, whether data or stations. (',' by default.)
#dec: decimal point ('.' by default)
#na.strings: strings coding missing data ('NA' by default)
#dateformat: format of dates if not in separate columns. ('%Y-%m-%d' by default.)
#header: TRUE by default, set to FALSE if files do not have a header
ldc <- length(datcol)
if(ldc!=2 & ldc!=4) stop(paste('datcol',"must be a vector of 2 or 4 elements."))
if(ldc==2) dc <- datcol[2] else dc <- datcol[4] #data column
#read stations file:
st <- read.table(stfile,as.is=TRUE,sep=sep,dec=dec,header=header)
ne <- nrow(st) #no. of stations
if(is.na(anyi) | is.na(anyf)) { #check the time period of the data:
cat('\n',"Checking the period covered by the data",'...\n')
inidate <- as.Date('3000-12-31'); enddate <- as.Date('0001-01-01')
for(i in 1:ne) { #for every station
cat('',i)
d <- read.table(st[i,stcol[1]],sep=sep,dec=dec,header=header,na.strings=na.strings)
d <- d[!is.na(d[,dc]),] #remove lines without data
if(ldc==2) dates <- as.Date(d[,datcol[1]],dateformat)
else dates <- as.Date(sprintf('%d-%02d-%02d',d[,datcol[1]],d[,datcol[2]],d[,datcol[3]]))
nadates <- is.na(dates)
if(sum(nadates)>0) {
cat("Abnormal dates found in file",st[i,stcol[1]],':\n')
print(d[nadates,])
}
rdates <- range(dates,na.rm=TRUE) #range of dates with data
if(rdates[1]<inidate) inidate <- rdates[1]
if(rdates[2]>enddate) enddate <- rdates[2]
}
cat('\n')
} else {
if(anyf<anyi) stop("Set initial year (anyi) lower or equal than final year (anyf)")
inidate <- as.Date(sprintf('%d-01-01',anyi))
enddate <- as.Date(sprintf('%d-12-31',anyf))
}
dates <- seq(inidate,enddate,by='1 day') #vector of dates
nd <- length(dates) #number of dates (=data per station)
cat(sprintf("%d days between %s and %s",nd,inidate,enddate),'\n')
dat <- matrix(NA,nd,ne)
#populate data matrix:
cat('\n',"Creating",varcli,"input files for Climatol from daily files",'...:\n')
for(i in 1:ne) { #for every station
cat(sprintf('%3d %s\n',i,st[i,stcol[1]]))
d <- read.table(st[i,stcol[1]],sep=sep,dec=dec,header=header,na.strings=na.strings)
d <- d[!is.na(d[,dc]),] #remove lines without data
if(ldc==2) ddates <- as.Date(d[,datcol[1]],dateformat)
else ddates <- as.Date(sprintf('%d-%02d-%02d',d[,datcol[1]],d[,datcol[2]],d[,datcol[3]]))
kd <- match(ddates,dates) #match data dates with the dates vector
#avoid 'NAs are not allowed in subscripted assignments' error:
if(sum(is.na(kd))>0) { d <- d[!is.na(kd),]; kd <- kd[!is.na(kd)] }
ddat <- d[,dc]
dat[kd,i] <- ddat
}
# dat[dat==mis] <- NA #use R missing data code
#remove stations without mindat data:
ndat <- apply(!is.na(dat),2,sum)
sel <- ndat < mindat
if(sum(sel)>0) {
cat('\n',"Stations with less than",mindat,"data: ",which(sel),'\n')
if(sum(sel)==ne) stop("No station has enough data!")
dat <- dat[,!sel]; st <- st[!sel,]; ne <- nrow(st) }
#write data file:
anyi <- format(inidate,'%Y'); anyf <- format(enddate,'%Y')
fich <- sprintf('%s_%s-%s.dat',varcli,anyi,anyf)
write(dat,fich,ncolumns=10)
cat('\n',"Data saved to file",fich,':\n')
print(summary(as.vector(dat)))
#write stations file:
nc <- ncol(st)
#assign numeric codes and names if not provided:
if(stcol[5]==0) cod <- as.character(1:ne) else cod <- st[,stcol[5]]
if(stcol[6]==0) nam <- as.character(1:ne) else nam <- st[,stcol[6]]
st <- cbind(st[,stcol[2:4]],cod,nam)
fich <- sprintf('%s_%s-%s.est',varcli,anyi,anyf)
write.table(st,fich,row.names=FALSE,col.names=FALSE)
cat('\n',"Station coordinates and names saved to file",fich,':\n')
names(st) <- c("X (lon)","Y (lat)","Z (elev)","Code","Name")
print(summary(st))
}
#- rclimdex2climatol.- Convert RClimDEX daily data files to Climatol format.
rclimdex2climatol <- function(stfile, stcol=1:5, kvar, chrcod=c(6,10),
sep='', anyi=NA, anyf=NA, mis=-99.9, mindat=365, header=TRUE) {
#stfile: file with the data file names and station coordinates (in degrees):
# dataFile latitude longitude elevation stationName')
#stcol: columns in stfile holding file names, longitudes, latitudes,
# elevations and station names. (Defaults to 1:5)
#kvar: RClimDEX variable to extract: 1(RR), 2(TX), 3(TN)
#chrcod: initial and final characters of data file names to use as codes
#sep: column separator (space or tab by default)
#anyi: initial year to study (defaults to the first year available in data)
#anyf: final year to study (defaults to the last year available in data)
#mis: missing data code [-99.9]
#mindat: minimum number of data per station
#header: do files have a header line? [TRUE]
varcli=c('RR','TX','TN')[kvar] #(short) name of the variable
cat('\n',"Creating",varcli,"Climatol input files from RClimDEX files",'...:\n\n')
st <- read.table(stfile,sep=sep,header=header,as.is=TRUE) #stations
ne <- nrow(st)
if(is.na(anyi) | is.na(anyf)) { #check the time period of the data:
inidate <- as.Date('3000-12-31'); enddate <- as.Date('0001-01-01')
for(i in 1:ne) { #for every station
d <- read.table(st[i,stcol[1]],header=header,sep=sep)
dates <- as.Date(sprintf('%d-%02d-%02d',d[,1],d[,2],d[,3]))
rdates <- range(dates,na.rm=TRUE) #range of dates with data
nadates <- is.na(dates)
if(sum(nadates)>0) {
cat("Abnormal dates found in file",st[i,stcol[1]],':\n')
print(d[nadates,])
}
if(rdates[1]<inidate) inidate <- rdates[1]
if(rdates[2]>enddate) enddate <- rdates[2]
}
}else {
if(anyf<anyi) stop("Set initial year (anyi) lower or equal than final year (anyf)")
inidate <- as.Date(sprintf('%d-01-01',anyi))
enddate <- as.Date(sprintf('%d-12-31',anyf))
}
dates <- seq(inidate,enddate,by='1 day') #vector of dates
nd <- length(dates) #number of dates (=data per station)
dat <- matrix(NA,nd,ne)
#populate data matrix:
for(i in 1:ne) { #for every station
cat(st[i,stcol[1]],'\n')
d <- read.table(st[i,stcol[1]],header=header,sep=sep) #data
ddates <- as.Date(sprintf('%d-%02d-%02d',d[,1],d[,2],d[,3]))
kd <- match(ddates,dates) #match data dates with the dates vector
#avoid "NAs are not allowed in subscripted assignments" error:
if(sum(is.na(kd))>0) { d <- d[!is.na(kd),]; kd <- kd[!is.na(kd)] }
ddat <- d[,kvar+3]
dat[kd,i] <- ddat
}
dat[dat==mis] <- NA #use R missing data code
#remove stations without mindat data:
ndat <- apply(!is.na(dat),2,sum)
sel <- ndat < mindat
if(sum(sel)>0) { dat <- dat[,!sel]; st <- st[!sel,] }
#write data file:
anyi <- format(inidate,'%Y'); anyf <- format(enddate,'%Y')
fich <- sprintf('%s_%s-%s.dat',varcli,anyi,anyf)
write(dat,fich,ncolumns=10)
cat('\n',"Data from",format(inidate),"to",format(enddate),"saved to file",fich,'\n')
#find longest period without concurrent missing data in all stations:
avd=apply(!is.na(dat),1,sum)>0
if(sum(!avd)>0) {
rle=rle(avd)
maxrle=which.max(rle$lengths)
ki=diffinv(rle$lengths)[maxrle]+1
kf=diffinv(rle$lengths)[maxrle+1]
cat("The longest period without concurrent missing data in all stations",'\n')
cat(" goes from",format(dates[ki]),"to",format(dates[kf]),'\n')
}
#write stations file:
cod <- substr(st[,stcol[1]],chrcod[1],chrcod[2])
df <- data.frame(st[,stcol[c(3,2,4)]],cod,st[,stcol[5]])
fich <- sprintf('%s_%s-%s.est',varcli,anyi,anyf)
write.table(df,fich,row.names=FALSE,col.names=FALSE)
cat("Station coordinates and names saved to file",fich,'\n\n')
}
#- climatol2rclimdex.- Convert DAILY data from Climatol to RClimDEX.
climatol2rclimdex <- function(varRR, varTX, varTN, yiRR, yfRR, yiTX=yiRR,
yfTX=yfRR, yiTN=yiRR, yfTN=yfRR, header=TRUE, prefix='hoclm', dir=NA,
na='-99.9') {
#varRR, varTX, varTN: Name of the variables in the climatol files. If some
# variable is not available, name it as ''.
#yiRR, yfRR: Initial and final years of the homogenized RR series.
#yiTX, yfTX, yiTN, yfTN: Initial and final years of the TX and TN series.
# The same as yiRR and yfRR by default.
#header: include a header in the files? (TRUE by default)
#prefix: prefix to prepend to station codes to name the output RClimDEX files.
#dir: Destination directory of the output RClimDEX files.
#na: Missing data code to use in the output RClimDEX files.
nm <- x <- dah <- nei <- est.c <- NULL #(avoid invisible bindings)
anyi <- max(c(yiRR,yiTX,yiTN)) #initial year of the output
anyf <- min(c(yfRR,yfTX,yfTN)) #final year of the output
if(!is.na(dir)) if(!dir.exists(dir)) dir.create(dir) #output directory
fech <- seq(as.Date(sprintf('%s-01-01',anyi)),as.Date(sprintf('%s-12-31',anyf)),by='1 day')
ndd <- length(fech) #no. of daily data per station
avl <- rep(FALSE,3) #availability flags
cod <- NULL
#-------- read results for the three daily variables (if available):
#precipitation:
if(varRR != '') {
load(sprintf('%s_%d-%d.rda',varRR,yiRR,yfRR))
if(nm>0) stop(sprintf("Data in %s_%d-%d.rda does not seem to be DAILY!",
varRR,yiRR,yfRR))
self <- match(fech,x) #selected days
dRR <- dah[self,1:nei] #selected data (from last homogeneous fragments)
sRR <- est.c[1:nei,4] #selected stations
if(length(sRR)>0) { avl[1] <- TRUE; cod <- sRR }
}
#maximum temperatures:
if(varTX != '') {
load(sprintf('%s_%d-%d.rda',varTX,yiTX,yfTX))
if(nm>0) stop(sprintf("Data in %s_%d-%d.rda does not seem to be DAILY!",
varTX,yiTX,yfTX))
self <- match(fech,x) #selected days
dTX <- dah[self,1:nei] #selected data (from last homogeneous fragments)
sTX <- est.c[1:nei,4] #selected stations
if(length(sTX)>0) {
avl[2] <- TRUE
if(is.null(cod)) cod <- sTX else cod <- intersect(cod,sTX)
if(length(cod)==0) stop(sprintf("No common station codes found in %s and %s",varRR,varTX))
}
}
#minimum temperatures:
if(varTN != '') {
load(sprintf('%s_%d-%d.rda',varTN,yiTN,yfTN))
if(nm>0) stop(sprintf("Data in %s_%d-%d.rda does not seem to be DAILY!",
varTN,yiTN,yfTN))
self <- match(fech,x) #selected days
dTN <- dah[self,1:nei] #selected data (from last homogeneous fragments)
sTN <- unsufix(est.c[1:nei,4]) #selected stations
if(length(sTN)>0) {
avl[3] <- TRUE
if(is.null(cod)) cod <- sTN else cod <- intersect(cod,sTN)
if(length(cod)==0) stop(sprintf("No common station codes found in %s and %s",varRR,varTN))
}
}
#-------- sort common station codes for the available variables:
cod <- sort(cod)
#-------- write RClimDEX files (one per station):
ne <- length(cod) #no. of stations
cat('\n',sprintf("Creating %d RClimDEX files from climatol homogenizations for %d-%d",ne,anyi,anyf),':\n\n')
for(i in 1:ne) { #for every station
if(is.na(dir)) stfile <- sprintf('%s%s.txt',prefix,cod[i])
else stfile <- sprintf('%s/%s%s.txt',dir,prefix,cod[i])
cat(' ',stfile)
dat <- matrix(NA,ndd,3)
if(avl[1]) dat[,1] <- dRR[,which(sRR==cod[i])]
if(avl[2]) dat[,2] <- dTX[,which(sTX==cod[i])]
if(avl[3]) dat[,3] <- dTN[,which(sTN==cod[i])]
#exchange TX with TN when TX<TN:
k <- which(dat[,2]<dat[,3])
if(length(k)>0) {
z <- dat[k,2]; dat[k,2] <- dat[k,3]; dat[k,3] <- z
cat(sprintf(" %d days with TX < TN fixed\n",length(k)))
} else cat('\n')
#write the RClimDEX file:
df <- data.frame(format(fech,'%Y'),format(fech,'%m'),format(fech,'%d'),dat)
names(df) <- c("Year","Month","Day",'RR','TX','TN')
write.table(df,stfile,sep='\t',quote=FALSE,row.names=FALSE,
col.names=header,na=na)
}
cat('\n')
kest=match(cod,est.c[,4])
df <- data.frame(est.c[kest,c(4,5,2,1,3)],'XX')
names(df) <- c("ID","STAT_NAME","LATITUDE","LONGITUDE","ELEVATION","COUNTRY")
write.table(df,'hoclm_stations.txt',sep='\t',row.names=FALSE)
cat("Stations file",'hoclm_stations.txt',"has been saved.",'\n')
}
#- sef2climatol.- Convert SEF data files to CLIMATOL input files.
#SEF stands for Station Exchange Format. Visit:
# https://datarescue.climate.copernicus.eu/node/80
#Missing elevations will be assigned the value 99
sef2climatol <- function(dr, Vbl, varcli=Vbl, ndec=1, na.strings='NA',
mindat=NA) {
#dr: directory containing the SEF files
#Vbl: name of the variable in the SEF files
#varcli: name of the variable in the Climatol destination files
#ndec: number of decimals to save
#na.strings: missing data codes (specified as quoted strings)
#mindat: minimum required number of data per station
Fs <- file('SEFdata.csv','w') #open auxiliary file
for(fich in dir(dr)) { #for every file in directory dr
cat(fich,'\n')
Fe <- file(sprintf('%s/%s',dr,fich),'r') #open for reading
li <- readLines(Fe,1)
if(substr(li,1,3)!='SEF') { cat(": Not a SEF file"); next }
li <- readLines(Fe,1)
cod <- unlist(strsplit(li,'\t'))[2] #station code
li <- readLines(Fe,1)
nom <- unlist(strsplit(li,'\t'))[2] #station name
li <- readLines(Fe,1)
Y <- as.numeric(unlist(strsplit(li,'\t'))[2]) #Y (longitude)
li <- readLines(Fe,1)
X <- as.numeric(unlist(strsplit(li,'\t'))[2]) #X (latitude)
li <- readLines(Fe,1)
Z <- unlist(strsplit(li,'\t'))[2]
if(is.na(Z)|Z==na.strings) Z <- 99 else Z <- as.numeric(Z) #Z (elevation)
li <- readLines(Fe,3)
vrb <- unlist(strsplit(li[3],'\t'))[2] #Vbl
if(vrb!=Vbl) { cat(": Not variable",Vbl); next }
li <- readLines(Fe,3)
#read data table: (Some files may contain metadata like |DSFLAG="|, which
#causes not reading the end of line until a pairing quoting is found in
#the next line, hence skipping half of the data. Parameter quote='\\'
#has been set as a workaround.)
d <- read.table(Fe,sep='\t',na.strings=na.strings,quote='\\',header=TRUE)
nas <- is.na(d[,3])
if(sum(nas)>0) d[nas,3] <- '01' #monthly values
write(sprintf('%f,%f,%f,\'%s\',\'%s\',\'%s\',%s,%f',X,Y,Z,cod,nom,cod,
sprintf('%s-%s-%s',d[,1],d[,2],d[,3]),d[,7]),Fs)
close(Fe)
}
close(Fs)
cat('\n')
csv2climatol('SEFdata.csv',varcli=varcli,ndec=ndec,mindat=mindat,header=FALSE)
file.remove('SEFdata.csv')
}
#- dahgrid.- Obtain grids of homogenized data.
dahgrid <- function(varcli, anyi, anyf, anyip=anyi, anyfp=anyf, grid, idp=2.0,
obsonly=TRUE, nmax=Inf) {
#anyip: first reference year for anomalies calculation.
#anyfp: final reference year for anomalies calculation.
#grid: base grid for interpolation, of class SpatialPoints.
#idp: Power of the inverse distance weights (2 by default).
#obsonly: Do not interpolate missing data estimated by homogen().
#nmax: Maximum number of nearest stations to use (all by default).
nei <- nd <- nm <- std <- est.c <- dat <- NULL #(avoid invisible bindings)
if(!requireNamespace('sp', quietly=TRUE)
| !requireNamespace('gstat', quietly=TRUE)
| !requireNamespace('raster', quietly=TRUE)
| !requireNamespace('ncdf4', quietly=TRUE)
) stop("This function requires packages sp, gstat, raster and ncdf4.\nPlease, install the lacking packages and re-run the function")
if(anyip<anyi) stop("Asked initial reference year before first year of data!")
if(anyfp>anyf) stop("Asked final reference year beyond last year of data!")
#- read original and homogenized data
fbas <- sprintf('%s_%d-%d',varcli,anyi,anyf) #base file name
load(sprintf('%s.rda',fbas))
#- select series from the last fragments
dah <- dah[,1:nei]
#- calculate their means and standard deviations in the chosen period
if(anyip==anyi & anyfp==anyf) { ki <- 1; kf <- nd } #initial and final pos.
else if(nm==0) { #daily data
ki <- which(x==as.Date(sprintf('%d-01-01',anyip)))
kf <- which(x==as.Date(sprintf('%d-12-31',anyfp)))
} else { ki <- (anyip-anyi)*nm+1; kf <- ki+(anyfp-anyip+1)*nm-1 }
m <- apply(dah[ki:kf,],2,mean)
if(std>2) s <- apply(dah[ki:kf,],2,sd)
#- save the statistics with coordinates to allow their use with a GIS
if(std<3) {
df <- data.frame(est.c[1:nei,1:4],m)
names(df) <- c('X','Y','Z','Code','Mean')
} else {
df <- data.frame(est.c[1:nei,1:4],m,s)
names(df) <- c('X','Y','Z','Code','Mean','Std.Dev.')
}
fmeans <- sprintf('%s_msd.csv',fbas)
write.csv(df,fmeans,row.names=FALSE)
#- normalize the series
switch(std,
daz <- scale(dah,center=m,scale=FALSE), #std=1
daz <- scale(dah,center=FALSE,scale=m),
daz <- scale(dah,center=m,scale=s) #std=3 (default)
)
#- if(obsonly), blank data missing in the original series
if(obsonly) daz[is.na(dat)] <- NA
rg <- range(daz,na.rm=TRUE)
#- interpolate means (and std. dev.), and save them in NetCDF
df <- data.frame(est.c[1:nei,1:2],m)
names(df) <- c('x','y','z')
sp::coordinates(df) <- ~ x+y
m <- gstat::idw(z~1,df,grid,nmax=nmax,idp=idp,debug.level=0) #means
dimLon <- ncdf4::ncdim_def(name='lon', units='degrees_east', vals=unique(grid@coords[,1]))
dimLat <- ncdf4::ncdim_def(name='lat', units='degrees_north', vals=rev(unique(grid@coords[,2])))
varCli.m <- ncdf4::ncvar_def(name=sprintf('%s.m',varcli), units='', dim=list(dimLon,
dimLat), missval=NA, longname=sprintf('%s %d-%d means',varcli,anyip,anyfp))
listvar <- list(varCli.m)
nc <- ncdf4::nc_create(sprintf('%s_m.nc',fbas), listvar) #open netCDF file
zz <- raster::rasterFromXYZ(m)
ncdf4::ncvar_put(nc,varCli.m,zz@data@values)
ncdf4::nc_close(nc)
if(std>2) {
df <- data.frame(est.c[1:nei,1:2],s)
names(df) <- c('x','y','z')
sp::coordinates(df) <- ~x+y
s <- gstat::idw(z~1,df,grid,nmax=nmax,idp=idp,debug.level=0) #std interp.
varCli.s <- ncdf4::ncvar_def(name=sprintf('%s.s',varcli), units='',
dim=list(dimLon, dimLat), missval=NA,
longname=sprintf('%s %d-%d std. deviations',varcli,anyip,anyfp))
listvar <- list(varCli.s)
nc <- ncdf4::nc_create(sprintf('%s_s.nc',fbas), listvar) #open netCDF file
zz=raster::rasterFromXYZ(s)
ncdf4::ncvar_put(nc,varCli.s,zz@data@values)
ncdf4::nc_close(nc)
}
#- === create a netCDF with grids interpolated at every time step
if(is.na(ini)) ini <- sprintf('%d-01-01',anyi) #default initial date
if(nm>0) x <- seq(as.Date(ini),length.out=nd,by=sprintf('%d months',12/nm))
else x <- seq(as.Date(ini),length.out=nd,by='1 day')
dimTime <- ncdf4::ncdim_def(name='Date', units='days since 1970-01-01',
vals=as.numeric(x), calendar='standard')
varCli <- ncdf4::ncvar_def(name=varcli, units='', dim=list(dimLon, dimLat,
dimTime), missval=NA)
listvar <- list(varCli)
nc <- ncdf4::nc_create(sprintf('%s.nc',fbas), listvar) #open netCDF file
#- for every time step:
cat(sprintf("Interpolating %d grids...: ",nd))
kz <- max(10,round(nd/100))
for(k in 1:nd) {
if(!k%%kz) cat('\b\b\b\b\b',sprintf('%2s %%',round(k*100/nd)))
#- interpolate (IDW) the normalized variable estandarizada at grid points
df <- data.frame(est.c[1:nei,1:2],daz[k,])
if(obsonly) df <- df[!is.na(df[,3]),]
names(df) <- c('x','y','z')
sp::coordinates(df) <- ~x+y
z <- gstat::idw(z~1,df,grid,nmax=nmax,idp=idp,debug.level=0) #std interp.
#from SpatialPointsDataFrame to RasterLayer:
zz=raster::rasterFromXYZ(z)
#- save values in the netCDF
ncdf4::ncvar_put(nc,varCli,zz@data@values,start=c(1,1,k),count=c(-1,-1,1))
}
cat(" (done)",'\n\n')
#- close the netCDF file and finish
ncdf4::nc_close(nc)
cat(sprintf("Normalized grids (%f to %f) saved to file %s.nc",rg[1],rg[2],fbas),'\n')
cat("Means")
if(std>2) cat(" and standard deviations")
cat(" (of the whole series) saved to files",'\n')
cat(sprintf('%s_m.nc',fbas))
if(std>2) cat(',',sprintf('%s_s.nc',fbas))
cat(" and",fmeans,'\n\n')
}
#- dahstat.- Extract series or statistics of the homogenized data.
dahstat <- function(varcli, anyi, anyf, anyip=anyi, anyfp=anyf, stat='me',
ndc=NA, vala=2, valm=vala, cod=NULL, prob=.5, all=FALSE, long=FALSE,
relref=FALSE, pernyr=10, estcol=c(1,2,4), sep=',', dec='.') {
# varcli: (Short) name of the homogenized climatic variable
# anyi: First year of the homogenized series
# anyf: Last year of the homogenized series
# anyip: First year for the statistical calculation
# anyfp: Last year for the statistical calculation
# stat: Statistic to calculate (one of "me"(means), "mdn"(medians), "max"(maxima), "min"(minima), "std"(standard deviations), "q"(quantiles), "tnd"(OLS trends and their p-values), "series"(none, just save homogenized series into a CSV file, plus another file with flags)
# ndc: No. of decimals (defaults to that used in the homogenization)
# vala: Annual value: 0(none), 1(sum), 2(mean), 3(maximum), 4(minimum)
# valm: Monthly value: 1(sum), 2(mean), 3(maximum), 4(minimum)
# cod: vector of requested station codes (all by default)
# prob: Probability to calculate quantiles (0.5 by default)
# all: If TRUE, all reconstructed series will be used. The default is FALSE, hence using only the series reconstructed from the last homogeneuos subperiod
# long: If TRUE (the default is FALSE), only series reconstructed from the longest homogeneuos subperiod will be used
# relref: Set to TRUE to use also any added reliable reference series
# pernyr: No. of years on which to express trend units (10 by default)
# estcol: Columns of est.c to include in the output tables (defaults to c(1,2,4): coordinates and station codes)
# sep: Field separator (',' by default)
# dec: Decimal point ('.' by default)
#- inicializaciones
if(valm==0) valm <- 2 #valm is needed for monthly aggregates
na <- anyf-anyi+1 #no. of years
if(anyi>anyf) stop ("First year of data greater than the last year!")
if(anyip<anyi) stop("Asked initial year before first year of data!")
if(anyfp>anyf) stop("Asked final year beyond last year of data!")
#chosen function to calculate monthly aggregates:
fun <- c('mean','median','max','min','sd','quantile')[which(c('me','mdn',
'max','min','std','q','tnd')==stat)]
lmcoef <- function(y,x) coef(summary(lm(y~x)))[2,c(1,4)] #regression coef.
#- unrecognized stat option? finish
if(!length(fun) & stat!='series' & stat!='mseries') {
cat('stat',"should be one of",'\'mean\',\'median\',\'max\',\'min\',\'sd\',\'quantile\'\n')
stop(sprintf("Option stat='%s' not recognized!",stat))
}
if(stat=='q') {
if(length(prob)>1) stop("Please, provide a unique probability to calculate quantiles")
if(prob<0 | prob>1) stop("prob must be a value between 0 and 1")
}
mes3 <- c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct",
"Nov","Dec")
if(all) long <- FALSE #long not needed if all=TRUE
#- read input data
load(sprintf('%s_%d-%d.rda',varcli,anyi,anyf))
#disregard trusted reference series?:
if(!relref) {
kr <- substr(est.c$Code,1,1)=='*'
if(sum(kr)>0) { #if there are trusted reference series:
nei <- nei-sum(kr[1:nei])
est.c <- est.c[!kr,]; ne <- nrow(est.c)
dat <- dat[,!kr[1:nei]]; dah <- dah[,!kr]
}
}
if(!is.na(ndc)) ndec <- ndc
codo <- unsufix(est.c$Code) #original station codes of all series
estvar <- names(est.c)
if(nm==1 | stat=='series') vala <- 0 #annual value not needed
else {
if(nm==0) {
z <- seq(as.Date(sprintf('%s-01-01',anyi)),as.Date(sprintf('%s-12-31',anyf)),1)
if(length(z)!=nd) stop("Statistics need complete years to be calculated")
}
if(vala<1 | vala>4) vala <- 2 #if vala out of range, set it to mean
funa <- c('sum','mean','max','min')[vala] #function for the annual value
}
#- locate data of the requested period:
if(anyip!=anyi | anyfp!=anyf) {
yy <- as.integer(strftime(x,'%Y'))
xk <- min(which(yy==anyip)):max(which(yy==anyfp))
} else xk <- 1:nd
#- select requested stations:
esel <- rep(TRUE,ne)
if(!is.null(cod)) {
ksel <- which(codo %in% cod) #requested stations
esel[-ksel] <- FALSE
} else cod <- est.c[1:nei,4]
if(!all & ne>nei) esel[(nei+1):ne] <- FALSE #last fragments only
else if(long) {
lsel <- rep(TRUE,length(esel)) #initialize vector
for(ko in 1:nei) { #for every original station
kest <- which(codo==est.c$Code[ko]) #series of the same station ko
if(length(kest)>1) { #if more than one fragment...
ksel <- which.max(est.c$pod[kest]) #highest % of data
lsel[kest[-ksel]] <- FALSE #chosen selection
}
}
esel <- esel & lsel
}
if(sum(esel)==0) stop("No station selected! (No output)")
codo <- codo[esel] #original station codes
est.c <- est.c[esel,] #selected stations
#- select data from chosen period and estations
dah <- dah[xk,esel]; dat <- dat[xk,esel[1:nei]] #selected data
#update parameters:
x <- x[xk]; ne <- sum(esel); nei <- length(cod); nd <- length(x)
na <- anyfp-anyip+1 #no. of years
iest <- match(codo,cod) #index of original stations
#- if(stat=="series"), list series and flags in CSV format
if(stat=='series') { #series, in two files (data and flags):
#compare homogenized and original data (avoid '=='!):
if(length(iest)==1) df <- abs(dah-dat) < 1e-9
else df <- abs(dah-dat[,iest]) < 1e-9
df <- as.numeric(df) #TRUE=1, FALSE=0
df[df==0] <- 2 #data different to the originals
df[df==1] <- 0 #data equal to the originals
df[is.na(df)] <- 1 #filled data (originally missing)
dim(df) <- dim(dah)
#name of output files:
ard <- sprintf('%s_%d-%d_series.csv',varcli,anyi,anyf)
arf <- sprintf('%s_%d-%d_flags.csv',varcli,anyi,anyf)
if(length(iest==1)) {
dah <- data.frame(cbind(format(x),dah))
df <- data.frame(cbind(format(x),df))
colnames(dah) <- colnames(df) <- c('Date',est.c[,4])
} else {
dah <- data.frame(cbind(format(x),dah[,order(iest)]))
df <- data.frame(cbind(format(x),df[,order(iest)]))
colnames(dah) <- colnames(df) <- c('Date',est.c[order(iest),4])
}
write.table(dah,ard,row.names=FALSE,quote=FALSE,sep=',',dec='.')
write.table(df,arf,row.names=FALSE,quote=FALSE,sep=',',dec='.')
cat(sprintf("Homogenized values written to %s,\nwith flags in %s:\n",ard,arf))
cat(' 0:',"Observed data",'\n')
cat(' 1:',"Missing data (filled in)",'\n')
cat(' 2:',"Corrected data",'\n')
return(invisible())
}
#- if(nm==0) calculate monthly aggregates:
if(nm==0) {
cat("Computing monthly aggregates... ")
me <- strftime(x,'%m'); anyo <- strftime(x,'%Y')
dm <- matrix(NA,na*12,ne) #monthly data
funm <- c('sum','mean','max','min')[valm] #function for the monthly values
if(ne==1) {
z <- aggregate(dah,list(me,anyo),funm)
dm <- round(z[,3],ndec)
} else {
for(ie in 1:ne) {
z <- aggregate(dah[,ie],list(me,anyo),funm)
dm[,ie] <- round(z[,3],ndec)
}
}
cat("Done.",'\n')
nm=12; dah <- dm
if(stat=='mseries') { #output monthly series and finish:
ard <- sprintf('%s_%d-%d_mseries.csv',varcli,anyi,anyf)
xm <- unique(format(x,'%Y-%m'))
if(length(iest==1)) {
dah <- data.frame(xm,dah)
colnames(dah) <- c('Date',est.c[,4])
} else {
dah <- data.frame(cbind(xm,dah[,order(iest)]))
colnames(dah) <- c('Date',est.c[order(iest),4])
}
write.table(dah,ard,row.names=FALSE,quote=FALSE,sep=',',dec='.')
cat(sprintf("\nHomogenized MONTHLY values written to %s\n",ard))
return(invisible())
}
}
dim(dah) <- c(nm,na,ne)
#- if(vala), calculate annual values
if(nm>1) { #calculate annual values
aval <- as.vector(apply(dah,2:3,funa))
dim(dah) <- c(nm,na*ne)
dah <- rbind(dah,aval)
nc <- nm+1 #no. of columns in the table
dim(dah) <- c(nc,na,ne)
} else nc <- 1
#initialize matrix:
val <- matrix(NA,ne,nc)
#- if(stat=="tnd"), calculate trends
if(stat=="tnd") {
ndec <- ndec+1 #add a decimal place for trends
pval <- val #matrix to allocate p-values
if(ne==1) {
z <- apply(dah,1,lmcoef,x=anyip:anyfp)
val <- t(as.data.frame(round(z[1,]*pernyr,ndec)))
pval <- t(as.data.frame(round(z[2,],3)))
} else {
z <- apply(dah,c(3,1),lmcoef,x=anyip:anyfp)
val <- round(z[1,,]*pernyr,ndec)
pval <- round(z[2,,],3)
}
}
#- else, apply the requested function
else {
for(i in 1:ne) {
if(nc==1) {
if(stat=='q') val[i,] <- round(eval(call(fun,dah,prob)),ndec)
else val[i,] <- round(eval(call(fun,dah[,,i])),ndec)
}
else { #monthly, bimonthly, quarterly or semester data:
if(stat=='q') val[i,] <- round(apply(dah[,,i],1,fun,prob),ndec)
else val[i,] <- round(apply(dah[,,i],1,fun),ndec)
}
}
}
#- issue message on the output files
if(stat=='me') cat("Mean")
else if(stat=='mdn') cat("Median")
else if(stat=='max') cat("Maximum")
else if(stat=='min') cat("Minimum")
else if(stat=='std') cat("Standard deviation")
else if(stat=='q') cat(prob,"prob. quantile")
else if(stat=='tnd') cat("Trend")
cat(" values of ",varcli,' (',anyip,'-',anyfp,')',sep='')
if(stat=='tnd') cat(", expressed in units per ",pernyr," years,",sep='')
dahs <- data.frame(cbind(est.c[,estcol],val))
if(nm==12) ndf <- c(estvar[estcol],mes3,"Annual")
else if(nm==6) ndf <- c(estvar[estcol],sprintf('Bim%d',1:6),"Annual")
else if(nm==4) ndf <- c(estvar[estcol],sprintf('Qrt%d',1:4),"Annual")
else if(nm==3) ndf <- c(estvar[estcol],sprintf('4mn%d',1:3),"Annual")
else if(nm==2) ndf <- c(estvar[estcol],sprintf('Sem%d',1:2),"Annual")
else if(nm==1) ndf <- c(estvar[estcol],"Annual")
else stop("Number of data per year is none of 1,2,3,4,6,12")
names(dahs) <- ndf
#- save values in output files
#output file:
if(stat=='q') ars <- sprintf('%s_%d-%d_%s%d.csv',varcli,anyip,anyfp,stat,round(100*prob))
else ars <- sprintf('%s_%d-%d_%s.csv',varcli,anyip,anyfp,stat)
write.table(dahs[order(est.c[,4]),],ars,row.names=FALSE,sep=',',dec='.')
cat('\n'," written to",ars,'\n')
if(stat=='tnd') { #save p-values
dahs2 <- data.frame(cbind(est.c[estcol],pval))
names(dahs2) <- ndf
ars <- sprintf('%s_%d-%d_pval.csv',varcli,anyip,anyfp)
write.table(dahs2[order(est.c[,4]),],ars,row.names=FALSE,sep=',',dec='.')
cat("P-values written to",ars,'\n')
}
cat('\n')
}
#- datsubset.- Subset data by subperiod, code list or no. of years with data.
datsubset <- function(varcli, anyi, anyf, anyis=anyi, anyfs=anyf, minny=NA,
codes=NULL, na.strings=NA, ini=NA) {
#varcli: (short) name of the climatic variable
#anyi, anyf: first and last year of the input data file
#anyis, anyfs: first and last year for data subsetting
#ninny: minimum number of years with data to subset
#codes: vector of chosen station codes. (Defaults to NULL, meaning all)
#na.strings: strings marking missing data (NA by default)
#ini:initial date (if not January 1st)
if(anyis==anyi & anyfs==anyf & is.na(minny) & is.null(codes))
stop("No subsetting required!\n")
if(anyis<anyi) stop("Asked initial selected year before first year of data!")
if(anyfs>anyf) stop("Asked final selected year beyond last year of data!")
#- read input data
z <- read.dat(varcli,anyi,anyf,ini=ini,na.strings=na.strings)
est.c <- z$est.c; dat <- z$dat; na <- z$na; nd <- z$nd; ne <- z$ne; x <- z$x
nm <- z$nm; rm(z) #free memory
nas <- anyfs-anyis+1 #no. of years in selected subperiod
fbas <- sprintf('%s_%d-%d',varcli,anyi,anyf) #base file name
fbas2 <- sprintf('%s_%d-%d',varcli,anyis,anyfs) #base output file name
if(fbas==fbas2) { #rename input data to avoid overwriting:
file.rename(sprintf('%s.dat',fbas),sprintf('%s-ori.dat',fbas))
file.rename(sprintf('%s.est',fbas),sprintf('%s-ori.est',fbas))
cat(sprintf("Original files renamed to %s-ori.dat and %s-ori.est\n",fbas,fbas))
}
#subset a subperiod of data? :
if(nas < na) {
xa <- strftime(x,'%Y') #years of every data
sel <- xa>=anyis & xa<=anyfs
dat <- dat[sel,]
}
#subset series with at least minny years with data? :
if(!is.na(minny)) {
#minny must be an integer equal or greater than 1:
if(minny!=round(minny)) minny <- round(minny); if(minny<1) minny <- 1
nad <- apply(!is.na(dat),2,sum) #no. of available data per station
if(nm>0) nyd <- nad/nm else nyd <- floor(nad/365.25)#no. of years w data
sel <- nyd >= minny; nes <- sum(sel) #no. of selected stations
if(nes==0) stop(sprintf("No series has >=%d years of data",minny))
if(nes < ne) { dat <- dat[,sel]; est.c <- est.c[sel,] }
}
#subset series of selected stations? :
if(!is.null(codes)) {
ke <- match(codes,est.c[,4]); ke <- ke[!is.na(ke)]
if(length(ke)==0) stop("Selected codes does not meet the other requirements")
dat <- dat[,ke]; est.c <- est.c[ke,]
}
#write output files:
if(nm>0) ncl <- nm else ncl <- 10
write(dat,sprintf('%s.dat',fbas2),ncolumns=ncl)
write.table(est.c,sprintf('%s.est',fbas2),col.names=FALSE,row.names=FALSE)
cat(sprintf('Subset data written to files %s.dat and %s.est\n',fbas2,fbas2))
}
#- db2dat.- Get data from a database and build input files *.dat and *.est for
# the homogen() function. (ODBC must be intalled and properly configured.)
# ----------------------------------------------------------------------
#Example for a database called "climate", with user "USER" and password "PASS":
# R #start R (version 3 or higher)
# library(RODBC)
# ch <- odbcConnect("climate",uid="USER",pwd="PASS") #connect to database
# db2dat('HRel',1961,2015,10,FALSE,ch,'%Y-%m-%d','monthly_relhum','Station',
# 'Date','Value','stations','Station','Name','Longitude','Latitude','Elevation')
# odbcClose(ch) #close connection to mcheng
# ----------------------------------------------------------------------
# This example will compile monthly average relative humidity for the period
# 1961-2015 excluding series with less than 10 years of data (120 monthly data)
# in files HRel_1961-2015.dat and HRel_1961-2015.est, which you can
# homogenize later with the Climatol R package with, e.g.:
# library(climatol)
# homogen('HRel',1961,2015,vmin=0,vmax=100)
# -------------------------------------------------------------------
db2dat <- function(varcli, anyi, anyf, minny=5, daily=TRUE,ch,
dformat='%Y-%m-%d', vtable, vcode, vdate, vval, stable, scode, sname, sx, sy,
sz) {
#varcli: Short name of the climatic variable under study
#anyi: Fist year of the study period
#anyf: Last year of the study period
#minny: Minimum number of years with data in the series to study
#ch: Name of the ODBC conexion to the database
#dformat:Format of dates in the database
#vtable: Name of the table containing our climatic variable
#vcode: Name of the variable containing station codes in the database
#vdate: Name of the variable containing dates in the database
#vval: Name of the climatic variable in the database
#stable: Name of the table containing station information (metadata)
#scode: Name of the variable containing station codes
#sname: Name of the variable containing station names
#sx: Name of the variable containing longitudes (degrees with decimals!)
#sy: Name of the variable containing latitudes (degrees with decimals!)
#sz: Name of the variable containing elevations (meters)
#- initializations
na <- anyf-anyi+1 #no. of years
if(na<=0) stop("Last year must be greater than the first year")
fini <- sprintf('%d-01-01',anyi)
if(daily) {
x <- seq(as.Date(sprintf('%d-01-01',anyi)),as.Date(sprintf('%d-12-31',anyf)),by='1 day')
ndmin <- round(minny*365.25) #min. no. of daily data
ffin <- sprintf('%d-12-31',anyf)
} else {
x <- seq(as.Date(sprintf('%d-01-01',anyi)),as.Date(sprintf('%d-12-01',anyf)),by='1 month')
ndmin <- minny*12 #min. no. of monthly data
ffin <- sprintf('%d-12-01',anyf)
}
nd <- length(x) #no. of data per station
#- read station names and coordinates
cat("Getting station names and coordinates...",'\n')
ds <- RODBC::sqlQuery(ch,sprintf('SELECT %s, %s, %s, %s, %s FROM %s', sx,sy,sz,scode,sname,stable,scode))
ds[,1:2] <- round(ds[,1:2],5) #round coordinates to 5 decimals
ds[,3] <- round(ds[,3],1) #round elevations to 1 decimal
ds[,4] <- as.character(ds[,4]) #force codes as character strings
ds[,5] <- as.character(ds[,5]) #force names as character strings
ds <- ds[order(ds[,4]),] #order stations by code
ns <- nrow(ds); ndat <- rep(0,nd)
#- open data and stations files
dfile <- sprintf('%s_%d-%d.dat',varcli,anyi,anyf)
efile <- sprintf('%s_%d-%d.est',varcli,anyi,anyf)
Fd <- file(dfile,'w')
Fe <- file(efile,'w')
#- get data from the ODBC connection, station by station
cat("Getting data for every station...",'\n')
ne <- 0
for(i in 1:ns) { #for every station
cat(unlist(ds[i,]),'\n')
if(sum(is.na(ds[i,]))>0) {
cat("Warning: Incomplete metadata (station skipped)",'\n')
next
}
dd <- RODBC::sqlQuery(ch,sprintf('SELECT %s,%s FROM %s WHERE %s >= "%s" AND %s <= "%s" AND %s = "%s"',vdate,vval,vtable,vdate,fini,vdate,ffin,vcode,ds[i,4]))
if(is.null(dim(dd))) next #no data for the variable at this station
if(sum(!is.na(dd[,2])) < ndmin) next #not enough data
dd[,1] <- as.Date(dd[,1],format=dformat,tz='') #force vdate to class Date
k <- match(dd[,1],x) #match data time steps
if(sum(is.na(k))>0) {
cat("Warning: Station skipped because some or all of its dates do not match the expected values",'\n')
next
}
dat <- rep(NA,nd) #initialize data vector
dat[k] <- dd[,2] #assign data
write(dat,Fd,ncolumns=ifelse(daily,10,12)) #write into data file
write.table(ds[i,],Fe,row.names=FALSE,col.names=FALSE) #write metadata
ne <- ne + 1 #count no. of saved series
ndat <- ndat + !is.na(dat) #count no. of data at every time step
}
#close files:
close(Fe); close(Fd)
cat(sprintf('\n',"Files %s and %s successfully generated.",dfile,efile))
#check data availability along time:
if(min(ndat)==0) {
ks <- which(ndat==0)
cat(sprintf(" BUT:\nNo data available in any station for %s",ifelse(daily,'day','month')))
if(length(ks)>1) cat('s:\n') else cat(' ')
print(x[ks])
cat(sprintf("Add stations or shorten the study period to avoid %s without data\n",ifelse(daily,'days','months')))
} else cat('\n')
}
#- dd2m.- Calculate monthly values from daily or subdaily data.
dd2m <- function(varcli, anyi, anyf, ndec=1, valm=2, namax=30, x=NULL,
na.strings=NA, tz='utc') {
#varcli: Short name of the climatic variable
#anyi: Initial year
#anyf: Final year
#ndec: No. of decimals requested in the results (1 by default)
#valm: Monthly value (1=sum, 2=mean, 3=maximum, 4=minimum, 5=standart deviation)
#namax: Maximum allowed percentage of missing data in a month
#x: Time vector. Automatically set by default, but needed if data are taken
# at irregular intervals.
#na.strings: Strings marking missing data (NA by default).
#tz: Time zone (if data are subdaily). ('utc' by default.)
#- read input data
z <- read.dat(varcli,anyi,anyf,x,na.strings=na.strings,tz=tz)
est.c <- z$est.c; dat <- z$dat; na <- z$na; ne <- z$ne; x <- z$x
me <- strftime(x,'%m')
anyo <- strftime(x,'%Y')
fun <- c('sum','mean','max','min','sd')[valm] #function for monthly values
ndm <- na*12 #no. of monthly values
dm <- matrix(NA,ndm,ne) #monthly values
zp <- c(table(me,anyo)) #no. of possible data in every month
for(ie in 1:ne) { #for every station
cat(' ',ie)
z <- aggregate(dat[,ie],list(me,anyo),fun,na.rm=TRUE)[,3] #monthly data
z2 <- aggregate(is.na(dat[,ie]),list(me,anyo),sum)[,3] #no. of missings
#with subdaily data some can be in year anyf+1:
if(length(z)>ndm) { z <- z[1:ndm]; z2 <- z2[1:ndm]; zp <- zp[1:ndm] }
nas <- z2/zp > namax/100. #months with not enough data
z[nas] <- NA #set their values to missing
dm[,ie] <- z #assign monthly data to the main matrix
}
dm[is.nan(dm)] <- NA #assign NA for missing data
#remove series with less than 12 data:
z <- apply(!is.na(dm),2,sum) #no. of data items per series
zk <- which(z < 12)
if(length(zk)>0) {
dm <- dm[,-zk]
est.c <- est.c[-zk,]
cat('\n\n',"Series removed because they have less than 12 monthly data",':\n')
print(est.c[zk,4])
df <- data.frame(z[zk],est.c[zk,4:5])
names(df) <- c('No. of data','Code','Name')
write.csv(df,'dd2m-few_data.csv',row.names=FALSE)
cat('\n',"See them in file",'dd2m-few_data.csv')
}
#save monthly data:
fichsal <- sprintf('%s-m_%d-%d.dat',varcli,anyi,anyf)
fichest <- sprintf('%s-m_%d-%d.est',varcli,anyi,anyf)
write(round(dm,ndec),fichsal,ncolumns=12)
write.table(est.c,fichest,row.names=FALSE,col.names=FALSE)
cat('\n\n',"Monthly",fun,"values saved to file",fichsal,'\n')
if(namax>0) cat("(Months with more than",namax,
'%',"missing data have also been set to missing)",'\n\n')
}
#- exampleFiles.- Get the path to some example files.
#(Adapted from readxl::readxl_example)
exampleFiles <- function(file=NULL) {
#file: Name of the needed file. If NULL, all example files will be listed.
if(is.null(file)) dir(system.file('files',package ='climatol'))
else system.file('files', file, package = 'climatol', mustWork=TRUE)
}
#- fix.sunshine.- Check homogenized daily sunshine hours and prune any excess or negative values.
fix.sunshine <- function(varcli, anyi, anyf) {
#varcli: Short name of the homogenized climatic variable
#anyi: First year of the homogenized series
#anyf: Last year of the homogenized series
nm <- ndec <- x <- ne <- est.c <- nd <- NULL #(avoid invisible bindings)
#- leer los datos de entrada
frda <- sprintf('%s_%d-%d.rda',varcli,anyi,anyf) #file to load/save
obj2save <- load(frda)
if(nm!=0) stop("This function only applies to DAILY sunshine series")
#- auxiliary functions to calculate maximum theoretical sunshine hours
insolteor <- function(lat,fech) { #maximum theoretical sunshine hours
nf <- length(fech) #no. of requested dates
it <- rep(NA,nf) #maximum theoretical sunshine hours vector
latr <- lat * 0.01745329 #latitude in radianes (0.01745329 = 2 * pi / 360)
for(k in 1:nf) {
dj <- as.numeric(strftime(fech[k],'%j'))
dec <- declin(dj)
c <- -tan(latr) * tan(dec)
if(c <= -1.) it[k] = 24.
else if(c >= 1.) it[k] = 0.
else {
b <- 1.570796 - atan(c / sqrt(-c*c+1.))
r <- b * 24 / pi
if(r > 12) d <- r else d <- 24-r
it[k] <- r + 2 * d * 0.004627778 + .05 # 0.004627778 = .833 / 180
}
}
return(it)
}
declin <- function(djul) { #sun declination
#Approximation from http://solardat.uoregon.edu/SolarRadiationBasics.html:
# declin = 23.45 * pi / 180 * sin(2 * pi * (284 + n) / 365) =
return(0.4092797 * sin(4.888834 + 0.01721421 * djul))
}
#- prune exceeding values
sink('fix.sunshine.txt',split=TRUE)
cat("Checking sunshine durations of",frda,'\n',"and prunning any excess or negative values...",'\n')
fixed <- FALSE; rmargin <- 1/10^ndec/2 #flag and rounding margin
dec=declin(as.numeric(strftime(x,'%j')))
for(j in 1:ne) {
cat('------',est.c[j,4],est.c[j,5],'\n')
c <- -tan(est.c[1,2]*0.01745329) * tan(dec)
r <- (1.570796 - atan(c / sqrt(-c*c+1.)))*24/pi
d <- r; z <- r>12; d[z] <- 24-d[z]
it <- round(r + 2 * d * 0.004627778 + .05, ndec) #maximum possible value
for(i in 1:nd) {
if(dah[i,j]<0) {
cat(est.c[j,4],format(x[i]),dah[i,j],'-> 0\n')
dah[i,j] <- 0; fixed <- TRUE
} else if(dah[i,j]>it[i]) {
cat(est.c[j,4],format(x[i]),dah[i,j],'->',it[i],'\n')
dah[i,j] <- it[i]; fixed <- TRUE
}
}
}
if(fixed) {
frda0 <- sprintf('%s.bak',frda)
file.rename(frda,frda0)
cat('\n',"Original file",frda,"renamed to",frda0,'\n')
cat("Writing the new",frda,'file...\n')
save(list=obj2save, file=frda)
cat("List of fixed values saved to fix.sunshine.txt",'\n')
} else cat("(No value has been modified)",'\n')
sink()
}
#- homogen.- automatic homogenization of climate series.
homogen <- function(varcli, anyi, anyf, test='snht', nref=NULL, std=NA,
swa=NA, ndec=1, niqd=c(4,6,1), dz.max=.01, dz.min=-dz.max, cumc=NA, wd=NULL,
inht=25, sts=5, maxdif=NA, maxite=999, force=FALSE, wz=.001, mindat=NA,
onlyQC=FALSE, annual=c('mean','sum','total'), x=NULL, ini=NA, na.strings='NA',
vmin=NA, vmax=NA, hc.method='ward.D2', nclust=300, cutlev=NA, grdcol=grey(.4),
mapcol=grey(.4), expl=FALSE, metad=FALSE, sufbrk='m', tinc=NA, tz='utc',
rlemin=NA, rlemax=NA, cex=1.1, uni=NA, raway=2, graphics=TRUE, verb=TRUE,
logf=TRUE, snht1=NA, snht2=NA, gp=NA) {
#varcli: Short name of the studied climatic variable
#anyi: Initial year
#anyf: Final year
#test: Break detection test to apply. One of 'snht' (the default) or 'cuct'
#nref: Maximum no. of reference stations at each stage
#std: Type of normalization. 1 (remove the mean), 2 (divide by the mean) or
# 3 (remove the mean and divide by the standard deviation).
#swa: Semi-Window Amplitude (no. of data; defaults to 60 months or 365 days).
#ndec: No. of required decimal places in the results (1 by default)
#niqd: No. of interquartilic distances to delete big outliers (defaults to
# c(4,6,1) for data, second differences and running lengths of identical data).
#dz.max: If >1, upper tolerance limit for anomalies (if two values are given,
# only those higher than the upper one will be rejected);
# If <=1, percentage of anomalous data to reject (in each side of the
# distribution (0.01 by default).
#dz.min: lower tolerance limit for anomalies (-dz.max by default).
#cumc: code of accumulated missing data.
#wd: Weight distance, in km. Distance at which the weight of a reference data
# is halved. (If wd=0, all reference stations will have equal weight.)
#inht: Inhomogeneity threshold(s). (0 to skip the stage.)
#sts: Series tail size (defaults to 5).
#maxdif: maximum data difference from previous iteration (ndec/2 by default).
#maxite: maximum number of iterations to compute means (999 by default).
#force: force direct homogenization of (sub)daily series [FALSE].
#wz: Scale factor for elevation Z. The default value (0.001) equals vertical
# differences in m to the horizontal differences in km. Can be used to
# give more weight to Z, or to calculate horizontal distances only (wz=0).
#mindat: Minimum no. of data for a split fragment to become a new series
# [swa/2 for daily series or 12 terms otherwise].
#onlyQC: Set to TRUE if only initial Quality Controls are requested [FALSE]
#annual: Running annual value to graph in the PDF output. One of 'mean' (the
# default), 'sum' or 'total' (equivalent to 'sum').
#x: Time vector. Only needed if data are taken at irregular intervals.
#ini: Initial date, with format 'AAAA-MM-DD' (for daily data, if series does not begin on January first as recommended).
#na.strings: Strings marking missing data (NA by default).
#vmin, vmax: Range of allowed values for the climatic variable.
#hc.method: hierarchical clustering method ('ward.D2' by default).
#nclust: Maximum number of series for the cluster analysis [300].
#cutlev: Level to cut dendrogram to define clusters (automatic by default).
#grdcol: Color of the graphic background grids [grey(0.04)].
#mapcol: Color of coastlines and borders in the stations map [grey(0.04)].
#expl: Perform an exploratory analysis? [FALSE].
#metad: Use the breakpoints file as metadata? [FALSE].
#sufbrk: Suffix to add to varcli to form the name of the provided metadata file ['m'; set to '' if original data were monthly].
#tinc: Time increment between data. Not set by default. If defined
# for subdaily data, should be in units 'hours', 'mins' or 'secs'.
# E.g.: tinc='1 hours'. (Do not forget the last 's' in the units).
#tz: Time zone. Only relevant for subdaily data. ('utc' by default.)
#rlemin: Data run lengths will exclude values <= rlemin in quality control.
#rlemax: Data run lengths will exclude values >= rlemax in quality control.
#cex: Character expansion factor for graphic labels and titles [1.1].
#uni: Units to use in some axis labels. (None by default.)
#raway: Factor to increase internal distance to reanalysis series to make them weight less than the observed series (2 by default).
#graphics: Output graphics in a PDF file [TRUE].
#verb: Verbosity [TRUE].
#logf: Save console messages to a log file? [TRUE].
#snht1, snht2: Obsolete but kept for backwards compatibility.
#gp: Obsolete but kept for backwards compatibility.
#------------------------------------------------------------------
palette <- NULL #(avoid invisible bindings)
#backwards compatibility:
if(!is.na(snht1)) inht <- snht1
if(!is.na(snht2)) inht <- c(inht,snht2)
if(!is.na(snht1)|!is.na(snht2)) cat("Please, note that parameters snht1 and snht2 are deprecated.\nUse inht in future applications of Climatol.",'\n')
if(!is.na(gp)) {
graphics <- TRUE
switch(gp+1,
graphics <- FALSE, #gp=0
onlyQC <- TRUE, #gp=1
, #gp=2
, #gp=3
annual <- 'sum', #gp=4
)
cat("Please, note that parameter gp is deprecated.\nUse graphics=FALSE for gp=0, onlyQC=TRUE for gp=1 or\nannual='total' for gp=4 in future applications of Climatol.",'\n')
}
if(onlyQC & !graphics) graphics <- TRUE #fix possible inconsistency
#- initializations
annual <- match.arg(annual); if(annual=='total') annual <- 'sum'
verde <- hsv(.33,1,.6) #dark green
palette('R3') #use old default palette
#auxiliary functions:
datmed.mean <- function(x) mean(datmed[x])
datmed.sd <- function(x) sd(datmed[x])
r3 <- function(x) sign(x)*abs(x)^(1/3) #cubic root
#chosen inhomogeneity test:
if(test=='cuct') { inhtest <- 'CucT'; test <- 'cuct' } #apply Cucconi
else { inhtest <- 'SNHT'; test <- 'snht' } #apply SNHT
#in case of error, close all output files:
options(error=cerrar)
if(!is.na(std)) {
std <- as.integer(std) #std must be an integer between 1 and 3:
if(std<1|std>3) cat("Warning:")
if(std<1) { std <- 1; cat("std lower than 1 has been forced to 1",'\n') }
if(std>3) { std <- 3; cat("std greater than 3 has been forced to 3",'\n') }
}
if(is.null(nref)) {
nref <- c(10,10,4); if(!is.na(cumc)) nref[3] <- 2
}
if(is.null(wd)) {
wd <- c(0,0,100); if(!is.na(cumc)) wd[3] <- 10
}
#if unset, set maxdif depending on ndec:
if(is.na(maxdif)) maxdif=10^(-ndec)/2 #0.05 for one decimal
#skip detection stages if metad==TRUE or in exploratory mode:
if(expl | metad) inht <- c(0,0)
#dz.min must be negative!:
z=dz.min>0; if(sum(z)>0) dz.min[z] <- -dz.min[z]
#- open log file and write header
archlog <- paste(varcli,'_',anyi,'-',anyf,'.txt',sep='')
if(logf) sink(archlog,split=verb)
cat('\n',"HOMOGEN() APPLICATION OUTPUT (From R's contributed package 'climatol' ",climatol.version,')\n',sep='')
cat('\n',"=========== Homogenization of ",varcli,', ',anyi,'-',anyf,'. (',
date(),')\n',sep='')
time1 <- Sys.time() #time at the beginning of the process
cat('\n',"Parameters:")
arg <- names(formals()) #list the function arguments
nargs <- length(arg) #no. of arguments
for(i in 1:nargs) {
if(arg[i]=='x') next #do not print the time vector!
cat(' ',arg[i],'=',sep='')
cat(eval(as.symbol(arg[i])))
if(i<nargs) cat(',')
}
cat('\n\n')
#complete multivalue parameters:
k <- length(inht); if(k<2) inht <- c(inht,inht) #same threshold in all stages
k <- length(wd); if(k<3) wd <- c(rep(0,3-k),wd)
k <- length(nref); if(k<3) nref <- c(rep(10,3-k),nref)
if(expl) nref[3] <- nref[1] #keep no. of references in exploratory mode
k <- length(niqd); if(k<3) niqd <- c(rep(10,3-k),niqd)
#data anomaly tolerance (warning and delete):
if(sum(dz.max) > 1) {
dz.maxw <- min(dz.max); dz.maxd <- max(dz.max)
dz.minw <- max(dz.min); dz.mind <- min(dz.min)
}
if(length(dz.max)==1) dz.maxw <- NA
if(length(dz.min)==1) dz.minw <- NA
fbas <- sprintf('%s_%d-%d',varcli,anyi,anyf) #file basename
#- read input data
z <- read.dat(varcli,anyi,anyf,x,ini,tinc,na.strings,tz=tz)
est.c <- z$est.c; dat <- z$dat; na <- z$na; nd <- z$nd; ne <- z$ne; x <- z$x
nm <- z$nm; ini <- z$ini; tinc <- z$tinc; acomp <- z$acomp
if(nm<1) {
if(!onlyQC & !metad & !expl & !force & is.na(cumc)) stop("These series seem to be daily or sub-daily. Their direct homogenization\n is not recommended. Please, use dd2m() and homogenize the monthly\n series first or set force=TRUE to avoid this message.")
} else if(!nm%in%c(1,2,3,4,6,12)) {
cat(sprintf("Calculated no. of data per year and station: nm=%f\n",nm))
stop("Complete years of monthly or seasonal data are required to avoid\n missing data in the results. Please complete your series to have\n a rounded number of data. (nm can be one of 1,2,3,4,6,12.)")
}
#set mindat if unset by the user:
if(is.na(mindat)) {
if(nm<=0) mindat <- nd/na/4 #three months
else mindat <- max(5,nm)
}
#check if series contain at least mindat data:
z <- apply(!is.na(dat),2,sum) #no. of data items per series
zk <- which(z < mindat)
if(length(zk)>0) {
df <- data.frame('No. of data'=z[zk],est.c[zk,4:5])
print(df)
cat("The above series have less than",mindat,"data.",'\n')
stop("Please, remove these series and run homogen() again")
}
#- disaggregate data if cumc code is set
if(!is.na(cumc)) { #manage data coded as accumulated to the next:
graphics <- FALSE #do not create graphics if using cumc
cuml <- apply(dat==cumc,2,which) #list of accumulated terms
if(length(cuml)==0) {
cat('\n',"There are no data coded as cumc =",cumc,". Nothing done.",'\n')
cerrar(graphics); return(invisible())
}
cumt <- function(z) { #first (cumA) and last (cumB) accumulated terms
zdif <- diff(z)>1
cumA <- z[c(TRUE,zdif)]
cumB <- z[c(zdif,TRUE)]
return(list(cumA,cumB))
}
z <- lapply(cuml,cumt)
cumA <- sapply(z, function(x) x[1]) #first terms of accumulated runs
cumB <- sapply(z, function(x) x[2]) #last terms of accumulated runs
dat[dat==cumc] <- NA #delete accumulated data
cumv <- vector('list',ne) #list of accumulation values:
for(j in 1:ne) {
cumv[[j]] <- dat[cumB[[j]]+1,j] #save accumulation values
dat[cumB[[j]]+1,j] <- NA #delete them
cumB[[j]] <- cumB[[j]]+1 #include them in the accumulated runs
}
}
nsy <- rep(0,na) #no. of shifts per year
if(is.na(swa)) { #swa default values:
if(nm>0) swa <- 5*nm #5 years for monthly or lower frequency
else { #1 year for daily or higher frequency
z=which.max(table(unlist(rle(strftime(x,'%Y'))[1])))
swa <- as.integer(names(z))
}
}
if(swa>nd/4) swa <- ceiling(nd/4) #avoid too big a window
if(graphics) { #activate graphic output to a pdf file:
pdfname <- sprintf('%s_%d-%d.pdf',varcli,anyi,anyf)
pdf(pdfname,title="homogen() graphics output",bg='white')
old.par <- par(no.readonly=TRUE)
plot(-1:1,-1:1,type='n',xaxt='n',yaxt='n',bty='n',xlab='',ylab='')
text(0,0.4,sprintf('CLIMATOL %s',climatol.version),cex=4)
text(0,-0.45,paste("Homogenization\ngraphic output of",'\n',varcli,'\n',anyi,'-',anyf,sep=''),cex=3)
par(las=1,cex=cex,cex.axis=cex,cex.lab=cex)
my.par <- par(no.readonly=TRUE)
}
#----------- Quality control of the series -----------------------
Fout <- file(sprintf('%s_out.csv',fbas),'w') #open outliers file
write("Code,Date,Observed,Suggested,Anomaly (std.devs.),Deleted",Fout)
#set vmin and std if unset by the user:
mn1 <- apply(dat,2,quantile,prob=.1,na.rm=TRUE) #first deciles
mn0 <- apply(dat,2,quantile,prob=.01,na.rm=TRUE) #first percentiles
#are the series skewed and limited by zero?:
skewed <- median(mn1)==0 & median(mn0)==0
if(skewed) { #series skewed and limited by zero:
if(is.na(vmin)) vmin <- 0 #minimum value
if(nm<1 & is.na(rlemin)) rlemin <- 0.1 #minimum value for run lengths
if(is.na(std) & vmin==0) std <- 2 #normal ratio normalization
} else if(is.na(std)) std <- 3 #default normalization: standardization
deld <- 0 #initialize deleted data count
#check if there are values out of allowed range
if(!is.na(vmin)) { #values lower than the minimum?
zout <- dat<vmin; nout <- sum(zout,na.rm=TRUE)
if(nout>0) {
sout <- apply(zout,2,sum,na.rm=TRUE)
for(k in 1:length(sout)) {
if(sout[k]==0) next
kz <- which(zout[,k])
for(j in kz) write(c(est.c[k,4],format(x[j]),dat[j,k],NA,NA,1),
Fout,ncolumns=6,sep=',')
}
dat[zout] <- NA #delete erroneous values
cat('\n',"Warning: deleted",nout,"data lower than",vmin,'\n')
deld <- deld + nout
}
}
if(!is.na(vmax)) { #values higher than the maximum?
zout <- dat>vmax; nout <- sum(zout,na.rm=TRUE)
if(nout>0) {
sout <- apply(zout,2,sum,na.rm=TRUE)
for(k in 1:length(sout)) {
if(sout[k]==0) next
kz <- which(zout[,k])
for(j in kz) write(c(est.c[k,4],format(x[j]),dat[j,k],NA,NA,1),
Fout,ncolumns=6,sep=',')
}
dat[zout] <- NA #delete erroneous values
cat('\n',"Warning: deleted",nout,"data greater than",vmax,'\n')
deld <- deld + nout
}
}
#check if there are series without any data:
ksd <- which(apply(!is.na(dat),2,sum) == 0)
if(length(ksd)>0) {
cat("There are series with no data!!!:",'\n')
print(est.c[ksd,])
stop("Please, remove these series from the input files and run homogen() again")
}
#check if there are series with too few data:
ksd <- which(apply(!is.na(dat),2,sum) < mindat)
if(length(ksd)>0) {
cat("There are series with too few data (less than",mindat,'):\n')
print(est.c[ksd,])
cat("Warning: Break-points cannot be corrected in these series",'\n')
}
#- if there are time steps without any data, issue a warning and finish
numdat <- apply(!is.na(dat),1,sum) #no. of data in each time step
if(!min(numdat)) {
plot(x,numdat,type='l',xlab="Time",ylab="Number of data",
main=sprintf("Number of %s data along time",varcli))
z <- class(x)
if(z[1]=='Date' | z[1]=='POSIXct') {
grid(NA,NULL,col=grey(.4))
if(z[1]=='Date') abline(v=axis.Date(1,x),lty=3,col=grey(.4))
else if(z[1]=='POSIXct') abline(v=axis.POSIXct(1,x),lty=3,col=grey(.4))
} else grid(col=grey(.4))
zz <- which(numdat==0); z <- range(zz)
cat('\n',sum(numdat==0)," time steps between terms ",z[1],' (',format(x[z[1]]),") and ",z[2],' (',format(x[z[2]]),")\n have missing data in all stations!",'\n',sep='')
if(length(zz)<=100) print(format(x[which(numdat==0)]))
cat(sprintf("(See the graphics in %s)",pdfname),'\n')
cerrar(graphics)
stop("Cannot continue.\n(Shorten the study period or add series with data in the void terms.)\n\n")
}
if(graphics) {
#partition stations in groups for the boxplots:
gs <- 40 #group size
ge <- 1+((0:(ne-1))%/%gs); nge <- max(ge) #station groups
#si the last group is very small, append it to the former group:
if(nge>1 & sum(ge==nge)<=10) {
ge[ge==nge] <- nge-1
nge <- nge-1
}
}
#detect too anomalous data (to be deleted to avoid their use as references):
if(skewed) {
da3 <- r3(dat); da3[dat==0] <- NA #cubic root of data without zeros
bp <- boxplot(da3,range=niqd[1],plot=FALSE) #big outliers of the skewed series
} else {
bp <- boxplot(dat,range=niqd[1],plot=FALSE) #big outliers of the series
da3 <- dat
}
nout <- length(bp$out)
if(skewed & nout>0) { #set outliers as original values (not as cubic root):
for(k in 1:nout) {
kj <- which(da3[,bp$group[k]]==bp$out[k])
bp$out[k] <- dat[kj[1],bp$group[k]] #(kj may have multiple values)
}
}
if(graphics) { #boxplots of the series, marking those too anomalous:
for(ig in 1:nge) { #for every station group:
ke <- which(ge==ig)
boxplot(dat[,ke],ylab=ifelse(is.na(uni),"Values",uni),xaxt='n',col=5,
xlab="Stations",main=paste(varcli,"data"),pch='.',cex=6); grid(col=grdcol)
keg <- bp$group %in% ke; kex <- bp$group[keg]-(ig-1)*gs
if(metad & nm<1) points(kex,bp$out[keg],pch=19,col='cyan',cex=1.1)
else points(kex,bp$out[keg],pch=19,col=2,cex=1.1)
gsx <- length(ke)
if(max(ke)>100) axis(1,1:gsx,ke,las=2) else axis(1,1:gsx,ke)
}
}
#delete too anomalous data (only if nm>0 or metad=FALSE; otherwise the user
# is expected to apply a previous run with onlyQC=TRUE):
if(nout>0 & (nm>0 | metad==FALSE)) {
cat('\n',"Warning:",nout,"big outliers deleted")
if(!onlyQC) cat(" before homogenization:",'\n\n') else cat(':\n\n')
grp <- unique(bp$group); ngrp <- length(grp)
for(i in 1:ngrp) {
j <- grp[i]; k <- which(bp$group==j); zout <- bp$out[k]
cat(j,est.c[j,4],': ')
if(length(zout)<=10) cat(zout,'\n') else cat(zout[1:10],'... etc\n')
kk <- which(dat[,j]%in%zout); nkk <- length(kk)
for(k in 1:nkk) {
write(c(est.c[j,4],format(x[kk[k]]),dat[kk[k],j],NA,NA,1),
Fout,ncolumns=6,sep=',')
dat[kk[k],j] <- NA
}
}
deld <- deld + nout
}
cat('\n')
#detect spikes in the series by analyzing second differences (SDifs):
daf <- diff(diff(dat)) #SDifs
bp <- boxplot(daf,range=niqd[2],plot=FALSE) #big SDifs outliers
nout <- length(bp$out) #number of SDifs outliers
if(graphics) { #boxplots of SDifs:
ylab <- "Increments"; if(!is.na(uni)) ylab <- sprintf('%s (%s)',ylab,uni)
for(ig in 1:nge) { #for every station group:
ke <- which(ge==ig)
boxplot(daf[,ke],xlab="Stations",ylab=ylab,xaxt='n',col=hsv(.2,.5,1),
main=paste(varcli,"second differences"),pch='.',cex=6)
grid(col=grdcol)
keg <- bp$group %in% ke; kex <- bp$group[keg]-(ig-1)*gs
if(!skewed) {
if(metad & nm<1) points(kex,bp$out[keg],pch=19,col='cyan',cex=1.1)
else points(kex,bp$out[keg],pch=19,col=2,cex=1.1)
}
gsx <- length(ke)
if(max(ke)>100) axis(1,1:gsx,ke,las=2) else axis(1,1:gsx,ke)
}
}
if(!skewed) { #(only if data are not very skewed)
#delete too anomalous SDifs outliers (only if nm>0 or metad=FALSE):
nout <- length(bp$out)
if(nout>0 & !(metad & nm<1)) {
cat('\n',"Warning:",nout,"big spikes deleted")
if(!onlyQC) cat(" before homogenization:",'\n\n') else cat(':\n\n')
grp <- unique(bp$group); ngrp <- length(grp)
for(i in 1:ngrp) {
j <- grp[i]; k <- which(bp$group==j); zout <- bp$out[k]
cat(j,est.c[j,4],': ')
if(length(zout)<=10) cat(zout,'\n') else cat(zout[1:10],'... etc\n')
kk <- which(daf[,j]%in%zout); nkk <- length(kk)
for(k in 1:nkk) {
write(c(est.c[j,4],format(x[kk[k]]),dat[kk[k]+1,j],NA,NA,1),
Fout,ncolumns=6,sep=',')
dat[kk[k]+1,j] <- NA
}
}
deld <- deld + nout
}
cat('\n')
}
#lengths of sequencies of constant values:
dat.0 <- dat; if(!is.na(rlemin)) dat.0[dat<rlemin] <- NA
if(!is.na(rlemax)) dat.0[dat>rlemax] <- NA
rlen <- sapply(apply(dat.0,2,rle), function(x) x[1]) #run lengths
z <- sapply(rlen,unique); zx <- max(unlist(z)) #absolute maximum run length
zxs <- unname(unlist(lapply(z,max))) #maximum run lengths
zu <- unname(unlist(z)) #vector of unique run lengths
# bp <- boxplot(zxs,range=niqd[3],plot=FALSE)
bp <- boxplot(zu,range=niqd[3],plot=FALSE)
#outliers of maximum run lengths:
nout <- length(bp$out) #number of run outliers
if(zx<3) nout <- 0 #accept up to 3 consecutive identical data
if(nout>0) { #list excessive run lengths in Fout:
zxn <- bp$out; zxn[zxn<median(zxs)] <- max(zxn) #avoid left tail
zxn <- min(zxn) #minimum run outlier (in the right tail of zxs)
srun <- which(zxs >= zxn) #series with excessive run lengths
sdel <- integer(); rdel <- list() #initialize list of excessive runs
for(ks in srun) { #for every series with excessive run lengths:
kzx <- which(rlen[[ks]]>=zxn) #location of excessive run lengths
kx <- diffinv(rlen[[ks]])
for(kk in kzx) { #for every excessive run length
nrun <- rlen[[ks]][kk] #length of the run
krun <- kx[kk]+1:nrun #terms of the run
kr1 <- krun[1] #first term of the run
write(c(est.c[ks,4],format(x[kr1]),dat[kr1,ks],NA,NA,nrun),Fout,
ncolumns=6,sep=',')
#keep series and excessive run terms to be deleted:
sdel <- c(sdel, ks); rdel <- c(rdel, list(krun))
}
}
cat('\n',"Warning: excessive long runs of identical values deleted in series:",'\n')
print(est.c[srun,4])
}
if(graphics) {
ylab="No. of identical consecutive values"
for(ig in 1:nge) { #for every station group:
ke <- which(ge==ig)
ylim=c(1,max(10,zxs[ke]))
if(max(ke)>100) las=2 else las=1
plot(0,0,xlim=range(ke),ylim=ylim,xlab="Stations",ylab=ylab,type='n',
las=las,main=paste(varcli,"run lengths")); grid(col=grdcol)
for(k in ke) {
if(inherits(z,'matrix')) uz <- z[,k] else uz <- z[[k]];
s <- rep(k,length(uz)); col <- rep(4,length(uz))
if(nout>0) { #color for excessive run lengths:
if(metad & nm<1) col[uz >= zxn] <- 'cyan' #preserved runs in cyan
else col[uz >= zxn] <- 2 #runs to be deleted in red
}
points(s,uz,pch=19,col=col)
}
}
}
#delete excessive runs of identical values (only if nm>0 and metad=FALSE):
if(nout>0 & !(metad & nm<1)) {
nsdel <- length(sdel) #no. of excessive runs of identical values
for(k in 1:nsdel) dat[rdel[[k]],sdel[k]] <- NA
deld <- deld + nout
}
#------------- End of initial quality control -----------------------
dat.o <- dat #original data to be stored in the final *.rda file (QC fixed)
if(nd<100) lw=3 #width of the bars of anomalies
else if(nd<300) lw=2
else lw=1
nei <- ne #initial no. of stations
est.i <- est.c #initial stations metadata
nsp <- rep(0,nei) #no. of cuts in every original series
iest <- 1:ne #index of original series of every sub-series
outan <- matrix(NA,nd,ne) #outlier anomalies
#- if(graphics), go on with the initial graphics
if(graphics) {
#data availability in every series:
z <- class(x)
if(nm<0) cat("Per station data availability graphic skipped for subdaily data\n (might be too heavy).",'\n')
else { #per station data availability:
if(sum(is.na(dat))==0) col=4 else col=c('white',4)
image(!is.na(dat),col=col,xaxt='n',yaxt='n',useRaster=TRUE,
xlab="Time",ylab="Series",main=paste(varcli,"data availability"))
yy=as.integer(strftime(x,'%Y')); if(length(unique(yy))<3) yy <- 1:nd
lb=pretty(yy); nt=length(lb)
if(lb[nt] > max(yy)) { nt=nt-1; lb=lb[1:nt] }
if(lb[1] < min(yy)) { lb=lb[2:nt]; nt=nt-1 }
kp=integer(); for(k in 1:nt) kp=c(kp,min(which(yy==lb[k])))
at=((1:nd-1)/(nd-1))[kp]
axis(1,at,labels=lb,xlab='Time')
abline(v=at,lty=3,col=grdcol)
lb=pretty(1:ne); nt=length(lb)
if(lb[nt] > ne) { nt=nt-1; lb=lb[1:nt] }
if(lb[1] < 1) { lb=lb[2:nt]; nt=nt-1 }
at=((1:ne-1)/(ne-1))[lb]
axis(2,at,labels=lb)
abline(h=at,lty=3,col=grdcol)
}
#no. of data in every time step:
plot(x,numdat,type='l',col=4,ylab="Number of data",xlab="Time",
ylim=c(0,ne),main=paste("Number of",varcli,"data in all stations"))
if(z[1]=='Date' | z[1]=='POSIXct') {
grid(NA,NULL,col=grdcol)
if(z[1]=='Date') abline(v=axis.Date(1,x),lty=3,col=grdcol)
else if(z[1]=='POSIXct') abline(v=axis.POSIXct(1,x),lty=3,col=grdcol)
} else grid(col=grdcol)
abline(h=5,lty=2,col=verde)
abline(h=3,lty=2,col='red')
#histogram of all data (near-normal distribution?)
main="Histogram of all data"
zh <- hist(dat,plot=FALSE)
zx <- zh$breaks
zy <- zh$counts; zy[zy==0] <- NA
barplot(zy,log='y',space=0,ylim=c(.9,max(zy,na.rm=TRUE)*2),xlab=varcli,
ylab="Frequency",main=main,col=hsv(.4,1,.8),names.arg=zh$mids)
#correlogram of fisrt differences of the series (r <-> distance)
#(if more than nclust series, use only a random sample of nclust series)
if(ne>nclust) { splc <- sample(1:ne,nclust); nec <- nclust }
else { splc <- 1:ne; nec <- ne }
est.d <- matrix(NA,nec,nec) #distance matrix
for(i in 1:(nec-1)) {
for(j in (i+1):nec) {
dx <- est.c[splc[i],1]-est.c[splc[j],1]
dy <- est.c[splc[i],2]-est.c[splc[j],2]
#distances in km (coarse method as flat geometry):
dx <- dx*111*cos((est.c[splc[i],2]+est.c[splc[j],2])*pi/360)
dy <- dy*111
dz <- (est.c[splc[i],3]-est.c[splc[j],3])*wz
d2 <- dx*dx+dy*dy+dz*dz #quadratic distance
est.d[i,j] <- sqrt(d2) #distance
est.d[j,i] <- est.d[i,j] #simmetric matrix
}
}
if(ne>2) { #correlogram of the series:
data <- dat[,splc] #copy of the data
difd <- diff(data) #first differences of the series
corm <- cor(difd,use='p') #correlation matrix of the differences
nc <- ncol(difd) #no. of columns of matrix difd
mc <- matrix(0,nc,nc) #matrix of common terms
for(i in 1:(nc-1)) for(j in (i+1):nc)
mc[i,j] <- mc[j,i] <- sum(!is.na(difd[,i]+difd[,j]))
#delete correlations computed with less than 10 pairs of data:
corm[mc<10] <- NA
if(ne>nclust) main <- sprintf("Correlogram of %d sampled %s series\n(First differences)",nclust,varcli)
else main <- sprintf("Correlogram of first difference %s series",varcli)
xd <- as.vector(est.d); y <- as.vector(corm)
xmin <- floor(min(c(0,xd),na.rm=TRUE)); xmax <- ceiling(max(xd,na.rm=TRUE))
ymin <- floor(min(c(0,y),na.rm=TRUE)); ymax <- ceiling(max(y,na.rm=TRUE))
xbin <- seq(xmin,xmax,length=100)
ybin <- seq(ymin,ymax,length=100)
freq <- as.data.frame(table(findInterval(xd,xbin),findInterval(y,ybin)))
freq[,1] <- as.integer(as.character(freq[,1]))
freq[,2] <- as.integer(as.character(freq[,2]))
freq2D <- matrix(0,100,100)
freq2D[cbind(freq[,1], freq[,2])] <- freq[,3]
freq2D[freq2D==0] <- NA
col=rev(rainbow(16,start=0,end=2/3))
nz=max(freq2D,na.rm=TRUE); if(nz<16) col=col[1:nz]
image(xbin,ybin,freq2D,main=main,useRaster=TRUE,xlab="Distance (km)",
ylab="Correlation coefficient",col=col)
grid(col=gray(.3)); abline(h=0,col=2)
#dendrogram of the stations:
#avoid NAs in the correlation matrix by imputing the mean correlation:
corm[is.na(corm)] <- mean(corm,na.rm=TRUE)
dism <- dist(corm) #dissimilarity matrix
#if there are NAs in the dissimilarity matrix, set them to 1:
kna=which(is.na(dism)); if(sum(kna)>0) dism[kna] <- 1
hc <- hclust(dism,hc.method)
if(ne>nclust) main <- paste("Dendrogram of",nclust,"sampled stations")
else main <- "Dendrogram of station clusters"
plot(hc,xlab="Stations",sub='',ylab="Dissimilarity",las=0,main=main)
#station clusters up to a maximum of 9 groups:
if(is.na(cutlev)) cutlev <- mean(hc$height)+sd(hc$height)
repeat {
ct <- cutree(hc,h=cutlev)
nc <- length(levels(factor(ct)))
if(nc<10) break
cutlev <- cutlev + .1
}
if(nc>1) {
abline(h=cutlev,col="red",lty=2)
if(ne<=nclust) { #list station clusters
cat('\n-------------------------------------------\n')
cat(sprintf("Stations in the %d clusters",nc),':\n\n')
print(split(splc,ct))
cat('---------------------------------------------\n')
}
}
#stations map:
if(nc==1) { col='blue'; main=paste(varcli,"station locations") }
else {
col=rainbow(nc,1,.55)[ct]
main=sprintf("%s station locations (%d clusters)",varcli,nc)
}
#map limits with a 10% margin around station coordinates:
zxr <- range(est.c[,1]); zyr <- range(est.c[,2])
zxm <- diff(zxr)*.1; zym <- diff(zyr)*.1
if(zxm==0 | zym==0) stop("X and/or Y coordinates are all the same!\nAllow some variation in the *.est input file")
xlim <- zxr+c(-zxm,zxm); ylim <- zyr+c(-zym,zym)
#now draw the map:
par(mar=c(4,4,4,4))
nomap <- FALSE
if(diff(zyr)<5. & requireNamespace('mapdata',quietly=TRUE))
z <- try(maps::map('mapdata::worldHires',col=mapcol,xlim=xlim,
ylim=ylim,mar=par('mar')),silent=TRUE)
else z <- try(maps::map('world',col=mapcol,xlim=xlim,ylim=ylim,
mar=par('mar')),silent=TRUE)
if(inherits(z,'try-error')|nomap) plot(0,0,xlim=xlim,ylim=ylim,xlab='',
ylab='',asp=1/(cos(mean(zyr)*pi/180)),main=main,type='n')
else { maps::map.axes(); title(main) }
if(ne>99) { #if more than 99 stations, plot symbols
#stations not in the sample are plotted first, in black:
points(est.c[-splc,1:2],pch='+',col=2,cex=.5)
#stations in the sample are plotted in color:
points(est.c[splc,1:2],col=col,pch=ct)
} else text(est.c[splc,1:2],labels=splc,col=col) #numbers
grid(col=gray(.4))
}
par(my.par) #restore graphic parameters
par(las=1,cex=cex,cex.axis=cex,cex.lab=cex)
}
#- if(onlyQC), finish (saving files if data were deleted)
if(onlyQC) {
cat('\n')
if(deld>0) { #save clean *.dat and *.est to apply dd2m() without outliers
fori <- sprintf('%s-ori_%d-%d',varcli,anyi,anyf) #original file basename
file.rename(sprintf('%s.dat',fbas),sprintf('%s.dat',fori))
write(dat,sprintf('%s.dat',fbas))
file.rename(sprintf('%s.est',fbas),sprintf('%s.est',fori))
write.table(est.c,sprintf('%s.est',fbas),row.names=FALSE,col.names=FALSE)
outrename(varcli,anyi,anyf,'QC') #rename QC diagnostics to keep them
#rename original input files
cat("New input data files free from the detected big outliers",'\n')
if(IQR(zxs)>0) cat("and/or excessive runs of identical values",'\n')
cat("have been rewritten. QC diagnostics have been renamed to",'\n')
cat(sprintf('%s-QC_%d-%d.*',varcli,anyi,anyf),'\n\n')
cat("Original files have been renamed to",'\n')
cat(sprintf('%s-ori_%d-%d.*',varcli,anyi,anyf),'\n\n')
cat("If you want to repeat this quality control with other parameters",'\n')
cat("or to apply homogen to your original files, you must rename them with",':\n',sep='')
z <- sprintf('\'%s-ori_%d-%d.est\',\'%s_%d-%d.est\'',varcli,anyi,anyf,varcli,anyi,anyf)
cat(' file.rename(',z,')\n',sep='')
z <- sprintf('\'%s-ori_%d-%d.dat\',\'%s_%d-%d.dat\'',varcli,anyi,anyf,varcli,anyi,anyf)
cat(' file.rename(',z,')\n\n',sep='')
} else {
cat("Quality control finished. No big quality problems detected.",'\n\n')
}
cerrar(graphics)
if(exists('ct')) return(invisible(list(corm=corm,ct=ct)))
else return()
}
# Homogenization processs in three stages:
# 1) splits in stepping windows
# 2) splits in the whole series
# 3) missing data filling
#- open outliers and breaks files
if(!metad) {
Fbrk <- file(sprintf('%s_brk.csv',fbas),'w')
write(sprintf('Code,Date,%s',inhtest),Fbrk)
}
#- compute distance and proximity rank matrices
cat("Computing inter-station distances ...")
refhom <- substr(est.c[,4],1,1)=='*' #trusted homogeneous references
est.d <- matrix(0,ne,ne) #distance matrix
zr <- est.d > 0 #flags for reanalysis series
for(i in 1:(ne-1)) {
cat(' ',i)
for(j in (i+1):ne) {
dx <- est.c[i,1]-est.c[j,1]
dy <- est.c[i,2]-est.c[j,2]
#distances in km (gross method, using flat geometry):
dx <- dx*111*cos((est.c[i,2]+est.c[j,2])*pi/360)
dy <- dy*111
dz <- (est.c[i,3]-est.c[j,3])*wz #vertical distance
d2 <- dx*dx+dy*dy+dz*dz #3D quadratic distance
est.d[i,j] <- est.d[j,i] <- sqrt(d2) #3D distance
#flag distances involving reanalysis series:
if(raway>1 & xor(refhom[i],refhom[j])) zr[i,j] <- zr[j,i] <- TRUE
}
}
if(sum(zr)>0) { #increase distances involving reanalysis series:
mdist <- median(est.d[!zr]) #median of distances without reanalysis
if(mdist < 10) mdist <- 10 #set a minimum value for the median
est.d[zr] <- (est.d[zr]+mdist)*raway #increase distances of reanalysis
}
cat('\n')
est.p <- t(apply(est.d,1,order)) #proximity ranks matrix
#- Firts estimation of means and standard deviations
datmed <- apply(dat,1,mean,na.rm=TRUE) #global mean series
refmed <- mean(datmed) #reference global mean
dat.m <- apply(dat,2,mean,na.rm=TRUE) #initial (raw) means of the series
if(std==2 & min(dat.m)==0) { #avoid divisions by zero:
j0 <- which(dat.m==0) #series with zero mean (should not happen...)
minN0 <- min(dat.m[dat.m>0]) #minimum non zero mean
dat.m[j0] <- minN0 #assign minimum non zero mean to previous zeros
}
if(std==3) {
refstd <- sd(datmed) #reference global standard deviation
dat.s <- apply(dat,2,sd,na.rm=TRUE) #initial (raw) standard deviations
}
#coarse adjustment of means and standard deviations to the whole period:
switch(std,
dat.m <- dat.m + refmed - apply(!is.na(dat),2,datmed.mean),
dat.m <- dat.m * refmed / apply(!is.na(dat),2,datmed.mean),
{dat.m <- dat.m + refmed - apply(!is.na(dat),2,datmed.mean)
dat.s <- dat.s + refstd - apply(!is.na(dat),2,datmed.sd)},
dat.m <- dat.m + refmed - apply(!is.na(dat),2,datmed.mean)
)
#- metad==TRUE? Read *_brk.csv and split the series by the break-points
if(metad) {
cat('\n',"Splitting the series following the metadata file...:",'\n')
if(sufbrk=='') fichbrk <- sprintf('%s_%d-%d_brk.csv',varcli,anyi,anyf)
else fichbrk <- sprintf('%s-%s_%d-%d_brk.csv',varcli,sufbrk,anyi,anyf)
brk <- read.csv(fichbrk,colClasses=c('character','character','character'))
if(!is.na(tinc)) brk[,2] <- as.POSIXct(brk[,2],tz=tz)
else brk[,2] <- as.Date(brk[,2])
nbrk <- nrow(brk); nn <- 0
if(nbrk<1) cat("No break-points in the metadata file.",'\n')
else {
for(kb in 1:nbrk) { #for every break:
i <- match(brk[kb,1],est.c[,4]) #series to split
if(is.na(i)) {
cat(sprintf('\n',"Code %s not found in station list; break skipped",brk[kb,1]))
next
}
kp <- match(brk[kb,2],x) #break-point location
#check dates concordancies:
if(is.na(kp)) kp <- which.max(x>brk[kb,2])
if(is.na(tinc)) cat('\n',sprintf("%s(%d) breaks at %s",est.c[i,4],i,format(x[kp])))
else cat('\n',sprintf("%s(%d) breaks at %s",est.c[i,4],i,format(x[kp],tz=tz,usetz=TRUE)))
if(sum(!is.na(dat[1:(kp-1),i])) < mindat) {
dat[1:(kp-1),i] <- NA
cat(" Fragment with less than",mindat,"data DELETED",'\n')
} else if(sum(!is.na(dat[kp:nd,i])) < mindat) {
dat[kp:nd,i] <- NA
cat(" Fragment with less than",mindat,"data DELETED",'\n')
} else {
nn <- nn+1 #increment no. of new series
iest <- c(iest,iest[i]) #add original series index
nsp[iest[i]] <- nsp[iest[i]]+1 #and its no. of breaks
if(nm>0) { #count no. of breaks per year
z <- 1 + floor((kp-1)/nm) #anual break location
nsy[z] <- nsy[z] + 1 #no. of breaks per year
}
dat <- cbind(dat,rep(NA,nd)) #new data column
#move pre-cut data to the new series:
dat[1:(kp-1),ne+nn] <- dat[1:(kp-1),i]
dat[1:(kp-1),i] <- NA #delete pre-cut data
#copy coordinates and add a suffix to station code and name:
z <- data.frame(est.i[iest[i],1:3],paste(est.i[iest[i],4],'-',1+nsp[iest[i]],sep=''),paste(est.i[iest[i],5],'-',1+nsp[iest[i]],sep=''))
names(z) <- names(est.i)
est.c <- rbind(est.c,z)
#adjust means (and std. devs.?) to the fragments:
switch(std,
{ dat.m[i] <- mean(dat[,i],na.rm=TRUE) + refmed - mean(datmed[!is.na(dat[,i])])
dat.m <- c(dat.m, mean(dat[,ne+nn],na.rm=TRUE)+refmed-mean(datmed[!is.na(dat[,ne+nn])])) },
{ dat.m[i] <- mean(dat[,i],na.rm=TRUE) * refmed / mean(datmed[!is.na(dat[,i])])
dat.m <- c(dat.m, mean(dat[,ne+nn],na.rm=TRUE)*refmed/mean(datmed[!is.na(dat[,ne+nn])])) },
{ dat.m[i] <- mean(dat[,i],na.rm=TRUE) + refmed - mean(datmed[!is.na(dat[,i])])
dat.m <- c(dat.m, mean(dat[,ne+nn],na.rm=TRUE)+refmed-mean(datmed[!is.na(dat[,ne+nn])]))
dat.s[i] <- sd(dat[,i],na.rm=TRUE) + refstd - sd(datmed[!is.na(dat[,i])])
dat.s <- c(dat.s, sd(dat[,ne+nn],na.rm=TRUE)+refstd-sd(datmed[!is.na(dat[,ne+nn])])) },
{ dat.m[i] <- mean(dat[,i],na.rm=TRUE) + refmed - mean(datmed[!is.na(dat[,i])])
dat.m <- c(dat.m, mean(dat[,ne+nn],na.rm=TRUE)+refmed-mean(datmed[!is.na(dat[,ne+nn])])) }
)
}
}
cat('\n\n',"Update number of series:",ne,'+',nn,'= ')
ne <- ne + nn #update no. of stations
cat(ne,'\n\n')
refhom <- substr(est.c[,4],1,1)=='*' #update homogeneous references
}
inht <- c(0,0) #go to missing data filling
}
delout <- numeric() #initialize list of deleted standardized anomalies
#- for (ks in 1:3) #(test in windows, in whole series and missing d. filling)
for (ks in 1:3) { #for every stage:
if(ks<3) {
inh <- inht[ks] #inht threshold for stage ks
if(inh==0) next #skip stage if inh==0
}
if(is.na(cumc)) { cat('\n\n==========',"STAGE",ks)
switch(ks,
cat(sprintf(" (%s on overlapping temporal windows)",inhtest),'===========\n\n'),
cat(sprintf(" (%s on the whole series)",inhtest),'=======================\n\n'),
cat(" (Final calculation of all missing data)",'==========\n\n')
)
} else cat('\n======',sprintf("Disaggregating daily precipitation accumulations (coded as %d)",cumc),'\n')
#- compute weight matrix? (depends on the stage)
if(ks==1) zz <- TRUE else if(wd[ks]!=wd[ks-1]) zz <- TRUE else zz <- FALSE
if(zz) {
est.w <- matrix(1,nei,nei) #weight matrix
if(wd[ks]>0) { #weights different from 1. Calling wd as h and being
# d the distance between stations, they are calculated as 1/(1+d2/h2),
# reformulated as h2/(h2+d2)
cat("Computing inter-station weights...")
wd2 <- wd[ks]*wd[ks]
for(i in 1:(nei-1)) {
for(j in (i+1):nei) {
est.w[i,j] <- wd2/(wd2+est.d[i,j]*est.d[i,j])
est.w[j,i] <- est.w[i,j] #simmetric matrix
}
}
cat(" (done)",'\n\n')
}
}
#- if(graphics), issue page indicating the stage
if(graphics) {
plot(-1:1,-1:1,type='n',xaxt='n',yaxt='n',bty='n',xlab='',ylab='')
text(0,0.4,paste("Stage",ks),cex=3)
if(ks==1) text(0,-0.3,sprintf("Binary splits on %d terms\nstepped windows with\nstd=%d, %s>%d\nand wd=%d km",round(swa),std,inhtest,round(inh),round(wd[ks])),cex=2.2)
else if(ks==2) text(0,-0.3,sprintf("Binary splits on\nwhole series with\nstd=%d, %s>%d\nand wd=%d km",std,inhtest,round(inh),round(wd[ks])),cex=2.5)
else text(0,-0.3,sprintf("Final anomalies of the\nhomogenized series with\nwd = %d km and nref = %d",round(wd[ks]),nref[ks]),cex=2)
}
#stage dependent values:
if(ks==3) aref <- TRUE else aref <- FALSE
nrefk <- nref[ks]
#- repeat until no series is split
repeat {
#---------- Compute anomalies and delete anomalous data:
#- initialize matrices dat.z|e|c oneref anom sanom mindist nrefs used
dat.z <- matrix(NA,nd,ne) #observed data (normalized)
dat.e <- matrix(NA,nd,ne) #estimated data (normalized)
dat.c <- matrix(NA,nd,ne) #calculated data (estimated)
oneref <- matrix(FALSE,nd,ne) # 1 reference only?
anom <- matrix(NA,nd,ne) #anomalies
sanom <- matrix(NA,nd,ne) #standardized anomalies
mindist <- matrix(NA,nd,ne) #minimum distances
nrefs <- matrix(NA,nd,ne) #no. of references
used <- matrix(FALSE,ne,ne) #flags of used stations
dat.d <- dat #working copy of the data
dat.na <- is.na(dat.d) #missing data index
#- if there are time steps without any data, issue a warnig and finish
numdat <- apply(!dat.na,1,sum)
nmin=min(numdat)
if(nmin==0) {
cat('\n',"There are terms with NO DATA!:",'\n')
for(j in which(numdat==0)) cat(format(x[j]),'\n')
stop("Cannot continue! Shorten the study period, add series with data in the empty terms, or be more tolerant to outliers.")
}
#use any existing former means and standard deviations:
if(exists('dat.m0')) {
dat.m <- dat.m0
if(std==3) dat.s <- dat.s0
}
#- normalize dat.d (obtain dat.z)
switch(std,
for(ke in 1:ne) dat.z[,ke] <- dat.d[,ke]-dat.m[ke], #std=1
for(ke in 1:ne) dat.z[,ke] <- dat.d[,ke]/dat.m[ke], #std=2
for(ke in 1:ne) dat.z[,ke]<-(dat.d[,ke]-dat.m[ke])/dat.s[ke],#std=3
dat.z <- dat.d
)
#- ite=0 and loop until estimated data converge
#iterative process for estimating the means of the series:
ite <- 0
if(expl) cat('\n',"Calculation of missing data",'\n')
else if(is.na(cumc)) {
cat('\n',"Calculation of missing data with outlier removal",'\n')
cat("(Suggested data replacements are provisional)",'\n')
}
if(length(dat)>10000000) cat("This process may take a very long time (many days)",'\n')
else if(length(dat)>1000000) cat("This process may take a long time (many hours)",'\n')
if(is.na(cumc)) {
if(ks==3 & !expl) cat('\n',"The following lines will have one of these formats:",'\n')
cat(" Station(rank) Date: Observed -> Suggested (Anomaly, in std. devs.)",'\n')
if(ks==3) cat(" Iteration Max_data_difference (Station_code)",'\n')
}
maxddif0 <- 99999. #initialize the maximum data difference
repeat {
ite <- ite+1
#- ite+=1 and estimate series (dat.e|c) from their neighbors
# update used, nrefs and mindist:
for(i in 1:ne) { #for every station
if(refhom[i]) next #skip trusted series
ik <- iest[i] #original station index
for(j in 1:nd) { #for every data
se <- 0
sw <- 0
nr <- 0
trusted <- FALSE
for(ir in 1:nei) { #for every station (possible reference)
kr <- est.p[ik,ir]
krf <- which(iest==kr) #fragments of the reference
k <- which(!dat.na[j,krf]) #which one has observation?
if(length(k)!=1) next #no fragment with observation
k <- krf[k] #index of the fragment with observation
if(i==k) next #it is the same station
nr <- nr+1 #no. of references
used[i,k] <- TRUE #flag used station
#minimum distance to the nearest data:
if(nr==1) mindist[j,i] <- max(est.d[ik,kr],1)
w <- est.w[ik,kr]
se <- se + w * dat.z[j,k]
sw <- sw + w
if(refhom[ir]) trusted <- TRUE
if(nr>=nrefk) break #if maximum no. of references, finish loop
}
if(!nr) { #no reference!
dat.e[j,i] <- dat.z[j,i] #keep the original data
nrefs[j,i] <- NA
} else {
nrefs[j,i] <- nr
#with only one untrusted reference, flag to avoid modifications:
if(nr==1 & !is.na(oneref[j,i]) & !trusted) oneref[j,i] <- TRUE
#avoid negative estimations if std=2 (precipitation, etc):
if(std==2 & se<0) se <- 0
dat.e[j,i] <- se / sw #estimated data (normalized)
}
}
}
#change any NaN to NA. (It may happen with std=2):
n <- sum(is.nan(dat.e))
if(n>0) {
cat(n,"NaN's in dat.e ! (changing them to NA's...)",'\n')
dat.e[is.nan(dat.e)] <- NA
}
#calculate values unnormalizing dat.e:
switch(std,
for(ke in 1:ne) dat.c[,ke] <- dat.e[,ke]+dat.m[ke], #std=1
for(ke in 1:ne) dat.c[,ke] <- dat.e[,ke]*dat.m[ke], #std=2
for(ke in 1:ne) dat.c[,ke]<-dat.e[,ke]*dat.s[ke]+dat.m[ke], #std=3
dat.c <- dat.e
)
#if cumc was set, dissaggregate data, save them in *.dat and finish:
if(!is.na(cumc)) {
for(ke in 1:ne) { #for every station
cumvl <- length(cumv[[ke]]) #no. of accumulations
if(cumvl==0) next #series without accumulations
for(j in 1:cumvl) {
#an embedded missing data? joint the accumulations:
if(is.na(cumv[[ke]][j])) {
if(j<cumvl & cumA[[ke]][j+1]==cumB[[ke]][j]+1)
cumA[[ke]][j+1] <- cumA[[ke]][j]
next
}
sumc <- sum(dat.c[cumA[[ke]][j]:cumB[[ke]][j],ke])
#if estimated data sum is 0, assign zeros and keep the
#accumulated value in the reported day:
if(sumc==0.0) {
dat[cumA[[ke]][j]:cumB[[ke]][j],ke] <- 0
dat[cumB[[ke]][j],ke] <- cumv[[ke]][j]
next #go to next accumulation
}
prop <- cumv[[ke]][j]/sumc #ratio accumulation/estimation_sum
zc <- dat.c[cumA[[ke]][j]:cumB[[ke]][j],ke] * prop
zk <- which.max(zc) #location of the maximum estimation
zc <- round(zc,ndec)
zd <- cumv[[ke]][j] - sum(zc) #rounding difference
#add difference to the highest value:
if(zd!=0.0) zc[zk] <- zc[zk]+zd
dat[cumA[[ke]][j]:cumB[[ke]][j],ke] <- zc
}
}
#save new input data and finish:
fcum <- sprintf('%s-cum_%d-%d',varcli,anyi,anyf) #accum.file basename
file.rename(sprintf('%s.dat',fbas),sprintf('%s.dat',fcum))
write(dat,sprintf('%s.dat',fbas))
file.rename(sprintf('%s.est',fbas),sprintf('%s.est',fcum))
write.table(est.c,sprintf('%s.est',fbas),row.names=FALSE,col.names=FALSE)
cat('\n',"Accumulated values have been distributed among the previous days and rewritten",'\n')
cat("as new input files. Original input files have been renamed to",'\n')
cat(fcum,'.dat'," and ",fcum,'.est\n\n',sep='')
cerrar(graphics); return(invisible()) #finish
}
#- anomalies calculation (anom, sanom) and outtlier deletion
anom <- dat.z-dat.e #anomalies
anom[dat.na] <- NA #forget anomalies of estimated data
#normalize anomalies:
anomm <- apply(anom,2,mean,na.rm=TRUE) #mean anomalies
anoms <- apply(anom,2,sd,na.rm=TRUE) #std. dev. of anomalies
for(i in 1:ne) sanom[,i] <- (anom[,i]-anomm[i])/anoms[i]
if(!expl) { #delete outliers:
if(ite==1 & sum(dz.max)<=1) { #set dz.* limits
z <- c(sanom) #normalized anomalies vector
dz.maxd <- quantile(z,probs=1-dz.max/100,na.rm=TRUE) #max.delete
dz.maxw <- quantile(z,probs=1-dz.max/10,na.rm=TRUE) #max.warning
dz.mind <- quantile(z,probs=dz.max/100,na.rm=TRUE) #min.delete
dz.minw <- quantile(z,probs=dz.max/10,na.rm=TRUE) #min.warning
dz.max <- 99 #do not repeat this in every stage
}
elim <- sanom < dz.mind | sanom > dz.maxd #anomalies to delete
elim[is.na(elim)] <- FALSE #remove any NA
elim[,refhom] <- FALSE #keep trusted series
nelim <- sum(elim) #no. of data to delete
if(nelim>0) { #delete the anomalous original data
#list anomalous data to be deleted:
for(i in 1:ne) {
for(j in 1:nd) if(elim[j,i] & !is.na(oneref[j,i])) {
outan[j,iest[i]] <- sanom[j,i] #save the outlier anomaly
do <- dat.d[j,i] #original data
dc <- dat.c[j,i] #calculated data
cat(sprintf('%s(%d) %s',est.c[i,4],i,format(x[j])))
cat(': ',do,' -> ',round(dc,ndec),' (',round(sanom[j,i],2),
')',sep='')
#do not delete with only one untrusted reference!:
if(oneref[j,i] & nrefk>1) {
cat(" Only 1 reference! (Unchanged)")
elim[j,i] <- FALSE
}
else { #write in Fout with flag 1
write(c(est.c[iest[i],4],format(x[j]),round(do,ndec),
round(dc,ndec),round(sanom[j,i],2),1),Fout,ncolumns=6,sep=',')
delout <- c(delout,sanom[j,i])
}
cat('\n')
}
}
dat[elim] <- NA #delete anomalous data
dat.na[elim] <- TRUE #update missing data flags
maxddif0 <- 99999. #reset to avoid fake convergence break
}
else if(!aref) cat("(No detected outliers)",'\n')
}
#list suspect values? :
if(ks==3 & !expl & (!is.na(dz.maxw) | !is.na(dz.minw))) {
#suspect values but not big outliers:
susp <- (sanom < dz.minw & sanom >= dz.mind) |
(sanom > dz.maxw & sanom <= dz.maxd)
susp[is.na(susp)] <- FALSE #remove any NA
susp[,refhom] <- FALSE #unflag trusted series
nsusp <- sum(susp) #no. of suspect data
if(nsusp>0) { #list suspect observations:
for(i in 1:ne) {
for(j in 1:nd) if(susp[j,i] & !is.na(oneref[j,i])) {
do <- dat.d[j,i] #original data
dc <- dat.c[j,i] #calculated data
#unflag suspect data with only one untrusted reference!:
if(oneref[j,i] & nrefk>1) susp[j,i] <- FALSE
else { #write in Fout with flag 0
write(c(est.c[iest[i],4],format(x[j]),round(do,ndec),
round(dc,ndec),round(sanom[j,i],2),0),Fout,ncolumns=6,sep=',')
}
}
}
}
}
#- missing data filling
dat.d[dat.na] <- dat.c[dat.na]
if(ite>1) {
ddif <- dat.d-dat.d0 #data differences from previous iteration
maxddif <- max(abs(ddif),na.rm=TRUE) #max. data difference
kmaxdif <- which.max(abs(ddif)) #max. dat. dif. location
kmaxest <- ceiling(kmaxdif/nd) #max. dat. dif. station
}
dat.d0 <- dat.d #data copy
#- update dat.m|s|z
dat.m <- apply(dat.d,2,mean,na.rm=TRUE)
if(std==3) dat.s <- apply(dat.d,2,sd,na.rm=TRUE)
switch(std,
for(ke in 1:ne) dat.z[,ke] <- dat.d[,ke]-dat.m[ke], #std=1
for(ke in 1:ne) dat.z[,ke] <- dat.d[,ke]/dat.m[ke], #std=2
for(ke in 1:ne) dat.z[,ke]<-(dat.d[,ke]-dat.m[ke])/dat.s[ke], #std=3
dat.z <- dat.d
)
#- if(!aref) break (no need to refine missing data until the end)
if(!aref) break
#- if(ite>1) and convergence reached or broken, break loop
if(ite>1) {
cat(ite,' ',round(maxddif,ndec+2),' (',est.c[kmaxest,4],')','\n',sep='')
if(maxddif>maxddif0) {
cat("Data convergence broken",'\n\n')
break
}
if(maxddif<=maxdif) {
cat("Prescribed convergence reached",'\n\n')
break
}
if(ite==maxite) {
cat('\n',"Average calculation skipped after",ite,"iterations",'\n')
break
}
maxddif0 <- maxddif #save max. dat. dif. for next iteration
}
}
#- save dat.m|s in dat.m0|s0
dat.m0 <- dat.m #copy of the means
if(std==3) dat.s0 <- dat.s #copy of std. deviations
oneref[is.na(oneref)] <- TRUE #reset oneref
#- if(aref==TRUE) repeat missing data filling with self-correction
if(aref==TRUE) {
cat("Last series readjustment (please, be patient...)",'\n')
#- obtain estimated series (dat.e, dat.c) from their neighbors
#- and update used[ne,ne], nrefs[nd,ne] and mindist[ne,ne]:
for(i in 1:ne) { #for every statin
ik <- iest[i] #index of original station
for(j in 1:nd) { #for every data
se <- 0
sw <- 0
nr <- 0
trusted <- FALSE
for(ir in 1:nei) { #for every station (possible reference)
kr <- est.p[ik,ir]
krf <- which(iest==kr) #fragments of the reference
k <- which(!dat.na[j,krf]) #which one has observation?
if(length(k)!=1) next #no fragment with observation
k <- krf[k] #index of the fragment with observation con dato
if(i==k) next #same station
nr <- nr+1 #no. of references
used[i,k] <- TRUE #flag used station
#minimum distance to the nearest data:
if(nr==1) mindist[j,i] <- max(est.d[ik,kr],1)
w <- est.w[ik,kr]
se <- se + w * dat.z[j,k]
sw <- sw + w
if(refhom[ir]) trusted <- TRUE
#if maximum no. of references or self-reference, finish:
if(nr>=nrefk | (aref & ir==1)) break
}
if(!nr) { #no reference!
dat.e[j,i] <- dat.z[j,i] #keep observation
nrefs[j,i] <- NA
} else {
nrefs[j,i] <- nr
#with only one untrusted reference, flag to avoid modifications:
if(nr==1 & !is.na(oneref[j,i]) & !trusted) oneref[j,i] <- TRUE
#avoid negative estimations if std=2 (precipitation, etc):
if(std==2 & se<0) se <- 0
dat.e[j,i] <- se / sw #estimated data (normalized)
}
}
}
#change any NaN to NA. (It may happen with std=2):
n <- sum(is.nan(dat.e))
if(n>0) {
cat(n,"NaN's in dat.e ! (changing them to NA's...)",'\n')
dat.e[is.nan(dat.e)] <- NA
}
#calculate values unnormalizing dat.e:
switch(std,
for(ke in 1:ne) dat.c[,ke] <- dat.e[,ke]+dat.m[ke], #std=1
for(ke in 1:ne) dat.c[,ke] <- dat.e[,ke]*dat.m[ke], #std=2
for(ke in 1:ne) dat.c[,ke]<-dat.e[,ke]*dat.s[ke]+dat.m[ke],#std=3
dat.c <- dat.e
)
#- missing data filling
dat.d[dat.na] <- dat.c[dat.na]
if(!is.na(vmax)) dat.d[dat.d > vmax] <- vmax
if(!is.na(vmin)) dat.d[dat.d < vmin] <- vmin
#- update dat.m|s|z
dat.m <- apply(dat.d,2,mean,na.rm=TRUE)
if(std==3) dat.s <- apply(dat.d,2,sd,na.rm=TRUE)
switch(std,
for(ke in 1:ne) dat.z[,ke] <- dat.d[,ke]-dat.m[ke], #std=1
for(ke in 1:ne) dat.z[,ke] <- dat.d[,ke]/dat.m[ke], #std=2
for(ke in 1:ne) dat.z[,ke]<-(dat.d[,ke]-dat.m[ke])/dat.s[ke], #std=3
dat.z <- dat.d
)
}
#- calculate final values of the anomalies (anom, sanom)
anom <- dat.z-dat.e #anomalies
anom[dat.na] <- NA #forget anomalies of estimated data
anomm <- apply(anom,2,mean,na.rm=TRUE) #mean anomalies
anoms <- apply(anom,2,sd,na.rm=TRUE) #std. dev. of anomalies
for(i in 1:ne) sanom[,i] <- (anom[,i]-anomm[i])/anoms[i]
#- ----------- Shift detection/correction (binary split):
#- if(ks>2) break (only missing data filling in the last stage)
if(ks>2) break
#split series when maximum test is higher than inht:
nn <- 0 #initialize no. of new series
splt <- rep(0,ne) #initialize tV of split series
modif <- FALSE #initialize flag of series modification
cat('\n',"Performing shift analysis on the",ne,"series...",'\n')
y <- sanom #normalized anomalies
y[is.na(nrefs)] <- NA #remove anomalies without reference
y[,refhom] <- NA #remove anomalies of trusted series
if(ks==1) tkx <- apply(y,2,wtest,swa,sts,test)
else tkx <- apply(y,2,test,sts)
tVx <- tkx[1,]; kpx<- as.integer(tkx[2,])
#- split series in decreasing order of tVx while not using as references
# recently split with tVx similar to the maximum tVx in all series:
if(sum(!is.na(tVx))==0) tVxx <- 0 else tVxx <- max(tVx,na.rm=TRUE)
while(tVxx > inh) {
i <- which.max(tVx) #series with maximum tVx
#if i used references split with a too high tVx, go to new iteration:
if(max(splt[used[i,]])>tVxx*(1+.05*min(nr,sum(used[i,])))) break
kp <- kpx[i] #location of tVx in series i
if(oneref[kp,i] & nrefk>1) { #avoid split with only one reference
tVx[i] <- -1 #set this series tVx to -1
tVxx <- max(tVx,na.rm=TRUE) #maximum tVx of remaining series
next
}
cat('\n',sprintf("%s(%d) breaks at %s (%.1f)",est.c[i,4],i,
format(x[kp]),tVx[i]))
write(sprintf('%s,%s,%.1f',est.c[iest[i],4],format(x[kp]),tVx[i]),
Fbrk,ncolumns=3)
#graphic of anomalies with split location:
if(graphics) {
y <- sanom[,i] #vector of anomalies of this series
ylab="Standardized anomalies (observed - computed)"
tit <- sprintf('%s %d (%s)\n%s',varcli,i,est.c[i,4],est.c[i,5])
plot(x,y,type='h',lwd=lw,ylim=c(-5,5),main=tit,xlab="Time",
ylab=ylab,col=hsv(.7,1,.9))
z <- class(x)
if(z[1]=='Date' | z[1]=='POSIXct') {
grid(NA,NULL,col=grdcol)
if(z[1]=='Date') abline(v=axis.Date(1,x),lty=3,col=grdcol)
else if(z[1]=='POSIXct') abline(v=axis.POSIXct(1,x),lty=3,
col=grdcol)
} else grid(col=grdcol)
abline(-3,0,lty=3,col=grdcol); abline(-5,0,lty=3,col=grdcol)
lines(x,log10(nrefs[,i])-5,col='orange2')
lines(x,log10(mindist[,i])-5,col=verde)
mtext(' 1',4,las=1,adj=0,at=-5,col=verde)
mtext(' 10',4,las=1,adj=0,at=-4,col=verde)
mtext(' 100',4,las=1,adj=0,at=-3,col=verde)
mtext("min.d.",4,las=1,adj=0,at=-5.4,col=verde)
mtext(' (km)',4,las=1,adj=0,at=-2,col=verde)
mtext("n.ref.",4,las=1,adj=0,at=-5.8,col='orange2')
lines(rep(x[kp],2),c(-5,4.8),col='red',lty=2) #show split location
text(x[kp],5,floor(tVxx))
}
#count no. of splits per year
z <- as.integer(strftime(x[kp],'%Y'))-anyi+1 #year of the split
nsy[z] <- nsy[z] + 1 #no. of splits per year
#split series at the break point:
nd1 <- sum(!is.na(dat[1:(kp-1),i])) #no. of data of fragment 1
nd2 <- sum(!is.na(dat[kp:nd,i])) #no. of data of fragment 2
if(nd1 < mindat & nd2 < mindat) stop('\n',sprintf("Both fragments have less than %d data. Please, remove this series\nfrom the input files or decrease the mindat parameter.",mindat))
del <- 0 #flag of deleted fragments
if(std==2) {
md1 <- mean(dat[1:(kp-1),i],na.rm=TRUE)
md2 <- mean(dat[kp:nd,i],na.rm=TRUE)
} else md1 <- md2 <- 1
if(nd1 < mindat) {
dat[1:(kp-1),i] <- NA; del <- del + 1
cat(" Fragment with less than",mindat,"data DELETED")
}
if(nd2 < mindat) {
dat[kp:nd,i] <- NA; del <- del + 1
cat(" Fragment with less than",mindat,"data DELETED")
}
if(md1==0) {
dat[1:(kp-1),i] <- NA; del <- del + 1
cat(" Fragment with zero mean DELETED")
} else if(md2==0) {
dat[kp:nd,i] <- NA; del <- del + 1
cat(" Fragment with zero mean DELETED")
}
if(del==0) { #if both fragments are kept:
nn <- nn+1 #update no. of new series
iest <- c(iest,iest[i]) #add index of original series
nsp[iest[i]] <- nsp[iest[i]]+1 #update no. of shifts
dat <- cbind(dat,rep(NA,nd)) #new data column
#move pre-cut data to the new series:
dat[1:(kp-1),ne+nn] <- dat[1:(kp-1),i]
dat[1:(kp-1),i] <- NA #delete pre-cut data
#copy coordinates and add a suffix to station code and name:
z <- data.frame(est.i[iest[i],1:3],paste(est.i[iest[i],4],'-',1+nsp[iest[i]],sep=''),paste(est.i[iest[i],5],'-',1+nsp[iest[i]],sep=''))
names(z) <- names(est.i)
est.c <- rbind(est.c,z)
#adjust means (and std. devs.?) to the fragments:
switch(std,
{ dat.m0[i] <- mean(dat[,i],na.rm=TRUE) + refmed - mean(datmed[!is.na(dat[,i])])
dat.m0 <- c(dat.m0, mean(dat[,ne+nn],na.rm=TRUE)+refmed-mean(datmed[!is.na(dat[,ne+nn])])) },
{ dat.m0[i] <- mean(dat[,i],na.rm=TRUE) * refmed / mean(datmed[!is.na(dat[,i])])
dat.m0 <- c(dat.m0, mean(dat[,ne+nn],na.rm=TRUE)*refmed/mean(datmed[!is.na(dat[,ne+nn])])) },
{ dat.m0[i] <- mean(dat[,i],na.rm=TRUE) + refmed - mean(datmed[!is.na(dat[,i])])
dat.m0 <- c(dat.m0, mean(dat[,ne+nn],na.rm=TRUE)+refmed-mean(datmed[!is.na(dat[,ne+nn])]))
dat.s0[i] <- sd(dat[,i],na.rm=TRUE) + refstd - sd(datmed[!is.na(dat[,i])])
dat.s0 <- c(dat.s0, sd(dat[,ne+nn],na.rm=TRUE)+refstd-sd(datmed[!is.na(dat[,ne+nn])])) },
{ dat.m0[i] <- mean(dat[,i],na.rm=TRUE) + refmed - mean(datmed[!is.na(dat[,i])])
dat.m0 <- c(dat.m0, mean(dat[,ne+nn],na.rm=TRUE)+refmed-mean(datmed[!is.na(dat[,ne+nn])])) }
)
}
#update tVx and flags for next loop:
modif <- TRUE #flag of modified series
splt[i] <- tVx[i] #split tV of series i
tVx[i] <- 0 #set its tVx to 0
tVxx <- max(tVx,na.rm=TRUE) #maximum tVx of remaining series
}
if(nn) {
cat('\n\n',"Update number of series:",ne,'+',nn,'= ')
ne <- ne+nn #update no. of series
cat(ne,'\n')
refhom <- c(refhom,rep(FALSE,nn)) #update refhom vector
}
#- No new splits? histograms of tVx and breaks
if(!nn & !modif) {
if(graphics) {
#histogram of global tVx (without unreallistic zeros):
z <- tVx[!is.na(tVx) & tVx>0]
main <- sprintf("Histogram of maximum %s (Stage %d)",inhtest,ks)
if(sum(!is.na(z))) hist(z,breaks=20,xlab=inhtest,col='purple',main=main)
if(ks==2 | inh<1) {
#histogram of no. of splits per station
#(too small fragments are not counted for, although they will be
# reported in the final break list):
hist(nsp,breaks=0:max(9,max(nsp)+1)-.5,col='orange2',xlab="Number of splits",ylab="Number of stations",main="Number of splits per station")
#no. of splits per year:
w <- min(5,ceiling(400/na)) #addapt bar width
plot(anyi:anyf,nsy,type='h',lwd=w,col=2,ylim=c(0,max(10,max(nsy))),xlab="Years",ylab="Number of splits",main="Number of splits per year")
grid(col=grdcol)
}
}
#list of possible splits with only one referencia:
z <- which(tVx<0)
if(length(z)>0) {
cat("Series that could break but had only one reference:",'\n')
print(est.c[z,4])
}
break #break loop and go to next stage
}
}
}
#------------ End of the three homogeneization stages --------------
#RMSE of estimated data:
z <- dat.c; zo <- dat.o[,iest]; zo[dat.na] <- NA
rmse <- apply((z-zo)^2,2,function(x) sqrt(mean(x,na.rm=TRUE)))
#- graphics of anomalies of the homogenized series
# (with maximium tVx, grouped by original series):
tVx <- rep(NA,ne) #(save final tVx in overlapping windows)
inhx <- rep(NA,ne) #(save final tVx in complete series)
for(io in 1:nei) { #for every original series
wi <- which(iest==io) #series derived from station io
lwi <- length(wi)
for(i in wi) { #for every derived series
y <- sanom[,i] #standardized anomalies of the series
if(graphics) {
ylab="Standardized anomalies (observed - computed)"
tit <- sprintf('%s %d (%s)\n%s',varcli,i,est.c[i,4],est.c[i,5])
plot(x,y,type='h',lwd=lw,ylim=c(-5,5),main=tit,xlab="Time",ylab=ylab,col=hsv(.7,1,.9))
z <- class(x)
if(z[1]=='Date' | z[1]=='POSIXct') {
grid(NA,NULL,col=grdcol)
if(z[1]=='Date') abline(v=axis.Date(1,x),lty=3,col=grdcol)
else if(z[1]=='POSIXct') abline(v=axis.POSIXct(1,x),lty=3,col=grdcol)
} else grid(col=grdcol)
abline(-3,0,lty=3,col=grdcol); abline(-5,0,lty=3,col=grdcol)
lines(x,log10(nrefs[,i])-5,col='orange2')
lines(x,log10(mindist[,i])-5,col=verde)
mtext(' 1',4,las=1,adj=0,at=-5,col=verde)
mtext(' 10',4,las=1,adj=0,at=-4,col=verde)
mtext(' 100',4,las=1,adj=0,at=-3,col=verde)
mtext("min.d.",4,las=1,adj=0,at=-5.4,col=verde)
mtext(' (km)',4,las=1,adj=0,at=-2,col=verde)
mtext("n.ref.",4,las=1,adj=0,at=-5.8,col='orange2')
}
#apply wtest and flag its maximum tV (if >=1):
st <- wtest(y,swa,sts,test); tVx[i] <- st[1]; zz <- floor(st[1])
if(zz) {
kp <- as.integer(st[2])
if(graphics) {
lines(rep(x[kp],2),c(-5,4.8),col=verde,lty=2) #flag maximum wtest
text(x[kp],5,zz,col=verde) #value
}
}
#apply test and flag its maximum:
st <- eval(call(test,y,sts))
inhx[i] <- round(st[1],1); zz <- floor(st[1])
if(!is.na(zz) & zz & graphics) {
kp <- round(st[2])
lines(rep(x[kp],2),c(-5,4.8),lty=4) #flag tVx (maximum test)
text(x[kp],-5.2,zz) #value
}
}
}
#homogenized series:
dah <- round(dat.d,ndec) #rount to the requested no. of decimals
#- graphics of homogenized series and their corrections
if(graphics) {
plot(-1:1,-1:1,type='n',xaxt='n',yaxt='n',bty='n',xlab='',ylab='')
text(0,0.4,"Final graphics",cex=3)
text(0,-0.3,"Adjusted series and\napplied corrections",cex=2.2)
if(nm>0) xlab <- "Years" else xlab <- "Dates"
layout(matrix(1:2,2,1,byrow=TRUE))
#filters to get annual values:
ndpy <- round(nd/na) #no. of data per year
if(nd<=120 | nd<ndpy*3) { fltr=1; ylabd <- "Data" }#few data? avoid filter
else {
fltr <- rep(1,ndpy)
if(annual=='sum') ylabd <- "Running annual totals"
else {
ylabd <- "Running annual means"
fltr <- fltr/ndpy
}
}
if(!is.na(uni)) ylabd <- sprintf('%s (%s)',ylabd,uni)
for(i in 1:nei) { #for every original station
wi <- which(iest==i) #derived series of station i
lwi <- length(wi)
if(lwi>1) vi <- TRUE else vi <- FALSE
#filtros para valores anuales
tit <- sprintf('%s %d (%s)\n%s',varcli,i,est.c[i,4],est.c[i,5])
yo <- as.vector(dat.o[,i]) #original observations
y <- dah[,wi] #homogenized data
par(mar=c(0,4,4,2),xaxt='n',cex=cex,cex.axis=cex,cex.lab=cex)
yf <- stats::filter(y,fltr)
ylim <- c(floor(min(yf,na.rm=TRUE)),ceiling(max(yf,na.rm=TRUE)))
#(avoid matplot because of bad date management in X axes)
plot(x,stats::filter(yo,fltr),type='n',ylim=ylim,ylab=ylabd,main=tit)
matlines(x,yf,lty=1,col=2:20)
lines(x,stats::filter(yo,fltr)) #redraw observations line
par(xaxt='s')
z <- class(x)
if(z[1]=='Date' | z[1]=='POSIXct') {
grid(NA,NULL,col=grdcol)
if(z[1]=='Date') abline(v=axis.Date(1,x,tick=FALSE,labels=FALSE),
lty=3,col=grdcol)
else if(z[1]=='POSIXct') abline(v=axis.POSIXct(1,x,tick=FALSE,
labels=FALSE),lty=3,col=grdcol)
} else grid(col=grdcol)
par(mar=c(5,4,0,2))
#corrections:
if(std==2) {
yo[yo==0] <- NA; y[y==0] <- NA
yd <- y/yo; ylab <- "Correction factors"
} else {
yd <- y-yo; ylab <- "Correction terms"
if(!is.na(uni)) ylab <- sprintf('%s (%s)',ylab,uni)
}
ylim <- c(floor(min(yd,na.rm=TRUE)),ceiling(max(yd,na.rm=TRUE)))
if(std==2 & ylim[2]<=2) ylim <- c(0,2)
if(std!=2 & ylim[1]>=-1 & ylim[2]<=1) ylim <- c(-1,1)
if(vi) plot(x,yd[,1],type='n',ylim=ylim,ylab=ylab,xlab='Time')
else plot(x,yd,type='n',ylim=ylim,ylab=ylab,xlab='Time')
matlines(x,yd,type='l',lty=1,col=2:20,xaxt='n')
z <- class(x)
if(z[1]=='Date' | z[1]=='POSIXct') {
grid(NA,NULL,col=grdcol)
if(z[1]=='Date') abline(v=axis.Date(1,x),lty=3,col=grdcol)
else abline(v=axis.POSIXct(1,x),lty=3,col=grdcol)
} else grid(col=grdcol)
}
par(my.par)
par(las=1,cex=cex,cex.axis=cex,cex.lab=cex)
}
close(Fout)
if(sum(inht)>0) cat('\n',"======== End of the homogenization process, after ",sep='')
else cat('\n',"======== End of the missing data infilling process, after ",sep='')
cat(format(round(Sys.time()-time1,2)),'\n')
cat('\n-----------',"Final calculations",':\n')
#inhtest in each series:
if(inhtest=='SNHT') cat('\n',"SNHT: Standard normal homogeneity test (on anomaly series)",'\n')
else cat('\n',"CucT: Cucconi test (on anomaly series)",'\n')
print(summary(round(inhx,1)))
#RMSE of estimated data (unnormalized):
cat('\n',"RMSE: Root mean squared error of the estimated data",'\n')
zz <- summary(rmse)
print(zz)
sedec <- max(1,2-ceiling(log10(zz[4]))) #no. of decimals of RMSE
rmse <- round(rmse,sedec) #round RMSE
pod <- floor(100*(nd-apply(dat.na,2,sum))/nd) #percentage of original data
cat('\n',"POD: Percentage of original data",'\n')
print(summary(pod))
#- print summary of results
cat('\n')
df <- data.frame(SNHT=inhx,RMSE=rmse,POD=pod,Code=est.c[,4],Name=est.c[,5])
names(df)[1] <- inhtest
print(df,right=FALSE); cat('\n')
cat("Frequency distribution tails of residual anomalies and",inhtest,'\n\n')
cat("Left tail of standardized anomalies:",'\n')
print(round(quantile(sanom,probs=c(.001,.002,.005,.01,.02,.05,.1),na.rm=TRUE),1))
cat("Right tail of standardized anomalies:",'\n')
print(round(quantile(sanom,probs=c(.9,.95,.98,.99,.995,.998,.999),na.rm=TRUE),1))
cat(sprintf("Right tail of %s on windows of %d terms with up to %d references:\n",inhtest,2*swa,nref[3]))
print(round(quantile(tVx,probs=c(.9,.95,.98,.99,.995,.998,.999),na.rm=TRUE),1))
cat(sprintf("Right tail of %s with up to %d references:\n",inhtest,nref[3]))
print(round(quantile(inhx,probs=c(.9,.95,.98,.99,.995,.998,.999),na.rm=TRUE),1))
#add new columns to the stations table (percentage of original data,
# inhx and RMSE):
est.c <- cbind(est.c,pod,inhx,rmse)
#round input data for graphics and final results
dat <- round(dat,ndec)
#- if(graphics), plot last graphics
if(graphics) {
#histogram of the anomalies (deleted outliers in red):
main <- "Histogram of standardized anomalies"
z <- hist(c(sanom,outan),plot=FALSE)
zx <- z$breaks
zy <- z$counts; zy[zy==0] <- NA
barplot(zy,log='y',space=0,ylim=c(.9,max(zy,na.rm=TRUE)*2),
ylab="Frequency",col=3,main=main,xlab="Anomalies (standard deviations)")
axis(1,1:length(zx)-1,labels=as.character(zx))
if(sum(delout)) { #redraw outliers in red
zy <- hist(delout,breaks=zx,plot=FALSE)$counts; zy[zy==0] <- NA
barplot(zy,log='y',space=0,ylim=c(.9,max(zy,na.rm=TRUE)*2),
col=hsv(0,.75),add=TRUE)
}
#histogram of tVx by overlapping windows (withoud unrealistic zeros):
z <- tVx[!is.na(tVx) & tVx>0]
main <- sprintf("Histogram of maximum windowed %s",inhtest)
if(sum(!is.na(z))) hist(z,breaks=20,xlab=inhtest,col=verde,main=main)
#histogram of tVx of the complete series:
z <- inhx; main <- sprintf("Histogram of maximum global %s",inhtest)
if(sum(!is.na(z))) hist(z,breaks=20,xlab=inhtest,col='purple',main=main)
#quality/singularity graphic:
if(is.na(uni)) xlab <- 'RMSE' else xlab <- sprintf('RMSE (%s)',uni)
plot(rmse,inhx,type='n',xlim=c(0,max(1,max(rmse,na.rm=TRUE))),ylim=c(0,max(50,max(inhx,na.rm=TRUE))),xlab=xlab,ylab=inhtest,main="Station's quality/singularity")
grid(col=grdcol)
text(rmse,inhx,col=hsv(.7,1,.9))
}
if(graphics) { par(old.par); graphics.off() } #close graphic output
#- save results in a rda file
dat <- dat.o
names(est.c) <- c('X','Y','Z',"Code","Name",'pod',test,'rmse')
rownames(est.c) <- 1:ne
if(graphics & exists('ct')) save(dat,dah,nd,ndec,uni,est.c,corm,ct,nei,ne,nm,std,x,ini, file=sprintf('%s.rda',fbas))
else save(dat,dah,nd,ndec,uni,est.c,nei,ne,nm,std,x,ini, file=sprintf('%s.rda',fbas))
#sort files of outliers and breaks:
if(!metad) {
close(Fbrk)
brk <- read.csv(sprintf('%s_brk.csv',fbas),colClasses=c('character','character','numeric'))
brk <- brk[order(brk[,1],brk[,2]),]
write.csv(brk,sprintf('%s_brk.csv',fbas),row.names=FALSE)
}
out <- read.csv(sprintf('%s_out.csv',fbas),colClasses=c('character','character','numeric','numeric','numeric','numeric'),check.names=FALSE)
if(nrow(out) > 1) { #remove duplicates keeping the last rows:
out <- out[as.integer(row.names(unique(out[,1:2],fromLast=TRUE))),]
out <- out[order(out[,1],out[,2]),] #sort by stations and dates
#substitute 'Suggested' values by the final estimations:
#(this may originate inconsistencies with the anomalies reported at the detection time)
for(k in 1:nrow(out)) {
if(out[k,6]==0) next #skip suspect outlier (not deleted)
ke <- which(est.c[1:nei,4]==out[k,1]) #station rank
kx <- which(x==as.Date(out[k,2])) #date rank
out[k,4] <- dah[kx,ke] #estimated value
}
#remove any line in which Observed = Estimated:
k <- out[,3]==out[,4]
if(sum(k)>0) out <- out[!k,]
#set Deleted=-1 in outliers with the same date and opposite sign:
restore <- FALSE
dou <- out[out$Deleted==1 & !is.na(out[,5]), ]
dup <- duplicated(dou$Date)
if(sum(dup)>0) {
kw <- which(dup)
ud <- unique(dou$Date[kw]) #unique duplicated dates
for(dt in ud) { #for every duplicated date
k <- which(dou$Date==dt); nk <- length(k)
if(sum(dou[k,5]>0) == nk) next #skip outliers with the same sign
for(ks in 1:nk) { #for every station with duplicated outliers
ko <- which(out$Code==dou$Code[k[ks]] & out$Date==dt)
out$Deleted[ko] <- -1
}
}
names(out)[5] <- "Anomaly (std.devs.)"
restore <- TRUE
}
write.csv(out,sprintf('%s_out.csv',fbas),row.names=FALSE)
if(restore) datrestore(varcli,anyi,anyf) #restore reciprocal outliers
}
if(metad & nm<1) {
cat('\n-----------',"Warning:",'\n\n')
cat("Any detected anomalous data in the initial quality control",'\n')
cat("has NOT been deleted: the user is expected to use onlyQC=TRUE",'\n')
cat("in a first run when dealing with daily or subdaily data.",'\n')
}
cat('\n-----------',"Generated output files:",'-------------------------\n\n')
if(logf) cat(fbas,'.txt : ',"Text output of the whole process",'\n',sep='')
cat(fbas,'_out.csv : ',"List of corrected outliers",'\n',sep='')
cat(fbas,'_brk.csv : ',"List of corrected breaks",'\n',sep='')
if(graphics) cat(fbas,'.pdf : ',"Diagnostic graphics",'\n',sep='')
cat(fbas,'.rda : ',"Homogenization results. Postprocess with (examples)",':\n',sep='')
cat(sprintf(' dahstat(\'%s\',%d,%d) #',varcli,anyi,anyf),"averages",'\n')
cat(sprintf(' dahstat(\'%s\',%d,%d,',varcli,anyi,anyf),'stat=','\'','tnd','\'',') #',"OLS trends and p-values",'\n',sep='')
cat(sprintf(' dahstat(\'%s\',%d,%d,',varcli,anyi,anyf),'stat=','\'','series','\'',') #',"homogenized series",'\n',sep='')
cat(sprintf(' dahgrid(\'%s\',%d,%d,',varcli,anyi,anyf),'grid=',"YOURGRID",') #',"homogenized grids",'\n',sep='')
cat(' ... (',"See other options in the package documentation",')\n\n',sep='')
while (sink.number()>0) sink() #close log file(s)
}
#- outrename.- Append a suffix to the output files, to avoid overwrites.
outrename <- function(varcli, anyi, anyf, suffix, restore=FALSE) {
#varcli: Short name of the studied climatic variable.
#anyi: Initial year of the study period.
#anyf: Final year of the study period.
#suffix: Suffix to be inserted (or removed) in the output file names.
#restore: If TRUE, the suffix will be removed.
fbn <- sprintf('%s_%d-%d',varcli,anyi,anyf) #original file base name
#destination file base name:
fbn2 <- sprintf('%s-%s_%d-%d',varcli,suffix,anyi,anyf)
for(ext in c('.txt','.pdf')) {
if(restore) file.rename(paste(fbn2,ext,sep=''),paste(fbn,ext,sep=''))
else file.rename(paste(fbn,ext,sep=''),paste(fbn2,ext,sep=''))
}
if(restore) {
name <- sprintf('%s.rda',fbn2)
if(file.exists(name)) file.rename(name,sprintf('%s.rda',fbn))
name <- sprintf('%s_out.csv',fbn2)
if(file.exists(name)) file.rename(name,sprintf('%s_out.csv',fbn))
name <- sprintf('%s_brk.csv',fbn2)
if(file.exists(name)) file.rename(name,sprintf('%s_brk.csv',fbn))
} else {
name <- sprintf('%s.rda',fbn)
if(file.exists(name)) file.rename(name,sprintf('%s.rda',fbn2))
name <- sprintf('%s_out.csv',fbn)
if(file.exists(name)) file.rename(name,sprintf('%s_out.csv',fbn2))
name <- sprintf('%s_brk.csv',fbn)
if(file.exists(name)) file.rename(name,sprintf('%s_brk.csv',fbn2))
}
return(invisible())
}
#- datrestore.- Restore some deleted outliers into the dah matrix of the *.rda output file.
datrestore <- function(varcli, anyi, anyf, QCout=FALSE) {
#varcli: Short name of the studied climatic variable.
#anyi: Initial year of the study period.
#anyf: Final year of the study period.
#QCout: if TRUE, restore selected outliers from the *-QC*_out.csv file.
x <- est.c <- NULL #(avoid invisible bindings)
fbn <- sprintf('%s_%d-%d',varcli,anyi,anyf) #file base name
frda <- sprintf('%s.rda',fbn) #file of homogenization results (*.rda)
obj <- load(frda) #load homogenization results
#read list of outliers to restore:
if(!QCout) dout <- read.csv(sprintf('%s_out.csv',fbn),as.is=TRUE)
else dout <- read.csv(sprintf('%s-QC_%d-%d_out.csv',varcli,anyi,anyf),as.is=TRUE)
dout <- dout[dout[,6]<0,] #keep outliers to be restored only
cat('\n')
if(nrow(dout)==0) { cat("No deleted data to restore!"); return() }
if(inherits(x,'Date')) posix <- FALSE else posix <- TRUE
cat("Restoring reciprocal outliers (Deleted<0) into the homogenized series:",'\n')
for(k in 1:nrow(dout)) {
if(posix) i <- which(x==as.POSIXct(dout[k,3])) #time index
else i <- which(x==as.Date(dout[k,2])) #time index
j <- which(est.c[,4]==dout[k,1]) #station index
i2 <- i-dout[k,6]-1 #final time index for identical data runs
cat(dout[k,1],dout[k,2],':',dah[i:i2,j],' -> ',dout[k,3],'\n')
dah[i:i2,j] <- dout[k,3]
}
save(list=obj, file=frda) #save updated *.rda file
cat('\n',"Homogenized series updated into the file",frda,'\n')
cat("(Note that selected outliers have been restored only in",'\n')
cat(" the series homogenized from the last homogeneous subperiod)",'\n')
}
#- QCthresholds.- Obtain monthly thresholds for Quality Control alerts.
QCthresholds <- function(dat, ndec=1, probs=c(0.,.001,.01,.99,.999,1.),
minval=NA, maxval=NA, homog=TRUE, verb=TRUE) {
#dat: either the name of a *.rda file of Climatol homogenization results or a
# data.frame of daily (or subdaily) data in columns, dates or date/times (of
# class Date or POSIXct) in the first column and station codes in the header
#ndec: number of decimals of output values [1] (defaults shown between brackets)
#probs: probabilities of the quantiles to be computed [0.,.001,.01,.99,.999,1.]
#minval: minimum value to compute runs of constant values (e.g., set to .1 with
# daily precipitation to avoid the calculation of long runs of zeros)
#maxval: maximum value to compute runs of constant values (e.g., set to 97 with relative humidity to avoid reporting long runs of values greater than 97% in episodes of persistent fog).
#homog: use homogenized data if a *.rda file is used as input [TRUE]
#verb: list all calculated values? [TRUE]
nei <- est.c <- dah <- NULL #(avoid invisible bindings)
if(is.data.frame(dat)) { #check the data frame:
nd <- nrow(dat) #no. of data rows
x <- dat[,1] #time vector
if(!inherits(x,'Date') & !inherits(x,'POSIXct')) stop("The first column should be of class Date (or POSIXct in the case of subdaily data)")
ne <- ncol(dat) #no. of data columns
codes <- names(dat[,2:ne]) #station codes
dat <- as.matrix(dat[,2:ne]); ne <- ne-1 #no. of stations
} else if(is.character(dat)) { #check the input file:
n <- nchar(dat)
if(substr(dat,n-3,n)!='.rda') stop("Files of Climatol homogenization results have extension .rda")
if(!file.exists(dat)) stop(sprintf("File %s not found",dat))
load(dat)
ne <- nei; codes <- est.c[1:nei,4]
if(homog) dat <- dah[,1:nei]
}
np <- length(probs) #no. of probability levels
probs2 <- probs[probs>0.5]; np2 <- length(probs2) #right tail probabilities
st <- as.factor(t(matrix(rep(codes,nd),ne,nd))) #factor of station codes
mm <- as.factor(matrix(rep(strftime(x,'%m'),ne),nd,ne)) #factor of months
#quantiles of the data:
thr1 <- round(unlist(tapply(dat,list(st,mm),quantile,probs=probs,
na.rm=TRUE)),ndec)
dim(thr1) <- c(np,ne,12); dimnames(thr1) <- list(probs,codes,1:12)
#quantiles of first differences:
st <- as.factor(t(matrix(rep(codes,nd-1),ne,nd-1))) #station codes
thr2 <- round(unlist(tapply(abs(diff(dat)),list(st),quantile,probs=probs2,
na.rm=TRUE)),ndec)
dim(thr2) <- c(np2,ne); thr2 <- t(thr2); dimnames(thr2) <- list(codes,probs2)
#quantiles of longest runs of constant values
if(!is.na(minval)) dat[dat<minval] <- NA #delete too low values
if(!is.na(maxval)) dat[dat>maxval] <- NA #delete too high values
z <- sapply(apply(dat,2,rle), function(x) x[1]) #run lengths
thr3 <- t(sapply(z,quantile,probs=probs2,na.rm=TRUE))
dimnames(thr3) <- list(codes,probs2)
if(verb) { #list all calculated thresholds:
cat('\n=========== thr1:',"Monthly quantiles of the data",'\n\n')
for(i in 1:ne) {
cat('---------',"Station",codes[i],'\n')
print(thr1[,i,])
}
cat('\n=========== thr2:',"Quantiles of the first differences",'\n\n')
print(thr2)
cat('\n=========== thr3:',"Quantiles of run lengths of constant values")
if(!is.na(minval)) cat(' >=',minval)
cat('\n\n')
print(thr3)
}
#save the results in R binary format:
save(thr1,thr2,thr3, file='QCthresholds.Rdat')
cat('\n',"Thresholds thr1,thr2,thr3 saved into QCthresholds.Rdat",'\n')
cat('(',"Rename this file to avoid overwriting it in the next run.",')\n')
}
#- read.dat.- Read input data (for several climatol functions).
read.dat <- function(varcli, anyi, anyf, x=NULL, ini=NA, tinc=NA, tz='utc',
na.strings=NA) {
#varcli: Short name of the studied climatic variable.
#anyi: Initial year of the study period.
#anyf: Final year of the study period.
#x: Time vector. Automatically set by default.
#ini: Initial date, with format 'AAAA-MM-DD' (for daily data).
#tinc: Time increment between data. Not set by default, but must be defined
# for subdaily data, with units 'hours', 'mins' or 'secs'.
#tz: Time zone (if data are subdaily). ('utc' by by default)
#na.strings: strings marking missing data (NA by default).
fbas <- sprintf('%s_%d-%d',varcli,anyi,anyf) #base file name
fiche <- sprintf('%s.est',fbas) #stations file name
#read coordinates, codes and names of the stations:
est.c <- read.table(fiche,colClasses=c('numeric','numeric','numeric','character','character'))
names(est.c) <- c('X','Y','Z',"Code","Name")
ne <- nrow(est.c) #no. of stations
#check for any missing X, Y, Code:
xnas <- sum(is.na(est.c[,1])); ynas <- sum(is.na(est.c[,2]))
cnas <- sum(is.na(est.c[,4]))
if(xnas>0 | ynas>0 | cnas>0) stop("There are missing coordinates and/or station codes!\nPlease, complete the *.est file and run this function again.")
#set any missing elevation to 99 m:
knas <- is.na(est.c[,3])
if(sum(knas>0)) {
cat("Warning: Missing elevations are replaced by",'99\n')
est.c[knas,3] <- 99
}
#change any hyphen in codes to underscores:
z <- gsub('-','_',est.c[,4])
zn <- sum(z != est.c[,4])
if(zn>0) {
cat("Characters '-' in codes have been changed to '_'",'\n\n')
est.c[,4] <- z
}
#check for duplicate codes:
z <- duplicated(est.c[,4])
if(sum(z)>0) {
zz <- unique(est.c[z,4])
cat("Duplicated codes detected",':\n',sep='')
print(est.c[est.c[,4]%in%zz,4:5])
stop("The station file *.est must contain unique codes.")
}
#use station codes for any missing station names:
knas<- is.na(est.c[,5])
if(sum(knas>0)) {
cat("Warning: Missing station names are replaced by their codes",'\n')
est.c[knas,5] <- est.c[knas,4]
}
#check if coordinates are in degrees:
if(max(abs(est.c[,1]))>180 | max(abs(est.c[,2]))>90)
stop("Station coordinates must be in geographic degrees with decimals:\n-180 to 180 for X (longitude) and -90 to 90 for Y (latitude)")
fichd <- sprintf('%s.dat',fbas) #data file name
dat <- scan(fichd,na.strings=na.strings) #read data
numdat <- length(dat) #no. of read data
nd <- numdat/ne #no. of data per station
if(nd-floor(nd)>1e-16) {
cat(ne,"stations read from",fiche,'\n')
cat(numdat,"data read from",fichd,'\n')
stop("The length of data is not a multiple of the number of stations!")
} else cat("Data matrix:",nd,"data",'x',ne,"stations",'\n')
dim(dat) <- c(nd,ne) #convert vector to matrix
na <- anyf-anyi+1 #no. of years
if(!is.null(x)) nm <- 0
else { #calculate nm (no. of data per year and station):
z <- nd/na
if(z>=1) nm <- floor(z)
if(nm > 366) nm <- -1 #flag of subdaily data
else if(nm > 12) nm <- 0 #flag of daily data
}
#check if years are complete:
ndp <- length(seq(as.Date(sprintf('%s-01-01',anyi)),
as.Date(sprintf('%s-12-31',anyf)),1)) #no. of days in the period
if(nm>0 & nd%%nm==0) acomp <- TRUE
else if(nd%%ndp==0) acomp <- TRUE
else acomp <- FALSE
#set tinc?
if(is.na(tinc) & nd>ndp) {
if(nd/ndp < 2) {
cat("There are",ndp,"days in the studied period, but there are",'\n')
cat(nd,"values per serie.",'\n')
stop("Inconsistent number of data for the period of study.")
}
nh <- 24*ndp/nd #no. of hours
if(nh>=1 & 24%%nh==0) tinc <- sprintf('%d hours',nh)
else if((nh*60)==floor(nh*60)) tinc <- sprintf('%d mins',nh*60)
else stop("Your data seem to be sub-daily, but the number of values per day is not a round number of hours or minutes. Please check.")
}
#- build time vector x if not supplied by the user:
if(is.null(x)) {
if(is.na(ini)) ini <- sprintf('%d-01-01',anyi) #default initial date
if(nm>0) x <- seq(as.Date(ini),length.out=nd,by=sprintf('%d months',round(12/nm)))
else if(nm==0) x <- seq(as.Date(ini),length.out=nd,by='1 day')
else x <- seq(as.POSIXct(ini,tz=tz),length.out=nd,by=tinc)
} else ini <- x[1]
if(length(x) != nd) stop(sprintf("The provided vector x has %d dates, but there appear to be %d data per station. Please, fix this inconsistency.",length(x),nd))
return(list(est.c=est.c,dat=dat,na=na,nd=nd,ne=ne,nm=nm,x=x,ini=ini,
tinc=tinc,acomp=acomp))
}
#- unsufix.- Remove numeric sufixes from station codes.
unsufix <- function(str) sapply(strsplit(str,'-'), function(x) x[1])
#- weekendaccum.- Check for possible undeclared daily precipitation weekend accumulations and assign an accumulation code if necessery to the false zeroes.
weekendaccum <- function(varcli,anyi,anyf,na.strings='NA',cumc=-1,wdsl=1) {
#varcli: Short name of the studied climatic variable
#anyi: Initial year
#anyf: Final year
#cumc: code of accumulated missing data.
#wdsl: weekday significance level (in %) to detect 1 to 3 false consecutive
# zeroes followed by an accumulation of precipitation. Only relevant for daily
# precipitation. Set a value between 0.1 and 10 to enable this test,
# equivalent to significant levels between 0.001 and 0.1.
#------------------------------------------------------------------
if(wdsl<.1 | wdsl>10) stop("Parameter wdsl needs a value between 0.1 and 10")
fbas <- sprintf('%s_%d-%d',varcli,anyi,anyf) #file basename
flog <- sprintf('%s-wkn.txt',fbas) #log file
#- read input data
z <- read.dat(varcli,anyi,anyf,x=NULL,ini=NA,tinc=NA,na.strings,tz='utc')
est.c <- z$est.c; dat <- z$dat; na <- z$na; nd <- z$nd; ne <- z$ne; x <- z$x
nm <- z$nm; ini <- z$ini; tinc <- z$tinc; acomp <- z$acomp
if(unique(diff(x)) != 1)
stop("This funtion is intended to be applied to DAILY PRECIPITATION only!")
#- check for even distribution in the week days
#quantiles of differences with 1 to 3 previous terms depending on sig. level:
qd <- rep(0,3)
qd[1] <- -0.718275+0.011355*wdsl+0.091394*log(wdsl)
qd[2] <- -0.620828+0.009160*wdsl+0.081216*log(wdsl)
qd[3] <- -0.583525+0.008634*wdsl+0.075586*log(wdsl)
qdr <- round(qd,3)
wkd <- as.integer(strftime(x,'%w')) #weekday: 0(Sunday) to 6
year <- as.integer(strftime(x,'%Y'))
#function to get short names of the weekdays 0(Sunday) to 6
wdf <- function(x) strftime(as.Date(x,origin='2000-01-02'),'%a')
dat0 <- dat==0; dat0[is.na(dat0)] <- FALSE #zeroes in data matrix
zm <- matrix(nrow=na,ncol=10) #initialize matrix of 0 precipitation counts
no <- aggregate(!is.na(dat),list(year),sum) #no. of observations per year,stn
z2 <- aggregate(dat0,list(year,wkd),sum,na.rm=TRUE)
z2[,1] <- z2[,1]-anyi+1; z2[,2] <- z2[,2]+4 #z2[,2] ranges 4(Sunday) to 10
nstac <- 0 #no. of stations with false weekend zeroes
#open log file and write header:
sink(flog,split=TRUE)
cat("========= Output of the weekendaccum function ================",'\n\n')
cat("Checking for weekend effect in",fbas,"data",':\n\n')
for(j in 1:ne) { #for each station:
cat('---------',"Station",j,paste0('(',est.c[j,4],'): '))
zm[as.matrix(z2[,1:2])] <- z2[,j+2] #no. of zeroes in na x 10wdays matrix
zm[no[,j+1]<100,] <- NA #delete counts with less than 100 observations/year
zs <- apply(zm[,4:10],1,sum) #total 0 counts per year
zm[,4:10] <- zm[,4:10]*7/zs # rate of 0 counts per day each year
zm[,1:3] <- zm[,8:10] #repeat last 3 weekdays at the beginning
ncum <- 0 #no. of zeroes suspect of accumulation
for(p in 1:na) { #for each year
if(is.na(sum(zm[p,]))) next #not enough data
zn <- rep(9,3) #initialize minimum values
kw <- rep(NA,3) #initialize days of maximum difference
for(q in 4:10) { #for weekdays Sunday to Saturday:
for(k in 1:3) { #maximum difference with 1, 2 or 3 previous days:
zd <- zm[p,q]-mean(zm[p,q-1:k]) #difference with k previous days
if(zd < zn[k]) { zn[k] <- zd; kw[k] <- q-3 } #kw ranges 1(Sunday):7
}
}
kd <- which.min(zn-qd) #no. of days with most significant difference
if(zn[kd] < qd[kd]) { #false zeroes detected in kd days before day kw:
if(kd==1) cat(p+anyi-1,':',1,"day difference ",round(zn[kd],3),'<',
qdr[kd],' (',wdf(kw[kd]-kd-1),'<-',cumc,')\n')
else cat(p+anyi-1,':',kd,"days difference",round(zn[kd],3),'<',
qdr[kd],paste0(' (',wdf(kw[kd]-kd-1),'-',wdf(kw[kd]-2),' <- ',cumc,')\n'))
k <- which(year==p+anyi-1 & kw[kd]==wkd+1) #terms of year p and day kw
if(p==1) k <- k[k>kd] #avoid the use of terms out of range
for(kk in k) {
kn <- kk-1:kd
if(sum(dat0[kn,j])==kd) { #all kd previous days must have zero precip.
ncum <- ncum+kd #no. of zeroes suspect of accumulation
dat[kn,j] <- cumc #assign to them the accumulation code
}
}
}
}
if(ncum>0) {
nstac <- nstac+1
cat(ncum,"dates with zero precipitation have been assigned code",cumc,'\n')
} else cat("No weekend effect detected in this station",'\n')
}
if(nstac>0) {
#keep original data as *-wkn_* and rewrite data with assigned cumc:
file.rename(sprintf('%s.dat',fbas),sprintf('%s-wkn_%d-%d.dat',varcli,anyi,anyf))
file.copy(sprintf('%s.est',fbas),sprintf('%s-wkn_%d-%d.est',varcli,anyi,anyf))
cat('\n',"Original files have been renamed to",sprintf('%s-wkn_%d-%d.dat',varcli,anyi,anyf),'\n')
if(length(dat)>99999) cat("Rewriting the data file. This make take some time",'...')
write(dat,sprintf('%s.dat',fbas))
cat(" Data file with assigned cumc=",cumc,"has been rewritten",'\n\n')
} else cat('\n',"No weekend false zeroes detected in file",sprintf('%s.dat',fbas),'\n\n')
while (sink.number()>0) sink() #close log file(s)
cat("The output of this process has been saved to file",flog,'\n\n')
}
#- snht.- Maximum Standard Normal Homogeneity Test (allowing missing data).
snht <- function(y, mints=3, allT=FALSE) {
#mints: minimum tail size (minimum number of terms in the tails of the series)
#allT: return all T values? (By default, only the maximum and its position)
yav <- which(!is.na(y)) #available data
x <- y[yav] #series without missing data
n <- length(x)
if(n<mints*2) return(c(0,0)) #insuficientes datos
if(sd(x)==0) return(c(0,0)) #serie constante
T <- rep(NA,n)
# z <- scale(x) #a bit slower than:
z <- (x-mean(x,na.rm=TRUE))/sd(x,na.rm=TRUE)
for(i in mints:(n-mints)+1) { #(skip tails with less than mints terms)
if(is.na(x[i])) next
n1 <- sum(!is.na(x[1:(i-1)])) #no. of terms of sample 1
n2 <- sum(!is.na(x[i:n])) #no. of terms of sample 2
if(n1<mints | n2<mints) next #at least one sample is too small
z1 <- mean(z[1:(i-1)],na.rm=TRUE)
z2 <- mean(z[i:n],na.rm=TRUE)
T[i] <- n1*z1*z1 + n2*z2*z2
}
if(allT) return(T) else return(c(max(T,na.rm=TRUE),yav[which.max(T)]))
}
#- wtest.- Inhomogeneity test on overlapping 2*nt term windows.
wtest <- function(x, nt=48, sts=3, test) {
ntt <- length(x) #no. of total terms of the series
ntv <- sum(!is.na(x)) #no. of valid (non-missing) data of the series
if(2*nt>ntv) return(c(0,0)) #not enough valid data for the test
tV <- 0 #initialization of maximum tV (test value) to return
pk <- 0 #initialization of the tV location to return
#initialization of sample limits (a1-b1, a2-b2):
k <- 1; while(k<ntt & is.na(x[k])) k <- k+1; a1 <- k
n<-1; while(n<nt & k<ntt) { k <- k+1; if(!is.na(x[k])) n <- n+1; }
b1 <- k
k <- k+1; while(k<ntt & is.na(x[k])) k <- k+1; a2 <- k
n<-1; while(n<nt & k<ntt) { k <- k+1; if(!is.na(x[k])) n <- n+1; }
b2 <- k
#apply the test to overlapping windows:
repeat {
st <- eval(call(test,x[a1:b2],sts))
stx <- st[1]
if(!is.na(tV) & stx>tV) { tV <- stx; pk <- round(st[2])+a1-1 }
if(b2==ntt) return(c(tV,pk))
#shift windows forwards:
a1 <- a2; b1 <- b2
k <- b2+1; while(k<ntt & is.na(x[k])) k <- k+1
if(is.na(x[k])) return(c(tV,pk)) else a2 <- k
n<-1; while(n<nt & k<ntt) { k <- k+1; if(!is.na(x[k])) n <- n+1; }
b2 <- k
}
}
#- cuct.- Maximum Cucconi test in a series (allowing missing data and zeros).
cuct <- function(y, mints=3) {
#mints: minimum tail size ((minimum number of terms in the tails of the series)
yav <- which(!is.na(y)) #available data
x <- y[yav] #series without missing data
n <- length(x) #series length
if(n<mints*2) return(c(NA,NA)) #not enough data
Tx <- 0; kx <- 0 #initialize maximum test value and location
rkx <- rank(x,ties.method="first") #ranks of the series
rkz <- n+1-rkx #reverse ranks of the series
#(use as.numeric() to avoid "exceeded integers" in long series:)
rkx <- as.numeric(rkx)*rkx; rkz <- as.numeric(rkz)*rkz #squared ranks
nz1 <- n+1; nz2 <- 2*n+1; nz8 <- 8*n+11 #auxiliary variables
r <- 2*(as.numeric(n)*n-4) / (nz2*nz8) - 1
#calculate test along the series:
for(i in mints:(n-mints)+1) { #(skip too short tails)
n1 <- i-1; n2 <- n-n1 #no. of terms of the samples
S1 <- sum(rkx[i:n]); S2 <- sum(rkz[i:n]) #summations
den <- sqrt(n1*n2*nz1*nz2*nz8/5) #denominator
U <- (6*S1-n2*nz1*nz2)/den
V <- (6*S2-n2*nz1*nz2)/den
T <- (U*U+V*V-2*r*U*V) / (2*(1-r*r)) #value of the test
if(T>Tx) { Tx <- T; kx <- i }
}
return(c(Tx,yav[kx]))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.