knitr::opts_chunk$set(echo = TRUE, message = F, warning = F) library(microbenchmark) library(profvis) library(ggplot2) library(gridExtra) library(tidyr) library(reshape2) library(plotly) library(parallel) library(pander) setwd("H:/LinuxMint/PissoortRepo/ValUSunSSN/ValUSunSSN") #source('smooth.gaussR_LOAD.R') # Re-created Gaussian smoother library(ValUSunSSN) #load('/home/piss/PissoortRepo/ValUSunSSN/Data/DataSSN.RData') # Take full dataset #load('DataSSN81.RData') # Take data since 1981 # data.chris <- read.csv("Alldata.csv") # # data.chris$Date <- as.Date(paste(data.chris$year, # data.chris$month, data.chris$day,sep = "-")) # data.chris[data.chris == -1] <- NA # # data.chris <- data.chris[, colnames(data.chris) %in% c("Ng", "Ns") == F] # # We do it only for SSN here # # data.mat <- spread(data.chris, key="station", value="SSN") # data.mat <- data.mat[order(data.mat$decdate),] # # data.mat2 <- data.mat[,-(1:5)] # Get matrix of stations in columns only. runExample()
All the functions used are available in the $\texttt{interpol2_em_pissoort.R}$. We will focus here on the main functions, in order to not being "overflowed" of code ...
To do this, we will We ran the algorithm three times :
Once for the full dataset (since 1924) because we thought should not waste the opportunity to use data.
Since 1960 because it was the time period use by the $\texttt{FillInChris.R}$ from C.Ritter
Since 1981 as it seemed to be the most reliable time period regarding L.Lefèvre for example, and this also the time span used by "Véronique".
Moreover, comparisons between the results provided by the different time spans could be interesting. We could for example train a model using data from 1981 and then validate it using previous period, in order to have an independent measure of accuracy (...)
and once in for the unmodified SSN datase We take
## First, we discard stations that have coverage <5% for reliability issues # If keep these values, the results will have incorrect values... notNa.sum <- apply(data.mat2, 2, function(x) sum(!is.na(x))) is_less5_cov <- unname(notNa.sum) < .05 * nrow(data.mat2) data.mat2.fin <- data.mat2[, !is_less5_cov] # 7 stations are discarded !! #data.mat2.fin <- data.mat2.fin[data.mat$decdate >= 1981, ] # take only from 1981 ? # Take this for input, as advised in the test.m file y <- sqrt(data.mat2.fin+1) # Select randomly here, for testing options(mc.cores=parallel::detectCores()) # all available cores z <- interpolsvd_em(y, nembed = 2, nsmo = 81, ncomp = 4, niter = 30, displ = F) # 193 sec for the whole dataset (with some stations discarded) z <- z$y.filled zssn <- interpolsvd_em(data.mat2.fin, nembed = 2, nsmo = 81, ncomp = 4, niter = 30, displ = F) zssn <- zssn$y.filled
rownames(y) <- rownames(zssn) <- 1:nrow(zssn) y1 <- cbind(as.data.frame(data.mat2.fin), Date = data.mat$Date, x = "Raw") z1 <- cbind(as.data.frame(zssn), Date = data.mat$Date[data.mat$decdate>1981], x = "Filled") colnames(y1) <- colnames(z1) <- c(colnames(data.mat2.fin), "Date", "x") zz <- rbind(y1,z1)
First of all, we decide to show only the station of Uccle which is considered to some extent, as the "main station". In this dynamic plot provided by $\texttt{plotly}$, you can hide the class you would like to see, e.g. to see raw data you can click on both red and blue button. This plot can conveniently show us, among other things, how the filled alues "fits" the shape of the raw data.
zz_neg_ucc <- data.frame( wnUC2 = zz$wnUC2, Date = zz$Date, x = zz$x, negative = zz$wnUC2 < 0 ) #z1$x <- ifelse(z1$wnUC2 < 0, "Filled <0", "Filled" ) #zz_neg_ucc[!is.na(zz_neg_ucc$x == "Filled"), ][,"x"] <- NA ## Visualize "incorrect" filled values, ie those below 0 # zz_neg_ucc[!is.na(zz_neg_ucc$x == "Filled"), ][ ,"x"] <- ifelse(zz_neg_ucc[!is.na(zz_neg_ucc$x != "Raw"), ][ ,"wnUC2"] < 0, "Filled <0", "Filled" ) zz_neg_ucc[(nrow(zz_neg_ucc)/2+1):nrow(zz_neg_ucc), ][ ,"x"] <- ifelse(zz_neg_ucc[(nrow(zz_neg_ucc)/2+1):nrow(zz_neg_ucc), ][ ,"wnUC2"] < 0, as.factor("Filled <0"), as.factor("Filled") ) zz_neg_ucc[is.na(zz_neg_ucc$x),][,'x'] <- "Filled <0" # zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$x <- ifelse( zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$wnUC2 < 0, "Filled < 0", "Filled" ) # for (i in 1:(nrow(zz)/2) ){ # if ( zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$wnUC2[i] < 0 ) # zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$x[i] <- "Filled <0" # # # else zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$x[i] <- "Filled" # } f <- list( family = "Calibri", size = 16, color = "#33666C" ) f2 <- list( family = "Old Standard TT, serif", size = 14, color = "black" ) xl <- list( title = " Date", titlefont = f, tickfont = f2, showticklabels = TRUE ) yl <- list( title = "SSN for wnUC2", titlefont = f, tickfont = f2, showticklabels = TRUE ) tit <- list( title = "Dynamic plot of the SSN for the station of Uccle", titlefont = f ) l <- list( font = list( family = "sans-serif", size = 13, color = "#000"), bgcolor = "#E2E2E2", bordercolor = "#FFFFFF", borderwidth = 2) plot_ly(data = zz_neg_ucc, x = ~Date, y = ~`wnUC2`, type = "scattergl", color = ~x, colors = c("green", "blue", "red")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)# %>% add_data(plot_ly(data = zz_neg_ucc[zz_neg_ucc$negative.wnUC2,], x = ~Date, y = ~`wnUC2`, type = "scattergl", colors = c("green"))) #%>% add_trace()%>% add_markers(color = ~as.factor(negative.wnUC2))
$\underline{\text{Comments}}$ :
We see that there seem to be some problems concerning the positivity of the values.
For convenience, we decide to show now the two retained stations that has the greatest number of available values, i.e. 21465 (%) and 20855 (%) for wnKS2 and wnKZ2 respectively, and also the two stations with the lowest number of avialable values
notNa.sum <- apply(data.mat2.fin, 2, function(x) sum(!is.na(x))) #sort(notNa.sum)/nrow(data.mat2.fin) * 100 df <- data.frame(wnKS2 = "21465 (64.4%)", wnKZ2 = "18957 (56.9%) ", `wnFR-S` = "2119 (6.4%)", `wnGU-S` = "2185 (6.6%)" ) rownames(df) <- c("# available values ") pander(df)
yl$title <- "SSN for wnFR-S" p1 <- plot_ly(data = zz, x = ~Date, y = ~`wnFR-S`, type = "scattergl", color = ~x, colors = c("red", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l) yl$title <- "SSN for wnGU-S" p2 <- plot_ly(data = zz, x = ~Date, y = ~`wnGU-S` , type = "scattergl", color = ~x, colors = c("red", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l) yl$title <- "SSN for wnKS2" p3 <- plot_ly(data = zz, x = ~Date, y = ~wnKS2, type = "scattergl", color = ~x, colors = c("red", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l) yl$title <- "SSN for wnKZ2" p4 <- plot_ly(data = zz, x = ~Date, y = ~wnKZ2, type = "scattergl", color = ~x, colors = c("red", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l) subplot(p1, p2, nrows = 1,shareY = T, titleY = T )
For these two stations, we remark that the small number of available values indeed impact the uncertainty and the way these missing values are filled by the algorithm.
In contrast, when the number of available values is high $\Rightarrow$
subplot(p3, p4, nrows = 1,shareY = T, titleY = T)
The filling seem quite more homogenous and thus robust.
# z1 <- cbind(as.data.frame(zz[nrow(data.mat),]), Date = data.mat$Date ) # colnames(z1) <- c(colnames(data.mat2.fin), "Date") # zz.full <- melt(z1, id.vars = "Date" ) # dimnames(zz.full)[[2]][2] <- "Station" # dimnames(zz.full)[[2]][3] <- "SSN" g <- ggplot(zz.full) + geom_line(aes( x = Date, y = SSN, col = Station), size = .4) + scale_color_hue(l = 40, c = 200) + theme_piss() # scale_colour_brewer(name=expression(underline(bold("data: "))), # palette = "Set1") + #guides(colour = guide_legend(override.aes = list(size= 2.5))) # ggplotly(g) plot_ly(data = zz.full, x = ~Date, y = ~SSN, type = "scattergl", color = ~Station )
We can first notice a "problem", that is the values for SSN which are below 0
Could'nt we
1. Are the non-missing values still the same ?
We hope so, but we will check it.
filledt.vero <- read.csv("/home/piss/PissoortRepo/ValUSunSSN/data/time_SSNf_1981.csv") # By looking at the Julian dates at the top and the bottom of the series, we retrieved the dates. filledt.vero$Date <- seq(as.Date("1981-02-19"), as.Date("2015-03-30"), by = 1) #load('/home/piss/PissoortRepo/ValUSunSSN/Data/DataSSN81.RData') colnames(zssn) <- colnames(z1[, -((ncol(z1)-1):(ncol(z1)))]) zssn <- cbind.data.frame(zssn, times = z1$Date) zssn <- zssn[as.Date(zssn$times) %in% filledt.vero$Date,] ## add indicators for missingness missings <- is.na() colnames(missings) <- paste0("miss_", colnames(missings)) zssn_complete <- cbind.data.frame(zssn, missings) sum(zssn_complete$miss_wnUC2) head(dif <- zssn_complete$wnUC2[!zssn_complete$miss_wnUC2] - data.mat2.fin$wnUC2[!zssn_complete$miss_wnUC2] ) sum (dif) / 0.0024342
The difference actually the same for all the observations. this not really an intrinsic difference but rather a variable type difference; the first being integer and the latter being real (numeric).
2. Comparisons with all data (missing & non-missing)
# df_uccle_vero <- data.frame(time = zssn$times[1:nrow(filledt.vero)], wnUC2 = filledt.vero$UC2, from = "Vero") # # df_uccle_v <- rbind.data.frame(df_uccle_chris, data.frame(time = zssn$times[1:nrow(filledt.vero)], wnUC2 = zssn$wnUC2[1:nrow(filledt.vero)], from = "`interpolEM`")) # # plot_ly(data = df_uccle_v, x = ~time, y = ~`wnUC2`, type = "scattergl", color = ~from, colors = c("green", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l) df_uccle_vero <- data.frame(time = zssn$times, SSN_wnUC2 = filledt.vero$UC2, from = "Vero") df_uccle_c <- rbind.data.frame(df_uccle_vero, data.frame(time = zssn$times, SSN_wnUC2 = zssn[,"wnUC2"], from = "`interpolEM`")) ggplot(df_uccle_c, aes(col = from)) + geom_point(aes(x = time, y = SSN_wnUC2), size = 0.15) + solar.cycle() + theme_piss() + guides(colour = guide_legend(override.aes = list(size= 3))) + ggtitle("Filled missing values")
Analyze residuals
resdf_vero <- data.frame(time = zssn$times, res = (filledt.vero$UC2) - (zssn[,"wnUC2"]) ) max(resdf_vero$res) ; min(resdf_vero$res) library(RColorBrewer) myPalette <- colorRampPalette(rev(brewer.pal(4, "Spectral"))) ggplot(resdf_vero, aes(x = time, y = res, col = res)) + geom_point( size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250))
Or we could also visualize the discrepancies between the filled values
Only for the filled data
filled_ssn <- zssn[,"wnUC2"][zssn_complete$miss_wnUC2] resdf_vero <- data.frame(time = zssn$times[zssn_complete$miss_wnUC2], res = (filledt.vero$UC2[zssn_complete$miss_wnUC2]) - filled_ssn ) max(resdf_vero$res) ; min(resdf_vero$res) myPalette <- colorRampPalette(rev(brewer.pal(4, "Spectral"))) # ggplot(resdf_vero, aes(x = time, y = res, col = res)) + geom_point( size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250)) ggplot(resdf_vero, aes(col = res)) + geom_line(aes(x = time, y = res), size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250))
Filling method of C.Ritter, time series starting from 1960
Again, we will take th station of Uccle for reference.
# filledt.chris <- read.csv("SSNfilledChris.csv") # Load data # # load("DataSSN60.RData") # Load data we have build # # filled.chris2 <- filledt.chris[,-2] # # df_uccle_chris <- data.frame(time = filled.chris2$times, wnUC2 = filled.chris2$wnUC2_d.txt, from = "Chris") # # df_uccle_c <- rbind.data.frame(df_uccle_chris, data.frame(time = filled.chris2$times, wnUC2 = zssn$wnUC2[1:nrow(df_uccle_chris)], from = "`interpolEM`")) # # plot_ly(data = df_uccle_c, x = ~time, y = ~`wnUC2`, type = "scattergl", color = ~from, colors = c("green", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l) filledt.chris <- read.csv("SSNfilledChris.csv") # Load data load("DataSSN60.RData") # Load data we have build filled.chris2 <- filledt.chris[,-2] ## Are the non-missing values still the same ?? ## add indicators for missingness missings <- is.na(y) colnames(missings) <- paste0("miss_", colnames(missings)) zssn_complete <- cbind.data.frame(zssn, missings) sum(zssn_complete$miss_wnUC2) head(dif <- zssn_complete$wnUC2[!zssn_complete$miss_wnUC2] - data.mat2.fin$wnUC2[!zssn_complete$miss_wnUC2] ) sum (dif) / 0.0024342
Same idea. The very slight discrepancies are du to (type) approximations
df_uccle_chris <- data.frame(time = filled.chris2$times, wnUC2 = filled.chris2$wnUC2_d.txt, from = "Chris") df_uccle_c <- rbind.data.frame(df_uccle_chris, data.frame(time = filled.chris2$times, wnUC2 = zssn$wnUC2[1:nrow(df_uccle_chris)], from = "`interpolEM`")) # Comparisons for first 5 years (1980 : 1985) plot_ly(data = df_uccle_c[1980 < df_uccle_c$time & df_uccle_c$time < 1985 ,], x = ~time, y = ~`wnUC2`, color = ~from, type = "scattergl", colors = c("green", "blue"), mode = "markers+text") %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l) ## Residuals resdf_chris <- data.frame(time = z1$Date[1:nrow(filled.chris2)], res = (filled.chris2$wnUC2_d.txt) - (zssn[,"wnUC2"][1:nrow(filled.chris2)]) ) max(resdf_chris$res) ; min(resdf_chris$res) # ggplot(resdf_chris, aes(x = time, y = res, col = res)) + geom_point( size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10), limits=c(-95, 88)) ggplot(resdf_chris, aes(col = res)) + geom_line(aes(x = time, y = res), size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10),limits=c(-95, 88))
Only for filled values :
filled_ssn <- zssn[,"wnUC2"][1:nrow(df_uccle_chris)][zssn_complete$miss_wnUC2] resdf_chris <- data.frame(time = z1$Date[1:nrow(filled.chris2)][zssn_complete$miss_wnUC2], res = (filled.chris2$wnUC2_d.txt[zssn_complete$miss_wnUC2]) - filled_ssn ) max(resdf_chris$res) ; min(resdf_chris$res) myPalette <- colorRampPalette(rev(brewer.pal(4, "Spectral"))) # ggplot(resdf_chris, aes(x = time, y = res, col = res)) + geom_point( size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250)) ggplot(resdf_chris, aes(col = res)) + geom_line(aes(x = time, y = res), size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250))
Discrepancies are less than with Vero’s method…
We could want want to pick all the time or station that have big discrepancies. This could be informative
library(softImpute) vignette(package = "softImpute") data.mat2 <- apply(data.mat2, 2, as.numeric) fit <- softImpute(data.mat2, trace = T, rank.max = 20, lambda = 2, type = "svd") fit <- softImpute(data.mat2, trace = T, rank.max = 20, lambda = 2, type = "als") data.mat2.fil1 <- softImpute::complete(data.mat2, fit) xs=as(data.mat2,"Incomplete") xsc <- biScale(data.mat2, col.scale=FALSE,row.scale=FALSE)
# rownames(zssn) <- data.mat$decdate # zssn <- as.data.frame(zssn) # colnames(zssn) <- colnames(data.mat2.fin) # write.csv(zssn, "SSNfillFULL_R.csv")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.