In proto4426/ValUSunSSN: Val-U-Sun project functions

knitr::opts_chunk$set(echo = TRUE, message = F, warning = F)
library(microbenchmark)
library(profvis)
library(ggplot2)
library(gridExtra)
library(tidyr)
library(reshape2)
library(plotly)
library(parallel)
library(pander)
setwd("H:/LinuxMint/PissoortRepo/ValUSunSSN/ValUSunSSN")
#source('smooth.gaussR_LOAD.R')  # Re-created Gaussian smoother
library(ValUSunSSN)
#load('/home/piss/PissoortRepo/ValUSunSSN/Data/DataSSN.RData')   # Take full dataset
#load('DataSSN81.RData')  # Take data since 1981
# data.chris <- read.csv("Alldata.csv")
# 
# data.chris$Date <- as.Date(paste(data.chris$year,
#                                  data.chris$month, data.chris$day,sep = "-"))
# data.chris[data.chris == -1] <- NA
# 
# data.chris <- data.chris[, colnames(data.chris) %in% c("Ng", "Ns") == F]
# # We do it only for SSN here
# 
# data.mat <- spread(data.chris, key="station", value="SSN")
# data.mat <- data.mat[order(data.mat$decdate),]
# 
# data.mat2 <- data.mat[,-(1:5)] # Get matrix of stations in columns only.
runExample()

Explanations of the main Function

All the functions used are available in the $\texttt{interpol2_em_pissoort.R}$. We will focus here on the main functions, in order to not being "overflowed" of code ...

Run the algorithms : Tests

To do this, we will We ran the algorithm three times :

Once for the full dataset (since 1924) because we thought should not waste the opportunity to use data.
Since 1960 because it was the time period use by the $\texttt{FillInChris.R}$ from C.Ritter
Since 1981 as it seemed to be the most reliable time period regarding L.Lefèvre for example, and this also the time span used by "Véronique".

Moreover, comparisons between the results provided by the different time spans could be interesting. We could for example train a model using data from 1981 and then validate it using previous period, in order to have an independent measure of accuracy (...)

and once in for the unmodified SSN datase We take

## First, we discard stations that have coverage <5% for reliability issues

# If keep these values, the results will have incorrect values...
notNa.sum <- apply(data.mat2, 2, function(x) sum(!is.na(x)))
is_less5_cov <- unname(notNa.sum) < .05 * nrow(data.mat2)
data.mat2.fin <- data.mat2[, !is_less5_cov] # 7 stations are discarded !! 

#data.mat2.fin <- data.mat2.fin[data.mat$decdate >= 1981, ] # take only from 1981 ?


# Take this for input, as advised in the test.m file 
y <- sqrt(data.mat2.fin+1) # Select randomly here, for testing

options(mc.cores=parallel::detectCores()) # all available cores

z <- interpolsvd_em(y, nembed = 2, nsmo = 81, ncomp = 4,
                      niter = 30, displ = F)
# 193 sec for the whole dataset (with some stations discarded)
z <- z$y.filled


zssn <- interpolsvd_em(data.mat2.fin, nembed = 2, nsmo = 81, ncomp = 4,
                        niter = 30, displ = F)
zssn <- zssn$y.filled

rownames(y) <- rownames(zssn) <- 1:nrow(zssn)
y1 <- cbind(as.data.frame(data.mat2.fin), Date = data.mat$Date, x = "Raw")

z1 <- cbind(as.data.frame(zssn), Date = data.mat$Date[data.mat$decdate>1981], x = "Filled")
colnames(y1) <- colnames(z1) <- c(colnames(data.mat2.fin), "Date", "x")

zz <- rbind(y1,z1)

Comparisons with raw data

First of all, we decide to show only the station of Uccle which is considered to some extent, as the "main station". In this dynamic plot provided by $\texttt{plotly}$, you can hide the class you would like to see, e.g. to see raw data you can click on both red and blue button. This plot can conveniently show us, among other things, how the filled alues "fits" the shape of the raw data.

zz_neg_ucc <- data.frame( wnUC2 = zz$wnUC2, Date = zz$Date, x = zz$x, negative = zz$wnUC2 < 0 )
#z1$x <- ifelse(z1$wnUC2 < 0, "Filled <0", "Filled" ) 

#zz_neg_ucc[!is.na(zz_neg_ucc$x == "Filled"), ][,"x"] <- NA
## Visualize "incorrect" filled values, ie those below 0 
# zz_neg_ucc[!is.na(zz_neg_ucc$x == "Filled"), ][ ,"x"] <- ifelse(zz_neg_ucc[!is.na(zz_neg_ucc$x != "Raw"), ][ ,"wnUC2"] < 0, "Filled <0", "Filled" ) 
zz_neg_ucc[(nrow(zz_neg_ucc)/2+1):nrow(zz_neg_ucc), ][ ,"x"] <- ifelse(zz_neg_ucc[(nrow(zz_neg_ucc)/2+1):nrow(zz_neg_ucc), ][ ,"wnUC2"] < 0, as.factor("Filled <0"), as.factor("Filled") ) 
zz_neg_ucc[is.na(zz_neg_ucc$x),][,'x'] <- "Filled <0"
# zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$x  <- ifelse( zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$wnUC2 < 0, "Filled < 0", "Filled" )


# for (i in 1:(nrow(zz)/2) ){ 
#   if ( zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$wnUC2[i] < 0 )
#     zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$x[i] <- "Filled <0"
#   
#  # else  zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$x[i] <- "Filled" 
# }


f <- list(
  family = "Calibri",
  size = 16,
  color = "#33666C"
)
f2 <- list(
  family = "Old Standard TT, serif",
  size = 14,
  color = "black"
)
xl <- list(
  title = " Date",
  titlefont = f,
  tickfont = f2,
   showticklabels = TRUE
)
yl <- list(
  title = "SSN for wnUC2",
  titlefont = f,
  tickfont = f2,
  showticklabels = TRUE
)
tit <- list(
  title = "Dynamic plot of the SSN for  the station of Uccle",
  titlefont = f
)
l <- list(
  font = list(
    family = "sans-serif",
    size = 13,
    color = "#000"),
  bgcolor = "#E2E2E2",
  bordercolor = "#FFFFFF",
  borderwidth = 2)

plot_ly(data = zz_neg_ucc, x = ~Date, y = ~`wnUC2`, type = "scattergl", color = ~x, colors = c("green", "blue", "red")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)# %>% add_data(plot_ly(data = zz_neg_ucc[zz_neg_ucc$negative.wnUC2,], x = ~Date, y = ~`wnUC2`, type = "scattergl", colors = c("green"))) #%>% add_trace()%>% add_markers(color = ~as.factor(negative.wnUC2))

$\underline{\text{Comments}}$ :

We see that there seem to be some problems concerning the positivity of the values.

For convenience, we decide to show now the two retained stations that has the greatest number of available values, i.e. 21465 (%) and 20855 (%) for wnKS2 and wnKZ2 respectively, and also the two stations with the lowest number of avialable values

notNa.sum <- apply(data.mat2.fin, 2, function(x) sum(!is.na(x)))
#sort(notNa.sum)/nrow(data.mat2.fin) * 100

df <- data.frame(wnKS2 = "21465 (64.4%)", wnKZ2 = "18957 (56.9%) ", `wnFR-S` = "2119 (6.4%)", `wnGU-S` = "2185 (6.6%)" )
rownames(df) <- c("# available values ")
pander(df)

yl$title <- "SSN for wnFR-S"
p1 <- plot_ly(data = zz, x = ~Date, y = ~`wnFR-S`, type = "scattergl", color = ~x, colors = c("red", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)

yl$title <- "SSN for wnGU-S"
p2 <- plot_ly(data = zz, x = ~Date, y = ~`wnGU-S` , type = "scattergl", color = ~x, colors = c("red", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l) 

yl$title <- "SSN for wnKS2"
p3 <- plot_ly(data = zz, x = ~Date, y = ~wnKS2, type = "scattergl", color = ~x, colors = c("red", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l) 

yl$title <- "SSN for wnKZ2"
p4 <- plot_ly(data = zz, x = ~Date, y = ~wnKZ2, type = "scattergl", color = ~x, colors = c("red", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l) 

subplot(p1, p2, nrows = 1,shareY = T, titleY = T )

For these two stations, we remark that the small number of available values indeed impact the uncertainty and the way these missing values are filled by the algorithm.

In contrast, when the number of available values is high $\Rightarrow$

subplot(p3, p4, nrows = 1,shareY = T, titleY = T)

The filling seem quite more homogenous and thus robust.

# z1 <- cbind(as.data.frame(zz[nrow(data.mat),]), Date = data.mat$Date )
# colnames(z1) <- c(colnames(data.mat2.fin), "Date")
# zz.full <- melt(z1, id.vars = "Date" )
# dimnames(zz.full)[[2]][2] <- "Station"
# dimnames(zz.full)[[2]][3] <- "SSN"

g <- ggplot(zz.full) + geom_line(aes( x = Date, y = SSN, col = Station), size = .4) +
 scale_color_hue(l = 40, c = 200) + theme_piss()
  # scale_colour_brewer(name=expression(underline(bold("data: "))),
  #                     palette = "Set1") + 
  #guides(colour = guide_legend(override.aes = list(size= 2.5))) 

# ggplotly(g)

plot_ly(data = zz.full, x = ~Date, y = ~SSN, type = "scattergl", color = ~Station )

We can first notice a "problem", that is the values for SSN which are below 0

Could'nt we

Comparisons with other filling methods

Filling method of "Vero", time series starting from 1981

1. Are the non-missing values still the same ?

We hope so, but we will check it.

filledt.vero <- read.csv("/home/piss/PissoortRepo/ValUSunSSN/data/time_SSNf_1981.csv")

# By looking at the Julian dates at the top and the bottom of the series, we retrieved the dates.
filledt.vero$Date <- seq(as.Date("1981-02-19"), as.Date("2015-03-30"), by = 1)

#load('/home/piss/PissoortRepo/ValUSunSSN/Data/DataSSN81.RData')

colnames(zssn) <- colnames(z1[, -((ncol(z1)-1):(ncol(z1)))])
zssn <- cbind.data.frame(zssn, times = z1$Date)

zssn <- zssn[as.Date(zssn$times) %in% filledt.vero$Date,]

## add indicators for missingness
missings <- is.na()
colnames(missings) <- paste0("miss_", colnames(missings))
zssn_complete <- cbind.data.frame(zssn, missings)
sum(zssn_complete$miss_wnUC2)

head(dif <- zssn_complete$wnUC2[!zssn_complete$miss_wnUC2] -
       data.mat2.fin$wnUC2[!zssn_complete$miss_wnUC2] )
sum (dif) / 0.0024342

The difference actually the same for all the observations. this not really an intrinsic difference but rather a variable type difference; the first being integer and the latter being real (numeric).

2. Comparisons with all data (missing & non-missing)

# df_uccle_vero <- data.frame(time = zssn$times[1:nrow(filledt.vero)], wnUC2 = filledt.vero$UC2, from = "Vero")
# 
# df_uccle_v <- rbind.data.frame(df_uccle_chris, data.frame(time = zssn$times[1:nrow(filledt.vero)], wnUC2 = zssn$wnUC2[1:nrow(filledt.vero)], from = "`interpolEM`"))
# 
# plot_ly(data = df_uccle_v, x = ~time, y = ~`wnUC2`, type = "scattergl", color = ~from, colors = c("green", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)


df_uccle_vero <- data.frame(time = zssn$times, SSN_wnUC2 = filledt.vero$UC2, from = "Vero")

df_uccle_c <- rbind.data.frame(df_uccle_vero, data.frame(time = zssn$times, SSN_wnUC2 = zssn[,"wnUC2"], from = "`interpolEM`"))

ggplot(df_uccle_c, aes(col = from)) + geom_point(aes(x = time, y = SSN_wnUC2), size = 0.15) + 
  solar.cycle() + theme_piss() +  guides(colour = guide_legend(override.aes = list(size= 3))) + ggtitle("Filled missing values")

Analyze residuals

resdf_vero <- data.frame(time = zssn$times,
                         res = (filledt.vero$UC2) - (zssn[,"wnUC2"]) ) 
max(resdf_vero$res)  ; min(resdf_vero$res)

library(RColorBrewer)
myPalette <- colorRampPalette(rev(brewer.pal(4, "Spectral")))

ggplot(resdf_vero, aes(x = time, y = res, col = res)) + geom_point( size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250))

Or we could also visualize the discrepancies between the filled values

Only for the filled data

filled_ssn <- zssn[,"wnUC2"][zssn_complete$miss_wnUC2]

resdf_vero <- data.frame(time = zssn$times[zssn_complete$miss_wnUC2],
                         res = (filledt.vero$UC2[zssn_complete$miss_wnUC2]) - filled_ssn ) 
max(resdf_vero$res)  ; min(resdf_vero$res)

myPalette <- colorRampPalette(rev(brewer.pal(4, "Spectral")))

# ggplot(resdf_vero, aes(x = time, y = res, col = res)) + geom_point( size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250))
ggplot(resdf_vero, aes(col = res)) + geom_line(aes(x = time, y = res), size = 0.35)  + theme_piss() + solar.cycle() +
  scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250))

Filling method of C.Ritter, time series starting from 1960

Again, we will take th station of Uccle for reference.

# filledt.chris <- read.csv("SSNfilledChris.csv") # Load data
# 
# load("DataSSN60.RData") # Load data we have build
# 
# filled.chris2 <- filledt.chris[,-2]
# 
# df_uccle_chris <- data.frame(time = filled.chris2$times, wnUC2 = filled.chris2$wnUC2_d.txt, from = "Chris")
# 
# df_uccle_c <- rbind.data.frame(df_uccle_chris, data.frame(time = filled.chris2$times, wnUC2 = zssn$wnUC2[1:nrow(df_uccle_chris)], from = "`interpolEM`"))
# 
# plot_ly(data = df_uccle_c, x = ~time, y = ~`wnUC2`, type = "scattergl", color = ~from, colors = c("green", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)

filledt.chris <- read.csv("SSNfilledChris.csv") # Load data

load("DataSSN60.RData") # Load data we have build

filled.chris2 <- filledt.chris[,-2]

## Are the non-missing values still the same ?? 
## add indicators for missingness
missings <- is.na(y)
colnames(missings) <- paste0("miss_", colnames(missings))
zssn_complete <- cbind.data.frame(zssn, missings)
sum(zssn_complete$miss_wnUC2)


head(dif <- zssn_complete$wnUC2[!zssn_complete$miss_wnUC2] - data.mat2.fin$wnUC2[!zssn_complete$miss_wnUC2] )
sum (dif) / 0.0024342

Same idea. The very slight discrepancies are du to (type) approximations

df_uccle_chris <- data.frame(time = filled.chris2$times, wnUC2 = filled.chris2$wnUC2_d.txt, from = "Chris")

df_uccle_c <- rbind.data.frame(df_uccle_chris, data.frame(time = filled.chris2$times, wnUC2 = zssn$wnUC2[1:nrow(df_uccle_chris)], from = "`interpolEM`"))


# Comparisons for first 5 years (1980 : 1985)
plot_ly(data = df_uccle_c[1980 < df_uccle_c$time & df_uccle_c$time < 1985 ,], x = ~time, y = ~`wnUC2`, color = ~from, type = "scattergl", colors = c("green", "blue"), mode =  "markers+text") %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)


## Residuals

resdf_chris <- data.frame(time = z1$Date[1:nrow(filled.chris2)],
                         res = (filled.chris2$wnUC2_d.txt) - (zssn[,"wnUC2"][1:nrow(filled.chris2)]) ) 
max(resdf_chris$res)  ; min(resdf_chris$res)

# ggplot(resdf_chris, aes(x = time, y = res, col = res)) + geom_point( size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10), limits=c(-95, 88))
ggplot(resdf_chris, aes(col = res)) + geom_line(aes(x = time, y = res), size = 0.35)  + theme_piss() + solar.cycle() +
  scale_colour_gradientn(colours = myPalette(10),limits=c(-95, 88))

Only for filled values :

filled_ssn <- zssn[,"wnUC2"][1:nrow(df_uccle_chris)][zssn_complete$miss_wnUC2]

resdf_chris <- data.frame(time = z1$Date[1:nrow(filled.chris2)][zssn_complete$miss_wnUC2],
                         res = (filled.chris2$wnUC2_d.txt[zssn_complete$miss_wnUC2]) - filled_ssn ) 
max(resdf_chris$res)  ; min(resdf_chris$res)

myPalette <- colorRampPalette(rev(brewer.pal(4, "Spectral")))

# ggplot(resdf_chris, aes(x = time, y = res, col = res)) + geom_point( size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250))
ggplot(resdf_chris, aes(col = res)) + geom_line(aes(x = time, y = res), size = 0.35)  + theme_piss() + solar.cycle() +
  scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250))

Discrepancies are less than with Vero’s method…

We could want want to pick all the time or station that have big discrepancies. This could be informative

library(softImpute)
vignette(package = "softImpute")

data.mat2 <- apply(data.mat2, 2, as.numeric)

fit <- softImpute(data.mat2, trace = T, rank.max = 20, lambda = 2, type = "svd")
fit <- softImpute(data.mat2, trace = T, rank.max = 20, lambda = 2, type = "als")

data.mat2.fil1 <- softImpute::complete(data.mat2, fit)


xs=as(data.mat2,"Incomplete")
xsc <- biScale(data.mat2, col.scale=FALSE,row.scale=FALSE)

# rownames(zssn) <- data.mat$decdate
# zssn <- as.data.frame(zssn) 
# colnames(zssn) <- colnames(data.mat2.fin) 
# write.csv(zssn, "SSNfillFULL_R.csv")

proto4426/ValUSunSSN documentation built on May 26, 2019, 10:31 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Tweet to @rdrrHQ

GitHub issue tracker

ian@mutexlabs.com