inst/doc/synthetic_news_data.R

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----observed_data_generation-------------------------------------------------
library(readr)
library(dplyr)
df <- suppressWarnings(read_csv("https://raw.githubusercontent.com/StatsGary/SyntheticNEWSData/main/observed_news_data.csv") %>% 
  dplyr::select(everything(), -X1))

glimpse(df)


## ----synth--------------------------------------------------------------------
library(synthpop)  
syn_df <- syn(df,seed=4321)
#### synthetic data
synthetic_news_data <- syn_df$syn
glimpse(synthetic_news_data)


## ----visuals------------------------------------------------------------------
library(ggplot2)
#Create temperature tibbles to compare observed vs synthetically generated labels
obs <- tibble(label="observed_data", value = df$temp)
synth <- tibble(label="synthetic_data",value = synthetic_news_data$temp)

#Merge the frames together to get a comparison
merged <- obs %>% 
  bind_rows(synth)

#Create the plot
plot <- merged %>% 
  ggplot(aes(value, fill = label)) +
  geom_histogram(alpha = 0.9, position = 'identity')  + theme_minimal() + 
  scale_fill_manual(values=c("#BCBDC1", "#2061AC")) +
  labs(title="Observed vs Synthetically NEWS values",
       subtitle="Based on NEWS Temperature score",
       x="NEWS Temperature Score", y="Score frequency") + 
  theme(legend.position = "none")

print(plot)

Try the NHSRdatasets package in your browser

Any scripts or data that you put into this service are public.

NHSRdatasets documentation built on March 14, 2021, 1:06 a.m.