if (!("devtools" %in% rownames(installed.packages()))) install.packages("devtools", repos = "https://cloud.r-project.org")
library(devtools)
if (!("DescriptiveStats" %in% rownames(installed.packages()))) install_github("N1h1l1sT/DescriptiveStatsR", upgrade = FALSE)
library(DescriptiveStats)
library(DescriptiveStats)
# Given any Dataset, for instance 'ExampleDS' which is based on real data from the industry:
#We can get Descriptive Statistics for it by calling DescriptiveStats()
ExampleStats <-
ExampleDS %>%
DescriptiveStats(
CalculateGraphs = TRUE, #CalculateGraphs is the only one we NEED to set, you can skip the rest. If False then only matrices are calculated; no plots.
DependentVar = "KWh", #In the dataset, the name of what we consider a Dependent Variable is how much power was produced, i.e. "KWh"
IsTimeSeries = TRUE, #This dataset is timeseries as each row is measured 10 minutes after the previous one, so TRUE
GroupBy = "Windmill", #We may want to see how each windmill does compared to each other, so group by per windmill
TimeFlowVar = "Date", #If IsTimeSeries=TRUE and left blank, then a new TimeFlowVar is introduced as number from 1 to NROW(Dataset)
BoxplotPointsColourVar = "WindSpeed" #It's interesting to juxtapose a second second/third variable in boxplots. For instance, see the KWh per Month and juxtapose Windspeed as a 3rd variable. Does the KWh go higher as Windspeed goes higher?
)
#The above is the Console Output of the function, but the real wealth of results are yet to be seen.
#They are inside the variable 'ExampleStats' and can be saved to a folder of your choice:
ExampleStats %>%
SaveDescrStats(
path_stats = file.path(getwd(), "ExampleDS"), #I'm creating a new folder named 'ExampleDS'
)
#So a folder is created, containing the following:
We can see that KWh, ReactivePower and RotorSpeed are statistically significantly strongly correlated with each other, forming one cluster, whilst Orientation is not correlated with any other variable, and with ReactivePower in particular, the -0.02 correlation is not statistically significant, as portrayed by the "X" symbol.
Here, we can see, for instance, that we have less data for February, which might be partially explained by the fact that February has less days than other months. However, we see that not all 30-days-months or not all 31-days-months contain exactly the same amount of data (rows) either. There is also a disparity between windmills as well.
So, we can see that the best months for Power Production on the windmills were January, November, May and February, whilst the worst were July and October. We also see that Windmill number 4 outperformed all other windmills by a significant margin.
Notice how the dots on low Power Production are mainly red colour, meaning WindSpeed of 10 or less, but for high values of KWh, the dots are mainly purple, so WindSpeed values of about 20, which is what we expected, but it's nice to visually confirm it.
Watching the 1st boxplot, we see that 25% of the time the windmill basically produces zero, or near-zero power, 50% of the production is between 80 to 280 KWh. We can also see that the orientation, while it seemingly can take any value, mostly falls between 3 zones.
Notice, for example, the long right tail on WindSpeed's distribution. Windspeed is mainly less than 20, but it seems to be taking every value between 20 and 32, albeit very rarely.
Also notice how much easier it is to see that much of the time the Windmill produces near-zero values or very high values. Combining this with what we saw on boxplots, pretty much 25% of the time it produces nothing, 25% it produces lots of power and 50% of the time it produces normal amounts of power.
This KWh-Windspeed plot releaves something unexpected; whilst in general, the stronger the wind, the higher the energy production, there is a zero KWh production for nearly any value of wind, however high it is. Another interesting thing unearthed is that even though the trend holds true for windspeeds up to about 25, we see that for really high WindSpeed values the production is always less than that of just high WindSpeed.
One thing that immediately becomes clear viewing this plot is that Reactive Power's behaviour changed after February
Notice how the transparent grey lines make it easy to see when there are great differences between each point in time. If the rise or fall of values is smooth, no grey line appears, like in: 1, 5, 10, 20, 15, 7, 4. If there is missing time as well, we'll see this coloured points before, coloured points after, and in the empty middle there'll be a grey line highlighting the missing pieces.
On all the example below, all statistics will be saved in a folder under your Working Directory, which usually is your project's folder.
#Load the Dataset
data("storms")
StormsDS <-
storms %>% #The "storms" DataFrame is part of dplyr, so it always exists even if we don't see it
as_tibble() %>%
mutate(
status = as.factor(status), #There are only 3 unique statuses in over 10,000 rows, so this is a factor variable, not a text one
Date = ymd_h(paste0(year, "-", month, "-", day, " ", hour))
) %>%
select(-c(year, month, day, hour)) %>% #Since these 4 variables are part of Date, we should remove them
{.[, c("Date", setdiff(names(.), "Date"))]} #This line JUST makes "Date" be the 1st variable, no reason other than I like having Date variables 1st.
print(StormsDS)
#Run the Descriptive Statistics
DescrStats <-
StormsDS %>%
DescriptiveStats(
CalculateGraphs = TRUE,
IsTimeSeries = TRUE,
CorrVarOrder = "PCA",
GroupBy = "category",
DependentVar = "wind",
TimeFlowVar = "Date",
BoxplotPointsColourVar = "pressure",
BoxPlotPointAlpha = 0.6,
BoxPlotPointSize = 0.9
)
#Save the Descriptive Statistics
DescrStats %>%
SaveDescrStats(
file.path(getwd(), "StormsDS/"),
NumWidth = 1024,
NumHeight = 768,
CatWidth = 800,
CatHeight = 600,
TimeSeriesWidth = 1280,
TimeSeriesHeight = 800
)
#Load the Dataset
data("mpg")
MpgDS <-
mpg %>% #The "mpg" DataFrame is part of ggplot2, we can also call it as ggplot2::mpg
as_tibble() %>%
mutate(
Date = ymd(paste0(year, "-01-01")), #Only 2 years
manufacturer = as.factor(manufacturer),
trans = as.factor(trans),
drv = factor(drv, levels = c("f", "r", "4"), labels = c("Front wheel", "Read wheel", "Four wheel")),
fl = factor(fl, levels = c("e", "d", "r", "p", "c"), labels = c("Ethanol", "Diesel", "Regular", "Premium", "Natural gas")),
class = as.factor(class)
) %>%
select(-c(year)) %>%
rename(
TransmissionType = trans,
DriveType = drv,
CityMilesPerGallon = cty,
HighwayMilesPerGallon = hwy,
FuelType = fl,
CylindersNum = cyl,
EngineDisplacement = displ
) %>%
{.[, c("Date", setdiff(names(.), "Date"))]} #This line JUST make "Date" be the 1st variable, no reason other than I like having Date variables 1st.
print(MpgDS)
#Run the Descriptive Statistics
DescrStats <-
MpgDS %>%
DescriptiveStats(
CalculateGraphs = TRUE,
CorrVarOrder = "PCA",
GroupBy = "DriveType",
DependentVar = "class",
BoxplotPointsColourVar = "CityMilesPerGallon",
BoxPlotPointAlpha = 0.7,
BoxPlotPointSize = 1
)
#Save the Descriptive Statistics
DescrStats %>%
SaveDescrStats(
file.path(getwd(), "MpgDS/"),
NumWidth = 900,
NumHeight = 600,
CatWidth = 1680,
CatHeight = 850
)
#Load the Dataset
data("diamonds")
DiamondsDS <-
diamonds %>% #The "diamonds" DataFrame is part of ggplot2, we can also call it as ggplot2::diamonds
as_tibble()
print(DiamondsDS)
#Run the Descriptive Statistics
DescrStats <-
DiamondsDS %>%
DescriptiveStats(
CalculateGraphs = TRUE,
CorrVarOrder = "PCA",
GroupBy = "cut",
DependentVar = "price",
BoxplotPointsColourVar = "carat",
BoxPlotPointAlpha = 0.3,
BoxPlotPointSize = 0.5
)
#Save the Descriptive Statistics
DescrStats %>%
SaveDescrStats(
file.path(getwd(), "DiamondsDS/"),
NumWidth = 1000,
NumHeight = 700,
CatWidth = 850,
CatHeight = 550
)
#Load the Dataset
data("Titanic")
TitanicDS <-
Titanic %>% #The "Titanic" DataFrame is part of the datasets library, we can also call it as datasets::Titanic
as_tibble() %>%
mutate(
Class = as.factor(Class),
Sex = as.factor(Sex),
Age = as.factor(Age),
Survived = as.factor(Survived)
) %>%
uncount(n)
print(TitanicDS)
#Run the Descriptive Statistics
DescrStats <-
TitanicDS %>%
DescriptiveStats(
CalculateGraphs = TRUE,
CorrVarOrder = "PCA",
GroupBy = "Sex",
DependentVar = "Survived"
)
#Save the Descriptive Statistics
DescrStats %>%
SaveDescrStats(
file.path(getwd(), "TitanicDS/"),
NumWidth = 1000,
NumHeight = 700,
CatWidth = 850,
CatHeight = 550
)
#Load the Dataset
data("CO2")
Co2DS <-
CO2 %>% #The "CO2" DataFrame is part of the datasets library, we can also call it as datasets::CO2
as_tibble()
print(Co2DS)
#Run the Descriptive Statistics
DescrStats <-
Co2DS %>%
DescriptiveStats(
CalculateGraphs = TRUE,
GroupBy = "Type",
DependentVar = "uptake",
BoxplotPointsColourVar = "conc",
BoxPlotPointAlpha = 1,
BoxPlotPointSize = 1
)
#Save the Descriptive Statistics
DescrStats %>%
SaveDescrStats(
file.path(getwd(), "CO2DS/"),
NumWidth = 1000,
NumHeight = 700,
CatWidth = 850,
CatHeight = 550
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.