R/documentation.R

#' Data for Data Analysis Course
#'
#' This package contains datasets for use with the Data Analysis course at Code Clan.
#' @docType package
#' @name CodeClanData
NULL

#########
### A ###
#########

#' all_deaths
#'
#' Game of Thrones Deaths
#' https://www.kaggle.com/mylesoneill/game-of-thrones#character-deaths.csv
#'
#' @format A data frame
"all_deaths"

#########
### B ###
#########

#' Backpack Weight and Back Problems Data
#'
#' A dataset looking at the weight of backpack carried by higher education students and back problems.
#'
#' @format A data frame
"backpack"

#' bank_expense
#'
#' Selected items from large bank's profit/loss statement
#'
#' @format A data frame
"bank_expense"


#' bayestown_survey
#'
#' bayestown_survey
#'
#' @format A data frame
"bayestown_survey"

#' Beer
#'
#' A dataset with the brand, brewer, alcohol percentage,
#'  calories and grams of carbohydrates for 172 beers.
#'
#'  Some data cleaning required for this data.
#'
#' @format A data frame
"beer"


#' Unemployement of Blue Collar Workers
#'
#' @description
#' a cross-section from 1972
#'
#' number of observations : 4877
#'
#' observation : individuals
#'
#' country : United States
#'
#' @format
#' A time serie containing :
#'
#' stateur
#' state unemployment rate (in %)
#'
#' statemb
#' state maximum benefit level
#'
#' state
#' state of residence code
#'
#' age
#' age in years
#'
#' tenure
#' years of tenure in job lost
#'
#' joblost
#' a factor with levels (slack_work,position_abolished,seasonal_job_ended,other)
#'
#' nwhite
#' non-white ?
#'
#' school12
#' more than 12 years of school ?
#'
#' sex
#' a factor with levels (male,female)
#'
#' bluecol
#' blue collar worker ?
#'
#' smsa
#' lives is smsa ?
#'
#' married
#' married ?
#'
#' dkids
#' has kids ?
#'
#' dykids
#' has young kids (0-5 yrs) ?
#'
#' yrdispl
#' year of job displacement (1982=1,..., 1991=10)
#'
#' rr
#' replacement rate
#'
#' head
#' is head of household ?
#'
#' ui
#' applied for (and received) UI benefits ?
#'
#' @source
#'
#' McCall, B.P. (1995) “The impact of unemployment insurance benefit levels on recipiency”, Journal of Business and Economic Statistics, 13, 189–198.
#'
#' @references
#'
#' Verbeek, Marno (2004) A guide to modern econometrics, John Wiley and Sons, http://www.econ.kuleuven.ac.be/GME, chapter 7.
#'
#' Journal of Business Economics and Statistics web site : http://www.amstat.org/publications/jbes/.
#' "benefits"
"benefits"

#' Bimodal data
#'
#' A dataset with simulated bimodal data for teaching
#'
#' @format A data frame
"bimodal"

#' Blood Pressure
#'
#' Systolic Blood pressure in mm/hg vs. daily saturated fat intake in mg for 25 subjects.
#'
#' @format A data frame
"blood_pressure"


#########
### C ###
#########

#' Car Use
#'
#' Car use, population and SO2 concentration in air in unidentified towns
#'
#' @format A data frame
"car_use"

#' chinesemeal
#'
#' Chinese Meals
#'
#' @format A data frame
"chinesemeal"

#' Colour List
#'
#' A dataset with colour information (invented).
#'
#' @format A nested list with 4 elements
"colour_list"

#' competencies
#'
#' Assessment of 4 widget companies' core competencies
#'
#' @format A data frame
"competencies"

#' comms_data
#'
#' Telecom Data
#'
#'
#' @format A data frame
"comms_data"

#' cuckoo
#'
#' cuckoo
#'
#' @format A data frame
"cuckoo"

#' Cycle Routes
#'
#' Data on some Edinburgh cycle routes.
#'
#'
#' @format A list of data frames
"cycle_routes"

#########
### D ###
#########

#' d20 Outcomes
#'
#' The outcomes from rolling a 20 sided dice 201 times.
#'
#' @format A data frame
"d20_outcomes"

#' 5d20 Outcomes
#'
#' The outcomes from rolling five 20 sided dice, and adding up the scores, 1001 times.
#'
#' @format A data frame
"d20x5_outcomes"

#' data1
#'
#' data1 - synthetic dataset 1
#'
#' @format A data frame
"data1"

#' data2
#'
#' data2 - synthetic dataset 2
#'
#' @format A data frame
"data2"

#' data3
#'
#' data3
#'
#' @format A data frame
"data3"

#' death_note
#'
#' Main causes of death in UK males in 2016
#'
#' @format A data frame
"death_males"

#' drinks_content
#'
#' Starbuck calorie dataset
#' https://www.kaggle.com/starbucks/starbucks-menu
#'
#' @format A data frame
"drinks_content"

#########
### E ###
#########

#' energy_scotland
#'
#' energy_scotland
#'
#' @format A data frame
"energy_scotland"

#' EUbank
#'
#' Banking crises in European countries 1800-2010
#'
#' @format A data frame
"EUbank"

#' euro_ineq
#'
#' Volatility of GDP, Gini coefficient (measure of inequality) and percentage
#' of GDP attributable to services for 17 European countries.
#'
#' @format A data frame
"euro_ineq"

#' Example PSI
#'
#' Example of PSI data
#'
#'
#' @format A data frame
"example_psi"


#########
### F ###
#########

#' Fitness Levels
#'
#' Aerobic fitness levels of school children in different activity groups, over different ages.
#'
#' @format A data frame
"fitness_levels"

#' flatPrices
#'
#' Estimated average flat prices and monthly rents in Edinburgh 2011-2019
#'
#' @format A data frame
"flatPrices"

#########
### G ###
#########

#' Game of Thrones
#'
#' Data on about the book 'A Game of Thrones', including names
#' and genders for all the characters.
#'
#' @format A matrix
"game_of_thrones"

#' Video Game Sales Data
#'
#' Global game sales data
#'
#' name            : Title of the game
#'
#' genre           : Type of game
#'
#' year_of_release : Year the game was released
#'
#' publisher       : Publisher of game
#'
#' sales           : Total global sales, in millions
#'
#' critic_score    : Critics review score (from 0 to 100) from Metacritic
#'
#' user_score      : Users review score (from 0 to 10) from Metacritic
#'
#' developer       : Studio that created the game
#'
#' rating          : American ESRB rating for the game
#'
#' platform        : Console the game runs on
#'
#'
#' @format A data frame
"game_sales"

#' Game of Thrones Ratings
#'
#' The IMDB ratings for Game of Thrones, season 1 to 8, by episode.
#'
#'
#' @format A data frame
"got_ratings"

#' Data from A.-M. Guerry, "Essay on the Moral Statistics of France"
#'
#' @description
#' Andre-Michel Guerry (1833) was the first to systematically collect and analyze social data on such things as crime, literacy and suicide with the view to determining social laws and the relations among these variables.
#'
#' The Guerry data frame comprises a collection of 'moral variables' on the 86 departments of France around 1830. A few additional variables have been added from other sources.
#'
#' Format
#' A data frame with 86 observations (the departments of France) on the following 23 variables.
#'
#' dept
#' Department ID: Standard numbers for the departments, except for Corsica (200)
#'
#' Region
#' Region of France ('N'='North', 'S'='South', 'E'='East', 'W'='West', 'C'='Central'). Corsica is coded as NA
#'
#' Department
#' Department name: Departments are named according to usage in 1830, but without accents. A factor with levels Ain Aisne Allier ... Vosges Yonne
#'
#' Crime_pers
#' Population per Crime against persons. Source: A2 (Compte general, 1825-1830)
#'
#' Crime_prop
#' Population per Crime against property. Source: A2 (Compte general, 1825-1830)
#'
#' Literacy
#' Percent Read & Write: Percent of military conscripts who can read and write. Source: A2
#'
#' Donations
#' Donations to the poor. Source: A2 (Bulletin des lois)
#'
#' Infants
#' Population per illegitimate birth. Source: A2 (Bureaau des Longitudes, 1817-1821)
#'
#' Suicides
#' Population per suicide. Source: A2 (Compte general, 1827-1830)
#'
#' MainCity
#' Size of principal city ('1:Sm', '2:Med', '3:Lg'), used as a surrogate for poulation density. Large refers to the top 10, small to the bottom 10; all the rest are classed Medium. Source: A1. An ordered factor with levels 1:Sm < 2:Med < 3:Lg
#'
#' Wealth
#' Per capita tax on personal property. A ranked index based on taxes on personal and movable property per inhabitant. Source: A1
#'
#' Commerce
#' Commerce and Industry, measured by the rank of the number of patents / population. Source: A1
#'
#' Clergy
#' Distribution of clergy, measured by the rank of the number of Catholic priests in active service / population. Source: A1 (Almanach officiel du clergy, 1829)
#'
#' Crime_parents
#' Crimes against parents, measured by the rank of the ratio of crimes against parents to all crimes– Average for the years 1825-1830. Source: A1 (Compte general)
#'
#' Infanticide
#' Infanticides per capita. A ranked ratio of number of infanticides to population– Average for the years 1825-1830. Source: A1 (Compte general)
#'
#' Donation_clergy
#' Donations to the clergy. A ranked ratio of the number of bequests and donations inter vivios to population– Average for the years 1815-1824. Source: A1 (Bull. des lois, ordunn. d'autorisation)
#'
#' Lottery
#' Per capita wager on Royal Lottery. Ranked ratio of the proceeds bet on the royal lottery to population— Average for the years 1822-1826. Source: A1 (Compte rendus par le ministre des finances)
#'
#' Desertion
#' Military disertion, ratio of the number of young soldiers accused of desertion to the force of the military contingent, minus the deficit produced by the insufficiency of available billets– Average of the years 1825-1827. Source: A1 (Compte du ministere du guerre, 1829 etat V)
#'
#' Instruction
#' Instruction. Ranks recorded from Guerry's map of Instruction. Note: this is inversely related to Literacy (as defined here)
#'
#' @source
#' Angeville, A. (1836). Essai sur la Statistique de la Population fran?aise Paris: F. Doufour.
#'
#' Guerry, A.-M. (1833). Essai sur la statistique morale de la France Paris: Crochard. English translation: Hugh P. Whitt and Victor W. Reinking, Lewiston, N.Y. : Edwin Mellen Press, 2002.
#'
#' Parent-Duchatelet, A. (1836). De la prostitution dans la ville de Paris, 3rd ed, 1857, p. 32, 36
#'
#' @references
#' Dray, S. and Jombart, T. (2011). A Revisit Of Guerry's Data: Introducing Spatial Constraints In Multivariate Analysis. The Annals of Applied Statistics, Vol. 5, No. 4, 2278-2299. http://arxiv.org/pdf/1202.6485.pdf, DOI: 10.1214/10-AOAS356.
#'
#' Brunsdon, C. and Dykes, J. (2007). Geographically weighted visualization: interactive graphics for scale-varying exploratory analysis. Geographical Information Science Research Conference (GISRUK 07), NUI Maynooth, Ireland, April, 2007.
#'
#' Friendly, M. (2007). A.-M. Guerry's Moral Statistics of France: Challenges for Multivariable Spatial Analysis. Statistical Science, 22, 368-399
#' Friendly, M. (2007). Data from A.-M. Guerry, Essay on the Moral Statistics of France (1833), http://datavis.ca/gallery/guerry/guerrydat.html.
#'
"guerry"

#########
### H ###
#########

#' heavily right skewed data
#'
#' A dataset with simulated heavy right skewed data for teaching
#'
#' @format A data frame
"heavily_right_skewed"

#' hills2000
#'
#' hills2000
#'
#' @format A data frame
"hills2000"

#' hospital_visits
#'
#' Reasons people were admitted to hospital between 1993 and 1998.
#' Purposely messy dataset
#'
#' @format A data frame
"hospital_visits"

#########
### I ###
#########

#' IBM_stock_price
#'
#' IBM stock prices
#'
#'
#' @format A data frame
"IBM_stock_price"

#' income
#'
#' Income info for different religious groups.
#' Invented data.
#'
#' @format A data frame
"income"

#' inflation4
#'
#' Inflation (CPI) for UK, France, Germany and Spain 1960-2010
#'
#' @format A data frame
"inflation4"

#' invest_alluvial
#'
#' Lifestyled pension investment allocations ages 40,50,60 - alluvial format
#'
#' @format A data frame
"invest_alluvial"

#' invest_lodes
#'
#' Lifestyled pension investment allocations ages 40,50,60 - lodes format
#'
#' @format A data frame
"invest_lodes"

#' invest_lodes2
#'
#' Lifestyled pension investment allocations ages 40,50,60 -
#' lodes format with outgoings
#'
#' @format A data frame
"invest_lodes2"

#' IQ Scores
#'
#' The IQ Scores for 5 people, measured across 3 different tests.
#' Invented data.
#'
#' @format A data frame
"iq_scores"

#########
### J ###
#########

#' JNJ_stock_price
#'
#' JNJ stock prices
#'
#'
#' @format A data frame
"JNJ_stock_price"

#########
### K ###
#########

#' kickstarter
#'
#' Data for the crowd funding site Kickstarter 
#' https://www.kaggle.com/kemical/kickstarter-projects
#'
#' @format A data frame
"kickstarter"


#########
### L ###
#########

#' late_deliveries
#'
#' Number of late deliveries over 12 calendar months for fictional company
#'
#' @format A data frame
"late_deliveries"

#' left skewed data
#'
#' A dataset with simulated left skewed data for teaching
#'
#' @format A data frame
"left_skewed"

#' lotka_volterra
#'
#' lotka_volterra
#'
#' @format A data frame
"lotka_volterra"


#########
### M ###
#########

#' Messy
#'
#' A small example messy dataset
#' Invented data.
#'
#' @format A data frame
"messy"

#' customer orders
#'
#' Customer orders for three different people, with info and amount.
#' Invented data.
#'
#' @format A data frame
"messy_orders"

#' milk
#'
#' Composition of mammal's milk
#'
#' @format A data frame
"milk"

#' Monthly Sales
#'
#' An invented dataset with monthly sales across 7 branches.
#'
#' A summary of this dataset can be found in `total_sales`.
#'
#' @format A data frame
"monthly_sales"


#########
### N ###
#########

#' New Coders
#'
#' A sample of data from freeCodeCamp's 2018 survey. Asks questions of new development
#' about their background and aspirations.
#'
#' @source \url{https://github.com/freeCodeCamp/2018-new-coder-survey}
#' @format A data frame
"new_coders"

#' NYC Dogs
#'
#' A dataset with all the dogs registered in New York City.
#'
#' Adapted from: https://fusiontables.google.com/data?docid=1pKcxc8kzJbBVzLu_kgzoAMzqYhZyUhtScXjB0BQ#rows:id=1
#'
#' @format A data frame
"nyc_dogs"

#########
### O ###
#########

#' Overall Olympics Medals
#'
#' The overall count of Olympic medals in both the summer and winter games.
#' From years 1896 to 2016.
#' Data from: https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results/downloads/120-years-of-olympic-history-athletes-and-results.zip/2
#'
#' @format A data frame
"olympics_overall_medals"

#########
### P ###
#########

#' pets
#'
#' pets
#'
#' @format A data frame
"pets"

#' Pension Liabilities
#'
#' Composition of liabilities (£m) of ABC pension scheme.
#'
#' @format A data frame
"pension_liabilities"

#' Pension Surplus
#'
#' Surplus in ABC pension scheme.
#'
#' @format A data frame
"pension_surplus"

#' physical_activity
#'
#' physical_activity
#'
#' @format A data frame
"physical_activity"

#' Playfair's Denmark Data
#'
#' @format A data frame
"playfair_denmark"

#' polydata
#'
#' polydata
#'
#' @format A data frame
"polydata"

#' Population
#'
#' A dataset of locations and genders.
#'
#'
#' @format A data frame
"population"


#########
### Q ###
#########

#' QikBit Competitors
#'
#' 2015-2019 Revenue for QikBit's competitors
#'
#' @format A data frame
"qb_competitors"

#' QikBit Accelerometer Data
#'
#' Comparative accelerometer data for 4 devices and 5 individuals over a single day
#'
#' @format A data frame
"qb_device_data"

#' QikBit Monthly Figures
#'
#' Aug 2018 - Jul 2019 revenue and costs for QikBit, plus sales for single product line
#'
#' @format A data frame
"qb_monthly_sales"

#' QikBit Revenue Breakdown
#'
#' 2019 Sales by product group for (fictional company) QikBit
#'
#' @format A data frame
"qb_revenue_breakdown"


#########
### R ###
#########

#' Recovery times
#'
#' Average Recovery Times under Treatments A and B, and Control.
#'
#'  @format A data frame
"recovery_times"

#' refunds
#'
#' Refunds for Brazils House of Deputies
#' https://www.kaggle.com/epattaro/brazils-house-of-deputies-reimbursements
#'
#' @format A data frame
"refunds"

#' right skewed data
#'
#' A dataset with simulated right skewed data for teaching
#'
#' @format A data frame
"right_skewed"


#########
### S ###
#########

#' Salaries
#'
#' Inveted data about job location, job type and salary.
#'
#'
#' @format A data frame
"salary"

#' Savings
#'
#' Invented data about how much money people have saved.
#'
#'
#' @format A data frame
"savings"

#' school_census
#'
#' A New Zealand school census
#' https://new.censusatschool.org.nz/about/
#'
#' @format A data frame
"school_census"

#' scottish_exports
#'
#' scottish_exports
#'
#' @format A data frame
"scottish_exports"

#' Starwars
#'
#' An extract of the data available from the Star Wars API.
#' Information on 10 characters from Star Wars.
#'
#' @source \url{https://swapi.co/}
#' @format A list with 10 elements
"starwars"

#' State Income Data
#'
#' A dataset with information about incomes in states.
#'
#' @format A data frame
"state_income_data"

#' stonybridge
#'
#' Words from Stonybridge promotion
#'
#' @format A data frame
"stonybridge"

#' Students
#'
#' A small dataset with some survey information from randomly chosen students.
#' Comes from the international census at school random sampler.
#'
#' @source \url{https://github.com/freeCodeCamp/2018-new-coder-survey}
#' @format A data frame with 14 observations of 10 variables
"students"

#' Students Big
#'
#' A larger dataset with information from the student census. This time only for UK students.
#'
#' @source \url{https://new.censusatschool.org.nz/tools/random-sampler/}
#' @format A data frame
"students_big"


#########
### T ###
#########

#' Table of Numbers
#'
#' A randomly generated table of 100 numbers. Numbers come from a Poisson distribution
#' with mean 3.
#'
#' @format A data frame
"table_of_numbers"

#' Temperature
#'
#' A matrix which contains the maximum temperature for each month in Scotland
#' over 106 years (1910 from 2015). Each row corresponds to a year and each
#' column to a month (January to December).
#'
#' @format A matrix
"temp"

#' Temperature Data Frame
#'
#' The same data available in temp, but in a long data frame.
#'
#' @format A data frame
"temp_df"

#' Total sales
#'
#' An invented dataset with total sales across 7 branches.
#'
#' A monthly breakdown of this dataset can be found in `monthly_sales`.
#'
#' @format A data frame
"total_sales"

#' Tyrell Corp Jobs
#'
#' @format A data frame
"tyrell_corp_jobs"

#########
### U ###
#########

#' UK_poly
#'
#' UK_poly
#'
#' @format A data frame
"UK_poly"

#' Unimodal data
#'
#' A dataset with simulated unimodal data for teaching
#'
#' @format A data frame
"unimodal"

#########
### V ###
#########

#' VoteEU
#'
#' Two way table - vote in 2016 EU referendum and 2019 voting intention
#' Converted to ggplot2 layout
#'
#' @format A data frame
"vote19_eu"

#########
### W ###
#########

#' whisky
#'
#' Scottish whisky distilleries
#'
#' @format A data frame
"whisky"

#' women_in_gov
#'
#' Women in Parliament
#' https://data.worldbank.org/indicator/SG.GEN.PARL.ZS
#'
#' @format A data frame
"women_in_gov"

#' world
#'
#' World map combined with data from Global Attitudes Survey 2017
#'
#' @format A data frame
"world"


#########
### X ###
#########


#########
### Y ###
#########


#########
### Z ###
#########
codeclan/CCData documentation built on March 13, 2023, 8:48 p.m.