Configuration file for input data for the simulate LA populations as weighted random samples for use in NATSAL classifier sim_config.R

library(plyr, quietly = TRUE)
library(stringr)
library(reshape2)
setwd(".\\data")

ethnicgroup_lookup <- read.csv("./ONSNatsal_ethgrp_mapping.csv", check.names=FALSE)

## ethnic group mapping to fewer, amalgamated groups
Natsalethgrp.mapping <- read.csv("C:/Users/nathan.green/Documents/chlamydia/classifier/data/Natsal_fewerethgrps_mapping.csv")

## different area groupings look-up table
area.lookup <- read.csv(".\\lookup_data_area.csv")

## LA population, by age & sex
## ---------------------------
## 2001-2012
popLAagesex.dat <- read.csv(".\\ONS_popn_age&sex\\pop_age_sex_LA.csv", check.names=FALSE)

## smoking
## -------
### LA only prevalence
###
smokingLA.dat <- read.csv(".\\risk_factors\\smoking\\smoking_LA.csv")
### age-group only prevalence
### 2010
smokingage.dat <- read.csv(".\\risk_factors\\smoking\\smoking_agegrp.csv", check.names=FALSE)
smokingLA.dat$Name <- LAnameClean(smokingLA.dat$Name)

## drinking
## --------
### LA only prevalence
### 2008-2009, >16 yr olds
drinkingLA.dat <- read.csv(".\\risk_factors\\drinking\\increasingandhigherriskdrinking_LA.csv")
drinkingLA.dat$Name <- LAnameClean(drinkingLA.dat$Name)
### age-group only prevalence
### 2011
# drinkingage.dat <- read.csv(".\\risk_factors\\drinking\\drinking_agegrp_freq.csv", check.names=FALSE)    #deprecated in favour of below
drinkingage.dat <- read.csv(".\\risk_factors\\drinking\\drinking_agegrp_units.csv", check.names=FALSE)

## income
## ------
## 2011-2012
#   incomeregion.dat <- read.csv(".\\risk_factors\\income\\income_by_regions.csv")  #deprecated
#   read.csv("./risk_factors/income/income_countymedian_2011.csv")
incomeage.dat <- read.csv(".\\risk_factors\\income\\income_by_age_sex.csv")
# ASHE (ONS) 2011
incomeLA.dat <- list()
incomeLA.dat[["Men"]]   <- read.csv("./risk_factors/income/incomeLA_male_2011.csv", colClasses=c("Median"="integer", "Mean"="integer"))
incomeLA.dat[["Women"]] <- read.csv("./risk_factors/income/incomeLA_female_2011.csv", colClasses=c("Median"="numeric", "Mean"="numeric"))

incomeLA.dat[["Men"]]$LA_Name   <- LAnameClean(str_trim(incomeLA.dat[["Men"]]$LA_Name))
incomeLA.dat[["Women"]]$LA_Name <- LAnameClean(str_trim(incomeLA.dat[["Women"]]$LA_Name))

## ethnicity
## ---------
## Census 2011
# ethnicityLA.dat <- read.csv("./risk_factors/ethnicity/la_ethgrp_pop.csv", check.names=FALSE)   #deprecated
# ethnicityLA.dat$Area <- LAnameClean(ethnicityLA.dat$Area)
ethnicityLA.dat <- list()

ethnicityLA.dat[["Men"]] <- read.csv("raw-data/census2011_LA_ethgrp_agegrp_male.csv", check.names=FALSE)
# ethnicityLA.dat[["Men"]]   <- read.csv("./risk_factors/ethnicity/census2011_LA_ethgrp_agegrp_male.csv", check.names=FALSE)

ethnicityLA.dat[["Women"]] <- read.csv("raw-data/census2011_LA_ethgrp_agegrp_female.csv", check.names=FALSE)
# ethnicityLA.dat[["Women"]] <- read.csv("./risk_factors/ethnicity/census2011_LA_ethgrp_agegrp_female.csv", check.names=FALSE)
##TODO##
## tidy this up
##lapply? function?

ethnicityLA.melt <- list()
ethnicityLA.melt[["Men"]]   <- melt(ethnicityLA.dat[["Men"]])
ethnicityLA.melt[["Women"]] <- melt(ethnicityLA.dat[["Women"]])

ethnicityLA.melt[["Men"]]  <- data.frame(ethnicityLA.melt[["Men"]][,names(ethnicityLA.melt[["Men"]] )!="variable"],
                                         colsplit(as.character(ethnicityLA.melt[["Men"]]$variable), ";", c("age","ethngrp")))
ethnicityLA.melt[["Men"]]$age <- gsub("Age: Age ", "", ethnicityLA.melt[["Men"]]$age)
ethnicityLA.melt[["Men"]]$ethngrp <- gsub("Ethnic Group: ", "", ethnicityLA.melt[["Men"]]$ethngrp)
ethnicityLA.melt[["Men"]]$ethngrp <- gsub("; measures: Value", "", ethnicityLA.melt[["Men"]]$ethngrp)
ethnicityLA.melt[["Men"]]$ethngrp <- str_trim(ethnicityLA.melt[["Men"]]$ethngrp)
ethnicityLA.melt[["Men"]]  <- merge(ethnicityLA.melt[["Men"]] , ethnicgroup_lookup, by.x="ethngrp", by.y="ONS", all=F)
ethnicityLA.melt[["Men"]]  <- merge(ethnicityLA.melt[["Men"]] , Natsalethgrp.mapping, by.x="NATSAL", by.y="ethnic", all.x=T, all.y=F)

ethnicityLA.melt[["Women"]]  <- data.frame(ethnicityLA.melt[["Women"]][,names(ethnicityLA.melt[["Women"]] )!="variable"],
                                           colsplit(as.character(ethnicityLA.melt[["Women"]]$variable), ";", c("age","ethngrp")))
ethnicityLA.melt[["Women"]]$age <- gsub("Age: Age ", "", ethnicityLA.melt[["Women"]]$age)
ethnicityLA.melt[["Women"]]$ethngrp <- gsub("Ethnic Group: ", "", ethnicityLA.melt[["Women"]]$ethngrp)
ethnicityLA.melt[["Women"]]$ethngrp <- gsub("; measures: Value", "", ethnicityLA.melt[["Women"]]$ethngrp)
ethnicityLA.melt[["Women"]]$ethngrp <- str_trim(ethnicityLA.melt[["Women"]]$ethngrp)
ethnicityLA.melt[["Women"]]  <- merge(ethnicityLA.melt[["Women"]] , ethnicgroup_lookup, by.x="ethngrp", by.y="ONS", all=F)
ethnicityLA.melt[["Women"]]  <- merge(ethnicityLA.melt[["Women"]] , Natsalethgrp.mapping, by.x="NATSAL", by.y="ethnic", all.x=T, all.y=F)

names(ethnicityLA.melt[["Men"]])[names(ethnicityLA.melt[["Men"]])=="age"] <- "agegrp"
names(ethnicityLA.melt[["Women"]])[names(ethnicityLA.melt[["Women"]])=="age"] <- "agegrp"

ethnicityLA.melt[["Men"]]$agegrp <- ordered(ethnicityLA.melt[["Men"]]$agegrp)
ethnicityLA.melt[["Men"]]$agegrp <- ordered(ethnicityLA.melt[["Men"]]$agegrp,
                                            levels=c("15", "16 to 17", "18 to 19", "20 to 24", "25 to 29", "30 to 34", "35 to 39", "Age: All categories: Age"))
ethnicityLA.melt[["Women"]]$agegrp <- ordered(ethnicityLA.melt[["Women"]]$agegrp)
ethnicityLA.melt[["Women"]]$agegrp <- ordered(ethnicityLA.melt[["Women"]]$agegrp,
                                              levels=c("15", "16 to 17", "18 to 19", "20 to 24", "25 to 29", "30 to 34", "35 to 39", "Age: All categories: Age")) 

# save(ethnicityLA.melt, file="./risk_factors/ethnicity/ethnicityLA_melt.RData")
# load(file="./data/risk_factors/ethnicity/ethnicityLA_melt.RData")
x <- census2011_LA.sex.age.studentnums[,c(2,3,grep("Student",names(census2011_LA.sex.age.studentnums)),
                                          grep("Economic Activity: All categories: Economic activity; measures: Value",names(census2011_LA.sex.age.studentnums)))]

names(x) <- gsub("; Economic Activity: Economically inactive: Student \\(including full-time students); measures: Value",replacement = " student",x = names(x))
names(x) <- gsub("persons; Age: Age ",replacement = "",x = names(x))
names(x) <- gsub("; Economic Activity: All categories: Economic activity; measures: Value",replacement = " pop",x = names(x))
names(x) <- gsub("Sex: ",replacement = "",x = names(x))
names(x) <- gsub("; Age: Age",replacement = "",x = names(x))

x$geography <- toupper(x$geography)
write.csv(x, file="temp.csv")
## do some Excel rearranging...

popCensus$'16 to 17' <- popCensus$'16 to 19'/2
popCensus$'18 to 19' <- popCensus$'16 to 19'/2
studentCensus$'16 to 17' <- studentCensus$'16 to 19'/2
studentCensus$'18 to 19' <- studentCensus$'16 to 19'/2

popCensus$'20 to 24' <- popCensus$'20 to 21' + popCensus$'22 to 24'
studentCensus$'20 to 24' <- studentCensus$'20 to 21' + studentCensus$'22 to 24'

library(reshape2)
studentCensus <- melt(studentCensus, variable.name = "agegrp")
popCensus <- melt(popCensus, variable.name = "agegrp")

studentCensus$LAname <- sub("`", "'", studentCensus$LAname)
popCensus$LAname <- sub("`", "'", popCensus$LAname)
setwd("..\\")

rm(list=lsf.str())    #remove functions only
save.image("locallevel_pop_input")
# save.image("./R_analyses/Chlamydia_classifier/sim_logistic_regn/workspaces/pop_sim_input.RData")


n8thangreen/STIecoPredict documentation built on June 7, 2020, 12:50 p.m.