configuration for

STI risk factor analysis

of NATSAL data

data_config.R

N Green

April 2014

library(plyr, quietly = TRUE)
library(scales, quietly = TRUE)
library(gdata, quietly = TRUE)
library(foreign, quietly = TRUE)
library(MASS, quietly = TRUE)
library(stargazer, quietly = TRUE)
library(utils, quietly = TRUE)

## print set-up info
system("hostname")
date()
sessionInfo()
## version 3
### this extract includes risk factors smoking, drinking, etc
# NATSAL.dat <- read.dta("C:\\Users\\nathan.green\\Documents\\Chlamydia\\data\\NATSAL\\Natsal-3_extract_2April2014_v12.dta")
NATSAL.dat <- read.csv("C:/Users/nathan.green/Documents/Chlamydia/data/NATSAL/Natsal-3_extract_2April2014.csv")

## England only id codes
sin2_England <- read.dta("C:/Users/nathan.green/Documents/chlamydia/data/NATSAL/SIN2_subsets/SIN2 of participants resident in England_stataV12.dta")

## London id codes
sin2_London <- read.dta("C:/Users/nathan.green/Documents/chlamydia/data/NATSAL/SIN2_subsets/SIN2 & london 14July2014_stataV12.dta")

## household sizes
sin2_household <- readstata13::read.dta13("inst/extdata/sin2-householdsize_22June2016.dta")

## ethnic group mapping to fewer, amalgamated groups
Natsalethgrp.mapping <- read.csv("C:/Users/nathan.green/Documents/chlamydia/classifier/data/Natsal_fewerethgrps_mapping.csv")
devtools::load_all(".")
setwd(system.file(package = "STIecoPredict"))
load(file = "../data/eul_natsal_2010_for_archive.RData")

STI risk factors

ONS "increasing drinker" is "regularly" >=4 units per day

(http://www.nhs.uk/Livewell/alcohol/Pages/Effectsofalcohol.aspx)

either use

drinknum - how often have >6/8 units on one occasion

drinknum: (5) daily or almost daily; weekly

decided that drinknum is too conservative

we need a lower threshold on the number of units

so create new variable

combine- manyalc2 {3 or 4, 5 or 6, >6}

drinkoft: Once/twice weekly,..., 5+ days a week

## iterate for different bootstrap samples
# out <- c(tot.pos.pred=NULL, true.pos=NULL)
# for(i in 1:2){


# select variables --------------------------------------------------------


## (1) LA-level estimates to simulate with ('sub' model)
# riskfac <- c("drinknum", "smokenow",
riskfac <- c("manyalc2", "drinkoft", "smokenow",
             "income", "ethnic",
             "dage", "rsex")

# riskfac <- c("london",
#              "manyalc2", "drinkoft", "smokenow",
#              "income", "ethnic",
#              "dage", "rsex")

## (2) without drinking because
## we don't have good reference data for this
# riskfac <- c("smokenow",
#              "income", "ethnic",
#              "dage", "rsex")

## (3) 'full' model
# riskfac <- c("drinknum", "smokenow",
# riskfac <- c("manyalc2", "drinkoft", "smokenow",
#              "nonocon",
#              "income", "ethnic",
#              "sex4wks", "het1yr",
#              "dage", "rsex")

## (4) risk factors that we _know_ at the LA-level
# riskfac <- c("dage", "rsex")

## (5) extended full model
# riskfac <- c("drinknum", "manyalc2", "smokenow", "exsmoke",
#              "nonocon", "conpill", "condom", "uscondom", "usnotused", "cond4wk", "yrcond",
#              "income", "ethnic",
#              "sex4wks", "het1yr", "afsex",
#              "dage", "rsex")

KEY

---

output variables:

"chlmtest" - tested for chlamydia in the last year (if not diagnosed with chlamydia in last year, age 16-44)

"chlofref" - offered and refused Chlamydia test (if not diagnosed with chlamydia in last year, age 16-44)

"chltstwh" - where last tested for chlamydia (if not diagnosed with chlamydia in last year, age 16-44)

"whnchlam"- time since last diagnosed with chlamydia (1:less than 1 year)

"chtstwh1" - where last tested for Chlamydia (if last diagnosis in last year)

"chtstwy1" - why last tested for Chlamydia

"cttestly" - tested for chlamydia in last year

other variables:

drinking & smoking variables

"drink", "drinkoft", "manyalc2", "exsmoke"

condom use

"condom", "nocondom", "yrcond", "cond4wk"

frequency of sex

"sex4wks1", "sex4wks2"

preprocess data

## NB use the wider range for both analyses
## since otherwise knn can't fit
ageRange <- 16:34  #years
# ageRange <- 16:24

data <- NATSAL.dat


## add London id
data <- cbind(london=sin2_London$london, data)

## lookup using stata2
gors <- c("North East","North West","Yorkshire & Humberside","East Midlands","West Midlands",
          "South West","Eastern","Inner London","Outer London","South East","Wales","Scotland")
gors <- data.frame(strata2=1:72, gors)


# remove records ----------------------------------------------------------

## use Natsal-2 location codes
### England data only
data <- data[data$sin2%in%sin2_England$sin2,]
### inside/outside London
# data <- data[data$sin2%in%sin2_London$sin2[sin2_London$london=="Elsewhere"],]
# data <- data[data$sin2%in%sin2_London$sin2[sin2_London$london=="Greater London"],]


## restricted age range only in Natsal
## (or could fit to whole data set instead and subset later)
data <- subset(data, dage>=min(ageRange) & dage<=max(ageRange))

## not had sex yet
# data.train$sex4wks[data.train$sex4wks==997] <- 0

## remove unwanted non-numeric entries
# data <- subset(data, nonocon<100 & sex4wks<100 & het1yr<100)# & afsex<90)



## reverse order of ages, from largest to smallest
## this is to match shape of logistic curve
## REMEMBER: do this for simulated data too!
data$dage <- max(ageRange)-data$dage

bootstrap ---------------------------------------------------------------

by Natsal sampling weights to adjust for selection bias

set.seed(1968)

## we could incorporate the weights directly into the classifier instead by adjusting \beta_1 or weighting the likelihood
## should we use a fixed resampled data set/random seed?
sampleRows <- sample(1:nrow(data), prob=data$total_wt, replace=TRUE , size=(m=1000000))
data <- data[sampleRows,]
rownames(data) <- NULL


depvars <- c("cttestly","whnchlam","chlmtest")

## extract columns of interest
data <- data[,c(depvars,"total_wt","strata2",riskfac)]

## class of each variable
(data.class <- sapply(data, class))

## convert all factors to upper case
data <- toupper.fac(data, names(data)[data.class=="factor"])




## reassign "Not Applicable' cttestly individuals
##
## can't do following because there are 'not happened yet' (had sex) who have been tested
# set 'cttestly: Not Applicable' to 'No' for '96: not happened yet'
# data$cttestly[NATSAL.dat$afsexall==96] <- "No"

data$cttestly[data$cttestly=="NOT APPLICABLE"] <- "NO"
# data$drinknum[data$drinknum=="NOT APPLICABLE"] <- "NEVER"

## ever drink alcohol nowaday=no, never
data$drinkoft[data$drinkoft=="NOT APPLICABLE"] <- "NOT AT ALL IN THE LAST 12 MONTHS"





## drinking new variable
## ---------------------
# data <- transform(data, increasingdrinker=data$drinknum%in%c("WEEKLY", "DAILY OR ALMOST DAILY"))
# riskfac <- gsub("drinknum", "increasingdrinker", riskfac)

## issue with whether over 3 units for women should be approximated with 3-4 units level or without?

data <- transform(data, increasingdrinker=
                    (manyalc2%in%c("3 OR 4","5 OR 6", ">6") &
                       rsex=="FEMALE" &
                       drinkoft%in%c("ONCE/TWICE A WEEK", "3/4 DAYS A WEEK","5+ DAYS A WEEK")) |
                    (manyalc2%in%c("3 OR 4","5 or 6", ">6") &
                       rsex=="MALE" &
                       drinkoft%in%c("ONCE/TWICE A WEEK","3/4 DAYS A WEEK","5+ DAYS A WEEK"))
)


## consistent labels with simulated data
data$rsex <- factor(data$rsex, levels=c("FEMALE","MALE"), labels=c("Women","Men"))

reorder (reference) levels ----------------------------------------------

# data$drinknum <- relevel(data$drinknum, ref="NEVER")
# data$cond4wk <- relevel(data$cond4wk, ref="YES, USED ON EVERY OCCASION")
# data$smokenow <- relevel(data$smokenow, ref="NO")
# data$manyalc2 <- reorder(data$manyalc2, new.order=c("DRINK LESS FREQUENTLY THAN ONCE/TWICE A MONTH", "I ONLY DRINK ON SPECIAL OCCASIONS", "1 OR 2", "3 OR 4", "5 OR 6", ">6" , "NOT ANSWERED", "OTHER"))
# data$drinknum <- reorder(data$drinknum, new.order=c("NEVER", "LESS THAN MONTHLY", "MONTHLY", "WEEKLY", "DAILY OR ALMOST DAILY", "NOT ANSWERED", "NOT APPLICABLE"))

data$whnchlam <- reorder(data$whnchlam,
                         new.order=c("LESS THAN 1 YEAR AGO", "BETWEEN 1 AND 5 YEARS AGO", "BETWEEN 5 AND 10 YEARS AGO", "MORE THAN 10 YEARS AGO", "NOT APPLICABLE"))
data$chlmtest <- reorder(data$chlmtest, new.order=c("NO", "YES", "NOT APPLICABLE"))
data$income <- reorder(data$income,
                       new.order=c("<2,500", "NOT ANSWERED", "2,500-4,999", "5,000-9,999", "10,000-19,999", "20,000-29,999", "30,000-39,999", "40,000-49,999", "50,000+"))
#                        new.order=c("NOT ANSWERED", "<2,500", "2,500-4,999", "5,000-9,999", "10,000-19,999", "20,000-29,999", "30,000-39,999", "40,000-49,999", "50,000+"))
data$ethnic <- relevel(data$ethnic, ref="WHITE BRITISH")
data$rsex <- relevel(data$rsex, ref="Men")

data <- transform(data, whnchlamYr = (whnchlam=="LESS THAN 1 YEAR AGO"))

data <- merge(data, Natsalethgrp.mapping, by="ethnic", all.x = TRUE, all.y=FALSE)

data <- merge(data, gors, by="strata2", all.x = TRUE, all.y=FALSE)


# summary(data)


## split data in to training and test data sets
data.test <- data[(m/2+1):m,]
data.train <- data[1:(m/2),]

# riskfac <- c("increasingdrinker", "smokenow", "income", "ethnic", "dage", "rsex")   #submodel
riskfac <- c("increasingdrinker", "smokenow", "ethnic", "dage", "rsex")
# riskfac <- c("london", "increasingdrinker", "smokenow", "nonocon", "income", "ethnic", "sex4wks", "het1yr", "dage", "rsex")   #full model

save

##TODO##
# where is Natsal.riskfac.table from ??

# save(data, data.test, data.train, riskfac, Natsal.riskfac.table,
#      file="./R_analyses/Chlamydia_classifier/sim_logistic_regn/workspaces/config/riskfactor_config_output_1634_fullmodel_elsewhere.RData")
# save(data, data.test, data.train, riskfac, Natsal.riskfac.table,
#      file="./R_analyses/Chlamydia_classifier/sim_logistic_regn/workspaces/config/riskfactor_config_output_TEMP.RData")

risk factor distns ------------------------------------------------------

## if _don't_do_ the bootstrap sampling above then
## get BOOTSTRAP DISTRIBUTIONS for risk factors

# smoke <- drink <- whitebrit <- male <- NULL
# income5000 <- income10000 <- income20000 <- income30000 <- income40000 <- income50000 <- NULL
# for (i in 1:2000){
#   sampleRows <- sample(1:nrow(data), prob=data$total_wt, replace=TRUE)
#   data.montecarlo <- data[sampleRows,]
#   rownames(data.montecarlo) <- NULL
#   smoke[i] <- sum(data.montecarlo$smokenow=="YES")/nrow(data.montecarlo)
#   drink[i] <- sum(data.montecarlo$increasingdrinker)/nrow(data.montecarlo)
#   male[i] <- sum(data.montecarlo$rsex=="Men")/nrow(data.montecarlo)
#   whitebrit[i] <- sum(data.montecarlo$ethnic=="WHITE BRITISH")/nrow(data.montecarlo)
#   income5000[i] <- sum(data.montecarlo$income=="5,000-9,999")/nrow(data.montecarlo)
#   income10000[i] <- sum(data.montecarlo$income=="10,000-19,999")/nrow(data.montecarlo)
#   income20000[i] <- sum(data.montecarlo$income=="20,000-29,999")/nrow(data.montecarlo)
#   income30000[i] <- sum(data.montecarlo$income=="30,000-39,999")/nrow(data.montecarlo)
#   income40000[i] <- sum(data.montecarlo$income=="40,000-49,999")/nrow(data.montecarlo)
#   income50000[i] <- sum(data.montecarlo$income=="50,000+")/nrow(data.montecarlo)
# }
# smoke_bar <- data$total_wt%*%as.numeric(data$smokenow=="YES")/sum(data$total_wt)
# drink_bar <- data$total_wt%*%as.numeric(data$increasingdrinker)/sum(data$total_wt)
# whitebrit_bar <- data$total_wt%*%as.numeric(data$ethnic=="WHITE BRITISH")/sum(data$total_wt)
# male_bar <- data$total_wt%*%as.numeric(data$rsex=="Men")/sum(data$total_wt)
# income5000_bar <- data$total_wt%*%as.numeric(data$income=="5,000-9,999")/sum(data$total_wt)
# income10000_bar <- data$total_wt%*%as.numeric(data$income=="10,000-19,999")/sum(data$total_wt)
# income20000_bar <- data$total_wt%*%as.numeric(data$income=="20,000-29,999")/sum(data$total_wt)
# income30000_bar <- data$total_wt%*%as.numeric(data$income=="30,000-39,999")/sum(data$total_wt)
# income40000_bar <- data$total_wt%*%as.numeric(data$income=="40,000-49,999")/sum(data$total_wt)
# income50000_bar <- data$total_wt%*%as.numeric(data$income=="50,000+")/sum(data$total_wt)
#
#
# save(smoke, drink, whitebrit, male, income5000, income10000, income20000, income30000, income40000, income50000,
#      smoke_bar, drink_bar, whitebrit_bar, male_bar, income5000_bar, income10000_bar, income20000_bar, income30000_bar, income40000_bar, income50000_bar,
#      file="./data/NATSAL/bstrap_distn_rf_1634.RData")
#
#
# hist(smoke, breaks=20)
# abline(v=smoke_bar, col="blue")
# hist(drink, breaks=20)
# abline(v=drink_bar, col="blue")
# hist(whitebrit, breaks=20)
# abline(v=whitebrit_bar, col="blue")
#
# hist(income5000, breaks=20)
# abline(v=income5000_bar, col="blue")
# hist(income10000, breaks=20)
# abline(v=income10000_bar, col="blue")
# hist(income20000, breaks=20)
# abline(v=income20000_bar, col="blue")
# hist(income30000, breaks=20)
# abline(v=income30000_bar, col="blue")
# hist(income40000, breaks=20)
# abline(v=income40000_bar, col="blue")
# hist(income50000, breaks=20)
# abline(v=income50000_bar, col="blue")

statistics:

why last tested for chlamydia?

# (tab <- table(NATSAL.dat$chtstwy1)+table(NATSAL.dat$chltstwy))
## percent (excluding not tested individuals)
# tab/sum(tab[!names(tab)%in%c("Not applicable", "Not answered")])*100


n8thangreen/STIecoPredict documentation built on June 7, 2020, 12:50 p.m.