library(plyr, quietly = TRUE) library(scales, quietly = TRUE) library(gdata, quietly = TRUE) library(foreign, quietly = TRUE) library(MASS, quietly = TRUE) library(stargazer, quietly = TRUE) library(utils, quietly = TRUE) ## print set-up info system("hostname") date() sessionInfo()
## version 3 ### this extract includes risk factors smoking, drinking, etc # NATSAL.dat <- read.dta("C:\\Users\\nathan.green\\Documents\\Chlamydia\\data\\NATSAL\\Natsal-3_extract_2April2014_v12.dta") NATSAL.dat <- read.csv("C:/Users/nathan.green/Documents/Chlamydia/data/NATSAL/Natsal-3_extract_2April2014.csv") ## England only id codes sin2_England <- read.dta("C:/Users/nathan.green/Documents/chlamydia/data/NATSAL/SIN2_subsets/SIN2 of participants resident in England_stataV12.dta") ## London id codes sin2_London <- read.dta("C:/Users/nathan.green/Documents/chlamydia/data/NATSAL/SIN2_subsets/SIN2 & london 14July2014_stataV12.dta") ## household sizes sin2_household <- readstata13::read.dta13("inst/extdata/sin2-householdsize_22June2016.dta") ## ethnic group mapping to fewer, amalgamated groups Natsalethgrp.mapping <- read.csv("C:/Users/nathan.green/Documents/chlamydia/classifier/data/Natsal_fewerethgrps_mapping.csv")
devtools::load_all(".") setwd(system.file(package = "STIecoPredict")) load(file = "../data/eul_natsal_2010_for_archive.RData")
## iterate for different bootstrap samples # out <- c(tot.pos.pred=NULL, true.pos=NULL) # for(i in 1:2){ # select variables -------------------------------------------------------- ## (1) LA-level estimates to simulate with ('sub' model) # riskfac <- c("drinknum", "smokenow", riskfac <- c("manyalc2", "drinkoft", "smokenow", "income", "ethnic", "dage", "rsex") # riskfac <- c("london", # "manyalc2", "drinkoft", "smokenow", # "income", "ethnic", # "dage", "rsex") ## (2) without drinking because ## we don't have good reference data for this # riskfac <- c("smokenow", # "income", "ethnic", # "dage", "rsex") ## (3) 'full' model # riskfac <- c("drinknum", "smokenow", # riskfac <- c("manyalc2", "drinkoft", "smokenow", # "nonocon", # "income", "ethnic", # "sex4wks", "het1yr", # "dage", "rsex") ## (4) risk factors that we _know_ at the LA-level # riskfac <- c("dage", "rsex") ## (5) extended full model # riskfac <- c("drinknum", "manyalc2", "smokenow", "exsmoke", # "nonocon", "conpill", "condom", "uscondom", "usnotused", "cond4wk", "yrcond", # "income", "ethnic", # "sex4wks", "het1yr", "afsex", # "dage", "rsex")
## NB use the wider range for both analyses ## since otherwise knn can't fit ageRange <- 16:34 #years # ageRange <- 16:24 data <- NATSAL.dat ## add London id data <- cbind(london=sin2_London$london, data) ## lookup using stata2 gors <- c("North East","North West","Yorkshire & Humberside","East Midlands","West Midlands", "South West","Eastern","Inner London","Outer London","South East","Wales","Scotland") gors <- data.frame(strata2=1:72, gors) # remove records ---------------------------------------------------------- ## use Natsal-2 location codes ### England data only data <- data[data$sin2%in%sin2_England$sin2,] ### inside/outside London # data <- data[data$sin2%in%sin2_London$sin2[sin2_London$london=="Elsewhere"],] # data <- data[data$sin2%in%sin2_London$sin2[sin2_London$london=="Greater London"],] ## restricted age range only in Natsal ## (or could fit to whole data set instead and subset later) data <- subset(data, dage>=min(ageRange) & dage<=max(ageRange)) ## not had sex yet # data.train$sex4wks[data.train$sex4wks==997] <- 0 ## remove unwanted non-numeric entries # data <- subset(data, nonocon<100 & sex4wks<100 & het1yr<100)# & afsex<90) ## reverse order of ages, from largest to smallest ## this is to match shape of logistic curve ## REMEMBER: do this for simulated data too! data$dage <- max(ageRange)-data$dage
set.seed(1968) ## we could incorporate the weights directly into the classifier instead by adjusting \beta_1 or weighting the likelihood ## should we use a fixed resampled data set/random seed? sampleRows <- sample(1:nrow(data), prob=data$total_wt, replace=TRUE , size=(m=1000000)) data <- data[sampleRows,] rownames(data) <- NULL depvars <- c("cttestly","whnchlam","chlmtest") ## extract columns of interest data <- data[,c(depvars,"total_wt","strata2",riskfac)] ## class of each variable (data.class <- sapply(data, class)) ## convert all factors to upper case data <- toupper.fac(data, names(data)[data.class=="factor"]) ## reassign "Not Applicable' cttestly individuals ## ## can't do following because there are 'not happened yet' (had sex) who have been tested # set 'cttestly: Not Applicable' to 'No' for '96: not happened yet' # data$cttestly[NATSAL.dat$afsexall==96] <- "No" data$cttestly[data$cttestly=="NOT APPLICABLE"] <- "NO" # data$drinknum[data$drinknum=="NOT APPLICABLE"] <- "NEVER" ## ever drink alcohol nowaday=no, never data$drinkoft[data$drinkoft=="NOT APPLICABLE"] <- "NOT AT ALL IN THE LAST 12 MONTHS" ## drinking new variable ## --------------------- # data <- transform(data, increasingdrinker=data$drinknum%in%c("WEEKLY", "DAILY OR ALMOST DAILY")) # riskfac <- gsub("drinknum", "increasingdrinker", riskfac) ## issue with whether over 3 units for women should be approximated with 3-4 units level or without? data <- transform(data, increasingdrinker= (manyalc2%in%c("3 OR 4","5 OR 6", ">6") & rsex=="FEMALE" & drinkoft%in%c("ONCE/TWICE A WEEK", "3/4 DAYS A WEEK","5+ DAYS A WEEK")) | (manyalc2%in%c("3 OR 4","5 or 6", ">6") & rsex=="MALE" & drinkoft%in%c("ONCE/TWICE A WEEK","3/4 DAYS A WEEK","5+ DAYS A WEEK")) ) ## consistent labels with simulated data data$rsex <- factor(data$rsex, levels=c("FEMALE","MALE"), labels=c("Women","Men"))
# data$drinknum <- relevel(data$drinknum, ref="NEVER") # data$cond4wk <- relevel(data$cond4wk, ref="YES, USED ON EVERY OCCASION") # data$smokenow <- relevel(data$smokenow, ref="NO") # data$manyalc2 <- reorder(data$manyalc2, new.order=c("DRINK LESS FREQUENTLY THAN ONCE/TWICE A MONTH", "I ONLY DRINK ON SPECIAL OCCASIONS", "1 OR 2", "3 OR 4", "5 OR 6", ">6" , "NOT ANSWERED", "OTHER")) # data$drinknum <- reorder(data$drinknum, new.order=c("NEVER", "LESS THAN MONTHLY", "MONTHLY", "WEEKLY", "DAILY OR ALMOST DAILY", "NOT ANSWERED", "NOT APPLICABLE")) data$whnchlam <- reorder(data$whnchlam, new.order=c("LESS THAN 1 YEAR AGO", "BETWEEN 1 AND 5 YEARS AGO", "BETWEEN 5 AND 10 YEARS AGO", "MORE THAN 10 YEARS AGO", "NOT APPLICABLE")) data$chlmtest <- reorder(data$chlmtest, new.order=c("NO", "YES", "NOT APPLICABLE")) data$income <- reorder(data$income, new.order=c("<2,500", "NOT ANSWERED", "2,500-4,999", "5,000-9,999", "10,000-19,999", "20,000-29,999", "30,000-39,999", "40,000-49,999", "50,000+")) # new.order=c("NOT ANSWERED", "<2,500", "2,500-4,999", "5,000-9,999", "10,000-19,999", "20,000-29,999", "30,000-39,999", "40,000-49,999", "50,000+")) data$ethnic <- relevel(data$ethnic, ref="WHITE BRITISH") data$rsex <- relevel(data$rsex, ref="Men") data <- transform(data, whnchlamYr = (whnchlam=="LESS THAN 1 YEAR AGO")) data <- merge(data, Natsalethgrp.mapping, by="ethnic", all.x = TRUE, all.y=FALSE) data <- merge(data, gors, by="strata2", all.x = TRUE, all.y=FALSE) # summary(data) ## split data in to training and test data sets data.test <- data[(m/2+1):m,] data.train <- data[1:(m/2),] # riskfac <- c("increasingdrinker", "smokenow", "income", "ethnic", "dage", "rsex") #submodel riskfac <- c("increasingdrinker", "smokenow", "ethnic", "dage", "rsex") # riskfac <- c("london", "increasingdrinker", "smokenow", "nonocon", "income", "ethnic", "sex4wks", "het1yr", "dage", "rsex") #full model
##TODO## # where is Natsal.riskfac.table from ?? # save(data, data.test, data.train, riskfac, Natsal.riskfac.table, # file="./R_analyses/Chlamydia_classifier/sim_logistic_regn/workspaces/config/riskfactor_config_output_1634_fullmodel_elsewhere.RData") # save(data, data.test, data.train, riskfac, Natsal.riskfac.table, # file="./R_analyses/Chlamydia_classifier/sim_logistic_regn/workspaces/config/riskfactor_config_output_TEMP.RData")
## if _don't_do_ the bootstrap sampling above then ## get BOOTSTRAP DISTRIBUTIONS for risk factors # smoke <- drink <- whitebrit <- male <- NULL # income5000 <- income10000 <- income20000 <- income30000 <- income40000 <- income50000 <- NULL # for (i in 1:2000){ # sampleRows <- sample(1:nrow(data), prob=data$total_wt, replace=TRUE) # data.montecarlo <- data[sampleRows,] # rownames(data.montecarlo) <- NULL # smoke[i] <- sum(data.montecarlo$smokenow=="YES")/nrow(data.montecarlo) # drink[i] <- sum(data.montecarlo$increasingdrinker)/nrow(data.montecarlo) # male[i] <- sum(data.montecarlo$rsex=="Men")/nrow(data.montecarlo) # whitebrit[i] <- sum(data.montecarlo$ethnic=="WHITE BRITISH")/nrow(data.montecarlo) # income5000[i] <- sum(data.montecarlo$income=="5,000-9,999")/nrow(data.montecarlo) # income10000[i] <- sum(data.montecarlo$income=="10,000-19,999")/nrow(data.montecarlo) # income20000[i] <- sum(data.montecarlo$income=="20,000-29,999")/nrow(data.montecarlo) # income30000[i] <- sum(data.montecarlo$income=="30,000-39,999")/nrow(data.montecarlo) # income40000[i] <- sum(data.montecarlo$income=="40,000-49,999")/nrow(data.montecarlo) # income50000[i] <- sum(data.montecarlo$income=="50,000+")/nrow(data.montecarlo) # } # smoke_bar <- data$total_wt%*%as.numeric(data$smokenow=="YES")/sum(data$total_wt) # drink_bar <- data$total_wt%*%as.numeric(data$increasingdrinker)/sum(data$total_wt) # whitebrit_bar <- data$total_wt%*%as.numeric(data$ethnic=="WHITE BRITISH")/sum(data$total_wt) # male_bar <- data$total_wt%*%as.numeric(data$rsex=="Men")/sum(data$total_wt) # income5000_bar <- data$total_wt%*%as.numeric(data$income=="5,000-9,999")/sum(data$total_wt) # income10000_bar <- data$total_wt%*%as.numeric(data$income=="10,000-19,999")/sum(data$total_wt) # income20000_bar <- data$total_wt%*%as.numeric(data$income=="20,000-29,999")/sum(data$total_wt) # income30000_bar <- data$total_wt%*%as.numeric(data$income=="30,000-39,999")/sum(data$total_wt) # income40000_bar <- data$total_wt%*%as.numeric(data$income=="40,000-49,999")/sum(data$total_wt) # income50000_bar <- data$total_wt%*%as.numeric(data$income=="50,000+")/sum(data$total_wt) # # # save(smoke, drink, whitebrit, male, income5000, income10000, income20000, income30000, income40000, income50000, # smoke_bar, drink_bar, whitebrit_bar, male_bar, income5000_bar, income10000_bar, income20000_bar, income30000_bar, income40000_bar, income50000_bar, # file="./data/NATSAL/bstrap_distn_rf_1634.RData") # # # hist(smoke, breaks=20) # abline(v=smoke_bar, col="blue") # hist(drink, breaks=20) # abline(v=drink_bar, col="blue") # hist(whitebrit, breaks=20) # abline(v=whitebrit_bar, col="blue") # # hist(income5000, breaks=20) # abline(v=income5000_bar, col="blue") # hist(income10000, breaks=20) # abline(v=income10000_bar, col="blue") # hist(income20000, breaks=20) # abline(v=income20000_bar, col="blue") # hist(income30000, breaks=20) # abline(v=income30000_bar, col="blue") # hist(income40000, breaks=20) # abline(v=income40000_bar, col="blue") # hist(income50000, breaks=20) # abline(v=income50000_bar, col="blue")
# (tab <- table(NATSAL.dat$chtstwy1)+table(NATSAL.dat$chltstwy)) ## percent (excluding not tested individuals) # tab/sum(tab[!names(tab)%in%c("Not applicable", "Not answered")])*100
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.