In n8thangreen/STIecoPredict: Multilevel Regression With Post-Stratification Using Natsal Data

library(lme4)
library(car)
library(arm)
library(pander)
library(knitr)
library(stargazer)
library(xtable)
library(lattice)
library(plyr)
library(blme)
# library(reshape)
library(reshape2)
library(optimx)

library(STIecoPredict)

# load("C:/Users/ngreen1/Dropbox/small-area & chlamydia/R_code/scripts/mrp/data/cleaned-regn-input-mrpNatsal.RData")
load("./data/cleaned-regn-input-mrpNatsal.RData")

Natsal0 <- Natsal

savedata <- FALSE
showplot <- TRUE

## older ages sparsely sampled
Natsal <- subset(Natsal, age>15 & age<25)   #NCSP range

## same sex partner in last year
Natsal <- subset(Natsal, sam1yr==0)

Natsal$sex1yr <- (Natsal$het1yr!=0 | Natsal$sam1yr!=0)
Natsal$GUMvenue <- Natsal$chtstwh1=="Sexual health clinic (GUM clinic)" | Natsal$chltstwh=="Sexual health clinic (GUM clinic)"

# names.atleast1test <- rownames(table(Natsal$laname, Natsal$cttestly))[table(Natsal$laname, Natsal$cttestly)[,"1"]>0]
# names.LAbigenough <- names(table(Natsal$laname)[(table(Natsal$laname))>10])
# 
# subnames <- names.LAbigenough[names.LAbigenough%in%names.atleast1test]
# Natsal2  <- subset(Natsal, laname%in%subnames)

Natsal2 <- Natsal

form.cttestly <- as.formula(cttestly ~ 1 + sex + (1|sex.age) + (1|ethnic2) + I(hhsize==1) + student +
                                (1|Conception.decile) +  (1|`Numerical classification`) + I(`Average Score`>28) +
                                (1|laname) + (1|gor) + (1|metcounty_UA) + FreqGUM) # + I(gor==9)
form.GUMvenue <- update(form.cttestly, GUMvenue ~ .)

fit.cttestly <- glmer(form.cttestly,
                      family = binomial(link="logit"),
                      data = Natsal2)

fit.GUMvenue <- glmer(form.GUMvenue,
                      family = binomial(link="logit"),
                      data = Natsal2[as.logical(Natsal2$cttestly), ])

MAXIT <- 10000L

fit.cttestly <- bglmer(form.cttestly,
                       family = binomial(link="logit"),
                       maxit = MAXIT,
                       control = glmerControl(optimizer="Nelder_Mead"),
                       data = Natsal2)

fit.GUMvenue <- bglmer(form.GUMvenue,
                       family = binomial(link="logit"),
                       maxit = MAXIT,
                       control = glmerControl(optimizer="Nelder_Mead"),
                       data = Natsal2[as.logical(Natsal2$cttestly), ])

lattice::dotplot(ranef(fit.GUMvenue, condVar=TRUE))
lattice::dotplot(ranef(fit.cttestly, condVar=TRUE))

Missing LA Random Effects and Neighbour Smoothing

Because some of the LAs don't have any sample point we create a vector of LA random effects that includes these. We set the RE to 0. Could alternatively take e.g. average of neighbours.

## add some missing random effects
missingLANatsal.names <- unique(Natsal$laname)[!unique(Natsal$laname)%in%rownames(ranef(fit.GUMvenue)$laname)]
missingLAsim_prop.names <- unique(sim_prop_la$LAname)[!unique(sim_prop_la$LAname)%in%rownames(ranef(fit.GUMvenue)$laname)]
missingLA.names <- sort(union(missingLAsim_prop.names, missingLANatsal.names))

la.ranefs.GUMvenue <- array(0, dim=c(length(missingLA.names),1))
dimnames(la.ranefs.GUMvenue) <- list(missingLA.names, "(Intercept)")
la.ranefs.GUMvenue <- rbind(ranef(fit.GUMvenue)$laname, la.ranefs.GUMvenue)

## add some missing random effects
missingmetUANatsal.names <- unique(Natsal$metcounty_UA)[!unique(Natsal$metcounty_UA)%in%rownames(ranef(fit.GUMvenue)$metcounty_UA)]
missingmetUAsim_prop.names <- unique(sim_prop_la$metcounty_UA)[!unique(sim_prop_la$metcounty_UA)%in%rownames(ranef(fit.GUMvenue)$metcounty_UA)]
missingmetUA.names <- na.omit(union(missingmetUAsim_prop.names, missingmetUANatsal.names))

metUA.ranefs.GUMvenue <- array(0, dim=c(length(missingmetUA.names),1))
dimnames(metUA.ranefs.GUMvenue) <- list(missingmetUA.names, "(Intercept)")
metUA.ranefs.GUMvenue <- rbind(ranef(fit.GUMvenue)$metcounty_UA, metUA.ranefs.GUMvenue)

##TODO##
## add some missing random effects
missinggorNatsal.names <- unique(Natsal$gor)[!unique(Natsal$gor)%in%rownames(ranef(fit.GUMvenue)$gor)]
missinggorsim_prop.names <- unique(sim_prop_la$gor)[!unique(sim_prop_la$gor)%in%rownames(ranef(fit.GUMvenue)$gor)]
missinggor.names <- union(missinggorsim_prop.names, missinggorNatsal.names)

gor.ranefs.GUMvenue <- array(0, dim=c(length(missinggor.names),1))
dimnames(gor.ranefs.GUMvenue) <- list(missinggor.names, "(Intercept)")
gor.ranefs.GUMvenue <- rbind(ranef(fit.GUMvenue)$gor, gor.ranefs.GUMvenue)

## add some missing random effects
missingLANatsal.names <- unique(Natsal$laname)[!unique(Natsal$laname)%in%rownames(ranef(fit.cttestly)$laname)]
missingLAsim_prop.names <- unique(sim_prop_la$LAname)[!unique(sim_prop_la$LAname)%in%rownames(ranef(fit.cttestly)$laname)]
missingLA.names <- union(missingLAsim_prop.names, missingLANatsal.names)

la.ranefs.cttestly <- array(0, dim=c(length(missingLA.names),1))
dimnames(la.ranefs.cttestly) <- list(missingLA.names, "(Intercept)")
la.ranefs.cttestly <- rbind(ranef(fit.cttestly)$laname, la.ranefs.cttestly)

## add some missing random effects
missingmetUANatsal.names <- unique(Natsal$metcounty_UA)[!unique(Natsal$metcounty_UA)%in%rownames(ranef(fit.cttestly)$metcounty_UA)]
missingmetUAsim_prop.names <- unique(sim_prop_la$metcounty_UA)[!unique(sim_prop_la$metcounty_UA)%in%rownames(ranef(fit.cttestly)$metcounty_UA)]
missingmetUA.names <- na.omit(union(missingmetUAsim_prop.names, missingmetUANatsal.names))

metUA.ranefs.cttestly <- array(0, dim=c(length(missingmetUA.names),1))
dimnames(metUA.ranefs.cttestly) <- list(missingmetUA.names, "(Intercept)")
metUA.ranefs.cttestly <- rbind(ranef(fit.cttestly)$metcounty_UA, metUA.ranefs.cttestly)

##TODO##
## add some missing random effects
missinggorNatsal.names <- unique(Natsal$gor)[!unique(Natsal$gor)%in%rownames(ranef(fit.cttestly)$gor)]
missinggorsim_prop.names <- unique(sim_prop_la$gor)[!unique(sim_prop_la$gor)%in%rownames(ranef(fit.cttestly)$gor)]
missinggor.names <- union(missinggorsim_prop.names, missinggorNatsal.names)

gor.ranefs.cttestly <- array(0, dim=c(length(missinggor.names),1))
dimnames(gor.ranefs.cttestly) <- list(missinggor.names, "(Intercept)")
gor.ranefs.cttestly <- rbind(ranef(fit.cttestly)$gor, gor.ranefs.cttestly)

## takes mean average of each LA and its neighbours

names(madjacency) <- LAnameClean(names(madjacency))
madjacency[,1] <- LAnameClean(madjacency[, 1])

## fitted LAs
LAnames <- rownames(ranef(fit.cttestly)$laname)

madjacencyRE <- madjacency

# madjacencyRE <- madjacencyRE[, c(TRUE,(names(madjacency)%in%LAnames)[-1])]  #not missing LAs
# madjacencyRE <- madjacencyRE[(names(madjacency)%in%LAnames)[-1],]

for (i in 1:nrow(madjacencyRE)) madjacencyRE[i, i+1] <- 1   #diagonal (not including first column)

# numNeighbours <- rowSums(madjacencyRE[, -1])  #true number of neigbours

## fillin fitted random effects
for (i in LAnames){
    madjacencyRE[, i] <- madjacencyRE[, i] * ranef(fit.cttestly)$laname[i, ]
}

madjacencyRE[madjacencyRE == 1] <- 0    #if don't have fitted RE then remove from average

## number of neighbours with values
numNeighbours <- apply(madjacencyRE[,-1], 1, function(x) sum(x!=0)) #numNeighbours <- rowSums(madjacencyRE[, -1])

rowAverages <- array(rowSums(madjacencyRE[,-1])/numNeighbours, dim = c(length(numNeighbours),1))
rowAverages[is.nan(rowAverages)] <- 0
dimnames(rowAverages) <- list(madjacencyRE[,1], "(Intercept)")

## who is still without an estimate?
missingLANatsal.names <- unique(Natsal0$laname)[!unique(Natsal0$laname)%in%rownames(rowAverages)]
missingLAsim_prop.names <- unique(sim_prop_la$LAname)[!unique(sim_prop_la$LAname)%in%rownames(rowAverages)]
missingLA.names <- union(missingLAsim_prop.names, missingLANatsal.names)

la.ranefs.cttestly <- array(0, dim=c(length(missingLA.names),1))
dimnames(la.ranefs.cttestly) <- list(missingLA.names, "(Intercept)")
la.ranefs.cttestly <- rbind(rowAverages, la.ranefs.cttestly)

## takes mean average of each LA and its neighbours

names(madjacency) <- LAnameClean(names(madjacency))
madjacency[,1] <- LAnameClean(madjacency[, 1])

## fitted LAs
LAnames <- rownames(ranef(fit.GUMvenue)$laname)

madjacencyRE <- madjacency

# madjacencyRE <- madjacencyRE[, c(TRUE,(names(madjacency)%in%LAnames)[-1])]  #not missing LAs
# madjacencyRE <- madjacencyRE[(names(madjacency)%in%LAnames)[-1],]

for (i in 1:nrow(madjacencyRE)) madjacencyRE[i, i+1] <- 1   #diagonal (not including first column)

# numNeighbours <- rowSums(madjacencyRE[, -1])  #true number of neigbours

## fillin fitted random effects
for (i in LAnames){
    madjacencyRE[, i] <- madjacencyRE[, i] * ranef(fit.GUMvenue)$laname[i, ]
}

madjacencyRE[madjacencyRE == 1] <- 0    #if don't have fitted RE then remove from average

## number of neighbours with values
numNeighbours <- apply(madjacencyRE[,-1], 1, function(x) sum(x!=0)) #numNeighbours <- rowSums(madjacencyRE[, -1])

rowAverages <- array(rowSums(madjacencyRE[,-1])/numNeighbours, dim = c(length(numNeighbours),1))
rowAverages[is.nan(rowAverages)] <- 0
dimnames(rowAverages) <- list(madjacencyRE[,1], "(Intercept)")

## who is still without an estimate?
missingLANatsal.names <- unique(Natsal0$laname)[!unique(Natsal0$laname)%in%rownames(rowAverages)]
missingLAsim_prop.names <- unique(sim_prop_la$LAname)[!unique(sim_prop_la$LAname)%in%rownames(rowAverages)]
missingLA.names <- union(missingLAsim_prop.names, missingLANatsal.names)

la.ranefs.GUMvenue <- array(0, dim=c(length(missingLA.names),1))
dimnames(la.ranefs.GUMvenue) <- list(missingLA.names, "(Intercept)")
la.ranefs.GUMvenue <- rbind(rowAverages, la.ranefs.GUMvenue)

Append predictions to joint distribution dataset

fixeff.formula <- as.formula("cttestly ~ student + sex + age + ethnic2 + livealone")

## census data
# test.calcTotalProbs(sim_prop_la)
sim_prop_la.fit <- calcTotalProbs(formula=fixeff.formula, data=sim_prop_la,
                                  extracols = c("LAname","gor","Average Score","metcounty_UA","Numerical classification","Conception.deciles","FreqGUM"))

pred.GUMvenue <- arm::invlogit(
                    fixef(fit.GUMvenue)["(Intercept)"] +

                    fixef(fit.GUMvenue)["sexWomen"]*(sim_prop_la.fit$sex=="Women") +

                    fixef(fit.GUMvenue)["studentTRUE"]*(sim_prop_la.fit$student) +

                    fixef(fit.GUMvenue)["I(hhsize == 1)TRUE"]*(sim_prop_la.fit$livealone) +

                    fixef(fit.GUMvenue)["FreqGUM"]*(sim_prop_la.fit$FreqGUM) +

                    ranef(fit.GUMvenue)$ethnic2[as.character(sim_prop_la.fit$ethnic2),1] +

                    # ranef(fit.GUMvenue)$age[as.character(sim_prop_la.fit$age),1] +

                    ranef(fit.GUMvenue)$'sex.age'[paste(sim_prop_la.fit$sex, sim_prop_la.fit$age, sep="."), 1] +

                    # ranef(fit.GUMvenue)$laname[as.character(sim_prop_la.fit$LAname),1] +
                    la.ranefs.GUMvenue[as.character(sim_prop_la.fit$LAname),1] +

                    # ranef(fit.GUMvenue)$metcounty_UA[as.character(sim_prop_la.fit$metcounty_UA),1] #+
                    metUA.ranefs.GUMvenue[as.character(sim_prop_la.fit$metcounty_UA),1] +

                    ranef(fit.GUMvenue)$gor[as.character(sim_prop_la.fit$gor),1] +
                    # fixef(fit.GUMvenue)["I(gor == 9)TRUE"]*(sim_prop_la.fit$gor==9)    #london

                    fixef(fit.GUMvenue)["I(`Average Score` > 28)TRUE"]*(sim_prop_la.fit$`Average Score`>28) +

                    ranef(fit.GUMvenue)$`Numerical classification`[as.character(sim_prop_la.fit$`Numerical classification`),1] +

                    ranef(fit.GUMvenue)$Conception.decile[as.character(sim_prop_la.fit$Conception.deciles),1])

pred.cttestly <- arm::invlogit(
                    fixef(fit.cttestly)["(Intercept)"] +

                    fixef(fit.cttestly)["sexWomen"]*(sim_prop_la.fit$sex=="Women") +

                    fixef(fit.cttestly)["studentTRUE"]*(sim_prop_la.fit$student) +

                    fixef(fit.cttestly)["I(hhsize == 1)TRUE"]*(sim_prop_la.fit$livealone) +

                    fixef(fit.cttestly)["FreqGUM"]*(sim_prop_la.fit$FreqGUM) +

                    ranef(fit.cttestly)$ethnic2[as.character(sim_prop_la.fit$ethnic2),1] +

                    #ranef(fit.cttestly)$age[as.character(sim_prop_la.fit$age),1] +

                    ranef(fit.cttestly)$'sex.age'[paste(sim_prop_la.fit$sex, sim_prop_la.fit$age, sep="."), 1] +

                    # ranef(fit.cttestly)$laname[as.character(sim_prop_la.fit$LAname),1] +
                    la.ranefs.cttestly[as.character(sim_prop_la.fit$LAname),1] +

                    # ranef(fit.cttestly)$metcounty_UA[as.character(sim_prop_la.fit$metcounty_UA),1] #+
                    metUA.ranefs.cttestly[as.character(sim_prop_la.fit$metcounty_UA),1] +

                    ranef(fit.cttestly)$gor[as.character(sim_prop_la.fit$gor),1] +
                    # fixef(fit.cttestly)["I(gor == 9)TRUE"]*(sim_prop_la.fit$gor==9)    #london

                    fixef(fit.cttestly)["I(`Average Score` > 28)TRUE"]*(sim_prop_la.fit$`Average Score`>28) +

                    ranef(fit.cttestly)$`Numerical classification`[as.character(sim_prop_la.fit$`Numerical classification`),1] +

                    ranef(fit.cttestly)$Conception.decile[as.character(sim_prop_la.fit$Conception.deciles),1])

## separate GUMvenue and test for chlamydia separate-out the two groups

## p(test AND GUMvenue AND category)
combined_prob <- pred.cttestly * pred.GUMvenue * sim_prop_la.fit$totalprob
## weighted average by LA
LApred <- numerator <- tapply(combined_prob, sim_prop_la.fit$LAname, sum)

## given tested in last year, what are the proportions?
## cf Gelman, Parks

## p(GUMvenue AND category)
denominator <- pred.GUMvenue * sim_prop_la.fit$totalprob
## weighted average by LA
denominator <- tapply(denominator, sim_prop_la.fit$LAname, sum)

LApred <- numerator/denominator

The LA specific post-stratified estimates are then

(LApred <- data.frame(LApred=LApred[order(LApred)]))

# popCensus <- read.csv("..\\..\\packages\\STIecoPredict\\raw-data\\popCensus.csv")
LApred$LAname <- rownames(LApred)
popCensus$LAname <- STIecoPredict:::LAnameClean(popCensus$LAname)
rownames(LApred) <- NULL
LApred <- merge(LApred, popCensus[popCensus$Sex=="All", c("LAname","prob.25.and.over")], all.x=TRUE)
LApred$LApred.adj <- LApred$LApred*(1-LApred$prob.25.and.over)

LApred[order(LApred$LApred.adj),]

save(pred, LApred, file="data/predictions.RData")

## raw
NatsalLA <- melt(tapply(Natsal2$cttestly, Natsal2$laname, mean), varnames = c("LA.Name")) #direct estimate
NatsalLAsize <- melt(tapply(Natsal2$cttestly, Natsal2$laname, length), varnames = c("LA.Name"))
#as.NatsalLAsize <- data.frame(table(Natsal2$laname))

## with sample weights
NatsalLA.weights <- ddply(.data=Natsal2, .variables=.(laname), function(y) data.frame(wtmean=weighted.mean(x=y$cttestly, w=y$total_wt)))

## use prediction instead of data
# Natsal2$cttestly.predict <- predict(fit, type="response")
# NatsalLA.predict <- melt(tapply(Natsal2$cttestly.predict, Natsal2$laname, mean), varnames = c("LA.Name")) #direct estimate
# NatsalLA.weights.predict <- ddply(.data=Natsal2, .variables=.(laname), function(y) data.frame(wtmean=weighted.mean(x=y$cttestly.predict, w=y$total_wt)))

NatsalLA.GUM.weights <- ddply(.data=Natsal2, .variables=.(laname), function(y) data.frame(GUMwtmean=weighted.mean(x=(y$cttestly & y$GUMvenue), w=y$total_wt)))

Surveillance data

Combine predictions with the surveillance data and produce summary statistics for each area.

CTADGUM_pred <- STIecoPredict:::joinAllOutcomeData(LApred)
CTADGUM_pred <- STIecoPredict:::calcStats.CTADGUM_pred(CTADGUM_pred)

CTADGUM_pred <- merge(CTADGUM_pred, NatsalLA[,c("LA.Name", "value")], by.x = "LA Name", by.y="LA.Name", all.x=TRUE)
CTADGUM_pred <- merge(CTADGUM_pred, NatsalLA.weights[,c("laname", "wtmean")], by.x = "LA Name", by.y="laname", all.x=TRUE)
CTADGUM_pred <- merge(CTADGUM_pred, NatsalLA.GUM.weights[,c("laname", "GUMwtmean")], by.x = "LA Name", by.y="laname", all.x=TRUE)
CTADGUM_pred <- merge(CTADGUM_pred, NatsalLAsize[,c("LA.Name", "value")], by.x="LA Name", by.y="LA.Name", all.x=TRUE)

names(CTADGUM_pred)[names(CTADGUM_pred)%in%c("value.x", "value.y")] <- c("NatsalLA", "NatsalLAsize")
CTADGUM_pred$NatsalLAsize[is.na(CTADGUM_pred$NatsalLAsize)] <- 1.5  #missing LAs

CTADGUM_pred$`LA Name` <- as.factor(CTADGUM_pred$`LA Name`)
CTADGUM_pred$laname.num <- as.numeric(CTADGUM_pred$`LA Name`)

CTADGUM_pred$Region <- droplevels(CTADGUM_pred$Region)
CTADGUM_pred <- CTADGUM_pred[order(CTADGUM_pred$Region), ]
x <- by(CTADGUM_pred, CTADGUM_pred$Region, nrow)
CTADGUM_pred$LAnum_inregion <- unlist(sapply(x, function(z) z:1))

save(CTADGUM_pred, file="data/CTADGUM_pred.RData")

n8thangreen/STIecoPredict documentation built on June 7, 2020, 12:50 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

n8thangreen/STIecoPredict
Multilevel Regression With Post-Stratification Using Natsal Data

In n8thangreen/STIecoPredict: Multilevel Regression With Post-Stratification Using Natsal Data

Missing LA Random Effects and Neighbour Smoothing

Append predictions to joint distribution dataset

Surveillance data

R Package Documentation

Browse R Packages

We want your feedback!

n8thangreen/STIecoPredict Multilevel Regression With Post-Stratification Using Natsal Data

In n8thangreen/STIecoPredict: Multilevel Regression With Post-Stratification Using Natsal Data

Missing LA Random Effects and Neighbour Smoothing

Append predictions to joint distribution dataset

Surveillance data

R Package Documentation

Browse R Packages

We want your feedback!

n8thangreen/STIecoPredict
Multilevel Regression With Post-Stratification Using Natsal Data