library(lme4)
library(car)
library(arm)
library(pander)
library(knitr)
library(stargazer)
library(xtable)
library(lattice)
library(plyr)
library(stringr)
library(reshape2)

# library(STIecoPredict)
library("STIecoPredict", lib.loc="~/R/win-library/3.2")
savedata <- FALSE
load("./data/cleaned-regn-input-mrpNatsal.RData")
Natsal0 <- Natsal
Natsal <- subset(Natsal, age>15 & age<25)   #NCSP range

## fit to London LAs only
subnames <- unique(Natsal$laname[Natsal$gor==9])
Natsal2  <- subset(Natsal, laname%in%subnames)

Natsal2 <- Natsal

## fit to non-small sampled LAs
# subnames2 <- names(table(Natsal2$laname)[(table(Natsal2$laname))>20])
# Natsal2  <- subset(Natsal2, laname%in%subnames2)
## >28 is 75% quantile
fit <- glmer(formula = cttestly ~ 1 + student + sex + (1|age) + (1|ethnic2) + I(`Average Score`>28) + (1|metcounty_UA) + (1|gor),# + (1|laname),#  , # + I(gor==9),
             family = binomial(link="logit"),
             weights = total_wt.int,
             # data = Natsal)
             data = Natsal2)

lattice::dotplot(ranef(fit, condVar=TRUE))
## add some missing random effects
missingLA.names <- unique(Natsal$laname[Natsal$gor==9])[!unique(Natsal$laname[Natsal$gor==9])%in%rownames(ranef(fit)$laname)]

la.ranefs <- array(0, dim=c(length(missingLA.names),1))
dimnames(la.ranefs) <- list(missingLA.names, "(Intercept)")
la.ranefs <- rbind(ranef(fit)$laname, la.ranefs)
## fitted LAs
LAnames <- rownames(ranef(fit)$laname)
numNeighbours <- rowSums(madjacency[,-1])

madjacencyRE <- madjacency
for (i in LAnames){
    madjacencyRE[, i] <- madjacencyRE[, i]*ranef(fit)$laname[i,]
}
madjacencyRE[madjacencyRE==1] <- 0

rowAverages <- array(rowSums(madjacencyRE[,-1])/numNeighbours, dim=c(length(numNeighbours),1))
rowAverages[is.nan(rowAverages)] <- 0
dimnames(rowAverages) <- list(madjacencyRE[,1], "(Intercept)")

missingLANatsal.names <- unique(Natsal$laname)[!unique(Natsal$laname)%in%rownames(ranef(fit)$laname)]
missingLAsim_prop.names <- unique(sim_prop_la$LAname)[!unique(sim_prop_la$LAname)%in%rownames(ranef(fit)$laname)]
missingLA.names <- union(missingLAsim_prop.names, missingLANatsal.names)

la.ranefs <- subset(rowAverages, rownames(rowAverages) %in% missingLA.names)
la.ranefs <- rbind(ranef(fit)$laname, la.ranefs)
names(madjacency) <- LAnameClean(names(madjacency))
madjacency[,1] <- LAnameClean(madjacency[, 1])

## fitted LAs
LAnames <- rownames(ranef(fit)$laname)

madjacencyRE <- madjacency

madjacencyRE <- madjacencyRE[, c(TRUE,(names(madjacency)%in%LAnames)[-1])]
madjacencyRE <- madjacencyRE[(names(madjacency)%in%LAnames)[-1],]

for (i in 1:nrow(madjacencyRE)) madjacencyRE[i, i+1] <- 1
numNeighbours <- rowSums(madjacencyRE[, -1])

for (i in LAnames){
    madjacencyRE[, i] <- madjacencyRE[, i] * ranef(fit)$laname[i, ]
}
madjacencyRE[madjacencyRE == 1] <- 0

rowAverages <- array(rowSums(madjacencyRE[,-1])/numNeighbours, dim=c(length(numNeighbours),1))
rowAverages[is.nan(rowAverages)] <- 0
dimnames(rowAverages) <- list(madjacencyRE[,1], "(Intercept)")

missingLANatsal.names <- unique(Natsal$laname)[!unique(Natsal$laname)%in%rownames(rowAverages)]
missingLAsim_prop.names <- unique(sim_prop_la$LAname)[!unique(sim_prop_la$LAname)%in%rownames(rowAverages)]
missingLA.names <- union(missingLAsim_prop.names, missingLANatsal.names)

la.ranefs <- array(0, dim=c(length(missingLA.names),1))
dimnames(la.ranefs) <- list(missingLA.names, "(Intercept)")
la.ranefs <- rbind(rowAverages, la.ranefs)

lattice::dotplot(sort(la.ranefs[la.ranefs!=0,]))

try different ethnic group probabilities

# load("../../packages/STIecoPredict/data/LA_ethnic2_london.RData")
sim_prop_la <- sim_prop_la[,names(sim_prop_la)!="p.ethnic2"]
sim_prop_la <- merge(sim_prop_la, LA_ethnic2_london, by=c("LAname", "ethnic2", "age", "sex"), all.x=FALSE, all.y=FALSE)

Append predictions to joint distribution dataset

fixeff.formula <- as.formula("cttestly ~ student + sex + age + ethnic2")

# test.calcTotalProbs(sim_prop_la)
sim_prop_la.fit <- STIecoPredict:::calcTotalProbs(formula=fixeff.formula, data=sim_prop_la, extracols = c("LAname", "Average Score", "metcounty_UA"))

pred <- arm::invlogit(
                    fixef(fit)["(Intercept)"] +

                    fixef(fit)["sexWomen"]*(as.character(sim_prop_la.fit$sex)=="Women") +

                    fixef(fit)["studentTRUE"]*(sim_prop_la.fit$student) +

                    ranef(fit)$ethnic2[as.character(sim_prop_la.fit$ethnic2),1] +

                    ranef(fit)$age[as.character(sim_prop_la.fit$age),1] +

                    # ranef(fit)$laname[as.character(sim_prop_la.fit$LAname),1] +
                    la.ranefs[as.character(sim_prop_la.fit$LAname),1] +

                    ranef(fit)$metcounty_UA[as.character(sim_prop_la.fit$metcounty_UA),1] +

                    fixef(fit)["I(`Average Score` > 28)TRUE"]*(sim_prop_la.fit$`Average Score`>28)
)
## weight the probabilities by the subpopulation sizes and then sum by LA
# tapply(sim_prop_la.fit$totalprob, sim_prop_la.fit$LAname, sum)

predweighted <- pred * sim_prop_la.fit$totalprob
LApred <- tapply(predweighted, sim_prop_la.fit$LAname, sum)

The LA specific post-stratified estimates are then

LApred <- data.frame(LApred=LApred[order(LApred)])

Re-adjust for conditioning of ages between 16-24 only

# popCensus <- read.csv("..\\..\\packages\\STIecoPredict\\raw-data\\popCensus.csv")
LApred$LAname <- rownames(LApred)
popCensus$LAname <- STIecoPredict:::LAnameClean(popCensus$LAname)
rownames(LApred) <- NULL
LApred <- merge(LApred, popCensus[popCensus$Sex=="All", c("LAname","prob.25.and.over")], all.x=TRUE)
LApred$LApred.adj <- LApred$LApred*(1-LApred$prob.25.and.over)

# LApred[order(LApred$LApred.adj),]
save(pred, LApred, file="data/predictions.RData")
agerange <- 15:24
NatsalLA <- melt(tapply(Natsal$cttestly[Natsal$age%in%agerange], Natsal$laname[Natsal$age%in%agerange], mean), varnames = c("LA.Name"))
NatsalLAsize <- melt(tapply(Natsal$cttestly[Natsal$age%in%agerange], Natsal$laname[Natsal$age%in%agerange], length), varnames = c("LA.Name"))

Surveillance data

Combine predictions with the surveillance data and produce summary statistics for each area.

CTADGUM_pred <- STIecoPredict:::joinAllOutcomeData(LApred)
CTADGUM_pred <- STIecoPredict:::calcStats.CTADGUM_pred(CTADGUM_pred)
CTADGUM_pred <- merge(CTADGUM_pred, NatsalLA[,c("LA.Name", "value")], by.x = "LA Name", by.y="LA.Name", all.x=TRUE)
CTADGUM_pred <- merge(CTADGUM_pred, NatsalLAsize[,c("LA.Name", "value")], by.x="LA Name", by.y="LA.Name", all.x=TRUE)
names(CTADGUM_pred)[names(CTADGUM_pred)%in%c("value.x", "value.y")] <- c("NatsalLA", "NatsalLAsize")
save(CTADGUM_pred, file="data/CTADGUM_pred.RData")

Include uncertainty on ethnicity probabilities

TODO

library(gtools)
temp <- dcast(sim_prop_la, LAname+age+sex+smokenow+increasingdrinker+student~ethnic2, value.var = "p.ethnic2")
temp2 <- t(round(apply(temp[,7:12]*10, 1, function(x) rdirichlet(n=1, alpha=x)),5))
apply(temp2, 2, median)
colnames(temp2) <- names(temp[,7:12])
mtemp <- data.frame(temp[,-(7:12)], temp2)
sample.ethnic2 <- melt(mtemp, measure.vars = make.names(names(temp[,7:12])), variable.name = "ethnic2")


n8thangreen/STIecoPredict documentation built on June 7, 2020, 12:50 p.m.