This example demonstrates multiple imputation with standard errors from balanced repeated replication (BRR).
Workflow:
For each replicate: 4. MCMC Model 5. Draw Parameter Sets
For each parameter set:
6. Simulate Usual Intakes
7. Regression Model
8. Average Model Coefficients
The dataset used for this analysis is derived from the 2005-2010
NHANES data. A subset of six strata (SDMVSTRA) will be used
to reduce computation time and allow this example to run in real
time.
The nutrient being analyzed is potassium (TPOTA).
The covariates being examined are smoking status
(SMK_REC), age (RIDAGEYR), and sex
(RIAGENDR). The two nuisance covariates are whether the
recall was on a weekend (Weekend) and and whether the
recall is on day 2 (Day2).
The outcome variable being measured will be systolic blood pressure
(BPSY_AVG).
Subjects with missing values are removed, and variables are defined for the analysis.
#subset data
input.dataset <- nhcvd[nhcvd$SDMVSTRA %in% c(48, 54, 60, 66, 72, 78),]
#Define indicator for Day 2
input.dataset$Day2 <- (input.dataset$DAY == 2)
#remove subjects that are missing any covariates, variables, or outcomes
missing.covariates <- is.na(input.dataset$SMK_REC) | is.na(input.dataset$RIDAGEYR) | is.na(input.dataset$RIAGENDR) |
is.na(input.dataset$Weekend) | is.na(input.dataset$Day2)
missing.variables <- is.na(input.dataset$TPOTA)
missing.outcomes <- is.na(input.dataset$BPSY_AVG)
input.dataset <- input.dataset[!missing.covariates & !missing.variables & !missing.outcomes,]
#rename potassium variable for readability
input.dataset$Potassium <- input.dataset$TPOTABRR replicate weights are created for the dataset.
#generate BRR weights
fay.factor <- 0.7
input.dataset <- brr_weights(input.data=input.dataset,
id="SEQN",
strata="SDMVSTRA",
psu="SDMVPSU",
cell="PSCELL",
weight="WTDRD1",
fay.factor=fay.factor)The potassium variable can now be transformed and standardized for use in the MCMC algorithm.
#Winsorize extreme values of potassium intake
wins.potassium <- boxcox_survey(input.data=input.dataset,
row.subset=(input.dataset$Day2 == 0),
variable="Potassium",
weight="RepWt_0",
do.winsorization=TRUE,
id="SEQN",
repeat.obs="DAY")
#> Outliers and Suggested Winsorized values for Potassium
#> SEQN DAY Potassium Potassium.winsorized
#> 57963 2 0 42.45263
input.dataset$Potassium <- ifelse(input.dataset$Potassium < 42.45263, 42.45263, input.dataset$Potassium)
#find best Box-Cox lambda parameter
boxcox.lambda.data <- boxcox_survey(input.data=input.dataset,
row.subset=(input.dataset$Day2 == 0),
variable="Potassium",
weight="RepWt_0",
covariates=c("SMK_REC", "RIDAGEYR", "RIAGENDR", "Weekend"),
cat.covariates="SMK_REC",
cat.reference=3)
#Calculate minimum consumption amount
minimum.amount.data <- calculate_minimum_amount(input.data=input.dataset,
row.subset=(input.dataset$Day2 == 0),
daily.variables="Potassium")
#transform and standardize
pre.mcmc.data <- nci_multivar_preprocessor(input.data=input.dataset,
daily.variables="Potassium",
continuous.covariates="RIDAGEYR",
boxcox.lambda.data=boxcox.lambda.data,
minimum.amount.data=minimum.amount.data)MCMC models are fit for the base weight and each set of replicate weights. Unlike in regression calibration, the outcome variable is included as a covariate in the measurement error model.
Instead of conditional U matrices drawn after the MCMC chain,
multiple imputation draws U matrices from the chain itself. The
save.mcmc.u parameter is set to TRUE so that
the U matrices created in the main MCMC chain are saved.
num.brr <- 8
mcmc.brr <- vector(mode="list", length=num.brr + 1)
for(brr.rep in 0:num.brr) {
print(paste0("Starting Iteration ", brr.rep))
mcmc.brr[[brr.rep + 1]] <- nci_multivar_mcmc(pre.mcmc.data=pre.mcmc.data,
id="SEQN",
repeat.obs="DAY",
weight=paste0("RepWt_", brr.rep),
daily.variables="Potassium",
default.covariates=c("SMK_REC", "std.RIDAGEYR", "RIAGENDR", "Day2", "Weekend", "BPSY_AVG"),
cat.covariates="SMK_REC",
cat.reference=3,
num.mcmc.iterations=3000,
num.burn=1000,
num.thin=2,
save.mcmc.u=TRUE,
mcmc.seed=(9999 + brr.rep))
}
#> [1] "Starting Iteration 0"
#> [1] "Starting Iteration 1"
#> [1] "Starting Iteration 2"
#> [1] "Starting Iteration 3"
#> [1] "Starting Iteration 4"
#> [1] "Starting Iteration 5"
#> [1] "Starting Iteration 6"
#> [1] "Starting Iteration 7"
#> [1] "Starting Iteration 8"The population base for simulating usual intakes is generated from the input data.
#get first instance of each subject
mcmc.input.data <- pre.mcmc.data$mcmc.input
distrib.pop <- mcmc.input.data[!duplicated(mcmc.input.data$SEQN),]
num.subjects <- nrow(distrib.pop)
#Set Day 2 to zero to factor out the effect of Day 2 recalls
distrib.pop$Day2 <- 0
#create repeats of each subject for weekday and weekend consumption
distrib.pop <- distrib.pop[rep(seq_len(num.subjects), each=2),]
distrib.pop$Weekend <- rep(c(0,1), num.subjects)
distrib.pop$Weekend.Weight <- rep(c(4,3), num.subjects)To perform multiple imputation, parameter sets will be drawn from the
distribution of the MCMC parameters. This can be using the
draw_parameters() function which will select iterations in
the MCMC chain that are spaced far enough apart to ensure that they are
independent. This example will use 10 parameter sets, specified using
num.draws in draw_parameters().
A single set of usual intakes are then simulated with
nci_multivar_distrib() for each parameter set. Since U
matrices in the main MCMC chain were saved from
nci_multivar_mcmc(), a U matrix conditional on the
parameters will be drawn with each parameter set.
A regression model is fit on the simulated usual intakes for each parameter set. The model coefficients are averaged across the parameter sets and saved for each BRR replicate.
summary.brr <- vector(mode="list", length=num.brr + 1)
for(brr.rep in 0:num.brr) {
print(paste0("Starting Iteration ", brr.rep))
#draw 10 sets of parameters from MCMC parameter distribution
num.sets <- 10
mcmc.parameter.sets <- draw_parameters(mcmc.brr[[brr.rep + 1]], num.draws=num.sets)
model.coefficients <- vector(mode="list", length=num.sets)
for(set.num in 1:num.sets) {
#simulate 1 usual intake for each subject using the U matrix from the model parameter set
regression.data <- nci_multivar_distrib(multivar.mcmc.model=mcmc.parameter.sets[[set.num]],
distrib.population=distrib.pop,
id="SEQN",
weight=paste0("RepWt_", brr.rep),
nuisance.weight="Weekend.Weight",
additional.output=c("SMK_REC", "RIDAGEYR", "RIAGENDR", "BPSY_AVG"),
use.mcmc.u.matrices=TRUE,
distrib.seed=(99999 + num.sets*brr.rep + set.num))
#scale down simulated potassium intake by 1000 to show the effect per 1,000 mg of potassium
regression.data$usual.intake.Potassium <- regression.data$usual.intake.Potassium/1000
#define smoking status as factor for regression
regression.data$SMK_REC <- factor(regression.data$SMK_REC, levels=c(3, 1, 2), labels=c("Never", "Current", "Former"))
#fit linear model of systolic blood pressure against the average simulated potassium intakes and save coefficients
bp.model <- lm(BPSY_AVG ~ usual.intake.Potassium + SMK_REC + RIDAGEYR + RIAGENDR,
data=regression.data,
weights=regression.data[,paste0("RepWt_", brr.rep)])
model.coefficients[[set.num]] <- summary_coefficients(model=bp.model)
}
#save average of predicted values across parameter sets
coefficient.values <- vapply(model.coefficients, function(set) set$value, model.coefficients[[1]]$value)
mean.coefficients <- rowMeans(coefficient.values)
summary.brr[[brr.rep + 1]] <- data.frame(model.coefficients[[1]][,c("population", "variable", "statistic")],
value=mean.coefficients)
}
#> [1] "Starting Iteration 0"
#> [1] "Starting Iteration 1"
#> [1] "Starting Iteration 2"
#> [1] "Starting Iteration 3"
#> [1] "Starting Iteration 4"
#> [1] "Starting Iteration 5"
#> [1] "Starting Iteration 6"
#> [1] "Starting Iteration 7"
#> [1] "Starting Iteration 8"Variances and confidence intervals are calculated using the BRR replicates.
#calculate degrees of freedom
df <- length(unique(input.dataset$SDMVSTRA))
#extract point estimate and BRR replicates
point <- summary.brr[[1]]$value
reps <- vapply(summary.brr[-1], function(brr.i) brr.i$value, point)
#calculate BRR standard error
std.error <- sqrt(rowSums((reps - point)^2)/(num.brr*fay.factor^2))
#95% confidence intervals
confidence.lower <- point + qt(0.025, df)*std.error
confidence.upper <- point + qt(0.975, df)*std.error
#create summary report
summary.report <- data.frame(summary.brr[[1]],
std.error=std.error,
confidence.lower=confidence.lower,
confidence.upper=confidence.upper)
summary.report
#> population variable statistic value
#> 1 All BPSY_AVG Coefficient for (Intercept) 112.7443138
#> 2 All BPSY_AVG Coefficient for usual.intake.Potassium -2.2471936
#> 3 All BPSY_AVG Coefficient for SMK_RECCurrent -0.5918735
#> 4 All BPSY_AVG Coefficient for SMK_RECFormer 0.2466777
#> 5 All BPSY_AVG Coefficient for RIDAGEYR 0.3535025
#> 6 All BPSY_AVG Coefficient for RIAGENDR -6.4673766
#> std.error confidence.lower confidence.upper
#> 1 5.19803152 100.0251889 125.4634387
#> 2 1.40398453 -5.6826200 1.1882328
#> 3 0.99822300 -3.0344372 1.8506902
#> 4 0.51721883 -1.0189112 1.5122666
#> 5 0.01762723 0.3103703 0.3966348
#> 6 1.12918523 -9.2303933 -3.7043598