Multiple Imputation with BRR Standard Errors

Amer Moosa

7/23/2024

This example demonstrates multiple imputation with standard errors from balanced repeated replication (BRR).

Workflow:

  1. Data Cleaning
  2. Replicate Weights
  3. Pre-Processing

For each replicate: 4. MCMC Model 5. Draw Parameter Sets

For each parameter set:
  6. Simulate Usual Intakes
  7. Regression Model
  
8. Average Model Coefficients
  1. Variances and Confidence Intervals
library(ncimultivar)

The dataset used for this analysis is derived from the 2005-2010 NHANES data. A subset of six strata (SDMVSTRA) will be used to reduce computation time and allow this example to run in real time.

The nutrient being analyzed is potassium (TPOTA).

The covariates being examined are smoking status (SMK_REC), age (RIDAGEYR), and sex (RIAGENDR). The two nuisance covariates are whether the recall was on a weekend (Weekend) and and whether the recall is on day 2 (Day2).

The outcome variable being measured will be systolic blood pressure (BPSY_AVG).

Subjects with missing values are removed, and variables are defined for the analysis.

#subset data
input.dataset <- nhcvd[nhcvd$SDMVSTRA %in% c(48, 54, 60, 66, 72, 78),]

#Define indicator for Day 2
input.dataset$Day2 <- (input.dataset$DAY == 2)

#remove subjects that are missing any covariates, variables, or outcomes
missing.covariates <- is.na(input.dataset$SMK_REC) | is.na(input.dataset$RIDAGEYR) | is.na(input.dataset$RIAGENDR) | 
                      is.na(input.dataset$Weekend) | is.na(input.dataset$Day2)
missing.variables <- is.na(input.dataset$TPOTA)
missing.outcomes <- is.na(input.dataset$BPSY_AVG)

input.dataset <- input.dataset[!missing.covariates & !missing.variables & !missing.outcomes,]

#rename potassium variable for readability
input.dataset$Potassium <- input.dataset$TPOTA

BRR replicate weights are created for the dataset.

#generate BRR weights
fay.factor <- 0.7

input.dataset <- brr_weights(input.data=input.dataset,
                             id="SEQN",
                             strata="SDMVSTRA",
                             psu="SDMVPSU",
                             cell="PSCELL",
                             weight="WTDRD1",
                             fay.factor=fay.factor)

The potassium variable can now be transformed and standardized for use in the MCMC algorithm.

#Winsorize extreme values of potassium intake
wins.potassium <- boxcox_survey(input.data=input.dataset,
                                row.subset=(input.dataset$Day2 == 0),
                                variable="Potassium",
                                weight="RepWt_0",
                                do.winsorization=TRUE,
                                id="SEQN",
                                repeat.obs="DAY")
#> Outliers and Suggested Winsorized values for Potassium
#>   SEQN DAY Potassium Potassium.winsorized
#>  57963   2         0             42.45263

input.dataset$Potassium <- ifelse(input.dataset$Potassium < 42.45263, 42.45263, input.dataset$Potassium)

#find best Box-Cox lambda parameter
boxcox.lambda.data <- boxcox_survey(input.data=input.dataset,
                                    row.subset=(input.dataset$Day2 == 0),
                                    variable="Potassium",
                                    weight="RepWt_0",
                                    covariates=c("SMK_REC", "RIDAGEYR", "RIAGENDR", "Weekend"),
                                    cat.covariates="SMK_REC",
                                    cat.reference=3)

#Calculate minimum consumption amount
minimum.amount.data <- calculate_minimum_amount(input.data=input.dataset,
                                                row.subset=(input.dataset$Day2 == 0),
                                                daily.variables="Potassium")

#transform and standardize
pre.mcmc.data <- nci_multivar_preprocessor(input.data=input.dataset,
                                           daily.variables="Potassium",
                                           continuous.covariates="RIDAGEYR",
                                           boxcox.lambda.data=boxcox.lambda.data,
                                           minimum.amount.data=minimum.amount.data)

MCMC models are fit for the base weight and each set of replicate weights. Unlike in regression calibration, the outcome variable is included as a covariate in the measurement error model.

Instead of conditional U matrices drawn after the MCMC chain, multiple imputation draws U matrices from the chain itself. The save.mcmc.u parameter is set to TRUE so that the U matrices created in the main MCMC chain are saved.

num.brr <- 8

mcmc.brr <- vector(mode="list", length=num.brr + 1)

for(brr.rep in 0:num.brr) {
  
  print(paste0("Starting Iteration ", brr.rep))
  
  mcmc.brr[[brr.rep + 1]] <- nci_multivar_mcmc(pre.mcmc.data=pre.mcmc.data,
                                               id="SEQN",
                                               repeat.obs="DAY",
                                               weight=paste0("RepWt_", brr.rep),
                                               daily.variables="Potassium",
                                               default.covariates=c("SMK_REC", "std.RIDAGEYR", "RIAGENDR", "Day2", "Weekend", "BPSY_AVG"),
                                               cat.covariates="SMK_REC",
                                               cat.reference=3,
                                               num.mcmc.iterations=3000,
                                               num.burn=1000,
                                               num.thin=2,
                                               save.mcmc.u=TRUE,
                                               mcmc.seed=(9999 + brr.rep))
}
#> [1] "Starting Iteration 0"
#> [1] "Starting Iteration 1"
#> [1] "Starting Iteration 2"
#> [1] "Starting Iteration 3"
#> [1] "Starting Iteration 4"
#> [1] "Starting Iteration 5"
#> [1] "Starting Iteration 6"
#> [1] "Starting Iteration 7"
#> [1] "Starting Iteration 8"

The population base for simulating usual intakes is generated from the input data.

#get first instance of each subject
mcmc.input.data <- pre.mcmc.data$mcmc.input
distrib.pop <- mcmc.input.data[!duplicated(mcmc.input.data$SEQN),]
num.subjects <- nrow(distrib.pop)

#Set Day 2 to zero to factor out the effect of Day 2 recalls
distrib.pop$Day2 <- 0

#create repeats of each subject for weekday and weekend consumption
distrib.pop <- distrib.pop[rep(seq_len(num.subjects), each=2),]
distrib.pop$Weekend <- rep(c(0,1), num.subjects)
distrib.pop$Weekend.Weight <- rep(c(4,3), num.subjects)

To perform multiple imputation, parameter sets will be drawn from the distribution of the MCMC parameters. This can be using the draw_parameters() function which will select iterations in the MCMC chain that are spaced far enough apart to ensure that they are independent. This example will use 10 parameter sets, specified using num.draws in draw_parameters().

A single set of usual intakes are then simulated with nci_multivar_distrib() for each parameter set. Since U matrices in the main MCMC chain were saved from nci_multivar_mcmc(), a U matrix conditional on the parameters will be drawn with each parameter set.

A regression model is fit on the simulated usual intakes for each parameter set. The model coefficients are averaged across the parameter sets and saved for each BRR replicate.

summary.brr <- vector(mode="list", length=num.brr + 1)

for(brr.rep in 0:num.brr) {
  
  print(paste0("Starting Iteration ", brr.rep))
  
  #draw 10 sets of parameters from MCMC parameter distribution
  num.sets <- 10
  mcmc.parameter.sets <- draw_parameters(mcmc.brr[[brr.rep + 1]], num.draws=num.sets)
  
  model.coefficients <- vector(mode="list", length=num.sets)
  
  for(set.num in 1:num.sets) {
    
    #simulate 1 usual intake for each subject using the U matrix from the model parameter set
    regression.data <- nci_multivar_distrib(multivar.mcmc.model=mcmc.parameter.sets[[set.num]],
                                            distrib.population=distrib.pop,
                                            id="SEQN",
                                            weight=paste0("RepWt_", brr.rep),
                                            nuisance.weight="Weekend.Weight",
                                            additional.output=c("SMK_REC", "RIDAGEYR", "RIAGENDR", "BPSY_AVG"),
                                            use.mcmc.u.matrices=TRUE,
                                            distrib.seed=(99999 + num.sets*brr.rep + set.num))
    
    #scale down simulated potassium intake by 1000 to show the effect per 1,000 mg of potassium
    regression.data$usual.intake.Potassium <- regression.data$usual.intake.Potassium/1000
    
    #define smoking status as factor for regression
    regression.data$SMK_REC <- factor(regression.data$SMK_REC, levels=c(3, 1, 2), labels=c("Never", "Current", "Former"))
    
    #fit linear model of systolic blood pressure against the average simulated potassium intakes and save coefficients
    bp.model <- lm(BPSY_AVG ~ usual.intake.Potassium + SMK_REC + RIDAGEYR + RIAGENDR, 
                   data=regression.data, 
                   weights=regression.data[,paste0("RepWt_", brr.rep)])
    
    model.coefficients[[set.num]] <- summary_coefficients(model=bp.model)
  }
  
  #save average of predicted values across parameter sets
  coefficient.values <- vapply(model.coefficients, function(set) set$value, model.coefficients[[1]]$value)
  mean.coefficients <- rowMeans(coefficient.values)
  
  summary.brr[[brr.rep + 1]] <- data.frame(model.coefficients[[1]][,c("population", "variable", "statistic")],
                                           value=mean.coefficients)
}
#> [1] "Starting Iteration 0"
#> [1] "Starting Iteration 1"
#> [1] "Starting Iteration 2"
#> [1] "Starting Iteration 3"
#> [1] "Starting Iteration 4"
#> [1] "Starting Iteration 5"
#> [1] "Starting Iteration 6"
#> [1] "Starting Iteration 7"
#> [1] "Starting Iteration 8"

Variances and confidence intervals are calculated using the BRR replicates.

#calculate degrees of freedom
df <- length(unique(input.dataset$SDMVSTRA))

#extract point estimate and BRR replicates
point <- summary.brr[[1]]$value
reps <- vapply(summary.brr[-1], function(brr.i) brr.i$value, point)

#calculate BRR standard error
std.error <- sqrt(rowSums((reps - point)^2)/(num.brr*fay.factor^2))

#95% confidence intervals
confidence.lower <- point + qt(0.025, df)*std.error
confidence.upper <- point + qt(0.975, df)*std.error

#create summary report
summary.report <- data.frame(summary.brr[[1]], 
                             std.error=std.error,
                             confidence.lower=confidence.lower,
                             confidence.upper=confidence.upper)

summary.report
#>   population variable                              statistic       value
#> 1        All BPSY_AVG            Coefficient for (Intercept) 112.7443138
#> 2        All BPSY_AVG Coefficient for usual.intake.Potassium  -2.2471936
#> 3        All BPSY_AVG         Coefficient for SMK_RECCurrent  -0.5918735
#> 4        All BPSY_AVG          Coefficient for SMK_RECFormer   0.2466777
#> 5        All BPSY_AVG               Coefficient for RIDAGEYR   0.3535025
#> 6        All BPSY_AVG               Coefficient for RIAGENDR  -6.4673766
#>    std.error confidence.lower confidence.upper
#> 1 5.19803152      100.0251889      125.4634387
#> 2 1.40398453       -5.6826200        1.1882328
#> 3 0.99822300       -3.0344372        1.8506902
#> 4 0.51721883       -1.0189112        1.5122666
#> 5 0.01762723        0.3103703        0.3966348
#> 6 1.12918523       -9.2303933       -3.7043598