Regression Calibration with the NCI Method

Amer Moosa

1/17/2024

This example demonstrates regression calibration with standard errors from balanced repeated replication (BRR).

Workflow:

  1. Data Cleaning
  2. Replicate Weights
  3. Pre-Processing

For each replicate: 4. MCMC Model 5. Simulate Usual Intakes 6. Regression Calibration

  1. Variances and Confidence Intervals
library(ncimultivar)

The dataset used for this analysis is derived from the 2005-2010 NHANES data. A subset of six strata (SDMVSTRA) will be used to reduce computation time and allow this example to run in real time.

The nutrient being analyzed is potassium (TPOTA).

The covariates being examined are smoking status (SMK_REC), age (RIDAGEYR), and sex (RIAGENDR). The two nuisance covariates are whether the recall was on a weekend (Weekend) and and whether the recall is on day 2 (Day2).

The WTDRD1 variable is the base weighting for each observation.

The outcome variables are systolic blood pressure (BPSY_AVG) and hypertension (HTN_BIN). A linear model will be fit for systolic blood pressure and a logistic model will be fit for hypertension.

Subjects with missing values are removed, and variables are defined for the analysis.

#subset data
input.dataset <- nhcvd[nhcvd$SDMVSTRA %in% c(48, 54, 60, 66, 72, 78),]

#Define indicator for Day 2
input.dataset$Day2 <- (input.dataset$DAY == 2)

#remove subjects that are missing any covariates, variables, or outcomes
missing.covariates <- is.na(input.dataset$SMK_REC) | is.na(input.dataset$RIDAGEYR) | is.na(input.dataset$RIAGENDR) |
                      is.na(input.dataset$Weekend) | is.na(input.dataset$Day2)
missing.variables <- is.na(input.dataset$TPOTA)
missing.outcomes <- is.na(input.dataset$BPSY_AVG) | is.na(input.dataset$HTN_BIN)

input.dataset <- input.dataset[!missing.covariates & !missing.variables & !missing.outcomes,]

#rename potassium variable for readability
input.dataset$Potassium <- input.dataset$TPOTA

BRR replicate weights are created for the dataset.

#generate BRR weights
fay.factor <- 0.7

input.dataset <- brr_weights(input.data=input.dataset,
                             id="SEQN",
                             strata="SDMVSTRA",
                             psu="SDMVPSU",
                             cell="PSCELL",
                             weight="WTDRD1",
                             fay.factor=fay.factor)

The potassium variable can now be transformed and standardized for use in the MCMC algorithm.

#Winsorize extreme values of potassium intake
wins.potassium <- boxcox_survey(input.data=input.dataset,
                                row.subset=(input.dataset$Day2 == 0),
                                variable="Potassium",
                                weight="RepWt_0",
                                do.winsorization=TRUE,
                                id="SEQN",
                                repeat.obs="DAY")
#> Outliers and Suggested Winsorized values for Potassium
#>   SEQN DAY Potassium Potassium.winsorized
#>  57963   2         0             42.45263

input.dataset$Potassium <- ifelse(input.dataset$Potassium < 42.45263, 42.45263, input.dataset$Potassium)

#find best Box-Cox lambda parameter
boxcox.lambda.data <- boxcox_survey(input.data=input.dataset,
                                    row.subset=(input.dataset$Day2 == 0),
                                    variable="Potassium",
                                    weight="RepWt_0",
                                    covariates=c("SMK_REC", "RIDAGEYR", "RIAGENDR", "Weekend"),
                                    cat.covariates="SMK_REC",
                                    cat.reference=3)

#Calculate minimum consumption amount
minimum.amount.data <- calculate_minimum_amount(input.data=input.dataset,
                                                row.subset=(input.dataset$Day2 == 0),
                                                daily.variables="Potassium")

#Run MCMC pre-preprocessor
pre.mcmc.data <- nci_multivar_preprocessor(input.data=input.dataset,
                                           daily.variables="Potassium",
                                           continuous.covariates="RIDAGEYR",
                                           boxcox.lambda.data=boxcox.lambda.data,
                                           minimum.amount.data=minimum.amount.data)

The MCMC measurement error model can now be fit for all BRR replicates. The outcome variable is not involved in the measurement error model.

When performing regression calibration using the same population used in the MCMC (as in this example), conditional U matrix draws must be used for generating usual intakes.

num.brr <- 8

mcmc.brr <- vector(mode="list", length=num.brr + 1)

for(brr.rep in 0:num.brr) {
  
  print(paste0("Starting Iteration ", brr.rep))
  
  mcmc.brr[[brr.rep + 1]] <- nci_multivar_mcmc(pre.mcmc.data=pre.mcmc.data,
                                               id="SEQN",
                                               repeat.obs="DAY",
                                               weight=paste0("RepWt_", brr.rep),
                                               daily.variables="Potassium",
                                               default.covariates=c("SMK_REC", "std.RIDAGEYR", "RIAGENDR", "Day2", "Weekend"),
                                               cat.covariates="SMK_REC",
                                               cat.reference=3,
                                               num.mcmc.iterations=3000,
                                               num.burn=1000,
                                               num.thin=2,
                                               num.post=500,
                                               mcmc.seed=(9999 + brr.rep))
}
#> [1] "Starting Iteration 0"
#> [1] "Starting Iteration 1"
#> [1] "Starting Iteration 2"
#> [1] "Starting Iteration 3"
#> [1] "Starting Iteration 4"
#> [1] "Starting Iteration 5"
#> [1] "Starting Iteration 6"
#> [1] "Starting Iteration 7"
#> [1] "Starting Iteration 8"

The population base for simulating usual intakes is generated from the input data.

#get first instance of each subject
mcmc.input.data <- pre.mcmc.data$mcmc.input
distrib.pop <- mcmc.input.data[!duplicated(mcmc.input.data$SEQN),]
num.subjects <- nrow(distrib.pop)

#Set Day 2 to zero to factor out the effect of Day 2 recalls
distrib.pop$Day2 <- 0

#create repeats of each subject for weekday and weekend consumption
distrib.pop <- distrib.pop[rep(seq_len(num.subjects), each=2),]
distrib.pop$Weekend <- rep(c(0,1), num.subjects)
distrib.pop$Weekend.Weight <- rep(c(4,3), num.subjects)

The nci_multivar_distrib() function is used to generate 500 usual intakes for each subject in the population using the conditional U matrices from the MCMC. This dataset represents the conditional expectation of usual intake given the observed data for each subject to be used in the regression calibration procedure. It is not a prediction or calculation of the true usual intake for individuals.

The outcome variable and covariates for the regression are included in the output dataset using additional.output.

To perform regression calibration, the 500 usual intakes for each subject are averaged to calculate proxy usual intakes to use in the regression model. The regressions are then fit using the proxy usual intakes as a predictor. While the proxy intakes are used in place of the true usual intakes in the regression, they are poor predictions of true usual intake for individuals.

The model coefficients are saved using summary_coefficients().

This process is repeated for the base weight and each set of replicate weights.

summary.brr <- vector(mode="list", length=num.brr + 1)

for(brr.rep in 0:num.brr) {
  
  print(paste0("Starting Iteration ", brr.rep))

  #simulate usual intakes for each subject using MCMC U matrices
  distrib.data <- nci_multivar_distrib(multivar.mcmc.model=mcmc.brr[[brr.rep + 1]],
                                       distrib.population=distrib.pop,
                                       id="SEQN",
                                       weight=paste0("RepWt_", brr.rep),
                                       nuisance.weight="Weekend.Weight",
                                       additional.output=c("SMK_REC", "RIDAGEYR", "RIAGENDR", "BPSY_AVG", "HTN_BIN"),
                                       use.mcmc.u.matrices=TRUE,
                                       distrib.seed=99999+brr.rep)

  #take average usual intake per subject
  regression.data <- aggregate(distrib.data[,"usual.intake.Potassium",drop=FALSE], 
                               by=distrib.data[,c("SEQN", paste0("RepWt_", brr.rep), "SMK_REC", "RIDAGEYR", "RIAGENDR", 
                                                  "BPSY_AVG", "HTN_BIN")], 
                               mean)
  
  #scale down simulated potassium intake by 1000 to show the effect per 1,000 mg of potassium
  regression.data$usual.intake.Potassium <- regression.data$usual.intake.Potassium/1000
  
  #define smoking status as factor for regression
  regression.data$SMK_REC <- factor(regression.data$SMK_REC, levels=c(3, 1, 2), labels=c("Never", "Current", "Former"))
  
  #rescale weights so that models can converge
  regression.data[,paste0("RepWt_", brr.rep)] <- regression.data[,paste0("RepWt_", brr.rep)]/mean(regression.data[,paste0("RepWt_", brr.rep)])
  
  #fit linear model of systolic blood pressure against the average simulated potassium intakes
  bp.model <- lm(BPSY_AVG ~ usual.intake.Potassium + SMK_REC + RIDAGEYR + RIAGENDR, 
                 data=regression.data, 
                 weights=regression.data[,paste0("RepWt_", brr.rep)])
  
  bp.parameters <- summary_coefficients(model=bp.model)
  
  #fit logistic model of diagnosed hypertension against the average simulated potassium intakes
  #family is specified as quasibinomial to handle non-integer weights
  htn.model <- glm(HTN_BIN ~ usual.intake.Potassium + SMK_REC + RIDAGEYR + RIAGENDR,
                   data=regression.data, 
                   weights=regression.data[,paste0("RepWt_", brr.rep)],
                   family=quasibinomial(link="logit"))
  
  htn.parameters <- summary_coefficients(model=htn.model)
  
  summary.brr[[brr.rep + 1]] <- rbind(bp.parameters, 
                                      htn.parameters)
}
#> [1] "Starting Iteration 0"
#> [1] "Starting Iteration 1"
#> [1] "Starting Iteration 2"
#> [1] "Starting Iteration 3"
#> [1] "Starting Iteration 4"
#> [1] "Starting Iteration 5"
#> [1] "Starting Iteration 6"
#> [1] "Starting Iteration 7"
#> [1] "Starting Iteration 8"

Variances and confidence intervals are calculated using the BRR replicates.

#calculate degrees of freedom
df <- length(unique(input.dataset$SDMVSTRA))

#extract point estimate and BRR replicates
point <- summary.brr[[1]]$value
reps <- vapply(summary.brr[-1], function(brr.i) brr.i$value, point)

#calculate BRR standard error
std.error <- sqrt(rowSums((reps - point)^2)/(num.brr*fay.factor^2))

#95% confidence intervals
confidence.lower <- point + qt(0.025, df)*std.error
confidence.upper <- point + qt(0.975, df)*std.error

#create summary report
summary.report <- data.frame(summary.brr[[1]], 
                             std.error=std.error,
                             confidence.lower=confidence.lower,
                             confidence.upper=confidence.upper)

summary.report
#>    population variable                              statistic        value
#> 1         All BPSY_AVG            Coefficient for (Intercept) 113.05108719
#> 2         All BPSY_AVG Coefficient for usual.intake.Potassium  -2.35735666
#> 3         All BPSY_AVG         Coefficient for SMK_RECCurrent  -0.59002768
#> 4         All BPSY_AVG          Coefficient for SMK_RECFormer   0.28047403
#> 5         All BPSY_AVG               Coefficient for RIDAGEYR   0.35397544
#> 6         All BPSY_AVG               Coefficient for RIAGENDR  -6.51452720
#> 7         All  HTN_BIN            Coefficient for (Intercept)  -2.88269021
#> 8         All  HTN_BIN Coefficient for usual.intake.Potassium  -0.22628765
#> 9         All  HTN_BIN         Coefficient for SMK_RECCurrent   0.60912325
#> 10        All  HTN_BIN          Coefficient for SMK_RECFormer   0.24351942
#> 11        All  HTN_BIN               Coefficient for RIDAGEYR   0.04354076
#> 12        All  HTN_BIN               Coefficient for RIAGENDR  -0.36279833
#>      std.error confidence.lower confidence.upper
#> 1  4.732255653     101.47167475     124.63049963
#> 2  1.221397873      -5.34600959       0.63129627
#> 3  0.990866015      -3.01458947       1.83453412
#> 4  0.539864757      -1.04052744       1.60147550
#> 5  0.016996676       0.31238607       0.39556481
#> 6  1.117872687      -9.24986313      -3.77919127
#> 7  0.465241367      -4.02109483      -1.74428560
#> 8  0.131607955      -0.54832072       0.09574542
#> 9  0.231068624       0.04371870       1.17452781
#> 10 0.230022847      -0.31932621       0.80636505
#> 11 0.004818188       0.03175108       0.05533045
#> 12 0.218058347      -0.89636788       0.17077122