/*
Title: Regression Calibration with the NCI Method
Author: NCI/Information Management Services
Date: 6/9/2025
*/

/*
The output of the NCI method can be used to perform statistical procedures for measurement error correction such as regression calibration.

This vignette assumes some familiarity with the NCI method and the ncimultivar package. 
A more in-depth overview of the basic NCI method procedures can be found in the daily nutrient vignette (daily_nutrient_analysis.sas).

This example will demonstrate the workflow of a basic regression calibration analysis:

1. Fit a measurement error model and simulate usual intakes using the NCI method
2. Fit models of outcome variables using the true intake distribution as the predictor
3. Calculate confidence intervals for the model parameters

The standard errors for the model parameters will be calculated using balanced repeated replication (BRR).
*/

libname indata "../data";

%include "../macros/ncimultivar.sas";

/*
The dataset used for this analysis is derived from the 2005-2010 NHANES data.
A subset of six strata (SDMVSTRA) will be used to reduce computation time and allow this example to run in real time.

The nutrient being analyzed is potassium (TPOTA).

The covariates being examined are smoking status (SMK_REC), age (RIDAGEYR), and sex (RIAGENDR). 
The two nuisance covariates are whether the recall was on a weekend (Weekend) and and whether the recall is on day 2 (Day2).

The WTDRD1 variable is the base weighting for each observation.

The outcome variables are systolic blood pressure (BPSY_AVG) and hypertension (HTN_BIN).
A linear model will be fit for systolic blood pressure and a logistic model will be fit for hypertension.

Subjects with missing values are removed, and categorical variables are transformed into binary indicators. 
*/

**subset data;
data input_dataset;
	set indata.nhcvd;
	if SDMVSTRA in (48, 54, 60, 66, 72, 78);
	
	**Define indicator for Day 2;
	Day2 = (DAY = 2);
run;

data input_dataset;
	set input_dataset;
	
	**remove subjects that are missing any covariates, variables, or outcomes;
	if not missing(SMK_REC) and
		 not missing(RIDAGEYR) and
		 not missing(RIAGENDR) and
		 not missing(Weekend) and
		 not missing(Day2) and
		 not missing(TPOTA) and
		 not missing(BPSY_AVG) and
		 not missing(HTN_BIN);
		 
	**break down smoking status into binary indicators;
	Current_Smoker = (SMK_REC = 1);
	Former_Smoker = (SMK_REC = 2);
	Never_Smoker = (SMK_REC = 3);
	
	**rename potassium variable for readability;
	Potassium = TPOTA;
run;

/*
BRR weights will be added to the dataset.
*/

%let fay_factor = 0.7;

%brr_weights(input_data=input_dataset,
						 id=SEQN,
						 strata=SDMVSTRA,
						 psu=SDMVPSU,
						 cell=PSCELL,
						 weight=WTDRD1,
						 fay_factor=&fay_factor.,
						 outname=input_dataset);

/*
The potassium variable can now be transformed and standardized for use in the MCMC algorithm.
*/

**Winsorize extreme values of potassium intake;
%boxcox_survey(input_data=input_dataset,
							 row_subset=%quote(Day2 = 0),
							 variable=Potassium,
							 weight=RepWt_0,
							 do_winsorization=Y,
							 id=SEQN,
							 repeat_obs=DAY);
							 
data input_dataset;
	set input_dataset;
	
	Potassium = max(Potassium, 42.45263);
run;

**run Box-Cox survey and create Box-Cox lambda data using the first recall;
%boxcox_survey(input_data=input_dataset,
							 row_subset=%quote(Day2 = 0),
							 variable=Potassium,
							 covariates=Current_Smoker Former_Smoker RIDAGEYR RIAGENDR Weekend,
							 weight=RepWt_0);
							 
**Calculate minimum amount of potassium intake in the first recall;
%calculate_minimum_amount(input_data=input_dataset,
													row_subset=%quote(Day2 = 0),
													daily_variables=Potassium);
													
**Run MCMC pre-preprocessor;
%nci_multivar_preprocessor(input_data=input_dataset,
													 daily_variables=Potassium,
													 continuous_covariates=RIDAGEYR,
													 boxcox_lambda_data=bc_Potassium,
													 minimum_amount_data=minimum_amount_data,
													 outname=model);

/*
The MCMC measurement error model can now be fit for all BRR replicates. 
The outcome variable is not involved in the measurement error model.

To perform regression calibration, draws of the U matrix conditional on the mean MCMC parameters must be taken. 
This is accomplished using the num_post parameter in the %nci_multivar_mcmc() macro. 
Each conditional U matrix draw will be used in place of a simulated U matrix in %nci_multivar_distrib() to simulate a usual intake for each subject. 
To ensure that enough data is generated to produce good estimates of the model parameters, 500 conditional U matrices will be drawn in order to simulate 500 usual intakes for each subject.
*/

%let num_brr = 8;

%macro mcmc_brr(num_brr=);

	%do brr_rep = 0 %to &num_brr.;
	
		%put Starting Iteration &brr_rep.;
		
		%nci_multivar_mcmc(pre_mcmc_data=model,
											 id=SEQN,
											 repeat_obs=DAY,
											 weight=RepWt_&brr_rep.,
											 daily_variables=Potassium,
											 default_covariates=Current_Smoker Former_Smoker std_RIDAGEYR RIAGENDR Day2 Weekend,
											 num_mcmc_iterations=3000,
											 num_burn=1000,
											 num_post=500,
											 num_thin=2,
											 mcmc_seed=%eval(9999 + &brr_rep.),
											 outname=model_brr&brr_rep.);
	%end;
%mend mcmc_brr;

%mcmc_brr(num_brr=&num_brr.);

/*
A dataset with simulated usual intakes for each subject can now be created using %nci_multivar_distrib(). 
This dataset represents the conditional expectation of usual intake given the observed data for each subject to be used in the regression calibration procedure. 
It is not a prediction or calculation of the true usual intake for individuals.

The population-based dataset is created the same way as for the daily nutrient vignette (daily_nutrient_analysis.sas).
*/

proc sort data=model_mcmc_in; by SEQN; run;
data distrib_pop;
	set model_mcmc_in;
	by SEQN;
	
	**get first instance of each subject;
	if first.SEQN then do;
	
		**Set Day 2 to zero to factor out the effect of Day 2 recalls;
		Day2 = 0;
	
		**create repeats of each subject for weekday and weekend consumption;
		Weekend = 0;
		Weekend_Weight = 4;
		output;
	
		Weekend = 1;
		Weekend_Weight = 3;
		output;
	end;
run;

/*
Each subject will have 500 simulated usual intakes from the 500 conditional U matrix draws taken from the output of %nci_multivar_mcmc().
In order to tell %nci_multivar_distrib() to use the conditional U matrix draws instead of simulating U matrices, the use_mcmc_u_matrices parameter must be set to Y.

The additional_output parameter is used to include the outcome variable and covariates in the distribution dataset to use in the regression.
The variables supplied in additional_output will be passed through from the population base dataset.

To perform regression calibration, the 500 simulated intakes for each subject need to be averaged. 
The mean simulated intakes are then used as a covariate in the regression models.
The effect per milligram of potassium is very small, so the potassium usual intake will be converted to grams so that the coefficient is more readable.
 
The coefficients of the models are saved for each BRR replicate.
*/

%macro summary_brr(num_brr=);

	%do brr_rep = 0 %to &num_brr.;
	
		%put Starting Iteration &brr_rep.;
		
		**simulate usual intakes for each subject using MCMC U matrices;
		%nci_multivar_distrib(multivar_mcmc_model=model_brr&brr_rep.,
													distrib_population=distrib_pop,
													id=SEQN,
													weight=RepWt_&brr_rep.,
													nuisance_weight=Weekend_Weight,
													additional_output=Current_Smoker Former_Smoker RIDAGEYR RIAGENDR BPSY_AVG HTN_BIN,
													use_mcmc_u_matrices=Y,
													distrib_seed=%eval(99999 + &brr_rep.),
													outname=model_distrib_out);
													
		**take average usual intake per subject;
		proc sort data=model_distrib_out; by SEQN RepWt_&brr_rep. Current_Smoker Former_Smoker RIDAGEYR RIAGENDR BPSY_AVG HTN_BIN; run;
		
		proc univariate data=model_distrib_out noprint;
			by SEQN RepWt_&brr_rep. Current_Smoker Former_Smoker RIDAGEYR RIAGENDR BPSY_AVG HTN_BIN;
			
			var usl_Potassium;
			
			output out=regression_data mean=usl_Potassium;
		run;
		
		**scale down simulated potassium intake by 1000 to show the effect per 1,000 mg of potassium;
		data regression_data;
			set regression_data;
	
			usl_Potassium = usl_Potassium/1000;
		run;
		
		ods select none;
		
		**fit linear model of systolic blood pressure against the average simulated potassium intakes;
		proc surveyreg data=regression_data;
		
			model BPSY_AVG = usl_Potassium Current_Smoker Former_Smoker RIDAGEYR RIAGENDR;
			
			weight RepWt_&brr_rep.;
			
			ods output ParameterEstimates=bp_model;
		run;
		
		%summary_coef_surveyreg(parameter_estimates=bp_model,
														response=BPSY_AVG,
														outname=bp_parameters);
		
		**fit logistic model of diagnosed hypertension against the average simulated potassium intakes;
		proc surveylogistic data=regression_data;
		
			model HTN_BIN(event='1') = usl_Potassium Current_Smoker Former_Smoker RIDAGEYR RIAGENDR;
			
			weight RepWt_&brr_rep.;
			
			ods output ParameterEstimates=htn_model;
		run;
		
		%summary_coef_surveylogistic(parameter_estimates=htn_model,
																 response=HTN_BIN,
																 outname=htn_parameters);
		
		ods select all;
																 
		data summary_brr&brr_rep.;
			set bp_parameters
					htn_parameters;
		run;
	%end;
	
	**extract point estimate and BRR replicates;
	data summary_brr_data;
		set summary_brr0;
		%do brr_rep = 1 %to &num_brr.;
			set summary_brr&brr_rep. (keep = value rename=(value = brr&brr_rep.));
		%end;
	run;
%mend summary_brr;

%summary_brr(num_brr=&num_brr.);

/*
The BRR standard error and confidence intervals can be calculated as usual. 
The BRR replicate weights in this dataset used a Fay factor of 0.7, so this must be accounted for in calculating the variance. 
Other datasets may use different Fay factors when generating BRR replicate weights, so it is important to verify it before calculating variances.
*/

**calculate degrees of freedom;
proc sort data=input_dataset; by SDMVSTRA; run;

data _NULL_;
	set input_dataset end=last_obs;
	by SDMVSTRA;
	
	retain num_strata 0;
	
	if first.SDMVSTRA then num_strata = num_strata + 1;
	
	if last_obs = 1 then call symputx("df", num_strata);
run;

**create summary report;
data summary_report (keep = population variable statistic value std_error confidence_lower confidence_upper);
	set summary_brr_data;
	
	array reps{&num_brr.} brr1-brr&num_brr.;
	
	**calculate BRR standard error;
	sum_sq_diff = 0;
	do i = 1 to &num_brr.;
	
		sum_sq_diff = sum_sq_diff + (reps{i} - value)**2;
	end;
	
	std_error = sqrt(sum_sq_diff/(&num_brr.*&fay_factor.**2));
	
	**95% confidence intervals;
	confidence_lower = value + tinv(0.025, &df.)*std_error;
	confidence_upper = value + tinv(0.975, &df.)*std_error;
run;

proc print data=summary_report; run;