/*
Title: Multivariate Regression Calibration
Author: NCI/Information Management Services
Date: 3/24/2025
*/

/*
This example demonstrates multivariate regression calibration using the NCI method.
*/

libname indata "./ncimultivar/data";

%include "./ncimultivar/macros/ncimultivar.sas";

/*
The effect of the ratio of sodium (TSODI) to potassium (TPOTA) on systolic blood pressure (BPSY_AVG) will be measured from 2005-2010 NHANES data.
A subset of six strata (SDMVSTRA) will be used to reduce computation time and allow this example to run in real time.

The covariates being examined are smoking status (SMK_REC), age (RIDAGEYR), and sex (RIAGENDR). 
The two nuisance covariates are whether the recall was on a weekend (Weekend) and and whether the recall is on day 2 (Day2).

Subjects with missing values are removed, and categorical variables are transformed into binary indicators.
*/

**subset data;
data input_dataset;
	set indata.nhcvd;
	if SDMVSTRA in (48 54 60 66 72 78);
	
	**Define indicator for Day 2;
	Day2 = (DAY = 2);
run;

data input_dataset;
	set input_dataset;
	
	**remove subjects that are missing any covariates, variables, or outcomes;
	if not missing(SMK_REC)  and 
		 not missing(RIDAGEYR) and 
		 not missing(RIAGENDR) and
		 not missing(Weekend)  and
		 not missing(Day2)		 and
		 not missing(TSODI)	 	 and
		 not missing(TPOTA)		 and
		 not missing(BPSY_AVG);
	
	**break down smoking status into binary indicators;
	Current_Smoker = (SMK_REC = 1);
	Former_Smoker = (SMK_REC = 2);
	Never_Smoker = (SMK_REC = 3);
	
	**rename sodium and potassium variables for readability;
	Sodium = TSODI;
	Potassium = TPOTA;
run;

/*
When calculating ratios, the NCI method should be applied to the individual components to generate distributions of usual intakes.
Then, ratio of the simulated usual intakes should be taken.
It is not recommended to take the ratio of raw intakes and apply the NCI method to the ratio.

The sodium and potassium variables can now be transformed and standardized for use in the MCMC algorithm.
*/

**Winsorize extreme values of sodium and potassium intake;
%boxcox_survey(input_data=input_dataset,
							 row_subset=%quote(Day2 = 0),
							 variable=Sodium,
							 weight=WTDRD1,
							 do_winsorization=Y,
							 id=SEQN,
							 repeat_obs=DAY);
							 
%boxcox_survey(input_data=input_dataset,
							 row_subset=%quote(Day2 = 0),
							 variable=Potassium,
							 weight=WTDRD1,
							 do_winsorization=Y,
							 id=SEQN,
							 repeat_obs=DAY);
							 
data input_dataset;
	set input_dataset;
	
	Sodium = max(Sodium, 113.1129);
	
	Potassium = max(Potassium, 42.45263);
run;

**run Box-Cox survey and create Box-Cox lambda data using the first recall;
%boxcox_survey(input_data=input_dataset,
							 row_subset=%quote(Day2 = 0),
							 variable=Sodium,
							 covariates=Current_Smoker Former_Smoker RIDAGEYR RIAGENDR Weekend,
							 weight=WTDRD1);
							 
%boxcox_survey(input_data=input_dataset,
							 row_subset=%quote(Day2 = 0),
							 variable=Potassium,
							 covariates=Current_Smoker Former_Smoker RIDAGEYR RIAGENDR Weekend,
							 weight=WTDRD1);
							 
data boxcox_lambda_data;
	set bc_Sodium
			bc_Potassium;
run;

**Calculate minimum amount of sodium and potassium intake in the first recall;
%calculate_minimum_amount(input_data=input_dataset,
													row_subset=%quote(Day2 = 0),
													daily_variables=Sodium Potassium);
													
**Run MCMC pre-preprocessor;
%nci_multivar_preprocessor(input_data=input_dataset,
													 daily_variables=Sodium Potassium,
													 continuous_covariates=RIDAGEYR,
													 boxcox_lambda_data=boxcox_lambda_data,
													 minimum_amount_data=minimum_amount_data,
													 outname=model);
													 
/*
The MCMC measurement error model can now be fit.

As with univariate regression calibration, 500 conditional post-MCMC U matrix draws will be made.
The number of MCMC and burn-in iterations is increased because this is a bivariate model.
*/

%nci_multivar_mcmc(pre_mcmc_data=model,
									 id=SEQN,
									 repeat_obs=DAY,
									 weight=WTDRD1,
									 daily_variables=Sodium Potassium,
									 default_covariates=Current_Smoker Former_Smoker std_RIDAGEYR RIAGENDR Day2 Weekend,
									 num_mcmc_iterations=4000,
									 num_burn=2000,
									 num_thin=2,
									 num_post=500,
									 mcmc_seed=9999,
									 outname=model);
									 
/*
A dataset with simulated usual intakes of whole grain and energy for each subject can now be created. 
This dataset represents the conditional expectation of usual intake given the observed data for each subject to be used in the regression calibration procedure. 
It is not a prediction or calculation of the true usual intake for individuals.
*/

proc sort data=model_mcmc_in; by SEQN; run;
data distrib_pop;
	set model_mcmc_in;
	by SEQN;
	
	**get first instance of each subject;
	if first.SEQN then do;
	
		**Set Day 2 to zero to factor out the effect of Day 2 recalls;
		Day2 = 0;
	
		**create repeats of each subject for weekday and weekend consumption;
		Weekend = 0;
		Weekend_Weight = 4;
		output;
	
		Weekend = 1;
		Weekend_Weight = 3;
		output;
	end;
run;

%nci_multivar_distrib(multivar_mcmc_model=model,
											distrib_population=distrib_pop,
											id=SEQN,
											weight=WTDRD1,
											nuisance_weight=Weekend_Weight,
											use_mcmc_u_matrices=Y,
											additional_output=Current_Smoker Former_Smoker RIDAGEYR RIAGENDR BPSY_AVG,
											distrib_seed=99999,
											outname=model_distrib_out);
											
/*
The sodium/potassium ratio can now be calculated from the simulated usual intakes of sodium and potassium.

Then the sodium/potassium ratio is averaged for each subject as with a univariate distribution.
The averaged sodium potassium ratio is included as a covariate in the model for systolic blood pressure.
*/

**Calculate sodium/potassium ratio;
data model_distrib_out;
	set model_distrib_out;
	
	Sodium_Potassium_Ratio = usl_Sodium/usl_Potassium;
run;

**Average sodium/potassium ratio for each subject;
proc sort data=model_distrib_out; by SEQN WTDRD1 Current_Smoker Former_Smoker RIDAGEYR RIAGENDR BPSY_AVG; run;
proc univariate data=model_distrib_out noprint;
	by SEQN WTDRD1 Current_Smoker Former_Smoker RIDAGEYR RIAGENDR BPSY_AVG;
	
	var Sodium_Potassium_Ratio;
	
	output out=regression_data mean=Sodium_Potassium_Ratio;
run;

ods select none;

**fit linear model;
proc surveyreg data=regression_data;

	model BPSY_AVG = Sodium_Potassium_Ratio Current_Smoker Former_Smoker RIDAGEYR RIAGENDR;
	
	weight WTDRD1;
	
	ods output ParameterEstimates=bp_model;
run;

ods select all;

**summary dataset of model parameters;
%summary_coef_surveyreg(parameter_estimates=bp_model,
												response=BPSY_AVG,
												outname=bp_parameters);
												
proc print data=bp_parameters; 

	title "Systolic Blood Pressure vs. Sodium/Potassium Ratio";
run;

