/*
Title: Multivariate Distribution
Author: NCI/Information Management Services
Date: 3/24/2025
*/

/*
This example demonstrates creating a multivariate distribution using the NCI method.
*/

libname indata "./ncimultivar/data";

%include "./ncimultivar/macros/ncimultivar.sas";

/*
The NCI method can be used to model multiple variables at once, including both episodically consumed foods (referred to as foods) and daily consumed nutrients (referred to as nutrients). 
Some steps in the workflow must take into account non-consumption days for episodically consumed foods.

For this example, a bivariate distribution will be generated for whole grains (G_WHOLE) and energy (TKCAL) from 2005-2010 NHANES data.
Whole grains will be modeled as an episodically consumed food and energy will modeled as a daily consumed nutrient.
A subset of six strata (SDMVSTRA) will be used to reduce computation time and allow this example to run in real time.

The covariates being examined are smoking status (SMK_REC), age (RIDAGEYR), and sex (RIAGENDR). 
Two nuisance covariates will be factored in as well: whether the recall was on a weekend (Weekend) and and whether the recall is on day 2 (Day2).

The WTDRD1 variable is the base weighting for each observation.

Subjects with missing values are removed, and categorical variables are transformed into binary indicators.
*/

**subset data;
data input_dataset;
	set indata.nhcvd;
	if SDMVSTRA in (48 54 60 66 72 78);
	
	**Define indicator for day 2;
	Day2 = (DAY = 2);
run;

data input_dataset;
	set input_dataset;
	
	**remove subjects that are missing any covariates or variables;
	if not missing(SMK_REC)  and 
		 not missing(RIDAGEYR) and 
		 not missing(RIAGENDR) and
		 not missing(Weekend)  and
		 not missing(Day2)		 and
		 not missing(G_WHOLE)	 and
		 not missing(TKCAL);
	
	**break down smoking status into binary indicators;
	Current_Smoker = (SMK_REC = 1);
	Former_Smoker = (SMK_REC = 2);
	Never_Smoker = (SMK_REC = 3);
	
	**rename whole grain and energy variables for readability;
	Whole_Grain = G_WHOLE;
	Energy = TKCAL;
run;

/*
Next, extreme observations will be identified for Winsorization. 
The %boxcox_survey() macro must be run for both variables. 
The non-zero values of the first recall will be used. 

Since whole grain is an episodic variable, Winsorization for low outliers is done slightly differently. 
Instead of changing values that are too small to a minimum threshold, they are set to zero and treated as a non-consumption observation. 
This behavior is toggled by setting the is_episodic parameter to Y.
*/

%boxcox_survey(input_data=input_dataset,
							 row_subset=%quote(Day2 = 0),
							 variable=Whole_Grain,
							 is_episodic=Y,
							 weight=WTDRD1,
							 do_winsorization=Y,
							 iqr_multiple=2,
							 id=SEQN,
							 repeat_obs=DAY);
							 
%boxcox_survey(input_data=input_dataset,
							 row_subset=%quote(Day2 = 0),
							 variable=Energy,
							 weight=WTDRD1,
							 do_winsorization=Y,
							 iqr_multiple=2,
							 id=SEQN,
							 repeat_obs=DAY);
							 
data input_dataset;
	set input_dataset;
	
	**Winsorize whole grain;
	Whole_Grain = min(Whole_Grain, 10.71163);
	
	**Winsorize energy;
	Energy = max(Energy, 269.0701);
	Energy = min(Energy, 8026.0436);
run;

/*
Next, the best Box-Cox lambda parameter for each variable can be found using the Winsorized data in the presence of covariates. 
A Box-Cox survey must be run for each variable used in the model.
*/

%boxcox_survey(input_data=input_dataset,
							 row_subset=%quote(Day2 = 0),
							 variable=Whole_Grain,
							 covariates=Current_Smoker Former_Smoker RIDAGEYR RIAGENDR Weekend,
							 weight=WTDRD1);
							 
%boxcox_survey(input_data=input_dataset,
							 row_subset=%quote(Day2 = 0),
							 variable=Energy,
							 covariates=Current_Smoker Former_Smoker RIDAGEYR RIAGENDR Weekend,
							 weight=WTDRD1);
							 
data boxcox_lambda_data;
	set bc_Whole_Grain
			bc_Energy;
run;

/*
Next, the minimum amounts for whole grain and energy can be calculated using the first recall. 

The %calculate_minimum_amount() macro can handle multiple variables at the same time.
Episodically consumed foods are specified with episodic_variables while daily consumed nutrients are specified with daily_variables.
If there are multiple daily and/or episodic variables, they should be specified as space-delimited lists in their respective parameters.
*/

%calculate_minimum_amount(input_data=input_dataset,
													row_subset=%quote(Day2 = 0),
													episodic_variables=Whole_Grain,
													daily_variables=Energy);
													
/*
The %nci_multivar_preprocessor() macro can now generate the MCMC input data. 
Whole grains should be input in the episodic_variables parameter to tell the macro to generate a consumption indicator variable in additional to an amount variable.

The episodic_variables and daily_variables parameters are space-delimited lists of episodically consumed foods and daily consumed nutrients, respectively.
*/

%nci_multivar_preprocessor(input_data=input_dataset,
													 episodic_variables=Whole_Grain,
													 daily_variables=Energy,
													 continuous_covariates=RIDAGEYR,
													 boxcox_lambda_data=boxcox_lambda_data,
													 minimum_amount_data=minimum_amount_data,
													 outname=model);
													 
/*
The MCMC model can now be fit using both whole grains and energy.

Episodic and daily variables are specified in episodic_variables and daily_variables as in the previous steps.

Note that the number of iterations and burn-in is higher than for a single daily consumed nutrient.
This is because models with episodic variables and multivariate models take more iterations to converge.
*/

%nci_multivar_mcmc(pre_mcmc_data=model,
									 id=SEQN,
									 repeat_obs=DAY,
									 weight=WTDRD1,
									 episodic_variables=Whole_Grain,
									 daily_variables=Energy,
									 default_covariates=Current_Smoker Former_Smoker std_RIDAGEYR RIAGENDR Day2 Weekend,
									 num_mcmc_iterations=4000,
									 num_burn=2000,
									 num_thin=2,
									 mcmc_seed=9999,
									 outname=model);
									 
/*
The %nci_multivar_distrib() macro will be used to simulate a dataset of usual intakes to be used to represent the distribution of true usual intakes. 
This procedure does not calculate or predict true usual intakes for each subject.

A population-based dataset must be constructed as with the univariate distribution.
*/

proc sort data=model_mcmc_in; by SEQN; run;
data distrib_pop;
	set model_mcmc_in;
	by SEQN;
	
	**get first instance of each subject;
	if first.SEQN then do;
	
		**Set Day 2 to zero to factor out the effect of Day 2 recalls;
		Day2 = 0;
	
		**create repeats of each subject for weekday and weekend consumption;
		Weekend = 0;
		Weekend_Weight = 4;
		output;
	
		Weekend = 1;
		Weekend_Weight = 3;
		output;
	end;
run;

/*
The %nci_multivar_distrib() macro can now be used to simulate 200 usual intakes for each subject.
*/

%nci_multivar_distrib(multivar_mcmc_model=model,
											distrib_population=distrib_pop,
											id=SEQN,
											weight=WTDRD1,
											nuisance_weight=Weekend_Weight,
											num_simulated_u=200,
											distrib_seed=99999,
											outname=model_distrib_out);
											
/*
The distribution dataset contains simulated usual intakes for both whole grains and energy.

Summary statistics can now be calculated for both variables.
Multiple variables can be summarized at the same time by specifying them as a space-delimited list in the variables parameter.
*/

**compute means and quantiles;
%nci_multivar_summary(input_data=model_distrib_out,
											variables=usl_Whole_Grain usl_Energy,
											weight=WTDRD1,
											do_means=Y,
											do_quantiles=Y,
											quantiles=5 25 50 75 95,
											outname=summary_stats);
											
proc print data=summary_stats; 
	
	title "Usual Intake Distributions of Whole Grain and Energy";
run;

