/*
Significance Testing for MCMC Covariates

Description:

	Tests the significance of covariates in an MCMC model using replication techniques.
	
Parameters:

	- pre_mcmc_lib: The SAS library that contains the pre-MCMC datasets. (default = WORK)
	- pre_mcmc_data: Prefix for the pre-MCMC datasets (max 23 characters). The following datasets with the prefix must be present:
									 	 - _mcmc_in: A SAS data set with indicators, standardized amounts, and covariates for the MCMC model.
									 	 - _backtran: A SAS data set with the following columns:
									 	 							  - variable: The name of the variable.
									 	 							  - tran_lambda: The value of the lambda used to transform the variable.
									 	 							  - minamount: Half of the minimum non-zero value of the variable.
									 	 							  - tran_center: The mean of the Box-Cox transformed variable before standardization.
									 	 							  - tran_scale: The standard deviation of the Box-Cox transformed variable before standardization divided by sqrt(2).
									 	 							  - biomarker: Binary flag indicating whether the variable is a biomarker assumed to be unbiased on the transformed scale.
	- id: Variable that identifies each subject.
	- repeat_obs: Variable that distinguishes repeat observations for each subject.
	- base_weight: Variable with weighting for each subject.
	- episodic_variables: Space-delimited list of episodic variables.
	- episodic_indicators: Space-delimited list of consumption indicator variables for episodic variables. 
												 Not needed if pre_mcmc_data was created by nci_multivar_preprocessor.
	- episodic_amounts: Space-delimited list of consumption amount variables for episodic variables.
											Not needed if pre_mcmc_data was created by nci_multivar_preprocessor.
	- daily_variables: Space-delimited list of daily variables.
	- daily_amounts: Space-delimited list of consumption amount variables for daily variables.
									 Not needed if pre_mcmc_data was created by nci_multivar_preprocessor.
	- default_covariates: Space-delimited list of covariates to be used for episodic indicators, episodic amounts, and daily amounts. 
												Does not affect the never-consumer model if present.
	- episodic_indicator_covariates: Space-delimited list of covariates to be used for episodic indicators. 
																	 If specified, overwrites default_covariates for episodic indicators.
	- episodic_amount_covariates: Space-delimited list of covariates to be used for episodic amounts. 
																If specified, overwrites default_covariates for episodic amounts.
	- daily_amount_covariates: Space-delimited list of covariates to be used for daily amounts. 
														 If specified, overwrites default_covariates for daily amounts.
	- individual_covariates: A SAS data set of covariate lists for individual indicators and amounts.
													 It must contain the following columns:
													 	 - variable: Name of the indicator or amount.
													 	 - covariates: A space-delimited string of covariates for the indicator or amount.
													 Overrides default_covariates, episodic_indicator_covariates, episodic_amount_covariates, and daily_amount_covariates for each named variable.
	- default_intercept: Flag to include an intercept in the models for episodic indicators, episodic amounts, and daily amounts. (Y/N, default = Y)
											 Does not affect the never-consumer model if present.
	- episodic_indicator_intercept: Flag to include an intercept in the model for episodic indicators. (Y/N).
																	If specified, overwrites default_intercept for episodic indicators.
	- episodic_amount_intercept: Flag to include an intercept in the model for episodic amounts. (Y/N)
															 If specified, overwrites default_intercept for episodic amounts.
	- daily_amount_intercept: Flag to include an intercept in the model for daily amounts. (Y/N)
														If specified, overwrites default_intercept for daily amounts.
	- individual_intercept: A SAS data set of intercepts for individual indicators and amounts.
													It must contain the following columns:
														- variable: Name of the indicator or amount.
														- intercept: Flag to include an intercept in the model for the indicator or amount (Y/N).
	- never_consumer_variable: One episodic variable to allow never-consumers. 
														 Can be a variable already listed as episodic or a new variable.
	- never_consumer_indicator: Consumption indicator variable for the never-consumer variable.
															Not needed if pre_mcmc_data was created by nci_multivar_preprocessor.
	- never_consumer_amount: Consumption amount variable for the never-consumer variable.
													 Not needed if pre_mcmc_data was created by nci_multivar_preprocessor.
	- never_consumer_covariates: Space-delimited list of covariates to be used for the never-consumer model. 
															 Not affected by default_covariates and must be specified separately.
	- never_consumer_intercept: Flag to include intercept in the never-consumer model.
															Not affected by default_intercept and must be specified separately. (Y/N, default = Y)
	- initial_mcmc_seed: Numeric starting seed for the random number generator. 
											 If specified, the seed will be incremented for each MCMC replicate. 
											 If blank, uses a randomly generated integer from -10^7 to 10^7, exclusive for each replicate. (default = blank)
	- num_mcmc_iterations: Integer specifying the total number (including burn-in) of iterations in the MCMC chain. (default = 12000)
	- num_burn: Integer specifying the number of burn-in iterations in the MCMC chain. Must be smaller than num_mcmc_iterations. (default = 2000)
	- num_thin: Integer specifying the number of iterations between MCMC samples used for calculating posterior means. (default = 25)
	- sigma_u_prior: A SAS data set specifying a prior covariance matrix for the random effects (u). 
									 Must be a valid covariance matrix with dimensions equal to the total number of episodic indicators, episodic amounts, and daily amounts.
	- sigma_u_constant: Flag specifying if the covariance matrix of the random effects (u) should remain the same for each iteration. (Y/N, default = N)
	- rep_weights: Space-delimited list of variables with the replicate weights for each subject.
								 Can be specified using DATA step variable list syntax.
	- brr: Flag specifying whether replicate weights are for balanced repeated replication (BRR). (Y/N, default = N)
	- fay_factor: Numeric between 0 and 1 specifying the Fay factor used to generate the BRR weights used in the function. \
								Applies only when brr is Y. (default = 1)
	- df: Integer specifying the number of degrees of freedom for Hotelling's T^2 test.
	- test_covariates: Space-delimited list of covariates to be tested.
	- outlib: The SAS library to store output datasets. (default = WORK)
	- outname: The name of the output dataset.
	
Output:

	The following SAS data set is created in outlib:
	
		outname: A SAS data set that contains the joint p-value for all of the covariates in test_covariates.
	
Details:

	The variance/covariance matrix of the beta parameters in the MCMC
	model is found by fitting MCMC model replicates with bootstrap or balanced
	repeated replication (BRR). The original weighting variable for the dataset
	is given in base_weight and the replicate weighting variables are given
	in rep_weights. For survey datasets that BRR is applicable to, BRR
	weights can be generated by brr_weights. When using BRR, the value of
	fay_factor must match the Fay factor used to generate the weights.

	The covariates to test are specified in test_covariates. Elements of
	test_covariates that are not present in the model are ignored.
	The test_covariates can include covariates from the episodic indicator,
	episodic amount, and daily amount parts of the model. This function does
	not test never-consumer covariates.

	The df parameter is the number of degrees of freedom to use for the
	Hotelling's t-squared test. For survey datasets, this is the total number
	of PSUs across all strata minus the number of strata.
*/
%macro covariate_test(pre_mcmc_lib=WORK,
											pre_mcmc_data=,
											id=,
											repeat_obs=,
											base_weight=,
											episodic_variables=,
											episodic_indicators=,
											episodic_amounts=,
											daily_variables=,
											daily_amounts=,
											default_covariates=,
											episodic_indicator_covariates=,
											episodic_amount_covariates=,
											daily_amount_covariates=,
											individual_covariates=,
											default_intercept=Y,
											episodic_indicator_intercept=,
											episodic_amount_intercept=,
											daily_amount_intercept=,
											individual_intercept=,
											never_consumer_variable=,
											never_consumer_indicator=,
											never_consumer_amount=,
											never_consumer_covariates=,
											never_consumer_intercept=Y,
											initial_mcmc_seed=,
											num_mcmc_iterations=12000,
											num_burn=2000,
											num_thin=25,
											sigma_u_prior=,
											sigma_u_constant=N,
											rep_weights=,
											brr=N,
											fay_factor=1,
											df=,
											test_covariates=,
											outlib=WORK,
											outname=);
											
	%local num_replicates rep weight
				 max_num_covariates num_test test
				 beta_selected num_selected;
											
	%let mcmc_seed = &initial_mcmc_seed.;
	
	**Expand list of replicate weights;
	data _repweights;
		set &pre_mcmc_lib..&pre_mcmc_data._mcmc_in (keep = &rep_weights.);
	run;
	
	proc contents data=_repweights out=_repweight_vars noprint; run;
	
	data _NULL_;
		set _repweight_vars end=_last_obs;
		
		length rep_weights $32767.;
		
		retain rep_weights;
		
		if _N_ = 1 then rep_weights = "";
		
		rep_weights = catx(" ", rep_weights, NAME);
		
		if _last_obs = 1 then call symputx("rep_weights", rep_weights);
	run;
											
	**run MCMC with base weights and all replicate weights and extract chosen beta parameters;
	%let num_replicates = %sysfunc(countw(&rep_weights., %str( )));
	
	%do rep = 0 %to &num_replicates.;
	
		%if &rep. = 0 %then %do;
		
			%let weight = &base_weight.;
		%end;
		%else %do;
		
			%let weight = %sysfunc(scan(&rep_weights., &rep., %str( )));
		%end;
		
		**Run MCMC;
		%nci_multivar_mcmc(pre_mcmc_lib=&pre_mcmc_lib.,
											 pre_mcmc_data=&pre_mcmc_data.,
											 id=&id.,
											 repeat_obs=&repeat_obs.,
											 weight=&weight.,
											 episodic_variables=&episodic_variables.,
											 episodic_indicators=&episodic_indicators.,
											 episodic_amounts=&episodic_amounts.,
											 daily_variables=&daily_variables.,
											 daily_amounts=&daily_amounts.,
											 default_covariates=&default_covariates.,
											 episodic_indicator_covariates=&episodic_indicator_covariates.,
											 episodic_amount_covariates=&episodic_amount_covariates.,
											 daily_amount_covariates=&daily_amount_covariates.,
											 individual_covariates=&individual_covariates.,
											 default_intercept=&default_intercept.,
											 episodic_indicator_intercept=&episodic_indicator_intercept.,
											 episodic_amount_intercept=&episodic_amount_intercept.,
											 daily_amount_intercept=&daily_amount_intercept.,
											 individual_intercept=&individual_intercept.,
											 never_consumer_variable=&never_consumer_variable.,
											 never_consumer_indicator=&never_consumer_indicator.,
											 never_consumer_amount=&never_consumer_amount.,
											 never_consumer_covariates=&never_consumer_covariates.,
											 never_consumer_intercept=&never_consumer_intercept.,
											 mcmc_seed=&mcmc_seed.,
											 num_mcmc_iterations=&num_mcmc_iterations.,
											 num_burn=&num_burn.,
											 num_thin=&num_thin.,
											 sigma_u_prior=&sigma_u_prior.,
											 sigma_u_constant=&sigma_u_constant.,
											 outname=_rep);
											 
		%if &rep. = 0 %then %do;
		
			**Select beta parameters for covariates to test;
			data _NULL_;
				set _rep_covars end=_last_obs;
				
				retain max_num_covariates;
				
				if _N_ = 1 then max_num_covariates = 0;
				
				if num_covariates > max_num_covariates then max_num_covariates = num_covariates;
				
				if _last_obs = 1 then call symputx("max_num_covariates", max_num_covariates);
			run;
			
			%let num_test = %sysfunc(countw(&test_covariates., %str( )));
			
			data _NULL_;
				set _rep_covars end=_last_obs;
				
				length beta_name $32.;
				length beta_selected $32767.;
				
				retain beta_selected;
				
				array covariates{&max_num_covariates.} covariate1-covariate&max_num_covariates.;
				
				if _N_ = 1 then _beta_selected = "";
				
				do i = 1 to num_covariates;
				
					beta_name = cat("beta", _N_, "_covariate", i);
				
					%do test = 1 %to &num_test.;
					
						%let test_covariate = %sysfunc(scan(&test_covariates., &test., %str( )));
						if compare(covariates{i}, "&test_covariate.", "i") = 0 then beta_selected = catx(" ", beta_selected, beta_name);
					%end;
				end;
				
				if _last_obs = 1 then call symputx("beta_selected", beta_selected);
			run;
			
			%let num_selected = %sysfunc(countw(&beta_selected., %str( )));
		%end;
		
		**Extract selected beta parameter means;
		%extract_parameters(multivar_mcmc_model=_rep,
												outname=_rep);
												
		data _selected (keep = value);
			set _rep_means (keep = &beta_selected.);
			
			array beta{&num_selected.} &beta_selected.;
			
			do i = 1 to &num_selected.;
			
				value = beta{i};
				output;
			end;
		run;
		
		%if &rep. = 0 %then %do;
		
			data _beta_reps;
				set _selected (rename=(value=point_estimate));
			run;
		%end;
		%else %do;
		
			data _beta_reps;
				set _beta_reps;
				set _selected (rename=(value=rep&rep.));
			run;
		%end;
		
		**increment seed if it is specified;
		%if &mcmc_seed. ^= %str() %then %do;
		
			%let mcmc_seed = %eval(&mcmc_seed. + 1);
		%end;
	%end;
	
	proc iml;
	
		use _beta_reps;
		
			read all var "point_estimate" into point_estimate;
			read all var (cat("rep", 1:&num_replicates.)) into replicates;
		close _beta_reps;
	
		**calculate variance-covariance matrix of beta estimates;
		if upcase("&brr.") = "N" then fay_factor = 1;
		else fay_factor = &fay_factor.;
		
		num_replicates = &num_replicates.;
		
		beta_covariance = ((replicates - point_estimate) * (replicates - point_estimate)`)/(num_replicates * fay_factor**2);
		
		**Hotelling T^2 test;
		t2 = point_estimate` * inv(beta_covariance) * point_estimate;
		
		N = &df.;
		p = &num_selected.;
		
		F_statistic = t2*(N-p)/(p*(N-1));
		p_value = 1 - probf(F_statistic, p, N-p);
		
		create &outlib..&outname. from p_value[colname={"p"}];
			append from p_value;
		close &outlib..&outname.;
	quit;
%mend covariate_test;