%include "&sysincludefiledir./internal/boxcox_functions.sas";

/*
Select the best Box-Cox lambda parameter for a variable

Description:

	Searches a specified grid of lambda values for a Box-Cox
	transformation that is most similar to a normal distribution. This function
	can also produce supplemental reports:

		- Suggested Winsorization: a report of outlier observations and suggested Winsorized values
		- Influential subjects: a report of subjects that have extreme variance among repeated observations

Parameters:

	- input_data: A SAS data set.
	- row_subset: A conditional expression wrapped in %quote() indicating which rows of input_data to use for selecting the best lambda.
	- variable: Variable to transform (max 28 characters).
	- id: Variable that identifies each subject.
	- repeat_obs: Variable that distinguishes repeat observations for each subject.
	- lambda_start: Minimum lambda value in the search grid. (default = 0)
	- lambda_increment: Spacing between lambda values in the search grid. (default = 0.01)
	- num_lambdas: Number of lambda values in the search grid. (default = 101)
	- covariates: Space-delimited list of names of covariates used to select the best lambda.
	- weight: Variable with weighting for each subject.
	- do_winsorization: Generate suggested Winsorization report? (Y/N, default = N)
	- print_winsorization: Print suggested Winsorization report? (Y/N, default = Y)
	- is_episodic: Is the variable episodic? Episodic variables have a substantial number of zero observations due to not being continuously observed. 
								 Required only for suggested Winsorization report. (Y/N, default = N)
	- iqr_multiple: Multiple of the interquartile range of the Box-Cox transformed variable. 
									This sets the distance away that an observation must be from the 25th or 75th percentiles to be considered an outlier. 
									Has no effect if the suggested Winsorization report is not generated. (default = 3)
	- do_influential: Generate influential subject report? (Y/N, default = N)
	- print_influential: Print influential subject report? (Y/N, default = Y)
	- influential_alpha: The F-test p-value threshold that a subject must be under to be considered influential. 
											 Has no effect if the influential subject report is not generated. 
											 See the Influential Subjects section for detals. (default = 0.000002342729)
	- multiple_test: The type of multiple testing correction to use to adjust influential_subject_alpha. 
									 The options are blank (no correction), BONFERRONI, and BENJAMINI-HOCHBERG. (default = no correction)
									 Has no effect if the influential subject report is not generated.
	- outlib: The SAS library to store output datasets. (default = WORK).
	
Output:

	The following SAS data set is created in outlib:
	
		- bc_&variable.: A SAS data set with the following columns:
										 	 - variable: Name of the variable that was transformed.
											 - tran_lambda: The value of lambda for the Box-Cox transformation most resembling a normal distribution.
											 
	The following SAS data set is created in outlib if do_winsorization = Y
				
		- win_&variable.: A SAS data set with the following columns:
												- &id.: The unique identifier for each subject.
												- &repeat_obs.: Distinguishes repeated observations for the same subject.
												- &variable.: The value of the outlier on the original scale.
												- &variable._win: The suggested value to Winsorize the outlier value to.
												
	The following SAS data set is created in outlib if do_influential = Y
				
		- inf_&variable.: A SAS data set with the following columns:
												- &id.: The unique identifier for each subject.
												- p: The p-value of the F-test that identified the subject's variance as influential.
												- &variable.1-&variable.k: One column for each of the k unique values of &repeat_obs. containing values of &variable. for each observation
		
Lambda search:

	The best lambda value is defined as the lambda value that produces a
	transformation that minimizes the sum of squared errors (SSE) between the
	actual 1st to 99th percentiles of the transformed variable to the
	corresponding expected percentiles of a normal distribution. Using the 1st
	to 99th percentiles excludes extreme values and makes the selection of the
	lambda less susceptible to outliers.

Suggested Winsorization:

	Outlier detection is done on the Box-Cox transformed scale using the
	selected lambda value to ensure that the data is as close to normal as
	possible. Outliers are defined as being a specified multiple (default: 3)
	of the interquartile range below the 25th percentile or above the 75th
	percentile.

Influential subjects:

	Detection of influential subjects is done on the Box-Cox transformed scale
	using the selected lambda value to ensure that the data is as close to
	normal as possible. Influential subjects are found by performing an F-test
	on the variance of each subject's observations against the mean of the
	variances of the other subjects' observations. When each subject has 2
	observations, the default alpha for the F-test corresponds to identifying
	subjects as influential that are 3 times the interquartile range of the
	differences between observations below the 25th percentile of differences
	or above the 75th percentile of differences. Multiple testing correction
	(Bonferroni or Benjamini-Hochberg) is also available.
*/
%macro boxcox_survey(input_data=,
										 row_subset=,
										 variable=,
										 id=,
										 repeat_obs=,
										 lambda_start=0,
										 lambda_increment=0.01,
										 num_lambdas=101,
										 covariates=,
										 weight=,
										 do_winsorization=N,
										 print_winsorization=Y,
										 is_episodic=N,
										 iqr_multiple=3,
										 do_influential=N,
										 print_influential=Y,
										 influential_alpha=0.000002342729,
										 multiple_test=,
										 outlib=WORK);
										 
	%local selected_lambda;
										 
	**Find best Box-Cox lambda parameter;
	%find_best_lambda(input_data=&input_data.,
										row_subset=&row_subset.,
										variable=&variable.,
										covariates=&covariates.,
										weight=&weight.,
										lambda_start=&lambda_start.,
										lambda_increment=&lambda_increment.,
										num_lambdas=&num_lambdas.,
										outlib=&outlib.);
										
	data _NULL_;
		set bc_&variable.;
		
		call symputx("selected_lambda", tran_lambda);
	run;
	
	%if %upcase("&do_winsorization.") = "Y" %then %do;
	
		**Create report of suggested Winsorized values for outliers;
		%find_suggested_winsorization(input_data=&input_data.,
																	row_subset=&row_subset.,
																	lambda=&selected_lambda.,
																	variable=&variable.,
																	is_episodic=&is_episodic.,
																	covariates=&covariates.,
																	weight=&weight.,
																	id=&id.,
																	repeat_obs=&repeat_obs.,
																	iqr_multiple=&iqr_multiple.,
																	print_report=&print_winsorization.,
																	outlib=&outlib.);
	%end;


	%if %upcase("&do_influential.") = "Y" %then %do;
		
		**Create report of subjects with influential within-subject variances;
		%find_influential_subjects(input_data=&input_data.,
															 row_subset=&row_subset.,
															 lambda=&selected_lambda.,
															 variable=&variable.,
															 weight=&weight.,
															 id=&id.,
															 repeat_obs=&repeat_obs.,
															 alpha=&influential_alpha.,
															 multiple_test=&multiple_test.,
															 print_report=&print_influential.,
															 outlib=&outlib.);
	%end;
%mend boxcox_survey;