**Find best Box-Cox lambda parameter;
%macro find_best_lambda(input_data=,
												row_subset=,
												variable=,
												covariates=,
												weight=,
												lambda_start=,
												lambda_increment=,
												num_lambdas=,
												outlib=);
												
	%local lambda i;
												
	**Subset to non-zero values and apply other user-specified subsetting;
	data _subset;
		set &input_data.;
		
		%if &row_subset. ^= %str() %then %do;
		
			if (&row_subset.) and &variable. > 0 then _var_value = &variable.;
			else _var_value = .;
		%end;
		%else %do;
		
			if &variable. > 0 then _var_value = &variable.;
			else _var_value = .;
		%end;
	run;
	
	**Calculate 1st-99th percentiles of Box-Cox transformations for the lambda grid;
	%do i = 0 %to %eval(&num_lambdas. - 1);
	
		data _NULL_;
		
			lambda = &lambda_start. + &i.*&lambda_increment.;
			call symputx("lambda", lambda);
		run;
	
		data _subset;
			set _subset;
			
			_lambda = &lambda.;
			if _lambda = 0 then _boxcox_value = log(_var_value);
			else _boxcox_value = (_var_value**_lambda - 1)/_lambda;
		run;
		
		%if &covariates. = %str() %then %do;
		
			proc univariate data=_subset noprint;
			
				var _boxcox_value;
				%if &weight. ^= %str() %then %do;
					weight &weight.;
				%end;
				
				output out=_percentiles pctlpts=1 to 99 by 1 pctlpre=P;
			run;
		%end;
		%else %do;
		
			ods select none;
			
			proc surveyreg data=_subset;
			
				model _boxcox_value = &covariates.;
				%if &weight. ^= %str() %then %do;
					weight &weight.;
				%end;
				
				output out=_reg_out r=residual;
			run;
			
			ods select all;
			
			proc univariate data=_reg_out noprint;
			
				var residual;
				%if &weight. ^= %str() %then %do;
					weight &weight.;
				%end;
				
				output out=_percentiles pctlpts=1 to 99 by 1 pctlpre=P;
			run;
		%end;
		
		data _percentiles;
			set _percentiles;
			
			lambda = &lambda.;
		run;
		
		data _lambda_percentiles;
			set %if &i. ^= 0 %then _lambda_percentiles;
					_percentiles;
		run;
	%end;
	
	**Calculate SSE for each lambda;
	data _lambda_percentiles;
		set _lambda_percentiles;
		
		retain N1-N99;
		
		array N{99} N1-N99; **Normal percentiles;
		array P{99} P1-P99; **Box-Cox percentiles;
		
		if _N_ = 1 then do;
			do i = 1 to 99;
			
				N{i} = probit(0.01*i);
			end;
		end;
		
		P_mean = mean(of P1-P99);
		
		norm_dev_sq = 0;
		bc_dev_sq = 0;
		norm_bc_cross = 0;
		do i = 1 to 99;
		
			norm_dev_sq = norm_dev_sq + N{i}**2;
			bc_dev_sq = bc_dev_sq + (P{i} - P_mean)**2;
			norm_bc_cross = norm_bc_cross + N{i}*(P{i} - P_mean);
		end;
		
		SSE = norm_dev_sq - (norm_bc_cross**2)/bc_dev_sq;
	run;
	
	**Select lambda with lowest SSE;
	proc sort data=_lambda_percentiles; by SSE; run;
	data &outlib..bc_&variable. (keep = variable tran_lambda);
		set _lambda_percentiles;
		
		length variable $32.;
		
		if _N_ = 1 then do;
		
			variable = "&variable.";
			tran_lambda = lambda;
			output;
		end;
	run;
%mend find_best_lambda;

**Suggested Winsorization report;
%macro find_suggested_winsorization(input_data=,
																		row_subset=,
																		lambda=,
																		variable=,
																		is_episodic=,
																		covariates=,
																		weight=,
																		id=,
																		repeat_obs=,
																		iqr_multiple=,
																		print_report=,
																		outlib=);
	
	**Subset of data used to find thresholds;
	data _subset;
		set &input_data.;
		
		%if &row_subset. ^= %str() %then %do;
		
			if (&row_subset.) and &variable. > 0 then _var_value = &variable.;
			else _var_value = .;
		%end;
		%else %do;
		
			if &variable. > 0 then _var_value = &variable.;
			else _var_value = .;
		%end;
	run;
	
	**Box-Cox transform variable;
	data _subset;
		set _subset;
		
		_lambda = &lambda.;
		if _lambda = 0 then _boxcox_value = log(_var_value);
		else _boxcox_value = (_var_value**_lambda - 1)/_lambda;
	run;
	
	**Find outlier thresholds on the Box-Cox transformed scale;
	%if &covariates. = %str() %then %do;
	
		proc univariate data=_subset noprint;
		
			var _boxcox_value;
			%if &weight. ^= %str() %then %do;
				weight &weight.;
			%end;
			
			output out=_percentiles pctlpts=25 75 pctlpre=P;
		run;
		
		data _thresholds;
			set _percentiles;
			
			iqr = P75 - P25;
			
			bc_low = P25 - &iqr_multiple.*iqr;
			bc_high = P75 + &iqr_multiple.*iqr;
		run;
	%end;
	%else %do;
	
		ods select none;
	
		proc surveyreg data=_subset;
		
			model _boxcox_value = &covariates.;
			%if &weight. ^= %str() %then %do;
				weight &weight.;
			%end;
			
			output out=_reg_out r=residual p=prediction;
		run;
		
		ods select all;
		
		proc univariate data=_reg_out noprint;
			
			var residual;
			%if &weight. ^= %str() %then %do;
				weight &weight.;
			%end;
			
			output out=_percentiles pctlpts=25 75 pctlpre=P;
		run;
		
		data _thresholds;
			set _reg_out;
			set _percentiles;
			
			retain resid_low resid_high;
			
			if _N_ = 1 then do;
			
				iqr = P75 - P25;
			
				resid_low = P25 - &iqr_multiple.*iqr;
				resid_high = P75 + &iqr_multiple.*iqr;
			end;
			
			bc_low = resid_low + prediction;
			bc_high = resid_high + prediction;
		run;
	%end;
	
	**Backtransform thresholds to original scale;
	data _thresholds;
		set _thresholds;
		
		lambda = &lambda.;
		if lambda = 0 then do;
		
			_low = exp(bc_low);
			_high = exp(bc_high);
		end;
		else do;
		
			if bc_low >= -1/lambda then _low = (lambda*bc_low + 1)**(1/lambda);
			else _low = 0;
			
			if bc_high >= -1/lambda then _high = (lambda*bc_high + 1)**(1/lambda);
			else _high = 0;
		end;
	run;
	
	**Winsorize values and generate report of outlier values based on thresholds;
	data _input;
		set &input_data.;
		
		_mergevar = 1;
	run;
	
	data _thresholds;
		set _thresholds;
		
		_mergevar = 1;
	run;
	
	data &outlib..win_&variable. (keep = &id. &repeat_obs. &variable. &variable._win);
		merge _input
					_thresholds;
		by _mergevar;
		if &variable. < _low or &variable. > _high;
		
		%if %upcase("&is_episodic.") = "Y" %then %do;
		
			if &variable. < _low then &variable._win = 0; **for episodic variables, Winsorize low outliers to zero;
			else if &variable. > _high then &variable._win = _high;
		%end;
		%else %do;
		
			if &variable. < _low then &variable._win = _low;
			else if &variable. > _high then &variable._win = _high;
		%end;
	run;
	
	%if %upcase("&is_episodic.") = "Y" %then %do;
	
		**Non-consumption days for episodic variables are not Winsorized;
		data &outlib..win_&variable.;
			set &outlib..win_&variable.;
			if &variable. > 0;
		run;
	%end;
	
	%if %upcase("&print_report.") = "Y" %then %do;
	
		proc print data=&outlib..win_&variable.; 
		
			title "Suggested Winsorization for &variable.";
		run;
		title;
	%end;
%mend find_suggested_winsorization;

**Influential subject report;
%macro find_influential_subjects(input_data=,
																 row_subset=,
																 lambda=,
																 variable=,
																 weight=,
																 id=,
																 repeat_obs=,
																 alpha=,
																 multiple_test=,
																 print_report=,
																 outlib=);
																 
	%local num_recalls;
																 
	**Remove non-consumption days;
	data _subset;
		set &input_data.;
		if &variable. > 0;
	run;
	
	**Assign numbers to recalls and calculate total number of recalls;
	proc sort data=_subset; by &repeat_obs.; run;
	data _subset;
		set _subset end=last_obs;
		by &repeat_obs.;
		
		retain _recallnum;
		
		if _N_ = 1 then _recallnum = 0;
		
		if first.&repeat_obs then _recallnum = _recallnum + 1;
		
		if last_obs = 1 then call symputx("num_recalls", _recallnum);
	run;
	
	**Transpose to one record per subject;
	proc sort data=_subset; by &id.; run;
	data _subject_data (keep = &id. &weight. &variable.1-&variable.&num_recalls.);
		set _subset;
		by &id.;
		
		retain &variable.1-&variable.&num_recalls.;
		
		array obs{&num_recalls.} &variable.1-&variable.&num_recalls.;
		
		if first.&id. then do;
		
			do _i = 1 to &num_recalls.;
			
				obs{_i} = .;
			end;
		end;
		
		obs{_recallnum} = &variable.;
	run;
	
	**Box-Cox transform the variable values;
	data _subject_data;
		set _subject_data;
		
		array obs{&num_recalls.} &variable.1-&variable.&num_recalls.;
		array bc{&num_recalls.} bc1-bc&num_recalls.;
		
		lambda = &lambda.;
		
		do i = 1 to &num_recalls.;
		
			if lambda = 0 then bc{i} = log(obs{i});
			else bc{i} = (obs{i}**lambda - 1)/lambda;
		end;
	run;
	
	**Calculate the variance of observations on the Box-Cox transformed scale for each subject;
	data _subject_data;
		set _subject_data;
		
		array bc{&num_recalls.} bc1-bc&num_recalls.;
		
		_subject_variance = var(of bc{*});
		_num_subjects = n(of bc{*});
		_subject_df = _num_subjects - 1;
	run;
	
	**Subset to subjects with at least 2 observations;
	data _subject_data;
		set _subject_data;
		if _num_subjects >= 2;
	run;
	
	**Calculate weighted sum of variances and total degrees of freedom;
	data _sums (keep = _sum_variance _sum_weights _sum_df);
		set _subject_data end=last_obs;
		
		retain _sum_variance _sum_weights _sum_df _num_subjects;
		
		if _N_ = 1 then do;
		
			_sum_variance = 0;
			_sum_weights = 0;
			_sum_df = 0;
			_num_subjects = 0;
		end;
		
		_sum_variance = _sum_variance + _subject_variance*_subject_df*&weight.;
		_sum_weights = _sum_weights + _subject_df*&weight.;
		_sum_df = _sum_df + _subject_df;
		_num_subjects = _num_subjects + 1;
		
		if last_obs = 1 then output;
	run;
	
	**Merge sums into dataset;
	data _subject_data;
		set _subject_data;
		
		_merge = 1;
	run;
	
	data _sums;
		set _sums;
		
		_merge = 1;
	run;
	
	data _subject_data;
		merge _subject_data
					_sums;
		by _merge;
	run;
	
	**F-test on variance for each subject against mean variance of other subjects;
	data _subject_data;
		set _subject_data;
		
		**Sum of variance and degrees of freedom for all subjects EXCEPT the current one;
		_sum_variance = _sum_variance - _subject_variance*_subject_df*&weight.;
		_sum_weights = _sum_weights - _subject_df*&weight.;
		_sum_df = _sum_df - _subject_df;
		
		**F-test;
		_mean_variance = _sum_variance/_sum_weights;
		_f_statistic = _subject_variance/_mean_variance;
		_p_value = 1 - probf(_f_statistic, _subject_df, _sum_df);
	run;
	
	proc sort data=_subject_data; by _p_value; run;
	
	**Determine if the F-test p-value is significant based on the provided alpha;
	%if &multiple_test. = %str() %then %do;
		
		data _subject_data;
			set _subject_data;
			
			if _p_value < &alpha. then _influential = 1;
			else _influential = 0;
		run;
	%end;
	%else %if %upcase("&multiple_test.") = "BONFERRONI" %then %do;
	
		data _subject_data;
			set _subject_data;
				
			if _p_value < &alpha./_num_subjects then _influential = 1;
			else _influential = 0;
		run;
	%end;
	%else %if %upcase("&multiple_test.") = "BENJAMINI-HOCHBERG" %then %do;
	
		%local last_significant;
	
		data _NULL_;
			set _subject_data end=last_obs;
			
			retain _last_significant;
			
			if _N_ = 1 then _last_significant = 0;
			
			if _p_value < (&alpha./_num_subjects)*_N_ then _last_significant = _N_;
			
			if last_obs = 1 then call symputx("last_significant", _last_significant);
		run;
		
		data _subject_data;
			set _subject_data;
			
			if _N_ <= &last_significant. then _influential = 1;
			else _influential = 0;
		run;
	%end;
	%else %do;
	
		%put Specified type of multiple testing correction is not available.;
	%end;
	
	**Create report of subjects found to be influential;
	data &outlib..inf_&variable. (keep = &id. _p_value &variable.1-&variable.&num_recalls.);
		set _subject_data;
		if _influential = 1;
	run;
	
	%if %upcase("&print_report.") = "Y" %then %do;
	
		proc print data=&outlib..inf_&variable.;
		
			title "Subjects with Influential Variance of &variable.";
		run;
		title;
	%end;
%mend find_influential_subjects;