%include "&sysincludefiledir./internal/summary_functions.sas";
%include "&sysincludefiledir./internal/summary_plots.sas";

/*
Summarize simulated usual intakes

Description:

	A utility that calculates means, quantiles, and proportions
	above/below a threshold for data frame columns. The output data frame is
	structured so that the summary statistic values are in a single column to
	make variance calculation from replicate methods more efficient.
	
Parameters:

	- input_data: A SAS data set.
	- variables: Space-delimited list of variables to use for all enabled summary functions.
	- mean_variables: Space-delimited list of variables to calculate means.
										Overrides variables for calculating means.
	- quantile_variables: Space-delimited list of variables to calculate quantiles. 
												Overrides variables for calculating quantiles.
	- density_plot_variables: Space-delimited list of variables to generate density plots. 
														Overrides variables for generating density plots.
	- ecdf_plot_variables: Vector of variables to generate ECDF plots. 
												 Overrides variables for generating ECDF plots.
	- row_subset: A conditional expression wrapped in %quote() indicating which rows of input_data to use when calculating summary statistics or plots.
	- population_name: An identifier for the population being summarized.
										 Included as a column in the output so that populations can be distinguished when output datasets are combined. (default = All)
	- weight: Variable with weighting for each subject.
	- do_means: Flag specifying whether means should be calculated. (Y/N, default = Y)
	- do_quantiles: Flag specifying whether quantiles should be calculated. (Y/N, default = Y)
	- quantiles: Space-delimited list of quantiles (between 0 and 100) to pass into PROC UNIVARIATE. 
							 Default calculates 5th, 25th, 50th, 75th, and 95th percentiles.
							 No effect when do_quantiles is N. (default = 5 25 50 75 95)
	- do_proportions: Flag specifying whether proportions of observations above/below thresholds should be calculated. (Y/N, default = Y) 
										At least one of lower_thresholds and upper_thresholds must be specified to calculate proportions.
	- lower_thresholds: A SAS data set specifying thresholds to calculate the proportion of subjects above those thresholds.
											Must contain the following columns:
												- variable: The name of the variable to calculate a proportion for.
												- threshold: The lower threshold to calculate the proportion of observations that the variable is above.
	- upper_thresholds: A SAS data set specifying thresholds to calculate the proportion of subjects below those thresholds.
											Must contain the following columns:
												- variable: The name of the variable to calculate a proportion for.
												- threshold: The upper threshold to calculate the proportion of observations that the variable is below.
	- do_density_plots: Flag specifying whether density plots should be generated. (Y/N, default = N)
	- do_ecdf_plots: Flag specifying whether ECDF plots should be generated, (Y/N, default = N)
	- outlib: The SAS library to store output datasets. (default = WORK)
	- outname: The name of the output dataset.
	
Output:

	A SAS data set outname is created in outlib with the following columns:
		- population: The name of the population given by population_name.
		- variable: The name of the variable being summarized.
		- statistic: The name of the summary statistic.
		- value: The value of the summary statistic.
		
	If plots are requested, they are output using ODS.
	
Details:
	
	This function is intended to summarize simulated usual intakes from
	nci_multivar_distrib and variables derived from them. By default, the
	5th, 25th, 50th, 75th, and 95th percentiles and the mean are calculated for
	the input variables. In addition, the proportion of subjects above lower
	threshold(s) and/or below upper threshold(s) for variables can be
	calculated.

	Supplemental plots can also be generated to visualize the distributions of
	the variables. Density plots use kernel density estimation via
	to visualize a variable's probability distribution function. 
	Empirical cumulative distribution function (ECDF) plots visualize a 
	variable's cumulative distribution function.
*/
%macro nci_multivar_summary(input_data=,
														variables=,
														mean_variables=,
														quantile_variables=,
														density_plot_variables=,
														ecdf_plot_variables=,
														row_subset=,
														population_name=All,
														weight=,
														do_means=Y,
														do_quantiles=Y,
														quantiles=5 25 50 75 95,
														do_proportions=Y,
														lower_thresholds=,
														upper_thresholds=,
														do_density_plots=N,
														do_ecdf_plots=N,
														outlib=WORK,
														outname=);
														
	**Means;
	%if %upcase(&do_means.) = Y %then %do;
	
		%if &mean_variables. = %str() %then %do;
		
			%let mean_variables = &variables.;
		%end;
		
		%summary_means(input_data=&input_data.,
									 variables=&mean_variables.,
									 row_subset=&row_subset.,
									 weight=&weight.,
									 population_name=&population_name.,
									 outname=_summary_means);
	%end;
	
	**Quantiles;
	%if %upcase(&do_quantiles.) = Y and &quantiles. ^= %str() %then %do;
	
		%if &quantile_variables. = %str() %then %do;
		
			%let quantile_variables = &variables.;
		%end;
		
		%summary_quantiles(input_data=&input_data.,
											 variables=&quantile_variables.,
											 row_subset=&row_subset.,
											 weight=&weight.,
											 quantiles=&quantiles.,
											 population_name=&population_name.,
											 outname=_summary_quantiles);
	%end;
	
	**Proportions;
	%if %upcase(&do_proportions.) = Y and (&lower_thresholds. ^= %str() or &upper_thresholds. ^= %str()) %then %do;
	
		%summary_proportions(input_data=&input_data.,
												 row_subset=&row_subset.,
												 weight=&weight.,
												 population_name=&population_name.,
												 lower_thresholds=&lower_thresholds.,
												 upper_thresholds=&upper_thresholds.,
												 outname=_summary_proportions);
	%end;
	
	**Density plots;
	%if %upcase(&do_density_plots.) = Y %then %do;
	
		%if &density_plot_variables. = %str() %then %do;
		
			%let density_plot_variables = &variables.;
		%end;
		
		%density_plots(input_data=&input_data.,
									 variables=&density_plot_variables.,
									 row_subset=&row_subset.,
									 weight=&weight.,
									 population_name=&population_name.);
	%end;
	
	**ECDF plots;
	%if %upcase(&do_ecdf_plots.) = Y %then %do;
	
		%if &ecdf_plot_variables. = %str() %then %do;
			
			%let ecdf_plot_variables = &variables.;
		%end;
		
		%density_plots(input_data=&input_data.,
									 variables=&ecdf_plot_variables.,
									 row_subset=&row_subset.,
									 weight=&weight.,
									 population_name=&population_name.);
	%end;
	
	**Create summary dataset;
	data &outlib..&outname.;
		set
		%if %upcase(&do_means.) = Y %then %do;
			_summary_means
		%end;
		%if %upcase(&do_quantiles.) = Y and &quantiles. ^= %str() %then %do;
			_summary_quantiles
		%end;
		%if %upcase(&do_proportions.) = Y and (&lower_thresholds. ^= %str() or &upper_thresholds. ^= %str()) %then %do;
			_summary_proportions
		%end;
		;
	run;
%mend nci_multivar_summary;

/*
Summarize coefficients of a PROC SURVEYREG model

Description:

	A utility that extracts the coefficients of a PROC SURVEYREG model and
	puts them into a data frame. The output data frame is structured so that
	the summary statistic values are in a single column to make variance
	calculation from replicate methods more efficient.
	
Parameters:

	- parameter_estimates: The ParameterEstimates ODS output from PROC SURVEYREG.
	- response: The model response variable.
	- population_name: An identifier for the population being summarized.
										 Included as a column in the output so that populations can be distinguished when output datasets are combined. (default = All)
	- outlib: The SAS library to store output datasets. (default = WORK)
	- outname: The name of the output dataset.
	
Output:

	A SAS data set outname is created in outlib with the following columns:
		- population: The name of the population given by population_name.
		- variable: The name of the response variable.
		- statistic: A string with the name of the covariate.
		- value: The value of the model coefficient.
*/
%macro summary_coef_surveyreg(parameter_estimates=,
															response=,
															population_name=All,
															outlib=WORK,
															outname=);
															
	data &outlib..&outname. (keep = population variable statistic value);
		set &parameter_estimates.;
		
		length population $200.;
		length variable $32.;
		length statistic $200.;
		
		population = "&population_name.";
		variable = "&response.";
		statistic = cat("Coefficient for ", Parameter);
		value = Estimate;
	run;
%mend summary_coef_surveyreg;

/*
Summarize coefficients of a PROC SURVEYLOGISTIC model

Description:

	A utility that extracts the coefficients of a PROC SURVEYLOGISTIC model and
	puts them into a data frame. The output data frame is structured so that
	the summary statistic values are in a single column to make variance
	calculation from replicate methods more efficient.
	
Parameters:

	- parameter_estimates: The ParameterEstimates ODS output from PROC SURVEYLOGISTIC.
	- response: The model response variable.
	- population_name: An identifier for the population being summarized.
										 Included as a column in the output so that populations can be distinguished when output datasets are combined. (default = All)
	- outlib: The SAS library to store output datasets. (default = WORK)
	- outname: The name of the output dataset.
	
Output:

	A SAS data set outname is created in outlib with the following columns:
		- population: The name of the population given by population_name.
		- variable: The name of the response variable.
		- statistic: A string with the name of the covariate.
		- value: The value of the model coefficient.
*/
%macro summary_coef_surveylogistic(parameter_estimates=,
																	 response=,
																	 population_name=All,
																	 outlib=WORK,
																	 outname=);
	
	data &outlib..&outname. (keep = population variable statistic value);
		set &parameter_estimates. (rename=(Variable=Parameter));
		
		length population $200.;
		length variable $32.;
		length statistic $200.;
		
		population = "&population_name.";
		variable = "&response.";
		statistic = cat("Coefficient for ", Parameter);
		value = Estimate;
	run;																 
%mend summary_coef_surveylogistic;

/*
Summarize predictions from a PROC SURVEYREG model

Description:

	A utility that extracts linear predictors from a PROC SURVEYREG model
	and puts them into a data frame. The output data frame is structured so
	that the summary statistic values are in a single column to make variance
	calculation from replicate methods more efficient.
	
Parameters:

	- model_output: A SAS data set created by the OUTPUT statement of PROC SURVEYREG with the PREDICTED option.
	- predicted: Prediction variable set by the PREDICTED option in the OUTPUT statement of PROC SURVEYREG. (default = _PREDICTED_)
	- response: The model response variable.
	- id: Variable that uniquely identifies each subject. (default = _N_)
	- population_name: An identifier for the population being summarized.
										 Included as a column in the output so that populations can be distinguished when output datasets are combined. (default = All)
	- outlib: The SAS library to store output datasets. (default = WORK)
	- outname: The name of the output dataset.
	
Output:

	A SAS data set outname is created in outlib with the following columns:
		- population: The name of the population given by population_name.
		- variable: The name of the response variable.
		- statistic: A string with the name of the subject being predicted.
		- value: The predicted value.
*/
%macro summary_pred_surveyreg(model_output=,
															predicted=_PREDICTED_,
															response=,
															id=_N_,
															population_name=All,
															outlib=WORK,
															outname=);
															
	data &outlib..&outname. (keep = population variable statistic value);
		set &model_output.;
		
		length population $200.;
		length variable $32.;
		length statistic $200.;
		
		population = "&population_name.";
		variable = "&response.";
		statistic = cat("Prediction for &id. = ", &id.);
		value = &predicted.;
	run;
%mend summary_pred_surveyreg;

/*
Summarize predictions from a PROC SURVEYREG model

Description:

	A utility that extracts linear predictors from a PROC SURVEYLOGISTIC model
	and puts them into a data frame. The output data frame is structured so
	that the summary statistic values are in a single column to make variance
	calculation from replicate methods more efficient.
	
Parameters:

	- model_output: A SAS data set created by the OUTPUT statement of PROC SURVEYLOGISTIC with the XBETA option.
	- xbeta: Linear predictor variable set by the XBETA option in the OUTPUT statement of PROC SURVEYLOGISTIC.
	- response: The model response variable.
	- id: Variable that uniquely identifies each subject. (default = _N_)
	- population_name: An identifier for the population being summarized.
										 Included as a column in the output so that populations can be distinguished when output datasets are combined. (default = All)
	- outlib: The SAS library to store output datasets. (default = WORK)
	- outname: The name of the output dataset.
	
Output:

	A SAS data set outname is created in outlib with the following columns:
		- population: The name of the population given by population_name.
		- variable: The name of the response variable.
		- statistic: A string with the name of the subject being predicted.
		- value: The predicted value.
*/
%macro summary_pred_surveylogistic(model_output=,
																	 xbeta=,
																	 response=,
																	 id=_N_,
																	 population_name=All,
																	 outlib=WORK,
																	 outname=);
																	 
	data &outlib..&outname. (keep = population variable statistic value);
		set &model_output.;
		
		length population $200.;
		length variable $32.;
		length statistic $200.;
		
		population = "&population_name.";
		variable = "&response.";
		statistic = cat("Prediction for &id. = ", &id.);
		value = &xbeta.;
	run;
%mend summary_pred_surveylogistic;

/*
Differences Between Two Populations

Description:

	Calculates the differences in summary statistics between two
	populations.
	
Parameters:

	- population1, population2: SAS data sets of populations to compare.
															Must have the following columns:
																- population: Name of the population.
																- variable: Name of the variable being summarized.
																- statistic: Name of the summary statistic.
																- value: Value of the summary statistic.
	- outlib: The SAS library to store output datasets. (default = WORK)
	- outname: The name of the output dataset.
	
Output:

	A SAS data set outname is created in outlib with the following columns:
		- population: Name of the population difference, composed from population columns of population1 and population2.
		- variable: Name of the variable being summarized.
		- statistic: Name of the summary statistic.
		- value: Difference between the value columns of population1 and population2.
	
Details:

	This utility macros is designed to work with the output of the
	other ncimultivar summary macros. Differences will be
	calculated for rows in the two input populations that have matching
	variable and statistic columns. Differences will not be calculated for
	rows in either population with no match.
*/
%macro summary_difference(population1=,
													population2=,
													outlib=WORK,
													outname=);
													 
	data _pop1;
		set &population1.;
		
		_order = _N_;
	run;
	
	data _pop2;
		set &population2.;
	run;
													 
	proc sort data=_pop1; by variable statistic; run;
	proc sort data=_pop2; by variable statistic; run;
	
	data _pop_diff;
		merge _pop1 (in=in1)
					_pop2 (in=in2 rename=(population=population2 value=value2));
		by variable statistic;
		if in1 = 1 and in2 = 1;
		
		population = cat(trim(population), " - ", trim(population2));
		value = value - value2;
	run;
	
	proc sort data=_pop_diff; by _order; run;
	
	data &outlib..&outname. (keep = population variable statistic value);
		set _pop_diff;
	run;
%mend summary_difference;