%include "&sysincludefiledir./internal/replicate_weight_functions.sas";

/*
Generate BRR weights for a survey dataset

Description:

	This macro creates balanced repeated replication (BRR)
	weights for variance estimation in survey data.
	
Parameters:

	- input_data: A SAS data set.
	- id: Variable that identifies each subject.
	- strata: Variable that identifies the strata that an observation is in.
	- psu: Variable that identifies which PSU within a strata that an observation is in. 
				 There must be exactly two unique values of this variable for each value of strata.
	- cell: Variable that identifies which post-stratification cell that an observation is in.
					Used to perform post-stratification adjustment of replicate weights.
					If blank, no post-stratification adjustment is performed. (default = blank)
	- weight: Base weighting variable for the dataset.
						If blank, all observations are treated as having a weight of 1. (default = blank)
	- fay_factor: A number between 0 and 1 that adjusts the weight of PSUs for the Fay method. 
								A value of 1 performs standard BRR. (default = 1)
	- outlib: The SAS library to store output datasets. (default = WORK)
	- outname: The name of the output dataset.
	
Output:

	The following SAS data set is created in outlib.
	
		outname: A SAS data set that has all of the columns of input_data plus the following columns:
						 - RepWt_0: The base weight.
						 - RepWt_1-RepWt_N: BRR replicate weights for replicates 1 to N. 
						 
						Weights are integerized for compatibility with modeling functions.
						 
About BRR:

	Balanced repeated replication (BRR) is a variance
	estimation technique used for stratified survey data. Like ordinary
	bootstrap, BRR uses a set of replicates resampled from the original data.
	BRR differs from bootstrap in that BRR replicates are structured rather
	than randomly sampled. In order for BRR to work properly, the survey data
	must be structured so that each strata has exactly two primary sampling
	units (PSUs). A Hadamard matrix is used to structure each replicate so that
	one PSU from each strata is used per replicate. The size of the Hadamard
	matrix determines the number of BRR replicate weights, and it is usually
	close to the number of strata. This means that in datasets where it can be 
	used, BRR is often more efficient than bootstrap.
						
Fay Method:

	Standard BRR uses only half of the PSUs in the dataset
	for each replicate. The Fay method uses the full dataset for each replicate
	but adjusts the weighting of each PSU based on a value between 0 and 1
	called the Fay factor f. If a PSU would be taken out of the sample in
	standard BRR, it is instead given a weight of 1 - f when using the Fay
	method. Likewise, PSUs that would be kept in the sample are instead given a
	weight of 1 + f. A Fay factor of 1 is equivalent to using standard BRR.
	
Post-Stratification:

	The design of a survey study usually will have
	expected proportions that different groups should have in the full
	population. Post-stratification adjustment is used to adjust observation
	weights so that the influence of each group (called a post-stratification
	cell) in an analysis is proportional to the group's actual proportion in
	the population.
*/

%macro brr_weights(input_data=,
									 id=,
									 strata=,
									 psu=,
									 cell=,
									 weight=,
									 fay_factor=1,
									 outlib=WORK,
									 outname=);
									 
	%local num_reps;
									 
	**Subset to one subject per row;
	proc sort data=&input_data.; by &id.; run;
	data _brr_data;
		set &input_data.;
		by &id.;
		
		**placeholder analysis variable for PROC SURVEYMEANS;
		_placeholder = 1;
		
		if first.&id. then output;
	run;
	
	**Set base weights;
	data _brr_data;
		set _brr_data;
		
		%if &weight. ^= %str() %then %do;
		
			RepWt_0 = &weight.;
		%end;
		%else %do;
		
			RepWt_0 = 1;
		%end;
	run;
	
	%if &cell. ^= %str() %then %do;
	
		**Totals in each post-stratification cell;
		proc sort data=_brr_data; by &cell.; run;
		data _control (keep = &cell. _PSTOTAL_);
			set _brr_data;
			by &cell.;
		
			retain _PSTOTAL_;
		
			if first.&cell. then _PSTOTAL_ = 0;
		
			_PSTOTAL_ = _PSTOTAL_ + RepWt_0;
		
			if last.&cell. then output;
		run;
	%end;
	
	**Use set Fay factor to (1 - Fay factor) for PROC SURVEYMEANS;
	data _NULL_;
	
		fay_factor = 1 - &fay_factor.;
		call symputx("fay_factor", fay_factor);
	run;
	
	**Generate BRR weights;
	proc sort data=_brr_data; by &strata. &psu.; run;
	
	proc surveymeans data=_brr_data varmethod=brr(outweights=_brr_data fay=&fay_factor.) noprint;
		var _placeholder;
		strata &strata.;
		cluster &psu.;
		weight RepWt_0;
		%if &cell. ^= %str() %then %do;
			poststrata &cell. / PSTOTAL=_control;
		%end;
	run;
	
	**Count number of replicates;
	proc contents data=_brr_data out=_allvars noprint; run;
	data _NULL_;
		set _allvars end=last_obs;
		
		retain num_reps;
		
		if _N_ = 1 then num_reps = 0;
		
		if substr(NAME, 1, 6) = "RepWt_" then num_reps = num_reps + 1;
		
		if last_obs = 1 then call symputx("num_reps", num_reps - 1);
	run;
	
	**Integerize weights;
	data _brr_data;
		set _brr_data;
		
		RepWt_0 = &weight.; **Integerized base weight;
	run;
	
	%integerize_weights(weight_data=_brr_data,
											weights=RepWt_0-RepWt_&num_reps.,
											cell=&cell.);
	
	**Merge BRR weights back into full data;
	proc sort data=_brr_data; by &id.; run;
	
	data &outlib..&outname.;
		merge &input_data.
					_brr_data (keep = &id. RepWt_0-RepWt_&num_reps.);
		by &id.;
	run;
%mend brr_weights;


/*
Generate bootstrap weights for a survey dataset

Description:

	This macro creates bootstrap weights for variance estimation in survey data.
	
Parameters:

	- input_data: A SAS data set.
	- id: Variable that identifies each subject.
	- strata: Variable that identifies the strata that an observation is in.
	- psu: Variable that identifies which PSU within a strata that an observation is in. 
	- cell: Variable that identifies which post-stratification cell that an observation is in.
					Used to perform post-stratification adjustment of replicate weights.
					If blank, no post-stratification adjustment is performed. (default = blank)
	- weight: Base weighting variable for the dataset.
						If blank, all observations are treated as having an equal weight of 1. (default = blank)
	- num_reps: Number of bootstrap replicate weights. (default = 200)
	- boot_seed: Positive integer starting seed for the random number generator.
							 If blank, uses a randomly generated integer from 1 to 2*10^7, inclusive. (default = blank)
	- outlib: The SAS library to store output datasets. (default = WORK)
	- outname: The name of the output dataset.
	
Output:

	The following SAS data set is created in outlib.
	
		outname: A SAS data set that has all of the columns of input_data plus the following columns:
						 - RepWt_0: The base weight.
						 - RepWt_1-RepWt_N: Bootstrap replicate weights for replicates 1 to N. 
						 - _seed: The random number generator seed used to generate the weights.
						 
						 Weights are integerized for compatibility with modeling functions.
						 										
About Bootstrap:

	Bootstrapping is a resampling technique that can be
	used for variance estimation in survey data. Each bootstrap replicate
	weight set is equivalent to sampling the primary sampling units (PSUs) in
	the original dataset with replacement. The strata can have an arbitrary
	number of PSUs within them. Often, each individual subject is treated as
	its own PSU. Many replicates need to be performed to provide accurate
	estimates of the variance. In practice, this is generally around 200-500
	replicates. The ideal number can vary depending on the dataset, and survey
	datasets will sometimes specify how many bootstrap replicates are
	recommended.
	
Post-Stratification:

	The design of a survey study usually will have
	expected proportions that different groups should have in the full
	population. Post-stratification adjustment is used to adjust observation
	weights so that the influence of each group (called a post-stratification
	cell) in an analysis is proportional to the group's actual proportion in
	the population.
*/
%macro boot_weights(input_data=,
										id=,
										strata=,
										psu=,
										cell=,
										weight=,
										num_reps=200,
										boot_seed=,
										outlib=WORK,
										outname=);
										
	%if &boot_seed. = %str() %then %do;
	
		data _NULL_;
		
			boot_seed = int(rand("uniform", 0, 1)*20000000 + 1);
			call symputx("boot_seed", boot_seed);
		run;
	%end;
	
	**Subset to one subject per row;
	proc sort data=&input_data.; by &id.; run;
	data _boot_data;
		set &input_data.;
		by &id.;
		
		**placeholder analysis variable for PROC SURVEYMEANS;
		_placeholder = 1;
		
		if first.&id. then output;
	run;
	
	**Set base weight;
	data _boot_data;
		set _boot_data;
		
		%if &weight. ^= %str() %then %do;
		
			RepWt_0 = &weight.;
		%end;
		%else %do;
			
			RepWt_0 = 1;
		%end;
	run;
	
	%if &cell. ^= %str() %then %do;
	
		**Totals in each post-stratification cell;
		proc sort data=_boot_data; by &cell.; run;
		data _control (keep = &cell. _PSTOTAL_);
			set _boot_data;
			by &cell.;
		
			retain _PSTOTAL_;
		
			if first.&cell. then _PSTOTAL_ = 0;
		
			_PSTOTAL_ = _PSTOTAL_ + RepWt_0;
		
			if last.&cell. then output;
		run;
	%end;
	
	**Generate bootstrap weights;
	proc sort data=_boot_data; by &strata. &psu.; run;
	
	proc surveymeans data=_boot_data varmethod=bootstrap(outweights=_boot_data reps=&num_reps. seed=&boot_seed.) noprint;
		var _placeholder;
		strata &strata.;
		cluster &psu.;
		weight RepWt_0;
		%if &cell. ^= %str() %then %do;
			poststrata &cell. / PSTOTAL=_control;
		%end;
	run;
	
	**Integerize weights;
	%integerize_weights(weight_data=_boot_data,
											weights=RepWt_0-RepWt_&num_reps.,
											cell=&cell.);
											
	**Merge bootstrap weights back into full data;
	proc sort data=_boot_data; by &id.; run;
	
	data &outlib..&outname.;
		merge &input_data.
					_boot_data (keep = &id. RepWt_0-RepWt_&num_reps.);
		by &id.;
	run;
	
	**Output seed;
	data &outlib..&outname.;
		set &outlib..&outname.;
		
		seed = &boot_seed.;
	run;
%mend boot_weights;

