#' Clean and prepare data for the MCMC model
#'
#' @description Removes missing and negative values from variables, makes
#'   indicators for episodic variables, and standardizes variables and
#'   covariates.
#'
#' @details For episodic variables, the MCMC requires a separate indicator and
#'   amount to model the probability of consumption and the amount consumed. The
#'   variable values for both indicators and amounts are Box-Cox transformed,
#'   then standardized to a mean of 0 and variance of 2. Continuous covariates
#'   are standardized to a mean of 0 and a variance of 1. This standardization
#'   is done to meet the assumptions of the MCMC laid out in Zhang, et al.
#'   (2011). Indicator variables have the prefix `ind.`, standardized amounts
#'   have the prefix `amt.`, and standardized covariates have the prefix `std.`.
#'
#'
#' @param input.data A data frame.
#' @param episodic.variables Vector of episodic variables.
#' @param episodic.biomarkers Vector episodic biomarkers.
#' @param daily.variables Vector of daily variables.
#' @param daily.biomarkers Vector daily biomarkers.
#' @param continuous.covariates Vector of continuous covariates.
#' @param boxcox.lambda.data A data frame with Box-Cox lambda parameters for
#'   each variable. Must contain only the following columns:
#' * variable: Name of the variable.
#' * tran_lambda: Box-Cox transformation parameter to use for the variable.
#' @param minimum.amount.data A data frame with minimum consumption amounts for
#'   each variable. Must contain only the following columns:
#' * variable: Name of the variable.
#' * minamount: Minimum amount consumed for the variable.
#'
#' @returns An `nci.multivar.preprocessor` object containing the following
#'   elements:
#' * mcmc.input: A data frame with all of the columns of `input.data` plus any created indicator variables and standardized variables and covariates.
#' * backtransformation: A data frame with the following columns:
#'   * variable: The name of the variable.
#'   * tran_lambda: The Box-Cox lambda used to transform the variable.
#'   * minamount: The minimum allowed usual intake, defined as half of the smallest non-zero intake in the observed data.
#'   * tran_center: The mean of the Box-Cox transformed variable before standardization.
#'   * tran_scale: The standard deviation of the Box-Cox transformed variable before standardization divided by `sqrt(2)`.
#'   * biomarker: Logical flag of whether the variable is a biomarker assumed to be unbiased on the transformed scale.
#'                If `FALSE`, a bias correction factor will be added and a 9-point approximation will be used for backtransformation.
#'                If `TRUE`, an exact backtransformation will be used with no correction.
#'
#' @export
#'
#' @examples
#' #subset NHANES data
#' nhanes.subset <- nhcvd[nhcvd$SDMVSTRA %in% c(48, 60, 72),]
#'
#' boxcox.sodium <- boxcox_survey(input.data=nhanes.subset,
#'                                row.subset=(nhanes.subset$DAY == 1),
#'                                variable="TSODI",
#'                                id="SEQN",
#'                                repeat.obs="DAY",
#'                                weight="WTDRD1",
#'                                covariates="RIDAGEYR")
#'
#' boxcox.g.whole <- boxcox_survey(input.data=nhanes.subset,
#'                                 row.subset=(nhanes.subset$DAY == 1),
#'                                 variable="G_WHOLE",
#'                                 is.episodic=TRUE,
#'                                 id="SEQN",
#'                                 repeat.obs="DAY",
#'                                 weight="WTDRD1",
#'                                 covariates="RIDAGEYR")
#'
#' boxcox.lambda.data <- rbind(boxcox.sodium, boxcox.g.whole)
#'
#' minimum.amount.data <- calculate_minimum_amount(input.data=nhanes.subset,
#'                                                 row.subset=(nhanes.subset$DAY == 1),
#'                                                 episodic.variables="G_WHOLE",
#'                                                 daily.variables="TSODI")
#'
#' pre.mcmc.data <- nci_multivar_preprocessor(input.data=nhanes.subset,
#'                                            episodic.variables="G_WHOLE",
#'                                            daily.variables="TSODI",
#'                                            continuous.covariates="RIDAGEYR",
#'                                            boxcox.lambda.data=boxcox.lambda.data,
#'                                            minimum.amount.data=minimum.amount.data)
#'
#' #indicator and amount names
#' pre.mcmc.data$amount.indicator.names
#'
#' #indicator for whole grains
#' head(pre.mcmc.data$mcmc.input$ind.G_WHOLE)
#'
#' #amounts are standardized to mean 0 and variance 2
#' mean(pre.mcmc.data$mcmc.input$amt.TSODI)
#' var(pre.mcmc.data$mcmc.input$amt.TSODI)
#'
#' mean(pre.mcmc.data$mcmc.input$amt.G_WHOLE, na.rm=TRUE)
#' var(pre.mcmc.data$mcmc.input$amt.G_WHOLE, na.rm=TRUE)
#'
#' #backtransformation data
#' pre.mcmc.data$backtransformation
nci_multivar_preprocessor <- function(input.data,
                                      episodic.variables=NULL,
                                      episodic.biomarkers=NULL,
                                      daily.variables=NULL,
                                      daily.biomarkers=NULL,
                                      continuous.covariates=NULL,
                                      boxcox.lambda.data,
                                      minimum.amount.data) {


  #1. Remove invalid records from input dataset
  cleaned.data <- remove_missing_and_negative(input.data=input.data,
                                              episodic.variables=episodic.variables,
                                              episodic.biomarkers=episodic.biomarkers,
                                              daily.variables=daily.variables,
                                              daily.biomarkers=daily.biomarkers,
                                              continuous.covariates=continuous.covariates)

  #2. Create episodic indicators, standardized amounts, and standardized covariates
  standardized.data <- create_standardized_variables(input.data=cleaned.data,
                                                     episodic.variables=episodic.variables,
                                                     episodic.biomarkers=episodic.biomarkers,
                                                     daily.variables=daily.variables,
                                                     daily.biomarkers=daily.biomarkers,
                                                     continuous.covariates=continuous.covariates,
                                                     boxcox.lambda.data=boxcox.lambda.data,
                                                     minimum.amount.data=minimum.amount.data)
  center.scale.data <- attr(standardized.data, "center.scale")

  #3. Create back-transformation dataset from Box-Cox survey data, minimum amount data, and standardization parameters
  backtransformation.data <- create_backtransformation_data(boxcox.lambda.data=boxcox.lambda.data,
                                                            minimum.amount.data=minimum.amount.data,
                                                            center.scale.data=center.scale.data,
                                                            episodic.biomarkers=episodic.biomarkers,
                                                            daily.biomarkers=daily.biomarkers)

  #4. Output MCMC input data and back-transformation data
  pre.mcmc.data <- structure(list(mcmc.input=standardized.data,
                                  backtransformation=backtransformation.data),
                             class="nci.multivar.preprocessor")

  return(pre.mcmc.data)
}
