#Summarizes weighted means of variables in dataset.
summary_means <- function(input.data,
                          variables=NULL,
                          row.subset=NULL,
                          population.name="All",
                          weight=NULL) {

  input.data <- as.data.frame(input.data)

  if(is.null(weight)) {

    subject.weighting <- rep(1, nrow(input.data))
  } else {

    subject.weighting <- input.data[,weight,drop=TRUE]
  }

  if(is.null(variables)) {

    variables <- names(input.data)
  }
  input.data <- input.data[,variables,drop=FALSE]

  if(!is.null(row.subset)) {

    input.data <- input.data[row.subset,,drop=FALSE]
    subject.weighting <- subject.weighting[row.subset]
  }

  summary.means <- apply(input.data, 2, function(variable) weighted.mean(variable, subject.weighting))

  summary.mean.data <- data.frame(population=population.name,
                                  variable=variables,
                                  statistic="Mean",
                                  value=summary.means,
                                  row.names=NULL)

  return(summary.mean.data)
}

#Summarizes weighted quantiles of variables in dataset.
summary_quantiles <- function(input.data,
                              variables=NULL,
                              row.subset=NULL,
                              population.name="All",
                              weight=NULL,
                              quantiles=c(0.05,0.25,0.5,0.75,0.95)) {

  input.data <- as.data.frame(input.data)

  if(is.null(weight)) {

    subject.weighting <- rep(1, nrow(input.data))
  } else {

    subject.weighting <- input.data[,weight,drop=TRUE]
  }

  if(is.null(variables)) {

    variables <- names(input.data)
  }
  input.data <- input.data[,variables,drop=FALSE]

  if(!is.null(row.subset)) {

    input.data <- input.data[row.subset,,drop=FALSE]
    subject.weighting <- subject.weighting[row.subset]
  }

  summary.quantiles <- apply(input.data, 2, function(variable) weighted_quantiles(variable, subject.weighting, quantiles))

  summary.quantile.data <- data.frame(population=population.name,
                                      variable=rep(variables, each=length(quantiles)),
                                      statistic=rownames(summary.quantiles),
                                      value=as.vector(summary.quantiles),
                                      row.names=NULL)

  return(summary.quantile.data)
}

#Summarizes proportions of observations in dataset above/below thresholds.
summary_proportions <- function(input.data,
                                row.subset=NULL,
                                population.name="All",
                                weight=NULL,
                                lower.thresholds=NULL,
                                upper.thresholds=NULL) {

  input.data <- as.data.frame(input.data)

  if(is.null(weight)) {

    subject.weighting <- rep(1, nrow(input.data))
  } else {

    subject.weighting <- input.data[,weight,drop=TRUE]
  }

  if(!is.null(row.subset)) {

    input.data <- input.data[row.subset,,drop=FALSE]
    subject.weighting <- subject.weighting[row.subset]
  }

  if(!is.null(lower.thresholds)) {

    lower.threshold.variables <- names(lower.thresholds)
    lower.threshold.values <- unlist(lower.thresholds, use.names=FALSE)
    num.lower <- length(lower.thresholds)

    lower.proportions <- numeric(num.lower)
    for(i in 1:num.lower) {

      lower.proportions[i] <- weighted.mean(input.data[,lower.threshold.variables[i]] > lower.threshold.values[i], subject.weighting)
    }

    summary.proportions.lower <- data.frame(population=population.name,
                                            variable=paste0(lower.threshold.variables, " > ", lower.threshold.values),
                                            statistic="Proportion",
                                            value=lower.proportions)
  }

  if(!is.null(upper.thresholds)) {

    upper.threshold.variables <- names(upper.thresholds)
    upper.threshold.values <- unlist(upper.thresholds, use.names=FALSE)
    num.upper <- length(upper.thresholds)

    upper.proportions <- numeric(num.upper)
    for(i in 1:num.upper) {

      upper.proportions[i] <- weighted.mean(input.data[,upper.threshold.variables[i]] < upper.threshold.values[i], subject.weighting)
    }

    summary.proportions.upper <- data.frame(population=population.name,
                                            variable=paste0(upper.threshold.variables, " < ", upper.threshold.values),
                                            statistic="Proportion",
                                            value=upper.proportions)
  }

  if(!is.null(lower.thresholds) && !is.null(upper.thresholds)) {

    summary.proportions <- rbind(summary.proportions.lower, summary.proportions.upper)
  } else if(!is.null(lower.thresholds)) {

    summary.proportions <- summary.proportions.lower
  } else if(!is.null(upper.thresholds)) {

    summary.proportions <- summary.proportions.upper
  } else {

    summary.proportions <- NULL
  }

  return(summary.proportions)
}



#' Summarize coefficients of a regression model
#'
#' @description A utility that extracts the coefficients of a model object and
#'   puts them into a data frame. The output data frame is structured so that
#'   the summary statistic values are in a single column to make variance
#'   calculation from replicate methods more efficient (for an example, see the
#'   regression calibration vignette).
#'
#' @details This function is compatible with models with class `lm`, `glm`, or
#'   `mlm`. This includes models output by [stats::lm()] and [stats::glm()] as
#'   well as extensions of those models such as from the 'survey' and 'mfp'
#'   packages.
#'
#'
#' @param model A model object. See Details for compatible object classes.
#' @param population.name A character string to identify a population. Included
#'   as a column in the output so that populations can be distinguished when
#'   output datasets are combined. (default = `"All"`)
#'
#' @returns A data frame with the following columns:
#' * population: The name of the population given by `population.name`.
#' * variable: The name of the response variable.
#' * statistic: A character string with the name of the covariate.
#' * value: The value of the model coefficient.
#'
#' @export
#'
#' @examples
#' #subset NHANES data
#' nhanes.subset <- nhcvd[nhcvd$SDMVSTRA %in% c(48, 60, 72),]
#'
#' #Use NCI method to simulate usual intakes of potassium
#' boxcox.lambda.data <- boxcox_survey(input.data=nhanes.subset,
#'                                     row.subset=(nhanes.subset$DAY == 1),
#'                                     variable="TPOTA",
#'                                     id="SEQN",
#'                                     repeat.obs="DAY",
#'                                     weight="WTDRD1")
#'
#' minimum.amount.data <- calculate_minimum_amount(input.data=nhanes.subset,
#'                                                 row.subset=(nhanes.subset$DAY == 1),
#'                                                 daily.variables="TPOTA")
#'
#' pre.mcmc.data <- nci_multivar_preprocessor(input.data=nhanes.subset,
#'                                            daily.variables="TPOTA",
#'                                            boxcox.lambda.data=boxcox.lambda.data,
#'                                            minimum.amount.data=minimum.amount.data)
#'
#' #for regression calibration, post-MCMC random effect ('u') draws are saved
#' mcmc.output <- nci_multivar_mcmc(pre.mcmc.data=pre.mcmc.data,
#'                                  id="SEQN",
#'                                  weight="WTDRD1",
#'                                  repeat.obs="DAY",
#'                                  daily.variables="TPOTA",
#'                                  num.mcmc.iterations=1000,
#'                                  num.burn=500,
#'                                  num.thin=1,
#'                                  num.post=250)
#'
#' mcmc.input.data <- pre.mcmc.data$mcmc.input
#' population.base <- mcmc.input.data[!duplicated(mcmc.input.data$SEQN),]
#'
#' #use post-MCMC random effect ('u') draws instead of simulating new ones
#' distrib.output <- nci_multivar_distrib(multivar.mcmc.model=mcmc.output,
#'                                        distrib.population=population.base,
#'                                        id="SEQN",
#'                                        weight="WTDRD1",
#'                                        use.mcmc.u.matrices=TRUE,
#'                                        additional.output="BPSY_AVG")
#'
#' #calculate average usual intake per subject
#' regression.data <- aggregate(distrib.output[,"usual.intake.TPOTA",drop=FALSE],
#'                              by=distrib.output[,c("SEQN", "BPSY_AVG"),drop=FALSE],
#'                              mean)
#'
#'
#' #convert usual intake of potassium from milligrams to grams
#' regression.data$usual.intake.TPOTA <- regression.data$usual.intake.TPOTA/1000
#'
#' #fit model of systolic blood pressure as a function of usual potassium intake in grams
#' blood.pressure <- lm(BPSY_AVG ~ usual.intake.TPOTA, data=regression.data)
#'
#' #summary of blood pressure model coefficients
#' blood.pressure.summary <- summary_coefficients(model=blood.pressure)
#' blood.pressure.summary
summary_coefficients <- function(model,
                                 population.name="All") {

  UseMethod("summary_coefficients")
}

#' @rdname summary_coefficients
#' @export
summary_coefficients.mlm <- function(model,
                                     population.name="All") {

  model.coefficients <- coefficients(model)
  covariates <- rownames(model.coefficients)
  responses <- colnames(model.coefficients)

  summary.coefficient.data <- data.frame(population=population.name,
                                         variable=rep(responses, each=length(covariates)),
                                         statistic=paste0("Coefficient for ", rep(covariates, length(responses))),
                                         value=as.vector(model.coefficients),
                                         row.names=NULL)
  return(summary.coefficient.data)
}

#' @rdname summary_coefficients
#' @export
summary_coefficients.glm <- function(model,
                                     population.name="All") {

  summary.coefficient.data <- summary_coefficients.lm(model=model,
                                                      population.name=population.name)
  return(summary.coefficient.data)
}

#' @rdname summary_coefficients
#' @export
summary_coefficients.lm <- function(model,
                                    population.name="All") {

  model.coefficients <- coefficients(model)
  covariates <- names(model.coefficients)
  response <- format(formula(model)[[2]])

  summary.coefficient.data <- data.frame(population=population.name,
                                         variable=response,
                                         statistic=paste0("Coefficient for ", covariates),
                                         value=model.coefficients,
                                         row.names=NULL)
  return(summary.coefficient.data)
}


#' Summarize predictions from a regression model
#'
#' @description A utility that extracts linear predictors from a model object
#'   and puts them into a data frame. The output data frame is structured so
#'   that the summary statistic values are in a single column to make variance
#'   calculation from replicate methods more efficient.
#'
#' @details This function is compatible with models with class `lm`, `glm`, or
#'   `mlm`. This includes models output by [stats::lm()] and [stats::glm()] as
#'   well as extensions of those models such as from the 'survey' and 'mfp'
#'   packages.
#'
#' @param model A model object. See Details for compatible object classes.
#' @param newdata A data frame to use for prediction. Must contain all predictor
#'   variables in `model`.
#' @param id A variable that identifies unique subjects in `newdata`. If `NULL`,
#'   then integers from `1` to `nrow(newdata)` are used. (default = `NULL`)
#' @param population.name A character string to identify a population. Included
#'   as a column in the output so that populations can be distinguished when
#'   output datasets are combined. (default = `"All"`)
#'
#' @returns A data frame with the following columns:
#' * population: The name of the population given by `population.name`.
#' * variable: The name of the response variable.
#' * statistic: A character string with the value of `id` at the observation being predicted.
#' * value: The predicted value.
#'
#' @export
#'
#' @examples
#' #subset NHANES data
#' nhanes.subset <- nhcvd[nhcvd$SDMVSTRA %in% c(48, 60, 72),]
#'
#' #Use NCI method to simulate usual intakes of potassium
#' boxcox.lambda.data <- boxcox_survey(input.data=nhanes.subset,
#'                                     row.subset=(nhanes.subset$DAY == 1),
#'                                     variable="TPOTA",
#'                                     id="SEQN",
#'                                     repeat.obs="DAY",
#'                                     weight="WTDRD1")
#'
#' minimum.amount.data <- calculate_minimum_amount(input.data=nhanes.subset,
#'                                                 row.subset=(nhanes.subset$DAY == 1),
#'                                                 daily.variables="TPOTA")
#'
#' pre.mcmc.data <- nci_multivar_preprocessor(input.data=nhanes.subset,
#'                                            daily.variables="TPOTA",
#'                                            boxcox.lambda.data=boxcox.lambda.data,
#'                                            minimum.amount.data=minimum.amount.data)
#'
#' #for regression calibration, post-MCMC random effect ('u') draws are saved
#' mcmc.output <- nci_multivar_mcmc(pre.mcmc.data=pre.mcmc.data,
#'                                  id="SEQN",
#'                                  weight="WTDRD1",
#'                                  repeat.obs="DAY",
#'                                  daily.variables="TPOTA",
#'                                  num.mcmc.iterations=1000,
#'                                  num.burn=500,
#'                                  num.thin=1,
#'                                  num.post=250)
#'
#' mcmc.input.data <- pre.mcmc.data$mcmc.input
#' population.base <- mcmc.input.data[!duplicated(mcmc.input.data$SEQN),]
#'
#' #use post-MCMC random effect ('u') draws instead of simulating new ones
#' distrib.output <- nci_multivar_distrib(multivar.mcmc.model=mcmc.output,
#'                                        distrib.population=population.base,
#'                                        id="SEQN",
#'                                        weight="WTDRD1",
#'                                        use.mcmc.u.matrices=TRUE,
#'                                        additional.output="BPSY_AVG")
#'
#' #calculate average usual intake per subject
#' regression.data <- aggregate(distrib.output[,"usual.intake.TPOTA",drop=FALSE],
#'                              by=distrib.output[,c("SEQN", "BPSY_AVG"),drop=FALSE],
#'                              mean)
#'
#' #fit model of systolic blood pressure as a function of usual potassium intake in grams
#' blood.pressure <- lm(BPSY_AVG ~ usual.intake.TPOTA, data=regression.data)
#'
#' #summary of blood pressure predictions for the first 10 subjects
#' blood.pressure.predictions <- summary_predictions(model=blood.pressure,
#'                                                   newdata=regression.data[1:10,],
#'                                                   id="SEQN")
#' blood.pressure.predictions
summary_predictions <- function(model,
                                newdata,
                                id=NULL,
                                population.name="All") {

  UseMethod("summary_predictions")
}

#' @rdname summary_predictions
#' @export
summary_predictions.mlm <- function(model,
                                    newdata,
                                    id=NULL,
                                    population.name="All") {

  model.predictions <- predict(model, newdata=newdata)
  responses <- colnames(model.predictions)

  if(!is.null(id)) {

    subjects <- as.character(newdata[,id,drop=TRUE])
  } else {

    id <- "Observation"
    subjects <- as.character(seq_len(nrow(newdata)))
  }

  summary.prediction.data <- data.frame(population=population.name,
                                        variable=rep(responses, each=length(subjects)),
                                        statistic=paste0("Prediction for ", id, " = ", rep(subjects, length(responses))),
                                        value=as.vector(model.predictions),
                                        row.names=NULL)
  return(summary.prediction.data)
}

#' @rdname summary_predictions
#' @export
summary_predictions.glm <- function(model,
                                    newdata,
                                    id=NULL,
                                    population.name="All") {

  summary.prediction.data <- summary_predictions.lm(model=model,
                                                    newdata=newdata,
                                                    id=id,
                                                    population.name=population.name)
  return(summary.prediction.data)
}

#' @rdname summary_predictions
#' @export
summary_predictions.lm <- function(model,
                                   newdata,
                                   id=NULL,
                                   population.name="All") {

  model.predictions <- predict(model, newdata=newdata)
  response <- format(formula(model)[[2]])

  if(!is.null(id)) {

    subjects <- as.character(newdata[,id,drop=TRUE])
  } else {

    id <- "Observation"
    subjects <- as.character(seq_len(nrow(newdata)))
  }

  summary.prediction.data <- data.frame(population=population.name,
                                        variable=response,
                                        statistic=paste0("Prediction for ", id, " = ", subjects),
                                        value=model.predictions,
                                        row.names=NULL)
  return(summary.prediction.data)
}


#' Differences Between Two Populations
#'
#' @description Calculates the differences in summary statistics between two
#'   populations.
#'
#' @details This utility function is designed to work with the output of the
#'   other summary functions in the `ncimultivar` package. Differences will be
#'   calculated for rows in the two input populations that have matching
#'   `variable` and `statistic` columns. Differences will not be calculated for
#'   rows in either population with no match.
#'
#'
#' @param population1,population2 Data frames of populations to compare. Must
#'   have the following columns:
#' * population: Name of the population.
#' * variable: Name of the variable being summarized.
#' * statistic: Name of the summary statistic.
#' * value: Value of the summary statistic.
#'
#' @returns A data frame with the following columns:
#' * population: Name of the population difference, composed from `population` columns of `population1` and `population2`.
#' * variable: Name of the variable being summarized.
#' * statistic: Name of the summary statistic.
#' * value: Difference between the `value` columns of `population1` and `population2`.
#'
#' @export
#'
#' @examples
#' #Subset NHANES data
#' nhanes.subset <- nhcvd[nhcvd$SDMVSTRA %in% c(48, 60, 72),]
#'
#' #Use NCI method to simulate usual intakes of sodium using sex as a covariate
#' boxcox.lambda.data <- boxcox_survey(input.data=nhanes.subset,
#'                                     row.subset=(nhanes.subset$DAY == 1),
#'                                     variable="TSODI",
#'                                     id="SEQN",
#'                                     repeat.obs="DAY",
#'                                     weight="WTDRD1",
#'                                     covariates="RIAGENDR")
#'
#' minimum.amount.data <- calculate_minimum_amount(input.data=nhanes.subset,
#'                                                 row.subset=(nhanes.subset$DAY == 1),
#'                                                 daily.variables="TSODI")
#'
#' pre.mcmc.data <- nci_multivar_preprocessor(input.data=nhanes.subset,
#'                                            daily.variables="TSODI",
#'                                            boxcox.lambda.data=boxcox.lambda.data,
#'                                            minimum.amount.data=minimum.amount.data)
#'
#' mcmc.output <- nci_multivar_mcmc(pre.mcmc.data=pre.mcmc.data,
#'                                  id="SEQN",
#'                                  weight="WTDRD1",
#'                                  repeat.obs="DAY",
#'                                  daily.variables="TSODI",
#'                                  default.covariates="RIAGENDR",
#'                                  num.mcmc.iterations=1000,
#'                                  num.burn=500,
#'                                  num.thin=1)
#'
#' mcmc.input.data <- pre.mcmc.data$mcmc.input
#' population.base <- mcmc.input.data[!duplicated(mcmc.input.data$SEQN),]
#'
#' distrib.output <- nci_multivar_distrib(multivar.mcmc.model=mcmc.output,
#'                                        distrib.population=population.base,
#'                                        id="SEQN",
#'                                        weight="WTDRD1",
#'                                        num.simulated.u=100,
#'                                        additional.output="RIAGENDR")
#'
#' #summary statistics for male and female subjects
#' summary.male <- nci_multivar_summary(input.data=distrib.output,
#'                                      row.subset=(distrib.output$RIAGENDR == 0),
#'                                      weight="WTDRD1",
#'                                      population.name="Male",
#'                                      variables="usual.intake.TSODI")
#' summary.male
#'
#' summary.female <- nci_multivar_summary(input.data=distrib.output,
#'                                        row.subset=(distrib.output$RIAGENDR == 1),
#'                                        weight="WTDRD1",
#'                                        population.name="Female",
#'                                        variables="usual.intake.TSODI")
#' summary.female
#'
#' #calculate differences between statistics in male and female subsets
#' summary.differences <- summary_difference(population1=summary.male,
#'                                           population2=summary.female)
#' summary.differences
summary_difference <- function(population1,
                               population2) {

  names(population1)[names(population1) %in% c("population", "variable", "statistic", "value")] <- c("population1", "variable", "statistic", "value1")
  names(population2)[names(population2) %in% c("population", "variable", "statistic", "value")] <- c("population2", "variable", "statistic", "value2")

  merged.populations <- merge(population1, population2, by=c("variable", "statistic"), sort=FALSE)
  difference.population.name <- paste0(merged.populations$population1, " - ", merged.populations$population2)
  difference.value <- merged.populations$value1 - merged.populations$value2

  summary.difference <- data.frame(population=difference.population.name,
                                   variable=merged.populations$variable,
                                   statistic=merged.populations$statistic,
                                   value=difference.value)

  return(summary.difference)
}
