#Finds the lambda value that corresponds to the Box-Cox transformation that most resembles a normal distribution.
find_best_lambda <- function(input.data,
                             row.subset,
                             variable,
                             covariates,
                             weight,
                             lambda.start,
                             lambda.increment,
                             num.lambdas) {

  if(!is.data.frame(input.data)) {

    input.data <- as.data.frame(input.data)
  }

  #Subset to non-zero values and apply user-specified subset
  if(!is.null(row.subset)) {

    row.subset <- row.subset & (input.data[,variable,drop=TRUE] > 0)
  } else {

    row.subset <- (input.data[,variable,drop=TRUE] > 0)
  }

  #extract variable, covariates, and subject weighting for subset data
  values.subset <- input.data[row.subset,variable,drop=TRUE]

  if(!is.null(covariates)) {

    covariate.matrix.subset <- cbind(1, as.matrix(input.data[row.subset,covariates,drop=FALSE]))
  }

  if(!is.null(weight)) {

    subject.weighting.subset <- input.data[row.subset,weight,drop=TRUE]
  } else {

    subject.weighting.subset <- rep(1, sum(row.subset))
  }

  #Calculate SSE of each lambda's percentiles against the standard normal distribution
  normal.percentiles <- qnorm(seq(0.01,0.99,0.01))
  sum.square.normal <- sum(normal.percentiles^2)

  lambda.grid <- seq(from=lambda.start, by=lambda.increment, length=num.lambdas)
  lambda.sse <- numeric(num.lambdas)
  for(i in seq_len(num.lambdas)) {

    lambda <- lambda.grid[i]

    #Box-Cox transformation
    if(lambda == 0) {

      boxcox.subset <- log(values.subset)
    } else {

      boxcox.subset <- (values.subset^lambda - 1)/lambda
    }

    #Percentiles of Box-Cox transformed variable
    if(is.null(covariates)) {

      #if no covariates are given, calculate percentiles directly
      lambda.percentiles <- weighted_quantiles(boxcox.subset, subject.weighting.subset, seq(0.01,0.99,0.01))
    } else {

      #fit model using Box-Cox transformed variable vs covariates and calculate residuals
      model.fit <- lm.wfit(x=covariate.matrix.subset, y=boxcox.subset, w=subject.weighting.subset)
      boxcox.residuals <- residuals(model.fit)

      #calculate percentiles from residuals
      lambda.percentiles <- weighted_quantiles(boxcox.residuals, subject.weighting.subset, seq(0.01,0.99,0.01))
    }

    #Calculate SSE
    sum.square.lambda <- sum((lambda.percentiles - mean(lambda.percentiles))^2)
    sum.cross.lambda.normal <- sum((lambda.percentiles - mean(lambda.percentiles))*normal.percentiles)
    lambda.sse[i] <- sum.square.normal - (sum.cross.lambda.normal^2)/sum.square.lambda
  }


  #Return the lambda that corresponds to the minimum SSE
  selected.lambda <- lambda.grid[which.min(lambda.sse)]
  return(selected.lambda)
}


#Finds suggested Winsorized values for a Box-Cox transformed variable.
find_suggested_winsorization <- function(input.data,
                                         row.subset,
                                         lambda,
                                         variable,
                                         is.episodic,
                                         id,
                                         repeat.obs,
                                         covariates,
                                         weight,
                                         iqr.multiple,
                                         print.report) {

  if(!is.data.frame(input.data)) {

    input.data <- as.data.frame(input.data)
  }

  #Subset to non-zero values and apply user-specified subset
  if(!is.null(row.subset)) {

    row.subset <- row.subset & (input.data[,variable,drop=TRUE] > 0)
  } else {

    row.subset <- (input.data[,variable,drop=TRUE] > 0)
  }

  #extract variable and covariates for full and subset data
  values <- input.data[,variable,drop=TRUE]
  values.subset <- values[row.subset]

  if(!is.null(covariates)) {

    covariate.matrix <- cbind(1, as.matrix(input.data[,covariates,drop=FALSE]))
    covariate.matrix.subset <- covariate.matrix[row.subset,,drop=FALSE]
  }

  #extract weighting for subset observations
  if(!is.null(weight)) {

    subject.weighting.subset <- input.data[row.subset,weight,drop=TRUE]
  } else {

    subject.weighting.subset <- rep(1, sum(row.subset))
  }

  #Perform Box-Cox transformation on subset variable values
  if(lambda == 0) {

    boxcox.subset <- log(values.subset)
  } else {

    boxcox.subset <- (values.subset^lambda - 1)/lambda
  }

  #Find outlier thresholds for Box-Cox transformed variable
  if(is.null(covariates)) {

    #calculate low and high thresholds for Winsorization
    percentiles <- weighted_quantiles(boxcox.subset, subject.weighting.subset, c(0.25, 0.75))
    iqr <- percentiles["75%"] - percentiles["25%"]

    boxcox.low <- percentiles["25%"] - iqr.multiple*iqr
    boxcox.high <- percentiles["75%"] + iqr.multiple*iqr

  } else {

    #Fit linear model with specified covariates and take residuals
    model.fit <- lm.wfit(x=covariate.matrix.subset, y=boxcox.subset, w=subject.weighting.subset)
    boxcox.residuals <- residuals(model.fit)

    #calculate low and high residual thresholds
    percentiles <- weighted_quantiles(boxcox.residuals, subject.weighting.subset, c(0.25, 0.75))
    iqr <- percentiles["75%"] - percentiles["25%"]

    residual.low <- percentiles["25%"] - iqr.multiple*iqr
    residual.high <- percentiles["75%"] + iqr.multiple*iqr

    #predict Winsorization thresholds for full data
    beta <- coefficients(model.fit)
    estimated <- model.fit$qr$pivot[seq_len(model.fit$rank)]
    pred <- as.vector(covariate.matrix[,estimated] %*% beta[estimated])

    boxcox.low <- pred + residual.low
    boxcox.high <- pred + residual.high
  }

  #transform Box-Cox outlier thresholds back to original scale
  if(lambda == 0) {

    value.low <- exp(boxcox.low)
    value.high <- exp(boxcox.high)
  } else {

    value.low <- ifelse(boxcox.low >= -1/lambda,
                        (lambda*boxcox.low + 1)^(1/lambda),
                        0)
    value.high <- ifelse(boxcox.high >= -1/lambda,
                         (lambda*boxcox.high + 1)^(1/lambda))
  }


  #Find Winsorized values for all observations based on thresholds
  outlier.low <- (values < value.low)
  outlier.high <- (values > value.high)

  if(is.episodic) {

    #For episodic variables, Winsorize values below the lower threshold to zero (non-consumption day)
    winsorized.value <- outlier.low*0 + outlier.high*value.high
  } else {

    winsorized.value <- outlier.low*value.low + outlier.high*value.high
  }

  #Create report of Winsorized outlier values
  winsorization.report <- data.frame(input.data[,id],
                                     input.data[,repeat.obs],
                                     input.data[,variable],
                                     winsorized.value)
  winsorization.report <- winsorization.report[(outlier.low | outlier.high),]
  names(winsorization.report) <- c(id, repeat.obs, variable, paste0(variable, ".winsorized"))

  #Non-consumption days for episodic variables are not Winsorized
  if(is.episodic) {

    winsorization.report <- winsorization.report[winsorization.report[,variable] > 0,]
  }

  if(print.report) {

    cat(paste0("Outliers and Suggested Winsorized values for ", variable, "\n"))
    print.data.frame(winsorization.report, row.names=FALSE)
    cat("\n")
  }

  return(winsorization.report)
}


#Finds subjects with significantly high variance among repeated observations using an F-test.
find_influential_subjects <- function(input.data,
                                      row.subset,
                                      lambda,
                                      variable,
                                      weight,
                                      id,
                                      repeat.obs,
                                      alpha,
                                      multiple.test,
                                      print.report) {

  if(!is.data.frame(input.data)) {

    input.data <- as.data.frame(input.data)
  }

  #Remove non-consumption days
  #This is needed because the Box-Cox transformation is only valid for positive values
  input.data <- input.data[input.data[,variable,drop=TRUE] > 0, ,drop=FALSE]

  #transpose data to wide format (one record per subject)
  subject.data <- reshape(data=input.data[,c(id, repeat.obs, variable, weight)],
                          direction="wide",
                          idvar=id,
                          timevar=repeat.obs,
                          v.names=variable)

  observation.variables <- attr(subject.data, "reshapeWide")$varying[1,]

  #subset to subjects with 2 or more observations
  num.observations <- apply(subject.data[,observation.variables,drop=FALSE], 1, function(x) length(x[!is.na(x)]))

  subject.data <- subject.data[num.observations > 1,]
  num.observations <- num.observations[num.observations > 1]

  #Perform Box-Cox transformation
  values <- subject.data[,observation.variables,drop=FALSE]
  if(lambda == 0) {

    boxcox <- log(values)
  } else {

    boxcox <- (values^lambda - 1)/lambda
  }

  #extract subject weighting
  if(!is.null(weight)) {

    subject.weighting <- subject.data[,weight,drop=TRUE]
  } else {

    subject.weighting <- rep(1, nrow(subject.data))
  }

  #F-test of each subject's variance against the mean of the other subjects
  variance.subject <- apply(boxcox, 1, function(x) var(x, na.rm=TRUE))
  df.subject <- num.observations - 1

  variance.weighting <- subject.weighting*df.subject
  variance.mean <- (sum(variance.subject*variance.weighting) - variance.subject*variance.weighting)/(sum(variance.weighting) - variance.weighting)
  df.mean <- sum(df.subject) - df.subject

  f.statistic <- variance.subject/variance.mean
  p.value <- 1 - pf(f.statistic, df.subject, df.mean)

  #Create report of p-values
  influential.subject.report <- data.frame(subject.data[,id],
                                           p.value,
                                           subject.data[,observation.variables])
  names(influential.subject.report) <- c(id, "p", observation.variables)

  #apply multiple test correction (if any)
  corrected.p.value <- p.adjust(p.value, method=multiple.test)

  #subset report to subjects with a corrected p-value less than the threshold alpha
  is.influential <- (corrected.p.value < alpha)
  influential.subject.report <- influential.subject.report[is.influential,]

  #Sort report by p-value
  influential.subject.report <- influential.subject.report[order(influential.subject.report[,"p"]),]

  if(print.report) {

    cat(paste0("Subjects with Influential Variances for ", variable, "\n"))
    print.data.frame(influential.subject.report, row.names=FALSE)
    cat("\n")
  }

  return(influential.subject.report)
}
