A Primer on Bayesian Inference for Accounting Research: 10-fold cross-validation predictions of ERCs

library(tidyverse)
library(cmdstanr)
library(posterior)
library(patchwork)
library(rsample)
source("00-utils.R")
kable <- knitr::kable
theme_set(theme_prodgray())

1. Introduction and preliminaries

This markdown file contains all the code necessary to replicate the cross-validation prediction test from Section 3: Heterogeneity in earnings response coefficients of the Paper What Can Bayesian Inference Do for Accounting Research?. All the code can also be found in the repo. It contains 00-utils.R which contains a few helper functions for graphs and tables.

Note: I used the newer cmdstanr package instead of the older rstan package because it likely is the future of the R based Stan ecosystem. I also really like its api, which is very close to the api of the pystan package. An additional advantage (I hope) is thus that most model fitting code should be more or less directly transferable to pystan for those that want to work in python. Installing cmdstanr used to be tricky at times because one needs a working c++ toolchain. But it is much smoother now. Please see the cmdstanr doc for installation instructions

2. Loading the data

The data used here is generated via the 02-create-ERC-sample.R script found in the repo. Here, we just load it and do some last minute transformations like de-meaning, etc.

ea_data <- arrow::read_parquet("../data/ea-event-returns.pqt") 
ea_data <- 
  ea_data |> 
  mutate(
    ret_dm = AbEvRet - mean(AbEvRet),
    earn_surp_dm = earn_surp - mean(earn_surp)
    )
head(ea_data)

# A tibble: 6 x 15
  ticker permno fpend_date ea_date    actual_eps median_fcast_eps
  <chr>   <dbl> <date>     <date>          <dbl>            <dbl>
1 001N    14504 2014-12-31 2015-02-09       0.04            0.03 
2 001N    14504 2015-03-31 2015-05-05       0.06           -0.045
3 001N    14504 2015-06-30 2015-08-05       0.02           -0.015
4 001N    14504 2015-09-30 2015-11-04      -0.02            0.04 
5 002T    14503 2014-06-30 2014-08-07       0.19            0.235
6 002T    14503 2014-09-30 2014-11-04       0.18            0.34 
# ... with 9 more variables: num_forecasts <int>,
#   two_days_bef_ea <date>, Price <dbl>, earn_surp <dbl>,
#   ea_match_date <date>, AbEvRet <dbl>, firm_id <int>, ret_dm <dbl>,
#   earn_surp_dm <dbl>

3. Creating 10-fold CV datasets

This creates the 10 random splits of the sample

set.seed(3597)
folds <- vfold_cv(ea_data, v = 10)
glimpse(folds)

Rows: 10
Columns: 2
$ splits <list> [<vfold_split[60624 x 6736 x 67360 x 15]>], [<vfold_~
$ id     <chr> "Fold01", "Fold02", "Fold03", "Fold04", "Fold05", "Fo~

4. OLS OOS predictions

Next, loop through the 10 splits and train the OLS model on each train split and compute predictions for holdout part.

ols_fit <- function(.d){
  fit <- lm(ret_dm ~ earn_surp_dm, data = .d)
}

ols_pred <- function(.fit, .nd){
  # Note there are some singularity warnings in here. Mainly due to insufficient n.obs
  suppressWarnings(
  fit_pred <- broom::augment(.fit, newdata = .nd)
  )
}

ols_predictions <- vector("list", length = 10)
for (i in 1:10) {
  train_set <- analysis(folds$splits[[i]])
  holdout_set <- assessment(folds$splits[[i]])
  
  ea_data_train <- 
    train_set |>
    nest(data = -c(ticker, firm_id)) 
  
  ea_data_test <- 
    holdout_set |>
    nest(data = -c(ticker, firm_id)) |> 
    rename(test_data = data)
  
  data_slice <- 
    ea_data_train |> 
    inner_join(ea_data_test, by = c("ticker", "firm_id"))
  
  ols_predictions[[i]] <- 
    data_slice |>
    mutate(ols_fit = map(.x = data, .f = ~ols_fit(.))) |> 
    mutate(ols_pred = map2(.x = ols_fit, .y = test_data, .f = ols_pred)) |> 
    select(ticker, firm_id, ols_pred) |> 
    unnest(cols = c(ols_pred)) |> 
    select(ticker, firm_id, ea_date, ret_dm, ols_pred = .fitted)
}
ols_predictions <- bind_rows(ols_predictions)
write_csv(ols_predictions, file = "../out/results/ols-dump.csv")

5. Bayesian OOS prediction

Do the same for the Bayes model. Loop through the 10 splits and train the Bayes model on each train split and compute predictions for holdout part.

cat(read_lines("../Stan/erc-wkinfo-priors-oos.stan"), sep = "\n")

data{
  int<lower=1> N;                   // num obs
  int<lower=1> J;                   // num groups
  int<lower=1> K;                   // num coefficients
  int<lower=1, upper=J> GroupID[N]; // GroupID for obs, e.g. FirmID or Industry-YearID
  vector[N] y;                      // Response
  matrix[N, K] x;                   // Predictors (incl. Intercept)

  // data for the oos test
  int<lower=1> N_test;              // num obs in test sample
  matrix[N_test, K] x_test;              // Predictors (incl. Intercept)
  int<lower=1, upper=J> GroupID_test[N_test]; // GroupID for  for test sample obs
}
parameters{
  matrix[K, J] z;                  // standard normal sampler
  cholesky_factor_corr[K] L_Omega; // hypprior coefficient correlation
  vector<lower=0>[K] tau;          // hypprior coefficient scales
  vector[K] mu_b;                  // hypprior mean coefficients
  real<lower=0> sigma;             // error-term scale
}
transformed parameters{
  matrix[J, K] b;                  // coefficient vector
  // The multivariate non-centered version:
  b = (rep_matrix(mu_b, J) + diag_pre_multiply(tau,L_Omega) * z)';
}
model{
  to_vector(z) ~ normal(0, 1);
  L_Omega ~ lkj_corr_cholesky(2);
  mu_b[1]  ~ normal(0, 0.1);
  mu_b[2]  ~ normal(0, 40);
  sigma ~ exponential(1.0 / 0.08);   // exp: 0.08 (std (abnormal returns))
  tau[1] ~ exponential(1.0 / 0.1);   // exp: 0.1
  tau[2] ~ exponential(1.0 / 40);    // exp: 40
  y ~ normal(rows_dot_product(b[GroupID] , x), sigma);
}
generated quantities {
  vector[N_test] y_pred = rows_dot_product(b[GroupID_test] , x_test);
}

model_wkinfo_priors <- cmdstan_model("../Stan/erc-wkinfo-priors-oos.stan")

Beware, the following code chunk can take a long time

bay_predictions <- vector("list", length = 10)
for (i in 1:10){
  train_set <- analysis(folds$splits[[i]])
  holdout_set <- assessment(folds$splits[[i]])
  
  input_data <- list(
    N = nrow(train_set),
    J = max(ea_data$firm_id),  # important! Needs to refer to full sample
    K = 2,
    GroupID = train_set$firm_id,
    y = train_set$AbEvRet,
    x = as.matrix(data.frame(int = 1, esurp = train_set$earn_surp)),
    
    N_test = nrow(holdout_set),
    x_test = as.matrix(data.frame(int = 1, esurp = holdout_set$earn_surp)),
    GroupID_test = holdout_set$firm_id
  )
  
  fit_wkinfo_priors <- model_wkinfo_priors$sample(
    data = input_data,
    iter_sampling = 1000,
    iter_warmup = 1000,
    chains = 4,
    parallel_chains = 4,
    seed = 1234,
    refresh = 1000
  )
  
  posterior_ypred <- summarise_draws(
    fit_wkinfo_priors$draws(c("y_pred")), 
      posterior_mean = mean,
      posterior_median = median, 
      posterior_sd = sd,
      ~quantile2(., probs = c(0.05, 0.25, 0.75, 0.95))
    )
  
   bay_predictions[[i]] <- 
     cbind(select(holdout_set, ticker, firm_id, ea_date, ret_dm), 
           posterior_ypred
           )
   
   rm(posterior_ypred, train_set, holdout_set, input_data)
}

Running MCMC with 4 parallel chains...

Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 3 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 4 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 3 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 3 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 1 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 1 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 2 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 2 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 4 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 4 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 2 finished in 1646.5 seconds.
Chain 3 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 3 finished in 1835.0 seconds.
Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 1 finished in 1850.0 seconds.
Chain 4 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 4 finished in 1888.3 seconds.

All 4 chains finished successfully.
Mean chain execution time: 1805.0 seconds.
Total execution time: 1889.0 seconds.
Running MCMC with 4 parallel chains...

Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 3 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 4 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 2 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 2 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 4 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 4 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 3 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 3 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 1 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 1 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 2 finished in 1857.0 seconds.
Chain 4 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 4 finished in 1862.4 seconds.
Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 1 finished in 1877.6 seconds.
Chain 3 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 3 finished in 1884.0 seconds.

All 4 chains finished successfully.
Mean chain execution time: 1870.3 seconds.
Total execution time: 1884.5 seconds.
Running MCMC with 4 parallel chains...

Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 3 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 4 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 1 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 1 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 4 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 4 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 3 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 3 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 2 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 2 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 3 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 3 finished in 1752.9 seconds.
Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 1 finished in 1844.6 seconds.
Chain 4 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 4 finished in 1852.3 seconds.
Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 2 finished in 1897.3 seconds.

All 4 chains finished successfully.
Mean chain execution time: 1836.8 seconds.
Total execution time: 1897.8 seconds.
Running MCMC with 4 parallel chains...

Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 3 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 4 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 2 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 2 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 4 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 4 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 3 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 3 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 1 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 1 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 4 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 4 finished in 1811.5 seconds.
Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 2 finished in 1833.4 seconds.
Chain 3 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 3 finished in 1858.0 seconds.
Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 1 finished in 1869.6 seconds.

All 4 chains finished successfully.
Mean chain execution time: 1843.1 seconds.
Total execution time: 1870.0 seconds.
Running MCMC with 4 parallel chains...

Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 3 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 4 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 3 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 3 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 2 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 2 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 4 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 4 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 1 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 1 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 2 finished in 1656.0 seconds.
Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 1 finished in 1711.2 seconds.
Chain 3 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 3 finished in 1817.6 seconds.
Chain 4 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 4 finished in 1855.2 seconds.

All 4 chains finished successfully.
Mean chain execution time: 1760.0 seconds.
Total execution time: 1855.8 seconds.
Running MCMC with 4 parallel chains...

Chain 3 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 4 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 3 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 3 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 4 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 4 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 2 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 2 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 1 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 1 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 3 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 3 finished in 1647.9 seconds.
Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 2 finished in 1805.2 seconds.
Chain 4 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 4 finished in 1872.3 seconds.
Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 1 finished in 1895.3 seconds.

All 4 chains finished successfully.
Mean chain execution time: 1805.2 seconds.
Total execution time: 1895.9 seconds.
Running MCMC with 4 parallel chains...

Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 3 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 4 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 2 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 2 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 4 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 4 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 3 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 3 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 1 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 1 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 4 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 4 finished in 1599.3 seconds.
Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 1 finished in 1619.7 seconds.
Chain 3 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 3 finished in 1662.0 seconds.
Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 2 finished in 1752.3 seconds.

All 4 chains finished successfully.
Mean chain execution time: 1658.3 seconds.
Total execution time: 1752.7 seconds.
Running MCMC with 4 parallel chains...

Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 3 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 4 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 2 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 2 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 4 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 4 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 3 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 3 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 1 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 1 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 4 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 4 finished in 1818.5 seconds.
Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 2 finished in 1832.0 seconds.
Chain 3 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 3 finished in 1898.6 seconds.
Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 1 finished in 1959.4 seconds.

All 4 chains finished successfully.
Mean chain execution time: 1877.1 seconds.
Total execution time: 1959.9 seconds.
Running MCMC with 4 parallel chains...

Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 3 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 4 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 1 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 1 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 4 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 4 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 3 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 3 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 2 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 2 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 1 finished in 1738.1 seconds.
Chain 4 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 4 finished in 1795.9 seconds.
Chain 3 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 3 finished in 1858.2 seconds.
Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 2 finished in 1860.0 seconds.

All 4 chains finished successfully.
Mean chain execution time: 1813.1 seconds.
Total execution time: 1860.5 seconds.
Running MCMC with 4 parallel chains...

Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 3 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 4 Iteration:    1 / 2000 [  0%]  (Warmup) 
Chain 3 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 3 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 4 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 4 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 2 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 2 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 1 Iteration: 1000 / 2000 [ 50%]  (Warmup) 
Chain 1 Iteration: 1001 / 2000 [ 50%]  (Sampling) 
Chain 4 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 4 finished in 1570.6 seconds.
Chain 3 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 3 finished in 1578.9 seconds.
Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 2 finished in 1814.2 seconds.
Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
Chain 1 finished in 1829.1 seconds.

All 4 chains finished successfully.
Mean chain execution time: 1698.2 seconds.
Total execution time: 1829.5 seconds.

bay_predictions <- bind_rows(bay_predictions)
write_csv(bay_predictions, file = "../out/results/bay-dump.csv")

ols_predictions <- read_csv("../out/results/ols-dump.csv")
bay_predictions <- read_csv("../out/results/bay-dump.csv")

6. Table 2

tab2.A <- bind_rows(
  yardstick::mae(bay_predictions, truth = ret_dm, estimate = posterior_mean),
  yardstick::rmse(bay_predictions, truth = ret_dm, estimate = posterior_mean),
  yardstick::rsq(bay_predictions, truth = ret_dm, estimate = posterior_mean),
) |> 
  mutate(case = c("Bayes (Full sample)", "Bayes (Full sample)", "Bayes (Full sample)"),
         .estimate = round(.estimate, 4)
         ) |> 
  select(-.estimator) |> 
  pivot_wider(names_from = .metric, values_from = .estimate)
tab2.A$N <- nrow(bay_predictions)
kable(tab2.A)

case	mae	rmse	rsq	N
Bayes (Full sample)	0.0575	0.0768	0.0339	67360

both_predictions <- 
  bay_predictions |> 
  ungroup() |> 
  inner_join(select(ols_predictions, ticker, ea_date, ols_pred), 
             by = c("ticker", "ea_date"))

tab2.B <- bind_rows(
  yardstick::mae(both_predictions, truth = ret_dm, estimate = posterior_mean),
  yardstick::mae(both_predictions, truth = ret_dm, estimate = ols_pred),
  yardstick::rmse(both_predictions, truth = ret_dm, estimate = posterior_mean),
  yardstick::rmse(both_predictions, truth = ret_dm, estimate = ols_pred),
  yardstick::rsq(both_predictions, truth = ret_dm, estimate = posterior_mean),
  yardstick::rsq(both_predictions, truth = ret_dm, estimate = ols_pred)
)|> 
  mutate(
    case = c("Bayes (OLS sample)", "OLS", "Bayes (OLS sample)", "OLS", "Bayes (OLS sample)", "OLS"),
    .estimate = round(.estimate, 4)) |> 
  select(-.estimator) |> 
  pivot_wider(names_from = .metric, values_from = .estimate)
tab2.B$N <- nrow(both_predictions)
kable(tab2.B)

case	mae	rmse	rsq	N
Bayes (OLS sample)	0.0575	0.0767	0.0339	67031
OLS	0.0669	0.4048	0.0000	67031

bind_rows(tab2.A, tab2.B) |> write_csv("../out/results/tab2.csv")

10-fold cross-validation predictions of ERCs