## Indirect inference through prediction - a tutorial

13 July 2018 Indirect inference through prediction - a tutorial

You built a simulation model and now you want to tune its parameters to data. Here, and in this arXiv preprint, I recycle Blum et al. (2013) methods to do just that. It’s a simple strategy that requires no new fancy technique.
The paper comes with code for replication but there is a lot of fluff there which you don’t really need. Here I try to produce a minimum example with no wasted effort.

### The model and its summary statistics

First of all, let’s load some basic packages

library(tidyverse)
library(glmnetUtils)
library(zoo)
library(knitr)

tidyverse and zoo are useful for data manipulation; knitR for printing tables.
The library glmnetUtils is useful for running elastic net regressions. These are fast and linear regularized regressions. They automatically drop uninformative or quasi-collinear coefficients.

Imagine that somebody hands us a complicated black-box agent-based model depending on two parameters $\alpha,\beta$. In reality the model is just an ARIMA(1,1,1), $\alpha$ the AR coefficient and $\beta$ the MA coefficient, but let’s pretend not to know.

complicated_model<-function(n,ar,ma){
#n is number of observations, #ar,ma are the parameters
return(arima.sim(n=n,model=list("ar"=ar,"ma"=ma,order=c(1,1,1))))
}

Run this model once and it’s apparent we aren’t dealing with a stationary model (let alone ergodic)

trial_run<-complicated_model(100,.3,-.2)
autoplot(as.zoo(trial_run))  +
xlab("Steps") + ylab("Simulation Output")

If we observe one run (100 steps) out of this model, how can we obtain its parameters?
First, let’s come up with a bunch of indicators (summary statistics) that describe the output of the model:

1. The trend
2. The average
3. The last value
4. The standard deviation
5. Number of peaks and valleys (see here)
6. The coefficients of an AR(5) fit
7. The first 10 ACF and PACF values
8. Same summary statistics (1-7) for the speed (the $\Delta$ of the simulation output)

Here’s the code to extract them:

# an easy function to find peaks in the data
# grabbed from here: https://github.com/stas-g/findPeaks
find_peaks <- function (x, m = 3){
shape <- diff(sign(diff(x, na.pad = FALSE)))
pks <- sapply(which(shape < 0), FUN = function(i){
z <- i - m + 1
z <- ifelse(z > 0, z, 1)
w <- i + m + 1
w <- ifelse(w < length(x), w, length(x))
if(all(x[c(z : i, (i + 2) : w)] <= x[i + 1])) return(i + 1) else return(numeric(0))
})
pks <- unlist(pks)
pks
}

# this function will take a simulation output and returns all the
# summary statistics of it
extract_summary_statistics<-function(model_run)
{
time<-1:length(model_run) # time steps, for trend
speed<-diff(model_run)
timed<-1:length(speed) # time steps, for trend
return(
unlist(list(
"trend" = coef(lm(model_run~time))[2],
"average" = mean(model_run),
"last_value" = last(model_run),
"stdev" = sd(model_run),
"number_of_peaks" = length(find_peaks(model_run,m=3)),
"number_of_valleys" = length(find_peaks(-model_run,m=3)),
"ar_coefficients" = (ar(model_run,order.max=5,aic=FALSE))$ar, "acf" = (acf(model_run,plot=FALSE))$acf[2:11],
"pacf" = (pacf(model_run,plot=FALSE))$acf[2:11], #basic info about speed (ugly code duplication!) "speed_trend" = coef(lm(speed~timed))[2], "speed_average" = mean(speed), "speed_last_value" = last(speed), "speed_stdev" = sd(speed), "speed_number_of_peaks" = length(find_peaks(speed,m=3)), "speed_number_of_valleys" = length(find_peaks(-speed,m=3)), "speed_ar_coefficients" = (ar(speed,order.max=5,aic=FALSE))$ar,
"speed_acf" = (acf(speed,plot=FALSE))$acf[2:11], "speed_pacf" = (pacf(speed,plot=FALSE))$acf[2:11]
))

)
}

summary_names<-names(extract_summary_statistics(trial_run))

This sums up to 62 base summary statistics. We know immediately most of them are useless, but let’s see if we can automatically figure that out using the technique from the paper.

### Build Training Data-Set

The first step is quite obvious, run the model “many” times (let’s do it 5000 times) changing the parameters and collect all the summary statistics. Let’s assume $\alpha$ (the AR coefficient) and $\beta$ (the MA coefficient) $\in [-.9,.9]$, let’s build a table where each row is a separate, independent simulation and each column is either $\alpha$,$\beta$ or a summary statistic.

SAMPLE_SIZE<-5000 # how many simulations we run
SIMULATION_LENGTH<-100 # how many time steps we let each simulation run

training<-list()
for( i in 1:SAMPLE_SIZE)
{
alpha<-runif(min = -.9,max=.9,n=1)
beta<-runif(min = -.9, max=.9,n=1)
#ugly way to build data-base
training[[i]]<-
c(alpha,beta,
#extract its summary statistics
extract_summary_statistics(
#run random model
complicated_model(
n=SIMULATION_LENGTH,
ar=alpha,
ma=beta
)
)
)

}
#turn list into database and name it properly
training <- do.call("rbind",training)
colnames(training)<- c("alpha","beta",summary_names)
#show results
training<-tbl_df(training)
head(training)
## # A tibble: 6 x 64
##    alpha     beta trend.time  average last_value stdev number_of_peaks
##    <dbl>    <dbl>      <dbl>    <dbl>      <dbl> <dbl>           <dbl>
## 1  0.806  0.574      0.467   -19.1         15.5  19.9               3.
## 2 -0.468  0.655      0.0469    4.57         7.61  2.10             12.
## 3 -0.598  0.847     -0.0182   -3.48        -9.79  3.77             10.
## 4 -0.380  0.00351   -0.0181   -0.0671      -2.06  2.04              9.
## 5 -0.405  0.543     -0.00551  -3.68        -6.77  2.35              8.
## 6 -0.262 -0.642      0.0128   -0.254        2.82  1.24             15.
## # ... with 57 more variables: number_of_valleys <dbl>,
## #   ar_coefficients1 <dbl>, ar_coefficients2 <dbl>,
## #   ar_coefficients3 <dbl>, ar_coefficients4 <dbl>,
## #   ar_coefficients5 <dbl>, acf1 <dbl>, acf2 <dbl>, acf3 <dbl>,
## #   acf4 <dbl>, acf5 <dbl>, acf6 <dbl>, acf7 <dbl>, acf8 <dbl>,
## #   acf9 <dbl>, acf10 <dbl>, pacf1 <dbl>, pacf2 <dbl>, pacf3 <dbl>,
## #   pacf4 <dbl>, pacf5 <dbl>, pacf6 <dbl>, pacf7 <dbl>, pacf8 <dbl>,
## #   pacf9 <dbl>, pacf10 <dbl>, speed_trend.timed <dbl>,
## #   speed_average <dbl>, speed_last_value <dbl>, speed_stdev <dbl>,
## #   speed_number_of_peaks <dbl>, speed_number_of_valleys <dbl>,
## #   speed_ar_coefficients1 <dbl>, speed_ar_coefficients2 <dbl>,
## #   speed_ar_coefficients3 <dbl>, speed_ar_coefficients4 <dbl>,
## #   speed_ar_coefficients5 <dbl>, speed_acf1 <dbl>, speed_acf2 <dbl>,
## #   speed_acf3 <dbl>, speed_acf4 <dbl>, speed_acf5 <dbl>,
## #   speed_acf6 <dbl>, speed_acf7 <dbl>, speed_acf8 <dbl>,
## #   speed_acf9 <dbl>, speed_acf10 <dbl>, speed_pacf1 <dbl>,
## #   speed_pacf2 <dbl>, speed_pacf3 <dbl>, speed_pacf4 <dbl>,
## #   speed_pacf5 <dbl>, speed_pacf6 <dbl>, speed_pacf7 <dbl>,
## #   speed_pacf8 <dbl>, speed_pacf9 <dbl>, speed_pacf10 <dbl>

### Fitting the regressions

Looking only at the observed summary statistics, let’s try to back out the parameters that generated them. We need two regressions:

$\left\{\begin{matrix} \alpha &= \sum a_{\beta,i} S_i + \sum b_{\beta,i} S_{i}^2 + \sum c_{\beta,i,j} S_i S_j \\ \beta &= \sum a_{\gamma,i} S_i + \sum b_{\gamma,i} S_{i}^2 + \sum c_{\gamma,i,j} S_i S_j \end{matrix}\right. \tag{1}$ Where $S$ is the vector of summary statistics (of size 62). We basically are using not just the base summary statistics but also all their squares and cross-products

alpha_formula<-
paste("alpha~(",paste(summary_names,collapse="+"),")^2 + I(",paste(summary_names,collapse="^2)+I("),"^2)")
fit<-
cv.glmnet(formula(alpha_formula),data=training)

beta_formula<-
paste("beta~(",paste(summary_names,collapse="+"),")^2 + I(",paste(summary_names,collapse="^2)+I("),"^2)")
fit_b<-
cv.glmnet(formula(beta_formula),data=training)

We can tell already the parametrization is only okay: the $R^2$ of the regression is 0.7153599 but we can make sure of its quality by predicting out of sample.

### Estimating the parameters

Now, let’s build a testing data-set by running the model 500 times more. Let’s then use the regressions we just built to try and predict the $\alpha$ and $\beta$ from the new model runs without retraining them. Let’s compare the estimates from our regressions against those we obtain by running the arima command in R.

#builds testing data-set
testing<-list()
for( i in 1:500)
{
tryCatch({
alpha<-runif(min = -.9,max=.9,n=1)
beta<-runif(min = -.9, max=.9,n=1)
simulation<- complicated_model(
n=SIMULATION_LENGTH,
ar=alpha,
ma=beta
)
#arima is the command to fit ARIMA models in R
fitt<-arima(simulation,order=c(1,1,1))
testing[[length(testing)+1]]<-
c(alpha,beta,
#extract its summary statistics
extract_summary_statistics(
#run random model
simulation
),
#the testing data frame will also have two more
#columns with the arima fit from R
fitt$coef[1], fitt$coef[2]
)
},
error = function(e) {})
}
#turn list into database and name it properly
testing <- tbl_df(do.call("rbind",testing))
colnames(testing)<- c("alpha","beta",summary_names,"arima_alpha","arima_beta")
#now let's use the regressions we built before
# (without retraining them!)
alpha_predictions<-
data.frame(
#this is the prediction by the GLM fit
prediction=(predict(fit,newdata=testing,s = "lambda.1se"))[,1],
#this is the real alpha
correct=testing$alpha, #this is the alpha from the arima fit arima=testing$arima_alpha
) %>%
gather(method,value,-correct) %>%
mutate(variable="alpha")
#do the same for beta
beta_predictions<-
data.frame(
prediction=(predict(fit_b,newdata=testing,s = "lambda.1se"))[,1],
correct=testing$beta, arima=testing$arima_beta
) %>%
gather(method,value,-correct)%>%
mutate(variable="beta")
#put them in a table
predict_table<-
bind_rows(alpha_predictions,
beta_predictions)

#show prediction quality graphically
ggplot(predict_table,
aes(y=value, x=correct)) +
geom_point(aes(col=method)) +
geom_abline(slope=1,lwd=2,col="red") +
scale_color_discrete(guide=FALSE) +
facet_grid(variable~method,
labeller = as_labeller(
c("arima" = "R arima function",
"prediction" = "Regression",
"alpha" = expression(alpha),
"beta" = expression(beta)
)

)) +
xlab("Real Value") +
ylab("Prediction") +
ggtitle("Out of Sample Estimation")