-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadvi.hpp
639 lines (568 loc) · 24.6 KB
/
advi.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
// Copyright (c) 2014, Stan
// All rights reserved.
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
// * Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice, this
// list of conditions and the following disclaimer in the documentation and/or
// other materials provided with the distribution.
// * Neither the name of the {organization} nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef STAN_VARIATIONAL_ADVI_HPP
#define STAN_VARIATIONAL_ADVI_HPP
#include <stan/math.hpp>
#include <stan/callbacks/logger.hpp>
#include <stan/callbacks/writer.hpp>
#include <stan/callbacks/stream_writer.hpp>
#include <stan/io/dump.hpp>
#include <stan/services/error_codes.hpp>
#include <stan/variational/print_progress.hpp>
#include <stan/variational/families/normal_fullrank.hpp>
#include <stan/variational/families/normal_meanfield.hpp>
#include <boost/circular_buffer.hpp>
#include <boost/lexical_cast.hpp>
#include <algorithm>
#include <limits>
#include <numeric>
#include <ostream>
#include <vector>
#include <queue>
#include <string>
namespace stan {
namespace variational {
/**
* Automatic Differentiation Variational Inference
*
* Implements "black box" variational inference using stochastic gradient
* ascent to maximize the Evidence Lower Bound for a given model
* and variational family.
*
* @tparam Model class of model
* @tparam Q class of variational distribution
* @tparam BaseRNG class of random number generator
*/
template <class Model, class Q, class BaseRNG>
class advi {
public:
/**
* Constructor
*
* @param[in] m stan model
* @param[in] cont_params initialization of continuous parameters
* @param[in,out] rng random number generator
* @param[in] n_monte_carlo_grad number of samples for gradient computation
* @param[in] n_monte_carlo_elbo number of samples for ELBO computation
* @param[in] eval_elbo evaluate ELBO at every "eval_elbo" iters
* @param[in] n_posterior_samples number of samples to draw from posterior
* @throw std::runtime_error if n_monte_carlo_grad is not positive
* @throw std::runtime_error if n_monte_carlo_elbo is not positive
* @throw std::runtime_error if eval_elbo is not positive
* @throw std::runtime_error if n_posterior_samples is not positive
*/
advi(Model& m,
Eigen::VectorXd& cont_params,
BaseRNG& rng,
int n_monte_carlo_grad,
int n_monte_carlo_elbo,
int eval_elbo,
int n_posterior_samples)
: model_(m),
cont_params_(cont_params),
rng_(rng),
n_monte_carlo_grad_(n_monte_carlo_grad),
n_monte_carlo_elbo_(n_monte_carlo_elbo),
eval_elbo_(eval_elbo),
n_posterior_samples_(n_posterior_samples) {
static const char* function = "stan::variational::advi";
math::check_positive(function,
"Number of Monte Carlo samples for gradients",
n_monte_carlo_grad_);
math::check_positive(function,
"Number of Monte Carlo samples for ELBO",
n_monte_carlo_elbo_);
math::check_positive(function,
"Evaluate ELBO at every eval_elbo iteration",
eval_elbo_);
math::check_positive(function,
"Number of posterior samples for output",
n_posterior_samples_);
}
/**
* Calculates the Evidence Lower BOund (ELBO) by sampling from
* the variational distribution and then evaluating the log joint,
* adjusted by the entropy term of the variational distribution.
*
* @param[in] variational variational approximation at which to evaluate
* the ELBO.
* @param logger logger for messages
* @return the evidence lower bound.
* @throw std::domain_error If, after n_monte_carlo_elbo_ number of draws
* from the variational distribution all give non-finite log joint
* evaluations. This means that the model is severly ill conditioned or
* that the variational distribution has somehow collapsed.
*/
double calc_ELBO(const Q& variational,
callbacks::logger& logger)
const {
static const char* function =
"stan::variational::advi::calc_ELBO";
double elbo = 0.0;
int dim = variational.dimension();
Eigen::VectorXd zeta(dim);
int n_dropped_evaluations = 0;
for (int i = 0; i < n_monte_carlo_elbo_;) {
variational.sample(rng_, zeta);
try {
std::stringstream ss;
double log_prob = model_.template log_prob<false, true>(zeta, &ss);
if (ss.str().length() > 0)
logger.info(ss);
stan::math::check_finite(function, "log_prob", log_prob);
elbo += log_prob;
++i;
} catch (const std::domain_error& e) {
++n_dropped_evaluations;
if (n_dropped_evaluations >= n_monte_carlo_elbo_) {
const char* name = "The number of dropped evaluations";
const char* msg1 = "has reached its maximum amount (";
const char* msg2 = "). Your model may be either severely "
"ill-conditioned or misspecified.";
stan::math::domain_error(function, name, n_monte_carlo_elbo_,
msg1, msg2);
}
}
}
elbo /= n_monte_carlo_elbo_;
elbo += variational.entropy();
return elbo;
}
/**
* Calculates the "black box" gradient of the ELBO.
*
* @param[in] variational variational approximation at which to evaluate
* the ELBO.
* @param[out] elbo_grad gradient of ELBO with respect to variational
* approximation.
* @param logger logger for messages
*/
void calc_ELBO_grad(const Q& variational, Q& elbo_grad,
callbacks::logger& logger) const {
static const char* function =
"stan::variational::advi::calc_ELBO_grad";
stan::math::check_size_match(function,
"Dimension of elbo_grad",
elbo_grad.dimension(),
"Dimension of variational q",
variational.dimension());
stan::math::check_size_match(function,
"Dimension of variational q",
variational.dimension(),
"Dimension of variables in model",
cont_params_.size());
variational.calc_grad(elbo_grad,
model_, cont_params_, n_monte_carlo_grad_, rng_,
logger);
}
/**
* Heuristic grid search to adapt eta to the scale of the problem.
*
* @param[in] variational initial variational distribution.
* @param[in] adapt_iterations number of iterations to spend doing stochastic
* gradient ascent at each proposed eta value.
* @param[in,out] logger logger for messages
* @return adapted (tuned) value of eta via heuristic grid search
* @throw std::domain_error If either (a) the initial ELBO cannot be
* computed at the initial variational distribution, (b) all step-size
* proposals in eta_sequence fail.
*/
double adapt_eta(Q& variational,
int adapt_iterations,
callbacks::logger& logger)
const {
static const char* function = "stan::variational::advi::adapt_eta";
stan::math::check_positive(function,
"Number of adaptation iterations",
adapt_iterations);
logger.info("Begin eta adaptation.");
// Sequence of eta values to try during adaptation
const int eta_sequence_size = 5;
double eta_sequence[eta_sequence_size] = {100, 10, 1, 0.1, 0.01};
// Initialize ELBO tracking variables
double elbo = -std::numeric_limits<double>::max();
double elbo_best = -std::numeric_limits<double>::max();
double elbo_init;
try {
elbo_init = calc_ELBO(variational, logger);
} catch (const std::domain_error& e) {
const char* name = "Cannot compute ELBO using the initial "
"variational distribution.";
const char* msg1 = "Your model may be either "
"severely ill-conditioned or misspecified.";
stan::math::domain_error(function, name, "", msg1);
}
// Variational family to store gradients
Q elbo_grad = Q(model_.num_params_r());
// Adaptive step-size sequence
Q history_grad_squared = Q(model_.num_params_r());
double tau = 1.0;
double pre_factor = 0.9;
double post_factor = 0.1;
double eta_best = 0.0;
double eta;
double eta_scaled;
bool do_more_tuning = true;
int eta_sequence_index = 0;
while (do_more_tuning) {
// Try next eta
eta = eta_sequence[eta_sequence_index];
int print_progress_m;
for (int iter_tune = 1; iter_tune <= adapt_iterations; ++iter_tune) {
print_progress_m = eta_sequence_index
* adapt_iterations + iter_tune;
variational
::print_progress(print_progress_m, 0,
adapt_iterations * eta_sequence_size,
adapt_iterations, true, "", "", logger);
// (ROBUST) Compute gradient of ELBO. It's OK if it diverges.
// We'll try a smaller eta.
try {
calc_ELBO_grad(variational, elbo_grad, logger);
} catch (const std::domain_error& e) {
elbo_grad.set_to_zero();
}
// Update step-size
if (iter_tune == 1) {
history_grad_squared += elbo_grad.square();
} else {
history_grad_squared = pre_factor * history_grad_squared
+ post_factor * elbo_grad.square();
}
eta_scaled = eta / sqrt(static_cast<double>(iter_tune));
// Stochastic gradient update
variational += eta_scaled * elbo_grad
/ (tau + history_grad_squared.sqrt());
}
// (ROBUST) Compute ELBO. It's OK if it has diverged.
try {
elbo = calc_ELBO(variational, logger);
} catch (const std::domain_error& e) {
elbo = -std::numeric_limits<double>::max();
}
// Check if:
// (1) ELBO at current eta is worse than the best ELBO
// (2) the best ELBO hasn't gotten worse than the initial ELBO
if (elbo < elbo_best && elbo_best > elbo_init) {
std::stringstream ss;
ss << "Success!"
<< " Found best value [eta = " << eta_best
<< "]";
if (eta_sequence_index < eta_sequence_size - 1)
ss << (" earlier than expected.");
else
ss << ".";
logger.info(ss);
logger.info("");
do_more_tuning = false;
} else {
if (eta_sequence_index < eta_sequence_size - 1) {
// Reset
elbo_best = elbo;
eta_best = eta;
} else {
// No more eta values to try, so use current eta if it
// didn't diverge or fail if it did diverge
if (elbo > elbo_init) {
std::stringstream ss;
ss << "Success!"
<< " Found best value [eta = " << eta_best
<< "].";
logger.info(ss);
logger.info("");
eta_best = eta;
do_more_tuning = false;
} else {
const char* name = "All proposed step-sizes";
const char* msg1 = "failed. Your model may be either "
"severely ill-conditioned or misspecified.";
stan::math::domain_error(function, name, "", msg1);
}
}
// Reset
history_grad_squared.set_to_zero();
}
++eta_sequence_index;
variational = Q(cont_params_);
}
return eta_best;
}
/**
* Runs stochastic gradient ascent with an adaptive stepsize sequence.
*
* @param[in,out] variational initia variational distribution
* @param[in] eta stepsize scaling parameter
* @param[in] tol_rel_obj relative tolerance parameter for convergence
* @param[in] max_iterations max number of iterations to run algorithm
* @param[in,out] logger logger for messages
* @param[in,out] diagnostic_writer writer for diagnostic information
* @throw std::domain_error If the ELBO or its gradient is ever
* non-finite, at any iteration
*/
void stochastic_gradient_ascent(Q& variational,
double eta,
double tol_rel_obj,
int max_iterations,
callbacks::logger& logger,
callbacks::writer& diagnostic_writer)
const {
static const char* function =
"stan::variational::advi::stochastic_gradient_ascent";
stan::math::check_positive(function, "Eta stepsize", eta);
stan::math::check_positive(function,
"Relative objective function tolerance",
tol_rel_obj);
stan::math::check_positive(function,
"Maximum iterations",
max_iterations);
// Gradient parameters
Q elbo_grad = Q(model_.num_params_r());
Q momentum = Q(model_.num_params_r()); // +
double HDM_eps = 0.2 ; // +
double HDM_delta = 0.75 ;
// Stepsize sequence parameters
Q history_grad_squared = Q(model_.num_params_r());
double tau = 1.0;
double pre_factor = 0.9;
double post_factor = 0.1;
double eta_scaled;
// Initialize ELBO and convergence tracking variables
double elbo(0.0);
double elbo_best = -std::numeric_limits<double>::max();
double elbo_prev = -std::numeric_limits<double>::max();
double delta_elbo = std::numeric_limits<double>::max();
double delta_elbo_ave = std::numeric_limits<double>::max();
double delta_elbo_med = std::numeric_limits<double>::max();
// Heuristic to estimate how far to look back in rolling window
int cb_size
= static_cast<int>(std::max(0.1 * max_iterations / eval_elbo_,
2.0));
boost::circular_buffer<double> elbo_diff(cb_size);
logger.info("Begin stochastic gradient ascent.");
logger.info(" iter"
" ELBO"
" delta_ELBO_mean"
" delta_ELBO_med"
" notes ");
// Timing variables
clock_t start = clock();
clock_t end;
double delta_t;
// Main loop
bool do_more_iterations = true;
for (int iter_counter = 1; do_more_iterations; ++iter_counter) {
// Compute gradient using Monte Carlo integration
calc_ELBO_grad(variational, elbo_grad, logger);
// Update step-size
// if (iter_counter == 1) {
// history_grad_squared += elbo_grad.square();
//} else {
// history_grad_squared = pre_factor * history_grad_squared
// + post_factor * elbo_grad.square();
//}
//eta_scaled = eta / sqrt(static_cast<double>(iter_counter));
// Stochastic gradient update
//
// variational += eta_scaled * elbo_grad
// / (tau + history_grad_squared.sqrt());
momentum *= HDM_delta;
momentum += - HDM_eps * HDM_delta * elbo_grad;
variational += - HDM_eps * momentum / (1.0 + momentum.square() ) ;
// Check for convergence every "eval_elbo_"th iteration
if (iter_counter % eval_elbo_ == 0) {
elbo_prev = elbo;
elbo = calc_ELBO(variational, logger);
if (elbo > elbo_best)
elbo_best = elbo;
delta_elbo = rel_difference(elbo, elbo_prev);
elbo_diff.push_back(delta_elbo);
delta_elbo_ave = std::accumulate(elbo_diff.begin(),
elbo_diff.end(), 0.0)
/ static_cast<double>(elbo_diff.size());
delta_elbo_med = circ_buff_median(elbo_diff);
std::stringstream ss;
ss << " "
<< std::setw(4) << iter_counter
<< " "
<< std::setw(15) << std::fixed << std::setprecision(3)
<< elbo
<< " "
<< std::setw(15) << std::fixed << std::setprecision(5)
<< delta_elbo_ave
<< " "
<< std::setw(15) << std::fixed << std::setprecision(5)
<< delta_elbo_med;
end = clock();
delta_t = static_cast<double>(end - start) / CLOCKS_PER_SEC;
std::vector<double> print_vector;
print_vector.clear();
print_vector.push_back(iter_counter);
print_vector.push_back(delta_t);
print_vector.push_back(elbo);
diagnostic_writer(print_vector);
if (delta_elbo_ave < tol_rel_obj) {
if (delta_elbo_med < tol_rel_obj) {
ss << " ELBO CONVERGED";
do_more_iterations = false;
}
}
if (iter_counter > 10 * eval_elbo_) {
if (delta_elbo_med > 0.5 || delta_elbo_ave > 0.5) {
ss << " MAY BE DIVERGING... INSPECT ELBO";
}
}
logger.info(ss);
if (do_more_iterations == false &&
rel_difference(elbo, elbo_best) > 0.05) {
logger.info("Informational Message: The ELBO at a previous "
"iteration is larger than the ELBO upon "
"convergence!");
logger.info("This variational approximation may not "
"have converged to a good optimum.");
}
}
if (iter_counter == max_iterations) {
logger.info("Informational Message: The maximum number of "
"iterations is reached! The algorithm may not have "
"converged.");
logger.info("This variational approximation is not "
"guaranteed to be meaningful.");
do_more_iterations = false;
}
}
}
/**
* Runs ADVI and writes to output.
*
* @param[in] eta eta parameter of stepsize sequence
* @param[in] adapt_engaged boolean flag for eta adaptation
* @param[in] adapt_iterations number of iterations for eta adaptation
* @param[in] tol_rel_obj relative tolerance parameter for convergence
* @param[in] max_iterations max number of iterations to run algorithm
* @param[in,out] logger logger for messages
* @param[in,out] parameter_writer writer for parameters
* (typically to file)
* @param[in,out] diagnostic_writer writer for diagnostic information
*/
int run(double eta, bool adapt_engaged, int adapt_iterations,
double tol_rel_obj, int max_iterations,
callbacks::logger& logger,
callbacks::writer& parameter_writer,
callbacks::writer& diagnostic_writer)
const {
diagnostic_writer("iter,time_in_seconds,ELBO");
// Initialize variational approximation
std::stringstream sin;
for(int v = 0; v<10; v++){
sin << cont_params_[v] << std::endl;
}
logger.info(sin);
Q variational = Q(cont_params_);
//if (adapt_engaged) {
// eta = adapt_eta(variational, adapt_iterations, logger);
// parameter_writer("Stepsize adaptation complete.");
// std::stringstream ss;
// ss << "eta = " << eta;
// parameter_writer(ss.str());
//}
stochastic_gradient_ascent(variational, eta,
tol_rel_obj, max_iterations,
logger, diagnostic_writer);
// Write mean of posterior approximation on first output line
cont_params_ = variational.mean();
std::vector<double> cont_vector(cont_params_.size());
for (int i = 0; i < cont_params_.size(); ++i)
cont_vector.at(i) = cont_params_(i);
std::vector<int> disc_vector;
std::vector<double> values;
std::stringstream msg;
model_.write_array(rng_, cont_vector, disc_vector, values,
true, true, &msg);
if (msg.str().length() > 0)
logger.info(msg);
values.insert(values.begin(), 0);
parameter_writer(values);
// Draw more samples from posterior and write on subsequent lines
logger.info("");
std::stringstream ss;
ss << "Drawing a sample of size "
<< n_posterior_samples_
<< " from the approximate posterior... ";
logger.info(ss);
for (int n = 0; n < n_posterior_samples_; ++n) {
variational.sample(rng_, cont_params_);
for (int i = 0; i < cont_params_.size(); ++i) {
cont_vector.at(i) = cont_params_(i);
}
std::stringstream msg2;
model_.write_array(rng_, cont_vector, disc_vector, values,
true, true, &msg2);
if (msg2.str().length() > 0)
logger.info(msg2);
values.insert(values.begin(), 0);
parameter_writer(values);
}
logger.info("COMPLETED.");
return stan::services::error_codes::OK;
}
// TODO(akucukelbir): move these things to stan math and test there
/**
* Compute the median of a circular buffer.
*
* @param[in] cb circular buffer with some number of values in it.
* @return median of values in circular buffer.
*/
double circ_buff_median(const boost::circular_buffer<double>& cb) const {
// FIXME: naive implementation; creates a copy as a vector
std::vector<double> v;
for (boost::circular_buffer<double>::const_iterator i = cb.begin();
i != cb.end(); ++i) {
v.push_back(*i);
}
size_t n = v.size() / 2;
std::nth_element(v.begin(), v.begin()+n, v.end());
return v[n];
}
/**
* Compute the relative difference between two double values.
*
* @param[in] prev previous value
* @param[in] curr current value
* @return absolutely value of relative difference
*/
// params is not prev, curr: curr, prev
double rel_difference(double prev, double curr) const {
return std::fabs(curr) < 1e-6 ? 1.0 : ((curr - prev) / prev);
}
protected:
Model& model_;
Eigen::VectorXd& cont_params_;
BaseRNG& rng_;
int n_monte_carlo_grad_;
int n_monte_carlo_elbo_;
int eval_elbo_;
int n_posterior_samples_;
};
} // variational
} // stan
#endif