36EXP3::EXP3(
size_t _K,
double _gamma)
42, one_minus_gamma(1 - _gamma)
43, gamma_over_K(_gamma / static_cast<double>(_K))
49 one_minus_gamma = 1 - _gamma;
50 gamma_over_K = _gamma /
static_cast<double>(K);
55 cerr <<
"STOP IT!! EXP3 should be receiving reward..." << endl;
58 double omega_sum = 0.0;
59 for (
size_t i = 0; i < K; i++) {
60 omega_sum += omega[i];
61 p_values[i] = gamma_over_K;
63 for (
size_t i = 0; i < K; i++) {
64 p_values[i] += (one_minus_gamma * (omega[i]/omega_sum));
66 boost::random::discrete_distribution<> p(p_values.begin(), p_values.end());
73 cerr <<
"STOP IT!! EXP3 should be choosing..." << endl;
76 if (r < 0.0 || r > 1.0) {
77 cerr <<
"STOP IT!! The reward needs to be in [0,1]." << endl;
79 double x = r / p_values[
choice];
83ostream& operator<<(ostream& out,
const EXP3& exp3) {
84 out <<
"Omegas:" << endl;
85 for (
size_t i = 0; i < exp3.K; i++)
86 out << exp3.omega[i] <<
" ";
87 out << endl <<
"p values:" << endl;
88 for (
size_t i = 0; i < exp3.K; i++)
89 out << exp3.p_values[i] <<
" ";
Implementation of the EXP3 algorithm for multiarmed bandits.
size_t choice
Store the last choice made.
bool choose_next
Belt-and braces: warn if choose/reward happens in the wrong order.
void set_gamma(double)
Reset gamma and associated members to something different.
void reward(double)
Provide reward for the most recent choice.
size_t choose()
Choose using the current state.
static boost::random::mt19937 random_generator
Random source.