import time
import numpy as np
from random import choice
from typing import Sequence, Tuple
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(64) # Fixing the seed for reproducibility
import warnings
warnings.filterwarnings('ignore')

class Bandit(object):

  def __init__(self,
               n_arm: int = 2,
               n_pulls: int = 2000,
               actual_toxicity_prob: list = [0.4, 0.6],
               theta: float = 0.3,
               ):
    self.n_arm = n_arm
    self.n_pulls = n_pulls
    self.actual_toxicity_prob = actual_toxicity_prob
    self.theta = theta
    self.init_bandit()

  def init_bandit(self):
    """
      Pessimistic initializaiton
    """
    self.num_dose_selected = np.array([0]*self.n_arm) # number of times a dose is selected
    self.num_toxic = np.array([0]*self.n_arm) # number of times a does found to be toxic

  def pull(self, a_idx: int):
    """
    .inputs:
      a_idx: Index of action.
    .outputs:
      rew: reward value.
    """
    assert a_idx < self.n_arm, "invalid action index"
    # ----------------------------------------------
    ...
    # ----------------------------------------------
    return rew

###Problem definition
bandit = Bandit(n_arm=3, n_pulls=2000, actual_toxicity_prob=[0.1, 0.35, 0.8], theta=0.3)

def eps_greedy(
    bandit: Bandit,
    eps: float,
    init_q: float = 0.0
) -> Tuple[list, list, list, list]:
    """
    .inputs:
      bandit: A bandit problem, instantiated from the above class.
      eps: The epsilon value for exploration.
      init_q: Initial estimation of each arm's value.
    .outputs:
      rew_record: The record of rewards at each timestep.
      avg_ret_record: The average of rewards up to step t, where t goes from 0 to n_pulls.
      tot_reg_record: The regret up to step t, where t goes from 0 to n_pulls.
      opt_action_perc_record: Percentage of optimal arm selected.
    """
    # Initialize q values and counts
    q = np.array([init_q] * bandit.n_arm, dtype=float)  # Action value estimates
    n_a = np.zeros(bandit.n_arm)  # Number of times each arm is selected

    cumulative_reward = 0.0
    cumulative_regret = 0.0
    rew_record = []
    avg_ret_record = []
    tot_reg_record = []
    opt_action_perc_record = []

  for t in range(bandit.n_pulls):
    # ----------------------------------------------
    ...
    # ----------------------------------------------

  return rew_record, avg_ret_record, tot_reg_record, opt_action_perc_record

plt.figure(0)
plt.xlabel("n pulls")
plt.ylabel("avg return")
plt.figure(1)
plt.xlabel("n pulls")
plt.ylabel("reward")
plt.figure(2)
plt.xlabel("n pulls")
plt.ylabel("total regret")
plt.figure(3)
plt.xlabel("n pulls")
plt.ylabel("% optimal action")

N = 20
tot_reg_rec_best = 1e8

for eps in [0.5, 0.1, .0]:
  rew_rec = np.zeros(bandit.n_pulls)
  avg_ret_rec = np.zeros(bandit.n_pulls)
  tot_reg_rec = np.zeros(bandit.n_pulls)
  opt_act_rec = np.zeros(bandit.n_pulls)
  start_time = time.time()
  for n in range(N):
    bandit.init_bandit()
    rew_rec_n, avg_ret_rec_n, tot_reg_rec_n, opt_act_rec_n = eps_greedy(bandit, eps)
    rew_rec += np.array(rew_rec_n)
    avg_ret_rec += np.array(avg_ret_rec_n)
    tot_reg_rec += np.array(tot_reg_rec_n)
    opt_act_rec += np.array(opt_act_rec_n)

  end_time = time.time()
  # print(f"time per run: {end_time - start_time}/N")
  # take the mean
  rew_rec /= N
  avg_ret_rec /= N
  tot_reg_rec /= N
  opt_act_rec /= N

  plt.figure(0)
  plt.plot(avg_ret_rec, label="eps={}".format(eps))
  plt.legend(loc="lower right")

  plt.figure(1)
  plt.plot(rew_rec[1:], label="eps={}".format(eps))
  plt.legend(loc="lower right")

  plt.figure(2)
  plt.plot(tot_reg_rec, label="eps={}".format(eps))
  plt.legend(loc="lower right")

  plt.figure(3)
  plt.plot(opt_act_rec, label="eps={}".format(eps))
  plt.legend(loc="lower right")


  if tot_reg_rec[-1] < tot_reg_rec_best:
        ep_greedy_dict = {
        'opt_act':opt_act_rec,
        'regret_list':tot_reg_rec,}
        tot_reg_rec_best = tot_reg_rec[-1]

plt.figure(4)
plt.xlabel("n pulls")
plt.ylabel("avg return")

plt.figure(5)
plt.xlabel("n pulls")
plt.ylabel("reward")

plt.figure(6)
plt.xlabel("n pulls")
plt.ylabel("total regret")

N = 20
for init_q in [-1, 1]:
  rew_rec = np.zeros(bandit.n_pulls)
  avg_ret_rec = np.zeros(bandit.n_pulls)
  for n in range(N):
    bandit.init_bandit()
    rew_rec_n, avg_ret_rec_n, tot_reg_rec_n, opt_act_rec_n = eps_greedy(bandit, eps=0.0, init_q=init_q)

    rew_rec += np.array(rew_rec_n)
    avg_ret_rec += np.array(avg_ret_rec_n)
    tot_reg_rec += np.array(tot_reg_rec_n)

  avg_ret_rec /= N
  rew_rec /= N
  tot_reg_rec /= N
  plt.figure(4)
  plt.plot(avg_ret_rec[1:], label="q_init_val={}".format(init_q))
  plt.legend(loc="lower right")

  plt.figure(5)
  plt.plot(rew_rec[1:], label="q_init_val={}".format(init_q))
  plt.legend(loc="lower right")

  plt.figure(6)
  plt.plot(tot_reg_rec[1:], label="q_init_val={}".format(init_q))
  plt.legend(loc="lower right")

def ucb(
    bandit: Bandit,
    c: float,
    init_q: float = 0.0
) -> Tuple[list, list, list, list]:
    """
    .inputs:
      bandit: A bandit problem, instantiated from the above class.
      c: The exploration coefficient.
      init_q: Initial estimation of each arm's value.
    .outputs:
      rew_record: The record of rewards at each timestep.
      avg_ret_record: The average summation of rewards up to step t, where t goes from 0 to n_pulls.
      tot_reg_record: The regret up to step t, where t goes from 0 to n_pulls.
      opt_action_perc_record: Percentage of optimal arm selected.
    """
    # Initialize action values, counts, and metrics
    q = np.array([init_q] * bandit.n_arm, dtype=float)  # Estimated values of each arm
    n_a = np.zeros(bandit.n_arm)  # Number of times each arm has been pulled
    ret = 0.0  # Cumulative reward
    rew_record = []  # Reward at each time step
    avg_ret_record = []  # Average return at each time step
    tot_reg_record = []  # Cumulative regret at each time step
    opt_action_perc_record = []  # Percentage of optimal actions

    # Determine the optimal arm
    actual_toxicity_prob = np.array(bandit.actual_toxicity_prob)
    opt_arm = np.argmin(np.abs(actual_toxicity_prob - bandit.theta))

    optimal_action_count = 0  # Counter for the number of times the optimal arm is selected
    cumulative_regret = 0.0  # Cumulative regret

  for t in range(bandit.n_pulls):
    # Assuming to take the first arm always when there is no exploration
    # ----------------------------------------------
    ...
    # ----------------------------------------------


  return rew_record, avg_ret_record, tot_reg_record, opt_action_perc_record

plt.figure(7)
plt.xlabel("n pulls")
plt.ylabel("avg return")
plt.figure(8)
plt.xlabel("n pulls")
plt.ylabel("reward")
plt.figure(9)
plt.xlabel("n pulls")
plt.ylabel("total regret")
plt.figure(10)
plt.xlabel("n pulls")
plt.ylabel("% optimal action")

N = 20
tot_reg_rec_best = 1e8
for c in [.0, 0.5, 2]:
  rew_rec = np.zeros(bandit.n_pulls)
  avg_ret_rec = np.zeros(bandit.n_pulls)
  tot_reg_rec = np.zeros(bandit.n_pulls)
  opt_act_rec = np.zeros(bandit.n_pulls)

  for n in range(N):
    bandit.init_bandit()
    rew_rec_n, avg_ret_rec_n, tot_reg_rec_n, opt_act_rec_n = ucb(bandit, c)
    rew_rec += np.array(rew_rec_n)
    avg_ret_rec += np.array(avg_ret_rec_n)
    tot_reg_rec += np.array(tot_reg_rec_n)
    opt_act_rec += np.array(opt_act_rec_n)

  # take the mean
  rew_rec /= N
  avg_ret_rec /= N
  tot_reg_rec /= N
  opt_act_rec /= N

  plt.figure(7)
  plt.plot(avg_ret_rec, label="c={}".format(c))
  plt.legend(loc="lower right")

  plt.figure(8)
  plt.plot(rew_rec, label="c={}".format(c))
  plt.legend(loc="lower right")

  plt.figure(9)
  plt.plot(tot_reg_rec, label="c={}".format(c))
  plt.legend(loc="lower right")

  plt.figure(10)
  plt.plot(opt_act_rec, label="c={}".format(c))
  plt.legend(loc="lower right")

  if tot_reg_rec[-1] < tot_reg_rec_best:
        ucb_dict = {
        'opt_act':opt_act_rec,
        'regret_list':tot_reg_rec,}
        tot_reg_rec_best = tot_reg_rec[-1]

Practical session 2: Bandits¶

Preliminaries¶

Setup¶

The MultiArmedBandit environment¶

Task 1: Define your Bandit class:¶

Fixing the setting for the rest of the practical session¶

Q1.b: $\epsilon$-greedy for k-armed bandit and Optimistic initial values¶

Q1.b1: $\epsilon$-greedy algorithm implementation¶

Q1.b2: Plotting the results¶

Analysis:¶

Q1.b4: Optimistic Initial Values¶

Analysis¶

Upper confidence bound arm selection¶

Q1.c: Upper-Confidence-Bound action selection¶

Q1.c1: UCB algorithm implementation¶

Q1.c2: Plotting the results¶

Q1.c3: Analysis¶