updated epsilon greedy method

ShreeshaN · ShreeshaN · commit e573ea8da696 · 2019-09-26T23:06:08.000-04:00
diff --git a/BlackJackMonteCarlo/bjack_src/mc.py b/BlackJackMonteCarlo/bjack_src/mc.py
@@ -134,10 +134,18 @@ def epsilon_greedy(Q, state, nA, epsilon=0.1):
     With probability (1 − epsilon) choose the greedy action.
     With probability epsilon choose an action at random.
     """
-    A = np.ones(nA) * epsilon / float(nA)
-    best_action = np.argmax(Q[state])
-    A[best_action] += (1.0 - epsilon)
-    return np.random.choice(np.arange(len(A)), p=A)
+    # A = np.ones(nA) * epsilon / float(nA)
+    # best_action, prob_for_best_action = np.argmax(Q[state]), max(Q[state])
+    # if prob_for_best_action > epsilon:
+    #     A[best_action] += (1.0 - epsilon)
+    #     return np.random.choice(np.arange(len(A)), p=A)
+    # else:
+    #     return np.random.choice(np.arange(len(A)))
+
+    actions = np.ones(nA) * epsilon / float(nA)
+    best_current_action = np.argmax(Q[state])
+    actions[best_current_action] += (1.0 - epsilon)
+    return np.random.choice(np.arange(len(actions)), p=actions)
 
 
 def generate_random_episode_greedy(Q, nA, epsilon, env):
diff --git a/CliffWalkingTemporalDifference/td_src/td.py b/CliffWalkingTemporalDifference/td_src/td.py
@@ -45,10 +45,10 @@ def epsilon_greedy(Q, state, nA, epsilon=0.1):
     With probability (1 − epsilon) choose the greedy action.
     With probability epsilon choose an action at random.
     """
-    A = np.ones(nA) * epsilon / float(nA)
-    best_action = np.argmax(Q[state])
-    A[best_action] += (1.0 - epsilon)
-    return np.random.choice(np.arange(len(A)), p=A)
+    actions = np.ones(nA) * epsilon / float(nA)
+    best_current_action = np.argmax(Q[state])
+    actions[best_current_action] += (1.0 - epsilon)
+    return np.random.choice(np.arange(len(actions)), p=actions)
 
 
 def sarsa(env, n_episodes, gamma=1.0, alpha=0.5, epsilon=0.1):