ShreeshaN
diff --git a/‎BlackJackMonteCarlo/README.md
+18 b/‎BlackJackMonteCarlo/README.md
+18
diff --git a/‎BlackJackMonteCarlo/__init__.py
+11 b/‎BlackJackMonteCarlo/__init__.py
+11
diff --git a/‎BlackJackMonteCarlo/bjack_src/__init__.py
+11 b/‎BlackJackMonteCarlo/bjack_src/__init__.py
+11
diff --git a/‎BlackJackMonteCarlo/bjack_src/mc.py
+215 b/‎BlackJackMonteCarlo/bjack_src/mc.py
+215
diff --git a/‎BlackJackMonteCarlo/img/first_visit_mc_control.png
197 KB b/‎BlackJackMonteCarlo/img/first_visit_mc_control.png
197 KB
diff --git a/‎BlackJackMonteCarlo/img/first_visit_mc_predict.png
125 KB b/‎BlackJackMonteCarlo/img/first_visit_mc_predict.png
125 KB
diff --git a/‎BlackJackMonteCarlo/test/__init__.py
+11 b/‎BlackJackMonteCarlo/test/__init__.py
+11
diff --git a/‎BlackJackMonteCarlo/test/mc_test.py
+87 b/‎BlackJackMonteCarlo/test/mc_test.py
+87
diff --git a/‎CliffWalkingTemporalDifference/README.md
+13 b/‎CliffWalkingTemporalDifference/README.md
+13
diff --git a/‎CliffWalkingTemporalDifference/__init__.py
+11 b/‎CliffWalkingTemporalDifference/__init__.py
+11
diff --git a/‎CliffWalkingTemporalDifference/img/cliff_walking.png
189 KB b/‎CliffWalkingTemporalDifference/img/cliff_walking.png
189 KB
diff --git a/‎CliffWalkingTemporalDifference/img/q-learning_td_control.png
123 KB b/‎CliffWalkingTemporalDifference/img/q-learning_td_control.png
123 KB
diff --git a/‎CliffWalkingTemporalDifference/img/sarsa_on_policy_td_control.png
137 KB b/‎CliffWalkingTemporalDifference/img/sarsa_on_policy_td_control.png
137 KB
@@ -0,0 +1,18 @@
+# BLACK JACK GAME
+
+#### This repo is a python implementation of the black jack game as demonstrated by the [Open AI gym](https://gym.openai.com/envs/Blackjack-v0/)
+
+
+## Algorithms used 
+* On-policy first visit __Monte-Carlo prediction__
+![](img/first_visit_mc_predict.png)
+
+* On-policy first visit __Monte-Carlo control__
+![](img/first_visit_mc_control.png)
+
+
+#### References
+
+1. [Open AI gym](https://gym.openai.com)
+2. [Analytics Vidya blog post](https://www.analyticsvidhya.com/blog/2018/11/reinforcement-learning-introduction-monte-carlo-learning-openai-gym/)
+3. [Professor David Silver Slides](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html)
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+"""
+@created on: 9/21/19,
+@author: Shreesha N,
+@version: v0.0.1
+@system name: badgod
+Description:
+
+..todo::
+
+"""
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+"""
+@created on: 9/21/19,
+@author: Shreesha N,
+@version: v0.0.1
+@system name: badgod
+Description:
+
+..todo::
+
+"""
@@ -0,0 +1,215 @@
+# -*- coding: utf-8 -*-
+"""
+@created on: 9/21/19,
+@author: Shreesha N,
+@version: v0.0.1
+@system name: badgod
+Description:
+
+..todo::
+
+"""
+import numpy as np
+import random
+from collections import defaultdict
+
+"""     
+    Monte-Carlo
+    In this problem, we will implememnt an AI player for Blackjack.
+    The main goal of this problem is to get familar with Monte-Carlo algorithm.
+    We can test the correctness of our code 
+    by typing 'nosetests -v mc_test.py' in the terminal.
+"""
+
+
+def initial_policy(observation):
+    """A policy that sticks if the player score is >= 20 and his otherwise
+
+    Parameters:
+    -----------
+    observation:
+    Returns:
+    --------
+    action: 0 or 1
+        0: STICK
+        1: HIT
+    """
+    return 0 if observation[0] >= 20 else 1
+
+
+def play_step(env, action_to_take):
+    """
+    Given the action to be taken, plays a step in the environment and returns the new set of values
+
+    :param env: function
+        OpenAI gym environment.
+    :param action_to_take: int
+        Action index to be taken for the current step.
+
+    :return: next_state: 3-tuple
+        (Player's sum, Dealer's sum, Boolean indicating if the player has ACE).
+    :return: reward: int
+        Reward received for choosing the given action
+    :return: done: boolean
+        Boolean indicating if the state is a terminal or not.
+    """
+    next_state, reward, done, info = env.step(action_to_take)
+    return next_state, reward, done
+
+
+def get_random_episode(env, policy):
+    """
+    Generates a list having episodes. Each episode in this list is generated until a terminal state is reached.
+    :param env: function
+        OpenAI gym environment.
+    :param policy: function
+        The policy  to be followed while choosing an action.
+    :return: list
+        List of generated episodes
+    """
+    new_set_of_episodes = []
+    current_state = env.reset()
+    while True:
+        action_to_take = policy(current_state)
+        next_state, reward, done = play_step(env, action_to_take)
+        new_set_of_episodes.append((current_state, action_to_take, reward))
+        if done:
+            break
+        current_state = next_state
+    return new_set_of_episodes
+
+
+def mc_prediction(policy, env, n_episodes, gamma=1.0):
+    """
+    Given policy using sampling to calculate the value function
+        by using Monte Carlo first visit algorithm.
+
+    :param policy: function
+        A function that maps an observation to action probabilities
+    :param env: function
+        OpenAI gym environment
+    :param n_episodes: int
+        Number of episodes to sample
+    :param gamma: float
+        Gamma discount factor
+    :return V: defaultdict(float)
+        A dictionary that maps from state to value
+    """
+
+    # initialize empty dictionaries
+    returns_sum = defaultdict(float)
+    returns_count = defaultdict(float)
+    # a nested dictionary that maps state -> value
+    V = defaultdict(float)
+    for i in range(n_episodes):
+        new_set_of_episodes = get_random_episode(env, policy)
+        states_set = set([episode[0] for episode in new_set_of_episodes])
+        for i, state in enumerate(states_set):
+            first_occurance = next(i for i, x in enumerate(new_set_of_episodes) if x[0] == state)
+            total_reward = sum([(int(gamma) ** power) * episode[2] for power, episode in
+                                enumerate(new_set_of_episodes[first_occurance:])])
+            returns_sum[state] += total_reward
+            returns_count[state] += 1
+            V[state] = returns_sum[state] / returns_count[state]
+    return V
+
+
+def epsilon_greedy(Q, state, nA, epsilon=0.1):
+    """
+    A epsilon-greedy method to generate random action based on Q state
+    :param Q: dict()
+        A dictionary  that maps from state -> action-values,
+        where Q[s][a] is the estimated action value corresponding to state s and action a.
+    :param state: int
+        current state
+    :param nA: int
+        Number of actions in the environment
+    :param epsilon: float
+        The probability to select a random action, range between 0 and 1
+    :return: int
+        action based current state
+
+    Hints:
+    ------
+    With probability (1 − epsilon) choose the greedy action.
+    With probability epsilon choose an action at random.
+    """
+    A = np.ones(nA) * epsilon / float(nA)
+    best_action = np.argmax(Q[state])
+    A[best_action] += (1.0 - epsilon)
+    return np.random.choice(np.arange(len(A)), p=A)
+
+
+def generate_random_episode_greedy(Q, nA, epsilon, env):
+    """
+    Generate episodes using epsilon greedy action chooser method.
+
+    :param Q: dict()
+        A dictionary  that maps from state -> action-values,
+        where Q[s][a] is the estimated action value corresponding to state s and action a.
+    :param nA: int
+        Number of actions in the environment
+    :param epsilon: float
+        The probability to select a random action, range between 0 and 1
+    :param env:  function
+        OpenAI gym environment
+    :return: list
+        List of generated episodes
+    """
+    new_set_of_episodes = []
+    current_state = env.reset()
+    while True:
+        action_to_take = epsilon_greedy(Q, current_state, nA, epsilon)
+        next_state, reward, done = play_step(env, action_to_take)
+        new_set_of_episodes.append((current_state, action_to_take, reward))
+        if done:
+            break
+        current_state = next_state
+    return new_set_of_episodes
+
+
+def mc_control_epsilon_greedy(env, n_episodes, gamma=1.0, epsilon=0.1):
+    """
+    Monte Carlo control with exploring starts.
+    Find an optimal epsilon-greedy policy.
+
+    :param env:  function
+        OpenAI gym environment
+    :param n_episodes:  int
+        Number of episodes to sample
+    :param gamma:  float
+        Gamma discount factor
+    :param epsilon:  float
+        The probability to select a random action, range between 0 and 1
+    :return: Q: dict()
+        A dictionary  that maps from state -> action-values,
+        where Q[s][a] is the estimated action value corresponding to state s and action a.
+
+    Hint:
+    -----
+    You could consider decaying epsilon, i.e. epsilon = epsilon-(0.1/n_episodes) during each episode
+    and episode must > 0.
+
+    """
+    returns_sum = defaultdict(float)
+    returns_count = defaultdict(float)
+    # a nested dictionary that maps state -> (action -> action-value)
+    Q = defaultdict(lambda: np.zeros(env.action_space.n))
+
+    def decay(epsilon):
+        return epsilon - (0.1 / n_episodes)
+
+    for i in range(n_episodes):
+        new_set_of_episodes = generate_random_episode_greedy(Q, env.action_space.n, epsilon, env)
+        epsilon = decay(epsilon)
+        for i in range(len(new_set_of_episodes)):
+            state_action_pair = (new_set_of_episodes[i][0], new_set_of_episodes[i][1])
+            first_occurance = next(
+                    i for i, episode in enumerate(new_set_of_episodes) if (episode[0], episode[1]) == state_action_pair)
+            g = sum([(int(gamma) ** power) * episode[2] for power, episode in
+                     enumerate(new_set_of_episodes[first_occurance:])])
+            returns_sum[state_action_pair] += g
+            returns_count[state_action_pair] += 1
+            Q[state_action_pair[0]][state_action_pair[1]] = returns_sum[state_action_pair] / returns_count[
+                state_action_pair]
+    return Q
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+"""
+@created on: 9/21/19,
+@author: Shreesha N,
+@version: v0.0.1
+@system name: badgod
+Description:
+
+..todo::
+
+"""
@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+"""
+@created on: 9/21/19,
+@author: Shreesha N,
+@version: v0.0.1
+@system name: badgod
+Description:
+
+..todo::
+
+"""
+
+import gym
+import sys
+
+from BlackJackMonteCarlo.bjack_src.mc import *
+
+"""
+    This file includes unit test for mc.py
+    You could test the correctness of your code by 
+    typing 'nosetests -v mc_test.py' in the terminal
+"""
+env = gym.make('Blackjack-v0')
+
+
+# ---------------------------------------------------------------
+def test_python_version():
+    assert sys.version_info[0] == 3  # require python 3
+
+
+# ---------------------------------------------------------------
+def test_initial_policy():
+    state1 = (21, 10, True)
+    action1 = initial_policy(state1)
+    state2 = (18, 5, True)
+    action2 = initial_policy(state2)
+
+    assert np.allclose(action1, 0)
+    assert np.allclose(action2, 1)
+
+
+# ---------------------------------------------------------------
+def test_mc_prediction():
+    V_500k = mc_prediction(initial_policy, env, n_episodes=500000, gamma=1.0)
+
+    boundaries1 = [(18, 4, False), (18, 6, False), (18, 8, False)]
+    boundaries2 = [(18, 4, True), (18, 6, True), (18, 8, True)]
+    boundaries3 = [(20, 4, False), (20, 6, False), (20, 8, False), (20, 4, True), (20, 6, True), (20, 8, True)]
+
+    assert len(V_500k) == 280
+    for b in boundaries1:
+        assert np.allclose(V_500k[b], -0.7, atol=0.05)
+    for b in boundaries2:
+        assert np.allclose(V_500k[b], -0.4, atol=0.1)
+    for b in boundaries3:
+        assert V_500k[b] > 0.6
+
+
+# ---------------------------------------------------------------
+def test_epsilon_greedy():
+    # just a test case, not related to blackjack
+    Q = defaultdict(lambda: np.zeros(4))
+    state = (14, 7, True)
+
+    actions = []
+    for _ in range(10000):
+        action = epsilon_greedy(Q, state, 4, epsilon=0.1)
+        actions.append(action)
+
+    assert np.allclose(1 - np.count_nonzero(actions) / 10000, 0.925, atol=0.02)
+
+
+# ---------------------------------------------------------------
+def test_mc_control_epsilon_greedy():
+    boundaries_key = [(19, 10, True), (19, 4, True), (18, 7, True), (17, 9, True), (17, 5, True),
+                      (17, 8, False), (17, 6, False), (15, 6, False), (14, 7, False)]
+    boundaries_action = [0, 0, 0, 1, 1, 0, 0, 0, 1]
+
+    count = 0
+    for _ in range(2):
+        Q_500k = mc_control_epsilon_greedy(env, n_episodes=1000000, gamma=1.0, epsilon=0.1)
+        policy = dict((k, np.argmax(v)) for k, v in Q_500k.items())
+        print([policy[key] for key in boundaries_key])
+        if [policy[key] for key in boundaries_key] == boundaries_action:
+            count += 1
+    assert len(Q_500k) == 280
+    assert count >= 1
@@ -0,0 +1,13 @@
+# Cliff Walking Game
+
+#### This repo is a python implementation of the cliff walking game as demonstrated by the [Open AI gym](https://github.com/openai/gym/blob/master/gym/envs/toy_text/cliffwalking.py)
+
+### Game play
+![](img/cliff_walking.png)
+
+### Algorithms used
+* Sarsa (on-policy TD control)
+![](img/sarsa_on_policy_td_control.png)
+
+* Q-learing (off-policy TD control)
+![](img/q-learning_td_control.png)
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+"""
+@created on: 9/22/19,
+@author: Shreesha N,
+@version: v0.0.1
+@system name: badgod
+Description:
+
+..todo::
+
+"""