From 2b09382a614573970bb8c28cedf59ba11c111810 Mon Sep 17 00:00:00 2001
From: SANJI <wadhwasanjam@gmail.com>
Date: Fri, 10 Oct 2025 09:01:54 +0530
Subject: [PATCH 1/7] Add Q-Learning algorithm implementation with
 epsilon-greedy policy and grid world demo

---
 machine_learning/q_learning.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 machine_learning/q_learning.py

diff --git a/machine_learning/q_learning.py b/machine_learning/q_learning.py
new file mode 100644
index 000000000000..e69de29bb2d1

From f0e21513e3eb8e6446524fed9165469682d07d18 Mon Sep 17 00:00:00 2001
From: SANJI <wadhwasanjam@gmail.com>
Date: Fri, 10 Oct 2025 10:00:16 +0530
Subject: [PATCH 2/7] Add Q-Learning algorithm implementation with
 epsilon-greedy policy and grid world demo code

---
 machine_learning/q_learning.py | 177 +++++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)

diff --git a/machine_learning/q_learning.py b/machine_learning/q_learning.py
index e69de29bb2d1..5d1d6e607258 100644
--- a/machine_learning/q_learning.py
+++ b/machine_learning/q_learning.py
@@ -0,0 +1,177 @@
+"""
+Q-Learning is a widely-used model-free algorithm in reinforcement learning that 
+learns the optimal action-value function Q(s, a), which tells an agent the expected 
+utility of taking action a in state s and then following the optimal policy after.
+It is able to find the best policy for any given finite Markov decision process (MDP) 
+without requiring a model of the environment.
+
+See: [https://en.wikipedia.org/wiki/Q-learning](https://en.wikipedia.org/wiki/Q-learning)
+"""
+
+from collections import defaultdict
+import random
+
+# Hyperparameters for Q-Learning
+LEARNING_RATE = 0.1
+DISCOUNT_FACTOR = 0.97
+EPSILON = 0.2
+EPSILON_DECAY = 0.995
+EPSILON_MIN = 0.01
+
+# Global Q-table to store state-action values
+q_table = defaultdict(lambda: defaultdict(float))
+
+# Environment variables for simple grid world
+SIZE = 4
+GOAL = (SIZE - 1, SIZE - 1)
+current_state = (0, 0)
+
+
+def get_q_value(state, action):
+    """
+    Get Q-value for a given state-action pair.
+
+    >>> get_q_value((0, 0), 2)
+    0.0
+    """
+    return q_table[state][action]
+
+
+def get_best_action(state, available_actions):
+    """
+    Get the action with maximum Q-value in the given state.
+
+    >>> q_table[(0, 0)][1] = 0.7
+    >>> q_table[(0, 0)][2] = 0.7
+    >>> q_table[(0, 0)][3] = 0.5
+    >>> get_best_action((0, 0), [1, 2, 3]) in [1, 2]
+    True
+    """
+    if not available_actions:
+        raise ValueError("No available actions provided")
+    max_q = max(q_table[state][a] for a in available_actions)
+    best = [a for a in available_actions if q_table[state][a] == max_q]
+    return random.choice(best)
+
+
+def choose_action(state, available_actions):
+    """
+    Choose action using epsilon-greedy policy.
+
+    >>> EPSILON = 0.0
+    >>> q_table[(0, 0)][1] = 1.0
+    >>> q_table[(0, 0)][2] = 0.5
+    >>> choose_action((0, 0), [1, 2])
+    1
+    """
+    global EPSILON
+    if not available_actions:
+        raise ValueError("No available actions provided")
+    if random.random() < EPSILON:
+        return random.choice(available_actions)
+    return get_best_action(state, available_actions)
+
+
+def update(state, action, reward, next_state, next_available_actions, done=False):
+    """
+    Perform Q-value update for a transition using the Q-learning rule.
+
+    Q(s,a) <- Q(s,a) + alpha * (r + gamma * max_a' Q(s',a') - Q(s,a))
+
+    >>> LEARNING_RATE = 0.5
+    >>> DISCOUNT_FACTOR = 0.9
+    >>> update((0,0), 1, 1.0, (0,1), [1,2], done=True)
+    >>> get_q_value((0,0), 1)
+    0.5
+    """
+    global LEARNING_RATE, DISCOUNT_FACTOR
+    max_q_next = 0.0 if done or not next_available_actions else max(
+        get_q_value(next_state, a) for a in next_available_actions
+    )
+    old_q = get_q_value(state, action)
+    new_q = (1 - LEARNING_RATE) * old_q + LEARNING_RATE * (
+        reward + DISCOUNT_FACTOR * max_q_next
+    )
+    q_table[state][action] = new_q
+
+
+def get_policy():
+    """
+    Extract a deterministic policy from the Q-table.
+
+    >>> q_table[(1,2)][1] = 2.0
+    >>> q_table[(1,2)][2] = 1.0
+    >>> get_policy()[(1,2)]
+    1
+    """
+    policy = {}
+    for s, a_dict in q_table.items():
+        if a_dict:
+            policy[s] = max(a_dict, key=a_dict.get)
+    return policy
+
+
+def reset_env():
+    """
+    Reset the environment to initial state.
+    """
+    global current_state
+    current_state = (0, 0)
+    return current_state
+
+
+def get_available_actions_env():
+    """
+    Get available actions in the current environment state.
+    """
+    return [0, 1, 2, 3]
+
+
+def step_env(action):
+    """
+    Take a step in the environment with the given action.
+    """
+    global current_state
+    x, y = current_state
+    if action == 0:  # up
+        x = max(0, x - 1)
+    elif action == 1:  # right
+        y = min(SIZE - 1, y + 1)
+    elif action == 2:  # down
+        x = min(SIZE - 1, x + 1)
+    elif action == 3:  # left
+        y = max(0, y - 1)
+    next_state = (x, y)
+    reward = 10.0 if next_state == GOAL else -1.0
+    done = next_state == GOAL
+    current_state = next_state
+    return next_state, reward, done
+
+
+def run_q_learning():
+    """
+    Run Q-Learning on the simple grid world environment.
+    """
+    global EPSILON
+    episodes = 200
+    for episode in range(episodes):
+        state = reset_env()
+        done = False
+        while not done:
+            actions = get_available_actions_env()
+            action = choose_action(state, actions)
+            next_state, reward, done = step_env(action)
+            next_actions = get_available_actions_env()
+            update(state, action, reward, next_state, next_actions, done)
+            state = next_state
+        EPSILON = max(EPSILON * EPSILON_DECAY, EPSILON_MIN)
+    policy = get_policy()
+    print("Learned Policy (state: action):")
+    for s, a in sorted(policy.items()):
+        print(f"{s}: {a}")
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
+    run_q_learning()
\ No newline at end of file

From 3f0ec83c49c07031dfd361433ec876b43759f8ff Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 10 Oct 2025 04:31:05 +0000
Subject: [PATCH 3/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 machine_learning/q_learning.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/machine_learning/q_learning.py b/machine_learning/q_learning.py
index 5d1d6e607258..754d087cb885 100644
--- a/machine_learning/q_learning.py
+++ b/machine_learning/q_learning.py
@@ -1,8 +1,8 @@
 """
-Q-Learning is a widely-used model-free algorithm in reinforcement learning that 
-learns the optimal action-value function Q(s, a), which tells an agent the expected 
+Q-Learning is a widely-used model-free algorithm in reinforcement learning that
+learns the optimal action-value function Q(s, a), which tells an agent the expected
 utility of taking action a in state s and then following the optimal policy after.
-It is able to find the best policy for any given finite Markov decision process (MDP) 
+It is able to find the best policy for any given finite Markov decision process (MDP)
 without requiring a model of the environment.
 
 See: [https://en.wikipedia.org/wiki/Q-learning](https://en.wikipedia.org/wiki/Q-learning)
@@ -85,8 +85,10 @@ def update(state, action, reward, next_state, next_available_actions, done=False
     0.5
     """
     global LEARNING_RATE, DISCOUNT_FACTOR
-    max_q_next = 0.0 if done or not next_available_actions else max(
-        get_q_value(next_state, a) for a in next_available_actions
+    max_q_next = (
+        0.0
+        if done or not next_available_actions
+        else max(get_q_value(next_state, a) for a in next_available_actions)
     )
     old_q = get_q_value(state, action)
     new_q = (1 - LEARNING_RATE) * old_q + LEARNING_RATE * (
@@ -173,5 +175,6 @@ def run_q_learning():
 
 if __name__ == "__main__":
     import doctest
+
     doctest.testmod()
-    run_q_learning()
\ No newline at end of file
+    run_q_learning()

From 072831208527e9209e987b8294278267f9821e09 Mon Sep 17 00:00:00 2001
From: SANJI <wadhwasanjam@gmail.com>
Date: Fri, 10 Oct 2025 10:33:14 +0530
Subject: [PATCH 4/7] bug fixes and linting

---
 machine_learning/q_learning.py | 78 +++++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 24 deletions(-)

diff --git a/machine_learning/q_learning.py b/machine_learning/q_learning.py
index 754d087cb885..ce4b56df5c97 100644
--- a/machine_learning/q_learning.py
+++ b/machine_learning/q_learning.py
@@ -8,8 +8,11 @@
 See: [https://en.wikipedia.org/wiki/Q-learning](https://en.wikipedia.org/wiki/Q-learning)
 """
 
-from collections import defaultdict
 import random
+from collections import defaultdict
+
+# Type alias for state
+type State = tuple[int, int]
 
 # Hyperparameters for Q-Learning
 LEARNING_RATE = 0.1
@@ -19,7 +22,7 @@
 EPSILON_MIN = 0.01
 
 # Global Q-table to store state-action values
-q_table = defaultdict(lambda: defaultdict(float))
+q_table: dict[State, dict[int, float]] = defaultdict(lambda: defaultdict(float))
 
 # Environment variables for simple grid world
 SIZE = 4
@@ -27,20 +30,22 @@
 current_state = (0, 0)
 
 
-def get_q_value(state, action):
+def get_q_value(state: State, action: int) -> float:
     """
     Get Q-value for a given state-action pair.
 
+    >>> q_table.clear()
     >>> get_q_value((0, 0), 2)
     0.0
     """
     return q_table[state][action]
 
 
-def get_best_action(state, available_actions):
+def get_best_action(state: State, available_actions: list[int]) -> int:
     """
     Get the action with maximum Q-value in the given state.
 
+    >>> q_table.clear()
     >>> q_table[(0, 0)][1] = 0.7
     >>> q_table[(0, 0)][2] = 0.7
     >>> q_table[(0, 0)][3] = 0.5
@@ -54,14 +59,18 @@ def get_best_action(state, available_actions):
     return random.choice(best)
 
 
-def choose_action(state, available_actions):
+def choose_action(state: State, available_actions: list[int]) -> int:
     """
     Choose action using epsilon-greedy policy.
 
+    >>> q_table.clear()
+    >>> old_epsilon = EPSILON
     >>> EPSILON = 0.0
     >>> q_table[(0, 0)][1] = 1.0
     >>> q_table[(0, 0)][2] = 0.5
-    >>> choose_action((0, 0), [1, 2])
+    >>> result = choose_action((0, 0), [1, 2])
+    >>> EPSILON = old_epsilon  # Restore
+    >>> result
     1
     """
     global EPSILON
@@ -72,64 +81,84 @@ def choose_action(state, available_actions):
     return get_best_action(state, available_actions)
 
 
-def update(state, action, reward, next_state, next_available_actions, done=False):
+def update(
+    state: State,
+    action: int,
+    reward: float,
+    next_state: State,
+    next_available_actions: list[int],
+    done: bool = False,
+    alpha: float | None = None,
+    gamma: float | None = None,
+) -> None:
     """
     Perform Q-value update for a transition using the Q-learning rule.
 
     Q(s,a) <- Q(s,a) + alpha * (r + gamma * max_a' Q(s',a') - Q(s,a))
 
-    >>> LEARNING_RATE = 0.5
-    >>> DISCOUNT_FACTOR = 0.9
-    >>> update((0,0), 1, 1.0, (0,1), [1,2], done=True)
-    >>> get_q_value((0,0), 1)
+    >>> q_table.clear()
+    >>> update((0, 0), 1, 1.0, (0, 1), [1, 2], done=True, alpha=0.5, gamma=0.9)
+    >>> get_q_value((0, 0), 1)
     0.5
     """
     global LEARNING_RATE, DISCOUNT_FACTOR
+    alpha = alpha if alpha is not None else LEARNING_RATE
+    gamma = gamma if gamma is not None else DISCOUNT_FACTOR
+    max_q_next = 0.0 if done or not next_available_actions else max(
+        get_q_value(next_state, a) for a in next_available_actions
     max_q_next = (
         0.0
         if done or not next_available_actions
         else max(get_q_value(next_state, a) for a in next_available_actions)
     )
     old_q = get_q_value(state, action)
-    new_q = (1 - LEARNING_RATE) * old_q + LEARNING_RATE * (
-        reward + DISCOUNT_FACTOR * max_q_next
+    new_q = (1 - alpha) * old_q + alpha * (
+        reward + gamma * max_q_next
     )
     q_table[state][action] = new_q
 
 
-def get_policy():
+def get_policy() -> dict[State, int]:
     """
     Extract a deterministic policy from the Q-table.
 
-    >>> q_table[(1,2)][1] = 2.0
-    >>> q_table[(1,2)][2] = 1.0
-    >>> get_policy()[(1,2)]
+    >>> q_table.clear()
+    >>> q_table[(1, 2)][1] = 2.0
+    >>> q_table[(1, 2)][2] = 1.0
+    >>> get_policy()[(1, 2)]
     1
     """
-    policy = {}
+    policy: dict[State, int] = {}
     for s, a_dict in q_table.items():
         if a_dict:
             policy[s] = max(a_dict, key=a_dict.get)
     return policy
 
 
-def reset_env():
+def reset_env() -> State:
     """
     Reset the environment to initial state.
+
+    >>> old_state = current_state
+    >>> current_state = (1, 1)  # Simulate non-initial state
+    >>> result = reset_env()
+    >>> current_state = old_state  # Restore for other tests
+    >>> result
+    (0, 0)
     """
     global current_state
     current_state = (0, 0)
     return current_state
 
 
-def get_available_actions_env():
+def get_available_actions_env() -> list[int]:
     """
     Get available actions in the current environment state.
     """
-    return [0, 1, 2, 3]
+    return [0, 1, 2, 3]  # 0: up, 1: right, 2: down, 3: left
 
 
-def step_env(action):
+def step_env(action: int) -> tuple[State, float, bool]:
     """
     Take a step in the environment with the given action.
     """
@@ -150,13 +179,13 @@ def step_env(action):
     return next_state, reward, done
 
 
-def run_q_learning():
+def run_q_learning() -> None:
     """
     Run Q-Learning on the simple grid world environment.
     """
     global EPSILON
     episodes = 200
-    for episode in range(episodes):
+    for _ in range(episodes):
         state = reset_env()
         done = False
         while not done:
@@ -178,3 +207,4 @@ def run_q_learning():
 
     doctest.testmod()
     run_q_learning()
+

From a7b5349685241897a7339330a062bc14df4a9111 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 10 Oct 2025 05:16:46 +0000
Subject: [PATCH 5/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 machine_learning/q_learning.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/machine_learning/q_learning.py b/machine_learning/q_learning.py
index ce4b56df5c97..7242d7078889 100644
--- a/machine_learning/q_learning.py
+++ b/machine_learning/q_learning.py
@@ -207,4 +207,3 @@ def run_q_learning() -> None:
 
     doctest.testmod()
     run_q_learning()
-

From 39d121a4902dc0c336a8164b603be32270fb1398 Mon Sep 17 00:00:00 2001
From: SANJI <wadhwasanjam@gmail.com>
Date: Fri, 10 Oct 2025 10:54:35 +0530
Subject: [PATCH 6/7] cls

---
 machine_learning/q_learning.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/machine_learning/q_learning.py b/machine_learning/q_learning.py
index 7242d7078889..0fba67d27573 100644
--- a/machine_learning/q_learning.py
+++ b/machine_learning/q_learning.py
@@ -104,17 +104,13 @@ def update(
     global LEARNING_RATE, DISCOUNT_FACTOR
     alpha = alpha if alpha is not None else LEARNING_RATE
     gamma = gamma if gamma is not None else DISCOUNT_FACTOR
-    max_q_next = 0.0 if done or not next_available_actions else max(
-        get_q_value(next_state, a) for a in next_available_actions
     max_q_next = (
         0.0
         if done or not next_available_actions
         else max(get_q_value(next_state, a) for a in next_available_actions)
     )
     old_q = get_q_value(state, action)
-    new_q = (1 - alpha) * old_q + alpha * (
-        reward + gamma * max_q_next
-    )
+    new_q = (1 - alpha) * old_q + alpha * (reward + gamma * max_q_next)
     q_table[state][action] = new_q
 
 

From f3594e6ae581f57e2b367df5dca243822cc2e04c Mon Sep 17 00:00:00 2001
From: SANJI <wadhwasanjam@gmail.com>
Date: Fri, 10 Oct 2025 11:03:44 +0530
Subject: [PATCH 7/7] bug fix and hints

---
 machine_learning/q_learning.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/machine_learning/q_learning.py b/machine_learning/q_learning.py
index 0fba67d27573..4b10737b945f 100644
--- a/machine_learning/q_learning.py
+++ b/machine_learning/q_learning.py
@@ -118,6 +118,7 @@ def get_policy() -> dict[State, int]:
     """
     Extract a deterministic policy from the Q-table.
 
+
     >>> q_table.clear()
     >>> q_table[(1, 2)][1] = 2.0
     >>> q_table[(1, 2)][2] = 1.0
@@ -127,7 +128,7 @@ def get_policy() -> dict[State, int]:
     policy: dict[State, int] = {}
     for s, a_dict in q_table.items():
         if a_dict:
-            policy[s] = max(a_dict, key=a_dict.get)
+            policy[s] = max(a_dict, key=lambda a: a_dict[a])
     return policy