From 2b09382a614573970bb8c28cedf59ba11c111810 Mon Sep 17 00:00:00 2001 From: SANJI Date: Fri, 10 Oct 2025 09:01:54 +0530 Subject: [PATCH 1/7] Add Q-Learning algorithm implementation with epsilon-greedy policy and grid world demo --- machine_learning/q_learning.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 machine_learning/q_learning.py diff --git a/machine_learning/q_learning.py b/machine_learning/q_learning.py new file mode 100644 index 000000000000..e69de29bb2d1 From f0e21513e3eb8e6446524fed9165469682d07d18 Mon Sep 17 00:00:00 2001 From: SANJI Date: Fri, 10 Oct 2025 10:00:16 +0530 Subject: [PATCH 2/7] Add Q-Learning algorithm implementation with epsilon-greedy policy and grid world demo code --- machine_learning/q_learning.py | 177 +++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) diff --git a/machine_learning/q_learning.py b/machine_learning/q_learning.py index e69de29bb2d1..5d1d6e607258 100644 --- a/machine_learning/q_learning.py +++ b/machine_learning/q_learning.py @@ -0,0 +1,177 @@ +""" +Q-Learning is a widely-used model-free algorithm in reinforcement learning that +learns the optimal action-value function Q(s, a), which tells an agent the expected +utility of taking action a in state s and then following the optimal policy after. +It is able to find the best policy for any given finite Markov decision process (MDP) +without requiring a model of the environment. + +See: [https://en.wikipedia.org/wiki/Q-learning](https://en.wikipedia.org/wiki/Q-learning) +""" + +from collections import defaultdict +import random + +# Hyperparameters for Q-Learning +LEARNING_RATE = 0.1 +DISCOUNT_FACTOR = 0.97 +EPSILON = 0.2 +EPSILON_DECAY = 0.995 +EPSILON_MIN = 0.01 + +# Global Q-table to store state-action values +q_table = defaultdict(lambda: defaultdict(float)) + +# Environment variables for simple grid world +SIZE = 4 +GOAL = (SIZE - 1, SIZE - 1) +current_state = (0, 0) + + +def get_q_value(state, action): + """ + Get Q-value for a given state-action pair. + + >>> get_q_value((0, 0), 2) + 0.0 + """ + return q_table[state][action] + + +def get_best_action(state, available_actions): + """ + Get the action with maximum Q-value in the given state. + + >>> q_table[(0, 0)][1] = 0.7 + >>> q_table[(0, 0)][2] = 0.7 + >>> q_table[(0, 0)][3] = 0.5 + >>> get_best_action((0, 0), [1, 2, 3]) in [1, 2] + True + """ + if not available_actions: + raise ValueError("No available actions provided") + max_q = max(q_table[state][a] for a in available_actions) + best = [a for a in available_actions if q_table[state][a] == max_q] + return random.choice(best) + + +def choose_action(state, available_actions): + """ + Choose action using epsilon-greedy policy. + + >>> EPSILON = 0.0 + >>> q_table[(0, 0)][1] = 1.0 + >>> q_table[(0, 0)][2] = 0.5 + >>> choose_action((0, 0), [1, 2]) + 1 + """ + global EPSILON + if not available_actions: + raise ValueError("No available actions provided") + if random.random() < EPSILON: + return random.choice(available_actions) + return get_best_action(state, available_actions) + + +def update(state, action, reward, next_state, next_available_actions, done=False): + """ + Perform Q-value update for a transition using the Q-learning rule. + + Q(s,a) <- Q(s,a) + alpha * (r + gamma * max_a' Q(s',a') - Q(s,a)) + + >>> LEARNING_RATE = 0.5 + >>> DISCOUNT_FACTOR = 0.9 + >>> update((0,0), 1, 1.0, (0,1), [1,2], done=True) + >>> get_q_value((0,0), 1) + 0.5 + """ + global LEARNING_RATE, DISCOUNT_FACTOR + max_q_next = 0.0 if done or not next_available_actions else max( + get_q_value(next_state, a) for a in next_available_actions + ) + old_q = get_q_value(state, action) + new_q = (1 - LEARNING_RATE) * old_q + LEARNING_RATE * ( + reward + DISCOUNT_FACTOR * max_q_next + ) + q_table[state][action] = new_q + + +def get_policy(): + """ + Extract a deterministic policy from the Q-table. + + >>> q_table[(1,2)][1] = 2.0 + >>> q_table[(1,2)][2] = 1.0 + >>> get_policy()[(1,2)] + 1 + """ + policy = {} + for s, a_dict in q_table.items(): + if a_dict: + policy[s] = max(a_dict, key=a_dict.get) + return policy + + +def reset_env(): + """ + Reset the environment to initial state. + """ + global current_state + current_state = (0, 0) + return current_state + + +def get_available_actions_env(): + """ + Get available actions in the current environment state. + """ + return [0, 1, 2, 3] + + +def step_env(action): + """ + Take a step in the environment with the given action. + """ + global current_state + x, y = current_state + if action == 0: # up + x = max(0, x - 1) + elif action == 1: # right + y = min(SIZE - 1, y + 1) + elif action == 2: # down + x = min(SIZE - 1, x + 1) + elif action == 3: # left + y = max(0, y - 1) + next_state = (x, y) + reward = 10.0 if next_state == GOAL else -1.0 + done = next_state == GOAL + current_state = next_state + return next_state, reward, done + + +def run_q_learning(): + """ + Run Q-Learning on the simple grid world environment. + """ + global EPSILON + episodes = 200 + for episode in range(episodes): + state = reset_env() + done = False + while not done: + actions = get_available_actions_env() + action = choose_action(state, actions) + next_state, reward, done = step_env(action) + next_actions = get_available_actions_env() + update(state, action, reward, next_state, next_actions, done) + state = next_state + EPSILON = max(EPSILON * EPSILON_DECAY, EPSILON_MIN) + policy = get_policy() + print("Learned Policy (state: action):") + for s, a in sorted(policy.items()): + print(f"{s}: {a}") + + +if __name__ == "__main__": + import doctest + doctest.testmod() + run_q_learning() \ No newline at end of file From 3f0ec83c49c07031dfd361433ec876b43759f8ff Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 10 Oct 2025 04:31:05 +0000 Subject: [PATCH 3/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/q_learning.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/machine_learning/q_learning.py b/machine_learning/q_learning.py index 5d1d6e607258..754d087cb885 100644 --- a/machine_learning/q_learning.py +++ b/machine_learning/q_learning.py @@ -1,8 +1,8 @@ """ -Q-Learning is a widely-used model-free algorithm in reinforcement learning that -learns the optimal action-value function Q(s, a), which tells an agent the expected +Q-Learning is a widely-used model-free algorithm in reinforcement learning that +learns the optimal action-value function Q(s, a), which tells an agent the expected utility of taking action a in state s and then following the optimal policy after. -It is able to find the best policy for any given finite Markov decision process (MDP) +It is able to find the best policy for any given finite Markov decision process (MDP) without requiring a model of the environment. See: [https://en.wikipedia.org/wiki/Q-learning](https://en.wikipedia.org/wiki/Q-learning) @@ -85,8 +85,10 @@ def update(state, action, reward, next_state, next_available_actions, done=False 0.5 """ global LEARNING_RATE, DISCOUNT_FACTOR - max_q_next = 0.0 if done or not next_available_actions else max( - get_q_value(next_state, a) for a in next_available_actions + max_q_next = ( + 0.0 + if done or not next_available_actions + else max(get_q_value(next_state, a) for a in next_available_actions) ) old_q = get_q_value(state, action) new_q = (1 - LEARNING_RATE) * old_q + LEARNING_RATE * ( @@ -173,5 +175,6 @@ def run_q_learning(): if __name__ == "__main__": import doctest + doctest.testmod() - run_q_learning() \ No newline at end of file + run_q_learning() From 072831208527e9209e987b8294278267f9821e09 Mon Sep 17 00:00:00 2001 From: SANJI Date: Fri, 10 Oct 2025 10:33:14 +0530 Subject: [PATCH 4/7] bug fixes and linting --- machine_learning/q_learning.py | 78 +++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/machine_learning/q_learning.py b/machine_learning/q_learning.py index 754d087cb885..ce4b56df5c97 100644 --- a/machine_learning/q_learning.py +++ b/machine_learning/q_learning.py @@ -8,8 +8,11 @@ See: [https://en.wikipedia.org/wiki/Q-learning](https://en.wikipedia.org/wiki/Q-learning) """ -from collections import defaultdict import random +from collections import defaultdict + +# Type alias for state +type State = tuple[int, int] # Hyperparameters for Q-Learning LEARNING_RATE = 0.1 @@ -19,7 +22,7 @@ EPSILON_MIN = 0.01 # Global Q-table to store state-action values -q_table = defaultdict(lambda: defaultdict(float)) +q_table: dict[State, dict[int, float]] = defaultdict(lambda: defaultdict(float)) # Environment variables for simple grid world SIZE = 4 @@ -27,20 +30,22 @@ current_state = (0, 0) -def get_q_value(state, action): +def get_q_value(state: State, action: int) -> float: """ Get Q-value for a given state-action pair. + >>> q_table.clear() >>> get_q_value((0, 0), 2) 0.0 """ return q_table[state][action] -def get_best_action(state, available_actions): +def get_best_action(state: State, available_actions: list[int]) -> int: """ Get the action with maximum Q-value in the given state. + >>> q_table.clear() >>> q_table[(0, 0)][1] = 0.7 >>> q_table[(0, 0)][2] = 0.7 >>> q_table[(0, 0)][3] = 0.5 @@ -54,14 +59,18 @@ def get_best_action(state, available_actions): return random.choice(best) -def choose_action(state, available_actions): +def choose_action(state: State, available_actions: list[int]) -> int: """ Choose action using epsilon-greedy policy. + >>> q_table.clear() + >>> old_epsilon = EPSILON >>> EPSILON = 0.0 >>> q_table[(0, 0)][1] = 1.0 >>> q_table[(0, 0)][2] = 0.5 - >>> choose_action((0, 0), [1, 2]) + >>> result = choose_action((0, 0), [1, 2]) + >>> EPSILON = old_epsilon # Restore + >>> result 1 """ global EPSILON @@ -72,64 +81,84 @@ def choose_action(state, available_actions): return get_best_action(state, available_actions) -def update(state, action, reward, next_state, next_available_actions, done=False): +def update( + state: State, + action: int, + reward: float, + next_state: State, + next_available_actions: list[int], + done: bool = False, + alpha: float | None = None, + gamma: float | None = None, +) -> None: """ Perform Q-value update for a transition using the Q-learning rule. Q(s,a) <- Q(s,a) + alpha * (r + gamma * max_a' Q(s',a') - Q(s,a)) - >>> LEARNING_RATE = 0.5 - >>> DISCOUNT_FACTOR = 0.9 - >>> update((0,0), 1, 1.0, (0,1), [1,2], done=True) - >>> get_q_value((0,0), 1) + >>> q_table.clear() + >>> update((0, 0), 1, 1.0, (0, 1), [1, 2], done=True, alpha=0.5, gamma=0.9) + >>> get_q_value((0, 0), 1) 0.5 """ global LEARNING_RATE, DISCOUNT_FACTOR + alpha = alpha if alpha is not None else LEARNING_RATE + gamma = gamma if gamma is not None else DISCOUNT_FACTOR + max_q_next = 0.0 if done or not next_available_actions else max( + get_q_value(next_state, a) for a in next_available_actions max_q_next = ( 0.0 if done or not next_available_actions else max(get_q_value(next_state, a) for a in next_available_actions) ) old_q = get_q_value(state, action) - new_q = (1 - LEARNING_RATE) * old_q + LEARNING_RATE * ( - reward + DISCOUNT_FACTOR * max_q_next + new_q = (1 - alpha) * old_q + alpha * ( + reward + gamma * max_q_next ) q_table[state][action] = new_q -def get_policy(): +def get_policy() -> dict[State, int]: """ Extract a deterministic policy from the Q-table. - >>> q_table[(1,2)][1] = 2.0 - >>> q_table[(1,2)][2] = 1.0 - >>> get_policy()[(1,2)] + >>> q_table.clear() + >>> q_table[(1, 2)][1] = 2.0 + >>> q_table[(1, 2)][2] = 1.0 + >>> get_policy()[(1, 2)] 1 """ - policy = {} + policy: dict[State, int] = {} for s, a_dict in q_table.items(): if a_dict: policy[s] = max(a_dict, key=a_dict.get) return policy -def reset_env(): +def reset_env() -> State: """ Reset the environment to initial state. + + >>> old_state = current_state + >>> current_state = (1, 1) # Simulate non-initial state + >>> result = reset_env() + >>> current_state = old_state # Restore for other tests + >>> result + (0, 0) """ global current_state current_state = (0, 0) return current_state -def get_available_actions_env(): +def get_available_actions_env() -> list[int]: """ Get available actions in the current environment state. """ - return [0, 1, 2, 3] + return [0, 1, 2, 3] # 0: up, 1: right, 2: down, 3: left -def step_env(action): +def step_env(action: int) -> tuple[State, float, bool]: """ Take a step in the environment with the given action. """ @@ -150,13 +179,13 @@ def step_env(action): return next_state, reward, done -def run_q_learning(): +def run_q_learning() -> None: """ Run Q-Learning on the simple grid world environment. """ global EPSILON episodes = 200 - for episode in range(episodes): + for _ in range(episodes): state = reset_env() done = False while not done: @@ -178,3 +207,4 @@ def run_q_learning(): doctest.testmod() run_q_learning() + From a7b5349685241897a7339330a062bc14df4a9111 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 10 Oct 2025 05:16:46 +0000 Subject: [PATCH 5/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/q_learning.py | 1 - 1 file changed, 1 deletion(-) diff --git a/machine_learning/q_learning.py b/machine_learning/q_learning.py index ce4b56df5c97..7242d7078889 100644 --- a/machine_learning/q_learning.py +++ b/machine_learning/q_learning.py @@ -207,4 +207,3 @@ def run_q_learning() -> None: doctest.testmod() run_q_learning() - From 39d121a4902dc0c336a8164b603be32270fb1398 Mon Sep 17 00:00:00 2001 From: SANJI Date: Fri, 10 Oct 2025 10:54:35 +0530 Subject: [PATCH 6/7] cls --- machine_learning/q_learning.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/machine_learning/q_learning.py b/machine_learning/q_learning.py index 7242d7078889..0fba67d27573 100644 --- a/machine_learning/q_learning.py +++ b/machine_learning/q_learning.py @@ -104,17 +104,13 @@ def update( global LEARNING_RATE, DISCOUNT_FACTOR alpha = alpha if alpha is not None else LEARNING_RATE gamma = gamma if gamma is not None else DISCOUNT_FACTOR - max_q_next = 0.0 if done or not next_available_actions else max( - get_q_value(next_state, a) for a in next_available_actions max_q_next = ( 0.0 if done or not next_available_actions else max(get_q_value(next_state, a) for a in next_available_actions) ) old_q = get_q_value(state, action) - new_q = (1 - alpha) * old_q + alpha * ( - reward + gamma * max_q_next - ) + new_q = (1 - alpha) * old_q + alpha * (reward + gamma * max_q_next) q_table[state][action] = new_q From f3594e6ae581f57e2b367df5dca243822cc2e04c Mon Sep 17 00:00:00 2001 From: SANJI Date: Fri, 10 Oct 2025 11:03:44 +0530 Subject: [PATCH 7/7] bug fix and hints --- machine_learning/q_learning.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/machine_learning/q_learning.py b/machine_learning/q_learning.py index 0fba67d27573..4b10737b945f 100644 --- a/machine_learning/q_learning.py +++ b/machine_learning/q_learning.py @@ -118,6 +118,7 @@ def get_policy() -> dict[State, int]: """ Extract a deterministic policy from the Q-table. + >>> q_table.clear() >>> q_table[(1, 2)][1] = 2.0 >>> q_table[(1, 2)][2] = 1.0 @@ -127,7 +128,7 @@ def get_policy() -> dict[State, int]: policy: dict[State, int] = {} for s, a_dict in q_table.items(): if a_dict: - policy[s] = max(a_dict, key=a_dict.get) + policy[s] = max(a_dict, key=lambda a: a_dict[a]) return policy