From e4306824e1263ce66f1a7b6dff2b963947f58912 Mon Sep 17 00:00:00 2001
From: Eric Seuret <eric@ericst.ch>
Date: Tue, 4 Nov 2025 06:41:01 +0100
Subject: [PATCH] Ex1

---
 demo_frozenlake.py |  26 ++++
 ex1.py             | 298 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 324 insertions(+)
 create mode 100755 demo_frozenlake.py
 create mode 100755 ex1.py

diff --git a/demo_frozenlake.py b/demo_frozenlake.py
new file mode 100755
index 0000000..c3a58b6
--- /dev/null
+++ b/demo_frozenlake.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+"""
+FrozenLake Gymnasium Demo
+Zeigt grundlegende Environment-Interaktion und einen zufälligen Agenten
+"""
+import gymnasium as gym
+
+# Demo: Zufälliger Agent
+print("\n=== Demo: Zufälliger Agent ===")
+env = gym.make('FrozenLake-v1', is_slippery=False, render_mode='human')
+
+for episode in range(20):
+    observation, _ = env.reset()
+    done = False
+    total_reward = 0.0
+    
+    while not done:
+        action = env.action_space.sample()  # Zufällig!
+        obs, reward, terminated, truncated, _ = env.step(action)
+        total_reward += float(reward)
+        done = terminated or truncated
+    
+    print(f"Episode {episode}: Reward = {total_reward}")
+
+env.close()
+print("\nBeobachtung: Agent scheitert meistens! Keine intelligente Strategie.")
diff --git a/ex1.py b/ex1.py
new file mode 100755
index 0000000..9aff8f0
--- /dev/null
+++ b/ex1.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python
+import gymnasium as gym
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+
+def initialize_environment():
+    """Initialisiere die FrozenLake Umgebung (deterministisch)."""
+    env = gym.make('FrozenLake-v1', is_slippery=False)
+    return env
+
+
+def initialize_q_table(n_states, n_actions):
+    """Initialisiere Q-Tabelle mit Nullen."""
+    return np.zeros((n_states, n_actions))
+
+
+def select_action(state, Q, epsilon, n_actions):
+    """
+    TODO: Implementiere ε-greedy Aktionswahl.
+    
+    Args:
+        state: Aktueller Zustand
+        Q: Q-Tabelle
+        epsilon: Explorations-Wahrscheinlichkeit
+        n_actions: Anzahl möglicher Aktionen
+    
+    Returns:
+        Gewählte Aktion
+    
+    Hinweis: 
+    - Mit Wahrscheinlichkeit ε: Wähle zufällige Aktion (Exploration)
+    - Mit Wahrscheinlichkeit 1-ε: Wähle beste Aktion gemäss Q-Tabelle (Exploitation)
+    - Verwende np.random.random() für Zufallszahl und np.random.randint() für zufällige Aktion
+    - Verwende np.argmax() um die beste Aktion zu finden
+    """
+    ...
+
+
+def q_learning_update(Q, state, action, reward, next_state, alpha, gamma, done):
+    """
+    TODO: Implementiere Q-Learning Update-Regel.
+    
+    Q(s,a) ← Q(s,a) + α · [r + γ · max Q(s',a') - Q(s,a)]
+    
+    Args:
+        Q: Q-Tabelle
+        state: Aktueller Zustand
+        action: Ausgeführte Aktion
+        reward: Erhaltene Belohnung
+        next_state: Nächster Zustand
+        alpha: Lernrate
+        gamma: Diskontierungsfaktor
+        done: Episode beendet?
+    
+    Hinweis:
+    - Berechne TD Target: r + γ · max Q(s',a')
+    - Falls done=True: TD Target = r (keine zukünftige Belohnung mehr)
+    - Berechne TD Error: TD Target - Q(s,a)
+    - Update: Q(s,a) += α · TD Error
+    - Verwende np.argmax() um die beste nächste Aktion zu finden
+    """
+    ...
+
+
+def train_q_learning(env, n_episodes=2000, alpha=0.1, gamma=0.99, 
+                     epsilon_start=1.0, epsilon_min=0.01, epsilon_decay=0.999):
+    """
+    TODO: Implementiere Q-Learning Trainingsschleife.
+    
+    Args:
+        env: Gymnasium Environment
+        n_episodes: Anzahl Trainings-Episoden
+        alpha: Lernrate
+        gamma: Diskontierungsfaktor
+        epsilon_start: Initiale Explorations-Rate
+        epsilon_min: Minimale Explorations-Rate
+        epsilon_decay: Explorations-Abnahme-Rate
+    
+    Returns:
+        Q: Trainierte Q-Tabelle
+        rewards: Liste der Episoden-Belohnungen
+    
+    Hinweis:
+    1. Initialisiere Q-Tabelle und epsilon
+    2. Für jede Episode:
+       a. Environment zurücksetzen (env.reset())
+       b. Solange nicht fertig:
+          - Aktion wählen mit select_action()
+          - Aktion ausführen in Environment (env.step())
+          - Q-Tabelle updaten mit q_learning_update()
+          - Zustand updaten
+       c. Epsilon decay: epsilon = max(epsilon_min, epsilon * epsilon_decay)
+       d. Episode-Belohnung speichern
+    3. Fortschritt ausgeben alle 100 Episoden
+    """
+    # Initialisierung
+    n_states = env.observation_space.n
+    n_actions = env.action_space.n
+    Q = initialize_q_table(n_states, n_actions)
+    
+    epsilon = epsilon_start
+    rewards = []
+    
+    # TODO: Implementiere Trainingsschleife hier
+    ...
+    
+    return Q, rewards
+
+
+def evaluate_policy(env, Q, n_episodes=100):
+    """
+    Evaluiere gelernte Policy ohne Exploration.
+    Diese Funktion ist bereits implementiert.
+    
+    Args:
+        env: Gymnasium Environment
+        Q: Trainierte Q-Tabelle
+        n_episodes: Anzahl Evaluations-Episoden
+    
+    Returns:
+        success_rate: Erfolgsrate in Prozent
+        avg_reward: Durchschnittliche Belohnung pro Episode
+    """
+    successes = 0
+    total_reward = 0
+    
+    for episode in range(n_episodes):
+        state, _ = env.reset()
+        done = False
+        episode_reward = 0
+        
+        while not done:
+            # Greedy Aktion (keine Exploration)
+            action = np.argmax(Q[state, :])
+            state, reward, terminated, truncated, _ = env.step(action)
+            done = terminated or truncated
+            episode_reward += reward
+        
+        total_reward += episode_reward
+        if episode_reward > 0:
+            successes += 1
+    
+    success_rate = (successes / n_episodes) * 100
+    avg_reward = total_reward / n_episodes
+    
+    return success_rate, avg_reward
+
+
+def plot_learning_curve(rewards, window=100):
+    """
+    Plotte Lernkurve mit gleitendem Durchschnitt.
+    Diese Funktion ist bereits implementiert.
+    
+    Args:
+        rewards: Liste der Episoden-Belohnungen
+        window: Fenstergrösse für gleitenden Durchschnitt
+    """
+    plt.figure(figsize=(10, 6))
+    
+    plt.plot(rewards, alpha=0.3, label='Episoden-Belohnung')
+    
+    # Gleitender Durchschnitt
+    if len(rewards) >= window:
+        moving_avg = np.convolve(rewards, np.ones(window)/window, mode='valid')
+        plt.plot(range(window-1, len(rewards)), moving_avg, 
+                label=f'Gleitender Durchschnitt ({window} Episoden)', linewidth=2)
+    
+    plt.xlabel('Episode')
+    plt.ylabel('Belohnung')
+    plt.title('Lernkurve')
+    plt.legend(loc='lower right')
+    plt.grid(True, alpha=0.3)
+    
+    plt.tight_layout()
+    plt.show()
+
+
+def visualize_q_table(Q):
+    """
+    Visualisiere Q-Tabelle als Heatmap.
+    Diese Funktion ist bereits implementiert.
+    
+    Args:
+        Q: Q-Tabelle
+    """
+    plt.figure(figsize=(10, 8))
+    sns.heatmap(Q, annot=True, fmt='.2f', cmap='YlOrRd',
+                xticklabels=['←', '↓', '→', '↑'],
+                yticklabels=[str(i) for i in range(Q.shape[0])])
+    plt.title('Q-Tabelle nach Training')
+    plt.xlabel('Aktion')
+    plt.ylabel('Zustand')
+    plt.tight_layout()
+    plt.show()
+
+
+def visualize_policy(Q):
+    """
+    Visualisiere gelernte Policy auf dem Grid.
+    Diese Funktion ist bereits implementiert.
+    
+    Args:
+        Q: Q-Tabelle
+    """
+    policy = np.argmax(Q, axis=1)
+    action_arrows = {0: '←', 1: '↓', 2: '→', 3: '↑'}
+    
+    print("\n" + "="*40)
+    print("GELERNTE POLICY")
+    print("="*40)
+    print("\nFrozenLake Grid:")
+    print("SFFF")
+    print("FHFH")
+    print("FFFH")
+    print("HFFG")
+    print("\nOptimale Aktionen:")
+    
+    policy_grid = policy.reshape(4, 4)
+    for i, row in enumerate(policy_grid):
+        row_str = ' '.join([action_arrows[a] for a in row])
+        print(f"Reihe {i}: {row_str}")
+    
+    print("\nZustand-für-Zustand Policy:")
+    for state in range(16):
+        row, col = state // 4, state % 4
+        print(f"Zustand {state:2d} (Reihe {row}, Spalte {col}): {action_arrows[policy[state]]}")
+    print("="*40 + "\n")
+
+
+def main():
+    """Hauptfunktion für alle Aufgaben aus Übung 1."""
+    
+    print("="*60)
+    print("Q-LEARNING FÜR FROZENLAKE (DETERMINISTISCH)")
+    print("Praktische Übung 1")
+    print("="*60)
+    
+    # Environment initialisieren
+    env = initialize_environment()
+    n_states = env.observation_space.n
+    n_actions = env.action_space.n
+    print(f"\nEnvironment: FrozenLake-v1 (is_slippery=False)")
+    print(f"Zustände: {n_states}")
+    print(f"Aktionen: {n_actions}")
+    
+    # ========================================
+    # HYPERPARAMETER
+    # ========================================
+    n_episodes = 2000
+    alpha = 0.4          # Lernrate - Versuche: 0.01, 0.5
+    gamma = 0.99         # Diskontierung - Versuche: 0.5, 0.99
+    epsilon_start = 1.0
+    epsilon_min = 0.01
+    epsilon_decay = 0.999  # Versuche: 1.0 (kein Decay)
+    
+    # Q-Learning trainieren
+    print("\n--- Q-Learning Training ---")
+    print(f"Hyperparameter: α={alpha}, γ={gamma}, ε_decay={epsilon_decay}")
+    Q, rewards = train_q_learning(
+        env, 
+        n_episodes=n_episodes,
+        alpha=alpha,
+        gamma=gamma,
+        epsilon_start=epsilon_start,
+        epsilon_min=epsilon_min,
+        epsilon_decay=epsilon_decay
+    )
+    
+    # Lernkurve plotten
+    print("\n--- Lernkurve ---")
+    plot_learning_curve(rewards)
+    
+    # Policy evaluieren
+    print("\n--- Policy Evaluation ---")
+    success_rate, avg_reward = evaluate_policy(env, Q, n_episodes=100)
+    print(f"Erfolgsrate (100 Episoden): {success_rate:.1f}%")
+    print(f"Durchschnittliche Belohnung: {avg_reward:.3f}")
+    
+    # Visualisierungen
+    print("\n--- Visualisierungen ---")
+    visualize_q_table(Q)
+    visualize_policy(Q)
+    
+    print("\n" + "="*60)
+    print("TRAINING ABGESCHLOSSEN!")
+    print("="*60)
+    print("\nÄndere die Hyperparameter oben und führe erneut aus!")
+    print("  - Versuche α = 0.01 oder α = 0.5")
+    print("  - Versuche γ = 0.5 oder γ = 0.99")
+    print("  - Versuche epsilon_decay = 1.0 (kein Decay)")
+    
+    env.close()
+
+
+if __name__ == "__main__":
+    main()