LLM tests

2026-06-30 21:44:57 +02:00 · 2026-06-30 21:44:57 +02:00 · 6b277d725d
commit 6b277d725d
parent 6229e2e8c4
5 changed files with 412 additions and 12 deletions
--- a/tools/engine.py
+++ b/tools/engine.py
@ -52,10 +52,12 @@ class GameEngine:

        if player_action:
            valid, reason = validate_action(player_action, on_debug=on_debug)
-            if not valid:
+            if valid:
+                state.append_llm_log(f"\n[VALIDATION PASSED] {reason}")
+            else:
                state.append_llm_log(f"\n[VALIDATION REJECTED] {reason}")
                return TurnResult(
-                    book_log=f"You can't do that — {reason}.",
+                    book_log=f"",
                    log_entry=f"You can't do that — {reason}.",
                    user_prompt=auto_prompt(""),
                )
--- a/tools/engine_lib/validation.py
+++ b/tools/engine_lib/validation.py
@ -1,6 +1,7 @@
 from __future__ import annotations

 import json
+import re

 from .llm import call_llm
 from .paths import CHAR_PATH, WORLD_PATH
@ -9,11 +10,6 @@ from . import state

 VALIDATION_PROMPT = """You are a strict RPG game master validating whether a player's action is possible given the game state. Be thorough — check inventory, stats, location, NPCs, and story logic.

-Respond with JSON only:
-{{"valid": true, "reason": "ok"}}
-or
-{{"valid": false, "reason": "brief explanation of why the action is impossible"}}
-
 ## Character
 {character}

@ -30,7 +26,15 @@ or
 - Does the action make sense given the character's abilities and resources? -> valid
 - If valid, also check: if they're using a consumable item, note that it must be removed from inventory.

-Reply with ONLY the JSON object."""
+Reply with ONLY the JSON object. Examples:
+```
+{{"valid": true, "reason": "ok"}}
+```
+or
+```
+{{"valid": false, "reason": "brief explanation of why the action is impossible"}}
+```
+"""


 def validate_action(
@ -48,17 +52,21 @@ def validate_action(

    text = call_llm(
        [{"role": "user", "content": prompt}],
-        max_tokens=256,
+        max_tokens=512,
        temperature=0.2,
        label="Action validation",
        on_debug=on_debug,
    )

    if not text:
-        return True, ""
+        return False, "Not sure"

+    cleaned = text.strip()
+    m = re.search(r"```(?:json)?\s*\n?(.*?)```", cleaned, re.DOTALL)
+    if m:
+        cleaned = m.group(1).strip()
    try:
-        data = json.loads(text.strip())
+        data = json.loads(cleaned)
        valid = data.get("valid", True)
        reason = data.get("reason", "")
        if on_debug:
@ -67,7 +75,7 @@ def validate_action(
    except (json.JSONDecodeError, ValueError):
        if on_debug:
            on_debug("action_validation", {"valid": True, "reason": "parse_failed", "raw": text[:200]})
-        return True, ""
+        return False, "Unrecognized"


 def auto_prompt(book_log: str = "") -> str:
--- a/tools/test_llm_turn.py
+++ b/tools/test_llm_turn.py
@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+"""End-to-end turn generation tests using the real configured LLM.
+
+Tests that generate_turn handles real LLM responses correctly with
+the actual character sheet and world state. Requires a running LLM.
+
+Usage:
+    python3 tools/test_llm_turn.py
+"""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from engine import GameEngine
+
+PASS = 0
+FAIL = 0
+engine = GameEngine()
+
+
+def check(label: str, result, *, expect_error=False, expect_book=True, expect_prompt=True, expect_log=None):
+    global PASS, FAIL
+    ok = True
+    details = []
+
+    if expect_error and not result.error:
+        ok = False
+        details.append("expected error but got none")
+    elif not expect_error and result.error:
+        ok = False
+        details.append(f"unexpected error: {result.error}")
+
+    if expect_book and not result.book_log:
+        ok = False
+        details.append("expected non-empty book_log")
+
+    if expect_prompt and not result.user_prompt:
+        ok = False
+        details.append("expected non-empty user_prompt")
+
+    if expect_log is not None:
+        if result.log_entry != expect_log:
+            ok = False
+            details.append(f"expected log_entry={expect_log!r}, got {result.log_entry!r}")
+
+    status = "✓" if ok else "✗"
+    if ok:
+        PASS += 1
+    else:
+        FAIL += 1
+
+    bl = result.book_log[:80].replace("\n", " ") if result.book_log else "(none)"
+    print(f"  {status} {label}")
+    if not ok:
+        for d in details:
+            print(f"       {d}")
+    print(f"       book_log: {bl}...")
+    if result.user_prompt:
+        print(f"       prompt: {result.user_prompt[:60]}...")
+
+
+def section(name: str):
+    print(f"\n{'=' * 60}")
+    print(f"  {name}")
+    print(f"{'=' * 60}")
+
+
+def main():
+    section("First turn — no player action (story opening)")
+    r = engine.generate_turn()
+    check("Story opening", r, expect_error=False, expect_book=True, expect_prompt=True)
+
+    section("Valid action — buy a drink")
+    r = engine.generate_turn(
+        player_action="I buy a mug of ale at the Splintered Tankard",
+        last_prompt="What do you do?",
+    )
+    check("Buy ale", r, expect_error=False, expect_book=True, expect_prompt=True)
+
+    section("Valid action — talk to an NPC")
+    r = engine.generate_turn(
+        player_action="I ask Mistress Otta about recent rumours",
+        last_prompt="What do you do?",
+    )
+    check("Ask Otta", r, expect_error=False, expect_book=True, expect_prompt=True)
+
+    section("Valid action — use inventory item")
+    r = engine.generate_turn(
+        player_action="I apply my healing salve to restore HP",
+        last_prompt="What do you do?",
+    )
+    check("Use healing salve", r, expect_error=False, expect_book=True, expect_prompt=True)
+
+    section("Valid action — explore")
+    r = engine.generate_turn(
+        player_action="I head to the Market Square to look around",
+        last_prompt="What do you do?",
+    )
+    check("Visit market", r, expect_error=False, expect_book=True, expect_prompt=True)
+
+    section("Invalid action — use non-existent item")
+    r = engine.generate_turn(
+        player_action="I drink a potion of invisibility",
+        last_prompt="What do you do?",
+    )
+    check("Potion of invisibility", r, expect_error=False, expect_book=False)
+    if r.log_entry:
+        print(f"       log: {r.log_entry}")
+
+    section("Invalid action — cast spell (not a weaver)")
+    r = engine.generate_turn(
+        player_action="I cast a fireball at the tavern ceiling",
+        last_prompt="What do you do?",
+    )
+    check("Fireball spell", r, expect_error=False, expect_book=False)
+    if r.log_entry:
+        print(f"       log: {r.log_entry}")
+
+    section("Invalid action — nonsensical")
+    r = engine.generate_turn(
+        player_action="I fly to the moon",
+        last_prompt="What do you do?",
+    )
+    check("Fly to moon", r, expect_error=False, expect_book=False)
+    if r.log_entry:
+        print(f"       log: {r.log_entry}")
+
+    section("Resume from last_prompt (no player action)")
+    r = engine.generate_turn(
+        last_prompt="You stand in the market square, surrounded by stalls and bustle. What do you do?",
+    )
+    check("Resume scene", r, expect_error=False, expect_book=True, expect_prompt=True)
+
+    print(f"\n{'=' * 60}")
+    print(f"  Results: {PASS} passed, {FAIL} failed")
+    print(f"{'=' * 60}")
+    return 0 if FAIL == 0 else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tools/test_llm_validation.py
+++ b/tools/test_llm_validation.py
@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+"""End-to-end validation tests using the real configured LLM.
+
+Tests that validate_action handles real LLM responses correctly with
+the actual character sheet and world state. Requires a running LLM.
+
+Usage:
+    python3 tools/test_llm_validation.py
+"""
+
+import sys
+import os
+import json
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from engine_lib.validation import validate_action
+
+PASS = 0
+FAIL = 0
+
+
+def check(label: str, valid: bool, reason: str, expected_valid: bool):
+    global PASS, FAIL
+    status = "✓" if valid == expected_valid else "✗"
+    if valid == expected_valid:
+        PASS += 1
+    else:
+        FAIL += 1
+    print(f"  {status} {label}: valid={valid}, reason=\"{reason}\"")
+
+
+def section(name: str):
+    print(f"\n{'=' * 60}")
+    print(f"  {name}")
+    print(f"{'=' * 60}")
+
+
+def main():
+    section("Valid actions — should pass")
+
+    check("Buy a drink",
+          *validate_action("I buy a mug of weak ale at the Splintered Tankard"),
+          expected_valid=True)
+
+    check("Use healing salve",
+          *validate_action("I use my healing salve to restore 1 HP"),
+          expected_valid=True)
+
+    check("Talk to Otta",
+          *validate_action("I ask Mistress Otta about recent news in the Keep"),
+          expected_valid=True)
+
+    check("Visit the market",
+          *validate_action("I head to the Market Square to browse stalls"),
+          expected_valid=True)
+
+    section("Invalid actions — should fail")
+
+    check("Use non-existent item",
+          *validate_action("I drink a potion of invisibility"),
+          expected_valid=False)
+
+    check("Cast a spell (not a weaver)",
+          *validate_action("I cast a fireball spell at the tavern"),
+          expected_valid=False)
+
+    check("Buy impossible item",
+          *validate_action("I buy a horse for a broken copper coin"),
+          expected_valid=False)
+
+    check("Assert false state",
+          *validate_action("I fly to the moon"),
+          expected_valid=False)
+
+    section("Edge cases")
+
+    check("Empty action",
+          *validate_action(""),
+          expected_valid=True)
+
+    check("Garbled nonsense",
+          *validate_action("qwxz jabberwocky flargle bargle"),
+          expected_valid=False)
+
+    print(f"\n{'=' * 60}")
+    print(f"  Results: {PASS} passed, {FAIL} failed")
+    print(f"{'=' * 60}")
+    return 0 if FAIL == 0 else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tools/test_validation.py
+++ b/tools/test_validation.py
@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+"""Tests for engine_lib/validation.py."""
+
+import sys
+import os
+import json
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from unittest.mock import patch, MagicMock
+
+
+def test_empty_action():
+    """Empty action should return (True, '')."""
+    from engine_lib.validation import validate_action
+    valid, reason = validate_action("")
+    assert valid is True
+    assert reason == ""
+    print("✓ empty action returns (True, '')")
+
+
+@patch("engine_lib.validation.state.read_file")
+@patch("engine_lib.validation.state.truncate_world")
+@patch("engine_lib.validation.call_llm")
+def test_valid_action(mock_call_llm, mock_truncate_world, mock_read_file):
+    from engine_lib.validation import validate_action
+
+    mock_read_file.side_effect = lambda p: "HP: 10\nGold: 5" if "character" in str(p).lower() else "## Location\nTavern"
+    mock_truncate_world.return_value = "## Location\nTavern"
+    mock_call_llm.return_value = json.dumps({"valid": True, "reason": "ok"})
+
+    valid, reason = validate_action("I buy a drink")
+
+    assert valid is True
+    assert reason == "ok"
+    mock_call_llm.assert_called_once()
+    print("✓ valid action returns (True, reason)")
+
+
+@patch("engine_lib.validation.state.read_file")
+@patch("engine_lib.validation.state.truncate_world")
+@patch("engine_lib.validation.call_llm")
+def test_invalid_action(mock_call_llm, mock_truncate_world, mock_read_file):
+    from engine_lib.validation import validate_action
+
+    mock_read_file.side_effect = lambda p: "HP: 10\nGold: 0" if "character" in str(p).lower() else "## Location\nTavern"
+    mock_truncate_world.return_value = "## Location\nTavern"
+    mock_call_llm.return_value = json.dumps({"valid": False, "reason": "Not enough gold"})
+
+    valid, reason = validate_action("I buy a drink")
+
+    assert valid is False
+    assert reason == "Not enough gold"
+    print("✓ invalid action returns (False, reason)")
+
+
+@patch("engine_lib.validation.state.read_file")
+@patch("engine_lib.validation.state.truncate_world")
+@patch("engine_lib.validation.call_llm")
+def test_llm_returns_none(mock_call_llm, mock_truncate_world, mock_read_file):
+    from engine_lib.validation import validate_action
+
+    mock_read_file.side_effect = lambda p: "HP: 10" if "character" in str(p).lower() else "## Location\nTavern"
+    mock_truncate_world.return_value = "## Location\nTavern"
+    mock_call_llm.return_value = None
+
+    valid, reason = validate_action("I attack the dragon")
+
+    assert valid is False
+    assert reason == "Not sure"
+    print("✓ LLM returning None gives (False, 'Not sure')")
+
+
+@patch("engine_lib.validation.state.read_file")
+@patch("engine_lib.validation.state.truncate_world")
+@patch("engine_lib.validation.call_llm")
+def test_llm_returns_bad_json(mock_call_llm, mock_truncate_world, mock_read_file):
+    from engine_lib.validation import validate_action
+
+    mock_read_file.side_effect = lambda p: "HP: 10" if "character" in str(p).lower() else "## Location\nTavern"
+    mock_truncate_world.return_value = "## Location\nTavern"
+    mock_call_llm.return_value = "not valid json at all"
+
+    valid, reason = validate_action("I cast a spell")
+
+    assert valid is False
+    assert reason == "Unrecognized"
+    print("✓ bad JSON from LLM gives (False, 'Unrecognized')")
+
+
+@patch("engine_lib.validation.state.read_file")
+@patch("engine_lib.validation.state.truncate_world")
+def test_missing_character_sheet(mock_truncate_world, mock_read_file):
+    from engine_lib.validation import validate_action
+
+    mock_read_file.return_value = ""
+    mock_truncate_world.return_value = "*No world state.*"
+
+    with patch("engine_lib.validation.call_llm") as mock_call_llm:
+        mock_call_llm.return_value = json.dumps({"valid": True, "reason": "ok"})
+        valid, reason = validate_action("I look around")
+
+    assert valid is True
+    print("✓ handles missing character sheet gracefully")
+
+
+@patch("engine_lib.validation.state.read_file")
+@patch("engine_lib.validation.state.truncate_world")
+@patch("engine_lib.validation.call_llm")
+def test_on_debug_called(mock_call_llm, mock_truncate_world, mock_read_file):
+    from engine_lib.validation import validate_action
+
+    mock_read_file.side_effect = lambda p: "HP: 10" if "character" in str(p).lower() else "## Location\nTavern"
+    mock_truncate_world.return_value = "## Location\nTavern"
+    mock_call_llm.return_value = json.dumps({"valid": True, "reason": "ok"})
+
+    events = []
+    def debug_cb(key, data):
+        events.append((key, data))
+
+    valid, reason = validate_action("I open the door", on_debug=debug_cb)
+
+    assert valid is True
+    assert len(events) == 1
+    assert events[0][0] == "action_validation"
+    assert events[0][1]["valid"] is True
+    print("✓ on_debug callback receives action_validation event")
+
+
+def test_auto_prompt_default():
+    from engine_lib.validation import auto_prompt
+    result = auto_prompt()
+    assert result == "**What do you do?**"
+    print("✓ auto_prompt() returns default prompt")
+
+
+def test_auto_prompt_with_log():
+    from engine_lib.validation import auto_prompt
+    result = auto_prompt(book_log="Some story text")
+    assert result == "**What do you do?**"
+    print("✓ auto_prompt() ignores book_log argument")
+
+
+if __name__ == "__main__":
+    test_empty_action()
+    test_valid_action()
+    test_invalid_action()
+    test_llm_returns_none()
+    test_llm_returns_bad_json()
+    test_missing_character_sheet()
+    test_on_debug_called()
+    test_auto_prompt_default()
+    test_auto_prompt_with_log()
+    print("\n✓ All validation tests passed")