LLM tests

This commit is contained in:
Dejvino 2026-06-30 21:44:57 +02:00
parent 6229e2e8c4
commit 6b277d725d
5 changed files with 412 additions and 12 deletions

View File

@ -52,10 +52,12 @@ class GameEngine:
if player_action:
valid, reason = validate_action(player_action, on_debug=on_debug)
if not valid:
if valid:
state.append_llm_log(f"\n[VALIDATION PASSED] {reason}")
else:
state.append_llm_log(f"\n[VALIDATION REJECTED] {reason}")
return TurnResult(
book_log=f"You can't do that — {reason}.",
book_log=f"",
log_entry=f"You can't do that — {reason}.",
user_prompt=auto_prompt(""),
)

View File

@ -1,6 +1,7 @@
from __future__ import annotations
import json
import re
from .llm import call_llm
from .paths import CHAR_PATH, WORLD_PATH
@ -9,11 +10,6 @@ from . import state
VALIDATION_PROMPT = """You are a strict RPG game master validating whether a player's action is possible given the game state. Be thorough — check inventory, stats, location, NPCs, and story logic.
Respond with JSON only:
{{"valid": true, "reason": "ok"}}
or
{{"valid": false, "reason": "brief explanation of why the action is impossible"}}
## Character
{character}
@ -30,7 +26,15 @@ or
- Does the action make sense given the character's abilities and resources? -> valid
- If valid, also check: if they're using a consumable item, note that it must be removed from inventory.
Reply with ONLY the JSON object."""
Reply with ONLY the JSON object. Examples:
```
{{"valid": true, "reason": "ok"}}
```
or
```
{{"valid": false, "reason": "brief explanation of why the action is impossible"}}
```
"""
def validate_action(
@ -48,17 +52,21 @@ def validate_action(
text = call_llm(
[{"role": "user", "content": prompt}],
max_tokens=256,
max_tokens=512,
temperature=0.2,
label="Action validation",
on_debug=on_debug,
)
if not text:
return True, ""
return False, "Not sure"
cleaned = text.strip()
m = re.search(r"```(?:json)?\s*\n?(.*?)```", cleaned, re.DOTALL)
if m:
cleaned = m.group(1).strip()
try:
data = json.loads(text.strip())
data = json.loads(cleaned)
valid = data.get("valid", True)
reason = data.get("reason", "")
if on_debug:
@ -67,7 +75,7 @@ def validate_action(
except (json.JSONDecodeError, ValueError):
if on_debug:
on_debug("action_validation", {"valid": True, "reason": "parse_failed", "raw": text[:200]})
return True, ""
return False, "Unrecognized"
def auto_prompt(book_log: str = "") -> str:

143
tools/test_llm_turn.py Normal file
View File

@ -0,0 +1,143 @@
#!/usr/bin/env python3
"""End-to-end turn generation tests using the real configured LLM.
Tests that generate_turn handles real LLM responses correctly with
the actual character sheet and world state. Requires a running LLM.
Usage:
python3 tools/test_llm_turn.py
"""
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from engine import GameEngine
PASS = 0
FAIL = 0
engine = GameEngine()
def check(label: str, result, *, expect_error=False, expect_book=True, expect_prompt=True, expect_log=None):
global PASS, FAIL
ok = True
details = []
if expect_error and not result.error:
ok = False
details.append("expected error but got none")
elif not expect_error and result.error:
ok = False
details.append(f"unexpected error: {result.error}")
if expect_book and not result.book_log:
ok = False
details.append("expected non-empty book_log")
if expect_prompt and not result.user_prompt:
ok = False
details.append("expected non-empty user_prompt")
if expect_log is not None:
if result.log_entry != expect_log:
ok = False
details.append(f"expected log_entry={expect_log!r}, got {result.log_entry!r}")
status = "" if ok else ""
if ok:
PASS += 1
else:
FAIL += 1
bl = result.book_log[:80].replace("\n", " ") if result.book_log else "(none)"
print(f" {status} {label}")
if not ok:
for d in details:
print(f" {d}")
print(f" book_log: {bl}...")
if result.user_prompt:
print(f" prompt: {result.user_prompt[:60]}...")
def section(name: str):
print(f"\n{'=' * 60}")
print(f" {name}")
print(f"{'=' * 60}")
def main():
section("First turn — no player action (story opening)")
r = engine.generate_turn()
check("Story opening", r, expect_error=False, expect_book=True, expect_prompt=True)
section("Valid action — buy a drink")
r = engine.generate_turn(
player_action="I buy a mug of ale at the Splintered Tankard",
last_prompt="What do you do?",
)
check("Buy ale", r, expect_error=False, expect_book=True, expect_prompt=True)
section("Valid action — talk to an NPC")
r = engine.generate_turn(
player_action="I ask Mistress Otta about recent rumours",
last_prompt="What do you do?",
)
check("Ask Otta", r, expect_error=False, expect_book=True, expect_prompt=True)
section("Valid action — use inventory item")
r = engine.generate_turn(
player_action="I apply my healing salve to restore HP",
last_prompt="What do you do?",
)
check("Use healing salve", r, expect_error=False, expect_book=True, expect_prompt=True)
section("Valid action — explore")
r = engine.generate_turn(
player_action="I head to the Market Square to look around",
last_prompt="What do you do?",
)
check("Visit market", r, expect_error=False, expect_book=True, expect_prompt=True)
section("Invalid action — use non-existent item")
r = engine.generate_turn(
player_action="I drink a potion of invisibility",
last_prompt="What do you do?",
)
check("Potion of invisibility", r, expect_error=False, expect_book=False)
if r.log_entry:
print(f" log: {r.log_entry}")
section("Invalid action — cast spell (not a weaver)")
r = engine.generate_turn(
player_action="I cast a fireball at the tavern ceiling",
last_prompt="What do you do?",
)
check("Fireball spell", r, expect_error=False, expect_book=False)
if r.log_entry:
print(f" log: {r.log_entry}")
section("Invalid action — nonsensical")
r = engine.generate_turn(
player_action="I fly to the moon",
last_prompt="What do you do?",
)
check("Fly to moon", r, expect_error=False, expect_book=False)
if r.log_entry:
print(f" log: {r.log_entry}")
section("Resume from last_prompt (no player action)")
r = engine.generate_turn(
last_prompt="You stand in the market square, surrounded by stalls and bustle. What do you do?",
)
check("Resume scene", r, expect_error=False, expect_book=True, expect_prompt=True)
print(f"\n{'=' * 60}")
print(f" Results: {PASS} passed, {FAIL} failed")
print(f"{'=' * 60}")
return 0 if FAIL == 0 else 1
if __name__ == "__main__":
sys.exit(main())

View File

@ -0,0 +1,93 @@
#!/usr/bin/env python3
"""End-to-end validation tests using the real configured LLM.
Tests that validate_action handles real LLM responses correctly with
the actual character sheet and world state. Requires a running LLM.
Usage:
python3 tools/test_llm_validation.py
"""
import sys
import os
import json
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from engine_lib.validation import validate_action
PASS = 0
FAIL = 0
def check(label: str, valid: bool, reason: str, expected_valid: bool):
global PASS, FAIL
status = "" if valid == expected_valid else ""
if valid == expected_valid:
PASS += 1
else:
FAIL += 1
print(f" {status} {label}: valid={valid}, reason=\"{reason}\"")
def section(name: str):
print(f"\n{'=' * 60}")
print(f" {name}")
print(f"{'=' * 60}")
def main():
section("Valid actions — should pass")
check("Buy a drink",
*validate_action("I buy a mug of weak ale at the Splintered Tankard"),
expected_valid=True)
check("Use healing salve",
*validate_action("I use my healing salve to restore 1 HP"),
expected_valid=True)
check("Talk to Otta",
*validate_action("I ask Mistress Otta about recent news in the Keep"),
expected_valid=True)
check("Visit the market",
*validate_action("I head to the Market Square to browse stalls"),
expected_valid=True)
section("Invalid actions — should fail")
check("Use non-existent item",
*validate_action("I drink a potion of invisibility"),
expected_valid=False)
check("Cast a spell (not a weaver)",
*validate_action("I cast a fireball spell at the tavern"),
expected_valid=False)
check("Buy impossible item",
*validate_action("I buy a horse for a broken copper coin"),
expected_valid=False)
check("Assert false state",
*validate_action("I fly to the moon"),
expected_valid=False)
section("Edge cases")
check("Empty action",
*validate_action(""),
expected_valid=True)
check("Garbled nonsense",
*validate_action("qwxz jabberwocky flargle bargle"),
expected_valid=False)
print(f"\n{'=' * 60}")
print(f" Results: {PASS} passed, {FAIL} failed")
print(f"{'=' * 60}")
return 0 if FAIL == 0 else 1
if __name__ == "__main__":
sys.exit(main())

154
tools/test_validation.py Normal file
View File

@ -0,0 +1,154 @@
#!/usr/bin/env python3
"""Tests for engine_lib/validation.py."""
import sys
import os
import json
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from unittest.mock import patch, MagicMock
def test_empty_action():
"""Empty action should return (True, '')."""
from engine_lib.validation import validate_action
valid, reason = validate_action("")
assert valid is True
assert reason == ""
print("✓ empty action returns (True, '')")
@patch("engine_lib.validation.state.read_file")
@patch("engine_lib.validation.state.truncate_world")
@patch("engine_lib.validation.call_llm")
def test_valid_action(mock_call_llm, mock_truncate_world, mock_read_file):
from engine_lib.validation import validate_action
mock_read_file.side_effect = lambda p: "HP: 10\nGold: 5" if "character" in str(p).lower() else "## Location\nTavern"
mock_truncate_world.return_value = "## Location\nTavern"
mock_call_llm.return_value = json.dumps({"valid": True, "reason": "ok"})
valid, reason = validate_action("I buy a drink")
assert valid is True
assert reason == "ok"
mock_call_llm.assert_called_once()
print("✓ valid action returns (True, reason)")
@patch("engine_lib.validation.state.read_file")
@patch("engine_lib.validation.state.truncate_world")
@patch("engine_lib.validation.call_llm")
def test_invalid_action(mock_call_llm, mock_truncate_world, mock_read_file):
from engine_lib.validation import validate_action
mock_read_file.side_effect = lambda p: "HP: 10\nGold: 0" if "character" in str(p).lower() else "## Location\nTavern"
mock_truncate_world.return_value = "## Location\nTavern"
mock_call_llm.return_value = json.dumps({"valid": False, "reason": "Not enough gold"})
valid, reason = validate_action("I buy a drink")
assert valid is False
assert reason == "Not enough gold"
print("✓ invalid action returns (False, reason)")
@patch("engine_lib.validation.state.read_file")
@patch("engine_lib.validation.state.truncate_world")
@patch("engine_lib.validation.call_llm")
def test_llm_returns_none(mock_call_llm, mock_truncate_world, mock_read_file):
from engine_lib.validation import validate_action
mock_read_file.side_effect = lambda p: "HP: 10" if "character" in str(p).lower() else "## Location\nTavern"
mock_truncate_world.return_value = "## Location\nTavern"
mock_call_llm.return_value = None
valid, reason = validate_action("I attack the dragon")
assert valid is False
assert reason == "Not sure"
print("✓ LLM returning None gives (False, 'Not sure')")
@patch("engine_lib.validation.state.read_file")
@patch("engine_lib.validation.state.truncate_world")
@patch("engine_lib.validation.call_llm")
def test_llm_returns_bad_json(mock_call_llm, mock_truncate_world, mock_read_file):
from engine_lib.validation import validate_action
mock_read_file.side_effect = lambda p: "HP: 10" if "character" in str(p).lower() else "## Location\nTavern"
mock_truncate_world.return_value = "## Location\nTavern"
mock_call_llm.return_value = "not valid json at all"
valid, reason = validate_action("I cast a spell")
assert valid is False
assert reason == "Unrecognized"
print("✓ bad JSON from LLM gives (False, 'Unrecognized')")
@patch("engine_lib.validation.state.read_file")
@patch("engine_lib.validation.state.truncate_world")
def test_missing_character_sheet(mock_truncate_world, mock_read_file):
from engine_lib.validation import validate_action
mock_read_file.return_value = ""
mock_truncate_world.return_value = "*No world state.*"
with patch("engine_lib.validation.call_llm") as mock_call_llm:
mock_call_llm.return_value = json.dumps({"valid": True, "reason": "ok"})
valid, reason = validate_action("I look around")
assert valid is True
print("✓ handles missing character sheet gracefully")
@patch("engine_lib.validation.state.read_file")
@patch("engine_lib.validation.state.truncate_world")
@patch("engine_lib.validation.call_llm")
def test_on_debug_called(mock_call_llm, mock_truncate_world, mock_read_file):
from engine_lib.validation import validate_action
mock_read_file.side_effect = lambda p: "HP: 10" if "character" in str(p).lower() else "## Location\nTavern"
mock_truncate_world.return_value = "## Location\nTavern"
mock_call_llm.return_value = json.dumps({"valid": True, "reason": "ok"})
events = []
def debug_cb(key, data):
events.append((key, data))
valid, reason = validate_action("I open the door", on_debug=debug_cb)
assert valid is True
assert len(events) == 1
assert events[0][0] == "action_validation"
assert events[0][1]["valid"] is True
print("✓ on_debug callback receives action_validation event")
def test_auto_prompt_default():
from engine_lib.validation import auto_prompt
result = auto_prompt()
assert result == "**What do you do?**"
print("✓ auto_prompt() returns default prompt")
def test_auto_prompt_with_log():
from engine_lib.validation import auto_prompt
result = auto_prompt(book_log="Some story text")
assert result == "**What do you do?**"
print("✓ auto_prompt() ignores book_log argument")
if __name__ == "__main__":
test_empty_action()
test_valid_action()
test_invalid_action()
test_llm_returns_none()
test_llm_returns_bad_json()
test_missing_character_sheet()
test_on_debug_called()
test_auto_prompt_default()
test_auto_prompt_with_log()
print("\n✓ All validation tests passed")