LLM tests
This commit is contained in:
parent
6229e2e8c4
commit
6b277d725d
@ -52,10 +52,12 @@ class GameEngine:
|
||||
|
||||
if player_action:
|
||||
valid, reason = validate_action(player_action, on_debug=on_debug)
|
||||
if not valid:
|
||||
if valid:
|
||||
state.append_llm_log(f"\n[VALIDATION PASSED] {reason}")
|
||||
else:
|
||||
state.append_llm_log(f"\n[VALIDATION REJECTED] {reason}")
|
||||
return TurnResult(
|
||||
book_log=f"You can't do that — {reason}.",
|
||||
book_log=f"",
|
||||
log_entry=f"You can't do that — {reason}.",
|
||||
user_prompt=auto_prompt(""),
|
||||
)
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
from .llm import call_llm
|
||||
from .paths import CHAR_PATH, WORLD_PATH
|
||||
@ -9,11 +10,6 @@ from . import state
|
||||
|
||||
VALIDATION_PROMPT = """You are a strict RPG game master validating whether a player's action is possible given the game state. Be thorough — check inventory, stats, location, NPCs, and story logic.
|
||||
|
||||
Respond with JSON only:
|
||||
{{"valid": true, "reason": "ok"}}
|
||||
or
|
||||
{{"valid": false, "reason": "brief explanation of why the action is impossible"}}
|
||||
|
||||
## Character
|
||||
{character}
|
||||
|
||||
@ -30,7 +26,15 @@ or
|
||||
- Does the action make sense given the character's abilities and resources? -> valid
|
||||
- If valid, also check: if they're using a consumable item, note that it must be removed from inventory.
|
||||
|
||||
Reply with ONLY the JSON object."""
|
||||
Reply with ONLY the JSON object. Examples:
|
||||
```
|
||||
{{"valid": true, "reason": "ok"}}
|
||||
```
|
||||
or
|
||||
```
|
||||
{{"valid": false, "reason": "brief explanation of why the action is impossible"}}
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
def validate_action(
|
||||
@ -48,17 +52,21 @@ def validate_action(
|
||||
|
||||
text = call_llm(
|
||||
[{"role": "user", "content": prompt}],
|
||||
max_tokens=256,
|
||||
max_tokens=512,
|
||||
temperature=0.2,
|
||||
label="Action validation",
|
||||
on_debug=on_debug,
|
||||
)
|
||||
|
||||
if not text:
|
||||
return True, ""
|
||||
return False, "Not sure"
|
||||
|
||||
cleaned = text.strip()
|
||||
m = re.search(r"```(?:json)?\s*\n?(.*?)```", cleaned, re.DOTALL)
|
||||
if m:
|
||||
cleaned = m.group(1).strip()
|
||||
try:
|
||||
data = json.loads(text.strip())
|
||||
data = json.loads(cleaned)
|
||||
valid = data.get("valid", True)
|
||||
reason = data.get("reason", "")
|
||||
if on_debug:
|
||||
@ -67,7 +75,7 @@ def validate_action(
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
if on_debug:
|
||||
on_debug("action_validation", {"valid": True, "reason": "parse_failed", "raw": text[:200]})
|
||||
return True, ""
|
||||
return False, "Unrecognized"
|
||||
|
||||
|
||||
def auto_prompt(book_log: str = "") -> str:
|
||||
|
||||
143
tools/test_llm_turn.py
Normal file
143
tools/test_llm_turn.py
Normal file
@ -0,0 +1,143 @@
|
||||
#!/usr/bin/env python3
|
||||
"""End-to-end turn generation tests using the real configured LLM.
|
||||
|
||||
Tests that generate_turn handles real LLM responses correctly with
|
||||
the actual character sheet and world state. Requires a running LLM.
|
||||
|
||||
Usage:
|
||||
python3 tools/test_llm_turn.py
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
from engine import GameEngine
|
||||
|
||||
PASS = 0
|
||||
FAIL = 0
|
||||
engine = GameEngine()
|
||||
|
||||
|
||||
def check(label: str, result, *, expect_error=False, expect_book=True, expect_prompt=True, expect_log=None):
|
||||
global PASS, FAIL
|
||||
ok = True
|
||||
details = []
|
||||
|
||||
if expect_error and not result.error:
|
||||
ok = False
|
||||
details.append("expected error but got none")
|
||||
elif not expect_error and result.error:
|
||||
ok = False
|
||||
details.append(f"unexpected error: {result.error}")
|
||||
|
||||
if expect_book and not result.book_log:
|
||||
ok = False
|
||||
details.append("expected non-empty book_log")
|
||||
|
||||
if expect_prompt and not result.user_prompt:
|
||||
ok = False
|
||||
details.append("expected non-empty user_prompt")
|
||||
|
||||
if expect_log is not None:
|
||||
if result.log_entry != expect_log:
|
||||
ok = False
|
||||
details.append(f"expected log_entry={expect_log!r}, got {result.log_entry!r}")
|
||||
|
||||
status = "✓" if ok else "✗"
|
||||
if ok:
|
||||
PASS += 1
|
||||
else:
|
||||
FAIL += 1
|
||||
|
||||
bl = result.book_log[:80].replace("\n", " ") if result.book_log else "(none)"
|
||||
print(f" {status} {label}")
|
||||
if not ok:
|
||||
for d in details:
|
||||
print(f" {d}")
|
||||
print(f" book_log: {bl}...")
|
||||
if result.user_prompt:
|
||||
print(f" prompt: {result.user_prompt[:60]}...")
|
||||
|
||||
|
||||
def section(name: str):
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f" {name}")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
|
||||
def main():
|
||||
section("First turn — no player action (story opening)")
|
||||
r = engine.generate_turn()
|
||||
check("Story opening", r, expect_error=False, expect_book=True, expect_prompt=True)
|
||||
|
||||
section("Valid action — buy a drink")
|
||||
r = engine.generate_turn(
|
||||
player_action="I buy a mug of ale at the Splintered Tankard",
|
||||
last_prompt="What do you do?",
|
||||
)
|
||||
check("Buy ale", r, expect_error=False, expect_book=True, expect_prompt=True)
|
||||
|
||||
section("Valid action — talk to an NPC")
|
||||
r = engine.generate_turn(
|
||||
player_action="I ask Mistress Otta about recent rumours",
|
||||
last_prompt="What do you do?",
|
||||
)
|
||||
check("Ask Otta", r, expect_error=False, expect_book=True, expect_prompt=True)
|
||||
|
||||
section("Valid action — use inventory item")
|
||||
r = engine.generate_turn(
|
||||
player_action="I apply my healing salve to restore HP",
|
||||
last_prompt="What do you do?",
|
||||
)
|
||||
check("Use healing salve", r, expect_error=False, expect_book=True, expect_prompt=True)
|
||||
|
||||
section("Valid action — explore")
|
||||
r = engine.generate_turn(
|
||||
player_action="I head to the Market Square to look around",
|
||||
last_prompt="What do you do?",
|
||||
)
|
||||
check("Visit market", r, expect_error=False, expect_book=True, expect_prompt=True)
|
||||
|
||||
section("Invalid action — use non-existent item")
|
||||
r = engine.generate_turn(
|
||||
player_action="I drink a potion of invisibility",
|
||||
last_prompt="What do you do?",
|
||||
)
|
||||
check("Potion of invisibility", r, expect_error=False, expect_book=False)
|
||||
if r.log_entry:
|
||||
print(f" log: {r.log_entry}")
|
||||
|
||||
section("Invalid action — cast spell (not a weaver)")
|
||||
r = engine.generate_turn(
|
||||
player_action="I cast a fireball at the tavern ceiling",
|
||||
last_prompt="What do you do?",
|
||||
)
|
||||
check("Fireball spell", r, expect_error=False, expect_book=False)
|
||||
if r.log_entry:
|
||||
print(f" log: {r.log_entry}")
|
||||
|
||||
section("Invalid action — nonsensical")
|
||||
r = engine.generate_turn(
|
||||
player_action="I fly to the moon",
|
||||
last_prompt="What do you do?",
|
||||
)
|
||||
check("Fly to moon", r, expect_error=False, expect_book=False)
|
||||
if r.log_entry:
|
||||
print(f" log: {r.log_entry}")
|
||||
|
||||
section("Resume from last_prompt (no player action)")
|
||||
r = engine.generate_turn(
|
||||
last_prompt="You stand in the market square, surrounded by stalls and bustle. What do you do?",
|
||||
)
|
||||
check("Resume scene", r, expect_error=False, expect_book=True, expect_prompt=True)
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f" Results: {PASS} passed, {FAIL} failed")
|
||||
print(f"{'=' * 60}")
|
||||
return 0 if FAIL == 0 else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
93
tools/test_llm_validation.py
Normal file
93
tools/test_llm_validation.py
Normal file
@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env python3
|
||||
"""End-to-end validation tests using the real configured LLM.
|
||||
|
||||
Tests that validate_action handles real LLM responses correctly with
|
||||
the actual character sheet and world state. Requires a running LLM.
|
||||
|
||||
Usage:
|
||||
python3 tools/test_llm_validation.py
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
from engine_lib.validation import validate_action
|
||||
|
||||
PASS = 0
|
||||
FAIL = 0
|
||||
|
||||
|
||||
def check(label: str, valid: bool, reason: str, expected_valid: bool):
|
||||
global PASS, FAIL
|
||||
status = "✓" if valid == expected_valid else "✗"
|
||||
if valid == expected_valid:
|
||||
PASS += 1
|
||||
else:
|
||||
FAIL += 1
|
||||
print(f" {status} {label}: valid={valid}, reason=\"{reason}\"")
|
||||
|
||||
|
||||
def section(name: str):
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f" {name}")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
|
||||
def main():
|
||||
section("Valid actions — should pass")
|
||||
|
||||
check("Buy a drink",
|
||||
*validate_action("I buy a mug of weak ale at the Splintered Tankard"),
|
||||
expected_valid=True)
|
||||
|
||||
check("Use healing salve",
|
||||
*validate_action("I use my healing salve to restore 1 HP"),
|
||||
expected_valid=True)
|
||||
|
||||
check("Talk to Otta",
|
||||
*validate_action("I ask Mistress Otta about recent news in the Keep"),
|
||||
expected_valid=True)
|
||||
|
||||
check("Visit the market",
|
||||
*validate_action("I head to the Market Square to browse stalls"),
|
||||
expected_valid=True)
|
||||
|
||||
section("Invalid actions — should fail")
|
||||
|
||||
check("Use non-existent item",
|
||||
*validate_action("I drink a potion of invisibility"),
|
||||
expected_valid=False)
|
||||
|
||||
check("Cast a spell (not a weaver)",
|
||||
*validate_action("I cast a fireball spell at the tavern"),
|
||||
expected_valid=False)
|
||||
|
||||
check("Buy impossible item",
|
||||
*validate_action("I buy a horse for a broken copper coin"),
|
||||
expected_valid=False)
|
||||
|
||||
check("Assert false state",
|
||||
*validate_action("I fly to the moon"),
|
||||
expected_valid=False)
|
||||
|
||||
section("Edge cases")
|
||||
|
||||
check("Empty action",
|
||||
*validate_action(""),
|
||||
expected_valid=True)
|
||||
|
||||
check("Garbled nonsense",
|
||||
*validate_action("qwxz jabberwocky flargle bargle"),
|
||||
expected_valid=False)
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f" Results: {PASS} passed, {FAIL} failed")
|
||||
print(f"{'=' * 60}")
|
||||
return 0 if FAIL == 0 else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
154
tools/test_validation.py
Normal file
154
tools/test_validation.py
Normal file
@ -0,0 +1,154 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Tests for engine_lib/validation.py."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
|
||||
def test_empty_action():
|
||||
"""Empty action should return (True, '')."""
|
||||
from engine_lib.validation import validate_action
|
||||
valid, reason = validate_action("")
|
||||
assert valid is True
|
||||
assert reason == ""
|
||||
print("✓ empty action returns (True, '')")
|
||||
|
||||
|
||||
@patch("engine_lib.validation.state.read_file")
|
||||
@patch("engine_lib.validation.state.truncate_world")
|
||||
@patch("engine_lib.validation.call_llm")
|
||||
def test_valid_action(mock_call_llm, mock_truncate_world, mock_read_file):
|
||||
from engine_lib.validation import validate_action
|
||||
|
||||
mock_read_file.side_effect = lambda p: "HP: 10\nGold: 5" if "character" in str(p).lower() else "## Location\nTavern"
|
||||
mock_truncate_world.return_value = "## Location\nTavern"
|
||||
mock_call_llm.return_value = json.dumps({"valid": True, "reason": "ok"})
|
||||
|
||||
valid, reason = validate_action("I buy a drink")
|
||||
|
||||
assert valid is True
|
||||
assert reason == "ok"
|
||||
mock_call_llm.assert_called_once()
|
||||
print("✓ valid action returns (True, reason)")
|
||||
|
||||
|
||||
@patch("engine_lib.validation.state.read_file")
|
||||
@patch("engine_lib.validation.state.truncate_world")
|
||||
@patch("engine_lib.validation.call_llm")
|
||||
def test_invalid_action(mock_call_llm, mock_truncate_world, mock_read_file):
|
||||
from engine_lib.validation import validate_action
|
||||
|
||||
mock_read_file.side_effect = lambda p: "HP: 10\nGold: 0" if "character" in str(p).lower() else "## Location\nTavern"
|
||||
mock_truncate_world.return_value = "## Location\nTavern"
|
||||
mock_call_llm.return_value = json.dumps({"valid": False, "reason": "Not enough gold"})
|
||||
|
||||
valid, reason = validate_action("I buy a drink")
|
||||
|
||||
assert valid is False
|
||||
assert reason == "Not enough gold"
|
||||
print("✓ invalid action returns (False, reason)")
|
||||
|
||||
|
||||
@patch("engine_lib.validation.state.read_file")
|
||||
@patch("engine_lib.validation.state.truncate_world")
|
||||
@patch("engine_lib.validation.call_llm")
|
||||
def test_llm_returns_none(mock_call_llm, mock_truncate_world, mock_read_file):
|
||||
from engine_lib.validation import validate_action
|
||||
|
||||
mock_read_file.side_effect = lambda p: "HP: 10" if "character" in str(p).lower() else "## Location\nTavern"
|
||||
mock_truncate_world.return_value = "## Location\nTavern"
|
||||
mock_call_llm.return_value = None
|
||||
|
||||
valid, reason = validate_action("I attack the dragon")
|
||||
|
||||
assert valid is False
|
||||
assert reason == "Not sure"
|
||||
print("✓ LLM returning None gives (False, 'Not sure')")
|
||||
|
||||
|
||||
@patch("engine_lib.validation.state.read_file")
|
||||
@patch("engine_lib.validation.state.truncate_world")
|
||||
@patch("engine_lib.validation.call_llm")
|
||||
def test_llm_returns_bad_json(mock_call_llm, mock_truncate_world, mock_read_file):
|
||||
from engine_lib.validation import validate_action
|
||||
|
||||
mock_read_file.side_effect = lambda p: "HP: 10" if "character" in str(p).lower() else "## Location\nTavern"
|
||||
mock_truncate_world.return_value = "## Location\nTavern"
|
||||
mock_call_llm.return_value = "not valid json at all"
|
||||
|
||||
valid, reason = validate_action("I cast a spell")
|
||||
|
||||
assert valid is False
|
||||
assert reason == "Unrecognized"
|
||||
print("✓ bad JSON from LLM gives (False, 'Unrecognized')")
|
||||
|
||||
|
||||
@patch("engine_lib.validation.state.read_file")
|
||||
@patch("engine_lib.validation.state.truncate_world")
|
||||
def test_missing_character_sheet(mock_truncate_world, mock_read_file):
|
||||
from engine_lib.validation import validate_action
|
||||
|
||||
mock_read_file.return_value = ""
|
||||
mock_truncate_world.return_value = "*No world state.*"
|
||||
|
||||
with patch("engine_lib.validation.call_llm") as mock_call_llm:
|
||||
mock_call_llm.return_value = json.dumps({"valid": True, "reason": "ok"})
|
||||
valid, reason = validate_action("I look around")
|
||||
|
||||
assert valid is True
|
||||
print("✓ handles missing character sheet gracefully")
|
||||
|
||||
|
||||
@patch("engine_lib.validation.state.read_file")
|
||||
@patch("engine_lib.validation.state.truncate_world")
|
||||
@patch("engine_lib.validation.call_llm")
|
||||
def test_on_debug_called(mock_call_llm, mock_truncate_world, mock_read_file):
|
||||
from engine_lib.validation import validate_action
|
||||
|
||||
mock_read_file.side_effect = lambda p: "HP: 10" if "character" in str(p).lower() else "## Location\nTavern"
|
||||
mock_truncate_world.return_value = "## Location\nTavern"
|
||||
mock_call_llm.return_value = json.dumps({"valid": True, "reason": "ok"})
|
||||
|
||||
events = []
|
||||
def debug_cb(key, data):
|
||||
events.append((key, data))
|
||||
|
||||
valid, reason = validate_action("I open the door", on_debug=debug_cb)
|
||||
|
||||
assert valid is True
|
||||
assert len(events) == 1
|
||||
assert events[0][0] == "action_validation"
|
||||
assert events[0][1]["valid"] is True
|
||||
print("✓ on_debug callback receives action_validation event")
|
||||
|
||||
|
||||
def test_auto_prompt_default():
|
||||
from engine_lib.validation import auto_prompt
|
||||
result = auto_prompt()
|
||||
assert result == "**What do you do?**"
|
||||
print("✓ auto_prompt() returns default prompt")
|
||||
|
||||
|
||||
def test_auto_prompt_with_log():
|
||||
from engine_lib.validation import auto_prompt
|
||||
result = auto_prompt(book_log="Some story text")
|
||||
assert result == "**What do you do?**"
|
||||
print("✓ auto_prompt() ignores book_log argument")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_empty_action()
|
||||
test_valid_action()
|
||||
test_invalid_action()
|
||||
test_llm_returns_none()
|
||||
test_llm_returns_bad_json()
|
||||
test_missing_character_sheet()
|
||||
test_on_debug_called()
|
||||
test_auto_prompt_default()
|
||||
test_auto_prompt_with_log()
|
||||
print("\n✓ All validation tests passed")
|
||||
Loading…
Reference in New Issue
Block a user