LLM retries and validations

2026-06-28 18:08:58 +02:00 · 2026-06-28 18:08:58 +02:00 · 5b93040f73
commit 5b93040f73
parent a7e6d5540f
2 changed files with 242 additions and 135 deletions
--- a/tools/engine.py
+++ b/tools/engine.py
@ -804,19 +804,25 @@ class GameEngine:
        elif last_prompt:
            self._append_llm_log(f"Resume from: {last_prompt[:120]}")

-        # ── Phase 1: Prose ────────────────────────────────────────────────
+        # ── Outer loop: Phase 1 (prose) → Phase 2 (summarize) → Phase 3 (extract) ──
        import random
        die_roll = random.randint(1, 6)
        self._append_llm_log(f"Dice: {die_roll} (1d6)")

+        book_log = None
+        changes_block = ""
+        log_entry = None
+        user_prompt = self._auto_prompt("")
+        ambience = None
+        debug_info = ""
+
+        for outer_attempt in range(3):
+            # ── Phase 1: Prose ────────────────────────────────────────────
            if on_action:
                on_action(f"Phase 1/3: writing story (dice={die_roll})")
            if on_debug:
-            on_debug("phase", {"phase": 1, "name": "prose", "status": "start", "dice": die_roll})
+                on_debug("phase", {"phase": 1, "name": "prose", "status": "start", "dice": die_roll, "outer_attempt": outer_attempt + 1})

-        book_log = None
-        changes_block = ""
-        for attempt in range(3):
            system = PROSE_PROMPT.substitute(
                character=self._read_file(CHAR_PATH) or "*No character sheet.*",
                world=self._truncate_world(self._read_file(WORLD_PATH) or "") or "*No world state.*",
@ -832,14 +838,14 @@ class GameEngine:
            text = self._call_llm([
                {"role": "system", "content": system},
                {"role": "user", "content": user},
-            ], label=f"Prose attempt {attempt + 1}", max_tokens=1024, on_debug=on_debug)
+            ], label=f"Prose attempt {outer_attempt + 1}", max_tokens=1024, on_debug=on_debug)

            if not text or not text.strip():
                if on_debug:
-                    on_debug("phase", {"phase": 1, "status": "empty", "attempt": attempt + 1})
+                    on_debug("phase", {"phase": 1, "status": "empty", "attempt": outer_attempt + 1})
                continue
+
            raw = text.strip()
-            # Split narrative from ### Changes block
            changes_block = ""
            if "### Changes" in raw:
                parts = raw.split("### Changes", 1)
@ -850,12 +856,18 @@ class GameEngine:
            if on_debug:
                preview = book_log[:150].replace("\n", "\\n")
                on_debug("phase", {"phase": 1, "status": "done", "chars": len(book_log), "changes": bool(changes_block), "preview": preview})
-            break

-        if not book_log:
-            return TurnResult(error="Prose generation failed after 3 attempts")
+            # ── Validation ────────────────────────────────────────────────
+            if on_debug:
+                on_debug("phase", {"phase": 1, "name": "validation", "status": "start"})
+            valid, reason = self._validate_narrative(book_log, on_debug=on_debug)
+            if not valid:
+                if on_debug:
+                    on_debug("phase", {"phase": 1, "status": "validation_failed", "reason": reason, "outer_attempt": outer_attempt + 1})
+                book_log = None
+                continue

-        # ── Phase 2: Summarize ────────────────────────────────────────────
+            # ── Phase 2: Summarize ────────────────────────────────────────
            if on_action:
                on_action("Phase 2/3: summarizing story")
            if on_debug:
@ -863,7 +875,7 @@ class GameEngine:

            log_context = self._read_recent_log()
            log_entry = None
-        for attempt in range(2):
+            for p2_attempt in range(2):
                context = book_log
                if changes_block:
                    context += f"\n\n{changes_block}"
@ -873,9 +885,9 @@ class GameEngine:
                        f"Focus on who was involved (character and NPC names):\n\n"
                        f"## Session Log\n{log_context}\n\n"
                        f"## New Story\n{context}"}
-            ], label=f"Summarize attempt {attempt + 1}", on_debug=on_debug)
+                ], label=f"Summarize attempt {p2_attempt + 1}", on_debug=on_debug)
                if text and text.strip():
-                log_entry = text.strip().split("\n")[0][:120]
+                    log_entry = text.strip().split("\n")[0][:300]
                    if on_debug:
                        on_debug("phase", {"phase": 2, "status": "done", "summary": log_entry})
                    break
@ -885,7 +897,7 @@ class GameEngine:
                if on_debug:
                    on_debug("phase", {"phase": 2, "status": "fallback", "summary": log_entry})

-        # ── Phase 3: Extract state changes ────────────────────────────────
+            # ── Phase 3: Extract state changes ────────────────────────────
            if on_action:
                on_action("Phase 3/3: extracting state changes")
            if on_debug:
@ -893,11 +905,14 @@ class GameEngine:

            user_prompt = self._auto_prompt(book_log)
            ambience = None
-        debug_info = ""
+            phase3_errors = []
+
+            previous_attempt = None  # {output, feedback}
+            phase3_ok = False
+            for p3_attempt in range(5):
                current_char = self._read_file(CHAR_PATH) or "*No character.*"
                current_world = self._truncate_world(self._read_file(WORLD_PATH) or "") or "*No world.*"

-        for attempt in range(3):
                phase3_prompt = (
                    f"## Current Character\n{current_char}\n\n"
                    f"## Current World\n{current_world}\n\n"
@ -915,6 +930,16 @@ class GameEngine:
                phase3_prompt += (
                    f"Output ```tool blocks for changes only. Examples:\n\n"
                )
+
+                if previous_attempt:
+                    phase3_prompt += (
+                        f"--- PREVIOUS ATTEMPT (had errors) ---\n"
+                        f"{previous_attempt['output']}\n\n"
+                        f"--- FEEDBACK ---\n"
+                        f"{previous_attempt['feedback']}\n\n"
+                        f"Fix the issues above. Output corrected tool calls only.\n\n"
+                    )
+
                text = self._call_llm([
                    {"role": "user", "content": phase3_prompt +
                        f"```tool\n{{\"tool\": \"modify_vitals\", \"args\": {{\"current_hp\": 5, \"cash\": 45}}}}\n```\n"
@ -928,15 +953,15 @@ class GameEngine:
                        f"```tool\n{{\"tool\": \"journal_update\", \"args\": {{\"add\": [\"Investigate the mine\"], \"done\": [\"Defeat the demon\"]}}}}\n```\n"
                        f"```tool\n{{\"tool\": \"finalize_turn\", \"args\": {{\"user_prompt\": \"What do you do?\", \"ambience\": \"dungeon\"}}}}\n```\n\n"
                        f"Only output tools for things that actually changed. Omit unchanged fields."}
-            ], label=f"Extract attempt {attempt + 1}", on_debug=on_debug)
+                ], label=f"Extract attempt {p3_attempt + 1}", on_debug=on_debug)

                if not text or not text.strip():
                    if on_debug:
-                    on_debug("phase", {"phase": 3, "status": "empty", "attempt": attempt + 1})
+                        on_debug("phase", {"phase": 3, "status": "empty", "attempt": p3_attempt + 1})
                    continue

                tool_calls = self._extract_tool_calls(
-                text, round_num=attempt + 1, on_debug=on_debug
+                    text, round_num=p3_attempt + 1, on_debug=on_debug
                )
                if on_debug and tool_calls:
                    names = [tc.get("tool", "?") for tc in tool_calls if tc.get("tool") != "finalize_turn"]
@ -956,7 +981,7 @@ class GameEngine:
                    if on_action:
                        on_action(f"State: {self._describe_tool_action(name, args)}")
                    if on_debug:
-                    on_debug("tool_call", {"round": attempt + 1, "tool": name, "args": args})
+                        on_debug("tool_call", {"round": p3_attempt + 1, "tool": name, "args": args})

                    if name == "player_roll" and on_player_roll:
                        dice = args.get("dice", "1d6")
@ -969,18 +994,41 @@ class GameEngine:
                    if result.startswith("**Error:") or result.startswith("Tool error") or result.startswith("Unknown"):
                        errors.append(f"{name}: {result}")
                    if on_debug:
-                    on_debug("tool_result", {"round": attempt + 1, "tool": name, "result": result})
+                        on_debug("tool_result", {"round": p3_attempt + 1, "tool": name, "result": result})

                if not errors:
+                    phase3_ok = True
+                    debug_info = ""
                    if on_debug:
                        on_debug("phase", {"phase": 3, "status": "done", "applied": len([tc for tc in tool_calls if tc.get("tool") != "finalize_turn"])})
                    break
+
+                phase3_errors = errors
                debug_info = "; ".join(errors)
                if on_debug:
-                on_debug("phase", {"phase": 3, "status": "errors", "errors": errors, "attempt": attempt + 1})
+                    on_debug("phase", {"phase": 3, "status": "errors", "errors": errors, "attempt": p3_attempt + 1})

-        if errors and on_debug:
-            on_debug("phase", {"phase": 3, "status": "exhausted", "errors": errors})
+                # Build feedback for the LLM to fix on next attempt
+                feedback_lines = ["The previous tool calls had errors:"]
+                for e in errors:
+                    feedback_lines.append(f"- {e}")
+                feedback_lines.append("")
+                feedback_lines.append("Fix ALL issues above. Use correct tool names, valid JSON, and reasonable values.")
+                previous_attempt = {"output": text, "feedback": "\n".join(feedback_lines)}
+
+            if phase3_ok:
+                break  # All phases succeeded on this outer attempt
+
+            # Phase 3 failed after 5 attempts — retry from Phase 1
+            if on_debug:
+                on_debug("phase", {"phase": 3, "status": "exhausted", "errors": phase3_errors})
+                on_debug("phase", {"phase": 1, "status": "retry_after_phase3_failure", "outer_attempt": outer_attempt + 1})
+            book_log = None  # Reset so Phase 1 runs again on next outer iteration
+
+        if not book_log:
+            return TurnResult(error="Generation failed after exhausting all retries")
+
+        # ── Finalize ──────────────────────────────────────────────────────
        if on_action:
            on_action("Turn complete")
        if on_debug:
@ -1022,6 +1070,59 @@ class GameEngine:
        """Fallback player prompt."""
        return "**What do you do?**"

+    def _validate_narrative(self, book_log: str, *, on_debug: callable = None) -> tuple[bool, str]:
+        """Check if book_log is acceptable narrative. Returns (ok, reason)."""
+        lines = book_log.strip().split("\n")
+        if not lines:
+            return False, "Empty narrative"
+
+        # 1) Heuristic: high repetition count
+        from collections import Counter
+        common = Counter(lines).most_common(1)
+        if common and common[0][1] >= 5:
+            return False, f"Repetition: '{common[0][0][:60]}' ×{common[0][1]}"
+
+        # 2) Heuristic: game mechanics bleedthrough
+        mech_lines = [l for l in lines if re.match(
+            r'^\*\*(?:Roll|Damage|Success|Failure|Check|Save|Hit|Miss|'
+            r'Strenght|Dexterity|Willpower|STR|DEX|WIL|'
+            r'(?:[A-Z][a-z]+(?: \(\w+\))?:))',
+            l
+        )]
+        if mech_lines:
+            ratio = len(mech_lines) / len(lines)
+            if ratio > 0.3:
+                return False, f"Game mechanics dominate ({len(mech_lines)}/{len(lines)} lines)"
+
+        # 3) Heuristic: tool / json blocks leaked into narrative
+        if re.search(r'```(?:tool|json)', book_log):
+            return False, "Contains unprocessed tool blocks"
+
+        # 4) Heuristic: under 50 characters of real prose
+        prose = re.sub(r'[*_#>`~\-\d]', '', book_log).strip()
+        if len(prose) < 50:
+            return False, "Too short to be meaningful"
+
+        # 5) LLM quality rating (only if heuristics pass)
+        text = self._call_llm([
+            {"role": "user", "content":
+                f"Rate this RPG narrative quality 1-5.\n"
+                f"1 = unreadable (spam, repetition, pure mechanics, garbled)\n"
+                f"2 = poor (mostly mechanics, little story)\n"
+                f"3 = acceptable (some narrative but rough)\n"
+                f"4 = good (solid prose, minor issues)\n"
+                f"5 = excellent (vivid, engaging)\n"
+                f"Reply with ONLY a single digit 1-5.\n\n"
+                f"{book_log[:600]}"}
+        ], label="Narrative validation", max_tokens=2, on_debug=on_debug)
+
+        if text and text.strip().isdigit():
+            score = int(text.strip())
+            if score < 3:
+                return False, f"Quality score: {score}/5"
+
+        return True, ""
+
    # ── Response Parsing ────────────────────────────────────────────────

    @staticmethod
--- a/tools/run.py
+++ b/tools/run.py
@ -841,8 +841,10 @@ class ChaosTUI(App):
            if status == "start":
                name = data.get("name", "")
                dice = data.get("dice")
+                outer = data.get("outer_attempt")
                d = f"  dice={dice}" if dice else ""
-                self._append_debug(f"▸ Phase {p}: {name} {d}")
+                o = f" [attempt {outer}/3]" if outer else ""
+                self._append_debug(f"▸ Phase {p}: {name}{o} {d}")
            elif status == "done":
                if p == 1:
                    self._append_debug(f"  ✔ prose: {data.get('chars', 0)} chars")
@ -870,6 +872,10 @@ class ChaosTUI(App):
                self._append_debug(f"  ✖ Phase 3 exhausted all retries — state changes may be missing!")
                for e in errs:
                    self._append_debug(f"    {e}")
+            elif status == "retry_after_phase3_failure":
+                self._append_debug(f"  ⟳ Phase 3 failed — retrying from Phase 1 (attempt {data.get('outer_attempt', '?')}/3)")
+            elif status == "validation_failed":
+                self._append_debug(f"  ✖ narrative rejected: {data.get('reason', '?')} (attempt {data.get('outer_attempt', '?')}/3)")
        elif event_type == "phase_done":
            self._append_debug(f"  ✔ turn complete — book_log: {data.get('book_log_chars', 0)} chars")
            if data.get("log_entry"):