From 978ccbbdfb7ba1a1ec9723736b9f6fd3a21e0db6 Mon Sep 17 00:00:00 2001
From: Rodribm10 <rodrigobm10@me.com>
Date: Sun, 19 Apr 2026 11:40:59 -0300
Subject: [PATCH] fix(captain): wrap runner.run in Timeout to guard HTTP hangs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Observed incident 2026-04-19 14:34: ResponseBuilderJob sat 156s
'Performing' in Sidekiq without ever emitting [Captain V2] Agent result,
while the client waited on WhatsApp. The runner.run() call never
returned — presumably an HTTP hang on the LLM side (OpenAI slow,
network flake, or retry storm inside ruby-llm).

Post-hoc protections (tool_loop_detected, max_turns) can't fire because
they only inspect result after run() returns. Adding a 45s hard timeout
on the run() block guarantees we bail out, trigger bot_handoff, and
respond to the client instead of hanging forever.

Rescue Timeout::Error separately so the log message is specific and
the user-facing message says "demorou mais do que o esperado".

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../captain/assistant/agent_runner_service.rb        | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/enterprise/app/services/captain/assistant/agent_runner_service.rb b/enterprise/app/services/captain/assistant/agent_runner_service.rb
index 9928f4a05..f20efa69c 100644
--- a/enterprise/app/services/captain/assistant/agent_runner_service.rb
+++ b/enterprise/app/services/captain/assistant/agent_runner_service.rb
@@ -44,6 +44,7 @@ class Captain::Assistant::AgentRunnerService
   MAX_TURNS_PER_MESSAGE = 15           # Cap inside a single run() call
   MAX_TURNS_PER_CONVERSATION = 30      # Cap across the whole conversation lifetime
   TOOL_LOOP_THRESHOLD = 3              # Same (tool_name, args) invoked N+ times = loop
+  RUNNER_TIMEOUT_SECS = 45             # Kill runner.run if LLM/HTTP hangs past this
 
   # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
   def generate_response(message_history: [])
@@ -59,9 +60,12 @@ class Captain::Assistant::AgentRunnerService
     runner = add_callbacks_to_runner(runner) if @callbacks.any?
     install_instrumentation(runner)
     # max_turns is the hard safety cap: each "turn" = one LLM call + optional tool calls.
-    # 100 allowed runaway loops (LLM calling faq_lookup indefinitely when confused).
     # MAX_TURNS_PER_MESSAGE is plenty for normal flows while keeping a burn-budget ceiling.
-    result = runner.run(message_to_process, context: context, max_turns: MAX_TURNS_PER_MESSAGE)
+    # Timeout guards against HTTP hangs on the LLM side (OpenAI slow / network flake):
+    # without it, the job hangs indefinitely and no post-hoc loop detection ever fires.
+    result = Timeout.timeout(RUNNER_TIMEOUT_SECS) do
+      runner.run(message_to_process, context: context, max_turns: MAX_TURNS_PER_MESSAGE)
+    end
 
     if tool_loop_detected?(result)
       Rails.logger.error("[Captain V2] Tool loop detected on conv #{@conversation&.id}. Triggering bot_handoff.")
@@ -71,6 +75,10 @@ class Captain::Assistant::AgentRunnerService
 
     increment_conversation_turn_count!
     process_agent_result(result, original_query: message_to_process)
+  rescue Timeout::Error
+    Rails.logger.error("[Captain V2] runner.run timed out after #{RUNNER_TIMEOUT_SECS}s on conv #{@conversation&.id}. Triggering bot_handoff.")
+    trigger_bot_handoff!
+    bot_handoff_response('A IA demorou mais do que o esperado. Transferindo para atendimento humano.')
   rescue StandardError => e
     # when running the agent runner service in a rake task, the conversation might not have an account associated
     # for regular production usage, it will run just fine