From ed99f67525122a46385809e49eef801ced473396 Mon Sep 17 00:00:00 2001
From: Rodribm10 <rodrigobm10@me.com>
Date: Sat, 2 May 2026 17:58:17 -0300
Subject: [PATCH] =?UTF-8?q?feat(captain/hermes):=20camada=202=20=E2=80=94?=
 =?UTF-8?q?=20gating=20de=20sa=C3=ADda=20factual=20sem=20tool=20call?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Detecta alucinação de memória: se resposta do Hermes contém info
factual (preço/senha/horário/regra/política) E o LLM NÃO chamou
nenhuma tool MCP entre dispatch e callback, bloqueia entrega + dispara
system_message forçando consulta a tool. 1 retry; persistindo, escala.

Implementação:
- McpController: incrementa Rails.cache hermes_tool_calls:<conv_id>
  em cada tools/call.
- OutgoingJob: snapshot do contador como hermes_tool_calls_baseline
  ANTES de despachar pro Hermes.
- HermesCallbackController.gate_factual_no_tool!: compara baseline vs
  current; se igual + FACTUAL_PATTERNS bate, intercepta. Patterns
  cobrem R$, %, "senha", check-in/out + horário, política de
  cancelamento, "permitido", "pode levar pet/animal".

Caso real: cliente pede senha do Wi-Fi → Hermes responde de cabeça
"é passada presencialmente" sem chamar faq_lookup → callback intercepta,
não entrega pro cliente, manda [SISTEMA: force_factual_tool] pro Hermes
com instrução de chamar faq_lookup. Se faq_lookup vier vazio → frase-
âncora handoff.

Auto-react ambient: removido filtro de "?" que barrava em prod.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../captain/hermes_callback_controller.rb     | 82 ++++++++++++++++++-
 .../webhooks/captain/mcp_controller.rb        | 15 ++++
 .../app/jobs/captain/hermes/outgoing_job.rb   | 10 +++
 .../captain/hermes/auto_react_service.rb      |  9 +-
 4 files changed, 111 insertions(+), 5 deletions(-)

diff --git a/app/controllers/webhooks/captain/hermes_callback_controller.rb b/app/controllers/webhooks/captain/hermes_callback_controller.rb
index 80785a243..770b6dc5a 100644
--- a/app/controllers/webhooks/captain/hermes_callback_controller.rb
+++ b/app/controllers/webhooks/captain/hermes_callback_controller.rb
@@ -13,7 +13,7 @@
 # de forma confiável, identificamos pela ÚLTIMA conversation pending da inbox
 # que recebeu mensagem nos últimos 5 minutos. Aceitável pra PoC com 1 conversa
 # de teste por vez. Pra produção, melhorar com Redis: delivery_id → conversation_id.
-class Webhooks::Captain::HermesCallbackController < ApplicationController
+class Webhooks::Captain::HermesCallbackController < ApplicationController # rubocop:disable Metrics/ClassLength
   RECENT_WINDOW = 5.minutes
 
   # "Um momento — vou verificar" é a frase-âncora de handoff intencional
@@ -26,6 +26,24 @@ class Webhooks::Captain::HermesCallbackController < ApplicationController
     /\A\s*aguarde\s+um\s+instante/i
   ].freeze
 
+  # Camada 2 — Gating de saída factual:
+  # Se a resposta do Hermes contém info factual (preço, senha, regra,
+  # horário) E o LLM NÃO chamou nenhuma tool MCP entre o dispatch e o
+  # callback, é alucinação de memória. Bloqueia a resposta, força
+  # consulta a tool via notify_event. 1 retry; se persistir, escala humano.
+  FACTUAL_PATTERNS = [
+    /R\$\s*\d/i,                                                # R$ 50, R$ 125,00
+    /\b\d+\s*(reais|reai|real)\b/i,
+    /\b\d+\s*%/,                                                # 50%
+    /\bsenha\b/i,                                               # qualquer menção a senha
+    /\b(c[óo]digo)\s+(de|do)\s+(porta|portao|portão|garagem)\b/i,
+    /\b(check[-]?in|check[-]?out|hor[áa]rio)\s+(é|eh|de)\s+\d/i, # check-in é 14h
+    /\b(política|politica|regra)\s+de\s+(cancelamento|no[\s-]?show|reembolso)/i,
+    /(n[ãa]o\s+(é\s+)?permitido|é\s+permitido)\s+\w{3,}/i,
+    /(pode|podem)\s+(levar|trazer)\s+(animal|pet|cachorro|gato|crian[çc]a)/i
+  ].freeze
+  MAX_FACTUAL_GATE_RETRIES = 1
+
   # Loop detection: 2 sinais combinados.
   # 1. Jaccard de tokens >= 0.50 → resposta praticamente igual.
   # 2. >= 3 palavras-chave em comum (sem stopwords) E ambas terminam com
@@ -56,6 +74,12 @@ class Webhooks::Captain::HermesCallbackController < ApplicationController
     return log_no_conversation_and_ack if conversation.blank?
 
     log_reply(conversation, content)
+
+    # Camada 2: resposta factual sem chamada de tool durante a turn é
+    # alucinação de memória. Bloqueia entrega + re-dispara forçando tool
+    # call. NÃO entrega a msg pro cliente até o LLM consultar a fonte.
+    return head :ok if gate_factual_no_tool!(conversation, content)
+
     detect_handoff_or_loop(conversation, content)
     deliver_outgoing(conversation, content)
     head :ok
@@ -93,6 +117,62 @@ class Webhooks::Captain::HermesCallbackController < ApplicationController
     HANDOFF_PATTERNS.any? { |re| content.match?(re) }
   end
 
+  # Retorna true se bloqueou a resposta (callback deve dar 200 + sair sem
+  # entregar). Retorna false pra fluxo normal continuar.
+  def gate_factual_no_tool!(conversation, content)
+    return false unless looks_factual?(content)
+    return false if tool_called_in_this_turn?(conversation)
+
+    retry_key = "hermes_factual_gate_retry:#{conversation.id}"
+    retries = Rails.cache.read(retry_key, raw: true).to_i
+    if retries >= MAX_FACTUAL_GATE_RETRIES
+      Rails.logger.error("[Hermes::Callback] factual sem tool persistente em conv #{conversation.display_id} — escalando")
+      mark_for_human_triage(conversation, reason: 'factual_no_tool_persistente')
+      Rails.cache.delete(retry_key)
+      # Entrega o conteúdo nesse caso (melhor algo do que silêncio); humano vê pela label.
+      deliver_outgoing(conversation, content)
+      return true
+    end
+
+    Rails.cache.write(retry_key, retries + 1, expires_in: 5.minutes, raw: true)
+    Rails.logger.warn("[Hermes::Callback] factual sem tool em conv #{conversation.display_id} — re-dispatch (retry #{retries + 1})")
+    trigger_force_tool_call!(conversation, content)
+    true
+  end
+
+  def looks_factual?(content)
+    return false if content.blank?
+
+    FACTUAL_PATTERNS.any? { |re| content.match?(re) }
+  end
+
+  # Compara contador de tool calls atual com baseline gravado pelo
+  # OutgoingJob no momento do dispatch. Se subiu, o LLM chamou tool —
+  # resposta é fundamentada. Se igual, é puro palpite.
+  def tool_called_in_this_turn?(conversation)
+    baseline = Rails.cache.read("hermes_tool_calls_baseline:#{conversation.id}", raw: true).to_i
+    current = Rails.cache.read("hermes_tool_calls:#{conversation.id}", raw: true).to_i
+    current > baseline
+  end
+
+  def trigger_force_tool_call!(conversation, original_content)
+    Captain::Hermes::Client.new(@inbox).notify_event(
+      conversation: conversation,
+      event_type: 'force_factual_tool',
+      system_message: '[SISTEMA: force_factual_tool] Você emitiu uma resposta com info ' \
+                      "factual ('#{original_content.truncate(120)}') SEM consultar tool. " \
+                      'Cliente NÃO recebeu essa mensagem. Releia a última pergunta do ' \
+                      'cliente e CHAME a tool relevante AGORA: faq_lookup pra regra/' \
+                      'política/Wi-Fi/horário, tabela de preços da skill pra valores. ' \
+                      'Se a tool retornar vazio, NÃO INVENTE: responda exatamente ' \
+                      "'⏳ Um momento — vou verificar.' e pare."
+    )
+  rescue Captain::Hermes::Client::DispatchError => e
+    Rails.logger.error("[Hermes::Callback] force_factual_tool dispatch falhou: #{e.message}")
+    mark_for_human_triage(conversation, reason: 'force_factual_dispatch_falhou')
+    deliver_outgoing(conversation, original_content)
+  end
+
   # Detecta loop: a resposta atual do Hermes é muito parecida com a anterior
   # outgoing dele na mesma conv (Jaccard de tokens >= 0.70). Sinaliza que o
   # agente está repetindo pergunta/resposta sem progredir — geralmente
diff --git a/app/controllers/webhooks/captain/mcp_controller.rb b/app/controllers/webhooks/captain/mcp_controller.rb
index a996e782f..1cec4cda9 100644
--- a/app/controllers/webhooks/captain/mcp_controller.rb
+++ b/app/controllers/webhooks/captain/mcp_controller.rb
@@ -33,6 +33,7 @@ class Webhooks::Captain::McpController < ApplicationController
       context: extract_context(request_body)
     )
 
+    track_tool_call!(request_body)
     return head :ok if response.nil? # MCP notifications
 
     render json: response
@@ -107,4 +108,18 @@ class Webhooks::Captain::McpController < ApplicationController
 
     value.to_i
   end
+
+  # Incrementa contador de tool calls por conversation. HermesCallbackController
+  # usa o snapshot pré-dispatch (gravado pelo OutgoingJob) vs valor atual pra
+  # detectar respostas factuais SEM chamada de tool (alucinação de memória).
+  def track_tool_call!(request_body)
+    return unless request_body['method'] == 'tools/call'
+
+    args = request_body.dig('params', 'arguments') || {}
+    conv_id = args['conversation_id'] || args[:conversation_id]
+    return if conv_id.blank?
+
+    Rails.cache.increment("hermes_tool_calls:#{conv_id}", 1, expires_in: 5.minutes, raw: true) ||
+      Rails.cache.write("hermes_tool_calls:#{conv_id}", 1, expires_in: 5.minutes, raw: true)
+  end
 end
diff --git a/enterprise/app/jobs/captain/hermes/outgoing_job.rb b/enterprise/app/jobs/captain/hermes/outgoing_job.rb
index adbe190a6..aadb97604 100644
--- a/enterprise/app/jobs/captain/hermes/outgoing_job.rb
+++ b/enterprise/app/jobs/captain/hermes/outgoing_job.rb
@@ -35,6 +35,8 @@ class Captain::Hermes::OutgoingJob < ApplicationJob
     # texto agrupado pra Hermes ver o pensamento completo do cliente.
     combined = combined_incoming_content(conversation, message)
 
+    snapshot_tool_call_baseline(conversation)
+
     Captain::Hermes::Client.new(conversation.inbox).dispatch(
       message: message, conversation: conversation, content_override: combined
     )
@@ -42,6 +44,14 @@ class Captain::Hermes::OutgoingJob < ApplicationJob
 
   private
 
+  # Salva o contador atual de tool calls da conv ANTES do dispatch.
+  # HermesCallbackController compara contra valor pós-callback pra detectar
+  # respostas factuais sem chamada de tool (alucinação de memória).
+  def snapshot_tool_call_baseline(conversation)
+    current = Rails.cache.read("hermes_tool_calls:#{conversation.id}", raw: true).to_i
+    Rails.cache.write("hermes_tool_calls_baseline:#{conversation.id}", current, expires_in: 5.minutes, raw: true)
+  end
+
   # Concatena texto de todas as msgs incoming entre a última resposta real
   # (não-reaction) do agente e a msg âncora. Retorna nil se só tem 1 msg
   # (pra dispatch usar message.content normal).
diff --git a/enterprise/app/services/captain/hermes/auto_react_service.rb b/enterprise/app/services/captain/hermes/auto_react_service.rb
index 9fb44b58e..3231a2677 100644
--- a/enterprise/app/services/captain/hermes/auto_react_service.rb
+++ b/enterprise/app/services/captain/hermes/auto_react_service.rb
@@ -88,12 +88,13 @@ class Captain::Hermes::AutoReactService
   end
 
   # Mensagens "neutras" elegíveis pra reação ambiente: nem curtas demais
-  # (provavelmente saudação que já pega regex), nem longas (geralmente
-  # narrativa que pede atenção), sem ?, sem termos de fluxo de reserva
-  # (preço/cpf/data — cliente está esperando ação, não emoji).
+  # (provavelmente saudação que já pega regex), nem longas (narrativa
+  # pede atenção), sem termos de fluxo de reserva crítico (preço/cpf/data
+  # — cliente está esperando ação, não emoji). AS perguntas comuns
+  # (com "?") TAMBÉM elegíveis: WhatsApp de motel é majoritariamente
+  # interrogativo; se filtrar "?" o ambient nunca dispara em prod.
   def ambient_eligible?(text)
     return false if text.length < 6 || text.length > 180
-    return false if text.include?('?')
     return false if text.match?(AMBIENT_RESERVATION_KEYWORDS)
     return false if text.match?(/\A\d/)