fix(captain): use metadata for openai_file_id and add pdf-reader gem

This commit is contained in:
Rodrigo Borba 2026-01-05 08:07:48 -03:00
parent a229b0a0f1
commit 4b4feb915d
2 changed files with 18 additions and 4 deletions

View File

@ -25,6 +25,7 @@ GIT
GEM
remote: https://rubygems.org/
specs:
Ascii85 (2.0.1)
actioncable (7.1.5.2)
actionpack (= 7.1.5.2)
activesupport (= 7.1.5.2)
@ -126,6 +127,7 @@ GEM
jbuilder (~> 2)
rails (>= 4.2, < 7.2)
selectize-rails (~> 0.6)
afm (1.0.0)
ai-agents (0.7.0)
ruby_llm (~> 1.8.2)
annotaterb (4.20.0)
@ -429,6 +431,7 @@ GEM
hana (1.3.7)
hash_diff (1.1.1)
hashdiff (1.1.0)
hashery (2.1.2)
hashie (5.0.0)
html2text (0.4.0)
nokogiri (>= 1.0, < 2.0)
@ -654,6 +657,12 @@ GEM
parser (3.3.8.0)
ast (~> 2.4.1)
racc
pdf-reader (2.15.1)
Ascii85 (>= 1.0, < 3.0, != 2.0.0)
afm (>= 0.2.1, < 2)
hashery (~> 2.0)
ruby-rc4
ttfunk
pg (1.5.3)
pg_search (2.3.6)
activerecord (>= 5.2)
@ -813,6 +822,7 @@ GEM
faraday (>= 1)
faraday-multipart (>= 1)
ruby-progressbar (1.13.0)
ruby-rc4 (0.1.5)
ruby-saml (1.18.1)
nokogiri (>= 1.13.10)
rexml
@ -945,6 +955,8 @@ GEM
i18n
timeout (0.4.3)
trailblazer-option (0.1.2)
ttfunk (1.8.0)
bigdecimal (~> 3.1)
twilio-ruby (7.6.0)
faraday (>= 0.9, < 3.0)
jwt (>= 1.5, < 3.0)
@ -1097,6 +1109,7 @@ DEPENDENCIES
opensearch-ruby
opentelemetry-exporter-otlp
opentelemetry-sdk
pdf-reader
pg
pg_search
pgvector
@ -1164,4 +1177,4 @@ RUBY VERSION
ruby 3.4.4p34
BUNDLED WITH
4.0.3
2.5.11

View File

@ -29,8 +29,9 @@ class Captain::Llm::PdfProcessingService < Llm::LegacyBaseOpenAiService
if content.present?
Rails.logger.info "PDF extracted content for document #{document.id} (chars=#{content.length})"
# Update content and ensure openai_file_id is nil to force standard FAQ generation
document.update!(content: content, openai_file_id: nil)
# Update content and clear openai_file_id in metadata to force standard FAQ generation.
metadata = (document.metadata || {}).merge('openai_file_id' => nil)
document.update!(content: content, metadata: metadata)
else
Rails.logger.warn "PDF extracted content is empty for document #{document.id}"
end