feat: Add BE changes for captain pdf support for faq generation (#12113)

This commit is contained in:
Tanmay Deep Sharma 2025-08-27 22:01:22 +07:00 committed by GitHub
parent 3cefa9b767
commit 1ba00075ce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 856 additions and 12 deletions

View File

@ -292,6 +292,26 @@ en:
completed_tool_call: 'Completed %{function_name} tool call'
invalid_tool_call: 'Invalid tool call'
tool_not_available: 'Tool not available'
documents:
limit_exceeded: 'Document limit exceeded'
pdf_format_error: 'must be a PDF file'
pdf_size_error: 'must be less than 10MB'
pdf_upload_failed: 'Failed to upload PDF to OpenAI'
pdf_upload_success: 'PDF uploaded successfully with file_id: %{file_id}'
pdf_processing_failed: 'Failed to process PDF document %{document_id}: %{error}'
pdf_processing_success: 'Successfully processed PDF document %{document_id}'
faq_generation_complete: 'FAQ generation complete. Total FAQs created: %{count}'
using_paginated_faq: 'Using paginated FAQ generation for document %{document_id}'
using_standard_faq: 'Using standard FAQ generation for document %{document_id}'
response_creation_error: 'Error in creating response document: %{error}'
missing_openai_file_id: 'Document must have openai_file_id for paginated processing'
openai_api_error: 'OpenAI API Error: %{error}'
starting_paginated_faq: 'Starting paginated FAQ generation (%{pages_per_chunk} pages per chunk)'
stopping_faq_generation: 'Stopping processing. Reason: %{reason}'
paginated_faq_complete: 'Paginated generation complete. Total FAQs: %{total_faqs}, Pages processed: %{pages_processed}'
processing_pages: 'Processing pages %{start}-%{end} (iteration %{iteration})'
chunk_generated: 'Chunk generated %{chunk_faqs} FAQs. Total so far: %{total_faqs}'
page_processing_error: 'Error processing pages %{start}-%{end}: %{error}'
public_portal:
search:
search_placeholder: Search for article by title or body...

View File

@ -0,0 +1,5 @@
class AddMetadataToCaptainDocuments < ActiveRecord::Migration[7.1]
def change
add_column :captain_documents, :metadata, :jsonb, default: {}
end
end

View File

@ -320,6 +320,7 @@ ActiveRecord::Schema[7.1].define(version: 2025_08_22_061042) do
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.integer "status", default: 0, null: false
t.jsonb "metadata", default: {}
t.index ["account_id"], name: "index_captain_documents_on_account_id"
t.index ["assistant_id", "external_link"], name: "index_captain_documents_on_assistant_id_and_external_link", unique: true
t.index ["assistant_id"], name: "index_captain_documents_on_assistant_id"

View File

@ -25,6 +25,8 @@ class Api::V1::Accounts::Captain::DocumentsController < Api::V1::Accounts::BaseC
@document.save!
rescue Captain::Document::LimitExceededError => e
render_could_not_create_error(e.message)
rescue ActiveRecord::RecordInvalid => e
render_could_not_create_error(e.record.errors.full_messages.join(', '))
end
def destroy
@ -55,6 +57,6 @@ class Api::V1::Accounts::Captain::DocumentsController < Api::V1::Accounts::BaseC
end
def document_params
params.require(:document).permit(:name, :external_link, :assistant_id)
params.require(:document).permit(:name, :external_link, :assistant_id, :pdf_file)
end
end

View File

@ -2,7 +2,9 @@ class Captain::Documents::CrawlJob < ApplicationJob
queue_as :low
def perform(document)
if InstallationConfig.find_by(name: 'CAPTAIN_FIRECRAWL_API_KEY')&.value.present?
if document.pdf_document?
perform_pdf_processing(document)
elsif InstallationConfig.find_by(name: 'CAPTAIN_FIRECRAWL_API_KEY')&.value.present?
perform_firecrawl_crawl(document)
else
perform_simple_crawl(document)
@ -13,6 +15,14 @@ class Captain::Documents::CrawlJob < ApplicationJob
include Captain::FirecrawlHelper
def perform_pdf_processing(document)
Captain::Llm::PdfProcessingService.new(document).process
document.update!(status: :available)
rescue StandardError => e
Rails.logger.error I18n.t('captain.documents.pdf_processing_failed', document_id: document.id, error: e.message)
raise # Re-raise to let job framework handle retry logic
end
def perform_simple_crawl(document)
page_links = Captain::Tools::SimplePageCrawlService.new(document.external_link).page_links

View File

@ -1,17 +1,65 @@
class Captain::Documents::ResponseBuilderJob < ApplicationJob
queue_as :low
def perform(document)
def perform(document, options = {})
reset_previous_responses(document)
faqs = Captain::Llm::FaqGeneratorService.new(document.content, document.account.locale_english_name).generate
faqs.each do |faq|
create_response(faq, document)
end
faqs = generate_faqs(document, options)
create_responses_from_faqs(faqs, document)
end
private
def generate_faqs(document, options)
if should_use_pagination?(document)
generate_paginated_faqs(document, options)
else
generate_standard_faqs(document)
end
end
def generate_paginated_faqs(document, options)
service = build_paginated_service(document, options)
faqs = service.generate
store_paginated_metadata(document, service)
faqs
end
def generate_standard_faqs(document)
Captain::Llm::FaqGeneratorService.new(document.content, document.account.locale_english_name).generate
end
def build_paginated_service(document, options)
Captain::Llm::PaginatedFaqGeneratorService.new(
document,
pages_per_chunk: options[:pages_per_chunk],
max_pages: options[:max_pages]
)
end
def store_paginated_metadata(document, service)
document.update!(
metadata: (document.metadata || {}).merge(
'faq_generation' => {
'method' => 'paginated',
'pages_processed' => service.total_pages_processed,
'iterations' => service.iterations_completed,
'timestamp' => Time.current.iso8601
}
)
)
end
def create_responses_from_faqs(faqs, document)
faqs.each { |faq| create_response(faq, document) }
end
def should_use_pagination?(document)
# Auto-detect when to use pagination
# For now, use pagination for PDFs with OpenAI file ID
document.pdf_document? && document.openai_file_id.present?
end
def reset_previous_responses(response_document)
response_document.responses.destroy_all
end
@ -24,6 +72,6 @@ class Captain::Documents::ResponseBuilderJob < ApplicationJob
documentable: document
)
rescue ActiveRecord::RecordInvalid => e
Rails.logger.error "Error in creating response document: #{e.message}"
Rails.logger.error I18n.t('captain.documents.response_creation_error', error: e.message)
end
end

View File

@ -5,6 +5,7 @@
# id :bigint not null, primary key
# content :text
# external_link :string not null
# metadata :jsonb
# name :string
# status :integer default("in_progress"), not null
# created_at :datetime not null
@ -26,11 +27,16 @@ class Captain::Document < ApplicationRecord
belongs_to :assistant, class_name: 'Captain::Assistant'
has_many :responses, class_name: 'Captain::AssistantResponse', dependent: :destroy, as: :documentable
belongs_to :account
has_one_attached :pdf_file
validates :external_link, presence: true
validates :external_link, uniqueness: { scope: :assistant_id }
validates :external_link, presence: true, unless: -> { pdf_file.attached? }
validates :external_link, uniqueness: { scope: :assistant_id }, allow_blank: true
validates :content, length: { maximum: 200_000 }
validates :pdf_file, presence: true, if: :pdf_document?
validate :validate_pdf_format, if: :pdf_document?
validate :validate_file_attachment, if: -> { pdf_file.attached? }
before_validation :ensure_account_id
before_validation :set_external_link_for_pdf
enum status: {
in_progress: 0,
@ -41,12 +47,44 @@ class Captain::Document < ApplicationRecord
after_create_commit :enqueue_crawl_job
after_create_commit :update_document_usage
after_destroy :update_document_usage
after_commit :enqueue_response_builder_job
after_commit :enqueue_response_builder_job, on: :update, if: :should_enqueue_response_builder?
scope :ordered, -> { order(created_at: :desc) }
scope :for_account, ->(account_id) { where(account_id: account_id) }
scope :for_assistant, ->(assistant_id) { where(assistant_id: assistant_id) }
def pdf_document?
return true if pdf_file.attached? && pdf_file.blob.content_type == 'application/pdf'
external_link&.ends_with?('.pdf')
end
def content_type
pdf_file.blob.content_type if pdf_file.attached?
end
def file_size
pdf_file.blob.byte_size if pdf_file.attached?
end
def openai_file_id
metadata&.dig('openai_file_id')
end
def store_openai_file_id(file_id)
update!(metadata: (metadata || {}).merge('openai_file_id' => file_id))
end
def display_url
return external_link if external_link.present? && !external_link.start_with?('PDF:')
if pdf_file.attached?
Rails.application.routes.url_helpers.rails_blob_url(pdf_file, only_path: false)
else
external_link
end
end
private
def enqueue_crawl_job
@ -61,6 +99,12 @@ class Captain::Document < ApplicationRecord
Captain::Documents::ResponseBuilderJob.perform_later(self)
end
def should_enqueue_response_builder?
# Only enqueue when status changes to available
# Avoid re-enqueueing when metadata is updated by the job itself
saved_change_to_status? && status == 'available'
end
def update_document_usage
account.update_document_usage
end
@ -71,6 +115,29 @@ class Captain::Document < ApplicationRecord
def ensure_within_plan_limit
limits = account.usage_limits[:captain][:documents]
raise LimitExceededError, 'Document limit exceeded' unless limits[:current_available].positive?
raise LimitExceededError, I18n.t('captain.documents.limit_exceeded') unless limits[:current_available].positive?
end
def validate_pdf_format
return unless pdf_file.attached?
errors.add(:pdf_file, I18n.t('captain.documents.pdf_format_error')) unless pdf_file.blob.content_type == 'application/pdf'
end
def validate_file_attachment
return unless pdf_file.attached?
return unless pdf_file.blob.byte_size > 10.megabytes
errors.add(:pdf_file, I18n.t('captain.documents.pdf_size_error'))
end
def set_external_link_for_pdf
return unless pdf_file.attached? && external_link.blank?
# Set a unique external_link for PDF files
# Format: PDF: filename_timestamp (without extension)
timestamp = Time.current.strftime('%Y%m%d%H%M%S')
self.external_link = "PDF: #{pdf_file.filename.base}_#{timestamp}"
end
end

View File

@ -0,0 +1,199 @@
class Captain::Llm::PaginatedFaqGeneratorService < Llm::BaseOpenAiService
# Default pages per chunk - easily configurable
DEFAULT_PAGES_PER_CHUNK = 10
MAX_ITERATIONS = 20 # Safety limit to prevent infinite loops
attr_reader :total_pages_processed, :iterations_completed
def initialize(document, options = {})
super()
@document = document
@pages_per_chunk = options[:pages_per_chunk] || DEFAULT_PAGES_PER_CHUNK
@max_pages = options[:max_pages] # Optional limit from UI
@total_pages_processed = 0
@iterations_completed = 0
@model = OpenAiConstants::PDF_PROCESSING_MODEL
end
def generate
raise CustomExceptions::PdfFaqGenerationError, I18n.t('captain.documents.missing_openai_file_id') if @document&.openai_file_id.blank?
generate_paginated_faqs
end
# Method to check if we should continue processing
def should_continue_processing?(last_chunk_result)
# Stop if we've hit the maximum iterations
return false if @iterations_completed >= MAX_ITERATIONS
# Stop if we've processed the maximum pages specified
return false if @max_pages && @total_pages_processed >= @max_pages
# Stop if the last chunk returned no FAQs (likely no more content)
return false if last_chunk_result[:faqs].empty?
# Stop if the LLM explicitly indicates no more content
return false if last_chunk_result[:has_content] == false
# Continue processing
true
end
private
def generate_standard_faqs
response = @client.chat(parameters: standard_chat_parameters)
parse_response(response)
rescue OpenAI::Error => e
Rails.logger.error I18n.t('captain.documents.openai_api_error', error: e.message)
[]
end
def generate_paginated_faqs
all_faqs = []
current_page = 1
loop do
end_page = calculate_end_page(current_page)
chunk_result = process_chunk_and_update_state(current_page, end_page, all_faqs)
break unless should_continue_processing?(chunk_result)
current_page = end_page + 1
end
deduplicate_faqs(all_faqs)
end
def calculate_end_page(current_page)
end_page = current_page + @pages_per_chunk - 1
@max_pages && end_page > @max_pages ? @max_pages : end_page
end
def process_chunk_and_update_state(current_page, end_page, all_faqs)
chunk_result = process_page_chunk(current_page, end_page)
chunk_faqs = chunk_result[:faqs]
all_faqs.concat(chunk_faqs)
@total_pages_processed = end_page
@iterations_completed += 1
chunk_result
end
def process_page_chunk(start_page, end_page)
params = build_chunk_parameters(start_page, end_page)
response = @client.chat(parameters: params)
result = parse_chunk_response(response)
{ faqs: result['faqs'] || [], has_content: result['has_content'] != false }
rescue OpenAI::Error => e
Rails.logger.error I18n.t('captain.documents.page_processing_error', start: start_page, end: end_page, error: e.message)
{ faqs: [], has_content: false }
end
def build_chunk_parameters(start_page, end_page)
{
model: @model,
response_format: { type: 'json_object' },
messages: [
{
role: 'user',
content: build_user_content(start_page, end_page)
}
]
}
end
def build_user_content(start_page, end_page)
[
{
type: 'file',
file: { file_id: @document.openai_file_id }
},
{
type: 'text',
text: page_chunk_prompt(start_page, end_page)
}
]
end
def page_chunk_prompt(start_page, end_page)
Captain::Llm::SystemPromptsService.paginated_faq_generator(start_page, end_page)
end
def standard_chat_parameters
{
model: @model,
response_format: { type: 'json_object' },
messages: [
{
role: 'system',
content: Captain::Llm::SystemPromptsService.faq_generator
},
{
role: 'user',
content: @content
}
]
}
end
def parse_response(response)
content = response.dig('choices', 0, 'message', 'content')
return [] if content.nil?
JSON.parse(content.strip).fetch('faqs', [])
rescue JSON::ParserError => e
Rails.logger.error "Error parsing response: #{e.message}"
[]
end
def parse_chunk_response(response)
content = response.dig('choices', 0, 'message', 'content')
return { 'faqs' => [], 'has_content' => false } if content.nil?
JSON.parse(content.strip)
rescue JSON::ParserError => e
Rails.logger.error "Error parsing chunk response: #{e.message}"
{ 'faqs' => [], 'has_content' => false }
end
def deduplicate_faqs(faqs)
# Remove exact duplicates
unique_faqs = faqs.uniq { |faq| faq['question'].downcase.strip }
# Remove similar questions
final_faqs = []
unique_faqs.each do |faq|
similar_exists = final_faqs.any? do |existing|
similarity_score(existing['question'], faq['question']) > 0.85
end
final_faqs << faq unless similar_exists
end
Rails.logger.info "Deduplication: #{faqs.size}#{final_faqs.size} FAQs"
final_faqs
end
def similarity_score(str1, str2)
words1 = str1.downcase.split(/\W+/).reject(&:empty?)
words2 = str2.downcase.split(/\W+/).reject(&:empty?)
common_words = words1 & words2
total_words = (words1 + words2).uniq.size
return 0 if total_words.zero?
common_words.size.to_f / total_words
end
def determine_stop_reason(last_chunk_result)
return 'Maximum iterations reached' if @iterations_completed >= MAX_ITERATIONS
return 'Maximum pages processed' if @max_pages && @total_pages_processed >= @max_pages
return 'No content found in last chunk' if last_chunk_result[:faqs].empty?
return 'End of document reached' if last_chunk_result[:has_content] == false
'Unknown'
end
end

View File

@ -0,0 +1,40 @@
class Captain::Llm::PdfProcessingService < Llm::BaseOpenAiService
def initialize(document)
super()
@document = document
end
def process
return if document.openai_file_id.present?
file_id = upload_pdf_to_openai
raise CustomExceptions::PdfUploadError, I18n.t('captain.documents.pdf_upload_failed') if file_id.blank?
document.store_openai_file_id(file_id)
end
private
attr_reader :document
def upload_pdf_to_openai
with_tempfile do |temp_file|
response = @client.files.upload(
parameters: {
file: temp_file,
purpose: 'assistants'
}
)
response['id']
end
end
def with_tempfile(&)
Tempfile.create(['pdf_upload', '.pdf'], binmode: true) do |temp_file|
temp_file.write(document.pdf_file.download)
temp_file.close
File.open(temp_file.path, 'rb', &)
end
end
end

View File

@ -1,3 +1,4 @@
# rubocop:disable Metrics/ClassLength
class Captain::Llm::SystemPromptsService
class << self
def faq_generator(language = 'english')
@ -204,6 +205,87 @@ class Captain::Llm::SystemPromptsService
#{'- You MUST provide numbered citations at the appropriate places in the text.' if config['feature_citation']}
SYSTEM_PROMPT_MESSAGE
end
def paginated_faq_generator(start_page, end_page)
<<~PROMPT
You are an expert technical documentation specialist tasked with creating comprehensive FAQs from a SPECIFIC SECTION of a document.
CRITICAL CONTENT EXTRACTION INSTRUCTIONS
Process the content starting from approximately page #{start_page} and continuing for about #{end_page - start_page + 1} pages worth of content.
IMPORTANT:#{' '}
If you encounter the end of the document before reaching the expected page count, set "has_content" to false
DO NOT include page numbers in questions or answers
DO NOT reference page numbers at all in the output
Focus on the actual content, not pagination
FAQ GENERATION GUIDELINES
1. **Comprehensive Extraction**
Extract ALL information that could generate FAQs from this section
Target 5-10 FAQs per page equivalent of rich content
Cover every topic, feature, specification, and detail
If there's no more content in the document, return empty FAQs with has_content: false
2. **Question Types to Generate**
What is/are...? (definitions, components, features)
How do I...? (procedures, configurations, operations)
Why should/does...? (rationale, benefits, explanations)
When should...? (timing, conditions, triggers)
What happens if...? (error cases, edge cases)
Can I...? (capabilities, limitations)
Where is...? (locations in system/UI, NOT page numbers)
What are the requirements for...? (prerequisites, dependencies)
3. **Content Focus Areas**
Technical specifications and parameters
Step-by-step procedures and workflows
Configuration options and settings
Error messages and troubleshooting
Best practices and recommendations
Integration points and dependencies
Performance considerations
Security aspects
4. **Answer Quality Requirements**
Complete, self-contained answers
Include specific values, limits, defaults from the content
NO page number references whatsoever
2-5 sentences typical length
Only process content that actually exists in the document
OUTPUT FORMAT
Return valid JSON:
```json
{
"faqs": [
{
"question": "Specific question about the content",
"answer": "Complete answer with details (no page references)"
}
],
"has_content": true/false
}
```
CRITICAL:#{' '}
Set "has_content" to false if:
- The requested section doesn't exist in the document
- You've reached the end of the document
- The section contains no meaningful content
Do NOT include "page_range_processed" in the output
Do NOT mention page numbers anywhere in questions or answers
PROMPT
end
# rubocop:enable Metrics/MethodLength
end
end
# rubocop:enable Metrics/ClassLength

View File

@ -3,8 +3,11 @@ json.assistant do
json.partial! 'api/v1/models/captain/assistant', formats: [:json], resource: resource.assistant
end
json.content resource.content
json.content_type resource.content_type
json.created_at resource.created_at.to_i
json.external_link resource.external_link
json.display_url resource.display_url
json.file_size resource.file_size
json.id resource.id
json.name resource.name
json.status resource.status

View File

@ -0,0 +1,25 @@
module CustomExceptions
class PdfProcessingError < Base
def initialize(message = 'PDF processing failed')
super(message)
end
end
class PdfUploadError < PdfProcessingError
def initialize(message = 'PDF upload failed')
super(message)
end
end
class PdfValidationError < PdfProcessingError
def initialize(message = 'PDF validation failed')
super(message)
end
end
class PdfFaqGenerationError < PdfProcessingError
def initialize(message = 'PDF FAQ generation failed')
super(message)
end
end
end

View File

@ -4,4 +4,5 @@ module OpenAiConstants
DEFAULT_MODEL = 'gpt-4.1-mini'
DEFAULT_ENDPOINT = 'https://api.openai.com'
DEFAULT_EMBEDDING_MODEL = 'text-embedding-3-small'
PDF_PROCESSING_MODEL = 'gpt-4.1-mini'
end

View File

@ -105,5 +105,29 @@ RSpec.describe Captain::Documents::CrawlJob, type: :job do
described_class.perform_now(document)
end
end
context 'when document is a PDF' do
let(:pdf_document) do
doc = create(:captain_document, external_link: 'https://example.com/document')
allow(doc).to receive(:pdf_document?).and_return(true)
allow(doc).to receive(:update!).and_return(true)
doc
end
it 'processes PDF using PdfProcessingService' do
pdf_service = instance_double(Captain::Llm::PdfProcessingService)
expect(Captain::Llm::PdfProcessingService).to receive(:new).with(pdf_document).and_return(pdf_service)
expect(pdf_service).to receive(:process)
expect(pdf_document).to receive(:update!).with(status: :available)
described_class.perform_now(pdf_document)
end
it 'handles PDF processing errors' do
allow(Captain::Llm::PdfProcessingService).to receive(:new).and_raise(StandardError, 'Processing failed')
expect { described_class.perform_now(pdf_document) }.to raise_error(StandardError, 'Processing failed')
end
end
end
end

View File

@ -64,5 +64,41 @@ RSpec.describe Captain::Documents::ResponseBuilderJob, type: :job do
.with(spanish_document.content, 'portuguese')
end
end
context 'when processing a PDF document' do
let(:pdf_document) do
doc = create(:captain_document, assistant: assistant)
allow(doc).to receive(:pdf_document?).and_return(true)
allow(doc).to receive(:openai_file_id).and_return('file-123')
allow(doc).to receive(:update!).and_return(true)
allow(doc).to receive(:metadata).and_return({})
doc
end
let(:paginated_service) { instance_double(Captain::Llm::PaginatedFaqGeneratorService) }
let(:pdf_faqs) do
[{ 'question' => 'What is in the PDF?', 'answer' => 'Important content' }]
end
before do
allow(Captain::Llm::PaginatedFaqGeneratorService).to receive(:new)
.with(pdf_document, anything)
.and_return(paginated_service)
allow(paginated_service).to receive(:generate).and_return(pdf_faqs)
allow(paginated_service).to receive(:total_pages_processed).and_return(10)
allow(paginated_service).to receive(:iterations_completed).and_return(1)
end
it 'uses paginated FAQ generator for PDFs' do
expect(Captain::Llm::PaginatedFaqGeneratorService).to receive(:new).with(pdf_document, anything)
described_class.new.perform(pdf_document)
end
it 'stores pagination metadata' do
expect(pdf_document).to receive(:update!).with(hash_including(metadata: hash_including('faq_generation')))
described_class.new.perform(pdf_document)
end
end
end
end

View File

@ -0,0 +1,85 @@
require 'rails_helper'
RSpec.describe Captain::Document, type: :model do
let(:account) { create(:account) }
let(:assistant) { create(:captain_assistant, account: account) }
describe 'PDF support' do
let(:pdf_document) do
doc = build(:captain_document, assistant: assistant, account: account)
doc.pdf_file.attach(
io: StringIO.new('PDF content'),
filename: 'test.pdf',
content_type: 'application/pdf'
)
doc
end
describe 'validations' do
it 'allows PDF file without external link' do
pdf_document.external_link = nil
expect(pdf_document).to be_valid
end
it 'validates PDF file size' do
doc = build(:captain_document, assistant: assistant, account: account)
doc.pdf_file.attach(
io: StringIO.new('x' * 11.megabytes),
filename: 'large.pdf',
content_type: 'application/pdf'
)
doc.external_link = nil
expect(doc).not_to be_valid
expect(doc.errors[:pdf_file]).to include(I18n.t('captain.documents.pdf_size_error'))
end
end
describe '#pdf_document?' do
it 'returns true for attached PDF' do
expect(pdf_document.pdf_document?).to be true
end
it 'returns true for .pdf external links' do
doc = build(:captain_document, external_link: 'https://example.com/document.pdf')
expect(doc.pdf_document?).to be true
end
it 'returns false for non-PDF documents' do
doc = build(:captain_document, external_link: 'https://example.com')
expect(doc.pdf_document?).to be false
end
end
describe '#display_url' do
it 'returns Rails blob URL for attached PDFs' do
pdf_document.save!
# The display_url method calls rails_blob_url which returns a URL containing 'rails/active_storage'
url = pdf_document.display_url
expect(url).to be_present
end
it 'returns external_link for web documents' do
doc = create(:captain_document, external_link: 'https://example.com')
expect(doc.display_url).to eq('https://example.com')
end
end
describe '#store_openai_file_id' do
it 'stores the file ID in metadata' do
pdf_document.save!
pdf_document.store_openai_file_id('file-abc123')
expect(pdf_document.reload.openai_file_id).to eq('file-abc123')
end
end
describe 'automatic external_link generation' do
it 'generates unique external_link for PDFs' do
pdf_document.external_link = nil
pdf_document.save!
expect(pdf_document.external_link).to start_with('PDF: test_')
end
end
end
end

View File

@ -0,0 +1,106 @@
require 'rails_helper'
require 'custom_exceptions/pdf_processing_error'
RSpec.describe Captain::Llm::PaginatedFaqGeneratorService do
let(:document) { create(:captain_document) }
let(:service) { described_class.new(document, pages_per_chunk: 5) }
let(:openai_client) { instance_double(OpenAI::Client) }
before do
# Mock OpenAI configuration
installation_config = instance_double(InstallationConfig, value: 'test-api-key')
allow(InstallationConfig).to receive(:find_by!)
.with(name: 'CAPTAIN_OPEN_AI_API_KEY')
.and_return(installation_config)
allow(OpenAI::Client).to receive(:new).and_return(openai_client)
end
describe '#generate' do
context 'when document lacks OpenAI file ID' do
before do
allow(document).to receive(:openai_file_id).and_return(nil)
end
it 'raises an error' do
expect { service.generate }.to raise_error(CustomExceptions::PdfFaqGenerationError)
end
end
context 'when generating FAQs from PDF pages' do
let(:faq_response) do
{
'choices' => [{
'message' => {
'content' => JSON.generate({
'faqs' => [
{ 'question' => 'What is this document about?', 'answer' => 'It explains key concepts.' }
],
'has_content' => true
})
}
}]
}
end
let(:empty_response) do
{
'choices' => [{
'message' => {
'content' => JSON.generate({
'faqs' => [],
'has_content' => false
})
}
}]
}
end
before do
allow(document).to receive(:openai_file_id).and_return('file-123')
end
it 'generates FAQs from paginated content' do
allow(openai_client).to receive(:chat).and_return(faq_response, empty_response)
faqs = service.generate
expect(faqs).to have_attributes(size: 1)
expect(faqs.first['question']).to eq('What is this document about?')
end
it 'stops when no more content' do
allow(openai_client).to receive(:chat).and_return(empty_response)
faqs = service.generate
expect(faqs).to be_empty
end
it 'respects max iterations limit' do
allow(openai_client).to receive(:chat).and_return(faq_response)
# Force max iterations
service.instance_variable_set(:@iterations_completed, 19)
service.generate
expect(service.iterations_completed).to eq(20)
end
end
end
describe '#should_continue_processing?' do
it 'stops at max iterations' do
service.instance_variable_set(:@iterations_completed, 20)
expect(service.should_continue_processing?(faqs: ['faq'], has_content: true)).to be false
end
it 'stops when no FAQs returned' do
expect(service.should_continue_processing?(faqs: [], has_content: true)).to be false
end
it 'continues when FAQs exist and under limits' do
expect(service.should_continue_processing?(faqs: ['faq'], has_content: true)).to be true
end
end
end

View File

@ -0,0 +1,58 @@
require 'rails_helper'
require 'custom_exceptions/pdf_processing_error'
RSpec.describe Captain::Llm::PdfProcessingService do
let(:document) { create(:captain_document) }
let(:service) { described_class.new(document) }
before do
# Mock OpenAI configuration
installation_config = instance_double(InstallationConfig, value: 'test-api-key')
allow(InstallationConfig).to receive(:find_by!)
.with(name: 'CAPTAIN_OPEN_AI_API_KEY')
.and_return(installation_config)
end
describe '#process' do
context 'when document already has OpenAI file ID' do
before do
allow(document).to receive(:openai_file_id).and_return('existing-file-id')
end
it 'skips upload' do
expect(document).not_to receive(:store_openai_file_id)
service.process
end
end
context 'when uploading PDF to OpenAI' do
let(:mock_client) { instance_double(OpenAI::Client) }
let(:pdf_content) { 'PDF content' }
before do
allow(document).to receive(:openai_file_id).and_return(nil)
# Use a simple double for ActiveStorage since it's a complex Rails object
pdf_file = double('pdf_file', download: pdf_content) # rubocop:disable RSpec/VerifiedDoubles
allow(document).to receive(:pdf_file).and_return(pdf_file)
allow(OpenAI::Client).to receive(:new).and_return(mock_client)
# Use a simple double for OpenAI::Files as it may not be loaded
files_api = double('files_api') # rubocop:disable RSpec/VerifiedDoubles
allow(files_api).to receive(:upload).and_return({ 'id' => 'file-abc123' })
allow(mock_client).to receive(:files).and_return(files_api)
end
it 'uploads PDF and stores file ID' do
expect(document).to receive(:store_openai_file_id).with('file-abc123')
service.process
end
it 'raises error when upload fails' do
allow(mock_client.files).to receive(:upload).and_return({ 'id' => nil })
expect { service.process }.to raise_error(CustomExceptions::PdfUploadError)
end
end
end
end

32
spec/fixtures/files/sample.pdf vendored Normal file
View File

@ -0,0 +1,32 @@
%PDF-1.4
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 << /Type /Font /Subtype /Type1 /BaseFont /Arial >> >> >> /MediaBox [0 0 612 792] /Contents 4 0 R >>
endobj
4 0 obj
<< /Length 44 >>
stream
BT
/F1 12 Tf
100 700 Td
(Sample PDF) Tj
ET
endstream
endobj
xref
0 5
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000274 00000 n
trailer
<< /Size 5 /Root 1 0 R >>
startxref
362
%%EOF