chatwoot-develop/app/services/jasmine/semantic_search_service.rb
2026-01-20 13:16:32 -03:00

179 lines
6.1 KiB
Ruby

module Jasmine
class SemanticSearchService
CANDIDATES_PER_PRIORITY = 50
TOP_K_PER_PRIORITY = 10
MAX_CHUNKS_PER_DOC = 2
def initialize(inbox)
@inbox = inbox
@account_id = inbox.account_id
@threshold = ENV.fetch('JASMINE_DISTANCE_THRESHOLD', '0.35').to_f
end
def search(query, limit: 10)
# 1. Get enabled collections sorted by priority DESC
enabled_links = @inbox.inbox_collections
.where(is_enabled: true)
.order(priority: :desc)
.includes(:collection)
return [] if enabled_links.empty?
# Group by exact priority
priority_groups = enabled_links.group_by(&:priority)
# Prepare query embedding
query_embedding = generate_embedding(query)
final_results = []
processed_chunk_ids = Set.new
# 2. Iterate Priority Groups (Waterfall)
priority_groups.keys.sort.reverse_each do |priority|
collection_ids = priority_groups[priority].map(&:collection_id)
# Step 1: ANN/HNSW Candidate Retrieval
# Find candidates across all collections in this priority group
# Using raw SQL for precise control over pgvector operator
candidates = retrieve_candidates(query_embedding, collection_ids)
# Step 2: Rerank, Filter (Threshold), and Dedupe
group_results = process_candidates(candidates)
# Waterfall Logic
group_results.each do |result|
next if processed_chunk_ids.include?(result.id)
final_results << result
processed_chunk_ids.add(result.id)
break if final_results.size >= limit
end
break if final_results.size >= limit
end
final_results
end
private
def retrieve_candidates(query_embedding, collection_ids)
# Step 1: Broad search for candidates using HNSW index
# We order by cosine distance (<=>)
Jasmine::DocumentChunk
.where(collection_id: collection_ids)
.order(Arel.sql("embedding <=> '#{query_embedding}'"))
.limit(CANDIDATES_PER_PRIORITY)
end
def process_candidates(candidates)
# Step 2: Deterministic Reranking and Filtering
# Note: 'nearest_neighbors' from neighbor gem already does distance calc,
# but we did it manually in retrieve_candidates to ensure we control the operator.
# We need to manually calculate distance for thresholding if the db didn't return it explicit as a column,
# or trust the order.
# Better approach: Select distance in the query.
# [FUTURE] Placeholder until distance select is wired into filtering.
# Enhanced query with distance
candidates.select(
"jasmine_document_chunks.*, (embedding <=> '#{to_pg_vector(candidates.first&.embedding || [])}') as distance"
)
# Filter by Threshold
# We need to re-query or calculate.
# Let's refine retrieve_candidates to include distance.
# Since we are iterating logic here, let's assume retrieve_candidates returns ActiveRecord::Relation.
# We'll map them to objects and filters.
# [FUTURE] Reserved for threshold filtering output.
Hash.new(0)
# Calculate distances locally or re-fetch.
# Since we ordered by distance in DB, we rely on that order.
# But we need the value for threshold.
# Let's fix retrieve_candidates to return distance
# Re-doing retrieval with select
# Correct approach:
# Iterate, check threshold, check Max Chunks per Doc
candidates.each do |chunk|
# [FUTURE] Distance will gate threshold checks once wired up.
chunk.neighbor_distance(:embedding, @embedding_vector)
rescue StandardError
nil
# NOTE: neighbor gem might not expose distance easily without using its scopes.
# Fallback: Rely on DB order, but checking absolute threshold might be tricky without the value.
# Let's trust Neighbor gem's `nearest_neighbors` if possible, but we used raw SQL order.
# To strictly follow plan: "Re-rank exact cosine distance".
# We can implement a simple ruby cosine distance if vector is loaded,
# or use the SQL value.
# Optimization: Let's assume the SQL order is correct (it is).
# We just need to stop if distance > threshold.
# Since we can't easily get the distance value without select, let's use neighbor gem scope correctly.
end
# Better Implementation using Neighbor Gem capabilities which handles this
# But filtering by priority group AND threshold AND limit is complex chain.
# Let's use Raw SQL for the whole Step 1 + Distance Select
# This is safer.
return [] if candidates.empty?
end
# Simplified re-implementation of retrieve + process
def retrieve_candidates(query_embedding, collection_ids)
Jasmine::DocumentChunk
.where(collection_id: collection_ids)
.select("jasmine_document_chunks.*, (embedding <=> '#{query_embedding}') as distance")
.order('distance ASC')
.limit(CANDIDATES_PER_PRIORITY)
end
# Overwrite process_candidates with the list from above
def process_candidates(candidates)
filtered = []
doc_counts = Hash.new(0)
candidates.each do |chunk|
# 1. Threshold Check
# distance is a string/float from SQL
dist = chunk[:distance].to_f
next if dist > @threshold
# 2. Doc Dedupe
limit = MAX_CHUNKS_PER_DOC
next if doc_counts[chunk.document_id] >= limit
doc_counts[chunk.document_id] += 1
filtered << chunk
end
# 3. Top K per Priority
filtered.first(TOP_K_PER_PRIORITY)
end
def generate_embedding(text)
# Using shared logic or direct call.
# Duplication for now to keep service independent or use embedding service class func
model = ENV.fetch('JASMINE_EMBEDDING_MODEL', 'text-embedding-3-small')
RubyLLM.embed(text, model: model).vectors.first
end
def to_pg_vector(vector)
# Ensure vector is an array of floats
# PGVector accepts JSON array string e.g. "[1.0, 2.0]"
vector.to_s
end
end
end