chatwoot-develop/app/services/jasmine/semantic_search_service.rb

module Jasmine
  class SemanticSearchService
    CANDIDATES_PER_PRIORITY = 50
    TOP_K_PER_PRIORITY = 10
    MAX_CHUNKS_PER_DOC = 2

    def initialize(inbox)
      @inbox = inbox
      @account_id = inbox.account_id
      @threshold = ENV.fetch('JASMINE_DISTANCE_THRESHOLD', '0.35').to_f
    end

    def search(query, limit: 10)
      # 1. Get enabled collections sorted by priority DESC
      enabled_links = @inbox.inbox_collections
                            .where(is_enabled: true)
                            .order(priority: :desc)
                            .includes(:collection)

      return [] if enabled_links.empty?

      # Group by exact priority
      priority_groups = enabled_links.group_by(&:priority)

      # Prepare query embedding
      query_embedding = generate_embedding(query)

      final_results = []
      processed_chunk_ids = Set.new

      # 2. Iterate Priority Groups (Waterfall)
      priority_groups.keys.sort.reverse_each do |priority|
        collection_ids = priority_groups[priority].map(&:collection_id)

        # Step 1: ANN/HNSW Candidate Retrieval
        # Find candidates across all collections in this priority group
        # Using raw SQL for precise control over pgvector operator
        candidates = retrieve_candidates(query_embedding, collection_ids)

        # Step 2: Rerank, Filter (Threshold), and Dedupe
        group_results = process_candidates(candidates)

        # Waterfall Logic
        group_results.each do |result|
          next if processed_chunk_ids.include?(result.id)

          final_results << result
          processed_chunk_ids.add(result.id)

          break if final_results.size >= limit
        end

        break if final_results.size >= limit
      end

      final_results
    end

    private

    def retrieve_candidates(query_embedding, collection_ids)
      # Step 1: Broad search for candidates using HNSW index
      # We order by cosine distance (<=>)
      Jasmine::DocumentChunk
        .where(collection_id: collection_ids)
        .order(Arel.sql("embedding <=> '#{query_embedding}'"))
        .limit(CANDIDATES_PER_PRIORITY)
    end

    def process_candidates(candidates)
      # Step 2: Deterministic Reranking and Filtering
      # Note: 'nearest_neighbors' from neighbor gem already does distance calc,
      # but we did it manually in retrieve_candidates to ensure we control the operator.
      # We need to manually calculate distance for thresholding if the db didn't return it explicit as a column,
      # or trust the order.
      # Better approach: Select distance in the query.

      # [FUTURE] Placeholder until distance select is wired into filtering.
      # Enhanced query with distance
      candidates.select(
        "jasmine_document_chunks.*, (embedding <=> '#{to_pg_vector(candidates.first&.embedding || [])}') as distance"
      )

      # Filter by Threshold
      # We need to re-query or calculate.
      # Let's refine retrieve_candidates to include distance.

      # Since we are iterating logic here, let's assume retrieve_candidates returns ActiveRecord::Relation.
      # We'll map them to objects and filters.

      # [FUTURE] Reserved for threshold filtering output.
      Hash.new(0)

      # Calculate distances locally or re-fetch.
      # Since we ordered by distance in DB, we rely on that order.
      # But we need the value for threshold.

      # Let's fix retrieve_candidates to return distance
      # Re-doing retrieval with select

      # Correct approach:
      # Iterate, check threshold, check Max Chunks per Doc

      candidates.each do |chunk|
        # [FUTURE] Distance will gate threshold checks once wired up.

        chunk.neighbor_distance(:embedding, @embedding_vector)
      rescue StandardError
        nil

        # NOTE: neighbor gem might not expose distance easily without using its scopes.
        # Fallback: Rely on DB order, but checking absolute threshold might be tricky without the value.
        # Let's trust Neighbor gem's `nearest_neighbors` if possible, but we used raw SQL order.

        # To strictly follow plan: "Re-rank exact cosine distance".
        # We can implement a simple ruby cosine distance if vector is loaded,
        # or use the SQL value.

        # Optimization: Let's assume the SQL order is correct (it is).
        # We just need to stop if distance > threshold.
        # Since we can't easily get the distance value without select, let's use neighbor gem scope correctly.
      end

      # Better Implementation using Neighbor Gem capabilities which handles this
      # But filtering by priority group AND threshold AND limit is complex chain.

      # Let's use Raw SQL for the whole Step 1 + Distance Select
      # This is safer.

      return [] if candidates.empty?
    end

    # Simplified re-implementation of retrieve + process
    def retrieve_candidates(query_embedding, collection_ids)
      Jasmine::DocumentChunk
        .where(collection_id: collection_ids)
        .select("jasmine_document_chunks.*, (embedding <=> '#{query_embedding}') as distance")
        .order('distance ASC')
        .limit(CANDIDATES_PER_PRIORITY)
    end

    # Overwrite process_candidates with the list from above
    def process_candidates(candidates)
      filtered = []
      doc_counts = Hash.new(0)

      candidates.each do |chunk|
        # 1. Threshold Check
        # distance is a string/float from SQL
        dist = chunk[:distance].to_f
        next if dist > @threshold

        # 2. Doc Dedupe
        limit = MAX_CHUNKS_PER_DOC
        next if doc_counts[chunk.document_id] >= limit

        doc_counts[chunk.document_id] += 1
        filtered << chunk
      end

      # 3. Top K per Priority
      filtered.first(TOP_K_PER_PRIORITY)
    end

    def generate_embedding(text)
      # Using shared logic or direct call.
      # Duplication for now to keep service independent or use embedding service class func
      model = ENV.fetch('JASMINE_EMBEDDING_MODEL', 'text-embedding-3-small')
      RubyLLM.embed(text, model: model).vectors.first
    end

    def to_pg_vector(vector)
      # Ensure vector is an array of floats
      # PGVector accepts JSON array string e.g. "[1.0, 2.0]"
      vector.to_s
    end
  end
end