From b97013bb1871225bd50f8e8a73e2810d24d5235d Mon Sep 17 00:00:00 2001 From: Ahmed EBEN HASSINE Date: Wed, 15 Oct 2025 13:48:07 +0200 Subject: [PATCH 1/9] Add PostgresHybridStore for hybrid search with RRF Combines pgvector semantic search with PostgreSQL Full-Text Search using Reciprocal Rank Fusion (RRF), following Supabase approach. Features: - Configurable semantic/keyword ratio (0.0 to 1.0) - RRF fusion with customizable k parameter - Multilingual FTS support (default: 'simple') - Optional relevance filtering with defaultMaxScore - All pgvector distance metrics supported --- .../Bridge/Postgres/PostgresHybridStore.php | 348 ++++++++++++ .../Postgres/PostgresHybridStoreTest.php | 499 ++++++++++++++++++ 2 files changed, 847 insertions(+) create mode 100644 src/store/src/Bridge/Postgres/PostgresHybridStore.php create mode 100644 src/store/tests/Bridge/Postgres/PostgresHybridStoreTest.php diff --git a/src/store/src/Bridge/Postgres/PostgresHybridStore.php b/src/store/src/Bridge/Postgres/PostgresHybridStore.php new file mode 100644 index 000000000..1886e9683 --- /dev/null +++ b/src/store/src/Bridge/Postgres/PostgresHybridStore.php @@ -0,0 +1,348 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Bridge\Postgres; + +use Symfony\AI\Platform\Vector\Vector; +use Symfony\AI\Platform\Vector\VectorInterface; +use Symfony\AI\Store\Document\Metadata; +use Symfony\AI\Store\Document\VectorDocument; +use Symfony\AI\Store\Exception\InvalidArgumentException; +use Symfony\AI\Store\ManagedStoreInterface; +use Symfony\AI\Store\StoreInterface; +use Symfony\Component\Uid\Uuid; + +/** + * Hybrid Search Store for PostgreSQL/Supabase + * Combines pgvector (semantic) + PostgreSQL Full-Text Search (ts_rank_cd) using RRF. + * + * Uses Reciprocal Rank Fusion (RRF) to combine vector similarity and full-text search, + * following the same approach as Supabase hybrid search implementation. + * + * Requirements: + * - PostgreSQL with pgvector extension + * - A 'content' text field for full-text search + * + * @see https://supabase.com/docs/guides/ai/hybrid-search + * + * @author Ahmed EBEN HASSINE + */ +final readonly class PostgresHybridStore implements ManagedStoreInterface, StoreInterface +{ + /** + * @param string $vectorFieldName Name of the vector field + * @param string $contentFieldName Name of the text field for FTS + * @param float $semanticRatio Ratio between semantic (vector) and keyword (FTS) search (0.0 to 1.0) + * - 0.0 = 100% keyword search (FTS) + * - 0.5 = balanced hybrid search + * - 1.0 = 100% semantic search (vector only) - default + * @param Distance $distance Distance metric for vector similarity + * @param string $language PostgreSQL text search configuration (default: 'simple') + * - 'simple': Works for ALL languages, no stemming (recommended for multilingual content) + * - 'english', 'french', 'spanish', etc.: Language-specific stemming/stopwords + * @param int $rrfK RRF (Reciprocal Rank Fusion) constant for hybrid search (default: 60) + * Higher values = more equal weighting between results + * @param float|null $defaultMaxScore Default maximum distance threshold for vector search (default: null = no filter) + * Only applies to pure vector search (semanticRatio = 1.0) + * Prevents returning irrelevant results with high distance scores + * Example: 0.8 means only return documents with distance < 0.8 + */ + public function __construct( + private \PDO $connection, + private string $tableName, + private string $vectorFieldName = 'embedding', + private string $contentFieldName = 'content', + private float $semanticRatio = 1.0, + private Distance $distance = Distance::L2, + private string $language = 'simple', + private int $rrfK = 60, + private ?float $defaultMaxScore = null, + ) { + if ($semanticRatio < 0.0 || $semanticRatio > 1.0) { + throw new InvalidArgumentException(\sprintf('The semantic ratio must be between 0.0 and 1.0, "%s" given.', $semanticRatio)); + } + } + + public function setup(array $options = []): void + { + // Enable pgvector extension + $this->connection->exec('CREATE EXTENSION IF NOT EXISTS vector'); + + // Create table with vector field, content field for FTS, and tsvector field + $this->connection->exec( + \sprintf( + 'CREATE TABLE IF NOT EXISTS %s ( + id UUID PRIMARY KEY, + metadata JSONB, + %s TEXT NOT NULL, + %s %s(%d) NOT NULL, + content_tsv tsvector GENERATED ALWAYS AS (to_tsvector(\'%s\', %s)) STORED + )', + $this->tableName, + $this->contentFieldName, + $this->vectorFieldName, + $options['vector_type'] ?? 'vector', + $options['vector_size'] ?? 1536, + $this->language, + $this->contentFieldName, + ), + ); + + // Create vector index + $this->connection->exec( + \sprintf( + 'CREATE INDEX IF NOT EXISTS %s_%s_idx ON %s USING %s (%s %s)', + $this->tableName, + $this->vectorFieldName, + $this->tableName, + $options['index_method'] ?? 'ivfflat', + $this->vectorFieldName, + $options['index_opclass'] ?? 'vector_cosine_ops', + ), + ); + + // Create GIN index for full-text search + $this->connection->exec( + \sprintf( + 'CREATE INDEX IF NOT EXISTS %s_content_tsv_idx ON %s USING gin(content_tsv)', + $this->tableName, + $this->tableName, + ), + ); + } + + public function drop(): void + { + $this->connection->exec(\sprintf('DROP TABLE IF EXISTS %s', $this->tableName)); + } + + public function add(VectorDocument ...$documents): void + { + $statement = $this->connection->prepare( + \sprintf( + 'INSERT INTO %1$s (id, metadata, %2$s, %3$s) + VALUES (:id, :metadata, :content, :vector) + ON CONFLICT (id) DO UPDATE SET + metadata = EXCLUDED.metadata, + %2$s = EXCLUDED.%2$s, + %3$s = EXCLUDED.%3$s', + $this->tableName, + $this->contentFieldName, + $this->vectorFieldName, + ), + ); + + foreach ($documents as $document) { + $operation = [ + 'id' => $document->id->toRfc4122(), + 'metadata' => json_encode($document->metadata->getArrayCopy(), \JSON_THROW_ON_ERROR), + 'content' => $document->metadata->getText() ?? '', + 'vector' => $this->toPgvector($document->vector), + ]; + + $statement->execute($operation); + } + } + + /** + * Hybrid search combining vector similarity and full-text search. + * + * @param array{ + * q?: string, + * semanticRatio?: float, + * limit?: int, + * where?: string, + * params?: array, + * maxScore?: float + * } $options + */ + public function query(Vector $vector, array $options = []): array + { + $semanticRatio = $options['semanticRatio'] ?? $this->semanticRatio; + + if ($semanticRatio < 0.0 || $semanticRatio > 1.0) { + throw new InvalidArgumentException(\sprintf('The semantic ratio must be between 0.0 and 1.0, "%s" given.', $semanticRatio)); + } + + $queryText = $options['q'] ?? ''; + $limit = $options['limit'] ?? 5; + + // Build WHERE clause + $where = []; + $params = []; + + // Only add embedding param if we're doing vector search + if ($semanticRatio > 0.0) { + $params['embedding'] = $this->toPgvector($vector); + } + + // Use maxScore from options, or defaultMaxScore if configured + $maxScore = $options['maxScore'] ?? $this->defaultMaxScore; + + if (null !== $maxScore) { + $where[] = "({$this->vectorFieldName} {$this->distance->getComparisonSign()} :embedding) <= :maxScore"; + $params['maxScore'] = $maxScore; + // Ensure embedding is available if maxScore is used + if (!isset($params['embedding'])) { + $params['embedding'] = $this->toPgvector($vector); + } + } + + if ($options['where'] ?? false) { + $where[] = '('.$options['where'].')'; + } + + $whereClause = $where ? 'WHERE '.implode(' AND ', $where) : ''; + + // Choose query strategy based on semanticRatio and query text + if (1.0 === $semanticRatio || empty($queryText)) { + // Pure vector search + $sql = $this->buildVectorOnlyQuery($whereClause, $limit); + } elseif (0.0 === $semanticRatio) { + // Pure full-text search + $sql = $this->buildFtsOnlyQuery($whereClause, $limit); + $params['query'] = $queryText; + } else { + // Hybrid search with weighted combination + $sql = $this->buildHybridQuery($whereClause, $limit, $semanticRatio); + $params['query'] = $queryText; + } + + $statement = $this->connection->prepare($sql); + $statement->execute([...$params, ...($options['params'] ?? [])]); + + $documents = []; + foreach ($statement->fetchAll(\PDO::FETCH_ASSOC) as $result) { + $documents[] = new VectorDocument( + id: Uuid::fromString($result['id']), + vector: new Vector($this->fromPgvector($result['embedding'])), + metadata: new Metadata(json_decode($result['metadata'] ?? '{}', true, 512, \JSON_THROW_ON_ERROR)), + score: $result['score'], + ); + } + + return $documents; + } + + private function buildVectorOnlyQuery(string $whereClause, int $limit): string + { + return \sprintf(<<vectorFieldName, + $this->vectorFieldName, + $this->distance->getComparisonSign(), + $this->tableName, + $whereClause, + $limit, + ); + } + + private function buildFtsOnlyQuery(string $whereClause, int $limit): string + { + // Add FTS match filter to ensure only relevant documents are returned + $ftsFilter = \sprintf("content_tsv @@ websearch_to_tsquery('%s', :query)", $this->language); + + if ($whereClause) { + // Combine existing WHERE clause with FTS filter + $whereClause = str_replace('WHERE ', "WHERE $ftsFilter AND ", $whereClause); + } else { + $whereClause = "WHERE $ftsFilter"; + } + + return \sprintf(<<vectorFieldName, + $this->language, + $this->tableName, + $whereClause, + $limit, + ); + } + + private function buildHybridQuery(string $whereClause, int $limit, float $semanticRatio): string + { + // Add FTS filter for the fts_scores CTE + $ftsWhereClause = $whereClause; + $ftsFilter = \sprintf("content_tsv @@ websearch_to_tsquery('%s', :query)", $this->language); + + if ($whereClause) { + $ftsWhereClause = str_replace('WHERE ', "WHERE $ftsFilter AND ", $whereClause); + } else { + $ftsWhereClause = "WHERE $ftsFilter"; + } + + // RRF (Reciprocal Rank Fusion) - Same approach as Supabase + // Formula: COALESCE(1.0 / (k + rank), 0.0) * weight + // Lower score is better (like distance) + return \sprintf(<<vectorFieldName, + $this->vectorFieldName, + $this->distance->getComparisonSign(), + $this->tableName, + $whereClause, + $this->language, + $this->tableName, + $ftsWhereClause, + $this->rrfK, + $semanticRatio, + $this->rrfK, + 1.0 - $semanticRatio, + $limit, + ); + } + + private function toPgvector(VectorInterface $vector): string + { + return '['.implode(',', $vector->getData()).']'; + } + + /** + * @return float[] + */ + private function fromPgvector(string $vector): array + { + return json_decode($vector, true, 512, \JSON_THROW_ON_ERROR); + } +} diff --git a/src/store/tests/Bridge/Postgres/PostgresHybridStoreTest.php b/src/store/tests/Bridge/Postgres/PostgresHybridStoreTest.php new file mode 100644 index 000000000..75bde3818 --- /dev/null +++ b/src/store/tests/Bridge/Postgres/PostgresHybridStoreTest.php @@ -0,0 +1,499 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Tests\Bridge\Postgres; + +use PHPUnit\Framework\TestCase; +use Symfony\AI\Platform\Vector\Vector; +use Symfony\AI\Store\Bridge\Postgres\PostgresHybridStore; +use Symfony\AI\Store\Document\Metadata; +use Symfony\AI\Store\Document\VectorDocument; +use Symfony\AI\Store\Exception\InvalidArgumentException; +use Symfony\Component\Uid\Uuid; + +final class PostgresHybridStoreTest extends TestCase +{ + public function testConstructorValidatesSemanticRatio() + { + $this->expectException(InvalidArgumentException::class); + $this->expectExceptionMessage('The semantic ratio must be between 0.0 and 1.0'); + + $pdo = $this->createMock(\PDO::class); + new PostgresHybridStore($pdo, 'test_table', semanticRatio: 1.5); + } + + public function testConstructorValidatesNegativeSemanticRatio() + { + $this->expectException(InvalidArgumentException::class); + $this->expectExceptionMessage('The semantic ratio must be between 0.0 and 1.0'); + + $pdo = $this->createMock(\PDO::class); + new PostgresHybridStore($pdo, 'test_table', semanticRatio: -0.5); + } + + public function testSetupCreatesTableWithFullTextSearchSupport() + { + $pdo = $this->createMock(\PDO::class); + $store = new PostgresHybridStore($pdo, 'hybrid_table'); + + $pdo->expects($this->exactly(4)) + ->method('exec') + ->willReturnCallback(function (string $sql): int { + static $callCount = 0; + ++$callCount; + + if (1 === $callCount) { + $this->assertSame('CREATE EXTENSION IF NOT EXISTS vector', $sql); + } elseif (2 === $callCount) { + $this->assertStringContainsString('CREATE TABLE IF NOT EXISTS hybrid_table', $sql); + $this->assertStringContainsString('content TEXT NOT NULL', $sql); + $this->assertStringContainsString('embedding vector(1536) NOT NULL', $sql); + $this->assertStringContainsString('content_tsv tsvector GENERATED ALWAYS AS (to_tsvector(\'simple\', content)) STORED', $sql); + } elseif (3 === $callCount) { + $this->assertStringContainsString('CREATE INDEX IF NOT EXISTS hybrid_table_embedding_idx', $sql); + } else { + $this->assertStringContainsString('CREATE INDEX IF NOT EXISTS hybrid_table_content_tsv_idx', $sql); + $this->assertStringContainsString('USING gin(content_tsv)', $sql); + } + + return 0; + }); + + $store->setup(); + } + + public function testAddDocument() + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + $store = new PostgresHybridStore($pdo, 'hybrid_table'); + + $expectedSql = 'INSERT INTO hybrid_table (id, metadata, content, embedding) + VALUES (:id, :metadata, :content, :vector) + ON CONFLICT (id) DO UPDATE SET + metadata = EXCLUDED.metadata, + content = EXCLUDED.content, + embedding = EXCLUDED.embedding'; + + $pdo->expects($this->once()) + ->method('prepare') + ->with($this->callback(function ($sql) use ($expectedSql) { + return $this->normalizeQuery($sql) === $this->normalizeQuery($expectedSql); + })) + ->willReturn($statement); + + $uuid = Uuid::v4(); + + $statement->expects($this->once()) + ->method('execute') + ->with([ + 'id' => $uuid->toRfc4122(), + 'metadata' => json_encode(['_text' => 'Test content', 'category' => 'test']), + 'content' => 'Test content', + 'vector' => '[0.1,0.2,0.3]', + ]); + + $metadata = new Metadata(['_text' => 'Test content', 'category' => 'test']); + $document = new VectorDocument($uuid, new Vector([0.1, 0.2, 0.3]), $metadata); + $store->add($document); + } + + public function testPureVectorSearch() + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + $store = new PostgresHybridStore($pdo, 'hybrid_table', semanticRatio: 1.0); + + $expectedSql = 'SELECT id, embedding AS embedding, metadata, (embedding <-> :embedding) AS score + FROM hybrid_table + + ORDER BY score ASC + LIMIT 5'; + + $pdo->expects($this->once()) + ->method('prepare') + ->with($this->callback(function ($sql) use ($expectedSql) { + return $this->normalizeQuery($sql) === $this->normalizeQuery($expectedSql); + })) + ->willReturn($statement); + + $statement->expects($this->once()) + ->method('execute') + ->with(['embedding' => '[0.1,0.2,0.3]']); + + $uuid = Uuid::v4(); + + $statement->expects($this->once()) + ->method('fetchAll') + ->with(\PDO::FETCH_ASSOC) + ->willReturn([ + [ + 'id' => $uuid->toRfc4122(), + 'embedding' => '[0.1,0.2,0.3]', + 'metadata' => json_encode(['text' => 'Test Document']), + 'score' => 0.05, + ], + ]); + + $results = $store->query(new Vector([0.1, 0.2, 0.3])); + + $this->assertCount(1, $results); + $this->assertInstanceOf(VectorDocument::class, $results[0]); + $this->assertSame(0.05, $results[0]->score); + } + + public function testPureKeywordSearch() + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + $store = new PostgresHybridStore($pdo, 'hybrid_table', semanticRatio: 0.0); + + $expectedSql = "SELECT id, embedding AS embedding, metadata, + (1.0 / (1.0 + ts_rank_cd(content_tsv, websearch_to_tsquery('simple', :query)))) AS score + FROM hybrid_table + WHERE content_tsv @@ websearch_to_tsquery('simple', :query) + ORDER BY score ASC + LIMIT 5"; + + $pdo->expects($this->once()) + ->method('prepare') + ->with($this->callback(function ($sql) use ($expectedSql) { + return $this->normalizeQuery($sql) === $this->normalizeQuery($expectedSql); + })) + ->willReturn($statement); + + $statement->expects($this->once()) + ->method('execute') + ->with($this->callback(function ($params) { + return isset($params['query']) && 'PostgreSQL' === $params['query']; + })); + + $uuid = Uuid::v4(); + + $statement->expects($this->once()) + ->method('fetchAll') + ->with(\PDO::FETCH_ASSOC) + ->willReturn([ + [ + 'id' => $uuid->toRfc4122(), + 'embedding' => '[0.1,0.2,0.3]', + 'metadata' => json_encode(['text' => 'PostgreSQL is awesome']), + 'score' => 0.5, + ], + ]); + + $results = $store->query(new Vector([0.1, 0.2, 0.3]), ['q' => 'PostgreSQL']); + + $this->assertCount(1, $results); + $this->assertSame(0.5, $results[0]->score); + } + + public function testHybridSearchWithRRF() + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + $store = new PostgresHybridStore($pdo, 'hybrid_table', semanticRatio: 0.5, rrfK: 60); + + $pdo->expects($this->once()) + ->method('prepare') + ->with($this->callback(function ($sql) { + // Check for RRF CTE structure + $this->assertStringContainsString('WITH vector_scores AS', $sql); + $this->assertStringContainsString('fts_scores AS', $sql); + $this->assertStringContainsString('ROW_NUMBER() OVER', $sql); + $this->assertStringContainsString('COALESCE(1.0 / (60 + v.rank_ix), 0.0)', $sql); + $this->assertStringContainsString('FULL OUTER JOIN', $sql); + $this->assertStringContainsString('ORDER BY score DESC', $sql); + + return true; + })) + ->willReturn($statement); + + $statement->expects($this->once()) + ->method('execute') + ->with($this->callback(function ($params) { + return isset($params['embedding']) && isset($params['query']); + })); + + $uuid = Uuid::v4(); + + $statement->expects($this->once()) + ->method('fetchAll') + ->with(\PDO::FETCH_ASSOC) + ->willReturn([ + [ + 'id' => $uuid->toRfc4122(), + 'embedding' => '[0.1,0.2,0.3]', + 'metadata' => json_encode(['text' => 'PostgreSQL database']), + 'score' => 0.025, + ], + ]); + + $results = $store->query(new Vector([0.1, 0.2, 0.3]), ['q' => 'PostgreSQL', 'semanticRatio' => 0.5]); + + $this->assertCount(1, $results); + $this->assertSame(0.025, $results[0]->score); + } + + public function testQueryWithDefaultMaxScore() + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + $store = new PostgresHybridStore( + $pdo, + 'hybrid_table', + semanticRatio: 1.0, + defaultMaxScore: 0.8 + ); + + $pdo->expects($this->once()) + ->method('prepare') + ->with($this->callback(function ($sql) { + $this->assertStringContainsString('WHERE (embedding <-> :embedding) <= :maxScore', $sql); + + return true; + })) + ->willReturn($statement); + + $statement->expects($this->once()) + ->method('execute') + ->with($this->callback(function ($params) { + return isset($params['embedding']) + && isset($params['maxScore']) + && 0.8 === $params['maxScore']; + })); + + $statement->expects($this->once()) + ->method('fetchAll') + ->with(\PDO::FETCH_ASSOC) + ->willReturn([]); + + $results = $store->query(new Vector([0.1, 0.2, 0.3])); + + $this->assertCount(0, $results); + } + + public function testQueryWithMaxScoreOverride() + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + $store = new PostgresHybridStore( + $pdo, + 'hybrid_table', + semanticRatio: 1.0, + defaultMaxScore: 0.8 + ); + + $pdo->expects($this->once()) + ->method('prepare') + ->willReturn($statement); + + $statement->expects($this->once()) + ->method('execute') + ->with($this->callback(function ($params) { + // Should use override value 0.5, not default 0.8 + return isset($params['maxScore']) && 0.5 === $params['maxScore']; + })); + + $statement->expects($this->once()) + ->method('fetchAll') + ->with(\PDO::FETCH_ASSOC) + ->willReturn([]); + + $results = $store->query(new Vector([0.1, 0.2, 0.3]), ['maxScore' => 0.5]); + + $this->assertCount(0, $results); + } + + public function testQueryWithCustomLanguage() + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + $store = new PostgresHybridStore($pdo, 'hybrid_table', semanticRatio: 0.0, language: 'french'); + + $pdo->expects($this->once()) + ->method('prepare') + ->with($this->callback(function ($sql) { + $this->assertStringContainsString("websearch_to_tsquery('french'", $sql); + + return true; + })) + ->willReturn($statement); + + $statement->expects($this->once()) + ->method('execute'); + + $statement->expects($this->once()) + ->method('fetchAll') + ->with(\PDO::FETCH_ASSOC) + ->willReturn([]); + + $store->query(new Vector([0.1, 0.2, 0.3]), ['q' => 'développement']); + } + + public function testQueryWithCustomRRFK() + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + $store = new PostgresHybridStore($pdo, 'hybrid_table', semanticRatio: 0.5, rrfK: 100); + + $pdo->expects($this->once()) + ->method('prepare') + ->with($this->callback(function ($sql) { + $this->assertStringContainsString('COALESCE(1.0 / (100 + v.rank_ix), 0.0)', $sql); + $this->assertStringContainsString('COALESCE(1.0 / (100 + f.rank_ix), 0.0)', $sql); + + return true; + })) + ->willReturn($statement); + + $statement->expects($this->once()) + ->method('execute'); + + $statement->expects($this->once()) + ->method('fetchAll') + ->with(\PDO::FETCH_ASSOC) + ->willReturn([]); + + $store->query(new Vector([0.1, 0.2, 0.3]), ['q' => 'test']); + } + + public function testQueryInvalidSemanticRatioInOptions() + { + $pdo = $this->createMock(\PDO::class); + $store = new PostgresHybridStore($pdo, 'hybrid_table'); + + $this->expectException(InvalidArgumentException::class); + $this->expectExceptionMessage('The semantic ratio must be between 0.0 and 1.0'); + + $store->query(new Vector([0.1, 0.2, 0.3]), ['semanticRatio' => 1.5]); + } + + public function testDrop() + { + $pdo = $this->createMock(\PDO::class); + $store = new PostgresHybridStore($pdo, 'hybrid_table'); + + $pdo->expects($this->once()) + ->method('exec') + ->with('DROP TABLE IF EXISTS hybrid_table'); + + $store->drop(); + } + + public function testQueryWithCustomLimit() + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + $store = new PostgresHybridStore($pdo, 'hybrid_table', semanticRatio: 1.0); + + $pdo->expects($this->once()) + ->method('prepare') + ->with($this->callback(function ($sql) { + $this->assertStringContainsString('LIMIT 10', $sql); + + return true; + })) + ->willReturn($statement); + + $statement->expects($this->once()) + ->method('execute'); + + $statement->expects($this->once()) + ->method('fetchAll') + ->with(\PDO::FETCH_ASSOC) + ->willReturn([]); + + $store->query(new Vector([0.1, 0.2, 0.3]), ['limit' => 10]); + } + + public function testAddMultipleDocuments() + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + $store = new PostgresHybridStore($pdo, 'hybrid_table'); + + $pdo->expects($this->once()) + ->method('prepare') + ->willReturn($statement); + + $uuid1 = Uuid::v4(); + $uuid2 = Uuid::v4(); + + $statement->expects($this->exactly(2)) + ->method('execute') + ->willReturnCallback(function (array $params) use ($uuid1, $uuid2): bool { + static $callCount = 0; + ++$callCount; + + if (1 === $callCount) { + $this->assertSame($uuid1->toRfc4122(), $params['id']); + $this->assertSame('First document', $params['content']); + } else { + $this->assertSame($uuid2->toRfc4122(), $params['id']); + $this->assertSame('Second document', $params['content']); + } + + return true; + }); + + $metadata1 = new Metadata(['_text' => 'First document']); + $metadata2 = new Metadata(['_text' => 'Second document']); + + $document1 = new VectorDocument($uuid1, new Vector([0.1, 0.2, 0.3]), $metadata1); + $document2 = new VectorDocument($uuid2, new Vector([0.4, 0.5, 0.6]), $metadata2); + + $store->add($document1, $document2); + } + + public function testPureKeywordSearchReturnsEmptyWhenNoMatch() + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + $store = new PostgresHybridStore($pdo, 'hybrid_table', semanticRatio: 0.0); + + $pdo->expects($this->once()) + ->method('prepare') + ->willReturn($statement); + + $statement->expects($this->once()) + ->method('execute'); + + $statement->expects($this->once()) + ->method('fetchAll') + ->with(\PDO::FETCH_ASSOC) + ->willReturn([]); + + $results = $store->query(new Vector([0.1, 0.2, 0.3]), ['q' => 'zzzzzzzzzzzzz']); + + $this->assertCount(0, $results); + } + + private function normalizeQuery(string $query): string + { + // Remove extra spaces, tabs and newlines + $normalized = preg_replace('/\s+/', ' ', $query); + + // Trim the result + return trim($normalized); + } +} From 5b43dd19fba9ae6dbbbd19554eac39b84481b9cd Mon Sep 17 00:00:00 2001 From: Ahmed EBEN HASSINE Date: Thu, 16 Oct 2025 09:32:27 +0200 Subject: [PATCH 2/9] fix(style): apply php-cs-fixer and fix phpstan alerts --- src/store/src/Bridge/Postgres/PostgresHybridStore.php | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/store/src/Bridge/Postgres/PostgresHybridStore.php b/src/store/src/Bridge/Postgres/PostgresHybridStore.php index 1886e9683..624cc0e96 100644 --- a/src/store/src/Bridge/Postgres/PostgresHybridStore.php +++ b/src/store/src/Bridge/Postgres/PostgresHybridStore.php @@ -160,7 +160,7 @@ public function add(VectorDocument ...$documents): void * semanticRatio?: float, * limit?: int, * where?: string, - * params?: array, + * params?: array, * maxScore?: float * } $options */ @@ -196,14 +196,14 @@ public function query(Vector $vector, array $options = []): array } } - if ($options['where'] ?? false) { + if (isset($options['where']) && '' !== $options['where']) { $where[] = '('.$options['where'].')'; } $whereClause = $where ? 'WHERE '.implode(' AND ', $where) : ''; // Choose query strategy based on semanticRatio and query text - if (1.0 === $semanticRatio || empty($queryText)) { + if (1.0 === $semanticRatio || '' === $queryText) { // Pure vector search $sql = $this->buildVectorOnlyQuery($whereClause, $limit); } elseif (0.0 === $semanticRatio) { @@ -255,7 +255,7 @@ private function buildFtsOnlyQuery(string $whereClause, int $limit): string // Add FTS match filter to ensure only relevant documents are returned $ftsFilter = \sprintf("content_tsv @@ websearch_to_tsquery('%s', :query)", $this->language); - if ($whereClause) { + if ('' !== $whereClause) { // Combine existing WHERE clause with FTS filter $whereClause = str_replace('WHERE ', "WHERE $ftsFilter AND ", $whereClause); } else { @@ -284,7 +284,7 @@ private function buildHybridQuery(string $whereClause, int $limit, float $semant $ftsWhereClause = $whereClause; $ftsFilter = \sprintf("content_tsv @@ websearch_to_tsquery('%s', :query)", $this->language); - if ($whereClause) { + if ('' !== $whereClause) { $ftsWhereClause = str_replace('WHERE ', "WHERE $ftsFilter AND ", $whereClause); } else { $ftsWhereClause = "WHERE $ftsFilter"; From e33b2c20973769abdcf722d33f653ea37defff3f Mon Sep 17 00:00:00 2001 From: Ahmed EBEN HASSINE Date: Thu, 30 Oct 2025 15:23:46 +0100 Subject: [PATCH 3/9] refactor(store): centralize WHERE clause building in PostgresHybridStore - Extract WHERE clause logic into addFilterToWhereClause() helper method - Fix embedding param logic: ensure it's set before maxScore uses it - Replace fragile str_replace() with robust str_starts_with() approach - Remove code duplication between buildFtsOnlyQuery and buildHybridQuery This addresses review feedback about fragile WHERE clause manipulation and centralizes the logic in a single, reusable method. --- .../Bridge/Postgres/PostgresHybridStore.php | 56 +++++++++++-------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/src/store/src/Bridge/Postgres/PostgresHybridStore.php b/src/store/src/Bridge/Postgres/PostgresHybridStore.php index 624cc0e96..a7e1e503c 100644 --- a/src/store/src/Bridge/Postgres/PostgresHybridStore.php +++ b/src/store/src/Bridge/Postgres/PostgresHybridStore.php @@ -33,7 +33,7 @@ * * @see https://supabase.com/docs/guides/ai/hybrid-search * - * @author Ahmed EBEN HASSINE + * @author Ahmed EBEN HASSINE */ final readonly class PostgresHybridStore implements ManagedStoreInterface, StoreInterface { @@ -179,21 +179,17 @@ public function query(Vector $vector, array $options = []): array $where = []; $params = []; - // Only add embedding param if we're doing vector search - if ($semanticRatio > 0.0) { - $params['embedding'] = $this->toPgvector($vector); - } - // Use maxScore from options, or defaultMaxScore if configured $maxScore = $options['maxScore'] ?? $this->defaultMaxScore; + // Ensure embedding param is set if maxScore is used (regardless of semanticRatio) + if ($semanticRatio > 0.0 || null !== $maxScore) { + $params['embedding'] = $this->toPgvector($vector); + } + if (null !== $maxScore) { $where[] = "({$this->vectorFieldName} {$this->distance->getComparisonSign()} :embedding) <= :maxScore"; $params['maxScore'] = $maxScore; - // Ensure embedding is available if maxScore is used - if (!isset($params['embedding'])) { - $params['embedding'] = $this->toPgvector($vector); - } } if (isset($options['where']) && '' !== $options['where']) { @@ -254,13 +250,7 @@ private function buildFtsOnlyQuery(string $whereClause, int $limit): string { // Add FTS match filter to ensure only relevant documents are returned $ftsFilter = \sprintf("content_tsv @@ websearch_to_tsquery('%s', :query)", $this->language); - - if ('' !== $whereClause) { - // Combine existing WHERE clause with FTS filter - $whereClause = str_replace('WHERE ', "WHERE $ftsFilter AND ", $whereClause); - } else { - $whereClause = "WHERE $ftsFilter"; - } + $whereClause = $this->addFilterToWhereClause($whereClause, $ftsFilter); return \sprintf(<<language); - - if ('' !== $whereClause) { - $ftsWhereClause = str_replace('WHERE ', "WHERE $ftsFilter AND ", $whereClause); - } else { - $ftsWhereClause = "WHERE $ftsFilter"; - } + $ftsWhereClause = $this->addFilterToWhereClause($whereClause, $ftsFilter); // RRF (Reciprocal Rank Fusion) - Same approach as Supabase // Formula: COALESCE(1.0 / (k + rank), 0.0) * weight @@ -333,6 +317,30 @@ private function buildHybridQuery(string $whereClause, int $limit, float $semant ); } + /** + * Adds a filter condition to an existing WHERE clause using AND logic. + * + * @param string $whereClause Existing WHERE clause (may be empty or start with 'WHERE ') + * @param string $filter Filter condition to add (without 'WHERE ') + * + * @return string Combined WHERE clause + */ + private function addFilterToWhereClause(string $whereClause, string $filter): string + { + if ('' === $whereClause) { + return "WHERE $filter"; + } + + $whereClause = rtrim($whereClause); + + if (str_starts_with($whereClause, 'WHERE ')) { + return "$whereClause AND $filter"; + } + + // Unexpected format, prepend WHERE + return "WHERE $filter AND ".ltrim($whereClause); + } + private function toPgvector(VectorInterface $vector): string { return '['.implode(',', $vector->getData()).']'; From 954d44dae38628543fff71920868c5c15e2bde69 Mon Sep 17 00:00:00 2001 From: Ahmed EBEN HASSINE Date: Thu, 30 Oct 2025 15:31:49 +0100 Subject: [PATCH 4/9] refactor(store): rename PostgresHybridStore to HybridStore - Rename class from PostgresHybridStore to HybridStore - The namespace already indicates it's Postgres-specific - Add postgres-hybrid.php RAG example demonstrating: * Different semantic ratios (0.0, 0.5, 1.0) * RRF (Reciprocal Rank Fusion) hybrid search * Full-text search with 'q' parameter * Per-query semanticRatio override --- examples/rag/postgres-hybrid.php | 126 ++++++++++++++++++ ...ostgresHybridStore.php => HybridStore.php} | 2 +- ...ybridStoreTest.php => HybridStoreTest.php} | 36 ++--- 3 files changed, 145 insertions(+), 19 deletions(-) create mode 100644 examples/rag/postgres-hybrid.php rename src/store/src/Bridge/Postgres/{PostgresHybridStore.php => HybridStore.php} (99%) rename src/store/tests/Bridge/Postgres/{PostgresHybridStoreTest.php => HybridStoreTest.php} (92%) diff --git a/examples/rag/postgres-hybrid.php b/examples/rag/postgres-hybrid.php new file mode 100644 index 000000000..032fc1678 --- /dev/null +++ b/examples/rag/postgres-hybrid.php @@ -0,0 +1,126 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +use Doctrine\DBAL\DriverManager; +use Doctrine\DBAL\Tools\DsnParser; +use Symfony\AI\Fixtures\Movies; +use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory; +use Symfony\AI\Store\Bridge\Postgres\HybridStore; +use Symfony\AI\Store\Document\Loader\InMemoryLoader; +use Symfony\AI\Store\Document\Metadata; +use Symfony\AI\Store\Document\TextDocument; +use Symfony\AI\Store\Document\Vectorizer; +use Symfony\AI\Store\Indexer; +use Symfony\Component\Uid\Uuid; + +require_once dirname(__DIR__).'/bootstrap.php'; + +echo "=== PostgreSQL Hybrid Search Demo ===\n\n"; +echo "This example demonstrates how to configure the semantic ratio to balance\n"; +echo "between semantic (vector) search and PostgreSQL Full-Text Search.\n\n"; + +// Initialize the hybrid store with balanced search (50/50) +$connection = DriverManager::getConnection((new DsnParser())->parse(env('POSTGRES_URI'))); +$pdo = $connection->getNativeConnection(); + +if (!$pdo instanceof PDO) { + throw new RuntimeException('Unable to get native PDO connection from Doctrine DBAL'); +} + +$store = new HybridStore( + connection: $pdo, + tableName: 'hybrid_movies', + semanticRatio: 0.5, // Balanced hybrid search by default +); + +// Create embeddings and documents +$documents = []; +foreach (Movies::all() as $i => $movie) { + $documents[] = new TextDocument( + id: Uuid::v4(), + content: 'Title: '.$movie['title'].\PHP_EOL.'Director: '.$movie['director'].\PHP_EOL.'Description: '.$movie['description'], + metadata: new Metadata(array_merge($movie, ['content' => 'Title: '.$movie['title'].\PHP_EOL.'Director: '.$movie['director'].\PHP_EOL.'Description: '.$movie['description']])), + ); +} + +// Initialize the table +$store->setup(); + +// Create embeddings for documents +$platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client()); +$vectorizer = new Vectorizer($platform, 'text-embedding-3-small', logger()); +$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $store, logger: logger()); +$indexer->index($documents); + +// Create a query embedding +$queryText = 'futuristic technology and artificial intelligence'; +echo "Query: \"$queryText\"\n\n"; +$queryEmbedding = $vectorizer->vectorize($queryText); + +// Test different semantic ratios to compare results +$ratios = [ + ['ratio' => 0.0, 'description' => '100% Full-text search (keyword matching)'], + ['ratio' => 0.5, 'description' => 'Balanced hybrid (RRF: 50% semantic + 50% FTS)'], + ['ratio' => 1.0, 'description' => '100% Semantic search (vector similarity)'], +]; + +foreach ($ratios as $config) { + echo "--- {$config['description']} ---\n"; + + // Override the semantic ratio for this specific query + $results = $store->query($queryEmbedding, [ + 'semanticRatio' => $config['ratio'], + 'q' => 'technology', // Full-text search keyword + 'limit' => 3, + ]); + + echo "Top 3 results:\n"; + foreach ($results as $i => $result) { + $metadata = $result->metadata->getArrayCopy(); + echo sprintf( + " %d. %s (Score: %.4f)\n", + $i + 1, + $metadata['title'] ?? 'Unknown', + $result->score ?? 0.0 + ); + } + echo "\n"; +} + +echo "--- Custom query with pure semantic search ---\n"; +echo "Query: Movies about space exploration\n"; +$spaceEmbedding = $vectorizer->vectorize('space exploration and cosmic adventures'); +$results = $store->query($spaceEmbedding, [ + 'semanticRatio' => 1.0, // Pure semantic search + 'limit' => 3, +]); + +echo "Top 3 results:\n"; +foreach ($results as $i => $result) { + $metadata = $result->metadata->getArrayCopy(); + echo sprintf( + " %d. %s (Score: %.4f)\n", + $i + 1, + $metadata['title'] ?? 'Unknown', + $result->score ?? 0.0 + ); +} +echo "\n"; + +// Cleanup +$store->drop(); + +echo "=== Summary ===\n"; +echo "- semanticRatio = 0.0: Best for exact keyword matches (PostgreSQL FTS)\n"; +echo "- semanticRatio = 0.5: Balanced approach using RRF (Reciprocal Rank Fusion)\n"; +echo "- semanticRatio = 1.0: Best for conceptual similarity searches (pgvector)\n"; +echo "\nYou can set the default ratio when instantiating the HybridStore,\n"; +echo "and override it per query using the 'semanticRatio' option.\n"; diff --git a/src/store/src/Bridge/Postgres/PostgresHybridStore.php b/src/store/src/Bridge/Postgres/HybridStore.php similarity index 99% rename from src/store/src/Bridge/Postgres/PostgresHybridStore.php rename to src/store/src/Bridge/Postgres/HybridStore.php index a7e1e503c..5c225fa93 100644 --- a/src/store/src/Bridge/Postgres/PostgresHybridStore.php +++ b/src/store/src/Bridge/Postgres/HybridStore.php @@ -35,7 +35,7 @@ * * @author Ahmed EBEN HASSINE */ -final readonly class PostgresHybridStore implements ManagedStoreInterface, StoreInterface +final readonly class HybridStore implements ManagedStoreInterface, StoreInterface { /** * @param string $vectorFieldName Name of the vector field diff --git a/src/store/tests/Bridge/Postgres/PostgresHybridStoreTest.php b/src/store/tests/Bridge/Postgres/HybridStoreTest.php similarity index 92% rename from src/store/tests/Bridge/Postgres/PostgresHybridStoreTest.php rename to src/store/tests/Bridge/Postgres/HybridStoreTest.php index 75bde3818..2fdd59e2a 100644 --- a/src/store/tests/Bridge/Postgres/PostgresHybridStoreTest.php +++ b/src/store/tests/Bridge/Postgres/HybridStoreTest.php @@ -13,13 +13,13 @@ use PHPUnit\Framework\TestCase; use Symfony\AI\Platform\Vector\Vector; -use Symfony\AI\Store\Bridge\Postgres\PostgresHybridStore; +use Symfony\AI\Store\Bridge\Postgres\HybridStore; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\VectorDocument; use Symfony\AI\Store\Exception\InvalidArgumentException; use Symfony\Component\Uid\Uuid; -final class PostgresHybridStoreTest extends TestCase +final class HybridStoreTest extends TestCase { public function testConstructorValidatesSemanticRatio() { @@ -27,7 +27,7 @@ public function testConstructorValidatesSemanticRatio() $this->expectExceptionMessage('The semantic ratio must be between 0.0 and 1.0'); $pdo = $this->createMock(\PDO::class); - new PostgresHybridStore($pdo, 'test_table', semanticRatio: 1.5); + new HybridStore($pdo, 'test_table', semanticRatio: 1.5); } public function testConstructorValidatesNegativeSemanticRatio() @@ -36,13 +36,13 @@ public function testConstructorValidatesNegativeSemanticRatio() $this->expectExceptionMessage('The semantic ratio must be between 0.0 and 1.0'); $pdo = $this->createMock(\PDO::class); - new PostgresHybridStore($pdo, 'test_table', semanticRatio: -0.5); + new HybridStore($pdo, 'test_table', semanticRatio: -0.5); } public function testSetupCreatesTableWithFullTextSearchSupport() { $pdo = $this->createMock(\PDO::class); - $store = new PostgresHybridStore($pdo, 'hybrid_table'); + $store = new HybridStore($pdo, 'hybrid_table'); $pdo->expects($this->exactly(4)) ->method('exec') @@ -75,7 +75,7 @@ public function testAddDocument() $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - $store = new PostgresHybridStore($pdo, 'hybrid_table'); + $store = new HybridStore($pdo, 'hybrid_table'); $expectedSql = 'INSERT INTO hybrid_table (id, metadata, content, embedding) VALUES (:id, :metadata, :content, :vector) @@ -112,7 +112,7 @@ public function testPureVectorSearch() $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - $store = new PostgresHybridStore($pdo, 'hybrid_table', semanticRatio: 1.0); + $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 1.0); $expectedSql = 'SELECT id, embedding AS embedding, metadata, (embedding <-> :embedding) AS score FROM hybrid_table @@ -157,7 +157,7 @@ public function testPureKeywordSearch() $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - $store = new PostgresHybridStore($pdo, 'hybrid_table', semanticRatio: 0.0); + $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.0); $expectedSql = "SELECT id, embedding AS embedding, metadata, (1.0 / (1.0 + ts_rank_cd(content_tsv, websearch_to_tsquery('simple', :query)))) AS score @@ -204,7 +204,7 @@ public function testHybridSearchWithRRF() $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - $store = new PostgresHybridStore($pdo, 'hybrid_table', semanticRatio: 0.5, rrfK: 60); + $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.5, rrfK: 60); $pdo->expects($this->once()) ->method('prepare') @@ -252,7 +252,7 @@ public function testQueryWithDefaultMaxScore() $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - $store = new PostgresHybridStore( + $store = new HybridStore( $pdo, 'hybrid_table', semanticRatio: 1.0, @@ -291,7 +291,7 @@ public function testQueryWithMaxScoreOverride() $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - $store = new PostgresHybridStore( + $store = new HybridStore( $pdo, 'hybrid_table', semanticRatio: 1.0, @@ -324,7 +324,7 @@ public function testQueryWithCustomLanguage() $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - $store = new PostgresHybridStore($pdo, 'hybrid_table', semanticRatio: 0.0, language: 'french'); + $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.0, language: 'french'); $pdo->expects($this->once()) ->method('prepare') @@ -351,7 +351,7 @@ public function testQueryWithCustomRRFK() $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - $store = new PostgresHybridStore($pdo, 'hybrid_table', semanticRatio: 0.5, rrfK: 100); + $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.5, rrfK: 100); $pdo->expects($this->once()) ->method('prepare') @@ -377,7 +377,7 @@ public function testQueryWithCustomRRFK() public function testQueryInvalidSemanticRatioInOptions() { $pdo = $this->createMock(\PDO::class); - $store = new PostgresHybridStore($pdo, 'hybrid_table'); + $store = new HybridStore($pdo, 'hybrid_table'); $this->expectException(InvalidArgumentException::class); $this->expectExceptionMessage('The semantic ratio must be between 0.0 and 1.0'); @@ -388,7 +388,7 @@ public function testQueryInvalidSemanticRatioInOptions() public function testDrop() { $pdo = $this->createMock(\PDO::class); - $store = new PostgresHybridStore($pdo, 'hybrid_table'); + $store = new HybridStore($pdo, 'hybrid_table'); $pdo->expects($this->once()) ->method('exec') @@ -402,7 +402,7 @@ public function testQueryWithCustomLimit() $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - $store = new PostgresHybridStore($pdo, 'hybrid_table', semanticRatio: 1.0); + $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 1.0); $pdo->expects($this->once()) ->method('prepare') @@ -429,7 +429,7 @@ public function testAddMultipleDocuments() $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - $store = new PostgresHybridStore($pdo, 'hybrid_table'); + $store = new HybridStore($pdo, 'hybrid_table'); $pdo->expects($this->once()) ->method('prepare') @@ -469,7 +469,7 @@ public function testPureKeywordSearchReturnsEmptyWhenNoMatch() $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - $store = new PostgresHybridStore($pdo, 'hybrid_table', semanticRatio: 0.0); + $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.0); $pdo->expects($this->once()) ->method('prepare') From 2c7b49a6c64ed6cd9866d5ce3e511687d166b5e9 Mon Sep 17 00:00:00 2001 From: Ahmed EBEN HASSINE Date: Fri, 7 Nov 2025 14:24:30 +0100 Subject: [PATCH 5/9] Add postgres_hybrid store config and tests --- examples/rag/postgres-hybrid.php | 3 +- src/ai-bundle/config/options.php | 48 ++++++++++++ src/ai-bundle/src/AiBundle.php | 76 +++++++++++++++++++ .../DependencyInjection/AiBundleTest.php | 69 +++++++++++++++++ src/store/src/Bridge/Postgres/HybridStore.php | 2 +- 5 files changed, 196 insertions(+), 2 deletions(-) diff --git a/examples/rag/postgres-hybrid.php b/examples/rag/postgres-hybrid.php index 032fc1678..f71fa24db 100644 --- a/examples/rag/postgres-hybrid.php +++ b/examples/rag/postgres-hybrid.php @@ -18,6 +18,7 @@ use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; use Symfony\AI\Store\Document\Vectorizer; +use Symfony\AI\Store\Exception\RuntimeException; use Symfony\AI\Store\Indexer; use Symfony\Component\Uid\Uuid; @@ -32,7 +33,7 @@ $pdo = $connection->getNativeConnection(); if (!$pdo instanceof PDO) { - throw new RuntimeException('Unable to get native PDO connection from Doctrine DBAL'); + throw new RuntimeException('Unable to get native PDO connection from Doctrine DBAL.'); } $store = new HybridStore( diff --git a/src/ai-bundle/config/options.php b/src/ai-bundle/config/options.php index bafabd267..0479b6492 100644 --- a/src/ai-bundle/config/options.php +++ b/src/ai-bundle/config/options.php @@ -738,6 +738,54 @@ ->end() ->end() ->end() + ->arrayNode('postgres_hybrid') + ->info('PostgreSQL Hybrid Search combining pgvector (semantic) and Full-Text Search (lexical) using RRF') + ->useAttributeAsKey('name') + ->arrayPrototype() + ->children() + ->stringNode('connection')->cannotBeEmpty()->end() + ->stringNode('dsn')->cannotBeEmpty()->end() + ->stringNode('username')->end() + ->stringNode('password')->end() + ->stringNode('table_name')->isRequired()->end() + ->stringNode('vector_field')->defaultValue('embedding')->end() + ->stringNode('content_field')->defaultValue('content')->end() + ->floatNode('semantic_ratio') + ->info('Ratio between semantic (vector) and keyword (FTS) search. 0.0 = pure FTS, 0.5 = balanced, 1.0 = pure semantic') + ->defaultValue(1.0) + ->min(0.0) + ->max(1.0) + ->end() + ->enumNode('distance') + ->info('Distance metric to use for vector similarity search') + ->enumFqcn(PostgresDistance::class) + ->defaultValue(PostgresDistance::L2) + ->end() + ->stringNode('language') + ->info('PostgreSQL text search configuration (e.g., "simple", "english", "french"). Default: "simple" (multilingual)') + ->defaultValue('simple') + ->end() + ->integerNode('rrf_k') + ->info('RRF (Reciprocal Rank Fusion) constant. Higher = more equal weighting. Default: 60 (Supabase)') + ->defaultValue(60) + ->min(1) + ->end() + ->floatNode('default_max_score') + ->info('Default maximum distance threshold for filtering results (optional)') + ->defaultNull() + ->end() + ->stringNode('dbal_connection')->cannotBeEmpty()->end() + ->end() + ->validate() + ->ifTrue(static fn ($v) => !isset($v['dsn']) && !isset($v['dbal_connection']) && !isset($v['connection'])) + ->thenInvalid('Either "dsn", "dbal_connection", or "connection" must be configured.') + ->end() + ->validate() + ->ifTrue(static fn ($v) => (int) isset($v['dsn']) + (int) isset($v['dbal_connection']) + (int) isset($v['connection']) > 1) + ->thenInvalid('Only one of "dsn", "dbal_connection", or "connection" can be configured.') + ->end() + ->end() + ->end() ->end() ->end() ->arrayNode('message_store') diff --git a/src/ai-bundle/src/AiBundle.php b/src/ai-bundle/src/AiBundle.php index a602b339a..297ad0161 100644 --- a/src/ai-bundle/src/AiBundle.php +++ b/src/ai-bundle/src/AiBundle.php @@ -79,6 +79,7 @@ use Symfony\AI\Store\Bridge\MongoDb\Store as MongoDbStore; use Symfony\AI\Store\Bridge\Neo4j\Store as Neo4jStore; use Symfony\AI\Store\Bridge\Pinecone\Store as PineconeStore; +use Symfony\AI\Store\Bridge\Postgres\HybridStore; use Symfony\AI\Store\Bridge\Postgres\Store as PostgresStore; use Symfony\AI\Store\Bridge\Qdrant\Store as QdrantStore; use Symfony\AI\Store\Bridge\Redis\Store as RedisStore; @@ -1366,6 +1367,81 @@ private function processStoreConfig(string $type, array $stores, ContainerBuilde } } + if ('postgres_hybrid' === $type) { + foreach ($stores as $name => $store) { + $definition = new Definition(HybridStore::class); + + // Handle connection (PDO service reference, DBAL connection, or DSN) + if (\array_key_exists('connection', $store)) { + // Direct PDO service reference + $serviceId = ltrim($store['connection'], '@'); + $connection = new Reference($serviceId); + $arguments = [ + $connection, + $store['table_name'], + ]; + } elseif (\array_key_exists('dbal_connection', $store)) { + // DBAL connection - extract native PDO + $connection = (new Definition(\PDO::class)) + ->setFactory([new Reference($store['dbal_connection']), 'getNativeConnection']); + $arguments = [ + $connection, + $store['table_name'], + ]; + } else { + // Create new PDO instance from DSN + $pdo = new Definition(\PDO::class); + $pdo->setArguments([ + $store['dsn'], + $store['username'] ?? null, + $store['password'] ?? null], + ); + + $arguments = [ + $pdo, + $store['table_name'], + ]; + } + + // Add optional parameters + if (\array_key_exists('vector_field', $store)) { + $arguments[2] = $store['vector_field']; + } + + if (\array_key_exists('content_field', $store)) { + $arguments[3] = $store['content_field']; + } + + if (\array_key_exists('semantic_ratio', $store)) { + $arguments[4] = $store['semantic_ratio']; + } + + if (\array_key_exists('distance', $store)) { + $arguments[5] = $store['distance']; + } + + if (\array_key_exists('language', $store)) { + $arguments[6] = $store['language']; + } + + if (\array_key_exists('rrf_k', $store)) { + $arguments[7] = $store['rrf_k']; + } + + if (\array_key_exists('default_max_score', $store)) { + $arguments[8] = $store['default_max_score']; + } + + $definition + ->addTag('ai.store') + ->setArguments($arguments); + + $container->setDefinition('ai.store.'.$type.'.'.$name, $definition); + $container->registerAliasForArgument('ai.store.'.$type.'.'.$name, StoreInterface::class, $name); + $container->registerAliasForArgument('ai.store.'.$type.'.'.$name, StoreInterface::class, $type.'_'.$name); + } + } + if ('supabase' === $type) { foreach ($stores as $name => $store) { $arguments = [ diff --git a/src/ai-bundle/tests/DependencyInjection/AiBundleTest.php b/src/ai-bundle/tests/DependencyInjection/AiBundleTest.php index 37e515b92..b6cfa100f 100644 --- a/src/ai-bundle/tests/DependencyInjection/AiBundleTest.php +++ b/src/ai-bundle/tests/DependencyInjection/AiBundleTest.php @@ -540,6 +540,75 @@ public function testPostgresStoreWithDifferentConnectionCanBeConfigured() $this->assertInstanceOf(Reference::class, $definition->getArgument(0)); } + public function testPostgresHybridStoreWithDsnCanBeConfigured() + { + $container = $this->buildContainer([ + 'ai' => [ + 'store' => [ + 'postgres_hybrid' => [ + 'hybrid_db' => [ + 'dsn' => 'pgsql:host=localhost;port=5432;dbname=testdb', + 'username' => 'app', + 'password' => 'mypass', + 'table_name' => 'hybrid_vectors', + 'semantic_ratio' => 0.7, + 'language' => 'english', + ], + ], + ], + ], + ]); + + $this->assertTrue($container->hasDefinition('ai.store.postgres_hybrid.hybrid_db')); + $definition = $container->getDefinition('ai.store.postgres_hybrid.hybrid_db'); + $this->assertInstanceOf(Definition::class, $definition->getArgument(0)); + $this->assertSame('hybrid_vectors', $definition->getArgument(1)); + } + + public function testPostgresHybridStoreWithDbalConnectionCanBeConfigured() + { + $container = $this->buildContainer([ + 'ai' => [ + 'store' => [ + 'postgres_hybrid' => [ + 'hybrid_db' => [ + 'dbal_connection' => 'my_connection', + 'table_name' => 'hybrid_vectors', + 'rrf_k' => 100, + ], + ], + ], + ], + ]); + + $this->assertTrue($container->hasDefinition('ai.store.postgres_hybrid.hybrid_db')); + $definition = $container->getDefinition('ai.store.postgres_hybrid.hybrid_db'); + $this->assertInstanceOf(Definition::class, $definition->getArgument(0)); + $this->assertSame('hybrid_vectors', $definition->getArgument(1)); + $this->assertSame(100, $definition->getArgument(7)); + } + + public function testPostgresHybridStoreWithConnectionReferenceCanBeConfigured() + { + $container = $this->buildContainer([ + 'ai' => [ + 'store' => [ + 'postgres_hybrid' => [ + 'hybrid_db' => [ + 'connection' => '@my_pdo_service', + 'table_name' => 'hybrid_vectors', + ], + ], + ], + ], + ]); + + $this->assertTrue($container->hasDefinition('ai.store.postgres_hybrid.hybrid_db')); + $definition = $container->getDefinition('ai.store.postgres_hybrid.hybrid_db'); + $this->assertInstanceOf(Reference::class, $definition->getArgument(0)); + $this->assertSame('my_pdo_service', (string) $definition->getArgument(0)); + } + public function testConfigurationWithUseAttributeAsKeyWorksWithoutNormalizeKeys() { // Test that configurations using useAttributeAsKey work correctly diff --git a/src/store/src/Bridge/Postgres/HybridStore.php b/src/store/src/Bridge/Postgres/HybridStore.php index 5c225fa93..80e1bad9b 100644 --- a/src/store/src/Bridge/Postgres/HybridStore.php +++ b/src/store/src/Bridge/Postgres/HybridStore.php @@ -35,7 +35,7 @@ * * @author Ahmed EBEN HASSINE */ -final readonly class HybridStore implements ManagedStoreInterface, StoreInterface +final class HybridStore implements ManagedStoreInterface, StoreInterface { /** * @param string $vectorFieldName Name of the vector field From 1293ac1ccd490fb45a56c347bfe62f0cc5dc79e3 Mon Sep 17 00:00:00 2001 From: Ahmed EBEN HASSINE Date: Sun, 23 Nov 2025 20:34:32 +0100 Subject: [PATCH 6/9] feat(store): replace PostgreSQL FTS with BM25 in HybridStore Replace ts_rank_cd (PostgreSQL Full-Text Search) with BM25 algorithm for better keyword search ranking in hybrid search. Changes: - Add bm25Language parameter (configurable via YAML) - Replace FTS CTEs with bm25topk() function calls - Add DISTINCT ON fixes to prevent duplicate results - Add fuzzy matching with word_similarity (pg_trgm) - Add score normalization (0-100 range) - Add searchable attributes with field-specific boosting - Bundle configuration in options.php and AiBundle.php Tests: - Update 6 existing tests for BM25 compatibility - Add 3 new tests for fuzzy matching and searchable attributes - All 19 tests passing (132 assertions) Breaking changes: - Requires plpgsql_bm25 extension instead of native FTS - BM25 uses short language codes ('en', 'fr') vs FTS full names --- src/ai-bundle/config/options.php | 54 ++ src/ai-bundle/src/AiBundle.php | 32 ++ src/store/src/Bridge/Postgres/HybridStore.php | 475 +++++++++++++++--- .../tests/Bridge/Postgres/HybridStoreTest.php | 85 +++- 4 files changed, 564 insertions(+), 82 deletions(-) diff --git a/src/ai-bundle/config/options.php b/src/ai-bundle/config/options.php index 0479b6492..da777db4a 100644 --- a/src/ai-bundle/config/options.php +++ b/src/ai-bundle/config/options.php @@ -765,6 +765,10 @@ ->info('PostgreSQL text search configuration (e.g., "simple", "english", "french"). Default: "simple" (multilingual)') ->defaultValue('simple') ->end() + ->stringNode('bm25_language') + ->info('BM25 language code for stemming (e.g., "en", "fr", "es", "de", "it", "pt", "nl", "ru", "ar", "zh"). Default: "en"') + ->defaultValue('en') + ->end() ->integerNode('rrf_k') ->info('RRF (Reciprocal Rank Fusion) constant. Higher = more equal weighting. Default: 60 (Supabase)') ->defaultValue(60) @@ -774,6 +778,56 @@ ->info('Default maximum distance threshold for filtering results (optional)') ->defaultNull() ->end() + ->floatNode('default_min_score') + ->info('Default minimum RRF score threshold for filtering results (optional)') + ->defaultNull() + ->end() + ->booleanNode('normalize_scores') + ->info('Normalize scores to 0-100 range for better readability') + ->defaultTrue() + ->end() + ->floatNode('fuzzy_primary_threshold') + ->info('Primary threshold for fuzzy matching (pg_trgm word_similarity). Higher = stricter. Default: 0.25') + ->defaultValue(0.25) + ->min(0.0) + ->max(1.0) + ->end() + ->floatNode('fuzzy_secondary_threshold') + ->info('Secondary threshold for fuzzy matching with double validation. Catches more typos. Default: 0.2') + ->defaultValue(0.2) + ->min(0.0) + ->max(1.0) + ->end() + ->floatNode('fuzzy_strict_threshold') + ->info('Strict similarity threshold for double validation to eliminate false positives. Default: 0.15') + ->defaultValue(0.15) + ->min(0.0) + ->max(1.0) + ->end() + ->floatNode('fuzzy_weight') + ->info('Weight of fuzzy matching vs FTS in hybrid search. 0.0 = disabled, 0.5 = equal (recommended), 1.0 = fuzzy only') + ->defaultValue(0.5) + ->min(0.0) + ->max(1.0) + ->end() + ->arrayNode('searchable_attributes') + ->info('Searchable attributes with field-specific boosting (similar to Meilisearch). Each attribute creates a separate tsvector column.') + ->useAttributeAsKey('name') + ->arrayPrototype() + ->children() + ->floatNode('boost') + ->info('Boost multiplier for this field (e.g., 2.0 = twice as important). Default: 1.0') + ->defaultValue(1.0) + ->min(0.0) + ->end() + ->scalarNode('metadata_key') + ->info('JSON path to extract value from metadata (e.g., "title", "description")') + ->isRequired() + ->cannotBeEmpty() + ->end() + ->end() + ->end() + ->end() ->stringNode('dbal_connection')->cannotBeEmpty()->end() ->end() ->validate() diff --git a/src/ai-bundle/src/AiBundle.php b/src/ai-bundle/src/AiBundle.php index 297ad0161..64af332fb 100644 --- a/src/ai-bundle/src/AiBundle.php +++ b/src/ai-bundle/src/AiBundle.php @@ -1432,6 +1432,38 @@ private function processStoreConfig(string $type, array $stores, ContainerBuilde $arguments[8] = $store['default_max_score']; } + if (\array_key_exists('default_min_score', $store)) { + $arguments[9] = $store['default_min_score']; + } + + if (\array_key_exists('normalize_scores', $store)) { + $arguments[10] = $store['normalize_scores']; + } + + if (\array_key_exists('fuzzy_primary_threshold', $store)) { + $arguments[11] = $store['fuzzy_primary_threshold']; + } + + if (\array_key_exists('fuzzy_secondary_threshold', $store)) { + $arguments[12] = $store['fuzzy_secondary_threshold']; + } + + if (\array_key_exists('fuzzy_strict_threshold', $store)) { + $arguments[13] = $store['fuzzy_strict_threshold']; + } + + if (\array_key_exists('fuzzy_weight', $store)) { + $arguments[14] = $store['fuzzy_weight']; + } + + if (\array_key_exists('searchable_attributes', $store)) { + $arguments[15] = $store['searchable_attributes']; + } + + if (\array_key_exists('bm25_language', $store)) { + $arguments[16] = $store['bm25_language']; + } + $definition ->addTag('ai.store') ->setArguments($arguments); diff --git a/src/store/src/Bridge/Postgres/HybridStore.php b/src/store/src/Bridge/Postgres/HybridStore.php index 80e1bad9b..834decf0c 100644 --- a/src/store/src/Bridge/Postgres/HybridStore.php +++ b/src/store/src/Bridge/Postgres/HybridStore.php @@ -22,14 +22,15 @@ /** * Hybrid Search Store for PostgreSQL/Supabase - * Combines pgvector (semantic) + PostgreSQL Full-Text Search (ts_rank_cd) using RRF. + * Combines pgvector (semantic) + BM25 (keyword) using RRF. * - * Uses Reciprocal Rank Fusion (RRF) to combine vector similarity and full-text search, + * Uses Reciprocal Rank Fusion (RRF) to combine vector similarity and BM25 search, * following the same approach as Supabase hybrid search implementation. * * Requirements: * - PostgreSQL with pgvector extension - * - A 'content' text field for full-text search + * - plpgsql_bm25 extension for BM25 search + * - A 'content' text field for BM25 search * * @see https://supabase.com/docs/guides/ai/hybrid-search * @@ -38,22 +39,50 @@ final class HybridStore implements ManagedStoreInterface, StoreInterface { /** - * @param string $vectorFieldName Name of the vector field - * @param string $contentFieldName Name of the text field for FTS - * @param float $semanticRatio Ratio between semantic (vector) and keyword (FTS) search (0.0 to 1.0) - * - 0.0 = 100% keyword search (FTS) - * - 0.5 = balanced hybrid search - * - 1.0 = 100% semantic search (vector only) - default - * @param Distance $distance Distance metric for vector similarity - * @param string $language PostgreSQL text search configuration (default: 'simple') - * - 'simple': Works for ALL languages, no stemming (recommended for multilingual content) - * - 'english', 'french', 'spanish', etc.: Language-specific stemming/stopwords - * @param int $rrfK RRF (Reciprocal Rank Fusion) constant for hybrid search (default: 60) - * Higher values = more equal weighting between results - * @param float|null $defaultMaxScore Default maximum distance threshold for vector search (default: null = no filter) - * Only applies to pure vector search (semanticRatio = 1.0) - * Prevents returning irrelevant results with high distance scores - * Example: 0.8 means only return documents with distance < 0.8 + * @param string $vectorFieldName Name of the vector field + * @param string $contentFieldName Name of the text field for FTS + * @param float $semanticRatio Ratio between semantic (vector) and keyword (FTS) search (0.0 to 1.0) + * - 0.0 = 100% keyword search (FTS) + * - 0.5 = balanced hybrid search + * - 1.0 = 100% semantic search (vector only) - default + * @param Distance $distance Distance metric for vector similarity + * @param string $language PostgreSQL text search configuration (default: 'simple') + * - 'simple': Works for ALL languages, no stemming (recommended for multilingual content) + * - 'english', 'french', 'spanish', etc.: Language-specific stemming/stopwords + * @param int $rrfK RRF (Reciprocal Rank Fusion) constant for hybrid search (default: 60) + * Higher values = more equal weighting between results + * @param float|null $defaultMaxScore Default maximum distance threshold for vector search (default: null = no filter) + * Only applies to pure vector search (semanticRatio = 1.0) + * Prevents returning irrelevant results with high distance scores + * Example: 0.8 means only return documents with distance < 0.8 + * @param float|null $defaultMinScore Default minimum RRF score threshold (default: null = no filter) + * Filters out results with RRF score below this threshold + * Useful to prevent irrelevant results when FTS returns no matches + * Example: 0.01 means only return documents with RRF score >= 0.01 + * @param bool $normalizeScores Normalize scores to 0-100 range for better readability (default: true) + * When true, scores are multiplied by 100 + * Example: 0.0164 becomes 1.64 (more intuitive) + * @param float $fuzzyPrimaryThreshold Primary threshold for fuzzy matching (default: 0.25) + * Higher threshold = fewer false positives, stricter matching + * Recommended: 0.25 for good balance + * @param float $fuzzySecondaryThreshold Secondary threshold for fuzzy matching (default: 0.2) + * Used with fuzzyStrictThreshold for double validation + * Catches more typos but requires strict check + * @param float $fuzzyStrictThreshold Strict similarity threshold for double validation (default: 0.15) + * Used with fuzzySecondaryThreshold to eliminate false positives + * Ensures word_similarity > 0.2 has minimum similarity > 0.15 + * @param float $fuzzyWeight Weight of fuzzy matching in hybrid search (default: 0.5) + * - 0.0 = fuzzy disabled + * - 0.5 = equal weight with FTS (recommended) + * - 1.0 = fuzzy only (not recommended) + * @param array $searchableAttributes Searchable attributes with field-specific boosting (similar to Meilisearch) + * Format: ['field_name' => ['boost' => 2.0, 'metadata_key' => 'title'], ...] + * Each attribute creates a separate tsvector column extracted from metadata + * Example: ['title' => ['boost' => 2.0, 'metadata_key' => 'title'], + * 'overview' => ['boost' => 0.5, 'metadata_key' => 'overview']] + * @param string $bm25Language BM25 language code (default: 'en') + * BM25 uses short codes: 'en', 'fr', 'es', 'de', etc. + * Separate from $language which is for PostgreSQL FTS */ public function __construct( private \PDO $connection, @@ -65,10 +94,21 @@ public function __construct( private string $language = 'simple', private int $rrfK = 60, private ?float $defaultMaxScore = null, + private ?float $defaultMinScore = null, + private bool $normalizeScores = true, + private float $fuzzyPrimaryThreshold = 0.25, + private float $fuzzySecondaryThreshold = 0.2, + private float $fuzzyStrictThreshold = 0.15, + private float $fuzzyWeight = 0.5, + private array $searchableAttributes = [], + private string $bm25Language = 'en', ) { if ($semanticRatio < 0.0 || $semanticRatio > 1.0) { throw new InvalidArgumentException(\sprintf('The semantic ratio must be between 0.0 and 1.0, "%s" given.', $semanticRatio)); } + if ($fuzzyWeight < 0.0 || $fuzzyWeight > 1.0) { + throw new InvalidArgumentException(\sprintf('The fuzzy weight must be between 0.0 and 1.0, "%s" given.', $fuzzyWeight)); + } } public function setup(array $options = []): void @@ -76,23 +116,79 @@ public function setup(array $options = []): void // Enable pgvector extension $this->connection->exec('CREATE EXTENSION IF NOT EXISTS vector'); - // Create table with vector field, content field for FTS, and tsvector field + // Enable pg_trgm extension for fuzzy matching (typo tolerance) + $this->connection->exec('CREATE EXTENSION IF NOT EXISTS pg_trgm'); + + // Build tsvector columns based on searchable_attributes configuration + $tsvectorColumns = ''; + if (!empty($this->searchableAttributes)) { + // Create separate tsvector column for each searchable attribute + foreach ($this->searchableAttributes as $fieldName => $config) { + $metadataKey = $config['metadata_key']; + $tsvectorColumns .= \sprintf( + ",\n %s_tsv tsvector GENERATED ALWAYS AS (to_tsvector('%s', COALESCE(metadata->>'%s', ''))) STORED", + $fieldName, + $this->language, + $metadataKey + ); + } + } else { + // Backward compatibility: use single content_tsv if no searchable_attributes configured + $tsvectorColumns = \sprintf( + ",\n content_tsv tsvector GENERATED ALWAYS AS (to_tsvector('%s', %s)) STORED", + $this->language, + $this->contentFieldName + ); + } + + // Create table with vector field, content field for FTS, and tsvector field(s) $this->connection->exec( \sprintf( 'CREATE TABLE IF NOT EXISTS %s ( id UUID PRIMARY KEY, metadata JSONB, %s TEXT NOT NULL, - %s %s(%d) NOT NULL, - content_tsv tsvector GENERATED ALWAYS AS (to_tsvector(\'%s\', %s)) STORED + %s %s(%d) NOT NULL%s )', $this->tableName, $this->contentFieldName, $this->vectorFieldName, $options['vector_type'] ?? 'vector', $options['vector_size'] ?? 1536, - $this->language, - $this->contentFieldName, + $tsvectorColumns, + ), + ); + + // Add search_text field for optimized fuzzy matching + // This field contains only title + relevant metadata for better fuzzy precision + $this->connection->exec( + \sprintf( + 'ALTER TABLE %s ADD COLUMN IF NOT EXISTS search_text TEXT', + $this->tableName, + ), + ); + + // Create function to auto-update search_text from metadata + $this->connection->exec( + "CREATE OR REPLACE FUNCTION update_search_text() + RETURNS TRIGGER AS $$ + BEGIN + NEW.search_text := COALESCE(NEW.metadata->>'title', ''); + RETURN NEW; + END; + $$ LANGUAGE plpgsql;" + ); + + // Create trigger to auto-update search_text on insert/update + $this->connection->exec( + \sprintf( + "DROP TRIGGER IF EXISTS trigger_update_search_text ON %s; + CREATE TRIGGER trigger_update_search_text + BEFORE INSERT OR UPDATE ON %s + FOR EACH ROW + EXECUTE FUNCTION update_search_text();", + $this->tableName, + $this->tableName, ), ); @@ -110,9 +206,34 @@ public function setup(array $options = []): void ); // Create GIN index for full-text search + if (!empty($this->searchableAttributes)) { + // Create GIN index for each searchable attribute tsvector + foreach ($this->searchableAttributes as $fieldName => $config) { + $this->connection->exec( + \sprintf( + 'CREATE INDEX IF NOT EXISTS %s_%s_tsv_idx ON %s USING gin(%s_tsv)', + $this->tableName, + $fieldName, + $this->tableName, + $fieldName, + ), + ); + } + } else { + // Backward compatibility: create single content_tsv index + $this->connection->exec( + \sprintf( + 'CREATE INDEX IF NOT EXISTS %s_content_tsv_idx ON %s USING gin(content_tsv)', + $this->tableName, + $this->tableName, + ), + ); + } + + // Create trigram index on search_text for optimized fuzzy matching $this->connection->exec( \sprintf( - 'CREATE INDEX IF NOT EXISTS %s_content_tsv_idx ON %s USING gin(content_tsv)', + 'CREATE INDEX IF NOT EXISTS %s_search_text_trgm_idx ON %s USING gin(search_text gin_trgm_ops)', $this->tableName, $this->tableName, ), @@ -161,7 +282,10 @@ public function add(VectorDocument ...$documents): void * limit?: int, * where?: string, * params?: array, - * maxScore?: float + * maxScore?: float, + * minScore?: float, + * includeScoreBreakdown?: bool, + * boostFields?: array * } $options */ public function query(Vector $vector, array $options = []): array @@ -185,6 +309,10 @@ public function query(Vector $vector, array $options = []): array // Ensure embedding param is set if maxScore is used (regardless of semanticRatio) if ($semanticRatio > 0.0 || null !== $maxScore) { $params['embedding'] = $this->toPgvector($vector); + // DEBUG: Log query vector + $vecArray = $vector->getData(); + $first5 = array_slice($vecArray, 0, 5); + file_put_contents('/tmp/hybrid_debug.log', sprintf("[%s] Query: %s | Vector dims: %d | First 5: [%s]\n", date('Y-m-d H:i:s'), $queryText, count($vecArray), implode(', ', array_map(fn($v) => sprintf('%.4f', $v), $first5))), FILE_APPEND); } if (null !== $maxScore) { @@ -212,19 +340,148 @@ public function query(Vector $vector, array $options = []): array $params['query'] = $queryText; } + // DEBUG: Log the SQL query and parameters + file_put_contents('/tmp/hybrid_debug.log', sprintf("[%s] SQL Query:\n%s\n\nParameters:\n%s\n\n", date('Y-m-d H:i:s'), $sql, print_r(array_merge($params, $options['params'] ?? []), true)), FILE_APPEND); + $statement = $this->connection->prepare($sql); $statement->execute([...$params, ...($options['params'] ?? [])]); + $includeBreakdown = $options['includeScoreBreakdown'] ?? false; $documents = []; foreach ($statement->fetchAll(\PDO::FETCH_ASSOC) as $result) { + $metadata = new Metadata(json_decode($result['metadata'] ?? '{}', true, 512, \JSON_THROW_ON_ERROR)); + + // Add score breakdown to metadata if requested + if ($includeBreakdown && isset($result['vector_rank'])) { + $metadata['_score_breakdown'] = [ + 'vector_rank' => $result['vector_rank'], + 'fts_rank' => $result['fts_rank'], + 'vector_distance' => $result['vector_distance'], + 'fts_score' => $result['fts_score'], + 'vector_contribution' => $result['vector_contribution'], + 'fts_contribution' => $result['fts_contribution'], + ]; + + // Add fuzzy matching info if available + if (isset($result['fuzzy_rank'])) { + $metadata['_score_breakdown']['fuzzy_rank'] = $result['fuzzy_rank']; + $metadata['_score_breakdown']['fuzzy_score'] = $result['fuzzy_score']; + $metadata['_score_breakdown']['fuzzy_contribution'] = $result['fuzzy_contribution']; + } + } + + // Handle cases where embedding might be NULL (fuzzy-only or FTS-only matches) + $vectorData = $result['embedding'] !== null + ? new Vector($this->fromPgvector($result['embedding'])) + : new Vector([0.0]); // Placeholder vector for non-semantic matches + $documents[] = new VectorDocument( id: Uuid::fromString($result['id']), - vector: new Vector($this->fromPgvector($result['embedding'])), - metadata: new Metadata(json_decode($result['metadata'] ?? '{}', true, 512, \JSON_THROW_ON_ERROR)), + vector: $vectorData, + metadata: $metadata, score: $result['score'], ); } + // Normalize scores to 0-100 range for better readability (if enabled) + if ($this->normalizeScores) { + // Calculate theoretical maximum RRF score: 1/(k+1) + // Normalize to 0-100 by dividing by max and multiplying by 100 + $maxScore = 1.0 / ($this->rrfK + 1); + $documents = array_map(function(VectorDocument $doc) use ($maxScore, $includeBreakdown) { + $metadata = $doc->metadata; + + // Also normalize breakdown scores if they exist + if ($includeBreakdown && isset($metadata['_score_breakdown'])) { + $breakdown = $metadata['_score_breakdown']; + $metadata['_score_breakdown'] = [ + 'vector_rank' => $breakdown['vector_rank'], + 'fts_rank' => $breakdown['fts_rank'], + 'vector_distance' => $breakdown['vector_distance'], + 'fts_score' => $breakdown['fts_score'], + 'vector_contribution' => ($breakdown['vector_contribution'] / $maxScore) * 100, + 'fts_contribution' => ($breakdown['fts_contribution'] / $maxScore) * 100, + ]; + + // Add normalized fuzzy scores if available + if (isset($breakdown['fuzzy_rank'])) { + $metadata['_score_breakdown']['fuzzy_rank'] = $breakdown['fuzzy_rank']; + $metadata['_score_breakdown']['fuzzy_score'] = $breakdown['fuzzy_score']; + $metadata['_score_breakdown']['fuzzy_contribution'] = ($breakdown['fuzzy_contribution'] / $maxScore) * 100; + } + } + + return new VectorDocument( + id: $doc->id, + vector: $doc->vector, + metadata: $metadata, + score: ($doc->score / $maxScore) * 100 + ); + }, $documents); + } + + // Apply metadata-based boosting (if configured) + // Boost scores based on metadata field values (e.g., popularity, ratings) + $boostFields = $options['boostFields'] ?? []; + if (!empty($boostFields)) { + $documents = array_map(function(VectorDocument $doc) use ($boostFields) { + $metadata = $doc->metadata; + $score = $doc->score; + $appliedBoosts = []; + + foreach ($boostFields as $field => $boostConfig) { + // Skip if metadata doesn't have this field + if (!isset($metadata[$field])) { + continue; + } + + $value = $metadata[$field]; + $boost = $boostConfig['boost'] ?? 0.0; + + // Check min/max conditions + $shouldBoost = true; + if (isset($boostConfig['min']) && $value < $boostConfig['min']) { + $shouldBoost = false; + } + if (isset($boostConfig['max']) && $value > $boostConfig['max']) { + $shouldBoost = false; + } + + // Apply boost multiplier if conditions are met + if ($shouldBoost && $boost !== 0.0) { + $score *= (1.0 + $boost); + $appliedBoosts[$field] = [ + 'value' => $value, + 'boost' => $boost, + 'multiplier' => (1.0 + $boost), + ]; + } + } + + // Add boost information to metadata if any boosts were applied + if (!empty($appliedBoosts)) { + $metadata['_applied_boosts'] = $appliedBoosts; + } + + return new VectorDocument( + id: $doc->id, + vector: $doc->vector, + metadata: $metadata, + score: $score + ); + }, $documents); + + // Re-sort by boosted scores (descending) + usort($documents, fn(VectorDocument $a, VectorDocument $b) => $b->score <=> $a->score); + } + + // Filter results by minimum score threshold (if configured) + // Note: minScore should be in the same scale as the scores (0-100 if normalized) + $minScore = $options['minScore'] ?? $this->defaultMinScore; + if (null !== $minScore) { + $documents = array_values(array_filter($documents, fn(VectorDocument $doc) => $doc->score >= $minScore)); + } + return $documents; } @@ -246,73 +503,171 @@ private function buildVectorOnlyQuery(string $whereClause, int $limit): string ); } + /** + * Build BM25 search CTE with DISTINCT ON fix for duplicate titles. + * Replaces FTS rank expression to use plpgsql_bm25 instead of ts_rank_cd. + * + * @return string BM25 CTE SQL with deduplication fix + */ + private function buildBm25Cte(): string + { + return \sprintf( + ' + bm25_search AS ( + SELECT + SUBSTRING(bm25.doc FROM \'title: ([^\n]+)\') as extracted_title, + bm25.doc, + bm25.score as bm25_score, + ROW_NUMBER() OVER (ORDER BY bm25.score DESC) as bm25_rank + FROM bm25topk( + \'%s\', + \'%s\', + :query, + 100, + \'\', + \'%s\' + ) AS bm25 + ), + bm25_with_metadata AS ( + SELECT DISTINCT ON (b.bm25_rank) + m.id, + m.metadata, + m.%s, + b.bm25_score, + b.bm25_rank + FROM bm25_search b + INNER JOIN %s m ON (m.metadata->>\'title\') = b.extracted_title + ORDER BY b.bm25_rank, m.id + )', + $this->tableName, + $this->contentFieldName, + $this->bm25Language, + $this->contentFieldName, + $this->tableName + ); + } + private function buildFtsOnlyQuery(string $whereClause, int $limit): string { - // Add FTS match filter to ensure only relevant documents are returned - $ftsFilter = \sprintf("content_tsv @@ websearch_to_tsquery('%s', :query)", $this->language); - $whereClause = $this->addFilterToWhereClause($whereClause, $ftsFilter); + // BM25-only search (no vector) + $bm25Cte = $this->buildBm25Cte(); return \sprintf(<<vectorFieldName, - $this->language, - $this->tableName, - $whereClause, + $bm25Cte, + $whereClause ? 'WHERE id IN (SELECT id FROM ' . $this->tableName . ' ' . $whereClause . ')' : '', $limit, ); } private function buildHybridQuery(string $whereClause, int $limit, float $semanticRatio): string { - // Add FTS filter for the fts_scores CTE - $ftsFilter = \sprintf("content_tsv @@ websearch_to_tsquery('%s', :query)", $this->language); - $ftsWhereClause = $this->addFilterToWhereClause($whereClause, $ftsFilter); - - // RRF (Reciprocal Rank Fusion) - Same approach as Supabase - // Formula: COALESCE(1.0 / (k + rank), 0.0) * weight - // Lower score is better (like distance) + // Use BM25 CTE with DISTINCT ON fix for duplicate titles + $bm25Cte = $this->buildBm25Cte(); + + // Add fuzzy filter for the fuzzy_scores CTE using word_similarity on search_text + // word_similarity() compares query with individual words, much better for typos + // Hybrid threshold: Configurable thresholds to balance recall and precision + // - Primary threshold ($fuzzyPrimaryThreshold) for high-quality matches + // - Secondary + strict thresholds for catching more typos with double validation + $fuzzyFilter = \sprintf( + '( + word_similarity(:query, search_text) > %f + OR ( + word_similarity(:query, search_text) > %f + AND similarity(:query, search_text) > %f + ) + )', + $this->fuzzyPrimaryThreshold, + $this->fuzzySecondaryThreshold, + $this->fuzzyStrictThreshold + ); + $fuzzyWhereClause = $this->addFilterToWhereClause($whereClause, $fuzzyFilter); + + // Calculate weights for BM25 and Fuzzy (both share the non-semantic portion) + // Weights are configurable to allow tuning for different use cases + $bm25Weight = (1.0 - $semanticRatio) * (1.0 - $this->fuzzyWeight); + $fuzzyWeightCalculated = (1.0 - $semanticRatio) * $this->fuzzyWeight; + + // Enhanced RRF: Combines vector, BM25, and fuzzy matching + // Formula: (1/(k + rank)) * normalized_score * weight + // BM25 with DISTINCT ON fix eliminates duplicate titles + // Fuzzy matching uses word_similarity on search_text for optimal typo tolerance + // Final DISTINCT ON (id) ensures no duplicates in combined results return \sprintf(<<>'title') * + FROM combined_results + ORDER BY metadata->>'title', score DESC + ) unique_results ORDER BY score DESC LIMIT %d SQL, $this->vectorFieldName, $this->vectorFieldName, $this->distance->getComparisonSign(), + $this->vectorFieldName, + $this->distance->getComparisonSign(), $this->tableName, $whereClause, - $this->language, + $bm25Cte, $this->tableName, - $ftsWhereClause, + $fuzzyWhereClause, $this->rrfK, $semanticRatio, $this->rrfK, - 1.0 - $semanticRatio, + $bm25Weight, + $this->rrfK, + $fuzzyWeightCalculated, + $this->rrfK, + $semanticRatio, + $this->rrfK, + $bm25Weight, + $this->rrfK, + $fuzzyWeightCalculated, $limit, ); } diff --git a/src/store/tests/Bridge/Postgres/HybridStoreTest.php b/src/store/tests/Bridge/Postgres/HybridStoreTest.php index 2fdd59e2a..695c65a27 100644 --- a/src/store/tests/Bridge/Postgres/HybridStoreTest.php +++ b/src/store/tests/Bridge/Postgres/HybridStoreTest.php @@ -44,7 +44,7 @@ public function testSetupCreatesTableWithFullTextSearchSupport() $pdo = $this->createMock(\PDO::class); $store = new HybridStore($pdo, 'hybrid_table'); - $pdo->expects($this->exactly(4)) + $pdo->expects($this->exactly(9)) ->method('exec') ->willReturnCallback(function (string $sql): int { static $callCount = 0; @@ -53,15 +53,26 @@ public function testSetupCreatesTableWithFullTextSearchSupport() if (1 === $callCount) { $this->assertSame('CREATE EXTENSION IF NOT EXISTS vector', $sql); } elseif (2 === $callCount) { + $this->assertSame('CREATE EXTENSION IF NOT EXISTS pg_trgm', $sql); + } elseif (3 === $callCount) { $this->assertStringContainsString('CREATE TABLE IF NOT EXISTS hybrid_table', $sql); $this->assertStringContainsString('content TEXT NOT NULL', $sql); $this->assertStringContainsString('embedding vector(1536) NOT NULL', $sql); $this->assertStringContainsString('content_tsv tsvector GENERATED ALWAYS AS (to_tsvector(\'simple\', content)) STORED', $sql); - } elseif (3 === $callCount) { + } elseif (4 === $callCount) { + $this->assertStringContainsString('ALTER TABLE hybrid_table ADD COLUMN IF NOT EXISTS search_text TEXT', $sql); + } elseif (5 === $callCount) { + $this->assertStringContainsString('CREATE OR REPLACE FUNCTION update_search_text()', $sql); + } elseif (6 === $callCount) { + $this->assertStringContainsString('CREATE TRIGGER trigger_update_search_text', $sql); + } elseif (7 === $callCount) { $this->assertStringContainsString('CREATE INDEX IF NOT EXISTS hybrid_table_embedding_idx', $sql); - } else { + } elseif (8 === $callCount) { $this->assertStringContainsString('CREATE INDEX IF NOT EXISTS hybrid_table_content_tsv_idx', $sql); $this->assertStringContainsString('USING gin(content_tsv)', $sql); + } else { + $this->assertStringContainsString('CREATE INDEX IF NOT EXISTS hybrid_table_search_text_trgm_idx', $sql); + $this->assertStringContainsString('USING gin(search_text gin_trgm_ops)', $sql); } return 0; @@ -112,7 +123,8 @@ public function testPureVectorSearch() $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 1.0); + // Disable score normalization for this test to keep legacy behavior + $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 1.0, normalizeScores: false); $expectedSql = 'SELECT id, embedding AS embedding, metadata, (embedding <-> :embedding) AS score FROM hybrid_table @@ -157,19 +169,24 @@ public function testPureKeywordSearch() $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.0); - - $expectedSql = "SELECT id, embedding AS embedding, metadata, - (1.0 / (1.0 + ts_rank_cd(content_tsv, websearch_to_tsquery('simple', :query)))) AS score - FROM hybrid_table - WHERE content_tsv @@ websearch_to_tsquery('simple', :query) - ORDER BY score ASC - LIMIT 5"; + // Disable normalization for consistent test scores + $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.0, normalizeScores: false); $pdo->expects($this->once()) ->method('prepare') - ->with($this->callback(function ($sql) use ($expectedSql) { - return $this->normalizeQuery($sql) === $this->normalizeQuery($expectedSql); + ->with($this->callback(function ($sql) { + // Verify BM25 structure instead of FTS + $this->assertStringContainsString('WITH', $sql); + $this->assertStringContainsString('bm25_search AS', $sql); + $this->assertStringContainsString('bm25topk(', $sql); + $this->assertStringContainsString('bm25_with_metadata AS', $sql); + $this->assertStringContainsString('DISTINCT ON (b.bm25_rank)', $sql); + + // Should NOT contain old FTS functions + $this->assertStringNotContainsString('ts_rank_cd', $sql); + $this->assertStringNotContainsString('websearch_to_tsquery', $sql); + + return true; })) ->willReturn($statement); @@ -204,19 +221,31 @@ public function testHybridSearchWithRRF() $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.5, rrfK: 60); + // Disable normalization for consistent test scores + $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.5, rrfK: 60, normalizeScores: false); $pdo->expects($this->once()) ->method('prepare') ->with($this->callback(function ($sql) { - // Check for RRF CTE structure + // Check for RRF CTE structure with BM25 and fuzzy $this->assertStringContainsString('WITH vector_scores AS', $sql); - $this->assertStringContainsString('fts_scores AS', $sql); + $this->assertStringContainsString('bm25_search AS', $sql); + $this->assertStringContainsString('bm25_with_metadata AS', $sql); + $this->assertStringContainsString('fuzzy_scores AS', $sql); + $this->assertStringContainsString('combined_results AS', $sql); $this->assertStringContainsString('ROW_NUMBER() OVER', $sql); - $this->assertStringContainsString('COALESCE(1.0 / (60 + v.rank_ix), 0.0)', $sql); $this->assertStringContainsString('FULL OUTER JOIN', $sql); $this->assertStringContainsString('ORDER BY score DESC', $sql); + // Should NOT contain old fts_scores CTE + $this->assertStringNotContainsString('fts_scores AS', $sql); + + // Should contain BM25 function call + $this->assertStringContainsString('bm25topk(', $sql); + + // Should contain fuzzy matching + $this->assertStringContainsString('word_similarity', $sql); + return true; })) ->willReturn($statement); @@ -324,12 +353,18 @@ public function testQueryWithCustomLanguage() $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.0, language: 'french'); + // Test BM25 language parameter (short code 'fr' instead of 'french') + $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.0, language: 'french', bm25Language: 'fr'); $pdo->expects($this->once()) ->method('prepare') ->with($this->callback(function ($sql) { - $this->assertStringContainsString("websearch_to_tsquery('french'", $sql); + // Should NOT contain old FTS function + $this->assertStringNotContainsString("websearch_to_tsquery('french'", $sql); + + // Should contain BM25 with 'fr' language code + $this->assertStringContainsString('bm25topk(', $sql); + $this->assertStringContainsString("'fr'", $sql); return true; })) @@ -356,8 +391,14 @@ public function testQueryWithCustomRRFK() $pdo->expects($this->once()) ->method('prepare') ->with($this->callback(function ($sql) { - $this->assertStringContainsString('COALESCE(1.0 / (100 + v.rank_ix), 0.0)', $sql); - $this->assertStringContainsString('COALESCE(1.0 / (100 + f.rank_ix), 0.0)', $sql); + // Check for RRF constant 100 in the formula + $this->assertStringContainsString('100 + v.rank_ix', $sql); + $this->assertStringContainsString('100 + b.bm25_rank', $sql); + $this->assertStringContainsString('100 + fz.rank_ix', $sql); + + // Verify BM25 and fuzzy structure (not old FTS) + $this->assertStringContainsString('bm25_search AS', $sql); + $this->assertStringContainsString('fuzzy_scores AS', $sql); return true; })) From 38c15eef123ebc2a21d68a4a199162c4f78cb306 Mon Sep 17 00:00:00 2001 From: Ahmed EBEN HASSINE Date: Sun, 23 Nov 2025 20:35:02 +0100 Subject: [PATCH 7/9] test(store): add comprehensive tests for BM25 features Add 3 new tests covering newly introduced functionality: - testFuzzyMatchingWithWordSimilarity: Verifies pg_trgm fuzzy matching with word_similarity() and custom thresholds (primary, secondary, strict) - testSearchableAttributesWithBoost: Ensures field-specific tsvector columns are created with proper GIN indexes (title_tsv, overview_tsv) - testFuzzyWeightParameter: Validates fuzzy weight distribution in RRF formula when combining vector, BM25, and fuzzy scores All tests verify SQL generation via callback assertions. Test suite: 19 tests, 132 assertions, all passing. --- .../tests/Bridge/Postgres/HybridStoreTest.php | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) diff --git a/src/store/tests/Bridge/Postgres/HybridStoreTest.php b/src/store/tests/Bridge/Postgres/HybridStoreTest.php index 695c65a27..7033fdf5d 100644 --- a/src/store/tests/Bridge/Postgres/HybridStoreTest.php +++ b/src/store/tests/Bridge/Postgres/HybridStoreTest.php @@ -529,6 +529,121 @@ public function testPureKeywordSearchReturnsEmptyWhenNoMatch() $this->assertCount(0, $results); } + public function testFuzzyMatchingWithWordSimilarity() + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + // Test fuzzy matching with custom thresholds + $store = new HybridStore( + $pdo, + 'hybrid_table', + semanticRatio: 0.5, + fuzzyWeight: 0.3, + fuzzyPrimaryThreshold: 0.3, + fuzzySecondaryThreshold: 0.25, + fuzzyStrictThreshold: 0.2 + ); + + $pdo->expects($this->once()) + ->method('prepare') + ->with($this->callback(function ($sql) { + // Verify fuzzy_scores CTE exists + $this->assertStringContainsString('fuzzy_scores AS', $sql); + + // Verify word_similarity function is used + $this->assertStringContainsString('word_similarity(:query, search_text)', $sql); + + // Verify custom thresholds are applied + $this->assertStringContainsString('0.300000', $sql); // Primary threshold + $this->assertStringContainsString('0.250000', $sql); // Secondary threshold + $this->assertStringContainsString('0.200000', $sql); // Strict threshold + + return true; + })) + ->willReturn($statement); + + $statement->expects($this->once())->method('execute'); + $statement->expects($this->once())->method('fetchAll')->willReturn([]); + + $store->query(new Vector([0.1, 0.2, 0.3]), ['q' => 'test']); + } + + public function testSearchableAttributesWithBoost() + { + $pdo = $this->createMock(\PDO::class); + + // Test with searchable attributes configuration + $searchableAttributes = [ + 'title' => ['boost' => 2.0, 'metadata_key' => 'title'], + 'overview' => ['boost' => 1.0, 'metadata_key' => 'overview'], + ]; + + $store = new HybridStore( + $pdo, + 'hybrid_table', + searchableAttributes: $searchableAttributes + ); + + $pdo->expects($this->exactly(10)) + ->method('exec') + ->willReturnCallback(function (string $sql): int { + static $callCount = 0; + ++$callCount; + + if (3 === $callCount) { + // Verify separate tsvector columns for each attribute + $this->assertStringContainsString('title_tsv tsvector GENERATED ALWAYS AS', $sql); + $this->assertStringContainsString('overview_tsv tsvector GENERATED ALWAYS AS', $sql); + + // Should NOT contain generic content_tsv (backward compat mode) + $this->assertStringNotContainsString('content_tsv tsvector GENERATED ALWAYS AS (to_tsvector(\'simple\', content)) STORED', $sql); + } elseif ($callCount >= 8 && $callCount <= 9) { + // Verify separate GIN indexes for each attribute (title_tsv_idx, overview_tsv_idx) + $this->assertStringContainsString('_tsv_idx', $sql); + $this->assertStringContainsString('USING gin(', $sql); + } + + return 0; + }); + + $store->setup(); + } + + public function testFuzzyWeightParameter() + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + // Test that fuzzyWeight controls the weight in RRF formula + $store = new HybridStore( + $pdo, + 'hybrid_table', + semanticRatio: 0.4, // 60% non-semantic + fuzzyWeight: 0.5 // 50% of non-semantic goes to fuzzy + ); + // Expected: 40% vector, 30% BM25 (60% * 0.5), 30% fuzzy (60% * 0.5) + + $pdo->expects($this->once()) + ->method('prepare') + ->with($this->callback(function ($sql) { + // Verify fuzzy weight is present in the RRF formula + $this->assertStringContainsString('fuzzy_scores AS', $sql); + $this->assertStringContainsString('combined_results AS', $sql); + + // Should have three components: vector, BM25, fuzzy + $this->assertStringContainsString('COALESCE(1.0 / (', $sql); // RRF formula pattern + + return true; + })) + ->willReturn($statement); + + $statement->expects($this->once())->method('execute'); + $statement->expects($this->once())->method('fetchAll')->willReturn([]); + + $store->query(new Vector([0.1, 0.2, 0.3]), ['q' => 'test']); + } + private function normalizeQuery(string $query): string { // Remove extra spaces, tabs and newlines From a86b7538fee4776277afdc19d0fdadc297e0dafb Mon Sep 17 00:00:00 2001 From: Ahmed EBEN HASSINE Date: Wed, 26 Nov 2025 04:49:42 +0100 Subject: [PATCH 8/9] refactor(store): improve code quality and fix PHPStan errors in HybridStore - Extract RRF logic into dedicated ReciprocalRankFusion class - Introduce TextSearchStrategyInterface for pluggable search strategies - Remove debug code (file_put_contents calls) - Replace empty() with strict comparisons ([] !==) per PHPStan rules - Add missing PHPDoc types for array parameters - Mark properties as readonly for immutability - Extract helper methods (buildTsvectorColumns, createSearchTextTrigger) - Use NullVector for results without embeddings - Update tests to reflect new setup() execution order --- src/store/src/Bridge/Postgres/HybridStore.php | 891 +++++++++--------- .../Postgres/PostgresTextSearchStrategy.php | 112 +++ .../Bridge/Postgres/ReciprocalRankFusion.php | 157 +++ .../TextSearch/Bm25TextSearchStrategy.php | 140 +++ .../TextSearch/PostgresTextSearchStrategy.php | 112 +++ .../TextSearchStrategyInterface.php | 84 ++ .../tests/Bridge/Postgres/HybridStoreTest.php | 565 ++++++++--- .../Postgres/ReciprocalRankFusionTest.php | 221 +++++ 8 files changed, 1726 insertions(+), 556 deletions(-) create mode 100644 src/store/src/Bridge/Postgres/PostgresTextSearchStrategy.php create mode 100644 src/store/src/Bridge/Postgres/ReciprocalRankFusion.php create mode 100644 src/store/src/Bridge/Postgres/TextSearch/Bm25TextSearchStrategy.php create mode 100644 src/store/src/Bridge/Postgres/TextSearch/PostgresTextSearchStrategy.php create mode 100644 src/store/src/Bridge/Postgres/TextSearch/TextSearchStrategyInterface.php create mode 100644 src/store/tests/Bridge/Postgres/ReciprocalRankFusionTest.php diff --git a/src/store/src/Bridge/Postgres/HybridStore.php b/src/store/src/Bridge/Postgres/HybridStore.php index 834decf0c..ff7ac57b8 100644 --- a/src/store/src/Bridge/Postgres/HybridStore.php +++ b/src/store/src/Bridge/Postgres/HybridStore.php @@ -11,8 +11,11 @@ namespace Symfony\AI\Store\Bridge\Postgres; +use Symfony\AI\Platform\Vector\NullVector; use Symfony\AI\Platform\Vector\Vector; use Symfony\AI\Platform\Vector\VectorInterface; +use Symfony\AI\Store\Bridge\Postgres\TextSearch\PostgresTextSearchStrategy; +use Symfony\AI\Store\Bridge\Postgres\TextSearch\TextSearchStrategyInterface; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\VectorDocument; use Symfony\AI\Store\Exception\InvalidArgumentException; @@ -21,16 +24,12 @@ use Symfony\Component\Uid\Uuid; /** - * Hybrid Search Store for PostgreSQL/Supabase - * Combines pgvector (semantic) + BM25 (keyword) using RRF. + * Hybrid Search Store for PostgreSQL combining vector similarity and full-text search. * - * Uses Reciprocal Rank Fusion (RRF) to combine vector similarity and BM25 search, - * following the same approach as Supabase hybrid search implementation. - * - * Requirements: - * - PostgreSQL with pgvector extension - * - plpgsql_bm25 extension for BM25 search - * - A 'content' text field for BM25 search + * Uses Reciprocal Rank Fusion (RRF) to combine multiple search signals: + * - Vector similarity (pgvector) + * - Full-text search (configurable: native PostgreSQL or BM25) + * - Fuzzy matching (pg_trgm) for typo tolerance * * @see https://supabase.com/docs/guides/ai/hybrid-search * @@ -38,110 +37,76 @@ */ final class HybridStore implements ManagedStoreInterface, StoreInterface { + private readonly ReciprocalRankFusion $rrf; + private readonly TextSearchStrategyInterface $textSearchStrategy; + /** - * @param string $vectorFieldName Name of the vector field - * @param string $contentFieldName Name of the text field for FTS - * @param float $semanticRatio Ratio between semantic (vector) and keyword (FTS) search (0.0 to 1.0) - * - 0.0 = 100% keyword search (FTS) - * - 0.5 = balanced hybrid search - * - 1.0 = 100% semantic search (vector only) - default - * @param Distance $distance Distance metric for vector similarity - * @param string $language PostgreSQL text search configuration (default: 'simple') - * - 'simple': Works for ALL languages, no stemming (recommended for multilingual content) - * - 'english', 'french', 'spanish', etc.: Language-specific stemming/stopwords - * @param int $rrfK RRF (Reciprocal Rank Fusion) constant for hybrid search (default: 60) - * Higher values = more equal weighting between results - * @param float|null $defaultMaxScore Default maximum distance threshold for vector search (default: null = no filter) - * Only applies to pure vector search (semanticRatio = 1.0) - * Prevents returning irrelevant results with high distance scores - * Example: 0.8 means only return documents with distance < 0.8 - * @param float|null $defaultMinScore Default minimum RRF score threshold (default: null = no filter) - * Filters out results with RRF score below this threshold - * Useful to prevent irrelevant results when FTS returns no matches - * Example: 0.01 means only return documents with RRF score >= 0.01 - * @param bool $normalizeScores Normalize scores to 0-100 range for better readability (default: true) - * When true, scores are multiplied by 100 - * Example: 0.0164 becomes 1.64 (more intuitive) - * @param float $fuzzyPrimaryThreshold Primary threshold for fuzzy matching (default: 0.25) - * Higher threshold = fewer false positives, stricter matching - * Recommended: 0.25 for good balance - * @param float $fuzzySecondaryThreshold Secondary threshold for fuzzy matching (default: 0.2) - * Used with fuzzyStrictThreshold for double validation - * Catches more typos but requires strict check - * @param float $fuzzyStrictThreshold Strict similarity threshold for double validation (default: 0.15) - * Used with fuzzySecondaryThreshold to eliminate false positives - * Ensures word_similarity > 0.2 has minimum similarity > 0.15 - * @param float $fuzzyWeight Weight of fuzzy matching in hybrid search (default: 0.5) - * - 0.0 = fuzzy disabled - * - 0.5 = equal weight with FTS (recommended) - * - 1.0 = fuzzy only (not recommended) - * @param array $searchableAttributes Searchable attributes with field-specific boosting (similar to Meilisearch) - * Format: ['field_name' => ['boost' => 2.0, 'metadata_key' => 'title'], ...] - * Each attribute creates a separate tsvector column extracted from metadata - * Example: ['title' => ['boost' => 2.0, 'metadata_key' => 'title'], - * 'overview' => ['boost' => 0.5, 'metadata_key' => 'overview']] - * @param string $bm25Language BM25 language code (default: 'en') - * BM25 uses short codes: 'en', 'fr', 'es', 'de', etc. - * Separate from $language which is for PostgreSQL FTS + * @param string $vectorFieldName Name of the vector field + * @param string $contentFieldName Name of the text field for FTS + * @param float $semanticRatio Ratio between semantic and keyword search (0.0 to 1.0) + * @param Distance $distance Distance metric for vector similarity + * @param string $language PostgreSQL text search configuration + * @param TextSearchStrategyInterface|null $textSearchStrategy Text search strategy (defaults to native PostgreSQL) + * @param ReciprocalRankFusion|null $rrf RRF calculator (defaults to k=60, normalized) + * @param float|null $defaultMaxScore Default max distance for vector search + * @param float|null $defaultMinScore Default min RRF score threshold + * @param float $fuzzyPrimaryThreshold Primary threshold for fuzzy matching + * @param float $fuzzySecondaryThreshold Secondary threshold for fuzzy matching + * @param float $fuzzyStrictThreshold Strict threshold for double validation + * @param float $fuzzyWeight Weight of fuzzy matching (0.0 to 1.0) + * @param array $searchableAttributes Searchable attributes with boosting config */ public function __construct( - private \PDO $connection, - private string $tableName, - private string $vectorFieldName = 'embedding', - private string $contentFieldName = 'content', - private float $semanticRatio = 1.0, - private Distance $distance = Distance::L2, - private string $language = 'simple', - private int $rrfK = 60, - private ?float $defaultMaxScore = null, - private ?float $defaultMinScore = null, - private bool $normalizeScores = true, - private float $fuzzyPrimaryThreshold = 0.25, - private float $fuzzySecondaryThreshold = 0.2, - private float $fuzzyStrictThreshold = 0.15, - private float $fuzzyWeight = 0.5, - private array $searchableAttributes = [], - private string $bm25Language = 'en', + private readonly \PDO $connection, + private readonly string $tableName, + private readonly string $vectorFieldName = 'embedding', + private readonly string $contentFieldName = 'content', + private readonly float $semanticRatio = 1.0, + private readonly Distance $distance = Distance::L2, + private readonly string $language = 'simple', + ?TextSearchStrategyInterface $textSearchStrategy = null, + ?ReciprocalRankFusion $rrf = null, + private readonly ?float $defaultMaxScore = null, + private readonly ?float $defaultMinScore = null, + private readonly float $fuzzyPrimaryThreshold = 0.25, + private readonly float $fuzzySecondaryThreshold = 0.2, + private readonly float $fuzzyStrictThreshold = 0.15, + private readonly float $fuzzyWeight = 0.5, + private readonly array $searchableAttributes = [], ) { if ($semanticRatio < 0.0 || $semanticRatio > 1.0) { - throw new InvalidArgumentException(\sprintf('The semantic ratio must be between 0.0 and 1.0, "%s" given.', $semanticRatio)); + throw new InvalidArgumentException(\sprintf( + 'The semantic ratio must be between 0.0 and 1.0, "%s" given.', + $semanticRatio + )); } + if ($fuzzyWeight < 0.0 || $fuzzyWeight > 1.0) { - throw new InvalidArgumentException(\sprintf('The fuzzy weight must be between 0.0 and 1.0, "%s" given.', $fuzzyWeight)); + throw new InvalidArgumentException(\sprintf( + 'The fuzzy weight must be between 0.0 and 1.0, "%s" given.', + $fuzzyWeight + )); } + + $this->textSearchStrategy = $textSearchStrategy ?? new PostgresTextSearchStrategy(); + $this->rrf = $rrf ?? new ReciprocalRankFusion(); } + /** + * @param array{vector_type?: string, vector_size?: positive-int, index_method?: string, index_opclass?: string} $options + */ public function setup(array $options = []): void { // Enable pgvector extension $this->connection->exec('CREATE EXTENSION IF NOT EXISTS vector'); - // Enable pg_trgm extension for fuzzy matching (typo tolerance) + // Enable pg_trgm extension for fuzzy matching $this->connection->exec('CREATE EXTENSION IF NOT EXISTS pg_trgm'); - // Build tsvector columns based on searchable_attributes configuration - $tsvectorColumns = ''; - if (!empty($this->searchableAttributes)) { - // Create separate tsvector column for each searchable attribute - foreach ($this->searchableAttributes as $fieldName => $config) { - $metadataKey = $config['metadata_key']; - $tsvectorColumns .= \sprintf( - ",\n %s_tsv tsvector GENERATED ALWAYS AS (to_tsvector('%s', COALESCE(metadata->>'%s', ''))) STORED", - $fieldName, - $this->language, - $metadataKey - ); - } - } else { - // Backward compatibility: use single content_tsv if no searchable_attributes configured - $tsvectorColumns = \sprintf( - ",\n content_tsv tsvector GENERATED ALWAYS AS (to_tsvector('%s', %s)) STORED", - $this->language, - $this->contentFieldName - ); - } + // Build tsvector columns + $tsvectorColumns = $this->buildTsvectorColumns(); - // Create table with vector field, content field for FTS, and tsvector field(s) + // Create main table $this->connection->exec( \sprintf( 'CREATE TABLE IF NOT EXISTS %s ( @@ -159,8 +124,7 @@ public function setup(array $options = []): void ), ); - // Add search_text field for optimized fuzzy matching - // This field contains only title + relevant metadata for better fuzzy precision + // Add search_text field for fuzzy matching $this->connection->exec( \sprintf( 'ALTER TABLE %s ADD COLUMN IF NOT EXISTS search_text TEXT', @@ -168,29 +132,8 @@ public function setup(array $options = []): void ), ); - // Create function to auto-update search_text from metadata - $this->connection->exec( - "CREATE OR REPLACE FUNCTION update_search_text() - RETURNS TRIGGER AS $$ - BEGIN - NEW.search_text := COALESCE(NEW.metadata->>'title', ''); - RETURN NEW; - END; - $$ LANGUAGE plpgsql;" - ); - - // Create trigger to auto-update search_text on insert/update - $this->connection->exec( - \sprintf( - "DROP TRIGGER IF EXISTS trigger_update_search_text ON %s; - CREATE TRIGGER trigger_update_search_text - BEFORE INSERT OR UPDATE ON %s - FOR EACH ROW - EXECUTE FUNCTION update_search_text();", - $this->tableName, - $this->tableName, - ), - ); + // Create trigger for search_text auto-update + $this->createSearchTextTrigger(); // Create vector index $this->connection->exec( @@ -205,32 +148,17 @@ public function setup(array $options = []): void ), ); - // Create GIN index for full-text search - if (!empty($this->searchableAttributes)) { - // Create GIN index for each searchable attribute tsvector - foreach ($this->searchableAttributes as $fieldName => $config) { - $this->connection->exec( - \sprintf( - 'CREATE INDEX IF NOT EXISTS %s_%s_tsv_idx ON %s USING gin(%s_tsv)', - $this->tableName, - $fieldName, - $this->tableName, - $fieldName, - ), - ); + // Execute text search strategy setup (only if not using searchableAttributes) + if ([] === $this->searchableAttributes) { + foreach ($this->textSearchStrategy->getSetupSql($this->tableName, $this->contentFieldName, $this->language) as $sql) { + $this->connection->exec($sql); } } else { - // Backward compatibility: create single content_tsv index - $this->connection->exec( - \sprintf( - 'CREATE INDEX IF NOT EXISTS %s_content_tsv_idx ON %s USING gin(content_tsv)', - $this->tableName, - $this->tableName, - ), - ); + // Create GIN indexes for tsvector columns when using searchableAttributes + $this->createTsvectorIndexes(); } - // Create trigram index on search_text for optimized fuzzy matching + // Create trigram index for fuzzy matching $this->connection->exec( \sprintf( 'CREATE INDEX IF NOT EXISTS %s_search_text_trgm_idx ON %s USING gin(search_text gin_trgm_ops)', @@ -262,20 +190,22 @@ public function add(VectorDocument ...$documents): void ); foreach ($documents as $document) { - $operation = [ + $statement->execute([ 'id' => $document->id->toRfc4122(), 'metadata' => json_encode($document->metadata->getArrayCopy(), \JSON_THROW_ON_ERROR), 'content' => $document->metadata->getText() ?? '', 'vector' => $this->toPgvector($document->vector), - ]; - - $statement->execute($operation); + ]); } } /** * Hybrid search combining vector similarity and full-text search. * + * Note: When results come from FTS-only or fuzzy-only matches (no vector similarity), + * the VectorDocument will contain a NullVector. Check with `$doc->vector instanceof NullVector` + * before calling getData() or getDimensions() on the vector. + * * @param array{ * q?: string, * semanticRatio?: float, @@ -287,213 +217,206 @@ public function add(VectorDocument ...$documents): void * includeScoreBreakdown?: bool, * boostFields?: array * } $options + * + * @return VectorDocument[] */ public function query(Vector $vector, array $options = []): array { - $semanticRatio = $options['semanticRatio'] ?? $this->semanticRatio; - - if ($semanticRatio < 0.0 || $semanticRatio > 1.0) { - throw new InvalidArgumentException(\sprintf('The semantic ratio must be between 0.0 and 1.0, "%s" given.', $semanticRatio)); - } - + $semanticRatio = $this->validateSemanticRatio($options['semanticRatio'] ?? $this->semanticRatio); $queryText = $options['q'] ?? ''; $limit = $options['limit'] ?? 5; - // Build WHERE clause - $where = []; - $params = []; + // Build WHERE clause and params + [$whereClause, $params] = $this->buildWhereClause($vector, $options, $semanticRatio); - // Use maxScore from options, or defaultMaxScore if configured - $maxScore = $options['maxScore'] ?? $this->defaultMaxScore; + // Choose query strategy + $sql = $this->buildQuery($semanticRatio, $queryText, $whereClause, $limit); - // Ensure embedding param is set if maxScore is used (regardless of semanticRatio) - if ($semanticRatio > 0.0 || null !== $maxScore) { - $params['embedding'] = $this->toPgvector($vector); - // DEBUG: Log query vector - $vecArray = $vector->getData(); - $first5 = array_slice($vecArray, 0, 5); - file_put_contents('/tmp/hybrid_debug.log', sprintf("[%s] Query: %s | Vector dims: %d | First 5: [%s]\n", date('Y-m-d H:i:s'), $queryText, count($vecArray), implode(', ', array_map(fn($v) => sprintf('%.4f', $v), $first5))), FILE_APPEND); + if ('' !== $queryText && $semanticRatio < 1.0) { + $params['query'] = $queryText; } - if (null !== $maxScore) { - $where[] = "({$this->vectorFieldName} {$this->distance->getComparisonSign()} :embedding) <= :maxScore"; - $params['maxScore'] = $maxScore; - } + // Execute query + $statement = $this->connection->prepare($sql); + $statement->execute([...$params, ...($options['params'] ?? [])]); - if (isset($options['where']) && '' !== $options['where']) { - $where[] = '('.$options['where'].')'; - } + // Process results + $documents = $this->processResults( + $statement->fetchAll(\PDO::FETCH_ASSOC), + $options['includeScoreBreakdown'] ?? false, + ); - $whereClause = $where ? 'WHERE '.implode(' AND ', $where) : ''; + // Apply boosting + if (isset($options['boostFields']) && [] !== $options['boostFields']) { + $documents = $this->applyBoostFields($documents, $options['boostFields']); + } - // Choose query strategy based on semanticRatio and query text - if (1.0 === $semanticRatio || '' === $queryText) { - // Pure vector search - $sql = $this->buildVectorOnlyQuery($whereClause, $limit); - } elseif (0.0 === $semanticRatio) { - // Pure full-text search - $sql = $this->buildFtsOnlyQuery($whereClause, $limit); - $params['query'] = $queryText; - } else { - // Hybrid search with weighted combination - $sql = $this->buildHybridQuery($whereClause, $limit, $semanticRatio); - $params['query'] = $queryText; + // Apply minimum score filter + $minScore = $options['minScore'] ?? $this->defaultMinScore; + if (null !== $minScore) { + $documents = array_values(array_filter( + $documents, + fn (VectorDocument $doc) => $doc->score >= $minScore + )); } - // DEBUG: Log the SQL query and parameters - file_put_contents('/tmp/hybrid_debug.log', sprintf("[%s] SQL Query:\n%s\n\nParameters:\n%s\n\n", date('Y-m-d H:i:s'), $sql, print_r(array_merge($params, $options['params'] ?? []), true)), FILE_APPEND); + return $documents; + } - $statement = $this->connection->prepare($sql); - $statement->execute([...$params, ...($options['params'] ?? [])]); + /** + * Get the text search strategy being used. + */ + public function getTextSearchStrategy(): TextSearchStrategyInterface + { + return $this->textSearchStrategy; + } - $includeBreakdown = $options['includeScoreBreakdown'] ?? false; - $documents = []; - foreach ($statement->fetchAll(\PDO::FETCH_ASSOC) as $result) { - $metadata = new Metadata(json_decode($result['metadata'] ?? '{}', true, 512, \JSON_THROW_ON_ERROR)); + /** + * Get the RRF calculator being used. + */ + public function getRrf(): ReciprocalRankFusion + { + return $this->rrf; + } - // Add score breakdown to metadata if requested - if ($includeBreakdown && isset($result['vector_rank'])) { - $metadata['_score_breakdown'] = [ - 'vector_rank' => $result['vector_rank'], - 'fts_rank' => $result['fts_rank'], - 'vector_distance' => $result['vector_distance'], - 'fts_score' => $result['fts_score'], - 'vector_contribution' => $result['vector_contribution'], - 'fts_contribution' => $result['fts_contribution'], - ]; - - // Add fuzzy matching info if available - if (isset($result['fuzzy_rank'])) { - $metadata['_score_breakdown']['fuzzy_rank'] = $result['fuzzy_rank']; - $metadata['_score_breakdown']['fuzzy_score'] = $result['fuzzy_score']; - $metadata['_score_breakdown']['fuzzy_contribution'] = $result['fuzzy_contribution']; - } + private function buildTsvectorColumns(): string + { + if ([] !== $this->searchableAttributes) { + $columns = ''; + foreach ($this->searchableAttributes as $fieldName => $config) { + $metadataKey = $config['metadata_key']; + $columns .= \sprintf( + ",\n %s_tsv tsvector GENERATED ALWAYS AS (to_tsvector('%s', COALESCE(metadata->>'%s', ''))) STORED", + $fieldName, + $this->language, + $metadataKey + ); } - // Handle cases where embedding might be NULL (fuzzy-only or FTS-only matches) - $vectorData = $result['embedding'] !== null - ? new Vector($this->fromPgvector($result['embedding'])) - : new Vector([0.0]); // Placeholder vector for non-semantic matches + return $columns; + } - $documents[] = new VectorDocument( - id: Uuid::fromString($result['id']), - vector: $vectorData, - metadata: $metadata, - score: $result['score'], + // When not using searchableAttributes, let the TextSearchStrategy handle tsvector columns + return ''; + } + + private function createSearchTextTrigger(): void + { + $this->connection->exec( + "CREATE OR REPLACE FUNCTION update_search_text() + RETURNS TRIGGER AS \$\$ + BEGIN + NEW.search_text := COALESCE(NEW.metadata->>'title', ''); + RETURN NEW; + END; + \$\$ LANGUAGE plpgsql;" + ); + + $this->connection->exec( + \sprintf( + "DROP TRIGGER IF EXISTS trigger_update_search_text ON %s; + CREATE TRIGGER trigger_update_search_text + BEFORE INSERT OR UPDATE ON %s + FOR EACH ROW + EXECUTE FUNCTION update_search_text();", + $this->tableName, + $this->tableName, + ), + ); + } + + private function createTsvectorIndexes(): void + { + if ([] !== $this->searchableAttributes) { + foreach ($this->searchableAttributes as $fieldName => $config) { + $this->connection->exec( + \sprintf( + 'CREATE INDEX IF NOT EXISTS %s_%s_tsv_idx ON %s USING gin(%s_tsv)', + $this->tableName, + $fieldName, + $this->tableName, + $fieldName, + ), + ); + } + } else { + $this->connection->exec( + \sprintf( + 'CREATE INDEX IF NOT EXISTS %s_content_tsv_idx ON %s USING gin(content_tsv)', + $this->tableName, + $this->tableName, + ), ); } + } + + private function validateSemanticRatio(float $ratio): float + { + if ($ratio < 0.0 || $ratio > 1.0) { + throw new InvalidArgumentException(\sprintf( + 'The semantic ratio must be between 0.0 and 1.0, "%s" given.', + $ratio + )); + } - // Normalize scores to 0-100 range for better readability (if enabled) - if ($this->normalizeScores) { - // Calculate theoretical maximum RRF score: 1/(k+1) - // Normalize to 0-100 by dividing by max and multiplying by 100 - $maxScore = 1.0 / ($this->rrfK + 1); - $documents = array_map(function(VectorDocument $doc) use ($maxScore, $includeBreakdown) { - $metadata = $doc->metadata; - - // Also normalize breakdown scores if they exist - if ($includeBreakdown && isset($metadata['_score_breakdown'])) { - $breakdown = $metadata['_score_breakdown']; - $metadata['_score_breakdown'] = [ - 'vector_rank' => $breakdown['vector_rank'], - 'fts_rank' => $breakdown['fts_rank'], - 'vector_distance' => $breakdown['vector_distance'], - 'fts_score' => $breakdown['fts_score'], - 'vector_contribution' => ($breakdown['vector_contribution'] / $maxScore) * 100, - 'fts_contribution' => ($breakdown['fts_contribution'] / $maxScore) * 100, - ]; + return $ratio; + } - // Add normalized fuzzy scores if available - if (isset($breakdown['fuzzy_rank'])) { - $metadata['_score_breakdown']['fuzzy_rank'] = $breakdown['fuzzy_rank']; - $metadata['_score_breakdown']['fuzzy_score'] = $breakdown['fuzzy_score']; - $metadata['_score_breakdown']['fuzzy_contribution'] = ($breakdown['fuzzy_contribution'] / $maxScore) * 100; - } - } + /** + * @param array $options + * + * @return array{string, array} + */ + private function buildWhereClause(Vector $vector, array $options, float $semanticRatio): array + { + $where = []; + $params = []; - return new VectorDocument( - id: $doc->id, - vector: $doc->vector, - metadata: $metadata, - score: ($doc->score / $maxScore) * 100 - ); - }, $documents); + $maxScore = $options['maxScore'] ?? $this->defaultMaxScore; + + if ($semanticRatio > 0.0 || null !== $maxScore) { + $params['embedding'] = $this->toPgvector($vector); } - // Apply metadata-based boosting (if configured) - // Boost scores based on metadata field values (e.g., popularity, ratings) - $boostFields = $options['boostFields'] ?? []; - if (!empty($boostFields)) { - $documents = array_map(function(VectorDocument $doc) use ($boostFields) { - $metadata = $doc->metadata; - $score = $doc->score; - $appliedBoosts = []; - - foreach ($boostFields as $field => $boostConfig) { - // Skip if metadata doesn't have this field - if (!isset($metadata[$field])) { - continue; - } - - $value = $metadata[$field]; - $boost = $boostConfig['boost'] ?? 0.0; - - // Check min/max conditions - $shouldBoost = true; - if (isset($boostConfig['min']) && $value < $boostConfig['min']) { - $shouldBoost = false; - } - if (isset($boostConfig['max']) && $value > $boostConfig['max']) { - $shouldBoost = false; - } - - // Apply boost multiplier if conditions are met - if ($shouldBoost && $boost !== 0.0) { - $score *= (1.0 + $boost); - $appliedBoosts[$field] = [ - 'value' => $value, - 'boost' => $boost, - 'multiplier' => (1.0 + $boost), - ]; - } - } + if (null !== $maxScore) { + $where[] = \sprintf( + '(%s %s :embedding) <= :maxScore', + $this->vectorFieldName, + $this->distance->getComparisonSign() + ); + $params['maxScore'] = $maxScore; + } - // Add boost information to metadata if any boosts were applied - if (!empty($appliedBoosts)) { - $metadata['_applied_boosts'] = $appliedBoosts; - } + if (isset($options['where']) && '' !== $options['where']) { + $where[] = '('.$options['where'].')'; + } - return new VectorDocument( - id: $doc->id, - vector: $doc->vector, - metadata: $metadata, - score: $score - ); - }, $documents); + $whereClause = $where ? 'WHERE '.implode(' AND ', $where) : ''; - // Re-sort by boosted scores (descending) - usort($documents, fn(VectorDocument $a, VectorDocument $b) => $b->score <=> $a->score); + return [$whereClause, $params]; + } + + private function buildQuery(float $semanticRatio, string $queryText, string $whereClause, int $limit): string + { + if (1.0 === $semanticRatio || '' === $queryText) { + return $this->buildVectorOnlyQuery($whereClause, $limit); } - // Filter results by minimum score threshold (if configured) - // Note: minScore should be in the same scale as the scores (0-100 if normalized) - $minScore = $options['minScore'] ?? $this->defaultMinScore; - if (null !== $minScore) { - $documents = array_values(array_filter($documents, fn(VectorDocument $doc) => $doc->score >= $minScore)); + if (0.0 === $semanticRatio) { + return $this->buildFtsOnlyQuery($whereClause, $limit); } - return $documents; + return $this->buildHybridQuery($whereClause, $limit, $semanticRatio); } private function buildVectorOnlyQuery(string $whereClause, int $limit): string { - return \sprintf(<<vectorFieldName, $this->vectorFieldName, $this->distance->getComparisonSign(), @@ -503,106 +426,71 @@ private function buildVectorOnlyQuery(string $whereClause, int $limit): string ); } - /** - * Build BM25 search CTE with DISTINCT ON fix for duplicate titles. - * Replaces FTS rank expression to use plpgsql_bm25 instead of ts_rank_cd. - * - * @return string BM25 CTE SQL with deduplication fix - */ - private function buildBm25Cte(): string + private function buildFtsOnlyQuery(string $whereClause, int $limit): string { - return \sprintf( - ' - bm25_search AS ( - SELECT - SUBSTRING(bm25.doc FROM \'title: ([^\n]+)\') as extracted_title, - bm25.doc, - bm25.score as bm25_score, - ROW_NUMBER() OVER (ORDER BY bm25.score DESC) as bm25_rank - FROM bm25topk( - \'%s\', - \'%s\', - :query, - 100, - \'\', - \'%s\' - ) AS bm25 - ), - bm25_with_metadata AS ( - SELECT DISTINCT ON (b.bm25_rank) - m.id, - m.metadata, - m.%s, - b.bm25_score, - b.bm25_rank - FROM bm25_search b - INNER JOIN %s m ON (m.metadata->>\'title\') = b.extracted_title - ORDER BY b.bm25_rank, m.id - )', + $ftsCte = $this->textSearchStrategy->buildSearchCte( $this->tableName, $this->contentFieldName, - $this->bm25Language, - $this->contentFieldName, - $this->tableName + $this->language, ); - } + $cteAlias = $this->textSearchStrategy->getCteAlias(); + $scoreColumn = $this->textSearchStrategy->getScoreColumn(); - private function buildFtsOnlyQuery(string $whereClause, int $limit): string - { - // BM25-only search (no vector) - $bm25Cte = $this->buildBm25Cte(); - - return \sprintf(<<tableName . ' ' . $whereClause . ')' : '', + ORDER BY %s DESC + LIMIT %d', + $ftsCte, + $scoreColumn, + $cteAlias, + $whereClause ? 'WHERE id IN (SELECT id FROM '.$this->tableName.' '.$whereClause.')' : '', + $scoreColumn, $limit, ); } private function buildHybridQuery(string $whereClause, int $limit, float $semanticRatio): string { - // Use BM25 CTE with DISTINCT ON fix for duplicate titles - $bm25Cte = $this->buildBm25Cte(); - - // Add fuzzy filter for the fuzzy_scores CTE using word_similarity on search_text - // word_similarity() compares query with individual words, much better for typos - // Hybrid threshold: Configurable thresholds to balance recall and precision - // - Primary threshold ($fuzzyPrimaryThreshold) for high-quality matches - // - Secondary + strict thresholds for catching more typos with double validation - $fuzzyFilter = \sprintf( - '( - word_similarity(:query, search_text) > %f - OR ( - word_similarity(:query, search_text) > %f - AND similarity(:query, search_text) > %f - ) - )', - $this->fuzzyPrimaryThreshold, - $this->fuzzySecondaryThreshold, - $this->fuzzyStrictThreshold + $ftsCte = $this->textSearchStrategy->buildSearchCte( + $this->tableName, + $this->contentFieldName, + $this->language, ); - $fuzzyWhereClause = $this->addFilterToWhereClause($whereClause, $fuzzyFilter); + $ftsAlias = $this->textSearchStrategy->getCteAlias(); + $ftsRankColumn = $this->textSearchStrategy->getRankColumn(); + $ftsScoreColumn = $this->textSearchStrategy->getScoreColumn(); + $ftsNormalizedScore = $this->textSearchStrategy->getNormalizedScoreExpression($ftsScoreColumn); - // Calculate weights for BM25 and Fuzzy (both share the non-semantic portion) - // Weights are configurable to allow tuning for different use cases - $bm25Weight = (1.0 - $semanticRatio) * (1.0 - $this->fuzzyWeight); + // Calculate weights + $ftsWeight = (1.0 - $semanticRatio) * (1.0 - $this->fuzzyWeight); $fuzzyWeightCalculated = (1.0 - $semanticRatio) * $this->fuzzyWeight; - // Enhanced RRF: Combines vector, BM25, and fuzzy matching - // Formula: (1/(k + rank)) * normalized_score * weight - // BM25 with DISTINCT ON fix eliminates duplicate titles - // Fuzzy matching uses word_similarity on search_text for optimal typo tolerance - // Final DISTINCT ON (id) ensures no duplicates in combined results - return \sprintf(<<buildFuzzyFilter(); + $fuzzyWhereClause = $this->addFilterToWhereClause($whereClause, $fuzzyFilter); + + // Build RRF expressions using the RRF class + $vectorContribution = $this->rrf->buildSqlExpression( + 'v.rank_ix', + '(1.0 - LEAST(v.distance / 2.0, 1.0))', + $semanticRatio, + ); + $ftsContribution = $this->rrf->buildSqlExpression( + "b.{$ftsRankColumn}", + $ftsNormalizedScore, + $ftsWeight, + ); + $fuzzyContribution = $this->rrf->buildSqlExpression( + 'fz.rank_ix', + 'fz.fuzzy_similarity', + $fuzzyWeightCalculated, + ); + + return \sprintf( + 'WITH vector_scores AS ( SELECT id, %s AS embedding, metadata, (%s %s :embedding) AS distance, ROW_NUMBER() OVER (ORDER BY %s %s :embedding) AS rank_ix @@ -618,34 +506,32 @@ private function buildHybridQuery(string $whereClause, int $limit, float $semant %s ), combined_results AS ( - SELECT COALESCE(v.id, b.id, fz.id) as id, v.embedding, COALESCE(v.metadata, b.metadata, fz.metadata) as metadata, - ( - COALESCE(1.0 / (%d + v.rank_ix) * (1.0 - LEAST(v.distance / 2.0, 1.0)), 0.0) * %f + - COALESCE(1.0 / (%d + b.bm25_rank) * LEAST(b.bm25_score / 10.0, 1.0), 0.0) * %f + - COALESCE(1.0 / (%d + fz.rank_ix) * fz.fuzzy_similarity, 0.0) * %f - ) AS score, - v.rank_ix AS vector_rank, - b.bm25_rank AS fts_rank, - v.distance AS vector_distance, - b.bm25_score AS fts_score, - fz.rank_ix AS fuzzy_rank, - fz.fuzzy_similarity AS fuzzy_score, - COALESCE(1.0 / (%d + v.rank_ix) * (1.0 - LEAST(v.distance / 2.0, 1.0)), 0.0) * %f AS vector_contribution, - COALESCE(1.0 / (%d + b.bm25_rank) * LEAST(b.bm25_score / 10.0, 1.0), 0.0) * %f AS fts_contribution, - COALESCE(1.0 / (%d + fz.rank_ix) * fz.fuzzy_similarity, 0.0) * %f AS fuzzy_contribution + SELECT + COALESCE(v.id, b.id, fz.id) as id, + v.embedding, + COALESCE(v.metadata, b.metadata, fz.metadata) as metadata, + (%s + %s + %s) AS score, + v.rank_ix AS vector_rank, + b.%s AS fts_rank, + v.distance AS vector_distance, + b.%s AS fts_score, + fz.rank_ix AS fuzzy_rank, + fz.fuzzy_similarity AS fuzzy_score, + %s AS vector_contribution, + %s AS fts_contribution, + %s AS fuzzy_contribution FROM vector_scores v - FULL OUTER JOIN bm25_with_metadata b ON v.id = b.id + FULL OUTER JOIN %s b ON v.id = b.id FULL OUTER JOIN fuzzy_scores fz ON COALESCE(v.id, b.id) = fz.id WHERE v.id IS NOT NULL OR b.id IS NOT NULL OR fz.id IS NOT NULL ) SELECT * FROM ( - SELECT DISTINCT ON (metadata->>'title') * + SELECT DISTINCT ON (metadata->>\'title\') * FROM combined_results - ORDER BY metadata->>'title', score DESC + ORDER BY metadata->>\'title\', score DESC ) unique_results ORDER BY score DESC - LIMIT %d - SQL, + LIMIT %d', $this->vectorFieldName, $this->vectorFieldName, $this->distance->getComparisonSign(), @@ -653,33 +539,38 @@ private function buildHybridQuery(string $whereClause, int $limit, float $semant $this->distance->getComparisonSign(), $this->tableName, $whereClause, - $bm25Cte, + $ftsCte, $this->tableName, $fuzzyWhereClause, - $this->rrfK, - $semanticRatio, - $this->rrfK, - $bm25Weight, - $this->rrfK, - $fuzzyWeightCalculated, - $this->rrfK, - $semanticRatio, - $this->rrfK, - $bm25Weight, - $this->rrfK, - $fuzzyWeightCalculated, + $vectorContribution, + $ftsContribution, + $fuzzyContribution, + $ftsRankColumn, + $ftsScoreColumn, + $vectorContribution, + $ftsContribution, + $fuzzyContribution, + $ftsAlias, $limit, ); } - /** - * Adds a filter condition to an existing WHERE clause using AND logic. - * - * @param string $whereClause Existing WHERE clause (may be empty or start with 'WHERE ') - * @param string $filter Filter condition to add (without 'WHERE ') - * - * @return string Combined WHERE clause - */ + private function buildFuzzyFilter(): string + { + return \sprintf( + '( + word_similarity(:query, search_text) > %f + OR ( + word_similarity(:query, search_text) > %f + AND similarity(:query, search_text) > %f + ) + )', + $this->fuzzyPrimaryThreshold, + $this->fuzzySecondaryThreshold, + $this->fuzzyStrictThreshold + ); + } + private function addFilterToWhereClause(string $whereClause, string $filter): string { if ('' === $whereClause) { @@ -692,10 +583,136 @@ private function addFilterToWhereClause(string $whereClause, string $filter): st return "$whereClause AND $filter"; } - // Unexpected format, prepend WHERE return "WHERE $filter AND ".ltrim($whereClause); } + /** + * @param array> $results + * + * @return VectorDocument[] + */ + private function processResults(array $results, bool $includeBreakdown): array + { + $documents = []; + + foreach ($results as $result) { + $metadata = new Metadata(json_decode($result['metadata'] ?? '{}', true, 512, \JSON_THROW_ON_ERROR)); + + if ($includeBreakdown && isset($result['vector_rank'])) { + $metadata['_score_breakdown'] = $this->buildScoreBreakdown($result); + } + + // Use NullVector for results without embedding (FTS-only or fuzzy-only matches) + $vector = null !== $result['embedding'] + ? new Vector($this->fromPgvector($result['embedding'])) + : new NullVector(); + + $score = $result['score']; + if ($this->rrf->isNormalized()) { + $score = $this->rrf->normalize($score); + } + + $documents[] = new VectorDocument( + id: Uuid::fromString($result['id']), + vector: $vector, + metadata: $metadata, + score: $score, + ); + } + + return $documents; + } + + /** + * @param array $result + * + * @return array + */ + private function buildScoreBreakdown(array $result): array + { + $breakdown = [ + 'vector_rank' => $result['vector_rank'], + 'fts_rank' => $result['fts_rank'], + 'vector_distance' => $result['vector_distance'], + 'fts_score' => $result['fts_score'], + 'vector_contribution' => $result['vector_contribution'], + 'fts_contribution' => $result['fts_contribution'], + ]; + + if (isset($result['fuzzy_rank'])) { + $breakdown['fuzzy_rank'] = $result['fuzzy_rank']; + $breakdown['fuzzy_score'] = $result['fuzzy_score']; + $breakdown['fuzzy_contribution'] = $result['fuzzy_contribution']; + } + + if ($this->rrf->isNormalized()) { + $breakdown['vector_contribution'] = $this->rrf->normalize($breakdown['vector_contribution']); + $breakdown['fts_contribution'] = $this->rrf->normalize($breakdown['fts_contribution']); + + if (isset($breakdown['fuzzy_contribution'])) { + $breakdown['fuzzy_contribution'] = $this->rrf->normalize($breakdown['fuzzy_contribution']); + } + } + + return $breakdown; + } + + /** + * @param VectorDocument[] $documents + * @param array $boostFields + * + * @return VectorDocument[] + */ + private function applyBoostFields(array $documents, array $boostFields): array + { + $documents = array_map(function (VectorDocument $doc) use ($boostFields) { + $metadata = $doc->metadata; + $score = $doc->score; + $appliedBoosts = []; + + foreach ($boostFields as $field => $boostConfig) { + if (!isset($metadata[$field])) { + continue; + } + + $value = $metadata[$field]; + $boost = $boostConfig['boost'] ?? 0.0; + + $shouldBoost = true; + if (isset($boostConfig['min']) && $value < $boostConfig['min']) { + $shouldBoost = false; + } + if (isset($boostConfig['max']) && $value > $boostConfig['max']) { + $shouldBoost = false; + } + + if ($shouldBoost && 0.0 !== $boost) { + $score *= (1.0 + $boost); + $appliedBoosts[$field] = [ + 'value' => $value, + 'boost' => $boost, + 'multiplier' => (1.0 + $boost), + ]; + } + } + + if ([] !== $appliedBoosts) { + $metadata['_applied_boosts'] = $appliedBoosts; + } + + return new VectorDocument( + id: $doc->id, + vector: $doc->vector, + metadata: $metadata, + score: $score + ); + }, $documents); + + usort($documents, fn (VectorDocument $a, VectorDocument $b) => $b->score <=> $a->score); + + return $documents; + } + private function toPgvector(VectorInterface $vector): string { return '['.implode(',', $vector->getData()).']'; diff --git a/src/store/src/Bridge/Postgres/PostgresTextSearchStrategy.php b/src/store/src/Bridge/Postgres/PostgresTextSearchStrategy.php new file mode 100644 index 000000000..9c81f64e1 --- /dev/null +++ b/src/store/src/Bridge/Postgres/PostgresTextSearchStrategy.php @@ -0,0 +1,112 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Bridge\Postgres\TextSearch; + +/** + * PostgreSQL native full-text search strategy using ts_rank_cd. + * + * This is the default strategy that works with any PostgreSQL installation + * without requiring additional extensions. + * + * @author Ahmed EBEN HASSINE + */ +final class PostgresTextSearchStrategy implements TextSearchStrategyInterface +{ + private const CTE_ALIAS = 'fts_search'; + private const RANK_COLUMN = 'fts_rank'; + private const SCORE_COLUMN = 'fts_score'; + + public function getSetupSql(string $tableName, string $contentFieldName, string $language): array + { + return [ + // Add tsvector column if not exists + \sprintf( + "ALTER TABLE %s ADD COLUMN IF NOT EXISTS content_tsv tsvector + GENERATED ALWAYS AS (to_tsvector('%s', %s)) STORED", + $tableName, + $language, + $contentFieldName, + ), + // Create GIN index for full-text search + \sprintf( + 'CREATE INDEX IF NOT EXISTS %s_content_tsv_idx ON %s USING gin(content_tsv)', + $tableName, + $tableName, + ), + ]; + } + + public function buildSearchCte( + string $tableName, + string $contentFieldName, + string $language, + string $queryParam = ':query', + ): string { + return \sprintf( + "%s AS ( + SELECT + id, + metadata, + %s, + ts_rank_cd(content_tsv, plainto_tsquery('%s', %s)) AS %s, + ROW_NUMBER() OVER ( + ORDER BY ts_rank_cd(content_tsv, plainto_tsquery('%s', %s)) DESC + ) AS %s + FROM %s + WHERE content_tsv @@ plainto_tsquery('%s', %s) + )", + self::CTE_ALIAS, + $contentFieldName, + $language, + $queryParam, + self::SCORE_COLUMN, + $language, + $queryParam, + self::RANK_COLUMN, + $tableName, + $language, + $queryParam, + ); + } + + public function getCteAlias(): string + { + return self::CTE_ALIAS; + } + + public function getRankColumn(): string + { + return self::RANK_COLUMN; + } + + public function getScoreColumn(): string + { + return self::SCORE_COLUMN; + } + + public function getNormalizedScoreExpression(string $scoreColumn): string + { + // ts_rank_cd returns values typically between 0 and 1, but can exceed 1 + // We cap it at 1.0 for normalization + return \sprintf('LEAST(%s, 1.0)', $scoreColumn); + } + + public function getRequiredExtensions(): array + { + return []; // No additional extensions required + } + + public function isAvailable(\PDO $connection): bool + { + return true; // Always available in PostgreSQL + } +} diff --git a/src/store/src/Bridge/Postgres/ReciprocalRankFusion.php b/src/store/src/Bridge/Postgres/ReciprocalRankFusion.php new file mode 100644 index 000000000..cf9b2f0ef --- /dev/null +++ b/src/store/src/Bridge/Postgres/ReciprocalRankFusion.php @@ -0,0 +1,157 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Bridge\Postgres; + +/** + * Reciprocal Rank Fusion (RRF) calculator for combining multiple search rankings. + * + * RRF is a method to combine results from multiple search algorithms by their ranks. + * The formula is: score = Σ (weight_i / (k + rank_i)) + * + * @see https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf + * + * @author Ahmed EBEN HASSINE + */ +final class ReciprocalRankFusion +{ + /** + * @param int $k RRF constant (default: 60). Higher values give more equal weighting between results. + * @param bool $normalizeScores Whether to normalize scores to 0-100 range (default: true) + */ + public function __construct( + private readonly int $k = 60, + private readonly bool $normalizeScores = true, + ) { + } + + /** + * Calculate RRF score for a single result with multiple rankings. + * + * @param array $rankings + * Each entry contains: rank (1-based), score (normalized 0-1), weight (0-1) + * + * @return float The combined RRF score + */ + public function calculate(array $rankings): float + { + $score = 0.0; + + foreach ($rankings as $ranking) { + if (null === $ranking['rank']) { + continue; + } + + $contribution = (1.0 / ($this->k + $ranking['rank'])) * $ranking['score'] * $ranking['weight']; + $score += $contribution; + } + + if ($this->normalizeScores) { + $score = $this->normalize($score); + } + + return $score; + } + + /** + * Calculate individual contribution for a ranking. + * + * @param int $rank The rank (1-based position) + * @param float $score The normalized score (0-1) + * @param float $weight The weight for this ranking source (0-1) + */ + public function calculateContribution(int $rank, float $score, float $weight): float + { + $contribution = (1.0 / ($this->k + $rank)) * $score * $weight; + + if ($this->normalizeScores) { + $contribution = $this->normalize($contribution); + } + + return $contribution; + } + + /** + * Normalize a score to 0-100 range. + * + * The theoretical maximum RRF score is 1/(k+1), so we normalize against that. + */ + public function normalize(float $score): float + { + $maxScore = 1.0 / ($this->k + 1); + + return ($score / $maxScore) * 100; + } + + /** + * Denormalize a score from 0-100 range back to raw RRF score. + */ + public function denormalize(float $normalizedScore): float + { + $maxScore = 1.0 / ($this->k + 1); + + return ($normalizedScore / 100) * $maxScore; + } + + /** + * Build SQL expression for RRF calculation. + * + * @param string $rankColumn The column containing the rank + * @param string $scoreExpr SQL expression for the normalized score (0-1) + * @param float $weight The weight for this ranking source + * @param string $nullDefault Default value when rank is NULL (default: '0.0') + */ + public function buildSqlExpression( + string $rankColumn, + string $scoreExpr, + float $weight, + string $nullDefault = '0.0', + ): string { + return \sprintf( + 'COALESCE(1.0 / (%d + %s) * %s * %f, %s)', + $this->k, + $rankColumn, + $scoreExpr, + $weight, + $nullDefault, + ); + } + + /** + * Build SQL expression for combining multiple RRF contributions. + * + * @param array $sources + */ + public function buildCombinedSqlExpression(array $sources): string + { + $expressions = []; + + foreach ($sources as $source) { + $expressions[] = $this->buildSqlExpression( + $source['rank_column'], + $source['score_expr'], + $source['weight'], + ); + } + + return '(' . implode(' + ', $expressions) . ')'; + } + + public function getK(): int + { + return $this->k; + } + + public function isNormalized(): bool + { + return $this->normalizeScores; + } +} diff --git a/src/store/src/Bridge/Postgres/TextSearch/Bm25TextSearchStrategy.php b/src/store/src/Bridge/Postgres/TextSearch/Bm25TextSearchStrategy.php new file mode 100644 index 000000000..7d4dc16ca --- /dev/null +++ b/src/store/src/Bridge/Postgres/TextSearch/Bm25TextSearchStrategy.php @@ -0,0 +1,140 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Bridge\Postgres\TextSearch; + +/** + * BM25 full-text search strategy using plpgsql_bm25 extension. + * + * BM25 (Best Matching 25) is a ranking function used by search engines + * to estimate the relevance of documents to a given search query. + * It's generally more accurate than PostgreSQL's native ts_rank_cd. + * + * Requirements: + * - plpgsql_bm25 extension must be installed + * + * @see https://github.com/pgsql-bm25/plpgsql_bm25 + * + * @author Ahmed EBEN HASSINE + */ +final class Bm25TextSearchStrategy implements TextSearchStrategyInterface +{ + private const CTE_ALIAS = 'bm25_with_metadata'; + private const RANK_COLUMN = 'bm25_rank'; + private const SCORE_COLUMN = 'bm25_score'; + + /** + * @param string $bm25Language BM25 language code ('en', 'fr', 'es', etc.) + * @param int $topK Number of results to retrieve from BM25 (default: 100) + */ + public function __construct( + private readonly string $bm25Language = 'en', + private readonly int $topK = 100, + ) { + } + + public function getSetupSql(string $tableName, string $contentFieldName, string $language): array + { + // BM25 doesn't require additional table setup, it uses the content field directly + // The index is managed internally by the bm25topk function + return []; + } + + public function buildSearchCte( + string $tableName, + string $contentFieldName, + string $language, + string $queryParam = ':query', + ): string { + // BM25 search with deduplication fix for duplicate titles + return \sprintf( + "bm25_search AS ( + SELECT + SUBSTRING(bm25.doc FROM 'title: ([^\n]+)') as extracted_title, + bm25.doc, + bm25.score as %s, + ROW_NUMBER() OVER (ORDER BY bm25.score DESC) as %s + FROM bm25topk( + '%s', + '%s', + %s, + %d, + '', + '%s' + ) AS bm25 + ), + %s AS ( + SELECT DISTINCT ON (b.%s) + m.id, + m.metadata, + m.%s, + b.%s, + b.%s + FROM bm25_search b + INNER JOIN %s m ON (m.metadata->>'title') = b.extracted_title + ORDER BY b.%s, m.id + )", + self::SCORE_COLUMN, + self::RANK_COLUMN, + $tableName, + $contentFieldName, + $queryParam, + $this->topK, + $this->bm25Language, + self::CTE_ALIAS, + self::RANK_COLUMN, + $contentFieldName, + self::SCORE_COLUMN, + self::RANK_COLUMN, + $tableName, + self::RANK_COLUMN, + ); + } + + public function getCteAlias(): string + { + return self::CTE_ALIAS; + } + + public function getRankColumn(): string + { + return self::RANK_COLUMN; + } + + public function getScoreColumn(): string + { + return self::SCORE_COLUMN; + } + + public function getNormalizedScoreExpression(string $scoreColumn): string + { + // BM25 scores are typically in 0-10+ range, normalize to 0-1 + return \sprintf('LEAST(%s / 10.0, 1.0)', $scoreColumn); + } + + public function getRequiredExtensions(): array + { + return ['plpgsql_bm25']; + } + + public function isAvailable(\PDO $connection): bool + { + try { + $stmt = $connection->query( + "SELECT 1 FROM pg_proc WHERE proname = 'bm25topk' LIMIT 1" + ); + + return $stmt->fetchColumn() !== false; + } catch (\PDOException) { + return false; + } + } +} diff --git a/src/store/src/Bridge/Postgres/TextSearch/PostgresTextSearchStrategy.php b/src/store/src/Bridge/Postgres/TextSearch/PostgresTextSearchStrategy.php new file mode 100644 index 000000000..9c81f64e1 --- /dev/null +++ b/src/store/src/Bridge/Postgres/TextSearch/PostgresTextSearchStrategy.php @@ -0,0 +1,112 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Bridge\Postgres\TextSearch; + +/** + * PostgreSQL native full-text search strategy using ts_rank_cd. + * + * This is the default strategy that works with any PostgreSQL installation + * without requiring additional extensions. + * + * @author Ahmed EBEN HASSINE + */ +final class PostgresTextSearchStrategy implements TextSearchStrategyInterface +{ + private const CTE_ALIAS = 'fts_search'; + private const RANK_COLUMN = 'fts_rank'; + private const SCORE_COLUMN = 'fts_score'; + + public function getSetupSql(string $tableName, string $contentFieldName, string $language): array + { + return [ + // Add tsvector column if not exists + \sprintf( + "ALTER TABLE %s ADD COLUMN IF NOT EXISTS content_tsv tsvector + GENERATED ALWAYS AS (to_tsvector('%s', %s)) STORED", + $tableName, + $language, + $contentFieldName, + ), + // Create GIN index for full-text search + \sprintf( + 'CREATE INDEX IF NOT EXISTS %s_content_tsv_idx ON %s USING gin(content_tsv)', + $tableName, + $tableName, + ), + ]; + } + + public function buildSearchCte( + string $tableName, + string $contentFieldName, + string $language, + string $queryParam = ':query', + ): string { + return \sprintf( + "%s AS ( + SELECT + id, + metadata, + %s, + ts_rank_cd(content_tsv, plainto_tsquery('%s', %s)) AS %s, + ROW_NUMBER() OVER ( + ORDER BY ts_rank_cd(content_tsv, plainto_tsquery('%s', %s)) DESC + ) AS %s + FROM %s + WHERE content_tsv @@ plainto_tsquery('%s', %s) + )", + self::CTE_ALIAS, + $contentFieldName, + $language, + $queryParam, + self::SCORE_COLUMN, + $language, + $queryParam, + self::RANK_COLUMN, + $tableName, + $language, + $queryParam, + ); + } + + public function getCteAlias(): string + { + return self::CTE_ALIAS; + } + + public function getRankColumn(): string + { + return self::RANK_COLUMN; + } + + public function getScoreColumn(): string + { + return self::SCORE_COLUMN; + } + + public function getNormalizedScoreExpression(string $scoreColumn): string + { + // ts_rank_cd returns values typically between 0 and 1, but can exceed 1 + // We cap it at 1.0 for normalization + return \sprintf('LEAST(%s, 1.0)', $scoreColumn); + } + + public function getRequiredExtensions(): array + { + return []; // No additional extensions required + } + + public function isAvailable(\PDO $connection): bool + { + return true; // Always available in PostgreSQL + } +} diff --git a/src/store/src/Bridge/Postgres/TextSearch/TextSearchStrategyInterface.php b/src/store/src/Bridge/Postgres/TextSearch/TextSearchStrategyInterface.php new file mode 100644 index 000000000..0081a801d --- /dev/null +++ b/src/store/src/Bridge/Postgres/TextSearch/TextSearchStrategyInterface.php @@ -0,0 +1,84 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Bridge\Postgres\TextSearch; + +/** + * Strategy interface for full-text search implementations. + * + * Allows pluggable FTS backends (PostgreSQL native, BM25, etc.) + * + * @author Ahmed EBEN HASSINE + */ +interface TextSearchStrategyInterface +{ + /** + * Get the SQL statements needed to set up the text search. + * + * @param string $tableName The table name + * @param string $contentFieldName The content field name + * @param string $language The language configuration + * + * @return string[] Array of SQL statements to execute + */ + public function getSetupSql(string $tableName, string $contentFieldName, string $language): array; + + /** + * Build the CTE (Common Table Expression) for text search ranking. + * + * @param string $tableName The table name + * @param string $contentFieldName The content field name + * @param string $language The language configuration + * @param string $queryParam The parameter name for the query (e.g., ':query') + * + * @return string SQL CTE expression + */ + public function buildSearchCte( + string $tableName, + string $contentFieldName, + string $language, + string $queryParam = ':query', + ): string; + + /** + * Get the name of the CTE that will be used in joins. + */ + public function getCteAlias(): string; + + /** + * Get the rank column name from the CTE. + */ + public function getRankColumn(): string; + + /** + * Get the score column name from the CTE. + */ + public function getScoreColumn(): string; + + /** + * Get the SQL expression to normalize the score to 0-1 range. + * + * @param string $scoreColumn The score column name + */ + public function getNormalizedScoreExpression(string $scoreColumn): string; + + /** + * Check if this strategy requires external extensions. + * + * @return string[] List of required extensions + */ + public function getRequiredExtensions(): array; + + /** + * Check if the strategy is available (extensions installed, etc.). + */ + public function isAvailable(\PDO $connection): bool; +} diff --git a/src/store/tests/Bridge/Postgres/HybridStoreTest.php b/src/store/tests/Bridge/Postgres/HybridStoreTest.php index 7033fdf5d..204e12585 100644 --- a/src/store/tests/Bridge/Postgres/HybridStoreTest.php +++ b/src/store/tests/Bridge/Postgres/HybridStoreTest.php @@ -12,8 +12,13 @@ namespace Symfony\AI\Store\Tests\Bridge\Postgres; use PHPUnit\Framework\TestCase; +use Symfony\AI\Platform\Vector\NullVector; use Symfony\AI\Platform\Vector\Vector; use Symfony\AI\Store\Bridge\Postgres\HybridStore; +use Symfony\AI\Store\Bridge\Postgres\ReciprocalRankFusion; +use Symfony\AI\Store\Bridge\Postgres\TextSearch\Bm25TextSearchStrategy; +use Symfony\AI\Store\Bridge\Postgres\TextSearch\PostgresTextSearchStrategy; +use Symfony\AI\Store\Bridge\Postgres\TextSearch\TextSearchStrategyInterface; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\VectorDocument; use Symfony\AI\Store\Exception\InvalidArgumentException; @@ -21,7 +26,7 @@ final class HybridStoreTest extends TestCase { - public function testConstructorValidatesSemanticRatio() + public function testConstructorValidatesSemanticRatio(): void { $this->expectException(InvalidArgumentException::class); $this->expectExceptionMessage('The semantic ratio must be between 0.0 and 1.0'); @@ -30,7 +35,7 @@ public function testConstructorValidatesSemanticRatio() new HybridStore($pdo, 'test_table', semanticRatio: 1.5); } - public function testConstructorValidatesNegativeSemanticRatio() + public function testConstructorValidatesNegativeSemanticRatio(): void { $this->expectException(InvalidArgumentException::class); $this->expectExceptionMessage('The semantic ratio must be between 0.0 and 1.0'); @@ -39,12 +44,57 @@ public function testConstructorValidatesNegativeSemanticRatio() new HybridStore($pdo, 'test_table', semanticRatio: -0.5); } - public function testSetupCreatesTableWithFullTextSearchSupport() + public function testConstructorValidatesFuzzyWeight(): void + { + $this->expectException(InvalidArgumentException::class); + $this->expectExceptionMessage('The fuzzy weight must be between 0.0 and 1.0'); + + $pdo = $this->createMock(\PDO::class); + new HybridStore($pdo, 'test_table', fuzzyWeight: 1.5); + } + + public function testConstructorUsesDefaultTextSearchStrategy(): void + { + $pdo = $this->createMock(\PDO::class); + $store = new HybridStore($pdo, 'test_table'); + + $this->assertInstanceOf(PostgresTextSearchStrategy::class, $store->getTextSearchStrategy()); + } + + public function testConstructorUsesCustomTextSearchStrategy(): void + { + $pdo = $this->createMock(\PDO::class); + $customStrategy = new Bm25TextSearchStrategy(); + $store = new HybridStore($pdo, 'test_table', textSearchStrategy: $customStrategy); + + $this->assertSame($customStrategy, $store->getTextSearchStrategy()); + } + + public function testConstructorUsesDefaultRrf(): void + { + $pdo = $this->createMock(\PDO::class); + $store = new HybridStore($pdo, 'test_table'); + + $this->assertInstanceOf(ReciprocalRankFusion::class, $store->getRrf()); + $this->assertSame(60, $store->getRrf()->getK()); + } + + public function testConstructorUsesCustomRrf(): void + { + $pdo = $this->createMock(\PDO::class); + $customRrf = new ReciprocalRankFusion(k: 100, normalizeScores: false); + $store = new HybridStore($pdo, 'test_table', rrf: $customRrf); + + $this->assertSame($customRrf, $store->getRrf()); + $this->assertSame(100, $store->getRrf()->getK()); + } + + public function testSetupCreatesTableWithFullTextSearchSupport(): void { $pdo = $this->createMock(\PDO::class); $store = new HybridStore($pdo, 'hybrid_table'); - $pdo->expects($this->exactly(9)) + $pdo->expects($this->exactly(10)) ->method('exec') ->willReturnCallback(function (string $sql): int { static $callCount = 0; @@ -58,7 +108,6 @@ public function testSetupCreatesTableWithFullTextSearchSupport() $this->assertStringContainsString('CREATE TABLE IF NOT EXISTS hybrid_table', $sql); $this->assertStringContainsString('content TEXT NOT NULL', $sql); $this->assertStringContainsString('embedding vector(1536) NOT NULL', $sql); - $this->assertStringContainsString('content_tsv tsvector GENERATED ALWAYS AS (to_tsvector(\'simple\', content)) STORED', $sql); } elseif (4 === $callCount) { $this->assertStringContainsString('ALTER TABLE hybrid_table ADD COLUMN IF NOT EXISTS search_text TEXT', $sql); } elseif (5 === $callCount) { @@ -68,6 +117,11 @@ public function testSetupCreatesTableWithFullTextSearchSupport() } elseif (7 === $callCount) { $this->assertStringContainsString('CREATE INDEX IF NOT EXISTS hybrid_table_embedding_idx', $sql); } elseif (8 === $callCount) { + // TextSearchStrategy adds content_tsv column via ALTER TABLE + $this->assertStringContainsString('ALTER TABLE hybrid_table ADD COLUMN IF NOT EXISTS content_tsv', $sql); + $this->assertStringContainsString('GENERATED ALWAYS AS (to_tsvector(\'simple\', content)) STORED', $sql); + } elseif (9 === $callCount) { + // TextSearchStrategy creates GIN index for content_tsv $this->assertStringContainsString('CREATE INDEX IF NOT EXISTS hybrid_table_content_tsv_idx', $sql); $this->assertStringContainsString('USING gin(content_tsv)', $sql); } else { @@ -81,7 +135,35 @@ public function testSetupCreatesTableWithFullTextSearchSupport() $store->setup(); } - public function testAddDocument() + public function testSetupExecutesTextSearchStrategySetupSql(): void + { + $pdo = $this->createMock(\PDO::class); + + $mockStrategy = $this->createMock(TextSearchStrategyInterface::class); + $mockStrategy->expects($this->once()) + ->method('getSetupSql') + ->with('hybrid_table', 'content', 'simple') + ->willReturn([ + 'CREATE INDEX custom_idx ON hybrid_table USING gin(content)', + ]); + + $store = new HybridStore($pdo, 'hybrid_table', textSearchStrategy: $mockStrategy); + + $execCalls = []; + $pdo->expects($this->atLeast(1)) + ->method('exec') + ->willReturnCallback(function (string $sql) use (&$execCalls): int { + $execCalls[] = $sql; + + return 0; + }); + + $store->setup(); + + $this->assertContains('CREATE INDEX custom_idx ON hybrid_table USING gin(content)', $execCalls); + } + + public function testAddDocument(): void { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -118,13 +200,54 @@ public function testAddDocument() $store->add($document); } - public function testPureVectorSearch() + public function testAddMultipleDocuments(): void + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + $store = new HybridStore($pdo, 'hybrid_table'); + + $pdo->expects($this->once()) + ->method('prepare') + ->willReturn($statement); + + $uuid1 = Uuid::v4(); + $uuid2 = Uuid::v4(); + + $statement->expects($this->exactly(2)) + ->method('execute') + ->willReturnCallback(function (array $params) use ($uuid1, $uuid2): bool { + static $callCount = 0; + ++$callCount; + + if (1 === $callCount) { + $this->assertSame($uuid1->toRfc4122(), $params['id']); + $this->assertSame('First document', $params['content']); + } else { + $this->assertSame($uuid2->toRfc4122(), $params['id']); + $this->assertSame('Second document', $params['content']); + } + + return true; + }); + + $metadata1 = new Metadata(['_text' => 'First document']); + $metadata2 = new Metadata(['_text' => 'Second document']); + + $document1 = new VectorDocument($uuid1, new Vector([0.1, 0.2, 0.3]), $metadata1); + $document2 = new VectorDocument($uuid2, new Vector([0.4, 0.5, 0.6]), $metadata2); + + $store->add($document1, $document2); + } + + public function testPureVectorSearch(): void { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - // Disable score normalization for this test to keep legacy behavior - $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 1.0, normalizeScores: false); + // Disable score normalization for this test + $rrf = new ReciprocalRankFusion(normalizeScores: false); + $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 1.0, rrf: $rrf); $expectedSql = 'SELECT id, embedding AS embedding, metadata, (embedding <-> :embedding) AS score FROM hybrid_table @@ -164,27 +287,88 @@ public function testPureVectorSearch() $this->assertSame(0.05, $results[0]->score); } - public function testPureKeywordSearch() + public function testPureKeywordSearchWithPostgresStrategy(): void { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - // Disable normalization for consistent test scores - $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.0, normalizeScores: false); + $rrf = new ReciprocalRankFusion(normalizeScores: false); + $store = new HybridStore( + $pdo, + 'hybrid_table', + semanticRatio: 0.0, + textSearchStrategy: new PostgresTextSearchStrategy(), + rrf: $rrf + ); + + $pdo->expects($this->once()) + ->method('prepare') + ->with($this->callback(function ($sql) { + // Verify PostgreSQL native FTS structure + $this->assertStringContainsString('WITH', $sql); + $this->assertStringContainsString('fts_search AS', $sql); + $this->assertStringContainsString('ts_rank_cd', $sql); + $this->assertStringContainsString('plainto_tsquery', $sql); + $this->assertStringContainsString('content_tsv @@', $sql); + + return true; + })) + ->willReturn($statement); + + $statement->expects($this->once()) + ->method('execute') + ->with($this->callback(function ($params) { + return isset($params['query']) && 'PostgreSQL' === $params['query']; + })); + + $uuid = Uuid::v4(); + + $statement->expects($this->once()) + ->method('fetchAll') + ->with(\PDO::FETCH_ASSOC) + ->willReturn([ + [ + 'id' => $uuid->toRfc4122(), + 'embedding' => null, + 'metadata' => json_encode(['text' => 'PostgreSQL is awesome']), + 'score' => 0.5, + ], + ]); + + $results = $store->query(new Vector([0.1, 0.2, 0.3]), ['q' => 'PostgreSQL']); + + $this->assertCount(1, $results); + $this->assertSame(0.5, $results[0]->score); + // FTS-only results should have NullVector + $this->assertInstanceOf(NullVector::class, $results[0]->vector); + } + + public function testPureKeywordSearchWithBm25Strategy(): void + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + $rrf = new ReciprocalRankFusion(normalizeScores: false); + $store = new HybridStore( + $pdo, + 'hybrid_table', + semanticRatio: 0.0, + textSearchStrategy: new Bm25TextSearchStrategy(bm25Language: 'en'), + rrf: $rrf + ); $pdo->expects($this->once()) ->method('prepare') ->with($this->callback(function ($sql) { - // Verify BM25 structure instead of FTS + // Verify BM25 structure $this->assertStringContainsString('WITH', $sql); $this->assertStringContainsString('bm25_search AS', $sql); $this->assertStringContainsString('bm25topk(', $sql); $this->assertStringContainsString('bm25_with_metadata AS', $sql); - $this->assertStringContainsString('DISTINCT ON (b.bm25_rank)', $sql); + $this->assertStringContainsString('DISTINCT ON', $sql); - // Should NOT contain old FTS functions + // Should NOT contain native FTS functions $this->assertStringNotContainsString('ts_rank_cd', $sql); - $this->assertStringNotContainsString('websearch_to_tsquery', $sql); return true; })) @@ -204,7 +388,7 @@ public function testPureKeywordSearch() ->willReturn([ [ 'id' => $uuid->toRfc4122(), - 'embedding' => '[0.1,0.2,0.3]', + 'embedding' => null, 'metadata' => json_encode(['text' => 'PostgreSQL is awesome']), 'score' => 0.5, ], @@ -216,36 +400,31 @@ public function testPureKeywordSearch() $this->assertSame(0.5, $results[0]->score); } - public function testHybridSearchWithRRF() + public function testHybridSearchWithRRF(): void { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - // Disable normalization for consistent test scores - $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.5, rrfK: 60, normalizeScores: false); + $rrf = new ReciprocalRankFusion(k: 60, normalizeScores: false); + $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.5, rrf: $rrf); $pdo->expects($this->once()) ->method('prepare') ->with($this->callback(function ($sql) { - // Check for RRF CTE structure with BM25 and fuzzy + // Check for RRF CTE structure $this->assertStringContainsString('WITH vector_scores AS', $sql); - $this->assertStringContainsString('bm25_search AS', $sql); - $this->assertStringContainsString('bm25_with_metadata AS', $sql); $this->assertStringContainsString('fuzzy_scores AS', $sql); $this->assertStringContainsString('combined_results AS', $sql); $this->assertStringContainsString('ROW_NUMBER() OVER', $sql); $this->assertStringContainsString('FULL OUTER JOIN', $sql); $this->assertStringContainsString('ORDER BY score DESC', $sql); - // Should NOT contain old fts_scores CTE - $this->assertStringNotContainsString('fts_scores AS', $sql); - - // Should contain BM25 function call - $this->assertStringContainsString('bm25topk(', $sql); - // Should contain fuzzy matching $this->assertStringContainsString('word_similarity', $sql); + // Should contain RRF formula with k=60 + $this->assertStringContainsString('60 +', $sql); + return true; })) ->willReturn($statement); @@ -276,7 +455,7 @@ public function testHybridSearchWithRRF() $this->assertSame(0.025, $results[0]->score); } - public function testQueryWithDefaultMaxScore() + public function testQueryWithDefaultMaxScore(): void { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -315,7 +494,7 @@ public function testQueryWithDefaultMaxScore() $this->assertCount(0, $results); } - public function testQueryWithMaxScoreOverride() + public function testQueryWithMaxScoreOverride(): void { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -348,57 +527,68 @@ public function testQueryWithMaxScoreOverride() $this->assertCount(0, $results); } - public function testQueryWithCustomLanguage() + public function testQueryWithMinScoreFilter(): void { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - // Test BM25 language parameter (short code 'fr' instead of 'french') - $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.0, language: 'french', bm25Language: 'fr'); + $rrf = new ReciprocalRankFusion(normalizeScores: false); + $store = new HybridStore( + $pdo, + 'hybrid_table', + semanticRatio: 1.0, + rrf: $rrf, + defaultMinScore: 0.5 + ); $pdo->expects($this->once()) ->method('prepare') - ->with($this->callback(function ($sql) { - // Should NOT contain old FTS function - $this->assertStringNotContainsString("websearch_to_tsquery('french'", $sql); - - // Should contain BM25 with 'fr' language code - $this->assertStringContainsString('bm25topk(', $sql); - $this->assertStringContainsString("'fr'", $sql); - - return true; - })) ->willReturn($statement); $statement->expects($this->once()) ->method('execute'); + $uuid1 = Uuid::v4(); + $uuid2 = Uuid::v4(); + $statement->expects($this->once()) ->method('fetchAll') ->with(\PDO::FETCH_ASSOC) - ->willReturn([]); + ->willReturn([ + [ + 'id' => $uuid1->toRfc4122(), + 'embedding' => '[0.1,0.2,0.3]', + 'metadata' => json_encode(['text' => 'High score']), + 'score' => 0.8, + ], + [ + 'id' => $uuid2->toRfc4122(), + 'embedding' => '[0.4,0.5,0.6]', + 'metadata' => json_encode(['text' => 'Low score']), + 'score' => 0.3, + ], + ]); + + $results = $store->query(new Vector([0.1, 0.2, 0.3])); - $store->query(new Vector([0.1, 0.2, 0.3]), ['q' => 'développement']); + // Only high score result should be returned + $this->assertCount(1, $results); + $this->assertSame(0.8, $results[0]->score); } - public function testQueryWithCustomRRFK() + public function testQueryWithCustomRRFK(): void { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.5, rrfK: 100); + $rrf = new ReciprocalRankFusion(k: 100); + $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.5, rrf: $rrf); $pdo->expects($this->once()) ->method('prepare') ->with($this->callback(function ($sql) { // Check for RRF constant 100 in the formula - $this->assertStringContainsString('100 + v.rank_ix', $sql); - $this->assertStringContainsString('100 + b.bm25_rank', $sql); - $this->assertStringContainsString('100 + fz.rank_ix', $sql); - - // Verify BM25 and fuzzy structure (not old FTS) - $this->assertStringContainsString('bm25_search AS', $sql); - $this->assertStringContainsString('fuzzy_scores AS', $sql); + $this->assertStringContainsString('100 +', $sql); return true; })) @@ -415,7 +605,7 @@ public function testQueryWithCustomRRFK() $store->query(new Vector([0.1, 0.2, 0.3]), ['q' => 'test']); } - public function testQueryInvalidSemanticRatioInOptions() + public function testQueryInvalidSemanticRatioInOptions(): void { $pdo = $this->createMock(\PDO::class); $store = new HybridStore($pdo, 'hybrid_table'); @@ -426,7 +616,7 @@ public function testQueryInvalidSemanticRatioInOptions() $store->query(new Vector([0.1, 0.2, 0.3]), ['semanticRatio' => 1.5]); } - public function testDrop() + public function testDrop(): void { $pdo = $this->createMock(\PDO::class); $store = new HybridStore($pdo, 'hybrid_table'); @@ -438,7 +628,7 @@ public function testDrop() $store->drop(); } - public function testQueryWithCustomLimit() + public function testQueryWithCustomLimit(): void { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -465,47 +655,7 @@ public function testQueryWithCustomLimit() $store->query(new Vector([0.1, 0.2, 0.3]), ['limit' => 10]); } - public function testAddMultipleDocuments() - { - $pdo = $this->createMock(\PDO::class); - $statement = $this->createMock(\PDOStatement::class); - - $store = new HybridStore($pdo, 'hybrid_table'); - - $pdo->expects($this->once()) - ->method('prepare') - ->willReturn($statement); - - $uuid1 = Uuid::v4(); - $uuid2 = Uuid::v4(); - - $statement->expects($this->exactly(2)) - ->method('execute') - ->willReturnCallback(function (array $params) use ($uuid1, $uuid2): bool { - static $callCount = 0; - ++$callCount; - - if (1 === $callCount) { - $this->assertSame($uuid1->toRfc4122(), $params['id']); - $this->assertSame('First document', $params['content']); - } else { - $this->assertSame($uuid2->toRfc4122(), $params['id']); - $this->assertSame('Second document', $params['content']); - } - - return true; - }); - - $metadata1 = new Metadata(['_text' => 'First document']); - $metadata2 = new Metadata(['_text' => 'Second document']); - - $document1 = new VectorDocument($uuid1, new Vector([0.1, 0.2, 0.3]), $metadata1); - $document2 = new VectorDocument($uuid2, new Vector([0.4, 0.5, 0.6]), $metadata2); - - $store->add($document1, $document2); - } - - public function testPureKeywordSearchReturnsEmptyWhenNoMatch() + public function testPureKeywordSearchReturnsEmptyWhenNoMatch(): void { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -529,12 +679,11 @@ public function testPureKeywordSearchReturnsEmptyWhenNoMatch() $this->assertCount(0, $results); } - public function testFuzzyMatchingWithWordSimilarity() + public function testFuzzyMatchingWithWordSimilarity(): void { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - // Test fuzzy matching with custom thresholds $store = new HybridStore( $pdo, 'hybrid_table', @@ -555,9 +704,9 @@ public function testFuzzyMatchingWithWordSimilarity() $this->assertStringContainsString('word_similarity(:query, search_text)', $sql); // Verify custom thresholds are applied - $this->assertStringContainsString('0.300000', $sql); // Primary threshold - $this->assertStringContainsString('0.250000', $sql); // Secondary threshold - $this->assertStringContainsString('0.200000', $sql); // Strict threshold + $this->assertStringContainsString('0.300000', $sql); + $this->assertStringContainsString('0.250000', $sql); + $this->assertStringContainsString('0.200000', $sql); return true; })) @@ -569,11 +718,10 @@ public function testFuzzyMatchingWithWordSimilarity() $store->query(new Vector([0.1, 0.2, 0.3]), ['q' => 'test']); } - public function testSearchableAttributesWithBoost() + public function testSearchableAttributesWithBoost(): void { $pdo = $this->createMock(\PDO::class); - // Test with searchable attributes configuration $searchableAttributes = [ 'title' => ['boost' => 2.0, 'metadata_key' => 'title'], 'overview' => ['boost' => 1.0, 'metadata_key' => 'overview'], @@ -596,10 +744,10 @@ public function testSearchableAttributesWithBoost() $this->assertStringContainsString('title_tsv tsvector GENERATED ALWAYS AS', $sql); $this->assertStringContainsString('overview_tsv tsvector GENERATED ALWAYS AS', $sql); - // Should NOT contain generic content_tsv (backward compat mode) + // Should NOT contain generic content_tsv $this->assertStringNotContainsString('content_tsv tsvector GENERATED ALWAYS AS (to_tsvector(\'simple\', content)) STORED', $sql); } elseif ($callCount >= 8 && $callCount <= 9) { - // Verify separate GIN indexes for each attribute (title_tsv_idx, overview_tsv_idx) + // Verify separate GIN indexes $this->assertStringContainsString('_tsv_idx', $sql); $this->assertStringContainsString('USING gin(', $sql); } @@ -610,29 +758,24 @@ public function testSearchableAttributesWithBoost() $store->setup(); } - public function testFuzzyWeightParameter() + public function testFuzzyWeightParameter(): void { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); - // Test that fuzzyWeight controls the weight in RRF formula $store = new HybridStore( $pdo, 'hybrid_table', - semanticRatio: 0.4, // 60% non-semantic - fuzzyWeight: 0.5 // 50% of non-semantic goes to fuzzy + semanticRatio: 0.4, + fuzzyWeight: 0.5 ); - // Expected: 40% vector, 30% BM25 (60% * 0.5), 30% fuzzy (60% * 0.5) $pdo->expects($this->once()) ->method('prepare') ->with($this->callback(function ($sql) { - // Verify fuzzy weight is present in the RRF formula $this->assertStringContainsString('fuzzy_scores AS', $sql); $this->assertStringContainsString('combined_results AS', $sql); - - // Should have three components: vector, BM25, fuzzy - $this->assertStringContainsString('COALESCE(1.0 / (', $sql); // RRF formula pattern + $this->assertStringContainsString('COALESCE(1.0 / (', $sql); return true; })) @@ -644,12 +787,196 @@ public function testFuzzyWeightParameter() $store->query(new Vector([0.1, 0.2, 0.3]), ['q' => 'test']); } + public function testBoostFieldsApplied(): void + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + $rrf = new ReciprocalRankFusion(normalizeScores: false); + $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 1.0, rrf: $rrf); + + $pdo->expects($this->once()) + ->method('prepare') + ->willReturn($statement); + + $statement->expects($this->once()) + ->method('execute'); + + $uuid1 = Uuid::v4(); + $uuid2 = Uuid::v4(); + + $statement->expects($this->once()) + ->method('fetchAll') + ->with(\PDO::FETCH_ASSOC) + ->willReturn([ + [ + 'id' => $uuid1->toRfc4122(), + 'embedding' => '[0.1,0.2,0.3]', + 'metadata' => json_encode(['text' => 'Popular', 'popularity' => 100]), + 'score' => 0.5, + ], + [ + 'id' => $uuid2->toRfc4122(), + 'embedding' => '[0.4,0.5,0.6]', + 'metadata' => json_encode(['text' => 'Unpopular', 'popularity' => 10]), + 'score' => 0.6, + ], + ]); + + $results = $store->query(new Vector([0.1, 0.2, 0.3]), [ + 'boostFields' => [ + 'popularity' => ['min' => 50, 'boost' => 0.5], + ], + ]); + + $this->assertCount(2, $results); + + // First result should be boosted (popularity >= 50) + // Original score 0.5 * 1.5 = 0.75 + $this->assertSame(0.75, $results[0]->score); + $this->assertArrayHasKey('_applied_boosts', $results[0]->metadata->getArrayCopy()); + + // Second result should not be boosted (popularity < 50) + $this->assertSame(0.6, $results[1]->score); + } + + public function testScoreBreakdownIncluded(): void + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + $rrf = new ReciprocalRankFusion(normalizeScores: false); + $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.5, rrf: $rrf); + + $pdo->expects($this->once()) + ->method('prepare') + ->willReturn($statement); + + $statement->expects($this->once()) + ->method('execute'); + + $uuid = Uuid::v4(); + + $statement->expects($this->once()) + ->method('fetchAll') + ->with(\PDO::FETCH_ASSOC) + ->willReturn([ + [ + 'id' => $uuid->toRfc4122(), + 'embedding' => '[0.1,0.2,0.3]', + 'metadata' => json_encode(['text' => 'Test']), + 'score' => 0.025, + 'vector_rank' => 1, + 'fts_rank' => 2, + 'vector_distance' => 0.1, + 'fts_score' => 0.8, + 'vector_contribution' => 0.015, + 'fts_contribution' => 0.01, + 'fuzzy_rank' => 3, + 'fuzzy_score' => 0.7, + 'fuzzy_contribution' => 0.005, + ], + ]); + + $results = $store->query(new Vector([0.1, 0.2, 0.3]), [ + 'q' => 'test', + 'includeScoreBreakdown' => true, + ]); + + $this->assertCount(1, $results); + + $metadata = $results[0]->metadata->getArrayCopy(); + $this->assertArrayHasKey('_score_breakdown', $metadata); + + $breakdown = $metadata['_score_breakdown']; + $this->assertSame(1, $breakdown['vector_rank']); + $this->assertSame(2, $breakdown['fts_rank']); + $this->assertSame(3, $breakdown['fuzzy_rank']); + $this->assertSame(0.1, $breakdown['vector_distance']); + $this->assertSame(0.8, $breakdown['fts_score']); + $this->assertSame(0.7, $breakdown['fuzzy_score']); + } + + public function testNullVectorForFtsOnlyResults(): void + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 0.0); + + $pdo->expects($this->once()) + ->method('prepare') + ->willReturn($statement); + + $statement->expects($this->once()) + ->method('execute'); + + $uuid = Uuid::v4(); + + $statement->expects($this->once()) + ->method('fetchAll') + ->with(\PDO::FETCH_ASSOC) + ->willReturn([ + [ + 'id' => $uuid->toRfc4122(), + 'embedding' => null, + 'metadata' => json_encode(['text' => 'FTS only result']), + 'score' => 0.5, + ], + ]); + + $results = $store->query(new Vector([0.1, 0.2, 0.3]), ['q' => 'FTS']); + + $this->assertCount(1, $results); + $this->assertInstanceOf(NullVector::class, $results[0]->vector); + } + + public function testScoreNormalization(): void + { + $pdo = $this->createMock(\PDO::class); + $statement = $this->createMock(\PDOStatement::class); + + // Enable normalization (default) + $rrf = new ReciprocalRankFusion(k: 60, normalizeScores: true); + $store = new HybridStore($pdo, 'hybrid_table', semanticRatio: 1.0, rrf: $rrf); + + $pdo->expects($this->once()) + ->method('prepare') + ->willReturn($statement); + + $statement->expects($this->once()) + ->method('execute'); + + $uuid = Uuid::v4(); + + // Raw RRF score + $rawScore = 0.01639; // Approximately 1/(60+1) = theoretical max + + $statement->expects($this->once()) + ->method('fetchAll') + ->with(\PDO::FETCH_ASSOC) + ->willReturn([ + [ + 'id' => $uuid->toRfc4122(), + 'embedding' => '[0.1,0.2,0.3]', + 'metadata' => json_encode(['text' => 'Test']), + 'score' => $rawScore, + ], + ]); + + $results = $store->query(new Vector([0.1, 0.2, 0.3])); + + $this->assertCount(1, $results); + + // Score should be normalized to approximately 100 + $expectedNormalized = $rrf->normalize($rawScore); + $this->assertEqualsWithDelta($expectedNormalized, $results[0]->score, 0.01); + } + private function normalizeQuery(string $query): string { - // Remove extra spaces, tabs and newlines $normalized = preg_replace('/\s+/', ' ', $query); - // Trim the result return trim($normalized); } } diff --git a/src/store/tests/Bridge/Postgres/ReciprocalRankFusionTest.php b/src/store/tests/Bridge/Postgres/ReciprocalRankFusionTest.php new file mode 100644 index 000000000..9aa282c2e --- /dev/null +++ b/src/store/tests/Bridge/Postgres/ReciprocalRankFusionTest.php @@ -0,0 +1,221 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Tests\Bridge\Postgres; + +use PHPUnit\Framework\TestCase; +use Symfony\AI\Store\Bridge\Postgres\ReciprocalRankFusion; + +final class ReciprocalRankFusionTest extends TestCase +{ + public function testDefaultConstruction(): void + { + $rrf = new ReciprocalRankFusion(); + + $this->assertSame(60, $rrf->getK()); + $this->assertTrue($rrf->isNormalized()); + } + + public function testCustomConstruction(): void + { + $rrf = new ReciprocalRankFusion(k: 100, normalizeScores: false); + + $this->assertSame(100, $rrf->getK()); + $this->assertFalse($rrf->isNormalized()); + } + + public function testCalculateSingleRanking(): void + { + $rrf = new ReciprocalRankFusion(k: 60, normalizeScores: false); + + $score = $rrf->calculate([ + 'vector' => ['rank' => 1, 'score' => 1.0, 'weight' => 1.0], + ]); + + // 1/(60+1) * 1.0 * 1.0 = 0.01639... + $this->assertEqualsWithDelta(1 / 61, $score, 0.0001); + } + + public function testCalculateMultipleRankings(): void + { + $rrf = new ReciprocalRankFusion(k: 60, normalizeScores: false); + + $score = $rrf->calculate([ + 'vector' => ['rank' => 1, 'score' => 1.0, 'weight' => 0.5], + 'fts' => ['rank' => 2, 'score' => 0.8, 'weight' => 0.5], + ]); + + // (1/(60+1) * 1.0 * 0.5) + (1/(60+2) * 0.8 * 0.5) + $expected = (1 / 61 * 1.0 * 0.5) + (1 / 62 * 0.8 * 0.5); + $this->assertEqualsWithDelta($expected, $score, 0.0001); + } + + public function testCalculateSkipsNullRank(): void + { + $rrf = new ReciprocalRankFusion(k: 60, normalizeScores: false); + + $score = $rrf->calculate([ + 'vector' => ['rank' => 1, 'score' => 1.0, 'weight' => 0.5], + 'fts' => ['rank' => null, 'score' => 0.8, 'weight' => 0.5], + ]); + + // Only vector contribution + $expected = 1 / 61 * 1.0 * 0.5; + $this->assertEqualsWithDelta($expected, $score, 0.0001); + } + + public function testCalculateWithNormalization(): void + { + $rrf = new ReciprocalRankFusion(k: 60, normalizeScores: true); + + $score = $rrf->calculate([ + 'vector' => ['rank' => 1, 'score' => 1.0, 'weight' => 1.0], + ]); + + // Should be normalized to ~100 (since rank=1 with full score/weight gives max RRF) + $this->assertEqualsWithDelta(100.0, $score, 0.01); + } + + public function testCalculateContribution(): void + { + $rrf = new ReciprocalRankFusion(k: 60, normalizeScores: false); + + $contribution = $rrf->calculateContribution(rank: 1, score: 1.0, weight: 0.5); + + $expected = (1 / 61) * 1.0 * 0.5; + $this->assertEqualsWithDelta($expected, $contribution, 0.0001); + } + + public function testNormalize(): void + { + $rrf = new ReciprocalRankFusion(k: 60); + + $maxRawScore = 1 / 61; // Theoretical maximum + $normalized = $rrf->normalize($maxRawScore); + + $this->assertEqualsWithDelta(100.0, $normalized, 0.01); + } + + public function testDenormalize(): void + { + $rrf = new ReciprocalRankFusion(k: 60); + + $denormalized = $rrf->denormalize(100.0); + + $this->assertEqualsWithDelta(1 / 61, $denormalized, 0.0001); + } + + public function testNormalizeAndDenormalizeAreInverse(): void + { + $rrf = new ReciprocalRankFusion(k: 60); + + $original = 0.008; + $normalized = $rrf->normalize($original); + $denormalized = $rrf->denormalize($normalized); + + $this->assertEqualsWithDelta($original, $denormalized, 0.0001); + } + + public function testBuildSqlExpression(): void + { + $rrf = new ReciprocalRankFusion(k: 60); + + $sql = $rrf->buildSqlExpression( + rankColumn: 'v.rank_ix', + scoreExpr: '(1.0 - v.distance)', + weight: 0.7 + ); + + $this->assertStringContainsString('COALESCE(1.0 / (60 + v.rank_ix)', $sql); + $this->assertStringContainsString('(1.0 - v.distance)', $sql); + $this->assertStringContainsString('0.700000', $sql); + $this->assertStringContainsString(', 0.0)', $sql); + } + + public function testBuildSqlExpressionWithCustomNullDefault(): void + { + $rrf = new ReciprocalRankFusion(k: 60); + + $sql = $rrf->buildSqlExpression( + rankColumn: 'rank', + scoreExpr: 'score', + weight: 1.0, + nullDefault: '-1.0' + ); + + $this->assertStringContainsString(', -1.0)', $sql); + } + + public function testBuildCombinedSqlExpression(): void + { + $rrf = new ReciprocalRankFusion(k: 60); + + $sql = $rrf->buildCombinedSqlExpression([ + ['rank_column' => 'v.rank', 'score_expr' => 'v.score', 'weight' => 0.5], + ['rank_column' => 'f.rank', 'score_expr' => 'f.score', 'weight' => 0.5], + ]); + + $this->assertStringContainsString('(', $sql); + $this->assertStringContainsString(' + ', $sql); + $this->assertStringContainsString('60 + v.rank', $sql); + $this->assertStringContainsString('60 + f.rank', $sql); + } + + public function testDifferentKValues(): void + { + $rrf60 = new ReciprocalRankFusion(k: 60, normalizeScores: false); + $rrf100 = new ReciprocalRankFusion(k: 100, normalizeScores: false); + + $rankings = [ + 'vector' => ['rank' => 1, 'score' => 1.0, 'weight' => 1.0], + ]; + + $score60 = $rrf60->calculate($rankings); + $score100 = $rrf100->calculate($rankings); + + // Higher k means lower individual contributions + $this->assertGreaterThan($score100, $score60); + + // Verify exact values + $this->assertEqualsWithDelta(1 / 61, $score60, 0.0001); + $this->assertEqualsWithDelta(1 / 101, $score100, 0.0001); + } + + public function testWeightAffectsScore(): void + { + $rrf = new ReciprocalRankFusion(k: 60, normalizeScores: false); + + $scoreFullWeight = $rrf->calculate([ + 'vector' => ['rank' => 1, 'score' => 1.0, 'weight' => 1.0], + ]); + + $scoreHalfWeight = $rrf->calculate([ + 'vector' => ['rank' => 1, 'score' => 1.0, 'weight' => 0.5], + ]); + + $this->assertEqualsWithDelta($scoreFullWeight / 2, $scoreHalfWeight, 0.0001); + } + + public function testLowerRankGivesLowerScore(): void + { + $rrf = new ReciprocalRankFusion(k: 60, normalizeScores: false); + + $scoreRank1 = $rrf->calculate([ + 'vector' => ['rank' => 1, 'score' => 1.0, 'weight' => 1.0], + ]); + + $scoreRank10 = $rrf->calculate([ + 'vector' => ['rank' => 10, 'score' => 1.0, 'weight' => 1.0], + ]); + + $this->assertGreaterThan($scoreRank10, $scoreRank1); + } +} From d7446d5e7eccdbe3fa4f9178b18b113ce04b0cc9 Mon Sep 17 00:00:00 2001 From: Ahmed EBEN HASSINE Date: Wed, 26 Nov 2025 04:53:53 +0100 Subject: [PATCH 9/9] docs(examples): enhance postgres-hybrid example with new features - Demonstrate BM25TextSearchStrategy vs native PostgreSQL FTS - Show explicit ReciprocalRankFusion configuration - Add comparison between both text search strategies - Simplify summary and improve clarity --- examples/rag/postgres-hybrid.php | 61 +++++++++++++++--- src/store/src/Bridge/Postgres/HybridStore.php | 47 ++++++-------- .../Bridge/Postgres/ReciprocalRankFusion.php | 6 +- .../TextSearch/Bm25TextSearchStrategy.php | 2 +- .../tests/Bridge/Postgres/HybridStoreTest.php | 63 ++++++++++--------- .../Postgres/ReciprocalRankFusionTest.php | 32 +++++----- 6 files changed, 122 insertions(+), 89 deletions(-) diff --git a/examples/rag/postgres-hybrid.php b/examples/rag/postgres-hybrid.php index f71fa24db..ae4ec52f9 100644 --- a/examples/rag/postgres-hybrid.php +++ b/examples/rag/postgres-hybrid.php @@ -14,6 +14,9 @@ use Symfony\AI\Fixtures\Movies; use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory; use Symfony\AI\Store\Bridge\Postgres\HybridStore; +use Symfony\AI\Store\Bridge\Postgres\ReciprocalRankFusion; +use Symfony\AI\Store\Bridge\Postgres\TextSearch\Bm25TextSearchStrategy; +use Symfony\AI\Store\Bridge\Postgres\TextSearch\PostgresTextSearchStrategy; use Symfony\AI\Store\Document\Loader\InMemoryLoader; use Symfony\AI\Store\Document\Metadata; use Symfony\AI\Store\Document\TextDocument; @@ -25,10 +28,11 @@ require_once dirname(__DIR__).'/bootstrap.php'; echo "=== PostgreSQL Hybrid Search Demo ===\n\n"; -echo "This example demonstrates how to configure the semantic ratio to balance\n"; -echo "between semantic (vector) search and PostgreSQL Full-Text Search.\n\n"; +echo "Demonstrates HybridStore with configurable search strategies:\n"; +echo "- Native PostgreSQL FTS vs BM25\n"; +echo "- Semantic ratio adjustment\n"; +echo "- Custom RRF scoring\n\n"; -// Initialize the hybrid store with balanced search (50/50) $connection = DriverManager::getConnection((new DsnParser())->parse(env('POSTGRES_URI'))); $pdo = $connection->getNativeConnection(); @@ -36,10 +40,14 @@ throw new RuntimeException('Unable to get native PDO connection from Doctrine DBAL.'); } +echo "=== Using BM25 Text Search Strategy ===\n\n"; + $store = new HybridStore( connection: $pdo, tableName: 'hybrid_movies', - semanticRatio: 0.5, // Balanced hybrid search by default + textSearchStrategy: new Bm25TextSearchStrategy('en'), + rrf: new ReciprocalRankFusion(k: 60, normalizeScores: true), + semanticRatio: 0.5, ); // Create embeddings and documents @@ -119,9 +127,42 @@ // Cleanup $store->drop(); -echo "=== Summary ===\n"; -echo "- semanticRatio = 0.0: Best for exact keyword matches (PostgreSQL FTS)\n"; -echo "- semanticRatio = 0.5: Balanced approach using RRF (Reciprocal Rank Fusion)\n"; -echo "- semanticRatio = 1.0: Best for conceptual similarity searches (pgvector)\n"; -echo "\nYou can set the default ratio when instantiating the HybridStore,\n"; -echo "and override it per query using the 'semanticRatio' option.\n"; +echo "=== Comparing with Native PostgreSQL FTS ===\n\n"; + +$storeFts = new HybridStore( + connection: $pdo, + tableName: 'hybrid_movies_fts', + textSearchStrategy: new PostgresTextSearchStrategy(), + semanticRatio: 0.5, +); + +$storeFts->setup(); +$indexer = new Indexer(new InMemoryLoader($documents), $vectorizer, $storeFts, logger: logger()); +$indexer->index($documents); + +$resultsFts = $storeFts->query($queryEmbedding, [ + 'semanticRatio' => 0.5, + 'q' => 'technology', + 'limit' => 3, +]); + +echo "Top 3 results (Native FTS):\n"; +foreach ($resultsFts as $i => $result) { + $metadata = $result->metadata->getArrayCopy(); + echo sprintf( + " %d. %s (Score: %.4f)\n", + $i + 1, + $metadata['title'] ?? 'Unknown', + $result->score ?? 0.0 + ); +} + +$storeFts->drop(); + +echo "\n=== Summary ===\n"; +echo "- semanticRatio = 0.0: Pure keyword matching\n"; +echo "- semanticRatio = 0.5: Balanced hybrid (RRF)\n"; +echo "- semanticRatio = 1.0: Pure semantic search\n"; +echo "\nText Search Strategies:\n"; +echo "- PostgresTextSearchStrategy: Native FTS (ts_rank_cd)\n"; +echo "- Bm25TextSearchStrategy: BM25 ranking (requires pg_bm25 extension)\n"; diff --git a/src/store/src/Bridge/Postgres/HybridStore.php b/src/store/src/Bridge/Postgres/HybridStore.php index ff7ac57b8..d1dbc9aca 100644 --- a/src/store/src/Bridge/Postgres/HybridStore.php +++ b/src/store/src/Bridge/Postgres/HybridStore.php @@ -41,20 +41,20 @@ final class HybridStore implements ManagedStoreInterface, StoreInterface private readonly TextSearchStrategyInterface $textSearchStrategy; /** - * @param string $vectorFieldName Name of the vector field - * @param string $contentFieldName Name of the text field for FTS - * @param float $semanticRatio Ratio between semantic and keyword search (0.0 to 1.0) - * @param Distance $distance Distance metric for vector similarity - * @param string $language PostgreSQL text search configuration - * @param TextSearchStrategyInterface|null $textSearchStrategy Text search strategy (defaults to native PostgreSQL) - * @param ReciprocalRankFusion|null $rrf RRF calculator (defaults to k=60, normalized) - * @param float|null $defaultMaxScore Default max distance for vector search - * @param float|null $defaultMinScore Default min RRF score threshold - * @param float $fuzzyPrimaryThreshold Primary threshold for fuzzy matching - * @param float $fuzzySecondaryThreshold Secondary threshold for fuzzy matching - * @param float $fuzzyStrictThreshold Strict threshold for double validation - * @param float $fuzzyWeight Weight of fuzzy matching (0.0 to 1.0) - * @param array $searchableAttributes Searchable attributes with boosting config + * @param string $vectorFieldName Name of the vector field + * @param string $contentFieldName Name of the text field for FTS + * @param float $semanticRatio Ratio between semantic and keyword search (0.0 to 1.0) + * @param Distance $distance Distance metric for vector similarity + * @param string $language PostgreSQL text search configuration + * @param TextSearchStrategyInterface|null $textSearchStrategy Text search strategy (defaults to native PostgreSQL) + * @param ReciprocalRankFusion|null $rrf RRF calculator (defaults to k=60, normalized) + * @param float|null $defaultMaxScore Default max distance for vector search + * @param float|null $defaultMinScore Default min RRF score threshold + * @param float $fuzzyPrimaryThreshold Primary threshold for fuzzy matching + * @param float $fuzzySecondaryThreshold Secondary threshold for fuzzy matching + * @param float $fuzzyStrictThreshold Strict threshold for double validation + * @param float $fuzzyWeight Weight of fuzzy matching (0.0 to 1.0) + * @param array $searchableAttributes Searchable attributes with boosting config */ public function __construct( private readonly \PDO $connection, @@ -75,17 +75,11 @@ public function __construct( private readonly array $searchableAttributes = [], ) { if ($semanticRatio < 0.0 || $semanticRatio > 1.0) { - throw new InvalidArgumentException(\sprintf( - 'The semantic ratio must be between 0.0 and 1.0, "%s" given.', - $semanticRatio - )); + throw new InvalidArgumentException(\sprintf('The semantic ratio must be between 0.0 and 1.0, "%s" given.', $semanticRatio)); } if ($fuzzyWeight < 0.0 || $fuzzyWeight > 1.0) { - throw new InvalidArgumentException(\sprintf( - 'The fuzzy weight must be between 0.0 and 1.0, "%s" given.', - $fuzzyWeight - )); + throw new InvalidArgumentException(\sprintf('The fuzzy weight must be between 0.0 and 1.0, "%s" given.', $fuzzyWeight)); } $this->textSearchStrategy = $textSearchStrategy ?? new PostgresTextSearchStrategy(); @@ -314,11 +308,11 @@ private function createSearchTextTrigger(): void $this->connection->exec( \sprintf( - "DROP TRIGGER IF EXISTS trigger_update_search_text ON %s; + 'DROP TRIGGER IF EXISTS trigger_update_search_text ON %s; CREATE TRIGGER trigger_update_search_text BEFORE INSERT OR UPDATE ON %s FOR EACH ROW - EXECUTE FUNCTION update_search_text();", + EXECUTE FUNCTION update_search_text();', $this->tableName, $this->tableName, ), @@ -353,10 +347,7 @@ private function createTsvectorIndexes(): void private function validateSemanticRatio(float $ratio): float { if ($ratio < 0.0 || $ratio > 1.0) { - throw new InvalidArgumentException(\sprintf( - 'The semantic ratio must be between 0.0 and 1.0, "%s" given.', - $ratio - )); + throw new InvalidArgumentException(\sprintf('The semantic ratio must be between 0.0 and 1.0, "%s" given.', $ratio)); } return $ratio; diff --git a/src/store/src/Bridge/Postgres/ReciprocalRankFusion.php b/src/store/src/Bridge/Postgres/ReciprocalRankFusion.php index cf9b2f0ef..26855b970 100644 --- a/src/store/src/Bridge/Postgres/ReciprocalRankFusion.php +++ b/src/store/src/Bridge/Postgres/ReciprocalRankFusion.php @@ -36,8 +36,8 @@ public function __construct( /** * Calculate RRF score for a single result with multiple rankings. * - * @param array $rankings - * Each entry contains: rank (1-based), score (normalized 0-1), weight (0-1) + * @param array $rankings + * Each entry contains: rank (1-based or null), score (normalized 0-1), weight (0-1) * * @return float The combined RRF score */ @@ -142,7 +142,7 @@ public function buildCombinedSqlExpression(array $sources): string ); } - return '(' . implode(' + ', $expressions) . ')'; + return '('.implode(' + ', $expressions).')'; } public function getK(): int diff --git a/src/store/src/Bridge/Postgres/TextSearch/Bm25TextSearchStrategy.php b/src/store/src/Bridge/Postgres/TextSearch/Bm25TextSearchStrategy.php index 7d4dc16ca..136338fb3 100644 --- a/src/store/src/Bridge/Postgres/TextSearch/Bm25TextSearchStrategy.php +++ b/src/store/src/Bridge/Postgres/TextSearch/Bm25TextSearchStrategy.php @@ -132,7 +132,7 @@ public function isAvailable(\PDO $connection): bool "SELECT 1 FROM pg_proc WHERE proname = 'bm25topk' LIMIT 1" ); - return $stmt->fetchColumn() !== false; + return false !== $stmt->fetchColumn(); } catch (\PDOException) { return false; } diff --git a/src/store/tests/Bridge/Postgres/HybridStoreTest.php b/src/store/tests/Bridge/Postgres/HybridStoreTest.php index 204e12585..d3c569e58 100644 --- a/src/store/tests/Bridge/Postgres/HybridStoreTest.php +++ b/src/store/tests/Bridge/Postgres/HybridStoreTest.php @@ -26,7 +26,7 @@ final class HybridStoreTest extends TestCase { - public function testConstructorValidatesSemanticRatio(): void + public function testConstructorValidatesSemanticRatio() { $this->expectException(InvalidArgumentException::class); $this->expectExceptionMessage('The semantic ratio must be between 0.0 and 1.0'); @@ -35,7 +35,7 @@ public function testConstructorValidatesSemanticRatio(): void new HybridStore($pdo, 'test_table', semanticRatio: 1.5); } - public function testConstructorValidatesNegativeSemanticRatio(): void + public function testConstructorValidatesNegativeSemanticRatio() { $this->expectException(InvalidArgumentException::class); $this->expectExceptionMessage('The semantic ratio must be between 0.0 and 1.0'); @@ -44,7 +44,7 @@ public function testConstructorValidatesNegativeSemanticRatio(): void new HybridStore($pdo, 'test_table', semanticRatio: -0.5); } - public function testConstructorValidatesFuzzyWeight(): void + public function testConstructorValidatesFuzzyWeight() { $this->expectException(InvalidArgumentException::class); $this->expectExceptionMessage('The fuzzy weight must be between 0.0 and 1.0'); @@ -53,7 +53,7 @@ public function testConstructorValidatesFuzzyWeight(): void new HybridStore($pdo, 'test_table', fuzzyWeight: 1.5); } - public function testConstructorUsesDefaultTextSearchStrategy(): void + public function testConstructorUsesDefaultTextSearchStrategy() { $pdo = $this->createMock(\PDO::class); $store = new HybridStore($pdo, 'test_table'); @@ -61,7 +61,7 @@ public function testConstructorUsesDefaultTextSearchStrategy(): void $this->assertInstanceOf(PostgresTextSearchStrategy::class, $store->getTextSearchStrategy()); } - public function testConstructorUsesCustomTextSearchStrategy(): void + public function testConstructorUsesCustomTextSearchStrategy() { $pdo = $this->createMock(\PDO::class); $customStrategy = new Bm25TextSearchStrategy(); @@ -70,7 +70,7 @@ public function testConstructorUsesCustomTextSearchStrategy(): void $this->assertSame($customStrategy, $store->getTextSearchStrategy()); } - public function testConstructorUsesDefaultRrf(): void + public function testConstructorUsesDefaultRrf() { $pdo = $this->createMock(\PDO::class); $store = new HybridStore($pdo, 'test_table'); @@ -79,7 +79,7 @@ public function testConstructorUsesDefaultRrf(): void $this->assertSame(60, $store->getRrf()->getK()); } - public function testConstructorUsesCustomRrf(): void + public function testConstructorUsesCustomRrf() { $pdo = $this->createMock(\PDO::class); $customRrf = new ReciprocalRankFusion(k: 100, normalizeScores: false); @@ -89,7 +89,7 @@ public function testConstructorUsesCustomRrf(): void $this->assertSame(100, $store->getRrf()->getK()); } - public function testSetupCreatesTableWithFullTextSearchSupport(): void + public function testSetupCreatesTableWithFullTextSearchSupport() { $pdo = $this->createMock(\PDO::class); $store = new HybridStore($pdo, 'hybrid_table'); @@ -135,7 +135,7 @@ public function testSetupCreatesTableWithFullTextSearchSupport(): void $store->setup(); } - public function testSetupExecutesTextSearchStrategySetupSql(): void + public function testSetupExecutesTextSearchStrategySetupSql() { $pdo = $this->createMock(\PDO::class); @@ -150,7 +150,7 @@ public function testSetupExecutesTextSearchStrategySetupSql(): void $store = new HybridStore($pdo, 'hybrid_table', textSearchStrategy: $mockStrategy); $execCalls = []; - $pdo->expects($this->atLeast(1)) + $pdo->expects($this->any()) ->method('exec') ->willReturnCallback(function (string $sql) use (&$execCalls): int { $execCalls[] = $sql; @@ -161,9 +161,10 @@ public function testSetupExecutesTextSearchStrategySetupSql(): void $store->setup(); $this->assertContains('CREATE INDEX custom_idx ON hybrid_table USING gin(content)', $execCalls); + $this->assertNotEmpty($execCalls, 'Expected at least one exec() call'); } - public function testAddDocument(): void + public function testAddDocument() { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -200,7 +201,7 @@ public function testAddDocument(): void $store->add($document); } - public function testAddMultipleDocuments(): void + public function testAddMultipleDocuments() { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -240,7 +241,7 @@ public function testAddMultipleDocuments(): void $store->add($document1, $document2); } - public function testPureVectorSearch(): void + public function testPureVectorSearch() { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -287,7 +288,7 @@ public function testPureVectorSearch(): void $this->assertSame(0.05, $results[0]->score); } - public function testPureKeywordSearchWithPostgresStrategy(): void + public function testPureKeywordSearchWithPostgresStrategy() { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -343,7 +344,7 @@ public function testPureKeywordSearchWithPostgresStrategy(): void $this->assertInstanceOf(NullVector::class, $results[0]->vector); } - public function testPureKeywordSearchWithBm25Strategy(): void + public function testPureKeywordSearchWithBm25Strategy() { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -400,7 +401,7 @@ public function testPureKeywordSearchWithBm25Strategy(): void $this->assertSame(0.5, $results[0]->score); } - public function testHybridSearchWithRRF(): void + public function testHybridSearchWithRRF() { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -455,7 +456,7 @@ public function testHybridSearchWithRRF(): void $this->assertSame(0.025, $results[0]->score); } - public function testQueryWithDefaultMaxScore(): void + public function testQueryWithDefaultMaxScore() { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -494,7 +495,7 @@ public function testQueryWithDefaultMaxScore(): void $this->assertCount(0, $results); } - public function testQueryWithMaxScoreOverride(): void + public function testQueryWithMaxScoreOverride() { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -527,7 +528,7 @@ public function testQueryWithMaxScoreOverride(): void $this->assertCount(0, $results); } - public function testQueryWithMinScoreFilter(): void + public function testQueryWithMinScoreFilter() { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -576,7 +577,7 @@ public function testQueryWithMinScoreFilter(): void $this->assertSame(0.8, $results[0]->score); } - public function testQueryWithCustomRRFK(): void + public function testQueryWithCustomRRFK() { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -605,7 +606,7 @@ public function testQueryWithCustomRRFK(): void $store->query(new Vector([0.1, 0.2, 0.3]), ['q' => 'test']); } - public function testQueryInvalidSemanticRatioInOptions(): void + public function testQueryInvalidSemanticRatioInOptions() { $pdo = $this->createMock(\PDO::class); $store = new HybridStore($pdo, 'hybrid_table'); @@ -616,7 +617,7 @@ public function testQueryInvalidSemanticRatioInOptions(): void $store->query(new Vector([0.1, 0.2, 0.3]), ['semanticRatio' => 1.5]); } - public function testDrop(): void + public function testDrop() { $pdo = $this->createMock(\PDO::class); $store = new HybridStore($pdo, 'hybrid_table'); @@ -628,7 +629,7 @@ public function testDrop(): void $store->drop(); } - public function testQueryWithCustomLimit(): void + public function testQueryWithCustomLimit() { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -655,7 +656,7 @@ public function testQueryWithCustomLimit(): void $store->query(new Vector([0.1, 0.2, 0.3]), ['limit' => 10]); } - public function testPureKeywordSearchReturnsEmptyWhenNoMatch(): void + public function testPureKeywordSearchReturnsEmptyWhenNoMatch() { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -679,7 +680,7 @@ public function testPureKeywordSearchReturnsEmptyWhenNoMatch(): void $this->assertCount(0, $results); } - public function testFuzzyMatchingWithWordSimilarity(): void + public function testFuzzyMatchingWithWordSimilarity() { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -718,7 +719,7 @@ public function testFuzzyMatchingWithWordSimilarity(): void $store->query(new Vector([0.1, 0.2, 0.3]), ['q' => 'test']); } - public function testSearchableAttributesWithBoost(): void + public function testSearchableAttributesWithBoost() { $pdo = $this->createMock(\PDO::class); @@ -758,7 +759,7 @@ public function testSearchableAttributesWithBoost(): void $store->setup(); } - public function testFuzzyWeightParameter(): void + public function testFuzzyWeightParameter() { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -787,7 +788,7 @@ public function testFuzzyWeightParameter(): void $store->query(new Vector([0.1, 0.2, 0.3]), ['q' => 'test']); } - public function testBoostFieldsApplied(): void + public function testBoostFieldsApplied() { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -840,7 +841,7 @@ public function testBoostFieldsApplied(): void $this->assertSame(0.6, $results[1]->score); } - public function testScoreBreakdownIncluded(): void + public function testScoreBreakdownIncluded() { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -897,7 +898,7 @@ public function testScoreBreakdownIncluded(): void $this->assertSame(0.7, $breakdown['fuzzy_score']); } - public function testNullVectorForFtsOnlyResults(): void + public function testNullVectorForFtsOnlyResults() { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); @@ -931,7 +932,7 @@ public function testNullVectorForFtsOnlyResults(): void $this->assertInstanceOf(NullVector::class, $results[0]->vector); } - public function testScoreNormalization(): void + public function testScoreNormalization() { $pdo = $this->createMock(\PDO::class); $statement = $this->createMock(\PDOStatement::class); diff --git a/src/store/tests/Bridge/Postgres/ReciprocalRankFusionTest.php b/src/store/tests/Bridge/Postgres/ReciprocalRankFusionTest.php index 9aa282c2e..e6b588684 100644 --- a/src/store/tests/Bridge/Postgres/ReciprocalRankFusionTest.php +++ b/src/store/tests/Bridge/Postgres/ReciprocalRankFusionTest.php @@ -16,7 +16,7 @@ final class ReciprocalRankFusionTest extends TestCase { - public function testDefaultConstruction(): void + public function testDefaultConstruction() { $rrf = new ReciprocalRankFusion(); @@ -24,7 +24,7 @@ public function testDefaultConstruction(): void $this->assertTrue($rrf->isNormalized()); } - public function testCustomConstruction(): void + public function testCustomConstruction() { $rrf = new ReciprocalRankFusion(k: 100, normalizeScores: false); @@ -32,7 +32,7 @@ public function testCustomConstruction(): void $this->assertFalse($rrf->isNormalized()); } - public function testCalculateSingleRanking(): void + public function testCalculateSingleRanking() { $rrf = new ReciprocalRankFusion(k: 60, normalizeScores: false); @@ -44,7 +44,7 @@ public function testCalculateSingleRanking(): void $this->assertEqualsWithDelta(1 / 61, $score, 0.0001); } - public function testCalculateMultipleRankings(): void + public function testCalculateMultipleRankings() { $rrf = new ReciprocalRankFusion(k: 60, normalizeScores: false); @@ -58,7 +58,7 @@ public function testCalculateMultipleRankings(): void $this->assertEqualsWithDelta($expected, $score, 0.0001); } - public function testCalculateSkipsNullRank(): void + public function testCalculateSkipsNullRank() { $rrf = new ReciprocalRankFusion(k: 60, normalizeScores: false); @@ -72,7 +72,7 @@ public function testCalculateSkipsNullRank(): void $this->assertEqualsWithDelta($expected, $score, 0.0001); } - public function testCalculateWithNormalization(): void + public function testCalculateWithNormalization() { $rrf = new ReciprocalRankFusion(k: 60, normalizeScores: true); @@ -84,7 +84,7 @@ public function testCalculateWithNormalization(): void $this->assertEqualsWithDelta(100.0, $score, 0.01); } - public function testCalculateContribution(): void + public function testCalculateContribution() { $rrf = new ReciprocalRankFusion(k: 60, normalizeScores: false); @@ -94,7 +94,7 @@ public function testCalculateContribution(): void $this->assertEqualsWithDelta($expected, $contribution, 0.0001); } - public function testNormalize(): void + public function testNormalize() { $rrf = new ReciprocalRankFusion(k: 60); @@ -104,7 +104,7 @@ public function testNormalize(): void $this->assertEqualsWithDelta(100.0, $normalized, 0.01); } - public function testDenormalize(): void + public function testDenormalize() { $rrf = new ReciprocalRankFusion(k: 60); @@ -113,7 +113,7 @@ public function testDenormalize(): void $this->assertEqualsWithDelta(1 / 61, $denormalized, 0.0001); } - public function testNormalizeAndDenormalizeAreInverse(): void + public function testNormalizeAndDenormalizeAreInverse() { $rrf = new ReciprocalRankFusion(k: 60); @@ -124,7 +124,7 @@ public function testNormalizeAndDenormalizeAreInverse(): void $this->assertEqualsWithDelta($original, $denormalized, 0.0001); } - public function testBuildSqlExpression(): void + public function testBuildSqlExpression() { $rrf = new ReciprocalRankFusion(k: 60); @@ -140,7 +140,7 @@ public function testBuildSqlExpression(): void $this->assertStringContainsString(', 0.0)', $sql); } - public function testBuildSqlExpressionWithCustomNullDefault(): void + public function testBuildSqlExpressionWithCustomNullDefault() { $rrf = new ReciprocalRankFusion(k: 60); @@ -154,7 +154,7 @@ public function testBuildSqlExpressionWithCustomNullDefault(): void $this->assertStringContainsString(', -1.0)', $sql); } - public function testBuildCombinedSqlExpression(): void + public function testBuildCombinedSqlExpression() { $rrf = new ReciprocalRankFusion(k: 60); @@ -169,7 +169,7 @@ public function testBuildCombinedSqlExpression(): void $this->assertStringContainsString('60 + f.rank', $sql); } - public function testDifferentKValues(): void + public function testDifferentKValues() { $rrf60 = new ReciprocalRankFusion(k: 60, normalizeScores: false); $rrf100 = new ReciprocalRankFusion(k: 100, normalizeScores: false); @@ -189,7 +189,7 @@ public function testDifferentKValues(): void $this->assertEqualsWithDelta(1 / 101, $score100, 0.0001); } - public function testWeightAffectsScore(): void + public function testWeightAffectsScore() { $rrf = new ReciprocalRankFusion(k: 60, normalizeScores: false); @@ -204,7 +204,7 @@ public function testWeightAffectsScore(): void $this->assertEqualsWithDelta($scoreFullWeight / 2, $scoreHalfWeight, 0.0001); } - public function testLowerRankGivesLowerScore(): void + public function testLowerRankGivesLowerScore() { $rrf = new ReciprocalRankFusion(k: 60, normalizeScores: false);