diff --git a/app/services/vector_stores.py b/app/services/vector_stores.py index 903085b..d07183b 100644 --- a/app/services/vector_stores.py +++ b/app/services/vector_stores.py @@ -30,6 +30,10 @@ class PGVectorStore(VectorDB): dbname=settings.DB_NAME, ) + def to_pgvector_str(self, vec: np.ndarray): + arr = np.array(vec, dtype=float).flatten() + return f"[{','.join(str(float(x)) for x in arr)}]" + def upsert_documents(self, documents: list[dict]) -> None: """ Upsert documents into the vector store. @@ -53,9 +57,22 @@ class PGVectorStore(VectorDB): logger.error(f"Invalid document structure: {doc}") raise ValueError(err) + seen = set() + unique_docs = [] + for doc in documents: + key = (doc["content"], doc["source"]) + if key not in seen: + seen.add(key) + unique_docs.append(doc) + + if len(unique_docs) < len(documents): + logger.warning( + "Duplicate (content, source) pairs found and removed before upsert." + ) + data_to_insert = [ - (doc["content"], np.array(doc["embedding"]), doc["source"]) - for doc in documents + (doc["content"], self.to_pgvector_str(doc["embedding"]), doc["source"]) + for doc in unique_docs ] query = """ @@ -84,7 +101,7 @@ class PGVectorStore(VectorDB): logger.exception(f"Unexpected error during upsert: {e}") raise - def search(self, vector: list, top_k: int = 5) -> list[SearchResult]: + def search(self, vector: np.ndarray, top_k: int = 5) -> list[SearchResult]: """ Search for similar documents using vector similarity. diff --git a/scripts/create_tables.py b/scripts/create_tables.py index 39a9d4a..29bbff2 100644 --- a/scripts/create_tables.py +++ b/scripts/create_tables.py @@ -95,9 +95,11 @@ def create_documents_table(conn: pg_connection, cursor: pg_cursor) -> None: CREATE TABLE IF NOT EXISTS documents ( id SERIAL PRIMARY KEY, content TEXT NOT NULL, - embedding VECTOR(384), -- Match the dimension of your embedding model + embedding VECTOR(384), source VARCHAR(255), - created_at TIMESTAMPTZ DEFAULT NOW() + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW(), + CONSTRAINT unique_content_source UNIQUE (content, source) ); """) conn.commit()