fix: ensure do duplication content, source in upsert

This commit is contained in:
Sosokker 2025-06-28 00:05:48 +07:00
parent aec6e10f90
commit fbc95ce40e
2 changed files with 24 additions and 5 deletions

View File

@ -30,6 +30,10 @@ class PGVectorStore(VectorDB):
dbname=settings.DB_NAME, dbname=settings.DB_NAME,
) )
def to_pgvector_str(self, vec: np.ndarray):
arr = np.array(vec, dtype=float).flatten()
return f"[{','.join(str(float(x)) for x in arr)}]"
def upsert_documents(self, documents: list[dict]) -> None: def upsert_documents(self, documents: list[dict]) -> None:
""" """
Upsert documents into the vector store. Upsert documents into the vector store.
@ -53,9 +57,22 @@ class PGVectorStore(VectorDB):
logger.error(f"Invalid document structure: {doc}") logger.error(f"Invalid document structure: {doc}")
raise ValueError(err) raise ValueError(err)
seen = set()
unique_docs = []
for doc in documents:
key = (doc["content"], doc["source"])
if key not in seen:
seen.add(key)
unique_docs.append(doc)
if len(unique_docs) < len(documents):
logger.warning(
"Duplicate (content, source) pairs found and removed before upsert."
)
data_to_insert = [ data_to_insert = [
(doc["content"], np.array(doc["embedding"]), doc["source"]) (doc["content"], self.to_pgvector_str(doc["embedding"]), doc["source"])
for doc in documents for doc in unique_docs
] ]
query = """ query = """
@ -84,7 +101,7 @@ class PGVectorStore(VectorDB):
logger.exception(f"Unexpected error during upsert: {e}") logger.exception(f"Unexpected error during upsert: {e}")
raise raise
def search(self, vector: list, top_k: int = 5) -> list[SearchResult]: def search(self, vector: np.ndarray, top_k: int = 5) -> list[SearchResult]:
""" """
Search for similar documents using vector similarity. Search for similar documents using vector similarity.

View File

@ -95,9 +95,11 @@ def create_documents_table(conn: pg_connection, cursor: pg_cursor) -> None:
CREATE TABLE IF NOT EXISTS documents ( CREATE TABLE IF NOT EXISTS documents (
id SERIAL PRIMARY KEY, id SERIAL PRIMARY KEY,
content TEXT NOT NULL, content TEXT NOT NULL,
embedding VECTOR(384), -- Match the dimension of your embedding model embedding VECTOR(384),
source VARCHAR(255), source VARCHAR(255),
created_at TIMESTAMPTZ DEFAULT NOW() created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW(),
CONSTRAINT unique_content_source UNIQUE (content, source)
); );
""") """)
conn.commit() conn.commit()