mirror of
https://github.com/Sosokker/plain-rag.git
synced 2025-12-18 06:34:03 +01:00
fix: ensure do duplication content, source in upsert
This commit is contained in:
parent
aec6e10f90
commit
fbc95ce40e
@ -30,6 +30,10 @@ class PGVectorStore(VectorDB):
|
||||
dbname=settings.DB_NAME,
|
||||
)
|
||||
|
||||
def to_pgvector_str(self, vec: np.ndarray):
|
||||
arr = np.array(vec, dtype=float).flatten()
|
||||
return f"[{','.join(str(float(x)) for x in arr)}]"
|
||||
|
||||
def upsert_documents(self, documents: list[dict]) -> None:
|
||||
"""
|
||||
Upsert documents into the vector store.
|
||||
@ -53,9 +57,22 @@ class PGVectorStore(VectorDB):
|
||||
logger.error(f"Invalid document structure: {doc}")
|
||||
raise ValueError(err)
|
||||
|
||||
seen = set()
|
||||
unique_docs = []
|
||||
for doc in documents:
|
||||
key = (doc["content"], doc["source"])
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique_docs.append(doc)
|
||||
|
||||
if len(unique_docs) < len(documents):
|
||||
logger.warning(
|
||||
"Duplicate (content, source) pairs found and removed before upsert."
|
||||
)
|
||||
|
||||
data_to_insert = [
|
||||
(doc["content"], np.array(doc["embedding"]), doc["source"])
|
||||
for doc in documents
|
||||
(doc["content"], self.to_pgvector_str(doc["embedding"]), doc["source"])
|
||||
for doc in unique_docs
|
||||
]
|
||||
|
||||
query = """
|
||||
@ -84,7 +101,7 @@ class PGVectorStore(VectorDB):
|
||||
logger.exception(f"Unexpected error during upsert: {e}")
|
||||
raise
|
||||
|
||||
def search(self, vector: list, top_k: int = 5) -> list[SearchResult]:
|
||||
def search(self, vector: np.ndarray, top_k: int = 5) -> list[SearchResult]:
|
||||
"""
|
||||
Search for similar documents using vector similarity.
|
||||
|
||||
|
||||
@ -95,9 +95,11 @@ def create_documents_table(conn: pg_connection, cursor: pg_cursor) -> None:
|
||||
CREATE TABLE IF NOT EXISTS documents (
|
||||
id SERIAL PRIMARY KEY,
|
||||
content TEXT NOT NULL,
|
||||
embedding VECTOR(384), -- Match the dimension of your embedding model
|
||||
embedding VECTOR(384),
|
||||
source VARCHAR(255),
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
CONSTRAINT unique_content_source UNIQUE (content, source)
|
||||
);
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
Loading…
Reference in New Issue
Block a user