mirror of
https://github.com/Sosokker/plain-rag.git
synced 2025-12-18 14:34:05 +01:00
fix: ensure do duplication content, source in upsert
This commit is contained in:
parent
aec6e10f90
commit
fbc95ce40e
@ -30,6 +30,10 @@ class PGVectorStore(VectorDB):
|
|||||||
dbname=settings.DB_NAME,
|
dbname=settings.DB_NAME,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def to_pgvector_str(self, vec: np.ndarray):
|
||||||
|
arr = np.array(vec, dtype=float).flatten()
|
||||||
|
return f"[{','.join(str(float(x)) for x in arr)}]"
|
||||||
|
|
||||||
def upsert_documents(self, documents: list[dict]) -> None:
|
def upsert_documents(self, documents: list[dict]) -> None:
|
||||||
"""
|
"""
|
||||||
Upsert documents into the vector store.
|
Upsert documents into the vector store.
|
||||||
@ -53,9 +57,22 @@ class PGVectorStore(VectorDB):
|
|||||||
logger.error(f"Invalid document structure: {doc}")
|
logger.error(f"Invalid document structure: {doc}")
|
||||||
raise ValueError(err)
|
raise ValueError(err)
|
||||||
|
|
||||||
|
seen = set()
|
||||||
|
unique_docs = []
|
||||||
|
for doc in documents:
|
||||||
|
key = (doc["content"], doc["source"])
|
||||||
|
if key not in seen:
|
||||||
|
seen.add(key)
|
||||||
|
unique_docs.append(doc)
|
||||||
|
|
||||||
|
if len(unique_docs) < len(documents):
|
||||||
|
logger.warning(
|
||||||
|
"Duplicate (content, source) pairs found and removed before upsert."
|
||||||
|
)
|
||||||
|
|
||||||
data_to_insert = [
|
data_to_insert = [
|
||||||
(doc["content"], np.array(doc["embedding"]), doc["source"])
|
(doc["content"], self.to_pgvector_str(doc["embedding"]), doc["source"])
|
||||||
for doc in documents
|
for doc in unique_docs
|
||||||
]
|
]
|
||||||
|
|
||||||
query = """
|
query = """
|
||||||
@ -84,7 +101,7 @@ class PGVectorStore(VectorDB):
|
|||||||
logger.exception(f"Unexpected error during upsert: {e}")
|
logger.exception(f"Unexpected error during upsert: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def search(self, vector: list, top_k: int = 5) -> list[SearchResult]:
|
def search(self, vector: np.ndarray, top_k: int = 5) -> list[SearchResult]:
|
||||||
"""
|
"""
|
||||||
Search for similar documents using vector similarity.
|
Search for similar documents using vector similarity.
|
||||||
|
|
||||||
|
|||||||
@ -95,9 +95,11 @@ def create_documents_table(conn: pg_connection, cursor: pg_cursor) -> None:
|
|||||||
CREATE TABLE IF NOT EXISTS documents (
|
CREATE TABLE IF NOT EXISTS documents (
|
||||||
id SERIAL PRIMARY KEY,
|
id SERIAL PRIMARY KEY,
|
||||||
content TEXT NOT NULL,
|
content TEXT NOT NULL,
|
||||||
embedding VECTOR(384), -- Match the dimension of your embedding model
|
embedding VECTOR(384),
|
||||||
source VARCHAR(255),
|
source VARCHAR(255),
|
||||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
CONSTRAINT unique_content_source UNIQUE (content, source)
|
||||||
);
|
);
|
||||||
""")
|
""")
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user