mirror of
https://github.com/Sosokker/plain-rag.git
synced 2025-12-18 14:34:05 +01:00
feat: add FileTypeIngestionError exception and enforce file type validation in RAG service
This commit is contained in:
parent
aec7ca824c
commit
43f37f7ad2
@ -8,3 +8,7 @@ class DocumentExtractionError(Exception):
|
||||
|
||||
class ModelNotFoundError(Exception):
|
||||
"""Exception raised when model is not found."""
|
||||
|
||||
|
||||
class FileTypeIngestionError(Exception):
|
||||
"""Exception raised when user upload unsupported file type."""
|
||||
|
||||
@ -5,7 +5,7 @@ from app.core.interfaces import EmbeddingModel
|
||||
|
||||
|
||||
class MiniLMEmbeddingModel(EmbeddingModel):
|
||||
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
|
||||
def __init__(self, model_name: str):
|
||||
self.model = SentenceTransformer(model_name)
|
||||
|
||||
def embed_documents(self, texts: list[str]) -> list[np.ndarray]:
|
||||
|
||||
@ -9,6 +9,7 @@ from PyPDF2.errors import PyPdfError
|
||||
from structlog import get_logger
|
||||
|
||||
from app.core.config import settings
|
||||
from app.core.exception import FileTypeIngestionError
|
||||
from app.core.interfaces import EmbeddingModel, Reranker, VectorDB
|
||||
from app.core.utils import RecursiveCharacterTextSplitter
|
||||
from app.schemas.enums import LLMModelName
|
||||
@ -74,6 +75,8 @@ Answer:"""
|
||||
path = Path(file_path)
|
||||
ext = path.suffix
|
||||
text = ""
|
||||
if ext[1:] not in settings.ALLOWED_DOCUMENT_TYPES:
|
||||
raise FileTypeIngestionError("Only support PDF, MD and TXT files")
|
||||
if ext == ".pdf":
|
||||
try:
|
||||
reader = PdfReader(str(file_path))
|
||||
|
||||
@ -11,7 +11,7 @@ logger = get_logger()
|
||||
|
||||
|
||||
class MiniLMReranker(Reranker):
|
||||
def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
|
||||
def __init__(self, model_name: str):
|
||||
try:
|
||||
self.model = CrossEncoder(model_name)
|
||||
except Exception as er:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user