diff --git a/src/application/api/mcp_bricks_tools.py b/src/application/api/mcp_bricks_tools.py index 34adf6e..a5d8803 100644 --- a/src/application/api/mcp_bricks_tools.py +++ b/src/application/api/mcp_bricks_tools.py @@ -2,6 +2,7 @@ from fastmcp import FastMCP from fastmcp.exceptions import ToolError +from pydantic import BaseModel, Field from application.responses.file_response import FileContentResponse from dependencies import ( @@ -15,8 +16,16 @@ mcp_bricks = FastMCP("RAGAnythingBricks") +class DocumentSummary(BaseModel): + id: str = Field(description="Document ID — pass this to read_bricks_document") + file_name: str = Field(description="Nom du fichier") + mime_type: str = Field(default="", description="Type MIME du fichier") + size: int = Field(default=0, description="Taille en octets") + status: str = Field(default="", description="Statut du document") + + @mcp_bricks.tool() -async def list_bricks_documents(project_unique_id: str) -> list: +async def list_bricks_documents(project_unique_id: str) -> list[DocumentSummary]: """Liste les documents d'un projet Bricks. Args: @@ -27,12 +36,22 @@ async def list_bricks_documents(project_unique_id: str) -> list: """ use_case = get_list_bricks_documents_use_case() try: - return await use_case.execute(project_id=project_unique_id) + documents = await use_case.execute(project_id=project_unique_id) except Exception: logger.exception( "Failed to list bricks documents for project %s", project_unique_id ) raise ToolError("Failed to list bricks documents") from None + return [ + DocumentSummary( + id=doc.id, + file_name=doc.file_name, + mime_type=doc.mime_type, + size=doc.size, + status=doc.status, + ) + for doc in documents + ] @mcp_bricks.tool() @@ -100,4 +119,4 @@ async def publish_section_version( logger.exception( "Failed to publish section version for project %s", project_unique_id ) - raise ToolError("Failed to publish section version") from None + raise ToolError("Failed to publish section version") from None \ No newline at end of file diff --git a/src/application/use_cases/read_bricks_document_use_case.py b/src/application/use_cases/read_bricks_document_use_case.py index 7b46be5..aedbaa3 100644 --- a/src/application/use_cases/read_bricks_document_use_case.py +++ b/src/application/use_cases/read_bricks_document_use_case.py @@ -24,7 +24,7 @@ async def execute( document_id: str, project_id: str, ) -> DocumentContent: - data, filename = await self.bricks_api.download_document( + data, filename, mime_type = await self.bricks_api.download_document( document_id=document_id, project_id=project_id ) @@ -36,7 +36,7 @@ async def execute( async with aiofiles.open(tmp_path, "wb") as f: await f.write(data) - return await self.document_reader.extract_content(tmp_path) + return await self.document_reader.extract_content(tmp_path, mime_type=mime_type) finally: with contextlib.suppress(OSError): os.unlink(tmp_path) diff --git a/src/domain/ports/bricks_api_port.py b/src/domain/ports/bricks_api_port.py index 3a363d0..ee572d9 100644 --- a/src/domain/ports/bricks_api_port.py +++ b/src/domain/ports/bricks_api_port.py @@ -49,7 +49,7 @@ async def download_document( self, document_id: str, project_id: str, - ) -> tuple[bytes, str]: + ) -> tuple[bytes, str, str]: pass @abstractmethod diff --git a/src/domain/ports/document_reader_port.py b/src/domain/ports/document_reader_port.py index a10a8ea..6ed508d 100644 --- a/src/domain/ports/document_reader_port.py +++ b/src/domain/ports/document_reader_port.py @@ -20,5 +20,5 @@ class DocumentContent(BaseModel): class DocumentReaderPort(ABC): @abstractmethod - async def extract_content(self, file_path: str) -> DocumentContent: + async def extract_content(self, file_path: str, mime_type: str = "") -> DocumentContent: pass diff --git a/src/infrastructure/bricks/bricks_api_adapter.py b/src/infrastructure/bricks/bricks_api_adapter.py index 7adc413..7d18f60 100644 --- a/src/infrastructure/bricks/bricks_api_adapter.py +++ b/src/infrastructure/bricks/bricks_api_adapter.py @@ -1,6 +1,7 @@ import asyncio import json import logging +import os import re import urllib.error import urllib.parse @@ -65,12 +66,14 @@ async def download_document( self, document_id: str, project_id: str, - ) -> tuple[bytes, str]: + ) -> tuple[bytes, str, str]: documents = await self.list_project_documents(project_id) url = None + mime_type = "" for doc in documents: if doc.id == document_id and doc.url: url = doc.url + mime_type = doc.mime_type break if not url: raise FileNotFoundError( @@ -96,7 +99,7 @@ async def download_document( raise TimeoutError(f"Document download timed out: {e}") from e filename = _extract_filename(resp_headers.get("Content-Disposition", ""), url) - return body, filename + return body, filename, mime_type async def publish_section_version(self, payload: dict) -> SectionVersionResult: url = f"{self._base_url}/api/section-versions" @@ -123,13 +126,30 @@ async def publish_section_version(self, payload: dict) -> SectionVersionResult: def _extract_filename(content_disposition: str, url: str = "") -> str: match = re.search(r'filename="([^"]+)"', content_disposition) if match: - return match.group(1) + filename = match.group(1) + logger.debug("Filename from Content-Disposition (quoted): %s", filename) + return _normalize_extension(filename) match = re.search(r"filename=([^\s;]+)", content_disposition) if match: - return match.group(1) + filename = match.group(1) + logger.debug("Filename from Content-Disposition (unquoted): %s", filename) + return _normalize_extension(filename) if url: decoded_path = urllib.parse.unquote(urllib.parse.urlparse(url).path) path_filename = decoded_path.rsplit("/", 1)[-1] if path_filename and "." in path_filename: - return path_filename - return "document.bin" \ No newline at end of file + logger.debug("Filename from URL path: %s (url=%s)", path_filename, url[:200]) + return _normalize_extension(path_filename) + logger.warning("Could not extract filename, falling back to document.bin (url=%s)", url[:200] if url else "") + return "document.bin" + + +def _normalize_extension(filename: str) -> str: + name, ext = os.path.splitext(filename) + if not ext or ext == ".": + return filename + cleaned = re.sub(r"^[^a-zA-Z]+", ".", ext.lower()) + if cleaned == ext.lower(): + return filename + logger.warning("Normalized file extension: %s -> %s (filename=%s)", ext, cleaned, filename) + return name + cleaned \ No newline at end of file diff --git a/src/infrastructure/document_reader/kreuzberg_adapter.py b/src/infrastructure/document_reader/kreuzberg_adapter.py index 0264003..d7770ca 100644 --- a/src/infrastructure/document_reader/kreuzberg_adapter.py +++ b/src/infrastructure/document_reader/kreuzberg_adapter.py @@ -62,9 +62,12 @@ class KreuzbergAdapter(DocumentReaderPort): def __init__(self, ocr_mode: str | None = None) -> None: self._config = make_extraction_config(ocr_mode=ocr_mode) - async def extract_content(self, file_path: str) -> DocumentContent: + async def extract_content(self, file_path: str, mime_type: str = "") -> DocumentContent: try: - result = await extract_file(file_path, config=self._config) + kwargs = {"config": self._config} + if mime_type: + kwargs["mime_type"] = mime_type + result = await extract_file(file_path, **kwargs) logger.debug("Full extraction result for %s: %s", file_path, result) except ParsingError as e: raise ValueError(f"Unsupported file format: {e}") from e diff --git a/tests/unit/infrastructure/test_bricks_api_adapter.py b/tests/unit/infrastructure/test_bricks_api_adapter.py index 539c1df..120aeeb 100644 --- a/tests/unit/infrastructure/test_bricks_api_adapter.py +++ b/tests/unit/infrastructure/test_bricks_api_adapter.py @@ -16,6 +16,7 @@ from infrastructure.bricks.bricks_api_adapter import ( BricksApiAdapter, _extract_filename, + _normalize_extension, ) @@ -236,13 +237,14 @@ async def test_resolves_document_id_and_downloads_presigned_url(self, adapter: B file_resp = _mock_urlopen_response(file_body, headers={"Content-Disposition": 'attachment; filename="doc.pdf"'}) with patch("infrastructure.bricks.bricks_api_adapter.urllib.request.urlopen", side_effect=[list_resp, file_resp]) as mock_urlopen: - content, filename = await adapter.download_document(document_id="doc-1", project_id="proj-1") + content, filename, mime_type = await adapter.download_document(document_id="doc-1", project_id="proj-1") assert mock_urlopen.call_count == 2 list_req = mock_urlopen.call_args_list[0][0][0] assert "api/projects/proj-1/documents/ai" in list_req.full_url assert content == b"PDF content" assert filename == "doc.pdf" + assert mime_type == "application/pdf" async def test_extracts_filename_from_url_when_no_content_disposition(self, adapter: BricksApiAdapter) -> None: """Should extract filename from URL path when no Content-Disposition header.""" @@ -261,7 +263,7 @@ async def test_extracts_filename_from_url_when_no_content_disposition(self, adap file_resp = _mock_urlopen_response(b"data", headers={}) with patch("infrastructure.bricks.bricks_api_adapter.urllib.request.urlopen", side_effect=[list_resp, file_resp]): - _, filename = await adapter.download_document(document_id="doc-2", project_id="proj-1") + _, filename, _ = await adapter.download_document(document_id="doc-2", project_id="proj-1") assert filename == "Dossier_Financement.pdf" @@ -281,7 +283,7 @@ async def test_defaults_to_document_bin_when_no_filename(self, adapter: BricksAp file_resp = _mock_urlopen_response(b"data", headers={}) with patch("infrastructure.bricks.bricks_api_adapter.urllib.request.urlopen", side_effect=[list_resp, file_resp]): - _, filename = await adapter.download_document(document_id="doc-3", project_id="proj-1") + _, filename, _ = await adapter.download_document(document_id="doc-3", project_id="proj-1") assert filename == "document.bin" @@ -359,6 +361,28 @@ def test_url_without_extension_falls_back(self) -> None: assert _extract_filename("", "https://s3.example.com/path/noext") == "document.bin" +class TestNormalizeExtension: + """Tests for _normalize_extension helper.""" + + def test_normal_dot_pdf(self) -> None: + assert _normalize_extension("report.pdf") == "report.pdf" + + def test_dot_underscore_pdf(self) -> None: + assert _normalize_extension("report._pdf") == "report.pdf" + + def test_double_dot_extension(self) -> None: + assert _normalize_extension("report..pdf") == "report..pdf" + + def test_no_extension(self) -> None: + assert _normalize_extension("noext") == "noext" + + def test_empty_extension(self) -> None: + assert _normalize_extension("report.") == "report." + + def test_normal_dot_docx(self) -> None: + assert _normalize_extension("file.docx") == "file.docx" + + class TestPublishSectionVersion: """Tests for BricksApiAdapter.publish_section_version.""" diff --git a/tests/unit/test_bricks_api_port.py b/tests/unit/test_bricks_api_port.py index 44ba0c3..ca9de7b 100644 --- a/tests/unit/test_bricks_api_port.py +++ b/tests/unit/test_bricks_api_port.py @@ -203,8 +203,8 @@ async def list_project_documents( async def download_document( self, document_id: str, project_id: str - ) -> tuple[bytes, str]: - return (b"", "file.pdf") + ) -> tuple[bytes, str, str]: + return (b"", "file.pdf", "application/pdf") async def publish_section_version( self, payload: dict diff --git a/tests/unit/test_kreuzberg_adapter.py b/tests/unit/test_kreuzberg_adapter.py index 63a6268..ae15704 100644 --- a/tests/unit/test_kreuzberg_adapter.py +++ b/tests/unit/test_kreuzberg_adapter.py @@ -93,3 +93,34 @@ async def test_extract_content_raises_on_other_kreuzberg_error( with pytest.raises(KreuzbergError): await adapter.extract_content("/tmp/test.pdf") + + @patch("infrastructure.document_reader.kreuzberg_adapter.extract_file") + async def test_extract_content_passes_mime_type(self, mock_extract) -> None: + mock_result = AsyncMock() + mock_result.content = "text" + mock_result.mime_type = "application/pdf" + mock_result.metadata = {} + mock_result.tables = [] + mock_extract.return_value = mock_result + + adapter = KreuzbergAdapter(ocr_mode="tesseract") + result = await adapter.extract_content("/tmp/doc.bin", mime_type="application/pdf") + + mock_extract.assert_awaited_once() + call_kwargs = mock_extract.call_args.kwargs + assert call_kwargs.get("mime_type") == "application/pdf" + + @patch("infrastructure.document_reader.kreuzberg_adapter.extract_file") + async def test_extract_content_without_mime_type(self, mock_extract) -> None: + mock_result = AsyncMock() + mock_result.content = "text" + mock_result.mime_type = "application/pdf" + mock_result.metadata = {} + mock_result.tables = [] + mock_extract.return_value = mock_result + + adapter = KreuzbergAdapter(ocr_mode="tesseract") + await adapter.extract_content("/tmp/test.pdf") + + call_kwargs = mock_extract.call_args.kwargs + assert "mime_type" not in call_kwargs diff --git a/tests/unit/test_read_bricks_document_use_case.py b/tests/unit/test_read_bricks_document_use_case.py index 50f91cd..ce5b413 100644 --- a/tests/unit/test_read_bricks_document_use_case.py +++ b/tests/unit/test_read_bricks_document_use_case.py @@ -25,7 +25,7 @@ async def test_execute_downloads_from_bricks_api( tmp_path, ) -> None: """Should call bricks_api.download_document with document_id and project_id.""" - mock_bricks_api.download_document.return_value = (b"file content", "report.pdf") + mock_bricks_api.download_document.return_value = (b"file content", "report.pdf", "application/pdf") mock_document_reader.extract_content.return_value = DocumentContent( content="extracted text", metadata=DocumentMetadata(format_type="pdf", mime_type="application/pdf"), @@ -50,7 +50,7 @@ async def test_execute_returns_document_content( tmp_path, ) -> None: """Should return DocumentContent from Kreuzberg extraction.""" - mock_bricks_api.download_document.return_value = (b"pdf data", "doc.pdf") + mock_bricks_api.download_document.return_value = (b"pdf data", "doc.pdf", "application/pdf") expected = DocumentContent( content="extracted text from bricks document", metadata=DocumentMetadata(format_type="pdf", mime_type="application/pdf"), @@ -78,7 +78,7 @@ async def test_execute_calls_document_reader_with_temp_file( tmp_path, ) -> None: """Should save temp file and pass its path to document_reader.extract_content.""" - mock_bricks_api.download_document.return_value = (b"pdf binary", "report.pdf") + mock_bricks_api.download_document.return_value = (b"pdf binary", "report.pdf", "application/pdf") mock_document_reader.extract_content.return_value = DocumentContent( content="text", metadata=DocumentMetadata(format_type="pdf"), @@ -104,7 +104,7 @@ async def test_execute_temp_file_suffix_matches_filename_extension( tmp_path, ) -> None: """Should use the filename extension from the download as temp file suffix.""" - mock_bricks_api.download_document.return_value = (b"data", "spreadsheet.xlsx") + mock_bricks_api.download_document.return_value = (b"data", "spreadsheet.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") mock_document_reader.extract_content.return_value = DocumentContent( content="spreadsheet data", metadata=DocumentMetadata(format_type="xlsx"), @@ -148,7 +148,7 @@ async def test_execute_propagates_kreuzberg_failure( tmp_path, ) -> None: """Should propagate errors when Kreuzberg extraction fails.""" - mock_bricks_api.download_document.return_value = (b"corrupt data", "broken.pdf") + mock_bricks_api.download_document.return_value = (b"corrupt data", "broken.pdf", "application/pdf") mock_document_reader.extract_content.side_effect = ValueError( "Unsupported format" ) @@ -168,7 +168,7 @@ async def test_execute_cleans_up_temp_file( tmp_path, ) -> None: """Should delete the temp file after successful extraction.""" - mock_bricks_api.download_document.return_value = (b"data", "report.pdf") + mock_bricks_api.download_document.return_value = (b"data", "report.pdf", "application/pdf") mock_document_reader.extract_content.return_value = DocumentContent( content="text", metadata=DocumentMetadata(format_type="txt"), @@ -193,7 +193,7 @@ async def test_execute_cleans_up_temp_file_on_error( tmp_path, ) -> None: """Should delete the temp file even when Kreuzberg extraction fails.""" - mock_bricks_api.download_document.return_value = (b"data", "report.pdf") + mock_bricks_api.download_document.return_value = (b"data", "report.pdf", "application/pdf") mock_document_reader.extract_content.side_effect = ValueError("bad format") use_case = ReadBricksDocumentUseCase( bricks_api=mock_bricks_api, @@ -217,7 +217,7 @@ async def test_execute_with_tables( """Should include tables in the returned DocumentContent.""" from domain.ports.document_reader_port import TableData - mock_bricks_api.download_document.return_value = (b"data", "financials.pdf") + mock_bricks_api.download_document.return_value = (b"data", "financials.pdf", "application/pdf") mock_document_reader.extract_content.return_value = DocumentContent( content="text with table", metadata=DocumentMetadata(format_type="pdf", mime_type="application/pdf"), @@ -234,4 +234,28 @@ async def test_execute_with_tables( ) assert len(result.tables) == 1 - assert result.tables[0].markdown == "| A | B |\n|---|---|\n| 1 | 2 |" \ No newline at end of file + assert result.tables[0].markdown == "| A | B |\n|---|---|\n| 1 | 2 |" + + async def test_execute_passes_mime_type_to_document_reader( + self, + mock_bricks_api: AsyncMock, + mock_document_reader: AsyncMock, + tmp_path, + ) -> None: + """Should pass mime_type from Bricks API to document_reader.extract_content.""" + mock_bricks_api.download_document.return_value = (b"data", "report.pdf", "application/pdf") + mock_document_reader.extract_content.return_value = DocumentContent( + content="text", + metadata=DocumentMetadata(format_type="pdf"), + tables=[], + ) + use_case = ReadBricksDocumentUseCase( + bricks_api=mock_bricks_api, + document_reader=mock_document_reader, + output_dir=str(tmp_path), + ) + + await use_case.execute(document_id="doc-1", project_id="proj-1") + + call_kwargs = mock_document_reader.extract_content.call_args + assert call_kwargs.kwargs.get("mime_type") == "application/pdf" or call_kwargs[1].get("mime_type") == "application/pdf" \ No newline at end of file