SoluDevTech · Kaiohz · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/src/application/api/mcp_bricks_tools.py b/src/application/api/mcp_bricks_tools.py
@@ -2,6 +2,7 @@
 
 from fastmcp import FastMCP
 from fastmcp.exceptions import ToolError
+from pydantic import BaseModel, Field
 
 from application.responses.file_response import FileContentResponse
 from dependencies import (
@@ -15,8 +16,16 @@
 mcp_bricks = FastMCP("RAGAnythingBricks")
 
 
+class DocumentSummary(BaseModel):
+    id: str = Field(description="Document ID — pass this to read_bricks_document")
+    file_name: str = Field(description="Nom du fichier")
+    mime_type: str = Field(default="", description="Type MIME du fichier")
+    size: int = Field(default=0, description="Taille en octets")
+    status: str = Field(default="", description="Statut du document")
+
+
 @mcp_bricks.tool()
-async def list_bricks_documents(project_unique_id: str) -> list:
+async def list_bricks_documents(project_unique_id: str) -> list[DocumentSummary]:
     """Liste les documents d'un projet Bricks.
 
     Args:
@@ -27,12 +36,22 @@ async def list_bricks_documents(project_unique_id: str) -> list:
     """
     use_case = get_list_bricks_documents_use_case()
     try:
-        return await use_case.execute(project_id=project_unique_id)
+        documents = await use_case.execute(project_id=project_unique_id)
     except Exception:
         logger.exception(
             "Failed to list bricks documents for project %s", project_unique_id
         )
         raise ToolError("Failed to list bricks documents") from None
+    return [
+        DocumentSummary(
+            id=doc.id,
+            file_name=doc.file_name,
+            mime_type=doc.mime_type,
+            size=doc.size,
+            status=doc.status,
+        )
+        for doc in documents
+    ]
 
 
 @mcp_bricks.tool()
@@ -100,4 +119,4 @@ async def publish_section_version(
         logger.exception(
             "Failed to publish section version for project %s", project_unique_id
         )
-        raise ToolError("Failed to publish section version") from None
+        raise ToolError("Failed to publish section version") from None
diff --git a/src/application/use_cases/read_bricks_document_use_case.py b/src/application/use_cases/read_bricks_document_use_case.py
@@ -24,7 +24,7 @@ async def execute(
         document_id: str,
         project_id: str,
     ) -> DocumentContent:
-        data, filename = await self.bricks_api.download_document(
+        data, filename, mime_type = await self.bricks_api.download_document(
             document_id=document_id, project_id=project_id
         )
 
@@ -36,7 +36,7 @@ async def execute(
             async with aiofiles.open(tmp_path, "wb") as f:
                 await f.write(data)
 
-            return await self.document_reader.extract_content(tmp_path)
+            return await self.document_reader.extract_content(tmp_path, mime_type=mime_type)
         finally:
             with contextlib.suppress(OSError):
                 os.unlink(tmp_path)
diff --git a/src/domain/ports/bricks_api_port.py b/src/domain/ports/bricks_api_port.py
@@ -49,7 +49,7 @@ async def download_document(
         self,
         document_id: str,
         project_id: str,
-    ) -> tuple[bytes, str]:
+    ) -> tuple[bytes, str, str]:
         pass
 
     @abstractmethod

diff --git a/src/domain/ports/document_reader_port.py b/src/domain/ports/document_reader_port.py
@@ -20,5 +20,5 @@ class DocumentContent(BaseModel):
 
 class DocumentReaderPort(ABC):
     @abstractmethod
-    async def extract_content(self, file_path: str) -> DocumentContent:
+    async def extract_content(self, file_path: str, mime_type: str = "") -> DocumentContent:
         pass
diff --git a/src/infrastructure/bricks/bricks_api_adapter.py b/src/infrastructure/bricks/bricks_api_adapter.py
@@ -1,6 +1,7 @@
 import asyncio
 import json
 import logging
+import os
 import re
 import urllib.error
 import urllib.parse
@@ -65,12 +66,14 @@ async def download_document(
         self,
         document_id: str,
         project_id: str,
-    ) -> tuple[bytes, str]:
+    ) -> tuple[bytes, str, str]:
         documents = await self.list_project_documents(project_id)
         url = None
+        mime_type = ""
         for doc in documents:
             if doc.id == document_id and doc.url:
                 url = doc.url
+                mime_type = doc.mime_type
                 break
         if not url:
             raise FileNotFoundError(
@@ -96,7 +99,7 @@ async def download_document(
             raise TimeoutError(f"Document download timed out: {e}") from e
 
         filename = _extract_filename(resp_headers.get("Content-Disposition", ""), url)
-        return body, filename
+        return body, filename, mime_type
 
     async def publish_section_version(self, payload: dict) -> SectionVersionResult:
         url = f"{self._base_url}/api/section-versions"
@@ -123,13 +126,30 @@ async def publish_section_version(self, payload: dict) -> SectionVersionResult:
 def _extract_filename(content_disposition: str, url: str = "") -> str:
     match = re.search(r'filename="([^"]+)"', content_disposition)
     if match:
-        return match.group(1)
+        filename = match.group(1)
+        logger.debug("Filename from Content-Disposition (quoted): %s", filename)
+        return _normalize_extension(filename)
     match = re.search(r"filename=([^\s;]+)", content_disposition)
     if match:
-        return match.group(1)
+        filename = match.group(1)
+        logger.debug("Filename from Content-Disposition (unquoted): %s", filename)
+        return _normalize_extension(filename)
     if url:
         decoded_path = urllib.parse.unquote(urllib.parse.urlparse(url).path)
         path_filename = decoded_path.rsplit("/", 1)[-1]
         if path_filename and "." in path_filename:
-            return path_filename
-    return "document.bin"
+            logger.debug("Filename from URL path: %s (url=%s)", path_filename, url[:200])
+            return _normalize_extension(path_filename)
+    logger.warning("Could not extract filename, falling back to document.bin (url=%s)", url[:200] if url else "")
+    return "document.bin"
+
+
+def _normalize_extension(filename: str) -> str:
+    name, ext = os.path.splitext(filename)
+    if not ext or ext == ".":
+        return filename
+    cleaned = re.sub(r"^[^a-zA-Z]+", ".", ext.lower())
+    if cleaned == ext.lower():
+        return filename
+    logger.warning("Normalized file extension: %s -> %s (filename=%s)", ext, cleaned, filename)
+    return name + cleaned
diff --git a/src/infrastructure/document_reader/kreuzberg_adapter.py b/src/infrastructure/document_reader/kreuzberg_adapter.py
@@ -62,9 +62,12 @@ class KreuzbergAdapter(DocumentReaderPort):
     def __init__(self, ocr_mode: str | None = None) -> None:
         self._config = make_extraction_config(ocr_mode=ocr_mode)
 
-    async def extract_content(self, file_path: str) -> DocumentContent:
+    async def extract_content(self, file_path: str, mime_type: str = "") -> DocumentContent:
         try:
-            result = await extract_file(file_path, config=self._config)
+            kwargs = {"config": self._config}
+            if mime_type:
+                kwargs["mime_type"] = mime_type
+            result = await extract_file(file_path, **kwargs)
             logger.debug("Full extraction result for %s: %s", file_path, result)
         except ParsingError as e:
             raise ValueError(f"Unsupported file format: {e}") from e

diff --git a/tests/unit/infrastructure/test_bricks_api_adapter.py b/tests/unit/infrastructure/test_bricks_api_adapter.py
@@ -16,6 +16,7 @@
 from infrastructure.bricks.bricks_api_adapter import (
     BricksApiAdapter,
     _extract_filename,
+    _normalize_extension,
 )
 
 
@@ -236,13 +237,14 @@ async def test_resolves_document_id_and_downloads_presigned_url(self, adapter: B
         file_resp = _mock_urlopen_response(file_body, headers={"Content-Disposition": 'attachment; filename="doc.pdf"'})
 
         with patch("infrastructure.bricks.bricks_api_adapter.urllib.request.urlopen", side_effect=[list_resp, file_resp]) as mock_urlopen:
-            content, filename = await adapter.download_document(document_id="doc-1", project_id="proj-1")
+            content, filename, mime_type = await adapter.download_document(document_id="doc-1", project_id="proj-1")
 
             assert mock_urlopen.call_count == 2
             list_req = mock_urlopen.call_args_list[0][0][0]
             assert "api/projects/proj-1/documents/ai" in list_req.full_url
         assert content == b"PDF content"
         assert filename == "doc.pdf"
+        assert mime_type == "application/pdf"
 
     async def test_extracts_filename_from_url_when_no_content_disposition(self, adapter: BricksApiAdapter) -> None:
         """Should extract filename from URL path when no Content-Disposition header."""
@@ -261,7 +263,7 @@ async def test_extracts_filename_from_url_when_no_content_disposition(self, adap
         file_resp = _mock_urlopen_response(b"data", headers={})
 
         with patch("infrastructure.bricks.bricks_api_adapter.urllib.request.urlopen", side_effect=[list_resp, file_resp]):
-            _, filename = await adapter.download_document(document_id="doc-2", project_id="proj-1")
+            _, filename, _ = await adapter.download_document(document_id="doc-2", project_id="proj-1")
 
         assert filename == "Dossier_Financement.pdf"
 
@@ -281,7 +283,7 @@ async def test_defaults_to_document_bin_when_no_filename(self, adapter: BricksAp
         file_resp = _mock_urlopen_response(b"data", headers={})
 
         with patch("infrastructure.bricks.bricks_api_adapter.urllib.request.urlopen", side_effect=[list_resp, file_resp]):
-            _, filename = await adapter.download_document(document_id="doc-3", project_id="proj-1")
+            _, filename, _ = await adapter.download_document(document_id="doc-3", project_id="proj-1")
 
         assert filename == "document.bin"
 
@@ -359,6 +361,28 @@ def test_url_without_extension_falls_back(self) -> None:
         assert _extract_filename("", "https://s3.example.com/path/noext") == "document.bin"
 
 
+class TestNormalizeExtension:
+    """Tests for _normalize_extension helper."""
+
+    def test_normal_dot_pdf(self) -> None:
+        assert _normalize_extension("report.pdf") == "report.pdf"
+
+    def test_dot_underscore_pdf(self) -> None:
+        assert _normalize_extension("report._pdf") == "report.pdf"
+
+    def test_double_dot_extension(self) -> None:
+        assert _normalize_extension("report..pdf") == "report..pdf"
+
+    def test_no_extension(self) -> None:
+        assert _normalize_extension("noext") == "noext"
+
+    def test_empty_extension(self) -> None:
+        assert _normalize_extension("report.") == "report."
+
+    def test_normal_dot_docx(self) -> None:
+        assert _normalize_extension("file.docx") == "file.docx"
+
+
 class TestPublishSectionVersion:
     """Tests for BricksApiAdapter.publish_section_version."""
 

diff --git a/tests/unit/test_bricks_api_port.py b/tests/unit/test_bricks_api_port.py
@@ -203,8 +203,8 @@ async def list_project_documents(
 
             async def download_document(
                 self, document_id: str, project_id: str
-            ) -> tuple[bytes, str]:
-                return (b"", "file.pdf")
+            ) -> tuple[bytes, str, str]:
+                return (b"", "file.pdf", "application/pdf")
 
             async def publish_section_version(
                 self, payload: dict

diff --git a/tests/unit/test_kreuzberg_adapter.py b/tests/unit/test_kreuzberg_adapter.py
@@ -93,3 +93,34 @@ async def test_extract_content_raises_on_other_kreuzberg_error(
 
         with pytest.raises(KreuzbergError):
             await adapter.extract_content("/tmp/test.pdf")
+
+    @patch("infrastructure.document_reader.kreuzberg_adapter.extract_file")
+    async def test_extract_content_passes_mime_type(self, mock_extract) -> None:
+        mock_result = AsyncMock()
+        mock_result.content = "text"
+        mock_result.mime_type = "application/pdf"
+        mock_result.metadata = {}
+        mock_result.tables = []
+        mock_extract.return_value = mock_result
+
+        adapter = KreuzbergAdapter(ocr_mode="tesseract")
+        result = await adapter.extract_content("/tmp/doc.bin", mime_type="application/pdf")
+
+        mock_extract.assert_awaited_once()
+        call_kwargs = mock_extract.call_args.kwargs
+        assert call_kwargs.get("mime_type") == "application/pdf"
+
+    @patch("infrastructure.document_reader.kreuzberg_adapter.extract_file")
+    async def test_extract_content_without_mime_type(self, mock_extract) -> None:
+        mock_result = AsyncMock()
+        mock_result.content = "text"
+        mock_result.mime_type = "application/pdf"
+        mock_result.metadata = {}
+        mock_result.tables = []
+        mock_extract.return_value = mock_result
+
+        adapter = KreuzbergAdapter(ocr_mode="tesseract")
+        await adapter.extract_content("/tmp/test.pdf")
+
+        call_kwargs = mock_extract.call_args.kwargs
+        assert "mime_type" not in call_kwargs
diff --git a/tests/unit/test_read_bricks_document_use_case.py b/tests/unit/test_read_bricks_document_use_case.py
@@ -25,7 +25,7 @@ async def test_execute_downloads_from_bricks_api(
         tmp_path,
     ) -> None:
         """Should call bricks_api.download_document with document_id and project_id."""
-        mock_bricks_api.download_document.return_value = (b"file content", "report.pdf")
+        mock_bricks_api.download_document.return_value = (b"file content", "report.pdf", "application/pdf")
         mock_document_reader.extract_content.return_value = DocumentContent(
             content="extracted text",
             metadata=DocumentMetadata(format_type="pdf", mime_type="application/pdf"),
@@ -50,7 +50,7 @@ async def test_execute_returns_document_content(
         tmp_path,
     ) -> None:
         """Should return DocumentContent from Kreuzberg extraction."""
-        mock_bricks_api.download_document.return_value = (b"pdf data", "doc.pdf")
+        mock_bricks_api.download_document.return_value = (b"pdf data", "doc.pdf", "application/pdf")
         expected = DocumentContent(
             content="extracted text from bricks document",
             metadata=DocumentMetadata(format_type="pdf", mime_type="application/pdf"),
@@ -78,7 +78,7 @@ async def test_execute_calls_document_reader_with_temp_file(
         tmp_path,
     ) -> None:
         """Should save temp file and pass its path to document_reader.extract_content."""
-        mock_bricks_api.download_document.return_value = (b"pdf binary", "report.pdf")
+        mock_bricks_api.download_document.return_value = (b"pdf binary", "report.pdf", "application/pdf")
         mock_document_reader.extract_content.return_value = DocumentContent(
             content="text",
             metadata=DocumentMetadata(format_type="pdf"),
@@ -104,7 +104,7 @@ async def test_execute_temp_file_suffix_matches_filename_extension(
         tmp_path,
     ) -> None:
         """Should use the filename extension from the download as temp file suffix."""
-        mock_bricks_api.download_document.return_value = (b"data", "spreadsheet.xlsx")
+        mock_bricks_api.download_document.return_value = (b"data", "spreadsheet.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
         mock_document_reader.extract_content.return_value = DocumentContent(
             content="spreadsheet data",
             metadata=DocumentMetadata(format_type="xlsx"),
@@ -148,7 +148,7 @@ async def test_execute_propagates_kreuzberg_failure(
         tmp_path,
     ) -> None:
         """Should propagate errors when Kreuzberg extraction fails."""
-        mock_bricks_api.download_document.return_value = (b"corrupt data", "broken.pdf")
+        mock_bricks_api.download_document.return_value = (b"corrupt data", "broken.pdf", "application/pdf")
         mock_document_reader.extract_content.side_effect = ValueError(
             "Unsupported format"
         )
@@ -168,7 +168,7 @@ async def test_execute_cleans_up_temp_file(
         tmp_path,
     ) -> None:
         """Should delete the temp file after successful extraction."""
-        mock_bricks_api.download_document.return_value = (b"data", "report.pdf")
+        mock_bricks_api.download_document.return_value = (b"data", "report.pdf", "application/pdf")
         mock_document_reader.extract_content.return_value = DocumentContent(
             content="text",
             metadata=DocumentMetadata(format_type="txt"),
@@ -193,7 +193,7 @@ async def test_execute_cleans_up_temp_file_on_error(
         tmp_path,
     ) -> None:
         """Should delete the temp file even when Kreuzberg extraction fails."""
-        mock_bricks_api.download_document.return_value = (b"data", "report.pdf")
+        mock_bricks_api.download_document.return_value = (b"data", "report.pdf", "application/pdf")
         mock_document_reader.extract_content.side_effect = ValueError("bad format")
         use_case = ReadBricksDocumentUseCase(
             bricks_api=mock_bricks_api,
@@ -217,7 +217,7 @@ async def test_execute_with_tables(
         """Should include tables in the returned DocumentContent."""
         from domain.ports.document_reader_port import TableData
 
-        mock_bricks_api.download_document.return_value = (b"data", "financials.pdf")
+        mock_bricks_api.download_document.return_value = (b"data", "financials.pdf", "application/pdf")
         mock_document_reader.extract_content.return_value = DocumentContent(
             content="text with table",
             metadata=DocumentMetadata(format_type="pdf", mime_type="application/pdf"),
@@ -234,4 +234,28 @@ async def test_execute_with_tables(
         )
 
         assert len(result.tables) == 1
-        assert result.tables[0].markdown == "| A | B |\n|---|---|\n| 1 | 2 |"
+        assert result.tables[0].markdown == "| A | B |\n|---|---|\n| 1 | 2 |"
+
+    async def test_execute_passes_mime_type_to_document_reader(
+        self,
+        mock_bricks_api: AsyncMock,
+        mock_document_reader: AsyncMock,
+        tmp_path,
+    ) -> None:
+        """Should pass mime_type from Bricks API to document_reader.extract_content."""
+        mock_bricks_api.download_document.return_value = (b"data", "report.pdf", "application/pdf")
+        mock_document_reader.extract_content.return_value = DocumentContent(
+            content="text",
+            metadata=DocumentMetadata(format_type="pdf"),
+            tables=[],
+        )
+        use_case = ReadBricksDocumentUseCase(
+            bricks_api=mock_bricks_api,
+            document_reader=mock_document_reader,
+            output_dir=str(tmp_path),
+        )
+
+        await use_case.execute(document_id="doc-1", project_id="proj-1")
+
+        call_kwargs = mock_document_reader.extract_content.call_args
+        assert call_kwargs.kwargs.get("mime_type") == "application/pdf" or call_kwargs[1].get("mime_type") == "application/pdf"