Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions src/application/api/mcp_bricks_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from fastmcp import FastMCP
from fastmcp.exceptions import ToolError
from pydantic import BaseModel, Field

from application.responses.file_response import FileContentResponse
from dependencies import (
Expand All @@ -15,8 +16,16 @@
mcp_bricks = FastMCP("RAGAnythingBricks")


class DocumentSummary(BaseModel):
id: str = Field(description="Document ID — pass this to read_bricks_document")
file_name: str = Field(description="Nom du fichier")
mime_type: str = Field(default="", description="Type MIME du fichier")
size: int = Field(default=0, description="Taille en octets")
status: str = Field(default="", description="Statut du document")


@mcp_bricks.tool()
async def list_bricks_documents(project_unique_id: str) -> list:
async def list_bricks_documents(project_unique_id: str) -> list[DocumentSummary]:
"""Liste les documents d'un projet Bricks.

Args:
Expand All @@ -27,12 +36,22 @@ async def list_bricks_documents(project_unique_id: str) -> list:
"""
use_case = get_list_bricks_documents_use_case()
try:
return await use_case.execute(project_id=project_unique_id)
documents = await use_case.execute(project_id=project_unique_id)
except Exception:
logger.exception(
"Failed to list bricks documents for project %s", project_unique_id
)
raise ToolError("Failed to list bricks documents") from None
return [
DocumentSummary(
id=doc.id,
file_name=doc.file_name,
mime_type=doc.mime_type,
size=doc.size,
status=doc.status,
)
for doc in documents
]


@mcp_bricks.tool()
Expand Down Expand Up @@ -100,4 +119,4 @@ async def publish_section_version(
logger.exception(
"Failed to publish section version for project %s", project_unique_id
)
raise ToolError("Failed to publish section version") from None
raise ToolError("Failed to publish section version") from None
4 changes: 2 additions & 2 deletions src/application/use_cases/read_bricks_document_use_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ async def execute(
document_id: str,
project_id: str,
) -> DocumentContent:
data, filename = await self.bricks_api.download_document(
data, filename, mime_type = await self.bricks_api.download_document(
document_id=document_id, project_id=project_id
)

Expand All @@ -36,7 +36,7 @@ async def execute(
async with aiofiles.open(tmp_path, "wb") as f:
await f.write(data)

return await self.document_reader.extract_content(tmp_path)
return await self.document_reader.extract_content(tmp_path, mime_type=mime_type)
finally:
with contextlib.suppress(OSError):
os.unlink(tmp_path)
2 changes: 1 addition & 1 deletion src/domain/ports/bricks_api_port.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ async def download_document(
self,
document_id: str,
project_id: str,
) -> tuple[bytes, str]:
) -> tuple[bytes, str, str]:
pass

@abstractmethod
Expand Down
2 changes: 1 addition & 1 deletion src/domain/ports/document_reader_port.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,5 @@ class DocumentContent(BaseModel):

class DocumentReaderPort(ABC):
@abstractmethod
async def extract_content(self, file_path: str) -> DocumentContent:
async def extract_content(self, file_path: str, mime_type: str = "") -> DocumentContent:
pass
32 changes: 26 additions & 6 deletions src/infrastructure/bricks/bricks_api_adapter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import asyncio
import json
import logging
import os
import re
import urllib.error
import urllib.parse
Expand Down Expand Up @@ -65,12 +66,14 @@ async def download_document(
self,
document_id: str,
project_id: str,
) -> tuple[bytes, str]:
) -> tuple[bytes, str, str]:
documents = await self.list_project_documents(project_id)
url = None
mime_type = ""
for doc in documents:
if doc.id == document_id and doc.url:
url = doc.url
mime_type = doc.mime_type
break
if not url:
raise FileNotFoundError(
Expand All @@ -96,7 +99,7 @@ async def download_document(
raise TimeoutError(f"Document download timed out: {e}") from e

filename = _extract_filename(resp_headers.get("Content-Disposition", ""), url)
return body, filename
return body, filename, mime_type

async def publish_section_version(self, payload: dict) -> SectionVersionResult:
url = f"{self._base_url}/api/section-versions"
Expand All @@ -123,13 +126,30 @@ async def publish_section_version(self, payload: dict) -> SectionVersionResult:
def _extract_filename(content_disposition: str, url: str = "") -> str:
match = re.search(r'filename="([^"]+)"', content_disposition)
if match:
return match.group(1)
filename = match.group(1)
logger.debug("Filename from Content-Disposition (quoted): %s", filename)
return _normalize_extension(filename)
match = re.search(r"filename=([^\s;]+)", content_disposition)
if match:
return match.group(1)
filename = match.group(1)
logger.debug("Filename from Content-Disposition (unquoted): %s", filename)
return _normalize_extension(filename)
if url:
decoded_path = urllib.parse.unquote(urllib.parse.urlparse(url).path)
path_filename = decoded_path.rsplit("/", 1)[-1]
if path_filename and "." in path_filename:
return path_filename
return "document.bin"
logger.debug("Filename from URL path: %s (url=%s)", path_filename, url[:200])
return _normalize_extension(path_filename)
logger.warning("Could not extract filename, falling back to document.bin (url=%s)", url[:200] if url else "")
return "document.bin"


def _normalize_extension(filename: str) -> str:
name, ext = os.path.splitext(filename)
if not ext or ext == ".":
return filename
cleaned = re.sub(r"^[^a-zA-Z]+", ".", ext.lower())
if cleaned == ext.lower():
return filename
logger.warning("Normalized file extension: %s -> %s (filename=%s)", ext, cleaned, filename)
return name + cleaned
7 changes: 5 additions & 2 deletions src/infrastructure/document_reader/kreuzberg_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,12 @@ class KreuzbergAdapter(DocumentReaderPort):
def __init__(self, ocr_mode: str | None = None) -> None:
self._config = make_extraction_config(ocr_mode=ocr_mode)

async def extract_content(self, file_path: str) -> DocumentContent:
async def extract_content(self, file_path: str, mime_type: str = "") -> DocumentContent:
try:
result = await extract_file(file_path, config=self._config)
kwargs = {"config": self._config}
if mime_type:
kwargs["mime_type"] = mime_type
result = await extract_file(file_path, **kwargs)
logger.debug("Full extraction result for %s: %s", file_path, result)
except ParsingError as e:
raise ValueError(f"Unsupported file format: {e}") from e
Expand Down
30 changes: 27 additions & 3 deletions tests/unit/infrastructure/test_bricks_api_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from infrastructure.bricks.bricks_api_adapter import (
BricksApiAdapter,
_extract_filename,
_normalize_extension,
)


Expand Down Expand Up @@ -236,13 +237,14 @@ async def test_resolves_document_id_and_downloads_presigned_url(self, adapter: B
file_resp = _mock_urlopen_response(file_body, headers={"Content-Disposition": 'attachment; filename="doc.pdf"'})

with patch("infrastructure.bricks.bricks_api_adapter.urllib.request.urlopen", side_effect=[list_resp, file_resp]) as mock_urlopen:
content, filename = await adapter.download_document(document_id="doc-1", project_id="proj-1")
content, filename, mime_type = await adapter.download_document(document_id="doc-1", project_id="proj-1")

assert mock_urlopen.call_count == 2
list_req = mock_urlopen.call_args_list[0][0][0]
assert "api/projects/proj-1/documents/ai" in list_req.full_url
assert content == b"PDF content"
assert filename == "doc.pdf"
assert mime_type == "application/pdf"

async def test_extracts_filename_from_url_when_no_content_disposition(self, adapter: BricksApiAdapter) -> None:
"""Should extract filename from URL path when no Content-Disposition header."""
Expand All @@ -261,7 +263,7 @@ async def test_extracts_filename_from_url_when_no_content_disposition(self, adap
file_resp = _mock_urlopen_response(b"data", headers={})

with patch("infrastructure.bricks.bricks_api_adapter.urllib.request.urlopen", side_effect=[list_resp, file_resp]):
_, filename = await adapter.download_document(document_id="doc-2", project_id="proj-1")
_, filename, _ = await adapter.download_document(document_id="doc-2", project_id="proj-1")

assert filename == "Dossier_Financement.pdf"

Expand All @@ -281,7 +283,7 @@ async def test_defaults_to_document_bin_when_no_filename(self, adapter: BricksAp
file_resp = _mock_urlopen_response(b"data", headers={})

with patch("infrastructure.bricks.bricks_api_adapter.urllib.request.urlopen", side_effect=[list_resp, file_resp]):
_, filename = await adapter.download_document(document_id="doc-3", project_id="proj-1")
_, filename, _ = await adapter.download_document(document_id="doc-3", project_id="proj-1")

assert filename == "document.bin"

Expand Down Expand Up @@ -359,6 +361,28 @@ def test_url_without_extension_falls_back(self) -> None:
assert _extract_filename("", "https://s3.example.com/path/noext") == "document.bin"


class TestNormalizeExtension:
"""Tests for _normalize_extension helper."""

def test_normal_dot_pdf(self) -> None:
assert _normalize_extension("report.pdf") == "report.pdf"

def test_dot_underscore_pdf(self) -> None:
assert _normalize_extension("report._pdf") == "report.pdf"

def test_double_dot_extension(self) -> None:
assert _normalize_extension("report..pdf") == "report..pdf"

def test_no_extension(self) -> None:
assert _normalize_extension("noext") == "noext"

def test_empty_extension(self) -> None:
assert _normalize_extension("report.") == "report."

def test_normal_dot_docx(self) -> None:
assert _normalize_extension("file.docx") == "file.docx"


class TestPublishSectionVersion:
"""Tests for BricksApiAdapter.publish_section_version."""

Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_bricks_api_port.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,8 @@ async def list_project_documents(

async def download_document(
self, document_id: str, project_id: str
) -> tuple[bytes, str]:
return (b"", "file.pdf")
) -> tuple[bytes, str, str]:
return (b"", "file.pdf", "application/pdf")

async def publish_section_version(
self, payload: dict
Expand Down
31 changes: 31 additions & 0 deletions tests/unit/test_kreuzberg_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,34 @@ async def test_extract_content_raises_on_other_kreuzberg_error(

with pytest.raises(KreuzbergError):
await adapter.extract_content("/tmp/test.pdf")

@patch("infrastructure.document_reader.kreuzberg_adapter.extract_file")
async def test_extract_content_passes_mime_type(self, mock_extract) -> None:
mock_result = AsyncMock()
mock_result.content = "text"
mock_result.mime_type = "application/pdf"
mock_result.metadata = {}
mock_result.tables = []
mock_extract.return_value = mock_result

adapter = KreuzbergAdapter(ocr_mode="tesseract")
result = await adapter.extract_content("/tmp/doc.bin", mime_type="application/pdf")

mock_extract.assert_awaited_once()
call_kwargs = mock_extract.call_args.kwargs
assert call_kwargs.get("mime_type") == "application/pdf"

@patch("infrastructure.document_reader.kreuzberg_adapter.extract_file")
async def test_extract_content_without_mime_type(self, mock_extract) -> None:
mock_result = AsyncMock()
mock_result.content = "text"
mock_result.mime_type = "application/pdf"
mock_result.metadata = {}
mock_result.tables = []
mock_extract.return_value = mock_result

adapter = KreuzbergAdapter(ocr_mode="tesseract")
await adapter.extract_content("/tmp/test.pdf")

call_kwargs = mock_extract.call_args.kwargs
assert "mime_type" not in call_kwargs
42 changes: 33 additions & 9 deletions tests/unit/test_read_bricks_document_use_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ async def test_execute_downloads_from_bricks_api(
tmp_path,
) -> None:
"""Should call bricks_api.download_document with document_id and project_id."""
mock_bricks_api.download_document.return_value = (b"file content", "report.pdf")
mock_bricks_api.download_document.return_value = (b"file content", "report.pdf", "application/pdf")
mock_document_reader.extract_content.return_value = DocumentContent(
content="extracted text",
metadata=DocumentMetadata(format_type="pdf", mime_type="application/pdf"),
Expand All @@ -50,7 +50,7 @@ async def test_execute_returns_document_content(
tmp_path,
) -> None:
"""Should return DocumentContent from Kreuzberg extraction."""
mock_bricks_api.download_document.return_value = (b"pdf data", "doc.pdf")
mock_bricks_api.download_document.return_value = (b"pdf data", "doc.pdf", "application/pdf")
expected = DocumentContent(
content="extracted text from bricks document",
metadata=DocumentMetadata(format_type="pdf", mime_type="application/pdf"),
Expand Down Expand Up @@ -78,7 +78,7 @@ async def test_execute_calls_document_reader_with_temp_file(
tmp_path,
) -> None:
"""Should save temp file and pass its path to document_reader.extract_content."""
mock_bricks_api.download_document.return_value = (b"pdf binary", "report.pdf")
mock_bricks_api.download_document.return_value = (b"pdf binary", "report.pdf", "application/pdf")
mock_document_reader.extract_content.return_value = DocumentContent(
content="text",
metadata=DocumentMetadata(format_type="pdf"),
Expand All @@ -104,7 +104,7 @@ async def test_execute_temp_file_suffix_matches_filename_extension(
tmp_path,
) -> None:
"""Should use the filename extension from the download as temp file suffix."""
mock_bricks_api.download_document.return_value = (b"data", "spreadsheet.xlsx")
mock_bricks_api.download_document.return_value = (b"data", "spreadsheet.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
mock_document_reader.extract_content.return_value = DocumentContent(
content="spreadsheet data",
metadata=DocumentMetadata(format_type="xlsx"),
Expand Down Expand Up @@ -148,7 +148,7 @@ async def test_execute_propagates_kreuzberg_failure(
tmp_path,
) -> None:
"""Should propagate errors when Kreuzberg extraction fails."""
mock_bricks_api.download_document.return_value = (b"corrupt data", "broken.pdf")
mock_bricks_api.download_document.return_value = (b"corrupt data", "broken.pdf", "application/pdf")
mock_document_reader.extract_content.side_effect = ValueError(
"Unsupported format"
)
Expand All @@ -168,7 +168,7 @@ async def test_execute_cleans_up_temp_file(
tmp_path,
) -> None:
"""Should delete the temp file after successful extraction."""
mock_bricks_api.download_document.return_value = (b"data", "report.pdf")
mock_bricks_api.download_document.return_value = (b"data", "report.pdf", "application/pdf")
mock_document_reader.extract_content.return_value = DocumentContent(
content="text",
metadata=DocumentMetadata(format_type="txt"),
Expand All @@ -193,7 +193,7 @@ async def test_execute_cleans_up_temp_file_on_error(
tmp_path,
) -> None:
"""Should delete the temp file even when Kreuzberg extraction fails."""
mock_bricks_api.download_document.return_value = (b"data", "report.pdf")
mock_bricks_api.download_document.return_value = (b"data", "report.pdf", "application/pdf")
mock_document_reader.extract_content.side_effect = ValueError("bad format")
use_case = ReadBricksDocumentUseCase(
bricks_api=mock_bricks_api,
Expand All @@ -217,7 +217,7 @@ async def test_execute_with_tables(
"""Should include tables in the returned DocumentContent."""
from domain.ports.document_reader_port import TableData

mock_bricks_api.download_document.return_value = (b"data", "financials.pdf")
mock_bricks_api.download_document.return_value = (b"data", "financials.pdf", "application/pdf")
mock_document_reader.extract_content.return_value = DocumentContent(
content="text with table",
metadata=DocumentMetadata(format_type="pdf", mime_type="application/pdf"),
Expand All @@ -234,4 +234,28 @@ async def test_execute_with_tables(
)

assert len(result.tables) == 1
assert result.tables[0].markdown == "| A | B |\n|---|---|\n| 1 | 2 |"
assert result.tables[0].markdown == "| A | B |\n|---|---|\n| 1 | 2 |"

async def test_execute_passes_mime_type_to_document_reader(
self,
mock_bricks_api: AsyncMock,
mock_document_reader: AsyncMock,
tmp_path,
) -> None:
"""Should pass mime_type from Bricks API to document_reader.extract_content."""
mock_bricks_api.download_document.return_value = (b"data", "report.pdf", "application/pdf")
mock_document_reader.extract_content.return_value = DocumentContent(
content="text",
metadata=DocumentMetadata(format_type="pdf"),
tables=[],
)
use_case = ReadBricksDocumentUseCase(
bricks_api=mock_bricks_api,
document_reader=mock_document_reader,
output_dir=str(tmp_path),
)

await use_case.execute(document_id="doc-1", project_id="proj-1")

call_kwargs = mock_document_reader.extract_content.call_args
assert call_kwargs.kwargs.get("mime_type") == "application/pdf" or call_kwargs[1].get("mime_type") == "application/pdf"
Loading