diff --git a/CHANGELOG.md b/CHANGELOG.md index 62354b3e7..b75a37cef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.1.8 + +### Features + +- **Add `include_orig_elements` parameter for chunking**: When `True` (the default), the elements used to form each chunk are attached to that chunk's `.metadata.orig_elements` as a gzipped+base64 blob. Set to `False` to omit them and produce a much smaller response payload — useful for large documents with tables, where this blob is duplicated into every chunk. + ## 0.1.7 ### Security diff --git a/prepline_general/api/__version__.py b/prepline_general/api/__version__.py index b71c670ea..93b52aecc 100644 --- a/prepline_general/api/__version__.py +++ b/prepline_general/api/__version__.py @@ -1 +1 @@ -__version__ = "0.1.7" # pragma: no cover +__version__ = "0.1.8" # pragma: no cover diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index 3139b252e..8a7b3cb28 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -223,6 +223,7 @@ def pipeline_api( new_after_n_chars: Optional[int], overlap: int, overlap_all: bool, + include_orig_elements: bool = True, # ---------------------- filename: str = "", file_content_type: Optional[str] = None, @@ -283,6 +284,7 @@ def pipeline_api( "new_after_n_chars": new_after_n_chars, "overlap": overlap, "overlap_all": overlap_all, + "include_orig_elements": include_orig_elements, "starting_page_number": starting_page_number, "include_slide_notes": include_slide_notes, }, @@ -338,6 +340,7 @@ def pipeline_api( "max_characters": max_characters, "overlap": overlap, "overlap_all": overlap_all, + "include_orig_elements": include_orig_elements, "extract_image_block_types": extract_image_block_types, "extract_image_block_to_payload": extract_image_block_to_payload, "unique_element_ids": unique_element_ids, @@ -368,6 +371,7 @@ def pipeline_api( "max_characters": max_characters, "overlap": overlap, "overlap_all": overlap_all, + "include_orig_elements": include_orig_elements, "extract_image_block_types": extract_image_block_types, "extract_image_block_to_payload": extract_image_block_to_payload, "unique_element_ids": unique_element_ids, @@ -716,6 +720,7 @@ def response_generator(is_multipart: bool): new_after_n_chars=form_params.new_after_n_chars, overlap=form_params.overlap, overlap_all=form_params.overlap_all, + include_orig_elements=form_params.include_orig_elements, starting_page_number=form_params.starting_page_number, include_slide_notes=form_params.include_slide_notes, ) diff --git a/prepline_general/api/models/form_params.py b/prepline_general/api/models/form_params.py index 8b96bfda2..081175dde 100644 --- a/prepline_general/api/models/form_params.py +++ b/prepline_general/api/models/form_params.py @@ -35,6 +35,7 @@ class GeneralFormParams(BaseModel): new_after_n_chars: Optional[int] overlap: int overlap_all: bool + include_orig_elements: bool starting_page_number: Optional[int] = None include_slide_notes: bool @@ -236,6 +237,18 @@ def as_form( examples=[True], ), ] = False, + include_orig_elements: Annotated[ + bool, + Form( + title="Include Orig Elements", + description="""When `True` (the default), the elements used to form each chunk are +added to that chunk's `.metadata.orig_elements` as a gzipped+base64 blob. Set to `False` to omit +them and produce a much smaller payload — useful for large tables, where this blob is duplicated +into every chunk and can balloon the response size dramatically.""", + examples=[False], + ), + BeforeValidator(SmartValueParser[bool]().value_or_first_element), + ] = True, starting_page_number: Annotated[ Optional[int], Form( @@ -283,6 +296,7 @@ def as_form( new_after_n_chars=new_after_n_chars, overlap=overlap, overlap_all=overlap_all, + include_orig_elements=include_orig_elements, unique_element_ids=unique_element_ids, starting_page_number=starting_page_number, include_slide_notes=include_slide_notes,