Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ dev = [
"sphinx-rtd-theme",
"sphinx-click",
]
# Backwards-compatible alias; pyarrow is a core dependency (Parquet flavor).
parquet = [
"pyarrow>=14.0.0",
]
transforms = [
"torchvision", # Needed for megatron.energon.transforms
]
Expand Down
4 changes: 4 additions & 0 deletions src/megatron/energon/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,13 @@
from megatron.energon.flavors import (
BaseCoreDatasetFactory,
BaseWebdatasetFactory,
BinIdxDatasetFactory,
CaptioningSample,
CaptioningWebdataset,
CrudeJsonlDatasetFactory,
CrudeSample,
CrudeWebdataset,
DefaultBinIdxDatasetFactory,
DefaultCrudeJsonlDatasetFactory,
DefaultDecoderWebdatasetFactory,
DefaultGenericWebdatasetFactory,
Expand Down Expand Up @@ -118,6 +120,7 @@
"BaseCoreDatasetFactory",
"BaseWebdatasetFactory",
"basic_sample_keys",
"BinIdxDatasetFactory",
"batch_list",
"batch_pad_stack",
"batch_stack",
Expand All @@ -137,6 +140,7 @@
"DatasetLoader",
"DatasetLoaderInterface",
"DecodeFileStore",
"DefaultBinIdxDatasetFactory",
"DefaultCrudeJsonlDatasetFactory",
"DefaultDecoderWebdatasetFactory",
"DefaultGenericWebdatasetFactory",
Expand Down
53 changes: 52 additions & 1 deletion src/megatron/energon/cache/file_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
# SPDX-License-Identifier: BSD-3-Clause

import sqlite3
from typing import Any, Optional, Union
from typing import Any, Callable, Optional, Union

from megatron.energon.cache.base import FileStore, FileStoreDecoder, FileStoreWrapper
from megatron.energon.epathlib import EPath
from megatron.energon.flavors.binidx.binidx_reader import BinIdxReader
from megatron.energon.flavors.jsonl.ijsonl_reader import IJsonlReader
from megatron.energon.flavors.parquet.reader import IParquetReader
from megatron.energon.flavors.webdataset.config import INDEX_SQLITE_FILENAME, MAIN_FOLDER_NAME
from megatron.energon.flavors.webdataset.itar_reader import SqliteITarEntryReader
from megatron.energon.flavors.webdataset.thread_local_sqlite import ThreadLocalSqlite
Expand Down Expand Up @@ -194,3 +196,52 @@ class JsonlFileStore(IJsonlReader, FileStore[bytes]):

def get_path(self) -> str:
return str(self.jsonl_path)


class BinIdxFileStore(BinIdxReader, FileStore[bytes]):
"""This dataset will directly read entries from a bin-idx file pair."""

def get_path(self) -> str:
return str(self.bin_path)


class ParquetFileStore(IParquetReader, FileStore[Any]):
"""Random access to rows of a Parquet dataset directory (layout from Parquet footers).

Per-column keys ``{row_index}.{column}`` return native pyarrow ``as_py()`` values, not bytes.
"""

def __init__(
self,
dataset_root: EPath | str,
*,
part_filter: Callable[[str], bool] | None = None,
parquet_file_cache_size: int = 5,
):
from megatron.energon.flavors.parquet.prepare import (
assert_layout_columns_subset,
scan_parquet_dataset,
)

root = EPath(dataset_root)
layout = scan_parquet_dataset(root)
layout_cols = list(layout.columns)
if part_filter is None:
read_columns = layout_cols
else:
read_columns = [c for c in layout_cols if part_filter(c)]
if not read_columns:
raise ValueError(
"part_filter excluded all Parquet columns; nothing to load. "
f"Layout columns: {layout_cols}"
)
assert_layout_columns_subset(layout_cols, read_columns)
super().__init__(
root,
layout,
read_columns,
parquet_file_cache_size=parquet_file_cache_size,
)

def get_path(self) -> str:
return str(self.dataset_root)
42 changes: 41 additions & 1 deletion src/megatron/energon/dataset_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
from megatron.energon.flavors import (
BaseCoreDatasetFactory,
CrudeSample,
DefaultBinIdxDatasetFactory,
DefaultCrudeJsonlDatasetFactory,
DefaultParquetDatasetFactory,
StandardWebdatasetFactory,
)
from megatron.energon.flavors.webdataset.config import MAIN_FOLDER_NAME
Expand Down Expand Up @@ -88,6 +90,7 @@ def get_dataset_from_config(
path = EPath(path)
dataset: BaseCoreDatasetFactory[T_sample]
ds_type = get_dataset_type(path)

if ds_type == EnergonDatasetType.JSONL:
assert sample_type is CrudeSample or sample_type is None, (
f"Sample type must be CrudeSample for jsonl datasets, but got {sample_type}"
Expand All @@ -107,6 +110,42 @@ def get_dataset_from_config(
worker_config=worker_config,
**kwargs,
)
elif ds_type == EnergonDatasetType.BINIDX:
assert sample_type is CrudeSample or sample_type is None, (
f"Sample type must be CrudeSample for bin-idx datasets, but got {sample_type}"
)
assert dataset_config is None, (
f"Dataset config must be None for bin-idx datasets, but got {dataset_config}"
)
assert split_config is None, (
f"Split config must be None for bin-idx datasets, but got {split_config}"
)

dataset = DefaultBinIdxDatasetFactory(
path,
training=training,
subflavors=subflavors,
worker_config=worker_config,
**kwargs,
)
elif ds_type == EnergonDatasetType.PARQUET:
assert sample_type is CrudeSample or sample_type is None, (
f"Sample type must be CrudeSample for Parquet datasets, but got {sample_type}"
)
assert dataset_config is None, (
f"Dataset config must be None for Parquet datasets, but got {dataset_config}"
)
assert split_config is None, (
f"Split config must be None for Parquet datasets, but got {split_config}"
)

dataset = DefaultParquetDatasetFactory(
path,
training=training,
subflavors=subflavors,
worker_config=worker_config,
**kwargs,
)
elif ds_type == EnergonDatasetType.WEBDATASET:
if dataset_config is None:
dataset_config = "dataset.yaml"
Expand All @@ -131,7 +170,8 @@ def get_dataset_from_config(
raise ValueError("Filesystem datasets are only supported as auxiliary datasets. ")
else:
raise ValueError(
f"Path {path} does not contain a {MAIN_FOLDER_NAME}/.info.yaml or .info.json file nor is it a jsonl file. "
f"Path {path} does not contain a {MAIN_FOLDER_NAME}/.info.yaml or .info.json file, "
f"nor is it a jsonl file, a .bin (bin-idx) file, or a directory of .parquet files. "
f"Did you forget to prepare the dataset? Please check the documentation for an introduction to dataset "
f"preparation."
)
Expand Down
8 changes: 8 additions & 0 deletions src/megatron/energon/flavors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
Sample,
SavableDataset,
)
from megatron.energon.flavors.binidx import (
BinIdxDatasetFactory,
DefaultBinIdxDatasetFactory,
)
from megatron.energon.flavors.captioning import CaptioningSample, CaptioningWebdataset
from megatron.energon.flavors.crude import CrudeSample, CrudeWebdataset
from megatron.energon.flavors.image import ImageSample, ImageWebdataset
Expand All @@ -21,6 +25,7 @@
)
from megatron.energon.flavors.multichoice_vqa import MultiChoiceVQASample, MultiChoiceVQAWebdataset
from megatron.energon.flavors.ocr import OCRSample, OCRWebdataset
from megatron.energon.flavors.parquet.dataset import DefaultParquetDatasetFactory
from megatron.energon.flavors.similarity_interleaved import (
SimilarityInterleavedSample,
SimilarityInterleavedWebdataset,
Expand Down Expand Up @@ -50,13 +55,16 @@
"AVDecoderType",
"BaseCoreDatasetFactory",
"BaseWebdatasetFactory",
"BinIdxDatasetFactory",
"CaptioningSample",
"CaptioningWebdataset",
"CrudeJsonlDatasetFactory",
"CrudeSample",
"CrudeWebdataset",
"DefaultBinIdxDatasetFactory",
"DefaultCrudeJsonlDatasetFactory",
"DefaultDecoderWebdatasetFactory",
"DefaultParquetDatasetFactory",
"DefaultGenericWebdatasetFactory",
"EmptyDatasetError",
"ImageClassificationSample",
Expand Down
11 changes: 9 additions & 2 deletions src/megatron/energon/flavors/base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,8 +405,15 @@ class BaseCoreDatasetFactory(Generic[T_sample], ABC):
subflavors: Dict[str, Any]

@abstractmethod
def build(self, worker_rotation_offset: int = 0) -> SavableDataset[T_sample]:
"""Builds the dataset."""
def build(
self, worker_rotation_offset: int = 0, part_filter: Callable[[str], bool] | None = None
) -> SavableDataset[T_sample]:
"""Builds the dataset.

Args:
worker_rotation_offset: The offset of the worker rotation.
part_filter: A function to filter the parts of the dataset.
"""
...

@abstractmethod
Expand Down
11 changes: 11 additions & 0 deletions src/megatron/energon/flavors/binidx/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Copyright (c) 2025, NVIDIA CORPORATION.
# SPDX-License-Identifier: BSD-3-Clause
from megatron.energon.flavors.binidx.binidx_dataset import (
BinIdxDatasetFactory,
DefaultBinIdxDatasetFactory,
)

__all__ = [
"BinIdxDatasetFactory",
"DefaultBinIdxDatasetFactory",
]
Loading
Loading