This package has been superseded by fsspeckit.
Action Required: Please migrate to fsspeckit for:
- Continued support and bug fixes
- New features and improvements
- Latest dependency updates
Replace your imports:
# OLD - fsspec-utils (deprecated)
from fsspec_utils import filesystem
from fsspec_utils.storage_options import AwsStorageOptions
# NEW - fsspeckit (recommended)
from fsspeckit import filesystem
from fsspeckit.storage_options import AwsStorageOptionsUpdate installation:
pip uninstall fsspec-utils
pip install fsspeckitAll functionality from fsspec-utils is now available in fsspeckit with the same API for easy migration.
Enhanced utilities and extensions for fsspec filesystems with multi-format I/O support.
fsspec-utils is a comprehensive toolkit that extends fsspec with:
- Multi-cloud storage configuration - Easy setup for AWS S3, Google Cloud Storage, Azure Storage, GitHub, and GitLab
- Enhanced caching - Improved caching filesystem with monitoring and path preservation
- Extended I/O operations - Read/write operations for JSON, CSV, Parquet with Polars/PyArrow integration
- Utility functions - Type conversion, parallel processing, and data transformation helpers
⚠️ DEPRECATED: This package is deprecated. Usefsspeckitinstead.
# Install fsspeckit instead (recommended)
pip install fsspeckit
# With specific cloud providers
pip install fsspeckit[aws] # AWS S3 support
pip install fsspeckit[gcp] # Google Cloud Storage
pip install fsspeckit[azure] # Azure Storage# Basic installation
pip install fsspec-utils
# With all optional dependencies
pip install fsspec-utils[full]
# Specific cloud providers
pip install fsspec-utils[aws] # AWS S3 support
pip install fsspec-utils[gcp] # Google Cloud Storage
pip install fsspec-utils[azure] # Azure StorageSee Migration Guide above for upgrading to fsspeckit.
⚠️ DEPRECATED: Code examples below usefsspec-utils. Please usefsspeckitinstead. Replacefsspec_utils→fsspeckitin imports.
# NEW - Use fsspeckit (recommended)
from fsspeckit import filesystem
# DEPRECATED - fsspec-utils (for reference only)
# from fsspec_utils import filesystem
# Local filesystem
fs = filesystem("file")
files = fs.ls("/path/to/data")
# S3 with caching
fs = filesystem("s3://my-bucket/", cached=True)
data = fs.cat("data/file.txt")# NEW - Use fsspeckit (recommended)
from fsspeckit.storage_options import AwsStorageOptions
# DEPRECATED - fsspec-utils (for reference only)
# from fsspec_utils.storage_options import AwsStorageOptions
# Configure S3 access
options = AwsStorageOptions(
region="us-west-2",
access_key_id="YOUR_KEY",
secret_access_key="YOUR_SECRET"
)
fs = filesystem("s3", storage_options=options, cached=True)# NEW - Use fsspeckit (recommended)
from fsspeckit.storage_options import AwsStorageOptions
# DEPRECATED - fsspec-utils (for reference only)
# from fsspec_utils.storage_options import AwsStorageOptions
# Load from environment variables
options = AwsStorageOptions.from_env()
fs = filesystem("s3", storage_options=options)# NEW - Use fsspeckit (recommended)
from fsspeckit.storage_options import (
AwsStorageOptions,
GcsStorageOptions,
GitHubStorageOptions
)
from fsspeckit import filesystem
# DEPRECATED - fsspec-utils (for reference only)
# from fsspec_utils.storage_options import (
# AwsStorageOptions,
# GcsStorageOptions,
# GitHubStorageOptions
# )
# from fsspec_utils import filesystem
# AWS S3
s3_fs = filesystem("s3", storage_options=AwsStorageOptions.from_env())
# Google Cloud Storage
gcs_fs = filesystem("gs", storage_options=GcsStorageOptions.from_env())
# GitHub repository
github_fs = filesystem("github", storage_options=GitHubStorageOptions(
org="microsoft",
repo="vscode",
token="ghp_xxxx"
))# NEW - Use fsspeckit (recommended)
from fsspeckit.storage_options import AwsStorageOptions
# DEPRECATED - fsspec-utils (for reference only)
# from fsspec_utils.storage_options import AwsStorageOptions
# Basic credentials
options = AwsStorageOptions(
access_key_id="AKIAXXXXXXXX",
secret_access_key="SECRET",
region="us-east-1"
)
# From AWS profile
options = AwsStorageOptions.create(profile="dev")
# S3-compatible service (MinIO)
options = AwsStorageOptions(
endpoint_url="http://localhost:9000",
access_key_id="minioadmin",
secret_access_key="minioadmin",
allow_http=True
)# NEW - Use fsspeckit (recommended)
from fsspeckit.storage_options import GcsStorageOptions
# DEPRECATED - fsspec-utils (for reference only)
# from fsspec_utils.storage_options import GcsStorageOptions
# Service account
options = GcsStorageOptions(
token="path/to/service-account.json",
project="my-project-123"
)
# From environment
options = GcsStorageOptions.from_env()# NEW - Use fsspeckit (recommended)
from fsspeckit.storage_options import AzureStorageOptions
# DEPRECATED - fsspec-utils (for reference only)
# from fsspec_utils.storage_options import AzureStorageOptions
# Account key
options = AzureStorageOptions(
protocol="az",
account_name="mystorageacct",
account_key="key123..."
)
# Connection string
options = AzureStorageOptions(
protocol="az",
connection_string="DefaultEndpoints..."
)# NEW - Use fsspeckit (recommended)
from fsspeckit.storage_options import GitHubStorageOptions
# DEPRECATED - fsspec-utils (for reference only)
# from fsspec_utils.storage_options import GitHubStorageOptions
# Public repository
options = GitHubStorageOptions(
org="microsoft",
repo="vscode",
ref="main"
)
# Private repository
options = GitHubStorageOptions(
org="myorg",
repo="private-repo",
token="ghp_xxxx",
ref="develop"
)# NEW - Use fsspeckit (recommended)
from fsspeckit.storage_options import GitLabStorageOptions
# DEPRECATED - fsspec-utils (for reference only)
# from fsspec_utils.storage_options import GitLabStorageOptions
# Public project
options = GitLabStorageOptions(
project_name="group/project",
ref="main"
)
# Private project with token
options = GitLabStorageOptions(
project_id=12345,
token="glpat_xxxx",
ref="develop"
)# NEW - Use fsspeckit (recommended)
from fsspeckit import filesystem
# DEPRECATED - fsspec-utils (for reference only)
# from fsspec_utils import filesystem
# Enable caching with monitoring
fs = filesystem(
"s3://my-bucket/",
cached=True,
cache_storage="/tmp/my_cache",
verbose=True
)
# Cache preserves directory structure
data = fs.cat("deep/nested/path/file.txt")
# Cached at: /tmp/my_cache/deep/nested/path/file.txt# NEW - Use fsspeckit (recommended)
from fsspeckit.utils import run_parallel
# DEPRECATED - fsspec-utils (for reference only)
# from fsspec_utils.utils import run_parallel
# Run function in parallel
def process_file(path, multiplier=1):
return len(path) * multiplier
results = run_parallel(
process_file,
["/path1", "/path2", "/path3"],
multiplier=2,
n_jobs=4,
verbose=True
)# NEW - Use fsspeckit (recommended)
from fsspeckit.utils import dict_to_dataframe, to_pyarrow_table
# DEPRECATED - fsspec-utils (for reference only)
# from fsspec_utils.utils import dict_to_dataframe, to_pyarrow_table
# Convert dict to DataFrame
data = {"col1": [1, 2, 3], "col2": [4, 5, 6]}
df = dict_to_dataframe(data)
# Convert to PyArrow table
table = to_pyarrow_table(df)# NEW - Use fsspeckit (recommended)
from fsspeckit.utils import setup_logging
# DEPRECATED - fsspec-utils (for reference only)
# from fsspec_utils.utils import setup_logging
# Configure logging
setup_logging(level="DEBUG", format_string="{time} | {level} | {message}")fsspec>=2023.1.0- Filesystem interfacemsgspec>=0.18.0- Serializationpyyaml>=6.0- YAML supportrequests>=2.25.0- HTTP requestsloguru>=0.7.0- Logging
orjson>=3.8.0- Fast JSON processingpolars>=0.19.0- Fast DataFramespyarrow>=10.0.0- Columnar datapandas>=1.5.0- Data analysisjoblib>=1.3.0- Parallel processingrich>=13.0.0- Progress bars
boto3>=1.26.0,s3fs>=2023.1.0- AWS S3gcsfs>=2023.1.0- Google Cloud Storageadlfs>=2023.1.0- Azure Storage
Contributions are welcome! Please feel free to submit a Pull Request.
This project is licensed under the MIT License - see the LICENSE file for details.
This package was extracted from the FlowerPower workflow framework to provide standalone filesystem utilities that can be used independently or as a dependency in other projects.