From 7d54df71de1c20780c0edc41db4657230acf461f Mon Sep 17 00:00:00 2001 From: matthewpeterkort Date: Tue, 5 May 2026 13:57:03 -0700 Subject: [PATCH 1/3] fix add error passing --- cmd/remote/add/gen3.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmd/remote/add/gen3.go b/cmd/remote/add/gen3.go index 9f9ceed7..33151ebc 100644 --- a/cmd/remote/add/gen3.go +++ b/cmd/remote/add/gen3.go @@ -99,13 +99,13 @@ func gen3Init(remoteName, credFile, fenceToken, project, organization, bucket st default: existing, err := configure.Load(remoteName) - if err == nil { + if err != nil { + return fmt.Errorf("failed to load %s config: %w", remoteName, err) + } else { accessToken = existing.AccessToken apiKey = existing.APIKey keyID = existing.KeyID apiEndpoint = existing.APIEndpoint - } else { - return fmt.Errorf("must provide either --cred or --token (or have existing profile %s)", remoteName) } } From 254a25f0b04c0b31074a1b0315b77e5490fc1426 Mon Sep 17 00:00:00 2001 From: Brian Date: Wed, 6 May 2026 15:28:50 -0700 Subject: [PATCH 2/3] dont delete repo;force push instead (#227) Co-authored-by: Matthew Peterkort <33436238+matthewpeterkort@users.noreply.github.com> --- tests/monorepos/e2e-monorepo-remote.sh | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/monorepos/e2e-monorepo-remote.sh b/tests/monorepos/e2e-monorepo-remote.sh index 7c40376b..a53dc5a2 100755 --- a/tests/monorepos/e2e-monorepo-remote.sh +++ b/tests/monorepos/e2e-monorepo-remote.sh @@ -504,7 +504,7 @@ validate_config() { configure_remote_auth() { MONO_REMOTE_URL_AUTH="$MONO_REMOTE_URL" if [[ -n "$TEST_GITHUB_TOKEN" && "$MONO_REMOTE_URL" =~ ^https://github.com/ ]]; then - MONO_REMOTE_URL_AUTH="${MONO_REMOTE_URL/https:\/\/github.com\//https:\/\/x-access-token:${TEST_GITHUB_TOKEN}@github.com/}" + MONO_REMOTE_URL_AUTH="${MONO_REMOTE_URL/https:\/\/github.com\//https://x-access-token:${TEST_GITHUB_TOKEN}@github.com/}" fi } @@ -571,11 +571,12 @@ delete_github_repo_if_requested() { require_cmd gh if GH_TOKEN="$TEST_GITHUB_TOKEN" gh api "/repos/${GITHUB_OWNER_REPO}" >/dev/null 2>&1; then - log "Deleting existing GitHub repo ${GITHUB_OWNER_REPO} for clean test run" - GH_TOKEN="$TEST_GITHUB_TOKEN" gh api -X DELETE "/repos/${GITHUB_OWNER_REPO}" >/dev/null - DELETED_REMOTE_REPO_AT_START=true - # Small wait to avoid eventual-consistency race with immediate recreation. - sleep 2 + # log "Deleting existing GitHub repo ${GITHUB_OWNER_REPO} for clean test run" + # GH_TOKEN="$TEST_GITHUB_TOKEN" gh api -X DELETE "/repos/${GITHUB_OWNER_REPO}" >/dev/null + # DELETED_REMOTE_REPO_AT_START=true + # # Small wait to avoid eventual-consistency race with immediate recreation. + # sleep 2 + log "Skipping deletion of existing GitHub repo ${GITHUB_OWNER_REPO}; using push -f instead" fi } @@ -683,7 +684,7 @@ push_dataset() { git add .gitattributes git commit -m "Initialize LFS tracking" || true # Ensure origin/main is established as upstream for subsequent git-drs pushes. - git push --set-upstream "$MONO_REMOTE_NAME" "$MONO_GIT_BRANCH" + git push -f --set-upstream "$MONO_REMOTE_NAME" "$MONO_GIT_BRANCH" if [[ "$MONO_RUN_MULTIPART_SMOKE" == "true" ]]; then mkdir -p fixtures/multipart-smoke From 1fda5621ef9cbda55a353deb50f785729ec11dc2 Mon Sep 17 00:00:00 2001 From: Brian Date: Thu, 7 May 2026 09:14:39 -0700 Subject: [PATCH 3/3] update docs for latest (#226) * update docs for latest * document new,non-lfs paths * extract issues * refactor lfs out of docs * update docs --------- Co-authored-by: matthewpeterkort Co-authored-by: Matthew Peterkort <33436238+matthewpeterkort@users.noreply.github.com> --- README.md | 174 ++--- ...sue-add-include-pattern-to-git-drs-pull.md | 51 ++ docs/adding-s3-files.md | 26 +- ...-drs-endpoints-and-transfer-concurrency.md | 216 ++++++ docs/commands.md | 639 +++++------------- docs/developer-guide.md | 21 +- docs/drs-registerfile-upsert.md | 4 +- docs/e2e-modes-and-local-setup.md | 6 +- docs/getting-started.md | 421 ++++++------ docs/installation.md | 48 +- docs/precommit-cache-addurl-prepush.md | 10 +- docs/precommit.md | 41 +- docs/troubleshooting.md | 581 ++++++---------- 13 files changed, 971 insertions(+), 1267 deletions(-) create mode 100644 attic/issue-add-include-pattern-to-git-drs-pull.md create mode 100644 docs/architecture-drs-endpoints-and-transfer-concurrency.md diff --git a/README.md b/README.md index 9fa44367..81ea2f6c 100644 --- a/README.md +++ b/README.md @@ -3,143 +3,113 @@ --- # NOTICE -git-drs is not yet fully compliant with DRS. It currently works against Gen3 DRS server. Full GA4GH DRS support is expected once v1.6 of the specification has been published. +`git-drs` is not a pure GA4GH DRS client. It targets Syfon/Gen3-style DRS workflows and uses extensions where repo-scale behavior requires them. --- [![Tests](https://github.com/calypr/git-drs/actions/workflows/test.yaml/badge.svg)](https://github.com/calypr/git-drs/actions/workflows/test.yaml) -**Git/DRS orchestration with optional Git LFS compatibility** +**Git/DRS orchestration with Git-compatible pointer workflows** -Git DRS manages Git-facing DRS workflows: local metadata, Git hooks, filter behavior, lookup/register/push/pull orchestration, and optional Git LFS compatibility. Provider-specific transfer, signed URL behavior, and direct cloud inspection live in client code outside this repo. +`git-drs` manages: + +- remote Gen3/Syfon configuration +- local DRS metadata +- pointer-aware push/pull orchestration +- bucket-scoped object reference workflows ## Key Features -- **Unified Workflow**: Manage both code and large data files using standard Git commands -- **DRS Integration**: Built-in support for Gen3 DRS servers -- **Multi-Remote Support**: Work with development, staging, and production servers in one repository -- **Automatic Processing**: Files are processed automatically during commits and pushes -- **Flexible Tracking**: Track individual files, patterns, or entire directories +- unified Git/data workflow around DRS-backed pointers +- Gen3/Syfon integration +- multiple remotes in one repository +- explicit file tracking and hydration +- metadata-only reference support for existing bucket objects ## How It Works -Git DRS works alongside Git LFS when you want LFS-compatible pointers and storage, while still supporting DRS-centric workflows: +At a high level: -1. **Initialization**: Set up repository and DRS server configuration -2. **Automatic Commits**: Create DRS objects during pre-commit hooks -3. **Automatic Pushes**: Register files with DRS servers and upload to configured storage -4. **On-Demand Downloads**: Pull specific files or patterns as needed +1. initialize the repository with `git drs init` +2. configure a remote for one `organization/project` +3. track file patterns with `git drs track` +4. add/commit/push normally +5. hydrate pointer files later with `git drs pull` ## Quick Start -### Installation - ```bash -# Install Git LFS first -brew install git-lfs # macOS -git lfs install --skip-smudge - -# Install Git DRS -/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/calypr/git-drs/refs/heads/main/install.sh)" -- $GIT_DRS_VERSION - -# Install global Git filter configuration for git-drs git drs install -``` - -### Basic Usage - -```bash -# Initialize repository (one-time Git repo setup) git drs init - -# Add DRS remote -git drs remote add gen3 production \ - --cred /path/to/credentials.json \ - --url https://calypr-public.ohsu.edu \ - --organization my-program \ - --project my-project \ - --bucket my-bucket - -# Required prerequisite (usually steward/admin setup): -# create bucket credentials, then map org/project to full storage roots before users run push/pull -git drs bucket add production \ - --bucket my-bucket \ - --region us-east-1 \ - --access-key "$AWS_ACCESS_KEY_ID" \ - --secret-key "$AWS_SECRET_ACCESS_KEY" \ - --s3-endpoint https://s3.amazonaws.com -git drs bucket add-organization production \ - --organization my-program \ - --path s3://my-bucket/my-program -git drs bucket add-project production \ - --organization my-program \ - --project my-project \ - --path s3://my-bucket/my-program/my-project - -# Track files -git lfs track "*.bam" +git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/credentials.json +git drs track "*.bam" git add .gitattributes - -# Add and commit files -git add my-file.bam -git commit -m "Add data file" +git add sample.bam +git commit -m "Add sample" git push - -# Download files -git lfs pull -I "*.bam" +git drs ls-files +git drs pull -I "*.bam" ``` -## Documentation +## Current CLI Shape -For detailed setup and usage information: +The cleaned CLI intentionally removed legacy commands: -- **[Getting Started](docs/getting-started.md)** - Repository setup and basic workflows -- **[Commands Reference](docs/commands.md)** - Complete command documentation -- **[Installation Guide](docs/installation.md)** - Platform-specific installation -- **[Troubleshooting](docs/troubleshooting.md)** - Common issues and solutions -- **[E2E Modes + Local Setup](docs/e2e-modes-and-local-setup.md)** - Local vs remote mode, server config, and end-to-end runbooks -- **[Cloud/Object Integration](docs/adding-s3-files.md)** - Adding files from provider URLs or configured bucket object keys -- **[Developer Guide](docs/developer-guide.md)** - Internals and development +- removed: + - `git drs fetch` + - `git drs list` + - `git drs upload` + - `git drs download` +- `git drs pull` is hydration-only +- `git drs ls-files` is the local file inventory command +- `git drs remote add gen3` takes scope as `organization/project` -## Supported Servers +Example: -- **Gen3 Data Commons** (e.g., CALYPR) +```bash +git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/credentials.json +``` -## Supported Environments +## Bucket Mapping Model -- **Local Development** environments -- **HPC Systems** (e.g., ARC) +End users should not need to know the bucket name. -## Commands Overview +Push and pull depend on server-side bucket mapping for the requested scope. That mapping is normally provisioned once by a steward/admin using the bucket commands. -| Command | Description | -| ---------------------- | ------------------------------------- | -| `git drs install` | Install global git-drs filter config | -| `git drs init` | Initialize repository | -| `git drs remote add` | Add a DRS remote server | -| `git drs remote list` | List configured remotes | -| `git drs remote set` | Set default remote | -| `git drs add-url` | Add files via provider URLs or configured bucket object keys | -| `git lfs track` | Track file patterns with LFS | -| `git lfs ls-files` | List tracked files | -| `git lfs pull` | Download tracked files | -| `git drs fetch` | Fetch metadata from DRS server | -| `git drs push` | Push objects to DRS server | +## Common Commands -Use `--help` with any command for details. See [Commands Reference](docs/commands.md) for complete documentation. +| Command | Description | +| --- | --- | +| `git drs install` | Install global `git-drs` filter config | +| `git drs init` | Initialize repository-local `git-drs` state | +| `git drs remote add gen3 [remote] ` | Add or refresh a Gen3/Syfon remote | +| `git drs remote list` | List configured remotes | +| `git drs remote set ` | Set the default remote | +| `git drs track ` | Track files or globs | +| `git drs untrack ` | Stop tracking files or globs | +| `git drs ls-files` | List tracked files and localization state | +| `git drs pull` | Hydrate pointer files in the current checkout | +| `git drs push` | Register/upload objects and push metadata workflow | +| `git drs add-url` | Add an existing provider object by URL or scoped key | +| `git drs add-ref` | Add a local reference to an existing DRS object | +| `git drs query` | Query a DRS object by ID | +| `git drs copy-records` | Copy Syfon records between remotes for one scope | -## Requirements +## Documentation -- Git LFS installed and configured -- Access credentials for your DRS server -- Go 1.24+ (for building from source) +- [Getting Started](docs/getting-started.md) +- [Commands Reference](docs/commands.md) +- [Troubleshooting](docs/troubleshooting.md) +- [Developer Guide](docs/developer-guide.md) +- [GA4GH DRS Scalability Gaps](docs/ga4gh-drs-scalability-gaps.md) -## Support +## Requirements -- **Issues**: [GitHub Issues](https://github.com/calypr/git-drs/issues) -- **Releases**: [GitHub Releases](https://github.com/calypr/git-drs/releases) -- **Documentation**: See `docs/` folder for detailed guides +- Git +- access credentials for the target Gen3/Syfon deployment +- Go 1.26.2+ for local builds -## License +## Support -This project is part of the CALYPR data commons ecosystem. +- [GitHub Issues](https://github.com/calypr/git-drs/issues) +- [GitHub Releases](https://github.com/calypr/git-drs/releases) diff --git a/attic/issue-add-include-pattern-to-git-drs-pull.md b/attic/issue-add-include-pattern-to-git-drs-pull.md new file mode 100644 index 00000000..4217ab3b --- /dev/null +++ b/attic/issue-add-include-pattern-to-git-drs-pull.md @@ -0,0 +1,51 @@ +# Add `-I "pattern"` include filter support to `git drs pull` + +## Summary +Add include-pattern filtering to `git drs pull`, similar to legacy `git lfs pull -I "pattern"` workflows. + +## Motivation +Current `git drs pull` behavior pulls based on repository resolution without a user-facing path pattern filter. Users migrating from `git lfs pull -I` expect selective hydration of files by glob/path. + +## Proposed UX +Support: + +```bash +git drs pull -I "results/*.txt" +git drs pull -I "*.bam" -I "data/**" +git drs pull --include "path/to/file" +``` + +Optional: +- `--exclude` parity (if desired in same change or follow-up) + +## Proposed behavior +1. Parse one or more include patterns (`-I`, `--include`). +2. Resolve candidate pointers as usual. +3. Filter by repo-relative path match before download. +4. Download only matched objects; skip others with clear logging. +5. If no pattern supplied, preserve current default behavior. + +## Scope +- `cmd/pull/main.go` CLI flags and pull selection pipeline +- pointer/path inventory layer (where path<->OID candidates are produced) +- docs: `docs/commands.md`, `docs/getting-started.md`, `docs/troubleshooting.md` +- tests for include filtering semantics + +## Acceptance criteria +- [ ] `git drs pull -I ""` works for a single pattern. +- [ ] Repeated `-I` flags are supported. +- [ ] Include matching is against repo-relative paths. +- [ ] Default `git drs pull` behavior unchanged when no `-I` is passed. +- [ ] Help text documents pattern syntax and examples. +- [ ] Unit/integration tests cover positive and negative matches. + +## Testing matrix +- Single file exact path include. +- Wildcard include (`*.bam`, `data/**`). +- Multiple `-I` values. +- No matches (should no-op cleanly and return success unless policy says otherwise). +- Mixed matched/unmatched objects in same pull run. + +## Notes +This closes a usability gap for users transitioning from `git lfs` CLI habits to `git drs` commands while keeping pull behavior explicit and predictable. + diff --git a/docs/adding-s3-files.md b/docs/adding-s3-files.md index cb233826..f8a83a6b 100644 --- a/docs/adding-s3-files.md +++ b/docs/adding-s3-files.md @@ -1,6 +1,6 @@ # Adding Provider Objects with `git drs add-url` -`git drs add-url` prepares a Git LFS pointer plus local DRS metadata for an object that already exists in provider storage. +`git drs add-url` prepares a Git pointer plus local DRS metadata for an object that already exists in provider storage. Important behavior: @@ -26,7 +26,7 @@ The inspector also accepts other go-cloud styles (`gs://`, `azblob://`, `file:// If your remote org/project already has a bucket mapping, pass an object key relative to that configured bucket scope and set `--scheme`. ```bash -git lfs track "data/*.bin" +git drs track "data/*.bin" git add .gitattributes git drs add-url path/to/object.bin data/from-bucket.bin \ @@ -54,7 +54,7 @@ git drs add-url s3://my-bucket/path/to/object.bin data/from-bucket.bin \ If you know the authoritative SHA256, pass `--sha256`. ```bash -git lfs track "data/*.bin" +git drs track "data/*.bin" git add .gitattributes git drs add-url path/to/object.bin data/from-bucket.bin \ @@ -66,25 +66,25 @@ git commit -m "add known-sha object" git drs push ``` -## Unknown SHA256 (experimental sentinel mode) +## Unknown SHA256 If SHA256 is unknown, omit `--sha256`. Behavior: 1. `add-url` performs object metadata lookup (HEAD/attributes). -2. Synthetic OID is derived from ETag (`sha256(etag)`). -3. A local sentinel object is written into `.git/lfs/objects/...`. -4. `git drs push` performs metadata-only registration. +2. A deterministic placeholder OID is derived from remote object metadata. +3. A pointer file and local DRS metadata are written. +4. `git drs push` performs metadata registration. ```bash -git lfs track "data/*.bin" +git drs track "data/*.bin" git add .gitattributes git drs add-url path/to/object.bin data/from-bucket.bin --scheme s3 git add data/from-bucket.bin -git commit -m "add unknown-sha object (sentinel mode)" +git commit -m "add unknown-sha object" git drs push ``` @@ -103,7 +103,7 @@ For e2e/dev harnesses, `TEST_BUCKET_*` variables are also supported by command-l ## Prerequisites -- File path must be LFS-tracked (via `.gitattributes`). +- File path must be tracked (via `.gitattributes`). - Remote configuration must point to the intended org/project scope. - The bucket credential and org/project storage scope must exist on drs-server, for example via `git drs bucket add`, then `git drs bucket add-organization` or `git drs bucket add-project --path s3://bucket/prefix`. @@ -118,13 +118,13 @@ Usually region/endpoint mismatch for S3-compatible storage. ### `no local payload available; skipping upload and keeping metadata-only registration` -Expected for add-url pointer/sentinel flows where local payload bytes are intentionally absent. +Expected for add-url pointer/metadata-only flows where local payload bytes are intentionally absent. -### `file is not tracked by LFS` +### `file is not tracked` Track the path pattern and re-add: ```bash -git lfs track "data/*.bin" +git drs track "data/*.bin" git add .gitattributes ``` diff --git a/docs/architecture-drs-endpoints-and-transfer-concurrency.md b/docs/architecture-drs-endpoints-and-transfer-concurrency.md new file mode 100644 index 00000000..7f5d258a --- /dev/null +++ b/docs/architecture-drs-endpoints-and-transfer-concurrency.md @@ -0,0 +1,216 @@ +# Architecture: DRS Endpoint Flows, Transfer Concurrency, and `add-url`/`add-ref` + +This document explains three implementation areas in `git-drs`: + +1. How user-issued Git/Git-DRS commands map to GA4GH DRS endpoint calls. +2. How transfer concurrency works for upload and download. +3. How `add-url` and `add-ref` work, including when and where SHA existence is checked on the DRS server. + +--- + +## 1) Command to Endpoint Trace (User command -> Code path -> DRS API) + +## 1.1 High-level command routing + +- User-facing commands are registered in `cmd/root.go`. +- Relevant command entrypoints: + - `git drs push` -> `cmd/push/main.go` + - `git drs pull` -> `cmd/pull/main.go` + - `git drs ls-files` -> `cmd/lsfiles/main.go` + - `git drs query` -> `cmd/query/main.go` + - `git drs add-ref` -> `cmd/addref/add-ref.go` + - `git drs add-url` -> `cmd/addurl/service.go` + +`git-drs` obtains a remote-specific API client via `config.LoadConfig()` + `cfg.GetRemoteClient(...)` (see `internal/config/remote.go`). + +## 1.2 Endpoint mapping matrix + +The table below maps command behavior to DRS client calls and the corresponding DRS API intent. + +| User command | Main call path | DRS client method(s) | DRS endpoint intent | +| --- | --- | --- | --- | +| `git drs query ` | `cmd/query/main.go` | `DRS().GetObject(drs_id)` | Get object by DRS ID (`/ga4gh/drs/v1/objects/{id}` style) | +| `git drs query --checksum ` | `cmd/query/main.go` -> `drsremote.ObjectsByHashForScope` | `DRS().BatchGetObjectsByHash([]checksum)` | Lookup objects by checksum (`/ga4gh/drs/v1/objects/checksum/{checksum}` style; asserted in tests) | +| `git drs ls-files --drs` | `cmd/lsfiles/main.go` | `DRS().BatchGetObjectsByHash([]checksum)` | Check DRS registration status for local tracked files | +| `git drs pull` | `cmd/pull/main.go` -> `drsremote.DownloadToCachePath` | `DRS().BatchGetObjectsByHash`, `DRS().GetAccessURL`; optional bulk via `DRSAPI().GetBulkAccessURLWithResponse` | Resolve missing OIDs to DRS records and access URLs, then hydrate content into the current checkout | +| `git drs push [remote]` | `cmd/push/main.go` -> `pushsync.BatchSyncForPush` | `DRS().BatchGetObjectsByHash`, `DRS().RegisterObjects`, `DRS().GetAccessURL` | Check checksum presence, register missing records, probe/downloadability before upload | +| `git drs add-ref ` | `cmd/addref/add-ref.go` | `DRS().GetObject(drs_uri)` | Resolve existing DRS object and write pointer | + +Notes: + +- `internal/drsremote/remote_test.go` explicitly verifies some concrete paths: + - checksum lookup path `/ga4gh/drs/v1/objects/checksum/{sha}` + - bulk access path `/ga4gh/drs/v1/objects/access` + - access URL path `/ga4gh/drs/v1/objects/{id}/access/{type}` +- `git drs pre-push-prepare` also calls a non-GA4GH metadata staging endpoint: + - `POST {remote}/info/drs/objects/metadata` (`cmd/prepush/main.go`) + - This is optional capability and not part of GA4GH DRS. + +## 1.3 Trace from standard Git commands + +`git-drs` participates in both explicit `git drs ...` commands and standard Git workflows after `git drs init`: + +- `git drs init` installs hooks (`cmd/initialize/main.go`): + - pre-commit: `git drs precommit` +- During a normal `git push`, pre-push metadata can be staged via `/info/drs/objects/metadata` before transfer. +- The explicit `git drs push` command runs the register/upload workflow, then runs `git push --no-verify` by default (`cmd/push/main.go`). + +--- + +## 2) Transfer Concurrency Model (Upload and Download) + +### Concurrency mechanism: in-process goroutines only + +All transfer concurrency in `git-drs` is **in-process**, implemented with **Go goroutines and channels**. There is no use of OS-level multi-processing (no `fork`/`exec` of worker processes) for data movement. + +- Upload object fan-out uses `golang.org/x/sync/errgroup` — goroutines with a shared context and bounded by `errgroup.SetLimit(n)`. +- Download chunk parallelism uses the `sydownload` library, which internally uses goroutines to issue concurrent HTTP range requests. +- Sub-process calls (`exec.Command("git", ...)`) appear only for Git metadata operations (for example `git checkout`, `git ls-files`, `git check-attr`), never for data-transfer concurrency. + +## 2.1 Upload concurrency (`git drs push`) + +Upload tuning originates from Git config and is carried in `config.GitContext`: + +- `lfs.concurrenttransfers` -> `UploadConcurrency` (Git config key) +- `drs.multipart-threshold` (MB) -> `MultiPartThreshold` + +See `internal/config/remote.go` (`newGitContext`) and `cmd/initialize/main.go` (`initGitConfig`). + +### Upload execution strategy + +In `internal/pushsync/batch_sync.go`: + +1. Build upload candidates. +2. Split candidates into: + - small files: `size < MultiPartThreshold` + - large files: `size >= MultiPartThreshold` +3. Small files upload in parallel using `errgroup.WithContext` + `eg.SetLimit(UploadConcurrency)` + `eg.Go(goroutine)` — **in-process goroutine fan-out**. +4. Large files upload sequentially (single goroutine, no additional concurrency). + +Key implementation points: + +- `executeUploadPlan(...)` controls fan-out and limits. +- Actual upload call is `syupload.UploadObjectFile(...)` in `internal/pushsync/register.go`. +- `forceMultipart` is computed per file (`fileSize >= threshold`) and passed to upload. + +Operationally, this gives bounded goroutine parallelism for many small objects while reducing resource contention for very large uploads. + +## 2.2 Download concurrency (`git drs pull`) + +Download concurrency is set via `sydownload.DownloadOptions`: + +- `MultipartThreshold: 5 MiB` +- `Concurrency: 2` +- `ChunkSize: 64 MiB` + +These values are currently hardcoded in `internal/drsremote/remote.go` (`downloadResolved`) and apply to the pull/hydration workflow. + +### Intra-object chunk concurrency + +The `sydownload` library implements **goroutine-based HTTP range-request concurrency** within a single object download: + +- `resolvedSource.GetRangeReader(ctx, guid, offset, length)` issues an HTTP range (`Range: bytes=offset-end`) request. +- `sydownload.DownloadToPathWithOptions` coordinates up to `Concurrency` (2) goroutines issuing simultaneous range requests per object. +- This is purely in-process; no subprocess is spawned. + +### Object-level iteration in pull + +- In `cmd/pull/main.go`, missing OIDs are processed in a **sequential** `for` loop — one object at a time. +- Each object download can still be internally chunk-concurrent (up to `Concurrency=2` goroutines) via `sydownload`. +- So pull concurrency is **intra-object** (goroutine-based chunk/range concurrency), not broad object fan-out. +- Bulk metadata prefetch (DRS objects + bulk access URLs) is performed **before** the sequential download loop to amortize API round-trips. + +## 2.3 Git metadata subprocesses + +Some flows still call Git commands directly for repository state inspection. + +- These are **subprocess** calls (`exec.Command("git", ...)`), not goroutine fan-out. +- Examples include tracked-file discovery and attribute inspection used by `ls-files` and `pull`. +- This is distinct from the goroutine-based `git drs push` upload fan-out and `sydownload` chunk concurrency. + +--- + +## 3) `add-url` and `add-ref`: Implementation and SHA existence checks + +## 3.1 `add-url` implementation + +Main logic lives in `cmd/addurl/service.go`. + +Workflow: + +1. Parse CLI input (`cmd/addurl/params.go`). +2. Resolve remote scope (org/project/bucket/prefix) (`cmd/addurl/scope.go`). +3. Resolve source object URL (full URL mode or key+`--scheme` mode). +4. Inspect object using cloud client (`sycloud.InspectObject`). +5. Ensure object identity: + - If `--sha256` provided: trust it as OID. + - Otherwise: derive a deterministic placeholder OID from remote object metadata. +6. Write pointer file to worktree. +7. Best-effort update of pre-commit cache (`updatePrecommitCache`). +8. Ensure file is tracked if needed. +9. Write/update local DRS metadata object under `.git/drs/lfs/objects` (`writeAddURLDrsObject`). + +### Does `add-url` query DRS server for SHA existence? + +Not immediately. `add-url` is local-preparation oriented: + +- It inspects provider object metadata. +- It writes local pointer + local DRS metadata. +- Server checksum existence is checked later during push (see section 3.3). + +## 3.2 `add-ref` implementation + +Main logic is in `cmd/addref/add-ref.go`. + +Workflow: + +1. Resolve remote client. +2. Call `DRS().GetObject(drs_uri)`. +3. Create parent directory if needed. +4. Write pointer from returned DRS object checksums (`lfs.CreateLfsPointer`). + +### Does `add-ref` query DRS server for SHA existence? + +It does not perform a checksum lookup endpoint call. It verifies existence by object ID (`GetObject`) and consumes checksum from that object payload.## 3.3 Where SHA existence check against DRS actually happens + +Checksum existence checks are performed during `git drs push` in `internal/pushsync/batch_sync.go`: + +1. `lookupMetadata()` iterates OIDs and calls: + - `drsremote.ObjectsByHash(...)` -> `DRS().BatchGetObjectsByHash(...)` +2. If no records exist for an OID, object candidate is included for bulk registration: + - `DRS().RegisterObjects(...)` +3. Upload decision is then based on registration status + downloadability probe. + +So for both `add-url` and `add-ref`, the checksum-existence gate is primarily deferred to push-time synchronization logic. + +--- + +## 4) End-to-end sequence summaries + +## 4.1 `git drs add-url ...` then `git drs push` + +1. `add-url`: local pointer + local DRS object prepared. +2. `push`: checksum lookup (`BatchGetObjectsByHash`). +3. Missing checksum -> `RegisterObjects`. +4. If payload required and available -> upload via syfon transfer. +5. Git refs pushed. + +## 4.2 `git drs add-ref ` then `git drs pull` + +1. `add-ref`: `GetObject(drs_id)` and write pointer. +2. `pull`: detect unresolved pointers. +3. For each OID, resolve scoped object by checksum and access URL. +4. Download to local object cache and hydrate the tracked file in the worktree. + +--- + +## 5) Practical implications for operators and developers + +- If you need immediate server-side checksum validation during `add-url`, that behavior does not exist today; validation happens at push time. +- All transfer concurrency is in-process (goroutines); no subprocess workers are used for data movement. +- Upload concurrency is configurable through Git config (`lfs.concurrenttransfers` key) and is implemented as a goroutine pool bounded by `errgroup.SetLimit`. +- Download concurrency is fixed (not configurable at runtime): `Concurrency=2` goroutines per object for HTTP range requests, currently hardcoded in `internal/drsremote/remote.go`. +- Object-level download iteration in `git drs pull` is sequential; only intra-object chunk downloads are concurrent. +- Git metadata discovery still uses subprocess calls, but those are repository inspection details, not data-transfer concurrency. + +--- diff --git a/docs/commands.md b/docs/commands.md index e1a7329f..43ac6d5a 100644 --- a/docs/commands.md +++ b/docs/commands.md @@ -1,617 +1,288 @@ # Commands Reference -Complete reference for Git DRS and related Git LFS commands. +Complete reference for the `git-drs` CLI as used on the `fix/cli` line. -Git DRS owns Git/DRS orchestration and local metadata. Direct provider access, signed URL behavior, and cloud inspection are client-side responsibilities reached through `syfon/client`. +Git DRS owns Git/DRS orchestration and local metadata. Provider access, signed URL behavior, and cloud inspection are handled through Syfon and client code behind these commands. -> **Navigation:** [Getting Started](getting-started.md) → **Commands Reference** → [Troubleshooting](troubleshooting.md) +> **Navigation:** [Getting Started](getting-started.md) -> **Commands Reference** -> [Troubleshooting](troubleshooting.md) -## Git DRS Commands +## Command Model -### `git drs install` - -Install global Git filter configuration for git-drs. This is equivalent in purpose to running `git-lfs install` for the git-drs filter. - -**Usage:** +`git-drs` is intentionally smaller now. -```bash -git drs install -``` +- Removed legacy commands: + - `git drs fetch` + - `git drs list` + - `git drs upload` + - `git drs download` +- `git drs pull` now mirrors `git lfs pull` semantics: + - it hydrates tracked pointer files in the current checkout + - it does not run `git pull` +- `git drs ls-files` is the `git lfs ls-files` analog: + - local-first inventory + - optional DRS registration checks +- `git drs remote add gen3` now takes scope as a positional `organization/project` -**What it does:** +## Core Setup -- Sets global Git config for `filter.drs.clean` -- Sets global Git config for `filter.drs.smudge` -- Sets global Git config for `filter.drs.process` -- Sets global Git config for `filter.drs.required` +### `git drs install` -**Resulting `~/.gitconfig` entries:** +Install global Git filter configuration for `git-drs`. -```ini -[filter "drs"] - clean = git-drs clean -- %f - smudge = git-drs smudge -- %f - process = git-drs filter - required = true +```bash +git drs install ``` -**When to run:** - -- **Once per machine/user** after installing `git-drs` -- Re-run any time you want to reset these global filter values +This sets the global `filter.drs.*` entries used by Git clean/smudge/filter operations. ### `git drs init` -Initialize Git DRS in a repository. Sets up Git DRS hooks and creates a `.git/drs/` directory that Git ignores automatically. - -**Usage:** +Initialize `git-drs` in the current repository. ```bash git drs init [flags] ``` -**Options:** - -- `--transfers `: Number of concurrent transfers (default: 4) - -**Example:** - -```bash -git drs init -``` - -**What it does:** - -- Creates `.git/drs/` directory structure -- Configures Git/LFS settings for git-drs managed push/pull -- Installs Git hooks for DRS workflows - -**When to run:** - -- **Once** after cloning a Git repository -- **Once** after creating a new Git repository -- **Never** needed for subsequent work sessions - -**You do NOT need to run `git drs init` again:** - -- When starting a new work session -- After refreshing credentials -- After pulling new changes +Common flags: -**Note:** Run this before adding remotes. +- `--transfers `: concurrent transfers +- `--upsert`: enable upsert behavior for push/register flows +- `--multipart-threshold `: multipart threshold in MB +- `--enable-data-client-logs`: enable lower-level client logging -### `git drs remote` +Run this once per repository. -Manage DRS remote server configurations. Git DRS supports multiple remotes for working with development, staging, and production servers. +## Remote Configuration -#### `git drs remote add gen3 ` +### `git drs remote add gen3 [remote-name] ` -Add a Gen3 DRS server configuration. - -**Usage:** +Add or refresh a Gen3-backed Syfon remote. ```bash -git drs remote add gen3 \ - --url \ - --cred \ - --organization \ - --project \ - [--bucket ] +git drs remote add gen3 [remote-name] [--cred | --token ] ``` -**Options:** - -- `--url `: Gen3 server endpoint (required) -- `--cred `: Path to credentials JSON file (required) -- `--token `: Token for temporary access (alternative to --cred) -- `--organization `: Program/organization scope used for bucket mapping -- `--project `: Project ID (required) -- `--bucket `: Bucket name fallback when no org/project mapping is configured - -**Examples:** +Examples: ```bash -# Add production remote -git drs remote add gen3 production \ - --url https://calypr-public.ohsu.edu \ - --cred /path/to/credentials.json \ - --organization my-program \ - --project my-project - -# Add staging remote -git drs remote add gen3 staging \ - --url https://staging.calypr.ohsu.edu \ - --cred /path/to/staging-credentials.json \ - --organization staging-program \ - --project staging-project +git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/credentials.json +git drs remote add gen3 staging HTAN_INT/BForePC --token "$GEN3_TOKEN" ``` -**Note:** The first remote you add automatically becomes the default remote. -**Important:** A bucket mapping for the target `organization/project` must already exist, typically created once by a steward/admin with `git drs bucket add`, then `git drs bucket add-organization` or `git drs bucket add-project --path :///`. Without that mapping, push/pull operations will fail. +Notes: -#### `git drs remote list` +- `remote-name` is optional; if omitted, the default remote name is used. +- scope is always one positional argument: `organization/project` +- `--cred` imports a Gen3 credential file +- `--token` uses a temporary bearer token +- bucket resolution is scope-driven; users do not need to provide `--bucket` +- endpoint resolution comes from the credential/token path; users do not need to provide `--url` -List all configured DRS remotes. +Prerequisite: -**Usage:** - -```bash -git drs remote list -``` - -**Example Output:** - -``` -* production gen3 https://calypr-public.ohsu.edu - staging gen3 https://staging.calypr.ohsu.edu - development gen3 https://dev.calypr.ohsu.edu -``` +- the target `organization/project` must already be mapped to a bucket on the server +- if no local repo mapping exists, `git-drs` can resolve the visible bucket from the server -The `*` indicates the default remote used by all commands unless specified otherwise. +### `git drs remote list` -#### `git drs remote set ` - -Set the default DRS remote for all operations. - -**Usage:** - -```bash -git drs remote set -``` - -**Examples:** +List configured DRS remotes. ```bash -# Switch to staging for testing -git drs remote set staging - -# Switch back to production -git drs remote set production - -# Verify change git drs remote list ``` -### `git drs fetch [remote-name]` - -Fetch DRS object metadata from remote server. Downloads metadata only, not actual files. +### `git drs remote set ` -**Usage:** +Set the default DRS remote. ```bash -# Fetch from default remote -git drs fetch - -# Fetch from specific remote -git drs fetch staging -git drs fetch production -``` - -**Note:** `fetch` and `push` are commonly used together for cross-remote workflows. See `git drs push` below. - -**What it does:** - -- Identifies remote and project from configuration -- Transfers all DRS records for a given project from the server to the local `.git/drs/lfs/objects/` directory - -### `git drs add-url [path]` - -Prepare a pointer plus local DRS metadata for an object that already exists in provider storage. - -**Usage:** - -```bash -# Preferred: object key resolved against configured bucket scope -git drs add-url path/to/object.bin data/from-bucket.bin --scheme s3 - -# Compatibility: explicit provider URL -git drs add-url s3://my-bucket/path/to/object.bin data/from-bucket.bin -``` - -**Options:** - -- `--scheme `: Required for object-key mode because local bucket mappings persist bucket/prefix, not provider scheme -- `--sha256 `: Expected SHA256 checksum when known - -**What it does:** - -- Resolves the effective org/project bucket scope for the current remote -- Inspects the provider object through client-owned cloud code -- Writes a Git LFS pointer into the worktree -- Stores local DRS metadata for later registration during `git drs push` - -### `git drs push [remote-name]` - -Push local DRS objects to server. Uploads new files and registers metadata. - -**Usage:** - -```bash -# Push to default remote -git drs push - -# Push to specific remote -git drs push staging -git drs push production -``` - -**What it does:** - -- Checks local `.git/drs/lfs/objects/` for DRS metadata -- For each object, uploads file to bucket if file exists locally -- If file doesn't exist locally (metadata only), registers metadata without upload -- This enables cross-remote promotion workflows - -**Cross-Remote Promotion:** - -Transfer DRS records from one remote to another (eg staging to production) without re-uploading files: - -```bash -# Fetch metadata from staging -git drs fetch staging - -# Push metadata to production (no file upload since files don't exist locally) -git drs push production -``` - -This is useful when files are already in the production bucket with matching SHA256 hashes. It can also be used to reupload files given that the files are pulled to the repo first. - -**Note:** `fetch` and `push` are commonly used together. `fetch` pulls metadata from one remote, `push` registers it to another. - -### `git drs query` - -Query a DRS object by its DRS ID or SHA256 checksum. - -**Usage:** - -```bash -# Query by DRS ID (default behavior) -git drs query - -# Query by SHA256 checksum -git drs query --checksum +git drs remote set production ``` -**Options:** +## Bucket Mapping -- `--checksum`, `-c`: Treat the argument as a SHA256 checksum instead of a DRS ID. -- `--pretty`, `-p`: Output indented JSON for easier reading. -- `--remote`, `-r`: Target a specific remote (default: default_remote). +These commands are typically steward/admin setup, not day-to-day end-user commands. -**Examples:** +### `git drs bucket add` -```bash -# Query by checksum and pretty-print the result -git drs query --checksum 9f2c2db77f0a3e2b47e4b44b8ce8d4c8c3c4c0b5f4c5a2d2f9b1d0bfb0a1c2d3 --pretty +Declare bucket credentials for a remote. -# Query by DRS ID against a specific remote -git drs query did:example:12345 --remote staging -``` - -### `git drs add-url` - -Prepare a file reference via cloud object URL for DRS registration. +### `git drs bucket add-organization` -**Usage:** +Map an organization to a bucket path. ```bash -# Stage local pointer + DRS metadata -git drs add-url [path] [--sha256 ] -# Register/push prepared records -git drs push +git drs bucket add-organization production \ + --organization HTAN_INT \ + --path s3://cbds/htan-int ``` -**Examples:** +### `git drs bucket add-project` -```bash -# Known SHA path -git drs add-url s3://bucket/path/file.bin data/file.bin --sha256 - -# Unknown SHA path (experimental sentinel mode) -git drs add-url s3://bucket/path/file.bin data/file.bin -``` - -**Options:** - -- `--sha256 `: Optional SHA256 hash of the source object. - If omitted, add-url uses experimental ETag-derived sentinel mode and registers a synthetic OID. - -**Notes:** - -- `add-url` no longer accepts per-command AWS credential flags. -- S3 connection hints are resolved from environment/runtime config when needed (for example `AWS_REGION`, `AWS_ENDPOINT_URL`, `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`). -- Registration happens on `git drs push`, not at `add-url` time. - -### `git drs version` - -Display Git DRS version information. +Map an organization/project to a bucket path. ```bash -git drs version +git drs bucket add-project production \ + --organization HTAN_INT \ + --project BForePC \ + --path s3://cbds/htan-int/bforepc ``` -### `git drs track [pattern ...]` - -Manage Git LFS tracking patterns from Git DRS. - -**View tracked patterns:** +## File Tracking and Hydration -```bash -git drs track -``` +### `git drs track` -**Track one or more patterns:** +Track files or patterns with Git-compatible pointer behavior. ```bash git drs track "*.bam" -git drs track "*.bam" "data/**" +git drs track "data/**" ``` -**Options:** - -- `--verbose`: Show detailed Git LFS output -- `--dry-run`: Show what would change without writing `.gitattributes` +Stage `.gitattributes` after changing tracked patterns. -### `git drs untrack [pattern ...]` +### `git drs untrack` -Remove one or more Git LFS tracking patterns. +Stop tracking patterns. ```bash git drs untrack "*.bam" -git drs untrack "*.bam" "data/**" -``` - -**Options:** - -- `--verbose`: Show detailed Git LFS output -- `--dry-run`: Show what would change without writing `.gitattributes` - -### Internal Commands - -These commands are called automatically by Git hooks: - -- `git drs precommit`: Process staged files during commit -- `git drs pre-push-prepare`: Stage DRS metadata before push -- `git lfs pre-push`: Optional Git LFS compatibility push flow (invoked by the pre-push hook when enabled) - -## Git LFS Commands - -### `git lfs track` - -Manage file tracking patterns. - -**View Tracked Patterns:** - -```bash -git lfs track -``` - -**Track New Pattern:** - -```bash -git lfs track "*.bam" -git lfs track "data/**" -git lfs track "specific-file.txt" -``` - -**Untrack Pattern:** - -```bash -git lfs untrack "*.bam" ``` -### `git lfs ls-files` - -List LFS-tracked files in the repository. +### `git drs ls-files [pathspec...]` -**All Files:** +List tracked LFS-style files in the current checkout. ```bash -git lfs ls-files +git drs ls-files +git drs ls-files data/** +git drs ls-files -I "*.bam" +git drs ls-files --drs +git drs ls-files -l --drs +git drs ls-files -n results/** ``` -**Specific Pattern:** - -```bash -git lfs ls-files -I "*.bam" -git lfs ls-files -I "data/**" -``` +Important behavior: -**Output Format:** +- default mode is local-first and cheap +- `*` means localized/hydrated in the worktree +- `-` means the worktree still contains a pointer +- `--drs` adds DRS registration checks -- `*` prefix: File is localized (downloaded) -- `-` prefix: File is not localized -- No prefix: File status unknown +Common flags: -### `git lfs pull` +- `-I, --include `: include filter; may be repeated +- `-l, --long`: long output +- `-n, --name-only`: path-only output +- `--json`: structured output +- `--drs`: check DRS registration status -Download LFS-tracked files. +### `git drs pull` -**All Files:** +Hydrate tracked pointer files in the current checkout. ```bash -git lfs pull +git drs pull +git drs pull -I "*.bam" +git drs pull -I "data/**" -I "results/*.txt" +git drs pull --dry-run -I "results/**" ``` -**Specific Files:** +Important behavior: -```bash -git lfs pull -I "*.bam" -git lfs pull -I "data/important.txt" -git lfs pull -I "results/**" -``` - -**Multiple Patterns:** +- `git drs pull` does not run `git pull` +- it only hydrates tracked pointer files already present in the current checkout +- include matching is against repo-relative paths -```bash -git lfs pull -I "*.bam" -I "*.vcf" -``` +Common flags: -### `git lfs install` +- `-I, --include `: include filter; may be repeated +- `--dry-run`: show what would be hydrated without downloading -Configure Git LFS for the system or repository. +## Object Registration and Push -**System-wide:** - -```bash -git lfs install --skip-smudge -``` +### `git drs push [remote-name]` -**Repository-only:** +Register and upload tracked objects, then rely on normal Git push for refs. ```bash -git lfs install --local --skip-smudge +git drs push +git drs push production ``` -The `--skip-smudge` option prevents automatic downloading of all LFS files during clone/checkout. - -## Standard Git Commands - -Git DRS integrates with standard Git commands: - -### `git add` - -Stage files for commit. LFS-tracked files are automatically processed. +What it does: -```bash -git add myfile.bam -git add data/ -git add . -``` +- resolves local pointer/object metadata +- uploads local bytes when needed +- registers object metadata with the target Syfon instance -### `git commit` +### `git drs add-url [path]` -Commit changes. Git DRS pre-commit hook runs automatically. +Create a pointer and local metadata for an object that already exists in provider storage. ```bash -git commit -m "Add new data files" +git drs add-url path/to/object.bin data/from-bucket.bin --scheme s3 +git drs add-url s3://my-bucket/path/to/object.bin data/from-bucket.bin +git drs add-url s3://my-bucket/path/to/object.bin data/from-bucket.bin --sha256 ``` -### `git push` +Notes: -Push commits to remote. Git DRS automatically uploads new files to DRS server. +- object-key mode resolves against the configured bucket scope +- explicit provider URL mode remains supported +- `--scheme` is required for object-key mode -```bash -git push -git push origin main -``` - -### `git clone` +### `git drs add-ref ` -Clone repository. Use with Git DRS initialization: +Add a local pointer file for an existing DRS object. ```bash -git clone -cd -git drs init -git drs remote add gen3 production --cred /path/to/credentials.json --url ... --organization ... --project ... +git drs add-ref drs://example/object-id data/object.bin ``` -## Workflow Examples +### `git drs query ` -### Complete File Addition Workflow +Query a DRS object by ID. ```bash -# 1. Ensure file type is tracked -git lfs track "*.bam" -git add .gitattributes - -# 2. Add your file -git add mydata.bam - -# 3. Verify tracking -git lfs ls-files -I "mydata.bam" - -# 4. Commit (creates DRS record) -git commit -m "Add analysis results" - -# 5. Push (uploads to default DRS server) -git push +git drs query drs://example/object-id ``` -### Selective File Download - -```bash -# Check what's available -git lfs ls-files +## Metadata Copy -# Download specific files -git lfs pull -I "results/*.txt" -git lfs pull -I "important-dataset.bam" - -# Verify download -git lfs ls-files -I "results/*.txt" -``` +### `git drs copy-records [source-remote] ` -### Repository Setup from Scratch +Copy Syfon metadata records from one remote to another for a single project scope. ```bash -# 1. Create and clone repo -git clone -cd - -# 2. Initialize Git DRS -git drs init - -# 3. Add DRS remote -git drs remote add gen3 production \ - --url https://calypr-public.ohsu.edu \ - --cred /path/to/credentials.json \ - --organization my-program \ - --project my-project - -# 4. Set up file tracking -git lfs track "*.bam" -git lfs track "*.vcf.gz" -git lfs track "data/**" -git add .gitattributes -git commit -m "Configure LFS tracking" -git push - -# 5. Add data files -git add data/sample1.bam -git commit -m "Add sample data" -git push +git drs copy-records prod HTAN_INT/BForePC +git drs copy-records dev prod HTAN_INT/BForePC ``` -### Cross-Remote Promotion Workflow +Behavior: -```bash -# 1. Add multiple remotes -git drs remote add gen3 staging \ - --url https://staging.calypr.ohsu.edu \ - --cred /path/to/staging-credentials.json \ - --organization staging-program \ - --project staging-project - -git drs remote add gen3 production \ - --url https://calypr-public.ohsu.edu \ - --cred /path/to/prod-credentials.json \ - --organization prod-program \ - --project prod-project - -# 2. Fetch metadata from staging -git drs fetch staging - -# 3. Push metadata to production (no re-upload) -git drs push production -``` +- with one remote arg: + - source defaults to the configured default remote + - arg is treated as the target remote +- with two remote args: + - first is source + - second is target +- copies metadata only, not object bytes -## Environment Variables +Merge behavior for existing target records: -Git DRS respects these environment variables: +- match by DID +- union `controlled_access` +- union `access_methods` +- preserve existing target metadata otherwise -- `AWS_ACCESS_KEY_ID`: AWS access key (for S3 operations) -- `AWS_SECRET_ACCESS_KEY`: AWS secret key (for S3 operations) +## Removed Legacy Commands -## Help and Documentation +These commands are gone from the cleaned CLI: -Use `--help` with any command for detailed usage: +- `git drs fetch` +- `git drs list` +- `git drs upload` +- `git drs download` -```bash -git-drs --help -git-drs init --help -git-drs add-url --help -git lfs --help -git lfs track --help -``` +If older docs or notes mention them, treat those references as stale. diff --git a/docs/developer-guide.md b/docs/developer-guide.md index e9751130..df82388c 100644 --- a/docs/developer-guide.md +++ b/docs/developer-guide.md @@ -10,7 +10,7 @@ Git DRS integrates with Git through several mechanisms: **Pre-commit Hook**: `git drs precommit` - Triggered automatically before each commit -- Processes all staged LFS files +- Processes all staged files - Creates DRS records for new files - Only processes files that don't already exist on the DRS server - Prepares metadata for later upload during push @@ -34,7 +34,7 @@ Git DRS integrates with Git through several mechanisms: - Stores in .git/drs/ directory 4. Developer: git push 5. Git Hook: git drs pre-push-prepare - - Stages pending metadata for LFS verify + - Stages pending metadata for DRS verify 6. Git DRS: - `git drs push` runs register/upload directly - `git drs pull` runs download directly @@ -44,8 +44,8 @@ Git DRS integrates with Git through several mechanisms: Git DRS no longer uses a custom transfer agent. -- Upload path (primary): `git drs push` discovers local LFS pointers, bulk-registers missing objects, checks validity, and uploads missing bits. -- Download path (primary): `git drs pull` resolves object records and downloads into local LFS object storage. +- Upload path (primary): `git drs push` discovers local pointers, bulk-registers missing objects, checks validity, and uploads missing bits. +- Download path (primary): `git drs pull` resolves object records and downloads into local object storage. ## Repository Structure @@ -73,12 +73,12 @@ drs/ # DRS object utilities ├── object.go # DRS object structures └── util.go # Utility functions -lfs/ # Git LFS integration -└── lfs.go # LFS pointer/discovery helpers +lfs/ # Pointer utilities +└── lfs.go # Pointer/discovery helpers utils/ # Shared utilities ├── common.go # Common functions -├── lfs-track.go # LFS tracking utilities +├── lfs-track.go # Tracking utilities └── util.go # General utilities ``` @@ -97,14 +97,13 @@ servers: ### DRS Object Management -Objects are stored in `.git/drs/lfs/objects/` during pre-commit and referenced during push/pull workflows. +Objects are stored in `.git/drs/objects/` during pre-commit and referenced during push/pull workflows. ## Development Setup ### Prerequisites -- Go 1.24+ -- Git LFS installed +- Go 1.26.2+ - Access to a DRS server for testing ### Building from Source @@ -152,7 +151,7 @@ export PATH=$PATH:$(pwd) ```bash # Test specific functionality -go test ./utils -run TestLFSTrack +go test ./utils -run TestTrack ``` ### Integration Tests diff --git a/docs/drs-registerfile-upsert.md b/docs/drs-registerfile-upsert.md index 72f162b5..03d1cff4 100644 --- a/docs/drs-registerfile-upsert.md +++ b/docs/drs-registerfile-upsert.md @@ -1,4 +1,4 @@ -# ADR 0001: Configure RegisterFile upsert/bucket checks via git LFS config +# ADR 0001: Configure RegisterFile upsert/bucket checks via git config ## Status Accepted @@ -8,7 +8,7 @@ The DRS `RegisterFile` flow needs toggles for: - whether to upsert DRS records (create when no matching project record exists, or replace by deleting and re-registering when a ma - whether to check bucket existence before uploading (Unimplemented, currently always checks and skips upload if already present) -These toggles must be controlled per-repository using git LFS configuration (`git config` entries under `drs.*`). This keeps behavior in repo-local configuration and avoids coupling to remote YAML configuration. +These toggles must be controlled per-repository using git config (`git config` entries under `drs.*`). This keeps behavior in repo-local configuration and avoids coupling to remote YAML configuration. ## Decision Read `drs.upsert` from git config during DRS client initialization. Missing values default to `false`. Invalid values fail initialization with a clear error. diff --git a/docs/e2e-modes-and-local-setup.md b/docs/e2e-modes-and-local-setup.md index 20820cf2..eb0de218 100644 --- a/docs/e2e-modes-and-local-setup.md +++ b/docs/e2e-modes-and-local-setup.md @@ -81,7 +81,7 @@ TEST_STRICT_CLEANUP=true - HTTP basic auth via: - `TEST_LOCAL_USERNAME` + `TEST_LOCAL_PASSWORD`, or - `TEST_ADMIN_AUTH_HEADER="Authorization: Basic "` -- `git drs remote add local ... --username ... --password ...` stores local basic auth in repo config for helper/LFS flows. +- `git drs remote add local ... --username ... --password ...` stores local basic auth in repo config for credential-helper flows. ## How wrapper scripts map to the main suites @@ -138,7 +138,7 @@ What it covers: - `git drs push` metadata register + upload - multipart/resume behavior -- `git drs pull` and `git lfs pull` compatibility checks +- `git drs pull` download and compatibility checks - cleanup by DID resolution ## Local add-url E2E: runbook @@ -152,7 +152,7 @@ bash tests/e2e-local-addurl.sh What it covers: - known-sha add-url path (`--sha256 `) -- unknown-sha add-url path (sentinel pointer OID) +- unknown-sha add-url path (placeholder pointer OID) - push/register + pull hydration checks ## Monorepo E2E (remote and local) diff --git a/docs/getting-started.md b/docs/getting-started.md index 92a2b636..cfef54f3 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -1,25 +1,32 @@ # Getting Started -This guide walks you through setting up Git DRS and performing common workflows. +This guide walks through the current `git-drs` workflow on the cleaned CLI path. -> **Navigation:** [Installation](installation.md) → **Getting Started** → [Commands Reference](commands.md) → [Troubleshooting](troubleshooting.md) +> **Navigation:** [Installation](installation.md) -> **Getting Started** -> [Commands Reference](commands.md) -> [Troubleshooting](troubleshooting.md) -## Repository Initialization +## What `git-drs` Does -Every Git repository using Git DRS requires configuration, whether you're creating a new repo or cloning an existing one. +`git-drs` manages: -### Cloning Existing Repository (Gen3) +- Git-compatible pointer files +- local DRS metadata +- remote Syfon/Gen3 configuration +- pointer hydration and object registration workflows -1. **Clone the Repository** +It no longer tries to be a mixed bag of Git, Git LFS, and DRS transport wrappers. + +## Cloning an Existing Repository + +1. Clone the repository: ```bash git clone .git cd ``` -2. **Configure SSH** (if using SSH URLs) +2. If you use SSH remotes, make sure your SSH setup is already working for that host. - If using SSH URLs like `git@github.com:user/repo.git`, add to `~/.ssh/config`: + A typical keepalive configuration looks like: ``` Host github.com @@ -27,347 +34,361 @@ Every Git repository using Git DRS requires configuration, whether you're creati ServerAliveInterval 30 ``` -3. **Get Credentials** - - - Log in to your data commons (e.g., https://calypr-public.ohsu.edu/) - - Profile → Create API Key → Download JSON - - **Note**: Credentials expire after 30 days - -4. **Initialize Repository** +3. Initialize `git-drs` in the repo: ```bash git drs init ``` -5. **Verify Configuration** +4. Hydrate tracked files if needed: ```bash - git drs remote list + git drs pull ``` - Output: - ``` - * production gen3 https://calypr-public.ohsu.edu/ - ``` +This is the normal onboarding flow for an existing repo. `git drs pull` hydrates pointer files already present in the checkout. It does not replace `git pull`. - The `*` indicates this is the default remote. +## One-Time Machine Setup -### New Repository Setup (Gen3) +Install `git-drs` and the global Git filter configuration: -1. **Create and Clone Repository** +```bash +git drs install +``` - ```bash - git clone .git - cd - ``` +## One-Time Repository Setup + +After cloning or creating a repository: + +```bash +git drs init +``` + +That sets up repository-local `git-drs` state and hooks. + +## Add a Gen3 Remote + +The current shape is: + +```bash +git drs remote add gen3 [remote-name] [--cred | --token ] +``` + +Example: + +```bash +git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/credentials.json +``` -2. **Configure SSH** (if needed - same as above) +Notes: -3. **Get Credentials** (same as above) +- scope is one positional argument: `organization/project` +- users do not provide `--bucket` +- users do not provide `--url` +- bucket resolution is scope-based and server-backed -4. **Get Project Details** +Verify: - Contact your data coordinator for: - - DRS server URL - - Organization name - - Project ID - - Bucket name - - Confirmation that bucket mapping exists for your organization/project +```bash +git drs remote list +``` -5. **Initialize Git DRS** +## New Repository Setup + +For a new repository or a repository that has not yet been configured with `git-drs`: + +1. Initialize the repository: ```bash git drs init ``` -6. **Add Remote Configuration** +2. Add the target remote: ```bash - git drs remote add gen3 production \ - --cred /path/to/credentials.json \ - --url https://calypr-public.ohsu.edu \ - --project my-project \ - --bucket my-bucket + git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/credentials.json ``` - **Note:** Since this is your first remote, it automatically becomes the default. No need to run `git drs remote set`. - -7. **Verify Configuration** +3. Verify the configuration: ```bash git drs remote list ``` - Output: - ``` - * production gen3 https://calypr-public.ohsu.edu - ``` +## Steward/Admin Prerequisite - **Important:** `git drs remote add` alone is not enough. Push/pull requires an existing bucket mapping for your `organization/project` (usually provisioned once by a steward/admin). +Push and pull depend on server-side bucket mapping for the target scope. -**Managing Additional Remotes** - -You can add more remotes later for multi-environment workflows (development, staging, production): +That usually means a steward/admin has already done something like: ```bash -# Add staging remote -git drs remote add gen3 staging \ - --cred /path/to/staging-credentials.json \ - --url https://staging.calypr.ohsu.edu \ - --project staging-project \ - --bucket staging-bucket - -# View all remotes -git drs remote list +git drs bucket add production \ + --bucket cbds \ + --region us-east-1 \ + --access-key "$AWS_ACCESS_KEY_ID" \ + --secret-key "$AWS_SECRET_ACCESS_KEY" + +git drs bucket add-organization production \ + --organization HTAN_INT \ + --path s3://cbds/htan-int + +git drs bucket add-project production \ + --organization HTAN_INT \ + --project BForePC \ + --path s3://cbds/htan-int/bforepc +``` -# Switch default remote -git drs remote set staging +End users generally should not need to know the bucket name. -# Or use specific remote for one command -git drs push production -git drs fetch staging -``` +## Credentials -## File Tracking +For Gen3-backed deployments: -Git DRS can use Git LFS-compatible pointers and local object storage. You must explicitly track file patterns before adding LFS-managed files. +- obtain a credential JSON or token from the target data commons +- the common path is: log in -> profile -> create API key -> download JSON +- refresh it when it expires +- re-run `git drs remote add gen3 ... --cred ...` when you need to refresh the stored profile -### View Current Tracking +Example: ```bash -git lfs track +git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/new-credentials.json ``` -### Track Files +## Managing Additional Remotes -**Single File** +You can add multiple remotes for multi-environment workflows. ```bash -git lfs track path/to/specific-file.txt -git add .gitattributes +git drs remote add gen3 staging HTAN_INT/BForePC --cred /path/to/staging-credentials.json +git drs remote list +git drs remote set staging ``` -**File Pattern** +Or target a non-default remote for a single command: ```bash -git lfs track "*.bam" -git add .gitattributes +git drs push production +git drs copy-records staging production HTAN_INT/BForePC ``` -**Directory** +## Track Files + +Track file types or paths you want managed by `git-drs`: ```bash -git lfs track "data/**" +git drs track "*.bam" git add .gitattributes ``` -### Untrack Files +You can also track explicit paths or path globs: ```bash -# View tracked patterns -git lfs track - -# Remove pattern -git lfs untrack "*.bam" - -# Stage changes +git drs track "data/**" git add .gitattributes ``` -## Basic Workflows +View current tracking: + +```bash +git drs track +``` -### Adding and Pushing Files +Stop tracking patterns: ```bash -# Track file type (if not already tracked) -git lfs track "*.bam" +git drs untrack "*.bam" git add .gitattributes +``` -# Add your file -git add myfile.bam - -# Verify LFS is tracking it -git lfs ls-files +## Add, Commit, and Push -# Commit and push -git commit -m "Add new data file" +```bash +git add sample.bam +git commit -m "Add sample" git push ``` -> **Note**: Git DRS automatically creates DRS records during commit and uploads files to the default remote during push. +`git-drs` handles pointer/object registration behavior around the Git workflow. -### Downloading Files +## Inspect Tracked Files -**Single File** +Use `ls-files` as the local inventory command: ```bash -git lfs pull -I path/to/file.bam +git drs ls-files +git drs ls-files -l +git drs ls-files --drs +git drs ls-files -I "*.bam" ``` -**Pattern** +Interpretation: -```bash -git lfs pull -I "*.bam" -``` - -**All Files** +- `*` means localized/hydrated in the worktree +- `-` means the worktree still contains a pointer -```bash -git lfs pull -``` +## Hydrate Files -**Directory** +Use `git drs pull` only for hydration. ```bash -git lfs pull -I "data/**" +git drs pull +git drs pull -I "*.bam" +git drs pull -I "results/**" -I "*.txt" ``` -### Checking File Status - -```bash -# List all LFS-tracked files -git lfs ls-files +Important: -# Check specific pattern -git lfs ls-files -I "*.bam" +- `git drs pull` does not run `git pull` +- run plain `git pull` yourself when you want new commits/trees +- then run `git drs pull` if you need to hydrate pointer files in the checkout -# View localization status -# (-) = not localized, (*) = localized -git lfs ls-files -``` +## Add Existing Bucket Objects -## Working with Cloud Object URLs - -You can add references to existing bucket objects without copying them: +If the object already exists in provider storage, use `add-url`: ```bash # Track the file pattern first -git lfs track "myfile.txt" +git drs track "myfile.txt" git add .gitattributes # Add object reference (known sha256 path) -git drs add-url s3://bucket/path/to/file \ +git drs add-url s3://bucket/path/to/file myfile.txt \ --sha256 -# Or use unknown-sha (experimental sentinel mode) -git drs add-url s3://bucket/path/to/file +# Or use unknown-sha +git drs add-url s3://bucket/path/to/file myfile.txt # Commit and push +git add myfile.txt git commit -m "Add S3 file reference" git push ``` -See [Cloud URL Integration Guide](adding-s3-files.md) for detailed examples. - -## Configuration Management - -### View Configuration +Scoped bucket-key mode also works: ```bash -git drs remote list +git drs add-url path/to/object.bin data/from-bucket.bin --scheme s3 +git commit -m "Add bucket-backed object reference" +git push ``` -### Update Configuration +Explicit provider URL mode also works: ```bash -# Refresh credentials - re-add remote with new credentials -git drs remote add gen3 production \ - --cred /path/to/new-credentials.json \ - --url https://calypr-public.ohsu.edu \ - --project my-project \ - --bucket my-bucket - -# Switch default remote -git drs remote set staging +git drs add-url s3://my-bucket/path/to/object.bin data/from-bucket.bin ``` -### View Logs +## Session Workflow -- Logs location: `.git/drs/` directory +> **Note:** You do not need to run `git drs init` again. Initialization is a one-time setup per local repository clone. -## Command Summary +For a normal work session: -| Action | Commands | -| ------------------ | ------------------------------------------- | -| **Initialize** | `git drs init` | -| **Add remote** | `git drs remote add gen3 --cred...` | -| **View remotes** | `git drs remote list` | -| **Set default** | `git drs remote set ` | -| **Track files** | `git lfs track "pattern"` | -| **Check tracked** | `git lfs ls-files` | -| **Add files** | `git add file.ext` | -| **Commit** | `git commit -m "message"` | -| **Push** | `git push` | -| **Download** | `git lfs pull -I "pattern"` | +1. Refresh credentials if needed -## Session Workflow + ```bash + git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/new-credentials.json + ``` + +2. Update Git history if needed + + ```bash + git pull + ``` -> **Note**: You do NOT need to run `git drs init` again. Initialization is a one-time setup per Git repository clone. +3. Hydrate tracked files if needed -For each work session: + ```bash + git drs pull + ``` -1. **Refresh credentials** (if expired - credentials expire after 30 days) +4. Work with files normally ```bash - git drs remote add gen3 production \ - --cred /path/to/new-credentials.json \ - --url https://calypr-public.ohsu.edu \ - --project my-project \ - --bucket my-bucket + git add ... + git commit -m "..." + git push ``` -2. **Work with files** (track, add, commit, push) +## Configuration Management + +View current remote configuration: + +```bash +git drs remote list +``` + +Refresh or update credentials by re-adding the remote: + +```bash +git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/new-credentials.json +``` ## Local DRS Server Setup -Use this flow when developing against a local `drs-server` instead of hosted Gen3. +Use this flow when developing against a local Syfon/DRS server instead of a hosted Gen3 deployment. -1. **Initialize repo** +1. Initialize the repo: ```bash git drs init ``` -2. **Add local remote** +2. Add the local remote: ```bash - git drs remote add local origin http://localhost:8080 \ - --organization calypr \ - --project end_to_end_test \ - --bucket cbds \ - --username drs-user \ - --password drs-pass + git drs remote add local origin http://localhost:8080 ``` - If your local server has no basic auth, omit `--username/--password`. + If your local server requires basic auth, include the local auth flags supported by that command. -3. **Track and push** +3. Track and push: ```bash - git lfs track "*.bin" + git drs track "*.bin" git add .gitattributes data/example.bin git commit -m "Add local DRS test file" git drs push ``` -4. **Verify pull** +4. Verify hydration: ```bash git drs pull - # or the Git LFS compatibility path - git lfs pull ``` -For complete local/remote mode behavior and e2e runbooks, see [E2E Modes + Local Setup](e2e-modes-and-local-setup.md). +For full local/remote runbooks, see [E2E Modes + Local Setup](e2e-modes-and-local-setup.md). -3. **Download files as needed** +## Copy Metadata Between Remotes - ```bash - git lfs pull -I "required-files*" - ``` +Use `copy-records` to copy Syfon metadata records between remotes for a single scope: -## Next Steps +```bash +git drs copy-records dev prod HTAN_INT/BForePC +``` + +Or let the default remote be the source: + +```bash +git drs copy-records prod HTAN_INT/BForePC +``` + +This copies metadata only. It does not copy object bytes between buckets. + +## Common Flow Summary + +```bash +git drs install +git drs init +git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/credentials.json +git drs track "*.bam" +git add .gitattributes +git add sample.bam +git commit -m "Add sample" +git push +git drs ls-files +git drs pull -I "*.bam" +``` -- [Commands Reference](commands.md) - Complete command documentation -- [Troubleshooting](troubleshooting.md) - Common issues and solutions -- [Developer Guide](developer-guide.md) - Advanced usage and internals +For command details, see [commands.md](commands.md). diff --git a/docs/installation.md b/docs/installation.md index 723303d7..0f111459 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -4,21 +4,7 @@ This guide covers installation of Git DRS across different environments and targ ## Prerequisites -All installations require [Git LFS](https://git-lfs.com/) to be installed first: - -```bash -# macOS -brew install git-lfs - -# Linux (download binary) -wget https://github.com/git-lfs/git-lfs/releases/download/v3.7.0/git-lfs-linux-amd64-v3.7.0.tar.gz -tar -xvf git-lfs-linux-amd64-v3.7.0.tar.gz -export PREFIX=$HOME -./git-lfs-v3.7.0/install.sh - -# Configure LFS -git lfs install --skip-smudge -``` +Git DRS requires Git to be installed. Install Git DRS using the steps below, then run `git drs install` to configure Git filters. ## Local Installation (Gen3 Server) @@ -33,9 +19,9 @@ git lfs install --skip-smudge 2. **Update PATH** ```bash - # Add to ~/.bash_profile or ~/.zshrc + # Add to your shell startup file (for example ~/.zshrc, ~/.bashrc, or ~/.profile) export PATH="$PATH:$HOME/.local/bin" - source ~/.bash_profile # or source ~/.zshrc + source ~/.zshrc # or source your shell startup file ``` 3. **Verify Installation** @@ -61,27 +47,7 @@ git lfs install --skip-smudge ### Steps -1. **Install Git LFS on HPC** - ```bash - # Download and install Git LFS - wget https://github.com/git-lfs/git-lfs/releases/download/v3.7.1/git-lfs-linux-amd64-v3.7.1.tar.gz - tar -xvf git-lfs-linux-amd64-v3.7.1.tar.gz - export PREFIX=$HOME - ./git-lfs-3.7.1/install.sh - - # Make permanent - echo 'export PATH="$HOME/bin:$PATH"' >> ~/.bash_profile - source ~/.bash_profile - - # Configure - git lfs install --skip-smudge - - # Cleanup - rm git-lfs-linux-amd64-v3.7.0.tar.gz - rm -r git-lfs-3.7.0/ - ``` - -2. **Configure Git/SSH (if needed)** +1. **Configure Git/SSH (if needed)** ```bash # Generate SSH key ssh-keygen -t ed25519 -C "your_email@example.com" @@ -94,7 +60,7 @@ git lfs install --skip-smudge cat ~/.ssh/id_ed25519.pub ``` -3. **Install Git DRS** +2. **Install Git DRS** ```bash /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/calypr/git-drs/refs/heads/main/install.sh)" @@ -103,7 +69,7 @@ git lfs install --skip-smudge source ~/.bash_profile ``` -4. **Verify Installation** +3. **Verify Installation** ```bash git-drs version git drs install @@ -133,8 +99,6 @@ After installation, verify your setup: # Check Git DRS version git-drs version -# Check Git LFS -git lfs version # View configured remotes (after setup) git drs remote list diff --git a/docs/precommit-cache-addurl-prepush.md b/docs/precommit-cache-addurl-prepush.md index fe77ec08..924050c3 100644 --- a/docs/precommit-cache-addurl-prepush.md +++ b/docs/precommit-cache-addurl-prepush.md @@ -5,10 +5,10 @@ Proposed ## Context `cmd/precommit` now maintains a local cache under `.git/drs/pre-commit/v1` that records: -- path → LFS OID in `paths/.json` +- path → OID in `paths/.json` - OID → paths + URL hint in `oids/.json` -`precommit_cache` provides read helpers for this cache and is intended to let the pre-push hook validate against authoritative sources while using cached hints to avoid re-scanning worktrees. `cmd/addurl` currently writes the LFS pointer and DRS files but does not update the pre-commit cache. `cmd/prepush` currently computes updates without consulting the cache. This means: +`precommit_cache` provides read helpers for this cache and is intended to let the pre-push hook validate against authoritative sources while using cached hints to avoid re-scanning worktrees. `cmd/addurl` currently writes the pointer and DRS files but does not update the pre-commit cache. `cmd/prepush` currently computes updates without consulting the cache. This means: - `add-url`-created objects are invisible to cache-aware workflows unless a pre-commit hook runs later. - `pre-push` cannot leverage cached OID/path/url hints or detect mismatches early. @@ -16,7 +16,7 @@ Proposed Update `cmd/addurl` and `cmd/prepush` to integrate with the pre-commit cache, while preserving the current fallback behavior when the cache is missing or stale. ### Changes required in `cmd/addurl` -1. **Write cache entries after LFS pointer creation** +1. **Write cache entries after pointer creation** - Create/update the path entry (`paths/.json`) using the same encoding as `cmd/precommit` (`base64.RawURLEncoding` of the repo-relative path). - Create/update the OID entry (`oids/.json`) using the same OID hashing (`sha256(oid string)`), ensuring the `paths` list includes the new path. 2. **Persist the external URL hint** @@ -30,13 +30,13 @@ Update `cmd/addurl` and `cmd/prepush` to integrate with the pre-commit cache, wh ### Changes required in `cmd/prepush` 1. **Use `precommit_cache` to seed work** - - Open the cache early and, when available, use it to map pushed paths/branches to their LFS OIDs and cached URL hints. + - Open the cache early and, when available, use it to map pushed paths/branches to their OIDs and cached URL hints. - If the cache is missing or entries are stale, fall back to current discovery/update logic. 2. **Validate cached URL hints** - When `updateDrsObjects` resolves authoritative URLs, compare them to cached hints via `precommit_cache.CheckExternalURLMismatch`. - Warn (or fail, depending on policy) on mismatches to surface potentially stale or incorrect metadata before pushing. 3. **Prefer cache data for DRS updates** - - Use cached OIDs/paths to reduce redundant file scans for LFS pointers. + - Use cached OIDs/paths to reduce redundant file scans for pointers. - Carry cached `external_url` into DRS metadata when authoritative sources are unavailable, while still treating it as non-authoritative. ## Consequences diff --git a/docs/precommit.md b/docs/precommit.md index 91682b6c..89bb318d 100644 --- a/docs/precommit.md +++ b/docs/precommit.md @@ -16,11 +16,11 @@ This repository uses a **local, non-versioned cache** under: .git/drs/pre-commit/ ``` -to support fast, offline-friendly workflows for **Git LFS–tracked files**. +to support fast, offline-friendly workflows for **Git DRS–tracked files**. The cache is: -* **LFS-only** +* **pointer-only** * **non-authoritative** * **local to a working copy** * **never committed to Git** @@ -44,7 +44,7 @@ Its sole purpose is to bridge the gap between: * Updates `.git/drs/pre-commit` cache * Never performs network I/O * Never queries DRS or DRS -* Ignores all non-LFS files +* Ignores all non-tracked files ### `precommit_cache` (helper library) @@ -61,7 +61,7 @@ Its sole purpose is to bridge the gap between: ## Cache Scope (Important) -Only files whose **staged content** is a valid Git LFS pointer are in scope: +Only files whose **staged content** is a valid Git DRS pointer are in scope: ``` version https://git-lfs.github.com/spec/v1 @@ -112,7 +112,7 @@ The cache models **three non-authoritative relationships**: 3. **OID → External URL (hint)** All are **hints only**. -The authoritative source of truth lives on the server (DRS / DRS). +The authoritative source of truth lives on the server (DRS). --- @@ -122,7 +122,7 @@ The authoritative source of truth lives on the server (DRS / DRS). `v1/paths/.json` -Represents the **currently staged** LFS object at a given working-tree path. +Represents the **currently staged** DRS object at a given working-tree path. ```json { @@ -135,7 +135,7 @@ Represents the **currently staged** LFS object at a given working-tree path. Notes: * `path` is repo-relative -* `lfs_oid` comes from the staged LFS pointer +* `lfs_oid` comes from the staged DRS pointer * Updated on: * add @@ -149,7 +149,7 @@ Notes: `v1/oids/.json` -Represents **advisory information** about an LFS object. +Represents **advisory information** about a DRS object. ```json { @@ -187,16 +187,16 @@ Used to record deleted paths for potential GC or debugging. ## Pre-Commit Behavior (What Happens Automatically) -### Add / Modify LFS File +### Add / Modify Tracked File -* Extracts LFS OID from staged pointer +* Extracts OID from staged pointer * Updates: * `paths/.json` * `oids/.json` * Preserves any existing `external_url` hint -### Rename / Move LFS File +### Rename / Move Tracked File * Moves `paths/.json` → `paths/.json` * Updates OID entry paths list @@ -291,7 +291,7 @@ url, ok, err := cache.LookupExternalURLByOID(oid) * Hint only * May be stale or missing -* Must be validated against DRS / DRS +* Must be validated against DRS --- @@ -322,7 +322,7 @@ Used by pre-push to compare local hints with server truth. ## Intended Pre-Push Usage Pattern 1. Determine commit range from pre-push stdin -2. Enumerate **LFS OIDs** referenced by pushed commits +2. Enumerate **OIDs** referenced by pushed commits 3. For each OID: * Optionally read local hints from `precommit_cache` @@ -377,9 +377,7 @@ sequenceDiagram participant PC as pre-commit hook (cmd/precommit) participant Cache as .git/drs/pre-commit (local cache) participant PP as pre-push hook - participant LFS as git-lfs participant IDX as DRS (authoritative) - participant DRS as DRS (authoritative) Dev->>Git: git add Dev->>Git: git commit @@ -387,10 +385,10 @@ sequenceDiagram Git->>PC: invoke pre-commit (no stdin) PC->>Git: git diff --cached --name-status -M PC->>Git: git show : (staged pointer) - alt staged file is LFS pointer + alt staged file is DRS pointer PC->>Cache: write paths/.json (path -> oid) PC->>Cache: upsert oids/.json (oid -> paths[] + external_url hint) - else non-LFS file + else non-tracked file PC-->>Git: ignore (out of scope) end PC-->>Git: exit 0 (commit proceeds) @@ -398,13 +396,10 @@ sequenceDiagram Dev->>Git: git push Git->>PP: invoke pre-push (stdin: ref updates) PP->>PP: compute commit ranges from stdin - PP->>LFS: enumerate LFS OIDs referenced by pushed commits + PP->>IDX: enumerate OIDs referenced by pushed commits loop for each required OID PP->>Cache: lookup external_url hint (optional) PP->>IDX: resolve by sha256 (OID) -> object_id + urls[] - opt DRS resolution - PP->>DRS: resolve by object_id -> access_methods[] - end alt OID not resolvable PP-->>Git: fail push (exit non-zero) else resolvable @@ -418,7 +413,7 @@ sequenceDiagram ## Summary -> `.git/drs/pre-commit` is a **local, LFS-only, non-authoritative cache** that tracks +> `.git/drs/pre-commit` is a **local, pointer-only, non-authoritative cache** that tracks > **path ↔ OID ↔ external URL hints** to support rename, undo, and offline workflows. > > `precommit_cache` provides safe, read-only access to this cache for enforcement at pre-push. @@ -426,5 +421,5 @@ sequenceDiagram If you want, I can also: * add **inline Go doc comments** suitable for `pkg.go.dev` -* generate a **sequence diagram** (commit → cache → push → DRS/DRS) +* generate a **sequence diagram** (commit → cache → push → DRS) * or write a **pre-push reference implementation** that uses these helpers end-to-end diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 01ff0f3b..c0b77c7b 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -1,247 +1,166 @@ # Troubleshooting -Common issues and solutions when working with Git DRS. +Common issues and solutions for the cleaned `git-drs` CLI. -> **Navigation:** [Getting Started](getting-started.md) → [Commands Reference](commands.md) → **Troubleshooting** +> **Navigation:** [Getting Started](getting-started.md) -> [Commands Reference](commands.md) -> **Troubleshooting** ## Frequently Asked Questions ### Do I need to run `git drs init` each time? -**No.** `git drs init` is set up once per Git repo. +No. -**Run it once when:** +`git drs init` is repository setup. Run it once per local clone unless you are deliberately reinitializing the repo. -- You first clone a repository -- You create a new repository +Run it when: -**Don't run it again:** +- you first clone a repository and need local `git-drs` setup +- you create a new repository and want to enable `git-drs` -- At the start of each work session -- After refreshing credentials -- After pulling updates +Do not run it every session: -**What it does:** +- not at the start of normal daily work +- not after refreshing credentials +- not after `git pull` -- Sets up `.drs/` directory structure -- Configures Git LFS hooks -- Updates `.gitignore` +What it changes: -These changes persist in your local repository. For subsequent sessions, you only need to refresh credentials if they've expired (every 30 days). +- creates `.git/drs/` repository-local state +- sets up `git-drs` repository configuration and hooks +- prepares the repo for managed pointer/register/hydration behavior -### What to do if you run `git drs init` again +### What if I run `git drs init` again? -Running `git drs init` a second time is usually harmless but unnecessary. It may re-create the `\`.git/drs/\`` directory, re-install hooks, or modify `\`.gitattributes\`` and `\`.gitignore\``. If you ran it accidentally, follow these steps: +Usually nothing catastrophic, but it is unnecessary. -1. Inspect what changed - - `git status` - - `git diff` (or `git diff -- ` for a specific file, e.g. `\`.gitignore\``) +If you did it accidentally: -2. If changes are fine - - No action required; commit the intended changes or leave them uncommitted. +1. inspect what changed -3. If you want to discard uncommitted changes - - Restore specific files: `git restore --staged \`.gitignore\`` && `git restore \`.gitignore\`` - - Restore all working-tree changes: `git restore .` - - Or (destructive) reset everything: `git reset --hard` \- use with caution. - -4. If you already committed the unintended changes - - Undo the last commit but keep changes staged: `git reset --soft HEAD~1` - - Or remove the commit and working changes: `git reset --hard HEAD~1` \- use with caution. - - See the "Undo Last Commit" section above for alternatives. - -5. Hooks or credentials issues - - If hooks were replaced or credentials need refresh, run `git drs init` with the correct `--cred`/`--profile` options, or re-add the remote with `git drs remote add`. - -Summary: inspect with `git status`/`git diff`, then either accept, manually edit, or revert the changes using standard `git restore` / `git reset` commands. - - -## When to Use Which Tool - -Understanding when to use Git, Git LFS, or Git DRS commands: - -### Git DRS Commands - -**Use for**: Repository and remote configuration - -- `git drs init` - Initialize Git LFS hooks -- `git drs remote add` - Configure DRS server connections -- `git drs remote list` - View configured remotes -- `git drs add-url` - Add cloud object references - -**When**: - -- Setting up a new repository -- Adding/managing DRS remotes -- Refreshing expired credentials -- Adding external file references - -### Git LFS Commands - -**Use for**: File tracking and management - -- `git lfs track` - Define which files to track -- `git lfs ls-files` - See tracked files and status -- `git lfs pull` - Download specific files -- `git lfs untrack` - Stop tracking file patterns - -**When**: - -- Managing which files are stored externally -- Downloading specific files -- Checking file localization status + ```bash + git status + git diff + ``` -### Standard Git Commands +2. if the changes are harmless, leave them alone or commit what you intended -**Use for**: Version control operations +3. if you want to discard the uncommitted changes, use normal Git restore/reset flow carefully -- `git add` - Stage files for commit -- `git commit` - Create commits -- `git push` - Upload commits and trigger file uploads -- `git pull` - Get latest commits +4. if hooks or repo-local state were repaired intentionally, keep the changes -**When**: +The right default is: inspect first, then decide whether anything actually needs to be reverted. -- Normal development workflow -- Git DRS runs automatically in the background +### What does `git drs init` actually change? -## Common Error Messages +It prepares repository-local `git-drs` state: -## Git LFS-Oriented Troubleshooting Guide (Commit/Push/Clone/Pull) +- `.git/drs/` metadata/state +- hook/config wiring for `git-drs` workflows +- the repo-local setup needed for pointer/register/hydration behavior -The checks below prioritize Git LFS guidance and documentation because Git DRS relies on Git LFS for large-file handling. If you run into issues, start with the Git LFS troubleshooting docs and logs, then move to Git DRS-specific configuration checks. Primary references: the Git LFS troubleshooting guide and the Git LFS documentation for installation, tracking, and environment variables: +Those changes persist in the clone. They are not something you redo per session. -- Git LFS troubleshooting: https://github.com/git-lfs/git-lfs/wiki/Troubleshooting -- Git LFS docs: https://github.com/git-lfs/git-lfs/tree/main/docs +## When to Use Which Tool -### Failed Commit (Git LFS hooks or pointer issues) +### Use `git-drs` for -1. **Confirm Git LFS is installed and hooks are active** - - Run: `git lfs version` and `git lfs env` - - If `git lfs env` reports `git lfs install` is needed, run `git lfs install` to re-install hooks. - - This is the most common cause of commits failing to convert large files into LFS pointers. +- repository-local `git-drs` setup +- remote configuration +- tracking rules +- object hydration +- DRS/Syfon metadata-oriented workflows -2. **Check whether the file was tracked before the commit** - - Run: `git lfs track` and confirm the file pattern is listed. - - If not tracked, add it (`git lfs track "*.bam"`) and stage `.gitattributes`. +Examples: -3. **Verify the file is staged as an LFS pointer** - - Run: `git lfs ls-files` to confirm the file is listed. - - If a large file was added to Git history directly, remove it from the index and re-add it after tracking. +- `git drs init` +- `git drs remote add gen3 ...` +- `git drs track` +- `git drs ls-files` +- `git drs pull` +- `git drs add-url` +- `git drs copy-records` -4. **Review Git LFS logs for hook errors** - - Run: `git lfs logs last` to inspect hook failures. - - Common errors include missing filters or file locking issues. +### Use normal Git for -### Failed Push (LFS uploads, auth, or bandwidth issues) +- branch and commit movement +- staging and committing +- ordinary ref push/pull operations -1. **Check Git LFS authentication and endpoint configuration** - - Run: `git lfs env` and confirm `Endpoint` values are correct. - - If tokens are expired, refresh credentials and re-run the push. +Examples: -2. **Retry with LFS verbose logging** - - Run: `GIT_TRACE=1 GIT_CURL_VERBOSE=1 git lfs push --all` - - Use this output to identify `403/401` auth issues or proxy errors. +- `git add` +- `git commit` +- `git push` +- `git pull` -3. **Confirm the LFS objects exist locally** - - Run: `git lfs ls-files` and ensure your large files are listed. - - Missing objects indicate a tracking or filter issue before the push. +## First Principles -4. **Validate the remote supports Git LFS** - - Run: `git lfs env` to confirm the remote endpoint. - - Some Git servers require explicit LFS enablement or URL configuration. +Before debugging behavior, keep the command split straight: -### Failed Clone (LFS objects missing or blocked) +- `git pull` + - updates commits, branches, and checkout state +- `git drs pull` + - hydrates tracked pointer files already present in the current checkout +- `git drs ls-files` + - shows tracked files and localization state -1. **Confirm LFS objects were fetched** - - After clone, run: `git lfs pull` to fetch large files. - - If the repo only has LFS pointers, you will see pointer files until you pull. +If you blur those together, the failure modes get confusing. -2. **Check LFS smudge/clean filters** - - Run: `git lfs env` and verify `git-lfs` filters are enabled. - - If not, run `git lfs install` and re-run `git lfs pull`. +## Common Error Patterns -3. **Validate access and authentication** - - `git lfs env` will show which endpoint is used; 401/403 errors point to invalid credentials. +### Failed commit or pointer conversion issues -4. **Inspect LFS logs for download errors** - - Run: `git lfs logs last` for the most recent transfer errors. +Check these in order: -### Failed Pull (LFS fetch/checkout issues) +1. confirm the file pattern was tracked before the add/commit flow -1. **Run `git lfs pull` separately** - - This isolates LFS download errors from Git merge errors. + ```bash + git drs track + ``` -2. **Check LFS file locking or concurrent transfers** - - If your Git host uses LFS file locking, verify the file is not locked by another user. +2. confirm `.gitattributes` was staged after changing tracking rules -3. **Review filters and tracking** - - Run: `git lfs track` to ensure required patterns are present. - - If a file type is newly tracked, re-run `git add .gitattributes` and commit. + ```bash + git status + ``` -4. **Check for storage or bandwidth limits** - - Some Git LFS hosts enforce quotas; errors will show in `git lfs logs last`. +3. confirm the file shows up in the tracked inventory -### Authentication Errors + ```bash + git drs ls-files + ``` -**Error**: `Upload error: 403 Forbidden` or `401 Unauthorized` +4. inspect `.git/drs/` logs if the hook path failed -**Cause**: Expired or invalid credentials +### Failed push: upload, register, or auth -**Solution**: +Check: ```bash -# Download new credentials from your data commons -# Then refresh them by re-adding the remote -git drs remote add gen3 production \ - --cred /path/to/new-credentials.json \ - --url https://calypr-public.ohsu.edu \ - --project my-project \ - --bucket my-bucket +git drs remote list +git drs ls-files --drs ``` -**Prevention**: +Then retry with higher Git/HTTP verbosity if needed: -- Credentials expire after 30 days -- Set a reminder to refresh them regularly - ---- - -**Error**: `Upload error: 503 Service Unavailable` - -**Cause**: DRS server is temporarily unavailable or credentials expired - -**Solutions**: - -1. Wait and retry the operation -2. Refresh credentials: - ```bash - git drs remote add gen3 production \ - --cred /path/to/credentials.json \ - --url https://calypr-public.ohsu.edu \ - --project my-project \ - --bucket my-bucket - ``` -3. If persistent, download new credentials from the data commons - -### Network Errors - -**Error**: `net/http: TLS handshake timeout` - -**Cause**: Network connectivity issues +```bash +GIT_TRACE=1 GIT_CURL_VERBOSE=1 git push +``` -**Solution**: +### Failed clone or fresh checkout still has pointer files -- Simply retry the command -- These are usually temporary network issues +That usually just means hydration has not happened yet. ---- +Run: -**Error**: Git push timeout during large file uploads +```bash +git drs init +git drs pull +``` -**Cause**: Long-running operations timing out +### Network timeout during push or download -**Solution**: Add to `~/.ssh/config`: +If you use SSH remotes, keepalives help: ``` Host github.com @@ -249,297 +168,195 @@ Host github.com ServerAliveInterval 30 ``` -### File Tracking Issues +## Common Problems -**Error**: Files not being tracked by LFS +### `git drs pull` did not update my branch -**Symptoms**: +That is expected. -- Large files committed directly to Git -- `git lfs ls-files` doesn't show your files +`git drs pull` no longer runs `git pull`. -**Solution**: +Use: ```bash -# Check what's currently tracked -git lfs track - -# Track your file type -git lfs track "*.bam" -git add .gitattributes - -# Remove from Git and re-add -git rm --cached large-file.bam -git add large-file.bam -git commit -m "Track large file with LFS" +git pull +git drs pull ``` ---- +### `git drs ls-files` does not show my file -**Error**: `[404] Object does not exist on the server` +Check these in order: -**Symptoms**: - -- After clone, git pull fails - -**Solution**: +1. is the path actually tracked? ```bash -# confirm repo has complete configuration -git drs list-config - -# init your git drs project -git drs init --cred /path/to/cred/file --profile - -# attempt git pull again -git lfs pull -I path/to/file +git drs track ``` ---- - -**Error**: `git lfs ls-files` shows files but they won't download - -**Cause**: Files may not have been properly uploaded or DRS records missing - -**Solution**: +2. did you stage `.gitattributes` after adding the pattern? ```bash -# Check repository status -git drs list-config - -# Try pulling with verbose output -git lfs pull -I "problematic-file*" --verbose - -# Check logs -cat .git/drs/*.log +git add .gitattributes ``` -### Configuration Issues - -**Error**: `git drs remote list` shows empty or incomplete configuration - -**Cause**: Repository not properly initialized or no remotes configured - -**Solution**: +3. is the file part of the current checkout? ```bash -# Initialize repository if needed -git drs init +git ls-files -- path/to/file +``` -# Add Gen3 remote -git drs remote add gen3 production \ - --cred /path/to/credentials.json \ - --url https://calypr-public.ohsu.edu \ - --project my-project \ - --bucket my-bucket +4. inspect the local view: -# Verify configuration -git drs remote list +```bash +git drs ls-files -l ``` ---- +### `git drs pull` does nothing -**Error**: Configuration exists but commands fail +That usually means one of these: -**Cause**: Mismatched configuration between global and local settings, or expired credentials +- the current checkout already has localized bytes +- there are no tracked pointer files matching your include filters +- the file is not tracked by `git-drs` -**Solution**: +Check: ```bash -# Check configuration -git drs remote list - -# Refresh credentials by re-adding the remote -git drs remote add gen3 production \ - --cred /path/to/new-credentials.json \ - --url https://calypr-public.ohsu.edu \ - --project my-project \ - --bucket my-bucket +git drs ls-files +git drs ls-files -I "*.bam" +git drs pull --dry-run -I "*.bam" ``` -### Remote Configuration Issues - -**Error**: `no default remote configured` +### `git drs pull` still leaves pointer files -**Cause**: Repository initialized but no remotes added yet - -**Solution**: +Check DRS registration status: ```bash -# Add your first remote (automatically becomes default) -git drs remote add gen3 production \ - --cred /path/to/credentials.json \ - --url https://calypr-public.ohsu.edu \ - --project my-project \ - --bucket my-bucket +git drs ls-files --drs ``` ---- - -**Error**: `default remote 'X' not found` +If the object is not registered or not resolvable from the configured remote, hydration cannot succeed. -**Cause**: Default remote was deleted or configuration is corrupted - -**Solution**: +Also confirm the remote configuration: ```bash -# List available remotes git drs remote list - -# Set a different remote as default -git drs remote set staging - -# Or add a new remote -git drs remote add gen3 production \ - --cred /path/to/credentials.json \ - --url https://calypr-public.ohsu.edu \ - --project my-project \ - --bucket my-bucket ``` ---- +If needed, inspect the detailed logs: -**Error**: Commands using wrong remote +```bash +ls -la .git/drs/ +``` -**Cause**: Default remote is not the one you want to use +### `git drs remote add gen3` fails on bucket mapping -**Solution**: +Current shape: ```bash -# Check current default -git drs remote list - -# Option 1: Change default remote -git drs remote set production - -# Option 2: Specify remote for single command -git drs push staging -git drs fetch production +git drs remote add gen3 [remote-name] [--cred | --token ] ``` -## Undoing Changes - -### Untrack LFS Files +If this fails, the likely cause is missing bucket mapping for that scope. -If you accidentally tracked the wrong files: +That mapping is usually steward/admin setup, not something the end user invents ad hoc. -```bash -# See current tracking -git lfs track +### My credentials expired -# Remove incorrect pattern -git lfs untrack "wrong-dir/**" +Refresh by re-adding the remote with a new credential file or token: -# Add correct pattern -git lfs track "correct-dir/**" - -# Stage the changes -git add .gitattributes -git commit -m "Fix LFS tracking patterns" +```bash +git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/new-credentials.json ``` -### Undo Git Add +### `git push` fails with upload or register errors -Remove files from staging area: +Check: ```bash -# Check what's staged -git status - -# Unstage specific files -git restore --staged file1.bam file2.bam - -# Unstage all files -git restore --staged . +git drs remote list +git drs ls-files --drs ``` -### Undo Last Commit +Typical root causes: -To retry a commit with different files: +- expired credentials +- wrong remote selected +- missing server-side bucket mapping +- object registration or upload permissions missing for the target scope -```bash -# Undo last commit, keep files in working directory -git reset --soft HEAD~1 +### Files are not being tracked -# Or undo and unstage files -git reset HEAD~1 +Symptoms: -# Or completely undo commit and changes (BE CAREFUL!) -git reset --hard HEAD~1 -``` - -### Remove Files from LFS History +- large files were committed directly to Git +- `git drs ls-files` does not show the file -If you committed large files directly to Git by mistake: +Recovery: ```bash -# Remove from Git history (use carefully!) -git filter-branch --tree-filter 'rm -f large-file.dat' HEAD - -# Then track properly with LFS -git lfs track "*.dat" +git drs track "*.bam" git add .gitattributes -git add large-file.dat -git commit -m "Track large file with LFS" +git rm --cached large-file.bam +git add large-file.bam +git commit -m "Track large file with git-drs" ``` -## Diagnostic Commands - -### Check System Status +### Cloned repo only has pointer files -```bash -# Git DRS version and help -git-drs version -git-drs --help +That is normal. -# Configuration -git drs remote list +After cloning: -# Repository status -git status -git lfs ls-files +```bash +git drs init +git drs pull ``` -### View Logs +Or hydrate only what you need: ```bash -# Git DRS logs (in repository) -ls -la .git/drs/ -cat .git/drs/*.log +git drs pull -I "*.bam" ``` -### Test Connectivity +## Debugging Workflow -```bash -# Test basic Git operations -git lfs pull --dry-run +When behavior is unclear, use this sequence: -# Test DRS configuration +```bash git drs remote list +git drs track +git drs ls-files -l +git drs ls-files --drs +git drs pull --dry-run ``` -## Getting Help +That usually tells you whether the problem is: -### Log Analysis +- tracking +- hydration state +- DRS registration +- remote configuration -When reporting issues, include: +## Log and State Inspection -```bash -# System information -git-drs version -git lfs version -git --version +Useful checks: -# Configuration +```bash git drs remote list - -# Recent logs -tail -50 .git/drs/*.log +git drs track +git drs ls-files -l +git drs ls-files --drs +ls -la .git/drs/ ``` -## Prevention Best Practices +## Removed Commands + +If you see old notes mentioning these, ignore them: + +- `git drs fetch` +- `git drs list` +- `git drs upload` +- `git drs download` -1. **Test in small batches** - Don't commit hundreds of files at once -2. **Verify tracking** - Always check `git lfs ls-files` after adding files -3. **Use .gitignore** - Prevent accidental commits of temporary files -4. **Monitor repository size** - Keep an eye on `.git` directory size +Those were removed from the cleaned CLI surface.