diff --git a/README.md b/README.md index 59e323a..6b82da0 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ public class MyTranslator implements BlobTranslator { - `-d`, `--duplicate`: Duplicate the source repository and overwrites it. **Requires `-o`**. - `--clean`: Delete the target repository before applying the transformation if it exists. **Requires `-o`**. - `--bare`: Treat that the specified repositories are bare. -- `-j`, `--jobs=`: Rewrites trees in parallel using `` threads. If the number of threads is omitted (just `-j` is given), _total number of processors - 1_ is used. +- `-j`, `--jobs=`: Rewrites trees in parallel using `` threads (see [Parallel Rewriting](#parallel-rewriting)). If the number of threads is omitted (just `-j` is given), the number of available processors is used. - `-n`, `--dry-run`: Do not actually modify the target repository. - `--stream-size-limit={,K,M,G}`: increase the stream size limit. - `--no-notes`: Stop noting the source commit ID to the commits in the target repository (see [Notes](#notes)). @@ -261,6 +261,21 @@ A no-op rewriter that copies all objects without transformation. Useful for verifying that the rewriting pipeline preserves repository content. +## Parallel Rewriting + +With `-j`, git-stein rewrites trees in parallel using multiple threads. +The rewriting is done in two passes over the commit history: + +1. **Tree rewriting pass** (parallel): all root trees are rewritten in parallel using a `ForkJoinPool`. +The commit list is split into contiguous chunks, and each chunk is processed by a worker thread. +Consecutive commits within a chunk share many tree entries, so the entry mapping cache is effective within each chunk. +2. **Commit writing pass** (sequential): commits are written in topological order. +Since each commit depends on its parent's ID, this pass must be sequential. +The tree rewriting results are looked up from the first pass. + +The number of threads can be specified explicitly (e.g., `-j4`) or left to default (`-j` alone uses all available processors). + + ## Chaining Commands Multiple commands can be listed on a single command line. diff --git a/src/main/java/jp/ac/titech/c/se/stein/Application.java b/src/main/java/jp/ac/titech/c/se/stein/Application.java index 7ff8e1e..75fc33b 100644 --- a/src/main/java/jp/ac/titech/c/se/stein/Application.java +++ b/src/main/java/jp/ac/titech/c/se/stein/Application.java @@ -77,8 +77,7 @@ public static class OutputOptions { void setNumberOfThreads(final int nthreads) { this.nthreads = nthreads; if (nthreads == 0) { - final int nprocs = Runtime.getRuntime().availableProcessors(); - this.nthreads = nprocs > 1 ? nprocs - 1 : 1; + this.nthreads = Runtime.getRuntime().availableProcessors(); } } public int nthreads = 1; diff --git a/src/main/java/jp/ac/titech/c/se/stein/rewriter/RepositoryRewriter.java b/src/main/java/jp/ac/titech/c/se/stein/rewriter/RepositoryRewriter.java index 1bd6e72..fc58eb0 100644 --- a/src/main/java/jp/ac/titech/c/se/stein/rewriter/RepositoryRewriter.java +++ b/src/main/java/jp/ac/titech/c/se/stein/rewriter/RepositoryRewriter.java @@ -68,7 +68,7 @@ private static Map createEntryMapping(long memoryBudget) { /** * Root tree-to-tree mapping. */ - protected Map rootTreeMapping = new HashMap<>(); + protected Map rootTreeMapping = new ConcurrentHashMap<>(); /** * Commit-to-commit mapping. @@ -156,10 +156,8 @@ public void initialize(final Repository sourceRepo, final Repository targetRepo) public void rewrite(final Context c) { setUp(c); - try { - final RevWalk walk = prepareRevisionWalk(c); + try (final RevWalk walk = prepareRevisionWalk(c)) { if (config.nthreads >= 2) { - log.debug("Parallel rewriting"); rewriteRootTrees(walk, c); Try.io(walk::memoReset); } @@ -188,6 +186,7 @@ public void rewrite(final Context c) { blobHit, blobTotal, String.format("%.1f", blobTotal > 0 ? blobHit * 100.0 / blobTotal : 0), treeHit, treeTotal, String.format("%.1f", treeTotal > 0 ? treeHit * 100.0 / treeTotal : 0), hits, total, String.format("%.1f", hits * 100.0 / total)); + log.info("Entry mapping size: {}, root tree mapping size: {}", entryMapping.size(), rootTreeMapping.size()); } if (entryCache != null) { entryCache.close(); @@ -219,18 +218,19 @@ protected void rewriteCommits(final RevWalk walk, final Context c) { protected void rewriteRootTrees(final RevWalk walk, final Context c) { final Map cxts = new ConcurrentHashMap<>(); + // Count commits without closing the walk long count = 0; - try (walk) { - for (final RevCommit commit : walk) { - count++; - } + for (final RevCommit ignored : walk) { + count++; } Try.io(walk::memoReset); + log.info("Parallel rewriting: {} commits with {} threads", count, config.nthreads); - try (walk) { + final ForkJoinPool pool = new ForkJoinPool(config.nthreads); + try { final int characteristics = Spliterator.DISTINCT | Spliterator.IMMUTABLE | Spliterator.NONNULL | Spliterator.SIZED; final Spliterator split = Spliterators.spliterator(walk.iterator(), count, characteristics); - new ForkJoinPool(config.nthreads).submit(() -> { + pool.submit(() -> { final Stream stream = StreamSupport.stream(split, true); stream.forEach(commit -> { final long id = Thread.currentThread().getId(); @@ -239,6 +239,9 @@ protected void rewriteRootTrees(final RevWalk walk, final Context c) { rewriteRootTree(commit.getTree().getId(), uuc); }); }).join(); + } finally { + log.debug("Pool stats: steal={}, threads={}", pool.getStealCount(), cxts.size()); + pool.shutdown(); } // finalize