diff --git a/.gitignore b/.gitignore
index ed65bc2..3768c93 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
/gradle.properties
/.settings/
/.idea/
+__*
*~
.DS_Store
diff --git a/README.md b/README.md
index 96a2869..59e323a 100644
--- a/README.md
+++ b/README.md
@@ -34,11 +34,7 @@ $ git stein [options...] # When subcommand available
## Recipes
-### Chaining commands
-
-Multiple commands can be listed on the command line.
-They are applied sequentially; intermediate repositories are created under `.git/.git-stein.N` in the target directory and cleaned up automatically.
-As an optimization, consecutive blob translators are composed into a single pass.
+### Splitting and converting to cregit
Split Java files into method-level modules, then convert each to cregit format:
```
@@ -72,18 +68,6 @@ $ git stein path/to/repo -o path/to/out \
@convert --endpoint=http://localhost:8080/convert --pattern='*.java'
```
-### Tracking original commit IDs
-
-When git-stein rewrites a repository, it records the original commit ID in Git notes (enabled by default).
-`@note-commit` reads these notes and prepends the original commit ID to each commit message.
-
-A typical workflow is to first transform, then apply `@note-commit`:
-```
-$ git stein path/to/repo -o path/to/out @historage-jdt @note-commit
-```
-After this, each commit message in `step2` starts with the original commit ID from `repo`.
-This works even after multiple transformations — the notes trace back to the original.
-
### Writing a custom blob translator
Implement the `BlobTranslator` interface to define your own transformation.
@@ -119,12 +103,13 @@ public class MyTranslator implements BlobTranslator {
- `-j`, `--jobs=`: Rewrites trees in parallel using `` threads. If the number of threads is omitted (just `-j` is given), _total number of processors - 1_ is used.
- `-n`, `--dry-run`: Do not actually modify the target repository.
- `--stream-size-limit={,K,M,G}`: increase the stream size limit.
-- `--no-notes`: Stop noting the source commit ID to the commits in the target repository.
+- `--no-notes`: Stop noting the source commit ID to the commits in the target repository (see [Notes](#notes)).
- `--no-pack`: Stop packing objects after transformation finished.
- `--alternates`: Share source objects via Git alternates to skip writing unchanged objects, which speeds up transformations where many objects are unchanged. The target repository will depend on the source's object store until repacked.
-- `--no-composite`: Stop composing multiple blob translators.
+- `--no-composite`: Stop composing multiple blob translators (see [Chaining Commands](#chaining-commands)).
- `--extra-attributes`: Allow opportunity to rewrite the encoding and the signature fields in commits.
-- `--cache=,...`: Specify the object types for caching (`commit`, `blob`, `tree`. See [Incremental transformation](#incremental-transformation) for the details). Default: none. `commit` is recommended.
+- `--cache`: Enable persistent entry caching (see [Caching](#caching)).
+- `--mapping-mem={,K,M,G}`: Max memory for entry mapping cache. Default: 25% of max heap (see [Caching](#caching)).
- `--cmdpath=:...`: Add packages for search for commands.
- `--log=`: Specify log level (default: `INFO`).
- `-q`, `--quiet`: Quiet mode (same as `--log=ERROR`).
@@ -143,19 +128,10 @@ The git-stein supports three rewriting modes.
- _duplicate_ mode (` -o -d`): given a source repository and a path for the target repository, copying the source repository into the given path and applying overwrite mode to the target repository.
-## Incremental Transformation
-In case the source repository to be transformed has been evolving, git-stein can transform only newly added objects.
-With the option `--cache=`, an SQLite3 cache file "cache.db" will be stored in the `.git` directory of the destination repository.
-This file records the correspondence between objects before and after transformation, according to the specified option.
-Correspondences between commits (`--cache=commit`), between trees (`--cache=tree`), and between files (`--cache=blob`) are stored.
-This cache can save the re-transformation of remaining objects during the second and subsequent transformation trials.
-
-
## Bundle Apps
### Blob Translators
_Blob translators_ provide a blob-to-blob(s) translations.
-Multiple blob translators can be composed and applied in a single pass.
#### @historage
Generates a [Historage](https://github.com/hideakihata/git2historage)-like repository using [Universal Ctags](https://ctags.io/).
@@ -285,6 +261,85 @@ A no-op rewriter that copies all objects without transformation.
Useful for verifying that the rewriting pipeline preserves repository content.
+## Chaining Commands
+
+Multiple commands can be listed on a single command line.
+They are applied sequentially as separate transformation steps.
+For example, with three commands `@A @B @C`:
+```
+source → target/.git/.git-stein.1 → target/.git/.git-stein.2 → target
+ (@A) (@B) (@C)
+```
+Intermediate repositories (`.git-stein.N`) are bare repositories created under the target's `.git` directory.
+
+As an optimization, consecutive blob translators are composed into a single pass rather than creating intermediate repositories for each one.
+This behavior can be disabled with `--no-composite`.
+For example, the following runs `@historage-jdt` and `@cregit` as a single composed blob translator, then `@note-commit` as a separate commit translator step:
+```
+$ git stein path/to/repo -o path/to/out \
+ @historage-jdt --no-original --no-classes \
+ @cregit --pattern='*.cjava' --ignore-case \
+ @note-commit
+```
+
+
+## Notes
+
+git-stein records the original commit ID as a git note on each target commit (enabled by default).
+Each note stores the source commit ID as a 40-character hex string.
+This provides the standard way to trace a target commit back to its source, and is visible in `git log` without any extra options (via `refs/notes/commits`).
+Notes are also used for [Incremental Transformation](#incremental-transformation) to skip already-processed commits on subsequent runs.
+
+`@note-commit` reads the note on each commit and embeds the original commit ID into the commit message.
+Place it at the end of the command list:
+```
+$ git stein path/to/repo -o path/to/out @historage-jdt @note-commit
+```
+
+git-stein uses three notes refs:
+`refs/notes/git-stein-prev` stores the immediate source commit ID (i.e., the commit in the input repository of this transformation step),
+`refs/notes/git-stein-orig` stores the original source commit ID (traces back through chained transformations to the very first source),
+and `refs/notes/commits` points to the same object as `git-stein-orig` (visible in `git log` by default).
+For a single transformation, all three refs point to the same notes object.
+In a chained transformation (see [Chaining Commands](#chaining-commands)), `git-stein-prev` and `git-stein-orig` may differ.
+For example, in `.git-stein.2`, `git-stein-prev` points to the commit in `.git-stein.1`, while `git-stein-orig` points to the commit in the original source.
+
+If `--no-notes` is used, no notes are written, and incremental transformation will not be available on subsequent runs.
+The target will be fully rewritten each time.
+
+
+## Incremental Transformation
+
+git-stein supports incremental transformation:
+when the target repository already contains results from a previous run, only new commits are processed.
+
+On subsequent runs, git-stein reads the notes from the target repository to reconstruct the commit mapping and skips already-processed commits.
+
+New commits still need to be transformed.
+To try to speed up the transformation of these new commits by reusing previously computed entry mappings, try `--cache` (see [Persistent cache](#persistent-cache-cache)).
+
+
+## Caching
+
+git-stein uses two levels of caching to avoid redundant work:
+an in-memory cache for the current run and an optional persistent cache for repeated runs.
+
+### In-memory cache
+
+During a single run, git-stein keeps an in-memory entry mapping (source entry → transformed entry) backed by a Guava Cache with LRU eviction.
+This avoids re-transforming identical entries within the same execution.
+The memory budget is controlled by `--mapping-mem` (default: 25% of max heap).
+
+### Persistent cache (`--cache`)
+
+When `--cache` is enabled, the entry mapping is stored in an MVStore (H2) file (`cache.mv.db`) in the target repository's `.git` directory.
+This persists entry mappings across runs, so entries that were already transformed in a previous run can be reused without re-computation.
+The `--mapping-mem` option also controls the MVStore page cache and write buffer sizes.
+
+`--cache` and the in-memory cache are mutually exclusive:
+when `--cache` is enabled, MVStore replaces the in-memory Guava Cache entirely.
+
+
## Publications
The following article includes the details of the incremental transformation (and a brief introduction to git-stein).
Those who have used git-stein in their academic work may be encouraged to cite the following in their work:
diff --git a/build.gradle b/build.gradle
index 49be582..2dcae7a 100644
--- a/build.gradle
+++ b/build.gradle
@@ -4,6 +4,12 @@ plugins {
id 'maven-publish'
id 'com.gradleup.shadow' version '9.4.0'
id 'com.github.ben-manes.versions' version '0.53.0'
+ id 'checkstyle'
+}
+
+checkstyle {
+ toolVersion = '10.21.4'
+ configFile = file("${rootDir}/config/checkstyle/checkstyle.xml")
}
repositories {
@@ -37,8 +43,7 @@ dependencies {
implementation 'org.jgrapht:jgrapht-core:1.5.2'
implementation 'org.jgrapht:jgrapht-io:1.5.2'
- implementation 'org.xerial:sqlite-jdbc:3.51.3.0'
- implementation 'com.j256.ormlite:ormlite-jdbc:5.7'
+ implementation 'com.h2database:h2-mvstore:2.3.232'
testImplementation 'org.junit.jupiter:junit-jupiter:5.14.3'
testRuntimeOnly 'org.junit.platform:junit-platform-launcher'
@@ -75,7 +80,6 @@ publishing {
shadowJar {
minimize {
- exclude(dependency('org.xerial:sqlite-jdbc:.*'))
exclude(dependency('ch.qos.logback:logback-classic:.*'))
}
}
@@ -87,10 +91,23 @@ tasks.register('benchmark', JavaExec) {
def benchArgs = project.hasProperty('benchRepo') ? [project.property('benchRepo')] : ['.']
if (project.hasProperty('alternates')) benchArgs.add('--alternates')
+ if (project.hasProperty('cache')) benchArgs.add('--cache')
args = benchArgs
jvmArgs = ['-Xmx1g']
}
+tasks.register('memoryProfile', JavaExec) {
+ dependsOn 'testClasses'
+ classpath = sourceSets.test.runtimeClasspath
+ mainClass = 'jp.ac.titech.c.se.stein.testing.MemoryProfile'
+
+ def profArgs = project.hasProperty('benchRepo') ? [project.property('benchRepo')] : ['.']
+ if (project.hasProperty('command')) profArgs.add(project.property('command'))
+ args = profArgs
+ def heap = project.hasProperty('heap') ? project.property('heap') : '4g'
+ jvmArgs = ["-Xmx${heap}", '-XX:+UseSerialGC', '-XX:+CrashOnOutOfMemoryError']
+}
+
tasks.register('executableJar') {
dependsOn 'shadowJar'
// cf. https://ujun.hatenablog.com/entry/2017/09/22/010209
diff --git a/config/checkstyle/checkstyle.xml b/config/checkstyle/checkstyle.xml
new file mode 100644
index 0000000..b996137
--- /dev/null
+++ b/config/checkstyle/checkstyle.xml
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+
+
+
+
diff --git a/scripts/bench-incremental.sh b/scripts/bench-incremental.sh
new file mode 100755
index 0000000..035830b
--- /dev/null
+++ b/scripts/bench-incremental.sh
@@ -0,0 +1,107 @@
+#!/bin/sh
+# Run incremental transformation benchmarks.
+# Usage: ./bench-incremental.sh [cache-opts...]
+#
+# Example:
+# ./bench-incremental.sh ./build/libs/git-stein-all.jar ./work @historage-jdt
+# ./bench-incremental.sh ./build/libs/git-stein-all.jar ./work @historage-jdt --cache commit,blob
+#
+# Runs two experiments:
+# A) Incremental over splits (1 -> 2 -> ... -> N)
+# B) Independent deltas from base (base+10, base+20, ...)
+set -eu
+
+JAR="${1:?Usage: bench-incremental.sh [cache-opts...]}"
+WORK_DIR="${2:?}"
+COMMAND="${3:?}"
+shift 3
+CACHE_OPTS="$*"
+
+RESULTS_DIR="$WORK_DIR/results"
+mkdir -p "$RESULTS_DIR"
+TIMESTAMP=$(date +%Y%m%d-%H%M%S)
+LABEL=$(echo "$CACHE_OPTS" | tr ' ' '_')
+[ -z "$LABEL" ] && LABEL="none"
+
+TIME=/usr/bin/time
+
+run_stein() {
+ java -Xmx1g -jar "$JAR" --bare --log=WARN $CACHE_OPTS -o "$2" "$1" "$COMMAND"
+}
+
+# Capture wall-clock seconds from "time -p"
+time_run_stein() {
+ $TIME -p sh -c "run_stein='java -Xmx1g -jar $JAR --bare --log=WARN $CACHE_OPTS -o $2 $1 $COMMAND'; eval \"\$run_stein\"" 2>&1 | grep '^real ' | awk '{print $2}'
+}
+
+# ============================================================
+# Experiment A: incremental over splits
+# ============================================================
+echo "=== Experiment A: Incremental splits (cache: ${CACHE_OPTS:-none}) ==="
+RESULT_A="$RESULTS_DIR/${TIMESTAMP}_splits_${LABEL}.csv"
+echo "step,commits,time_seconds" > "$RESULT_A"
+
+SPLITS_DIR="$WORK_DIR/splits"
+DEST_A="$WORK_DIR/dest_splits_${LABEL}"
+rm -rf "$DEST_A"
+
+SPLITS=$(ls -1d "$SPLITS_DIR"/[0-9]* 2>/dev/null | wc -l | tr -d ' ')
+
+for i in $(seq 1 "$SPLITS"); do
+ SOURCE="$SPLITS_DIR/$i"
+ [ -d "$SOURCE" ] || continue
+ NCOMMITS=$(git -C "$SOURCE" rev-list --all 2>/dev/null | wc -l | tr -d ' ')
+ printf " Split %d/%d (%d commits) ... " "$i" "$SPLITS" "$NCOMMITS"
+
+ ELAPSED=$(time_run_stein "$SOURCE" "$DEST_A")
+
+ echo "${ELAPSED}s"
+ echo "$i,$NCOMMITS,$ELAPSED" >> "$RESULT_A"
+done
+echo "Results: $RESULT_A"
+rm -rf "$DEST_A"
+
+# ============================================================
+# Experiment B: independent deltas from base
+# ============================================================
+echo ""
+echo "=== Experiment B: Deltas from base (cache: ${CACHE_OPTS:-none}) ==="
+RESULT_B="$RESULTS_DIR/${TIMESTAMP}_deltas_${LABEL}.csv"
+echo "delta,commits,time_seconds" > "$RESULT_B"
+
+DELTAS_DIR="$WORK_DIR/deltas"
+BASE_SOURCE="$DELTAS_DIR/base"
+
+# First, create the base destination
+DEST_BASE="$WORK_DIR/dest_deltas_base_${LABEL}"
+rm -rf "$DEST_BASE"
+printf " Building base ... "
+BASE_TIME=$(time_run_stein "$BASE_SOURCE" "$DEST_BASE")
+BASE_COMMITS=$(git -C "$BASE_SOURCE" rev-list --all 2>/dev/null | wc -l | tr -d ' ')
+echo "$BASE_COMMITS commits, ${BASE_TIME}s"
+echo "0,$BASE_COMMITS,$BASE_TIME" >> "$RESULT_B"
+
+# Run deltas independently (cp base, then incremental transform)
+DELTAS=$(ls -1d "$DELTAS_DIR"/[0-9]* 2>/dev/null | sort -n | while read d; do basename "$d"; done)
+
+for i in $DELTAS; do
+ DELTA_SOURCE="$DELTAS_DIR/$i"
+ [ -d "$DELTA_SOURCE" ] || continue
+ NCOMMITS=$(git -C "$DELTA_SOURCE" rev-list --all 2>/dev/null | wc -l | tr -d ' ')
+ DIFF=$(( NCOMMITS - BASE_COMMITS ))
+ printf " Delta %s (+%d commits, total %d) ... " "$i" "$DIFF" "$NCOMMITS"
+
+ DEST_DELTA="$WORK_DIR/dest_deltas_${LABEL}_${i}"
+ cp -r "$DEST_BASE" "$DEST_DELTA"
+
+ ELAPSED=$(time_run_stein "$DELTA_SOURCE" "$DEST_DELTA")
+
+ echo "${ELAPSED}s"
+ echo "$i,$NCOMMITS,$ELAPSED" >> "$RESULT_B"
+ rm -rf "$DEST_DELTA"
+done
+echo "Results: $RESULT_B"
+rm -rf "$DEST_BASE"
+
+echo ""
+echo "Done."
diff --git a/scripts/setup-incremental.sh b/scripts/setup-incremental.sh
new file mode 100755
index 0000000..cad608f
--- /dev/null
+++ b/scripts/setup-incremental.sh
@@ -0,0 +1,91 @@
+#!/bin/sh
+# Setup: clone target repo and generate sub-repositories by truncating at commit boundaries.
+# Usage: ./setup-incremental.sh [splits] [delta-base-frac] [delta-step] [delta-count]
+#
+# Example:
+# ./setup-incremental.sh https://github.com/google/gson.git ./work 10 0.5 10 10
+#
+# This creates:
+# work/source.git -- bare clone of the repo
+# work/splits/1 .. N -- repos truncated at 1/N, 2/N, ..., (N-1)/N of commits
+# work/deltas/base -- repo at delta-base-frac of total commits
+# work/deltas/1 .. M -- repos at base + delta-step*1, base + delta-step*2, ...
+set -eu
+
+REPO_URL="${1:?Usage: setup-incremental.sh [splits] [delta-base-frac] [delta-step] [delta-count]}"
+WORK_DIR="${2:?}"
+SPLITS="${3:-10}"
+DELTA_BASE_FRAC="${4:-0.5}"
+DELTA_STEP="${5:-10}"
+DELTA_COUNT="${6:-10}"
+
+mkdir -p "$WORK_DIR"
+
+# Clone source
+SOURCE="$WORK_DIR/source.git"
+if [ ! -d "$SOURCE" ]; then
+ echo "Cloning $REPO_URL ..."
+ git clone --bare "$REPO_URL" "$SOURCE"
+fi
+
+# Get first-parent commit list (oldest first)
+COMMITS_FILE="$WORK_DIR/commits.txt"
+git -C "$SOURCE" rev-list --first-parent HEAD | sed '1!G;h;$!d' > "$COMMITS_FILE"
+TOTAL=$(wc -l < "$COMMITS_FILE" | tr -d ' ')
+echo "Total first-parent commits: $TOTAL"
+
+# Helper: create a repo truncated at commit N
+create_truncated() {
+ n="$1"
+ dest="$2"
+ sha=$(sed -n "${n}p" "$COMMITS_FILE")
+
+ if [ -d "$dest" ]; then
+ echo " $dest already exists, skipping"
+ return
+ fi
+
+ git clone --bare --no-tags "$SOURCE" "$dest" 2>/dev/null
+ git -C "$dest" update-ref refs/heads/main "$sha"
+ # Remove all other refs
+ git -C "$dest" for-each-ref --format='%(refname)' | grep -v '^refs/heads/main$' | while read ref; do
+ git -C "$dest" update-ref -d "$ref" 2>/dev/null || true
+ done
+ git -C "$dest" gc --prune=now --quiet 2>/dev/null || true
+}
+
+# Experiment A: splits
+echo ""
+echo "=== Creating $SPLITS splits ==="
+mkdir -p "$WORK_DIR/splits"
+STEP=$(( TOTAL / SPLITS ))
+for i in $(seq 1 $(( SPLITS - 1 ))); do
+ N=$(( STEP * i ))
+ echo "Split $i/$SPLITS: $N commits"
+ create_truncated "$N" "$WORK_DIR/splits/$i"
+done
+# Last split = full repo
+if [ ! -d "$WORK_DIR/splits/$SPLITS" ]; then
+ cp -r "$SOURCE" "$WORK_DIR/splits/$SPLITS"
+fi
+
+# Experiment B: deltas
+echo ""
+echo "=== Creating delta repos (base + step*N) ==="
+mkdir -p "$WORK_DIR/deltas"
+BASE_N=$(python3 -c "print(int($TOTAL * $DELTA_BASE_FRAC))")
+echo "Base: $BASE_N commits"
+create_truncated "$BASE_N" "$WORK_DIR/deltas/base"
+
+for i in $(seq 1 "$DELTA_COUNT"); do
+ N=$(( BASE_N + DELTA_STEP * i ))
+ if [ "$N" -gt "$TOTAL" ]; then
+ echo "Delta $i: $N exceeds total ($TOTAL), stopping"
+ break
+ fi
+ echo "Delta $i: $N commits (+$(( DELTA_STEP * i )))"
+ create_truncated "$N" "$WORK_DIR/deltas/$i"
+done
+
+echo ""
+echo "Setup complete: $WORK_DIR"
diff --git a/src/main/java/jp/ac/titech/c/se/stein/Application.java b/src/main/java/jp/ac/titech/c/se/stein/Application.java
index bf956b2..7ff8e1e 100644
--- a/src/main/java/jp/ac/titech/c/se/stein/Application.java
+++ b/src/main/java/jp/ac/titech/c/se/stein/Application.java
@@ -5,17 +5,16 @@
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
-import java.util.EnumSet;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.stream.Collectors;
-import jp.ac.titech.c.se.stein.app.blob.FilterBlob;
import jp.ac.titech.c.se.stein.rewriter.BlobTranslator;
import jp.ac.titech.c.se.stein.app.Identity;
import jp.ac.titech.c.se.stein.rewriter.RewriterCommand;
import jp.ac.titech.c.se.stein.util.SettableHelpCommand;
import jp.ac.titech.c.se.stein.util.Loader;
+import jp.ac.titech.c.se.stein.util.SizeConverter;
import org.apache.commons.io.FileUtils;
import org.eclipse.jgit.internal.storage.file.FileRepository;
import org.eclipse.jgit.lib.Constants;
@@ -102,15 +101,19 @@ public enum AlternatesMode { relative, absolute }
fallbackValue = "relative", order = MIDDLE, arity = "0..1")
public AlternatesMode alternatesMode;
- @Option(names = "--cache", split = ",", paramLabel = "", description = "cache level (${COMPLETION-CANDIDATES}. default: none)", order = MIDDLE)
- public EnumSet cacheLevel = EnumSet.noneOf(RepositoryRewriter.CacheLevel.class);
+ @Option(names = "--cache", description = "enable persistent entry caching", order = MIDDLE)
+ public boolean isCachingEnabled = false;
+
+ @Option(names = "--mapping-mem", paramLabel = "{,K,M,G}", description = "max memory for entry mapping (default: 25%% of max heap)", order = MIDDLE,
+ converter = SizeConverter.class)
+ public long entryMappingMemory = -1;
@Option(names = "--extra-attributes", description = "rewrite encoding and signature in commits", order = MIDDLE)
public boolean isRewritingExtraAttributes = false;
@SuppressWarnings("unused")
@Option(names = "--stream-size-limit", paramLabel = "{,K,M,G}", description = "increase stream size limit", order = MIDDLE,
- converter = FilterBlob.SizeConverter.class)
+ converter = SizeConverter.class)
void setSizeLimit(final long limit) {
// default: 50MB is too small
final int intLimit = (int) Math.min(limit, Integer.MAX_VALUE);
diff --git a/src/main/java/jp/ac/titech/c/se/stein/PorcelainAPI.java b/src/main/java/jp/ac/titech/c/se/stein/PorcelainAPI.java
index 959b4b6..f9d3863 100644
--- a/src/main/java/jp/ac/titech/c/se/stein/PorcelainAPI.java
+++ b/src/main/java/jp/ac/titech/c/se/stein/PorcelainAPI.java
@@ -5,7 +5,6 @@
import org.eclipse.jgit.api.ResetCommand.ResetType;
import org.eclipse.jgit.api.errors.*;
import org.eclipse.jgit.internal.storage.file.FileRepository;
-import org.eclipse.jgit.internal.storage.file.GC;
import jp.ac.titech.c.se.stein.core.Try;
diff --git a/src/main/java/jp/ac/titech/c/se/stein/app/blob/FilterBlob.java b/src/main/java/jp/ac/titech/c/se/stein/app/blob/FilterBlob.java
index 17cad4b..04c0d07 100644
--- a/src/main/java/jp/ac/titech/c/se/stein/app/blob/FilterBlob.java
+++ b/src/main/java/jp/ac/titech/c/se/stein/app/blob/FilterBlob.java
@@ -4,6 +4,7 @@
import jp.ac.titech.c.se.stein.entry.BlobEntry;
import jp.ac.titech.c.se.stein.rewriter.BlobTranslator;
import jp.ac.titech.c.se.stein.rewriter.NameFilter;
+import jp.ac.titech.c.se.stein.util.SizeConverter;
import lombok.ToString;
import lombok.extern.slf4j.Slf4j;
@@ -11,7 +12,6 @@
import org.apache.commons.io.FileUtils;
import picocli.CommandLine.Command;
import picocli.CommandLine.Mixin;
-import picocli.CommandLine.ITypeConverter;
import picocli.CommandLine.Option;
@@ -51,31 +51,4 @@ public AnyHotEntry rewriteBlobEntry(final BlobEntry entry, final Context c) {
return entry;
}
-
- public static class SizeConverter implements ITypeConverter {
- @Override
- public Long convert(final String value) {
- if (value.isEmpty()) {
- throw new IllegalArgumentException("Empty value is given");
- }
- final int len = value.length();
- final char unit = Character.toUpperCase(value.charAt(len - 1));
- final String num = value.substring(0, len - 1);
- return switch (unit) {
- case 'B' -> convert(num);
- case 'K' -> displaySizeToByteCount(num, 1024);
- case 'M' -> displaySizeToByteCount(num, 1024 * 1024);
- case 'G' -> displaySizeToByteCount(num, 1024 * 1024 * 1024);
- default -> displaySizeToByteCount(value, 1);
- };
- }
-
- protected long displaySizeToByteCount(final String value, final long base) {
- if (value.contains(".")) {
- return (long) (Double.parseDouble(value) * base);
- } else {
- return Long.parseLong(value) * base;
- }
- }
- }
}
diff --git a/src/main/java/jp/ac/titech/c/se/stein/app/blob/Tokenize.java b/src/main/java/jp/ac/titech/c/se/stein/app/blob/Tokenize.java
index 1de8689..007ff0a 100644
--- a/src/main/java/jp/ac/titech/c/se/stein/app/blob/Tokenize.java
+++ b/src/main/java/jp/ac/titech/c/se/stein/app/blob/Tokenize.java
@@ -8,7 +8,6 @@
import lombok.ToString;
import picocli.CommandLine.Command;
-import java.nio.charset.StandardCharsets;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
diff --git a/src/main/java/jp/ac/titech/c/se/stein/app/blob/TokenizeViaJDT.java b/src/main/java/jp/ac/titech/c/se/stein/app/blob/TokenizeViaJDT.java
index 7672b36..2f68715 100644
--- a/src/main/java/jp/ac/titech/c/se/stein/app/blob/TokenizeViaJDT.java
+++ b/src/main/java/jp/ac/titech/c/se/stein/app/blob/TokenizeViaJDT.java
@@ -1,7 +1,5 @@
package jp.ac.titech.c.se.stein.app.blob;
-import java.nio.charset.StandardCharsets;
-
import jp.ac.titech.c.se.stein.entry.AnyHotEntry;
import jp.ac.titech.c.se.stein.core.SourceText;
import jp.ac.titech.c.se.stein.entry.BlobEntry;
diff --git a/src/main/java/jp/ac/titech/c/se/stein/app/blob/Untokenize.java b/src/main/java/jp/ac/titech/c/se/stein/app/blob/Untokenize.java
index 4d087b3..fb8bb05 100644
--- a/src/main/java/jp/ac/titech/c/se/stein/app/blob/Untokenize.java
+++ b/src/main/java/jp/ac/titech/c/se/stein/app/blob/Untokenize.java
@@ -10,8 +10,6 @@
import picocli.CommandLine.Command;
import picocli.CommandLine.Mixin;
-import java.nio.charset.StandardCharsets;
-
/**
* Restores linetoken-encoded source files back to their original form.
* The inverse of {@link Tokenize}: removes line breaks between tokens and
diff --git a/src/main/java/jp/ac/titech/c/se/stein/core/Cache.java b/src/main/java/jp/ac/titech/c/se/stein/core/Cache.java
deleted file mode 100644
index de42ac0..0000000
--- a/src/main/java/jp/ac/titech/c/se/stein/core/Cache.java
+++ /dev/null
@@ -1,102 +0,0 @@
-package jp.ac.titech.c.se.stein.core;
-
-import lombok.AllArgsConstructor;
-
-import java.util.AbstractMap;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-import java.util.function.Predicate;
-
-@AllArgsConstructor
-public class Cache extends AbstractMap {
- private final Map frontend, readingBackend, writingBackend;
-
- public Cache(final Map frontend, final Map backend) {
- this(frontend, backend, backend);
- }
-
- public Cache(final Map frontend, final Map backend, final boolean readFrom, final boolean writeTo) {
- this(frontend, readFrom ? backend : new NullObjectMap<>(),
- writeTo ? backend : new NullObjectMap<>());
- }
-
- @Override
- public V get(final Object key) {
- @SuppressWarnings("unchecked")
- final K k = (K) key;
- return frontend.computeIfAbsent(k, readingBackend::get);
- }
-
- @Override
- public V put(final K key, final V value) {
- writingBackend.put(key, value);
- return frontend.put(key, value);
- }
-
- @Override
-
- public Set> entrySet() {
- final Set> result = new HashSet<>();
- result.addAll(frontend.entrySet());
- result.addAll(readingBackend.entrySet());
- return result;
- }
-
- @Override
- public void clear() {
- frontend.clear();
- writingBackend.clear();
- }
-
- public static class Filter extends AbstractMap {
- private final Predicate condition;
- private final Map delegatee;
-
- public Filter(final Predicate condition, final Map delegatee) {
- this.condition = condition;
- this.delegatee = delegatee;
- }
-
- public static Map apply(final Predicate condition, final Map delegatee) {
- return new Filter<>(condition, delegatee);
- }
-
- @Override
- public V get(final Object key) {
- @SuppressWarnings("unchecked")
- final K k = (K) key;
- return condition.test(k) ? delegatee.get(key) : null;
- }
-
- @Override
- public V put(final K key, final V value) {
- return condition.test(key) ? delegatee.put(key, value) : value;
- }
-
- @Override
-
- public Set> entrySet() {
- return delegatee.entrySet();
- }
- }
-
- public static class NullObjectMap extends AbstractMap {
- @Override
- public V get(final Object key) {
- return null;
- }
-
- @Override
- public V put(final K key, final V value) {
- return value;
- }
-
- @Override
-
- public Set> entrySet() {
- return Collections.emptySet();
- }
- }
-}
diff --git a/src/main/java/jp/ac/titech/c/se/stein/core/Marshaler.java b/src/main/java/jp/ac/titech/c/se/stein/core/Marshaler.java
deleted file mode 100644
index 7a78e05..0000000
--- a/src/main/java/jp/ac/titech/c/se/stein/core/Marshaler.java
+++ /dev/null
@@ -1,91 +0,0 @@
-package jp.ac.titech.c.se.stein.core;
-
-import org.eclipse.jgit.lib.ObjectId;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.ObjectInputStream;
-import java.io.ObjectOutputStream;
-import java.io.OutputStream;
-
-/**
- * Converts an object to a byte array and vice versa.
- */
-public interface Marshaler {
- Logger log = LoggerFactory.getLogger(Marshaler.class);
-
- /**
- * Marshals an object and write it to the given stream.
- */
- void writeObject(final T object, final OutputStream stream);
-
- /**
- * Reads from the given stream and unmarshals it to an object.
- */
- T readObject(final InputStream stream);
-
- /**
- * Marshals an object.
- */
- default byte[] marshal(final T object) {
- final ByteArrayOutputStream stream = new ByteArrayOutputStream();
- writeObject(object, stream);
- return stream.toByteArray();
- }
-
- /**
- * Unmarshals an object.
- */
- default T unmarshal(final byte[] binary) {
- return readObject(new ByteArrayInputStream(binary));
- }
-
- class JavaSerializerMarshaler implements Marshaler {
- @Override
- public void writeObject(final T object, final OutputStream stream) {
- try (final ObjectOutputStream output = new ObjectOutputStream(stream)) {
- output.writeObject(object);
- } catch (final IOException e) {
- log.error(e.getMessage(), e);
- }
- }
-
- @Override
- public T readObject(final InputStream stream) {
- try (final ObjectInputStream input = new ObjectInputStream(stream)) {
- @SuppressWarnings("unchecked")
- final T result = (T) input.readObject();
- return result;
- } catch (final IOException | ClassNotFoundException e) {
- log.error(e.getMessage(), e);
- return null;
- }
- }
- }
-
- class ObjectIdMarshaler implements Marshaler {
- @Override
- public void writeObject(final ObjectId object, final OutputStream stream) {
- try {
- object.copyRawTo(stream);
- } catch (final IOException e) {
- log.error(e.getMessage(), e);
- }
- }
-
- @Override
- public ObjectId readObject(final InputStream stream) {
- try {
- final byte[] bytes = stream.readAllBytes();
- return ObjectId.fromRaw(bytes);
- } catch (final IOException e) {
- log.error(e.getMessage(), e);
- return null;
- }
- }
- }
-}
diff --git a/src/main/java/jp/ac/titech/c/se/stein/core/RefEntry.java b/src/main/java/jp/ac/titech/c/se/stein/core/RefEntry.java
index 6e4a3c1..097f8cd 100644
--- a/src/main/java/jp/ac/titech/c/se/stein/core/RefEntry.java
+++ b/src/main/java/jp/ac/titech/c/se/stein/core/RefEntry.java
@@ -1,6 +1,7 @@
package jp.ac.titech.c.se.stein.core;
import java.io.Serializable;
+import java.util.Comparator;
import lombok.EqualsAndHashCode;
import org.eclipse.jgit.lib.ObjectId;
@@ -14,7 +15,7 @@
* A symbolic ref has a non-null {@link #target} and a null {@link #id}.
*/
@EqualsAndHashCode
-public class RefEntry implements Serializable {
+public class RefEntry implements Serializable, Comparable {
/**
* The ref name (e.g., {@code "refs/heads/main"} or {@code "HEAD"}).
*/
@@ -76,4 +77,14 @@ public boolean isSymbolic() {
public String toString() {
return String.format("", name, target != null ? target : id.name());
}
+
+ private static final Comparator COMPARATOR = Comparator
+ .comparing((RefEntry r) -> r.name, Comparator.nullsFirst(Comparator.naturalOrder()))
+ .thenComparing(r -> r.id, Comparator.nullsFirst(Comparator.naturalOrder()))
+ .thenComparing(r -> r.target, Comparator.nullsFirst(Comparator.naturalOrder()));
+
+ @Override
+ public int compareTo(final RefEntry other) {
+ return COMPARATOR.compare(this, other);
+ }
}
diff --git a/src/main/java/jp/ac/titech/c/se/stein/core/SQLiteCacheProvider.java b/src/main/java/jp/ac/titech/c/se/stein/core/SQLiteCacheProvider.java
deleted file mode 100644
index 12d8690..0000000
--- a/src/main/java/jp/ac/titech/c/se/stein/core/SQLiteCacheProvider.java
+++ /dev/null
@@ -1,187 +0,0 @@
-package jp.ac.titech.c.se.stein.core;
-
-import com.j256.ormlite.dao.Dao;
-import com.j256.ormlite.dao.DaoManager;
-import com.j256.ormlite.field.DataType;
-import com.j256.ormlite.field.DatabaseField;
-import com.j256.ormlite.jdbc.JdbcConnectionSource;
-import com.j256.ormlite.logger.Slf4jLoggingLogBackend;
-import com.j256.ormlite.misc.TransactionManager;
-import com.j256.ormlite.table.DatabaseTable;
-import com.j256.ormlite.table.TableUtils;
-import jp.ac.titech.c.se.stein.entry.AnyColdEntry;
-import jp.ac.titech.c.se.stein.entry.Entry;
-import org.eclipse.jgit.lib.ObjectId;
-import org.eclipse.jgit.lib.Repository;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.sql.SQLException;
-import java.util.AbstractMap;
-import java.util.Collections;
-import java.util.Map;
-import java.util.Set;
-import java.util.concurrent.Callable;
-import java.util.function.Supplier;
-import java.util.stream.Collectors;
-
-public class SQLiteCacheProvider {
- private final static Logger log = LoggerFactory.getLogger(SQLiteCacheProvider.class);
-
- static class KeyValue {
- @DatabaseField(id = true, dataType = DataType.BYTE_ARRAY)
- byte[] source;
- @DatabaseField(dataType = DataType.BYTE_ARRAY)
- byte[] target;
- }
-
- @DatabaseTable(tableName = "commits")
- static class CommitRow extends KeyValue {}
-
- @DatabaseTable(tableName = "entries")
- static class EntryRow extends KeyValue {}
-
- @DatabaseTable(tableName = "refs")
- static class RefRow extends KeyValue {}
-
- JdbcConnectionSource connectionSource = null;
-
- Dao commitDao;
-
- Dao entryDao;
-
- Dao refDao;
-
- final boolean initial;
-
- public SQLiteCacheProvider(final Repository target) {
- com.j256.ormlite.logger.LoggerFactory.setLogBackendFactory(new Slf4jLoggingLogBackend.Slf4jLoggingLogBackendFactory());
- com.j256.ormlite.logger.Logger.setGlobalLogLevel(com.j256.ormlite.logger.Level.FATAL);
-
- final File dotGitDir = target.getDirectory().getAbsoluteFile();
- final Path dbFile = dotGitDir.toPath().resolve("cache.db");
- initial = !Files.exists(dbFile);
- try {
- connectionSource = new JdbcConnectionSource("jdbc:sqlite:" + dbFile);
- commitDao = DaoManager.createDao(connectionSource, CommitRow.class);
- TableUtils.createTableIfNotExists(connectionSource, CommitRow.class);
- entryDao = DaoManager.createDao(connectionSource, EntryRow.class);
- TableUtils.createTableIfNotExists(connectionSource, EntryRow.class);
- refDao = DaoManager.createDao(connectionSource, RefRow.class);
- TableUtils.createTableIfNotExists(connectionSource, RefRow.class);
- } catch (final SQLException e) {
- log.error(e.getMessage(), e);
- } finally {
- try {
- if (connectionSource != null) {
- connectionSource.close();
- }
- } catch (final IOException e) {
- log.error("Failed to close connection to Database", e);
- }
- }
- }
-
- public void inTransaction(final Callable fn) {
- try {
- TransactionManager.callInTransaction(connectionSource, fn);
- } catch (final SQLException e) {
- log.error(e.getMessage(), e);
- }
- }
-
- public boolean isInitial() {
- return initial;
- }
-
- public Map getCommitMapping() {
- final Marshaler m = new Marshaler.ObjectIdMarshaler();
- return new MapAdapter<>(commitDao, CommitRow::new, m, m);
- }
-
- public Map getEntryMapping() {
- final Marshaler km = new Marshaler.JavaSerializerMarshaler<>();
- final Marshaler vm = new Marshaler.JavaSerializerMarshaler<>();
- return new MapAdapter<>(entryDao, EntryRow::new, km, vm);
- }
-
- public Map getRefEntryMapping() {
- final Marshaler m = new Marshaler.JavaSerializerMarshaler<>();
- return new MapAdapter<>(refDao, RefRow::new, m, m);
- }
-
- /**
- * Map interface using the SQLite cache.
- */
- static class MapAdapter extends AbstractMap {
-
- final Dao dao;
-
- final Supplier constructor;
-
- final Marshaler keyMarshaler;
-
- final Marshaler valueMarshaler;
-
- public MapAdapter(final Dao dao, final Supplier constructor, final Marshaler keyMarshaler, Marshaler valueMarshaler) {
- this.dao = dao;
- this.constructor = constructor;
- this.keyMarshaler = keyMarshaler;
- this.valueMarshaler = valueMarshaler;
- }
-
- @Override
- public V get(final Object key) {
- @SuppressWarnings("unchecked")
- final K k = (K) key;
- try {
- final byte[] source = keyMarshaler.marshal(k);
- final Row row = dao.queryForId(source);
- return row != null ? valueMarshaler.unmarshal(row.target) : null;
- } catch (final SQLException e) {
- log.warn(e.getMessage(), e);
- return null;
- }
- }
-
- @Override
- public V put(final K key, final V value) {
- try {
- final Row row = constructor.get();
- row.source = keyMarshaler.marshal(key);
- row.target = valueMarshaler.marshal(value);
- dao.createIfNotExists(row);
- } catch (final SQLException e) {
- log.error(e.getMessage(), e);
- }
- return value;
- }
-
- @Override
- public Set> entrySet() {
- try {
- return dao
- .queryForAll()
- .stream()
- .map(r -> new AbstractMap.SimpleEntry<>(keyMarshaler.unmarshal(r.source), valueMarshaler.unmarshal(r.target)))
- .collect(Collectors.toSet());
- } catch (final SQLException e) {
- log.error(e.getMessage(), e);
- return Collections.emptySet();
- }
- }
-
- @Override
- public void clear() {
- try {
- dao.deleteBuilder().delete();
- } catch (final SQLException e) {
- log.error(e.getMessage(), e);
- }
- }
- }
-}
diff --git a/src/main/java/jp/ac/titech/c/se/stein/core/cache/CommitMapping.java b/src/main/java/jp/ac/titech/c/se/stein/core/cache/CommitMapping.java
new file mode 100644
index 0000000..221964f
--- /dev/null
+++ b/src/main/java/jp/ac/titech/c/se/stein/core/cache/CommitMapping.java
@@ -0,0 +1,126 @@
+package jp.ac.titech.c.se.stein.core.cache;
+
+import jp.ac.titech.c.se.stein.core.RepositoryAccess;
+import lombok.Getter;
+import org.eclipse.jgit.lib.Constants;
+import org.eclipse.jgit.lib.ObjectId;
+import org.eclipse.jgit.lib.Ref;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Manages source-to-target commit ID mapping with support for notes-based restoration.
+ *
+ *
When an incremental transformation is performed, commits that were already
+ * transformed in a previous run should not be re-processed. git-stein records the
+ * source commit ID as a git note on each target commit, so the target repository
+ * itself serves as persistent storage for commit mappings.
+ *
+ *
Loading is two-phase. On initialization, only the ref tips of the target are
+ * examined: for each target ref, the note on the tip commit is read to recover the
+ * corresponding source commit ID. These are registered in the mapping and also
+ * collected as "uninteresting" points so that the source RevWalk stops at
+ * already-processed commits. This covers the common case (linear history, no merges
+ * from old branches). If a merge commit references an old source commit not reachable
+ * from any current ref tip, the mapping will miss, and a full scan of all target notes
+ * is triggered lazily (at most once) to load the remaining entries.
+ */
+public class CommitMapping extends AbstractMap {
+ private static final Logger log = LoggerFactory.getLogger(CommitMapping.class);
+
+ private final Map map = new HashMap<>();
+
+ /**
+ * Source commit IDs of previously processed ref tips.
+ * These should be marked as uninteresting in the source RevWalk.
+ */
+ @Getter
+ private final List previousSourceTips = new ArrayList<>();
+
+ private NoteObjectIdMap notesMap;
+ private volatile boolean notesFullyLoaded = false;
+
+ /**
+ * Restores commit mapping from the target repository's notes.
+ * Only ref tips are read eagerly.
+ *
+ * @param notesRef the notes ref to read from (e.g., {@code refs/notes/git-stein-prev})
+ */
+ public void restoreFromTarget(RepositoryAccess target, String notesRef) {
+ final List targetRefs = target.getRefs();
+ if (targetRefs.isEmpty()) {
+ return;
+ }
+
+ notesMap = new NoteObjectIdMap(target.readNotes(notesRef), target);
+
+ for (final Ref ref : targetRefs) {
+ final ObjectId targetTipId = target.getRefTarget(ref);
+ if (targetTipId == null || target.getObjectType(targetTipId) != Constants.OBJ_COMMIT) {
+ continue;
+ }
+ final ObjectId sourceTipId = notesMap.get(targetTipId);
+ if (sourceTipId == null) {
+ continue;
+ }
+ map.put(sourceTipId, targetTipId);
+ previousSourceTips.add(sourceTipId);
+ log.debug("Restored commit mapping from note: {} -> {} (ref: {})",
+ sourceTipId.name(), targetTipId.name(), ref.getName());
+ }
+
+ if (!previousSourceTips.isEmpty()) {
+ log.info("Restored {} commit mappings from target notes", previousSourceTips.size());
+ }
+ }
+
+
+ @Override
+ public ObjectId get(Object key) {
+ final ObjectId v = map.get(key);
+ if (v != null) {
+ return v;
+ }
+ if (!notesFullyLoaded && notesMap != null) {
+ loadAllNotes();
+ return map.get(key);
+ }
+ return null;
+ }
+
+ @Override
+ public ObjectId put(ObjectId key, ObjectId value) {
+ return map.put(key, value);
+ }
+
+ @Override
+ public int size() {
+ return map.size();
+ }
+
+ @Override
+ public Set> entrySet() {
+ return map.entrySet();
+ }
+
+ /**
+ * Loads all notes into the mapping. Called at most once, when a lookup
+ * misses on a commit not reachable from any ref tip (e.g., old merge parent).
+ */
+ private synchronized void loadAllNotes() {
+ if (notesFullyLoaded) {
+ return;
+ }
+ log.info("Loading full notes for commit mapping fallback");
+ notesMap.forEach((targetId, sourceId) -> map.put(sourceId, targetId));
+ log.info("Loaded commit mappings, total {} entries", map.size());
+ notesFullyLoaded = true;
+ }
+}
diff --git a/src/main/java/jp/ac/titech/c/se/stein/core/cache/NoteObjectIdMap.java b/src/main/java/jp/ac/titech/c/se/stein/core/cache/NoteObjectIdMap.java
new file mode 100644
index 0000000..ffb0f74
--- /dev/null
+++ b/src/main/java/jp/ac/titech/c/se/stein/core/cache/NoteObjectIdMap.java
@@ -0,0 +1,83 @@
+package jp.ac.titech.c.se.stein.core.cache;
+
+import jp.ac.titech.c.se.stein.core.Context;
+import jp.ac.titech.c.se.stein.core.RepositoryAccess;
+import org.eclipse.jgit.lib.Constants;
+import org.eclipse.jgit.lib.ObjectId;
+import org.eclipse.jgit.notes.NoteMap;
+
+import java.util.function.BiConsumer;
+
+/**
+ * A view over a JGit NoteMap that interprets note bodies as ObjectIds (hex-encoded).
+ * Supports both reading (get, forEach) and writing (add).
+ */
+public class NoteObjectIdMap {
+ private final NoteMap notes;
+ private final RepositoryAccess ra;
+
+ public NoteObjectIdMap(NoteMap notes, RepositoryAccess ra) {
+ this.notes = notes;
+ this.ra = ra;
+ }
+
+ /**
+ * Returns the NoteMap backing this view.
+ */
+ public NoteMap getNoteMap() {
+ return notes;
+ }
+
+ /**
+ * Reads the note on the given commit as an ObjectId.
+ */
+ public ObjectId get(ObjectId commitId) {
+ return parseObjectId(ra.readNote(notes, commitId));
+ }
+
+ /**
+ * Adds a note recording the given value as the note body on the given commit.
+ */
+ public void add(ObjectId commitId, ObjectId value, Context c) {
+ final byte[] content = new byte[Constants.OBJECT_ID_STRING_LENGTH];
+ value.copyTo(content, 0);
+ ra.addNote(notes, commitId, content, c);
+ }
+
+ /**
+ * Adds a note by forwarding raw note bytes (for chain forwarding).
+ */
+ public void addRaw(ObjectId commitId, byte[] rawNote, Context c) {
+ ra.addNote(notes, commitId, rawNote, c);
+ }
+
+ /**
+ * Iterates all notes, passing (annotatedId, bodyAsObjectId) pairs.
+ */
+ public void forEach(BiConsumer consumer) {
+ ra.forEachNote(notes, (annotatedId, body) -> {
+ final ObjectId bodyId = parseObjectId(body);
+ if (bodyId != null) {
+ consumer.accept(annotatedId, bodyId);
+ }
+ });
+ }
+
+ /**
+ * Writes the notes to the repository under the given ref.
+ */
+ public void write(String ref, Context c) {
+ ra.writeNotes(notes, ref, c);
+ }
+
+ private static ObjectId parseObjectId(byte[] body) {
+ if (body == null) {
+ return null;
+ }
+ try {
+ return ObjectId.fromString(new String(body));
+ } catch (Exception e) {
+ return null;
+ }
+ }
+}
diff --git a/src/main/java/jp/ac/titech/c/se/stein/core/cache/PersistentEntryCache.java b/src/main/java/jp/ac/titech/c/se/stein/core/cache/PersistentEntryCache.java
new file mode 100644
index 0000000..879a99a
--- /dev/null
+++ b/src/main/java/jp/ac/titech/c/se/stein/core/cache/PersistentEntryCache.java
@@ -0,0 +1,58 @@
+package jp.ac.titech.c.se.stein.core.cache;
+
+import jp.ac.titech.c.se.stein.entry.AnyColdEntry;
+import jp.ac.titech.c.se.stein.entry.Entry;
+import lombok.Getter;
+import org.eclipse.jgit.lib.Repository;
+import org.h2.mvstore.MVStore;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Map;
+
+/**
+ * Persistent entry cache backed by H2 MVStore.
+ * Data is stored in a single file ({@code cache.mv.db}) in the target repository's .git directory.
+ */
+public class PersistentEntryCache implements AutoCloseable {
+ /**
+ * Fraction of memoryBudget allocated to the read page cache.
+ */
+ private static final double READ_CACHE_RATIO = 1.0;
+
+ /**
+ * Fraction of memoryBudget allocated to the write buffer (auto-commit threshold).
+ * Worst-case total memory usage is (READ_CACHE_RATIO + WRITE_BUFFER_RATIO) times the budget.
+ */
+ private static final double WRITE_BUFFER_RATIO = 0.5;
+
+ private final MVStore store;
+
+ @Getter
+ private final boolean initial;
+
+ public PersistentEntryCache(final Repository target, final long memoryBudget) {
+ final Path dbFile = target.getDirectory().toPath().resolve("cache.mv.db");
+ initial = !Files.exists(dbFile);
+ final int cacheSizeMB = (int) Math.max(1, (long) (memoryBudget * READ_CACHE_RATIO) / (1024 * 1024));
+ final int autoCommitBufferSizeKB = (int) Math.max(1, (long) (memoryBudget * WRITE_BUFFER_RATIO) / 1024);
+ store = new MVStore.Builder()
+ .fileName(dbFile.toString())
+ .cacheSize(cacheSizeMB)
+ .autoCommitBufferSize(autoCommitBufferSizeKB)
+ .open();
+ }
+
+ @SuppressWarnings("unchecked")
+ public Map getEntryMapping() {
+ return store.openMap("entries");
+ }
+
+ @Override
+ public void close() {
+ if (store != null && !store.isClosed()) {
+ store.commit();
+ store.close();
+ }
+ }
+}
diff --git a/src/main/java/jp/ac/titech/c/se/stein/entry/SingleEntry.java b/src/main/java/jp/ac/titech/c/se/stein/entry/SingleEntry.java
index 9c5f377..fa08694 100644
--- a/src/main/java/jp/ac/titech/c/se/stein/entry/SingleEntry.java
+++ b/src/main/java/jp/ac/titech/c/se/stein/entry/SingleEntry.java
@@ -3,6 +3,8 @@
import org.eclipse.jgit.lib.FileMode;
import org.eclipse.jgit.lib.ObjectId;
+import java.util.Comparator;
+
/**
* Common interface for a single tree entry.
*
@@ -93,11 +95,18 @@ default String sortKey() {
return isTree() ? getName() + "/" : getName();
}
+ Comparator COMPARATOR = Comparator
+ .comparing(SingleEntry::sortKey)
+ .thenComparing(SingleEntry::getId)
+ .thenComparingInt(SingleEntry::getMode)
+ .thenComparing(SingleEntry::getDirectory, Comparator.nullsFirst(Comparator.naturalOrder()));
+
/**
- * Compares entries by their {@link #sortKey()}.
+ * Compares entries by their {@link #sortKey()}, then by mode, object ID, and directory
+ * to ensure consistency with {@code equals}.
*/
@Override
default int compareTo(final SingleEntry other) {
- return sortKey().compareTo(other.sortKey());
+ return COMPARATOR.compare(this, other);
}
}
diff --git a/src/main/java/jp/ac/titech/c/se/stein/entry/TreeEntry.java b/src/main/java/jp/ac/titech/c/se/stein/entry/TreeEntry.java
index 7a74cfe..4bdb6f5 100644
--- a/src/main/java/jp/ac/titech/c/se/stein/entry/TreeEntry.java
+++ b/src/main/java/jp/ac/titech/c/se/stein/entry/TreeEntry.java
@@ -11,7 +11,6 @@
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
-import java.util.stream.Stream;
/**
* A Hot entry representing a tree (directory).
diff --git a/src/main/java/jp/ac/titech/c/se/stein/jgit/TreeFormatter.java b/src/main/java/jp/ac/titech/c/se/stein/jgit/TreeFormatter.java
index 5d7fa94..3351472 100644
--- a/src/main/java/jp/ac/titech/c/se/stein/jgit/TreeFormatter.java
+++ b/src/main/java/jp/ac/titech/c/se/stein/jgit/TreeFormatter.java
@@ -56,8 +56,9 @@ private void append(byte[] nameBuf, int nameLen, byte[] mode, AnyObjectId id) {
}
private boolean fmtBuf(byte[] nameBuf, int nameLen, byte[] mode) {
- if (buf == null || buf.length < ptr + entrySize(mode, nameLen))
+ if (buf == null || buf.length < ptr + entrySize(mode, nameLen)) {
return false;
+ }
//mode.copyTo(buf, ptr);
//ptr += mode.copyToLength();
System.arraycopy(mode, 0, buf, ptr, mode.length);
@@ -83,15 +84,17 @@ private void fmtOverflowBuffer(byte[] nameBuf, int nameLen, byte[] mode) throws
}
public ObjectId insertTo(ObjectInserter ins) throws IOException {
- if (buf != null)
+ if (buf != null) {
return ins.insert(OBJ_TREE, buf, 0, ptr);
+ }
final long len = overflowBuffer.length();
return ins.insert(OBJ_TREE, len, overflowBuffer.openInputStream());
}
public ObjectId computeId(ObjectInserter ins) {
- if (buf != null)
+ if (buf != null) {
return ins.idFor(OBJ_TREE, buf, 0, ptr);
+ }
final long len = overflowBuffer.length();
try {
return ins.idFor(OBJ_TREE, len, overflowBuffer.openInputStream());
diff --git a/src/main/java/jp/ac/titech/c/se/stein/rewriter/BlobTranslator.java b/src/main/java/jp/ac/titech/c/se/stein/rewriter/BlobTranslator.java
index 9eceeef..bea0e9c 100644
--- a/src/main/java/jp/ac/titech/c/se/stein/rewriter/BlobTranslator.java
+++ b/src/main/java/jp/ac/titech/c/se/stein/rewriter/BlobTranslator.java
@@ -8,7 +8,6 @@
import lombok.Getter;
import lombok.ToString;
-import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.stream.Collectors;
import java.util.function.Function;
diff --git a/src/main/java/jp/ac/titech/c/se/stein/rewriter/RepositoryRewriter.java b/src/main/java/jp/ac/titech/c/se/stein/rewriter/RepositoryRewriter.java
index 02c5b5c..1bd6e72 100644
--- a/src/main/java/jp/ac/titech/c/se/stein/rewriter/RepositoryRewriter.java
+++ b/src/main/java/jp/ac/titech/c/se/stein/rewriter/RepositoryRewriter.java
@@ -5,11 +5,14 @@
import java.nio.charset.Charset;
import java.util.*;
import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
+import com.google.common.cache.CacheBuilder;
import jp.ac.titech.c.se.stein.core.*;
+import jp.ac.titech.c.se.stein.core.cache.*;
import jp.ac.titech.c.se.stein.entry.*;
import jp.ac.titech.c.se.stein.jgit.RevWalk;
import lombok.Setter;
@@ -41,9 +44,26 @@ public class RepositoryRewriter implements RewriterCommand {
protected static final ObjectId ZERO = ObjectId.zeroId();
/**
- * Entry-to-entries mapping.
+ * Entry-to-entries mapping. When {@code --cache} is disabled, uses an in-memory
+ * Guava Cache with LRU eviction. When enabled, uses a persistent MVStore map.
*/
- protected Map entryMapping = new HashMap<>();
+ protected Map entryMapping;
+
+ private final AtomicLong blobCacheHits = new AtomicLong();
+ private final AtomicLong blobCacheMisses = new AtomicLong();
+ private final AtomicLong treeCacheHits = new AtomicLong();
+ private final AtomicLong treeCacheMisses = new AtomicLong();
+
+ private static final int BYTES_PER_ENTRY = 300;
+
+ private static Map createEntryMapping(long memoryBudget) {
+ final long maxWeight = Math.max(1000, memoryBudget / BYTES_PER_ENTRY);
+ return CacheBuilder.newBuilder()
+ .maximumWeight(maxWeight)
+ .weigher((Entry k, AnyColdEntry v) -> v.size())
+ .build()
+ .asMap();
+ }
/**
* Root tree-to-tree mapping.
@@ -53,7 +73,7 @@ public class RepositoryRewriter implements RewriterCommand {
/**
* Commit-to-commit mapping.
*/
- protected Map commitMapping = new HashMap<>();
+ protected final CommitMapping commitMapping = new CommitMapping();
/**
* Tag-to-tag mapping.
@@ -65,8 +85,38 @@ public class RepositoryRewriter implements RewriterCommand {
*/
protected Map refEntryMapping = new HashMap<>();
+ /**
+ * Notes ref for the immediate source commit ID (for incremental transformation).
+ */
+ public static final String R_NOTES_PREV = "refs/notes/git-stein-prev";
+
+ /**
+ * Notes ref for the original source commit ID (through the chain).
+ */
+ public static final String R_NOTES_ORIG = "refs/notes/git-stein-orig";
+
protected RepositoryAccess source, target;
+ /**
+ * Whether source is a chained transformation (has git-stein-orig notes).
+ */
+ private boolean isChained = false;
+
+ /**
+ * Notes for prev (always the immediate source commit ID).
+ */
+ private NoteObjectIdMap prevNotes;
+
+ /**
+ * Notes for orig (forwarded from source, or same as prev for single).
+ */
+ private NoteObjectIdMap origNotes;
+
+ /**
+ * Source's orig notes (for chain forwarding). Cached at initialization.
+ */
+ private NoteObjectIdMap sourceOrigNotes;
+
protected boolean isOverwriting = false;
protected boolean isPathSensitive = false;
@@ -74,55 +124,40 @@ public class RepositoryRewriter implements RewriterCommand {
@Setter
protected Config config;
- public enum CacheLevel {
- blob, tree, commit
- }
-
- protected SQLiteCacheProvider cacheProvider;
+ protected PersistentEntryCache entryCache;
public void initialize(final Repository sourceRepo, final Repository targetRepo) {
source = new RepositoryAccess(sourceRepo);
target = new RepositoryAccess(targetRepo);
isOverwriting = sourceRepo == targetRepo;
- if (config.nthreads > 1) {
- this.entryMapping = new ConcurrentHashMap<>();
- }
if (config.isDryRunning) {
source.setDryRunning(true);
target.setDryRunning(true);
}
- if (!config.cacheLevel.isEmpty()) {
- cacheProvider = new SQLiteCacheProvider(targetRepo);
- if (config.cacheLevel.contains(CacheLevel.commit)) {
- log.info("Stored mapping (commit-mapping) is available");
- commitMapping = new Cache<>(commitMapping, cacheProvider.getCommitMapping(), !cacheProvider.isInitial(), true);
- refEntryMapping = new Cache<>(refEntryMapping, cacheProvider.getRefEntryMapping(), !cacheProvider.isInitial(), true);
- }
- if (config.cacheLevel.contains(CacheLevel.blob) || config.cacheLevel.contains(CacheLevel.tree)) {
- log.info("Stored mapping (entry-mapping) is available");
- Map storedEntryMapping = cacheProvider.getEntryMapping();
- if (!config.cacheLevel.contains(CacheLevel.tree)) {
- log.info("Stored mapping (entry-mapping): blob-only filtering");
- storedEntryMapping = Cache.Filter.apply(e -> !e.isTree(), storedEntryMapping);
- } else if (!config.cacheLevel.contains(CacheLevel.blob)) {
- log.info("Stored mapping (entry-mapping): tree-only filtering");
- storedEntryMapping = Cache.Filter.apply(Entry::isTree, storedEntryMapping);
- }
- entryMapping = new Cache<>(entryMapping, storedEntryMapping, !cacheProvider.isInitial(), true);
+ if (config.isAddingNotes && !isOverwriting) {
+ isChained = source.getRef(R_NOTES_ORIG) != null;
+ prevNotes = new NoteObjectIdMap(target.readNotes(R_NOTES_PREV), target);
+ if (isChained) {
+ origNotes = new NoteObjectIdMap(target.readNotes(R_NOTES_ORIG), target);
+ sourceOrigNotes = new NoteObjectIdMap(source.readNotes(R_NOTES_ORIG), source);
+ } else {
+ origNotes = prevNotes;
}
+ commitMapping.restoreFromTarget(target, R_NOTES_PREV);
+ }
+ final long budget = config.entryMappingMemory >= 0 ? config.entryMappingMemory : Runtime.getRuntime().maxMemory() / 4;
+ if (config.isCachingEnabled) {
+ entryCache = new PersistentEntryCache(targetRepo, budget);
+ entryMapping = entryCache.getEntryMapping();
+ } else {
+ entryMapping = createEntryMapping(budget);
}
}
public void rewrite(final Context c) {
setUp(c);
- final RevWalk walk = prepareRevisionWalk(c);
- if (cacheProvider != null) {
- cacheProvider.inTransaction(() -> {
- rewriteCommits(walk, c);
- updateRefs(c);
- return null;
- });
- } else {
+ try {
+ final RevWalk walk = prepareRevisionWalk(c);
if (config.nthreads >= 2) {
log.debug("Parallel rewriting");
rewriteRootTrees(walk, c);
@@ -130,9 +165,35 @@ public void rewrite(final Context c) {
}
rewriteCommits(walk, c);
updateRefs(c);
+ if (config.isAddingNotes) {
+ prevNotes.write(R_NOTES_PREV, c);
+ if (isChained) {
+ origNotes.write(R_NOTES_ORIG, c);
+ } else {
+ // Single transformation: orig = prev, share the same ref
+ target.applyRefUpdate(new RefEntry(R_NOTES_ORIG, target.getRef(R_NOTES_PREV).getObjectId()));
+ }
+ // Default notes = orig (for git log display)
+ target.applyRefUpdate(new RefEntry(Constants.R_NOTES_COMMITS, target.getRef(R_NOTES_ORIG).getObjectId()));
+ } else {
+ target.writeNotes(target.getDefaultNotes(), c);
+ }
+ } finally {
+ final long blobHit = blobCacheHits.get(), blobMiss = blobCacheMisses.get();
+ final long treeHit = treeCacheHits.get(), treeMiss = treeCacheMisses.get();
+ final long blobTotal = blobHit + blobMiss, treeTotal = treeHit + treeMiss, total = blobTotal + treeTotal;
+ if (total > 0) {
+ final long hits = blobHit + treeHit;
+ log.info("Entry mapping cache hit: blob {}/{} ({}%), tree {}/{} ({}%), total {}/{} ({}%)",
+ blobHit, blobTotal, String.format("%.1f", blobTotal > 0 ? blobHit * 100.0 / blobTotal : 0),
+ treeHit, treeTotal, String.format("%.1f", treeTotal > 0 ? treeHit * 100.0 / treeTotal : 0),
+ hits, total, String.format("%.1f", hits * 100.0 / total));
+ }
+ if (entryCache != null) {
+ entryCache.close();
+ }
+ cleanUp(c);
}
- target.writeNotes(target.getDefaultNotes(), c);
- cleanUp(c);
}
protected void setUp(final Context c) {}
@@ -232,16 +293,11 @@ protected List filterRefs(final List refs, @SuppressWarnings("unused")
* Collects the set of commit Ids used as uninteresting points.
*/
protected Collection collectUninterestings(@SuppressWarnings("unused") final Context c) {
- final List result = new ArrayList<>();
- for (final Map.Entry e : refEntryMapping.entrySet()) {
- final RefEntry ref = e.getKey();
- if (ref.id != null) {
- log.debug("Previous Ref {}: added as an uninteresting point (commit: {})", ref.name, ref.id.name());
- result.add(ref.id);
- }
+ final List tips = commitMapping.getPreviousSourceTips();
+ if (!tips.isEmpty()) {
+ log.info("Using {} previous source tips as uninteresting points", tips.size());
}
- refEntryMapping.clear(); // ref entries might be removed when updated.
- return result;
+ return tips;
}
/**
@@ -272,23 +328,15 @@ protected ObjectId rewriteCommit(final RevCommit commit, final Context c) {
log.debug("Rewrite commit: {} -> {} {}", oldId.name(), newId.name(), c);
if (config.isAddingNotes) {
- target.addNote(target.getDefaultNotes(), newId, getNote(oldId, c), uc);
+ prevNotes.add(newId, oldId, uc);
+ if (isChained) {
+ final ObjectId origId = sourceOrigNotes.get(oldId);
+ origNotes.add(newId, origId != null ? origId : oldId, uc);
+ }
}
return newId;
}
- /**
- * Returns a note for a commit.
- */
- protected byte[] getNote(final ObjectId oldCommitId, @SuppressWarnings("unused") final Context c) {
- final byte[] note = source.readNote(source.getDefaultNotes(), oldCommitId);
- if (note != null) {
- return note;
- }
- final byte[] blob = new byte[Constants.OBJECT_ID_STRING_LENGTH];
- oldCommitId.copyTo(blob, 0);
- return blob;
- }
/**
* Rewrites the parents of a commit.
@@ -332,10 +380,12 @@ protected ObjectId rewriteRootTree(final ObjectId treeId, final Context c) {
*/
protected AnyColdEntry getEntry(final Entry entry, final Context c) {
// computeIfAbsent is unsuitable because this may be invoked recursively
- final AnyColdEntry cache = entryMapping.get(entry);
- if (cache != null) {
- return cache;
+ final AnyColdEntry cached = entryMapping.get(entry);
+ if (cached != null) {
+ (entry.isTree() ? treeCacheHits : blobCacheHits).incrementAndGet();
+ return cached;
}
+ (entry.isTree() ? treeCacheMisses : blobCacheMisses).incrementAndGet();
final AnyColdEntry result = rewriteEntry(entry, c);
entryMapping.put(entry, result);
return result;
diff --git a/src/main/java/jp/ac/titech/c/se/stein/util/SizeConverter.java b/src/main/java/jp/ac/titech/c/se/stein/util/SizeConverter.java
new file mode 100644
index 0000000..35f5963
--- /dev/null
+++ b/src/main/java/jp/ac/titech/c/se/stein/util/SizeConverter.java
@@ -0,0 +1,33 @@
+package jp.ac.titech.c.se.stein.util;
+
+import picocli.CommandLine.ITypeConverter;
+
+/**
+ * Converts a human-readable size string (e.g., "10", "1K", "256M", "1.5G") to bytes.
+ */
+public class SizeConverter implements ITypeConverter {
+ @Override
+ public Long convert(final String value) {
+ if (value.isEmpty()) {
+ throw new IllegalArgumentException("Empty value is given");
+ }
+ final int len = value.length();
+ final char unit = Character.toUpperCase(value.charAt(len - 1));
+ final String num = value.substring(0, len - 1);
+ return switch (unit) {
+ case 'B' -> convert(num);
+ case 'K' -> displaySizeToByteCount(num, 1024);
+ case 'M' -> displaySizeToByteCount(num, 1024 * 1024);
+ case 'G' -> displaySizeToByteCount(num, 1024 * 1024 * 1024);
+ default -> displaySizeToByteCount(value, 1);
+ };
+ }
+
+ protected long displaySizeToByteCount(final String value, final long base) {
+ if (value.contains(".")) {
+ return (long) (Double.parseDouble(value) * base);
+ } else {
+ return Long.parseLong(value) * base;
+ }
+ }
+}
diff --git a/src/test/java/jp/ac/titech/c/se/stein/app/blob/ConvertBlobTest.java b/src/test/java/jp/ac/titech/c/se/stein/app/blob/ConvertBlobTest.java
index f481438..0bf3695 100644
--- a/src/test/java/jp/ac/titech/c/se/stein/app/blob/ConvertBlobTest.java
+++ b/src/test/java/jp/ac/titech/c/se/stein/app/blob/ConvertBlobTest.java
@@ -6,7 +6,6 @@
import jp.ac.titech.c.se.stein.entry.BlobEntry;
import jp.ac.titech.c.se.stein.entry.HotEntry;
import jp.ac.titech.c.se.stein.util.ProcessRunner;
-import org.eclipse.jgit.lib.FileMode;
import org.junit.jupiter.api.Test;
import java.net.InetSocketAddress;
diff --git a/src/test/java/jp/ac/titech/c/se/stein/app/blob/CregitTest.java b/src/test/java/jp/ac/titech/c/se/stein/app/blob/CregitTest.java
index bb9115a..dd5fe0a 100644
--- a/src/test/java/jp/ac/titech/c/se/stein/app/blob/CregitTest.java
+++ b/src/test/java/jp/ac/titech/c/se/stein/app/blob/CregitTest.java
@@ -27,8 +27,12 @@ static void setUp() throws IOException {
@AfterAll
static void tearDown() {
- if (result != null) result.close();
- if (source != null) source.close();
+ if (result != null) {
+ result.close();
+ }
+ if (source != null) {
+ source.close();
+ }
}
static RepositoryAccess getResult() {
diff --git a/src/test/java/jp/ac/titech/c/se/stein/app/blob/FilterBlobTest.java b/src/test/java/jp/ac/titech/c/se/stein/app/blob/FilterBlobTest.java
index 89b9106..5fdc696 100644
--- a/src/test/java/jp/ac/titech/c/se/stein/app/blob/FilterBlobTest.java
+++ b/src/test/java/jp/ac/titech/c/se/stein/app/blob/FilterBlobTest.java
@@ -1,6 +1,7 @@
package jp.ac.titech.c.se.stein.app.blob;
import jp.ac.titech.c.se.stein.entry.Entry;
+import jp.ac.titech.c.se.stein.util.SizeConverter;
import jp.ac.titech.c.se.stein.core.RepositoryAccess;
import jp.ac.titech.c.se.stein.testing.TestRepo;
import org.eclipse.jgit.revwalk.RevCommit;
@@ -28,7 +29,7 @@ static void tearDown() {
@Test
public void testSizeConverter() {
- final FilterBlob.SizeConverter converter = new FilterBlob.SizeConverter();
+ final SizeConverter converter = new SizeConverter();
assertEquals(Long.valueOf(10), converter.convert("10"));
assertEquals(Long.valueOf(10), converter.convert("10B"));
assertEquals(Long.valueOf(1024), converter.convert("1K"));
diff --git a/src/test/java/jp/ac/titech/c/se/stein/app/blob/HistorageTest.java b/src/test/java/jp/ac/titech/c/se/stein/app/blob/HistorageTest.java
index ae27d3a..690b5f2 100644
--- a/src/test/java/jp/ac/titech/c/se/stein/app/blob/HistorageTest.java
+++ b/src/test/java/jp/ac/titech/c/se/stein/app/blob/HistorageTest.java
@@ -27,8 +27,12 @@ static void setUp() throws IOException {
@AfterAll
static void tearDown() {
- if (result != null) result.close();
- if (source != null) source.close();
+ if (result != null) {
+ result.close();
+ }
+ if (source != null) {
+ source.close();
+ }
}
static RepositoryAccess getResult() {
diff --git a/src/test/java/jp/ac/titech/c/se/stein/app/blob/HistorageViaJDTTest.java b/src/test/java/jp/ac/titech/c/se/stein/app/blob/HistorageViaJDTTest.java
index 63e72ac..b54eb68 100644
--- a/src/test/java/jp/ac/titech/c/se/stein/app/blob/HistorageViaJDTTest.java
+++ b/src/test/java/jp/ac/titech/c/se/stein/app/blob/HistorageViaJDTTest.java
@@ -9,7 +9,6 @@
import jp.ac.titech.c.se.stein.entry.HotEntry;
import jp.ac.titech.c.se.stein.core.RepositoryAccess;
import jp.ac.titech.c.se.stein.testing.TestRepo;
-import org.eclipse.jgit.lib.FileMode;
import org.eclipse.jgit.revwalk.RevCommit;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
diff --git a/src/test/java/jp/ac/titech/c/se/stein/core/cache/PersistentEntryCacheTest.java b/src/test/java/jp/ac/titech/c/se/stein/core/cache/PersistentEntryCacheTest.java
new file mode 100644
index 0000000..7d3ecf7
--- /dev/null
+++ b/src/test/java/jp/ac/titech/c/se/stein/core/cache/PersistentEntryCacheTest.java
@@ -0,0 +1,121 @@
+package jp.ac.titech.c.se.stein.core.cache;
+
+import jp.ac.titech.c.se.stein.core.RepositoryAccess;
+import jp.ac.titech.c.se.stein.core.Context;
+
+import jp.ac.titech.c.se.stein.Application;
+import jp.ac.titech.c.se.stein.app.Identity;
+import jp.ac.titech.c.se.stein.app.blob.HistorageViaJDT;
+import jp.ac.titech.c.se.stein.rewriter.BlobTranslator;
+import jp.ac.titech.c.se.stein.rewriter.RepositoryRewriter;
+import jp.ac.titech.c.se.stein.testing.TestRepo;
+import org.eclipse.jgit.lib.Repository;
+import org.eclipse.jgit.revwalk.RevCommit;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+public class PersistentEntryCacheTest {
+ static RepositoryAccess source;
+
+ @BeforeAll
+ static void setUp() throws IOException {
+ source = TestRepo.createSample(true);
+ }
+
+ @AfterAll
+ static void tearDown() {
+ source.close();
+ }
+
+ private Application.Config cacheConfig() {
+ final Application.Config config = new Application.Config();
+ config.isCachingEnabled = true;
+ return config;
+ }
+
+ private void rewriteWithCache(RepositoryRewriter rewriter, Repository targetRepo) {
+ rewriter.setConfig(cacheConfig());
+ rewriter.initialize(source.repo, targetRepo);
+ rewriter.rewrite(Context.init());
+ }
+
+ @Test
+ public void testCacheProducesCorrectResult() {
+ try (RepositoryAccess target = TestRepo.create(true)) {
+ rewriteWithCache(new Identity(), target.repo);
+ final List firstRun = target.collectCommits("refs/heads/main");
+
+ assertTrue(new File(target.repo.getDirectory(), "cache.mv.db").exists());
+
+ rewriteWithCache(new Identity(), target.repo);
+ final List secondRun = target.collectCommits("refs/heads/main");
+
+ assertEquals(firstRun.size(), secondRun.size());
+ for (int i = 0; i < firstRun.size(); i++) {
+ assertEquals(firstRun.get(i).getId(), secondRun.get(i).getId());
+ }
+ }
+ }
+
+ @Test
+ public void testCacheMatchesNonCachedResult() {
+ try (RepositoryAccess noCacheResult = TestRepo.rewrite(source, new Identity())) {
+ final List noCacheCommits = noCacheResult.collectCommits("refs/heads/main");
+
+ try (RepositoryAccess target = TestRepo.create(true)) {
+ rewriteWithCache(new Identity(), target.repo);
+ final List cachedCommits = target.collectCommits("refs/heads/main");
+
+ assertEquals(noCacheCommits.size(), cachedCommits.size());
+ for (int i = 0; i < noCacheCommits.size(); i++) {
+ assertEquals(noCacheCommits.get(i).getId(), cachedCommits.get(i).getId());
+ }
+ }
+ }
+ }
+
+ @Test
+ public void testCacheWithHistorage() {
+ try (RepositoryAccess target = TestRepo.create(true)) {
+ rewriteWithCache(new HistorageViaJDT().toRewriter(), target.repo);
+ final List firstRun = target.collectCommits("refs/heads/main");
+ assertFalse(firstRun.isEmpty());
+
+ rewriteWithCache(new HistorageViaJDT().toRewriter(), target.repo);
+ final List secondRun = target.collectCommits("refs/heads/main");
+
+ assertEquals(firstRun.size(), secondRun.size());
+ for (int i = 0; i < firstRun.size(); i++) {
+ assertEquals(firstRun.get(i).getId(), secondRun.get(i).getId());
+ }
+ }
+ }
+
+ @Test
+ public void testSecondRunHasZeroTranslations() {
+ final AtomicInteger count = new AtomicInteger();
+ final BlobTranslator counting = (entry, c) -> {
+ count.incrementAndGet();
+ return entry;
+ };
+
+ try (RepositoryAccess target = TestRepo.create(true)) {
+ count.set(0);
+ rewriteWithCache(counting.toRewriter(), target.repo);
+ assertTrue(count.get() > 0, "First run should translate blobs");
+
+ count.set(0);
+ rewriteWithCache(counting.toRewriter(), target.repo);
+ assertEquals(0, count.get(),
+ "Second run should have 100% cache hit (0 translations), but got " + count.get());
+ }
+ }
+}
diff --git a/src/test/java/jp/ac/titech/c/se/stein/rewriter/BlobTranslatorTest.java b/src/test/java/jp/ac/titech/c/se/stein/rewriter/BlobTranslatorTest.java
index d6ad813..5e772ef 100644
--- a/src/test/java/jp/ac/titech/c/se/stein/rewriter/BlobTranslatorTest.java
+++ b/src/test/java/jp/ac/titech/c/se/stein/rewriter/BlobTranslatorTest.java
@@ -4,12 +4,10 @@
import jp.ac.titech.c.se.stein.app.blob.TokenizeViaJDT;
import jp.ac.titech.c.se.stein.core.Context;
import jp.ac.titech.c.se.stein.entry.AnyHotEntry;
-import jp.ac.titech.c.se.stein.entry.BlobEntry;
import jp.ac.titech.c.se.stein.entry.Entry;
import jp.ac.titech.c.se.stein.entry.HotEntry;
import jp.ac.titech.c.se.stein.core.RepositoryAccess;
import jp.ac.titech.c.se.stein.testing.TestRepo;
-import org.eclipse.jgit.lib.FileMode;
import org.eclipse.jgit.revwalk.RevCommit;
import org.junit.jupiter.api.Test;
diff --git a/src/test/java/jp/ac/titech/c/se/stein/testing/MemoryProfile.java b/src/test/java/jp/ac/titech/c/se/stein/testing/MemoryProfile.java
new file mode 100644
index 0000000..2eac5bf
--- /dev/null
+++ b/src/test/java/jp/ac/titech/c/se/stein/testing/MemoryProfile.java
@@ -0,0 +1,123 @@
+package jp.ac.titech.c.se.stein.testing;
+
+import jp.ac.titech.c.se.stein.Application;
+import jp.ac.titech.c.se.stein.app.Identity;
+import jp.ac.titech.c.se.stein.app.blob.HistorageViaJDT;
+import jp.ac.titech.c.se.stein.core.Context;
+import jp.ac.titech.c.se.stein.rewriter.RepositoryRewriter;
+import jp.ac.titech.c.se.stein.util.TemporaryFile;
+import org.eclipse.jgit.internal.storage.file.FileRepository;
+import org.eclipse.jgit.storage.file.FileRepositoryBuilder;
+
+import java.io.File;
+import java.io.IOException;
+import java.lang.reflect.Field;
+import java.util.Map;
+
+/**
+ * Profiles memory usage of entryMapping during rewrite.
+ * Usage: java -Xmx4g -cp ... MemoryProfile [command]
+ * command: identity (default) or historage
+ */
+public class MemoryProfile {
+ public static void main(String[] args) throws Exception {
+ final String repoPath = args.length > 0 ? args[0] : ".";
+ final String command = args.length > 1 ? args[1] : "identity";
+ final File sourceDir = new File(repoPath);
+
+ if (!new File(sourceDir, ".git").exists() && !new File(sourceDir, "HEAD").exists()) {
+ System.err.println("Not a git repository: " + sourceDir.getAbsolutePath());
+ System.exit(1);
+ }
+
+ final Runtime rt = Runtime.getRuntime();
+ System.out.printf("Max heap: %d MB%n", rt.maxMemory() / (1024 * 1024));
+ System.out.printf("Repo: %s%n", sourceDir.getAbsolutePath());
+ System.out.printf("Command: %s%n%n", command);
+
+ final boolean isBare = !new File(sourceDir, ".git").exists();
+ final FileRepository sourceRepo = openRepository(sourceDir, isBare);
+
+ try (TemporaryFile.Directory tmp = TemporaryFile.directoryOf("mem-profile-")) {
+ final FileRepository targetRepo = createRepository(tmp.getPath().toFile());
+
+ final RepositoryRewriter rewriter = switch (command) {
+ case "historage" -> new HistorageViaJDT().toRewriter();
+ default -> new Identity();
+ };
+ rewriter.setConfig(new Application.Config());
+ rewriter.initialize(sourceRepo, targetRepo);
+
+ // Before
+ System.gc();
+ Thread.sleep(500);
+ System.gc();
+ final long heapBefore = usedHeap();
+ System.out.printf("Before rewrite:%n");
+ System.out.printf(" Heap used: %d MB%n%n", heapBefore / (1024 * 1024));
+
+ // Run
+ rewriter.rewrite(Context.init());
+
+ // After (before GC)
+ final long heapAfterNoGC = usedHeap();
+
+ // After (after GC)
+ System.gc();
+ Thread.sleep(500);
+ System.gc();
+ final long heapAfterGC = usedHeap();
+
+ // entryMapping size
+ final int entryMappingSize = getEntryMappingSize(rewriter);
+
+ System.out.printf("After rewrite:%n");
+ System.out.printf(" Heap used (before GC): %d MB%n", heapAfterNoGC / (1024 * 1024));
+ System.out.printf(" Heap used (after GC): %d MB%n", heapAfterGC / (1024 * 1024));
+ System.out.printf(" Heap delta (after GC): %d MB%n", (heapAfterGC - heapBefore) / (1024 * 1024));
+ System.out.printf(" entryMapping size: %d entries%n", entryMappingSize);
+ if (entryMappingSize > 0) {
+ final long deltaBytes = heapAfterGC - heapBefore;
+ System.out.printf(" Approx bytes/entry: %d bytes%n", deltaBytes / entryMappingSize);
+ }
+
+ sourceRepo.close();
+ targetRepo.close();
+ }
+ }
+
+ static int getEntryMappingSize(RepositoryRewriter rewriter) {
+ try {
+ Field f = RepositoryRewriter.class.getDeclaredField("entryMapping");
+ f.setAccessible(true);
+ Map, ?> map = (Map, ?>) f.get(rewriter);
+ return map.size();
+ } catch (Exception e) {
+ System.err.println("Could not access entryMapping: " + e.getMessage());
+ return -1;
+ }
+ }
+
+ static long usedHeap() {
+ final Runtime rt = Runtime.getRuntime();
+ return rt.totalMemory() - rt.freeMemory();
+ }
+
+ static FileRepository openRepository(File dir, boolean isBare) throws IOException {
+ final FileRepositoryBuilder builder = new FileRepositoryBuilder();
+ if (isBare) {
+ builder.setGitDir(dir).setBare();
+ } else {
+ builder.setWorkTree(dir).setGitDir(new File(dir, ".git"));
+ }
+ return (FileRepository) builder.readEnvironment().build();
+ }
+
+ static FileRepository createRepository(File dir) throws IOException {
+ final FileRepositoryBuilder builder = new FileRepositoryBuilder();
+ builder.setGitDir(dir).setBare();
+ final FileRepository repo = (FileRepository) builder.build();
+ repo.create(true);
+ return repo;
+ }
+}
diff --git a/src/test/java/jp/ac/titech/c/se/stein/testing/RewriteBenchmark.java b/src/test/java/jp/ac/titech/c/se/stein/testing/RewriteBenchmark.java
index 2ca7d6b..1cc4539 100644
--- a/src/test/java/jp/ac/titech/c/se/stein/testing/RewriteBenchmark.java
+++ b/src/test/java/jp/ac/titech/c/se/stein/testing/RewriteBenchmark.java
@@ -43,17 +43,20 @@ public static void main(String[] args) throws Exception {
}
final boolean alternates = Arrays.asList(args).contains("--alternates");
+ final boolean cache = Arrays.asList(args).contains("--cache");
- System.out.println("Benchmarking: " + sourceDir.getAbsolutePath() + (alternates ? " (alternates)" : ""));
+ System.out.println("Benchmarking: " + sourceDir.getAbsolutePath()
+ + (alternates ? " (alternates)" : "")
+ + (cache ? " (cache)" : ""));
System.out.println();
final List results = new ArrayList<>();
- results.add(benchmark("identity", sourceDir, new Identity(), alternates));
- results.add(benchmark("tokenize-jdt", sourceDir, new TokenizeViaJDT().toRewriter(), alternates));
- results.add(benchmark("historage-jdt", sourceDir, new HistorageViaJDT().toRewriter(), alternates));
+ results.add(benchmark("identity", sourceDir, Identity::new, alternates, cache));
+ results.add(benchmark("tokenize-jdt", sourceDir, () -> new TokenizeViaJDT().toRewriter(), alternates, cache));
+ results.add(benchmark("historage-jdt", sourceDir, () -> new HistorageViaJDT().toRewriter(), alternates, cache));
results.add(benchmark("historage+tokenize", sourceDir,
- new BlobTranslator.Composite(new HistorageViaJDT(), new TokenizeViaJDT()), alternates));
+ () -> new BlobTranslator.Composite(new HistorageViaJDT(), new TokenizeViaJDT()), alternates, cache));
// summary
System.out.println();
@@ -76,7 +79,13 @@ public static void main(String[] args) throws Exception {
System.out.println(GSON.toJson(report));
}
- static JsonObject benchmark(String name, File sourceDir, RepositoryRewriter rewriter, boolean useAlternates) throws IOException {
+ @FunctionalInterface
+ interface RewriterFactory {
+ RepositoryRewriter create();
+ }
+
+ static JsonObject benchmark(String name, File sourceDir, RewriterFactory factory,
+ boolean useAlternates, boolean useCache) throws IOException {
System.out.printf("Running %-25s ... ", name);
System.out.flush();
@@ -96,7 +105,12 @@ static JsonObject benchmark(String name, File sourceDir, RepositoryRewriter rewr
targetRepo = openRepository(tmp.getPath().toFile(), true);
}
- rewriter.setConfig(new Application.Config());
+ final Application.Config config = new Application.Config();
+ if (useCache) {
+ config.isCachingEnabled = true;
+ }
+ final RepositoryRewriter rewriter = factory.create();
+ rewriter.setConfig(config);
rewriter.initialize(sourceRepo, targetRepo);
System.gc();
@@ -105,18 +119,45 @@ static JsonObject benchmark(String name, File sourceDir, RepositoryRewriter rewr
final Instant start = Instant.now();
rewriter.rewrite(Context.init());
final Instant end = Instant.now();
+ final long timeMs = Duration.between(start, end).toMillis();
+ final long heapMb = Math.max(0, (usedHeap() - heapBefore) / (1024 * 1024));
+ final int commits = countCommits(targetRepo);
- final JsonObject result = new JsonObject();
+ System.out.printf("%d ms, %d MB heap%n", timeMs, heapMb);
+
+ // If cache is enabled, run a second time (incremental) with a fresh rewriter
+ JsonObject result = new JsonObject();
result.addProperty("name", name);
- result.addProperty("timeMs", Duration.between(start, end).toMillis());
- result.addProperty("heapMb", Math.max(0, (usedHeap() - heapBefore) / (1024 * 1024)));
- result.addProperty("commits", countCommits(targetRepo));
+ result.addProperty("timeMs", timeMs);
+ result.addProperty("heapMb", heapMb);
+ result.addProperty("commits", commits);
+
+ // Second run: incremental (notes skip already-processed commits)
+ {
+ System.out.printf(" (2nd run) %-21s ... ", name);
+ System.out.flush();
+
+ final RepositoryRewriter rewriter2 = factory.create();
+ rewriter2.setConfig(config);
+ rewriter2.initialize(sourceRepo, targetRepo);
+
+ System.gc();
+ final long heapBefore2 = usedHeap();
+ final Instant start2 = Instant.now();
+ rewriter2.rewrite(Context.init());
+ final Instant end2 = Instant.now();
+ final long timeMs2 = Duration.between(start2, end2).toMillis();
+ final long heapMb2 = Math.max(0, (usedHeap() - heapBefore2) / (1024 * 1024));
+
+ System.out.printf("%d ms, %d MB heap%n", timeMs2, heapMb2);
+
+ result.addProperty("secondTimeMs", timeMs2);
+ result.addProperty("secondHeapMb", heapMb2);
+ }
sourceRepo.close();
targetRepo.close();
- System.out.printf("%d ms, %d MB heap%n",
- result.get("timeMs").getAsLong(), result.get("heapMb").getAsLong());
return result;
}
}