From 4ffb3aa4ea2e662351d550b197f328826d635842 Mon Sep 17 00:00:00 2001 From: Michael Vinther Date: Sat, 18 Apr 2026 23:05:51 +0200 Subject: [PATCH 1/7] Optimize local contrast --- .../BitmapOperations/IIRMinMaxOperation.cs | 144 ++++++++++++++---- .../BitmapOperations/IIRSmoothOperation.cs | 49 +++++- 2 files changed, 162 insertions(+), 31 deletions(-) diff --git a/PhotoLocator/BitmapOperations/IIRMinMaxOperation.cs b/PhotoLocator/BitmapOperations/IIRMinMaxOperation.cs index 0433ed5..804b672 100644 --- a/PhotoLocator/BitmapOperations/IIRMinMaxOperation.cs +++ b/PhotoLocator/BitmapOperations/IIRMinMaxOperation.cs @@ -48,7 +48,59 @@ static public void MinFilter(FloatBitmap plane, float filterSize) } }); // Vertical - Parallel.For(0, plane.Stride, x => + int quaterWidth = plane.Stride / 4; + Parallel.For(0, quaterWidth, x4 => + { + int x = x4 * 4; + var width = plane.Stride; + var height = plane.Height; + fixed (float* pixels = plane.Elements) + { + var pix0 = &pixels[x]; + var prev0 = *pix0; + var pix1 = pix0 + 1; + var prev1 = *pix1; + var pix2 = pix0 + 2; + var prev2 = *pix2; + var pix3 = pix0 + 3; + var prev3 = *pix3; + for (var y = 1; y < height; y++) + { + pix0 += width; + if (prev0 < *pix0) { prev0 = (prev0 * filterSize + *pix0) * scale; *pix0 = prev0; } + else prev0 = *pix0; + pix1 += width; + if (prev1 < *pix1) { prev1 = (prev1 * filterSize + *pix1) * scale; *pix1 = prev1; } + else prev1 = *pix1; + pix2 += width; + if (prev2 < *pix2) { prev2 = (prev2 * filterSize + *pix2) * scale; *pix2 = prev2; } + else prev2 = *pix2; + pix3 += width; + if (prev3 < *pix3) { prev3 = (prev3 * filterSize + *pix3) * scale; *pix3 = prev3; } + else prev3 = *pix3; + } + prev0 = *pix0; + prev1 = *pix1; + prev2 = *pix2; + prev3 = *pix3; + for (var y = 1; y < height; y++) + { + pix0 -= width; + if (prev0 < *pix0) { prev0 = (prev0 * filterSize + *pix0) * scale; *pix0 = prev0; } + else prev0 = *pix0; + pix1 -= width; + if (prev1 < *pix1) { prev1 = (prev1 * filterSize + *pix1) * scale; *pix1 = prev1; } + else prev1 = *pix1; + pix2 -= width; + if (prev2 < *pix2) { prev2 = (prev2 * filterSize + *pix2) * scale; *pix2 = prev2; } + else prev2 = *pix2; + pix3 -= width; + if (prev3 < *pix3) { prev3 = (prev3 * filterSize + *pix3) * scale; *pix3 = prev3; } + else prev3 = *pix3; + } + } + }); + Parallel.For(quaterWidth * 4, plane.Stride, x => { var width = plane.Stride; var height = plane.Height; @@ -59,25 +111,15 @@ static public void MinFilter(FloatBitmap plane, float filterSize) for (var y = 1; y < height; y++) { pix += width; - if (prev < *pix) - { - prev = (prev * filterSize + *pix) * scale; - *pix = prev; - } - else - prev = *pix; + if (prev < *pix) { prev = (prev * filterSize + *pix) * scale; *pix = prev; } + else prev = *pix; } prev = *pix; for (var y = 1; y < height; y++) { pix -= width; - if (prev < *pix) - { - prev = (prev * filterSize + *pix) * scale; - *pix = prev; - } - else - prev = *pix; + if (prev < *pix) { prev = (prev * filterSize + *pix) * scale; *pix = prev; } + else prev = *pix; } } }); @@ -129,7 +171,59 @@ static public void MaxFilter(FloatBitmap plane, float filterSize) } }); // Vertical - Parallel.For(0, plane.Stride, x => + int quaterWidth = plane.Stride / 4; + Parallel.For(0, quaterWidth, x4 => + { + int x = x4 * 4; + var width = plane.Stride; + var height = plane.Height; + fixed (float* pixels = plane.Elements) + { + var pix0 = &pixels[x]; + var prev0 = *pix0; + var pix1 = pix0 + 1; + var prev1 = *pix1; + var pix2 = pix0 + 2; + var prev2 = *pix2; + var pix3 = pix0 + 3; + var prev3 = *pix3; + for (var y = 1; y < height; y++) + { + pix0 += width; + if (prev0 > *pix0) { prev0 = (prev0 * filterSize + *pix0) * scale; *pix0 = prev0; } + else prev0 = *pix0; + pix1 += width; + if (prev1 > *pix1) { prev1 = (prev1 * filterSize + *pix1) * scale; *pix1 = prev1; } + else prev1 = *pix1; + pix2 += width; + if (prev2 > *pix2) { prev2 = (prev2 * filterSize + *pix2) * scale; *pix2 = prev2; } + else prev2 = *pix2; + pix3 += width; + if (prev3 > *pix3) { prev3 = (prev3 * filterSize + *pix3) * scale; *pix3 = prev3; } + else prev3 = *pix3; + } + prev0 = *pix0; + prev1 = *pix1; + prev2 = *pix2; + prev3 = *pix3; + for (var y = 1; y < height; y++) + { + pix0 -= width; + if (prev0 > *pix0) { prev0 = (prev0 * filterSize + *pix0) * scale; *pix0 = prev0; } + else prev0 = *pix0; + pix1 -= width; + if (prev1 > *pix1) { prev1 = (prev1 * filterSize + *pix1) * scale; *pix1 = prev1; } + else prev1 = *pix1; + pix2 -= width; + if (prev2 > *pix2) { prev2 = (prev2 * filterSize + *pix2) * scale; *pix2 = prev2; } + else prev2 = *pix2; + pix3 -= width; + if (prev3 > *pix3) { prev3 = (prev3 * filterSize + *pix3) * scale; *pix3 = prev3; } + else prev3 = *pix3; + } + } + }); + Parallel.For(quaterWidth * 4, plane.Stride, x => { var width = plane.Stride; var height = plane.Height; @@ -140,25 +234,15 @@ static public void MaxFilter(FloatBitmap plane, float filterSize) for (var y = 1; y < height; y++) { pix += width; - if (prev > *pix) - { - prev = (prev * filterSize + *pix) * scale; - *pix = prev; - } - else - prev = *pix; + if (prev > *pix) { prev = (prev * filterSize + *pix) * scale; *pix = prev; } + else prev = *pix; } prev = *pix; for (var y = 1; y < height; y++) { pix -= width; - if (prev > *pix) - { - prev = (prev * filterSize + *pix) * scale; - *pix = prev; - } - else - prev = *pix; + if (prev > *pix) { prev = (prev * filterSize + *pix) * scale; *pix = prev; } + else prev = *pix; } } }); diff --git a/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs b/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs index 790af72..117de92 100644 --- a/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs +++ b/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs @@ -37,7 +37,54 @@ public static void Apply(FloatBitmap plane, float filterSize) } }); // Vertical smooth - Parallel.For(0, plane.Stride, x => + int quaterWidth = plane.Stride / 4; + Parallel.For(0, quaterWidth, x4 => + { + int x = x4 * 4; + var width = plane.Stride; + fixed (float* pixels = plane.Elements) + { + var pix0 = &pixels[x]; + var value0 = *pix0; + var pix1 = pix0 + 1; + var value1 = *pix1; + var pix2 = pix0 + 2; + var value2 = *pix2; + var pix3 = pix0 + 3; + var value3 = *pix3; + for (var y = plane.Height; y > 1; y--) + { + pix0 += width; + value0 = (value0 * filterSize + *pix0) * scale; + *pix0 = value0; + pix1 += width; + value1 = (value1 * filterSize + *pix1) * scale; + *pix1 = value1; + pix2 += width; + value2 = (value2 * filterSize + *pix2) * scale; + *pix2 = value2; + pix3 += width; + value3 = (value3 * filterSize + *pix3) * scale; + *pix3 = value3; + } + for (var y = plane.Height; y > 1; y--) + { + pix0 -= width; + value0 = (value0 * filterSize + *pix0) * scale; + *pix0 = value0; + pix1 -= width; + value1 = (value1 * filterSize + *pix1) * scale; + *pix1 = value1; + pix2 -= width; + value2 = (value2 * filterSize + *pix2) * scale; + *pix2 = value2; + pix3 -= width; + value3 = (value3 * filterSize + *pix3) * scale; + *pix3 = value3; + } + } + }); + Parallel.For(quaterWidth * 4, plane.Stride, x => { var width = plane.Stride; fixed (float* pixels = plane.Elements) From 8a8c2cd2b4b30f5ca6f5fcd7914a66be29161f6e Mon Sep 17 00:00:00 2001 From: Michael Vinther Date: Sun, 19 Apr 2026 10:25:31 +0200 Subject: [PATCH 2/7] Use vector operations --- .../BitmapOperations/IIRSmoothOperation.cs | 197 ++++++++++++------ 1 file changed, 137 insertions(+), 60 deletions(-) diff --git a/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs b/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs index 117de92..69c61f4 100644 --- a/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs +++ b/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs @@ -1,5 +1,7 @@ using System.Diagnostics; using System.Threading.Tasks; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; namespace PhotoLocator.BitmapOperations { @@ -36,75 +38,150 @@ public static void Apply(FloatBitmap plane, float filterSize) } } }); - // Vertical smooth - int quaterWidth = plane.Stride / 4; - Parallel.For(0, quaterWidth, x4 => + + // Vertical smooth - vectorized over columns when possible + unsafe { - int x = x4 * 4; - var width = plane.Stride; - fixed (float* pixels = plane.Elements) + // Prefer hardware intrinsics if available (AVX/ SSE). Fallback to scalar per-column processing. + bool hasAvx = false;//Avx.IsSupported; + bool hasSse = false;//Sse.IsSupported; + int vectorSize = hasAvx ? 8 : 4; + int vectorizedSegments = plane.Stride / vectorSize; + + if (hasAvx) { - var pix0 = &pixels[x]; - var value0 = *pix0; - var pix1 = pix0 + 1; - var value1 = *pix1; - var pix2 = pix0 + 2; - var value2 = *pix2; - var pix3 = pix0 + 3; - var value3 = *pix3; - for (var y = plane.Height; y > 1; y--) + var filterSizeV = Vector256.Create(filterSize); + var scaleV = Vector256.Create(scale); + Parallel.For(0, vectorizedSegments, vi => { - pix0 += width; - value0 = (value0 * filterSize + *pix0) * scale; - *pix0 = value0; - pix1 += width; - value1 = (value1 * filterSize + *pix1) * scale; - *pix1 = value1; - pix2 += width; - value2 = (value2 * filterSize + *pix2) * scale; - *pix2 = value2; - pix3 += width; - value3 = (value3 * filterSize + *pix3) * scale; - *pix3 = value3; - } - for (var y = plane.Height; y > 1; y--) + int x = vi * 8; + var width = plane.Stride; + var height = plane.Height; + fixed (float* pixels = plane.Elements) + { + float* colPtr = &pixels[x]; + var v = Avx.LoadVector256(colPtr); + Avx.Store(colPtr, v); + for (int y = 1; y < height; y++) + { + colPtr += width; + var inV = Avx.LoadVector256(colPtr); + v = Avx.Multiply(Avx.Add(Avx.Multiply(v, filterSizeV), inV), scaleV); + Avx.Store(colPtr, v); + } + for (int y = height - 1; y > 0; y--) + { + colPtr -= width; + var inV = Avx.LoadVector256(colPtr); + v = Avx.Multiply(Avx.Add(Avx.Multiply(v, filterSizeV), inV), scaleV); + Avx.Store(colPtr, v); + } + } + }); + } + else if (hasSse) + { + var filterSizeV = Vector128.Create(filterSize); + var scaleV = Vector128.Create(scale); + Parallel.For(0, vectorizedSegments, vi => { - pix0 -= width; - value0 = (value0 * filterSize + *pix0) * scale; - *pix0 = value0; - pix1 -= width; - value1 = (value1 * filterSize + *pix1) * scale; - *pix1 = value1; - pix2 -= width; - value2 = (value2 * filterSize + *pix2) * scale; - *pix2 = value2; - pix3 -= width; - value3 = (value3 * filterSize + *pix3) * scale; - *pix3 = value3; - } + int x = vi * 4; + var width = plane.Stride; + var height = plane.Height; + fixed (float* pixels = plane.Elements) + { + float* colPtr = &pixels[x]; + var v = Sse.LoadVector128(colPtr); + Sse.Store(colPtr, v); + for (int y = 1; y < height; y++) + { + colPtr += width; + var inV = Sse.LoadVector128(colPtr); + v = Sse.Multiply(Sse.Add(Sse.Multiply(v, filterSizeV), inV), scaleV); + Sse.Store(colPtr, v); + } + for (int y = height - 1; y > 0; y--) + { + colPtr -= width; + var inV = Sse.LoadVector128(colPtr); + v = Sse.Multiply(Sse.Add(Sse.Multiply(v, filterSizeV), inV), scaleV); + Sse.Store(colPtr, v); + } + } + }); } - }); - Parallel.For(quaterWidth * 4, plane.Stride, x => - { - var width = plane.Stride; - fixed (float* pixels = plane.Elements) + else { - var pix = &pixels[x]; - var value = *pix; - for (var y = plane.Height; y > 1; y--) + Parallel.For(0, vectorizedSegments, vi => { - pix += width; - value = (value * filterSize + *pix) * scale; - *pix = value; - } - for (var y = plane.Height; y > 1; y--) + int x = vi * 4; + var width = plane.Stride; + fixed (float* pixels = plane.Elements) + { + var pix0 = &pixels[x]; + var value0 = *pix0; + var pix1 = pix0 + 1; + var value1 = *pix1; + var pix2 = pix0 + 2; + var value2 = *pix2; + var pix3 = pix0 + 3; + var value3 = *pix3; + for (var y = plane.Height; y > 1; y--) + { + pix0 += width; + value0 = (value0 * filterSize + *pix0) * scale; + *pix0 = value0; + pix1 += width; + value1 = (value1 * filterSize + *pix1) * scale; + *pix1 = value1; + pix2 += width; + value2 = (value2 * filterSize + *pix2) * scale; + *pix2 = value2; + pix3 += width; + value3 = (value3 * filterSize + *pix3) * scale; + *pix3 = value3; + } + for (var y = plane.Height; y > 1; y--) + { + pix0 -= width; + value0 = (value0 * filterSize + *pix0) * scale; + *pix0 = value0; + pix1 -= width; + value1 = (value1 * filterSize + *pix1) * scale; + *pix1 = value1; + pix2 -= width; + value2 = (value2 * filterSize + *pix2) * scale; + *pix2 = value2; + pix3 -= width; + value3 = (value3 * filterSize + *pix3) * scale; + *pix3 = value3; + } + } + }); + } + // Remaining columns (scalar) - includes columns not divisible by vectorSize + Parallel.For(vectorizedSegments * vectorSize, plane.Stride, x => + { + var width = plane.Stride; + fixed (float* pixels = plane.Elements) { - pix -= width; - value = (value * filterSize + *pix) * scale; - *pix = value; + var pix = &pixels[x]; + var value = *pix; + for (var y = plane.Height; y > 1; y--) + { + pix += width; + value = (value * filterSize + *pix) * scale; + *pix = value; + } + for (var y = plane.Height; y > 1; y--) + { + pix -= width; + value = (value * filterSize + *pix) * scale; + *pix = value; + } } - } - }); + }); + } } } } From a3a4b335a474a67715dca6757a6d1e2b53d31085 Mon Sep 17 00:00:00 2001 From: Michael Vinther Date: Sun, 19 Apr 2026 16:18:55 +0200 Subject: [PATCH 3/7] Benchmark class --- PhotoLocator/BitmapOperations/FloatBitmap.cs | 2 + .../BitmapOperations/IIRSmoothOperation.cs | 69 +++++++++---------- PhotoLocatorTest/BenchmarkHelper.cs | 33 +++++++++ 3 files changed, 69 insertions(+), 35 deletions(-) create mode 100644 PhotoLocatorTest/BenchmarkHelper.cs diff --git a/PhotoLocator/BitmapOperations/FloatBitmap.cs b/PhotoLocator/BitmapOperations/FloatBitmap.cs index 66f12e2..5194dc9 100644 --- a/PhotoLocator/BitmapOperations/FloatBitmap.cs +++ b/PhotoLocator/BitmapOperations/FloatBitmap.cs @@ -227,6 +227,8 @@ public BitmapSource ToBitmapSource(double dpiX, double dpiY, double gamma, Pixel private byte[] ToPixels8(double gamma) { + if (Elements is null) + throw new InvalidOperationException("Bitmap not initialized"); var pixels = ArrayPool.Shared.Rent(Height * Stride); var gammaLut = CreateGammaLookupFloatToByte(gamma); unsafe diff --git a/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs b/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs index 69c61f4..58a3b15 100644 --- a/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs +++ b/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs @@ -14,23 +14,24 @@ public static void Apply(FloatBitmap plane, float filterSize) Debug.Assert(plane.PlaneCount == 1); filterSize /= 4f; var scale = 1f / (1f + filterSize); + var height = plane.Height; unsafe { // Horizontal smooth - Parallel.For(0, plane.Height, y => + Parallel.For(0, height, y => { - var width = plane.Stride; + var stride = plane.Stride; fixed (float* pixels = plane.Elements) { - var pix = &pixels[y * width]; + var pix = &pixels[y * stride]; var value = *pix; - for (var x = width; x > 1; x--) + for (var x = stride; x > 1; x--) { pix++; value = (value * filterSize + *pix) * scale; *pix = value; } - for (var x = width; x > 1; x--) + for (var x = stride; x > 1; x--) { pix--; value = (value * filterSize + *pix) * scale; @@ -43,8 +44,8 @@ public static void Apply(FloatBitmap plane, float filterSize) unsafe { // Prefer hardware intrinsics if available (AVX/ SSE). Fallback to scalar per-column processing. - bool hasAvx = false;//Avx.IsSupported; - bool hasSse = false;//Sse.IsSupported; + bool hasAvx = Avx.IsSupported; + bool hasSse = Sse.IsSupported; int vectorSize = hasAvx ? 8 : 4; int vectorizedSegments = plane.Stride / vectorSize; @@ -55,23 +56,22 @@ public static void Apply(FloatBitmap plane, float filterSize) Parallel.For(0, vectorizedSegments, vi => { int x = vi * 8; - var width = plane.Stride; - var height = plane.Height; + var stride = plane.Stride; fixed (float* pixels = plane.Elements) { float* colPtr = &pixels[x]; var v = Avx.LoadVector256(colPtr); Avx.Store(colPtr, v); - for (int y = 1; y < height; y++) + for (var y = height; y > 1; y--) { - colPtr += width; + colPtr += stride; var inV = Avx.LoadVector256(colPtr); v = Avx.Multiply(Avx.Add(Avx.Multiply(v, filterSizeV), inV), scaleV); Avx.Store(colPtr, v); } - for (int y = height - 1; y > 0; y--) + for (var y = height; y > 1; y--) { - colPtr -= width; + colPtr -= stride; var inV = Avx.LoadVector256(colPtr); v = Avx.Multiply(Avx.Add(Avx.Multiply(v, filterSizeV), inV), scaleV); Avx.Store(colPtr, v); @@ -86,23 +86,22 @@ public static void Apply(FloatBitmap plane, float filterSize) Parallel.For(0, vectorizedSegments, vi => { int x = vi * 4; - var width = plane.Stride; - var height = plane.Height; + var stride = plane.Stride; fixed (float* pixels = plane.Elements) { float* colPtr = &pixels[x]; var v = Sse.LoadVector128(colPtr); Sse.Store(colPtr, v); - for (int y = 1; y < height; y++) + for (var y = height; y > 1; y--) { - colPtr += width; + colPtr += stride; var inV = Sse.LoadVector128(colPtr); v = Sse.Multiply(Sse.Add(Sse.Multiply(v, filterSizeV), inV), scaleV); Sse.Store(colPtr, v); } - for (int y = height - 1; y > 0; y--) + for (var y = height; y > 1; y--) { - colPtr -= width; + colPtr -= stride; var inV = Sse.LoadVector128(colPtr); v = Sse.Multiply(Sse.Add(Sse.Multiply(v, filterSizeV), inV), scaleV); Sse.Store(colPtr, v); @@ -115,7 +114,7 @@ public static void Apply(FloatBitmap plane, float filterSize) Parallel.For(0, vectorizedSegments, vi => { int x = vi * 4; - var width = plane.Stride; + var stride = plane.Stride; fixed (float* pixels = plane.Elements) { var pix0 = &pixels[x]; @@ -126,33 +125,33 @@ public static void Apply(FloatBitmap plane, float filterSize) var value2 = *pix2; var pix3 = pix0 + 3; var value3 = *pix3; - for (var y = plane.Height; y > 1; y--) + for (var y = height; y > 1; y--) { - pix0 += width; + pix0 += stride; value0 = (value0 * filterSize + *pix0) * scale; *pix0 = value0; - pix1 += width; + pix1 += stride; value1 = (value1 * filterSize + *pix1) * scale; *pix1 = value1; - pix2 += width; + pix2 += stride; value2 = (value2 * filterSize + *pix2) * scale; *pix2 = value2; - pix3 += width; + pix3 += stride; value3 = (value3 * filterSize + *pix3) * scale; *pix3 = value3; } - for (var y = plane.Height; y > 1; y--) + for (var y = height; y > 1; y--) { - pix0 -= width; + pix0 -= stride; value0 = (value0 * filterSize + *pix0) * scale; *pix0 = value0; - pix1 -= width; + pix1 -= stride; value1 = (value1 * filterSize + *pix1) * scale; *pix1 = value1; - pix2 -= width; + pix2 -= stride; value2 = (value2 * filterSize + *pix2) * scale; *pix2 = value2; - pix3 -= width; + pix3 -= stride; value3 = (value3 * filterSize + *pix3) * scale; *pix3 = value3; } @@ -162,20 +161,20 @@ public static void Apply(FloatBitmap plane, float filterSize) // Remaining columns (scalar) - includes columns not divisible by vectorSize Parallel.For(vectorizedSegments * vectorSize, plane.Stride, x => { - var width = plane.Stride; + var stride = plane.Stride; fixed (float* pixels = plane.Elements) { var pix = &pixels[x]; var value = *pix; - for (var y = plane.Height; y > 1; y--) + for (var y = height; y > 1; y--) { - pix += width; + pix += stride; value = (value * filterSize + *pix) * scale; *pix = value; } - for (var y = plane.Height; y > 1; y--) + for (var y = height; y > 1; y--) { - pix -= width; + pix -= stride; value = (value * filterSize + *pix) * scale; *pix = value; } diff --git a/PhotoLocatorTest/BenchmarkHelper.cs b/PhotoLocatorTest/BenchmarkHelper.cs new file mode 100644 index 0000000..ca406e6 --- /dev/null +++ b/PhotoLocatorTest/BenchmarkHelper.cs @@ -0,0 +1,33 @@ +using System.Diagnostics; +using System.Linq; + +namespace PhotoLocator +{ + public class BenchmarkHelper + { + public static void Run(Action action, int innerLoops = 1, int outerIterations = 5) + { +#if DEBUG + Console.WriteLine("WARNING: Running benchmark in DEBUG mode. Results may not reflect release performance."); +#endif + var iterationTimes = new long[outerIterations]; + for (int i = 0; i < outerIterations; i++) + { + GC.Collect(); + GC.TryStartNoGCRegion(1024 * 1024 * 100); + var sw = Stopwatch.StartNew(); + for (int j = 0; j < innerLoops; j++) + action(); + sw.Stop(); + Console.WriteLine($"Iteration {i + 1}: {sw.ElapsedMilliseconds} ms"); + GC.EndNoGCRegion(); + iterationTimes[i] = sw.ElapsedMilliseconds; + } + var median = iterationTimes.Order().Skip(outerIterations / 2).First(); + var min = iterationTimes.Min(); + Console.WriteLine($"Median time: {median} ms"); + Console.WriteLine($"Minimum time: {min} ms"); + throw new AssertFailedException($"Min={min} ms, median={median} ms"); + } + } +} From 54a39279c7ed24a90e504ce79e6ef9a907a0291a Mon Sep 17 00:00:00 2001 From: Michael Vinther Date: Sun, 19 Apr 2026 17:18:38 +0200 Subject: [PATCH 4/7] Use Vector4 --- .../BitmapOperations/IIRSmoothOperation.cs | 78 ++++--------------- 1 file changed, 14 insertions(+), 64 deletions(-) diff --git a/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs b/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs index 58a3b15..e1be398 100644 --- a/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs +++ b/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs @@ -2,6 +2,7 @@ using System.Threading.Tasks; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +using System.Numerics; namespace PhotoLocator.BitmapOperations { @@ -43,9 +44,7 @@ public static void Apply(FloatBitmap plane, float filterSize) // Vertical smooth - vectorized over columns when possible unsafe { - // Prefer hardware intrinsics if available (AVX/ SSE). Fallback to scalar per-column processing. bool hasAvx = Avx.IsSupported; - bool hasSse = Sse.IsSupported; int vectorSize = hasAvx ? 8 : 4; int vectorizedSegments = plane.Stride / vectorSize; @@ -61,7 +60,6 @@ public static void Apply(FloatBitmap plane, float filterSize) { float* colPtr = &pixels[x]; var v = Avx.LoadVector256(colPtr); - Avx.Store(colPtr, v); for (var y = height; y > 1; y--) { colPtr += stride; @@ -79,10 +77,10 @@ public static void Apply(FloatBitmap plane, float filterSize) } }); } - else if (hasSse) + else { - var filterSizeV = Vector128.Create(filterSize); - var scaleV = Vector128.Create(scale); + var filterSizeV = Vector4.Create(filterSize); + var scaleV = Vector4.Create(scale); Parallel.For(0, vectorizedSegments, vi => { int x = vi * 4; @@ -90,75 +88,27 @@ public static void Apply(FloatBitmap plane, float filterSize) fixed (float* pixels = plane.Elements) { float* colPtr = &pixels[x]; - var v = Sse.LoadVector128(colPtr); - Sse.Store(colPtr, v); + int vectorizedSegments = plane.Stride / vectorSize; + var v = Vector4.Load(colPtr); for (var y = height; y > 1; y--) { colPtr += stride; - var inV = Sse.LoadVector128(colPtr); - v = Sse.Multiply(Sse.Add(Sse.Multiply(v, filterSizeV), inV), scaleV); - Sse.Store(colPtr, v); + var inV = Vector4.Load(colPtr); + v = Vector4.Multiply(Vector4.Add(Vector4.Multiply(v, filterSizeV), inV), scaleV); + v.Store(colPtr); } for (var y = height; y > 1; y--) { colPtr -= stride; - var inV = Sse.LoadVector128(colPtr); - v = Sse.Multiply(Sse.Add(Sse.Multiply(v, filterSizeV), inV), scaleV); - Sse.Store(colPtr, v); - } - } - }); - } - else - { - Parallel.For(0, vectorizedSegments, vi => - { - int x = vi * 4; - var stride = plane.Stride; - fixed (float* pixels = plane.Elements) - { - var pix0 = &pixels[x]; - var value0 = *pix0; - var pix1 = pix0 + 1; - var value1 = *pix1; - var pix2 = pix0 + 2; - var value2 = *pix2; - var pix3 = pix0 + 3; - var value3 = *pix3; - for (var y = height; y > 1; y--) - { - pix0 += stride; - value0 = (value0 * filterSize + *pix0) * scale; - *pix0 = value0; - pix1 += stride; - value1 = (value1 * filterSize + *pix1) * scale; - *pix1 = value1; - pix2 += stride; - value2 = (value2 * filterSize + *pix2) * scale; - *pix2 = value2; - pix3 += stride; - value3 = (value3 * filterSize + *pix3) * scale; - *pix3 = value3; - } - for (var y = height; y > 1; y--) - { - pix0 -= stride; - value0 = (value0 * filterSize + *pix0) * scale; - *pix0 = value0; - pix1 -= stride; - value1 = (value1 * filterSize + *pix1) * scale; - *pix1 = value1; - pix2 -= stride; - value2 = (value2 * filterSize + *pix2) * scale; - *pix2 = value2; - pix3 -= stride; - value3 = (value3 * filterSize + *pix3) * scale; - *pix3 = value3; + var inV = Vector4.Load(colPtr); + v = Vector4.Multiply(Vector4.Add(Vector4.Multiply(v, filterSizeV), inV), scaleV); + v.Store(colPtr); } } }); } - // Remaining columns (scalar) - includes columns not divisible by vectorSize + + // Remaining columns - columns not divisible by vectorSize Parallel.For(vectorizedSegments * vectorSize, plane.Stride, x => { var stride = plane.Stride; From af11a9d3be696750088ec8de905162282caae7c8 Mon Sep 17 00:00:00 2001 From: Michael Vinther Date: Sun, 26 Apr 2026 21:15:48 +0200 Subject: [PATCH 5/7] cr --- PhotoLocator/BitmapOperations/IIRMinMaxOperation.cs | 12 ++++++------ PhotoLocator/BitmapOperations/IIRSmoothOperation.cs | 1 - PhotoLocatorTest/BenchmarkHelper.cs | 4 ++-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/PhotoLocator/BitmapOperations/IIRMinMaxOperation.cs b/PhotoLocator/BitmapOperations/IIRMinMaxOperation.cs index 804b672..4c66fb3 100644 --- a/PhotoLocator/BitmapOperations/IIRMinMaxOperation.cs +++ b/PhotoLocator/BitmapOperations/IIRMinMaxOperation.cs @@ -48,8 +48,8 @@ static public void MinFilter(FloatBitmap plane, float filterSize) } }); // Vertical - int quaterWidth = plane.Stride / 4; - Parallel.For(0, quaterWidth, x4 => + int quarterWidth = plane.Stride / 4; + Parallel.For(0, quarterWidth, x4 => { int x = x4 * 4; var width = plane.Stride; @@ -100,7 +100,7 @@ static public void MinFilter(FloatBitmap plane, float filterSize) } } }); - Parallel.For(quaterWidth * 4, plane.Stride, x => + Parallel.For(quarterWidth * 4, plane.Stride, x => { var width = plane.Stride; var height = plane.Height; @@ -171,8 +171,8 @@ static public void MaxFilter(FloatBitmap plane, float filterSize) } }); // Vertical - int quaterWidth = plane.Stride / 4; - Parallel.For(0, quaterWidth, x4 => + int quarterWidth = plane.Stride / 4; + Parallel.For(0, quarterWidth, x4 => { int x = x4 * 4; var width = plane.Stride; @@ -223,7 +223,7 @@ static public void MaxFilter(FloatBitmap plane, float filterSize) } } }); - Parallel.For(quaterWidth * 4, plane.Stride, x => + Parallel.For(quarterWidth * 4, plane.Stride, x => { var width = plane.Stride; var height = plane.Height; diff --git a/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs b/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs index e1be398..36fb738 100644 --- a/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs +++ b/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs @@ -88,7 +88,6 @@ public static void Apply(FloatBitmap plane, float filterSize) fixed (float* pixels = plane.Elements) { float* colPtr = &pixels[x]; - int vectorizedSegments = plane.Stride / vectorSize; var v = Vector4.Load(colPtr); for (var y = height; y > 1; y--) { diff --git a/PhotoLocatorTest/BenchmarkHelper.cs b/PhotoLocatorTest/BenchmarkHelper.cs index ca406e6..a3d0750 100644 --- a/PhotoLocatorTest/BenchmarkHelper.cs +++ b/PhotoLocatorTest/BenchmarkHelper.cs @@ -3,7 +3,7 @@ namespace PhotoLocator { - public class BenchmarkHelper + public static class BenchmarkHelper { public static void Run(Action action, int innerLoops = 1, int outerIterations = 5) { @@ -27,7 +27,7 @@ public static void Run(Action action, int innerLoops = 1, int outerIterations = var min = iterationTimes.Min(); Console.WriteLine($"Median time: {median} ms"); Console.WriteLine($"Minimum time: {min} ms"); - throw new AssertFailedException($"Min={min} ms, median={median} ms"); + throw new AssertInconclusiveException($"Min={min} ms, median={median} ms"); } } } From 56420da77d8483b14ec5bc7418b5087003180f05 Mon Sep 17 00:00:00 2001 From: Michael Vinther Date: Sun, 26 Apr 2026 22:35:16 +0200 Subject: [PATCH 6/7] Use generic vector class --- .../BitmapOperations/IIRSmoothOperation.cs | 76 +++++-------------- 1 file changed, 21 insertions(+), 55 deletions(-) diff --git a/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs b/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs index 36fb738..d5d1346 100644 --- a/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs +++ b/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs @@ -44,68 +44,34 @@ public static void Apply(FloatBitmap plane, float filterSize) // Vertical smooth - vectorized over columns when possible unsafe { - bool hasAvx = Avx.IsSupported; - int vectorSize = hasAvx ? 8 : 4; + int vectorSize = Vector.Count; int vectorizedSegments = plane.Stride / vectorSize; - - if (hasAvx) + var filterSizeV = Vector.Create(filterSize); + var scaleV = Vector.Create(scale); + Parallel.For(0, vectorizedSegments, vi => { - var filterSizeV = Vector256.Create(filterSize); - var scaleV = Vector256.Create(scale); - Parallel.For(0, vectorizedSegments, vi => + int x = vi * vectorSize; + var stride = plane.Stride; + fixed (float* pixels = plane.Elements) { - int x = vi * 8; - var stride = plane.Stride; - fixed (float* pixels = plane.Elements) + float* colPtr = &pixels[x]; + var v = Vector.Load(colPtr); + for (var y = height; y > 1; y--) { - float* colPtr = &pixels[x]; - var v = Avx.LoadVector256(colPtr); - for (var y = height; y > 1; y--) - { - colPtr += stride; - var inV = Avx.LoadVector256(colPtr); - v = Avx.Multiply(Avx.Add(Avx.Multiply(v, filterSizeV), inV), scaleV); - Avx.Store(colPtr, v); - } - for (var y = height; y > 1; y--) - { - colPtr -= stride; - var inV = Avx.LoadVector256(colPtr); - v = Avx.Multiply(Avx.Add(Avx.Multiply(v, filterSizeV), inV), scaleV); - Avx.Store(colPtr, v); - } + colPtr += stride; + var inV = Vector.Load(colPtr); + v = Vector.Multiply(Vector.Add(Vector.Multiply(v, filterSizeV), inV), scaleV); + v.Store(colPtr); } - }); - } - else - { - var filterSizeV = Vector4.Create(filterSize); - var scaleV = Vector4.Create(scale); - Parallel.For(0, vectorizedSegments, vi => - { - int x = vi * 4; - var stride = plane.Stride; - fixed (float* pixels = plane.Elements) + for (var y = height; y > 1; y--) { - float* colPtr = &pixels[x]; - var v = Vector4.Load(colPtr); - for (var y = height; y > 1; y--) - { - colPtr += stride; - var inV = Vector4.Load(colPtr); - v = Vector4.Multiply(Vector4.Add(Vector4.Multiply(v, filterSizeV), inV), scaleV); - v.Store(colPtr); - } - for (var y = height; y > 1; y--) - { - colPtr -= stride; - var inV = Vector4.Load(colPtr); - v = Vector4.Multiply(Vector4.Add(Vector4.Multiply(v, filterSizeV), inV), scaleV); - v.Store(colPtr); - } + colPtr -= stride; + var inV = Vector.Load(colPtr); + v = Vector.Multiply(Vector.Add(Vector.Multiply(v, filterSizeV), inV), scaleV); + v.Store(colPtr); } - }); - } + } + }); // Remaining columns - columns not divisible by vectorSize Parallel.For(vectorizedSegments * vectorSize, plane.Stride, x => From 97cd31958c3bc45966c486397c238eb8807d36ec Mon Sep 17 00:00:00 2001 From: Michael Vinther Date: Sun, 26 Apr 2026 22:42:46 +0200 Subject: [PATCH 7/7] Improve test --- .../BitmapOperations/IIRSmoothOperation.cs | 91 +++++++++---------- .../IncreaseLocalContrastOperationTest.cs | 1 + 2 files changed, 45 insertions(+), 47 deletions(-) diff --git a/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs b/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs index d5d1346..180c696 100644 --- a/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs +++ b/PhotoLocator/BitmapOperations/IIRSmoothOperation.cs @@ -42,60 +42,57 @@ public static void Apply(FloatBitmap plane, float filterSize) }); // Vertical smooth - vectorized over columns when possible - unsafe + int vectorSize = Vector.Count; + int vectorizedSegments = plane.Stride / vectorSize; + var filterSizeV = Vector.Create(filterSize); + var scaleV = Vector.Create(scale); + Parallel.For(0, vectorizedSegments, vi => { - int vectorSize = Vector.Count; - int vectorizedSegments = plane.Stride / vectorSize; - var filterSizeV = Vector.Create(filterSize); - var scaleV = Vector.Create(scale); - Parallel.For(0, vectorizedSegments, vi => + int x = vi * vectorSize; + var stride = plane.Stride; + fixed (float* pixels = plane.Elements) { - int x = vi * vectorSize; - var stride = plane.Stride; - fixed (float* pixels = plane.Elements) + float* colPtr = &pixels[x]; + var v = Vector.Load(colPtr); + for (var y = height; y > 1; y--) { - float* colPtr = &pixels[x]; - var v = Vector.Load(colPtr); - for (var y = height; y > 1; y--) - { - colPtr += stride; - var inV = Vector.Load(colPtr); - v = Vector.Multiply(Vector.Add(Vector.Multiply(v, filterSizeV), inV), scaleV); - v.Store(colPtr); - } - for (var y = height; y > 1; y--) - { - colPtr -= stride; - var inV = Vector.Load(colPtr); - v = Vector.Multiply(Vector.Add(Vector.Multiply(v, filterSizeV), inV), scaleV); - v.Store(colPtr); - } + colPtr += stride; + var inV = Vector.Load(colPtr); + v = Vector.Multiply(Vector.Add(Vector.Multiply(v, filterSizeV), inV), scaleV); + v.Store(colPtr); } - }); - - // Remaining columns - columns not divisible by vectorSize - Parallel.For(vectorizedSegments * vectorSize, plane.Stride, x => + for (var y = height; y > 1; y--) + { + colPtr -= stride; + var inV = Vector.Load(colPtr); + v = Vector.Multiply(Vector.Add(Vector.Multiply(v, filterSizeV), inV), scaleV); + v.Store(colPtr); + } + } + }); + + // Remaining columns - columns not divisible by vectorSize + Parallel.For(vectorizedSegments * vectorSize, plane.Stride, x => + { + var stride = plane.Stride; + fixed (float* pixels = plane.Elements) { - var stride = plane.Stride; - fixed (float* pixels = plane.Elements) + var pix = &pixels[x]; + var value = *pix; + for (var y = height; y > 1; y--) { - var pix = &pixels[x]; - var value = *pix; - for (var y = height; y > 1; y--) - { - pix += stride; - value = (value * filterSize + *pix) * scale; - *pix = value; - } - for (var y = height; y > 1; y--) - { - pix -= stride; - value = (value * filterSize + *pix) * scale; - *pix = value; - } + pix += stride; + value = (value * filterSize + *pix) * scale; + *pix = value; } - }); - } + for (var y = height; y > 1; y--) + { + pix -= stride; + value = (value * filterSize + *pix) * scale; + *pix = value; + } + } + }); } } } diff --git a/PhotoLocatorTest/BitmapOperations/IncreaseLocalContrastOperationTest.cs b/PhotoLocatorTest/BitmapOperations/IncreaseLocalContrastOperationTest.cs index 229546e..b3fa651 100644 --- a/PhotoLocatorTest/BitmapOperations/IncreaseLocalContrastOperationTest.cs +++ b/PhotoLocatorTest/BitmapOperations/IncreaseLocalContrastOperationTest.cs @@ -32,6 +32,7 @@ public void Apply_IncreaseLocalContrast() #if DEBUG GeneralFileFormatHandler.SaveToFile(result, "localContrast.png"); #endif + Assert.AreEqual(0.23394798570186837, op.DstBitmap.Mean(), 1e-5); } } }