-
Notifications
You must be signed in to change notification settings - Fork 1
Implement AVX2 SIMD optimization #10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
- Add l2_simd with AVX2 intrinsics - Enable -march=native for vectorization
5000user5000
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
實作 AVX SIMD 加速
- 增加 -match=native 啟用 CPU 指令集
- 原有 L2 計算新增 AVX SIMD 計算
- 將舊的 L2 函式名替換成新的
| @@ -1,5 +1,5 @@ | |||
| CXX := g++ | |||
| CXXFLAGS := -std=c++17 -O3 -fPIC -fopenmp | |||
| CXXFLAGS := -std=c++17 -O3 -fPIC -march=native -fopenmp | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
增加 -march=native , 啟用 CPU 支援指令集
| inline float l2_simd(const float* __restrict a, | ||
| const float* __restrict b, | ||
| size_t dim) { | ||
| #if defined(__AVX2__) | ||
| const size_t step = 8; // 8 × 32-bit floats | ||
| __m256 acc = _mm256_setzero_ps(); | ||
| size_t i = 0; | ||
| for (; i + step - 1 < dim; i += step) { | ||
| __m256 va = _mm256_loadu_ps(a + i); | ||
| __m256 vb = _mm256_loadu_ps(b + i); | ||
| __m256 diff = _mm256_sub_ps(va, vb); | ||
| acc = _mm256_fmadd_ps(diff, diff, acc); // acc += diff² | ||
| } | ||
| float buf[step]; | ||
| _mm256_storeu_ps(buf, acc); | ||
| float d = 0.f; | ||
| for (int j = 0; j < step; ++j) d += buf[j]; | ||
|
|
||
| for (; i < dim; ++i) { | ||
| float diff = a[i] - b[i]; | ||
| d += diff * diff; | ||
| } | ||
| return d; | ||
| #else | ||
| float d = 0.f; | ||
| for (size_t i = 0; i < dim; ++i) { | ||
| float diff = a[i] - b[i]; | ||
| d += diff * diff; | ||
| } | ||
| return d; | ||
| #endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
L2 計算新增 AVX SIMD 版本,如果不支援 AVX2,則會退回原版
| #pragma omp parallel for schedule(static) | ||
| for (size_t c = 0; c < nlist_; ++c) { | ||
| float d = l2_naive(query.data(), centroids_[c].data(), dimension_); | ||
| float d = l2_simd(query.data(), centroids_[c].data(), dimension_); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
將既有 l2_naive 替換成新的 l2_simd
實作 AVX2 SIMD 加速