From dbeccfa2bdbd05925f7371ad8b4531a1b3ba92e1 Mon Sep 17 00:00:00 2001 From: owen <37291975+kudanowen@users.noreply.github.com> Date: Mon, 8 Jul 2019 17:20:39 +0100 Subject: [PATCH 1/3] Fix performance issues close to image boundary. --- KFAST.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/KFAST.h b/KFAST.h index ea7e3b6..79f1d6c 100644 --- a/KFAST.h +++ b/KFAST.h @@ -372,9 +372,11 @@ void _KFAST(const uint8_t* __restrict const data, const int32_t cols, const int3 processCols(num_corners, ptr, j, offsets, ushft, t, cols, consec, corners, cur, keypoints, i, start_row); } - // handle last few columns - processCols(num_corners, ptr, j, offsets, ushft, t, - cols, consec, corners, cur, keypoints, i, start_row); + if (cols - j - 3 > 0) { + // handle last few columns + processCols(num_corners, ptr, j, offsets, ushft, t, + cols, consec, corners, cur, keypoints, i, start_row); + } } if (nonmax_suppression) { From 5b975a3c68baea424ac83a0cf085c9c8bc668a68 Mon Sep 17 00:00:00 2001 From: owen <37291975+kudanowen@users.noreply.github.com> Date: Mon, 8 Jul 2019 17:22:56 +0100 Subject: [PATCH 2/3] Fix undefined behaviour when cols == 35. --- KFAST.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/KFAST.h b/KFAST.h index 79f1d6c..75bcd0b 100644 --- a/KFAST.h +++ b/KFAST.h @@ -156,7 +156,7 @@ void processCols(int32_t& num_corners, const uint8_t* __restrict & ptr, int32_t& // the normal full 32 columns, or special handling for the last few columns if they // don't divide up evenly into 32 uint32_t last_cols_mask; - if (!full) last_cols_mask = (1 << (cols - j - 3)) - 1; + if (!full) last_cols_mask = static_cast((1ull << (cols - j - 3)) - 1); // 'mask' now contains one bit for each element // which is SET if that element COULD be a corner based on the 2 consective cardinal point test From 6e7fd6345a18ddcf2f551daa3512a2033cac09d3 Mon Sep 17 00:00:00 2001 From: Owen Date: Mon, 8 Jul 2019 18:02:51 +0100 Subject: [PATCH 3/3] Fixed reading into the next row which could generate false keypoints. --- KFAST.h | 58 +++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/KFAST.h b/KFAST.h index 75bcd0b..91261a4 100644 --- a/KFAST.h +++ b/KFAST.h @@ -62,6 +62,7 @@ #include #include +#include #include #include #include @@ -88,7 +89,38 @@ void processCols(int32_t& num_corners, const uint8_t* __restrict & ptr, int32_t& const __m256i& consec, int32_t* const __restrict corners, uint8_t* const __restrict cur, std::vector& keypoints, const int32_t i, const int32_t start_row) { // ppt is an integer vector that now holds 32 of point p - __m256i ppt = _mm256_loadu_si256(reinterpret_cast(ptr)); + __m256i ppt, p9, p5, p1, p13; + if (full) { + ppt = _mm256_loadu_si256(reinterpret_cast(ptr)); + + // Rosten's point 9 for all 32 pixels in consideration + p9 = _mm256_loadu_si256(reinterpret_cast(ptr + *offsets)); + + // Rosten's point 5 + p5 = _mm256_loadu_si256(reinterpret_cast(ptr + offsets[4])); + + // Rosten's point 1 + p1 = _mm256_loadu_si256(reinterpret_cast(ptr + offsets[8])); + + // Rosten's point 13 + p13 = _mm256_loadu_si256(reinterpret_cast(ptr + offsets[12])); + } + else { + // to avoid reading into the next row and generating false features. + ppt = _mm256_setzero_si256(); + p9 = p5 = p1 = p13 = ppt; + + memcpy(&ppt, ptr, cols - j - 3); + memcpy(&p9, ptr + offsets[0], cols - j - 3); + memcpy(&p5, ptr + offsets[4], cols - j - 3); + memcpy(&p1, ptr + offsets[8], cols - j - 3); + memcpy(&p13, ptr + offsets[12], cols - j - 3); + } + + p9 = _mm256_xor_si256(p9, ushft); + p5 = _mm256_xor_si256(p5, ushft); + p1 = _mm256_xor_si256(p1, ushft); + p13 = _mm256_xor_si256(p13, ushft); // we subtract (and clamp) the threshold value from all 32 pixels // pmt represents p - t @@ -98,18 +130,6 @@ void processCols(int32_t& num_corners, const uint8_t* __restrict & ptr, int32_t& // ppt represents p + t ppt = _mm256_xor_si256(_mm256_adds_epu8(ppt, t), ushft); - // Rosten's point 9 for all 32 pixels in consideration - __m256i p9 = _mm256_xor_si256(_mm256_loadu_si256(reinterpret_cast(ptr + *offsets)), ushft); - - // Rosten's point 5 - __m256i p5 = _mm256_xor_si256(_mm256_loadu_si256(reinterpret_cast(ptr + offsets[4])), ushft); - - // Rosten's point 1 - __m256i p1 = _mm256_xor_si256(_mm256_loadu_si256(reinterpret_cast(ptr + offsets[8])), ushft); - - // Rosten's point 13 - __m256i p13 = _mm256_xor_si256(_mm256_loadu_si256(reinterpret_cast(ptr + offsets[12])), ushft); - // compare p's against p9's // compare p's against p5's // AND the result into ppt_accum @@ -187,7 +207,17 @@ void processCols(int32_t& num_corners, const uint8_t* __restrict & ptr, int32_t& // for each of the 24 pixels in the circle (wrapping around extra 8 at the end) for (int32_t k = 0; k < 24; ++k) { // x is a vector of the kth member of the circle - __m256i p = _mm256_xor_si256(_mm256_loadu_si256(reinterpret_cast(ptr + offsets[k])), ushft); + __m256i p; + if (full) { + p = _mm256_xor_si256( + _mm256_loadu_si256(reinterpret_cast(ptr + offsets[k])), ushft); + } + else { + p = _mm256_setzero_si256(); + + memcpy(&p, ptr + offsets[k], cols - j - 3); + p = _mm256_xor_si256(p, ushft); + } ppt_accum = _mm256_cmpgt_epi8(p, ppt); pmt_accum = _mm256_cmpgt_epi8(pmt, p);