summaryrefslogtreecommitdiff
path: root/media/libjxl/src/lib/jxl/convolve.cc
diff options
context:
space:
mode:
Diffstat (limited to 'media/libjxl/src/lib/jxl/convolve.cc')
-rw-r--r--media/libjxl/src/lib/jxl/convolve.cc1332
1 files changed, 1332 insertions, 0 deletions
diff --git a/media/libjxl/src/lib/jxl/convolve.cc b/media/libjxl/src/lib/jxl/convolve.cc
new file mode 100644
index 0000000000..b3e81f0ee7
--- /dev/null
+++ b/media/libjxl/src/lib/jxl/convolve.cc
@@ -0,0 +1,1332 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/convolve.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/convolve.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/common.h" // RoundUpTo
+#include "lib/jxl/convolve-inl.h"
+#include "lib/jxl/image_ops.h"
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Vec;
+
+// Weighted sum of 1x5 pixels around ix, iy with [wx2 wx1 wx0 wx1 wx2].
+template <class WrapY>
+static float WeightedSumBorder(const ImageF& in, const WrapY wrap_y,
+ const int64_t ix, const int64_t iy,
+ const size_t xsize, const size_t ysize,
+ const float wx0, const float wx1,
+ const float wx2) {
+ const WrapMirror wrap_x;
+ const float* JXL_RESTRICT row = in.ConstRow(wrap_y(iy, ysize));
+ const float in_m2 = row[wrap_x(ix - 2, xsize)];
+ const float in_p2 = row[wrap_x(ix + 2, xsize)];
+ const float in_m1 = row[wrap_x(ix - 1, xsize)];
+ const float in_p1 = row[wrap_x(ix + 1, xsize)];
+ const float in_00 = row[ix];
+ const float sum_2 = wx2 * (in_m2 + in_p2);
+ const float sum_1 = wx1 * (in_m1 + in_p1);
+ const float sum_0 = wx0 * in_00;
+ return sum_2 + sum_1 + sum_0;
+}
+
+template <class WrapY, class V>
+static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix,
+ const int64_t iy, const size_t ysize, const V wx0,
+ const V wx1, const V wx2) {
+ const HWY_FULL(float) d;
+ const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix;
+ const auto in_m2 = LoadU(d, center - 2);
+ const auto in_p2 = LoadU(d, center + 2);
+ const auto in_m1 = LoadU(d, center - 1);
+ const auto in_p1 = LoadU(d, center + 1);
+ const auto in_00 = Load(d, center);
+ const auto sum_2 = wx2 * (in_m2 + in_p2);
+ const auto sum_1 = wx1 * (in_m1 + in_p1);
+ const auto sum_0 = wx0 * in_00;
+ return sum_2 + sum_1 + sum_0;
+}
+
+// Produces result for one pixel
+template <class WrapY>
+float Symmetric5Border(const ImageF& in, const Rect& rect, const int64_t ix,
+ const int64_t iy, const WeightsSymmetric5& weights) {
+ const float w0 = weights.c[0];
+ const float w1 = weights.r[0];
+ const float w2 = weights.R[0];
+ const float w4 = weights.d[0];
+ const float w5 = weights.L[0];
+ const float w8 = weights.D[0];
+
+ const size_t xsize = rect.xsize();
+ const size_t ysize = rect.ysize();
+ const WrapY wrap_y;
+ // Unrolled loop over all 5 rows of the kernel.
+ float sum0 = WeightedSumBorder(in, wrap_y, ix, iy, xsize, ysize, w0, w1, w2);
+
+ sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 2, xsize, ysize, w2, w5, w8);
+ float sum1 =
+ WeightedSumBorder(in, wrap_y, ix, iy + 2, xsize, ysize, w2, w5, w8);
+
+ sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 1, xsize, ysize, w1, w4, w5);
+ sum1 += WeightedSumBorder(in, wrap_y, ix, iy + 1, xsize, ysize, w1, w4, w5);
+
+ return sum0 + sum1;
+}
+
+// Produces result for one vector's worth of pixels
+template <class WrapY>
+static void Symmetric5Interior(const ImageF& in, const Rect& rect,
+ const int64_t ix, const int64_t iy,
+ const WeightsSymmetric5& weights,
+ float* JXL_RESTRICT row_out) {
+ const HWY_FULL(float) d;
+
+ const auto w0 = LoadDup128(d, weights.c);
+ const auto w1 = LoadDup128(d, weights.r);
+ const auto w2 = LoadDup128(d, weights.R);
+ const auto w4 = LoadDup128(d, weights.d);
+ const auto w5 = LoadDup128(d, weights.L);
+ const auto w8 = LoadDup128(d, weights.D);
+
+ const size_t ysize = rect.ysize();
+ const WrapY wrap_y;
+ // Unrolled loop over all 5 rows of the kernel.
+ auto sum0 = WeightedSum(in, wrap_y, ix, iy, ysize, w0, w1, w2);
+
+ sum0 += WeightedSum(in, wrap_y, ix, iy - 2, ysize, w2, w5, w8);
+ auto sum1 = WeightedSum(in, wrap_y, ix, iy + 2, ysize, w2, w5, w8);
+
+ sum0 += WeightedSum(in, wrap_y, ix, iy - 1, ysize, w1, w4, w5);
+ sum1 += WeightedSum(in, wrap_y, ix, iy + 1, ysize, w1, w4, w5);
+
+ Store(sum0 + sum1, d, row_out + ix);
+}
+
+template <class WrapY>
+static void Symmetric5Row(const ImageF& in, const Rect& rect, const int64_t iy,
+ const WeightsSymmetric5& weights,
+ float* JXL_RESTRICT row_out) {
+ const int64_t kRadius = 2;
+ const size_t xsize = rect.xsize();
+
+ size_t ix = 0;
+ const HWY_FULL(float) d;
+ const size_t N = Lanes(d);
+ const size_t aligned_x = RoundUpTo(kRadius, N);
+ for (; ix < std::min(aligned_x, xsize); ++ix) {
+ row_out[ix] = Symmetric5Border<WrapY>(in, rect, ix, iy, weights);
+ }
+ for (; ix + N + kRadius <= xsize; ix += N) {
+ Symmetric5Interior<WrapY>(in, rect, ix, iy, weights, row_out);
+ }
+ for (; ix < xsize; ++ix) {
+ row_out[ix] = Symmetric5Border<WrapY>(in, rect, ix, iy, weights);
+ }
+}
+
+static JXL_NOINLINE void Symmetric5BorderRow(const ImageF& in, const Rect& rect,
+ const int64_t iy,
+ const WeightsSymmetric5& weights,
+ float* JXL_RESTRICT row_out) {
+ return Symmetric5Row<WrapMirror>(in, rect, iy, weights, row_out);
+}
+
+#if HWY_TARGET != HWY_SCALAR
+
+// Returns indices for SetTableIndices such that TableLookupLanes on the
+// rightmost unaligned vector (rightmost sample in its most-significant lane)
+// returns the mirrored values, with the mirror outside the last valid sample.
+static inline const int32_t* MirrorLanes(const size_t mod) {
+ const HWY_CAPPED(float, 16) d;
+ constexpr size_t kN = MaxLanes(d);
+
+ // For mod = `image width mod 16` 0..15:
+ // last full vec mirrored (mem order) loadedVec mirrorVec idxVec
+ // 0123456789abcdef| fedcba9876543210 fed..210 012..def 012..def
+ // 0123456789abcdef|0 0fedcba98765432 0fe..321 234..f00 123..eff
+ // 0123456789abcdef|01 10fedcba987654 10f..432 456..110 234..ffe
+ // 0123456789abcdef|012 210fedcba9876 210..543 67..2210 34..ffed
+ // 0123456789abcdef|0123 3210fedcba98 321..654 8..33210 4..ffedc
+ // 0123456789abcdef|01234 43210fedcba
+ // 0123456789abcdef|012345 543210fedc
+ // 0123456789abcdef|0123456 6543210fe
+ // 0123456789abcdef|01234567 76543210
+ // 0123456789abcdef|012345678 8765432
+ // 0123456789abcdef|0123456789 987654
+ // 0123456789abcdef|0123456789A A9876
+ // 0123456789abcdef|0123456789AB BA98
+ // 0123456789abcdef|0123456789ABC CBA
+ // 0123456789abcdef|0123456789ABCD DC
+ // 0123456789abcdef|0123456789ABCDE E EDC..10f EED..210 ffe..321
+#if HWY_CAP_GE512
+ HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = {
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, //
+ 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+#elif HWY_CAP_GE256
+ HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = {
+ 1, 2, 3, 4, 5, 6, 7, 7, //
+ 6, 5, 4, 3, 2, 1, 0};
+#else // 128-bit
+ HWY_ALIGN static constexpr int32_t idx_lanes[2 * kN - 1] = {1, 2, 3, 3, //
+ 2, 1, 0};
+#endif
+ return idx_lanes + kN - 1 - mod;
+}
+
+#endif // HWY_TARGET != HWY_SCALAR
+
+namespace strategy {
+
+struct StrategyBase {
+ using D = HWY_CAPPED(float, 16);
+ using V = Vec<D>;
+};
+
+// 3x3 convolution by symmetric kernel with a single scan through the input.
+class Symmetric3 : public StrategyBase {
+ public:
+ static constexpr int64_t kRadius = 1;
+
+ // Only accesses pixels in [0, xsize).
+ template <size_t kSizeModN, class WrapRow>
+ static JXL_INLINE void ConvolveRow(const float* const JXL_RESTRICT row_m,
+ const size_t xsize, const int64_t stride,
+ const WrapRow& wrap_row,
+ const WeightsSymmetric3& weights,
+ float* const JXL_RESTRICT row_out) {
+ const D d;
+ // t, m, b = top, middle, bottom row;
+ const float* const JXL_RESTRICT row_t = wrap_row(row_m - stride, stride);
+ const float* const JXL_RESTRICT row_b = wrap_row(row_m + stride, stride);
+
+ // Must load in advance - compiler doesn't understand LoadDup128 and
+ // schedules them too late.
+ const V w0 = LoadDup128(d, weights.c);
+ const V w1 = LoadDup128(d, weights.r);
+ const V w2 = LoadDup128(d, weights.d);
+
+ // l, c, r = left, center, right. Leftmost vector: need FirstL1.
+ {
+ const V tc = LoadU(d, row_t + 0);
+ const V mc = LoadU(d, row_m + 0);
+ const V bc = LoadU(d, row_b + 0);
+ const V tl = Neighbors::FirstL1(tc);
+ const V tr = LoadU(d, row_t + 0 + 1);
+ const V ml = Neighbors::FirstL1(mc);
+ const V mr = LoadU(d, row_m + 0 + 1);
+ const V bl = Neighbors::FirstL1(bc);
+ const V br = LoadU(d, row_b + 0 + 1);
+ const V conv =
+ WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2);
+ Store(conv, d, row_out + 0);
+ }
+
+ // Loop as long as we can load enough new values:
+ const size_t N = Lanes(d);
+ size_t x = N;
+ for (; x + N + kRadius <= xsize; x += N) {
+ const auto conv = ConvolveValid(row_t, row_m, row_b, x, w0, w1, w2);
+ Store(conv, d, row_out + x);
+ }
+
+ // For final (partial) vector:
+ const V tc = LoadU(d, row_t + x);
+ const V mc = LoadU(d, row_m + x);
+ const V bc = LoadU(d, row_b + x);
+
+ V tr, mr, br;
+#if HWY_TARGET == HWY_SCALAR
+ tr = tc; // Single-lane => mirrored right neighbor = center value.
+ mr = mc;
+ br = bc;
+#else
+ if (kSizeModN == 0) {
+ // The above loop didn't handle the last vector because it needs an
+ // additional right neighbor (generated via mirroring).
+ auto mirror = SetTableIndices(d, MirrorLanes(N - 1));
+ tr = TableLookupLanes(tc, mirror);
+ mr = TableLookupLanes(mc, mirror);
+ br = TableLookupLanes(bc, mirror);
+ } else {
+ auto mirror = SetTableIndices(d, MirrorLanes((xsize % N) - 1));
+ // Loads last valid value into uppermost lane and mirrors.
+ tr = TableLookupLanes(LoadU(d, row_t + xsize - N), mirror);
+ mr = TableLookupLanes(LoadU(d, row_m + xsize - N), mirror);
+ br = TableLookupLanes(LoadU(d, row_b + xsize - N), mirror);
+ }
+#endif
+
+ const V tl = LoadU(d, row_t + x - 1);
+ const V ml = LoadU(d, row_m + x - 1);
+ const V bl = LoadU(d, row_b + x - 1);
+ const V conv = WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2);
+ Store(conv, d, row_out + x);
+ }
+
+ private:
+ // Returns sum{x_i * w_i}.
+ template <class V>
+ static JXL_INLINE V WeightedSum(const V tl, const V tc, const V tr,
+ const V ml, const V mc, const V mr,
+ const V bl, const V bc, const V br,
+ const V w0, const V w1, const V w2) {
+ const V sum_tb = tc + bc;
+
+ // Faster than 5 mul + 4 FMA.
+ const V mul0 = mc * w0;
+ const V sum_lr = ml + mr;
+
+ const V x1 = sum_tb + sum_lr;
+ const V mul1 = MulAdd(x1, w1, mul0);
+
+ const V sum_t2 = tl + tr;
+ const V sum_b2 = bl + br;
+ const V x2 = sum_t2 + sum_b2;
+ const V mul2 = MulAdd(x2, w2, mul1);
+ return mul2;
+ }
+
+ static JXL_INLINE V ConvolveValid(const float* JXL_RESTRICT row_t,
+ const float* JXL_RESTRICT row_m,
+ const float* JXL_RESTRICT row_b,
+ const int64_t x, const V w0, const V w1,
+ const V w2) {
+ const D d;
+ const V tc = LoadU(d, row_t + x);
+ const V mc = LoadU(d, row_m + x);
+ const V bc = LoadU(d, row_b + x);
+ const V tl = LoadU(d, row_t + x - 1);
+ const V tr = LoadU(d, row_t + x + 1);
+ const V ml = LoadU(d, row_m + x - 1);
+ const V mr = LoadU(d, row_m + x + 1);
+ const V bl = LoadU(d, row_b + x - 1);
+ const V br = LoadU(d, row_b + x + 1);
+ return WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2);
+ }
+};
+
+// 5x5 convolution by separable kernel with a single scan through the input.
+// This is more cache-efficient than separate horizontal/vertical passes, and
+// possibly faster (given enough registers) than tiling and/or transposing.
+//
+// Overview: imagine a 5x5 window around a central pixel. First convolve the
+// rows by multiplying the pixels with the corresponding weights from
+// WeightsSeparable5.horz[abs(x_offset) * 4]. Then multiply each of these
+// intermediate results by the corresponding vertical weight, i.e.
+// vert[abs(y_offset) * 4]. Finally, store the sum of these values as the
+// convolution result at the position of the central pixel in the output.
+//
+// Each of these operations uses SIMD vectors. The central pixel and most
+// importantly the output are aligned, so neighnoring pixels (e.g. x_offset=1)
+// require unaligned loads. Because weights are supplied in identical groups of
+// 4, we can use LoadDup128 to load them (slightly faster).
+//
+// Uses mirrored boundary handling. Until x >= kRadius, the horizontal
+// convolution uses Neighbors class to shuffle vectors as if each of its lanes
+// had been loaded from the mirrored offset. Similarly, the last full vector to
+// write uses mirroring. In the case of scalar vectors, Neighbors is not usable
+// and the value is loaded directly. Otherwise, the number of valid pixels
+// modulo the vector size enables a small optimization: for smaller offsets,
+// a non-mirrored load is sufficient.
+class Separable5 : public StrategyBase {
+ public:
+ static constexpr int64_t kRadius = 2;
+
+ template <size_t kSizeModN, class WrapRow>
+ static JXL_INLINE void ConvolveRow(const float* const JXL_RESTRICT row_m,
+ const size_t xsize, const int64_t stride,
+ const WrapRow& wrap_row,
+ const WeightsSeparable5& weights,
+ float* const JXL_RESTRICT row_out) {
+ const D d;
+ const int64_t neg_stride = -stride; // allows LEA addressing.
+ const float* const JXL_RESTRICT row_t2 =
+ wrap_row(row_m + 2 * neg_stride, stride);
+ const float* const JXL_RESTRICT row_t1 =
+ wrap_row(row_m + 1 * neg_stride, stride);
+ const float* const JXL_RESTRICT row_b1 =
+ wrap_row(row_m + 1 * stride, stride);
+ const float* const JXL_RESTRICT row_b2 =
+ wrap_row(row_m + 2 * stride, stride);
+
+ const V wh0 = LoadDup128(d, weights.horz + 0 * 4);
+ const V wh1 = LoadDup128(d, weights.horz + 1 * 4);
+ const V wh2 = LoadDup128(d, weights.horz + 2 * 4);
+ const V wv0 = LoadDup128(d, weights.vert + 0 * 4);
+ const V wv1 = LoadDup128(d, weights.vert + 1 * 4);
+ const V wv2 = LoadDup128(d, weights.vert + 2 * 4);
+
+ size_t x = 0;
+
+ // More than one iteration for scalars.
+ for (; x < kRadius; x += Lanes(d)) {
+ const V conv0 = HorzConvolveFirst(row_m, x, xsize, wh0, wh1, wh2) * wv0;
+
+ const V conv1t = HorzConvolveFirst(row_t1, x, xsize, wh0, wh1, wh2);
+ const V conv1b = HorzConvolveFirst(row_b1, x, xsize, wh0, wh1, wh2);
+ const V conv1 = MulAdd(conv1t + conv1b, wv1, conv0);
+
+ const V conv2t = HorzConvolveFirst(row_t2, x, xsize, wh0, wh1, wh2);
+ const V conv2b = HorzConvolveFirst(row_b2, x, xsize, wh0, wh1, wh2);
+ const V conv2 = MulAdd(conv2t + conv2b, wv2, conv1);
+ Store(conv2, d, row_out + x);
+ }
+
+ // Main loop: load inputs without padding
+ for (; x + Lanes(d) + kRadius <= xsize; x += Lanes(d)) {
+ const V conv0 = HorzConvolve(row_m + x, wh0, wh1, wh2) * wv0;
+
+ const V conv1t = HorzConvolve(row_t1 + x, wh0, wh1, wh2);
+ const V conv1b = HorzConvolve(row_b1 + x, wh0, wh1, wh2);
+ const V conv1 = MulAdd(conv1t + conv1b, wv1, conv0);
+
+ const V conv2t = HorzConvolve(row_t2 + x, wh0, wh1, wh2);
+ const V conv2b = HorzConvolve(row_b2 + x, wh0, wh1, wh2);
+ const V conv2 = MulAdd(conv2t + conv2b, wv2, conv1);
+ Store(conv2, d, row_out + x);
+ }
+
+ // Last full vector to write (the above loop handled mod >= kRadius)
+#if HWY_TARGET == HWY_SCALAR
+ while (x < xsize) {
+#else
+ if (kSizeModN < kRadius) {
+#endif
+ const V conv0 =
+ HorzConvolveLast<kSizeModN>(row_m, x, xsize, wh0, wh1, wh2) * wv0;
+
+ const V conv1t =
+ HorzConvolveLast<kSizeModN>(row_t1, x, xsize, wh0, wh1, wh2);
+ const V conv1b =
+ HorzConvolveLast<kSizeModN>(row_b1, x, xsize, wh0, wh1, wh2);
+ const V conv1 = MulAdd(conv1t + conv1b, wv1, conv0);
+
+ const V conv2t =
+ HorzConvolveLast<kSizeModN>(row_t2, x, xsize, wh0, wh1, wh2);
+ const V conv2b =
+ HorzConvolveLast<kSizeModN>(row_b2, x, xsize, wh0, wh1, wh2);
+ const V conv2 = MulAdd(conv2t + conv2b, wv2, conv1);
+ Store(conv2, d, row_out + x);
+ x += Lanes(d);
+ }
+
+ // If mod = 0, the above vector was the last.
+ if (kSizeModN != 0) {
+ for (; x < xsize; ++x) {
+ float mul = 0.0f;
+ for (int64_t dy = -kRadius; dy <= kRadius; ++dy) {
+ const float wy = weights.vert[std::abs(dy) * 4];
+ const float* clamped_row = wrap_row(row_m + dy * stride, stride);
+ for (int64_t dx = -kRadius; dx <= kRadius; ++dx) {
+ const float wx = weights.horz[std::abs(dx) * 4];
+ const int64_t clamped_x = Mirror(x + dx, xsize);
+ mul += clamped_row[clamped_x] * wx * wy;
+ }
+ }
+ row_out[x] = mul;
+ }
+ }
+ }
+
+ private:
+ // Same as HorzConvolve for the first/last vector in a row.
+ static JXL_INLINE V HorzConvolveFirst(const float* const JXL_RESTRICT row,
+ const int64_t x, const int64_t xsize,
+ const V wh0, const V wh1, const V wh2) {
+ const D d;
+ const V c = LoadU(d, row + x);
+ const V mul0 = c * wh0;
+
+#if HWY_TARGET == HWY_SCALAR
+ const V l1 = LoadU(d, row + Mirror(x - 1, xsize));
+ const V l2 = LoadU(d, row + Mirror(x - 2, xsize));
+#else
+ (void)xsize;
+ const V l1 = Neighbors::FirstL1(c);
+ const V l2 = Neighbors::FirstL2(c);
+#endif
+
+ const V r1 = LoadU(d, row + x + 1);
+ const V r2 = LoadU(d, row + x + 2);
+
+ const V mul1 = MulAdd(l1 + r1, wh1, mul0);
+ const V mul2 = MulAdd(l2 + r2, wh2, mul1);
+ return mul2;
+ }
+
+ template <size_t kSizeModN>
+ static JXL_INLINE V HorzConvolveLast(const float* const JXL_RESTRICT row,
+ const int64_t x, const int64_t xsize,
+ const V wh0, const V wh1, const V wh2) {
+ const D d;
+ const V c = LoadU(d, row + x);
+ const V mul0 = c * wh0;
+
+ const V l1 = LoadU(d, row + x - 1);
+ const V l2 = LoadU(d, row + x - 2);
+
+ V r1, r2;
+#if HWY_TARGET == HWY_SCALAR
+ r1 = LoadU(d, row + Mirror(x + 1, xsize));
+ r2 = LoadU(d, row + Mirror(x + 2, xsize));
+#else
+ const size_t N = Lanes(d);
+ if (kSizeModN == 0) {
+ r2 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 2)));
+ r1 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 1)));
+ } else { // == 1
+ const auto last = LoadU(d, row + xsize - N);
+ r2 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1)));
+ r1 = last;
+ }
+#endif
+
+ // Sum of pixels with Manhattan distance i, multiplied by weights[i].
+ const V sum1 = l1 + r1;
+ const V mul1 = MulAdd(sum1, wh1, mul0);
+ const V sum2 = l2 + r2;
+ const V mul2 = MulAdd(sum2, wh2, mul1);
+ return mul2;
+ }
+
+ // Requires kRadius valid pixels before/after pos.
+ static JXL_INLINE V HorzConvolve(const float* const JXL_RESTRICT pos,
+ const V wh0, const V wh1, const V wh2) {
+ const D d;
+ const V c = LoadU(d, pos);
+ const V mul0 = c * wh0;
+
+ // Loading anew is faster than combining vectors.
+ const V l1 = LoadU(d, pos - 1);
+ const V r1 = LoadU(d, pos + 1);
+ const V l2 = LoadU(d, pos - 2);
+ const V r2 = LoadU(d, pos + 2);
+ // Sum of pixels with Manhattan distance i, multiplied by weights[i].
+ const V sum1 = l1 + r1;
+ const V mul1 = MulAdd(sum1, wh1, mul0);
+ const V sum2 = l2 + r2;
+ const V mul2 = MulAdd(sum2, wh2, mul1);
+ return mul2;
+ }
+}; // namespace strategy
+
+// 7x7 convolution by separable kernel with a single scan through the input.
+// Extended version of Separable5, see documentation there.
+class Separable7 : public StrategyBase {
+ public:
+ static constexpr int64_t kRadius = 3;
+
+ template <size_t kSizeModN, class WrapRow>
+ static JXL_INLINE void ConvolveRow(const float* const JXL_RESTRICT row_m,
+ const size_t xsize, const int64_t stride,
+ const WrapRow& wrap_row,
+ const WeightsSeparable7& weights,
+ float* const JXL_RESTRICT row_out) {
+ const D d;
+ const int64_t neg_stride = -stride; // allows LEA addressing.
+ const float* const JXL_RESTRICT row_t3 =
+ wrap_row(row_m + 3 * neg_stride, stride);
+ const float* const JXL_RESTRICT row_t2 =
+ wrap_row(row_m + 2 * neg_stride, stride);
+ const float* const JXL_RESTRICT row_t1 =
+ wrap_row(row_m + 1 * neg_stride, stride);
+ const float* const JXL_RESTRICT row_b1 =
+ wrap_row(row_m + 1 * stride, stride);
+ const float* const JXL_RESTRICT row_b2 =
+ wrap_row(row_m + 2 * stride, stride);
+ const float* const JXL_RESTRICT row_b3 =
+ wrap_row(row_m + 3 * stride, stride);
+
+ const V wh0 = LoadDup128(d, weights.horz + 0 * 4);
+ const V wh1 = LoadDup128(d, weights.horz + 1 * 4);
+ const V wh2 = LoadDup128(d, weights.horz + 2 * 4);
+ const V wh3 = LoadDup128(d, weights.horz + 3 * 4);
+ const V wv0 = LoadDup128(d, weights.vert + 0 * 4);
+ const V wv1 = LoadDup128(d, weights.vert + 1 * 4);
+ const V wv2 = LoadDup128(d, weights.vert + 2 * 4);
+ const V wv3 = LoadDup128(d, weights.vert + 3 * 4);
+
+ size_t x = 0;
+
+ // More than one iteration for scalars.
+ for (; x < kRadius; x += Lanes(d)) {
+ const V conv0 =
+ HorzConvolveFirst(row_m, x, xsize, wh0, wh1, wh2, wh3) * wv0;
+
+ const V conv1t = HorzConvolveFirst(row_t1, x, xsize, wh0, wh1, wh2, wh3);
+ const V conv1b = HorzConvolveFirst(row_b1, x, xsize, wh0, wh1, wh2, wh3);
+ const V conv1 = MulAdd(conv1t + conv1b, wv1, conv0);
+
+ const V conv2t = HorzConvolveFirst(row_t2, x, xsize, wh0, wh1, wh2, wh3);
+ const V conv2b = HorzConvolveFirst(row_b2, x, xsize, wh0, wh1, wh2, wh3);
+ const V conv2 = MulAdd(conv2t + conv2b, wv2, conv1);
+
+ const V conv3t = HorzConvolveFirst(row_t3, x, xsize, wh0, wh1, wh2, wh3);
+ const V conv3b = HorzConvolveFirst(row_b3, x, xsize, wh0, wh1, wh2, wh3);
+ const V conv3 = MulAdd(conv3t + conv3b, wv3, conv2);
+
+ Store(conv3, d, row_out + x);
+ }
+
+ // Main loop: load inputs without padding
+ for (; x + Lanes(d) + kRadius <= xsize; x += Lanes(d)) {
+ const V conv0 = HorzConvolve(row_m + x, wh0, wh1, wh2, wh3) * wv0;
+
+ const V conv1t = HorzConvolve(row_t1 + x, wh0, wh1, wh2, wh3);
+ const V conv1b = HorzConvolve(row_b1 + x, wh0, wh1, wh2, wh3);
+ const V conv1 = MulAdd(conv1t + conv1b, wv1, conv0);
+
+ const V conv2t = HorzConvolve(row_t2 + x, wh0, wh1, wh2, wh3);
+ const V conv2b = HorzConvolve(row_b2 + x, wh0, wh1, wh2, wh3);
+ const V conv2 = MulAdd(conv2t + conv2b, wv2, conv1);
+
+ const V conv3t = HorzConvolve(row_t3 + x, wh0, wh1, wh2, wh3);
+ const V conv3b = HorzConvolve(row_b3 + x, wh0, wh1, wh2, wh3);
+ const V conv3 = MulAdd(conv3t + conv3b, wv3, conv2);
+
+ Store(conv3, d, row_out + x);
+ }
+
+ // Last full vector to write (the above loop handled mod >= kRadius)
+#if HWY_TARGET == HWY_SCALAR
+ while (x < xsize) {
+#else
+ if (kSizeModN < kRadius) {
+#endif
+ const V conv0 =
+ HorzConvolveLast<kSizeModN>(row_m, x, xsize, wh0, wh1, wh2, wh3) *
+ wv0;
+
+ const V conv1t =
+ HorzConvolveLast<kSizeModN>(row_t1, x, xsize, wh0, wh1, wh2, wh3);
+ const V conv1b =
+ HorzConvolveLast<kSizeModN>(row_b1, x, xsize, wh0, wh1, wh2, wh3);
+ const V conv1 = MulAdd(conv1t + conv1b, wv1, conv0);
+
+ const V conv2t =
+ HorzConvolveLast<kSizeModN>(row_t2, x, xsize, wh0, wh1, wh2, wh3);
+ const V conv2b =
+ HorzConvolveLast<kSizeModN>(row_b2, x, xsize, wh0, wh1, wh2, wh3);
+ const V conv2 = MulAdd(conv2t + conv2b, wv2, conv1);
+
+ const V conv3t =
+ HorzConvolveLast<kSizeModN>(row_t3, x, xsize, wh0, wh1, wh2, wh3);
+ const V conv3b =
+ HorzConvolveLast<kSizeModN>(row_b3, x, xsize, wh0, wh1, wh2, wh3);
+ const V conv3 = MulAdd(conv3t + conv3b, wv3, conv2);
+
+ Store(conv3, d, row_out + x);
+ x += Lanes(d);
+ }
+
+ // If mod = 0, the above vector was the last.
+ if (kSizeModN != 0) {
+ for (; x < xsize; ++x) {
+ float mul = 0.0f;
+ for (int64_t dy = -kRadius; dy <= kRadius; ++dy) {
+ const float wy = weights.vert[std::abs(dy) * 4];
+ const float* clamped_row = wrap_row(row_m + dy * stride, stride);
+ for (int64_t dx = -kRadius; dx <= kRadius; ++dx) {
+ const float wx = weights.horz[std::abs(dx) * 4];
+ const int64_t clamped_x = Mirror(x + dx, xsize);
+ mul += clamped_row[clamped_x] * wx * wy;
+ }
+ }
+ row_out[x] = mul;
+ }
+ }
+ }
+
+ private:
+ // Same as HorzConvolve for the first/last vector in a row.
+ static JXL_INLINE V HorzConvolveFirst(const float* const JXL_RESTRICT row,
+ const int64_t x, const int64_t xsize,
+ const V wh0, const V wh1, const V wh2,
+ const V wh3) {
+ const D d;
+ const V c = LoadU(d, row + x);
+ const V mul0 = c * wh0;
+
+#if HWY_TARGET == HWY_SCALAR
+ const V l1 = LoadU(d, row + Mirror(x - 1, xsize));
+ const V l2 = LoadU(d, row + Mirror(x - 2, xsize));
+ const V l3 = LoadU(d, row + Mirror(x - 3, xsize));
+#else
+ (void)xsize;
+ const V l1 = Neighbors::FirstL1(c);
+ const V l2 = Neighbors::FirstL2(c);
+ const V l3 = Neighbors::FirstL3(c);
+#endif
+
+ const V r1 = LoadU(d, row + x + 1);
+ const V r2 = LoadU(d, row + x + 2);
+ const V r3 = LoadU(d, row + x + 3);
+
+ const V mul1 = MulAdd(l1 + r1, wh1, mul0);
+ const V mul2 = MulAdd(l2 + r2, wh2, mul1);
+ const V mul3 = MulAdd(l3 + r3, wh3, mul2);
+ return mul3;
+ }
+
+ template <size_t kSizeModN>
+ static JXL_INLINE V HorzConvolveLast(const float* const JXL_RESTRICT row,
+ const int64_t x, const int64_t xsize,
+ const V wh0, const V wh1, const V wh2,
+ const V wh3) {
+ const D d;
+ const V c = LoadU(d, row + x);
+ const V mul0 = c * wh0;
+
+ const V l1 = LoadU(d, row + x - 1);
+ const V l2 = LoadU(d, row + x - 2);
+ const V l3 = LoadU(d, row + x - 3);
+
+ V r1, r2, r3;
+#if HWY_TARGET == HWY_SCALAR
+ r1 = LoadU(d, row + Mirror(x + 1, xsize));
+ r2 = LoadU(d, row + Mirror(x + 2, xsize));
+ r3 = LoadU(d, row + Mirror(x + 3, xsize));
+#else
+ const size_t N = Lanes(d);
+ if (kSizeModN == 0) {
+ r3 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 3)));
+ r2 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 2)));
+ r1 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 1)));
+ } else if (kSizeModN == 1) {
+ const auto last = LoadU(d, row + xsize - N);
+ r3 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 2)));
+ r2 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1)));
+ r1 = last;
+ } else /* kSizeModN >= 2 */ {
+ const auto last = LoadU(d, row + xsize - N);
+ r3 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1)));
+ r2 = last;
+ r1 = LoadU(d, row + x + 1);
+ }
+#endif
+
+ // Sum of pixels with Manhattan distance i, multiplied by weights[i].
+ const V sum1 = l1 + r1;
+ const V mul1 = MulAdd(sum1, wh1, mul0);
+ const V sum2 = l2 + r2;
+ const V mul2 = MulAdd(sum2, wh2, mul1);
+ const V sum3 = l3 + r3;
+ const V mul3 = MulAdd(sum3, wh3, mul2);
+ return mul3;
+ }
+
+ // Returns one vector of horizontal convolution results; lane i is the result
+ // for pixel pos + i. This is the fast path for interior pixels, i.e. kRadius
+ // valid pixels before/after pos.
+ static JXL_INLINE V HorzConvolve(const float* const JXL_RESTRICT pos,
+ const V wh0, const V wh1, const V wh2,
+ const V wh3) {
+ const D d;
+ const V c = LoadU(d, pos);
+ const V mul0 = c * wh0;
+
+ // TODO(janwas): better to Combine
+ const V l1 = LoadU(d, pos - 1);
+ const V r1 = LoadU(d, pos + 1);
+ const V l2 = LoadU(d, pos - 2);
+ const V r2 = LoadU(d, pos + 2);
+ const V l3 = LoadU(d, pos - 3);
+ const V r3 = LoadU(d, pos + 3);
+ // Sum of pixels with Manhattan distance i, multiplied by weights[i].
+ const V sum1 = l1 + r1;
+ const V mul1 = MulAdd(sum1, wh1, mul0);
+ const V sum2 = l2 + r2;
+ const V mul2 = MulAdd(sum2, wh2, mul1);
+ const V sum3 = l3 + r3;
+ const V mul3 = MulAdd(sum3, wh3, mul2);
+ return mul3;
+ }
+}; // namespace HWY_NAMESPACE
+
+} // namespace strategy
+
+// Single entry point for convolution.
+// "Strategy" (Direct*/Separable*) decides kernel size and how to evaluate it.
+template <class Strategy>
+class ConvolveT {
+ static constexpr int64_t kRadius = Strategy::kRadius;
+ using Simd = HWY_CAPPED(float, 16);
+
+ public:
+ static size_t MinWidth() {
+#if HWY_TARGET == HWY_SCALAR
+ // First/Last use mirrored loads of up to +/- kRadius.
+ return 2 * kRadius;
+#else
+ return Lanes(Simd()) + kRadius;
+#endif
+ }
+
+ // "Image" is ImageF or Image3F.
+ template <class Image, class Weights>
+ static void Run(const Image& in, const Rect& rect, const Weights& weights,
+ ThreadPool* pool, Image* out) {
+ PROFILER_ZONE("ConvolveT::Run");
+ JXL_CHECK(SameSize(rect, *out));
+ JXL_CHECK(rect.xsize() >= MinWidth());
+
+ static_assert(int64_t(kRadius) <= 3,
+ "Must handle [0, kRadius) and >= kRadius");
+ switch (rect.xsize() % Lanes(Simd())) {
+ case 0:
+ return RunRows<0>(in, rect, weights, pool, out);
+ case 1:
+ return RunRows<1>(in, rect, weights, pool, out);
+ case 2:
+ return RunRows<2>(in, rect, weights, pool, out);
+ default:
+ return RunRows<3>(in, rect, weights, pool, out);
+ }
+ }
+
+ private:
+ template <size_t kSizeModN, class WrapRow, class Weights>
+ static JXL_INLINE void RunRow(const float* JXL_RESTRICT in,
+ const size_t xsize, const int64_t stride,
+ const WrapRow& wrap_row, const Weights& weights,
+ float* JXL_RESTRICT out) {
+ Strategy::template ConvolveRow<kSizeModN>(in, xsize, stride, wrap_row,
+ weights, out);
+ }
+
+ template <size_t kSizeModN, class Weights>
+ static JXL_INLINE void RunBorderRows(const ImageF& in, const Rect& rect,
+ const int64_t ybegin, const int64_t yend,
+ const Weights& weights, ImageF* out) {
+ const int64_t stride = in.PixelsPerRow();
+ const WrapRowMirror wrap_row(in, rect.ysize());
+ for (int64_t y = ybegin; y < yend; ++y) {
+ RunRow<kSizeModN>(rect.ConstRow(in, y), rect.xsize(), stride, wrap_row,
+ weights, out->Row(y));
+ }
+ }
+
+ // Image3F.
+ template <size_t kSizeModN, class Weights>
+ static JXL_INLINE void RunBorderRows(const Image3F& in, const Rect& rect,
+ const int64_t ybegin, const int64_t yend,
+ const Weights& weights, Image3F* out) {
+ const int64_t stride = in.PixelsPerRow();
+ for (int64_t y = ybegin; y < yend; ++y) {
+ for (size_t c = 0; c < 3; ++c) {
+ const WrapRowMirror wrap_row(in.Plane(c), rect.ysize());
+ RunRow<kSizeModN>(rect.ConstPlaneRow(in, c, y), rect.xsize(), stride,
+ wrap_row, weights, out->PlaneRow(c, y));
+ }
+ }
+ }
+
+ template <size_t kSizeModN, class Weights>
+ static JXL_INLINE void RunInteriorRows(const ImageF& in, const Rect& rect,
+ const int64_t ybegin,
+ const int64_t yend,
+ const Weights& weights,
+ ThreadPool* pool, ImageF* out) {
+ const int64_t stride = in.PixelsPerRow();
+ JXL_CHECK(RunOnPool(
+ pool, ybegin, yend, ThreadPool::NoInit,
+ [&](const uint32_t y, size_t /*thread*/) HWY_ATTR {
+ RunRow<kSizeModN>(rect.ConstRow(in, y), rect.xsize(), stride,
+ WrapRowUnchanged(), weights, out->Row(y));
+ },
+ "Convolve"));
+ }
+
+ // Image3F.
+ template <size_t kSizeModN, class Weights>
+ static JXL_INLINE void RunInteriorRows(const Image3F& in, const Rect& rect,
+ const int64_t ybegin,
+ const int64_t yend,
+ const Weights& weights,
+ ThreadPool* pool, Image3F* out) {
+ const int64_t stride = in.PixelsPerRow();
+ JXL_CHECK(RunOnPool(
+ pool, ybegin, yend, ThreadPool::NoInit,
+ [&](const uint32_t y, size_t /*thread*/) HWY_ATTR {
+ for (size_t c = 0; c < 3; ++c) {
+ RunRow<kSizeModN>(rect.ConstPlaneRow(in, c, y), rect.xsize(),
+ stride, WrapRowUnchanged(), weights,
+ out->PlaneRow(c, y));
+ }
+ },
+ "Convolve3"));
+ }
+
+ template <size_t kSizeModN, class Image, class Weights>
+ static JXL_INLINE void RunRows(const Image& in, const Rect& rect,
+ const Weights& weights, ThreadPool* pool,
+ Image* out) {
+ const int64_t ysize = rect.ysize();
+ RunBorderRows<kSizeModN>(in, rect, 0, std::min(int64_t(kRadius), ysize),
+ weights, out);
+ if (ysize > 2 * int64_t(kRadius)) {
+ RunInteriorRows<kSizeModN>(in, rect, int64_t(kRadius),
+ ysize - int64_t(kRadius), weights, pool, out);
+ }
+ if (ysize > int64_t(kRadius)) {
+ RunBorderRows<kSizeModN>(in, rect, ysize - int64_t(kRadius), ysize,
+ weights, out);
+ }
+ }
+};
+
+void Symmetric3(const ImageF& in, const Rect& rect,
+ const WeightsSymmetric3& weights, ThreadPool* pool,
+ ImageF* out) {
+ using Conv = ConvolveT<strategy::Symmetric3>;
+ if (rect.xsize() >= Conv::MinWidth()) {
+ return Conv::Run(in, rect, weights, pool, out);
+ }
+
+ return SlowSymmetric3(in, rect, weights, pool, out);
+}
+
+// Symmetric5 is implemented above without ConvolveT.
+
+void Separable5(const ImageF& in, const Rect& rect,
+ const WeightsSeparable5& weights, ThreadPool* pool,
+ ImageF* out) {
+ using Conv = ConvolveT<strategy::Separable5>;
+ if (rect.xsize() >= Conv::MinWidth()) {
+ return Conv::Run(in, rect, weights, pool, out);
+ }
+
+ return SlowSeparable5(in, rect, weights, pool, out);
+}
+void Separable5_3(const Image3F& in, const Rect& rect,
+ const WeightsSeparable5& weights, ThreadPool* pool,
+ Image3F* out) {
+ using Conv = ConvolveT<strategy::Separable5>;
+ if (rect.xsize() >= Conv::MinWidth()) {
+ return Conv::Run(in, rect, weights, pool, out);
+ }
+
+ return SlowSeparable5(in, rect, weights, pool, out);
+}
+
+void Separable7(const ImageF& in, const Rect& rect,
+ const WeightsSeparable7& weights, ThreadPool* pool,
+ ImageF* out) {
+ using Conv = ConvolveT<strategy::Separable7>;
+ if (rect.xsize() >= Conv::MinWidth()) {
+ return Conv::Run(in, rect, weights, pool, out);
+ }
+
+ return SlowSeparable7(in, rect, weights, pool, out);
+}
+void Separable7_3(const Image3F& in, const Rect& rect,
+ const WeightsSeparable7& weights, ThreadPool* pool,
+ Image3F* out) {
+ using Conv = ConvolveT<strategy::Separable7>;
+ if (rect.xsize() >= Conv::MinWidth()) {
+ return Conv::Run(in, rect, weights, pool, out);
+ }
+
+ return SlowSeparable7(in, rect, weights, pool, out);
+}
+
+// Semi-vectorized (interior pixels Fonly); called directly like slow::, unlike
+// the fully vectorized strategies below.
+void Symmetric5(const ImageF& in, const Rect& rect,
+ const WeightsSymmetric5& weights, ThreadPool* pool,
+ ImageF* JXL_RESTRICT out) {
+ PROFILER_FUNC;
+
+ const size_t ysize = rect.ysize();
+ JXL_CHECK(RunOnPool(
+ pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+ [&](const uint32_t task, size_t /*thread*/) {
+ const int64_t iy = task;
+
+ if (iy < 2 || iy >= static_cast<ssize_t>(ysize) - 2) {
+ Symmetric5BorderRow(in, rect, iy, weights, out->Row(iy));
+ } else {
+ Symmetric5Row<WrapUnchanged>(in, rect, iy, weights, out->Row(iy));
+ }
+ },
+ "Symmetric5x5Convolution"));
+}
+
+void Symmetric5_3(const Image3F& in, const Rect& rect,
+ const WeightsSymmetric5& weights, ThreadPool* pool,
+ Image3F* JXL_RESTRICT out) {
+ PROFILER_FUNC;
+
+ const size_t ysize = rect.ysize();
+ JXL_CHECK(RunOnPool(
+ pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+ [&](const uint32_t task, size_t /*thread*/) {
+ const size_t iy = task;
+
+ if (iy < 2 || iy >= ysize - 2) {
+ for (size_t c = 0; c < 3; ++c) {
+ Symmetric5BorderRow(in.Plane(c), rect, iy, weights,
+ out->PlaneRow(c, iy));
+ }
+ } else {
+ for (size_t c = 0; c < 3; ++c) {
+ Symmetric5Row<WrapUnchanged>(in.Plane(c), rect, iy, weights,
+ out->PlaneRow(c, iy));
+ }
+ }
+ },
+ "Symmetric5x5Convolution3"));
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+} // namespace HWY_NAMESPACE
+} // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(Symmetric3);
+void Symmetric3(const ImageF& in, const Rect& rect,
+ const WeightsSymmetric3& weights, ThreadPool* pool,
+ ImageF* out) {
+ return HWY_DYNAMIC_DISPATCH(Symmetric3)(in, rect, weights, pool, out);
+}
+
+HWY_EXPORT(Symmetric5);
+void Symmetric5(const ImageF& in, const Rect& rect,
+ const WeightsSymmetric5& weights, ThreadPool* pool,
+ ImageF* JXL_RESTRICT out) {
+ return HWY_DYNAMIC_DISPATCH(Symmetric5)(in, rect, weights, pool, out);
+}
+
+HWY_EXPORT(Symmetric5_3);
+void Symmetric5_3(const Image3F& in, const Rect& rect,
+ const WeightsSymmetric5& weights, ThreadPool* pool,
+ Image3F* JXL_RESTRICT out) {
+ return HWY_DYNAMIC_DISPATCH(Symmetric5_3)(in, rect, weights, pool, out);
+}
+
+HWY_EXPORT(Separable5);
+void Separable5(const ImageF& in, const Rect& rect,
+ const WeightsSeparable5& weights, ThreadPool* pool,
+ ImageF* out) {
+ return HWY_DYNAMIC_DISPATCH(Separable5)(in, rect, weights, pool, out);
+}
+
+HWY_EXPORT(Separable5_3);
+void Separable5_3(const Image3F& in, const Rect& rect,
+ const WeightsSeparable5& weights, ThreadPool* pool,
+ Image3F* out) {
+ return HWY_DYNAMIC_DISPATCH(Separable5_3)(in, rect, weights, pool, out);
+}
+
+HWY_EXPORT(Separable7);
+void Separable7(const ImageF& in, const Rect& rect,
+ const WeightsSeparable7& weights, ThreadPool* pool,
+ ImageF* out) {
+ return HWY_DYNAMIC_DISPATCH(Separable7)(in, rect, weights, pool, out);
+}
+
+HWY_EXPORT(Separable7_3);
+void Separable7_3(const Image3F& in, const Rect& rect,
+ const WeightsSeparable7& weights, ThreadPool* pool,
+ Image3F* out) {
+ return HWY_DYNAMIC_DISPATCH(Separable7_3)(in, rect, weights, pool, out);
+}
+
+//------------------------------------------------------------------------------
+// Kernels
+
+// Concentrates energy in low-frequency components (e.g. for antialiasing).
+const WeightsSymmetric3& WeightsSymmetric3Lowpass() {
+ // Computed by research/convolve_weights.py's cubic spline approximations of
+ // prolate spheroidal wave functions.
+ constexpr float w0 = 0.36208932f;
+ constexpr float w1 = 0.12820096f;
+ constexpr float w2 = 0.03127668f;
+ static constexpr WeightsSymmetric3 weights = {
+ {HWY_REP4(w0)}, {HWY_REP4(w1)}, {HWY_REP4(w2)}};
+ return weights;
+}
+
+const WeightsSeparable5& WeightsSeparable5Lowpass() {
+ constexpr float w0 = 0.41714928f;
+ constexpr float w1 = 0.25539268f;
+ constexpr float w2 = 0.03603267f;
+ static constexpr WeightsSeparable5 weights = {
+ {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
+ {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}};
+ return weights;
+}
+
+const WeightsSymmetric5& WeightsSymmetric5Lowpass() {
+ static constexpr WeightsSymmetric5 weights = {
+ {HWY_REP4(0.1740135f)}, {HWY_REP4(0.1065369f)}, {HWY_REP4(0.0150310f)},
+ {HWY_REP4(0.0652254f)}, {HWY_REP4(0.0012984f)}, {HWY_REP4(0.0092025f)}};
+ return weights;
+}
+
+const WeightsSeparable5& WeightsSeparable5Gaussian1() {
+ constexpr float w0 = 0.38774f;
+ constexpr float w1 = 0.24477f;
+ constexpr float w2 = 0.06136f;
+ static constexpr WeightsSeparable5 weights = {
+ {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
+ {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}};
+ return weights;
+}
+
+const WeightsSeparable5& WeightsSeparable5Gaussian2() {
+ constexpr float w0 = 0.250301f;
+ constexpr float w1 = 0.221461f;
+ constexpr float w2 = 0.153388f;
+ static constexpr WeightsSeparable5 weights = {
+ {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
+ {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)}};
+ return weights;
+}
+
+//------------------------------------------------------------------------------
+// Slow
+
+namespace {
+
+template <class WrapX, class WrapY>
+float SlowSymmetric3Pixel(const ImageF& in, const int64_t ix, const int64_t iy,
+ const int64_t xsize, const int64_t ysize,
+ const WeightsSymmetric3& weights) {
+ float sum = 0.0f;
+
+ // ix: image; kx: kernel
+ for (int64_t ky = -1; ky <= 1; ky++) {
+ const int64_t y = WrapY()(iy + ky, ysize);
+ const float* JXL_RESTRICT row_in = in.ConstRow(static_cast<size_t>(y));
+
+ const float wc = ky == 0 ? weights.c[0] : weights.r[0];
+ const float wlr = ky == 0 ? weights.r[0] : weights.d[0];
+
+ const int64_t xm1 = WrapX()(ix - 1, xsize);
+ const int64_t xp1 = WrapX()(ix + 1, xsize);
+ sum += row_in[ix] * wc + (row_in[xm1] + row_in[xp1]) * wlr;
+ }
+ return sum;
+}
+
+template <class WrapY>
+void SlowSymmetric3Row(const ImageF& in, const int64_t iy, const int64_t xsize,
+ const int64_t ysize, const WeightsSymmetric3& weights,
+ float* JXL_RESTRICT row_out) {
+ row_out[0] =
+ SlowSymmetric3Pixel<WrapMirror, WrapY>(in, 0, iy, xsize, ysize, weights);
+ for (int64_t ix = 1; ix < xsize - 1; ix++) {
+ row_out[ix] = SlowSymmetric3Pixel<WrapUnchanged, WrapY>(in, ix, iy, xsize,
+ ysize, weights);
+ }
+ {
+ const int64_t ix = xsize - 1;
+ row_out[ix] = SlowSymmetric3Pixel<WrapMirror, WrapY>(in, ix, iy, xsize,
+ ysize, weights);
+ }
+}
+
+} // namespace
+
+void SlowSymmetric3(const ImageF& in, const Rect& rect,
+ const WeightsSymmetric3& weights, ThreadPool* pool,
+ ImageF* JXL_RESTRICT out) {
+ PROFILER_FUNC;
+
+ const int64_t xsize = static_cast<int64_t>(rect.xsize());
+ const int64_t ysize = static_cast<int64_t>(rect.ysize());
+ const int64_t kRadius = 1;
+
+ JXL_CHECK(RunOnPool(
+ pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+ [&](const uint32_t task, size_t /*thread*/) {
+ const int64_t iy = task;
+ float* JXL_RESTRICT out_row = out->Row(static_cast<size_t>(iy));
+
+ if (iy < kRadius || iy >= ysize - kRadius) {
+ SlowSymmetric3Row<WrapMirror>(in, iy, xsize, ysize, weights, out_row);
+ } else {
+ SlowSymmetric3Row<WrapUnchanged>(in, iy, xsize, ysize, weights,
+ out_row);
+ }
+ },
+ "SlowSymmetric3"));
+}
+
+void SlowSymmetric3(const Image3F& in, const Rect& rect,
+ const WeightsSymmetric3& weights, ThreadPool* pool,
+ Image3F* JXL_RESTRICT out) {
+ PROFILER_FUNC;
+
+ const int64_t xsize = static_cast<int64_t>(rect.xsize());
+ const int64_t ysize = static_cast<int64_t>(rect.ysize());
+ const int64_t kRadius = 1;
+
+ JXL_CHECK(RunOnPool(
+ pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+ [&](const uint32_t task, size_t /*thread*/) {
+ const int64_t iy = task;
+ const size_t oy = static_cast<size_t>(iy);
+
+ if (iy < kRadius || iy >= ysize - kRadius) {
+ for (size_t c = 0; c < 3; ++c) {
+ SlowSymmetric3Row<WrapMirror>(in.Plane(c), iy, xsize, ysize,
+ weights, out->PlaneRow(c, oy));
+ }
+ } else {
+ for (size_t c = 0; c < 3; ++c) {
+ SlowSymmetric3Row<WrapUnchanged>(in.Plane(c), iy, xsize, ysize,
+ weights, out->PlaneRow(c, oy));
+ }
+ }
+ },
+ "SlowSymmetric3"));
+}
+
+namespace {
+
+// Separable kernels, any radius.
+float SlowSeparablePixel(const ImageF& in, const Rect& rect, const int64_t x,
+ const int64_t y, const int64_t radius,
+ const float* JXL_RESTRICT horz_weights,
+ const float* JXL_RESTRICT vert_weights) {
+ const size_t xsize = rect.xsize();
+ const size_t ysize = rect.ysize();
+ const WrapMirror wrap;
+
+ float mul = 0.0f;
+ for (int dy = -radius; dy <= radius; ++dy) {
+ const float wy = vert_weights[std::abs(dy) * 4];
+ const size_t sy = wrap(y + dy, ysize);
+ JXL_CHECK(sy < ysize);
+ const float* const JXL_RESTRICT row = rect.ConstRow(in, sy);
+ for (int dx = -radius; dx <= radius; ++dx) {
+ const float wx = horz_weights[std::abs(dx) * 4];
+ const size_t sx = wrap(x + dx, xsize);
+ JXL_CHECK(sx < xsize);
+ mul += row[sx] * wx * wy;
+ }
+ }
+ return mul;
+}
+
+} // namespace
+
+void SlowSeparable5(const ImageF& in, const Rect& rect,
+ const WeightsSeparable5& weights, ThreadPool* pool,
+ ImageF* out) {
+ PROFILER_FUNC;
+ const float* horz_weights = &weights.horz[0];
+ const float* vert_weights = &weights.vert[0];
+
+ const size_t ysize = rect.ysize();
+ JXL_CHECK(RunOnPool(
+ pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+ [&](const uint32_t task, size_t /*thread*/) {
+ const int64_t y = task;
+
+ float* const JXL_RESTRICT row_out = out->Row(y);
+ for (size_t x = 0; x < rect.xsize(); ++x) {
+ row_out[x] = SlowSeparablePixel(in, rect, x, y, /*radius=*/2,
+ horz_weights, vert_weights);
+ }
+ },
+ "SlowSeparable5"));
+}
+
+void SlowSeparable5(const Image3F& in, const Rect& rect,
+ const WeightsSeparable5& weights, ThreadPool* pool,
+ Image3F* out) {
+ for (size_t c = 0; c < 3; ++c) {
+ SlowSeparable5(in.Plane(c), rect, weights, pool, &out->Plane(c));
+ }
+}
+
+void SlowSeparable7(const ImageF& in, const Rect& rect,
+ const WeightsSeparable7& weights, ThreadPool* pool,
+ ImageF* out) {
+ PROFILER_FUNC;
+ const float* horz_weights = &weights.horz[0];
+ const float* vert_weights = &weights.vert[0];
+
+ const size_t ysize = rect.ysize();
+ JXL_CHECK(RunOnPool(
+ pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+ [&](const uint32_t task, size_t /*thread*/) {
+ const int64_t y = task;
+
+ float* const JXL_RESTRICT row_out = out->Row(y);
+ for (size_t x = 0; x < rect.xsize(); ++x) {
+ row_out[x] = SlowSeparablePixel(in, rect, x, y, /*radius=*/3,
+ horz_weights, vert_weights);
+ }
+ },
+ "SlowSeparable7"));
+}
+
+void SlowSeparable7(const Image3F& in, const Rect& rect,
+ const WeightsSeparable7& weights, ThreadPool* pool,
+ Image3F* out) {
+ for (size_t c = 0; c < 3; ++c) {
+ SlowSeparable7(in.Plane(c), rect, weights, pool, &out->Plane(c));
+ }
+}
+
+void SlowLaplacian5(const ImageF& in, const Rect& rect, ThreadPool* pool,
+ ImageF* out) {
+ PROFILER_FUNC;
+ JXL_CHECK(SameSize(rect, *out));
+
+ const size_t xsize = rect.xsize();
+ const size_t ysize = rect.ysize();
+ const WrapMirror wrap;
+
+ JXL_CHECK(RunOnPool(
+ pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
+ [&](const uint32_t task, size_t /*thread*/) {
+ const int64_t y = task;
+
+ const float* const JXL_RESTRICT row_t =
+ rect.ConstRow(in, wrap(y - 2, ysize));
+ const float* const JXL_RESTRICT row_m = rect.ConstRow(in, y);
+ const float* const JXL_RESTRICT row_b =
+ rect.ConstRow(in, wrap(y + 2, ysize));
+ float* const JXL_RESTRICT row_out = out->Row(y);
+
+ for (int64_t x = 0; static_cast<size_t>(x) < xsize; ++x) {
+ const int64_t xm2 = wrap(x - 2, xsize);
+ const int64_t xp2 = wrap(x + 2, xsize);
+ float r = 0.0f;
+ r += /* */ 1.0f * row_t[x];
+ r += 1.0f * row_m[xm2] - 4.0f * row_m[x] + 1.0f * row_m[xp2];
+ r += /* */ 1.0f * row_b[x];
+ row_out[x] = r;
+ }
+ },
+ "SlowLaplacian5"));
+}
+
+void SlowLaplacian5(const Image3F& in, const Rect& rect, ThreadPool* pool,
+ Image3F* out) {
+ for (size_t c = 0; c < 3; ++c) {
+ SlowLaplacian5(in.Plane(c), rect, pool, &out->Plane(c));
+ }
+}
+
+} // namespace jxl
+#endif // HWY_ONCE