diff options
Diffstat (limited to 'media/highway/src/hwy/contrib')
46 files changed, 9530 insertions, 0 deletions
diff --git a/media/highway/src/hwy/contrib/algo/copy-inl.h b/media/highway/src/hwy/contrib/algo/copy-inl.h new file mode 100644 index 0000000000..34e926a915 --- /dev/null +++ b/media/highway/src/hwy/contrib/algo/copy-inl.h @@ -0,0 +1,138 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target include guard +#if defined(HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ +#endif + +#include <string.h> // memcpy + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// These functions avoid having to write a loop plus remainder handling in the +// (unfortunately still common) case where arrays are not aligned/padded. If the +// inputs are known to be aligned/padded, it is more efficient to write a single +// loop using Load(). We do not provide a CopyAlignedPadded because it +// would be more verbose than such a loop. + +// Fills `to`[0, `count`) with `value`. +template <class D, typename T = TFromD<D>> +void Fill(D d, T value, size_t count, T* HWY_RESTRICT to) { + const size_t N = Lanes(d); + const Vec<D> v = Set(d, value); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + StoreU(v, d, to + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + SafeFillN(remaining, value, d, to + idx); +} + +// Copies `from`[0, `count`) to `to`, which must not overlap `from`. +template <class D, typename T = TFromD<D>> +void Copy(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec<D> v = LoadU(d, from + idx); + StoreU(v, d, to + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + SafeCopyN(remaining, d, from + idx, to + idx); +} + +// For idx in [0, count) in ascending order, appends `from[idx]` to `to` if the +// corresponding mask element of `func(d, v)` is true. Returns the STL-style end +// of the newly written elements in `to`. +// +// `func` is either a functor with a templated operator()(d, v) returning a +// mask, or a generic lambda if using C++14. Due to apparent limitations of +// Clang on Windows, it is currently necessary to add HWY_ATTR before the +// opening { of the lambda to avoid errors about "function .. requires target". +// +// NOTE: this is only supported for 16-, 32- or 64-bit types. +// NOTE: Func may be called a second time for elements it has already seen, but +// these elements will not be written to `to` again. +template <class D, class Func, typename T = TFromD<D>> +T* CopyIf(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to, + const Func& func) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec<D> v = LoadU(d, from + idx); + to += CompressBlendedStore(v, func(d, v), d, to); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return to; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag<T, 1> d1; + for (; idx < count; ++idx) { + using V1 = Vec<decltype(d1)>; + // Workaround for -Waggressive-loop-optimizations on GCC 8 + // (iteration 2305843009213693951 invokes undefined behavior for T=i64) + const uintptr_t addr = reinterpret_cast<uintptr_t>(from); + const T* HWY_RESTRICT from_idx = + reinterpret_cast<const T * HWY_RESTRICT>(addr + (idx * sizeof(T))); + const V1 v = LoadU(d1, from_idx); + // Avoid storing to `to` unless we know it should be kept - otherwise, we + // might overrun the end if it was allocated for the exact count. + if (CountTrue(d1, func(d1, v)) == 0) continue; + StoreU(v, d1, to); + to += 1; + } +#else + // Start index of the last unaligned whole vector, ending at the array end. + const size_t last = count - N; + // Number of elements before `from` or already written. + const size_t invalid = idx - last; + HWY_DASSERT(0 != invalid && invalid < N); + const Mask<D> mask = Not(FirstN(d, invalid)); + const Vec<D> v = MaskedLoad(mask, d, from + last); + to += CompressBlendedStore(v, And(mask, func(d, v)), d, to); +#endif + return to; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ diff --git a/media/highway/src/hwy/contrib/algo/copy_test.cc b/media/highway/src/hwy/contrib/algo/copy_test.cc new file mode 100644 index 0000000000..8e55cd5bec --- /dev/null +++ b/media/highway/src/hwy/contrib/algo/copy_test.cc @@ -0,0 +1,199 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/aligned_allocator.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/algo/copy_test.cc" +#include "hwy/foreach_target.h" + +#include "hwy/contrib/algo/copy-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +// If your project requires C++14 or later, you can ignore this and pass lambdas +// directly to Transform, without requiring an lvalue as we do here for C++11. +#if __cplusplus < 201402L +#define HWY_GENERIC_LAMBDA 0 +#else +#define HWY_GENERIC_LAMBDA 1 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Returns random integer in [0, 128), which fits in any lane type. +template <typename T> +T Random7Bit(RandomState& rng) { + return static_cast<T>(Random32(&rng) & 127); +} + +// In C++14, we can instead define these as generic lambdas next to where they +// are invoked. +#if !HWY_GENERIC_LAMBDA + +struct IsOdd { + template <class D, class V> + Mask<D> operator()(D d, V v) const { + return TestBit(v, Set(d, TFromD<D>{1})); + } +}; + +#endif // !HWY_GENERIC_LAMBDA + +// Invokes Test (e.g. TestCopyIf) with all arg combinations. T comes from +// ForFloatTypes. +template <class Test> +struct ForeachCountAndMisalign { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) const { + RandomState rng; + const size_t N = Lanes(d); + const size_t misalignments[3] = {0, N / 4, 3 * N / 5}; + + for (size_t count = 0; count < 2 * N; ++count) { + for (size_t ma : misalignments) { + for (size_t mb : misalignments) { + Test()(d, count, ma, mb, rng); + } + } + } + } +}; + +struct TestFill { + template <class D> + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD<D>; + // HWY_MAX prevents error when misalign == count == 0. + AlignedFreeUniquePtr<T[]> pa = + AllocateAligned<T>(HWY_MAX(1, misalign_a + count)); + T* expected = pa.get() + misalign_a; + const T value = Random7Bit<T>(rng); + for (size_t i = 0; i < count; ++i) { + expected[i] = value; + } + AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(misalign_b + count + 1); + T* actual = pb.get() + misalign_b; + + actual[count] = T{0}; // sentinel + Fill(d, value, count, actual); + HWY_ASSERT_EQ(T{0}, actual[count]); // did not write past end + + const auto info = hwy::detail::MakeTypeInfo<T>(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected, actual, count, target_name, + __FILE__, __LINE__); + } +}; + +void TestAllFill() { + ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFill>>()); +} + +struct TestCopy { + template <class D> + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD<D>; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr<T[]> pa = + AllocateAligned<T>(HWY_MAX(1, misalign_a + count)); + T* a = pa.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random7Bit<T>(rng); + } + AlignedFreeUniquePtr<T[]> pb = + AllocateAligned<T>(HWY_MAX(1, misalign_b + count)); + T* b = pb.get() + misalign_b; + + Copy(d, a, count, b); + + const auto info = hwy::detail::MakeTypeInfo<T>(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, a, b, count, target_name, __FILE__, + __LINE__); + } +}; + +void TestAllCopy() { + ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestCopy>>()); +} + +struct TestCopyIf { + template <class D> + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD<D>; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr<T[]> pa = + AllocateAligned<T>(HWY_MAX(1, misalign_a + count)); + T* a = pa.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random7Bit<T>(rng); + } + const size_t padding = Lanes(ScalableTag<T>()); + AlignedFreeUniquePtr<T[]> pb = + AllocateAligned<T>(HWY_MAX(1, misalign_b + count + padding)); + T* b = pb.get() + misalign_b; + + AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count)); + size_t num_odd = 0; + for (size_t i = 0; i < count; ++i) { + if (a[i] & 1) { + expected[num_odd++] = a[i]; + } + } + +#if HWY_GENERIC_LAMBDA + const auto is_odd = [](const auto d, const auto v) HWY_ATTR { + return TestBit(v, Set(d, TFromD<decltype(d)>{1})); + }; +#else + const IsOdd is_odd; +#endif + T* end = CopyIf(d, a, count, b, is_odd); + const size_t num_written = static_cast<size_t>(end - b); + HWY_ASSERT_EQ(num_odd, num_written); + + const auto info = hwy::detail::MakeTypeInfo<T>(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), b, num_odd, target_name, + __FILE__, __LINE__); + } +}; + +void TestAllCopyIf() { + ForUI163264(ForPartialVectors<ForeachCountAndMisalign<TestCopyIf>>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(CopyTest); +HWY_EXPORT_AND_TEST_P(CopyTest, TestAllFill); +HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopy); +HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopyIf); +} // namespace hwy + +#endif diff --git a/media/highway/src/hwy/contrib/algo/find-inl.h b/media/highway/src/hwy/contrib/algo/find-inl.h new file mode 100644 index 0000000000..388842e988 --- /dev/null +++ b/media/highway/src/hwy/contrib/algo/find-inl.h @@ -0,0 +1,109 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target include guard +#if defined(HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_ +#endif + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Returns index of the first element equal to `value` in `in[0, count)`, or +// `count` if not found. +template <class D, typename T = TFromD<D>> +size_t Find(D d, T value, const T* HWY_RESTRICT in, size_t count) { + const size_t N = Lanes(d); + const Vec<D> broadcasted = Set(d, value); + + size_t i = 0; + for (; i + N <= count; i += N) { + const intptr_t pos = FindFirstTrue(d, Eq(broadcasted, LoadU(d, in + i))); + if (pos >= 0) return i + static_cast<size_t>(pos); + } + + if (i != count) { +#if HWY_MEM_OPS_MIGHT_FAULT + // Scan single elements. + const CappedTag<T, 1> d1; + using V1 = Vec<decltype(d1)>; + const V1 broadcasted1 = Set(d1, GetLane(broadcasted)); + for (; i < count; ++i) { + if (AllTrue(d1, Eq(broadcasted1, LoadU(d1, in + i)))) { + return i; + } + } +#else + const size_t remaining = count - i; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask<D> mask = FirstN(d, remaining); + const Vec<D> v = MaskedLoad(mask, d, in + i); + // Apply mask so that we don't 'find' the zero-padding from MaskedLoad. + const intptr_t pos = FindFirstTrue(d, And(Eq(broadcasted, v), mask)); + if (pos >= 0) return i + static_cast<size_t>(pos); +#endif // HWY_MEM_OPS_MIGHT_FAULT + } + + return count; // not found +} + +// Returns index of the first element in `in[0, count)` for which `func(d, vec)` +// returns true, otherwise `count`. +template <class D, class Func, typename T = TFromD<D>> +size_t FindIf(D d, const T* HWY_RESTRICT in, size_t count, const Func& func) { + const size_t N = Lanes(d); + + size_t i = 0; + for (; i + N <= count; i += N) { + const intptr_t pos = FindFirstTrue(d, func(d, LoadU(d, in + i))); + if (pos >= 0) return i + static_cast<size_t>(pos); + } + + if (i != count) { +#if HWY_MEM_OPS_MIGHT_FAULT + // Scan single elements. + const CappedTag<T, 1> d1; + for (; i < count; ++i) { + if (AllTrue(d1, func(d1, LoadU(d1, in + i)))) { + return i; + } + } +#else + const size_t remaining = count - i; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask<D> mask = FirstN(d, remaining); + const Vec<D> v = MaskedLoad(mask, d, in + i); + // Apply mask so that we don't 'find' the zero-padding from MaskedLoad. + const intptr_t pos = FindFirstTrue(d, And(func(d, v), mask)); + if (pos >= 0) return i + static_cast<size_t>(pos); +#endif // HWY_MEM_OPS_MIGHT_FAULT + } + + return count; // not found +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_ diff --git a/media/highway/src/hwy/contrib/algo/find_test.cc b/media/highway/src/hwy/contrib/algo/find_test.cc new file mode 100644 index 0000000000..8caf7e1512 --- /dev/null +++ b/media/highway/src/hwy/contrib/algo/find_test.cc @@ -0,0 +1,219 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <algorithm> +#include <vector> + +#include "hwy/aligned_allocator.h" +#include "hwy/base.h" +#include "hwy/print.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/algo/find_test.cc" +#include "hwy/foreach_target.h" + +#include "hwy/contrib/algo/find-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +// If your project requires C++14 or later, you can ignore this and pass lambdas +// directly to FindIf, without requiring an lvalue as we do here for C++11. +#if __cplusplus < 201402L +#define HWY_GENERIC_LAMBDA 0 +#else +#define HWY_GENERIC_LAMBDA 1 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Returns random number in [-8, 8) - we use knowledge of the range to Find() +// values we know are not present. +template <typename T> +T Random(RandomState& rng) { + const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023; + const double val = (bits - 512) / 64.0; + // Clamp negative to zero for unsigned types. + return static_cast<T>(HWY_MAX(hwy::LowestValue<T>(), val)); +} + +// In C++14, we can instead define these as generic lambdas next to where they +// are invoked. +#if !HWY_GENERIC_LAMBDA + +class GreaterThan { + public: + GreaterThan(int val) : val_(val) {} + template <class D, class V> + Mask<D> operator()(D d, V v) const { + return Gt(v, Set(d, static_cast<TFromD<D>>(val_))); + } + + private: + int val_; +}; + +#endif // !HWY_GENERIC_LAMBDA + +// Invokes Test (e.g. TestFind) with all arg combinations. +template <class Test> +struct ForeachCountAndMisalign { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) const { + RandomState rng; + const size_t N = Lanes(d); + const size_t misalignments[3] = {0, N / 4, 3 * N / 5}; + + // Find() checks 8 vectors at a time, so we want to cover a fairly large + // range without oversampling (checking every possible count). + std::vector<size_t> counts(AdjustedReps(512)); + for (size_t& count : counts) { + count = static_cast<size_t>(rng()) % (16 * N + 1); + } + counts[0] = 0; // ensure we test count=0. + + for (size_t count : counts) { + for (size_t m : misalignments) { + Test()(d, count, m, rng); + } + } + } +}; + +struct TestFind { + template <class D> + void operator()(D d, size_t count, size_t misalign, RandomState& rng) { + using T = TFromD<D>; + // Must allocate at least one even if count is zero. + AlignedFreeUniquePtr<T[]> storage = + AllocateAligned<T>(HWY_MAX(1, misalign + count)); + T* in = storage.get() + misalign; + for (size_t i = 0; i < count; ++i) { + in[i] = Random<T>(rng); + } + + // For each position, search for that element (which we know is there) + for (size_t pos = 0; pos < count; ++pos) { + const size_t actual = Find(d, in[pos], in, count); + + // We may have found an earlier occurrence of the same value; ensure the + // value is the same, and that it is the first. + if (!IsEqual(in[pos], in[actual])) { + fprintf(stderr, "%s count %d, found %.15f at %d but wanted %.15f\n", + hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count), + static_cast<double>(in[actual]), static_cast<int>(actual), + static_cast<double>(in[pos])); + HWY_ASSERT(false); + } + for (size_t i = 0; i < actual; ++i) { + if (IsEqual(in[i], in[pos])) { + fprintf(stderr, "%s count %d, found %f at %d but Find returned %d\n", + hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count), + static_cast<double>(in[i]), static_cast<int>(i), + static_cast<int>(actual)); + HWY_ASSERT(false); + } + } + } + + // Also search for values we know not to be present (out of range) + HWY_ASSERT_EQ(count, Find(d, T{9}, in, count)); + HWY_ASSERT_EQ(count, Find(d, static_cast<T>(-9), in, count)); + } +}; + +void TestAllFind() { + ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFind>>()); +} + +struct TestFindIf { + template <class D> + void operator()(D d, size_t count, size_t misalign, RandomState& rng) { + using T = TFromD<D>; + using TI = MakeSigned<T>; + // Must allocate at least one even if count is zero. + AlignedFreeUniquePtr<T[]> storage = + AllocateAligned<T>(HWY_MAX(1, misalign + count)); + T* in = storage.get() + misalign; + for (size_t i = 0; i < count; ++i) { + in[i] = Random<T>(rng); + HWY_ASSERT(in[i] < 8); + HWY_ASSERT(!hwy::IsSigned<T>() || static_cast<TI>(in[i]) >= -8); + } + + bool found_any = false; + bool not_found_any = false; + + // unsigned T would be promoted to signed and compare greater than any + // negative val, whereas Set() would just cast to an unsigned value and the + // comparison remains unsigned, so avoid negative numbers there. + const int min_val = IsSigned<T>() ? -9 : 0; + // Includes out-of-range value 9 to test the not-found path. + for (int val = min_val; val <= 9; ++val) { +#if HWY_GENERIC_LAMBDA + const auto greater = [val](const auto d, const auto v) HWY_ATTR { + return Gt(v, Set(d, static_cast<T>(val))); + }; +#else + const GreaterThan greater(val); +#endif + const size_t actual = FindIf(d, in, count, greater); + found_any |= actual < count; + not_found_any |= actual == count; + + const auto pos = std::find_if( + in, in + count, [val](T x) { return x > static_cast<T>(val); }); + // Convert returned iterator to index. + const size_t expected = static_cast<size_t>(pos - in); + if (expected != actual) { + fprintf(stderr, "%s count %d val %d, expected %d actual %d\n", + hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count), + val, static_cast<int>(expected), static_cast<int>(actual)); + hwy::detail::PrintArray(hwy::detail::MakeTypeInfo<T>(), "in", in, count, + 0, count); + HWY_ASSERT(false); + } + } + + // We will always not-find something due to val=9. + HWY_ASSERT(not_found_any); + // We'll find something unless the input is empty or {0} - because 0 > i + // is false for all i=[0,9]. + if (count != 0 && in[0] != 0) { + HWY_ASSERT(found_any); + } + } +}; + +void TestAllFindIf() { + ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFindIf>>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(FindTest); +HWY_EXPORT_AND_TEST_P(FindTest, TestAllFind); +HWY_EXPORT_AND_TEST_P(FindTest, TestAllFindIf); +} // namespace hwy + +#endif diff --git a/media/highway/src/hwy/contrib/algo/transform-inl.h b/media/highway/src/hwy/contrib/algo/transform-inl.h new file mode 100644 index 0000000000..3e830acb47 --- /dev/null +++ b/media/highway/src/hwy/contrib/algo/transform-inl.h @@ -0,0 +1,262 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target include guard +#if defined(HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_ +#endif + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// These functions avoid having to write a loop plus remainder handling in the +// (unfortunately still common) case where arrays are not aligned/padded. If the +// inputs are known to be aligned/padded, it is more efficient to write a single +// loop using Load(). We do not provide a TransformAlignedPadded because it +// would be more verbose than such a loop. +// +// Func is either a functor with a templated operator()(d, v[, v1[, v2]]), or a +// generic lambda if using C++14. Due to apparent limitations of Clang on +// Windows, it is currently necessary to add HWY_ATTR before the opening { of +// the lambda to avoid errors about "always_inline function .. requires target". +// +// If HWY_MEM_OPS_MIGHT_FAULT, we use scalar code instead of masking. Otherwise, +// we used `MaskedLoad` and `BlendedStore` to read/write the final partial +// vector. + +// Fills `out[0, count)` with the vectors returned by `func(d, index_vec)`, +// where `index_vec` is `Vec<RebindToUnsigned<D>>`. On the first call to `func`, +// the value of its lane i is i, and increases by `Lanes(d)` after every call. +// Note that some of these indices may be `>= count`, but the elements that +// `func` returns in those lanes will not be written to `out`. +template <class D, class Func, typename T = TFromD<D>> +void Generate(D d, T* HWY_RESTRICT out, size_t count, const Func& func) { + const RebindToUnsigned<D> du; + using TU = TFromD<decltype(du)>; + const size_t N = Lanes(d); + + size_t idx = 0; + Vec<decltype(du)> vidx = Iota(du, 0); + for (; idx + N <= count; idx += N) { + StoreU(func(d, vidx), d, out + idx); + vidx = Add(vidx, Set(du, static_cast<TU>(N))); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag<T, 1> d1; + const RebindToUnsigned<decltype(d1)> du1; + for (; idx < count; ++idx) { + StoreU(func(d1, Set(du1, static_cast<TU>(idx))), d1, out + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask<D> mask = FirstN(d, remaining); + BlendedStore(func(d, vidx), mask, d, out + idx); +#endif +} + +// Replaces `inout[idx]` with `func(d, inout[idx])`. Example usage: multiplying +// array elements by a constant. +template <class D, class Func, typename T = TFromD<D>> +void Transform(D d, T* HWY_RESTRICT inout, size_t count, const Func& func) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec<D> v = LoadU(d, inout + idx); + StoreU(func(d, v), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag<T, 1> d1; + for (; idx < count; ++idx) { + using V1 = Vec<decltype(d1)>; + const V1 v = LoadU(d1, inout + idx); + StoreU(func(d1, v), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask<D> mask = FirstN(d, remaining); + const Vec<D> v = MaskedLoad(mask, d, inout + idx); + BlendedStore(func(d, v), mask, d, inout + idx); +#endif +} + +// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx])`. Example usage: +// multiplying array elements by those of another array. +template <class D, class Func, typename T = TFromD<D>> +void Transform1(D d, T* HWY_RESTRICT inout, size_t count, + const T* HWY_RESTRICT in1, const Func& func) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec<D> v = LoadU(d, inout + idx); + const Vec<D> v1 = LoadU(d, in1 + idx); + StoreU(func(d, v, v1), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag<T, 1> d1; + for (; idx < count; ++idx) { + using V1 = Vec<decltype(d1)>; + const V1 v = LoadU(d1, inout + idx); + const V1 v1 = LoadU(d1, in1 + idx); + StoreU(func(d1, v, v1), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask<D> mask = FirstN(d, remaining); + const Vec<D> v = MaskedLoad(mask, d, inout + idx); + const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx); + BlendedStore(func(d, v, v1), mask, d, inout + idx); +#endif +} + +// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx], in2[idx])`. Example +// usage: FMA of elements from three arrays, stored into the first array. +template <class D, class Func, typename T = TFromD<D>> +void Transform2(D d, T* HWY_RESTRICT inout, size_t count, + const T* HWY_RESTRICT in1, const T* HWY_RESTRICT in2, + const Func& func) { + const size_t N = Lanes(d); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + const Vec<D> v = LoadU(d, inout + idx); + const Vec<D> v1 = LoadU(d, in1 + idx); + const Vec<D> v2 = LoadU(d, in2 + idx); + StoreU(func(d, v, v1, v2), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag<T, 1> d1; + for (; idx < count; ++idx) { + using V1 = Vec<decltype(d1)>; + const V1 v = LoadU(d1, inout + idx); + const V1 v1 = LoadU(d1, in1 + idx); + const V1 v2 = LoadU(d1, in2 + idx); + StoreU(func(d1, v, v1, v2), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask<D> mask = FirstN(d, remaining); + const Vec<D> v = MaskedLoad(mask, d, inout + idx); + const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx); + const Vec<D> v2 = MaskedLoad(mask, d, in2 + idx); + BlendedStore(func(d, v, v1, v2), mask, d, inout + idx); +#endif +} + +template <class D, typename T = TFromD<D>> +void Replace(D d, T* HWY_RESTRICT inout, size_t count, T new_t, T old_t) { + const size_t N = Lanes(d); + const Vec<D> old_v = Set(d, old_t); + const Vec<D> new_v = Set(d, new_t); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + Vec<D> v = LoadU(d, inout + idx); + StoreU(IfThenElse(Eq(v, old_v), new_v, v), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag<T, 1> d1; + const Vec<decltype(d1)> old_v1 = Set(d1, old_t); + const Vec<decltype(d1)> new_v1 = Set(d1, new_t); + for (; idx < count; ++idx) { + using V1 = Vec<decltype(d1)>; + const V1 v1 = LoadU(d1, inout + idx); + StoreU(IfThenElse(Eq(v1, old_v1), new_v1, v1), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask<D> mask = FirstN(d, remaining); + const Vec<D> v = MaskedLoad(mask, d, inout + idx); + BlendedStore(IfThenElse(Eq(v, old_v), new_v, v), mask, d, inout + idx); +#endif +} + +template <class D, class Func, typename T = TFromD<D>> +void ReplaceIf(D d, T* HWY_RESTRICT inout, size_t count, T new_t, + const Func& func) { + const size_t N = Lanes(d); + const Vec<D> new_v = Set(d, new_t); + + size_t idx = 0; + for (; idx + N <= count; idx += N) { + Vec<D> v = LoadU(d, inout + idx); + StoreU(IfThenElse(func(d, v), new_v, v), d, inout + idx); + } + + // `count` was a multiple of the vector length `N`: already done. + if (HWY_UNLIKELY(idx == count)) return; + +#if HWY_MEM_OPS_MIGHT_FAULT + // Proceed one by one. + const CappedTag<T, 1> d1; + const Vec<decltype(d1)> new_v1 = Set(d1, new_t); + for (; idx < count; ++idx) { + using V1 = Vec<decltype(d1)>; + const V1 v = LoadU(d1, inout + idx); + StoreU(IfThenElse(func(d1, v), new_v1, v), d1, inout + idx); + } +#else + const size_t remaining = count - idx; + HWY_DASSERT(0 != remaining && remaining < N); + const Mask<D> mask = FirstN(d, remaining); + const Vec<D> v = MaskedLoad(mask, d, inout + idx); + BlendedStore(IfThenElse(func(d, v), new_v, v), mask, d, inout + idx); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_ diff --git a/media/highway/src/hwy/contrib/algo/transform_test.cc b/media/highway/src/hwy/contrib/algo/transform_test.cc new file mode 100644 index 0000000000..52373cca6c --- /dev/null +++ b/media/highway/src/hwy/contrib/algo/transform_test.cc @@ -0,0 +1,372 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <string.h> + +#include "hwy/aligned_allocator.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/algo/transform_test.cc" +#include "hwy/foreach_target.h" + +#include "hwy/contrib/algo/transform-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +// If your project requires C++14 or later, you can ignore this and pass lambdas +// directly to Transform, without requiring an lvalue as we do here for C++11. +#if __cplusplus < 201402L +#define HWY_GENERIC_LAMBDA 0 +#else +#define HWY_GENERIC_LAMBDA 1 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template <typename T> +T Alpha() { + return static_cast<T>(1.5); // arbitrary scalar +} + +// Returns random floating-point number in [-8, 8) to ensure computations do +// not exceed float32 precision. +template <typename T> +T Random(RandomState& rng) { + const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023; + const double val = (bits - 512) / 64.0; + // Clamp negative to zero for unsigned types. + return static_cast<T>(HWY_MAX(hwy::LowestValue<T>(), val)); +} + +// SCAL, AXPY names are from BLAS. +template <typename T> +HWY_NOINLINE void SimpleSCAL(const T* x, T* out, size_t count) { + for (size_t i = 0; i < count; ++i) { + out[i] = Alpha<T>() * x[i]; + } +} + +template <typename T> +HWY_NOINLINE void SimpleAXPY(const T* x, const T* y, T* out, size_t count) { + for (size_t i = 0; i < count; ++i) { + out[i] = Alpha<T>() * x[i] + y[i]; + } +} + +template <typename T> +HWY_NOINLINE void SimpleFMA4(const T* x, const T* y, const T* z, T* out, + size_t count) { + for (size_t i = 0; i < count; ++i) { + out[i] = x[i] * y[i] + z[i]; + } +} + +// In C++14, we can instead define these as generic lambdas next to where they +// are invoked. +#if !HWY_GENERIC_LAMBDA + +// Generator that returns even numbers by doubling the output indices. +struct Gen2 { + template <class D, class VU> + Vec<D> operator()(D d, VU vidx) const { + return BitCast(d, Add(vidx, vidx)); + } +}; + +struct SCAL { + template <class D, class V> + Vec<D> operator()(D d, V v) const { + using T = TFromD<D>; + return Mul(Set(d, Alpha<T>()), v); + } +}; + +struct AXPY { + template <class D, class V> + Vec<D> operator()(D d, V v, V v1) const { + using T = TFromD<D>; + return MulAdd(Set(d, Alpha<T>()), v, v1); + } +}; + +struct FMA4 { + template <class D, class V> + Vec<D> operator()(D /*d*/, V v, V v1, V v2) const { + return MulAdd(v, v1, v2); + } +}; + +#endif // !HWY_GENERIC_LAMBDA + +// Invokes Test (e.g. TestTransform1) with all arg combinations. T comes from +// ForFloatTypes. +template <class Test> +struct ForeachCountAndMisalign { + template <typename T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) const { + RandomState rng; + const size_t N = Lanes(d); + const size_t misalignments[3] = {0, N / 4, 3 * N / 5}; + + for (size_t count = 0; count < 2 * N; ++count) { + for (size_t ma : misalignments) { + for (size_t mb : misalignments) { + Test()(d, count, ma, mb, rng); + } + } + } + } +}; + +// Output-only, no loads +struct TestGenerate { + template <class D> + void operator()(D d, size_t count, size_t misalign_a, size_t /*misalign_b*/, + RandomState& /*rng*/) { + using T = TFromD<D>; + AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + count + 1); + T* actual = pa.get() + misalign_a; + + AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count)); + for (size_t i = 0; i < count; ++i) { + expected[i] = static_cast<T>(2 * i); + } + + // TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that + // the attribute also applies to lambdas? If so, remove HWY_ATTR. +#if HWY_GENERIC_LAMBDA + const auto gen2 = [](const auto d, const auto vidx) + HWY_ATTR { return BitCast(d, Add(vidx, vidx)); }; +#else + const Gen2 gen2; +#endif + actual[count] = T{0}; // sentinel + Generate(d, actual, count, gen2); + HWY_ASSERT_EQ(T{0}, actual[count]); // did not write past end + + const auto info = hwy::detail::MakeTypeInfo<T>(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), actual, count, + target_name, __FILE__, __LINE__); + } +}; + +// Zero extra input arrays +struct TestTransform { + template <class D> + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + if (misalign_b != 0) return; + using T = TFromD<D>; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr<T[]> pa = + AllocateAligned<T>(HWY_MAX(1, misalign_a + count)); + T* a = pa.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random<T>(rng); + } + + AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count)); + SimpleSCAL(a, expected.get(), count); + + // TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that + // the attribute also applies to lambdas? If so, remove HWY_ATTR. +#if HWY_GENERIC_LAMBDA + const auto scal = [](const auto d, const auto v) + HWY_ATTR { return Mul(Set(d, Alpha<T>()), v); }; +#else + const SCAL scal; +#endif + Transform(d, a, count, scal); + + const auto info = hwy::detail::MakeTypeInfo<T>(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name, + __FILE__, __LINE__); + } +}; + +// One extra input array +struct TestTransform1 { + template <class D> + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD<D>; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr<T[]> pa = + AllocateAligned<T>(HWY_MAX(1, misalign_a + count)); + AlignedFreeUniquePtr<T[]> pb = + AllocateAligned<T>(HWY_MAX(1, misalign_b + count)); + T* a = pa.get() + misalign_a; + T* b = pb.get() + misalign_b; + for (size_t i = 0; i < count; ++i) { + a[i] = Random<T>(rng); + b[i] = Random<T>(rng); + } + + AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count)); + SimpleAXPY(a, b, expected.get(), count); + +#if HWY_GENERIC_LAMBDA + const auto axpy = [](const auto d, const auto v, const auto v1) HWY_ATTR { + return MulAdd(Set(d, Alpha<T>()), v, v1); + }; +#else + const AXPY axpy; +#endif + Transform1(d, a, count, b, axpy); + + const auto info = hwy::detail::MakeTypeInfo<T>(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name, + __FILE__, __LINE__); + } +}; + +// Two extra input arrays +struct TestTransform2 { + template <class D> + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD<D>; + // Prevents error if size to allocate is zero. + AlignedFreeUniquePtr<T[]> pa = + AllocateAligned<T>(HWY_MAX(1, misalign_a + count)); + AlignedFreeUniquePtr<T[]> pb = + AllocateAligned<T>(HWY_MAX(1, misalign_b + count)); + AlignedFreeUniquePtr<T[]> pc = + AllocateAligned<T>(HWY_MAX(1, misalign_a + count)); + T* a = pa.get() + misalign_a; + T* b = pb.get() + misalign_b; + T* c = pc.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random<T>(rng); + b[i] = Random<T>(rng); + c[i] = Random<T>(rng); + } + + AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count)); + SimpleFMA4(a, b, c, expected.get(), count); + +#if HWY_GENERIC_LAMBDA + const auto fma4 = [](auto /*d*/, auto v, auto v1, auto v2) + HWY_ATTR { return MulAdd(v, v1, v2); }; +#else + const FMA4 fma4; +#endif + Transform2(d, a, count, b, c, fma4); + + const auto info = hwy::detail::MakeTypeInfo<T>(); + const char* target_name = hwy::TargetName(HWY_TARGET); + hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name, + __FILE__, __LINE__); + } +}; + +template <typename T> +class IfEq { + public: + IfEq(T val) : val_(val) {} + + template <class D, class V> + Mask<D> operator()(D d, V v) const { + return Eq(v, Set(d, val_)); + } + + private: + T val_; +}; + +struct TestReplace { + template <class D> + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + if (misalign_b != 0) return; + if (count == 0) return; + using T = TFromD<D>; + AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + count); + T* a = pa.get() + misalign_a; + for (size_t i = 0; i < count; ++i) { + a[i] = Random<T>(rng); + } + AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(count); + + AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(count); + + std::vector<size_t> positions(AdjustedReps(count)); + for (size_t& pos : positions) { + pos = static_cast<size_t>(rng()) % count; + } + + for (size_t pos = 0; pos < count; ++pos) { + const T old_t = a[pos]; + const T new_t = Random<T>(rng); + for (size_t i = 0; i < count; ++i) { + expected[i] = IsEqual(a[i], old_t) ? new_t : a[i]; + } + + // Copy so ReplaceIf gets the same input (and thus also outputs expected) + memcpy(pb.get(), a, count * sizeof(T)); + + Replace(d, a, count, new_t, old_t); + HWY_ASSERT_ARRAY_EQ(expected.get(), a, count); + + ReplaceIf(d, pb.get(), count, new_t, IfEq<T>(old_t)); + HWY_ASSERT_ARRAY_EQ(expected.get(), pb.get(), count); + } + } +}; + +void TestAllGenerate() { + // The test BitCast-s the indices, which does not work for floats. + ForIntegerTypes(ForPartialVectors<ForeachCountAndMisalign<TestGenerate>>()); +} + +void TestAllTransform() { + ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform>>()); +} + +void TestAllTransform1() { + ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform1>>()); +} + +void TestAllTransform2() { + ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform2>>()); +} + +void TestAllReplace() { + ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestReplace>>()); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(TransformTest); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllGenerate); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform1); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform2); +HWY_EXPORT_AND_TEST_P(TransformTest, TestAllReplace); +} // namespace hwy + +#endif diff --git a/media/highway/src/hwy/contrib/dot/dot-inl.h b/media/highway/src/hwy/contrib/dot/dot-inl.h new file mode 100644 index 0000000000..e04636f1b8 --- /dev/null +++ b/media/highway/src/hwy/contrib/dot/dot-inl.h @@ -0,0 +1,252 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Include guard (still compiled once per target) +#include <cmath> + +#if defined(HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_ +#endif + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct Dot { + // Specify zero or more of these, ORed together, as the kAssumptions template + // argument to Compute. Each one may improve performance or reduce code size, + // at the cost of additional requirements on the arguments. + enum Assumptions { + // num_elements is at least N, which may be up to HWY_MAX_BYTES / sizeof(T). + kAtLeastOneVector = 1, + // num_elements is divisible by N (a power of two, so this can be used if + // the problem size is known to be a power of two >= HWY_MAX_BYTES / + // sizeof(T)). + kMultipleOfVector = 2, + // RoundUpTo(num_elements, N) elements are accessible; their value does not + // matter (will be treated as if they were zero). + kPaddedToVector = 4, + }; + + // Returns sum{pa[i] * pb[i]} for float or double inputs. Aligning the + // pointers to a multiple of N elements is helpful but not required. + template <int kAssumptions, class D, typename T = TFromD<D>, + HWY_IF_NOT_LANE_SIZE_D(D, 2)> + static HWY_INLINE T Compute(const D d, const T* const HWY_RESTRICT pa, + const T* const HWY_RESTRICT pb, + const size_t num_elements) { + static_assert(IsFloat<T>(), "MulAdd requires float type"); + using V = decltype(Zero(d)); + + const size_t N = Lanes(d); + size_t i = 0; + + constexpr bool kIsAtLeastOneVector = + (kAssumptions & kAtLeastOneVector) != 0; + constexpr bool kIsMultipleOfVector = + (kAssumptions & kMultipleOfVector) != 0; + constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0; + + // Won't be able to do a full vector load without padding => scalar loop. + if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector && + HWY_UNLIKELY(num_elements < N)) { + // Only 2x unroll to avoid excessive code size. + T sum0 = T(0); + T sum1 = T(0); + for (; i + 2 <= num_elements; i += 2) { + sum0 += pa[i + 0] * pb[i + 0]; + sum1 += pa[i + 1] * pb[i + 1]; + } + if (i < num_elements) { + sum1 += pa[i] * pb[i]; + } + return sum0 + sum1; + } + + // Compiler doesn't make independent sum* accumulators, so unroll manually. + // 2 FMA ports * 4 cycle latency = up to 8 in-flight, but that is excessive + // for unaligned inputs (each unaligned pointer halves the throughput + // because it occupies both L1 load ports for a cycle). We cannot have + // arrays of vectors on RVV/SVE, so always unroll 4x. + V sum0 = Zero(d); + V sum1 = Zero(d); + V sum2 = Zero(d); + V sum3 = Zero(d); + + // Main loop: unrolled + for (; i + 4 * N <= num_elements; /* i += 4 * N */) { // incr in loop + const auto a0 = LoadU(d, pa + i); + const auto b0 = LoadU(d, pb + i); + i += N; + sum0 = MulAdd(a0, b0, sum0); + const auto a1 = LoadU(d, pa + i); + const auto b1 = LoadU(d, pb + i); + i += N; + sum1 = MulAdd(a1, b1, sum1); + const auto a2 = LoadU(d, pa + i); + const auto b2 = LoadU(d, pb + i); + i += N; + sum2 = MulAdd(a2, b2, sum2); + const auto a3 = LoadU(d, pa + i); + const auto b3 = LoadU(d, pb + i); + i += N; + sum3 = MulAdd(a3, b3, sum3); + } + + // Up to 3 iterations of whole vectors + for (; i + N <= num_elements; i += N) { + const auto a = LoadU(d, pa + i); + const auto b = LoadU(d, pb + i); + sum0 = MulAdd(a, b, sum0); + } + + if (!kIsMultipleOfVector) { + const size_t remaining = num_elements - i; + if (remaining != 0) { + if (kIsPaddedToVector) { + const auto mask = FirstN(d, remaining); + const auto a = LoadU(d, pa + i); + const auto b = LoadU(d, pb + i); + sum1 = MulAdd(IfThenElseZero(mask, a), IfThenElseZero(mask, b), sum1); + } else { + // Unaligned load such that the last element is in the highest lane - + // ensures we do not touch any elements outside the valid range. + // If we get here, then num_elements >= N. + HWY_DASSERT(i >= N); + i += remaining - N; + const auto skip = FirstN(d, N - remaining); + const auto a = LoadU(d, pa + i); // always unaligned + const auto b = LoadU(d, pb + i); + sum1 = MulAdd(IfThenZeroElse(skip, a), IfThenZeroElse(skip, b), sum1); + } + } + } // kMultipleOfVector + + // Reduction tree: sum of all accumulators by pairs, then across lanes. + sum0 = Add(sum0, sum1); + sum2 = Add(sum2, sum3); + sum0 = Add(sum0, sum2); + return GetLane(SumOfLanes(d, sum0)); + } + + // Returns sum{pa[i] * pb[i]} for bfloat16 inputs. Aligning the pointers to a + // multiple of N elements is helpful but not required. + template <int kAssumptions, class D> + static HWY_INLINE float Compute(const D d, + const bfloat16_t* const HWY_RESTRICT pa, + const bfloat16_t* const HWY_RESTRICT pb, + const size_t num_elements) { + const RebindToUnsigned<D> du16; + const Repartition<float, D> df32; + + using V = decltype(Zero(df32)); + const size_t N = Lanes(d); + size_t i = 0; + + constexpr bool kIsAtLeastOneVector = + (kAssumptions & kAtLeastOneVector) != 0; + constexpr bool kIsMultipleOfVector = + (kAssumptions & kMultipleOfVector) != 0; + constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0; + + // Won't be able to do a full vector load without padding => scalar loop. + if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector && + HWY_UNLIKELY(num_elements < N)) { + float sum0 = 0.0f; // Only 2x unroll to avoid excessive code size for.. + float sum1 = 0.0f; // this unlikely(?) case. + for (; i + 2 <= num_elements; i += 2) { + sum0 += F32FromBF16(pa[i + 0]) * F32FromBF16(pb[i + 0]); + sum1 += F32FromBF16(pa[i + 1]) * F32FromBF16(pb[i + 1]); + } + if (i < num_elements) { + sum1 += F32FromBF16(pa[i]) * F32FromBF16(pb[i]); + } + return sum0 + sum1; + } + + // See comment in the other Compute() overload. Unroll 2x, but we need + // twice as many sums for ReorderWidenMulAccumulate. + V sum0 = Zero(df32); + V sum1 = Zero(df32); + V sum2 = Zero(df32); + V sum3 = Zero(df32); + + // Main loop: unrolled + for (; i + 2 * N <= num_elements; /* i += 2 * N */) { // incr in loop + const auto a0 = LoadU(d, pa + i); + const auto b0 = LoadU(d, pb + i); + i += N; + sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1); + const auto a1 = LoadU(d, pa + i); + const auto b1 = LoadU(d, pb + i); + i += N; + sum2 = ReorderWidenMulAccumulate(df32, a1, b1, sum2, sum3); + } + + // Possibly one more iteration of whole vectors + if (i + N <= num_elements) { + const auto a0 = LoadU(d, pa + i); + const auto b0 = LoadU(d, pb + i); + i += N; + sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1); + } + + if (!kIsMultipleOfVector) { + const size_t remaining = num_elements - i; + if (remaining != 0) { + if (kIsPaddedToVector) { + const auto mask = FirstN(du16, remaining); + const auto va = LoadU(d, pa + i); + const auto vb = LoadU(d, pb + i); + const auto a16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, va))); + const auto b16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, vb))); + sum2 = ReorderWidenMulAccumulate(df32, a16, b16, sum2, sum3); + + } else { + // Unaligned load such that the last element is in the highest lane - + // ensures we do not touch any elements outside the valid range. + // If we get here, then num_elements >= N. + HWY_DASSERT(i >= N); + i += remaining - N; + const auto skip = FirstN(du16, N - remaining); + const auto va = LoadU(d, pa + i); // always unaligned + const auto vb = LoadU(d, pb + i); + const auto a16 = BitCast(d, IfThenZeroElse(skip, BitCast(du16, va))); + const auto b16 = BitCast(d, IfThenZeroElse(skip, BitCast(du16, vb))); + sum2 = ReorderWidenMulAccumulate(df32, a16, b16, sum2, sum3); + } + } + } // kMultipleOfVector + + // Reduction tree: sum of all accumulators by pairs, then across lanes. + sum0 = Add(sum0, sum1); + sum2 = Add(sum2, sum3); + sum0 = Add(sum0, sum2); + return GetLane(SumOfLanes(df32, sum0)); + } +}; + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_ diff --git a/media/highway/src/hwy/contrib/dot/dot_test.cc b/media/highway/src/hwy/contrib/dot/dot_test.cc new file mode 100644 index 0000000000..d9e1ac621d --- /dev/null +++ b/media/highway/src/hwy/contrib/dot/dot_test.cc @@ -0,0 +1,167 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> + +#include "hwy/aligned_allocator.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/dot/dot_test.cc" +#include "hwy/foreach_target.h" + +#include "hwy/contrib/dot/dot-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template <typename T> +HWY_NOINLINE T SimpleDot(const T* pa, const T* pb, size_t num) { + double sum = 0.0; + for (size_t i = 0; i < num; ++i) { + sum += pa[i] * pb[i]; + } + return static_cast<T>(sum); +} + +HWY_NOINLINE float SimpleDot(const bfloat16_t* pa, const bfloat16_t* pb, + size_t num) { + float sum = 0.0f; + for (size_t i = 0; i < num; ++i) { + sum += F32FromBF16(pa[i]) * F32FromBF16(pb[i]); + } + return sum; +} + +template <typename T> +void SetValue(const float value, T* HWY_RESTRICT ptr) { + *ptr = static_cast<T>(value); +} +void SetValue(const float value, bfloat16_t* HWY_RESTRICT ptr) { + *ptr = BF16FromF32(value); +} + +class TestDot { + // Computes/verifies one dot product. + template <int kAssumptions, class D> + void Test(D d, size_t num, size_t misalign_a, size_t misalign_b, + RandomState& rng) { + using T = TFromD<D>; + const size_t N = Lanes(d); + const auto random_t = [&rng]() { + const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023; + return static_cast<float>(bits - 512) * (1.0f / 64); + }; + + const size_t padded = + (kAssumptions & Dot::kPaddedToVector) ? RoundUpTo(num, N) : num; + AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + padded); + AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(misalign_b + padded); + T* a = pa.get() + misalign_a; + T* b = pb.get() + misalign_b; + size_t i = 0; + for (; i < num; ++i) { + SetValue(random_t(), a + i); + SetValue(random_t(), b + i); + } + // Fill padding with NaN - the values are not used, but avoids MSAN errors. + for (; i < padded; ++i) { + ScalableTag<float> df1; + SetValue(GetLane(NaN(df1)), a + i); + SetValue(GetLane(NaN(df1)), b + i); + } + + const auto expected = SimpleDot(a, b, num); + const auto actual = Dot::Compute<kAssumptions>(d, a, b, num); + const auto max = static_cast<decltype(actual)>(8 * 8 * num); + HWY_ASSERT(-max <= actual && actual <= max); + HWY_ASSERT(expected - 1E-4 <= actual && actual <= expected + 1E-4); + } + + // Runs tests with various alignments. + template <int kAssumptions, class D> + void ForeachMisalign(D d, size_t num, RandomState& rng) { + const size_t N = Lanes(d); + const size_t misalignments[3] = {0, N / 4, 3 * N / 5}; + for (size_t ma : misalignments) { + for (size_t mb : misalignments) { + Test<kAssumptions>(d, num, ma, mb, rng); + } + } + } + + // Runs tests with various lengths compatible with the given assumptions. + template <int kAssumptions, class D> + void ForeachCount(D d, RandomState& rng) { + const size_t N = Lanes(d); + const size_t counts[] = {1, + 3, + 7, + 16, + HWY_MAX(N / 2, 1), + HWY_MAX(2 * N / 3, 1), + N, + N + 1, + 4 * N / 3, + 3 * N, + 8 * N, + 8 * N + 2}; + for (size_t num : counts) { + if ((kAssumptions & Dot::kAtLeastOneVector) && num < N) continue; + if ((kAssumptions & Dot::kMultipleOfVector) && (num % N) != 0) continue; + ForeachMisalign<kAssumptions>(d, num, rng); + } + } + + public: + template <class T, class D> + HWY_NOINLINE void operator()(T /*unused*/, D d) { + RandomState rng; + + // All 8 combinations of the three length-related flags: + ForeachCount<0>(d, rng); + ForeachCount<Dot::kAtLeastOneVector>(d, rng); + ForeachCount<Dot::kMultipleOfVector>(d, rng); + ForeachCount<Dot::kMultipleOfVector | Dot::kAtLeastOneVector>(d, rng); + ForeachCount<Dot::kPaddedToVector>(d, rng); + ForeachCount<Dot::kPaddedToVector | Dot::kAtLeastOneVector>(d, rng); + ForeachCount<Dot::kPaddedToVector | Dot::kMultipleOfVector>(d, rng); + ForeachCount<Dot::kPaddedToVector | Dot::kMultipleOfVector | + Dot::kAtLeastOneVector>(d, rng); + } +}; + +void TestAllDot() { ForFloatTypes(ForPartialVectors<TestDot>()); } +void TestAllDotBF16() { ForShrinkableVectors<TestDot>()(bfloat16_t()); } + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(DotTest); +HWY_EXPORT_AND_TEST_P(DotTest, TestAllDot); +HWY_EXPORT_AND_TEST_P(DotTest, TestAllDotBF16); +} // namespace hwy + +#endif diff --git a/media/highway/src/hwy/contrib/image/image.cc b/media/highway/src/hwy/contrib/image/image.cc new file mode 100644 index 0000000000..3f8f255bab --- /dev/null +++ b/media/highway/src/hwy/contrib/image/image.cc @@ -0,0 +1,145 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/image/image.h" + +#include <algorithm> // swap +#include <cstddef> + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/image/image.cc" +#include "hwy/foreach_target.h" +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +size_t GetVectorSize() { return Lanes(ScalableTag<uint8_t>()); } +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE + +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(GetVectorSize); // Local function. +} // namespace + +size_t ImageBase::VectorSize() { + // Do not cache result - must return the current value, which may be greater + // than the first call if it was subject to DisableTargets! + return HWY_DYNAMIC_DISPATCH(GetVectorSize)(); +} + +size_t ImageBase::BytesPerRow(const size_t xsize, const size_t sizeof_t) { + const size_t vec_size = VectorSize(); + size_t valid_bytes = xsize * sizeof_t; + + // Allow unaligned accesses starting at the last valid value - this may raise + // msan errors unless the user calls InitializePaddingForUnalignedAccesses. + // Skip for the scalar case because no extra lanes will be loaded. + if (vec_size != 1) { + HWY_DASSERT(vec_size >= sizeof_t); + valid_bytes += vec_size - sizeof_t; + } + + // Round up to vector and cache line size. + const size_t align = HWY_MAX(vec_size, HWY_ALIGNMENT); + size_t bytes_per_row = RoundUpTo(valid_bytes, align); + + // During the lengthy window before writes are committed to memory, CPUs + // guard against read after write hazards by checking the address, but + // only the lower 11 bits. We avoid a false dependency between writes to + // consecutive rows by ensuring their sizes are not multiples of 2 KiB. + // Avoid2K prevents the same problem for the planes of an Image3. + if (bytes_per_row % HWY_ALIGNMENT == 0) { + bytes_per_row += align; + } + + HWY_DASSERT(bytes_per_row % align == 0); + return bytes_per_row; +} + +ImageBase::ImageBase(const size_t xsize, const size_t ysize, + const size_t sizeof_t) + : xsize_(static_cast<uint32_t>(xsize)), + ysize_(static_cast<uint32_t>(ysize)), + bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) { + HWY_ASSERT(sizeof_t == 1 || sizeof_t == 2 || sizeof_t == 4 || sizeof_t == 8); + + bytes_per_row_ = 0; + // Dimensions can be zero, e.g. for lazily-allocated images. Only allocate + // if nonzero, because "zero" bytes still have padding/bookkeeping overhead. + if (xsize != 0 && ysize != 0) { + bytes_per_row_ = BytesPerRow(xsize, sizeof_t); + bytes_ = AllocateAligned<uint8_t>(bytes_per_row_ * ysize); + HWY_ASSERT(bytes_.get() != nullptr); + InitializePadding(sizeof_t, Padding::kRoundUp); + } +} + +ImageBase::ImageBase(const size_t xsize, const size_t ysize, + const size_t bytes_per_row, void* const aligned) + : xsize_(static_cast<uint32_t>(xsize)), + ysize_(static_cast<uint32_t>(ysize)), + bytes_per_row_(bytes_per_row), + bytes_(static_cast<uint8_t*>(aligned), + AlignedFreer(&AlignedFreer::DoNothing, nullptr)) { + const size_t vec_size = VectorSize(); + HWY_ASSERT(bytes_per_row % vec_size == 0); + HWY_ASSERT(reinterpret_cast<uintptr_t>(aligned) % vec_size == 0); +} + +void ImageBase::InitializePadding(const size_t sizeof_t, Padding padding) { +#if HWY_IS_MSAN || HWY_IDE + if (xsize_ == 0 || ysize_ == 0) return; + + const size_t vec_size = VectorSize(); // Bytes, independent of sizeof_t! + if (vec_size == 1) return; // Scalar mode: no padding needed + + const size_t valid_size = xsize_ * sizeof_t; + const size_t initialize_size = padding == Padding::kRoundUp + ? RoundUpTo(valid_size, vec_size) + : valid_size + vec_size - sizeof_t; + if (valid_size == initialize_size) return; + + for (size_t y = 0; y < ysize_; ++y) { + uint8_t* HWY_RESTRICT row = static_cast<uint8_t*>(VoidRow(y)); +#if defined(__clang__) && (__clang_major__ <= 6) + // There's a bug in msan in clang-6 when handling AVX2 operations. This + // workaround allows tests to pass on msan, although it is slower and + // prevents msan warnings from uninitialized images. + memset(row, 0, initialize_size); +#else + memset(row + valid_size, 0, initialize_size - valid_size); +#endif // clang6 + } +#else + (void)sizeof_t; + (void)padding; +#endif // HWY_IS_MSAN +} + +void ImageBase::Swap(ImageBase& other) { + std::swap(xsize_, other.xsize_); + std::swap(ysize_, other.ysize_); + std::swap(bytes_per_row_, other.bytes_per_row_); + std::swap(bytes_, other.bytes_); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/media/highway/src/hwy/contrib/image/image.h b/media/highway/src/hwy/contrib/image/image.h new file mode 100644 index 0000000000..bea6e654c4 --- /dev/null +++ b/media/highway/src/hwy/contrib/image/image.h @@ -0,0 +1,476 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_ +#define HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_ + +// SIMD/multicore-friendly planar image representation with row accessors. + +#include <inttypes.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +#include <cstddef> +#include <utility> // std::move + +#include "hwy/aligned_allocator.h" +#include "hwy/base.h" +#include "hwy/highway_export.h" + +namespace hwy { + +// Type-independent parts of Image<> - reduces code duplication and facilitates +// moving member function implementations to cc file. +struct HWY_CONTRIB_DLLEXPORT ImageBase { + // Returns required alignment in bytes for externally allocated memory. + static size_t VectorSize(); + + // Returns distance [bytes] between the start of two consecutive rows, a + // multiple of VectorSize but NOT kAlias (see implementation). + static size_t BytesPerRow(const size_t xsize, const size_t sizeof_t); + + // No allocation (for output params or unused images) + ImageBase() + : xsize_(0), + ysize_(0), + bytes_per_row_(0), + bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {} + + // Allocates memory (this is the common case) + ImageBase(size_t xsize, size_t ysize, size_t sizeof_t); + + // References but does not take ownership of external memory. Useful for + // interoperability with other libraries. `aligned` must be aligned to a + // multiple of VectorSize() and `bytes_per_row` must also be a multiple of + // VectorSize() or preferably equal to BytesPerRow(). + ImageBase(size_t xsize, size_t ysize, size_t bytes_per_row, void* aligned); + + // Copy construction/assignment is forbidden to avoid inadvertent copies, + // which can be very expensive. Use CopyImageTo() instead. + ImageBase(const ImageBase& other) = delete; + ImageBase& operator=(const ImageBase& other) = delete; + + // Move constructor (required for returning Image from function) + ImageBase(ImageBase&& other) noexcept = default; + + // Move assignment (required for std::vector) + ImageBase& operator=(ImageBase&& other) noexcept = default; + + void Swap(ImageBase& other); + + // Useful for pre-allocating image with some padding for alignment purposes + // and later reporting the actual valid dimensions. Caller is responsible + // for ensuring xsize/ysize are <= the original dimensions. + void ShrinkTo(const size_t xsize, const size_t ysize) { + xsize_ = static_cast<uint32_t>(xsize); + ysize_ = static_cast<uint32_t>(ysize); + // NOTE: we can't recompute bytes_per_row for more compact storage and + // better locality because that would invalidate the image contents. + } + + // How many pixels. + HWY_INLINE size_t xsize() const { return xsize_; } + HWY_INLINE size_t ysize() const { return ysize_; } + + // NOTE: do not use this for copying rows - the valid xsize may be much less. + HWY_INLINE size_t bytes_per_row() const { return bytes_per_row_; } + + // Raw access to byte contents, for interfacing with other libraries. + // Unsigned char instead of char to avoid surprises (sign extension). + HWY_INLINE uint8_t* bytes() { + void* p = bytes_.get(); + return static_cast<uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64)); + } + HWY_INLINE const uint8_t* bytes() const { + const void* p = bytes_.get(); + return static_cast<const uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64)); + } + + protected: + // Returns pointer to the start of a row. + HWY_INLINE void* VoidRow(const size_t y) const { +#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN + if (y >= ysize_) { + HWY_ABORT("Row(%" PRIu64 ") >= %u\n", static_cast<uint64_t>(y), ysize_); + } +#endif + + void* row = bytes_.get() + y * bytes_per_row_; + return HWY_ASSUME_ALIGNED(row, 64); + } + + enum class Padding { + // Allow Load(d, row + x) for x = 0; x < xsize(); x += Lanes(d). Default. + kRoundUp, + // Allow LoadU(d, row + x) for x <= xsize() - 1. This requires an extra + // vector to be initialized. If done by default, this would suppress + // legitimate msan warnings. We therefore require users to explicitly call + // InitializePadding before using unaligned loads (e.g. convolution). + kUnaligned + }; + + // Initializes the minimum bytes required to suppress msan warnings from + // legitimate (according to Padding mode) vector loads/stores on the right + // border, where some lanes are uninitialized and assumed to be unused. + void InitializePadding(size_t sizeof_t, Padding padding); + + // (Members are non-const to enable assignment during move-assignment.) + uint32_t xsize_; // In valid pixels, not including any padding. + uint32_t ysize_; + size_t bytes_per_row_; // Includes padding. + AlignedFreeUniquePtr<uint8_t[]> bytes_; +}; + +// Single channel, aligned rows separated by padding. T must be POD. +// +// 'Single channel' (one 2D array per channel) simplifies vectorization +// (repeating the same operation on multiple adjacent components) without the +// complexity of a hybrid layout (8 R, 8 G, 8 B, ...). In particular, clients +// can easily iterate over all components in a row and Image requires no +// knowledge of the pixel format beyond the component type "T". +// +// 'Aligned' means each row is aligned to the L1 cache line size. This prevents +// false sharing between two threads operating on adjacent rows. +// +// 'Padding' is still relevant because vectors could potentially be larger than +// a cache line. By rounding up row sizes to the vector size, we allow +// reading/writing ALIGNED vectors whose first lane is a valid sample. This +// avoids needing a separate loop to handle remaining unaligned lanes. +// +// This image layout could also be achieved with a vector and a row accessor +// function, but a class wrapper with support for "deleter" allows wrapping +// existing memory allocated by clients without copying the pixels. It also +// provides convenient accessors for xsize/ysize, which shortens function +// argument lists. Supports move-construction so it can be stored in containers. +template <typename ComponentType> +class Image : public ImageBase { + public: + using T = ComponentType; + + Image() = default; + Image(const size_t xsize, const size_t ysize) + : ImageBase(xsize, ysize, sizeof(T)) {} + Image(const size_t xsize, const size_t ysize, size_t bytes_per_row, + void* aligned) + : ImageBase(xsize, ysize, bytes_per_row, aligned) {} + + void InitializePaddingForUnalignedAccesses() { + InitializePadding(sizeof(T), Padding::kUnaligned); + } + + HWY_INLINE const T* ConstRow(const size_t y) const { + return static_cast<const T*>(VoidRow(y)); + } + HWY_INLINE const T* ConstRow(const size_t y) { + return static_cast<const T*>(VoidRow(y)); + } + + // Returns pointer to non-const. This allows passing const Image* parameters + // when the callee is only supposed to fill the pixels, as opposed to + // allocating or resizing the image. + HWY_INLINE T* MutableRow(const size_t y) const { + return static_cast<T*>(VoidRow(y)); + } + HWY_INLINE T* MutableRow(const size_t y) { + return static_cast<T*>(VoidRow(y)); + } + + // Returns number of pixels (some of which are padding) per row. Useful for + // computing other rows via pointer arithmetic. WARNING: this must + // NOT be used to determine xsize. + HWY_INLINE intptr_t PixelsPerRow() const { + return static_cast<intptr_t>(bytes_per_row_ / sizeof(T)); + } +}; + +using ImageF = Image<float>; + +// A bundle of 3 same-sized images. To fill an existing Image3 using +// single-channel producers, we also need access to each const Image*. Const +// prevents breaking the same-size invariant, while still allowing pixels to be +// changed via MutableRow. +template <typename ComponentType> +class Image3 { + public: + using T = ComponentType; + using ImageT = Image<T>; + static constexpr size_t kNumPlanes = 3; + + Image3() : planes_{ImageT(), ImageT(), ImageT()} {} + + Image3(const size_t xsize, const size_t ysize) + : planes_{ImageT(xsize, ysize), ImageT(xsize, ysize), + ImageT(xsize, ysize)} {} + + Image3(Image3&& other) noexcept { + for (size_t i = 0; i < kNumPlanes; i++) { + planes_[i] = std::move(other.planes_[i]); + } + } + + Image3(ImageT&& plane0, ImageT&& plane1, ImageT&& plane2) { + if (!SameSize(plane0, plane1) || !SameSize(plane0, plane2)) { + HWY_ABORT("Not same size: %" PRIu64 " x %" PRIu64 ", %" PRIu64 + " x %" PRIu64 ", %" PRIu64 " x %" PRIu64 "\n", + static_cast<uint64_t>(plane0.xsize()), + static_cast<uint64_t>(plane0.ysize()), + static_cast<uint64_t>(plane1.xsize()), + static_cast<uint64_t>(plane1.ysize()), + static_cast<uint64_t>(plane2.xsize()), + static_cast<uint64_t>(plane2.ysize())); + } + planes_[0] = std::move(plane0); + planes_[1] = std::move(plane1); + planes_[2] = std::move(plane2); + } + + // Copy construction/assignment is forbidden to avoid inadvertent copies, + // which can be very expensive. Use CopyImageTo instead. + Image3(const Image3& other) = delete; + Image3& operator=(const Image3& other) = delete; + + Image3& operator=(Image3&& other) noexcept { + for (size_t i = 0; i < kNumPlanes; i++) { + planes_[i] = std::move(other.planes_[i]); + } + return *this; + } + + HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) const { + return static_cast<const T*>(VoidPlaneRow(c, y)); + } + HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) { + return static_cast<const T*>(VoidPlaneRow(c, y)); + } + + HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) const { + return static_cast<T*>(VoidPlaneRow(c, y)); + } + HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) { + return static_cast<T*>(VoidPlaneRow(c, y)); + } + + HWY_INLINE const ImageT& Plane(size_t idx) const { return planes_[idx]; } + + void Swap(Image3& other) { + for (size_t c = 0; c < 3; ++c) { + other.planes_[c].Swap(planes_[c]); + } + } + + void ShrinkTo(const size_t xsize, const size_t ysize) { + for (ImageT& plane : planes_) { + plane.ShrinkTo(xsize, ysize); + } + } + + // Sizes of all three images are guaranteed to be equal. + HWY_INLINE size_t xsize() const { return planes_[0].xsize(); } + HWY_INLINE size_t ysize() const { return planes_[0].ysize(); } + // Returns offset [bytes] from one row to the next row of the same plane. + // WARNING: this must NOT be used to determine xsize, nor for copying rows - + // the valid xsize may be much less. + HWY_INLINE size_t bytes_per_row() const { return planes_[0].bytes_per_row(); } + // Returns number of pixels (some of which are padding) per row. Useful for + // computing other rows via pointer arithmetic. WARNING: this must NOT be used + // to determine xsize. + HWY_INLINE intptr_t PixelsPerRow() const { return planes_[0].PixelsPerRow(); } + + private: + // Returns pointer to the start of a row. + HWY_INLINE void* VoidPlaneRow(const size_t c, const size_t y) const { +#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN + if (c >= kNumPlanes || y >= ysize()) { + HWY_ABORT("PlaneRow(%" PRIu64 ", %" PRIu64 ") >= %" PRIu64 "\n", + static_cast<uint64_t>(c), static_cast<uint64_t>(y), + static_cast<uint64_t>(ysize())); + } +#endif + // Use the first plane's stride because the compiler might not realize they + // are all equal. Thus we only need a single multiplication for all planes. + const size_t row_offset = y * planes_[0].bytes_per_row(); + const void* row = planes_[c].bytes() + row_offset; + return static_cast<const T * HWY_RESTRICT>( + HWY_ASSUME_ALIGNED(row, HWY_ALIGNMENT)); + } + + private: + ImageT planes_[kNumPlanes]; +}; + +using Image3F = Image3<float>; + +// Rectangular region in image(s). Factoring this out of Image instead of +// shifting the pointer by x0/y0 allows this to apply to multiple images with +// different resolutions. Can compare size via SameSize(rect1, rect2). +class Rect { + public: + // Most windows are xsize_max * ysize_max, except those on the borders where + // begin + size_max > end. + constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize_max, + size_t ysize_max, size_t xend, size_t yend) + : x0_(xbegin), + y0_(ybegin), + xsize_(ClampedSize(xbegin, xsize_max, xend)), + ysize_(ClampedSize(ybegin, ysize_max, yend)) {} + + // Construct with origin and known size (typically from another Rect). + constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize, size_t ysize) + : x0_(xbegin), y0_(ybegin), xsize_(xsize), ysize_(ysize) {} + + // Construct a rect that covers a whole image. + template <typename Image> + explicit Rect(const Image& image) + : Rect(0, 0, image.xsize(), image.ysize()) {} + + Rect() : Rect(0, 0, 0, 0) {} + + Rect(const Rect&) = default; + Rect& operator=(const Rect&) = default; + + Rect Subrect(size_t xbegin, size_t ybegin, size_t xsize_max, + size_t ysize_max) { + return Rect(x0_ + xbegin, y0_ + ybegin, xsize_max, ysize_max, x0_ + xsize_, + y0_ + ysize_); + } + + template <typename T> + const T* ConstRow(const Image<T>* image, size_t y) const { + return image->ConstRow(y + y0_) + x0_; + } + + template <typename T> + T* MutableRow(const Image<T>* image, size_t y) const { + return image->MutableRow(y + y0_) + x0_; + } + + template <typename T> + const T* ConstPlaneRow(const Image3<T>& image, size_t c, size_t y) const { + return image.ConstPlaneRow(c, y + y0_) + x0_; + } + + template <typename T> + T* MutablePlaneRow(Image3<T>* image, const size_t c, size_t y) const { + return image->MutablePlaneRow(c, y + y0_) + x0_; + } + + // Returns true if this Rect fully resides in the given image. ImageT could be + // Image<T> or Image3<T>; however if ImageT is Rect, results are nonsensical. + template <class ImageT> + bool IsInside(const ImageT& image) const { + return (x0_ + xsize_ <= image.xsize()) && (y0_ + ysize_ <= image.ysize()); + } + + size_t x0() const { return x0_; } + size_t y0() const { return y0_; } + size_t xsize() const { return xsize_; } + size_t ysize() const { return ysize_; } + + private: + // Returns size_max, or whatever is left in [begin, end). + static constexpr size_t ClampedSize(size_t begin, size_t size_max, + size_t end) { + return (begin + size_max <= end) ? size_max + : (end > begin ? end - begin : 0); + } + + size_t x0_; + size_t y0_; + + size_t xsize_; + size_t ysize_; +}; + +// Works for any image-like input type(s). +template <class Image1, class Image2> +HWY_MAYBE_UNUSED bool SameSize(const Image1& image1, const Image2& image2) { + return image1.xsize() == image2.xsize() && image1.ysize() == image2.ysize(); +} + +// Mirrors out of bounds coordinates and returns valid coordinates unchanged. +// We assume the radius (distance outside the image) is small compared to the +// image size, otherwise this might not terminate. +// The mirror is outside the last column (border pixel is also replicated). +static HWY_INLINE HWY_MAYBE_UNUSED size_t Mirror(int64_t x, + const int64_t xsize) { + HWY_DASSERT(xsize != 0); + + // TODO(janwas): replace with branchless version + while (x < 0 || x >= xsize) { + if (x < 0) { + x = -x - 1; + } else { + x = 2 * xsize - 1 - x; + } + } + return static_cast<size_t>(x); +} + +// Wrap modes for ensuring X/Y coordinates are in the valid range [0, size): + +// Mirrors (repeating the edge pixel once). Useful for convolutions. +struct WrapMirror { + HWY_INLINE size_t operator()(const int64_t coord, const size_t size) const { + return Mirror(coord, static_cast<int64_t>(size)); + } +}; + +// Returns the same coordinate, for when we know "coord" is already valid (e.g. +// interior of an image). +struct WrapUnchanged { + HWY_INLINE size_t operator()(const int64_t coord, size_t /*size*/) const { + return static_cast<size_t>(coord); + } +}; + +// Similar to Wrap* but for row pointers (reduces Row() multiplications). + +class WrapRowMirror { + public: + template <class View> + WrapRowMirror(const View& image, size_t ysize) + : first_row_(image.ConstRow(0)), last_row_(image.ConstRow(ysize - 1)) {} + + const float* operator()(const float* const HWY_RESTRICT row, + const int64_t stride) const { + if (row < first_row_) { + const int64_t num_before = first_row_ - row; + // Mirrored; one row before => row 0, two before = row 1, ... + return first_row_ + num_before - stride; + } + if (row > last_row_) { + const int64_t num_after = row - last_row_; + // Mirrored; one row after => last row, two after = last - 1, ... + return last_row_ - num_after + stride; + } + return row; + } + + private: + const float* const HWY_RESTRICT first_row_; + const float* const HWY_RESTRICT last_row_; +}; + +struct WrapRowUnchanged { + HWY_INLINE const float* operator()(const float* const HWY_RESTRICT row, + int64_t /*stride*/) const { + return row; + } +}; + +} // namespace hwy + +#endif // HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_ diff --git a/media/highway/src/hwy/contrib/image/image_test.cc b/media/highway/src/hwy/contrib/image/image_test.cc new file mode 100644 index 0000000000..a23ec6ccc9 --- /dev/null +++ b/media/highway/src/hwy/contrib/image/image_test.cc @@ -0,0 +1,154 @@ +// Copyright (c) the JPEG XL Project +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/image/image.h" + +#include <cstddef> + +#include "hwy/base.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/image/image_test.cc" +#include "hwy/foreach_target.h" + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> + +#include <random> +#include <utility> + +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +// Ensure we can always write full aligned vectors. +struct TestAlignedT { + template <typename T> + void operator()(T /*unused*/) const { + std::mt19937 rng(129); + std::uniform_int_distribution<int> dist(0, 16); + const ScalableTag<T> d; + + for (size_t ysize = 1; ysize < 4; ++ysize) { + for (size_t xsize = 1; xsize < 64; ++xsize) { + Image<T> img(xsize, ysize); + + for (size_t y = 0; y < ysize; ++y) { + T* HWY_RESTRICT row = img.MutableRow(y); + for (size_t x = 0; x < xsize; x += Lanes(d)) { + const auto values = Iota(d, static_cast<T>(dist(rng))); + Store(values, d, row + x); + } + } + + // Sanity check to prevent optimizing out the writes + const auto x = std::uniform_int_distribution<size_t>(0, xsize - 1)(rng); + const auto y = std::uniform_int_distribution<size_t>(0, ysize - 1)(rng); + HWY_ASSERT(img.ConstRow(y)[x] < 16 + Lanes(d)); + } + } + } +}; + +void TestAligned() { ForUnsignedTypes(TestAlignedT()); } + +// Ensure we can write an unaligned vector starting at the last valid value. +struct TestUnalignedT { + template <typename T> + void operator()(T /*unused*/) const { + std::mt19937 rng(129); + std::uniform_int_distribution<int> dist(0, 3); + const ScalableTag<T> d; + + for (size_t ysize = 1; ysize < 4; ++ysize) { + for (size_t xsize = 1; xsize < 128; ++xsize) { + Image<T> img(xsize, ysize); + img.InitializePaddingForUnalignedAccesses(); + +// This test reads padding, which only works if it was initialized, +// which only happens in MSAN builds. +#if HWY_IS_MSAN || HWY_IDE + // Initialize only the valid samples + for (size_t y = 0; y < ysize; ++y) { + T* HWY_RESTRICT row = img.MutableRow(y); + for (size_t x = 0; x < xsize; ++x) { + row[x] = static_cast<T>(1u << dist(rng)); + } + } + + // Read padding bits + auto accum = Zero(d); + for (size_t y = 0; y < ysize; ++y) { + T* HWY_RESTRICT row = img.MutableRow(y); + for (size_t x = 0; x < xsize; ++x) { + accum = Or(accum, LoadU(d, row + x)); + } + } + + // Ensure padding was zero + const size_t N = Lanes(d); + auto lanes = AllocateAligned<T>(N); + Store(accum, d, lanes.get()); + for (size_t i = 0; i < N; ++i) { + HWY_ASSERT(lanes[i] < 16); + } +#else // Check that writing padding does not overwrite valid samples + // Initialize only the valid samples + for (size_t y = 0; y < ysize; ++y) { + T* HWY_RESTRICT row = img.MutableRow(y); + for (size_t x = 0; x < xsize; ++x) { + row[x] = static_cast<T>(x); + } + } + + // Zero padding and rightmost sample + for (size_t y = 0; y < ysize; ++y) { + T* HWY_RESTRICT row = img.MutableRow(y); + StoreU(Zero(d), d, row + xsize - 1); + } + + // Ensure no samples except the rightmost were overwritten + for (size_t y = 0; y < ysize; ++y) { + T* HWY_RESTRICT row = img.MutableRow(y); + for (size_t x = 0; x < xsize - 1; ++x) { + HWY_ASSERT_EQ(static_cast<T>(x), row[x]); + } + } +#endif + } + } + } +}; + +void TestUnaligned() { ForUnsignedTypes(TestUnalignedT()); } + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(ImageTest); +HWY_EXPORT_AND_TEST_P(ImageTest, TestAligned); +HWY_EXPORT_AND_TEST_P(ImageTest, TestUnaligned); +} // namespace hwy + +#endif diff --git a/media/highway/src/hwy/contrib/math/math-inl.h b/media/highway/src/hwy/contrib/math/math-inl.h new file mode 100644 index 0000000000..b4cbb5d119 --- /dev/null +++ b/media/highway/src/hwy/contrib/math/math-inl.h @@ -0,0 +1,1242 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Include guard (still compiled once per target) +#if defined(HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_ +#undef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_ +#else +#define HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_ +#endif + +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +/** + * Highway SIMD version of std::acos(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 2 + * Valid Range: [-1, +1] + * @return arc cosine of 'x' + */ +template <class D, class V> +HWY_INLINE V Acos(const D d, V x); +template <class D, class V> +HWY_NOINLINE V CallAcos(const D d, VecArg<V> x) { + return Acos(d, x); +} + +/** + * Highway SIMD version of std::acosh(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 3 + * Valid Range: float32[1, +FLT_MAX], float64[1, +DBL_MAX] + * @return hyperbolic arc cosine of 'x' + */ +template <class D, class V> +HWY_INLINE V Acosh(const D d, V x); +template <class D, class V> +HWY_NOINLINE V CallAcosh(const D d, VecArg<V> x) { + return Acosh(d, x); +} + +/** + * Highway SIMD version of std::asin(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 2 + * Valid Range: [-1, +1] + * @return arc sine of 'x' + */ +template <class D, class V> +HWY_INLINE V Asin(const D d, V x); +template <class D, class V> +HWY_NOINLINE V CallAsin(const D d, VecArg<V> x) { + return Asin(d, x); +} + +/** + * Highway SIMD version of std::asinh(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 3 + * Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX] + * @return hyperbolic arc sine of 'x' + */ +template <class D, class V> +HWY_INLINE V Asinh(const D d, V x); +template <class D, class V> +HWY_NOINLINE V CallAsinh(const D d, VecArg<V> x) { + return Asinh(d, x); +} + +/** + * Highway SIMD version of std::atan(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 3 + * Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX] + * @return arc tangent of 'x' + */ +template <class D, class V> +HWY_INLINE V Atan(const D d, V x); +template <class D, class V> +HWY_NOINLINE V CallAtan(const D d, VecArg<V> x) { + return Atan(d, x); +} + +/** + * Highway SIMD version of std::atanh(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 3 + * Valid Range: (-1, +1) + * @return hyperbolic arc tangent of 'x' + */ +template <class D, class V> +HWY_INLINE V Atanh(const D d, V x); +template <class D, class V> +HWY_NOINLINE V CallAtanh(const D d, VecArg<V> x) { + return Atanh(d, x); +} + +/** + * Highway SIMD version of std::cos(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 3 + * Valid Range: [-39000, +39000] + * @return cosine of 'x' + */ +template <class D, class V> +HWY_INLINE V Cos(const D d, V x); +template <class D, class V> +HWY_NOINLINE V CallCos(const D d, VecArg<V> x) { + return Cos(d, x); +} + +/** + * Highway SIMD version of std::exp(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 1 + * Valid Range: float32[-FLT_MAX, +104], float64[-DBL_MAX, +706] + * @return e^x + */ +template <class D, class V> +HWY_INLINE V Exp(const D d, V x); +template <class D, class V> +HWY_NOINLINE V CallExp(const D d, VecArg<V> x) { + return Exp(d, x); +} + +/** + * Highway SIMD version of std::expm1(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 4 + * Valid Range: float32[-FLT_MAX, +104], float64[-DBL_MAX, +706] + * @return e^x - 1 + */ +template <class D, class V> +HWY_INLINE V Expm1(const D d, V x); +template <class D, class V> +HWY_NOINLINE V CallExpm1(const D d, VecArg<V> x) { + return Expm1(d, x); +} + +/** + * Highway SIMD version of std::log(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 4 + * Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX] + * @return natural logarithm of 'x' + */ +template <class D, class V> +HWY_INLINE V Log(const D d, V x); +template <class D, class V> +HWY_NOINLINE V CallLog(const D d, VecArg<V> x) { + return Log(d, x); +} + +/** + * Highway SIMD version of std::log10(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 2 + * Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX] + * @return base 10 logarithm of 'x' + */ +template <class D, class V> +HWY_INLINE V Log10(const D d, V x); +template <class D, class V> +HWY_NOINLINE V CallLog10(const D d, VecArg<V> x) { + return Log10(d, x); +} + +/** + * Highway SIMD version of std::log1p(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 2 + * Valid Range: float32[0, +FLT_MAX], float64[0, +DBL_MAX] + * @return log(1 + x) + */ +template <class D, class V> +HWY_INLINE V Log1p(const D d, V x); +template <class D, class V> +HWY_NOINLINE V CallLog1p(const D d, VecArg<V> x) { + return Log1p(d, x); +} + +/** + * Highway SIMD version of std::log2(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 2 + * Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX] + * @return base 2 logarithm of 'x' + */ +template <class D, class V> +HWY_INLINE V Log2(const D d, V x); +template <class D, class V> +HWY_NOINLINE V CallLog2(const D d, VecArg<V> x) { + return Log2(d, x); +} + +/** + * Highway SIMD version of std::sin(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 3 + * Valid Range: [-39000, +39000] + * @return sine of 'x' + */ +template <class D, class V> +HWY_INLINE V Sin(const D d, V x); +template <class D, class V> +HWY_NOINLINE V CallSin(const D d, VecArg<V> x) { + return Sin(d, x); +} + +/** + * Highway SIMD version of std::sinh(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 4 + * Valid Range: float32[-88.7228, +88.7228], float64[-709, +709] + * @return hyperbolic sine of 'x' + */ +template <class D, class V> +HWY_INLINE V Sinh(const D d, V x); +template <class D, class V> +HWY_NOINLINE V CallSinh(const D d, VecArg<V> x) { + return Sinh(d, x); +} + +/** + * Highway SIMD version of std::tanh(x). + * + * Valid Lane Types: float32, float64 + * Max Error: ULP = 4 + * Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX] + * @return hyperbolic tangent of 'x' + */ +template <class D, class V> +HWY_INLINE V Tanh(const D d, V x); +template <class D, class V> +HWY_NOINLINE V CallTanh(const D d, VecArg<V> x) { + return Tanh(d, x); +} + +//////////////////////////////////////////////////////////////////////////////// +// Implementation +//////////////////////////////////////////////////////////////////////////////// +namespace impl { + +// Estrin's Scheme is a faster method for evaluating large polynomials on +// super scalar architectures. It works by factoring the Horner's Method +// polynomial into power of two sub-trees that can be evaluated in parallel. +// Wikipedia Link: https://en.wikipedia.org/wiki/Estrin%27s_scheme +template <class T> +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1) { + return MulAdd(c1, x, c0); +} +template <class T> +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2) { + T x2 = Mul(x, x); + return MulAdd(x2, c2, MulAdd(c1, x, c0)); +} +template <class T> +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3) { + T x2 = Mul(x, x); + return MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)); +} +template <class T> +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + return MulAdd(x4, c4, MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))); +} +template <class T> +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + return MulAdd(x4, MulAdd(c5, x, c4), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))); +} +template <class T> +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + return MulAdd(x4, MulAdd(x2, c6, MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))); +} +template <class T> +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + return MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))); +} +template <class T> +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + return MulAdd(x8, c8, + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); +} +template <class T> +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + return MulAdd(x8, MulAdd(c9, x, c8), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); +} +template <class T> +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9, T c10) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + return MulAdd(x8, MulAdd(x2, c10, MulAdd(c9, x, c8)), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); +} +template <class T> +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9, T c10, T c11) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + return MulAdd(x8, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8)), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); +} +template <class T> +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9, T c10, T c11, + T c12) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + return MulAdd( + x8, MulAdd(x4, c12, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); +} +template <class T> +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9, T c10, T c11, + T c12, T c13) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + return MulAdd(x8, + MulAdd(x4, MulAdd(c13, x, c12), + MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); +} +template <class T> +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9, T c10, T c11, + T c12, T c13, T c14) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + return MulAdd(x8, + MulAdd(x4, MulAdd(x2, c14, MulAdd(c13, x, c12)), + MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); +} +template <class T> +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9, T c10, T c11, + T c12, T c13, T c14, T c15) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + return MulAdd(x8, + MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)), + MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); +} +template <class T> +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9, T c10, T c11, + T c12, T c13, T c14, T c15, T c16) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + T x16 = Mul(x8, x8); + return MulAdd( + x16, c16, + MulAdd(x8, + MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)), + MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))))); +} +template <class T> +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9, T c10, T c11, + T c12, T c13, T c14, T c15, T c16, T c17) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + T x16 = Mul(x8, x8); + return MulAdd( + x16, MulAdd(c17, x, c16), + MulAdd(x8, + MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)), + MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))))); +} +template <class T> +HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, + T c6, T c7, T c8, T c9, T c10, T c11, + T c12, T c13, T c14, T c15, T c16, T c17, + T c18) { + T x2 = Mul(x, x); + T x4 = Mul(x2, x2); + T x8 = Mul(x4, x4); + T x16 = Mul(x8, x8); + return MulAdd( + x16, MulAdd(x2, c18, MulAdd(c17, x, c16)), + MulAdd(x8, + MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)), + MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), + MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), + MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))))); +} + +template <class FloatOrDouble> +struct AsinImpl {}; +template <class FloatOrDouble> +struct AtanImpl {}; +template <class FloatOrDouble> +struct CosSinImpl {}; +template <class FloatOrDouble> +struct ExpImpl {}; +template <class FloatOrDouble> +struct LogImpl {}; + +template <> +struct AsinImpl<float> { + // Polynomial approximation for asin(x) over the range [0, 0.5). + template <class D, class V> + HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) { + const auto k0 = Set(d, +0.1666677296f); + const auto k1 = Set(d, +0.07495029271f); + const auto k2 = Set(d, +0.04547423869f); + const auto k3 = Set(d, +0.02424046025f); + const auto k4 = Set(d, +0.04197454825f); + + return Estrin(x2, k0, k1, k2, k3, k4); + } +}; + +#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 + +template <> +struct AsinImpl<double> { + // Polynomial approximation for asin(x) over the range [0, 0.5). + template <class D, class V> + HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) { + const auto k0 = Set(d, +0.1666666666666497543); + const auto k1 = Set(d, +0.07500000000378581611); + const auto k2 = Set(d, +0.04464285681377102438); + const auto k3 = Set(d, +0.03038195928038132237); + const auto k4 = Set(d, +0.02237176181932048341); + const auto k5 = Set(d, +0.01735956991223614604); + const auto k6 = Set(d, +0.01388715184501609218); + const auto k7 = Set(d, +0.01215360525577377331); + const auto k8 = Set(d, +0.006606077476277170610); + const auto k9 = Set(d, +0.01929045477267910674); + const auto k10 = Set(d, -0.01581918243329996643); + const auto k11 = Set(d, +0.03161587650653934628); + + return Estrin(x2, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11); + } +}; + +#endif + +template <> +struct AtanImpl<float> { + // Polynomial approximation for atan(x) over the range [0, 1.0). + template <class D, class V> + HWY_INLINE V AtanPoly(D d, V x) { + const auto k0 = Set(d, -0.333331018686294555664062f); + const auto k1 = Set(d, +0.199926957488059997558594f); + const auto k2 = Set(d, -0.142027363181114196777344f); + const auto k3 = Set(d, +0.106347933411598205566406f); + const auto k4 = Set(d, -0.0748900920152664184570312f); + const auto k5 = Set(d, +0.0425049886107444763183594f); + const auto k6 = Set(d, -0.0159569028764963150024414f); + const auto k7 = Set(d, +0.00282363896258175373077393f); + + const auto y = Mul(x, x); + return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7), Mul(y, x), x); + } +}; + +#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 + +template <> +struct AtanImpl<double> { + // Polynomial approximation for atan(x) over the range [0, 1.0). + template <class D, class V> + HWY_INLINE V AtanPoly(D d, V x) { + const auto k0 = Set(d, -0.333333333333311110369124); + const auto k1 = Set(d, +0.199999999996591265594148); + const auto k2 = Set(d, -0.14285714266771329383765); + const auto k3 = Set(d, +0.111111105648261418443745); + const auto k4 = Set(d, -0.090908995008245008229153); + const auto k5 = Set(d, +0.0769219538311769618355029); + const auto k6 = Set(d, -0.0666573579361080525984562); + const auto k7 = Set(d, +0.0587666392926673580854313); + const auto k8 = Set(d, -0.0523674852303482457616113); + const auto k9 = Set(d, +0.0466667150077840625632675); + const auto k10 = Set(d, -0.0407629191276836500001934); + const auto k11 = Set(d, +0.0337852580001353069993897); + const auto k12 = Set(d, -0.0254517624932312641616861); + const auto k13 = Set(d, +0.016599329773529201970117); + const auto k14 = Set(d, -0.00889896195887655491740809); + const auto k15 = Set(d, +0.00370026744188713119232403); + const auto k16 = Set(d, -0.00110611831486672482563471); + const auto k17 = Set(d, +0.000209850076645816976906797); + const auto k18 = Set(d, -1.88796008463073496563746e-5); + + const auto y = Mul(x, x); + return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, + k12, k13, k14, k15, k16, k17, k18), + Mul(y, x), x); + } +}; + +#endif + +template <> +struct CosSinImpl<float> { + // Rounds float toward zero and returns as int32_t. + template <class D, class V> + HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) { + return ConvertTo(Rebind<int32_t, D>(), x); + } + + template <class D, class V> + HWY_INLINE V Poly(D d, V x) { + const auto k0 = Set(d, -1.66666597127914428710938e-1f); + const auto k1 = Set(d, +8.33307858556509017944336e-3f); + const auto k2 = Set(d, -1.981069071916863322258e-4f); + const auto k3 = Set(d, +2.6083159809786593541503e-6f); + + const auto y = Mul(x, x); + return MulAdd(Estrin(y, k0, k1, k2, k3), Mul(y, x), x); + } + + template <class D, class V, class VI32> + HWY_INLINE V CosReduce(D d, V x, VI32 q) { + // kHalfPiPart0f + kHalfPiPart1f + kHalfPiPart2f + kHalfPiPart3f ~= -pi/2 + const V kHalfPiPart0f = Set(d, -0.5f * 3.140625f); + const V kHalfPiPart1f = Set(d, -0.5f * 0.0009670257568359375f); + const V kHalfPiPart2f = Set(d, -0.5f * 6.2771141529083251953e-7f); + const V kHalfPiPart3f = Set(d, -0.5f * 1.2154201256553420762e-10f); + + // Extended precision modular arithmetic. + const V qf = ConvertTo(d, q); + x = MulAdd(qf, kHalfPiPart0f, x); + x = MulAdd(qf, kHalfPiPart1f, x); + x = MulAdd(qf, kHalfPiPart2f, x); + x = MulAdd(qf, kHalfPiPart3f, x); + return x; + } + + template <class D, class V, class VI32> + HWY_INLINE V SinReduce(D d, V x, VI32 q) { + // kPiPart0f + kPiPart1f + kPiPart2f + kPiPart3f ~= -pi + const V kPiPart0f = Set(d, -3.140625f); + const V kPiPart1f = Set(d, -0.0009670257568359375f); + const V kPiPart2f = Set(d, -6.2771141529083251953e-7f); + const V kPiPart3f = Set(d, -1.2154201256553420762e-10f); + + // Extended precision modular arithmetic. + const V qf = ConvertTo(d, q); + x = MulAdd(qf, kPiPart0f, x); + x = MulAdd(qf, kPiPart1f, x); + x = MulAdd(qf, kPiPart2f, x); + x = MulAdd(qf, kPiPart3f, x); + return x; + } + + // (q & 2) == 0 ? -0.0 : +0.0 + template <class D, class VI32> + HWY_INLINE Vec<Rebind<float, D>> CosSignFromQuadrant(D d, VI32 q) { + const VI32 kTwo = Set(Rebind<int32_t, D>(), 2); + return BitCast(d, ShiftLeft<30>(AndNot(q, kTwo))); + } + + // ((q & 1) ? -0.0 : +0.0) + template <class D, class VI32> + HWY_INLINE Vec<Rebind<float, D>> SinSignFromQuadrant(D d, VI32 q) { + const VI32 kOne = Set(Rebind<int32_t, D>(), 1); + return BitCast(d, ShiftLeft<31>(And(q, kOne))); + } +}; + +#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 + +template <> +struct CosSinImpl<double> { + // Rounds double toward zero and returns as int32_t. + template <class D, class V> + HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) { + return DemoteTo(Rebind<int32_t, D>(), x); + } + + template <class D, class V> + HWY_INLINE V Poly(D d, V x) { + const auto k0 = Set(d, -0.166666666666666657414808); + const auto k1 = Set(d, +0.00833333333333332974823815); + const auto k2 = Set(d, -0.000198412698412696162806809); + const auto k3 = Set(d, +2.75573192239198747630416e-6); + const auto k4 = Set(d, -2.50521083763502045810755e-8); + const auto k5 = Set(d, +1.60590430605664501629054e-10); + const auto k6 = Set(d, -7.64712219118158833288484e-13); + const auto k7 = Set(d, +2.81009972710863200091251e-15); + const auto k8 = Set(d, -7.97255955009037868891952e-18); + + const auto y = Mul(x, x); + return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8), Mul(y, x), x); + } + + template <class D, class V, class VI32> + HWY_INLINE V CosReduce(D d, V x, VI32 q) { + // kHalfPiPart0d + kHalfPiPart1d + kHalfPiPart2d + kHalfPiPart3d ~= -pi/2 + const V kHalfPiPart0d = Set(d, -0.5 * 3.1415926218032836914); + const V kHalfPiPart1d = Set(d, -0.5 * 3.1786509424591713469e-8); + const V kHalfPiPart2d = Set(d, -0.5 * 1.2246467864107188502e-16); + const V kHalfPiPart3d = Set(d, -0.5 * 1.2736634327021899816e-24); + + // Extended precision modular arithmetic. + const V qf = PromoteTo(d, q); + x = MulAdd(qf, kHalfPiPart0d, x); + x = MulAdd(qf, kHalfPiPart1d, x); + x = MulAdd(qf, kHalfPiPart2d, x); + x = MulAdd(qf, kHalfPiPart3d, x); + return x; + } + + template <class D, class V, class VI32> + HWY_INLINE V SinReduce(D d, V x, VI32 q) { + // kPiPart0d + kPiPart1d + kPiPart2d + kPiPart3d ~= -pi + const V kPiPart0d = Set(d, -3.1415926218032836914); + const V kPiPart1d = Set(d, -3.1786509424591713469e-8); + const V kPiPart2d = Set(d, -1.2246467864107188502e-16); + const V kPiPart3d = Set(d, -1.2736634327021899816e-24); + + // Extended precision modular arithmetic. + const V qf = PromoteTo(d, q); + x = MulAdd(qf, kPiPart0d, x); + x = MulAdd(qf, kPiPart1d, x); + x = MulAdd(qf, kPiPart2d, x); + x = MulAdd(qf, kPiPart3d, x); + return x; + } + + // (q & 2) == 0 ? -0.0 : +0.0 + template <class D, class VI32> + HWY_INLINE Vec<Rebind<double, D>> CosSignFromQuadrant(D d, VI32 q) { + const VI32 kTwo = Set(Rebind<int32_t, D>(), 2); + return BitCast( + d, ShiftLeft<62>(PromoteTo(Rebind<int64_t, D>(), AndNot(q, kTwo)))); + } + + // ((q & 1) ? -0.0 : +0.0) + template <class D, class VI32> + HWY_INLINE Vec<Rebind<double, D>> SinSignFromQuadrant(D d, VI32 q) { + const VI32 kOne = Set(Rebind<int32_t, D>(), 1); + return BitCast( + d, ShiftLeft<63>(PromoteTo(Rebind<int64_t, D>(), And(q, kOne)))); + } +}; + +#endif + +template <> +struct ExpImpl<float> { + // Rounds float toward zero and returns as int32_t. + template <class D, class V> + HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) { + return ConvertTo(Rebind<int32_t, D>(), x); + } + + template <class D, class V> + HWY_INLINE V ExpPoly(D d, V x) { + const auto k0 = Set(d, +0.5f); + const auto k1 = Set(d, +0.166666671633720397949219f); + const auto k2 = Set(d, +0.0416664853692054748535156f); + const auto k3 = Set(d, +0.00833336077630519866943359f); + const auto k4 = Set(d, +0.00139304355252534151077271f); + const auto k5 = Set(d, +0.000198527617612853646278381f); + + return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5), Mul(x, x), x); + } + + // Computes 2^x, where x is an integer. + template <class D, class VI32> + HWY_INLINE Vec<D> Pow2I(D d, VI32 x) { + const Rebind<int32_t, D> di32; + const VI32 kOffset = Set(di32, 0x7F); + return BitCast(d, ShiftLeft<23>(Add(x, kOffset))); + } + + // Sets the exponent of 'x' to 2^e. + template <class D, class V, class VI32> + HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) { + const VI32 y = ShiftRight<1>(e); + return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y))); + } + + template <class D, class V, class VI32> + HWY_INLINE V ExpReduce(D d, V x, VI32 q) { + // kLn2Part0f + kLn2Part1f ~= -ln(2) + const V kLn2Part0f = Set(d, -0.693145751953125f); + const V kLn2Part1f = Set(d, -1.428606765330187045e-6f); + + // Extended precision modular arithmetic. + const V qf = ConvertTo(d, q); + x = MulAdd(qf, kLn2Part0f, x); + x = MulAdd(qf, kLn2Part1f, x); + return x; + } +}; + +template <> +struct LogImpl<float> { + template <class D, class V> + HWY_INLINE Vec<Rebind<int32_t, D>> Log2p1NoSubnormal(D /*d*/, V x) { + const Rebind<int32_t, D> di32; + const Rebind<uint32_t, D> du32; + const auto kBias = Set(di32, 0x7F); + return Sub(BitCast(di32, ShiftRight<23>(BitCast(du32, x))), kBias); + } + + // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)]. + template <class D, class V> + HWY_INLINE V LogPoly(D d, V x) { + const V k0 = Set(d, 0.66666662693f); + const V k1 = Set(d, 0.40000972152f); + const V k2 = Set(d, 0.28498786688f); + const V k3 = Set(d, 0.24279078841f); + + const V x2 = Mul(x, x); + const V x4 = Mul(x2, x2); + return MulAdd(MulAdd(k2, x4, k0), x2, Mul(MulAdd(k3, x4, k1), x4)); + } +}; + +#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 +template <> +struct ExpImpl<double> { + // Rounds double toward zero and returns as int32_t. + template <class D, class V> + HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) { + return DemoteTo(Rebind<int32_t, D>(), x); + } + + template <class D, class V> + HWY_INLINE V ExpPoly(D d, V x) { + const auto k0 = Set(d, +0.5); + const auto k1 = Set(d, +0.166666666666666851703837); + const auto k2 = Set(d, +0.0416666666666665047591422); + const auto k3 = Set(d, +0.00833333333331652721664984); + const auto k4 = Set(d, +0.00138888888889774492207962); + const auto k5 = Set(d, +0.000198412698960509205564975); + const auto k6 = Set(d, +2.4801587159235472998791e-5); + const auto k7 = Set(d, +2.75572362911928827629423e-6); + const auto k8 = Set(d, +2.75573911234900471893338e-7); + const auto k9 = Set(d, +2.51112930892876518610661e-8); + const auto k10 = Set(d, +2.08860621107283687536341e-9); + + return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10), + Mul(x, x), x); + } + + // Computes 2^x, where x is an integer. + template <class D, class VI32> + HWY_INLINE Vec<D> Pow2I(D d, VI32 x) { + const Rebind<int32_t, D> di32; + const Rebind<int64_t, D> di64; + const VI32 kOffset = Set(di32, 0x3FF); + return BitCast(d, ShiftLeft<52>(PromoteTo(di64, Add(x, kOffset)))); + } + + // Sets the exponent of 'x' to 2^e. + template <class D, class V, class VI32> + HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) { + const VI32 y = ShiftRight<1>(e); + return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y))); + } + + template <class D, class V, class VI32> + HWY_INLINE V ExpReduce(D d, V x, VI32 q) { + // kLn2Part0d + kLn2Part1d ~= -ln(2) + const V kLn2Part0d = Set(d, -0.6931471805596629565116018); + const V kLn2Part1d = Set(d, -0.28235290563031577122588448175e-12); + + // Extended precision modular arithmetic. + const V qf = PromoteTo(d, q); + x = MulAdd(qf, kLn2Part0d, x); + x = MulAdd(qf, kLn2Part1d, x); + return x; + } +}; + +template <> +struct LogImpl<double> { + template <class D, class V> + HWY_INLINE Vec<Rebind<int64_t, D>> Log2p1NoSubnormal(D /*d*/, V x) { + const Rebind<int64_t, D> di64; + const Rebind<uint64_t, D> du64; + return Sub(BitCast(di64, ShiftRight<52>(BitCast(du64, x))), + Set(di64, 0x3FF)); + } + + // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)]. + template <class D, class V> + HWY_INLINE V LogPoly(D d, V x) { + const V k0 = Set(d, 0.6666666666666735130); + const V k1 = Set(d, 0.3999999999940941908); + const V k2 = Set(d, 0.2857142874366239149); + const V k3 = Set(d, 0.2222219843214978396); + const V k4 = Set(d, 0.1818357216161805012); + const V k5 = Set(d, 0.1531383769920937332); + const V k6 = Set(d, 0.1479819860511658591); + + const V x2 = Mul(x, x); + const V x4 = Mul(x2, x2); + return MulAdd(MulAdd(MulAdd(MulAdd(k6, x4, k4), x4, k2), x4, k0), x2, + (Mul(MulAdd(MulAdd(k5, x4, k3), x4, k1), x4))); + } +}; + +#endif + +template <class D, class V, bool kAllowSubnormals = true> +HWY_INLINE V Log(const D d, V x) { + // http://git.musl-libc.org/cgit/musl/tree/src/math/log.c for more info. + using T = TFromD<D>; + impl::LogImpl<T> impl; + + constexpr bool kIsF32 = (sizeof(T) == 4); + + // Float Constants + const V kLn2Hi = Set(d, kIsF32 ? static_cast<T>(0.69313812256f) + : static_cast<T>(0.693147180369123816490)); + const V kLn2Lo = Set(d, kIsF32 ? static_cast<T>(9.0580006145e-6f) + : static_cast<T>(1.90821492927058770002e-10)); + const V kOne = Set(d, static_cast<T>(+1.0)); + const V kMinNormal = Set(d, kIsF32 ? static_cast<T>(1.175494351e-38f) + : static_cast<T>(2.2250738585072014e-308)); + const V kScale = Set(d, kIsF32 ? static_cast<T>(3.355443200e+7f) + : static_cast<T>(1.8014398509481984e+16)); + + // Integer Constants + using TI = MakeSigned<T>; + const Rebind<TI, D> di; + using VI = decltype(Zero(di)); + const VI kLowerBits = Set(di, kIsF32 ? static_cast<TI>(0x00000000L) + : static_cast<TI>(0xFFFFFFFFLL)); + const VI kMagic = Set(di, kIsF32 ? static_cast<TI>(0x3F3504F3L) + : static_cast<TI>(0x3FE6A09E00000000LL)); + const VI kExpMask = Set(di, kIsF32 ? static_cast<TI>(0x3F800000L) + : static_cast<TI>(0x3FF0000000000000LL)); + const VI kExpScale = + Set(di, kIsF32 ? static_cast<TI>(-25) : static_cast<TI>(-54)); + const VI kManMask = Set(di, kIsF32 ? static_cast<TI>(0x7FFFFFL) + : static_cast<TI>(0xFFFFF00000000LL)); + + // Scale up 'x' so that it is no longer denormalized. + VI exp_bits; + V exp; + if (kAllowSubnormals == true) { + const auto is_denormal = Lt(x, kMinNormal); + x = IfThenElse(is_denormal, Mul(x, kScale), x); + + // Compute the new exponent. + exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic)); + const VI exp_scale = + BitCast(di, IfThenElseZero(is_denormal, BitCast(d, kExpScale))); + exp = ConvertTo( + d, Add(exp_scale, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits)))); + } else { + // Compute the new exponent. + exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic)); + exp = ConvertTo(d, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits))); + } + + // Renormalize. + const V y = Or(And(x, BitCast(d, kLowerBits)), + BitCast(d, Add(And(exp_bits, kManMask), kMagic))); + + // Approximate and reconstruct. + const V ym1 = Sub(y, kOne); + const V z = Div(ym1, Add(y, kOne)); + + return MulSub( + exp, kLn2Hi, + Sub(MulSub(z, Sub(ym1, impl.LogPoly(d, z)), Mul(exp, kLn2Lo)), ym1)); +} + +} // namespace impl + +template <class D, class V> +HWY_INLINE V Acos(const D d, V x) { + using T = TFromD<D>; + + const V kZero = Zero(d); + const V kHalf = Set(d, static_cast<T>(+0.5)); + const V kPi = Set(d, static_cast<T>(+3.14159265358979323846264)); + const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169)); + + const V sign_x = And(SignBit(d), x); + const V abs_x = Xor(x, sign_x); + const auto mask = Lt(abs_x, kHalf); + const V yy = + IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf)); + const V y = IfThenElse(mask, abs_x, Sqrt(yy)); + + impl::AsinImpl<T> impl; + const V t = Mul(impl.AsinPoly(d, yy, y), Mul(y, yy)); + + const V t_plus_y = Add(t, y); + const V z = + IfThenElse(mask, Sub(kPiOverTwo, Add(Xor(y, sign_x), Xor(t, sign_x))), + Add(t_plus_y, t_plus_y)); + return IfThenElse(Or(mask, Ge(x, kZero)), z, Sub(kPi, z)); +} + +template <class D, class V> +HWY_INLINE V Acosh(const D d, V x) { + using T = TFromD<D>; + + const V kLarge = Set(d, static_cast<T>(268435456.0)); + const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227)); + const V kOne = Set(d, static_cast<T>(+1.0)); + const V kTwo = Set(d, static_cast<T>(+2.0)); + + const auto is_x_large = Gt(x, kLarge); + const auto is_x_gt_2 = Gt(x, kTwo); + + const V x_minus_1 = Sub(x, kOne); + const V y0 = MulSub(kTwo, x, Div(kOne, Add(Sqrt(MulSub(x, x, kOne)), x))); + const V y1 = + Add(Sqrt(MulAdd(x_minus_1, kTwo, Mul(x_minus_1, x_minus_1))), x_minus_1); + const V y2 = + IfThenElse(is_x_gt_2, IfThenElse(is_x_large, x, y0), Add(y1, kOne)); + const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2); + + const auto is_pole = Eq(y2, kOne); + const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne); + return Add(IfThenElse(is_x_gt_2, z, + IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor))), + IfThenElseZero(is_x_large, kLog2)); +} + +template <class D, class V> +HWY_INLINE V Asin(const D d, V x) { + using T = TFromD<D>; + + const V kHalf = Set(d, static_cast<T>(+0.5)); + const V kTwo = Set(d, static_cast<T>(+2.0)); + const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169)); + + const V sign_x = And(SignBit(d), x); + const V abs_x = Xor(x, sign_x); + const auto mask = Lt(abs_x, kHalf); + const V yy = + IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf)); + const V y = IfThenElse(mask, abs_x, Sqrt(yy)); + + impl::AsinImpl<T> impl; + const V z0 = MulAdd(impl.AsinPoly(d, yy, y), Mul(yy, y), y); + const V z1 = NegMulAdd(z0, kTwo, kPiOverTwo); + return Or(IfThenElse(mask, z0, z1), sign_x); +} + +template <class D, class V> +HWY_INLINE V Asinh(const D d, V x) { + using T = TFromD<D>; + + const V kSmall = Set(d, static_cast<T>(1.0 / 268435456.0)); + const V kLarge = Set(d, static_cast<T>(268435456.0)); + const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227)); + const V kOne = Set(d, static_cast<T>(+1.0)); + const V kTwo = Set(d, static_cast<T>(+2.0)); + + const V sign_x = And(SignBit(d), x); // Extract the sign bit + const V abs_x = Xor(x, sign_x); + + const auto is_x_large = Gt(abs_x, kLarge); + const auto is_x_lt_2 = Lt(abs_x, kTwo); + + const V x2 = Mul(x, x); + const V sqrt_x2_plus_1 = Sqrt(Add(x2, kOne)); + + const V y0 = MulAdd(abs_x, kTwo, Div(kOne, Add(sqrt_x2_plus_1, abs_x))); + const V y1 = Add(Div(x2, Add(sqrt_x2_plus_1, kOne)), abs_x); + const V y2 = + IfThenElse(is_x_lt_2, Add(y1, kOne), IfThenElse(is_x_large, abs_x, y0)); + const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2); + + const auto is_pole = Eq(y2, kOne); + const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne); + const auto large = IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor)); + const V y = IfThenElse(Lt(abs_x, kSmall), x, large); + return Or(Add(IfThenElse(is_x_lt_2, y, z), IfThenElseZero(is_x_large, kLog2)), + sign_x); +} + +template <class D, class V> +HWY_INLINE V Atan(const D d, V x) { + using T = TFromD<D>; + + const V kOne = Set(d, static_cast<T>(+1.0)); + const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169)); + + const V sign = And(SignBit(d), x); + const V abs_x = Xor(x, sign); + const auto mask = Gt(abs_x, kOne); + + impl::AtanImpl<T> impl; + const auto divisor = IfThenElse(mask, abs_x, kOne); + const V y = impl.AtanPoly(d, IfThenElse(mask, Div(kOne, divisor), abs_x)); + return Or(IfThenElse(mask, Sub(kPiOverTwo, y), y), sign); +} + +template <class D, class V> +HWY_INLINE V Atanh(const D d, V x) { + using T = TFromD<D>; + + const V kHalf = Set(d, static_cast<T>(+0.5)); + const V kOne = Set(d, static_cast<T>(+1.0)); + + const V sign = And(SignBit(d), x); // Extract the sign bit + const V abs_x = Xor(x, sign); + return Mul(Log1p(d, Div(Add(abs_x, abs_x), Sub(kOne, abs_x))), + Xor(kHalf, sign)); +} + +template <class D, class V> +HWY_INLINE V Cos(const D d, V x) { + using T = TFromD<D>; + impl::CosSinImpl<T> impl; + + // Float Constants + const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153)); + + // Integer Constants + const Rebind<int32_t, D> di32; + using VI32 = decltype(Zero(di32)); + const VI32 kOne = Set(di32, 1); + + const V y = Abs(x); // cos(x) == cos(|x|) + + // Compute the quadrant, q = int(|x| / pi) * 2 + 1 + const VI32 q = Add(ShiftLeft<1>(impl.ToInt32(d, Mul(y, kOneOverPi))), kOne); + + // Reduce range, apply sign, and approximate. + return impl.Poly( + d, Xor(impl.CosReduce(d, y, q), impl.CosSignFromQuadrant(d, q))); +} + +template <class D, class V> +HWY_INLINE V Exp(const D d, V x) { + using T = TFromD<D>; + + const V kHalf = Set(d, static_cast<T>(+0.5)); + const V kLowerBound = + Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0))); + const V kNegZero = Set(d, static_cast<T>(-0.0)); + const V kOne = Set(d, static_cast<T>(+1.0)); + const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681)); + + impl::ExpImpl<T> impl; + + // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5)) + const auto q = + impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero)))); + + // Reduce, approximate, and then reconstruct. + const V y = impl.LoadExpShortRange( + d, Add(impl.ExpPoly(d, impl.ExpReduce(d, x, q)), kOne), q); + return IfThenElseZero(Ge(x, kLowerBound), y); +} + +template <class D, class V> +HWY_INLINE V Expm1(const D d, V x) { + using T = TFromD<D>; + + const V kHalf = Set(d, static_cast<T>(+0.5)); + const V kLowerBound = + Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0))); + const V kLn2Over2 = Set(d, static_cast<T>(+0.346573590279972654708616)); + const V kNegOne = Set(d, static_cast<T>(-1.0)); + const V kNegZero = Set(d, static_cast<T>(-0.0)); + const V kOne = Set(d, static_cast<T>(+1.0)); + const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681)); + + impl::ExpImpl<T> impl; + + // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5)) + const auto q = + impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero)))); + + // Reduce, approximate, and then reconstruct. + const V y = impl.ExpPoly(d, impl.ExpReduce(d, x, q)); + const V z = IfThenElse(Lt(Abs(x), kLn2Over2), y, + Sub(impl.LoadExpShortRange(d, Add(y, kOne), q), kOne)); + return IfThenElse(Lt(x, kLowerBound), kNegOne, z); +} + +template <class D, class V> +HWY_INLINE V Log(const D d, V x) { + return impl::Log<D, V, /*kAllowSubnormals=*/true>(d, x); +} + +template <class D, class V> +HWY_INLINE V Log10(const D d, V x) { + using T = TFromD<D>; + return Mul(Log(d, x), Set(d, static_cast<T>(0.4342944819032518276511))); +} + +template <class D, class V> +HWY_INLINE V Log1p(const D d, V x) { + using T = TFromD<D>; + const V kOne = Set(d, static_cast<T>(+1.0)); + + const V y = Add(x, kOne); + const auto is_pole = Eq(y, kOne); + const auto divisor = Sub(IfThenZeroElse(is_pole, y), kOne); + const auto non_pole = + Mul(impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y), Div(x, divisor)); + return IfThenElse(is_pole, x, non_pole); +} + +template <class D, class V> +HWY_INLINE V Log2(const D d, V x) { + using T = TFromD<D>; + return Mul(Log(d, x), Set(d, static_cast<T>(1.44269504088896340735992))); +} + +template <class D, class V> +HWY_INLINE V Sin(const D d, V x) { + using T = TFromD<D>; + impl::CosSinImpl<T> impl; + + // Float Constants + const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153)); + const V kHalf = Set(d, static_cast<T>(0.5)); + + // Integer Constants + const Rebind<int32_t, D> di32; + using VI32 = decltype(Zero(di32)); + + const V abs_x = Abs(x); + const V sign_x = Xor(abs_x, x); + + // Compute the quadrant, q = int((|x| / pi) + 0.5) + const VI32 q = impl.ToInt32(d, MulAdd(abs_x, kOneOverPi, kHalf)); + + // Reduce range, apply sign, and approximate. + return impl.Poly(d, Xor(impl.SinReduce(d, abs_x, q), + Xor(impl.SinSignFromQuadrant(d, q), sign_x))); +} + +template <class D, class V> +HWY_INLINE V Sinh(const D d, V x) { + using T = TFromD<D>; + const V kHalf = Set(d, static_cast<T>(+0.5)); + const V kOne = Set(d, static_cast<T>(+1.0)); + const V kTwo = Set(d, static_cast<T>(+2.0)); + + const V sign = And(SignBit(d), x); // Extract the sign bit + const V abs_x = Xor(x, sign); + const V y = Expm1(d, abs_x); + const V z = Mul(Div(Add(y, kTwo), Add(y, kOne)), Mul(y, kHalf)); + return Xor(z, sign); // Reapply the sign bit +} + +template <class D, class V> +HWY_INLINE V Tanh(const D d, V x) { + using T = TFromD<D>; + const V kLimit = Set(d, static_cast<T>(18.714973875)); + const V kOne = Set(d, static_cast<T>(+1.0)); + const V kTwo = Set(d, static_cast<T>(+2.0)); + + const V sign = And(SignBit(d), x); // Extract the sign bit + const V abs_x = Xor(x, sign); + const V y = Expm1(d, Mul(abs_x, kTwo)); + const V z = IfThenElse(Gt(abs_x, kLimit), kOne, Div(y, Add(y, kTwo))); + return Xor(z, sign); // Reapply the sign bit +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_ diff --git a/media/highway/src/hwy/contrib/math/math_test.cc b/media/highway/src/hwy/contrib/math/math_test.cc new file mode 100644 index 0000000000..ec6032c6c1 --- /dev/null +++ b/media/highway/src/hwy/contrib/math/math_test.cc @@ -0,0 +1,223 @@ +// Copyright 2020 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stdio.h> + +#include <cfloat> // FLT_MAX +#include <type_traits> + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/math/math_test.cc" +#include "hwy/foreach_target.h" + +#include "hwy/contrib/math/math-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +template <class Out, class In> +inline Out BitCast(const In& in) { + static_assert(sizeof(Out) == sizeof(In), ""); + Out out; + CopyBytes<sizeof(out)>(&in, &out); + return out; +} + +template <class T, class D> +HWY_NOINLINE void TestMath(const std::string name, T (*fx1)(T), + Vec<D> (*fxN)(D, VecArg<Vec<D>>), D d, T min, T max, + uint64_t max_error_ulp) { + using UintT = MakeUnsigned<T>; + + const UintT min_bits = BitCast<UintT>(min); + const UintT max_bits = BitCast<UintT>(max); + + // If min is negative and max is positive, the range needs to be broken into + // two pieces, [+0, max] and [-0, min], otherwise [min, max]. + int range_count = 1; + UintT ranges[2][2] = {{min_bits, max_bits}, {0, 0}}; + if ((min < 0.0) && (max > 0.0)) { + ranges[0][0] = BitCast<UintT>(static_cast<T>(+0.0)); + ranges[0][1] = max_bits; + ranges[1][0] = BitCast<UintT>(static_cast<T>(-0.0)); + ranges[1][1] = min_bits; + range_count = 2; + } + + uint64_t max_ulp = 0; + // Emulation is slower, so cannot afford as many. + constexpr UintT kSamplesPerRange = static_cast<UintT>(AdjustedReps(4000)); + for (int range_index = 0; range_index < range_count; ++range_index) { + const UintT start = ranges[range_index][0]; + const UintT stop = ranges[range_index][1]; + const UintT step = HWY_MAX(1, ((stop - start) / kSamplesPerRange)); + for (UintT value_bits = start; value_bits <= stop; value_bits += step) { + // For reasons unknown, the HWY_MAX is necessary on RVV, otherwise + // value_bits can be less than start, and thus possibly NaN. + const T value = BitCast<T>(HWY_MIN(HWY_MAX(start, value_bits), stop)); + const T actual = GetLane(fxN(d, Set(d, value))); + const T expected = fx1(value); + + // Skip small inputs and outputs on armv7, it flushes subnormals to zero. +#if HWY_TARGET == HWY_NEON && HWY_ARCH_ARM_V7 + if ((std::abs(value) < 1e-37f) || (std::abs(expected) < 1e-37f)) { + continue; + } +#endif + + const auto ulp = hwy::detail::ComputeUlpDelta(actual, expected); + max_ulp = HWY_MAX(max_ulp, ulp); + if (ulp > max_error_ulp) { + fprintf(stderr, + "%s: %s(%f) expected %f actual %f ulp %" PRIu64 " max ulp %u\n", + hwy::TypeName(T(), Lanes(d)).c_str(), name.c_str(), value, + expected, actual, static_cast<uint64_t>(ulp), + static_cast<uint32_t>(max_error_ulp)); + } + } + } + fprintf(stderr, "%s: %s max_ulp %" PRIu64 "\n", + hwy::TypeName(T(), Lanes(d)).c_str(), name.c_str(), max_ulp); + HWY_ASSERT(max_ulp <= max_error_ulp); +} + +#define DEFINE_MATH_TEST_FUNC(NAME) \ + HWY_NOINLINE void TestAll##NAME() { \ + ForFloatTypes(ForPartialVectors<Test##NAME>()); \ + } + +#undef DEFINE_MATH_TEST +#define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \ + F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR) \ + struct Test##NAME { \ + template <class T, class D> \ + HWY_NOINLINE void operator()(T, D d) { \ + if (sizeof(T) == 4) { \ + TestMath<T, D>(HWY_STR(NAME), F32x1, F32xN, d, F32_MIN, F32_MAX, \ + F32_ERROR); \ + } else { \ + TestMath<T, D>(HWY_STR(NAME), F64x1, F64xN, d, \ + static_cast<T>(F64_MIN), static_cast<T>(F64_MAX), \ + F64_ERROR); \ + } \ + } \ + }; \ + DEFINE_MATH_TEST_FUNC(NAME) + +// Floating point values closest to but less than 1.0 +const float kNearOneF = BitCast<float>(0x3F7FFFFF); +const double kNearOneD = BitCast<double>(0x3FEFFFFFFFFFFFFFULL); + +// The discrepancy is unacceptably large for MSYS2 (less accurate libm?), so +// only increase the error tolerance there. +constexpr uint64_t Cos64ULP() { +#if defined(__MINGW32__) + return 23; +#else + return 3; +#endif +} + +constexpr uint64_t ACosh32ULP() { +#if defined(__MINGW32__) + return 8; +#else + return 3; +#endif +} + +// clang-format off +DEFINE_MATH_TEST(Acos, + std::acos, CallAcos, -1.0f, +1.0f, 3, // NEON is 3 instead of 2 + std::acos, CallAcos, -1.0, +1.0, 2) +DEFINE_MATH_TEST(Acosh, + std::acosh, CallAcosh, +1.0f, +FLT_MAX, ACosh32ULP(), + std::acosh, CallAcosh, +1.0, +DBL_MAX, 3) +DEFINE_MATH_TEST(Asin, + std::asin, CallAsin, -1.0f, +1.0f, 4, // ARMv7 is 4 instead of 2 + std::asin, CallAsin, -1.0, +1.0, 2) +DEFINE_MATH_TEST(Asinh, + std::asinh, CallAsinh, -FLT_MAX, +FLT_MAX, 3, + std::asinh, CallAsinh, -DBL_MAX, +DBL_MAX, 3) +DEFINE_MATH_TEST(Atan, + std::atan, CallAtan, -FLT_MAX, +FLT_MAX, 3, + std::atan, CallAtan, -DBL_MAX, +DBL_MAX, 3) +DEFINE_MATH_TEST(Atanh, + std::atanh, CallAtanh, -kNearOneF, +kNearOneF, 4, // NEON is 4 instead of 3 + std::atanh, CallAtanh, -kNearOneD, +kNearOneD, 3) +DEFINE_MATH_TEST(Cos, + std::cos, CallCos, -39000.0f, +39000.0f, 3, + std::cos, CallCos, -39000.0, +39000.0, Cos64ULP()) +DEFINE_MATH_TEST(Exp, + std::exp, CallExp, -FLT_MAX, +104.0f, 1, + std::exp, CallExp, -DBL_MAX, +104.0, 1) +DEFINE_MATH_TEST(Expm1, + std::expm1, CallExpm1, -FLT_MAX, +104.0f, 4, + std::expm1, CallExpm1, -DBL_MAX, +104.0, 4) +DEFINE_MATH_TEST(Log, + std::log, CallLog, +FLT_MIN, +FLT_MAX, 1, + std::log, CallLog, +DBL_MIN, +DBL_MAX, 1) +DEFINE_MATH_TEST(Log10, + std::log10, CallLog10, +FLT_MIN, +FLT_MAX, 2, + std::log10, CallLog10, +DBL_MIN, +DBL_MAX, 2) +DEFINE_MATH_TEST(Log1p, + std::log1p, CallLog1p, +0.0f, +1e37f, 3, // NEON is 3 instead of 2 + std::log1p, CallLog1p, +0.0, +DBL_MAX, 2) +DEFINE_MATH_TEST(Log2, + std::log2, CallLog2, +FLT_MIN, +FLT_MAX, 2, + std::log2, CallLog2, +DBL_MIN, +DBL_MAX, 2) +DEFINE_MATH_TEST(Sin, + std::sin, CallSin, -39000.0f, +39000.0f, 3, + std::sin, CallSin, -39000.0, +39000.0, 4) // MSYS is 4 instead of 3 +DEFINE_MATH_TEST(Sinh, + std::sinh, CallSinh, -80.0f, +80.0f, 4, + std::sinh, CallSinh, -709.0, +709.0, 4) +DEFINE_MATH_TEST(Tanh, + std::tanh, CallTanh, -FLT_MAX, +FLT_MAX, 4, + std::tanh, CallTanh, -DBL_MAX, +DBL_MAX, 4) +// clang-format on + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +HWY_BEFORE_TEST(HwyMathTest); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcos); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcosh); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsin); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsinh); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtan); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtanh); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllCos); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExp); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExpm1); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog10); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog1p); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog2); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSin); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinh); +HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllTanh); +} // namespace hwy + +#endif diff --git a/media/highway/src/hwy/contrib/sort/BUILD b/media/highway/src/hwy/contrib/sort/BUILD new file mode 100644 index 0000000000..2e1ddcc0cc --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/BUILD @@ -0,0 +1,134 @@ +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) + +# Unused on Bazel builds, where this is not defined/known; Copybara replaces +# usages with an empty list. +COMPAT = [ + "//buildenv/target:non_prod", # includes mobile/vendor. +] + +cc_library( + name = "vqsort", + srcs = [ + # Split into separate files to reduce MSVC build time. + "vqsort.cc", + "vqsort_i16a.cc", + "vqsort_i16d.cc", + "vqsort_u16a.cc", + "vqsort_u16d.cc", + "vqsort_f32a.cc", + "vqsort_f32d.cc", + "vqsort_i32a.cc", + "vqsort_i32d.cc", + "vqsort_u32a.cc", + "vqsort_u32d.cc", + "vqsort_f64a.cc", + "vqsort_f64d.cc", + "vqsort_i64a.cc", + "vqsort_i64d.cc", + "vqsort_u64a.cc", + "vqsort_u64d.cc", + "vqsort_128a.cc", + "vqsort_128d.cc", + ], + hdrs = [ + "disabled_targets.h", + "vqsort.h", # public interface + ], + compatible_with = [], + local_defines = ["hwy_contrib_EXPORTS"], + textual_hdrs = [ + "shared-inl.h", + "sorting_networks-inl.h", + "traits-inl.h", + "traits128-inl.h", + "vqsort-inl.h", + ], + deps = [ + # Only if VQSORT_SECURE_RNG is set. + # "//third_party/absl/random", + "//:hwy", + ], +) + +# ----------------------------------------------------------------------------- +# Internal-only targets + +cc_library( + name = "helpers", + testonly = 1, + textual_hdrs = [ + "algo-inl.h", + "result-inl.h", + ], + deps = [ + ":vqsort", + "//:nanobenchmark", + # Required for HAVE_PDQSORT, but that is unused and this is + # unavailable to Bazel builds, hence commented out. + # "//third_party/boost/allowed", + # Avoid ips4o and thus TBB to work around hwloc build failure. + ], +) + +cc_binary( + name = "print_network", + testonly = 1, + srcs = ["print_network.cc"], + deps = [ + ":helpers", + ":vqsort", + "//:hwy", + ], +) + +cc_test( + name = "sort_test", + size = "medium", + srcs = ["sort_test.cc"], + features = ["fully_static_link"], + linkstatic = True, + local_defines = ["HWY_IS_TEST"], + # for test_suite. + tags = ["hwy_ops_test"], + deps = [ + ":helpers", + ":vqsort", + "@com_google_googletest//:gtest_main", + "//:hwy", + "//:hwy_test_util", + ], +) + +cc_binary( + name = "bench_sort", + testonly = 1, + srcs = ["bench_sort.cc"], + features = ["fully_static_link"], + linkstatic = True, + local_defines = ["HWY_IS_TEST"], + deps = [ + ":helpers", + ":vqsort", + "@com_google_googletest//:gtest_main", + "//:hwy", + "//:hwy_test_util", + ], +) + +cc_binary( + name = "bench_parallel", + testonly = 1, + srcs = ["bench_parallel.cc"], + features = ["fully_static_link"], + linkstatic = True, + local_defines = ["HWY_IS_TEST"], + deps = [ + ":helpers", + ":vqsort", + "@com_google_googletest//:gtest_main", + "//:hwy", + "//:hwy_test_util", + ], +) diff --git a/media/highway/src/hwy/contrib/sort/algo-inl.h b/media/highway/src/hwy/contrib/sort/algo-inl.h new file mode 100644 index 0000000000..6e85ca681a --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/algo-inl.h @@ -0,0 +1,401 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Normal include guard for target-independent parts +#ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_ +#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_ + +#include <stdint.h> +#include <string.h> // memcpy + +#include <algorithm> +#include <cmath> // std::abs +#include <vector> + +#include "hwy/base.h" +#include "hwy/contrib/sort/vqsort.h" + +// Third-party algorithms +#define HAVE_AVX2SORT 0 +#define HAVE_IPS4O 0 +// When enabling, consider changing max_threads (required for Table 1a) +#define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1) +#define HAVE_PDQSORT 0 +#define HAVE_SORT512 0 + +#if HAVE_AVX2SORT +HWY_PUSH_ATTRIBUTES("avx2,avx") +#include "avx2sort.h" +HWY_POP_ATTRIBUTES +#endif +#if HAVE_IPS4O || HAVE_PARALLEL_IPS4O +#include "third_party/ips4o/include/ips4o.hpp" +#include "third_party/ips4o/include/ips4o/thread_pool.hpp" +#endif +#if HAVE_PDQSORT +#include "third_party/boost/allowed/sort/sort.hpp" +#endif +#if HAVE_SORT512 +#include "sort512.h" +#endif + +namespace hwy { + +enum class Dist { kUniform8, kUniform16, kUniform32 }; + +std::vector<Dist> AllDist() { + return {/*Dist::kUniform8, Dist::kUniform16,*/ Dist::kUniform32}; +} + +const char* DistName(Dist dist) { + switch (dist) { + case Dist::kUniform8: + return "uniform8"; + case Dist::kUniform16: + return "uniform16"; + case Dist::kUniform32: + return "uniform32"; + } + return "unreachable"; +} + +template <typename T> +class InputStats { + public: + void Notify(T value) { + min_ = std::min(min_, value); + max_ = std::max(max_, value); + // Converting to integer would truncate floats, multiplying to save digits + // risks overflow especially when casting, so instead take the sum of the + // bit representations as the checksum. + uint64_t bits = 0; + static_assert(sizeof(T) <= 8, "Expected a built-in type"); + CopyBytes<sizeof(T)>(&value, &bits); + sum_ += bits; + count_ += 1; + } + + bool operator==(const InputStats& other) const { + if (count_ != other.count_) { + HWY_ABORT("count %d vs %d\n", static_cast<int>(count_), + static_cast<int>(other.count_)); + } + + if (min_ != other.min_ || max_ != other.max_) { + HWY_ABORT("minmax %f/%f vs %f/%f\n", double(min_), double(max_), + double(other.min_), double(other.max_)); + } + + // Sum helps detect duplicated/lost values + if (sum_ != other.sum_) { + HWY_ABORT("Sum mismatch; min %g max %g\n", double(min_), double(max_)); + } + + return true; + } + + private: + T min_ = hwy::HighestValue<T>(); + T max_ = hwy::LowestValue<T>(); + uint64_t sum_ = 0; + size_t count_ = 0; +}; + +enum class Algo { +#if HAVE_AVX2SORT + kSEA, +#endif +#if HAVE_IPS4O + kIPS4O, +#endif +#if HAVE_PARALLEL_IPS4O + kParallelIPS4O, +#endif +#if HAVE_PDQSORT + kPDQ, +#endif +#if HAVE_SORT512 + kSort512, +#endif + kStd, + kVQSort, + kHeap, +}; + +const char* AlgoName(Algo algo) { + switch (algo) { +#if HAVE_AVX2SORT + case Algo::kSEA: + return "sea"; +#endif +#if HAVE_IPS4O + case Algo::kIPS4O: + return "ips4o"; +#endif +#if HAVE_PARALLEL_IPS4O + case Algo::kParallelIPS4O: + return "par_ips4o"; +#endif +#if HAVE_PDQSORT + case Algo::kPDQ: + return "pdq"; +#endif +#if HAVE_SORT512 + case Algo::kSort512: + return "sort512"; +#endif + case Algo::kStd: + return "std"; + case Algo::kVQSort: + return "vq"; + case Algo::kHeap: + return "heap"; + } + return "unreachable"; +} + +} // namespace hwy +#endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_ + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE +#endif + +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" // HeapSort +#include "hwy/tests/test_util-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +class Xorshift128Plus { + static HWY_INLINE uint64_t SplitMix64(uint64_t z) { + z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull; + z = (z ^ (z >> 27)) * 0x94D049BB133111EBull; + return z ^ (z >> 31); + } + + public: + // Generates two vectors of 64-bit seeds via SplitMix64 and stores into + // `seeds`. Generating these afresh in each ChoosePivot is too expensive. + template <class DU64> + static void GenerateSeeds(DU64 du64, TFromD<DU64>* HWY_RESTRICT seeds) { + seeds[0] = SplitMix64(0x9E3779B97F4A7C15ull); + for (size_t i = 1; i < 2 * Lanes(du64); ++i) { + seeds[i] = SplitMix64(seeds[i - 1]); + } + } + + // Need to pass in the state because vector cannot be class members. + template <class DU64> + static Vec<DU64> RandomBits(DU64 /* tag */, Vec<DU64>& state0, + Vec<DU64>& state1) { + Vec<DU64> s1 = state0; + Vec<DU64> s0 = state1; + const Vec<DU64> bits = Add(s1, s0); + state0 = s0; + s1 = Xor(s1, ShiftLeft<23>(s1)); + state1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0)))); + return bits; + } +}; + +template <typename T, class DU64, HWY_IF_NOT_FLOAT(T)> +Vec<DU64> RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1, + const Vec<DU64> mask) { + const Vec<DU64> bits = Xorshift128Plus::RandomBits(du64, s0, s1); + return And(bits, mask); +} + +// Important to avoid denormals, which are flushed to zero by SIMD but not +// scalar sorts, and NaN, which may be ordered differently in scalar vs. SIMD. +template <typename T, class DU64, HWY_IF_FLOAT(T)> +Vec<DU64> RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1, + const Vec<DU64> mask) { + const Vec<DU64> bits = Xorshift128Plus::RandomBits(du64, s0, s1); + const Vec<DU64> values = And(bits, mask); +#if HWY_TARGET == HWY_SCALAR // Cannot repartition u64 to i32 + const RebindToSigned<DU64> di; +#else + const Repartition<MakeSigned<T>, DU64> di; +#endif + const RebindToFloat<decltype(di)> df; + const RebindToUnsigned<decltype(di)> du; + const auto k1 = BitCast(du64, Set(df, T{1.0})); + const auto mantissa = BitCast(du64, Set(du, MantissaMask<T>())); + // Avoid NaN/denormal by converting from (range-limited) integer. + const Vec<DU64> no_nan = OrAnd(k1, values, mantissa); + return BitCast(du64, ConvertTo(df, BitCast(di, no_nan))); +} + +template <class DU64> +Vec<DU64> MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t) { + switch (sizeof_t) { + case 2: + return Set(du64, (dist == Dist::kUniform8) ? 0x00FF00FF00FF00FFull + : 0xFFFFFFFFFFFFFFFFull); + case 4: + return Set(du64, (dist == Dist::kUniform8) ? 0x000000FF000000FFull + : (dist == Dist::kUniform16) ? 0x0000FFFF0000FFFFull + : 0xFFFFFFFFFFFFFFFFull); + case 8: + return Set(du64, (dist == Dist::kUniform8) ? 0x00000000000000FFull + : (dist == Dist::kUniform16) ? 0x000000000000FFFFull + : 0x00000000FFFFFFFFull); + default: + HWY_ABORT("Logic error"); + return Zero(du64); + } +} + +template <typename T> +InputStats<T> GenerateInput(const Dist dist, T* v, size_t num) { + SortTag<uint64_t> du64; + using VU64 = Vec<decltype(du64)>; + const size_t N64 = Lanes(du64); + auto buf = hwy::AllocateAligned<uint64_t>(2 * N64); + Xorshift128Plus::GenerateSeeds(du64, buf.get()); + auto s0 = Load(du64, buf.get()); + auto s1 = Load(du64, buf.get() + N64); + + const VU64 mask = MaskForDist(du64, dist, sizeof(T)); + + const Repartition<T, decltype(du64)> d; + const size_t N = Lanes(d); + size_t i = 0; + for (; i + N <= num; i += N) { + const VU64 bits = RandomValues<T>(du64, s0, s1, mask); +#if HWY_ARCH_RVV + // v may not be 64-bit aligned + StoreU(bits, du64, buf.get()); + memcpy(v + i, buf.get(), N64 * sizeof(uint64_t)); +#else + StoreU(bits, du64, reinterpret_cast<uint64_t*>(v + i)); +#endif + } + if (i < num) { + const VU64 bits = RandomValues<T>(du64, s0, s1, mask); + StoreU(bits, du64, buf.get()); + memcpy(v + i, buf.get(), (num - i) * sizeof(T)); + } + + InputStats<T> input_stats; + for (size_t i = 0; i < num; ++i) { + input_stats.Notify(v[i]); + } + return input_stats; +} + +struct ThreadLocal { + Sorter sorter; +}; + +struct SharedState { +#if HAVE_PARALLEL_IPS4O + const unsigned max_threads = hwy::LimitsMax<unsigned>(); // 16 for Table 1a + ips4o::StdThreadPool pool{static_cast<int>( + HWY_MIN(max_threads, std::thread::hardware_concurrency() / 2))}; +#endif + std::vector<ThreadLocal> tls{1}; +}; + +template <class Order, typename T> +void Run(Algo algo, T* HWY_RESTRICT inout, size_t num, SharedState& shared, + size_t thread) { + using detail::HeapSort; + using detail::TraitsLane; + using detail::SharedTraits; + + switch (algo) { +#if HAVE_AVX2SORT + case Algo::kSEA: + return avx2::quicksort(inout, static_cast<int>(num)); +#endif + +#if HAVE_IPS4O + case Algo::kIPS4O: + if (Order().IsAscending()) { + return ips4o::sort(inout, inout + num, std::less<T>()); + } else { + return ips4o::sort(inout, inout + num, std::greater<T>()); + } +#endif + +#if HAVE_PARALLEL_IPS4O + case Algo::kParallelIPS4O: + if (Order().IsAscending()) { + return ips4o::parallel::sort(inout, inout + num, std::less<T>(), + shared.pool); + } else { + return ips4o::parallel::sort(inout, inout + num, std::greater<T>(), + shared.pool); + } +#endif + +#if HAVE_SORT512 + case Algo::kSort512: + HWY_ABORT("not supported"); + // return Sort512::Sort(inout, num); +#endif + +#if HAVE_PDQSORT + case Algo::kPDQ: + if (Order().IsAscending()) { + return boost::sort::pdqsort_branchless(inout, inout + num, + std::less<T>()); + } else { + return boost::sort::pdqsort_branchless(inout, inout + num, + std::greater<T>()); + } +#endif + + case Algo::kStd: + if (Order().IsAscending()) { + return std::sort(inout, inout + num, std::less<T>()); + } else { + return std::sort(inout, inout + num, std::greater<T>()); + } + + case Algo::kVQSort: + return shared.tls[thread].sorter(inout, num, Order()); + + case Algo::kHeap: + HWY_ASSERT(sizeof(T) < 16); + if (Order().IsAscending()) { + const SharedTraits<TraitsLane<detail::OrderAscending>> st; + return HeapSort(st, inout, num); + } else { + const SharedTraits<TraitsLane<detail::OrderDescending>> st; + return HeapSort(st, inout, num); + } + + default: + HWY_ABORT("Not implemented"); + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE diff --git a/media/highway/src/hwy/contrib/sort/bench_parallel.cc b/media/highway/src/hwy/contrib/sort/bench_parallel.cc new file mode 100644 index 0000000000..c0cb058dd2 --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/bench_parallel.cc @@ -0,0 +1,240 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Concurrent, independent sorts for generating more memory traffic and testing +// scalability. + +#include <stdint.h> +#include <stdio.h> + +#include <condition_variable> //NOLINT +#include <functional> +#include <memory> +#include <mutex> //NOLINT +#include <thread> //NOLINT +#include <utility> +#include <vector> + +// clang-format off +#include "hwy/contrib/sort/vqsort.h" +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_parallel.cc" //NOLINT +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/algo-inl.h" +#include "hwy/contrib/sort/result-inl.h" +#include "hwy/aligned_allocator.h" +// Last +#include "hwy/tests/test_util-inl.h" +// clang-format on + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace { + +#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_EMU128 + +class ThreadPool { + public: + // Starts the given number of worker threads and blocks until they are ready. + explicit ThreadPool( + const size_t num_threads = std::thread::hardware_concurrency()) + : num_threads_(num_threads) { + HWY_ASSERT(num_threads_ > 0); + threads_.reserve(num_threads_); + for (size_t i = 0; i < num_threads_; ++i) { + threads_.emplace_back(ThreadFunc, this, i); + } + + WorkersReadyBarrier(); + } + + ThreadPool(const ThreadPool&) = delete; + ThreadPool& operator&(const ThreadPool&) = delete; + + // Waits for all threads to exit. + ~ThreadPool() { + StartWorkers(kWorkerExit); + + for (std::thread& thread : threads_) { + thread.join(); + } + } + + size_t NumThreads() const { return threads_.size(); } + + template <class Func> + void RunOnThreads(size_t max_threads, const Func& func) { + task_ = &CallClosure<Func>; + data_ = &func; + StartWorkers(max_threads); + WorkersReadyBarrier(); + } + + private: + // After construction and between calls to Run, workers are "ready", i.e. + // waiting on worker_start_cv_. They are "started" by sending a "command" + // and notifying all worker_start_cv_ waiters. (That is why all workers + // must be ready/waiting - otherwise, the notification will not reach all of + // them and the main thread waits in vain for them to report readiness.) + using WorkerCommand = uint64_t; + + static constexpr WorkerCommand kWorkerWait = ~1ULL; + static constexpr WorkerCommand kWorkerExit = ~2ULL; + + // Calls a closure (lambda with captures). + template <class Closure> + static void CallClosure(const void* f, size_t thread) { + (*reinterpret_cast<const Closure*>(f))(thread); + } + + void WorkersReadyBarrier() { + std::unique_lock<std::mutex> lock(mutex_); + // Typically only a single iteration. + while (workers_ready_ != threads_.size()) { + workers_ready_cv_.wait(lock); + } + workers_ready_ = 0; + + // Safely handle spurious worker wakeups. + worker_start_command_ = kWorkerWait; + } + + // Precondition: all workers are ready. + void StartWorkers(const WorkerCommand worker_command) { + std::unique_lock<std::mutex> lock(mutex_); + worker_start_command_ = worker_command; + // Workers will need this lock, so release it before they wake up. + lock.unlock(); + worker_start_cv_.notify_all(); + } + + static void ThreadFunc(ThreadPool* self, size_t thread) { + // Until kWorkerExit command received: + for (;;) { + std::unique_lock<std::mutex> lock(self->mutex_); + // Notify main thread that this thread is ready. + if (++self->workers_ready_ == self->num_threads_) { + self->workers_ready_cv_.notify_one(); + } + RESUME_WAIT: + // Wait for a command. + self->worker_start_cv_.wait(lock); + const WorkerCommand command = self->worker_start_command_; + switch (command) { + case kWorkerWait: // spurious wakeup: + goto RESUME_WAIT; // lock still held, avoid incrementing ready. + case kWorkerExit: + return; // exits thread + default: + break; + } + + lock.unlock(); + // Command is the maximum number of threads that should run the task. + HWY_ASSERT(command < self->NumThreads()); + if (thread < command) { + self->task_(self->data_, thread); + } + } + } + + const size_t num_threads_; + + // Unmodified after ctor, but cannot be const because we call thread::join(). + std::vector<std::thread> threads_; + + std::mutex mutex_; // guards both cv and their variables. + std::condition_variable workers_ready_cv_; + size_t workers_ready_ = 0; + std::condition_variable worker_start_cv_; + WorkerCommand worker_start_command_; + + // Written by main thread, read by workers (after mutex lock/unlock). + std::function<void(const void*, size_t)> task_; // points to CallClosure + const void* data_; // points to caller's Func +}; + +template <class Order, typename T> +void RunWithoutVerify(const Dist dist, const size_t num, const Algo algo, + SharedState& shared, size_t thread) { + auto aligned = hwy::AllocateAligned<T>(num); + + (void)GenerateInput(dist, aligned.get(), num); + + const Timestamp t0; + Run<Order>(algo, aligned.get(), num, shared, thread); + HWY_ASSERT(aligned[0] < aligned[num - 1]); +} + +void BenchParallel() { + // Not interested in benchmark results for other targets on x86 + if (HWY_ARCH_X86 && (HWY_TARGET != HWY_AVX2 && HWY_TARGET != HWY_AVX3)) { + return; + } + + ThreadPool pool; + const size_t NT = pool.NumThreads(); + + using T = int64_t; + detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st; + + size_t num = 100 * 1000 * 1000; + +#if HAVE_IPS4O + const Algo algo = Algo::kIPS4O; +#else + const Algo algo = Algo::kVQSort; +#endif + const Dist dist = Dist::kUniform16; + + SharedState shared; + shared.tls.resize(NT); + + std::vector<Result> results; + for (size_t nt = 1; nt < NT; nt += HWY_MAX(1, NT / 16)) { + Timestamp t0; + // Default capture because MSVC wants algo/dist but clang does not. + pool.RunOnThreads(nt, [=, &shared](size_t thread) { + RunWithoutVerify<SortAscending, T>(dist, num, algo, shared, thread); + }); + const double sec = SecondsSince(t0); + results.push_back(MakeResult<T>(algo, dist, st, num, nt, sec)); + results.back().Print(); + } +} + +#else +void BenchParallel() {} +#endif + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +namespace { +HWY_BEFORE_TEST(BenchParallel); +HWY_EXPORT_AND_TEST_P(BenchParallel, BenchParallel); +} // namespace +} // namespace hwy + +#endif // HWY_ONCE diff --git a/media/highway/src/hwy/contrib/sort/bench_sort.cc b/media/highway/src/hwy/contrib/sort/bench_sort.cc new file mode 100644 index 0000000000..65fc3e18d9 --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/bench_sort.cc @@ -0,0 +1,267 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/algo-inl.h" +#include "hwy/contrib/sort/result-inl.h" +#include "hwy/contrib/sort/vqsort.h" +#include "hwy/contrib/sort/sorting_networks-inl.h" // SharedTraits +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/tests/test_util-inl.h" +// clang-format on + +#include <stdint.h> +#include <stdio.h> +#include <string.h> // memcpy + +#include <vector> + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +// Defined within HWY_ONCE, used by BenchAllSort. +extern uint32_t first_sort_target; + +namespace HWY_NAMESPACE { +namespace { +using detail::TraitsLane; +using detail::OrderAscending; +using detail::OrderDescending; +using detail::SharedTraits; + +#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_EMU128 +using detail::OrderAscending128; +using detail::OrderDescending128; +using detail::Traits128; + +template <class Traits, typename T> +HWY_NOINLINE void BenchPartition() { + const SortTag<T> d; + detail::SharedTraits<Traits> st; + const Dist dist = Dist::kUniform8; + double sum = 0.0; + + const size_t max_log2 = AdjustedLog2Reps(20); + for (size_t log2 = max_log2; log2 < max_log2 + 1; ++log2) { + const size_t num = 1ull << log2; + auto aligned = hwy::AllocateAligned<T>(num); + auto buf = + hwy::AllocateAligned<T>(hwy::SortConstants::PartitionBufNum(Lanes(d))); + + std::vector<double> seconds; + const size_t num_reps = (1ull << (14 - log2 / 2)) * kReps; + for (size_t rep = 0; rep < num_reps; ++rep) { + (void)GenerateInput(dist, aligned.get(), num); + + const Timestamp t0; + + detail::Partition(d, st, aligned.get(), 0, num - 1, Set(d, T(128)), + buf.get()); + seconds.push_back(SecondsSince(t0)); + // 'Use' the result to prevent optimizing out the partition. + sum += static_cast<double>(aligned.get()[num / 2]); + } + + MakeResult<T>(Algo::kVQSort, dist, st, num, 1, + SummarizeMeasurements(seconds)) + .Print(); + } + HWY_ASSERT(sum != 999999); // Prevent optimizing out +} + +HWY_NOINLINE void BenchAllPartition() { + // Not interested in benchmark results for these targets + if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) { + return; + } + + BenchPartition<TraitsLane<OrderDescending>, float>(); + BenchPartition<TraitsLane<OrderAscending>, int64_t>(); + BenchPartition<Traits128<OrderDescending128>, uint64_t>(); +} + +template <class Traits, typename T> +HWY_NOINLINE void BenchBase(std::vector<Result>& results) { + // Not interested in benchmark results for these targets + if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) { + return; + } + + const SortTag<T> d; + detail::SharedTraits<Traits> st; + const Dist dist = Dist::kUniform32; + + const size_t N = Lanes(d); + const size_t num = SortConstants::BaseCaseNum(N); + auto keys = hwy::AllocateAligned<T>(num); + auto buf = hwy::AllocateAligned<T>(num + N); + + std::vector<double> seconds; + double sum = 0; // prevents elision + constexpr size_t kMul = AdjustedReps(600); // ensures long enough to measure + + for (size_t rep = 0; rep < kReps; ++rep) { + InputStats<T> input_stats = GenerateInput(dist, keys.get(), num); + + const Timestamp t0; + for (size_t i = 0; i < kMul; ++i) { + detail::BaseCase(d, st, keys.get(), num, buf.get()); + sum += static_cast<double>(keys[0]); + } + seconds.push_back(SecondsSince(t0)); + // printf("%f\n", seconds.back()); + + HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num, "BenchBase")); + } + HWY_ASSERT(sum < 1E99); + results.push_back(MakeResult<T>(Algo::kVQSort, dist, st, num * kMul, 1, + SummarizeMeasurements(seconds))); +} + +HWY_NOINLINE void BenchAllBase() { + // Not interested in benchmark results for these targets + if (HWY_TARGET == HWY_SSSE3) { + return; + } + + std::vector<Result> results; + BenchBase<TraitsLane<OrderAscending>, float>(results); + BenchBase<TraitsLane<OrderDescending>, int64_t>(results); + BenchBase<Traits128<OrderAscending128>, uint64_t>(results); + for (const Result& r : results) { + r.Print(); + } +} + +std::vector<Algo> AlgoForBench() { + return { +#if HAVE_AVX2SORT + Algo::kSEA, +#endif +#if HAVE_PARALLEL_IPS4O + Algo::kParallelIPS4O, +#elif HAVE_IPS4O + Algo::kIPS4O, +#endif +#if HAVE_PDQSORT + Algo::kPDQ, +#endif +#if HAVE_SORT512 + Algo::kSort512, +#endif + +// These are 10-20x slower, but that's OK for the default size when we are +// not testing the parallel mode. +#if !HAVE_PARALLEL_IPS4O + Algo::kStd, Algo::kHeap, + + Algo::kVQSort, // only ~4x slower, but not required for Table 1a +#endif + + }; +} + +template <class Traits, typename T> +HWY_NOINLINE void BenchSort(size_t num) { + if (first_sort_target == 0) first_sort_target = HWY_TARGET; + + SharedState shared; + detail::SharedTraits<Traits> st; + auto aligned = hwy::AllocateAligned<T>(num); + for (Algo algo : AlgoForBench()) { + // Other algorithms don't depend on the vector instructions, so only run + // them for the first target. + if (algo != Algo::kVQSort && HWY_TARGET != first_sort_target) continue; + + for (Dist dist : AllDist()) { + std::vector<double> seconds; + for (size_t rep = 0; rep < kReps; ++rep) { + InputStats<T> input_stats = GenerateInput(dist, aligned.get(), num); + + const Timestamp t0; + Run<typename Traits::Order>(algo, aligned.get(), num, shared, + /*thread=*/0); + seconds.push_back(SecondsSince(t0)); + // printf("%f\n", seconds.back()); + + HWY_ASSERT( + VerifySort(st, input_stats, aligned.get(), num, "BenchSort")); + } + MakeResult<T>(algo, dist, st, num, 1, SummarizeMeasurements(seconds)) + .Print(); + } // dist + } // algo +} + +HWY_NOINLINE void BenchAllSort() { + // Not interested in benchmark results for these targets + if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) { + return; + } + + constexpr size_t K = 1000; + constexpr size_t M = K * K; + (void)K; + (void)M; + for (size_t num : { +#if HAVE_PARALLEL_IPS4O + 100 * M, +#else + AdjustedReps(1 * M), +#endif + }) { + BenchSort<TraitsLane<OrderAscending>, float>(num); + // BenchSort<TraitsLane<OrderDescending>, double>(num); + // BenchSort<TraitsLane<OrderAscending>, int16_t>(num); + BenchSort<TraitsLane<OrderDescending>, int32_t>(num); + BenchSort<TraitsLane<OrderAscending>, int64_t>(num); + // BenchSort<TraitsLane<OrderDescending>, uint16_t>(num); + // BenchSort<TraitsLane<OrderDescending>, uint32_t>(num); + // BenchSort<TraitsLane<OrderAscending>, uint64_t>(num); + + BenchSort<Traits128<OrderAscending128>, uint64_t>(num); + } +} + +#else +void BenchAllPartition() {} +void BenchAllBase() {} +void BenchAllSort() {} +#endif + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +uint32_t first_sort_target = 0; // none run yet +namespace { +HWY_BEFORE_TEST(BenchSort); +HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllPartition); +HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllBase); +HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllSort); +} // namespace +} // namespace hwy + +#endif // HWY_ONCE diff --git a/media/highway/src/hwy/contrib/sort/disabled_targets.h b/media/highway/src/hwy/contrib/sort/disabled_targets.h new file mode 100644 index 0000000000..4c3f54b45b --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/disabled_targets.h @@ -0,0 +1,31 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Speed up MSVC builds by building fewer targets. This header must be included +// from all TUs that contain a HWY_DYNAMIC_DISPATCH to vqsort, i.e. vqsort_*.cc. +// However, users of vqsort.h are unaffected. + +#ifndef HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_ +#define HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_ + +#include "hwy/base.h" + +#if HWY_COMPILER_MSVC +#undef HWY_DISABLED_TARGETS +// Either HWY_SCALAR/HWY_EMU128 remains, so we still have a valid target. +#define HWY_DISABLED_TARGETS (HWY_SSSE3 | HWY_SSE4) +#endif // HWY_COMPILER_MSVC + +#endif // HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_ diff --git a/media/highway/src/hwy/contrib/sort/print_network.cc b/media/highway/src/hwy/contrib/sort/print_network.cc new file mode 100644 index 0000000000..59cfebcfbd --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/print_network.cc @@ -0,0 +1,191 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <stdio.h> + +#include <algorithm> + +#include "hwy/base.h" + +// Based on A.7 in "Entwurf und Implementierung vektorisierter +// Sortieralgorithmen" and code by Mark Blacher. +void PrintMergeNetwork16x2() { + for (int i = 8; i < 16; ++i) { + printf("v%x = st.SwapAdjacent(d, v%x);\n", i, i); + } + for (int i = 0; i < 8; ++i) { + printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i); + } + for (int i = 0; i < 4; ++i) { + printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 4, i + 4); + printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 12, i + 12); + } + for (int i = 0; i < 4; ++i) { + printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i); + printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i); + } + for (int i = 0; i < 16; i += 4) { + printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 2, i + 2); + printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 3, i + 3); + } + for (int i = 0; i < 16; i += 4) { + printf("st.Sort2(d, v%x, v%x);\n", i, i + 3); + printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2); + } + for (int i = 0; i < 16; i += 2) { + printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 1, i + 1); + } + for (int i = 0; i < 16; i += 2) { + printf("st.Sort2(d, v%x, v%x);\n", i, i + 1); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i); + } + printf("\n"); +} + +void PrintMergeNetwork16x4() { + printf("\n"); + + for (int i = 8; i < 16; ++i) { + printf("v%x = st.Reverse4(d, v%x);\n", i, i); + } + for (int i = 0; i < 8; ++i) { + printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i); + } + for (int i = 0; i < 4; ++i) { + printf("v%x = st.Reverse4(d, v%x);\n", i + 4, i + 4); + printf("v%x = st.Reverse4(d, v%x);\n", i + 12, i + 12); + } + for (int i = 0; i < 4; ++i) { + printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i); + printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i); + } + for (int i = 0; i < 16; i += 4) { + printf("v%x = st.Reverse4(d, v%x);\n", i + 2, i + 2); + printf("v%x = st.Reverse4(d, v%x);\n", i + 3, i + 3); + } + for (int i = 0; i < 16; i += 4) { + printf("st.Sort2(d, v%x, v%x);\n", i, i + 3); + printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2); + } + for (int i = 0; i < 16; i += 2) { + printf("v%x = st.Reverse4(d, v%x);\n", i + 1, i + 1); + } + for (int i = 0; i < 16; i += 2) { + printf("st.Sort2(d, v%x, v%x);\n", i, i + 1); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsReverse4(d, v%x);\n", i, i); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i); + } +} + +void PrintMergeNetwork16x8() { + printf("\n"); + + for (int i = 8; i < 16; ++i) { + printf("v%x = st.ReverseKeys8(d, v%x);\n", i, i); + } + for (int i = 0; i < 8; ++i) { + printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i); + } + for (int i = 0; i < 4; ++i) { + printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 4, i + 4); + printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 12, i + 12); + } + for (int i = 0; i < 4; ++i) { + printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i); + printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i); + } + for (int i = 0; i < 16; i += 4) { + printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 2, i + 2); + printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 3, i + 3); + } + for (int i = 0; i < 16; i += 4) { + printf("st.Sort2(d, v%x, v%x);\n", i, i + 3); + printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2); + } + for (int i = 0; i < 16; i += 2) { + printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 1, i + 1); + } + for (int i = 0; i < 16; i += 2) { + printf("st.Sort2(d, v%x, v%x);\n", i, i + 1); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsReverse8(d, v%x);\n", i, i); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsDistance2<kOrder>(d, v%x);\n", i, i); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i); + } +} + +void PrintMergeNetwork16x16() { + printf("\n"); + + for (int i = 8; i < 16; ++i) { + printf("v%x = st.ReverseKeys16(d, v%x);\n", i, i); + } + for (int i = 0; i < 8; ++i) { + printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i); + } + for (int i = 0; i < 4; ++i) { + printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 4, i + 4); + printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 12, i + 12); + } + for (int i = 0; i < 4; ++i) { + printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i); + printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i); + } + for (int i = 0; i < 16; i += 4) { + printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 2, i + 2); + printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 3, i + 3); + } + for (int i = 0; i < 16; i += 4) { + printf("st.Sort2(d, v%x, v%x);\n", i, i + 3); + printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2); + } + for (int i = 0; i < 16; i += 2) { + printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 1, i + 1); + } + for (int i = 0; i < 16; i += 2) { + printf("st.Sort2(d, v%x, v%x);\n", i, i + 1); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsReverse16<kOrder>(d, v%x);\n", i, i); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsDistance4<kOrder>(d, v%x);\n", i, i); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsDistance2<kOrder>(d, v%x);\n", i, i); + } + for (int i = 0; i < 16; ++i) { + printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i); + } +} + +int main(int argc, char** argv) { + PrintMergeNetwork16x2(); + PrintMergeNetwork16x4(); + PrintMergeNetwork16x8(); + PrintMergeNetwork16x16(); + return 0; +} diff --git a/media/highway/src/hwy/contrib/sort/result-inl.h b/media/highway/src/hwy/contrib/sort/result-inl.h new file mode 100644 index 0000000000..402f639d5b --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/result-inl.h @@ -0,0 +1,150 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/algo-inl.h" + +// Normal include guard for non-SIMD parts +#ifndef HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_ +#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_ + +#include <time.h> + +#include <algorithm> // std::sort +#include <string> + +#include "hwy/base.h" +#include "hwy/nanobenchmark.h" + +namespace hwy { + +struct Timestamp { + Timestamp() { t = platform::Now(); } + double t; +}; + +double SecondsSince(const Timestamp& t0) { + const Timestamp t1; + return t1.t - t0.t; +} + +constexpr size_t kReps = 30; + +// Returns trimmed mean (we don't want to run an out-of-L3-cache sort often +// enough for the mode to be reliable). +double SummarizeMeasurements(std::vector<double>& seconds) { + std::sort(seconds.begin(), seconds.end()); + double sum = 0; + int count = 0; + for (size_t i = kReps / 4; i < seconds.size() - kReps / 2; ++i) { + sum += seconds[i]; + count += 1; + } + return sum / count; +} + +} // namespace hwy +#endif // HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_ + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +struct Result { + Result() {} + Result(const uint32_t target, const Algo algo, Dist dist, bool is128, + size_t num, size_t num_threads, double sec, size_t sizeof_t, + const char* type_name) + : target(target), + algo(algo), + dist(dist), + is128(is128), + num(num), + num_threads(num_threads), + sec(sec), + sizeof_t(sizeof_t), + type_name(type_name) {} + + void Print() const { + const double bytes = static_cast<double>(num) * + static_cast<double>(num_threads) * + static_cast<double>(sizeof_t); + printf("%10s: %12s: %7s: %9s: %.2E %4.0f MB/s (%2zu threads)\n", + hwy::TargetName(target), AlgoName(algo), + is128 ? "u128" : type_name.c_str(), DistName(dist), + static_cast<double>(num), bytes * 1E-6 / sec, num_threads); + } + + uint32_t target; + Algo algo; + Dist dist; + bool is128; + size_t num = 0; + size_t num_threads = 0; + double sec = 0.0; + size_t sizeof_t = 0; + std::string type_name; +}; + +template <typename T, class Traits> +Result MakeResult(const Algo algo, Dist dist, Traits st, size_t num, + size_t num_threads, double sec) { + char string100[100]; + hwy::detail::TypeName(hwy::detail::MakeTypeInfo<T>(), 1, string100); + return Result(HWY_TARGET, algo, dist, st.Is128(), num, num_threads, sec, + sizeof(T), string100); +} + +template <class Traits, typename T> +bool VerifySort(Traits st, const InputStats<T>& input_stats, const T* out, + size_t num, const char* caller) { + constexpr size_t N1 = st.Is128() ? 2 : 1; + HWY_ASSERT(num >= N1); + + InputStats<T> output_stats; + // Ensure it matches the sort order + for (size_t i = 0; i < num - N1; i += N1) { + output_stats.Notify(out[i]); + if (N1 == 2) output_stats.Notify(out[i + 1]); + // Reverse order instead of checking !Compare1 so we accept equal keys. + if (st.Compare1(out + i + N1, out + i)) { + printf("%s: i=%d of %d: N1=%d %5.0f %5.0f vs. %5.0f %5.0f\n\n", caller, + static_cast<int>(i), static_cast<int>(num), static_cast<int>(N1), + double(out[i + 1]), double(out[i + 0]), double(out[i + N1 + 1]), + double(out[i + N1])); + HWY_ABORT("%d-bit sort is incorrect\n", + static_cast<int>(sizeof(T) * 8 * N1)); + } + } + output_stats.Notify(out[num - N1]); + if (N1 == 2) output_stats.Notify(out[num - N1 + 1]); + + return input_stats == output_stats; +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE diff --git a/media/highway/src/hwy/contrib/sort/shared-inl.h b/media/highway/src/hwy/contrib/sort/shared-inl.h new file mode 100644 index 0000000000..f98a3d5286 --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/shared-inl.h @@ -0,0 +1,122 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Definitions shared between vqsort-inl and sorting_networks-inl. + +// Normal include guard for target-independent parts +#ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_ +#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_ + +#include "hwy/base.h" + +namespace hwy { + +// Internal constants - these are to avoid magic numbers/literals and cannot be +// changed without also changing the associated code. +struct SortConstants { +// SortingNetwork reshapes its input into a matrix. This is the maximum number +// of *keys* per vector. +#if HWY_COMPILER_MSVC + static constexpr size_t kMaxCols = 8; // avoids build timeout +#else + static constexpr size_t kMaxCols = 16; // enough for u32 in 512-bit vector +#endif + + // 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers, + // fitting within 16 AVX2 registers with only a few spills, keeping BaseCase + // code size reasonable (7 KiB for AVX-512 and 16 cols), and minimizing the + // extra logN factor for larger networks (for which only loose upper bounds + // on size are known). + static constexpr size_t kMaxRowsLog2 = 4; + static constexpr size_t kMaxRows = size_t{1} << kMaxRowsLog2; + + static constexpr HWY_INLINE size_t BaseCaseNum(size_t N) { + return kMaxRows * HWY_MIN(N, kMaxCols); + } + + // Unrolling is important (pipelining and amortizing branch mispredictions); + // 2x is sufficient to reach full memory bandwidth on SKX in Partition, but + // somewhat slower for sorting than 4x. + // + // To change, must also update left + 3 * N etc. in the loop. + static constexpr size_t kPartitionUnroll = 4; + + static constexpr HWY_INLINE size_t PartitionBufNum(size_t N) { + // The main loop reads kPartitionUnroll vectors, and first loads from + // both left and right beforehand, so it requires min = 2 * + // kPartitionUnroll vectors. To handle smaller amounts (only guaranteed + // >= BaseCaseNum), we partition the right side into a buffer. We need + // another vector at the end so CompressStore does not overwrite anything. + return (2 * kPartitionUnroll + 1) * N; + } + + // Chunk := group of keys loaded for sampling a pivot. Matches the typical + // cache line size of 64 bytes to get maximum benefit per L2 miss. If vectors + // are larger, use entire vectors to ensure we do not overrun the array. + static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) { + return HWY_MAX(64 / sizeof_t, N); + } + + static constexpr HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) { + // 3 chunks of medians, 1 chunk of median medians plus two padding vectors. + return (3 + 1) * LanesPerChunk(sizeof_t, N) + 2 * N; + } + + template <typename T> + static constexpr HWY_INLINE size_t BufNum(size_t N) { + // One extra for padding plus another for full-vector loads. + return HWY_MAX(BaseCaseNum(N) + 2 * N, + HWY_MAX(PartitionBufNum(N), PivotBufNum(sizeof(T), N))); + } + + template <typename T> + static constexpr HWY_INLINE size_t BufBytes(size_t vector_size) { + return sizeof(T) * BufNum<T>(vector_size / sizeof(T)); + } +}; + +} // namespace hwy + +#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_ + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE +#endif + +#include "hwy/highway.h" + +namespace hwy { +namespace HWY_NAMESPACE { + +// Default tag / vector width selector. +#if HWY_TARGET == HWY_RVV +// Use LMUL = 1/2; for SEW=64 this ends up emulated via vsetvl. +template <typename T> +using SortTag = ScalableTag<T, -1>; +#else +template <typename T> +using SortTag = ScalableTag<T>; +#endif + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy + +#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE diff --git a/media/highway/src/hwy/contrib/sort/sort_test.cc b/media/highway/src/hwy/contrib/sort/sort_test.cc new file mode 100644 index 0000000000..2f44866a26 --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/sort_test.cc @@ -0,0 +1,585 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc" +#include "hwy/foreach_target.h" + +#include "hwy/contrib/sort/vqsort.h" +// After foreach_target +#include "hwy/contrib/sort/algo-inl.h" +#include "hwy/contrib/sort/result-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" // BaseCase +#include "hwy/tests/test_util-inl.h" +// clang-format on + +#include <stdint.h> +#include <stdio.h> +#include <string.h> // memcpy + +#include <algorithm> // std::max +#include <vector> + +#undef VQSORT_TEST_IMPL +#if (HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128) || \ + (defined(_MSC_VER) && !HWY_IS_DEBUG_BUILD) +// Scalar does not implement these, and MSVC non-debug builds time out. +#define VQSORT_TEST_IMPL 0 +#else +#define VQSORT_TEST_IMPL 1 +#endif + +#undef VQSORT_TEST_SORT +// MSVC non-debug builds time out. +#if defined(_MSC_VER) && !HWY_IS_DEBUG_BUILD +#define VQSORT_TEST_SORT 0 +#else +#define VQSORT_TEST_SORT 1 +#endif + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace { + +#if VQSORT_TEST_IMPL || VQSORT_TEST_SORT +using detail::TraitsLane; +using detail::OrderAscending; +using detail::OrderAscending128; +using detail::OrderDescending; +using detail::OrderDescending128; +using detail::SharedTraits; +using detail::Traits128; +#endif + +#if !VQSORT_TEST_IMPL +static void TestAllMedian() {} +static void TestAllBaseCase() {} +static void TestAllPartition() {} +static void TestAllGenerator() {} +#else + +template <class Traits> +static HWY_NOINLINE void TestMedian3() { + using T = uint64_t; + using D = CappedTag<T, 1>; + SharedTraits<Traits> st; + const D d; + using V = Vec<D>; + for (uint32_t bits = 0; bits < 8; ++bits) { + const V v0 = Set(d, T{(bits & (1u << 0)) ? 1u : 0u}); + const V v1 = Set(d, T{(bits & (1u << 1)) ? 1u : 0u}); + const V v2 = Set(d, T{(bits & (1u << 2)) ? 1u : 0u}); + const T m = GetLane(detail::MedianOf3(st, v0, v1, v2)); + // If at least half(rounded up) of bits are 1, so is the median. + const size_t count = PopCount(bits); + HWY_ASSERT_EQ((count >= 2) ? static_cast<T>(1) : 0, m); + } +} + +HWY_NOINLINE void TestAllMedian() { + TestMedian3<TraitsLane<OrderAscending> >(); +} + +template <class Traits, typename T> +static HWY_NOINLINE void TestBaseCaseAscDesc() { + SharedTraits<Traits> st; + const SortTag<T> d; + const size_t N = Lanes(d); + const size_t base_case_num = SortConstants::BaseCaseNum(N); + const size_t N1 = st.LanesPerKey(); + + constexpr int kDebug = 0; + auto aligned_keys = hwy::AllocateAligned<T>(N + base_case_num + N); + auto buf = hwy::AllocateAligned<T>(base_case_num + 2 * N); + + std::vector<size_t> lengths; + lengths.push_back(HWY_MAX(1, N1)); + lengths.push_back(3 * N1); + lengths.push_back(base_case_num / 2); + lengths.push_back(base_case_num / 2 + N1); + lengths.push_back(base_case_num - N1); + lengths.push_back(base_case_num); + + std::vector<size_t> misalignments; + misalignments.push_back(0); + misalignments.push_back(1); + if (N >= 6) misalignments.push_back(N / 2 - 1); + misalignments.push_back(N / 2); + misalignments.push_back(N / 2 + 1); + misalignments.push_back(HWY_MIN(2 * N / 3 + 3, size_t{N - 1})); + + for (bool asc : {false, true}) { + for (size_t len : lengths) { + for (size_t misalign : misalignments) { + T* HWY_RESTRICT keys = aligned_keys.get() + misalign; + if (kDebug) { + printf("============%s asc %d N1 %d len %d misalign %d\n", + hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(N1), + static_cast<int>(len), static_cast<int>(misalign)); + } + + for (size_t i = 0; i < misalign; ++i) { + aligned_keys[i] = hwy::LowestValue<T>(); + } + InputStats<T> input_stats; + for (size_t i = 0; i < len; ++i) { + keys[i] = + asc ? static_cast<T>(T(i) + 1) : static_cast<T>(T(len) - T(i)); + input_stats.Notify(keys[i]); + if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i])); + } + for (size_t i = len; i < base_case_num + N; ++i) { + keys[i] = hwy::LowestValue<T>(); + } + + detail::BaseCase(d, st, keys, len, buf.get()); + + if (kDebug >= 2) { + printf("out>>>>>>\n"); + for (size_t i = 0; i < len; ++i) { + printf("%3zu: %f\n", i, double(keys[i])); + } + } + + HWY_ASSERT(VerifySort(st, input_stats, keys, len, "BaseAscDesc")); + for (size_t i = 0; i < misalign; ++i) { + if (aligned_keys[i] != hwy::LowestValue<T>()) + HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i)); + } + for (size_t i = len; i < base_case_num + N; ++i) { + if (keys[i] != hwy::LowestValue<T>()) + HWY_ABORT("Overrun right at %d\n", static_cast<int>(i)); + } + } // misalign + } // len + } // asc +} + +template <class Traits, typename T> +static HWY_NOINLINE void TestBaseCase01() { + SharedTraits<Traits> st; + const SortTag<T> d; + const size_t N = Lanes(d); + const size_t base_case_num = SortConstants::BaseCaseNum(N); + const size_t N1 = st.LanesPerKey(); + + constexpr int kDebug = 0; + auto keys = hwy::AllocateAligned<T>(base_case_num + N); + auto buf = hwy::AllocateAligned<T>(base_case_num + 2 * N); + + std::vector<size_t> lengths; + lengths.push_back(HWY_MAX(1, N1)); + lengths.push_back(3 * N1); + lengths.push_back(base_case_num / 2); + lengths.push_back(base_case_num / 2 + N1); + lengths.push_back(base_case_num - N1); + lengths.push_back(base_case_num); + + for (size_t len : lengths) { + if (kDebug) { + printf("============%s 01 N1 %d len %d\n", hwy::TypeName(T(), 1).c_str(), + static_cast<int>(N1), static_cast<int>(len)); + } + const uint64_t kMaxBits = AdjustedLog2Reps(HWY_MIN(len, size_t{14})); + for (uint64_t bits = 0; bits < ((1ull << kMaxBits) - 1); ++bits) { + InputStats<T> input_stats; + for (size_t i = 0; i < len; ++i) { + keys[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0; + input_stats.Notify(keys[i]); + if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i])); + } + for (size_t i = len; i < base_case_num + N; ++i) { + keys[i] = hwy::LowestValue<T>(); + } + + detail::BaseCase(d, st, keys.get(), len, buf.get()); + + if (kDebug >= 2) { + printf("out>>>>>>\n"); + for (size_t i = 0; i < len; ++i) { + printf("%3zu: %f\n", i, double(keys[i])); + } + } + + HWY_ASSERT(VerifySort(st, input_stats, keys.get(), len, "Base01")); + for (size_t i = len; i < base_case_num + N; ++i) { + if (keys[i] != hwy::LowestValue<T>()) + HWY_ABORT("Overrun right at %d\n", static_cast<int>(i)); + } + } // bits + } // len +} + +template <class Traits, typename T> +static HWY_NOINLINE void TestBaseCase() { + TestBaseCaseAscDesc<Traits, T>(); + TestBaseCase01<Traits, T>(); +} + +HWY_NOINLINE void TestAllBaseCase() { + // Workaround for stack overflow on MSVC debug. +#if defined(_MSC_VER) && HWY_IS_DEBUG_BUILD && (HWY_TARGET == HWY_AVX3) + return; +#endif + + TestBaseCase<TraitsLane<OrderAscending>, int32_t>(); + TestBaseCase<TraitsLane<OrderDescending>, int64_t>(); + TestBaseCase<Traits128<OrderAscending128>, uint64_t>(); + TestBaseCase<Traits128<OrderDescending128>, uint64_t>(); +} + +template <class Traits, typename T> +static HWY_NOINLINE void VerifyPartition(Traits st, T* HWY_RESTRICT keys, + size_t left, size_t border, + size_t right, const size_t N1, + const T* pivot) { + /* for (size_t i = left; i < right; ++i) { + if (i == border) printf("--\n"); + printf("%4zu: %3d\n", i, keys[i]); + }*/ + + HWY_ASSERT(left % N1 == 0); + HWY_ASSERT(border % N1 == 0); + HWY_ASSERT(right % N1 == 0); + const bool asc = typename Traits::Order().IsAscending(); + for (size_t i = left; i < border; i += N1) { + if (st.Compare1(pivot, keys + i)) { + HWY_ABORT( + "%s: asc %d left[%d] piv %.0f %.0f compares before %.0f %.0f " + "border %d", + hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(i), + double(pivot[1]), double(pivot[0]), double(keys[i + 1]), + double(keys[i + 0]), static_cast<int>(border)); + } + } + for (size_t i = border; i < right; i += N1) { + if (!st.Compare1(pivot, keys + i)) { + HWY_ABORT( + "%s: asc %d right[%d] piv %.0f %.0f compares after %.0f %.0f " + "border %d", + hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(i), + double(pivot[1]), double(pivot[0]), double(keys[i + 1]), + double(keys[i]), static_cast<int>(border)); + } + } +} + +template <class Traits, typename T> +static HWY_NOINLINE void TestPartition() { + const SortTag<T> d; + SharedTraits<Traits> st; + const bool asc = typename Traits::Order().IsAscending(); + const size_t N = Lanes(d); + constexpr int kDebug = 0; + const size_t base_case_num = SortConstants::BaseCaseNum(N); + // left + len + align + const size_t total = 32 + (base_case_num + 4 * HWY_MAX(N, 4)) + 2 * N; + auto aligned_keys = hwy::AllocateAligned<T>(total); + auto buf = hwy::AllocateAligned<T>(SortConstants::PartitionBufNum(N)); + + const size_t N1 = st.LanesPerKey(); + for (bool in_asc : {false, true}) { + for (int left_i : {0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 22, 28, 29, 30, 31}) { + const size_t left = static_cast<size_t>(left_i) & ~(N1 - 1); + for (size_t ofs : {N, N + 1, N + 2, N + 3, 2 * N, 2 * N + 1, 2 * N + 2, + 2 * N + 3, 3 * N - 1, 4 * N - 3, 4 * N - 2}) { + const size_t len = (base_case_num + ofs) & ~(N1 - 1); + for (T pivot1 : + {T(0), T(len / 3), T(len / 2), T(2 * len / 3), T(len)}) { + const T pivot2[2] = {pivot1, 0}; + const auto pivot = st.SetKey(d, pivot2); + for (size_t misalign = 0; misalign < N; + misalign += st.LanesPerKey()) { + T* HWY_RESTRICT keys = aligned_keys.get() + misalign; + const size_t right = left + len; + if (kDebug) { + printf( + "=========%s asc %d left %d len %d right %d piv %.0f %.0f\n", + hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(left), + static_cast<int>(len), static_cast<int>(right), + double(pivot2[1]), double(pivot2[0])); + } + + for (size_t i = 0; i < misalign; ++i) { + aligned_keys[i] = hwy::LowestValue<T>(); + } + for (size_t i = 0; i < left; ++i) { + keys[i] = hwy::LowestValue<T>(); + } + for (size_t i = left; i < right; ++i) { + keys[i] = static_cast<T>(in_asc ? T(i + 1) - static_cast<T>(left) + : static_cast<T>(right) - T(i)); + if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i])); + } + for (size_t i = right; i < total - misalign; ++i) { + keys[i] = hwy::LowestValue<T>(); + } + + size_t border = + detail::Partition(d, st, keys, left, right, pivot, buf.get()); + + if (kDebug >= 2) { + printf("out>>>>>>\n"); + for (size_t i = left; i < right; ++i) { + printf("%3zu: %f\n", i, double(keys[i])); + } + for (size_t i = right; i < total - misalign; ++i) { + printf("%3zu: sentinel %f\n", i, double(keys[i])); + } + } + + VerifyPartition(st, keys, left, border, right, N1, pivot2); + for (size_t i = 0; i < misalign; ++i) { + if (aligned_keys[i] != hwy::LowestValue<T>()) + HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i)); + } + for (size_t i = 0; i < left; ++i) { + if (keys[i] != hwy::LowestValue<T>()) + HWY_ABORT("Overrun left at %d\n", static_cast<int>(i)); + } + for (size_t i = right; i < total - misalign; ++i) { + if (keys[i] != hwy::LowestValue<T>()) + HWY_ABORT("Overrun right at %d\n", static_cast<int>(i)); + } + } // misalign + } // pivot + } // len + } // left + } // asc +} + +HWY_NOINLINE void TestAllPartition() { + TestPartition<TraitsLane<OrderAscending>, int16_t>(); + TestPartition<TraitsLane<OrderDescending>, int32_t>(); + TestPartition<TraitsLane<OrderAscending>, int64_t>(); + TestPartition<TraitsLane<OrderDescending>, float>(); +#if HWY_HAVE_FLOAT64 + TestPartition<TraitsLane<OrderDescending>, double>(); +#endif + TestPartition<Traits128<OrderAscending128>, uint64_t>(); + TestPartition<Traits128<OrderDescending128>, uint64_t>(); +} + +// (used for sample selection for choosing a pivot) +template <typename TU> +static HWY_NOINLINE void TestRandomGenerator() { + static_assert(!hwy::IsSigned<TU>(), ""); + SortTag<TU> du; + const size_t N = Lanes(du); + + detail::Generator rng(&N, N); + + const size_t lanes_per_block = HWY_MAX(64 / sizeof(TU), N); // power of two + + for (uint32_t num_blocks = 2; num_blocks < 100000; + num_blocks = 3 * num_blocks / 2) { + // Generate some numbers and ensure all are in range + uint64_t sum = 0; + constexpr size_t kReps = 10000; + for (size_t rep = 0; rep < kReps; ++rep) { + const uint32_t bits = rng() & 0xFFFFFFFF; + const size_t index = detail::RandomChunkIndex(num_blocks, bits); + HWY_ASSERT(((index + 1) * lanes_per_block) <= + num_blocks * lanes_per_block); + + sum += index; + } + + // Also ensure the mean is near the middle of the range + const double expected = (num_blocks - 1) / 2.0; + const double actual = double(sum) / kReps; + HWY_ASSERT(0.9 * expected <= actual && actual <= 1.1 * expected); + } +} + +HWY_NOINLINE void TestAllGenerator() { + TestRandomGenerator<uint32_t>(); + TestRandomGenerator<uint64_t>(); +} + +#endif // VQSORT_TEST_IMPL + +#if !VQSORT_TEST_SORT +static void TestAllSort() {} +#else + +// Remembers input, and compares results to that of a reference algorithm. +template <class Traits, typename T> +class CompareResults { + public: + void SetInput(const T* in, size_t num) { + copy_.resize(num); + memcpy(copy_.data(), in, num * sizeof(T)); + } + + bool Verify(const T* output) { +#if HAVE_PDQSORT + const Algo reference = Algo::kPDQ; +#else + const Algo reference = Algo::kStd; +#endif + SharedState shared; + using Order = typename Traits::Order; + Run<Order>(reference, copy_.data(), copy_.size(), shared, + /*thread=*/0); + + for (size_t i = 0; i < copy_.size(); ++i) { + if (copy_[i] != output[i]) { + fprintf(stderr, "Asc %d mismatch at %d: %A %A\n", Order().IsAscending(), + static_cast<int>(i), double(copy_[i]), double(output[i])); + return false; + } + } + return true; + } + + private: + std::vector<T> copy_; +}; + +std::vector<Algo> AlgoForTest() { + return { +#if HAVE_AVX2SORT + Algo::kSEA, +#endif +#if HAVE_IPS4O + Algo::kIPS4O, +#endif +#if HAVE_PDQSORT + Algo::kPDQ, +#endif +#if HAVE_SORT512 + Algo::kSort512, +#endif + Algo::kHeap, Algo::kVQSort, + }; +} + +template <class Traits, typename T> +void TestSort(size_t num) { + // TODO(janwas): fix + if (HWY_TARGET == HWY_SSSE3) return; +// Workaround for stack overflow on clang-cl (/F 8388608 does not help). +#if defined(_MSC_VER) && HWY_IS_DEBUG_BUILD && (HWY_TARGET == HWY_AVX3) + return; +#endif + + SharedState shared; + SharedTraits<Traits> st; + + // Round up to a whole number of keys. + num += (st.Is128() && (num & 1)); + + constexpr size_t kMaxMisalign = 16; + auto aligned = hwy::AllocateAligned<T>(kMaxMisalign + num + kMaxMisalign); + for (Algo algo : AlgoForTest()) { +#if HAVE_IPS4O + if (st.Is128() && (algo == Algo::kIPS4O || algo == Algo::kParallelIPS4O)) { + continue; + } +#endif + for (Dist dist : AllDist()) { + for (size_t misalign : {size_t{0}, size_t{st.LanesPerKey()}, + size_t{3 * st.LanesPerKey()}, kMaxMisalign / 2}) { + T* keys = aligned.get() + misalign; + + // Set up red zones before/after the keys to sort + for (size_t i = 0; i < misalign; ++i) { + aligned[i] = hwy::LowestValue<T>(); + } + for (size_t i = 0; i < kMaxMisalign; ++i) { + keys[num + i] = hwy::HighestValue<T>(); + } +#if HWY_IS_MSAN + __msan_poison(aligned.get(), misalign * sizeof(T)); + __msan_poison(keys + num, kMaxMisalign * sizeof(T)); +#endif + InputStats<T> input_stats = GenerateInput(dist, keys, num); + + CompareResults<Traits, T> compare; + compare.SetInput(keys, num); + + Run<typename Traits::Order>(algo, keys, num, shared, /*thread=*/0); + HWY_ASSERT(compare.Verify(keys)); + HWY_ASSERT(VerifySort(st, input_stats, keys, num, "TestSort")); + + // Check red zones +#if HWY_IS_MSAN + __msan_unpoison(aligned.get(), misalign * sizeof(T)); + __msan_unpoison(keys + num, kMaxMisalign * sizeof(T)); +#endif + for (size_t i = 0; i < misalign; ++i) { + if (aligned[i] != hwy::LowestValue<T>()) + HWY_ABORT("Overrun left at %d\n", static_cast<int>(i)); + } + for (size_t i = num; i < num + kMaxMisalign; ++i) { + if (keys[i] != hwy::HighestValue<T>()) + HWY_ABORT("Overrun right at %d\n", static_cast<int>(i)); + } + } // misalign + } // dist + } // algo +} + +void TestAllSort() { + const size_t num = AdjustedReps(20 * 1000); + + TestSort<TraitsLane<OrderAscending>, int16_t>(num); + TestSort<TraitsLane<OrderDescending>, uint16_t>(num); + + TestSort<TraitsLane<OrderDescending>, int32_t>(num); + TestSort<TraitsLane<OrderDescending>, uint32_t>(num); + + TestSort<TraitsLane<OrderAscending>, int64_t>(num); + TestSort<TraitsLane<OrderAscending>, uint64_t>(num); + + // WARNING: for float types, SIMD comparisons will flush denormals to zero, + // causing mismatches with scalar sorts. In this test, we avoid generating + // denormal inputs. + TestSort<TraitsLane<OrderAscending>, float>(num); +#if HWY_HAVE_FLOAT64 // protects algo-inl's GenerateRandom + if (Sorter::HaveFloat64()) { + TestSort<TraitsLane<OrderDescending>, double>(num); + } +#endif + + TestSort<Traits128<OrderAscending128>, uint64_t>(num); + TestSort<Traits128<OrderAscending128>, uint64_t>(num); +} + +#endif // VQSORT_TEST_SORT + +} // namespace +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace hwy { +namespace { +HWY_BEFORE_TEST(SortTest); +HWY_EXPORT_AND_TEST_P(SortTest, TestAllMedian); +HWY_EXPORT_AND_TEST_P(SortTest, TestAllBaseCase); +HWY_EXPORT_AND_TEST_P(SortTest, TestAllPartition); +HWY_EXPORT_AND_TEST_P(SortTest, TestAllGenerator); +HWY_EXPORT_AND_TEST_P(SortTest, TestAllSort); +} // namespace +} // namespace hwy + +#endif // HWY_ONCE diff --git a/media/highway/src/hwy/contrib/sort/sorting_networks-inl.h b/media/highway/src/hwy/contrib/sort/sorting_networks-inl.h new file mode 100644 index 0000000000..ce72fe6b58 --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/sorting_networks-inl.h @@ -0,0 +1,687 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE +#endif + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/shared-inl.h" // SortConstants +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace detail { + +using Constants = hwy::SortConstants; + +// ------------------------------ SharedTraits + +// Code shared between all traits. It's unclear whether these can profitably be +// specialized for Lane vs Block, or optimized like SortPairsDistance1 using +// Compare/DupOdd. +template <class Base> +struct SharedTraits : public Base { + // Conditionally swaps lane 0 with 2, 1 with 3 etc. + template <class D> + HWY_INLINE Vec<D> SortPairsDistance2(D d, Vec<D> v) const { + const Base* base = static_cast<const Base*>(this); + Vec<D> swapped = base->SwapAdjacentPairs(d, v); + base->Sort2(d, v, swapped); + return base->OddEvenPairs(d, swapped, v); + } + + // Swaps with the vector formed by reversing contiguous groups of 8 keys. + template <class D> + HWY_INLINE Vec<D> SortPairsReverse8(D d, Vec<D> v) const { + const Base* base = static_cast<const Base*>(this); + Vec<D> swapped = base->ReverseKeys8(d, v); + base->Sort2(d, v, swapped); + return base->OddEvenQuads(d, swapped, v); + } + + // Swaps with the vector formed by reversing contiguous groups of 8 keys. + template <class D> + HWY_INLINE Vec<D> SortPairsReverse16(D d, Vec<D> v) const { + const Base* base = static_cast<const Base*>(this); + static_assert(Constants::kMaxCols <= 16, "Need actual Reverse16"); + Vec<D> swapped = base->ReverseKeys(d, v); + base->Sort2(d, v, swapped); + return ConcatUpperLower(d, swapped, v); // 8 = half of the vector + } +}; + +// ------------------------------ Sorting network + +// (Green's irregular) sorting network for independent columns in 16 vectors. +template <class D, class Traits, class V = Vec<D>> +HWY_INLINE void Sort16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5, + V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd, + V& ve, V& vf) { + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v4, v5); + st.Sort2(d, v6, v7); + st.Sort2(d, v8, v9); + st.Sort2(d, va, vb); + st.Sort2(d, vc, vd); + st.Sort2(d, ve, vf); + st.Sort2(d, v0, v2); + st.Sort2(d, v1, v3); + st.Sort2(d, v4, v6); + st.Sort2(d, v5, v7); + st.Sort2(d, v8, va); + st.Sort2(d, v9, vb); + st.Sort2(d, vc, ve); + st.Sort2(d, vd, vf); + st.Sort2(d, v0, v4); + st.Sort2(d, v1, v5); + st.Sort2(d, v2, v6); + st.Sort2(d, v3, v7); + st.Sort2(d, v8, vc); + st.Sort2(d, v9, vd); + st.Sort2(d, va, ve); + st.Sort2(d, vb, vf); + st.Sort2(d, v0, v8); + st.Sort2(d, v1, v9); + st.Sort2(d, v2, va); + st.Sort2(d, v3, vb); + st.Sort2(d, v4, vc); + st.Sort2(d, v5, vd); + st.Sort2(d, v6, ve); + st.Sort2(d, v7, vf); + st.Sort2(d, v5, va); + st.Sort2(d, v6, v9); + st.Sort2(d, v3, vc); + st.Sort2(d, v7, vb); + st.Sort2(d, vd, ve); + st.Sort2(d, v4, v8); + st.Sort2(d, v1, v2); + st.Sort2(d, v1, v4); + st.Sort2(d, v7, vd); + st.Sort2(d, v2, v8); + st.Sort2(d, vb, ve); + st.Sort2(d, v2, v4); + st.Sort2(d, v5, v6); + st.Sort2(d, v9, va); + st.Sort2(d, vb, vd); + st.Sort2(d, v3, v8); + st.Sort2(d, v7, vc); + st.Sort2(d, v3, v5); + st.Sort2(d, v6, v8); + st.Sort2(d, v7, v9); + st.Sort2(d, va, vc); + st.Sort2(d, v3, v4); + st.Sort2(d, v5, v6); + st.Sort2(d, v7, v8); + st.Sort2(d, v9, va); + st.Sort2(d, vb, vc); + st.Sort2(d, v6, v7); + st.Sort2(d, v8, v9); +} + +// ------------------------------ Merging networks + +// Blacher's hybrid bitonic/odd-even networks, generated by print_network.cc. + +template <class D, class Traits, class V = Vec<D>> +HWY_INLINE void Merge2(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5, + V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd, + V& ve, V& vf) { + v8 = st.ReverseKeys2(d, v8); + v9 = st.ReverseKeys2(d, v9); + va = st.ReverseKeys2(d, va); + vb = st.ReverseKeys2(d, vb); + vc = st.ReverseKeys2(d, vc); + vd = st.ReverseKeys2(d, vd); + ve = st.ReverseKeys2(d, ve); + vf = st.ReverseKeys2(d, vf); + st.Sort2(d, v0, vf); + st.Sort2(d, v1, ve); + st.Sort2(d, v2, vd); + st.Sort2(d, v3, vc); + st.Sort2(d, v4, vb); + st.Sort2(d, v5, va); + st.Sort2(d, v6, v9); + st.Sort2(d, v7, v8); + v4 = st.ReverseKeys2(d, v4); + vc = st.ReverseKeys2(d, vc); + v5 = st.ReverseKeys2(d, v5); + vd = st.ReverseKeys2(d, vd); + v6 = st.ReverseKeys2(d, v6); + ve = st.ReverseKeys2(d, ve); + v7 = st.ReverseKeys2(d, v7); + vf = st.ReverseKeys2(d, vf); + st.Sort2(d, v0, v7); + st.Sort2(d, v8, vf); + st.Sort2(d, v1, v6); + st.Sort2(d, v9, ve); + st.Sort2(d, v2, v5); + st.Sort2(d, va, vd); + st.Sort2(d, v3, v4); + st.Sort2(d, vb, vc); + v2 = st.ReverseKeys2(d, v2); + v3 = st.ReverseKeys2(d, v3); + v6 = st.ReverseKeys2(d, v6); + v7 = st.ReverseKeys2(d, v7); + va = st.ReverseKeys2(d, va); + vb = st.ReverseKeys2(d, vb); + ve = st.ReverseKeys2(d, ve); + vf = st.ReverseKeys2(d, vf); + st.Sort2(d, v0, v3); + st.Sort2(d, v1, v2); + st.Sort2(d, v4, v7); + st.Sort2(d, v5, v6); + st.Sort2(d, v8, vb); + st.Sort2(d, v9, va); + st.Sort2(d, vc, vf); + st.Sort2(d, vd, ve); + v1 = st.ReverseKeys2(d, v1); + v3 = st.ReverseKeys2(d, v3); + v5 = st.ReverseKeys2(d, v5); + v7 = st.ReverseKeys2(d, v7); + v9 = st.ReverseKeys2(d, v9); + vb = st.ReverseKeys2(d, vb); + vd = st.ReverseKeys2(d, vd); + vf = st.ReverseKeys2(d, vf); + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v4, v5); + st.Sort2(d, v6, v7); + st.Sort2(d, v8, v9); + st.Sort2(d, va, vb); + st.Sort2(d, vc, vd); + st.Sort2(d, ve, vf); + v0 = st.SortPairsDistance1(d, v0); + v1 = st.SortPairsDistance1(d, v1); + v2 = st.SortPairsDistance1(d, v2); + v3 = st.SortPairsDistance1(d, v3); + v4 = st.SortPairsDistance1(d, v4); + v5 = st.SortPairsDistance1(d, v5); + v6 = st.SortPairsDistance1(d, v6); + v7 = st.SortPairsDistance1(d, v7); + v8 = st.SortPairsDistance1(d, v8); + v9 = st.SortPairsDistance1(d, v9); + va = st.SortPairsDistance1(d, va); + vb = st.SortPairsDistance1(d, vb); + vc = st.SortPairsDistance1(d, vc); + vd = st.SortPairsDistance1(d, vd); + ve = st.SortPairsDistance1(d, ve); + vf = st.SortPairsDistance1(d, vf); +} + +template <class D, class Traits, class V = Vec<D>> +HWY_INLINE void Merge4(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5, + V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd, + V& ve, V& vf) { + v8 = st.ReverseKeys4(d, v8); + v9 = st.ReverseKeys4(d, v9); + va = st.ReverseKeys4(d, va); + vb = st.ReverseKeys4(d, vb); + vc = st.ReverseKeys4(d, vc); + vd = st.ReverseKeys4(d, vd); + ve = st.ReverseKeys4(d, ve); + vf = st.ReverseKeys4(d, vf); + st.Sort2(d, v0, vf); + st.Sort2(d, v1, ve); + st.Sort2(d, v2, vd); + st.Sort2(d, v3, vc); + st.Sort2(d, v4, vb); + st.Sort2(d, v5, va); + st.Sort2(d, v6, v9); + st.Sort2(d, v7, v8); + v4 = st.ReverseKeys4(d, v4); + vc = st.ReverseKeys4(d, vc); + v5 = st.ReverseKeys4(d, v5); + vd = st.ReverseKeys4(d, vd); + v6 = st.ReverseKeys4(d, v6); + ve = st.ReverseKeys4(d, ve); + v7 = st.ReverseKeys4(d, v7); + vf = st.ReverseKeys4(d, vf); + st.Sort2(d, v0, v7); + st.Sort2(d, v8, vf); + st.Sort2(d, v1, v6); + st.Sort2(d, v9, ve); + st.Sort2(d, v2, v5); + st.Sort2(d, va, vd); + st.Sort2(d, v3, v4); + st.Sort2(d, vb, vc); + v2 = st.ReverseKeys4(d, v2); + v3 = st.ReverseKeys4(d, v3); + v6 = st.ReverseKeys4(d, v6); + v7 = st.ReverseKeys4(d, v7); + va = st.ReverseKeys4(d, va); + vb = st.ReverseKeys4(d, vb); + ve = st.ReverseKeys4(d, ve); + vf = st.ReverseKeys4(d, vf); + st.Sort2(d, v0, v3); + st.Sort2(d, v1, v2); + st.Sort2(d, v4, v7); + st.Sort2(d, v5, v6); + st.Sort2(d, v8, vb); + st.Sort2(d, v9, va); + st.Sort2(d, vc, vf); + st.Sort2(d, vd, ve); + v1 = st.ReverseKeys4(d, v1); + v3 = st.ReverseKeys4(d, v3); + v5 = st.ReverseKeys4(d, v5); + v7 = st.ReverseKeys4(d, v7); + v9 = st.ReverseKeys4(d, v9); + vb = st.ReverseKeys4(d, vb); + vd = st.ReverseKeys4(d, vd); + vf = st.ReverseKeys4(d, vf); + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v4, v5); + st.Sort2(d, v6, v7); + st.Sort2(d, v8, v9); + st.Sort2(d, va, vb); + st.Sort2(d, vc, vd); + st.Sort2(d, ve, vf); + v0 = st.SortPairsReverse4(d, v0); + v1 = st.SortPairsReverse4(d, v1); + v2 = st.SortPairsReverse4(d, v2); + v3 = st.SortPairsReverse4(d, v3); + v4 = st.SortPairsReverse4(d, v4); + v5 = st.SortPairsReverse4(d, v5); + v6 = st.SortPairsReverse4(d, v6); + v7 = st.SortPairsReverse4(d, v7); + v8 = st.SortPairsReverse4(d, v8); + v9 = st.SortPairsReverse4(d, v9); + va = st.SortPairsReverse4(d, va); + vb = st.SortPairsReverse4(d, vb); + vc = st.SortPairsReverse4(d, vc); + vd = st.SortPairsReverse4(d, vd); + ve = st.SortPairsReverse4(d, ve); + vf = st.SortPairsReverse4(d, vf); + v0 = st.SortPairsDistance1(d, v0); + v1 = st.SortPairsDistance1(d, v1); + v2 = st.SortPairsDistance1(d, v2); + v3 = st.SortPairsDistance1(d, v3); + v4 = st.SortPairsDistance1(d, v4); + v5 = st.SortPairsDistance1(d, v5); + v6 = st.SortPairsDistance1(d, v6); + v7 = st.SortPairsDistance1(d, v7); + v8 = st.SortPairsDistance1(d, v8); + v9 = st.SortPairsDistance1(d, v9); + va = st.SortPairsDistance1(d, va); + vb = st.SortPairsDistance1(d, vb); + vc = st.SortPairsDistance1(d, vc); + vd = st.SortPairsDistance1(d, vd); + ve = st.SortPairsDistance1(d, ve); + vf = st.SortPairsDistance1(d, vf); +} + +template <class D, class Traits, class V = Vec<D>> +HWY_INLINE void Merge8(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5, + V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd, + V& ve, V& vf) { + v8 = st.ReverseKeys8(d, v8); + v9 = st.ReverseKeys8(d, v9); + va = st.ReverseKeys8(d, va); + vb = st.ReverseKeys8(d, vb); + vc = st.ReverseKeys8(d, vc); + vd = st.ReverseKeys8(d, vd); + ve = st.ReverseKeys8(d, ve); + vf = st.ReverseKeys8(d, vf); + st.Sort2(d, v0, vf); + st.Sort2(d, v1, ve); + st.Sort2(d, v2, vd); + st.Sort2(d, v3, vc); + st.Sort2(d, v4, vb); + st.Sort2(d, v5, va); + st.Sort2(d, v6, v9); + st.Sort2(d, v7, v8); + v4 = st.ReverseKeys8(d, v4); + vc = st.ReverseKeys8(d, vc); + v5 = st.ReverseKeys8(d, v5); + vd = st.ReverseKeys8(d, vd); + v6 = st.ReverseKeys8(d, v6); + ve = st.ReverseKeys8(d, ve); + v7 = st.ReverseKeys8(d, v7); + vf = st.ReverseKeys8(d, vf); + st.Sort2(d, v0, v7); + st.Sort2(d, v8, vf); + st.Sort2(d, v1, v6); + st.Sort2(d, v9, ve); + st.Sort2(d, v2, v5); + st.Sort2(d, va, vd); + st.Sort2(d, v3, v4); + st.Sort2(d, vb, vc); + v2 = st.ReverseKeys8(d, v2); + v3 = st.ReverseKeys8(d, v3); + v6 = st.ReverseKeys8(d, v6); + v7 = st.ReverseKeys8(d, v7); + va = st.ReverseKeys8(d, va); + vb = st.ReverseKeys8(d, vb); + ve = st.ReverseKeys8(d, ve); + vf = st.ReverseKeys8(d, vf); + st.Sort2(d, v0, v3); + st.Sort2(d, v1, v2); + st.Sort2(d, v4, v7); + st.Sort2(d, v5, v6); + st.Sort2(d, v8, vb); + st.Sort2(d, v9, va); + st.Sort2(d, vc, vf); + st.Sort2(d, vd, ve); + v1 = st.ReverseKeys8(d, v1); + v3 = st.ReverseKeys8(d, v3); + v5 = st.ReverseKeys8(d, v5); + v7 = st.ReverseKeys8(d, v7); + v9 = st.ReverseKeys8(d, v9); + vb = st.ReverseKeys8(d, vb); + vd = st.ReverseKeys8(d, vd); + vf = st.ReverseKeys8(d, vf); + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v4, v5); + st.Sort2(d, v6, v7); + st.Sort2(d, v8, v9); + st.Sort2(d, va, vb); + st.Sort2(d, vc, vd); + st.Sort2(d, ve, vf); + v0 = st.SortPairsReverse8(d, v0); + v1 = st.SortPairsReverse8(d, v1); + v2 = st.SortPairsReverse8(d, v2); + v3 = st.SortPairsReverse8(d, v3); + v4 = st.SortPairsReverse8(d, v4); + v5 = st.SortPairsReverse8(d, v5); + v6 = st.SortPairsReverse8(d, v6); + v7 = st.SortPairsReverse8(d, v7); + v8 = st.SortPairsReverse8(d, v8); + v9 = st.SortPairsReverse8(d, v9); + va = st.SortPairsReverse8(d, va); + vb = st.SortPairsReverse8(d, vb); + vc = st.SortPairsReverse8(d, vc); + vd = st.SortPairsReverse8(d, vd); + ve = st.SortPairsReverse8(d, ve); + vf = st.SortPairsReverse8(d, vf); + v0 = st.SortPairsDistance2(d, v0); + v1 = st.SortPairsDistance2(d, v1); + v2 = st.SortPairsDistance2(d, v2); + v3 = st.SortPairsDistance2(d, v3); + v4 = st.SortPairsDistance2(d, v4); + v5 = st.SortPairsDistance2(d, v5); + v6 = st.SortPairsDistance2(d, v6); + v7 = st.SortPairsDistance2(d, v7); + v8 = st.SortPairsDistance2(d, v8); + v9 = st.SortPairsDistance2(d, v9); + va = st.SortPairsDistance2(d, va); + vb = st.SortPairsDistance2(d, vb); + vc = st.SortPairsDistance2(d, vc); + vd = st.SortPairsDistance2(d, vd); + ve = st.SortPairsDistance2(d, ve); + vf = st.SortPairsDistance2(d, vf); + v0 = st.SortPairsDistance1(d, v0); + v1 = st.SortPairsDistance1(d, v1); + v2 = st.SortPairsDistance1(d, v2); + v3 = st.SortPairsDistance1(d, v3); + v4 = st.SortPairsDistance1(d, v4); + v5 = st.SortPairsDistance1(d, v5); + v6 = st.SortPairsDistance1(d, v6); + v7 = st.SortPairsDistance1(d, v7); + v8 = st.SortPairsDistance1(d, v8); + v9 = st.SortPairsDistance1(d, v9); + va = st.SortPairsDistance1(d, va); + vb = st.SortPairsDistance1(d, vb); + vc = st.SortPairsDistance1(d, vc); + vd = st.SortPairsDistance1(d, vd); + ve = st.SortPairsDistance1(d, ve); + vf = st.SortPairsDistance1(d, vf); +} + +// Unused on MSVC, see below +#if !HWY_COMPILER_MSVC + +template <class D, class Traits, class V = Vec<D>> +HWY_INLINE void Merge16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, + V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, + V& vd, V& ve, V& vf) { + v8 = st.ReverseKeys16(d, v8); + v9 = st.ReverseKeys16(d, v9); + va = st.ReverseKeys16(d, va); + vb = st.ReverseKeys16(d, vb); + vc = st.ReverseKeys16(d, vc); + vd = st.ReverseKeys16(d, vd); + ve = st.ReverseKeys16(d, ve); + vf = st.ReverseKeys16(d, vf); + st.Sort2(d, v0, vf); + st.Sort2(d, v1, ve); + st.Sort2(d, v2, vd); + st.Sort2(d, v3, vc); + st.Sort2(d, v4, vb); + st.Sort2(d, v5, va); + st.Sort2(d, v6, v9); + st.Sort2(d, v7, v8); + v4 = st.ReverseKeys16(d, v4); + vc = st.ReverseKeys16(d, vc); + v5 = st.ReverseKeys16(d, v5); + vd = st.ReverseKeys16(d, vd); + v6 = st.ReverseKeys16(d, v6); + ve = st.ReverseKeys16(d, ve); + v7 = st.ReverseKeys16(d, v7); + vf = st.ReverseKeys16(d, vf); + st.Sort2(d, v0, v7); + st.Sort2(d, v8, vf); + st.Sort2(d, v1, v6); + st.Sort2(d, v9, ve); + st.Sort2(d, v2, v5); + st.Sort2(d, va, vd); + st.Sort2(d, v3, v4); + st.Sort2(d, vb, vc); + v2 = st.ReverseKeys16(d, v2); + v3 = st.ReverseKeys16(d, v3); + v6 = st.ReverseKeys16(d, v6); + v7 = st.ReverseKeys16(d, v7); + va = st.ReverseKeys16(d, va); + vb = st.ReverseKeys16(d, vb); + ve = st.ReverseKeys16(d, ve); + vf = st.ReverseKeys16(d, vf); + st.Sort2(d, v0, v3); + st.Sort2(d, v1, v2); + st.Sort2(d, v4, v7); + st.Sort2(d, v5, v6); + st.Sort2(d, v8, vb); + st.Sort2(d, v9, va); + st.Sort2(d, vc, vf); + st.Sort2(d, vd, ve); + v1 = st.ReverseKeys16(d, v1); + v3 = st.ReverseKeys16(d, v3); + v5 = st.ReverseKeys16(d, v5); + v7 = st.ReverseKeys16(d, v7); + v9 = st.ReverseKeys16(d, v9); + vb = st.ReverseKeys16(d, vb); + vd = st.ReverseKeys16(d, vd); + vf = st.ReverseKeys16(d, vf); + st.Sort2(d, v0, v1); + st.Sort2(d, v2, v3); + st.Sort2(d, v4, v5); + st.Sort2(d, v6, v7); + st.Sort2(d, v8, v9); + st.Sort2(d, va, vb); + st.Sort2(d, vc, vd); + st.Sort2(d, ve, vf); + v0 = st.SortPairsReverse16(d, v0); + v1 = st.SortPairsReverse16(d, v1); + v2 = st.SortPairsReverse16(d, v2); + v3 = st.SortPairsReverse16(d, v3); + v4 = st.SortPairsReverse16(d, v4); + v5 = st.SortPairsReverse16(d, v5); + v6 = st.SortPairsReverse16(d, v6); + v7 = st.SortPairsReverse16(d, v7); + v8 = st.SortPairsReverse16(d, v8); + v9 = st.SortPairsReverse16(d, v9); + va = st.SortPairsReverse16(d, va); + vb = st.SortPairsReverse16(d, vb); + vc = st.SortPairsReverse16(d, vc); + vd = st.SortPairsReverse16(d, vd); + ve = st.SortPairsReverse16(d, ve); + vf = st.SortPairsReverse16(d, vf); + v0 = st.SortPairsDistance4(d, v0); + v1 = st.SortPairsDistance4(d, v1); + v2 = st.SortPairsDistance4(d, v2); + v3 = st.SortPairsDistance4(d, v3); + v4 = st.SortPairsDistance4(d, v4); + v5 = st.SortPairsDistance4(d, v5); + v6 = st.SortPairsDistance4(d, v6); + v7 = st.SortPairsDistance4(d, v7); + v8 = st.SortPairsDistance4(d, v8); + v9 = st.SortPairsDistance4(d, v9); + va = st.SortPairsDistance4(d, va); + vb = st.SortPairsDistance4(d, vb); + vc = st.SortPairsDistance4(d, vc); + vd = st.SortPairsDistance4(d, vd); + ve = st.SortPairsDistance4(d, ve); + vf = st.SortPairsDistance4(d, vf); + v0 = st.SortPairsDistance2(d, v0); + v1 = st.SortPairsDistance2(d, v1); + v2 = st.SortPairsDistance2(d, v2); + v3 = st.SortPairsDistance2(d, v3); + v4 = st.SortPairsDistance2(d, v4); + v5 = st.SortPairsDistance2(d, v5); + v6 = st.SortPairsDistance2(d, v6); + v7 = st.SortPairsDistance2(d, v7); + v8 = st.SortPairsDistance2(d, v8); + v9 = st.SortPairsDistance2(d, v9); + va = st.SortPairsDistance2(d, va); + vb = st.SortPairsDistance2(d, vb); + vc = st.SortPairsDistance2(d, vc); + vd = st.SortPairsDistance2(d, vd); + ve = st.SortPairsDistance2(d, ve); + vf = st.SortPairsDistance2(d, vf); + v0 = st.SortPairsDistance1(d, v0); + v1 = st.SortPairsDistance1(d, v1); + v2 = st.SortPairsDistance1(d, v2); + v3 = st.SortPairsDistance1(d, v3); + v4 = st.SortPairsDistance1(d, v4); + v5 = st.SortPairsDistance1(d, v5); + v6 = st.SortPairsDistance1(d, v6); + v7 = st.SortPairsDistance1(d, v7); + v8 = st.SortPairsDistance1(d, v8); + v9 = st.SortPairsDistance1(d, v9); + va = st.SortPairsDistance1(d, va); + vb = st.SortPairsDistance1(d, vb); + vc = st.SortPairsDistance1(d, vc); + vd = st.SortPairsDistance1(d, vd); + ve = st.SortPairsDistance1(d, ve); + vf = st.SortPairsDistance1(d, vf); +} + +#endif // !HWY_COMPILER_MSVC + +// Reshapes `buf` into a matrix, sorts columns independently, and then merges +// into a sorted 1D array without transposing. +// +// `st` is SharedTraits<Traits*<Order*>>. This abstraction layer bridges +// differences in sort order and single-lane vs 128-bit keys. +// `buf` ensures full vectors are aligned, and enables loads/stores without +// bounds checks. +// +// References: +// https://drops.dagstuhl.de/opus/volltexte/2021/13775/pdf/LIPIcs-SEA-2021-3.pdf +// https://github.com/simd-sorting/fast-and-robust/blob/master/avx2_sort_demo/avx2sort.h +// "Entwurf und Implementierung vektorisierter Sortieralgorithmen" (M. Blacher) +template <class Traits, typename T> +HWY_INLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) { + const CappedTag<T, Constants::kMaxCols> d; + using V = decltype(Zero(d)); + + HWY_DASSERT(cols <= Constants::kMaxCols); + + // The network width depends on the number of keys, not lanes. + constexpr size_t kLanesPerKey = st.LanesPerKey(); + const size_t keys = cols / kLanesPerKey; + constexpr size_t kMaxKeys = MaxLanes(d) / kLanesPerKey; + + // These are aligned iff cols == Lanes(d). We prefer unaligned/non-constexpr + // offsets to duplicating this code for every value of cols. + static_assert(Constants::kMaxRows == 16, "Update loads/stores/args"); + V v0 = LoadU(d, buf + 0x0 * cols); + V v1 = LoadU(d, buf + 0x1 * cols); + V v2 = LoadU(d, buf + 0x2 * cols); + V v3 = LoadU(d, buf + 0x3 * cols); + V v4 = LoadU(d, buf + 0x4 * cols); + V v5 = LoadU(d, buf + 0x5 * cols); + V v6 = LoadU(d, buf + 0x6 * cols); + V v7 = LoadU(d, buf + 0x7 * cols); + V v8 = LoadU(d, buf + 0x8 * cols); + V v9 = LoadU(d, buf + 0x9 * cols); + V va = LoadU(d, buf + 0xa * cols); + V vb = LoadU(d, buf + 0xb * cols); + V vc = LoadU(d, buf + 0xc * cols); + V vd = LoadU(d, buf + 0xd * cols); + V ve = LoadU(d, buf + 0xe * cols); + V vf = LoadU(d, buf + 0xf * cols); + + Sort16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, vf); + + // Checking MaxLanes avoids generating HWY_ASSERT code for the unreachable + // code paths: if MaxLanes < 2, then keys <= cols < 2. + if (HWY_LIKELY(keys >= 2 && kMaxKeys >= 2)) { + Merge2(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, + vf); + + if (HWY_LIKELY(keys >= 4 && kMaxKeys >= 4)) { + Merge4(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, + vf); + + if (HWY_LIKELY(keys >= 8 && kMaxKeys >= 8)) { + Merge8(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, + ve, vf); + + // Avoids build timeout +#if !HWY_COMPILER_MSVC + if (HWY_LIKELY(keys >= 16 && kMaxKeys >= 16)) { + Merge16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, + ve, vf); + + static_assert(Constants::kMaxCols <= 16, "Add more branches"); + } +#endif + } + } + } + + StoreU(v0, d, buf + 0x0 * cols); + StoreU(v1, d, buf + 0x1 * cols); + StoreU(v2, d, buf + 0x2 * cols); + StoreU(v3, d, buf + 0x3 * cols); + StoreU(v4, d, buf + 0x4 * cols); + StoreU(v5, d, buf + 0x5 * cols); + StoreU(v6, d, buf + 0x6 * cols); + StoreU(v7, d, buf + 0x7 * cols); + StoreU(v8, d, buf + 0x8 * cols); + StoreU(v9, d, buf + 0x9 * cols); + StoreU(va, d, buf + 0xa * cols); + StoreU(vb, d, buf + 0xb * cols); + StoreU(vc, d, buf + 0xc * cols); + StoreU(vd, d, buf + 0xd * cols); + StoreU(ve, d, buf + 0xe * cols); + StoreU(vf, d, buf + 0xf * cols); +} + +} // namespace detail +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE diff --git a/media/highway/src/hwy/contrib/sort/traits-inl.h b/media/highway/src/hwy/contrib/sort/traits-inl.h new file mode 100644 index 0000000000..e86a2a1eee --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/traits-inl.h @@ -0,0 +1,325 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE +#endif + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/shared-inl.h" // SortConstants +#include "hwy/contrib/sort/vqsort.h" // SortDescending +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace detail { + +// Highway does not provide a lane type for 128-bit keys, so we use uint64_t +// along with an abstraction layer for single-lane vs. lane-pair, which is +// independent of the order. +struct KeyLane { + constexpr size_t LanesPerKey() const { return 1; } + + // For HeapSort + template <typename T> + HWY_INLINE void Swap(T* a, T* b) const { + const T temp = *a; + *a = *b; + *b = temp; + } + + // Broadcasts one key into a vector + template <class D> + HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const { + return Set(d, *key); + } + + template <class D> + HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const { + return Reverse(d, v); + } + + template <class D> + HWY_INLINE Vec<D> ReverseKeys2(D d, Vec<D> v) const { + return Reverse2(d, v); + } + + template <class D> + HWY_INLINE Vec<D> ReverseKeys4(D d, Vec<D> v) const { + return Reverse4(d, v); + } + + template <class D> + HWY_INLINE Vec<D> ReverseKeys8(D d, Vec<D> v) const { + return Reverse8(d, v); + } + + template <class D> + HWY_INLINE Vec<D> ReverseKeys16(D d, Vec<D> v) const { + static_assert(SortConstants::kMaxCols <= 16, "Assumes u32x16 = 512 bit"); + return ReverseKeys(d, v); + } + + template <class V> + HWY_INLINE V OddEvenKeys(const V odd, const V even) const { + return OddEven(odd, even); + } + + template <class D, HWY_IF_LANE_SIZE_D(D, 2)> + HWY_INLINE Vec<D> SwapAdjacentPairs(D d, const Vec<D> v) const { + const Repartition<uint32_t, D> du32; + return BitCast(d, Shuffle2301(BitCast(du32, v))); + } + template <class D, HWY_IF_LANE_SIZE_D(D, 4)> + HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const { + return Shuffle1032(v); + } + template <class D, HWY_IF_LANE_SIZE_D(D, 8)> + HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const { + return SwapAdjacentBlocks(v); + } + + template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)> + HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const { +#if HWY_HAVE_FLOAT64 // in case D is float32 + const RepartitionToWide<D> dw; +#else + const RepartitionToWide<RebindToUnsigned<D>> dw; +#endif + return BitCast(d, SwapAdjacentPairs(dw, BitCast(dw, v))); + } + template <class D, HWY_IF_LANE_SIZE_D(D, 8)> + HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const { + // Assumes max vector size = 512 + return ConcatLowerUpper(d, v, v); + } + + template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)> + HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd, + const Vec<D> even) const { +#if HWY_HAVE_FLOAT64 // in case D is float32 + const RepartitionToWide<D> dw; +#else + const RepartitionToWide<RebindToUnsigned<D>> dw; +#endif + return BitCast(d, OddEven(BitCast(dw, odd), BitCast(dw, even))); + } + template <class D, HWY_IF_LANE_SIZE_D(D, 8)> + HWY_INLINE Vec<D> OddEvenPairs(D /* tag */, Vec<D> odd, Vec<D> even) const { + return OddEvenBlocks(odd, even); + } + + template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)> + HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const { +#if HWY_HAVE_FLOAT64 // in case D is float32 + const RepartitionToWide<D> dw; +#else + const RepartitionToWide<RebindToUnsigned<D>> dw; +#endif + return BitCast(d, OddEvenPairs(dw, BitCast(dw, odd), BitCast(dw, even))); + } + template <class D, HWY_IF_LANE_SIZE_D(D, 8)> + HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const { + return ConcatUpperLower(d, odd, even); + } +}; + +// Anything order-related depends on the key traits *and* the order (see +// FirstOfLanes). We cannot implement just one Compare function because Lt128 +// only compiles if the lane type is u64. Thus we need either overloaded +// functions with a tag type, class specializations, or separate classes. +// We avoid overloaded functions because we want all functions to be callable +// from a SortTraits without per-function wrappers. Specializing would work, but +// we are anyway going to specialize at a higher level. +struct OrderAscending : public KeyLane { + using Order = SortAscending; + + template <typename T> + HWY_INLINE bool Compare1(const T* a, const T* b) { + return *a < *b; + } + + template <class D> + HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const { + return Lt(a, b); + } + + // Two halves of Sort2, used in ScanMinMax. + template <class D> + HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const { + return Min(a, b); + } + + template <class D> + HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const { + return Max(a, b); + } + + template <class D> + HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v, + TFromD<D>* HWY_RESTRICT /* buf */) const { + return MinOfLanes(d, v); + } + + template <class D> + HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v, + TFromD<D>* HWY_RESTRICT /* buf */) const { + return MaxOfLanes(d, v); + } + + template <class D> + HWY_INLINE Vec<D> FirstValue(D d) const { + return Set(d, hwy::LowestValue<TFromD<D>>()); + } + + template <class D> + HWY_INLINE Vec<D> LastValue(D d) const { + return Set(d, hwy::HighestValue<TFromD<D>>()); + } +}; + +struct OrderDescending : public KeyLane { + using Order = SortDescending; + + template <typename T> + HWY_INLINE bool Compare1(const T* a, const T* b) { + return *b < *a; + } + + template <class D> + HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const { + return Lt(b, a); + } + + template <class D> + HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const { + return Max(a, b); + } + + template <class D> + HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const { + return Min(a, b); + } + + template <class D> + HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v, + TFromD<D>* HWY_RESTRICT /* buf */) const { + return MaxOfLanes(d, v); + } + + template <class D> + HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v, + TFromD<D>* HWY_RESTRICT /* buf */) const { + return MinOfLanes(d, v); + } + + template <class D> + HWY_INLINE Vec<D> FirstValue(D d) const { + return Set(d, hwy::HighestValue<TFromD<D>>()); + } + + template <class D> + HWY_INLINE Vec<D> LastValue(D d) const { + return Set(d, hwy::LowestValue<TFromD<D>>()); + } +}; + +// Shared code that depends on Order. +template <class Base> +struct TraitsLane : public Base { + constexpr bool Is128() const { return false; } + + // For each lane i: replaces a[i] with the first and b[i] with the second + // according to Base. + // Corresponds to a conditional swap, which is one "node" of a sorting + // network. Min/Max are cheaper than compare + blend at least for integers. + template <class D> + HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const { + const Base* base = static_cast<const Base*>(this); + + const Vec<D> a_copy = a; + // Prior to AVX3, there is no native 64-bit Min/Max, so they compile to 4 + // instructions. We can reduce it to a compare + 2 IfThenElse. +#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3 + if (sizeof(TFromD<D>) == 8) { + const Mask<D> cmp = base->Compare(d, a, b); + a = IfThenElse(cmp, a, b); + b = IfThenElse(cmp, b, a_copy); + return; + } +#endif + a = base->First(d, a, b); + b = base->Last(d, a_copy, b); + } + + // Conditionally swaps even-numbered lanes with their odd-numbered neighbor. + template <class D, HWY_IF_LANE_SIZE_D(D, 8)> + HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const { + const Base* base = static_cast<const Base*>(this); + Vec<D> swapped = base->ReverseKeys2(d, v); + // Further to the above optimization, Sort2+OddEvenKeys compile to four + // instructions; we can save one by combining two blends. +#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3 + const Vec<D> cmp = VecFromMask(d, base->Compare(d, v, swapped)); + return IfVecThenElse(DupOdd(cmp), swapped, v); +#else + Sort2(d, v, swapped); + return base->OddEvenKeys(swapped, v); +#endif + } + + // (See above - we use Sort2 for non-64-bit types.) + template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)> + HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const { + const Base* base = static_cast<const Base*>(this); + Vec<D> swapped = base->ReverseKeys2(d, v); + Sort2(d, v, swapped); + return base->OddEvenKeys(swapped, v); + } + + // Swaps with the vector formed by reversing contiguous groups of 4 keys. + template <class D> + HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const { + const Base* base = static_cast<const Base*>(this); + Vec<D> swapped = base->ReverseKeys4(d, v); + Sort2(d, v, swapped); + return base->OddEvenPairs(d, swapped, v); + } + + // Conditionally swaps lane 0 with 4, 1 with 5 etc. + template <class D> + HWY_INLINE Vec<D> SortPairsDistance4(D d, Vec<D> v) const { + const Base* base = static_cast<const Base*>(this); + Vec<D> swapped = base->SwapAdjacentQuads(d, v); + // Only used in Merge16, so this will not be used on AVX2 (which only has 4 + // u64 lanes), so skip the above optimization for 64-bit AVX2. + Sort2(d, v, swapped); + return base->OddEvenQuads(d, swapped, v); + } +}; + +} // namespace detail +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE diff --git a/media/highway/src/hwy/contrib/sort/traits128-inl.h b/media/highway/src/hwy/contrib/sort/traits128-inl.h new file mode 100644 index 0000000000..02948d799c --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/traits128-inl.h @@ -0,0 +1,369 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE +#endif + +#include "hwy/contrib/sort/vqsort.h" // SortDescending +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace detail { + +#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128 + +struct OrderAscending128 { + using Order = SortAscending; + + template <typename T> + HWY_INLINE bool Compare1(const T* a, const T* b) { + return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1]; + } +}; + +struct OrderDescending128 { + using Order = SortDescending; + + template <typename T> + HWY_INLINE bool Compare1(const T* a, const T* b) { + return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1]; + } +}; + +template <class Order> +struct Traits128 : public Order { + constexpr bool Is128() const { return true; } + constexpr size_t LanesPerKey() const { return 2; } +}; + +#else + +// Highway does not provide a lane type for 128-bit keys, so we use uint64_t +// along with an abstraction layer for single-lane vs. lane-pair, which is +// independent of the order. +struct Key128 { + constexpr size_t LanesPerKey() const { return 2; } + + template <typename T> + HWY_INLINE void Swap(T* a, T* b) const { + const FixedTag<T, 2> d; + const auto temp = LoadU(d, a); + StoreU(LoadU(d, b), d, a); + StoreU(temp, d, b); + } + + template <class D> + HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const { + return LoadDup128(d, key); + } + + template <class D> + HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const { + return ReverseBlocks(d, v); + } + + template <class D> + HWY_INLINE Vec<D> ReverseKeys2(D /* tag */, const Vec<D> v) const { + return SwapAdjacentBlocks(v); + } + + // Only called for 4 keys because we do not support >512-bit vectors. + template <class D> + HWY_INLINE Vec<D> ReverseKeys4(D d, const Vec<D> v) const { + HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>)); + return ReverseKeys(d, v); + } + + // Only called for 4 keys because we do not support >512-bit vectors. + template <class D> + HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd, + const Vec<D> even) const { + HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>)); + return ConcatUpperLower(d, odd, even); + } + + template <class V> + HWY_INLINE V OddEvenKeys(const V odd, const V even) const { + return OddEvenBlocks(odd, even); + } + + template <class D> + HWY_INLINE Vec<D> ReverseKeys8(D, Vec<D>) const { + HWY_ASSERT(0); // not supported: would require 1024-bit vectors + } + + template <class D> + HWY_INLINE Vec<D> ReverseKeys16(D, Vec<D>) const { + HWY_ASSERT(0); // not supported: would require 2048-bit vectors + } + + // This is only called for 8/16 col networks (not supported). + template <class D> + HWY_INLINE Vec<D> SwapAdjacentPairs(D, Vec<D>) const { + HWY_ASSERT(0); + } + + // This is only called for 16 col networks (not supported). + template <class D> + HWY_INLINE Vec<D> SwapAdjacentQuads(D, Vec<D>) const { + HWY_ASSERT(0); + } + + // This is only called for 8 col networks (not supported). + template <class D> + HWY_INLINE Vec<D> OddEvenQuads(D, Vec<D>, Vec<D>) const { + HWY_ASSERT(0); + } +}; + +// Anything order-related depends on the key traits *and* the order (see +// FirstOfLanes). We cannot implement just one Compare function because Lt128 +// only compiles if the lane type is u64. Thus we need either overloaded +// functions with a tag type, class specializations, or separate classes. +// We avoid overloaded functions because we want all functions to be callable +// from a SortTraits without per-function wrappers. Specializing would work, but +// we are anyway going to specialize at a higher level. +struct OrderAscending128 : public Key128 { + using Order = SortAscending; + + template <typename T> + HWY_INLINE bool Compare1(const T* a, const T* b) { + return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1]; + } + + template <class D> + HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const { + return Lt128(d, a, b); + } + + // Used by CompareTop + template <class V> + HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const { + return Lt(a, b); + } + + template <class D> + HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const { + return Min128(d, a, b); + } + + template <class D> + HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const { + return Max128(d, a, b); + } + + template <class D> + HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v, + TFromD<D>* HWY_RESTRICT buf) const { + const size_t N = Lanes(d); + Store(v, d, buf); + v = SetKey(d, buf + 0); // result must be broadcasted + for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) { + v = First(d, v, SetKey(d, buf + i)); + } + return v; + } + + template <class D> + HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v, + TFromD<D>* HWY_RESTRICT buf) const { + const size_t N = Lanes(d); + Store(v, d, buf); + v = SetKey(d, buf + 0); // result must be broadcasted + for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) { + v = Last(d, v, SetKey(d, buf + i)); + } + return v; + } + + // Same as for regular lanes because 128-bit lanes are u64. + template <class D> + HWY_INLINE Vec<D> FirstValue(D d) const { + return Set(d, hwy::LowestValue<TFromD<D> >()); + } + + template <class D> + HWY_INLINE Vec<D> LastValue(D d) const { + return Set(d, hwy::HighestValue<TFromD<D> >()); + } +}; + +struct OrderDescending128 : public Key128 { + using Order = SortDescending; + + template <typename T> + HWY_INLINE bool Compare1(const T* a, const T* b) { + return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1]; + } + + template <class D> + HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const { + return Lt128(d, b, a); + } + + // Used by CompareTop + template <class V> + HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const { + return Lt(b, a); + } + + template <class D> + HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const { + return Max128(d, a, b); + } + + template <class D> + HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const { + return Min128(d, a, b); + } + + template <class D> + HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v, + TFromD<D>* HWY_RESTRICT buf) const { + const size_t N = Lanes(d); + Store(v, d, buf); + v = SetKey(d, buf + 0); // result must be broadcasted + for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) { + v = First(d, v, SetKey(d, buf + i)); + } + return v; + } + + template <class D> + HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v, + TFromD<D>* HWY_RESTRICT buf) const { + const size_t N = Lanes(d); + Store(v, d, buf); + v = SetKey(d, buf + 0); // result must be broadcasted + for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) { + v = Last(d, v, SetKey(d, buf + i)); + } + return v; + } + + // Same as for regular lanes because 128-bit lanes are u64. + template <class D> + HWY_INLINE Vec<D> FirstValue(D d) const { + return Set(d, hwy::HighestValue<TFromD<D> >()); + } + + template <class D> + HWY_INLINE Vec<D> LastValue(D d) const { + return Set(d, hwy::LowestValue<TFromD<D> >()); + } +}; + +// Shared code that depends on Order. +template <class Base> +class Traits128 : public Base { +#if HWY_TARGET <= HWY_AVX2 + // Returns vector with only the top u64 lane valid. Useful when the next step + // is to replicate the mask anyway. + template <class D> + HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const { + const Base* base = static_cast<const Base*>(this); + const Vec<D> eqHL = VecFromMask(d, Eq(a, b)); + const Vec<D> ltHL = VecFromMask(d, base->CompareLanes(a, b)); + const Vec<D> ltLX = ShiftLeftLanes<1>(ltHL); + return OrAnd(ltHL, eqHL, ltLX); + } + + // We want to swap 2 u128, i.e. 4 u64 lanes, based on the 0 or FF..FF mask in + // the most-significant of those lanes (the result of CompareTop), so + // replicate it 4x. Only called for >= 256-bit vectors. + template <class V> + HWY_INLINE V ReplicateTop4x(V v) const { +#if HWY_TARGET <= HWY_AVX3 + return V{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))}; +#else // AVX2 + return V{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))}; +#endif + } +#endif + + public: + constexpr bool Is128() const { return true; } + + template <class D> + HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const { + const Base* base = static_cast<const Base*>(this); + + const Vec<D> a_copy = a; + const auto lt = base->Compare(d, a, b); + a = IfThenElse(lt, a, b); + b = IfThenElse(lt, b, a_copy); + } + + // Conditionally swaps even-numbered lanes with their odd-numbered neighbor. + template <class D> + HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const { + const Base* base = static_cast<const Base*>(this); + Vec<D> swapped = base->ReverseKeys2(d, v); + +#if HWY_TARGET <= HWY_AVX2 + const Vec<D> select = ReplicateTop4x(CompareTop(d, v, swapped)); + return IfVecThenElse(select, swapped, v); +#else + Sort2(d, v, swapped); + return base->OddEvenKeys(swapped, v); +#endif + } + + // Swaps with the vector formed by reversing contiguous groups of 4 keys. + template <class D> + HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const { + const Base* base = static_cast<const Base*>(this); + Vec<D> swapped = base->ReverseKeys4(d, v); + + // Only specialize for AVX3 because this requires 512-bit vectors. +#if HWY_TARGET <= HWY_AVX3 + const Vec512<uint64_t> outHx = CompareTop(d, v, swapped); + // Similar to ReplicateTop4x, we want to gang together 2 comparison results + // (4 lanes). They are not contiguous, so use permute to replicate 4x. + alignas(64) uint64_t kIndices[8] = {7, 7, 5, 5, 5, 5, 7, 7}; + const Vec512<uint64_t> select = + TableLookupLanes(outHx, SetTableIndices(d, kIndices)); + return IfVecThenElse(select, swapped, v); +#else + Sort2(d, v, swapped); + return base->OddEvenPairs(d, swapped, v); +#endif + } + + // Conditionally swaps lane 0 with 4, 1 with 5 etc. + template <class D> + HWY_INLINE Vec<D> SortPairsDistance4(D, Vec<D>) const { + // Only used by Merge16, which would require 2048 bit vectors (unsupported). + HWY_ASSERT(0); + } +}; + +#endif // HWY_TARGET + +} // namespace detail +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE diff --git a/media/highway/src/hwy/contrib/sort/vqsort-inl.h b/media/highway/src/hwy/contrib/sort/vqsort-inl.h new file mode 100644 index 0000000000..50b4d16f0c --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort-inl.h @@ -0,0 +1,757 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Normal include guard for target-independent parts +#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_ +#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_ + +// Makes it harder for adversaries to predict our sampling locations, at the +// cost of 1-2% increased runtime. +#ifndef VQSORT_SECURE_RNG +#define VQSORT_SECURE_RNG 0 +#endif + +#if VQSORT_SECURE_RNG +#include "third_party/absl/random/random.h" +#endif + +#include <string.h> // memcpy + +#include "hwy/cache_control.h" // Prefetch +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" // Fill24Bytes + +#if HWY_IS_MSAN +#include <sanitizer/msan_interface.h> +#endif + +#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_ + +// Per-target +#if defined(HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE) == \ + defined(HWY_TARGET_TOGGLE) +#ifdef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE +#undef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE +#else +#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE +#endif + +#include "hwy/contrib/sort/shared-inl.h" +#include "hwy/contrib/sort/sorting_networks-inl.h" +#include "hwy/highway.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { +namespace detail { + +#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128 + +template <typename T> +void Swap(T* a, T* b) { + T t = *a; + *a = *b; + *b = t; +} + +// Scalar version of HeapSort (see below) +template <class Traits, typename T> +void HeapSort(Traits st, T* HWY_RESTRICT keys, const size_t num) { + if (num < 2) return; + + // Build heap. + for (size_t i = 1; i < num; i += 1) { + size_t j = i; + while (j != 0) { + const size_t idx_parent = ((j - 1) / 1 / 2); + if (!st.Compare1(keys + idx_parent, keys + j)) { + break; + } + Swap(keys + j, keys + idx_parent); + j = idx_parent; + } + } + + for (size_t i = num - 1; i != 0; i -= 1) { + // Swap root with last + Swap(keys + 0, keys + i); + + // Sift down the new root. + size_t j = 0; + while (j < i) { + const size_t left = 2 * j + 1; + const size_t right = 2 * j + 2; + if (left >= i) break; + size_t idx_larger = j; + if (st.Compare1(keys + j, keys + left)) { + idx_larger = left; + } + if (right < i && st.Compare1(keys + idx_larger, keys + right)) { + idx_larger = right; + } + if (idx_larger == j) break; + Swap(keys + j, keys + idx_larger); + j = idx_larger; + } + } +} + +#else + +using Constants = hwy::SortConstants; + +// ------------------------------ HeapSort + +// Heapsort: O(1) space, O(N*logN) worst-case comparisons. +// Based on LLVM sanitizer_common.h, licensed under Apache-2.0. +template <class Traits, typename T> +void HeapSort(Traits st, T* HWY_RESTRICT keys, const size_t num) { + constexpr size_t N1 = st.LanesPerKey(); + const FixedTag<T, N1> d; + + if (num < 2 * N1) return; + + // Build heap. + for (size_t i = N1; i < num; i += N1) { + size_t j = i; + while (j != 0) { + const size_t idx_parent = ((j - N1) / N1 / 2) * N1; + if (AllFalse(d, st.Compare(d, st.SetKey(d, keys + idx_parent), + st.SetKey(d, keys + j)))) { + break; + } + st.Swap(keys + j, keys + idx_parent); + j = idx_parent; + } + } + + for (size_t i = num - N1; i != 0; i -= N1) { + // Swap root with last + st.Swap(keys + 0, keys + i); + + // Sift down the new root. + size_t j = 0; + while (j < i) { + const size_t left = 2 * j + N1; + const size_t right = 2 * j + 2 * N1; + if (left >= i) break; + size_t idx_larger = j; + const auto key_j = st.SetKey(d, keys + j); + if (AllTrue(d, st.Compare(d, key_j, st.SetKey(d, keys + left)))) { + idx_larger = left; + } + if (right < i && AllTrue(d, st.Compare(d, st.SetKey(d, keys + idx_larger), + st.SetKey(d, keys + right)))) { + idx_larger = right; + } + if (idx_larger == j) break; + st.Swap(keys + j, keys + idx_larger); + j = idx_larger; + } + } +} + +// ------------------------------ BaseCase + +// Sorts `keys` within the range [0, num) via sorting network. +template <class D, class Traits, typename T> +HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys, size_t num, + T* HWY_RESTRICT buf) { + const size_t N = Lanes(d); + using V = decltype(Zero(d)); + + // _Nonzero32 requires num - 1 != 0. + if (HWY_UNLIKELY(num <= 1)) return; + + // Reshape into a matrix with kMaxRows rows, and columns limited by the + // 1D `num`, which is upper-bounded by the vector width (see BaseCaseNum). + const size_t num_pow2 = size_t{1} + << (32 - Num0BitsAboveMS1Bit_Nonzero32( + static_cast<uint32_t>(num - 1))); + HWY_DASSERT(num <= num_pow2 && num_pow2 <= Constants::BaseCaseNum(N)); + const size_t cols = + HWY_MAX(st.LanesPerKey(), num_pow2 >> Constants::kMaxRowsLog2); + HWY_DASSERT(cols <= N); + + // Copy `keys` to `buf`. + size_t i; + for (i = 0; i + N <= num; i += N) { + Store(LoadU(d, keys + i), d, buf + i); + } + SafeCopyN(num - i, d, keys + i, buf + i); + i = num; + + // Fill with padding - last in sort order, not copied to keys. + const V kPadding = st.LastValue(d); + // Initialize an extra vector because SortingNetwork loads full vectors, + // which may exceed cols*kMaxRows. + for (; i < (cols * Constants::kMaxRows + N); i += N) { + StoreU(kPadding, d, buf + i); + } + + SortingNetwork(st, buf, cols); + + for (i = 0; i + N <= num; i += N) { + StoreU(Load(d, buf + i), d, keys + i); + } + SafeCopyN(num - i, d, buf + i, keys + i); +} + +// ------------------------------ Partition + +// Consumes from `left` until a multiple of kUnroll*N remains. +// Temporarily stores the right side into `buf`, then moves behind `right`. +template <class D, class Traits, class T> +HWY_NOINLINE void PartitionToMultipleOfUnroll(D d, Traits st, + T* HWY_RESTRICT keys, + size_t& left, size_t& right, + const Vec<D> pivot, + T* HWY_RESTRICT buf) { + constexpr size_t kUnroll = Constants::kPartitionUnroll; + const size_t N = Lanes(d); + size_t readL = left; + size_t bufR = 0; + const size_t num = right - left; + // Partition requires both a multiple of kUnroll*N and at least + // 2*kUnroll*N for the initial loads. If less, consume all here. + const size_t num_rem = + (num < 2 * kUnroll * N) ? num : (num & (kUnroll * N - 1)); + size_t i = 0; + for (; i + N <= num_rem; i += N) { + const Vec<D> vL = LoadU(d, keys + readL); + readL += N; + + const auto comp = st.Compare(d, pivot, vL); + left += CompressBlendedStore(vL, Not(comp), d, keys + left); + bufR += CompressStore(vL, comp, d, buf + bufR); + } + // Last iteration: only use valid lanes. + if (HWY_LIKELY(i != num_rem)) { + const auto mask = FirstN(d, num_rem - i); + const Vec<D> vL = LoadU(d, keys + readL); + + const auto comp = st.Compare(d, pivot, vL); + left += CompressBlendedStore(vL, AndNot(comp, mask), d, keys + left); + bufR += CompressStore(vL, And(comp, mask), d, buf + bufR); + } + + // MSAN seems not to understand CompressStore. buf[0, bufR) are valid. +#if HWY_IS_MSAN + __msan_unpoison(buf, bufR * sizeof(T)); +#endif + + // Everything we loaded was put into buf, or behind the new `left`, after + // which there is space for bufR items. First move items from `right` to + // `left` to free up space, then copy `buf` into the vacated `right`. + // A loop with masked loads from `buf` is insufficient - we would also need to + // mask from `right`. Combining a loop with memcpy for the remainders is + // slower than just memcpy, so we use that for simplicity. + right -= bufR; + memcpy(keys + left, keys + right, bufR * sizeof(T)); + memcpy(keys + right, buf, bufR * sizeof(T)); +} + +template <class D, class Traits, typename T> +HWY_INLINE void StoreLeftRight(D d, Traits st, const Vec<D> v, + const Vec<D> pivot, T* HWY_RESTRICT keys, + size_t& writeL, size_t& writeR) { + const size_t N = Lanes(d); + + const auto comp = st.Compare(d, pivot, v); + + if (hwy::HWY_NAMESPACE::CompressIsPartition<T>::value) { + // Non-native Compress (e.g. AVX2): we are able to partition a vector using + // a single Compress+two StoreU instead of two Compress[Blended]Store. The + // latter are more expensive. Because we store entire vectors, the contents + // between the updated writeL and writeR are ignored and will be overwritten + // by subsequent calls. This works because writeL and writeR are at least + // two vectors apart. + const auto mask = Not(comp); + const auto lr = Compress(v, mask); + const size_t num_left = CountTrue(d, mask); + StoreU(lr, d, keys + writeL); + writeL += num_left; + // Now write the right-side elements (if any), such that the previous writeR + // is one past the end of the newly written right elements, then advance. + StoreU(lr, d, keys + writeR - N); + writeR -= (N - num_left); + } else { + // Native Compress[Store] (e.g. AVX3), which only keep the left or right + // side, not both, hence we require two calls. + const size_t num_left = CompressStore(v, Not(comp), d, keys + writeL); + writeL += num_left; + + writeR -= (N - num_left); + (void)CompressBlendedStore(v, comp, d, keys + writeR); + } +} + +template <class D, class Traits, typename T> +HWY_INLINE void StoreLeftRight4(D d, Traits st, const Vec<D> v0, + const Vec<D> v1, const Vec<D> v2, + const Vec<D> v3, const Vec<D> pivot, + T* HWY_RESTRICT keys, size_t& writeL, + size_t& writeR) { + StoreLeftRight(d, st, v0, pivot, keys, writeL, writeR); + StoreLeftRight(d, st, v1, pivot, keys, writeL, writeR); + StoreLeftRight(d, st, v2, pivot, keys, writeL, writeR); + StoreLeftRight(d, st, v3, pivot, keys, writeL, writeR); +} + +// Moves "<= pivot" keys to the front, and others to the back. pivot is +// broadcasted. Time-critical! +// +// Aligned loads do not seem to be worthwhile (not bottlenecked by load ports). +template <class D, class Traits, typename T> +HWY_NOINLINE size_t Partition(D d, Traits st, T* HWY_RESTRICT keys, size_t left, + size_t right, const Vec<D> pivot, + T* HWY_RESTRICT buf) { + using V = decltype(Zero(d)); + const size_t N = Lanes(d); + + // StoreLeftRight will CompressBlendedStore ending at `writeR`. Unless all + // lanes happen to be in the right-side partition, this will overrun `keys`, + // which triggers asan errors. Avoid by special-casing the last vector. + HWY_DASSERT(right - left > 2 * N); // ensured by HandleSpecialCases + right -= N; + const size_t last = right; + const V vlast = LoadU(d, keys + last); + + PartitionToMultipleOfUnroll(d, st, keys, left, right, pivot, buf); + constexpr size_t kUnroll = Constants::kPartitionUnroll; + + // Invariant: [left, writeL) and [writeR, right) are already partitioned. + size_t writeL = left; + size_t writeR = right; + + const size_t num = right - left; + // Cannot load if there were fewer than 2 * kUnroll * N. + if (HWY_LIKELY(num != 0)) { + HWY_DASSERT(num >= 2 * kUnroll * N); + HWY_DASSERT((num & (kUnroll * N - 1)) == 0); + + // Make space for writing in-place by reading from left and right. + const V vL0 = LoadU(d, keys + left + 0 * N); + const V vL1 = LoadU(d, keys + left + 1 * N); + const V vL2 = LoadU(d, keys + left + 2 * N); + const V vL3 = LoadU(d, keys + left + 3 * N); + left += kUnroll * N; + right -= kUnroll * N; + const V vR0 = LoadU(d, keys + right + 0 * N); + const V vR1 = LoadU(d, keys + right + 1 * N); + const V vR2 = LoadU(d, keys + right + 2 * N); + const V vR3 = LoadU(d, keys + right + 3 * N); + + // The left/right updates may consume all inputs, so check before the loop. + while (left != right) { + V v0, v1, v2, v3; + + // Free up capacity for writing by loading from the side that has less. + // Data-dependent but branching is faster than forcing branch-free. + const size_t capacityL = left - writeL; + const size_t capacityR = writeR - right; + HWY_DASSERT(capacityL <= num && capacityR <= num); // >= 0 + if (capacityR < capacityL) { + right -= kUnroll * N; + v0 = LoadU(d, keys + right + 0 * N); + v1 = LoadU(d, keys + right + 1 * N); + v2 = LoadU(d, keys + right + 2 * N); + v3 = LoadU(d, keys + right + 3 * N); + hwy::Prefetch(keys + right - 3 * kUnroll * N); + } else { + v0 = LoadU(d, keys + left + 0 * N); + v1 = LoadU(d, keys + left + 1 * N); + v2 = LoadU(d, keys + left + 2 * N); + v3 = LoadU(d, keys + left + 3 * N); + left += kUnroll * N; + hwy::Prefetch(keys + left + 3 * kUnroll * N); + } + + StoreLeftRight4(d, st, v0, v1, v2, v3, pivot, keys, writeL, writeR); + } + + // Now finish writing the initial left/right to the middle. + StoreLeftRight4(d, st, vL0, vL1, vL2, vL3, pivot, keys, writeL, writeR); + StoreLeftRight4(d, st, vR0, vR1, vR2, vR3, pivot, keys, writeL, writeR); + } + + // We have partitioned [left, right) such that writeL is the boundary. + HWY_DASSERT(writeL == writeR); + // Make space for inserting vlast: move up to N of the first right-side keys + // into the unused space starting at last. If we have fewer, ensure they are + // the last items in that vector by subtracting from the *load* address, + // which is safe because we have at least two vectors (checked above). + const size_t totalR = last - writeL; + const size_t startR = totalR < N ? writeL + totalR - N : writeL; + StoreU(LoadU(d, keys + startR), d, keys + last); + + // Partition vlast: write L, then R, into the single-vector gap at writeL. + const auto comp = st.Compare(d, pivot, vlast); + writeL += CompressBlendedStore(vlast, Not(comp), d, keys + writeL); + (void)CompressBlendedStore(vlast, comp, d, keys + writeL); + + return writeL; +} + +// ------------------------------ Pivot + +template <class Traits, class V> +HWY_INLINE V MedianOf3(Traits st, V v0, V v1, V v2) { + const DFromV<V> d; + // Slightly faster for 128-bit, apparently because not serially dependent. + if (st.Is128()) { + // Median = XOR-sum 'minus' the first and last. Calling First twice is + // slightly faster than Compare + 2 IfThenElse or even IfThenElse + XOR. + const auto sum = Xor(Xor(v0, v1), v2); + const auto first = st.First(d, st.First(d, v0, v1), v2); + const auto last = st.Last(d, st.Last(d, v0, v1), v2); + return Xor(Xor(sum, first), last); + } + st.Sort2(d, v0, v2); + v1 = st.Last(d, v0, v1); + v1 = st.First(d, v1, v2); + return v1; +} + +// Replaces triplets with their median and recurses until less than 3 keys +// remain. Ignores leftover values (non-whole triplets)! +template <class D, class Traits, typename T> +Vec<D> RecursiveMedianOf3(D d, Traits st, T* HWY_RESTRICT keys, size_t num, + T* HWY_RESTRICT buf) { + const size_t N = Lanes(d); + constexpr size_t N1 = st.LanesPerKey(); + + if (num < 3 * N1) return st.SetKey(d, keys); + + size_t read = 0; + size_t written = 0; + + // Triplets of vectors + for (; read + 3 * N <= num; read += 3 * N) { + const auto v0 = Load(d, keys + read + 0 * N); + const auto v1 = Load(d, keys + read + 1 * N); + const auto v2 = Load(d, keys + read + 2 * N); + Store(MedianOf3(st, v0, v1, v2), d, buf + written); + written += N; + } + + // Triplets of keys + for (; read + 3 * N1 <= num; read += 3 * N1) { + const auto v0 = st.SetKey(d, keys + read + 0 * N1); + const auto v1 = st.SetKey(d, keys + read + 1 * N1); + const auto v2 = st.SetKey(d, keys + read + 2 * N1); + StoreU(MedianOf3(st, v0, v1, v2), d, buf + written); + written += N1; + } + + // Tail recursion; swap buffers + return RecursiveMedianOf3(d, st, buf, written, keys); +} + +#if VQSORT_SECURE_RNG +using Generator = absl::BitGen; +#else +// Based on https://github.com/numpy/numpy/issues/16313#issuecomment-641897028 +#pragma pack(push, 1) +class Generator { + public: + Generator(const void* heap, size_t num) { + Sorter::Fill24Bytes(heap, num, &a_); + k_ = 1; // stream index: must be odd + } + + explicit Generator(uint64_t seed) { + a_ = b_ = w_ = seed; + k_ = 1; + } + + uint64_t operator()() { + const uint64_t b = b_; + w_ += k_; + const uint64_t next = a_ ^ w_; + a_ = (b + (b << 3)) ^ (b >> 11); + const uint64_t rot = (b << 24) | (b >> 40); + b_ = rot + next; + return next; + } + + private: + uint64_t a_; + uint64_t b_; + uint64_t w_; + uint64_t k_; // increment +}; +#pragma pack(pop) + +#endif // !VQSORT_SECURE_RNG + +// Returns slightly biased random index of a chunk in [0, num_chunks). +// See https://www.pcg-random.org/posts/bounded-rands.html. +HWY_INLINE size_t RandomChunkIndex(const uint32_t num_chunks, uint32_t bits) { + const uint64_t chunk_index = (static_cast<uint64_t>(bits) * num_chunks) >> 32; + HWY_DASSERT(chunk_index < num_chunks); + return static_cast<size_t>(chunk_index); +} + +template <class D, class Traits, typename T> +HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys, + const size_t begin, const size_t end, + T* HWY_RESTRICT buf, Generator& rng) { + using V = decltype(Zero(d)); + const size_t N = Lanes(d); + + // Power of two + const size_t lanes_per_chunk = Constants::LanesPerChunk(sizeof(T), N); + + keys += begin; + size_t num = end - begin; + + // Align start of keys to chunks. We always have at least 2 chunks because the + // base case would have handled anything up to 16 vectors, i.e. >= 4 chunks. + HWY_DASSERT(num >= 2 * lanes_per_chunk); + const size_t misalign = + (reinterpret_cast<uintptr_t>(keys) / sizeof(T)) & (lanes_per_chunk - 1); + if (misalign != 0) { + const size_t consume = lanes_per_chunk - misalign; + keys += consume; + num -= consume; + } + + // Generate enough random bits for 9 uint32 + uint64_t* bits64 = reinterpret_cast<uint64_t*>(buf); + for (size_t i = 0; i < 5; ++i) { + bits64[i] = rng(); + } + const uint32_t* bits = reinterpret_cast<const uint32_t*>(buf); + + const uint32_t lpc32 = static_cast<uint32_t>(lanes_per_chunk); + // Avoid division + const size_t log2_lpc = Num0BitsBelowLS1Bit_Nonzero32(lpc32); + const size_t num_chunks64 = num >> log2_lpc; + // Clamp to uint32 for RandomChunkIndex + const uint32_t num_chunks = + static_cast<uint32_t>(HWY_MIN(num_chunks64, 0xFFFFFFFFull)); + + const size_t offset0 = RandomChunkIndex(num_chunks, bits[0]) << log2_lpc; + const size_t offset1 = RandomChunkIndex(num_chunks, bits[1]) << log2_lpc; + const size_t offset2 = RandomChunkIndex(num_chunks, bits[2]) << log2_lpc; + const size_t offset3 = RandomChunkIndex(num_chunks, bits[3]) << log2_lpc; + const size_t offset4 = RandomChunkIndex(num_chunks, bits[4]) << log2_lpc; + const size_t offset5 = RandomChunkIndex(num_chunks, bits[5]) << log2_lpc; + const size_t offset6 = RandomChunkIndex(num_chunks, bits[6]) << log2_lpc; + const size_t offset7 = RandomChunkIndex(num_chunks, bits[7]) << log2_lpc; + const size_t offset8 = RandomChunkIndex(num_chunks, bits[8]) << log2_lpc; + for (size_t i = 0; i < lanes_per_chunk; i += N) { + const V v0 = Load(d, keys + offset0 + i); + const V v1 = Load(d, keys + offset1 + i); + const V v2 = Load(d, keys + offset2 + i); + const V medians0 = MedianOf3(st, v0, v1, v2); + Store(medians0, d, buf + i); + + const V v3 = Load(d, keys + offset3 + i); + const V v4 = Load(d, keys + offset4 + i); + const V v5 = Load(d, keys + offset5 + i); + const V medians1 = MedianOf3(st, v3, v4, v5); + Store(medians1, d, buf + i + lanes_per_chunk); + + const V v6 = Load(d, keys + offset6 + i); + const V v7 = Load(d, keys + offset7 + i); + const V v8 = Load(d, keys + offset8 + i); + const V medians2 = MedianOf3(st, v6, v7, v8); + Store(medians2, d, buf + i + lanes_per_chunk * 2); + } + + return RecursiveMedianOf3(d, st, buf, 3 * lanes_per_chunk, + buf + 3 * lanes_per_chunk); +} + +// Compute exact min/max to detect all-equal partitions. Only called after a +// degenerate Partition (none in the right partition). +template <class D, class Traits, typename T> +HWY_NOINLINE void ScanMinMax(D d, Traits st, const T* HWY_RESTRICT keys, + size_t num, T* HWY_RESTRICT buf, Vec<D>& first, + Vec<D>& last) { + const size_t N = Lanes(d); + + first = st.LastValue(d); + last = st.FirstValue(d); + + size_t i = 0; + for (; i + N <= num; i += N) { + const Vec<D> v = LoadU(d, keys + i); + first = st.First(d, v, first); + last = st.Last(d, v, last); + } + if (HWY_LIKELY(i != num)) { + HWY_DASSERT(num >= N); // See HandleSpecialCases + const Vec<D> v = LoadU(d, keys + num - N); + first = st.First(d, v, first); + last = st.Last(d, v, last); + } + + first = st.FirstOfLanes(d, first, buf); + last = st.LastOfLanes(d, last, buf); +} + +template <class D, class Traits, typename T> +void Recurse(D d, Traits st, T* HWY_RESTRICT keys, const size_t begin, + const size_t end, const Vec<D> pivot, T* HWY_RESTRICT buf, + Generator& rng, size_t remaining_levels) { + HWY_DASSERT(begin + 1 < end); + const size_t num = end - begin; // >= 2 + + // Too many degenerate partitions. This is extremely unlikely to happen + // because we select pivots from large (though still O(1)) samples. + if (HWY_UNLIKELY(remaining_levels == 0)) { + HeapSort(st, keys + begin, num); // Slow but N*logN. + return; + } + + const ptrdiff_t base_case_num = + static_cast<ptrdiff_t>(Constants::BaseCaseNum(Lanes(d))); + const size_t bound = Partition(d, st, keys, begin, end, pivot, buf); + + const ptrdiff_t num_left = + static_cast<ptrdiff_t>(bound) - static_cast<ptrdiff_t>(begin); + const ptrdiff_t num_right = + static_cast<ptrdiff_t>(end) - static_cast<ptrdiff_t>(bound); + + // Check for degenerate partitions (i.e. Partition did not move any keys): + if (HWY_UNLIKELY(num_right == 0)) { + // Because the pivot is one of the keys, it must have been equal to the + // first or last key in sort order. Scan for the actual min/max: + // passing the current pivot as the new bound is insufficient because one of + // the partitions might not actually include that key. + Vec<D> first, last; + ScanMinMax(d, st, keys + begin, num, buf, first, last); + if (AllTrue(d, Eq(first, last))) return; + + // Separate recursion to make sure that we don't pick `last` as the + // pivot - that would again lead to a degenerate partition. + Recurse(d, st, keys, begin, end, first, buf, rng, remaining_levels - 1); + return; + } + + if (HWY_UNLIKELY(num_left <= base_case_num)) { + BaseCase(d, st, keys + begin, static_cast<size_t>(num_left), buf); + } else { + const Vec<D> next_pivot = ChoosePivot(d, st, keys, begin, bound, buf, rng); + Recurse(d, st, keys, begin, bound, next_pivot, buf, rng, + remaining_levels - 1); + } + if (HWY_UNLIKELY(num_right <= base_case_num)) { + BaseCase(d, st, keys + bound, static_cast<size_t>(num_right), buf); + } else { + const Vec<D> next_pivot = ChoosePivot(d, st, keys, bound, end, buf, rng); + Recurse(d, st, keys, bound, end, next_pivot, buf, rng, + remaining_levels - 1); + } +} + +// Returns true if sorting is finished. +template <class D, class Traits, typename T> +bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num, + T* HWY_RESTRICT buf) { + const size_t N = Lanes(d); + const size_t base_case_num = Constants::BaseCaseNum(N); + + // 128-bit keys require vectors with at least two u64 lanes, which is always + // the case unless `d` requests partial vectors (e.g. fraction = 1/2) AND the + // hardware vector width is less than 128bit / fraction. + const bool partial_128 = N < 2 && st.Is128(); + // Partition assumes its input is at least two vectors. If vectors are huge, + // base_case_num may actually be smaller. If so, which is only possible on + // RVV, pass a capped or partial d (LMUL < 1). Use HWY_MAX_BYTES instead of + // HWY_LANES to account for the largest possible LMUL. + constexpr bool kPotentiallyHuge = + HWY_MAX_BYTES / sizeof(T) > Constants::kMaxRows * Constants::kMaxCols; + const bool huge_vec = kPotentiallyHuge && (2 * N > base_case_num); + if (partial_128 || huge_vec) { + // PERFORMANCE WARNING: falling back to HeapSort. + HeapSort(st, keys, num); + return true; + } + + // Small arrays: use sorting network, no need for other checks. + if (HWY_UNLIKELY(num <= base_case_num)) { + BaseCase(d, st, keys, num, buf); + return true; + } + + // We could also check for already sorted/reverse/equal, but that's probably + // counterproductive if vqsort is used as a base case. + + return false; // not finished sorting +} + +#endif // HWY_TARGET +} // namespace detail + +// Sorts `keys[0..num-1]` according to the order defined by `st.Compare`. +// In-place i.e. O(1) additional storage. Worst-case N*logN comparisons. +// Non-stable (order of equal keys may change), except for the common case where +// the upper bits of T are the key, and the lower bits are a sequential or at +// least unique ID. +// There is no upper limit on `num`, but note that pivots may be chosen by +// sampling only from the first 256 GiB. +// +// `d` is typically SortTag<T> (chooses between full and partial vectors). +// `st` is SharedTraits<Traits*<Order*>>. This abstraction layer bridges +// differences in sort order and single-lane vs 128-bit keys. +template <class D, class Traits, typename T> +void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num, + T* HWY_RESTRICT buf) { +#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128 + (void)d; + (void)buf; + // PERFORMANCE WARNING: vqsort is not enabled for the non-SIMD target + return detail::HeapSort(st, keys, num); +#else +#if !HWY_HAVE_SCALABLE + // On targets with fixed-size vectors, avoid _using_ the allocated memory. + // We avoid (potentially expensive for small input sizes) allocations on + // platforms where no targets are scalable. For 512-bit vectors, this fits on + // the stack (several KiB). + HWY_ALIGN T storage[SortConstants::BufNum<T>(HWY_LANES(T))] = {}; + static_assert(sizeof(storage) <= 8192, "Unexpectedly large, check size"); + buf = storage; +#endif // !HWY_HAVE_SCALABLE + + if (detail::HandleSpecialCases(d, st, keys, num, buf)) return; + +#if HWY_MAX_BYTES > 64 + // sorting_networks-inl and traits assume no more than 512 bit vectors. + if (Lanes(d) > 64 / sizeof(T)) { + return Sort(CappedTag<T, 64 / sizeof(T)>(), st, keys, num, buf); + } +#endif // HWY_MAX_BYTES > 64 + + // Pulled out of the recursion so we can special-case degenerate partitions. + detail::Generator rng(keys, num); + const Vec<D> pivot = detail::ChoosePivot(d, st, keys, 0, num, buf, rng); + + // Introspection: switch to worst-case N*logN heapsort after this many. + const size_t max_levels = 2 * hwy::CeilLog2(num) + 4; + + detail::Recurse(d, st, keys, 0, num, pivot, buf, rng, max_levels); +#endif // HWY_TARGET +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE diff --git a/media/highway/src/hwy/contrib/sort/vqsort.cc b/media/highway/src/hwy/contrib/sort/vqsort.cc new file mode 100644 index 0000000000..95117d8a58 --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort.cc @@ -0,0 +1,182 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/vqsort.h" + +#include <string.h> // memset + +#include "hwy/aligned_allocator.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/shared-inl.h" + +// Architectures for which we know HWY_HAVE_SCALABLE == 0. This opts into an +// optimization that replaces dynamic allocation with stack storage. +#ifndef VQSORT_STACK +#if HWY_ARCH_X86 || HWY_ARCH_WASM +#define VQSORT_STACK 1 +#else +#define VQSORT_STACK 0 +#endif +#endif // VQSORT_STACK + +// Check if we have sys/random.h. First skip some systems on which the check +// itself (features.h) might be problematic. +#if defined(ANDROID) || defined(__ANDROID__) || HWY_ARCH_RVV +#define VQSORT_GETRANDOM 0 +#endif + +#if !defined(VQSORT_GETRANDOM) && (defined(linux) || defined(__linux__)) +#include <features.h> + +// ---- which libc +#if defined(__UCLIBC__) +#define VQSORT_GETRANDOM 1 // added Mar 2015, before uclibc-ng 1.0 + +#elif defined(__GLIBC__) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 25) +#define VQSORT_GETRANDOM 1 +#else +#define VQSORT_GETRANDOM 0 +#endif + +#else +// Assume MUSL, which has getrandom since 2018. There is no macro to test, see +// https://www.openwall.com/lists/musl/2013/03/29/13. +#define VQSORT_GETRANDOM 1 + +#endif // ---- which libc +#endif // linux + +#if !defined(VQSORT_GETRANDOM) +#define VQSORT_GETRANDOM 0 +#endif + +// Seed source for SFC generator: 1=getrandom, 2=CryptGenRandom +// (not all Android support the getrandom wrapper) +#ifndef VQSORT_SECURE_SEED + +#if VQSORT_GETRANDOM +#define VQSORT_SECURE_SEED 1 +#elif defined(_WIN32) || defined(_WIN64) +#define VQSORT_SECURE_SEED 2 +#else +#define VQSORT_SECURE_SEED 0 +#endif + +#endif // VQSORT_SECURE_SEED + +#if !VQSORT_SECURE_RNG + +#include <time.h> +#if VQSORT_SECURE_SEED == 1 +#include <sys/random.h> +#elif VQSORT_SECURE_SEED == 2 +#include <windows.h> +#pragma comment(lib, "advapi32.lib") +// Must come after windows.h. +#include <wincrypt.h> +#endif // VQSORT_SECURE_SEED + +#endif // !VQSORT_SECURE_RNG + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +size_t VectorSize() { return Lanes(ScalableTag<uint8_t, 3>()); } +bool HaveFloat64() { return HWY_HAVE_FLOAT64; } + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(VectorSize); +HWY_EXPORT(HaveFloat64); + +} // namespace + +Sorter::Sorter() { +#if VQSORT_STACK + ptr_ = nullptr; // Sort will use stack storage instead +#else + // Determine the largest buffer size required for any type by trying them all. + // (The capping of N in BaseCaseNum means that smaller N but larger sizeof_t + // may require a larger buffer.) + const size_t vector_size = HWY_DYNAMIC_DISPATCH(VectorSize)(); + const size_t max_bytes = + HWY_MAX(HWY_MAX(SortConstants::BufBytes<uint16_t>(vector_size), + SortConstants::BufBytes<uint32_t>(vector_size)), + SortConstants::BufBytes<uint64_t>(vector_size)); + ptr_ = hwy::AllocateAlignedBytes(max_bytes, nullptr, nullptr); + + // Prevent msan errors by initializing. + memset(ptr_, 0, max_bytes); +#endif +} + +void Sorter::Delete() { +#if !VQSORT_STACK + FreeAlignedBytes(ptr_, nullptr, nullptr); + ptr_ = nullptr; +#endif +} + +#if !VQSORT_SECURE_RNG + +void Sorter::Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes) { +#if VQSORT_SECURE_SEED == 1 + // May block if urandom is not yet initialized. + const ssize_t ret = getrandom(bytes, 24, /*flags=*/0); + if (ret == 24) return; +#elif VQSORT_SECURE_SEED == 2 + HCRYPTPROV hProvider{}; + if (CryptAcquireContextA(&hProvider, nullptr, nullptr, PROV_RSA_FULL, + CRYPT_VERIFYCONTEXT)) { + const BOOL ok = + CryptGenRandom(hProvider, 24, reinterpret_cast<BYTE*>(bytes)); + CryptReleaseContext(hProvider, 0); + if (ok) return; + } +#endif + + // VQSORT_SECURE_SEED == 0, or one of the above failed. Get some entropy from + // stack/heap/code addresses and the clock() timer. + uint64_t* words = reinterpret_cast<uint64_t*>(bytes); + uint64_t** seed_stack = &words; + void (*seed_code)(const void*, size_t, void*) = &Fill24Bytes; + const uintptr_t bits_stack = reinterpret_cast<uintptr_t>(seed_stack); + const uintptr_t bits_heap = reinterpret_cast<uintptr_t>(seed_heap); + const uintptr_t bits_code = reinterpret_cast<uintptr_t>(seed_code); + const uint64_t bits_time = static_cast<uint64_t>(clock()); + words[0] = bits_stack ^ bits_time ^ seed_num; + words[1] = bits_heap ^ bits_time ^ seed_num; + words[2] = bits_code ^ bits_time ^ seed_num; +} + +#endif // !VQSORT_SECURE_RNG + +bool Sorter::HaveFloat64() { return HWY_DYNAMIC_DISPATCH(HaveFloat64)(); } + +} // namespace hwy +#endif // HWY_ONCE diff --git a/media/highway/src/hwy/contrib/sort/vqsort.h b/media/highway/src/hwy/contrib/sort/vqsort.h new file mode 100644 index 0000000000..df1afb07db --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort.h @@ -0,0 +1,106 @@ +// Copyright 2022 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Interface to vectorized quicksort with dynamic dispatch. Measurements and +// detailed description: https://arxiv.org/abs/2205.05982 . + +#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_ +#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_ + +#include "hwy/base.h" + +namespace hwy { + +// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it: +// https://reviews.llvm.org/D86310 +#pragma pack(push, 1) +struct alignas(16) uint128_t { + uint64_t lo; // little-endian layout + uint64_t hi; +}; +#pragma pack(pop) + +// Tag arguments that determine the sort order. +struct SortAscending { + constexpr bool IsAscending() const { return true; } +}; +struct SortDescending { + constexpr bool IsAscending() const { return false; } +}; + +// Allocates O(1) space. Type-erased RAII wrapper over hwy/aligned_allocator.h. +// This allows amortizing the allocation over multiple sorts. +class HWY_CONTRIB_DLLEXPORT Sorter { + public: + Sorter(); + ~Sorter() { Delete(); } + + // Move-only + Sorter(const Sorter&) = delete; + Sorter& operator=(const Sorter&) = delete; + Sorter(Sorter&& other) { + Delete(); + ptr_ = other.ptr_; + other.ptr_ = nullptr; + } + Sorter& operator=(Sorter&& other) { + Delete(); + ptr_ = other.ptr_; + other.ptr_ = nullptr; + return *this; + } + + // Sorts keys[0, n). Dispatches to the best available instruction set, + // and does not allocate memory. + void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortDescending) const; + void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortDescending) const; + void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortDescending) const; + + void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortDescending) const; + void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortDescending) const; + void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortDescending) const; + + void operator()(float* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(float* HWY_RESTRICT keys, size_t n, SortDescending) const; + void operator()(double* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(double* HWY_RESTRICT keys, size_t n, SortDescending) const; + + void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortAscending) const; + void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortDescending) const; + + // For internal use only + static void Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes); + static bool HaveFloat64(); + + private: + void Delete(); + + template <typename T> + T* Get() const { + return static_cast<T*>(ptr_); + } + + void* ptr_ = nullptr; +}; + +} // namespace hwy + +#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_ diff --git a/media/highway/src/hwy/contrib/sort/vqsort_128a.cc b/media/highway/src/hwy/contrib/sort/vqsort_128a.cc new file mode 100644 index 0000000000..5ce2057f70 --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort_128a.cc @@ -0,0 +1,56 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128a.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void Sort128Asc(uint64_t* HWY_RESTRICT keys, size_t num, + uint64_t* HWY_RESTRICT buf) { + SortTag<uint64_t> d; + detail::SharedTraits<detail::Traits128<detail::OrderAscending128>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(Sort128Asc); +} // namespace + +void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(Sort128Asc) + (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/media/highway/src/hwy/contrib/sort/vqsort_128d.cc b/media/highway/src/hwy/contrib/sort/vqsort_128d.cc new file mode 100644 index 0000000000..7218e1c4d2 --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort_128d.cc @@ -0,0 +1,56 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128d.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/traits128-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void Sort128Desc(uint64_t* HWY_RESTRICT keys, size_t num, + uint64_t* HWY_RESTRICT buf) { + SortTag<uint64_t> d; + detail::SharedTraits<detail::Traits128<detail::OrderDescending128>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(Sort128Desc); +} // namespace + +void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(Sort128Desc) + (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/media/highway/src/hwy/contrib/sort/vqsort_f32a.cc b/media/highway/src/hwy/contrib/sort/vqsort_f32a.cc new file mode 100644 index 0000000000..5934f8a496 --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort_f32a.cc @@ -0,0 +1,54 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32a.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortF32Asc(float* HWY_RESTRICT keys, size_t num, float* HWY_RESTRICT buf) { + SortTag<float> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortF32Asc); +} // namespace + +void Sorter::operator()(float* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(SortF32Asc)(keys, n, Get<float>()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/media/highway/src/hwy/contrib/sort/vqsort_f32d.cc b/media/highway/src/hwy/contrib/sort/vqsort_f32d.cc new file mode 100644 index 0000000000..ec0469a744 --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort_f32d.cc @@ -0,0 +1,55 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32d.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortF32Desc(float* HWY_RESTRICT keys, size_t num, + float* HWY_RESTRICT buf) { + SortTag<float> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortF32Desc); +} // namespace + +void Sorter::operator()(float* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(SortF32Desc)(keys, n, Get<float>()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/media/highway/src/hwy/contrib/sort/vqsort_f64a.cc b/media/highway/src/hwy/contrib/sort/vqsort_f64a.cc new file mode 100644 index 0000000000..b701c9f3de --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort_f64a.cc @@ -0,0 +1,62 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64a.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortF64Asc(double* HWY_RESTRICT keys, size_t num, + double* HWY_RESTRICT buf) { +#if HWY_HAVE_FLOAT64 + SortTag<double> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st; + Sort(d, st, keys, num, buf); +#else + (void)keys; + (void)num; + (void)buf; + HWY_ASSERT(0); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortF64Asc); +} // namespace + +void Sorter::operator()(double* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(SortF64Asc)(keys, n, Get<double>()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/media/highway/src/hwy/contrib/sort/vqsort_f64d.cc b/media/highway/src/hwy/contrib/sort/vqsort_f64d.cc new file mode 100644 index 0000000000..87ae9ca191 --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort_f64d.cc @@ -0,0 +1,62 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64d.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortF64Desc(double* HWY_RESTRICT keys, size_t num, + double* HWY_RESTRICT buf) { +#if HWY_HAVE_FLOAT64 + SortTag<double> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st; + Sort(d, st, keys, num, buf); +#else + (void)keys; + (void)num; + (void)buf; + HWY_ASSERT(0); +#endif +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortF64Desc); +} // namespace + +void Sorter::operator()(double* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(SortF64Desc)(keys, n, Get<double>()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/media/highway/src/hwy/contrib/sort/vqsort_i16a.cc b/media/highway/src/hwy/contrib/sort/vqsort_i16a.cc new file mode 100644 index 0000000000..6e64eeb0e6 --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort_i16a.cc @@ -0,0 +1,60 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16a.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +// Workaround for build timeout +#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortI16Asc(int16_t* HWY_RESTRICT keys, size_t num, + int16_t* HWY_RESTRICT buf) { + SortTag<int16_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortI16Asc); +} // namespace + +void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(SortI16Asc)(keys, n, Get<int16_t>()); +} + +} // namespace hwy +#endif // HWY_ONCE + +#endif // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD diff --git a/media/highway/src/hwy/contrib/sort/vqsort_i16d.cc b/media/highway/src/hwy/contrib/sort/vqsort_i16d.cc new file mode 100644 index 0000000000..922cee3775 --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort_i16d.cc @@ -0,0 +1,60 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16d.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +// Workaround for build timeout +#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortI16Desc(int16_t* HWY_RESTRICT keys, size_t num, + int16_t* HWY_RESTRICT buf) { + SortTag<int16_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortI16Desc); +} // namespace + +void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(SortI16Desc)(keys, n, Get<int16_t>()); +} + +} // namespace hwy +#endif // HWY_ONCE + +#endif // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD diff --git a/media/highway/src/hwy/contrib/sort/vqsort_i32a.cc b/media/highway/src/hwy/contrib/sort/vqsort_i32a.cc new file mode 100644 index 0000000000..12204fbaee --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort_i32a.cc @@ -0,0 +1,55 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32a.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortI32Asc(int32_t* HWY_RESTRICT keys, size_t num, + int32_t* HWY_RESTRICT buf) { + SortTag<int32_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortI32Asc); +} // namespace + +void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(SortI32Asc)(keys, n, Get<int32_t>()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/media/highway/src/hwy/contrib/sort/vqsort_i32d.cc b/media/highway/src/hwy/contrib/sort/vqsort_i32d.cc new file mode 100644 index 0000000000..fd2a4ff9fa --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort_i32d.cc @@ -0,0 +1,55 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32d.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortI32Desc(int32_t* HWY_RESTRICT keys, size_t num, + int32_t* HWY_RESTRICT buf) { + SortTag<int32_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortI32Desc); +} // namespace + +void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(SortI32Desc)(keys, n, Get<int32_t>()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/media/highway/src/hwy/contrib/sort/vqsort_i64a.cc b/media/highway/src/hwy/contrib/sort/vqsort_i64a.cc new file mode 100644 index 0000000000..6b9d225165 --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort_i64a.cc @@ -0,0 +1,55 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64a.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortI64Asc(int64_t* HWY_RESTRICT keys, size_t num, + int64_t* HWY_RESTRICT buf) { + SortTag<int64_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortI64Asc); +} // namespace + +void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(SortI64Asc)(keys, n, Get<int64_t>()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/media/highway/src/hwy/contrib/sort/vqsort_i64d.cc b/media/highway/src/hwy/contrib/sort/vqsort_i64d.cc new file mode 100644 index 0000000000..ef3ac097b8 --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort_i64d.cc @@ -0,0 +1,55 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64d.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortI64Desc(int64_t* HWY_RESTRICT keys, size_t num, + int64_t* HWY_RESTRICT buf) { + SortTag<int64_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortI64Desc); +} // namespace + +void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(SortI64Desc)(keys, n, Get<int64_t>()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/media/highway/src/hwy/contrib/sort/vqsort_u16a.cc b/media/highway/src/hwy/contrib/sort/vqsort_u16a.cc new file mode 100644 index 0000000000..8bef7fba32 --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort_u16a.cc @@ -0,0 +1,60 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16a.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +// Workaround for build timeout +#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortU16Asc(uint16_t* HWY_RESTRICT keys, size_t num, + uint16_t* HWY_RESTRICT buf) { + SortTag<uint16_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortU16Asc); +} // namespace + +void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(SortU16Asc)(keys, n, Get<uint16_t>()); +} + +} // namespace hwy +#endif // HWY_ONCE + +#endif // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD diff --git a/media/highway/src/hwy/contrib/sort/vqsort_u16d.cc b/media/highway/src/hwy/contrib/sort/vqsort_u16d.cc new file mode 100644 index 0000000000..4120873b9f --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort_u16d.cc @@ -0,0 +1,60 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16d.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +// Workaround for build timeout +#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortU16Desc(uint16_t* HWY_RESTRICT keys, size_t num, + uint16_t* HWY_RESTRICT buf) { + SortTag<uint16_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortU16Desc); +} // namespace + +void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(SortU16Desc)(keys, n, Get<uint16_t>()); +} + +} // namespace hwy +#endif // HWY_ONCE + +#endif // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD diff --git a/media/highway/src/hwy/contrib/sort/vqsort_u32a.cc b/media/highway/src/hwy/contrib/sort/vqsort_u32a.cc new file mode 100644 index 0000000000..4d33705b14 --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort_u32a.cc @@ -0,0 +1,55 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32a.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortU32Asc(uint32_t* HWY_RESTRICT keys, size_t num, + uint32_t* HWY_RESTRICT buf) { + SortTag<uint32_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortU32Asc); +} // namespace + +void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(SortU32Asc)(keys, n, Get<uint32_t>()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/media/highway/src/hwy/contrib/sort/vqsort_u32d.cc b/media/highway/src/hwy/contrib/sort/vqsort_u32d.cc new file mode 100644 index 0000000000..e73fb82b6d --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort_u32d.cc @@ -0,0 +1,55 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32d.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortU32Desc(uint32_t* HWY_RESTRICT keys, size_t num, + uint32_t* HWY_RESTRICT buf) { + SortTag<uint32_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortU32Desc); +} // namespace + +void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(SortU32Desc)(keys, n, Get<uint32_t>()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/media/highway/src/hwy/contrib/sort/vqsort_u64a.cc b/media/highway/src/hwy/contrib/sort/vqsort_u64a.cc new file mode 100644 index 0000000000..b1dc3f78b3 --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort_u64a.cc @@ -0,0 +1,55 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64a.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortU64Asc(uint64_t* HWY_RESTRICT keys, size_t num, + uint64_t* HWY_RESTRICT buf) { + SortTag<uint64_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortU64Asc); +} // namespace + +void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n, + SortAscending) const { + HWY_DYNAMIC_DISPATCH(SortU64Asc)(keys, n, Get<uint64_t>()); +} + +} // namespace hwy +#endif // HWY_ONCE diff --git a/media/highway/src/hwy/contrib/sort/vqsort_u64d.cc b/media/highway/src/hwy/contrib/sort/vqsort_u64d.cc new file mode 100644 index 0000000000..43acfd238f --- /dev/null +++ b/media/highway/src/hwy/contrib/sort/vqsort_u64d.cc @@ -0,0 +1,55 @@ +// Copyright 2021 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/contrib/sort/disabled_targets.h" +#include "hwy/contrib/sort/vqsort.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64d.cc" +#include "hwy/foreach_target.h" + +// After foreach_target +#include "hwy/contrib/sort/traits-inl.h" +#include "hwy/contrib/sort/vqsort-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace hwy { +namespace HWY_NAMESPACE { + +void SortU64Desc(uint64_t* HWY_RESTRICT keys, size_t num, + uint64_t* HWY_RESTRICT buf) { + SortTag<uint64_t> d; + detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st; + Sort(d, st, keys, num, buf); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace hwy +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace hwy { +namespace { +HWY_EXPORT(SortU64Desc); +} // namespace + +void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n, + SortDescending) const { + HWY_DYNAMIC_DISPATCH(SortU64Desc)(keys, n, Get<uint64_t>()); +} + +} // namespace hwy +#endif // HWY_ONCE |