diff --git a/hwy/base.h b/hwy/base.h index d6c75bbe72..a02fe7096f 100644 --- a/hwy/base.h +++ b/hwy/base.h @@ -217,7 +217,9 @@ namespace hwy { //------------------------------------------------------------------------------ // Macros -#define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED +// Note: it is safe to remove `static` for users who want to use modules, but +// that might be a breaking change for some users, hence we do not by default. +#define HWY_API static HWY_INLINE HWY_FLATTEN #define HWY_CONCAT_IMPL(a, b) a##b #define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b) @@ -2762,7 +2764,7 @@ HWY_API constexpr TTo ConvertScalarTo(TFrom in) { template constexpr inline T1 DivCeil(T1 a, T2 b) { #if HWY_CXX_LANG >= 201703L - HWY_DASSERT(b != 0); + HWY_DASSERT(b != T2{0}); #endif return (a + b - 1) / b; } @@ -2985,9 +2987,10 @@ HWY_INLINE constexpr T AddWithWraparound(T t, T2 n) { // 64 x 64 = 128 bit multiplication HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) { #if defined(__SIZEOF_INT128__) - __uint128_t product = (__uint128_t)a * (__uint128_t)b; - *upper = (uint64_t)(product >> 64); - return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL); + __uint128_t product = + static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); + *upper = static_cast(product >> 64); + return static_cast(product & 0xFFFFFFFFFFFFFFFFULL); #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 return _umul128(a, b, upper); #else @@ -3004,9 +3007,9 @@ HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) { HWY_API int64_t Mul128(int64_t a, int64_t b, int64_t* HWY_RESTRICT upper) { #if defined(__SIZEOF_INT128__) - __int128_t product = (__int128_t)a * (__int128_t)b; - *upper = (int64_t)(product >> 64); - return (int64_t)(product & 0xFFFFFFFFFFFFFFFFULL); + __int128_t product = static_cast<__int128_t>(a) * static_cast<__int128_t>(b); + *upper = static_cast(product >> 64); + return static_cast(product & 0xFFFFFFFFFFFFFFFFULL); #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 return _mul128(a, b, upper); #else diff --git a/hwy/contrib/math/math-inl.h b/hwy/contrib/math/math-inl.h index d741684576..24523dbaeb 100644 --- a/hwy/contrib/math/math-inl.h +++ b/hwy/contrib/math/math-inl.h @@ -342,7 +342,6 @@ HWY_NOINLINE V CallTanh(const D d, VecArg x) { * Valid Lane Types: float32, float64 * Max Error: ULP = 1 * Valid Range: [-39000, +39000] - * @return sine and cosine of 'x' */ template HWY_INLINE void SinCos(D d, V x, V& s, V& c); diff --git a/hwy/contrib/math/math_test.cc b/hwy/contrib/math/math_test.cc index f6b0a74d28..1c512f631b 100644 --- a/hwy/contrib/math/math_test.cc +++ b/hwy/contrib/math/math_test.cc @@ -88,7 +88,7 @@ HWY_NOINLINE void TestMath(const char* name, T (*fx1)(T), // two pieces, [+0, max] and [-0, min], otherwise [min, max]. int range_count = 1; UintT ranges[2][2] = {{min_bits, max_bits}, {0, 0}}; - if ((min < 0.0) && (max > 0.0)) { + if ((min < T{0}) && (max > T{0})) { ranges[0][0] = BitCastScalar(ConvertScalarTo(+0.0)); ranges[0][1] = max_bits; ranges[1][0] = BitCastScalar(ConvertScalarTo(-0.0)); @@ -122,9 +122,9 @@ HWY_NOINLINE void TestMath(const char* name, T (*fx1)(T), max_ulp = HWY_MAX(max_ulp, ulp); if (ulp > max_error_ulp) { fprintf(stderr, "%s: %s(%f) expected %E actual %E ulp %g max ulp %u\n", - hwy::TypeName(T(), Lanes(d)).c_str(), name, value, expected, - actual, static_cast(ulp), - static_cast(max_error_ulp)); + hwy::TypeName(T(), Lanes(d)).c_str(), name, value, + static_cast(expected), static_cast(actual), + static_cast(ulp), static_cast(max_error_ulp)); } } } @@ -139,21 +139,20 @@ HWY_NOINLINE void TestMath(const char* name, T (*fx1)(T), } #undef DEFINE_MATH_TEST -#define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \ - F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR) \ - struct Test##NAME { \ - template \ - HWY_NOINLINE void operator()(T, D d) { \ - if (sizeof(T) == 4) { \ - TestMath(HWY_STR(NAME), F32x1, F32xN, d, F32_MIN, F32_MAX, \ - F32_ERROR); \ - } else { \ - TestMath(HWY_STR(NAME), F64x1, F64xN, d, \ - static_cast(F64_MIN), static_cast(F64_MAX), \ - F64_ERROR); \ - } \ - } \ - }; \ +#define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \ + F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR) \ + struct Test##NAME { \ + template \ + HWY_NOINLINE void operator()(T, D d) { \ + TestMath(HWY_STR(NAME), F32x1, F32xN, d, F32_MIN, F32_MAX, \ + F32_ERROR); \ + } \ + template \ + HWY_NOINLINE void operator()(T, D d) { \ + TestMath(HWY_STR(NAME), F64x1, F64xN, d, static_cast(F64_MIN), \ + static_cast(F64_MAX), F64_ERROR); \ + } \ + }; \ DEFINE_MATH_TEST_FUNC(NAME) // clang-format off diff --git a/hwy/contrib/thread_pool/thread_pool.h b/hwy/contrib/thread_pool/thread_pool.h index 5d46622dab..1bd891625d 100644 --- a/hwy/contrib/thread_pool/thread_pool.h +++ b/hwy/contrib/thread_pool/thread_pool.h @@ -287,19 +287,21 @@ struct Zones { profiler::ZoneHandle run; }; -#if PROFILER_ENABLED +#if PROFILER_ENABLED || HWY_IDE // Accumulates timings and stats from main thread and workers. class Stats { - // Offsets passed to `PerThread`. - static constexpr size_t kBeforeRun = 0; - static constexpr size_t kDRun = 1; - static constexpr size_t kTasksStatic = 2; - static constexpr size_t kTasksDynamic = 3; - static constexpr size_t kTasksStolen = 4; - static constexpr size_t kDFuncStatic = 5; - static constexpr size_t kDFuncDynamic = 6; - static constexpr size_t kSentinel = 7; + // Up to `HWY_ALIGNMENT / 8` slots/offsets, passed to `PerThread`. + static constexpr size_t kDWait = 0; + static constexpr size_t kWaitReps = 1; + static constexpr size_t kTBeforeRun = 2; + static constexpr size_t kDRun = 3; + static constexpr size_t kTasksStatic = 4; + static constexpr size_t kTasksDynamic = 5; + static constexpr size_t kTasksStolen = 6; + static constexpr size_t kDFuncStatic = 7; + static constexpr size_t kDFuncDynamic = 8; + static constexpr size_t kSentinel = 9; public: Stats() { @@ -347,14 +349,18 @@ class Stats { // Called concurrently by non-main worker threads after their `WorkerRun` and // before the barrier. - void NotifyThreadRun(size_t worker_idx, Stopwatch& stopwatch) { + void NotifyThreadRun(size_t worker_idx, timer::Ticks d_wait, size_t wait_reps, + timer::Ticks t_before_run, timer::Ticks d_run) { HWY_DASSERT(worker_idx != 0); // Not called by main thread. const size_t thread_idx = worker_idx - 1; + HWY_DASSERT(PerThread(thread_idx, kDWait) == 0); + HWY_DASSERT(PerThread(thread_idx, kWaitReps) == 0); HWY_DASSERT(PerThread(thread_idx, kBeforeRun) == 0); HWY_DASSERT(PerThread(thread_idx, kDRun) == 0); - // Also store `Origin` (start time) for computing wake latency. - PerThread(thread_idx, kBeforeRun) = stopwatch.Origin(); - PerThread(thread_idx, kDRun) = stopwatch.Elapsed(); + PerThread(thread_idx, kDWait) = d_wait; + PerThread(thread_idx, kWaitReps) = wait_reps; + PerThread(thread_idx, kTBeforeRun) = t_before_run; // For wake latency. + PerThread(thread_idx, kDRun) = d_run; } // Called by the main thread after the barrier, whose store-release and @@ -362,18 +368,26 @@ class Stats { // store `after_barrier`. If workers did, which by definition happens after // the barrier, then they would race with this function's reads. void NotifyMainRun(size_t num_threads, timer::Ticks t_before_wake, - timer::Ticks d_wake, timer::Ticks d_run, + timer::Ticks d_wake, timer::Ticks d_main_run, timer::Ticks d_barrier) { HWY_DASSERT(num_threads <= kMaxThreads); + timer::Ticks min_d_run = ~timer::Ticks{0}; + timer::Ticks max_d_run = 0; + timer::Ticks sum_d_run = 0; for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) { sum_tasks_static_ += PerThread(thread_idx, kTasksStatic); sum_tasks_dynamic_ += PerThread(thread_idx, kTasksDynamic); sum_tasks_stolen_ += PerThread(thread_idx, kTasksStolen); sum_d_func_static_ += PerThread(thread_idx, kDFuncStatic); sum_d_func_dynamic_ += PerThread(thread_idx, kDFuncDynamic); - sum_d_run_ += PerThread(thread_idx, kDRun); - const timer::Ticks t_before_run = PerThread(thread_idx, kBeforeRun); + sum_d_wait_ += PerThread(thread_idx, kDWait); + sum_wait_reps_ += PerThread(thread_idx, kWaitReps); + const timer::Ticks d_thread_run = PerThread(thread_idx, kDRun); + min_d_run = HWY_MIN(min_d_run, d_thread_run); + max_d_run = HWY_MAX(max_d_run, d_thread_run); + sum_d_run += d_thread_run; + const timer::Ticks t_before_run = PerThread(thread_idx, kTBeforeRun); for (size_t offset = 0; offset < kSentinel; ++offset) { PerThread(thread_idx, offset) = 0; @@ -384,14 +398,22 @@ class Stats { sum_wake_latency_ += d_latency; max_wake_latency_ = HWY_MAX(max_wake_latency_, d_latency); } + const double inv_avg_d_run = + static_cast(num_threads) / static_cast(sum_d_run); + // Ratios of min and max run times to the average, for this pool.Run. + const double r_min = static_cast(min_d_run) * inv_avg_d_run; + const double r_max = static_cast(max_d_run) * inv_avg_d_run; num_run_++; // `num_run_*` are incremented by `NotifyRun*`. + sum_d_run_ += sum_d_run; + sum_r_min_ += r_min; // For average across all pool.Run. + sum_r_max_ += r_max; sum_d_wake_ += d_wake; // `*wake_latency_` are updated above. sum_d_barrier_ += d_barrier; - sum_d_run_ += d_run; - sum_d_run_main_ += d_run; + sum_d_run_ += d_main_run; + sum_d_run_main_ += d_main_run; } void PrintAndReset(size_t num_threads, timer::Ticks d_thread_lifetime_ticks) { @@ -402,14 +424,26 @@ class Stats { const double d_func_static = Seconds(sum_d_func_static_); const double d_func_dynamic = Seconds(sum_d_func_dynamic_); - const double d_run = Seconds(sum_d_run_); - const double d_run_main = Seconds(sum_d_run_main_); + const double sum_d_run = Seconds(sum_d_run_); + const double func_div_run = (d_func_static + d_func_dynamic) / sum_d_run; + if (!(0.95 <= func_div_run && func_div_run <= 1.0)) { + HWY_WARN("Func time %f should be similar to total run %f.", + d_func_static + d_func_dynamic, sum_d_run); + } + const double sum_d_run_main = Seconds(sum_d_run_main_); + const double max_wake_latency = Seconds(max_wake_latency_); + const double sum_d_wait = Seconds(sum_d_wait_); const double d_thread_lifetime = Seconds(d_thread_lifetime_ticks); const double inv_run = 1.0 / static_cast(num_run_); const auto per_run = [inv_run](double sum) { return sum * inv_run; }; - const auto us = [](double sec) { return sec * 1E6; }; - const auto ns = [](double sec) { return sec * 1E9; }; + const double avg_d_wake = per_run(Seconds(sum_d_wake_)); + const double avg_wake_latency = per_run(Seconds(sum_wake_latency_)); + const double avg_d_wait = per_run(sum_d_wait); + const double avg_wait_reps = per_run(static_cast(sum_wait_reps_)); + const double avg_d_barrier = per_run(Seconds(sum_d_barrier_)); + const double avg_r_min = per_run(sum_r_min_); + const double avg_r_max = per_run(sum_r_max_); const size_t num_workers = 1 + num_threads; const double avg_tasks_static = @@ -418,20 +452,25 @@ class Stats { Avg(sum_tasks_dynamic_, num_run_dynamic_ * num_workers); const double avg_steals = Avg(sum_tasks_stolen_, num_run_dynamic_ * num_workers); + const double avg_d_run = sum_d_run / num_workers; + + const double pc_wait = sum_d_wait / d_thread_lifetime * 100.0; + const double pc_run = sum_d_run / d_thread_lifetime * 100.0; + const double pc_main = sum_d_run_main / avg_d_run * 100.0; + const auto us = [](double sec) { return sec * 1E6; }; + const auto ns = [](double sec) { return sec * 1E9; }; printf( - "%3zu: static %5d, %.2f tasks; dyn %5d, %4.1f tasks, %.2f steals; " + "%3zu: %5d x %.2f/%5d x %4.1f tasks, %.2f steals; " "wake %7.3f ns, latency %6.3f < %7.3f us, barrier %7.3f us; " - "func: static %6.3f + dyn %7.3f = %.1f%% of total run %7.3f s, " - "%.1f%% of thread time %7.3f s; main run share %5.1f%%\n", - num_threads, static_cast(num_run_static_), avg_tasks_static, - static_cast(num_run_dynamic_), avg_tasks_dynamic, avg_steals, - ns(per_run(Seconds(sum_d_wake_))), - us(per_run(Seconds(sum_wake_latency_))), us(Seconds(max_wake_latency_)), - us(per_run(Seconds(sum_d_barrier_))), d_func_static, d_func_dynamic, - (d_func_static + d_func_dynamic) / d_run * 100.0, d_run, - d_run / d_thread_lifetime * 100.0, d_thread_lifetime, - d_run_main * 100.0 / (d_run / (1 + num_threads))); + "wait %.1f us (%5.0f reps, %4.1f%%), balance %4.1f%%-%5.1f%%, " + "func: %6.3f + %7.3f, " + "%.1f%% of thread time %7.3f s; main:worker %5.1f%%\n", + num_threads, num_run_static_, avg_tasks_static, num_run_dynamic_, + avg_tasks_dynamic, avg_steals, ns(avg_d_wake), us(avg_wake_latency), + us(max_wake_latency), us(avg_d_barrier), us(avg_d_wait), avg_wait_reps, + pc_wait, avg_r_min * 100.0, avg_r_max * 100.0, d_func_static, + d_func_dynamic, pc_run, d_thread_lifetime, pc_main); Reset(num_threads); } @@ -441,17 +480,21 @@ class Stats { num_run_static_ = 0; num_run_dynamic_ = 0; + sum_tasks_stolen_ = 0; sum_tasks_static_ = 0; sum_tasks_dynamic_ = 0; - sum_tasks_stolen_ = 0; sum_d_wake_ = 0; sum_wake_latency_ = 0; max_wake_latency_ = 0; + sum_d_wait_ = 0; + sum_wait_reps_ = 0; sum_d_barrier_ = 0; sum_d_func_static_ = 0; sum_d_func_dynamic_ = 0; + sum_r_min_ = 0.0; + sum_r_max_ = 0.0; sum_d_run_ = 0; sum_d_run_main_ = 0; // ctor and `NotifyMainRun` already reset `PerThread`. @@ -471,25 +514,28 @@ class Stats { return per_thread_[thread_idx * kU64PerLine + offset]; } - int64_t num_run_; - int64_t num_run_static_; - int64_t num_run_dynamic_; + int32_t num_run_; + int32_t num_run_static_; + int32_t num_run_dynamic_; + int32_t sum_tasks_stolen_; int64_t sum_tasks_static_; int64_t sum_tasks_dynamic_; - int64_t sum_tasks_stolen_; timer::Ticks sum_d_wake_; timer::Ticks sum_wake_latency_; timer::Ticks max_wake_latency_; + timer::Ticks sum_d_wait_; + uint64_t sum_wait_reps_; timer::Ticks sum_d_barrier_; timer::Ticks sum_d_func_static_; timer::Ticks sum_d_func_dynamic_; + double sum_r_min_; + double sum_r_max_; timer::Ticks sum_d_run_; timer::Ticks sum_d_run_main_; - HWY_MEMBER_VAR_MAYBE_UNUSED uint64_t padding_[kU64PerLine - 14]; // One cache line per pool thread to avoid false sharing. uint64_t per_thread_[kMaxThreads * kU64PerLine]; }; @@ -501,7 +547,8 @@ static_assert(sizeof(Stats) == (kMaxThreads + 1) * HWY_ALIGNMENT, "Wrong size"); struct Stats { void NotifyRunStatic(size_t, timer::Ticks) {} void NotifyRunDynamic(size_t, size_t, size_t, timer::Ticks) {} - void NotifyThreadRun(size_t, Stopwatch&) {} + void NotifyThreadRun(size_t, timer::Ticks, size_t, timer::Ticks, + timer::Ticks) {} void NotifyMainRun(size_t, timer::Ticks, timer::Ticks, timer::Ticks, timer::Ticks) {} void PrintAndReset(size_t, timer::Ticks) {} @@ -1194,8 +1241,9 @@ class alignas(HWY_ALIGNMENT) ThreadPool { fprintf(stderr, "Pool %3zu: %s %8.0f +/- %6.0f. Gain %.2fx [%.2fx, %.2fx]\n", NumWorkers(), auto_tuner.Best()->ToString().c_str(), best_cost, - AT.Stddev(), s_ratio.GeometricMean(), s_ratio.Min(), - s_ratio.Max()); + AT.Stddev(), s_ratio.GeometricMean(), + static_cast(s_ratio.Min()), + static_cast(s_ratio.Max())); } } SendConfig(next); @@ -1261,12 +1309,16 @@ class alignas(HWY_ALIGNMENT) ThreadPool { // Main worker also calls this, so their epochs match. const uint32_t epoch = worker.AdvanceWorkerEpoch(); - // TODO: log number of spin-wait iterations. - (void)wait.UntilWoken(worker, spin); - Stopwatch stopwatch = worker.MakeStopwatch(); + + const size_t wait_reps = wait.UntilWoken(worker, spin); + const timer::Ticks d_wait = stopwatch.Elapsed(); + const timer::Ticks t_before_run = stopwatch.Origin(); + tasks.WorkerRun(&worker); - shared.stats.NotifyThreadRun(worker.Index(), stopwatch); + const timer::Ticks d_run = stopwatch.Elapsed(); + shared.stats.NotifyThreadRun(worker.Index(), d_wait, wait_reps, + t_before_run, d_run); // Notify barrier after `WorkerRun`. Note that we cannot send an // after-barrier timestamp, see above. diff --git a/hwy/contrib/thread_pool/topology.cc b/hwy/contrib/thread_pool/topology.cc index 8da62a09d6..b041799fca 100644 --- a/hwy/contrib/thread_pool/topology.cc +++ b/hwy/contrib/thread_pool/topology.cc @@ -1018,7 +1018,7 @@ bool InitCachesSysfs(Caches& caches) { // and their properties. It's OK to return false; callers are responsible for // assuming reasonable defaults. #ifndef __ANDROID__ - HWY_WARN("sysfs detected L1=%u L2=%u, err %x\n", caches[1].size_kib, + HWY_WARN("sysfs detected L1=%u L2=%u, err %d\n", caches[1].size_kib, caches[2].size_kib, errno); #endif return false; diff --git a/hwy/highway_test.cc b/hwy/highway_test.cc index c5d1f15609..b4593434ac 100644 --- a/hwy/highway_test.cc +++ b/hwy/highway_test.cc @@ -548,7 +548,7 @@ struct TestBlocks { }; HWY_NOINLINE void TestAllBlocks() { - ForAllTypes(ForPartialVectors()); + ForAllTypes(ForPartialVectors()); } struct TestBlockDFromD { diff --git a/hwy/perf_counters.cc b/hwy/perf_counters.cc index 472d19b0e8..4cad466d67 100644 --- a/hwy/perf_counters.cc +++ b/hwy/perf_counters.cc @@ -343,7 +343,7 @@ class PMU { // Monostate, see header. PMU& GetPMU() { - static PMU pmu; + static PMU& pmu = *new PMU(); // avoids exit-dtor warning (no dtor required) return pmu; } diff --git a/hwy/print.cc b/hwy/print.cc index ed17b4e965..cea41042b6 100644 --- a/hwy/print.cc +++ b/hwy/print.cc @@ -72,7 +72,7 @@ HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr, float value; CopyBytes<4>(ptr, &value); // NOLINTNEXTLINE - snprintf(string100, 100, hwy::ScalarAbs(value) < 1E-6 ? "%.9E" : "%.9f", + snprintf(string100, 100, hwy::ScalarAbs(value) < 1E-6f ? "%.9E" : "%.9f", static_cast(value)); } else if (info.is_signed) { int32_t value; diff --git a/hwy/stats.cc b/hwy/stats.cc index 4c53124b5b..c6b7f37d93 100644 --- a/hwy/stats.cc +++ b/hwy/stats.cc @@ -69,7 +69,7 @@ std::string Stats::ToString(int exclude) const { if (Count() == 0) return std::string("(none)"); char buf[300]; - int pos = 0; + size_t pos = 0; int ret; // snprintf - bytes written or negative for error. if ((exclude & kNoCount) == 0) { @@ -93,8 +93,8 @@ std::string Stats::ToString(int exclude) const { } if ((exclude & kNoMinMax) == 0) { - ret = snprintf(buf + pos, sizeof(buf) - pos, "Min=%8.5e Max=%8.5e ", Min(), - Max()); + ret = snprintf(buf + pos, sizeof(buf) - pos, "Min=%8.5e Max=%8.5e ", + static_cast(Min()), static_cast(Max())); HWY_ASSERT(ret > 0); pos += ret; } @@ -113,7 +113,7 @@ std::string Stats::ToString(int exclude) const { pos += ret; } - HWY_ASSERT(pos < static_cast(sizeof(buf))); + HWY_ASSERT(pos < sizeof(buf)); return buf; } diff --git a/hwy/tests/crypto_test.cc b/hwy/tests/crypto_test.cc index 13da12be68..f6b6dc44a6 100644 --- a/hwy/tests/crypto_test.cc +++ b/hwy/tests/crypto_test.cc @@ -660,8 +660,8 @@ struct TestCLMul { CopyBytes(kCLMulLower, expected_lower.get()); CopyBytes(kCLMulUpper, expected_upper.get()); const size_t padding_size = (padded - kCLMulNum) * sizeof(T); - memset(expected_lower.get() + kCLMulNum, 0, padding_size); - memset(expected_upper.get() + kCLMulNum, 0, padding_size); + ZeroBytes(expected_lower.get() + kCLMulNum, padding_size); + ZeroBytes(expected_upper.get() + kCLMulNum, padding_size); // Random inputs in each lane RandomState rng; diff --git a/hwy/tests/demote_test.cc b/hwy/tests/demote_test.cc index b2dc7f4a9d..31898e2a79 100644 --- a/hwy/tests/demote_test.cc +++ b/hwy/tests/demote_test.cc @@ -511,8 +511,8 @@ class TestReorderDemote2To { const auto sum_expected = ReduceSum(d32, Add(f0, f1)); const auto sum_actual = ReduceSum(d32, Add(promoted0, promoted1)); - HWY_ASSERT(sum_expected - 1E-4 <= sum_actual && - sum_actual <= sum_expected + 1E-4); + HWY_ASSERT(sum_expected - 1E-4f <= sum_actual && + sum_actual <= sum_expected + 1E-4f); // Ensure values are the same after sorting to undo the Reorder Store(f0, d32, expected.get() + 0); diff --git a/hwy/tests/float_test.cc b/hwy/tests/float_test.cc index f0a00aaedc..f971dde4fe 100644 --- a/hwy/tests/float_test.cc +++ b/hwy/tests/float_test.cc @@ -150,7 +150,7 @@ struct TestApproximateReciprocal { if (l1 > max_l1) { max_l1 = l1; worst_expected = expected; - worst_actual = actual[i]; + worst_actual = ConvertScalarTo(actual[i]); } } const double abs_worst_expected = ScalarAbs(worst_expected); @@ -197,7 +197,7 @@ struct TestMaskedApproximateReciprocal { if (l1 > max_l1) { max_l1 = l1; worst_expected = expected; - worst_actual = actual[i]; + worst_actual = ConvertScalarTo(actual[i]); } } const double abs_worst_expected = ScalarAbs(worst_expected); @@ -381,19 +381,8 @@ HWY_NOINLINE void TestAllRound() { ForFloatTypes(ForPartialVectors()); } -struct TestNearestInt { - static HWY_INLINE int16_t RoundScalarFloatToInt(float16_t f) { - return static_cast(std::lrintf(ConvertScalarTo(f))); - } - - static HWY_INLINE int32_t RoundScalarFloatToInt(float f) { - return static_cast(std::lrintf(f)); - } - - static HWY_INLINE int64_t RoundScalarFloatToInt(double f) { - return static_cast(std::llrint(f)); - } - +class TestNearestInt { + public: template HWY_NOINLINE void operator()(TF tf, const DF df) { using TI = MakeSigned; @@ -423,6 +412,19 @@ struct TestNearestInt { HWY_ASSERT_VEC_EQ(di, &expected[i], NearestInt(no_nan)); } } + + private: + static HWY_INLINE int16_t RoundScalarFloatToInt(float16_t f) { + return static_cast(std::lrintf(ConvertScalarTo(f))); + } + + static HWY_INLINE int32_t RoundScalarFloatToInt(float f) { + return static_cast(std::lrintf(f)); + } + + static HWY_INLINE int64_t RoundScalarFloatToInt(double f) { + return static_cast(std::llrint(f)); + } }; HWY_NOINLINE void TestAllNearestInt() { diff --git a/hwy/tests/list_targets.cc b/hwy/tests/list_targets.cc index 0ed64f4379..7232cdae35 100644 --- a/hwy/tests/list_targets.cc +++ b/hwy/tests/list_targets.cc @@ -30,6 +30,7 @@ HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { +namespace { void PrintCompiler() { if (HWY_COMPILER_ICX) { @@ -141,6 +142,7 @@ void PrintAll() { TestVisitor(); } +} // namespace // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy diff --git a/hwy/tests/mask_mem_test.cc b/hwy/tests/mask_mem_test.cc index 0fd84e6e9a..26a172f60d 100644 --- a/hwy/tests/mask_mem_test.cc +++ b/hwy/tests/mask_mem_test.cc @@ -333,7 +333,7 @@ class TestStoreMaskBits { const auto mask2 = LoadMaskBits(di, actual.get()); HWY_ASSERT_MASK_EQ(di, mask, mask2); - memset(expected.get(), 0, expected_num_bytes); + ZeroBytes(expected.get(), expected_num_bytes); for (size_t i = 0; i < N; ++i) { expected[i / 8] = static_cast(expected[i / 8] | (bool_lanes[i] << (i % 8))); diff --git a/hwy/tests/mask_set_test.cc b/hwy/tests/mask_set_test.cc index 4c526b26cb..1d765d19ce 100644 --- a/hwy/tests/mask_set_test.cc +++ b/hwy/tests/mask_set_test.cc @@ -115,7 +115,7 @@ struct TestSetBeforeFirst { const size_t N = Lanes(di); auto bool_lanes = AllocateAligned(N); HWY_ASSERT(bool_lanes); - memset(bool_lanes.get(), 0, N * sizeof(TI)); + ZeroBytes(bool_lanes.get(), N * sizeof(TI)); // For all combinations of zero/nonzero state of subset of lanes: const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); @@ -149,7 +149,7 @@ struct TestSetAtOrBeforeFirst { const size_t N = Lanes(di); auto bool_lanes = AllocateAligned(N); HWY_ASSERT(bool_lanes); - memset(bool_lanes.get(), 0, N * sizeof(TI)); + ZeroBytes(bool_lanes.get(), N * sizeof(TI)); // For all combinations of zero/nonzero state of subset of lanes: const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); @@ -183,7 +183,7 @@ struct TestSetOnlyFirst { const size_t N = Lanes(di); auto bool_lanes = AllocateAligned(N); HWY_ASSERT(bool_lanes); - memset(bool_lanes.get(), 0, N * sizeof(TI)); + ZeroBytes(bool_lanes.get(), N * sizeof(TI)); auto expected_lanes = AllocateAligned(N); HWY_ASSERT(expected_lanes); @@ -194,7 +194,7 @@ struct TestSetOnlyFirst { bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); } - memset(expected_lanes.get(), 0, N * sizeof(TI)); + ZeroBytes(expected_lanes.get(), N * sizeof(TI)); if (code != 0) { const size_t idx_of_first_lane = Num0BitsBelowLS1Bit_Nonzero64(static_cast(code)); @@ -222,7 +222,7 @@ struct TestSetAtOrAfterFirst { const size_t N = Lanes(di); auto bool_lanes = AllocateAligned(N); HWY_ASSERT(bool_lanes); - memset(bool_lanes.get(), 0, N * sizeof(TI)); + ZeroBytes(bool_lanes.get(), N * sizeof(TI)); // For all combinations of zero/nonzero state of subset of lanes: const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); diff --git a/hwy/tests/mask_test.cc b/hwy/tests/mask_test.cc index e73a790c37..ceda38e413 100644 --- a/hwy/tests/mask_test.cc +++ b/hwy/tests/mask_test.cc @@ -35,7 +35,7 @@ struct TestMaskFromVec { auto lanes = AllocateAligned(N); HWY_ASSERT(lanes); - memset(lanes.get(), 0, N * sizeof(T)); + ZeroBytes(lanes.get(), N * sizeof(T)); const Mask actual_false = MaskFromVec(Load(d, lanes.get())); HWY_ASSERT_MASK_EQ(d, MaskFalse(d), actual_false); diff --git a/hwy/tests/slide_up_down_test.cc b/hwy/tests/slide_up_down_test.cc index 1fc2da4629..2befb60cb4 100644 --- a/hwy/tests/slide_up_down_test.cc +++ b/hwy/tests/slide_up_down_test.cc @@ -13,8 +13,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include // memset - #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "tests/slide_up_down_test.cc" #include "hwy/foreach_target.h" // IWYU pragma: keep diff --git a/hwy/tests/test_util-inl.h b/hwy/tests/test_util-inl.h index 1672016e69..8a7b638481 100644 --- a/hwy/tests/test_util-inl.h +++ b/hwy/tests/test_util-inl.h @@ -16,7 +16,6 @@ // Target-specific helper functions for use by *_test.cc. #include -#include // memset // IWYU pragma: begin_exports #include @@ -178,8 +177,8 @@ HWY_NOINLINE void AssertMaskEqual(D d, VecArg> a, VecArg> b, auto bits_a = AllocateAligned(HWY_MAX(size_t{8}, N8)); auto bits_b = AllocateAligned(size_t{HWY_MAX(8, N8)}); HWY_ASSERT(bits_a && bits_b); - memset(bits_a.get(), 0, N8); - memset(bits_b.get(), 0, N8); + ZeroBytes(bits_a.get(), N8); + ZeroBytes(bits_b.get(), N8); const size_t num_bytes_a = StoreMaskBits(d, a, bits_a.get()); const size_t num_bytes_b = StoreMaskBits(d, b, bits_b.get()); AssertEqual(num_bytes_a, num_bytes_b, target_name, filename, line);