diff --git a/hwy/contrib/thread_pool/thread_pool.h b/hwy/contrib/thread_pool/thread_pool.h index 1f19ab4a8f..41abd7dd51 100644 --- a/hwy/contrib/thread_pool/thread_pool.h +++ b/hwy/contrib/thread_pool/thread_pool.h @@ -458,7 +458,7 @@ class Stats { printf( "%3zu: %5d x %.2f/%5d x %4.1f tasks, %.2f steals; " "wake %7.3f ns, latency %6.3f < %7.3f us, barrier %7.3f us; " - "wait %.1f us (%5.0f reps, %4.1f%%), balance %4.1f%%-%5.1f%%, " + "wait %.1f us (%6.0f reps, %4.1f%%), balance %4.1f%%-%5.1f%%, " "func: %6.3f + %7.3f, " "%.1f%% of thread time %7.3f s; main:worker %5.1f%%\n", num_threads, num_run_static_, avg_tasks_static, num_run_dynamic_, @@ -589,7 +589,7 @@ class CallerAccumulator { const double task_len = avg_elapsed / avg_tasks_per_worker; printf( "%40s: %7.0f x (%3.0f%%) %2zu clusters, %4.1f workers @ " - "%5.1f tasks (%u-%u), " + "%5.1f tasks (%5u-%5u), " "%5.0f us wait, %6.1E us run (task len %6.1E us), total %6.2f s\n", caller, static_cast(calls_), pc_root, active_clusters, avg_workers, avg_tasks_per_worker, static_cast(min_tasks_), @@ -1605,7 +1605,7 @@ class alignas(HWY_ALIGNMENT) ThreadPool { // Returns whether threads were used. If not, there is no need to update // the autotuner config. template - bool RunWithoutAutotune(uint64_t begin, uint64_t end, HWY_MAYBE_UNUSED pool::Caller caller, + bool RunWithoutAutotune(uint64_t begin, uint64_t end, pool::Caller caller, const Closure& closure) { pool::Worker& main = workers_[0]; @@ -1649,6 +1649,8 @@ class alignas(HWY_ALIGNMENT) ThreadPool { PROFILER_END_ROOT_RUN(); shared_.LastRootEnd().Reset(); } +#else + (void)caller; #endif busy_.Clear(); diff --git a/hwy/ops/set_macros-inl.h b/hwy/ops/set_macros-inl.h index 2a5e1dc1ca..bf604d96da 100644 --- a/hwy/ops/set_macros-inl.h +++ b/hwy/ops/set_macros-inl.h @@ -141,7 +141,7 @@ #define HWY_TARGET_STR_AVX2 \ HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C -#ifndef HWY_HAVE_EVEX512 +#ifndef HWY_HAVE_EVEX512 // allow override // evex512 has been removed from clang 22, see // https://github.com/llvm/llvm-project/pull/157034 #if (1400 <= HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1600) || \ @@ -167,15 +167,21 @@ ",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avx512vnni,avx512bitalg," \ "avx512vpopcntdq,gfni" -// Force-disable for compilers that do not properly support avx512bf16. -#if !defined(HWY_AVX3_DISABLE_AVX512BF16) && \ +// Opt-out for compilers that do not properly support avx512bf16. +#ifndef HWY_AVX3_ENABLE_AVX512BF16 // allow override +// Default is to disable if the DISABLE macro is defined, or if old compiler. +// clang-cl 21.1.4 reportedly works; feel free to define this to 1 there. +#if defined(HWY_AVX3_DISABLE_AVX512BF16) || \ (HWY_COMPILER_CLANGCL || \ (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) || \ (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 900)) -#define HWY_AVX3_DISABLE_AVX512BF16 +#define HWY_AVX3_ENABLE_AVX512BF16 0 +#else +#define HWY_AVX3_ENABLE_AVX512BF16 1 #endif +#endif // HWY_AVX3_ENABLE_AVX512BF16 -#if !defined(HWY_AVX3_DISABLE_AVX512BF16) +#if HWY_AVX3_ENABLE_AVX512BF16 #define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL ",avx512bf16" #else #define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL @@ -338,7 +344,7 @@ #define HWY_HAVE_FLOAT64 1 #define HWY_MEM_OPS_MIGHT_FAULT 0 #define HWY_NATIVE_FMA 1 -#if (HWY_TARGET <= HWY_AVX3_ZEN4) && !defined(HWY_AVX3_DISABLE_AVX512BF16) +#if (HWY_TARGET <= HWY_AVX3_ZEN4) && HWY_AVX3_ENABLE_AVX512BF16 #define HWY_NATIVE_DOT_BF16 1 #else #define HWY_NATIVE_DOT_BF16 0 diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h index e496e3e9e2..5e1573831f 100644 --- a/hwy/ops/x86_128-inl.h +++ b/hwy/ops/x86_128-inl.h @@ -57,7 +57,7 @@ namespace detail { #undef HWY_AVX3_HAVE_F32_TO_BF16C #if HWY_TARGET <= HWY_AVX3_ZEN4 && !HWY_COMPILER_CLANGCL && \ (HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 900) && \ - !defined(HWY_AVX3_DISABLE_AVX512BF16) + HWY_AVX3_ENABLE_AVX512BF16 #define HWY_AVX3_HAVE_F32_TO_BF16C 1 #else #define HWY_AVX3_HAVE_F32_TO_BF16C 0