这是indexloc提供的服务,不要输入任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions hwy/contrib/thread_pool/thread_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ class Stats {
printf(
"%3zu: %5d x %.2f/%5d x %4.1f tasks, %.2f steals; "
"wake %7.3f ns, latency %6.3f < %7.3f us, barrier %7.3f us; "
"wait %.1f us (%5.0f reps, %4.1f%%), balance %4.1f%%-%5.1f%%, "
"wait %.1f us (%6.0f reps, %4.1f%%), balance %4.1f%%-%5.1f%%, "
"func: %6.3f + %7.3f, "
"%.1f%% of thread time %7.3f s; main:worker %5.1f%%\n",
num_threads, num_run_static_, avg_tasks_static, num_run_dynamic_,
Expand Down Expand Up @@ -589,7 +589,7 @@ class CallerAccumulator {
const double task_len = avg_elapsed / avg_tasks_per_worker;
printf(
"%40s: %7.0f x (%3.0f%%) %2zu clusters, %4.1f workers @ "
"%5.1f tasks (%u-%u), "
"%5.1f tasks (%5u-%5u), "
"%5.0f us wait, %6.1E us run (task len %6.1E us), total %6.2f s\n",
caller, static_cast<double>(calls_), pc_root, active_clusters,
avg_workers, avg_tasks_per_worker, static_cast<uint32_t>(min_tasks_),
Expand Down Expand Up @@ -1605,7 +1605,7 @@ class alignas(HWY_ALIGNMENT) ThreadPool {
// Returns whether threads were used. If not, there is no need to update
// the autotuner config.
template <class Closure>
bool RunWithoutAutotune(uint64_t begin, uint64_t end, HWY_MAYBE_UNUSED pool::Caller caller,
bool RunWithoutAutotune(uint64_t begin, uint64_t end, pool::Caller caller,
const Closure& closure) {
pool::Worker& main = workers_[0];

Expand Down Expand Up @@ -1649,6 +1649,8 @@ class alignas(HWY_ALIGNMENT) ThreadPool {
PROFILER_END_ROOT_RUN();
shared_.LastRootEnd().Reset();
}
#else
(void)caller;
#endif

busy_.Clear();
Expand Down
18 changes: 12 additions & 6 deletions hwy/ops/set_macros-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@
#define HWY_TARGET_STR_AVX2 \
HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C

#ifndef HWY_HAVE_EVEX512
#ifndef HWY_HAVE_EVEX512 // allow override
// evex512 has been removed from clang 22, see
// https://github.com/llvm/llvm-project/pull/157034
#if (1400 <= HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1600) || \
Expand All @@ -167,15 +167,21 @@
",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avx512vnni,avx512bitalg," \
"avx512vpopcntdq,gfni"

// Force-disable for compilers that do not properly support avx512bf16.
#if !defined(HWY_AVX3_DISABLE_AVX512BF16) && \
// Opt-out for compilers that do not properly support avx512bf16.
#ifndef HWY_AVX3_ENABLE_AVX512BF16 // allow override
// Default is to disable if the DISABLE macro is defined, or if old compiler.
// clang-cl 21.1.4 reportedly works; feel free to define this to 1 there.
#if defined(HWY_AVX3_DISABLE_AVX512BF16) || \
(HWY_COMPILER_CLANGCL || \
(HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) || \
(HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 900))
#define HWY_AVX3_DISABLE_AVX512BF16
#define HWY_AVX3_ENABLE_AVX512BF16 0
#else
#define HWY_AVX3_ENABLE_AVX512BF16 1
#endif
#endif // HWY_AVX3_ENABLE_AVX512BF16

#if !defined(HWY_AVX3_DISABLE_AVX512BF16)
#if HWY_AVX3_ENABLE_AVX512BF16
#define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL ",avx512bf16"
#else
#define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL
Expand Down Expand Up @@ -338,7 +344,7 @@
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 0
#define HWY_NATIVE_FMA 1
#if (HWY_TARGET <= HWY_AVX3_ZEN4) && !defined(HWY_AVX3_DISABLE_AVX512BF16)
#if (HWY_TARGET <= HWY_AVX3_ZEN4) && HWY_AVX3_ENABLE_AVX512BF16
#define HWY_NATIVE_DOT_BF16 1
#else
#define HWY_NATIVE_DOT_BF16 0
Expand Down
2 changes: 1 addition & 1 deletion hwy/ops/x86_128-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ namespace detail {
#undef HWY_AVX3_HAVE_F32_TO_BF16C
#if HWY_TARGET <= HWY_AVX3_ZEN4 && !HWY_COMPILER_CLANGCL && \
(HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 900) && \
!defined(HWY_AVX3_DISABLE_AVX512BF16)
HWY_AVX3_ENABLE_AVX512BF16
#define HWY_AVX3_HAVE_F32_TO_BF16C 1
#else
#define HWY_AVX3_HAVE_F32_TO_BF16C 0
Expand Down