diff --git a/src/gaussian/f16.rs b/src/gaussian/f16.rs index e1b04288..7c41c0f3 100644 --- a/src/gaussian/f16.rs +++ b/src/gaussian/f16.rs @@ -1,3 +1,4 @@ +#![allow(dead_code)] // ShaderType derives emit unused check helpers use std::marker::Copy; use half::f16; @@ -28,6 +29,7 @@ use crate::gaussian::{ }; +#[allow(dead_code)] #[derive( Clone, Debug, @@ -132,6 +134,7 @@ impl From<[u32; 4]> for RotationScaleOpacityPacked128 { } +#[allow(dead_code)] #[derive( Clone, Debug, @@ -201,6 +204,7 @@ impl From<[u32; 4]> for Covariance3dOpacityPacked128 { +#[allow(dead_code)] #[derive( Clone, Debug, @@ -299,3 +303,4 @@ pub fn unpack_u32_to_f32s(value: u32) -> (f32, f32) { let (upper, lower) = unpack_u32_to_f16s(value); (upper.to_f32(), lower.to_f32()) } + diff --git a/src/gaussian/f32.rs b/src/gaussian/f32.rs index e0d3b9fe..4e589f7a 100644 --- a/src/gaussian/f32.rs +++ b/src/gaussian/f32.rs @@ -1,3 +1,4 @@ +#![allow(dead_code)] // ShaderType derives emit unused check helpers use std::marker::Copy; use bevy::{ @@ -25,6 +26,7 @@ use crate::gaussian::{ pub type Position = [f32; 3]; +#[allow(dead_code)] #[derive( Clone, Debug, @@ -58,6 +60,7 @@ impl From<[f32; 4]> for PositionTimestamp { } +#[allow(dead_code)] #[derive( Clone, Debug, @@ -99,6 +102,7 @@ impl From<[f32; 4]> for PositionVisibility { } +#[allow(dead_code)] #[derive( Clone, Debug, @@ -124,6 +128,7 @@ impl From<[f32; 4]> for Rotation { } +#[allow(dead_code)] #[derive( Clone, Debug, @@ -182,6 +187,7 @@ impl From<[f32; 8]> for IsotropicRotations { } +#[allow(dead_code)] #[derive( Clone, Debug, @@ -215,6 +221,7 @@ impl From<[f32; 4]> for ScaleOpacity { } +#[allow(dead_code)] #[derive( Clone, Debug, @@ -246,6 +253,7 @@ impl From<[f32; 4]> for TimestampTimescale { } +#[allow(dead_code)] #[derive( Clone, Debug, @@ -280,3 +288,4 @@ impl From<&Gaussian3d> for Covariance3dOpacity { } } } + diff --git a/src/material/spherical_harmonics.rs b/src/material/spherical_harmonics.rs index 88d5e816..7e3fe86a 100644 --- a/src/material/spherical_harmonics.rs +++ b/src/material/spherical_harmonics.rs @@ -1,3 +1,4 @@ +#![allow(dead_code)] // ShaderType derives emit unused check helpers use std::marker::Copy; use bevy::{ @@ -97,6 +98,7 @@ pub const SH_VEC4_PLANES: usize = SH_COEFF_COUNT / 4; // } +#[allow(dead_code)] #[derive( Clone, Copy, @@ -240,3 +242,4 @@ where d.deserialize_tuple(SH_COEFF_COUNT, CoefficientsVisitor) } + diff --git a/src/material/spherindrical_harmonics.rs b/src/material/spherindrical_harmonics.rs index 598be565..92a49410 100644 --- a/src/material/spherindrical_harmonics.rs +++ b/src/material/spherindrical_harmonics.rs @@ -1,3 +1,4 @@ +#![allow(dead_code)] // ShaderType derives emit unused check helpers use std::marker::Copy; use bevy::{ @@ -87,6 +88,7 @@ impl Plugin for SpherindricalHarmonicCoefficientsPlugin { // pub coefficients: [[u32; POD_ARRAY_SIZE]; POD_PLANE_COUNT], // } +#[allow(dead_code)] #[derive( Clone, Copy, @@ -252,3 +254,4 @@ where d.deserialize_tuple(SH_4D_COEFF_COUNT, CoefficientsVisitor) } + diff --git a/src/render/mod.rs b/src/render/mod.rs index fab8c109..913ca9cd 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -1,3 +1,4 @@ +#![allow(dead_code)] // ShaderType derives emit unused check helpers use std::{ hash::Hash, num::NonZero, @@ -614,7 +615,7 @@ impl Default for ShaderDefines { let workgroup_entries_a = workgroup_invocations_a * entries_per_invocation_a; let workgroup_entries_c = workgroup_invocations_c * entries_per_invocation_c; let sorting_buffer_size = radix_base * radix_digit_places * - std::mem::size_of::() as u32 + 5 * std::mem::size_of::() as u32; + std::mem::size_of::() as u32 + (5 + radix_base) * std::mem::size_of::() as u32; Self { radix_bits_per_digit, @@ -837,6 +838,7 @@ type DrawGaussians = ( ); +#[allow(dead_code)] #[derive(Component, ShaderType, Clone, Copy)] pub struct CloudUniform { pub transform: Mat4, @@ -1131,6 +1133,7 @@ pub fn queue_gaussian_view_bind_groups( } // Prepare the compute view bind group using the compute_view_layout (for compute pipelines) +#[allow(clippy::too_many_arguments)] pub fn queue_gaussian_compute_view_bind_groups( mut commands: Commands, render_device: Res, @@ -1149,7 +1152,6 @@ pub fn queue_gaussian_compute_view_bind_groups( globals_buffer: Res, ) where - R: PlanarSync, R::GpuPlanarType: GpuPlanarStorage, { if let ( @@ -1396,3 +1398,4 @@ where RenderCommandResult::Success } } + diff --git a/src/sort/mod.rs b/src/sort/mod.rs index 5285d1a2..bfb8e8b0 100644 --- a/src/sort/mod.rs +++ b/src/sort/mod.rs @@ -1,3 +1,4 @@ +#![allow(dead_code)] // ShaderType derives emit unused check helpers use core::time::Duration; use std::marker::PhantomData; @@ -418,6 +419,7 @@ impl From<&SortedEntriesHandle> for AssetId { } +#[allow(dead_code)] #[derive( Clone, Copy, @@ -543,3 +545,4 @@ pub struct GpuSortedEntry { #[cfg(feature = "buffer_texture")] pub texture: Handle, } + diff --git a/src/sort/radix.rs b/src/sort/radix.rs index dc5e4115..20b2f437 100644 --- a/src/sort/radix.rs +++ b/src/sort/radix.rs @@ -495,7 +495,7 @@ where }; let group = render_device.create_bind_group( - format!("radix_sort_bind_group pass={} parity={}", pass_idx, parity).as_str(), + format!("radix_sort_bind_group pass={pass_idx} parity={parity}").as_str(), &radix_pipeline.radix_sort_layout, &[ // sorting_pass_index (u32) == pass_idx regardless of parity diff --git a/src/sort/radix.wgsl b/src/sort/radix.wgsl index 3b75a1a9..f0dca0bd 100644 --- a/src/sort/radix.wgsl +++ b/src/sort/radix.wgsl @@ -35,6 +35,7 @@ struct SortingGlobal { digit_histogram: array, #{RADIX_BASE}>, #{RADIX_DIGIT_PLACES}>, assignment_counter: atomic, + digit_tile_head: array, #{RADIX_BASE}>, } @group(3) @binding(0) var sorting_pass_index: u32; @@ -61,6 +62,9 @@ fn radix_reset( let b = local_id.x; let p = local_id.y; atomicStore(&sorting.digit_histogram[p][b], 0u); + if (p == 0u) { + atomicStore(&sorting.digit_tile_head[b], 0u); + } if (global_id.x == 0u && global_id.y == 0u) { atomicStore(&sorting.assignment_counter, 0u); draw_indirect.instance_count = 0u; @@ -123,7 +127,7 @@ var sorted_tile_entries: array; var local_digit_counts: array; var local_digit_offsets: array; var digit_global_base_ws: array; -var total_valid_in_tile_ws: u32; +var tile_entry_count_ws: u32; const INVALID_KEY: u32 = 0xFFFFFFFFu; @@ -140,31 +144,37 @@ fn radix_sort_c( let threads = #{WORKGROUP_INVOCATIONS_C}u; let global_entry_offset = workgroup_id.y * tile_size; + // Clear per-digit base cache so stale values are never reused across tiles. + if (tid < #{RADIX_BASE}u) { + digit_global_base_ws[tid] = 0u; + } + workgroupBarrier(); + // --- Step 1: Parallel load --- for (var i = tid; i < tile_size; i += threads) { let idx = global_entry_offset + i; if (idx < gaussian_uniforms.count) { tile_input_entries[i] = input_entries[idx]; } else { - tile_input_entries[i].key = INVALID_KEY; + tile_input_entries[i] = Entry(INVALID_KEY, INVALID_KEY); } } workgroupBarrier(); // --- Step 2: Serial, stable sort within the tile by a single thread --- - // This is the key change that guarantees stability by eliminating all race conditions. if (tid == 0u) { for (var i = 0u; i < #{RADIX_BASE}u; i+=1u) { local_digit_counts[i] = 0u; } - var valid_count = 0u; + var entries_in_tile = 0u; for (var i = 0u; i < tile_size; i+=1u) { - if (tile_input_entries[i].key != INVALID_KEY) { - let digit = (tile_input_entries[i].key >> (sorting_pass_index * #{RADIX_BITS_PER_DIGIT}u)) & (#{RADIX_BASE}u - 1u); - local_digit_counts[digit] += 1u; - valid_count += 1u; - } + let entry = tile_input_entries[i]; + if (entry.value == INVALID_KEY) { continue; } // value sentinel marks padding + + let digit = (entry.key >> (sorting_pass_index * #{RADIX_BITS_PER_DIGIT}u)) & (#{RADIX_BASE}u - 1u); + local_digit_counts[digit] += 1u; + entries_in_tile += 1u; } - total_valid_in_tile_ws = valid_count; + tile_entry_count_ws = entries_in_tile; var sum = 0u; for (var i = 0u; i < #{RADIX_BASE}u; i+=1u) { @@ -173,24 +183,33 @@ fn radix_sort_c( } for (var i = 0u; i < tile_size; i+=1u) { - if (tile_input_entries[i].key != INVALID_KEY) { - let entry = tile_input_entries[i]; - let digit = (entry.key >> (sorting_pass_index * #{RADIX_BITS_PER_DIGIT}u)) & (#{RADIX_BASE}u - 1u); - let dest_idx = local_digit_offsets[digit]; - local_digit_offsets[digit] = dest_idx + 1u; - sorted_tile_entries[dest_idx] = entry; - } + let entry = tile_input_entries[i]; + if (entry.value == INVALID_KEY) { continue; } // value sentinel marks padding + + let digit = (entry.key >> (sorting_pass_index * #{RADIX_BITS_PER_DIGIT}u)) & (#{RADIX_BASE}u - 1u); + let dest_idx = local_digit_offsets[digit]; + local_digit_offsets[digit] = dest_idx + 1u; + sorted_tile_entries[dest_idx] = entry; } } workgroupBarrier(); - // --- Step 3: Atomically determine the global base address for this tile --- - // This replaces the fragile spin-lock with a single, robust atomic operation per digit. + // --- Step 3: Determine deterministic global base for each digit --- if (tid < #{RADIX_BASE}u) { let count = local_digit_counts[tid]; - if (count > 0u) { - digit_global_base_ws[tid] = atomicAdd(&sorting.digit_histogram[sorting_pass_index][tid], count); + let tile_count = max((gaussian_uniforms.count + tile_size - 1u) / tile_size, 1u); + let expected = sorting_pass_index * tile_count + workgroup_id.y; + + loop { + let head = atomicLoad(&sorting.digit_tile_head[tid]); + if (head == expected) { + let exchange = atomicCompareExchangeWeak(&sorting.digit_tile_head[tid], expected, expected + 1u); + if (exchange.exchanged) { break; } + } } + + let base = atomicAdd(&sorting.digit_histogram[sorting_pass_index][tid], count); + digit_global_base_ws[tid] = base; } workgroupBarrier(); @@ -205,10 +224,10 @@ fn radix_sort_c( workgroupBarrier(); for (var i = tid; i < tile_size; i += threads) { - if (i < total_valid_in_tile_ws) { + if (i < tile_entry_count_ws) { let entry = sorted_tile_entries[i]; let digit = (entry.key >> (sorting_pass_index * #{RADIX_BITS_PER_DIGIT}u)) & (#{RADIX_BASE}u - 1u); - + let bin_start_offset = local_digit_offsets[digit]; let rank_in_bin = i - bin_start_offset; let global_base = digit_global_base_ws[digit]; @@ -223,4 +242,4 @@ fn radix_sort_c( if (sorting_pass_index == #{RADIX_DIGIT_PLACES}u - 1u && tid == 0u) { atomicStore(&draw_indirect.instance_count, gaussian_uniforms.count); } -} \ No newline at end of file +}