mosure · mosure · Sep 15, 2025 · Sep 15, 2025
diff --git a/src/gaussian/f16.rs b/src/gaussian/f16.rs
@@ -1,3 +1,4 @@
+#![allow(dead_code)] // ShaderType derives emit unused check helpers
 use std::marker::Copy;
 
 use half::f16;
@@ -28,6 +29,7 @@ use crate::gaussian::{
 };
 
 
+#[allow(dead_code)]
 #[derive(
     Clone,
     Debug,
@@ -132,6 +134,7 @@ impl From<[u32; 4]> for RotationScaleOpacityPacked128 {
 }
 
 
+#[allow(dead_code)]
 #[derive(
     Clone,
     Debug,
@@ -201,6 +204,7 @@ impl From<[u32; 4]> for Covariance3dOpacityPacked128 {
 
 
 
+#[allow(dead_code)]
 #[derive(
     Clone,
     Debug,
@@ -299,3 +303,4 @@ pub fn unpack_u32_to_f32s(value: u32) -> (f32, f32) {
     let (upper, lower) = unpack_u32_to_f16s(value);
     (upper.to_f32(), lower.to_f32())
 }
+
diff --git a/src/gaussian/f32.rs b/src/gaussian/f32.rs
@@ -1,3 +1,4 @@
+#![allow(dead_code)] // ShaderType derives emit unused check helpers
 use std::marker::Copy;
 
 use bevy::{
@@ -25,6 +26,7 @@ use crate::gaussian::{
 pub type Position = [f32; 3];
 
 
+#[allow(dead_code)]
 #[derive(
     Clone,
     Debug,
@@ -58,6 +60,7 @@ impl From<[f32; 4]> for PositionTimestamp {
 }
 
 
+#[allow(dead_code)]
 #[derive(
     Clone,
     Debug,
@@ -99,6 +102,7 @@ impl From<[f32; 4]> for PositionVisibility {
 }
 
 
+#[allow(dead_code)]
 #[derive(
     Clone,
     Debug,
@@ -124,6 +128,7 @@ impl From<[f32; 4]> for Rotation {
 }
 
 
+#[allow(dead_code)]
 #[derive(
     Clone,
     Debug,
@@ -182,6 +187,7 @@ impl From<[f32; 8]> for IsotropicRotations {
 }
 
 
+#[allow(dead_code)]
 #[derive(
     Clone,
     Debug,
@@ -215,6 +221,7 @@ impl From<[f32; 4]> for ScaleOpacity {
 }
 
 
+#[allow(dead_code)]
 #[derive(
     Clone,
     Debug,
@@ -246,6 +253,7 @@ impl From<[f32; 4]> for TimestampTimescale {
 }
 
 
+#[allow(dead_code)]
 #[derive(
     Clone,
     Debug,
@@ -280,3 +288,4 @@ impl From<&Gaussian3d> for Covariance3dOpacity {
         }
     }
 }
+
diff --git a/src/material/spherical_harmonics.rs b/src/material/spherical_harmonics.rs
@@ -1,3 +1,4 @@
+#![allow(dead_code)] // ShaderType derives emit unused check helpers
 use std::marker::Copy;
 
 use bevy::{
@@ -97,6 +98,7 @@ pub const SH_VEC4_PLANES: usize = SH_COEFF_COUNT / 4;
 // }
 
 
+#[allow(dead_code)]
 #[derive(
     Clone,
     Copy,
@@ -240,3 +242,4 @@ where
 
     d.deserialize_tuple(SH_COEFF_COUNT, CoefficientsVisitor)
 }
+
diff --git a/src/material/spherindrical_harmonics.rs b/src/material/spherindrical_harmonics.rs
@@ -1,3 +1,4 @@
+#![allow(dead_code)] // ShaderType derives emit unused check helpers
 use std::marker::Copy;
 
 use bevy::{
@@ -87,6 +88,7 @@ impl Plugin for SpherindricalHarmonicCoefficientsPlugin {
 //     pub coefficients: [[u32; POD_ARRAY_SIZE]; POD_PLANE_COUNT],
 // }
 
+#[allow(dead_code)]
 #[derive(
     Clone,
     Copy,
@@ -252,3 +254,4 @@ where
 
     d.deserialize_tuple(SH_4D_COEFF_COUNT, CoefficientsVisitor)
 }
+
diff --git a/src/render/mod.rs b/src/render/mod.rs
@@ -1,3 +1,4 @@
+#![allow(dead_code)] // ShaderType derives emit unused check helpers
 use std::{
     hash::Hash,
     num::NonZero,
@@ -614,7 +615,7 @@ impl Default for ShaderDefines {
         let workgroup_entries_a = workgroup_invocations_a * entries_per_invocation_a;
         let workgroup_entries_c = workgroup_invocations_c * entries_per_invocation_c;
         let sorting_buffer_size = radix_base * radix_digit_places *
-            std::mem::size_of::<u32>() as u32 + 5 * std::mem::size_of::<u32>() as u32;
+            std::mem::size_of::<u32>() as u32 + (5 + radix_base) * std::mem::size_of::<u32>() as u32;
 
         Self {
             radix_bits_per_digit,
@@ -837,6 +838,7 @@ type DrawGaussians<R: PlanarSync> = (
 );
 
 
+#[allow(dead_code)]
 #[derive(Component, ShaderType, Clone, Copy)]
 pub struct CloudUniform {
     pub transform: Mat4,
@@ -1131,6 +1133,7 @@ pub fn queue_gaussian_view_bind_groups<R: PlanarSync>(
 }
 
 // Prepare the compute view bind group using the compute_view_layout (for compute pipelines)
+#[allow(clippy::too_many_arguments)]
 pub fn queue_gaussian_compute_view_bind_groups<R: PlanarSync>(
     mut commands: Commands,
     render_device: Res<RenderDevice>,
@@ -1149,7 +1152,6 @@ pub fn queue_gaussian_compute_view_bind_groups<R: PlanarSync>(
     globals_buffer: Res<GlobalsBuffer>,
 )
 where
-    R: PlanarSync,
     R::GpuPlanarType: GpuPlanarStorage,
 {
     if let (
@@ -1396,3 +1398,4 @@ where
         RenderCommandResult::Success
     }
 }
+
diff --git a/src/sort/mod.rs b/src/sort/mod.rs
@@ -1,3 +1,4 @@
+#![allow(dead_code)] // ShaderType derives emit unused check helpers
 use core::time::Duration;
 use std::marker::PhantomData;
 
@@ -418,6 +419,7 @@ impl From<&SortedEntriesHandle> for AssetId<SortedEntries> {
 }
 
 
+#[allow(dead_code)]
 #[derive(
     Clone,
     Copy,
@@ -543,3 +545,4 @@ pub struct GpuSortedEntry {
     #[cfg(feature = "buffer_texture")]
     pub texture: Handle<Image>,
 }
+
diff --git a/src/sort/radix.rs b/src/sort/radix.rs
@@ -495,7 +495,7 @@ where
                     };
 
                     let group = render_device.create_bind_group(
-                        format!("radix_sort_bind_group pass={} parity={}", pass_idx, parity).as_str(),
+                        format!("radix_sort_bind_group pass={pass_idx} parity={parity}").as_str(),
                         &radix_pipeline.radix_sort_layout,
                         &[
                             // sorting_pass_index (u32) == pass_idx regardless of parity

diff --git a/src/sort/radix.wgsl b/src/sort/radix.wgsl
@@ -35,6 +35,7 @@
 struct SortingGlobal {
     digit_histogram: array<array<atomic<u32>, #{RADIX_BASE}>, #{RADIX_DIGIT_PLACES}>,
     assignment_counter: atomic<u32>,
+    digit_tile_head: array<atomic<u32>, #{RADIX_BASE}>,
 }
 
 @group(3) @binding(0) var<uniform> sorting_pass_index: u32;
@@ -61,6 +62,9 @@ fn radix_reset(
     let b = local_id.x;
     let p = local_id.y;
     atomicStore(&sorting.digit_histogram[p][b], 0u);
+    if (p == 0u) {
+        atomicStore(&sorting.digit_tile_head[b], 0u);
+    }
     if (global_id.x == 0u && global_id.y == 0u) {
         atomicStore(&sorting.assignment_counter, 0u);
         draw_indirect.instance_count = 0u;
@@ -123,7 +127,7 @@ var<workgroup> sorted_tile_entries: array<Entry, #{WORKGROUP_ENTRIES_C}>;
 var<workgroup> local_digit_counts: array<u32, #{RADIX_BASE}>;
 var<workgroup> local_digit_offsets: array<u32, #{RADIX_BASE}>;
 var<workgroup> digit_global_base_ws: array<u32, #{RADIX_BASE}>;
-var<workgroup> total_valid_in_tile_ws: u32;
+var<workgroup> tile_entry_count_ws: u32;
 const INVALID_KEY: u32 = 0xFFFFFFFFu;
 
 
@@ -140,31 +144,37 @@ fn radix_sort_c(
     let threads = #{WORKGROUP_INVOCATIONS_C}u;
     let global_entry_offset = workgroup_id.y * tile_size;
 
+    // Clear per-digit base cache so stale values are never reused across tiles.
+    if (tid < #{RADIX_BASE}u) {
+        digit_global_base_ws[tid] = 0u;
+    }
+    workgroupBarrier();
+
     // --- Step 1: Parallel load ---
     for (var i = tid; i < tile_size; i += threads) {
         let idx = global_entry_offset + i;
         if (idx < gaussian_uniforms.count) {
             tile_input_entries[i] = input_entries[idx];
         } else {
-            tile_input_entries[i].key = INVALID_KEY;
+            tile_input_entries[i] = Entry(INVALID_KEY, INVALID_KEY);
         }
     }
     workgroupBarrier();
 
     // --- Step 2: Serial, stable sort within the tile by a single thread ---
-    // This is the key change that guarantees stability by eliminating all race conditions.
     if (tid == 0u) {
         for (var i = 0u; i < #{RADIX_BASE}u; i+=1u) { local_digit_counts[i] = 0u; }
 
-        var valid_count = 0u;
+        var entries_in_tile = 0u;
         for (var i = 0u; i < tile_size; i+=1u) {
-            if (tile_input_entries[i].key != INVALID_KEY) {
-                let digit = (tile_input_entries[i].key >> (sorting_pass_index * #{RADIX_BITS_PER_DIGIT}u)) & (#{RADIX_BASE}u - 1u);
-                local_digit_counts[digit] += 1u;
-                valid_count += 1u;
-            }
+            let entry = tile_input_entries[i];
+            if (entry.value == INVALID_KEY) { continue; } // value sentinel marks padding
+
+            let digit = (entry.key >> (sorting_pass_index * #{RADIX_BITS_PER_DIGIT}u)) & (#{RADIX_BASE}u - 1u);
+            local_digit_counts[digit] += 1u;
+            entries_in_tile += 1u;
         }
-        total_valid_in_tile_ws = valid_count;
+        tile_entry_count_ws = entries_in_tile;
 
         var sum = 0u;
         for (var i = 0u; i < #{RADIX_BASE}u; i+=1u) {
@@ -173,24 +183,33 @@ fn radix_sort_c(
         }
 
         for (var i = 0u; i < tile_size; i+=1u) {
-            if (tile_input_entries[i].key != INVALID_KEY) {
-                let entry = tile_input_entries[i];
-                let digit = (entry.key >> (sorting_pass_index * #{RADIX_BITS_PER_DIGIT}u)) & (#{RADIX_BASE}u - 1u);
-                let dest_idx = local_digit_offsets[digit];
-                local_digit_offsets[digit] = dest_idx + 1u;
-                sorted_tile_entries[dest_idx] = entry;
-            }
+            let entry = tile_input_entries[i];
+            if (entry.value == INVALID_KEY) { continue; } // value sentinel marks padding
+
+            let digit = (entry.key >> (sorting_pass_index * #{RADIX_BITS_PER_DIGIT}u)) & (#{RADIX_BASE}u - 1u);
+            let dest_idx = local_digit_offsets[digit];
+            local_digit_offsets[digit] = dest_idx + 1u;
+            sorted_tile_entries[dest_idx] = entry;
         }
     }
     workgroupBarrier();
 
-    // --- Step 3: Atomically determine the global base address for this tile ---
-    // This replaces the fragile spin-lock with a single, robust atomic operation per digit.
+    // --- Step 3: Determine deterministic global base for each digit ---
     if (tid < #{RADIX_BASE}u) {
         let count = local_digit_counts[tid];
-        if (count > 0u) {
-            digit_global_base_ws[tid] = atomicAdd(&sorting.digit_histogram[sorting_pass_index][tid], count);
+        let tile_count = max((gaussian_uniforms.count + tile_size - 1u) / tile_size, 1u);
+        let expected = sorting_pass_index * tile_count + workgroup_id.y;
+
+        loop {
+            let head = atomicLoad(&sorting.digit_tile_head[tid]);
+            if (head == expected) {
+                let exchange = atomicCompareExchangeWeak(&sorting.digit_tile_head[tid], expected, expected + 1u);
+                if (exchange.exchanged) { break; }
+            }
         }
+
+        let base = atomicAdd(&sorting.digit_histogram[sorting_pass_index][tid], count);
+        digit_global_base_ws[tid] = base;
     }
     workgroupBarrier();
 
@@ -205,10 +224,10 @@ fn radix_sort_c(
     workgroupBarrier();
 
     for (var i = tid; i < tile_size; i += threads) {
-        if (i < total_valid_in_tile_ws) {
+        if (i < tile_entry_count_ws) {
             let entry = sorted_tile_entries[i];
             let digit = (entry.key >> (sorting_pass_index * #{RADIX_BITS_PER_DIGIT}u)) & (#{RADIX_BASE}u - 1u);
-            
+
             let bin_start_offset = local_digit_offsets[digit];
             let rank_in_bin = i - bin_start_offset;
             let global_base = digit_global_base_ws[digit];
@@ -223,4 +242,4 @@ fn radix_sort_c(
     if (sorting_pass_index == #{RADIX_DIGIT_PLACES}u - 1u && tid == 0u) {
         atomicStore(&draw_indirect.instance_count, gaussian_uniforms.count);
     }
-}
+}