这是indexloc提供的服务,不要输入任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/gaussian/f16.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#![allow(dead_code)] // ShaderType derives emit unused check helpers
use std::marker::Copy;

use half::f16;
Expand Down Expand Up @@ -28,6 +29,7 @@ use crate::gaussian::{
};


#[allow(dead_code)]
#[derive(
Clone,
Debug,
Expand Down Expand Up @@ -132,6 +134,7 @@ impl From<[u32; 4]> for RotationScaleOpacityPacked128 {
}


#[allow(dead_code)]
#[derive(
Clone,
Debug,
Expand Down Expand Up @@ -201,6 +204,7 @@ impl From<[u32; 4]> for Covariance3dOpacityPacked128 {



#[allow(dead_code)]
#[derive(
Clone,
Debug,
Expand Down Expand Up @@ -299,3 +303,4 @@ pub fn unpack_u32_to_f32s(value: u32) -> (f32, f32) {
let (upper, lower) = unpack_u32_to_f16s(value);
(upper.to_f32(), lower.to_f32())
}

9 changes: 9 additions & 0 deletions src/gaussian/f32.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#![allow(dead_code)] // ShaderType derives emit unused check helpers
use std::marker::Copy;

use bevy::{
Expand Down Expand Up @@ -25,6 +26,7 @@ use crate::gaussian::{
pub type Position = [f32; 3];


#[allow(dead_code)]
#[derive(
Clone,
Debug,
Expand Down Expand Up @@ -58,6 +60,7 @@ impl From<[f32; 4]> for PositionTimestamp {
}


#[allow(dead_code)]
#[derive(
Clone,
Debug,
Expand Down Expand Up @@ -99,6 +102,7 @@ impl From<[f32; 4]> for PositionVisibility {
}


#[allow(dead_code)]
#[derive(
Clone,
Debug,
Expand All @@ -124,6 +128,7 @@ impl From<[f32; 4]> for Rotation {
}


#[allow(dead_code)]
#[derive(
Clone,
Debug,
Expand Down Expand Up @@ -182,6 +187,7 @@ impl From<[f32; 8]> for IsotropicRotations {
}


#[allow(dead_code)]
#[derive(
Clone,
Debug,
Expand Down Expand Up @@ -215,6 +221,7 @@ impl From<[f32; 4]> for ScaleOpacity {
}


#[allow(dead_code)]
#[derive(
Clone,
Debug,
Expand Down Expand Up @@ -246,6 +253,7 @@ impl From<[f32; 4]> for TimestampTimescale {
}


#[allow(dead_code)]
#[derive(
Clone,
Debug,
Expand Down Expand Up @@ -280,3 +288,4 @@ impl From<&Gaussian3d> for Covariance3dOpacity {
}
}
}

3 changes: 3 additions & 0 deletions src/material/spherical_harmonics.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#![allow(dead_code)] // ShaderType derives emit unused check helpers
use std::marker::Copy;

use bevy::{
Expand Down Expand Up @@ -97,6 +98,7 @@ pub const SH_VEC4_PLANES: usize = SH_COEFF_COUNT / 4;
// }


#[allow(dead_code)]
#[derive(
Clone,
Copy,
Expand Down Expand Up @@ -240,3 +242,4 @@ where

d.deserialize_tuple(SH_COEFF_COUNT, CoefficientsVisitor)
}

3 changes: 3 additions & 0 deletions src/material/spherindrical_harmonics.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#![allow(dead_code)] // ShaderType derives emit unused check helpers
use std::marker::Copy;

use bevy::{
Expand Down Expand Up @@ -87,6 +88,7 @@ impl Plugin for SpherindricalHarmonicCoefficientsPlugin {
// pub coefficients: [[u32; POD_ARRAY_SIZE]; POD_PLANE_COUNT],
// }

#[allow(dead_code)]
#[derive(
Clone,
Copy,
Expand Down Expand Up @@ -252,3 +254,4 @@ where

d.deserialize_tuple(SH_4D_COEFF_COUNT, CoefficientsVisitor)
}

7 changes: 5 additions & 2 deletions src/render/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#![allow(dead_code)] // ShaderType derives emit unused check helpers
use std::{
hash::Hash,
num::NonZero,
Expand Down Expand Up @@ -614,7 +615,7 @@ impl Default for ShaderDefines {
let workgroup_entries_a = workgroup_invocations_a * entries_per_invocation_a;
let workgroup_entries_c = workgroup_invocations_c * entries_per_invocation_c;
let sorting_buffer_size = radix_base * radix_digit_places *
std::mem::size_of::<u32>() as u32 + 5 * std::mem::size_of::<u32>() as u32;
std::mem::size_of::<u32>() as u32 + (5 + radix_base) * std::mem::size_of::<u32>() as u32;

Self {
radix_bits_per_digit,
Expand Down Expand Up @@ -837,6 +838,7 @@ type DrawGaussians<R: PlanarSync> = (
);


#[allow(dead_code)]
#[derive(Component, ShaderType, Clone, Copy)]
pub struct CloudUniform {
pub transform: Mat4,
Expand Down Expand Up @@ -1131,6 +1133,7 @@ pub fn queue_gaussian_view_bind_groups<R: PlanarSync>(
}

// Prepare the compute view bind group using the compute_view_layout (for compute pipelines)
#[allow(clippy::too_many_arguments)]
pub fn queue_gaussian_compute_view_bind_groups<R: PlanarSync>(
mut commands: Commands,
render_device: Res<RenderDevice>,
Expand All @@ -1149,7 +1152,6 @@ pub fn queue_gaussian_compute_view_bind_groups<R: PlanarSync>(
globals_buffer: Res<GlobalsBuffer>,
)
where
R: PlanarSync,
R::GpuPlanarType: GpuPlanarStorage,
{
if let (
Expand Down Expand Up @@ -1396,3 +1398,4 @@ where
RenderCommandResult::Success
}
}

3 changes: 3 additions & 0 deletions src/sort/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#![allow(dead_code)] // ShaderType derives emit unused check helpers
use core::time::Duration;
use std::marker::PhantomData;

Expand Down Expand Up @@ -418,6 +419,7 @@ impl From<&SortedEntriesHandle> for AssetId<SortedEntries> {
}


#[allow(dead_code)]
#[derive(
Clone,
Copy,
Expand Down Expand Up @@ -543,3 +545,4 @@ pub struct GpuSortedEntry {
#[cfg(feature = "buffer_texture")]
pub texture: Handle<Image>,
}

2 changes: 1 addition & 1 deletion src/sort/radix.rs
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,7 @@ where
};

let group = render_device.create_bind_group(
format!("radix_sort_bind_group pass={} parity={}", pass_idx, parity).as_str(),
format!("radix_sort_bind_group pass={pass_idx} parity={parity}").as_str(),
&radix_pipeline.radix_sort_layout,
&[
// sorting_pass_index (u32) == pass_idx regardless of parity
Expand Down
67 changes: 43 additions & 24 deletions src/sort/radix.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
struct SortingGlobal {
digit_histogram: array<array<atomic<u32>, #{RADIX_BASE}>, #{RADIX_DIGIT_PLACES}>,
assignment_counter: atomic<u32>,
digit_tile_head: array<atomic<u32>, #{RADIX_BASE}>,
}

@group(3) @binding(0) var<uniform> sorting_pass_index: u32;
Expand All @@ -61,6 +62,9 @@ fn radix_reset(
let b = local_id.x;
let p = local_id.y;
atomicStore(&sorting.digit_histogram[p][b], 0u);
if (p == 0u) {
atomicStore(&sorting.digit_tile_head[b], 0u);
}
if (global_id.x == 0u && global_id.y == 0u) {
atomicStore(&sorting.assignment_counter, 0u);
draw_indirect.instance_count = 0u;
Expand Down Expand Up @@ -123,7 +127,7 @@ var<workgroup> sorted_tile_entries: array<Entry, #{WORKGROUP_ENTRIES_C}>;
var<workgroup> local_digit_counts: array<u32, #{RADIX_BASE}>;
var<workgroup> local_digit_offsets: array<u32, #{RADIX_BASE}>;
var<workgroup> digit_global_base_ws: array<u32, #{RADIX_BASE}>;
var<workgroup> total_valid_in_tile_ws: u32;
var<workgroup> tile_entry_count_ws: u32;
const INVALID_KEY: u32 = 0xFFFFFFFFu;


Expand All @@ -140,31 +144,37 @@ fn radix_sort_c(
let threads = #{WORKGROUP_INVOCATIONS_C}u;
let global_entry_offset = workgroup_id.y * tile_size;

// Clear per-digit base cache so stale values are never reused across tiles.
if (tid < #{RADIX_BASE}u) {
digit_global_base_ws[tid] = 0u;
}
workgroupBarrier();

// --- Step 1: Parallel load ---
for (var i = tid; i < tile_size; i += threads) {
let idx = global_entry_offset + i;
if (idx < gaussian_uniforms.count) {
tile_input_entries[i] = input_entries[idx];
} else {
tile_input_entries[i].key = INVALID_KEY;
tile_input_entries[i] = Entry(INVALID_KEY, INVALID_KEY);
}
}
workgroupBarrier();

// --- Step 2: Serial, stable sort within the tile by a single thread ---
// This is the key change that guarantees stability by eliminating all race conditions.
if (tid == 0u) {
for (var i = 0u; i < #{RADIX_BASE}u; i+=1u) { local_digit_counts[i] = 0u; }

var valid_count = 0u;
var entries_in_tile = 0u;
for (var i = 0u; i < tile_size; i+=1u) {
if (tile_input_entries[i].key != INVALID_KEY) {
let digit = (tile_input_entries[i].key >> (sorting_pass_index * #{RADIX_BITS_PER_DIGIT}u)) & (#{RADIX_BASE}u - 1u);
local_digit_counts[digit] += 1u;
valid_count += 1u;
}
let entry = tile_input_entries[i];
if (entry.value == INVALID_KEY) { continue; } // value sentinel marks padding

let digit = (entry.key >> (sorting_pass_index * #{RADIX_BITS_PER_DIGIT}u)) & (#{RADIX_BASE}u - 1u);
local_digit_counts[digit] += 1u;
entries_in_tile += 1u;
}
total_valid_in_tile_ws = valid_count;
tile_entry_count_ws = entries_in_tile;

var sum = 0u;
for (var i = 0u; i < #{RADIX_BASE}u; i+=1u) {
Expand All @@ -173,24 +183,33 @@ fn radix_sort_c(
}

for (var i = 0u; i < tile_size; i+=1u) {
if (tile_input_entries[i].key != INVALID_KEY) {
let entry = tile_input_entries[i];
let digit = (entry.key >> (sorting_pass_index * #{RADIX_BITS_PER_DIGIT}u)) & (#{RADIX_BASE}u - 1u);
let dest_idx = local_digit_offsets[digit];
local_digit_offsets[digit] = dest_idx + 1u;
sorted_tile_entries[dest_idx] = entry;
}
let entry = tile_input_entries[i];
if (entry.value == INVALID_KEY) { continue; } // value sentinel marks padding

let digit = (entry.key >> (sorting_pass_index * #{RADIX_BITS_PER_DIGIT}u)) & (#{RADIX_BASE}u - 1u);
let dest_idx = local_digit_offsets[digit];
local_digit_offsets[digit] = dest_idx + 1u;
sorted_tile_entries[dest_idx] = entry;
}
}
workgroupBarrier();

// --- Step 3: Atomically determine the global base address for this tile ---
// This replaces the fragile spin-lock with a single, robust atomic operation per digit.
// --- Step 3: Determine deterministic global base for each digit ---
if (tid < #{RADIX_BASE}u) {
let count = local_digit_counts[tid];
if (count > 0u) {
digit_global_base_ws[tid] = atomicAdd(&sorting.digit_histogram[sorting_pass_index][tid], count);
let tile_count = max((gaussian_uniforms.count + tile_size - 1u) / tile_size, 1u);
let expected = sorting_pass_index * tile_count + workgroup_id.y;

loop {
let head = atomicLoad(&sorting.digit_tile_head[tid]);
if (head == expected) {
let exchange = atomicCompareExchangeWeak(&sorting.digit_tile_head[tid], expected, expected + 1u);
if (exchange.exchanged) { break; }
}
}

let base = atomicAdd(&sorting.digit_histogram[sorting_pass_index][tid], count);
digit_global_base_ws[tid] = base;
}
workgroupBarrier();

Expand All @@ -205,10 +224,10 @@ fn radix_sort_c(
workgroupBarrier();

for (var i = tid; i < tile_size; i += threads) {
if (i < total_valid_in_tile_ws) {
if (i < tile_entry_count_ws) {
let entry = sorted_tile_entries[i];
let digit = (entry.key >> (sorting_pass_index * #{RADIX_BITS_PER_DIGIT}u)) & (#{RADIX_BASE}u - 1u);

let bin_start_offset = local_digit_offsets[digit];
let rank_in_bin = i - bin_start_offset;
let global_base = digit_global_base_ws[digit];
Expand All @@ -223,4 +242,4 @@ fn radix_sort_c(
if (sorting_pass_index == #{RADIX_DIGIT_PLACES}u - 1u && tid == 0u) {
atomicStore(&draw_indirect.instance_count, gaussian_uniforms.count);
}
}
}
Loading