这是indexloc提供的服务,不要输入任何密码
Skip to content

Transitioning from a nvexecstream_context` to another seg faults #1563

@romintomasetti

Description

@romintomasetti

I'm using c9d272a.

I'm trying to transition from one nvexec::stream_context to another, but it seg faults.

Here is the reproducer:

TEST(try, reproducer)
{
    ::nvexec::stream_context stream_ctx_0{};
    ::nvexec::stream_context stream_ctx_1{};

    auto snd = ::stdexec::schedule(stream_ctx_0.get_scheduler())
        | ::stdexec::then([=] () -> int {
            if (::nvexec::is_on_gpu())
                return 1;
            else
                return 0;
        })
        | ::stdexec::continues_on(stream_ctx_1.get_scheduler())
        | ::stdexec::then([=](const int val) -> int {
            if (::nvexec::is_on_gpu() && val == 1)
                return 2;
            else
                return 0;
        });

    const auto [result] = ::stdexec::sync_wait(std::move(snd)).value();

    ASSERT_EQ(result, 2);
}

Note that if I change to ::stdexec::continues_on(stream_ctx_0.get_scheduler()) (i.e. transitioning from stream_ctx_0 to itself), it works fine.

Here is the backtrace I get from cuda-gdb:

[----------] 1 test from try
[ RUN      ] try.reproducer
[New Thread 0x7fff995fe000 (LWP 33576)]
[New Thread 0x7fff98dfd000 (LWP 33577)]
[New Thread 0x7fff79fff000 (LWP 33578)]
[Thread 0x7fff98dfd000 (LWP 33577) exited]

Thread 21 "tests_nvexec_ad" received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7fff995fe000 (LWP 33576)]
0x00007ffff274276b in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
(cuda-gdb) bt
#0  0x00007ffff274276b in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#1  0x00007ffff284214e in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#2  0x00007ffff27f97c9 in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#3  0x00007ffff27df6f0 in cuMemFreeAsync () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#4  0x00007ffff2016540 in ?? () from /usr/local/cuda/lib64/libcudart.so.12
#5  0x00007ffff207e05f in cudaFreeAsync () from /usr/local/cuda/lib64/libcudart.so.12
#6  0x000055555556ecb1 in nvexec::_strm::continuation_task_t<nvexec::_strm::_continues_on::operation_state_t<nvexec::_strm::then_sender_t<nvexec::_strm::stream_scheduler::sender_t, tests::nvexec::adaptors::try_reproducer_Test::TestBody()::{lambda()#1}>, nvexec::_strm::_sched_from::receiver_t<nvexec::_strm::continues_on_sender_t<nvexec::_strm::stream_scheduler, nvexec::_strm::then_sender_t<nvexec::_strm::stream_scheduler::sender_t, tests::nvexec::adaptors::try_reproducer_Test::TestBody()::{lambda()#1}> >, nvexec::_strm::_then::receiver_t<4ul, nvexec::_strm::propagate_receiver_t<nvexec::_strm::_sync_wait::receiver_t<nvexec::_strm::then_sender_t<nvexec::_strm::schedule_from_sender_t<nvexec::_strm::stream_scheduler, nvexec::_strm::continues_on_sender_t<nvexec::_strm::stream_scheduler, nvexec::_strm::then_sender_t<nvexec::_strm::stream_scheduler::sender_t, tests::nvexec::adaptors::try_reproducer_Test::TestBody()::{lambda()#1}> > >, tests::nvexec::adaptors::try_reproducer_Test::TestBody()::{lambda(int)#1}> > >, tests::nvexec::adaptors::try_reproducer_Test::TestBody()::{lambda(int)#1}> > >::__t::receiver_t, nvexec::variant_t<cuda::std::__4::tuple<nvexec::_strm::set_noop>, cuda::std::__4::tuple<stdexec::__rcvrs::set_value_t, int>, cuda::std::__4::tuple<stdexec::__rcvrs::set_error_t, cudaError>, cuda::std::__4::tuple<stdexec::__rcvrs::set_error_t, std::__exception_ptr::exception_ptr> > >::continuation_task_t(nvexec::_strm::_continues_on::operation_state_t<nvexec::_strm::then_sender_t<nvexec::_strm::stream_scheduler::sender_t, tests::nvexec::adaptors::try_reproducer_Test::TestBody()::{lambda()#1}>, nvexec::_strm::_sched_from::receiver_t<nvexec::_strm::continues_on_sender_t<nvexec::_strm::stream_scheduler, nvexec::_strm::then_sender_t<nvexec::_strm::stream_scheduler::sender_t, tests::nvexec::adaptors::try_reproducer_Test::TestBody()::{lambda()#1}> >, nvexec::_strm::_then::receiver_t<4ul, nvexec::_strm::propagate_receiver_t<nvexec::_strm::_sync_wait::receiver_t<nvexec::_strm::then_sender_t<nvexec::_strm::schedule_from_sender_t<nvexec::_strm::stream_scheduler, nvexec::_strm::continues_on_sender_t<nvexec::_strm::stream_scheduler, nvexec::_strm::then_sender_t<nvexec::_strm::stream_scheduler::sender_t, tests::nvexec::adaptors::try_reproducer_Test::TestBody()::{lambda()#1}> > >, tests::nvexec::adaptors::try_reproducer_Test::TestBody()::{lambda(int)#1}> > >, tests::nvexec::adaptors::try_reproducer_Test::TestBody()::{lambda(int)#1}> > >::__t::receiver_t, nvexec::variant_t<cuda::std::__4::tuple<nvexec::_strm::set_noop>, cuda::std::__4::tuple<stdexec::__rcvrs::set_value_t, int>, cuda::std::__4::tuple<stdexec::__rcvrs::set_error_t, cudaError>, cuda::std::__4::tuple<stdexec::__rcvrs::set_error_t, std::__exception_ptr::exception_ptr> >*, CUstream_st*, std::pmr::memory_resource*)::{lambda(nvexec::_strm::queue::task_base_t*)#2}::__invoke(nvexec::_strm::queue::task_base_t*) ()

I am on a AMPERE86 GPU with Cuda 12.8, compiling with:

Ubuntu clang version 20.1.2 (++20250331083337+bc65196c0919-1exp120250331203353.96)

Any idea ? I am assuming that my code is legit, but I might be wrong as well. Thanks!

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions