[BUG] `cute::_` and `_` in C+= structured binding used together  cause runtime issues.

### Which component has the problem?

CUTLASS C++

### Bug Report

**Describe the bug**
`cute::_` and `_` in C+= structured binding used together  cause runtime issues.

**Steps/Code to reproduce bug**
```cpp
struct Config{
    constexpr static int TileM=64;
    constexpr static int TileN=128;
    constexpr static int TileK=32;
    constexpr static int Stage=2;

    using g2s_op = SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>;
    using g2s_traits = Copy_Traits<g2s_op>;
    using g2s_atom = Copy_Atom<g2s_traits, float>;
    using G2SCopyA = decltype( // A:(64, 32)
        make_tiled_copy(g2s_atom{}, 
            make_layout(
                make_shape(Int<64>{}, Int<8>{}),
                make_stride(Int<8>{}, Int<1>{})),
            make_layout(make_shape(Int<1>{}, Int<4>{})))
        );

    using G2SCopyB = decltype( // B:(32, 128)
        make_tiled_copy(g2s_atom{}, 
            make_layout(
                make_shape(Int<16>{},   Int<32>{}),
                make_stride(Int<32>{}, Int<1>{})),
            make_layout(make_shape(Int<1>{}, Int<4>{})))
        );
    constexpr static int ThreadNum = 512;
};


template<typename Config, typename TensorA, typename TensorB, typename TensorC>
__global__ void gemm_float(TensorA A, TensorB B, TensorC C){
    // auto [bx, by, bz] = blockIdx; (void)bz;
    auto [bx, by, _] = blockIdx;
    auto gA = A(_, by, _);
    auto gB = B(_, _, bx);
    auto gC = C(_, by, bx);  

    PRINT(gA);
    PRINT(gB);
    PRINT(gC);
}

template<typename Config, int M, int N, int K>
void launch_gemm(const float* Aptr, 
                 const float* Bptr, 
                 float* Cptr){    
    constexpr int TileM = Config::TileM;
    constexpr int TileN = Config::TileN;
    constexpr int TileK = Config::TileK;
    
    Tensor A_total = make_tensor(make_gmem_ptr(Aptr), 
               make_layout(make_shape(Int<M>{}, Int<K>{}), 
                           LayoutRight{}));
    Tensor A = tiled_divide(A_total, make_shape(Int<TileM>{}, Int<TileK>{}));
    
    Tensor B_total = make_tensor(make_gmem_ptr(Bptr),
                           make_layout(make_shape(Int<K>{}, Int<N>{}),
                           LayoutRight{}));
    Tensor B = tiled_divide(B_total, make_shape(Int<TileK>{}, Int<TileN>{}));
    

    Tensor C_total = make_tensor(make_gmem_ptr(Cptr),
                           make_layout(make_shape(Int<M>{}, Int<N>{}),
                           LayoutRight{}));
    Tensor C = tiled_divide(C_total, make_shape(Int<TileM>{}, Int<TileN>{}));

    dim3 block{Config::ThreadNum, 1, 1};
    dim3 grid{CEILDIV(N, TileN), CEILDIV(M, TileM), 1};


    gemm_float<Config, decltype(A), decltype(B), decltype(C)>
        <<<grid, block>>>(A, B, C);
    cudaCheckError(cudaGetLastError());
    cudaCheckError(cudaStreamSynchronize(0));
}
```

**Environment details (please complete the following information):**
 - cutlass 4
 - cuda 12.9

**Additional context**
output:
```txt
gA 0.000000
gB 1.000000
gC 0.000000
```

**Other problems**
  i replace the `auto [bx, by, _] = blockIdx;` with   `auto [bx, by, bz] = blockIdx; (void)bz;`, the new problem comes:
  https://github.com/NVIDIA/cutlass/issues/2499

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[BUG] `cute::_` and `_` in C+= structured binding used together cause runtime issues. #2498

Which component has the problem?

Bug Report

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[BUG] cute::_ and _ in C+= structured binding used together cause runtime issues. #2498

Description

Which component has the problem?

Bug Report

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions

[BUG] `cute::_` and `_` in C+= structured binding used together cause runtime issues. #2498