这是indexloc提供的服务,不要输入任何密码
Skip to content

[BUG] cute::_ and _ in C+= structured binding used together cause runtime issues. #2498

@happyflathead

Description

@happyflathead

Which component has the problem?

CUTLASS C++

Bug Report

Describe the bug
cute::_ and _ in C+= structured binding used together cause runtime issues.

Steps/Code to reproduce bug

struct Config{
    constexpr static int TileM=64;
    constexpr static int TileN=128;
    constexpr static int TileK=32;
    constexpr static int Stage=2;

    using g2s_op = SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>;
    using g2s_traits = Copy_Traits<g2s_op>;
    using g2s_atom = Copy_Atom<g2s_traits, float>;
    using G2SCopyA = decltype( // A:(64, 32)
        make_tiled_copy(g2s_atom{}, 
            make_layout(
                make_shape(Int<64>{}, Int<8>{}),
                make_stride(Int<8>{}, Int<1>{})),
            make_layout(make_shape(Int<1>{}, Int<4>{})))
        );

    using G2SCopyB = decltype( // B:(32, 128)
        make_tiled_copy(g2s_atom{}, 
            make_layout(
                make_shape(Int<16>{},   Int<32>{}),
                make_stride(Int<32>{}, Int<1>{})),
            make_layout(make_shape(Int<1>{}, Int<4>{})))
        );
    constexpr static int ThreadNum = 512;
};


template<typename Config, typename TensorA, typename TensorB, typename TensorC>
__global__ void gemm_float(TensorA A, TensorB B, TensorC C){
    // auto [bx, by, bz] = blockIdx; (void)bz;
    auto [bx, by, _] = blockIdx;
    auto gA = A(_, by, _);
    auto gB = B(_, _, bx);
    auto gC = C(_, by, bx);  

    PRINT(gA);
    PRINT(gB);
    PRINT(gC);
}

template<typename Config, int M, int N, int K>
void launch_gemm(const float* Aptr, 
                 const float* Bptr, 
                 float* Cptr){    
    constexpr int TileM = Config::TileM;
    constexpr int TileN = Config::TileN;
    constexpr int TileK = Config::TileK;
    
    Tensor A_total = make_tensor(make_gmem_ptr(Aptr), 
               make_layout(make_shape(Int<M>{}, Int<K>{}), 
                           LayoutRight{}));
    Tensor A = tiled_divide(A_total, make_shape(Int<TileM>{}, Int<TileK>{}));
    
    Tensor B_total = make_tensor(make_gmem_ptr(Bptr),
                           make_layout(make_shape(Int<K>{}, Int<N>{}),
                           LayoutRight{}));
    Tensor B = tiled_divide(B_total, make_shape(Int<TileK>{}, Int<TileN>{}));
    

    Tensor C_total = make_tensor(make_gmem_ptr(Cptr),
                           make_layout(make_shape(Int<M>{}, Int<N>{}),
                           LayoutRight{}));
    Tensor C = tiled_divide(C_total, make_shape(Int<TileM>{}, Int<TileN>{}));

    dim3 block{Config::ThreadNum, 1, 1};
    dim3 grid{CEILDIV(N, TileN), CEILDIV(M, TileM), 1};


    gemm_float<Config, decltype(A), decltype(B), decltype(C)>
        <<<grid, block>>>(A, B, C);
    cudaCheckError(cudaGetLastError());
    cudaCheckError(cudaStreamSynchronize(0));
}

Environment details (please complete the following information):

  • cutlass 4
  • cuda 12.9

Additional context
output:

gA 0.000000
gB 1.000000
gC 0.000000

Other problems
i replace the auto [bx, by, _] = blockIdx; with auto [bx, by, bz] = blockIdx; (void)bz;, the new problem comes:
#2499

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions