-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Open
Labels
Description
Which component has the problem?
CUTLASS C++
Bug Report
Describe the bug
cute::_
and _
in C+= structured binding used together cause runtime issues.
Steps/Code to reproduce bug
struct Config{
constexpr static int TileM=64;
constexpr static int TileN=128;
constexpr static int TileK=32;
constexpr static int Stage=2;
using g2s_op = SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>;
using g2s_traits = Copy_Traits<g2s_op>;
using g2s_atom = Copy_Atom<g2s_traits, float>;
using G2SCopyA = decltype( // A:(64, 32)
make_tiled_copy(g2s_atom{},
make_layout(
make_shape(Int<64>{}, Int<8>{}),
make_stride(Int<8>{}, Int<1>{})),
make_layout(make_shape(Int<1>{}, Int<4>{})))
);
using G2SCopyB = decltype( // B:(32, 128)
make_tiled_copy(g2s_atom{},
make_layout(
make_shape(Int<16>{}, Int<32>{}),
make_stride(Int<32>{}, Int<1>{})),
make_layout(make_shape(Int<1>{}, Int<4>{})))
);
constexpr static int ThreadNum = 512;
};
template<typename Config, typename TensorA, typename TensorB, typename TensorC>
__global__ void gemm_float(TensorA A, TensorB B, TensorC C){
// auto [bx, by, bz] = blockIdx; (void)bz;
auto [bx, by, _] = blockIdx;
auto gA = A(_, by, _);
auto gB = B(_, _, bx);
auto gC = C(_, by, bx);
PRINT(gA);
PRINT(gB);
PRINT(gC);
}
template<typename Config, int M, int N, int K>
void launch_gemm(const float* Aptr,
const float* Bptr,
float* Cptr){
constexpr int TileM = Config::TileM;
constexpr int TileN = Config::TileN;
constexpr int TileK = Config::TileK;
Tensor A_total = make_tensor(make_gmem_ptr(Aptr),
make_layout(make_shape(Int<M>{}, Int<K>{}),
LayoutRight{}));
Tensor A = tiled_divide(A_total, make_shape(Int<TileM>{}, Int<TileK>{}));
Tensor B_total = make_tensor(make_gmem_ptr(Bptr),
make_layout(make_shape(Int<K>{}, Int<N>{}),
LayoutRight{}));
Tensor B = tiled_divide(B_total, make_shape(Int<TileK>{}, Int<TileN>{}));
Tensor C_total = make_tensor(make_gmem_ptr(Cptr),
make_layout(make_shape(Int<M>{}, Int<N>{}),
LayoutRight{}));
Tensor C = tiled_divide(C_total, make_shape(Int<TileM>{}, Int<TileN>{}));
dim3 block{Config::ThreadNum, 1, 1};
dim3 grid{CEILDIV(N, TileN), CEILDIV(M, TileM), 1};
gemm_float<Config, decltype(A), decltype(B), decltype(C)>
<<<grid, block>>>(A, B, C);
cudaCheckError(cudaGetLastError());
cudaCheckError(cudaStreamSynchronize(0));
}
Environment details (please complete the following information):
- cutlass 4
- cuda 12.9
Additional context
output:
gA 0.000000
gB 1.000000
gC 0.000000
Other problems
i replace the auto [bx, by, _] = blockIdx;
with auto [bx, by, bz] = blockIdx; (void)bz;
, the new problem comes:
#2499