/*******************************************************************************
* Copyright 2020 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "common_u8.hpp"
#include "jit_generator.hpp"

namespace mkldnn {
namespace impl {
namespace cpu {

jit_sse41_kernel_b0_c_gemm_s8u8s32_kern::
jit_sse41_kernel_b0_c_gemm_s8u8s32_kern()
        : jit_generator(nullptr, S8U8S32_COMPUTE_KERNEL_CODE_SIZE) {

#ifndef _WIN32

#define M rdi
#define N rsi
#define K rdx
#define A r8
#define B r9
#define C r10
#define LDC r11

#define AA rcx
#define I r12
#define J r13
#define H rax
#define AO r14
#define BO r15
#define CO1 rbx
#define CO2 rbp

#else

#define M rcx
#define N rdx
#define K r8
#define A rsi
#define B r9
#define C r10
#define LDC r11

#define AA rdi
#define I r12
#define J r13
#define H rax
#define AO r14
#define BO r15
#define CO1 rbx
#define CO2 rbp

#endif

#ifdef _WIN32
#define ARG_A (args_offset - 16) + rsp
#define ARG_B (args_offset - 8) + rsp
#endif
#define ARG_C (args_offset + 0) + rsp
#define ARG_LDC (args_offset + 8) + rsp
#define ARG_COFFSET_C (args_offset + 16) + rsp

#define COFFSET_CX 0 + rsp
#define COFFSET_CY 8 + rsp

    inLocalLabel();
    {

        Xbyak::Label l103c;
        Xbyak::Label l104c;
        Xbyak::Label l10c;
        Xbyak::Label l10e0;
        Xbyak::Label l1118;
        Xbyak::Label l1170;
        Xbyak::Label l11d8;
        Xbyak::Label l1208;
        Xbyak::Label l120c;
        Xbyak::Label l1258;
        Xbyak::Label l1280;
        Xbyak::Label l130c;
        Xbyak::Label l1320;
        Xbyak::Label l13ac;
        Xbyak::Label l13e8;
        Xbyak::Label l143c;
        Xbyak::Label l149c;
        Xbyak::Label l14d4;
        Xbyak::Label l1504;
        Xbyak::Label l1568;
        Xbyak::Label l1574;
        Xbyak::Label l15d8;
        Xbyak::Label l1600;
        Xbyak::Label l1640;
        Xbyak::Label l168c;
        Xbyak::Label l16a8;
        Xbyak::Label l16ac;
        Xbyak::Label l16f8;
        Xbyak::Label l1720;
        Xbyak::Label l17ac;
        Xbyak::Label l17c0;
        Xbyak::Label l184c;
        Xbyak::Label l1888;
        Xbyak::Label l18dc;
        Xbyak::Label l193c;
        Xbyak::Label l1974;
        Xbyak::Label l19a4;
        Xbyak::Label l1a08;
        Xbyak::Label l1a14;
        Xbyak::Label l1a78;
        Xbyak::Label l1aa0;
        Xbyak::Label l1ae0;
        Xbyak::Label l1b2c;
        Xbyak::Label l1b48;
        Xbyak::Label l1b4c;
        Xbyak::Label l1b98;
        Xbyak::Label l1bc0;
        Xbyak::Label l1c4c;
        Xbyak::Label l1c60;
        Xbyak::Label l1cec;
        Xbyak::Label l1d28;
        Xbyak::Label l1d7c;
        Xbyak::Label l1ddc;
        Xbyak::Label l1e14;
        Xbyak::Label l1e44;
        Xbyak::Label l1ea8;
        Xbyak::Label l1eb4;
        Xbyak::Label l1f18;
        Xbyak::Label l1f40;
        Xbyak::Label l1f80;
        Xbyak::Label l1fcc;
        Xbyak::Label l1fe8;
        Xbyak::Label l1fec;
        Xbyak::Label l28c;
        Xbyak::Label l2a0;
        Xbyak::Label l420;
        Xbyak::Label l4c4;
        Xbyak::Label l598;
        Xbyak::Label l688;
        Xbyak::Label l72c;
        Xbyak::Label l774;
        Xbyak::Label l868;
        Xbyak::Label l878;
        Xbyak::Label l96c;
        Xbyak::Label l98;
        Xbyak::Label l9c8;
        Xbyak::Label la50;
        Xbyak::Label laf8;
        Xbyak::Label lb50;
        Xbyak::Label lb64;
        Xbyak::Label lbb0;
        Xbyak::Label lbdc;
        Xbyak::Label lcb8;
        Xbyak::Label lccc;
        Xbyak::Label ld4;
        Xbyak::Label lda8;
        Xbyak::Label le04;
        Xbyak::Label le80;
        Xbyak::Label lf10;
        Xbyak::Label lf6c;
        Xbyak::Label lfa8;

        auto stack_alloc_size = 32;
        auto args_offset = stack_alloc_size + get_size_of_abi_save_regs() + 8;
#ifdef _WIN32
        args_offset += 48;
#endif
        preamble();
        sub(rsp, stack_alloc_size);
#ifdef _WIN32
        mov(A, ptr[ARG_A]);
        mov(B, ptr[ARG_B]);
#endif

        mov(C, qword[ARG_C]);
        mov(LDC, qword[ARG_LDC]);
        sub(A, -128);
        sub(B, -128);
        mov(M, qword[M]);
        mov(N, qword[N]);
        mov(K, qword[K]);
        lea(LDC, ptr[LDC * 4 + 0x0]);
        mov(H, qword[ARG_COFFSET_C]);
        mov(qword[COFFSET_CX], H);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        xorps(xmm12, xmm12);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(H, 0x10001);
        movq(xmm7, H);
        pshufd(xmm7, xmm7, 0x0);
        mov(J, M);
        cmp(J, 0x10);
        jl(lb64, T_NEAR);
        align(4);

        L(l98);
        mov(CO1, C);
        add(C, 0x40);
        mov(BO, B);
        mov(AA, K);
        shl(AA, 0x20);
        lea(AA, ptr[A + AA * 1 + 0x200]);
        mov(H, qword[COFFSET_CX]);
        mov(qword[COFFSET_CY], H);
        add(H, 0x40);
        mov(qword[COFFSET_CX], H);
        mov(I, N);
        cmp(I, 0x2);
        jl(l72c, T_NEAR);
        align(4);

        L(ld4);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm1, xword[AO - 0x70]);
        movdqu(xmm2, xword[AO - 0x60]);
        movdqu(xmm3, xword[AO - 0x50]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l420, T_NEAR);
        sub(H, 0x8);
        jle(l28c, T_NEAR);
        align(4);

        L(l10c);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        movdqu(xmm2, xword[AO - 0x20]);
        movdqu(xmm3, xword[AO - 0x10]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO]);
        movdqu(xmm1, xword[AO + 0x10]);
        movdqu(xmm2, xword[AO + 0x20]);
        movdqu(xmm3, xword[AO + 0x30]);
        add(AA, 0x4);
        add(AO, 0x80);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l10c, T_NEAR);
        align(4);

        L(l28c);
        prefetcht0(byte[CO1 + 0x3c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
        add(H, 0x8);
        jle(l420, T_NEAR);
        align(4);

        L(l2a0);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        movdqu(xmm2, xword[AO - 0x20]);
        movdqu(xmm3, xword[AO - 0x10]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO]);
        movdqu(xmm1, xword[AO + 0x10]);
        movdqu(xmm2, xword[AO + 0x20]);
        movdqu(xmm3, xword[AO + 0x30]);
        add(AA, 0x4);
        add(AO, 0x80);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l2a0, T_NEAR);
        align(4);

        L(l420);
        mov(H, K);
        test(H, 0x4);
        je(l4c4, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        add(AO, 0x40);
        add(BO, 0x8);
        align(4);

        L(l4c4);
        mov(H, K);
        test(H, 0x2);
        je(l598, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        punpckhwd(xmm1, xmm6);
        movdqu(xmm2, xword[AO - 0x70]);
        movaps(xmm3, xmm2);
        punpcklwd(xmm2, xmm6);
        punpckhwd(xmm3, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        add(AO, 0x20);
        add(BO, 0x4);
        align(4);

        L(l598);
        mov(H, K);
        test(H, 0x1);
        je(l688, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        pshufd(xmm1, xmm3, 0x55);
        punpcklbw(xmm1, xmm6);
        punpcklwd(xmm1, xmm6);
        pshufd(xmm2, xmm3, 0xaa);
        punpcklbw(xmm2, xmm6);
        punpcklwd(xmm2, xmm6);
        pshufd(xmm3, xmm3, 0xff);
        punpcklbw(xmm3, xmm6);
        punpcklwd(xmm3, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        add(AO, 0x10);
        add(BO, 0x2);
        align(4);

        L(l688);
        mov(H, qword[COFFSET_CY]);
        movdqu(xmm0, xword[H]);
        paddd(xmm8, xmm0);
        paddd(xmm9, xmm0);
        movdqu(xmm0, xword[H + 0x10]);
        paddd(xmm10, xmm0);
        paddd(xmm11, xmm0);
        movdqu(xmm0, xword[H + 0x20]);
        paddd(xmm12, xmm0);
        paddd(xmm13, xmm0);
        movdqu(xmm0, xword[H + 0x30]);
        paddd(xmm14, xmm0);
        paddd(xmm15, xmm0);
        movdqu(xword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movdqu(xword[CO1 + 0x10], xmm10);
        xorps(xmm10, xmm10);
        movdqu(xword[CO1 + 0x20], xmm12);
        xorps(xmm12, xmm12);
        movdqu(xword[CO1 + 0x30], xmm14);
        xorps(xmm14, xmm14);
        movdqu(xword[CO1 + LDC * 1], xmm9);
        xorps(xmm9, xmm9);
        movdqu(xword[CO1 + LDC * 1 + 0x10], xmm11);
        xorps(xmm11, xmm11);
        movdqu(xword[CO1 + LDC * 1 + 0x20], xmm13);
        xorps(xmm13, xmm13);
        movdqu(xword[CO1 + LDC * 1 + 0x30], xmm15);
        xorps(xmm15, xmm15);
        lea(CO1, ptr[CO1 + LDC * 2]);
        sub(I, 0x2);
        cmp(I, 0x2);
        jge(ld4, T_NEAR);
        align(4);

        L(l72c);
        test(I, 0x1);
        jle(lb50, T_NEAR);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm1, xword[AO - 0x70]);
        movdqu(xmm2, xword[AO - 0x60]);
        movdqu(xmm3, xword[AO - 0x50]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l96c, T_NEAR);
        sub(H, 0x8);
        jle(l868, T_NEAR);
        align(4);

        L(l774);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        movdqu(xmm2, xword[AO - 0x20]);
        movdqu(xmm3, xword[AO - 0x10]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO]);
        movdqu(xmm1, xword[AO + 0x10]);
        movdqu(xmm2, xword[AO + 0x20]);
        movdqu(xmm3, xword[AO + 0x30]);
        add(AA, 0x4);
        add(AO, 0x80);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l774, T_NEAR);
        align(4);

        L(l868);
        prefetcht0(byte[CO1 + 0x3c]);
        add(H, 0x8);
        jle(l96c, T_NEAR);
        align(4);

        L(l878);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        movdqu(xmm2, xword[AO - 0x20]);
        movdqu(xmm3, xword[AO - 0x10]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO]);
        movdqu(xmm1, xword[AO + 0x10]);
        movdqu(xmm2, xword[AO + 0x20]);
        movdqu(xmm3, xword[AO + 0x30]);
        add(AA, 0x4);
        add(AO, 0x80);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l878, T_NEAR);
        align(4);

        L(l96c);
        mov(H, K);
        test(H, 0x4);
        je(l9c8, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        add(AO, 0x40);
        add(BO, 0x4);
        align(4);

        L(l9c8);
        mov(H, K);
        test(H, 0x2);
        je(la50, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        punpckhwd(xmm1, xmm6);
        movdqu(xmm2, xword[AO - 0x70]);
        movaps(xmm3, xmm2);
        punpcklwd(xmm2, xmm6);
        punpckhwd(xmm3, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        add(AO, 0x20);
        add(BO, 0x2);
        align(4);

        L(la50);
        mov(H, K);
        test(H, 0x1);
        je(laf8, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        pshufd(xmm1, xmm3, 0x55);
        punpcklbw(xmm1, xmm6);
        punpcklwd(xmm1, xmm6);
        pshufd(xmm2, xmm3, 0xaa);
        punpcklbw(xmm2, xmm6);
        punpcklwd(xmm2, xmm6);
        pshufd(xmm3, xmm3, 0xff);
        punpcklbw(xmm3, xmm6);
        punpcklwd(xmm3, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        add(AO, 0x10);
        add(BO, 0x1);
        align(4);

        L(laf8);
        mov(H, qword[COFFSET_CY]);
        movdqu(xmm0, xword[H]);
        paddd(xmm8, xmm0);
        movdqu(xmm0, xword[H + 0x10]);
        paddd(xmm10, xmm0);
        movdqu(xmm0, xword[H + 0x20]);
        paddd(xmm12, xmm0);
        movdqu(xmm0, xword[H + 0x30]);
        paddd(xmm14, xmm0);
        movdqu(xword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movdqu(xword[CO1 + 0x10], xmm10);
        xorps(xmm10, xmm10);
        movdqu(xword[CO1 + 0x20], xmm12);
        xorps(xmm12, xmm12);
        movdqu(xword[CO1 + 0x30], xmm14);
        xorps(xmm14, xmm14);
        lea(CO1, ptr[CO1 + LDC * 1]);
        align(4);

        L(lb50);
        mov(A, AO);
        sub(J, 0x10);
        cmp(J, 0x10);
        jge(l98, T_NEAR);
        align(4);

        L(lb64);
        test(J, 0x8);
        jle(l120c, T_NEAR);
        mov(CO1, C);
        add(C, 0x20);
        mov(BO, B);
        mov(AA, K);
        shl(AA, 0x10);
        lea(AA, ptr[A + AA * 1 + 0x200]);
        mov(H, qword[COFFSET_CX]);
        mov(qword[COFFSET_CY], H);
        add(H, 0x20);
        mov(qword[COFFSET_CX], H);
        mov(I, N);
        cmp(I, 0x2);
        jl(lf6c, T_NEAR);
        align(4);

        L(lbb0);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm1, xword[AO - 0x70]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(lda8, T_NEAR);
        sub(H, 0x8);
        jle(lcb8, T_NEAR);
        align(4);

        L(lbdc);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        movdqu(xmm0, xword[AO - 0x60]);
        movdqu(xmm1, xword[AO - 0x50]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        add(AA, 0x4);
        add(AO, 0x40);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(lbdc, T_NEAR);
        align(4);

        L(lcb8);
        prefetcht0(byte[CO1 + 0x3c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
        add(H, 0x8);
        jle(lda8, T_NEAR);
        align(4);

        L(lccc);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        movdqu(xmm0, xword[AO - 0x60]);
        movdqu(xmm1, xword[AO - 0x50]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        add(AA, 0x4);
        add(AO, 0x40);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(lccc, T_NEAR);
        align(4);

        L(lda8);
        mov(H, K);
        test(H, 0x4);
        je(le04, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        add(AO, 0x20);
        add(BO, 0x8);
        align(4);

        L(le04);
        mov(H, K);
        test(H, 0x2);
        je(le80, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        punpckhwd(xmm1, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        add(AO, 0x10);
        add(BO, 0x4);
        align(4);

        L(le80);
        mov(H, K);
        test(H, 0x1);
        je(lf10, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        pshufd(xmm1, xmm3, 0x55);
        punpcklbw(xmm1, xmm6);
        punpcklwd(xmm1, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        add(AO, 0x8);
        add(BO, 0x2);
        align(4);

        L(lf10);
        mov(H, qword[COFFSET_CY]);
        movdqu(xmm0, xword[H]);
        paddd(xmm8, xmm0);
        paddd(xmm9, xmm0);
        movdqu(xmm0, xword[H + 0x10]);
        paddd(xmm10, xmm0);
        paddd(xmm11, xmm0);
        movdqu(xword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movdqu(xword[CO1 + 0x10], xmm10);
        xorps(xmm10, xmm10);
        movdqu(xword[CO1 + LDC * 1], xmm9);
        xorps(xmm9, xmm9);
        movdqu(xword[CO1 + LDC * 1 + 0x10], xmm11);
        xorps(xmm11, xmm11);
        lea(CO1, ptr[CO1 + LDC * 2]);
        sub(I, 0x2);
        cmp(I, 0x2);
        jge(lbb0, T_NEAR);
        align(4);

        L(lf6c);
        test(I, 0x1);
        jle(l1208, T_NEAR);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm1, xword[AO - 0x70]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l10e0, T_NEAR);
        sub(H, 0x8);
        jle(l103c, T_NEAR);
        align(4);

        L(lfa8);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x60]);
        movdqu(xmm1, xword[AO - 0x50]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        add(AA, 0x4);
        add(AO, 0x40);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(lfa8, T_NEAR);
        align(4);

        L(l103c);
        prefetcht0(byte[CO1 + 0x3c]);
        add(H, 0x8);
        jle(l10e0, T_NEAR);
        align(4);

        L(l104c);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x60]);
        movdqu(xmm1, xword[AO - 0x50]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        add(AA, 0x4);
        add(AO, 0x40);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l104c, T_NEAR);
        align(4);

        L(l10e0);
        mov(H, K);
        test(H, 0x4);
        je(l1118, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        add(AO, 0x20);
        add(BO, 0x4);
        align(4);

        L(l1118);
        mov(H, K);
        test(H, 0x2);
        je(l1170, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        punpckhwd(xmm1, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        add(AO, 0x10);
        add(BO, 0x2);
        align(4);

        L(l1170);
        mov(H, K);
        test(H, 0x1);
        je(l11d8, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        pshufd(xmm1, xmm3, 0x55);
        punpcklbw(xmm1, xmm6);
        punpcklwd(xmm1, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        add(AO, 0x8);
        add(BO, 0x1);
        align(4);

        L(l11d8);
        mov(H, qword[COFFSET_CY]);
        movdqu(xmm0, xword[H]);
        paddd(xmm8, xmm0);
        movdqu(xmm0, xword[H + 0x10]);
        paddd(xmm10, xmm0);
        movdqu(xword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movdqu(xword[CO1 + 0x10], xmm10);
        xorps(xmm10, xmm10);
        lea(CO1, ptr[CO1 + LDC * 1]);
        align(4);

        L(l1208);
        mov(A, AO);
        align(4);

        L(l120c);
        test(J, 0x4);
        jle(l16ac, T_NEAR);
        mov(CO1, C);
        add(C, 0x10);
        mov(BO, B);
        mov(AA, K);
        shl(AA, 0x8);
        lea(AA, ptr[A + AA * 1 + 0x200]);
        mov(H, qword[COFFSET_CX]);
        mov(qword[COFFSET_CY], H);
        add(H, 0x10);
        mov(qword[COFFSET_CX], H);
        mov(I, N);
        cmp(I, 0x2);
        jl(l14d4, T_NEAR);
        align(4);

        L(l1258);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l13ac, T_NEAR);
        sub(H, 0x8);
        jle(l130c, T_NEAR);
        align(4);

        L(l1280);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm0, xword[AO - 0x70]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x60]);
        add(AA, 0x4);
        add(AO, 0x20);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l1280, T_NEAR);
        align(4);

        L(l130c);
        prefetcht0(byte[CO1 + 0x3c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
        add(H, 0x8);
        jle(l13ac, T_NEAR);
        align(4);

        L(l1320);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm0, xword[AO - 0x70]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x60]);
        add(AA, 0x4);
        add(AO, 0x20);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l1320, T_NEAR);
        align(4);

        L(l13ac);
        mov(H, K);
        test(H, 0x4);
        je(l13e8, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x10);
        add(BO, 0x8);
        align(4);

        L(l13e8);
        mov(H, K);
        test(H, 0x2);
        je(l143c, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x8);
        add(BO, 0x4);
        align(4);

        L(l143c);
        mov(H, K);
        test(H, 0x1);
        je(l149c, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x4);
        add(BO, 0x2);
        align(4);

        L(l149c);
        mov(H, qword[COFFSET_CY]);
        movdqu(xmm0, xword[H]);
        paddd(xmm8, xmm0);
        paddd(xmm9, xmm0);
        movdqu(xword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movdqu(xword[CO1 + LDC * 1], xmm9);
        xorps(xmm9, xmm9);
        lea(CO1, ptr[CO1 + LDC * 2]);
        sub(I, 0x2);
        cmp(I, 0x2);
        jge(l1258, T_NEAR);
        align(4);

        L(l14d4);
        test(I, 0x1);
        jle(l16a8, T_NEAR);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l15d8, T_NEAR);
        sub(H, 0x8);
        jle(l1568, T_NEAR);
        align(4);

        L(l1504);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x70]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x60]);
        add(AA, 0x4);
        add(AO, 0x20);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l1504, T_NEAR);
        align(4);

        L(l1568);
        prefetcht0(byte[CO1 + 0x3c]);
        add(H, 0x8);
        jle(l15d8, T_NEAR);
        align(4);

        L(l1574);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x70]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x60]);
        add(AA, 0x4);
        add(AO, 0x20);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l1574, T_NEAR);
        align(4);

        L(l15d8);
        mov(H, K);
        test(H, 0x4);
        je(l1600, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x10);
        add(BO, 0x4);
        align(4);

        L(l1600);
        mov(H, K);
        test(H, 0x2);
        je(l1640, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x8);
        add(BO, 0x2);
        align(4);

        L(l1640);
        mov(H, K);
        test(H, 0x1);
        je(l168c, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x4);
        add(BO, 0x1);
        align(4);

        L(l168c);
        mov(H, qword[COFFSET_CY]);
        movdqu(xmm0, xword[H]);
        paddd(xmm8, xmm0);
        movdqu(xword[CO1], xmm8);
        xorps(xmm8, xmm8);
        lea(CO1, ptr[CO1 + LDC * 1]);
        align(4);

        L(l16a8);
        mov(A, AO);
        align(4);

        L(l16ac);
        test(J, 0x2);
        jle(l1b4c, T_NEAR);
        mov(CO1, C);
        add(C, 0x8);
        mov(BO, B);
        mov(AA, K);
        shl(AA, 0x4);
        lea(AA, ptr[A + AA * 1 + 0x200]);
        mov(H, qword[COFFSET_CX]);
        mov(qword[COFFSET_CY], H);
        add(H, 0x8);
        mov(qword[COFFSET_CX], H);
        mov(I, N);
        cmp(I, 0x2);
        jl(l1974, T_NEAR);
        align(4);

        L(l16f8);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l184c, T_NEAR);
        sub(H, 0x8);
        jle(l17ac, T_NEAR);
        align(4);

        L(l1720);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm0, xword[AO - 0x78]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x70]);
        add(AA, 0x4);
        add(AO, 0x10);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l1720, T_NEAR);
        align(4);

        L(l17ac);
        prefetcht0(byte[CO1 + 0x3c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
        add(H, 0x8);
        jle(l184c, T_NEAR);
        align(4);

        L(l17c0);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm0, xword[AO - 0x78]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x70]);
        add(AA, 0x4);
        add(AO, 0x10);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l17c0, T_NEAR);
        align(4);

        L(l184c);
        mov(H, K);
        test(H, 0x4);
        je(l1888, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x8);
        add(BO, 0x8);
        align(4);

        L(l1888);
        mov(H, K);
        test(H, 0x2);
        je(l18dc, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x4);
        add(BO, 0x4);
        align(4);

        L(l18dc);
        mov(H, K);
        test(H, 0x1);
        je(l193c, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x2);
        add(BO, 0x2);
        align(4);

        L(l193c);
        mov(H, qword[COFFSET_CY]);
        movsd(xmm0, qword[H]);
        paddd(xmm8, xmm0);
        paddd(xmm9, xmm0);
        movlps(qword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movlps(qword[CO1 + LDC * 1], xmm9);
        xorps(xmm9, xmm9);
        lea(CO1, ptr[CO1 + LDC * 2]);
        sub(I, 0x2);
        cmp(I, 0x2);
        jge(l16f8, T_NEAR);
        align(4);

        L(l1974);
        test(I, 0x1);
        jle(l1b48, T_NEAR);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l1a78, T_NEAR);
        sub(H, 0x8);
        jle(l1a08, T_NEAR);
        align(4);

        L(l19a4);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x78]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x70]);
        add(AA, 0x4);
        add(AO, 0x10);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l19a4, T_NEAR);
        align(4);

        L(l1a08);
        prefetcht0(byte[CO1 + 0x3c]);
        add(H, 0x8);
        jle(l1a78, T_NEAR);
        align(4);

        L(l1a14);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x78]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x70]);
        add(AA, 0x4);
        add(AO, 0x10);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l1a14, T_NEAR);
        align(4);

        L(l1a78);
        mov(H, K);
        test(H, 0x4);
        je(l1aa0, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x8);
        add(BO, 0x4);
        align(4);

        L(l1aa0);
        mov(H, K);
        test(H, 0x2);
        je(l1ae0, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x4);
        add(BO, 0x2);
        align(4);

        L(l1ae0);
        mov(H, K);
        test(H, 0x1);
        je(l1b2c, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x2);
        add(BO, 0x1);
        align(4);

        L(l1b2c);
        mov(H, qword[COFFSET_CY]);
        movsd(xmm0, qword[H]);
        paddd(xmm8, xmm0);
        movlps(qword[CO1], xmm8);
        xorps(xmm8, xmm8);
        lea(CO1, ptr[CO1 + LDC * 1]);
        align(4);

        L(l1b48);
        mov(A, AO);
        align(4);

        L(l1b4c);
        test(J, 0x1);
        jle(l1fec, T_NEAR);
        mov(CO1, C);
        add(C, 0x4);
        mov(BO, B);
        mov(AA, K);
        shl(AA, 0x2);
        lea(AA, ptr[A + AA * 1 + 0x200]);
        mov(H, qword[COFFSET_CX]);
        mov(qword[COFFSET_CY], H);
        add(H, 0x4);
        mov(qword[COFFSET_CX], H);
        mov(I, N);
        cmp(I, 0x2);
        jl(l1e14, T_NEAR);
        align(4);

        L(l1b98);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l1cec, T_NEAR);
        sub(H, 0x8);
        jle(l1c4c, T_NEAR);
        align(4);

        L(l1bc0);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm0, xword[AO - 0x7c]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x78]);
        add(AA, 0x4);
        add(AO, 0x8);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l1bc0, T_NEAR);
        align(4);

        L(l1c4c);
        prefetcht0(byte[CO1 + 0x3c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
        add(H, 0x8);
        jle(l1cec, T_NEAR);
        align(4);

        L(l1c60);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm0, xword[AO - 0x7c]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x78]);
        add(AA, 0x4);
        add(AO, 0x8);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l1c60, T_NEAR);
        align(4);

        L(l1cec);
        mov(H, K);
        test(H, 0x4);
        je(l1d28, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x4);
        add(BO, 0x8);
        align(4);

        L(l1d28);
        mov(H, K);
        test(H, 0x2);
        je(l1d7c, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x2);
        add(BO, 0x4);
        align(4);

        L(l1d7c);
        mov(H, K);
        test(H, 0x1);
        je(l1ddc, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x1);
        add(BO, 0x2);
        align(4);

        L(l1ddc);
        mov(H, qword[COFFSET_CY]);
        movss(xmm0, dword[H]);
        paddd(xmm8, xmm0);
        paddd(xmm9, xmm0);
        movss(dword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movss(dword[CO1 + LDC * 1], xmm9);
        xorps(xmm9, xmm9);
        lea(CO1, ptr[CO1 + LDC * 2]);
        sub(I, 0x2);
        cmp(I, 0x2);
        jge(l1b98, T_NEAR);
        align(4);

        L(l1e14);
        test(I, 0x1);
        jle(l1fe8, T_NEAR);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l1f18, T_NEAR);
        sub(H, 0x8);
        jle(l1ea8, T_NEAR);
        align(4);

        L(l1e44);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x7c]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x78]);
        add(AA, 0x4);
        add(AO, 0x8);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l1e44, T_NEAR);
        align(4);

        L(l1ea8);
        prefetcht0(byte[CO1 + 0x3c]);
        add(H, 0x8);
        jle(l1f18, T_NEAR);
        align(4);

        L(l1eb4);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x7c]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x78]);
        add(AA, 0x4);
        add(AO, 0x8);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l1eb4, T_NEAR);
        align(4);

        L(l1f18);
        mov(H, K);
        test(H, 0x4);
        je(l1f40, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x4);
        add(BO, 0x4);
        align(4);

        L(l1f40);
        mov(H, K);
        test(H, 0x2);
        je(l1f80, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x2);
        add(BO, 0x2);
        align(4);

        L(l1f80);
        mov(H, K);
        test(H, 0x1);
        je(l1fcc, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x1);
        add(BO, 0x1);
        align(4);

        L(l1fcc);
        mov(H, qword[COFFSET_CY]);
        movss(xmm0, dword[H]);
        paddd(xmm8, xmm0);
        movss(dword[CO1], xmm8);
        xorps(xmm8, xmm8);
        lea(CO1, ptr[CO1 + LDC * 1]);
        align(4);

        L(l1fe8);
        mov(A, AO);
        align(4);

        L(l1fec);
        add(rsp, stack_alloc_size);
        postamble();
    }
    outLocalLabel();

#undef M
#undef N
#undef K
#undef A
#undef B
#undef C
#undef LDC
#undef AA
#undef I
#undef J
#undef H
#undef AO
#undef BO
#undef CO1
#undef CO2
#ifdef _WIN32
#undef ARG_A
#undef ARG_B
#endif
#undef ARG_C
#undef ARG_LDC
#undef ARG_COFFSET_C
#undef COFFSET_CX
#undef COFFSET_CY
}

} // namespace cpu
} // namespace impl
} // namespace mkldnn
