rangeify: don't tag consts, they are global (#12247) #43
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Benchmarks | |
| env: | |
| # TODO: this rescheduling makes gpt2, mixtral and llama unjitted slower | |
| # TODO: very slow for llama 70B and resnet training 6 GPU | |
| CAPTURE_PROCESS_REPLAY: "1" | |
| ASSERT_PROCESS_REPLAY: "0" | |
| PYTHONPATH: . | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| on: | |
| push: | |
| branches: | |
| - master | |
| - update_benchmark | |
| - update_benchmark_staging | |
| workflow_dispatch: | |
| inputs: | |
| run_process_replay: | |
| description: "Run process replay tests" | |
| required: false | |
| default: false | |
| type: boolean | |
| jobs: | |
| testmacbenchmark: | |
| name: Mac Benchmark | |
| env: | |
| # since sudo is required for usbgpu on macos, move the cache to a new location, as some of the files are owned by root | |
| PYTHONPYCACHEPREFIX: /tmp/tiny_python_pycache | |
| runs-on: [self-hosted, macOS] | |
| timeout-minutes: 60 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v4 | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/extra/disassemblers/applegpu extra/disassemblers/applegpu | |
| ln -s ~/tinygrad/weights/sd-v1-4.ckpt weights/sd-v1-4.ckpt | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: python3.11 test/external/process_replay/reset.py | |
| - name: Run Stable Diffusion | |
| run: BENCHMARK_LOG=stable_diffusion JIT=1 ASSERT_MIN_STEP_TIME=500 python3.11 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt | |
| - name: Run Stable Diffusion without fp16 | |
| run: BENCHMARK_LOG=stable_diffusion_fp32 JIT=1 ASSERT_MIN_STEP_TIME=700 python3.11 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd_no_fp16.txt | |
| - name: Run Stable Diffusion v2 | |
| run: BENCHMARK_LOG=stable_diffusion_v2 JIT=1 ASSERT_MIN_STEP_TIME=1600 python3.11 examples/sdv2.py --fp16 --seed 0 --noshow --timing | tee sdv2.txt | |
| # process replay can't capture this, the graph is too large | |
| - name: Run SDXL | |
| run: BENCHMARK_LOG=stable_diffusion_xl ASSERT_MIN_STEP_TIME=3000 CAPTURE_PROCESS_REPLAY=0 JIT=1 python3.11 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt | |
| - name: Run model inference benchmark | |
| run: METAL=1 python3.11 test/external/external_model_benchmark.py | |
| - name: Test speed vs torch | |
| run: BIG=2 MPS=1 python3.11 test/speed/external_test_speed_v_torch.py | tee torch_speed.txt | |
| - name: Test tensor cores | |
| run: METAL=1 python3.11 test/opt/test_tensor_cores.py | |
| - name: Test AMX tensor cores | |
| run: | | |
| DEBUG=2 CPU=1 CPU_LLVM=0 AMX=1 python3.11 test/opt/test_tensor_cores.py | |
| DEBUG=2 CPU=1 CPU_LLVM=1 AMX=1 python3.11 test/opt/test_tensor_cores.py | |
| DEBUG=2 CPU=1 CPU_LLVM=0 AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx | |
| DEBUG=2 CPU=1 CPU_LLVM=1 AMX=1 python3.11 test/opt/test_gen_float4.py TestFloat4.test_float4_multidim_amx TestFloat4.test_float4_multidim_unaligned_load_amx | |
| - name: Run Tensor Core GEMM (float) | |
| run: DEBUG=2 SHOULD_USE_TC=1 python3.11 extra/gemm/simple_matmul.py | tee matmul.txt | |
| - name: Run Tensor Core GEMM (half) | |
| run: DEBUG=2 SHOULD_USE_TC=1 HALF=1 python3.11 extra/gemm/simple_matmul.py | tee matmul_half.txt | |
| - name: Run Tensor Core GEMM (bfloat16) | |
| run: DEBUG=2 SHOULD_USE_TC=1 BFLOAT16=1 python3.11 extra/gemm/simple_matmul.py | tee matmul_bfloat16.txt | |
| - name: Fuzz Padded Tensor Core GEMM | |
| run: METAL=1 M_START=6 M_STOP=10 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=6 K_STOP=24 K_STEP=1 TC_OPT=2 DEBUG=2 python3.11 ./extra/gemm/fuzz_matmul.py | |
| - name: Run LLaMA | |
| run: | | |
| BENCHMARK_LOG=llama_nojit JIT=0 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt | |
| BENCHMARK_LOG=llama JIT=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt | |
| - name: Run LLaMA with BEAM | |
| run: BENCHMARK_LOG=llama_beam JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt | |
| - name: Run quantized LLaMA | |
| run: | | |
| BENCHMARK_LOG=llama_int8 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize int8 | tee llama_int8.txt | |
| BENCHMARK_LOG=llama_nf4 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize nf4 | tee llama_nf4.txt | |
| - name: Run quantized LLaMA3 | |
| run: | | |
| BENCHMARK_LOG=llama3_int8 python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize int8 | tee llama3_int8.txt | |
| BENCHMARK_LOG=llama3_nf4 python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize nf4 | tee llama3_nf4.txt | |
| #- name: Run LLaMA 7B on 4 (virtual) GPUs | |
| # run: python3.11 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt | |
| - name: Run GPT2 | |
| run: | | |
| BENCHMARK_LOG=gpt2_nojit JIT=0 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt | |
| BENCHMARK_LOG=gpt2 JIT=1 ASSERT_MIN_STEP_TIME=8 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt | |
| - name: Run GPT2 w HALF | |
| run: BENCHMARK_LOG=gpt2_half HALF=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt | |
| - name: Run GPT2 w HALF/BEAM | |
| run: BENCHMARK_LOG=gpt2_half_beam HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt | |
| - name: Run OLMoE | |
| run: BENCHMARK_LOG=olmoe python3.11 examples/olmoe.py | |
| - name: Train MNIST | |
| run: time PYTHONPATH=. TARGET_EVAL_ACC_PCT=96.0 python3.11 examples/beautiful_mnist.py | tee beautiful_mnist.txt | |
| - name: Run 10 CIFAR training steps | |
| run: BENCHMARK_LOG=cifar_10steps JIT=1 ASSERT_MIN_STEP_TIME=320 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar.txt | |
| - name: Run 10 CIFAR training steps w HALF | |
| run: BENCHMARK_LOG=cifar_10steps_half JIT=2 ASSERT_MIN_STEP_TIME=385 STEPS=10 DEFAULT_FLOAT=HALF python3.11 examples/hlb_cifar10.py | tee train_cifar_half.txt | |
| #- name: Run 10 CIFAR training steps w BF16 | |
| # run: STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3.11 examples/hlb_cifar10.py | tee train_cifar_bf16.txt | |
| - name: Run 10 CIFAR training steps w winograd | |
| run: BENCHMARK_LOG=cifar_10steps_wino JIT=1 ASSERT_MIN_STEP_TIME=150 WINO=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar_wino.txt | |
| - name: UsbGPU boot time | |
| run: sudo -E PYTHONPATH=. DEBUG=2 AM_RESET=1 AMD=1 AMD_IFACE=USB time python3.11 test/test_tiny.py TestTiny.test_plus | |
| - name: UsbGPU tiny tests | |
| run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB python3.11 test/test_tiny.py | |
| - name: UsbGPU copy speeds | |
| run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB python3.11 test/external/external_test_usb_asm24.py TestDevCopySpeeds | |
| #- name: UsbGPU openpilot test | |
| # run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB NOLOCALS=0 IMAGE=0 GRAPH_ONE_KERNEL=1 python3.11 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: Speed (Mac) | |
| path: | | |
| onnx_inference_speed.csv | |
| torch_speed.txt | |
| llama_unjitted.txt | |
| llama_jitted.txt | |
| llama_beam.txt | |
| llama_int8.txt | |
| llama_nf4.txt | |
| llama3_int8.txt | |
| llama3_nf4.txt | |
| llama_four_gpu.txt | |
| gpt2_unjitted.txt | |
| gpt2_jitted.txt | |
| gpt2_half.txt | |
| gpt2_half_beam.txt | |
| matmul.txt | |
| matmul_half.txt | |
| matmul_bfloat16.txt | |
| sd.txt | |
| sd_no_fp16.txt | |
| sdv2.txt | |
| sdxl.txt | |
| beautiful_mnist.txt | |
| train_cifar.txt | |
| train_cifar_half.txt | |
| train_cifar_bf16.txt | |
| train_cifar_wino.txt | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3.11 process_replay.py | |
| testnvidiabenchmark: | |
| name: tinybox green Benchmark | |
| runs-on: [self-hosted, Linux, tinyboxgreen] | |
| timeout-minutes: 60 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v4 | |
| - name: Print nvidia-smi | |
| run: nvidia-smi | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| ln -s /raid/weights/LLaMA-3 weights/LLaMA-3 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| - name: Run model inference benchmark | |
| run: NV=1 CAPTURE_PROCESS_REPLAY=0 NOCLANG=1 python3 test/external/external_model_benchmark.py | |
| - name: Test speed vs torch | |
| run: NV=1 CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/speed/external_test_speed_v_torch.py | tee torch_speed.txt | |
| - name: Test speed vs theoretical | |
| run: NV=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20 | |
| - name: Test benchmark allreduce | |
| run: NV=1 python test/external/external_benchmark_multitensor_allreduce.py | |
| - name: Test tensor cores | |
| run: | | |
| NV=1 ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py | |
| NV=1 NV_PTX=1 ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py | |
| - name: Run Tensor Core GEMM (CUDA) | |
| run: | | |
| CUDA=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt | |
| CUDA=1 SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_bfloat16.txt | |
| CUDA=1 SHOULD_USE_TC=1 ALLOW_TF32=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py | tee matmul_tf32.txt | |
| - name: Run Tensor Core GEMM (PTX) | |
| run: NV=1 NV_PTX=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_ptx.txt | |
| - name: Run Tensor Core GEMM (NV) | |
| run: NV=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_nv.txt | |
| - name: Test NV=1 | |
| run: DEBUG=2 NV=1 python -m pytest -rA test/test_tiny.py | |
| - name: Test CUDA=1 | |
| run: DEBUG=2 CUDA=1 python -m pytest -rA test/test_tiny.py | |
| - name: Run Stable Diffusion | |
| run: BENCHMARK_LOG=stable_diffusion NV=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt | |
| - name: Run SDXL | |
| run: BENCHMARK_LOG=stable_diffusion_xl ASSERT_MIN_STEP_TIME=2000 CAPTURE_PROCESS_REPLAY=0 NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt | |
| - name: Run LLaMA | |
| run: | | |
| BENCHMARK_LOG=llama_nojit NV=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt | |
| BENCHMARK_LOG=llama NV=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt | |
| - name: Run LLaMA with BEAM | |
| run: BENCHMARK_LOG=llama_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt | |
| # - name: Run LLaMA 7B on 4 GPUs | |
| # run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt | |
| # - name: Run LLaMA 7B on 6 GPUs | |
| # run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt | |
| - name: Run LLaMA-3 8B BEAM | |
| run: BENCHMARK_LOG=llama3_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt | |
| - name: Run LLaMA-3 8B on 4 GPUs with BEAM | |
| run: BENCHMARK_LOG=llama3_beam_4gpu NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt | |
| # - name: Run LLaMA-3 8B on 6 GPUs | |
| # run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt | |
| # - name: Run LLaMA-2 70B | |
| # run: NV=1 CAPTURE_PROCESS_REPLAY=0 MAX_CONTEXT=256 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt | |
| - name: Run Mixtral 8x7B | |
| run: time BENCHMARK_LOG=mixtral NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt | |
| - name: Run GPT2 | |
| run: | | |
| BENCHMARK_LOG=gpt2_nojit NV=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt | |
| BENCHMARK_LOG=gpt2 NV=1 JIT=1 ASSERT_MIN_STEP_TIME=5 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt | |
| - name: Run GPT2 w HALF | |
| run: BENCHMARK_LOG=gpt2_half NV=1 HALF=1 ASSERT_MIN_STEP_TIME=5 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt | |
| - name: Run GPT2 w HALF/BEAM | |
| run: BENCHMARK_LOG=gpt2_half_beam NV=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: Speed (NVIDIA) | |
| path: | | |
| onnx_inference_speed.csv | |
| torch_speed.txt | |
| matmul.txt | |
| matmul_bfloat16.txt | |
| matmul_tf32.txt | |
| matmul_ptx.txt | |
| matmul_nv.txt | |
| sd.txt | |
| sdxl.txt | |
| llama_unjitted.txt | |
| llama_jitted.txt | |
| llama_beam.txt | |
| llama3_beam.txt | |
| llama3_four_gpu.txt | |
| llama3_six_gpu.txt | |
| llama_2_70B.txt | |
| mixtral.txt | |
| gpt2_unjitted.txt | |
| gpt2_jitted.txt | |
| gpt2_half.txt | |
| gpt2_half_beam.txt | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| testmorenvidiabenchmark: | |
| name: tinybox green Training Benchmark | |
| runs-on: [self-hosted, Linux, tinyboxgreen] | |
| timeout-minutes: 60 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v4 | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| - name: Fuzz Padded Tensor Core GEMM (NV) | |
| run: NV=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py | |
| - name: Fuzz Padded Tensor Core GEMM (PTX) | |
| run: NV=1 NV_PTX=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py | |
| - name: Train MNIST | |
| run: time PYTHONPATH=. NV=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt | |
| - name: Run 10 CIFAR training steps | |
| run: BENCHMARK_LOG=cifar_10steps ASSERT_MIN_STEP_TIME=85 NV=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt | |
| - name: Run 10 CIFAR training steps w HALF | |
| run: BENCHMARK_LOG=cifar_10steps_half ASSERT_MIN_STEP_TIME=68 NV=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt | |
| - name: Run 10 CIFAR training steps w BF16 | |
| run: BENCHMARK_LOG=cifar_10steps_bf16 ASSERT_MIN_STEP_TIME=75 NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt | |
| - name: Run 10 CIFAR training steps w winograd | |
| run: BENCHMARK_LOG=cifar_10steps_half_wino ASSERT_MIN_STEP_TIME=35 NV=1 CAPTURE_PROCESS_REPLAY=0 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt | |
| - name: Run full CIFAR training w 1 GPU | |
| run: time BENCHMARK_LOG=cifar NV=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt | |
| - name: Run full CIFAR training steps w 6 GPUS | |
| run: time BENCHMARK_LOG=cifar_6gpu CAPTURE_PROCESS_REPLAY=0 NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt | |
| - name: Run MLPerf resnet eval on training data | |
| run: time BENCHMARK_LOG=resnet_eval NV=1 MODEL=resnet python3 examples/mlperf/model_eval.py | |
| #- name: Run 10 MLPerf ResNet50 training steps (1 gpu) | |
| # run: BENCHMARK_LOG=resnet_10steps NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt | |
| #- name: Run 10 MLPerf ResNet50 training steps (6 gpu) | |
| # run: BENCHMARK_LOG=resnet_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt | |
| - name: Run 10 MLPerf Bert training steps (6 gpu) | |
| # TODO: remove BERT_LAYERS once scheduler is fast | |
| run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: Speed (NVIDIA Training) | |
| path: | | |
| beautiful_mnist.txt | |
| train_cifar.txt | |
| train_cifar_half.txt | |
| train_cifar_bf16.txt | |
| train_cifar_wino.txt | |
| train_cifar_one_gpu.txt | |
| train_cifar_six_gpu.txt | |
| train_resnet.txt | |
| train_resnet_one_gpu.txt | |
| train_bert.txt | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| testamdbenchmark: | |
| name: tinybox red Benchmark | |
| runs-on: [self-hosted, Linux, tinybox] | |
| timeout-minutes: 60 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v4 | |
| - name: Remove amdgpu | |
| run: sudo rmmod amdgpu || true | |
| - name: Cleanup running AM processes | |
| run: python extra/amdpci/am_smi.py --pids --kill | |
| #- name: Insert amdgpu | |
| # run: sudo modprobe amdgpu | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| ln -s /raid/weights/LLaMA-3 weights/LLaMA-3 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| #- name: setup perflevel | |
| # run: | | |
| # examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/setup.sh | |
| # rocm-smi | |
| #- name: Show off tinybox | |
| # run: /opt/rocm/bin/rocm-bandwidth-test | |
| # TODO: unstable on AMD | |
| #- name: Run model inference benchmark | |
| # run: LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 NOCLANG=1 python3 test/external/external_model_benchmark.py | |
| # TODO: unstable on AMD | |
| #- name: Test speed vs torch | |
| # run: | | |
| # python3 -c "import torch; print(torch.__version__)" | |
| # LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/speed/external_test_speed_v_torch.py | tee torch_speed.txt | |
| - name: Test speed vs theoretical | |
| run: AMD=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20 | |
| - name: Test tensor cores | |
| run: | | |
| AMD=1 AMD_LLVM=0 python3 test/opt/test_tensor_cores.py | |
| AMD=1 AMD_LLVM=1 python3 test/opt/test_tensor_cores.py | |
| AMD=1 SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | |
| - name: Run Tensor Core GEMM (AMD) | |
| run: AMD=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py | tee matmul_amd.txt | |
| - name: Test AMD=1 | |
| run: DEBUG=2 AMD=1 python -m pytest -rA test/test_tiny.py | |
| #- name: Test HIP=1 | |
| # run: DEBUG=2 HIP=1 python -m pytest -rA test/test_tiny.py | |
| # TODO: AMD compiler bug causes this to fail | |
| #- name: Fuzz Padded Tensor Core GEMM | |
| # run: HSA=1 M_START=12 M_STOP=20 M_STEP=1 N_START=12 N_STOP=20 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 DEBUG=2 python3 ./extra/gemm/fuzz_matmul.py | |
| #- name: Remove amdgpu | |
| # run: sleep 10 && sudo rmmod amdgpu # sleep a bit to let the driver unload the prev pid. | |
| - name: Test AM cold start time | |
| run: time AMD=1 AM_RESET=1 python3 test/test_tiny.py TestTiny.test_plus | |
| - name: Test AM warm start time | |
| run: time AMD=1 python3 test/test_tiny.py TestTiny.test_plus | |
| - name: Run Stable Diffusion | |
| run: BENCHMARK_LOG=stable_diffusion ASSERT_MIN_STEP_TIME=450 AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt | |
| - name: Run SDXL | |
| run: BENCHMARK_LOG=stable_diffusion_xl ASSERT_MIN_STEP_TIME=1400 CAPTURE_PROCESS_REPLAY=0 AMD=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt | |
| - name: Run LLaMA 7B | |
| run: | | |
| BENCHMARK_LOG=llama_nojit AMD=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt | |
| BENCHMARK_LOG=llama AMD=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt | |
| - name: Run LLaMA 7B with BEAM | |
| run: BENCHMARK_LOG=llama_beam AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt | |
| # - name: Run LLaMA 7B on 4 GPUs | |
| # run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt | |
| # - name: Run LLaMA 7B on 6 GPUs | |
| # run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt | |
| - name: Run LLaMA-3 8B BEAM | |
| run: BENCHMARK_LOG=llama3_beam AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt | |
| - name: Run LLaMA-3 8B on 4 GPUs with BEAM | |
| run: BENCHMARK_LOG=llama3_beam_4gpu AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt | |
| # - name: Run LLaMA-3 8B on 6 GPUs | |
| # run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt | |
| #- name: Restore amdgpu | |
| # run: sudo modprobe amdgpu | |
| # - name: Run LLaMA-2 70B | |
| # run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt | |
| - name: Run Mixtral 8x7B | |
| run: time BENCHMARK_LOG=mixtral AMD=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt | |
| - name: Run GPT2 | |
| run: | | |
| BENCHMARK_LOG=gpt2_nojit AMD=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt | |
| BENCHMARK_LOG=gpt2 AMD=1 JIT=1 ASSERT_MIN_STEP_TIME=5 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt | |
| - name: Run GPT2 w HALF | |
| run: BENCHMARK_LOG=gpt2_half AMD=1 HALF=1 ASSERT_MIN_STEP_TIME=5 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt | |
| - name: Run GPT2 w HALF/BEAM | |
| run: BENCHMARK_LOG=gpt2_half_beam AMD=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: Speed (AMD) | |
| path: | | |
| onnx_inference_speed.csv | |
| torch_speed.txt | |
| llama_unjitted.txt | |
| llama_jitted.txt | |
| llama_beam.txt | |
| llama3_beam.txt | |
| llama3_four_gpu.txt | |
| llama3_six_gpu.txt | |
| llama_2_70B.txt | |
| gpt2_unjitted.txt | |
| gpt2_jitted.txt | |
| gpt2_half.txt | |
| gpt2_half_beam.txt | |
| matmul.txt | |
| matmul_amd.txt | |
| sd.txt | |
| sdxl.txt | |
| mixtral.txt | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| testmoreamdbenchmark: | |
| name: tinybox red Training Benchmark | |
| runs-on: [self-hosted, Linux, tinybox] | |
| timeout-minutes: 60 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v4 | |
| - name: Remove amdgpu | |
| run: sudo rmmod amdgpu || true | |
| - name: Cleanup running AM processes | |
| run: python extra/amdpci/am_smi.py --pids --kill | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| - name: Train MNIST | |
| run: time PYTHONPATH=. AMD=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt | |
| - name: Run 10 CIFAR training steps | |
| run: BENCHMARK_LOG=cifar_10steps ASSERT_MIN_STEP_TIME=85 AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt | |
| - name: Run 10 CIFAR training steps w HALF | |
| run: BENCHMARK_LOG=cifar_10steps_half ASSERT_MIN_STEP_TIME=188 AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt | |
| # - name: Run 10 CIFAR training steps w BF16 | |
| # run: BENCHMARK_LOG=cifar_10steps_bf16 ASSERT_MIN_STEP_TIME=288 AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt | |
| - name: Run 10 CIFAR training steps w winograd | |
| run: BENCHMARK_LOG=cifar_10steps_half_wino ASSERT_MIN_STEP_TIME=66 AMD=1 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt | |
| - name: Run full CIFAR training w 1 GPU | |
| run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt | |
| #- name: Run full CIFAR training steps w 6 GPUS | |
| # run: time BENCHMARK_LOG=cifar_6gpu AMD=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt | |
| #- name: Run full CIFAR training steps w 6 GPUS (REMOTE) | |
| # run: time BENCHMARK_LOG=cifar_6gpu_remote REMOTE=1 REMOTEDEV=AMD DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu_remote.txt | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: Speed (AMD Training) | |
| path: | | |
| beautiful_mnist.txt | |
| train_cifar.txt | |
| train_cifar_half.txt | |
| train_cifar_bf16.txt | |
| train_cifar_wino.txt | |
| train_cifar_one_gpu.txt | |
| train_cifar_six_gpu.txt | |
| train_cifar_six_gpu_remote.txt | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| testmlperfamdbenchmark: | |
| name: tinybox red MLPerf Benchmark | |
| runs-on: [self-hosted, Linux, tinybox] | |
| timeout-minutes: 60 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v4 | |
| - name: Remove amdgpu | |
| run: sudo rmmod amdgpu || true | |
| - name: Cleanup running AM processes | |
| run: python extra/amdpci/am_smi.py --pids --kill | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| - name: Run MLPerf resnet eval | |
| run: time BENCHMARK_LOG=resnet_eval AMD=1 MODEL=resnet python3 examples/mlperf/model_eval.py | |
| #- name: Run 10 MLPerf ResNet50 training steps (1 gpu) | |
| # run: BENCHMARK_LOG=resnet_10steps AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt | |
| #- name: Run 10 MLPerf ResNet50 training steps (6 gpu) | |
| # run: BENCHMARK_LOG=resnet_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt | |
| - name: Run 10 MLPerf Bert training steps (6 gpu) | |
| # TODO: remove BERT_LAYERS once scheduler is fast | |
| run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: Speed (AMD MLPerf) | |
| path: | | |
| train_resnet.txt | |
| train_resnet_one_gpu.txt | |
| train_bert.txt | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| testqualcommbenchmark: | |
| name: comma Benchmark | |
| runs-on: [self-hosted, Linux, comma] | |
| timeout-minutes: 20 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v4 | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| - name: benchmark openpilot 0.9.9 driving_vision | |
| run: BENCHMARK_LOG=openpilot_0_9_9_vision ASSERT_MIN_STEP_TIME=30 PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/driving_vision.onnx | |
| - name: benchmark openpilot 0.9.9 driving_policy | |
| run: BENCHMARK_LOG=openpilot_0_9_9_policy ASSERT_MIN_STEP_TIME=45 PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/driving_policy.onnx | |
| - name: benchmark openpilot 0.9.9 dmonitoring | |
| run: BENCHMARK_LOG=openpilot_0_9_9_dmonitoring ASSERT_MIN_STEP_TIME=70 PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/dmonitoring_model.onnx | |
| - name: openpilot compile3 0.9.9 driving_vision | |
| run: PYTHONPATH="." QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/driving_vision.onnx | |
| - name: openpilot compile3 0.9.9 driving_policy | |
| run: PYTHONPATH="." QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/driving_policy.onnx | |
| - name: openpilot compile3 0.9.9 dmonitoring | |
| run: PYTHONPATH="." QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.9/selfdrive/modeld/models/dmonitoring_model.onnx | |
| - name: openpilot compile3 Space Lab policy + vision | |
| run: | | |
| PYTHONPATH="." QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://gitlab.com/commaai/openpilot-lfs.git/gitlab-lfs/objects/22aec22a10ce09384d4a4af2a0bbff08d54af7e0c888503508f356fae4ff0e29 | |
| PYTHONPATH="." QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://gitlab.com/commaai/openpilot-lfs.git/gitlab-lfs/objects/c824f68646a3b94f117f01c70dc8316fb466e05fbd42ccdba440b8a8dc86914b | |
| - name: benchmark MobileNetV2 on DSP | |
| run: | | |
| # generate quantized weights | |
| ln -s /data/home/tiny/tinygrad/extra/datasets/imagenet extra/datasets/imagenet | |
| ln -s /data/home/tiny/tinygrad/testsig-*.so . | |
| PYTHONPATH=. CC=clang-19 CPU=1 CPU_LLVM=0 QUANT=1 CNT=0 python3 examples/test_onnx_imagenet.py https://github.com/xamcat/mobcat-samples/raw/refs/heads/master/onnx_runtime/InferencingSample/InferencingSample/mobilenetv2-7.onnx /tmp/model.quant.onnx | |
| # benchmark on DSP with NOOPT=1, the devectorizer has issues | |
| PYTHONPATH=. CC=clang-19 DSP=1 DONT_REALIZE_EXPAND=1 NOOPT=1 CNT=2 DEBUG=2 python3 examples/test_onnx_imagenet.py /tmp/model.quant.onnx | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: Speed (comma) | |
| path: | | |
| openpilot_compile_0_9_4.txt | |
| openpilot_compile_0_9_7.txt | |
| openpilot_0_9_4.txt | |
| openpilot_0_9_7.txt | |
| openpilot_image_0_9_4.txt | |
| openpilot_image_0_9_7.txt | |
| testreddriverbenchmark: | |
| name: AM Benchmark | |
| runs-on: [self-hosted, Linux, tinyboxrandom] | |
| timeout-minutes: 20 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v4 | |
| - name: Remove amd modules | |
| run: ./extra/hcq/hcq_smi.py amd rmmod | |
| - name: Kill stale pids | |
| run: ./extra/hcq/hcq_smi.py amd kill_pids | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| - name: Test driver cold start time | |
| run: time DEBUG=3 AMD=1 AM_RESET=1 python3 test/test_tiny.py TestTiny.test_plus | |
| - name: Test driver warm start time | |
| run: time DEBUG=3 AMD=1 python3 test/test_tiny.py TestTiny.test_plus | |
| # Fails on 9070 | |
| # - name: Test tensor cores | |
| # run: | | |
| # AMD=1 AMD_LLVM=0 python3 test/test_linearizer.py test/opt/test_tensor_cores.py | |
| # AMD=1 AMD_LLVM=1 python3 test/test_linearizer.py test/opt/test_tensor_cores.py | |
| # AMD=1 SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | |
| - name: Run Tensor Core GEMM (AMD) | |
| run: AMD=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py | tee am_matmul_amd.txt | |
| - name: Test AMD=1 | |
| run: DEBUG=2 AMD=1 python -m pytest -rA test/test_tiny.py | |
| - name: Test DISK copy time | |
| run: AMD=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py | |
| - name: Test CPU copy time | |
| run: | | |
| AMD=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyDefaulttoCPUJit | |
| AMD=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyCPUtoDefaultJit | |
| - name: Run full CIFAR training w 1 GPU | |
| run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee am_train_cifar_one_gpu.txt | |
| # TODO: enable | |
| # - name: Run 10 MLPerf ResNet50 training steps (1 gpu) | |
| # run: BENCHMARK_LOG=resnet_10steps AMD=1 MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee am_train_resnet_one_gpu.txt | |
| - name: Run 10 MLPerf Bert training steps (1 gpu) | |
| # TODO: remove BERT_LAYERS once scheduler is fast | |
| run: BENCHMARK_LOG=bert_10steps AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=1 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee am_train_bert_one_gpu.txt | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: Speed (AM Driver) | |
| path: | | |
| am_matmul_amd.txt | |
| am_train_cifar_one_gpu.txt | |
| am_train_resnet_one_gpu.txt | |
| am_train_bert_one_gpu.txt | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| testgreendriverbenchmark: | |
| name: NV Benchmark | |
| runs-on: [self-hosted, Linux, tinyboxrandom] | |
| timeout-minutes: 20 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v4 | |
| - name: Remove nv modules | |
| run: ./extra/hcq/hcq_smi.py nv rmmod | |
| - name: Kill stale pids | |
| run: ./extra/hcq/hcq_smi.py nv kill_pids | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| - name: Test driver start time | |
| run: time DEBUG=3 NV=1 python3 test/test_tiny.py TestTiny.test_plus | |
| - name: Test tensor cores | |
| run: NV=1 ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py | |
| - name: Test DISK copy time | |
| run: NV=1 TESTFILE=/raid/downloads/llama3-8b-sfr/model-00001-of-00004.safetensors python3 test/external/external_benchmark_disk_raw.py | |
| - name: Test CPU copy time | |
| run: | | |
| NV=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyDefaulttoCPUJit | |
| NV=1 GRAPH_ONE_KERNEL=1 PYTHONPATH=. NSZ=8192 python3 test/speed/external_test_copy_speed.py TestCopySpeed.testCopyCPUtoDefaultJit | |
| - name: Test LLAMA-3 | |
| run: BENCHMARK_LOG=llama3_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --benchmark --temperature 0 | tee nv_llama3_beam.txt | |
| - name: Run full CIFAR training w 1 GPU | |
| run: time BENCHMARK_LOG=cifar NV=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee nv_train_cifar_one_gpu.txt | |
| #- name: Run 10 MLPerf ResNet50 training steps (1 gpu) | |
| # run: BENCHMARK_LOG=resnet_10steps NV=1 MNISTMOCK=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee nv_train_resnet_one_gpu.txt | |
| - name: Run 10 MLPerf Bert training steps (1 gpu) | |
| # TODO: remove BERT_LAYERS once scheduler is fast | |
| run: BENCHMARK_LOG=bert_10steps NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=1 BERT_LAYERS=2 MODEL=bert python3 examples/mlperf/model_train.py | tee nv_train_bert_one_gpu.txt | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: Speed (NV Driver) | |
| path: | | |
| nv_llama3_beam.txt | |
| nv_train_cifar_one_gpu.txt | |
| nv_train_resnet_one_gpu.txt | |
| nv_train_bert_one_gpu.txt | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py |