viz: allow end_time=None in trace events (#11092) #29
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Benchmarks | |
| env: | |
| # TODO: this rescheduling makes gpt2, mixtral and llama unjitted slower | |
| # TODO: very slow for llama 70B and resnet training 6 GPU | |
| CAPTURE_PROCESS_REPLAY: "1" | |
| ASSERT_PROCESS_REPLAY: "0" | |
| PYTHONPATH: . | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| on: | |
| push: | |
| branches: | |
| - master | |
| - update_benchmark | |
| - update_benchmark_staging | |
| workflow_dispatch: | |
| inputs: | |
| run_process_replay: | |
| description: "Run process replay tests" | |
| required: false | |
| default: false | |
| type: boolean | |
| jobs: | |
| testmacbenchmark: | |
| name: Mac Benchmark | |
| env: | |
| # since sudo is required for usbgpu on macos, move the cache to a new location, as some of the files are owned by root | |
| PYTHONPYCACHEPREFIX: /tmp/tiny_python_pycache | |
| runs-on: [self-hosted, macOS] | |
| timeout-minutes: 20 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v4 | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/extra/disassemblers/applegpu extra/disassemblers/applegpu | |
| ln -s ~/tinygrad/weights/sd-v1-4.ckpt weights/sd-v1-4.ckpt | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: python3.11 test/external/process_replay/reset.py | |
| - name: Run Stable Diffusion | |
| run: BENCHMARK_LOG=stable_diffusion JIT=1 python3.11 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt | |
| - name: Run Stable Diffusion without fp16 | |
| run: BENCHMARK_LOG=stable_diffusion_fp32 JIT=1 python3.11 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd_no_fp16.txt | |
| - name: Run Stable Diffusion v2 | |
| run: BENCHMARK_LOG=stable_diffusion_v2 JIT=1 python3.11 examples/sdv2.py --fp16 --seed 0 --noshow --timing | tee sdv2.txt | |
| # process replay can't capture this, the graph is too large | |
| - name: Run SDXL | |
| run: BENCHMARK_LOG=stable_diffusion_xl CAPTURE_PROCESS_REPLAY=0 JIT=1 python3.11 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt | |
| - name: Run model inference benchmark | |
| run: METAL=1 python3.11 test/external/external_model_benchmark.py | |
| - name: Run huggingface_onnx test | |
| run: METAL=1 python3.11 extra/huggingface_onnx/run_models.py test --debug FacebookAI/xlm-roberta-large | |
| - name: Test speed vs torch | |
| run: BIG=2 MPS=1 python3.11 test/test_speed_v_torch.py | tee torch_speed.txt | |
| - name: Test tensor cores | |
| run: METAL=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops | |
| - name: Test AMX tensor cores | |
| run: | | |
| DEBUG=2 CPU=1 AMX=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops | |
| DEBUG=2 LLVM=1 AMX=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops | |
| - name: Run Tensor Core GEMM (float) | |
| run: DEBUG=2 SHOULD_USE_TC=1 python3.11 extra/gemm/simple_matmul.py | tee matmul.txt | |
| - name: Run Tensor Core GEMM (half) | |
| run: DEBUG=2 SHOULD_USE_TC=1 HALF=1 python3.11 extra/gemm/simple_matmul.py | tee matmul_half.txt | |
| - name: Run Tensor Core GEMM (bfloat16) | |
| run: DEBUG=2 SHOULD_USE_TC=1 BFLOAT16=1 python3.11 extra/gemm/simple_matmul.py | tee matmul_bfloat16.txt | |
| - name: Fuzz Padded Tensor Core GEMM | |
| run: METAL=1 M_START=6 M_STOP=10 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=6 K_STOP=24 K_STEP=1 TC_OPT=2 DEBUG=2 python3.11 ./extra/gemm/fuzz_matmul.py | |
| - name: Run LLaMA | |
| run: | | |
| BENCHMARK_LOG=llama_nojit JIT=0 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt | |
| BENCHMARK_LOG=llama JIT=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt | |
| - name: Run LLaMA with BEAM | |
| run: BENCHMARK_LOG=llama_beam JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt | |
| - name: Run quantized LLaMA | |
| run: | | |
| BENCHMARK_LOG=llama_int8 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize int8 | tee llama_int8.txt | |
| BENCHMARK_LOG=llama_nf4 python3.11 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize nf4 | tee llama_nf4.txt | |
| - name: Run quantized LLaMA3 | |
| run: | | |
| BENCHMARK_LOG=llama3_int8 python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize int8 | tee llama3_int8.txt | |
| BENCHMARK_LOG=llama3_nf4 python3.11 examples/llama3.py --size 8B --temperature 0 --benchmark --quantize nf4 | tee llama3_nf4.txt | |
| #- name: Run LLaMA 7B on 4 (virtual) GPUs | |
| # run: python3.11 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt | |
| - name: Run GPT2 | |
| run: | | |
| BENCHMARK_LOG=gpt2_nojit JIT=0 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt | |
| BENCHMARK_LOG=gpt2 JIT=1 python3.11 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt | |
| - name: Run GPT2 w HALF | |
| run: BENCHMARK_LOG=gpt2_half HALF=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt | |
| - name: Run GPT2 w HALF/BEAM | |
| run: BENCHMARK_LOG=gpt2_half_beam HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3.11 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt | |
| - name: Run OLMoE | |
| run: BENCHMARK_LOG=olmoe python3.11 examples/olmoe.py | |
| - name: Train MNIST | |
| run: time PYTHONPATH=. TARGET_EVAL_ACC_PCT=96.0 python3.11 examples/beautiful_mnist.py | tee beautiful_mnist.txt | |
| - name: Run 10 CIFAR training steps | |
| run: BENCHMARK_LOG=cifar_10steps JIT=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar.txt | |
| - name: Run 10 CIFAR training steps w HALF | |
| run: BENCHMARK_LOG=cifar_10steps_half JIT=2 STEPS=10 DEFAULT_FLOAT=HALF python3.11 examples/hlb_cifar10.py | tee train_cifar_half.txt | |
| #- name: Run 10 CIFAR training steps w BF16 | |
| # run: STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3.11 examples/hlb_cifar10.py | tee train_cifar_bf16.txt | |
| - name: Run 10 CIFAR training steps w winograd | |
| run: BENCHMARK_LOG=cifar_10steps_wino JIT=1 WINO=1 STEPS=10 python3.11 examples/hlb_cifar10.py | tee train_cifar_wino.txt | |
| - name: UsbGPU boot time | |
| run: sudo -E PYTHONPATH=. DEBUG=2 AM_RESET=1 AMD=1 AMD_IFACE=USB time python3.11 test/test_tiny.py TestTiny.test_plus | |
| - name: UsbGPU tiny tests | |
| run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB python3.11 test/test_tiny.py | |
| - name: UsbGPU copy speeds | |
| run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB python3.11 test/external/external_test_usb_asm24.py TestDevCopySpeeds | |
| - name: UsbGPU openpilot test | |
| run: sudo -E PYTHONPATH=. AMD=1 AMD_IFACE=USB NOLOCALS=0 IMAGE=0 GRAPH_ONE_KERNEL=1 python3.11 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: Speed (Mac) | |
| path: | | |
| onnx_inference_speed.csv | |
| torch_speed.txt | |
| llama_unjitted.txt | |
| llama_jitted.txt | |
| llama_beam.txt | |
| llama_int8.txt | |
| llama_nf4.txt | |
| llama3_int8.txt | |
| llama3_nf4.txt | |
| llama_four_gpu.txt | |
| gpt2_unjitted.txt | |
| gpt2_jitted.txt | |
| gpt2_half.txt | |
| gpt2_half_beam.txt | |
| matmul.txt | |
| matmul_half.txt | |
| matmul_bfloat16.txt | |
| sd.txt | |
| sd_no_fp16.txt | |
| sdv2.txt | |
| sdxl.txt | |
| beautiful_mnist.txt | |
| train_cifar.txt | |
| train_cifar_half.txt | |
| train_cifar_bf16.txt | |
| train_cifar_wino.txt | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3.11 process_replay.py | |
| testnvidiabenchmark: | |
| name: tinybox green Benchmark | |
| runs-on: [self-hosted, Linux, tinyboxgreen] | |
| timeout-minutes: 30 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v4 | |
| - name: Print nvidia-smi | |
| run: nvidia-smi | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| ln -s /raid/weights/LLaMA-3 weights/LLaMA-3 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| - name: Run model inference benchmark | |
| run: NV=1 CAPTURE_PROCESS_REPLAY=0 NOCLANG=1 python3 test/external/external_model_benchmark.py | |
| - name: Test speed vs torch | |
| run: NV=1 CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt | |
| - name: Test speed vs theoretical | |
| run: NV=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20 | |
| - name: Test benchmark allreduce | |
| run: NV=1 python test/external/external_benchmark_multitensor_allreduce.py | |
| - name: Test tensor cores | |
| run: | | |
| NV=1 ALLOW_TF32=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops | |
| PTX=1 ALLOW_TF32=1 NV=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops | |
| - name: Run Tensor Core GEMM (CUDA) | |
| run: | | |
| CUDA=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt | |
| CUDA=1 SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_bfloat16.txt | |
| CUDA=1 SHOULD_USE_TC=1 ALLOW_TF32=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py | tee matmul_tf32.txt | |
| - name: Run Tensor Core GEMM (PTX) | |
| run: NV=1 PTX=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_ptx.txt | |
| - name: Run Tensor Core GEMM (NV) | |
| run: NV=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_nv.txt | |
| - name: Test NV=1 | |
| run: DEBUG=2 NV=1 python -m pytest -rA test/test_tiny.py | |
| - name: Test CUDA=1 | |
| run: DEBUG=2 CUDA=1 python -m pytest -rA test/test_tiny.py | |
| - name: Run Stable Diffusion | |
| run: BENCHMARK_LOG=stable_diffusion NV=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt | |
| - name: Run SDXL | |
| run: BENCHMARK_LOG=stable_diffusion_xl CAPTURE_PROCESS_REPLAY=0 NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt | |
| - name: Run LLaMA | |
| run: | | |
| BENCHMARK_LOG=llama_nojit NV=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt | |
| BENCHMARK_LOG=llama NV=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt | |
| - name: Run LLaMA with BEAM | |
| run: BENCHMARK_LOG=llama_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt | |
| # - name: Run LLaMA 7B on 4 GPUs | |
| # run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt | |
| # - name: Run LLaMA 7B on 6 GPUs | |
| # run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt | |
| - name: Run LLaMA-3 8B BEAM | |
| run: BENCHMARK_LOG=llama3_beam NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt | |
| - name: Run LLaMA-3 8B on 4 GPUs with BEAM | |
| run: BENCHMARK_LOG=llama3_beam_4gpu NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt | |
| # - name: Run LLaMA-3 8B on 6 GPUs | |
| # run: NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt | |
| # - name: Run LLaMA-2 70B | |
| # run: NV=1 CAPTURE_PROCESS_REPLAY=0 MAX_CONTEXT=256 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt | |
| - name: Run Mixtral 8x7B | |
| run: time BENCHMARK_LOG=mixtral NV=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt | |
| - name: Run GPT2 | |
| run: | | |
| BENCHMARK_LOG=gpt2_nojit NV=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt | |
| BENCHMARK_LOG=gpt2 NV=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt | |
| - name: Run GPT2 w HALF | |
| run: BENCHMARK_LOG=gpt2_half NV=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt | |
| - name: Run GPT2 w HALF/BEAM | |
| run: BENCHMARK_LOG=gpt2_half_beam NV=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: Speed (NVIDIA) | |
| path: | | |
| onnx_inference_speed.csv | |
| torch_speed.txt | |
| matmul.txt | |
| matmul_bfloat16.txt | |
| matmul_tf32.txt | |
| matmul_ptx.txt | |
| matmul_nv.txt | |
| sd.txt | |
| sdxl.txt | |
| llama_unjitted.txt | |
| llama_jitted.txt | |
| llama_beam.txt | |
| llama3_beam.txt | |
| llama3_four_gpu.txt | |
| llama3_six_gpu.txt | |
| llama_2_70B.txt | |
| mixtral.txt | |
| gpt2_unjitted.txt | |
| gpt2_jitted.txt | |
| gpt2_half.txt | |
| gpt2_half_beam.txt | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| testmorenvidiabenchmark: | |
| name: tinybox green Training Benchmark | |
| runs-on: [self-hosted, Linux, tinyboxgreen] | |
| timeout-minutes: 20 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v4 | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| - name: Fuzz Padded Tensor Core GEMM (NV) | |
| run: NV=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py | |
| - name: Fuzz Padded Tensor Core GEMM (PTX) | |
| run: NV=1 PTX=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py | |
| - name: Train MNIST | |
| run: time PYTHONPATH=. NV=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt | |
| - name: Run 10 CIFAR training steps | |
| run: BENCHMARK_LOG=cifar_10steps NV=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt | |
| - name: Run 10 CIFAR training steps w HALF | |
| run: BENCHMARK_LOG=cifar_10steps_half NV=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt | |
| - name: Run 10 CIFAR training steps w BF16 | |
| run: BENCHMARK_LOG=cifar_10steps_bf16 NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt | |
| - name: Run 10 CIFAR training steps w winograd | |
| run: BENCHMARK_LOG=cifar_10steps_half_wino NV=1 CAPTURE_PROCESS_REPLAY=0 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt | |
| - name: Run full CIFAR training w 1 GPU | |
| run: time BENCHMARK_LOG=cifar NV=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt | |
| - name: Run full CIFAR training steps w 6 GPUS | |
| run: time BENCHMARK_LOG=cifar_6gpu CAPTURE_PROCESS_REPLAY=0 NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt | |
| - name: Run MLPerf resnet eval on training data | |
| run: time BENCHMARK_LOG=resnet_eval NV=1 MODEL=resnet python3 examples/mlperf/model_eval.py | |
| - name: Run 10 MLPerf ResNet50 training steps (1 gpu) | |
| run: BENCHMARK_LOG=resnet_10steps NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt | |
| - name: Run 10 MLPerf ResNet50 training steps (6 gpu) | |
| run: BENCHMARK_LOG=resnet_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt | |
| - name: Run 10 MLPerf Bert training steps (6 gpu) | |
| # TODO: remove BERT_LAYERS once scheduler is fast | |
| run: BENCHMARK_LOG=bert_10steps_6gpu NV=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: Speed (NVIDIA Training) | |
| path: | | |
| beautiful_mnist.txt | |
| train_cifar.txt | |
| train_cifar_half.txt | |
| train_cifar_bf16.txt | |
| train_cifar_wino.txt | |
| train_cifar_one_gpu.txt | |
| train_cifar_six_gpu.txt | |
| train_resnet.txt | |
| train_resnet_one_gpu.txt | |
| train_bert.txt | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| testamdbenchmark: | |
| name: tinybox red Benchmark | |
| runs-on: [self-hosted, Linux, tinybox] | |
| timeout-minutes: 20 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v4 | |
| - name: Remove amdgpu | |
| run: sudo rmmod amdgpu || true | |
| - name: Cleanup running AM processes | |
| run: python extra/amdpci/am_smi.py --pids --kill | |
| #- name: Insert amdgpu | |
| # run: sudo modprobe amdgpu | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| ln -s /raid/weights/LLaMA-3 weights/LLaMA-3 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| #- name: setup perflevel | |
| # run: | | |
| # examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/setup.sh | |
| # rocm-smi | |
| #- name: Show off tinybox | |
| # run: /opt/rocm/bin/rocm-bandwidth-test | |
| # TODO: unstable on AMD | |
| #- name: Run model inference benchmark | |
| # run: LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 NOCLANG=1 python3 test/external/external_model_benchmark.py | |
| # TODO: unstable on AMD | |
| #- name: Test speed vs torch | |
| # run: | | |
| # python3 -c "import torch; print(torch.__version__)" | |
| # LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt | |
| - name: Test speed vs theoretical | |
| run: AMD=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20 | |
| - name: Test tensor cores | |
| run: | | |
| AMD=1 AMD_LLVM=0 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded_amd TestLinearizer.test_tensor_cores_padded_uops | |
| AMD=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded_amd TestLinearizer.test_tensor_cores_padded_uops | |
| AMD=1 SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | |
| - name: Run Tensor Core GEMM (AMD) | |
| run: AMD=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py | tee matmul_amd.txt | |
| - name: Test AMD=1 | |
| run: DEBUG=2 AMD=1 python -m pytest -rA test/test_tiny.py | |
| #- name: Test HIP=1 | |
| # run: DEBUG=2 HIP=1 python -m pytest -rA test/test_tiny.py | |
| # TODO: AMD compiler bug causes this to fail | |
| #- name: Fuzz Padded Tensor Core GEMM | |
| # run: HSA=1 M_START=12 M_STOP=20 M_STEP=1 N_START=12 N_STOP=20 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 DEBUG=2 python3 ./extra/gemm/fuzz_matmul.py | |
| #- name: Remove amdgpu | |
| # run: sleep 10 && sudo rmmod amdgpu # sleep a bit to let the driver unload the prev pid. | |
| - name: Test AM cold start time | |
| run: time AMD=1 AM_RESET=1 python3 test/test_tiny.py TestTiny.test_plus | |
| - name: Test AM warm start time | |
| run: time AMD=1 python3 test/test_tiny.py TestTiny.test_plus | |
| - name: Run Stable Diffusion | |
| run: BENCHMARK_LOG=stable_diffusion AMD=1 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd.txt | |
| - name: Run SDXL | |
| run: BENCHMARK_LOG=stable_diffusion_xl CAPTURE_PROCESS_REPLAY=0 AMD=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt | |
| - name: Run LLaMA 7B | |
| run: | | |
| BENCHMARK_LOG=llama_nojit AMD=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt | |
| BENCHMARK_LOG=llama AMD=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt | |
| - name: Run LLaMA 7B with BEAM | |
| run: BENCHMARK_LOG=llama_beam AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt | |
| # - name: Run LLaMA 7B on 4 GPUs | |
| # run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt | |
| # - name: Run LLaMA 7B on 6 GPUs | |
| # run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt | |
| - name: Run LLaMA-3 8B BEAM | |
| run: BENCHMARK_LOG=llama3_beam AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt | |
| - name: Run LLaMA-3 8B on 4 GPUs with BEAM | |
| run: BENCHMARK_LOG=llama3_beam_4gpu AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt | |
| # - name: Run LLaMA-3 8B on 6 GPUs | |
| # run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt | |
| #- name: Restore amdgpu | |
| # run: sudo modprobe amdgpu | |
| # - name: Run LLaMA-2 70B | |
| # run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt | |
| - name: Run Mixtral 8x7B | |
| run: time BENCHMARK_LOG=mixtral AMD=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt | |
| - name: Run GPT2 | |
| run: | | |
| BENCHMARK_LOG=gpt2_nojit AMD=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt | |
| BENCHMARK_LOG=gpt2 AMD=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt | |
| - name: Run GPT2 w HALF | |
| run: BENCHMARK_LOG=gpt2_half AMD=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt | |
| - name: Run GPT2 w HALF/BEAM | |
| run: BENCHMARK_LOG=gpt2_half_beam AMD=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: Speed (AMD) | |
| path: | | |
| onnx_inference_speed.csv | |
| torch_speed.txt | |
| llama_unjitted.txt | |
| llama_jitted.txt | |
| llama_beam.txt | |
| llama3_beam.txt | |
| llama3_four_gpu.txt | |
| llama3_six_gpu.txt | |
| llama_2_70B.txt | |
| gpt2_unjitted.txt | |
| gpt2_jitted.txt | |
| gpt2_half.txt | |
| gpt2_half_beam.txt | |
| matmul.txt | |
| matmul_amd.txt | |
| sd.txt | |
| sdxl.txt | |
| mixtral.txt | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| testmoreamdbenchmark: | |
| name: tinybox red Training Benchmark | |
| runs-on: [self-hosted, Linux, tinybox] | |
| timeout-minutes: 30 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v4 | |
| - name: Remove amdgpu | |
| run: sudo rmmod amdgpu || true | |
| - name: Cleanup running AM processes | |
| run: python extra/amdpci/am_smi.py --pids --kill | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| - name: Train MNIST | |
| run: time PYTHONPATH=. AMD=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt | |
| - name: Run 10 CIFAR training steps | |
| run: BENCHMARK_LOG=cifar_10steps AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt | |
| - name: Run 10 CIFAR training steps w HALF | |
| run: BENCHMARK_LOG=cifar_10steps_half AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt | |
| - name: Run 10 CIFAR training steps w BF16 | |
| run: BENCHMARK_LOG=cifar_10steps_bf16 AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt | |
| - name: Run 10 CIFAR training steps w winograd | |
| run: BENCHMARK_LOG=cifar_10steps_half_wino AMD=1 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt | |
| - name: Run full CIFAR training w 1 GPU | |
| run: time BENCHMARK_LOG=cifar AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt | |
| - name: Run full CIFAR training steps w 6 GPUS | |
| run: time BENCHMARK_LOG=cifar_6gpu AMD=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt | |
| - name: Run full CIFAR training steps w 6 GPUS (REMOTE) | |
| run: time BENCHMARK_LOG=cifar_6gpu_remote REMOTE=1 REMOTEDEV=AMD DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu_remote.txt | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: Speed (AMD Training) | |
| path: | | |
| beautiful_mnist.txt | |
| train_cifar.txt | |
| train_cifar_half.txt | |
| train_cifar_bf16.txt | |
| train_cifar_wino.txt | |
| train_cifar_one_gpu.txt | |
| train_cifar_six_gpu.txt | |
| train_cifar_six_gpu_remote.txt | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| testmlperfamdbenchmark: | |
| name: tinybox red MLPerf Benchmark | |
| runs-on: [self-hosted, Linux, tinybox] | |
| timeout-minutes: 30 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v4 | |
| - name: Remove amdgpu | |
| run: sudo rmmod amdgpu || true | |
| - name: Cleanup running AM processes | |
| run: python extra/amdpci/am_smi.py --pids --kill | |
| - name: Symlink models and datasets | |
| run: | | |
| mkdir -p weights | |
| ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
| ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
| ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
| ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
| ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
| mkdir -p extra/datasets | |
| ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| - name: Run MLPerf resnet eval | |
| run: time BENCHMARK_LOG=resnet_eval AMD=1 MODEL=resnet python3 examples/mlperf/model_eval.py | |
| - name: Run 10 MLPerf ResNet50 training steps (1 gpu) | |
| run: BENCHMARK_LOG=resnet_10steps AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt | |
| - name: Run 10 MLPerf ResNet50 training steps (6 gpu) | |
| run: BENCHMARK_LOG=resnet_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt | |
| - name: Run 10 MLPerf Bert training steps (6 gpu) | |
| # TODO: remove BERT_LAYERS once scheduler is fast | |
| run: BENCHMARK_LOG=bert_10steps_6gpu AMD=1 CAPTURE_PROCESS_REPLAY=0 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=66 GPUS=6 BERT_LAYERS=2 FUSE_ARANGE=1 FUSE_ARANGE_UINT=0 MODEL=bert python3 examples/mlperf/model_train.py | tee train_bert.txt | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: Speed (AMD MLPerf) | |
| path: | | |
| train_resnet.txt | |
| train_resnet_one_gpu.txt | |
| train_bert.txt | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| testqualcommbenchmark: | |
| name: comma Benchmark | |
| runs-on: [self-hosted, Linux, comma] | |
| timeout-minutes: 20 | |
| defaults: | |
| run: | |
| shell: bash -e -o pipefail {0} | |
| if: github.repository_owner == 'tinygrad' | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v4 | |
| - name: setup staging db | |
| if: github.ref == 'refs/heads/update_benchmark_staging' | |
| run: | | |
| echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
| rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
| - name: reset process replay | |
| run: test/external/process_replay/reset.py | |
| - name: validate openpilot 0.9.7 | |
| run: PYTHONPATH=. FLOAT16=0 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_7.txt | |
| - name: benchmark openpilot 0.9.7 | |
| run: BENCHMARK_LOG=openpilot_0_9_7 PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_7.txt | |
| - name: benchmark openpilot w IMAGE=2 0.9.7 | |
| run: BENCHMARK_LOG=openpilot_0_9_7_image PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_7.txt | |
| - name: openpilot compile3 0.9.7 | |
| run: PYTHONPATH="." QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | |
| - name: openpilot compile3 0.9.7+ tomb raider | |
| run: PYTHONPATH="." QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/e8bea2c78ffa92685ece511e9b554122aaf1a79d/selfdrive/modeld/models/supercombo.onnx | |
| - name: openpilot dmonitoring compile3 0.9.7 | |
| run: PYTHONPATH="." QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/dmonitoring_model.onnx | |
| - name: benchmark MobileNetV2 on DSP | |
| run: | | |
| # generate quantized weights | |
| ln -s /data/home/tiny/tinygrad/extra/datasets/imagenet extra/datasets/imagenet | |
| ln -s /data/home/tiny/tinygrad/testsig-*.so . | |
| PYTHONPATH=. CC=clang-19 CPU=1 QUANT=1 CNT=0 python3 examples/test_onnx_imagenet.py https://github.com/xamcat/mobcat-samples/raw/refs/heads/master/onnx_runtime/InferencingSample/InferencingSample/mobilenetv2-7.onnx /tmp/model.quant.onnx | |
| # benchmark on DSP with NOOPT=1, the devectorizer has issues | |
| PYTHONPATH=. CC=clang-19 DSP=1 DONT_REALIZE_EXPAND=1 NOOPT=1 CNT=2 DEBUG=2 python3 examples/test_onnx_imagenet.py /tmp/model.quant.onnx | |
| - name: Run process replay tests | |
| run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: Speed (comma) | |
| path: | | |
| openpilot_compile_0_9_4.txt | |
| openpilot_compile_0_9_7.txt | |
| openpilot_0_9_4.txt | |
| openpilot_0_9_7.txt | |
| openpilot_image_0_9_4.txt | |
| openpilot_image_0_9_7.txt |