这是indexloc提供的服务,不要输入任何密码
Skip to content

tensor_parallel_size设置为2时候报错 #428

@Mikivishy

Description

@Mikivishy

默认的tensor_parallel_size为1,我想修改成2,但是报错:

2025-07-17 17:00:39 | INFO | stdout | �[36m(pid=47761)�[0m NCCL version 2.21.5+cuda11.0
2025-07-17 17:00:39 | ERROR | stderr | Traceback (most recent call last):
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/runpy.py", line 196, in _run_module_as_main
2025-07-17 17:00:39 | ERROR | stderr |     return _run_code(code, main_globals, None,
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/runpy.py", line 86, in _run_code
2025-07-17 17:00:39 | ERROR | stderr |     exec(code, run_globals)
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/trainer/main.py", line 216, in <module>
2025-07-17 17:00:39 | ERROR | stderr |     main()
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/trainer/main.py", line 212, in main
2025-07-17 17:00:39 | ERROR | stderr |     ray.get(runner.run.remote(ppo_config))
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
2025-07-17 17:00:39 | ERROR | stderr |     return fn(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
2025-07-17 17:00:39 | ERROR | stderr |     return func(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/ray/_private/worker.py", line 2822, in get
2025-07-17 17:00:39 | ERROR | stderr |     values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/ray/_private/worker.py", line 930, in get_objects
2025-07-17 17:00:39 | ERROR | stderr |     raise value.as_instanceof_cause()
2025-07-17 17:00:39 | ERROR | stderr | ray.exceptions.RayTaskError(KeyError): �[36mray::Runner.run()�[39m (pid=41424, ip=10.140.37.138, actor_id=aac3cbdc33303e9fb12a736501000000, repr=<main.Runner object at 0x7fc5f5d2cee0>)
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/trainer/main.py", line 99, in run
2025-07-17 17:00:39 | ERROR | stderr |     trainer.init_workers()
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/trainer/ray_trainer.py", line 443, in init_workers
2025-07-17 17:00:39 | ERROR | stderr |     self.actor_rollout_wg.init_model()
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/single_controller/ray/base.py", line 47, in func
2025-07-17 17:00:39 | ERROR | stderr |     output = ray.get(output)
2025-07-17 17:00:39 | ERROR | stderr | ray.exceptions.RayTaskError(KeyError): �[36mray::WorkerDict.actor_rollout_init_model()�[39m (pid=47760, ip=10.140.37.138, actor_id=c03b337d86fc2c524621f42701000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7f31ad170c10>)
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/single_controller/ray/base.py", line 432, in func
2025-07-17 17:00:39 | ERROR | stderr |     return getattr(self.worker_dict[key], name)(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/single_controller/base/decorator.py", line 207, in inner
2025-07-17 17:00:39 | ERROR | stderr |     return func(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/workers/fsdp_workers.py", line 384, in init_model
2025-07-17 17:00:39 | ERROR | stderr |     self._build_rollout()
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/workers/fsdp_workers.py", line 315, in _build_rollout
2025-07-17 17:00:39 | ERROR | stderr |     self.rollout = vLLMRollout(
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/workers/rollout/vllm_rollout_spmd.py", line 76, in __init__
2025-07-17 17:00:39 | ERROR | stderr |     self.inference_engine = LLM(
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/utils.py", line 1161, in inner
2025-07-17 17:00:39 | ERROR | stderr |     return fn(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 247, in __init__
2025-07-17 17:00:39 | ERROR | stderr |     self.llm_engine = LLMEngine.from_engine_args(
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 510, in from_engine_args
2025-07-17 17:00:39 | ERROR | stderr |     return engine_cls.from_vllm_config(
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 112, in from_vllm_config
2025-07-17 17:00:39 | ERROR | stderr |     return cls(vllm_config=vllm_config,
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 92, in __init__
2025-07-17 17:00:39 | ERROR | stderr |     self.engine_core = EngineCoreClient.make_client(
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 75, in make_client
2025-07-17 17:00:39 | ERROR | stderr |     return InprocClient(vllm_config, executor_class, log_stats)
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 198, in __init__
2025-07-17 17:00:39 | ERROR | stderr |     self.engine_core = EngineCore(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 64, in __init__
2025-07-17 17:00:39 | ERROR | stderr |     self.model_executor = executor_class(vllm_config)
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 52, in __init__
2025-07-17 17:00:39 | ERROR | stderr |     self._init_executor()
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 121, in _init_executor
2025-07-17 17:00:39 | ERROR | stderr |     self.collective_rpc("init_device")
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
2025-07-17 17:00:39 | ERROR | stderr |     answer = run_method(self.driver_worker, method, args, kwargs)
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/utils.py", line 2456, in run_method
2025-07-17 17:00:39 | ERROR | stderr |     return func(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/worker/worker_base.py", line 604, in init_device
2025-07-17 17:00:39 | ERROR | stderr |     self.worker.init_device()  # type: ignore
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 135, in init_device
2025-07-17 17:00:39 | ERROR | stderr |     init_worker_distributed_environment(self.vllm_config, self.rank,
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 326, in init_worker_distributed_environment
2025-07-17 17:00:39 | ERROR | stderr |     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 1022, in ensure_model_parallel_initialized
2025-07-17 17:00:39 | ERROR | stderr |     initialize_model_parallel(tensor_model_parallel_size,
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 975, in initialize_model_parallel
2025-07-17 17:00:39 | ERROR | stderr |     _TP = init_model_parallel_group(group_ranks,
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 788, in init_model_parallel_group
2025-07-17 17:00:39 | ERROR | stderr |     return GroupCoordinator(
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 252, in __init__
2025-07-17 17:00:39 | ERROR | stderr |     self.device_communicator = device_comm_cls(
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/device_communicators/cuda_communicator.py", line 47, in __init__
2025-07-17 17:00:39 | ERROR | stderr |     self.ca_comm = CustomAllreduce(
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/device_communicators/custom_all_reduce.py", line 145, in __init__
2025-07-17 17:00:39 | ERROR | stderr |     if not current_platform.is_rocm() and not _can_p2p(rank, world_size):
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/device_communicators/custom_all_reduce.py", line 37, in _can_p2p
2025-07-17 17:00:39 | ERROR | stderr |     if not gpu_p2p_access_check(rank, i):
2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/device_communicators/custom_all_reduce_utils.py", line 248, in gpu_p2p_access_check
2025-07-17 17:00:39 | ERROR | stderr |     return _gpu_p2p_access_cache[f"{src}->{tgt}"]
2025-07-17 17:00:39 | ERROR | stderr | KeyError: '1->0'
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=46880)�[0m 2025-07-17 16:59:58 | ERROR | stderr | 
�[36m(pid=46880)�[0m 2025-07-17 16:59:59 | ERROR | stderr | 
�[36m(pid=46880)�[0m 2025-07-17 16:59:59 | ERROR | stderr | 
�[36m(pid=46880)�[0m 2025-07-17 17:00:05 | ERROR | stderr | 
�[36m(pid=46880)�[0m 2025-07-17 17:00:06 | ERROR | stderr | 
�[36m(pid=46880)�[0m 2025-07-17 17:00:07 | ERROR | stderr | 
�[36m(pid=46880)�[0m 2025-07-17 17:00:07 | ERROR | stderr |
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr | Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): �[36mray::WorkerDict.actor_rollout_init_model()�[39m (pid=47762, ip=10.140.37.138, actor_id=fd8e5e2f1f8107d072c83c9801000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7fc572d50c10>)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/single_controller/ray/base.py", line 432, in func
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return getattr(self.worker_dict[key], name)(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/single_controller/base/decorator.py", line 207, in inner
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return func(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/workers/fsdp_workers.py", line 384, in init_model
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self._build_rollout()
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/workers/fsdp_workers.py", line 315, in _build_rollout
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.rollout = vLLMRollout(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/workers/rollout/vllm_rollout_spmd.py", line 76, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.inference_engine = LLM(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/utils.py", line 1161, in inner
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return fn(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 247, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.llm_engine = LLMEngine.from_engine_args(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 510, in from_engine_args
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return engine_cls.from_vllm_config(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 112, in from_vllm_config
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return cls(vllm_config=vllm_config,
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 92, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.engine_core = EngineCoreClient.make_client(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 75, in make_client
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return InprocClient(vllm_config, executor_class, log_stats)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 198, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.engine_core = EngineCore(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 64, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.model_executor = executor_class(vllm_config)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 52, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self._init_executor()
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 121, in _init_executor
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.collective_rpc("init_device")
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     answer = run_method(self.driver_worker, method, args, kwargs)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/utils.py", line 2456, in run_method
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return func(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/worker/worker_base.py", line 604, in init_device
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.worker.init_device()  # type: ignore
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 135, in init_device
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     init_worker_distributed_environment(self.vllm_config, self.rank,
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 326, in init_worker_distributed_environment
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 1022, in ensure_model_parallel_initialized
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     initialize_model_parallel(tensor_model_parallel_size,
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 975, in initialize_model_parallel
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     _TP = init_model_parallel_group(group_ranks,
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 788, in init_model_parallel_group
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return GroupCoordinator(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 252, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.device_communicator = device_comm_cls(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/device_communicators/cuda_communicator.py", line 47, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.ca_comm = CustomAllreduce(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/device_communicators/custom_all_reduce.py", line 145, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     if not current_platform.is_rocm() and not _can_p2p(rank, world_size):
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/device_communicators/custom_all_reduce.py", line 37, in _can_p2p
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     if not gpu_p2p_access_check(rank, i):
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/device_communicators/custom_all_reduce_utils.py", line 248, in gpu_p2p_access_check
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return _gpu_p2p_access_cache[f"{src}->{tgt}"]
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr | KeyError: '1->0'
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr | Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): �[36mray::WorkerDict.actor_rollout_init_model()�[39m (pid=47761, ip=10.140.37.138, actor_id=47ed97652b87fc5762ebbb2401000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7eeb7b44f0a0>)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/single_controller/ray/base.py", line 432, in func
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return getattr(self.worker_dict[key], name)(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/single_controller/base/decorator.py", line 207, in inner
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return func(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/workers/fsdp_workers.py", line 384, in init_model
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self._build_rollout()
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/workers/fsdp_workers.py", line 315, in _build_rollout
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.rollout = vLLMRollout(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/workers/rollout/vllm_rollout_spmd.py", line 76, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.inference_engine = LLM(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/utils.py", line 1161, in inner
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return fn(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 247, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.llm_engine = LLMEngine.from_engine_args(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 510, in from_engine_args
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return engine_cls.from_vllm_config(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 112, in from_vllm_config
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return cls(vllm_config=vllm_config,
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 92, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.engine_core = EngineCoreClient.make_client(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 75, in make_client
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return InprocClient(vllm_config, executor_class, log_stats)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 198, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.engine_core = EngineCore(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 64, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.model_executor = executor_class(vllm_config)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 52, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self._init_executor()
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 121, in _init_executor
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.collective_rpc("init_device")
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     answer = run_method(self.driver_worker, method, args, kwargs)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/utils.py", line 2456, in run_method
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return func(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/worker/worker_base.py", line 604, in init_device
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.worker.init_device()  # type: ignore
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 135, in init_device
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     init_worker_distributed_environment(self.vllm_config, self.rank,
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 326, in init_worker_distributed_environment
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 1022, in ensure_model_parallel_initialized
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     initialize_model_parallel(tensor_model_parallel_size,
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 975, in initialize_model_parallel
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     _TP = init_model_parallel_group(group_ranks,
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 788, in init_model_parallel_group
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return GroupCoordinator(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 252, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.device_communicator = device_comm_cls(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/device_communicators/cuda_communicator.py", line 47, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.ca_comm = CustomAllreduce(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/device_communicators/custom_all_reduce.py", line 145, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     if not current_platform.is_rocm() and not _can_p2p(rank, world_size):
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/device_communicators/custom_all_reduce.py", line 37, in _can_p2p
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     if not gpu_p2p_access_check(rank, i):
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/device_communicators/custom_all_reduce_utils.py", line 248, in gpu_p2p_access_check
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return _gpu_p2p_access_cache[f"{src}->{tgt}"]
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr | KeyError: '0->1'
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr | Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): �[36mray::WorkerDict.actor_rollout_init_model()�[39m (pid=46880, ip=10.140.37.138, actor_id=f0ee4094614e96a389af02d101000000, repr=<verl.single_controller.ray.base.WorkerDict object at 0x7fca14d141f0>)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/single_controller/ray/base.py", line 432, in func
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return getattr(self.worker_dict[key], name)(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/single_controller/base/decorator.py", line 207, in inner
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return func(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/workers/fsdp_workers.py", line 384, in init_model
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self._build_rollout()
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/workers/fsdp_workers.py", line 315, in _build_rollout
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.rollout = vLLMRollout(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/visual-code/EasyR1/verl/workers/rollout/vllm_rollout_spmd.py", line 76, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.inference_engine = LLM(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/utils.py", line 1161, in inner
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return fn(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 247, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.llm_engine = LLMEngine.from_engine_args(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 510, in from_engine_args
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return engine_cls.from_vllm_config(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 112, in from_vllm_config
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return cls(vllm_config=vllm_config,
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/llm_engine.py", line 92, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.engine_core = EngineCoreClient.make_client(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 75, in make_client
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return InprocClient(vllm_config, executor_class, log_stats)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 198, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.engine_core = EngineCore(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 64, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.model_executor = executor_class(vllm_config)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 52, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self._init_executor()
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 121, in _init_executor
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.collective_rpc("init_device")
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     answer = run_method(self.driver_worker, method, args, kwargs)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/utils.py", line 2456, in run_method
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return func(*args, **kwargs)
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/worker/worker_base.py", line 604, in init_device
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.worker.init_device()  # type: ignore
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 135, in init_device
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     init_worker_distributed_environment(self.vllm_config, self.rank,
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 326, in init_worker_distributed_environment
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 1022, in ensure_model_parallel_initialized
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     initialize_model_parallel(tensor_model_parallel_size,
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 975, in initialize_model_parallel
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     _TP = init_model_parallel_group(group_ranks,
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 788, in init_model_parallel_group
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return GroupCoordinator(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 252, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.device_communicator = device_comm_cls(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/device_communicators/cuda_communicator.py", line 47, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     self.ca_comm = CustomAllreduce(
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/device_communicators/custom_all_reduce.py", line 145, in __init__
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     if not current_platform.is_rocm() and not _can_p2p(rank, world_size):
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/device_communicators/custom_all_reduce.py", line 37, in _can_p2p
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     if not gpu_p2p_access_check(rank, i):
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |   File "/mnt/petrelfs/sunhaoyu/miniconda3/envs/visual/lib/python3.10/site-packages/vllm/distributed/device_communicators/custom_all_reduce_utils.py", line 248, in gpu_p2p_access_check
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr |     return _gpu_p2p_access_cache[f"{src}->{tgt}"]
2025-07-17 17:00:39 | ERROR | stderr | �[36m(pid=41424)�[0m 2025-07-17 17:00:39 | ERROR | stderr | KeyError: '0->1'```

我尝试了一些方法,比如设置export VLLM_RPC_BASE_PATH=/dev/shm,但是没有用,代码本身是可以运行的,我只改了这一个参数

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions