diff --git a/notebooks/community/model_garden/model_garden_pytorch_llama3_1_finetuning.ipynb b/notebooks/community/model_garden/model_garden_pytorch_llama3_1_finetuning.ipynb index 6caf3bfac..e05be10ac 100644 --- a/notebooks/community/model_garden/model_garden_pytorch_llama3_1_finetuning.ipynb +++ b/notebooks/community/model_garden/model_garden_pytorch_llama3_1_finetuning.ipynb @@ -806,35 +806,6 @@ " return model, endpoint\n", "\n", "\n", - "def predict_vllm(\n", - " prompt: str,\n", - " max_tokens: int,\n", - " temperature: float,\n", - " top_p: float,\n", - " top_k: int,\n", - " raw_response: bool,\n", - " lora_weight: str = \"\",\n", - "):\n", - " # Parameters for inference.\n", - " instance = {\n", - " \"prompt\": prompt,\n", - " \"max_tokens\": max_tokens,\n", - " \"temperature\": temperature,\n", - " \"top_p\": top_p,\n", - " \"top_k\": top_k,\n", - " \"raw_response\": raw_response,\n", - " }\n", - " if lora_weight:\n", - " instance[\"dynamic-lora\"] = lora_weight\n", - " instances = [instance]\n", - " response = endpoints[\"vllm_gpu\"].predict(\n", - " instances=instances, use_dedicated_endpoint=use_dedicated_endpoint\n", - " )\n", - "\n", - " for prediction in response.predictions:\n", - " print(prediction)\n", - "\n", - "\n", "# Use FP8 base model for 405B since original model does not fit.\n", "deploy_pretrained_model_id = pretrained_model_id\n", "if \"Meta-Llama-3.1-405B\" in deploy_pretrained_model_id:\n", @@ -880,24 +851,32 @@ "# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.\n", "\n", "prompt = \"What is a car?\" # @param {type: \"string\"}\n", - "# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.\n", + "# @markdown If you encounter an issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.\n", "max_tokens = 50 # @param {type:\"integer\"}\n", "temperature = 1.0 # @param {type:\"number\"}\n", "top_p = 1.0 # @param {type:\"number\"}\n", "top_k = 1 # @param {type:\"integer\"}\n", "raw_response = False # @param {type:\"boolean\"}\n", "\n", - "\n", - "predict_vllm(\n", - " prompt=prompt,\n", - " max_tokens=max_tokens,\n", - " temperature=temperature,\n", - " top_p=top_p,\n", - " top_k=top_k,\n", - " raw_response=raw_response,\n", - " lora_weight=final_checkpoint,\n", + "# Overrides parameters for inferences.\n", + "instance = {\n", + " \"prompt\": prompt,\n", + " \"max_tokens\": max_tokens,\n", + " \"temperature\": temperature,\n", + " \"top_p\": top_p,\n", + " \"top_k\": top_k,\n", + " \"raw_response\": raw_response,\n", + "}\n", + "if lora_weight:\n", + " instance[\"dynamic-lora\"] = lora_weight\n", + "instances = [instance]\n", + "response = endpoints[\"vllm_gpu\"].predict(\n", + " instances=instances, use_dedicated_endpoint=use_dedicated_endpoint\n", ")\n", "\n", + "for prediction in response.predictions:\n", + " print(prediction)\n", + "\n", "# @markdown Click \"Show Code\" to see more details." ] },