""" Quick test script to verify the vLLM server is running and responding. Usage: pip install openai python test_server.py [--host HOST] [--port PORT] [--api-key KEY] """ import argparse import sys from openai import OpenAI def main(): parser = argparse.ArgumentParser(description="Test vLLM inference server") parser.add_argument("--host", default="localhost", help="Server hostname") parser.add_argument("--port", default=7080, type=int, help="Server port") parser.add_argument("--api-key", default="EMPTY", help="API key") args = parser.parse_args() base_url = f"http://{args.host}:{args.port}/v1" model = "qwen3.5-35b-a3b" client = OpenAI(base_url=base_url, api_key=args.api_key) print(f"Connecting to {base_url} ...") print("\n--- Available Models ---") try: models = client.models.list() for m in models.data: print(f" {m.id}") except Exception as e: print(f"ERROR: Cannot connect to server: {e}") sys.exit(1) print("\n--- Test Chat Completion ---") response = client.chat.completions.create( model=model, messages=[ {"role": "user", "content": "What is 2 + 2? Answer in one sentence."} ], max_tokens=256, temperature=0.7, ) print(f" Response: {response.choices[0].message.content}") print(f" Tokens: prompt={response.usage.prompt_tokens}, " f"completion={response.usage.completion_tokens}") print("\n--- Test Streaming ---") stream = client.chat.completions.create( model=model, messages=[ {"role": "user", "content": "Count from 1 to 5."} ], max_tokens=128, temperature=0.7, stream=True, ) print(" Response: ", end="") for chunk in stream: if chunk.choices[0].delta.content: print(chunk.choices[0].delta.content, end="", flush=True) print("\n") print("All tests passed!") if __name__ == "__main__": main()