LLM_Inferenz_Server_1/test_server.py
herzogflorian 076001b07f Add vLLM inference setup for Qwen3.5-35B-A3B on Apptainer
Scripts to build container, download model, and serve Qwen3.5-35B-A3B
via vLLM with OpenAI-compatible API on port 7080. Configured for 2x
NVIDIA L40S GPUs with tensor parallelism, supporting ~15 concurrent
students.

Made-with: Cursor
2026-03-02 14:43:39 +01:00

71 lines
2.0 KiB
Python

"""
Quick test script to verify the vLLM server is running and responding.
Usage:
pip install openai
python test_server.py [--host HOST] [--port PORT] [--api-key KEY]
"""
import argparse
import sys
from openai import OpenAI
def main():
parser = argparse.ArgumentParser(description="Test vLLM inference server")
parser.add_argument("--host", default="localhost", help="Server hostname")
parser.add_argument("--port", default=7080, type=int, help="Server port")
parser.add_argument("--api-key", default="EMPTY", help="API key")
args = parser.parse_args()
base_url = f"http://{args.host}:{args.port}/v1"
model = "qwen3.5-35b-a3b"
client = OpenAI(base_url=base_url, api_key=args.api_key)
print(f"Connecting to {base_url} ...")
print("\n--- Available Models ---")
try:
models = client.models.list()
for m in models.data:
print(f" {m.id}")
except Exception as e:
print(f"ERROR: Cannot connect to server: {e}")
sys.exit(1)
print("\n--- Test Chat Completion ---")
response = client.chat.completions.create(
model=model,
messages=[
{"role": "user", "content": "What is 2 + 2? Answer in one sentence."}
],
max_tokens=256,
temperature=0.7,
)
print(f" Response: {response.choices[0].message.content}")
print(f" Tokens: prompt={response.usage.prompt_tokens}, "
f"completion={response.usage.completion_tokens}")
print("\n--- Test Streaming ---")
stream = client.chat.completions.create(
model=model,
messages=[
{"role": "user", "content": "Count from 1 to 5."}
],
max_tokens=128,
temperature=0.7,
stream=True,
)
print(" Response: ", end="")
for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)
print("\n")
print("All tests passed!")
if __name__ == "__main__":
main()