71 lines
2.2 KiB
Python
71 lines
2.2 KiB
Python
"""
|
|
Quick test script to verify the vLLM server is running and responding.
|
|
|
|
Usage:
|
|
pip install openai
|
|
python test_server.py [--host HOST] [--port PORT] [--api-key KEY]
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
|
|
from openai import OpenAI
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Test vLLM inference server")
|
|
parser.add_argument("--host", default="localhost", help="Server hostname")
|
|
parser.add_argument("--port", default=7080, type=int, help="Server port")
|
|
parser.add_argument("--api-key", default="EMPTY", help="API key")
|
|
args = parser.parse_args()
|
|
|
|
base_url = f"http://{args.host}:{args.port}/v1"
|
|
model = "qwen3.5-35b-a3b"
|
|
client = OpenAI(base_url=base_url, api_key=args.api_key)
|
|
|
|
print(f"Connecting to {base_url} ...")
|
|
|
|
print("\n--- Available Models ---")
|
|
try:
|
|
models = client.models.list()
|
|
for m in models.data:
|
|
print(f" {m.id}")
|
|
except Exception as e:
|
|
print(f"ERROR: Cannot connect to server: {e}")
|
|
sys.exit(1)
|
|
|
|
print("\n--- Test Chat Completion ---")
|
|
response = client.chat.completions.create(
|
|
model=model,
|
|
messages=[
|
|
{"role": "user", "content": "Create a latex document that derives and explains the principle component analysis (pca). Make a self contain document with introduction, derivation, examples of applications. This is for computer science undergraduate class."}
|
|
],
|
|
max_tokens=16384,
|
|
temperature=0.7,
|
|
)
|
|
print(f" Response: {response.choices[0].message.content}")
|
|
print(f" Tokens: prompt={response.usage.prompt_tokens}, "
|
|
f"completion={response.usage.completion_tokens}")
|
|
|
|
print("\n--- Test Streaming ---")
|
|
stream = client.chat.completions.create(
|
|
model=model,
|
|
messages=[
|
|
{"role": "user", "content": "Count from 1 to 5."}
|
|
],
|
|
max_tokens=16384,
|
|
temperature=0.7,
|
|
stream=True,
|
|
)
|
|
print(" Response: ", end="")
|
|
for chunk in stream:
|
|
if chunk.choices[0].delta.content:
|
|
print(chunk.choices[0].delta.content, end="", flush=True)
|
|
print("\n")
|
|
|
|
print("All tests passed!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|