使用vLLM兼容OpenAI接口
本章节以v1/chat流式推理接口和v1/completions流式推理接口为例介绍接口调用,其他接口的调用方法请参见vLLM兼容OpenAI接口章节。
v1/chat流式推理接口
接口名 |
v1/chat流式推理接口 |
|---|---|
URL |
https://{服务IP地址}:{端口号}/v1/chat/completions |
请求类型 |
POST |
请求示例 |
curl -H "Accept: application/json" -H "Content-type: application/json" --cacert ca.pem --cert client.pem --key client.key.pem -X POST -d '{
"model": "llama",
"messages": [
{
"role": "user",
"content": "You are a helpful assistant."
}
],
"stream": true,
"presence_penalty": 1.03,
"frequency_penalty": 1.0,
"repetition_penalty": 1.0,
"temperature": 0.5,
"top_p": 0.95,
"top_k": 1,
"seed": 1,
"max_tokens": 5,
"n": 2,
"best_of": 2
}' https://127.0.0.1:1025/v1/chat/completions
|
返回示例 |
data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":0,"delta":{"role":"assistant","content":"You"},"logprobs":null,"finish_reason":null}]}
data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":1,"delta":{"role":"assistant","content":"You"},"logprobs":null,"finish_reason":null}]}
data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":0,"delta":{"role":"assistant","content":" are"},"logprobs":null,"finish_reason":null}]}
data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":1,"delta":{"role":"assistant","content":" are"},"logprobs":null,"finish_reason":null}]}
data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":0,"delta":{"role":"assistant","content":" a"},"logprobs":null,"finish_reason":null}]}
data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":1,"delta":{"role":"assistant","content":" a"},"logprobs":null,"finish_reason":null}]}
data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":0,"delta":{"role":"assistant","content":" helpful"},"logprobs":null,"finish_reason":null}]}
data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":1,"delta":{"role":"assistant","content":" helpful"},"logprobs":null,"finish_reason":null}]}
data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","usage":{"prompt_tokens":24,"completion_tokens":5,"total_tokens":29,"batch_size":[1,1,1,1,1],"queue_wait_time":[5318,117,82,72,196]},"choices":[{"index":0,"delta":{"role":"assistant","content":" assistant"},"logprobs":null,"finish_reason":"length"}]}
data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","usage":{"prompt_tokens":24,"completion_tokens":5,"total_tokens":29,"batch_size":[1,1,1,1,1],"queue_wait_time":[5318,117,82,72,196]},"choices":[{"index":1,"delta":{"role":"assistant","content":" assistant"},"logprobs":null,"finish_reason":"length"}]}
data: [DONE]
|
v1/completions流式推理接口
接口名 |
v1/completions流式推理接口 |
|---|---|
URL |
https://{服务IP地址}:{端口号}/v1/completions |
请求类型 |
POST |
请求示例 |
curl -H "Accept: application/json" -H "Content-type: application/json" --cacert ca.pem --cert client.pem --key client.key.pem -X POST -d '{
"model": "Qwen2.5-7B-Instruct",
"prompt": "who are you",
"temperature": 1,
"max_tokens": 5,
"use_beam_search": true,
"ignore_eos":true,
"n": 2,
"best_of":2,
"stream": true,
"logprobs": 2
}' https://127.0.0.1:1025/v1/completions
|
返回示例 |
data: {"id":"endpoint_common_1","object":"text_completion","created":1744948803,"model":"Qwen2.5-7B-Instruct","choices":[{"index":0,"text":"\nI am a large","logprobs":{"text_offset":[0,1,2,5,7],"token_logprobs":[-1.8828125,-0.018310546875,-0.054931640625,-0.435546875,-0.0286865234375],"tokens":["\n","I"," am"," a"," large"],"top_logprobs":[{"\n":-1.8828125,"\n\n":-2.0},{"I":-0.018310546875,"Hello":-4.53125},{" am":-0.054931640625,"'m":-2.9375},{" a":-0.435546875," Q":-1.1875},{" large":-0.0286865234375," language":-4.78125}]},"stop_reason":null,"finish_reason":"length"},{"index":1,"text":"\n\nI am a large","logprobs":{"text_offset":[13,15,16,19,21],"token_logprobs":[-2.0,-0.031494140625,-0.0791015625,-0.5546875,-0.01092529296875],"tokens":["\n\n","I"," am"," a"," large"],"top_logprobs":[{"\n\n":-2.0,"\n":-1.8828125},{"I":-0.031494140625,"Hello":-4.28125},{" am":-0.0791015625,"'m":-2.578125},{" a":-0.5546875," Q":-1.0546875},{" large":-0.01092529296875," language":-6.375}]},"stop_reason":null,"finish_reason":"length"}],"usage":{"prompt_tokens":3,"completion_tokens":10,"total_tokens":13,"batch_size":[1,1,1,1,1,1,1,1,1,1],"queue_wait_time":[5496,146,65,60,111,42,27,70,64,51]}}
data: [DONE]
|
父主题: 服务化接口调用