使用vLLM兼容OpenAI接口
本章节以v1/chat流式推理接口和v1/completions流式推理接口为例介绍接口调用,其他接口的调用方法请参见vLLM兼容OpenAI接口章节。
v1/chat流式推理接口
接口名 |
v1/chat流式推理接口 |
---|---|
URL |
https://{服务IP地址}:{端口号}/v1/chat/completions |
请求类型 |
POST |
请求示例 |
curl -H "Accept: application/json" -H "Content-type: application/json" --cacert ca.pem --cert client.pem --key client.key.pem -X POST -d '{ "model": "llama", "messages": [ { "role": "user", "content": "You are a helpful assistant." } ], "stream": true, "presence_penalty": 1.03, "frequency_penalty": 1.0, "repetition_penalty": 1.0, "temperature": 0.5, "top_p": 0.95, "top_k": 1, "seed": 1, "max_tokens": 5, "n": 2, "best_of": 2 }' https://127.0.0.1:1025/v1/chat/completions |
返回示例 |
data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":0,"delta":{"role":"assistant","content":"You"},"logprobs":null,"finish_reason":null}]} data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":1,"delta":{"role":"assistant","content":"You"},"logprobs":null,"finish_reason":null}]} data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":0,"delta":{"role":"assistant","content":" are"},"logprobs":null,"finish_reason":null}]} data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":1,"delta":{"role":"assistant","content":" are"},"logprobs":null,"finish_reason":null}]} data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":0,"delta":{"role":"assistant","content":" a"},"logprobs":null,"finish_reason":null}]} data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":1,"delta":{"role":"assistant","content":" a"},"logprobs":null,"finish_reason":null}]} data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":0,"delta":{"role":"assistant","content":" helpful"},"logprobs":null,"finish_reason":null}]} data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":1,"delta":{"role":"assistant","content":" helpful"},"logprobs":null,"finish_reason":null}]} data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","usage":{"prompt_tokens":24,"completion_tokens":5,"total_tokens":29,"batch_size":[1,1,1,1,1],"queue_wait_time":[5318,117,82,72,196]},"choices":[{"index":0,"delta":{"role":"assistant","content":" assistant"},"logprobs":null,"finish_reason":"length"}]} data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","usage":{"prompt_tokens":24,"completion_tokens":5,"total_tokens":29,"batch_size":[1,1,1,1,1],"queue_wait_time":[5318,117,82,72,196]},"choices":[{"index":1,"delta":{"role":"assistant","content":" assistant"},"logprobs":null,"finish_reason":"length"}]} data: [DONE] |
v1/completions流式推理接口
接口名 |
v1/completions流式推理接口 |
---|---|
URL |
https://{服务IP地址}:{端口号}/v1/completions |
请求类型 |
POST |
请求示例 |
curl -H "Accept: application/json" -H "Content-type: application/json" --cacert ca.pem --cert client.pem --key client.key.pem -X POST -d '{ "model": "Qwen2.5-7B-Instruct", "prompt": "who are you", "temperature": 1, "max_tokens": 5, "use_beam_search": true, "ignore_eos":true, "n": 2, "best_of":2, "stream": true, "logprobs": 2 }' https://127.0.0.1:1025/v1/completions |
返回示例 |
data: {"id":"endpoint_common_1","object":"text_completion","created":1744948803,"model":"Qwen2.5-7B-Instruct","choices":[{"index":0,"text":"\nI am a large","logprobs":{"text_offset":[0,1,2,5,7],"token_logprobs":[-1.8828125,-0.018310546875,-0.054931640625,-0.435546875,-0.0286865234375],"tokens":["\n","I"," am"," a"," large"],"top_logprobs":[{"\n":-1.8828125,"\n\n":-2.0},{"I":-0.018310546875,"Hello":-4.53125},{" am":-0.054931640625,"'m":-2.9375},{" a":-0.435546875," Q":-1.1875},{" large":-0.0286865234375," language":-4.78125}]},"stop_reason":null,"finish_reason":"length"},{"index":1,"text":"\n\nI am a large","logprobs":{"text_offset":[13,15,16,19,21],"token_logprobs":[-2.0,-0.031494140625,-0.0791015625,-0.5546875,-0.01092529296875],"tokens":["\n\n","I"," am"," a"," large"],"top_logprobs":[{"\n\n":-2.0,"\n":-1.8828125},{"I":-0.031494140625,"Hello":-4.28125},{" am":-0.0791015625,"'m":-2.578125},{" a":-0.5546875," Q":-1.0546875},{" large":-0.01092529296875," language":-6.375}]},"stop_reason":null,"finish_reason":"length"}],"usage":{"prompt_tokens":3,"completion_tokens":10,"total_tokens":13,"batch_size":[1,1,1,1,1,1,1,1,1,1],"queue_wait_time":[5496,146,65,60,111,42,27,70,64,51]}} data: [DONE] |
父主题: 服务化接口调用