昇腾社区首页
中文
注册

使用vLLM兼容OpenAI接口

本章节以v1/chat流式推理接口和v1/completions流式推理接口为例介绍接口调用,其他接口的调用方法请参见vLLM兼容OpenAI接口章节。

v1/chat流式推理接口

接口名

v1/chat流式推理接口

URL

https://{服务IP地址}:{端口号}/v1/chat/completions

请求类型

POST

请求示例

curl -H "Accept: application/json" -H "Content-type: application/json" --cacert ca.pem --cert client.pem  --key client.key.pem -X POST -d '{
    "model": "llama",
    "messages": [
        {
            "role": "user",
            "content": "You are a helpful assistant."
        }
    ],
    "stream": true,
    "presence_penalty": 1.03,
    "frequency_penalty": 1.0,
    "repetition_penalty": 1.0,
    "temperature": 0.5,
    "top_p": 0.95,
    "top_k": 1,
    "seed": 1,
    "max_tokens": 5,
    "n": 2,
    "best_of": 2
}' https://127.0.0.1:1025/v1/chat/completions

返回示例

data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":0,"delta":{"role":"assistant","content":"You"},"logprobs":null,"finish_reason":null}]}

data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":1,"delta":{"role":"assistant","content":"You"},"logprobs":null,"finish_reason":null}]}

data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":0,"delta":{"role":"assistant","content":" are"},"logprobs":null,"finish_reason":null}]}

data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":1,"delta":{"role":"assistant","content":" are"},"logprobs":null,"finish_reason":null}]}

data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":0,"delta":{"role":"assistant","content":" a"},"logprobs":null,"finish_reason":null}]}

data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":1,"delta":{"role":"assistant","content":" a"},"logprobs":null,"finish_reason":null}]}

data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":0,"delta":{"role":"assistant","content":" helpful"},"logprobs":null,"finish_reason":null}]}

data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","choices":[{"index":1,"delta":{"role":"assistant","content":" helpful"},"logprobs":null,"finish_reason":null}]}

data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","usage":{"prompt_tokens":24,"completion_tokens":5,"total_tokens":29,"batch_size":[1,1,1,1,1],"queue_wait_time":[5318,117,82,72,196]},"choices":[{"index":0,"delta":{"role":"assistant","content":" assistant"},"logprobs":null,"finish_reason":"length"}]}

data: {"id":"endpoint_common_10","object":"chat.completion.chunk","created":1744038509,"model":"llama","usage":{"prompt_tokens":24,"completion_tokens":5,"total_tokens":29,"batch_size":[1,1,1,1,1],"queue_wait_time":[5318,117,82,72,196]},"choices":[{"index":1,"delta":{"role":"assistant","content":" assistant"},"logprobs":null,"finish_reason":"length"}]}

data: [DONE]

v1/completions流式推理接口

接口名

v1/completions流式推理接口

URL

https://{服务IP地址}:{端口号}/v1/completions

请求类型

POST

请求示例

curl -H "Accept: application/json" -H "Content-type: application/json" --cacert ca.pem --cert client.pem  --key client.key.pem -X POST -d '{
    "model": "Qwen2.5-7B-Instruct",
    "prompt": "who are you",
    "temperature": 1,
    "max_tokens": 5,
    "use_beam_search": true,
    "ignore_eos":true,
    "n": 2,
    "best_of":2,
    "stream": true,
    "logprobs": 2
}' https://127.0.0.1:1025/v1/completions

返回示例

data: {"id":"endpoint_common_1","object":"text_completion","created":1744948803,"model":"Qwen2.5-7B-Instruct","choices":[{"index":0,"text":"\nI am a large","logprobs":{"text_offset":[0,1,2,5,7],"token_logprobs":[-1.8828125,-0.018310546875,-0.054931640625,-0.435546875,-0.0286865234375],"tokens":["\n","I"," am"," a"," large"],"top_logprobs":[{"\n":-1.8828125,"\n\n":-2.0},{"I":-0.018310546875,"Hello":-4.53125},{" am":-0.054931640625,"'m":-2.9375},{" a":-0.435546875," Q":-1.1875},{" large":-0.0286865234375," language":-4.78125}]},"stop_reason":null,"finish_reason":"length"},{"index":1,"text":"\n\nI am a large","logprobs":{"text_offset":[13,15,16,19,21],"token_logprobs":[-2.0,-0.031494140625,-0.0791015625,-0.5546875,-0.01092529296875],"tokens":["\n\n","I"," am"," a"," large"],"top_logprobs":[{"\n\n":-2.0,"\n":-1.8828125},{"I":-0.031494140625,"Hello":-4.28125},{" am":-0.0791015625,"'m":-2.578125},{" a":-0.5546875," Q":-1.0546875},{" large":-0.01092529296875," language":-6.375}]},"stop_reason":null,"finish_reason":"length"}],"usage":{"prompt_tokens":3,"completion_tokens":10,"total_tokens":13,"batch_size":[1,1,1,1,1,1,1,1,1,1],"queue_wait_time":[5496,146,65,60,111,42,27,70,64,51]}}

data: [DONE]