昇腾社区首页
中文
注册

使用兼容Triton接口

本章节以Token推理接口、文本推理接口和流式推理接口为例介绍接口调用,其他接口的调用方法请参见兼容Triton接口章节。

Token推理接口

接口名

Token推理接口

URL

https://{服务IP地址}:{端口号}/v2/models/${MODEL_NAME}/infer

请求类型

POST

请求示例

curl -H "Accept: application/json" -H "Content-type: application/json" --cacert ca.pem --cert client.pem  --key client.key.pem -X POST -d '{
    "id": "42",
    "inputs": [{
        "name": "input0",
        "shape": [
            1,
            10
        ],
        "datatype": "UINT32",
        "data": [
            396, 319, 13996, 29877, 29901, 29907, 3333, 20718, 316, 23924
        ]
    }],
    "outputs": [{
        "name": "output0"
    }],
    "parameters": {
        "temperature": 0.5,
        "top_k": 10,
        "top_p": 0.95,
        "do_sample": true,
        "seed": null,
        "repetition_penalty": 1.03,
        "max_new_tokens": 20,
        "watermark": true,
        "priority": 5,
        "timeout": 10
    }
}' https://127.0.0.1:1025/v2/models/llama_65b/infer

返回示例

{
    "id": "42",
    "outputs": [
        {
            "name": "output0",
            "shape": [
                1,
                20
            ],
            "datatype": "UINT32",
            "data": [
                1,
                396,
                319,
                13996,
                29877,
                29901,
                29907,
                3333,
                20718,
                316,
                23924,
                562,
                2142,
                1702,
                425,
                14015,
                16060,
                316,
                383,
                19498
            ]
        }
    ]
}

文本推理接口

接口名

文本推理接口

URL

https://{服务IP地址}:{端口号}/v2/models/${MODEL_NAME}/generate

请求类型

POST

请求示例

curl -H "Accept: application/json" -H "Content-type: application/json" --cacert ca.pem --cert client.pem  --key client.key.pem -X POST -d '{
    "id":"a123",
    "text_input": "My name is Olivier and I",
    "parameters": {
        "details": true,
        "do_sample": true,
        "max_new_tokens":5,
        "repetition_penalty": 1.1,
        "seed": 123,
        "temperature": 1,
        "top_k": 10,
        "top_p": 0.99,
        "batch_size":100,
        "typical_p": 0.5,
        "watermark": false,
        "perf_stat": true,
        "priority": 5,
        "timeout": 10
    }
}' https://127.0.0.1:1025/v2/models/llama_65b/generate

返回示例

{
    "id": "a123",
    "model_name": "llama_65b",
    "model_version": null,
    "text_output": "live in Paris, France",
    "details": {
        "finish_reason": "length",
        "generated_tokens": 5,
        "first_token_cost": null,
        "decode_cost": null,
        "perf_stat": [
            [
                5735,
                17
            ],
            [
                297,
                8
            ],
            [
                3681,
                7
            ],
            [
                29892,
                7
            ],
            [
                3444,
                7
            ]
        ]
    }
}

流式推理接口

接口名

流式推理接口

URL

https://{服务IP地址}:{端口号}/v2/models/${MODEL_NAME}/generate_stream

请求类型

POST

请求示例

curl -H "Accept: application/json" -H "Content-type: application/json" --cacert ca.pem --cert client.pem  --key client.key.pem -X POST -d '{
    "id":"a123",
    "text_input": "My name is Olivier and I",
    "parameters": {
        "details": true,
        "do_sample": true,
        "max_new_tokens":5,
        "repetition_penalty": 1.1,
        "seed": 123,
        "temperature": 1,
        "top_k": 10,
        "top_p": 0.99,
        "batch_size":100,
        "typical_p": 0.5,
        "watermark": false,
        "perf_stat": true,
        "priority": 5,
        "timeout": 10
    }
}' https://127.0.0.1:1025/v2/models/llama_65b/generate_stream

返回示例

data:{"id":"a123","model_name":"llama_65b","model_version":null,"text_output":"live","details":{"generated_tokens":1,"first_token_cost":null,"decode_cost":null,"perf_stat":[[5735,28]],"batch_size":1,"queue_wait_time":5082},"prefill_time":28,"decode_time":null}

data:{"id":"a123","model_name":"llama_65b","model_version":null,"text_output":" in","details":{"generated_tokens":2,"first_token_cost":null,"decode_cost":null,"perf_stat":[[5735,28],[297,9]],"batch_size":1,"queue_wait_time":36},"prefill_time":null,"decode_time":9}

data:{"id":"a123","model_name":"llama_65b","model_version":null,"text_output":" Paris","details":{"generated_tokens":3,"first_token_cost":null,"decode_cost":null,"perf_stat":[[5735,28],[297,9],[3681,8]],"batch_size":1,"queue_wait_time":30},"prefill_time":null,"decode_time":8}

data:{"id":"a123","model_name":"llama_65b","model_version":null,"text_output":",","details":{"generated_tokens":4,"first_token_cost":null,"decode_cost":null,"perf_stat":[[5735,28],[297,9],[3681,8],[29892,7]],"batch_size":1,"queue_wait_time":23},"prefill_time":null,"decode_time":7}

data:{"id":"a123","model_name":"llama_65b","model_version":null,"text_output":" France","details":{"finish_reason":"length","generated_tokens":5,"first_token_cost":null,"decode_cost":null,"perf_stat":[[5735,28],[297,9],[3681,8],[29892,7],[3444,7]],"batch_size":1,"queue_wait_time":24},"prefill_time":null,"decode_time":7}