代码目录结构如下：

Triton_MindIE-LLM_Backend
|____build.sh
|____CMakeLists.txt
|____src
|     |____mindie.cc
|     |____model_state.cc
|     |____model_state.h
|     |____model_instance_state.cc
|     |____model_instance_state.h
|     |____mindie_utils.cc
|     |____mindie_utils.h
|     |____infer_task.cc
|     |____infer_task.h
|____example
|     |____tritonModels
|     |     |____llama3_8b
|     |     |     |____1
|     |     |     |____config.pbtxt
|     |     |____config.json
|     |____set_env.sh
|     |____launch.sh
|     |____submit.py
|     |____client_stream.py
|     |____infer_client.py
|     |____logger_util.py
|     |____GSM8K.jsonl

src源文件和example文件夹的含义和作用分别如表1和表2所示。

表1 **src源文件的含义和作用**
源文件	含义及作用
mindie.cc	包含了Triton提供给Backend的核心接口函数，分别是TRITONBACKEND_Backend初始化、TRITONBACKEND_Model初始化和释放、TRITONBACKEND_ModelInstance初始化和释放以及模型实例接收并执行请求推理的接口设置。
model_state.cc	模型状态类的源文件，在其构造函数中使用engineConfigPath和五个callback函数给每个模型实例构造出一个LlmManager对象，完成初始化后创建线程从ModelState存储队列中读取出所有请求。
model_instance_state.cc	模型实例状态类的源文件，核心是其中定义的Enqueue成员函数，将所有来自triton::core侧的待推理请求解析并转换成MindIE侧请求类型，最后存储进ModelState队列中以待MindIE读取。
mindie_utils.cc	定义了几个MindIE工具函数，一个是将响应从MindIE侧转化为Triton侧类型的转换函数，其他几个是从文件中提取出配置和参数信息的解析函数。
infer_task.cc	推理任务类的源文件，核心是构造函数中将Triton侧请求转化为MindIE侧类型，并给每个推理请求创建了一个响应工厂，用来在发送响应回调函数中创建Triton侧响应。

表2 **example文件的含义和作用**
源文件	描述
tritonModels	Triton管理的模型仓，里面存放了LLaMa3_8b模型的Triton侧配置文件config.pbtxt和MindIE侧配置文件config.json。
set_env.sh	环境变量配置文件，根据CANN、ATB Models、MindIE安装路径按需修改。
launch.sh	启动Tritonserver服务端，其中包含MindIE LLM和Triton日志等级配置。
submit.py	利用tritonClinet向服务端依次发送GSM8K.jsonl数据集中的请求。
client_stream.py	利用tritonClinet向服务端发送单个请求。
GSM8K.jsonl	推理数据集，这里以GSM8K为例，需用户自行提供。

样例代码：

src/mindie.cc

// Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <stdint.h>
#include <string>
#include "model_state.h"
#include "model_instance_state.h"
#include "triton/backend/backend_common.h"
namespace triton { namespace backend { namespace mindie {
//
// ModelState
//
/////////////
extern "C" {
TRITONSERVER_Error* TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
{
    const char* cname;
    RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));
    std::string name(cname);
    LOG_MESSAGE(
        TRITONSERVER_LOG_INFO,
        (std::string("TRITONBACKEND_Initialize: ") + name).c_str());
  
    uint32_t api_version_major, api_version_minor;
    RETURN_IF_ERROR(
        TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor));
    LOG_MESSAGE(
        TRITONSERVER_LOG_INFO,
        (std::string("Triton TRITONBACKEND API version: ") +
        std::to_string(api_version_major) + "." +
        std::to_string(api_version_minor))
            .c_str());
    LOG_MESSAGE(
        TRITONSERVER_LOG_INFO,
        (std::string("'") + name + "' TRITONBACKEND API version: " +
        std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." +
        std::to_string(TRITONBACKEND_API_VERSION_MINOR)).c_str());
    if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) ||
        (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {
        return TRITONSERVER_ErrorNew(
            TRITONSERVER_ERROR_UNSUPPORTED,
            (std::string("Triton TRITONBACKEND API version: ") +
            std::to_string(api_version_major) + "." +
            std::to_string(api_version_minor) + " does not support '" + name +
            "' TRITONBACKEND API version: " +
            std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." +
            std::to_string(TRITONBACKEND_API_VERSION_MINOR)).c_str());
    }
    return nullptr;  // success
}
TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
{
    // Create a ModelState object and associate it with the
    // TRITONBACKEND_Model. If anything goes wrong with initialization
    // of the model state then an error is returned and Triton will fail
    // to load the model.
    ModelState* model_state;
    RETURN_IF_ERROR(ModelState::Create(model, &model_state));
    RETURN_IF_ERROR(TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
    return nullptr; // success
}
TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
{
    void* vstate;
    RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
    ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
    delete model_state;
    return nullptr; // success
}
TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
{
    // Get the model state associated with this instance's model.
    TRITONBACKEND_Model* model;
    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
    void* vmodelstate;
    RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
    ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
    // Create a ModelInstanceState object and associate it with the
    // TRITONBACKEND_ModelInstance.
    ModelInstanceState* instance_state;
    RETURN_IF_ERROR(ModelInstanceState::Create(model_state, instance, &instance_state));
    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(instance, reinterpret_cast<void*>(instance_state)));
    return nullptr; // success
}
TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
{
    void* vstate;
    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
    ModelInstanceState* instance_state = reinterpret_cast<ModelInstanceState*>(vstate);
    delete instance_state;
    return nullptr; // success
}
TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute(
    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
    const uint32_t request_count)
{
    ModelInstanceState* instance_state;
    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, reinterpret_cast<void**>(&instance_state)));
    LOG_MESSAGE(
      TRITONSERVER_LOG_INFO,
      (std::string("model instance ") + instance_state->Name() +
       ", executing " + std::to_string(request_count) + " requests")
          .c_str());
    instance_state->Enqueue(requests, request_count);
    return nullptr; // success
}
}   // extern "C"
}}} // namespace triton::backend::mindie

src/model_state.h

// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclr.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef MINDIE_STATE_H
#define MINDIE_STATE_H
//
// ModelState
//
// State associated with a model that is using this backend. An object
// of this class is created and associated with each
// TRITONBACKEND_Model. ModelState is derived from BackendModel class
// provided in the backend utilities that provides many common
// functions.
//
#include <string>
#include <thread>
#include <queue>
#include <unordered_map>
#include <set>
#include <mutex>
#include <atomic>
#include <functional>
#include <memory>
#include <shared_mutex>
#include "infer_task.h"
#include "triton/backend/backend_model.h"
#include "triton/backend/backend_common.h"
#include "triton/core/tritonbackend.h"
#include "llm_manager/llm_manager.h"

namespace triton::backend::mindie {
class ModelState : public BackendModel {
public:
    static TRITONSERVER_Error *Create(TRITONBACKEND_Model *triton_model, ModelState **state);
    virtual ~ModelState();
    // Name of the input and output tensor
    const std::string &InputTensorName() const
    {
        return input_name_;
    }
    const std::string &OutputTensorName() const
    {
        return output_name_;
    }
    // Datatype of the input and output tensor
    TRITONSERVER_DataType TensorDataType() const
    {
        return datatype_;
    }
    // Shape of the input and output tensor as given in the model
    // configuration file. This shape will not include the batch
    // dimension (if the model has one).
    const std::vector<int64_t> &TensorNonBatchShape() const
    {
        return nb_shape_;
    }
    // Validate that this model is supported by this backend.
    TRITONSERVER_Error *ValidateModelConfig();
    std::queue<std::shared_ptr<mindie_llm::InferRequest>> &GetRequestsQueue();
    std::unordered_map<mindie_llm::InferRequestId, std::shared_ptr<InferTask>> &GetInferTasksMap();
    std::shared_mutex &GetMutex();
    std::vector<mindie_llm::LlmManager*> GetLlmManagers();
private:
    ModelState(TRITONBACKEND_Model *triton_model);
    mindie_llm::GetRequestsCallback CreateGetRequestsCallback();
    mindie_llm::SendResponsesCallback CreateSendResponsesCallback();
    mindie_llm::ControlSignalCallback CreateControlSignalCallback();
    mindie_llm::LlmManagerStatsCallback CreateLlmManagerStatsCallback();
    mindie_llm::SendStatusResponseCallback CreateSendStatusResponseCallback();
    std::pair<uint32_t, std::vector<std::set<size_t>>> GetInitConfig();
    std::string input_name_;
    std::string output_name_;
    TRITONSERVER_DataType datatype_;
    bool shape_initialized_;
    std::vector<int64_t> nb_shape_;
    std::vector<int64_t> shape_;
    std::queue<std::shared_ptr<mindie_llm::InferRequest>> requests_;
    std::unordered_map<mindie_llm::InferRequestId, std::shared_ptr<InferTask>> inferTasksMap_;
    std::shared_mutex mutex_;
    std::vector<mindie_llm::LlmManager*> llmManagers_;
};
}  // namespace triton::backend::mindie
#endif

src/model_state.cc

// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <unordered_set>
#include "mindie_utils.h"
#include "model_state.h"

namespace triton::backend::mindie {
ModelState::ModelState(TRITONBACKEND_Model *triton_model) : BackendModel(triton_model, true), shape_initialized_(false)
{
    // Validate that the model's configuration matches what is supported
    // by this backend.
    THROW_IF_BACKEND_MODEL_ERROR(ValidateModelConfig());
    std::string configPath = MindIEUtils::GetEngineConfigPath(this);
    LOG_MESSAGE(TRITONSERVER_LOG_ERROR,
                (std::string("configPath: ") + configPath).c_str());
    auto initConfig = GetInitConfig();
    uint32_t modelInstanceNumber = initConfig.first;
    std::vector<std::set<size_t>> npuDeviceIds = initConfig.second;
    LOG_MESSAGE(TRITONSERVER_LOG_ERROR,
                (std::string("modelInstanceNumber: ") + std::to_string(modelInstanceNumber)).c_str());
    auto getRequestsFunc = CreateGetRequestsCallback();
    auto sendResponsesFunc = CreateSendResponsesCallback();
    auto controlSignalFunc = CreateControlSignalCallback();
    auto llmManagerStatsFunc = CreateLlmManagerStatsCallback();
    auto sendStatusResponseCallback = CreateSendStatusResponseCallback();
    mindie_llm::LlmManager* llm_manager = nullptr;
    for (uint32_t modelInstanceId = 0; modelInstanceId < modelInstanceNumber; ++modelInstanceId) {
        llm_manager = new mindie_llm::LlmManager(configPath,
            getRequestsFunc, sendResponsesFunc, controlSignalFunc, llmManagerStatsFunc, sendStatusResponseCallback);
        llmManagers_.emplace_back(llm_manager);
        auto status = llm_manager->Init(modelInstanceId, npuDeviceIds[modelInstanceId]);
        if (!status.IsOk()) {
            LOG_MESSAGE(TRITONSERVER_LOG_ERROR,
                "Failed to init llm_manager");
            break;
        }
    }
}
mindie_llm::GetRequestsCallback ModelState::CreateGetRequestsCallback()
{
    return [this]() -> std::vector<std::shared_ptr<mindie_llm::InferRequest>> {
        std::vector<std::shared_ptr<mindie_llm::InferRequest>> newRequests;
        std::unique_lock lock(this->mutex_);
        if (this->requests_.empty()) {
            return newRequests;
        }
        while (!this->requests_.empty()) {
            auto req = this->requests_.front();
            auto requestId = req->GetRequestId();
            newRequests.push_back(this->requests_.front());
            this->requests_.pop();
        }
        return newRequests;
    };
}
mindie_llm::SendResponsesCallback ModelState::CreateSendResponsesCallback()
{
    return [this](mindie_llm::InferRequestId reqId,
                    const mindie_llm::TensorMap &tensorMap,
                    bool isEnd,
                    const std::string &flag) {
            auto item = this->inferTasksMap_.find(reqId);
            if (item == this->inferTasksMap_.end()) {
                LOG_MESSAGE(TRITONSERVER_LOG_ERROR,
                    (std::string("can't find infertask with reqId: ") + reqId.StringValue()).c_str());
                return;
            }
            std::shared_ptr<InferTask> inferTask = item->second;
            TRITONBACKEND_ResponseFactory *responseFactory = inferTask->GetResponseFactory();
            if (responseFactory == nullptr) {
                LOG_MESSAGE(
                    TRITONSERVER_LOG_ERROR, std::string("MindIE inferTask->GetResponseFactory() is nullptr").c_str());
                return;
            }
            TRITONBACKEND_Response *bResponse;
            auto tritonErr = TRITONBACKEND_ResponseNewFromFactory(&bResponse, responseFactory);
            if (tritonErr != nullptr) {
                LOG_MESSAGE(TRITONSERVER_LOG_ERROR,
                    (std::string("ResponseNewFromFactory failed, reqId: ") + reqId.StringValue()).c_str());
                return;
            }
            tritonErr = MindIEUtils::ConvertResponse(bResponse, tensorMap);
            if (tritonErr != nullptr) {
                LOG_IF_ERROR(tritonErr, std::string("generate triton response failed, reqId: ") + reqId.StringValue());
                return;
            }
            tritonErr = TRITONBACKEND_ResponseSend(bResponse,
                isEnd ? TRITONSERVER_RESPONSE_COMPLETE_FINAL : 0,
                tritonErr);
            if (isEnd) {
                TRITONBACKEND_RequestRelease(inferTask->GetBackendRequest(), TRITONSERVER_REQUEST_RELEASE_ALL);
                std::unique_lock lock(this->GetMutex());
                this->inferTasksMap_.erase(item);
            }
            if (tritonErr != nullptr) {
                LOG_IF_ERROR(tritonErr, std::string("send triton response failed, reqId: ") + reqId.StringValue());
                return;
            }
        };
}
mindie_llm::ControlSignalCallback ModelState::CreateControlSignalCallback()
{
    return [this]() -> std::vector<std::pair<mindie_llm::InferRequestId, mindie_llm::Operation>> {
        std::vector<std::pair<mindie_llm::InferRequestId, mindie_llm::Operation>> stopIds;
        return stopIds;
    };
}
mindie_llm::LlmManagerStatsCallback ModelState::CreateLlmManagerStatsCallback()
{
    return [this](const std::string &strData) {
        return;
    };
}
mindie_llm::SendStatusResponseCallback ModelState::CreateSendStatusResponseCallback()
{
    return [this](mindie_llm::InferRequestId requestId,
                  mindie_llm::Status status,
                  mindie_llm::StatusResponseType responsetype) {
        if (!(status == mindie_llm::Status())) {
            LOG_MESSAGE(TRITONSERVER_LOG_ERROR,
                        (std::string("Failed to stop request ") + requestId.StringValue() +
                         ", Error msg: " + status.StatusMsg() + ", responsetype: ").c_str());
        }
    };
}
ModelState::~ModelState()
{
    if (!llmManagers_.empty()) {
        for (auto& llm_manager:llmManagers_) {
            llm_manager->Shutdown();
            delete llm_manager;
            llm_manager = nullptr;
        }
        llmManagers_.clear();
    }
}
TRITONSERVER_Error *ModelState::Create(TRITONBACKEND_Model *triton_model, ModelState **state)
{
    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("Create MindIE ModelState:")).c_str());
    try {
        *state = new ModelState(triton_model);
    } catch (const BackendModelException &ex) {
        RETURN_ERROR_IF_TRUE(ex.err_ == nullptr,
            TRITONSERVER_ERROR_INTERNAL,
            std::string("unexpected nullptr in BackendModelException"));
        RETURN_IF_ERROR(ex.err_);
    }
    return nullptr; // success
}
std::pair<uint32_t, std::vector<std::set<size_t>>> ModelState::GetInitConfig()
{
    triton::common::TritonJson::Value parameters;
    TRITONSERVER_Error* err = this->ModelConfig().MemberAsObject("parameters", &parameters);
    if (err != nullptr) {
        TRITONSERVER_ErrorDelete(err);
        throw std::runtime_error("Model config doesn't have a parameters section");
    }
    std::string modelInstanceNumber_s;
    LOG_IF_ERROR(
        GetParameterValue(parameters, "model_instance_number", &modelInstanceNumber_s),
            "failed to get modelInstanceNumber config.");
    std::string npuDeviceIds_s;
    LOG_IF_ERROR(
        GetParameterValue(parameters, "npu_device_ids", &npuDeviceIds_s),
            "failed to get npuDeviceIds config.");
    // 0 : number of modelInstances
    uint32_t modelInstanceNumber = static_cast<uint32_t>(modelInstanceNumber_s[0] - '0');
    std::vector<std::set<size_t>> npuDeviceIds;
    std::set<size_t> npuDeviceIds_instance;
    for (const auto& id: npuDeviceIds_s) {
        if (id == ';') {
            npuDeviceIds.emplace_back(npuDeviceIds_instance);
            npuDeviceIds_instance.clear();
            continue;
        }
        size_t npuId = static_cast<size_t>(id - '0');
        npuDeviceIds_instance.emplace(npuId);
    }
    auto initConfig = std::pair(modelInstanceNumber, npuDeviceIds);
    return initConfig;
}
TRITONSERVER_Error *ModelState::ValidateModelConfig()
{
    return nullptr;  // success
}
std::queue<std::shared_ptr<mindie_llm::InferRequest>> &ModelState::GetRequestsQueue()
{
    return requests_;
}
std::unordered_map<mindie_llm::InferRequestId, std::shared_ptr<InferTask>> &ModelState::GetInferTasksMap()
{
    return inferTasksMap_;
}
std::shared_mutex &ModelState::GetMutex()
{
    return mutex_;
}
std::vector<mindie_llm::LlmManager*> ModelState::GetLlmManagers()
{
    return llmManagers_;
}
}  // namespace triton::backend::mindie

src/model_instance_state.h

// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef MINDIE_INSTANCE_H
#define MINDIE_INSTANCE_H
//
// ModelInstanceState
//
// State associated with a model instance. An object of this class is
// created and associated with each
// TRITONBACKEND_ModelInstance. ModelInstanceState is derived from
// BackendModelInstance class provided in the backend utilities that
// provides many common functions.
//
#include "triton/backend/backend_common.h"
#include "triton/backend/backend_model_instance.h"
#include "triton/backend/backend_model.h"
#include "triton/core/tritonbackend.h"
#include <string>
#include "model_state.h"
#include "infer_task.h"
#include "mindie_utils.h"
namespace triton::backend::mindie {
class ModelInstanceState : public BackendModelInstance {
public:
    static TRITONSERVER_Error *Create(
        ModelState *model_state, TRITONBACKEND_ModelInstance *triton_model_instance, ModelInstanceState **state);
    virtual ~ModelInstanceState() = default;
    // Get the state of the model that corresponds to this instance.
    ModelState *StateForModel() const
    {
        return model_state_;
    }
    void Enqueue(TRITONBACKEND_Request **requests, const uint32_t request_count);
private:
    ModelInstanceState(ModelState *model_state, TRITONBACKEND_ModelInstance *triton_model_instance);
    ModelState *model_state_;
};
}  // namespace triton::backend::mindie
#endif

src/model_instance_state.cc

// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "model_instance_state.h"
#include <cmath>

namespace triton::backend::mindie {
TRITONSERVER_Error *ModelInstanceState::Create(
    ModelState *model_state, TRITONBACKEND_ModelInstance *triton_model_instance, ModelInstanceState **state)
{
    try {
        *state = new ModelInstanceState(model_state, triton_model_instance);
    } catch (const BackendModelInstanceException &ex) {
        RETURN_ERROR_IF_TRUE(ex.err_ == nullptr,
            TRITONSERVER_ERROR_INTERNAL,
            std::string("unexpected nullptr in BackendModelInstanceException"));
        RETURN_IF_ERROR(ex.err_);
    }
    return nullptr;
}
ModelInstanceState::ModelInstanceState(ModelState *model_state, TRITONBACKEND_ModelInstance *triton_model_instance)
    : BackendModelInstance(model_state, triton_model_instance), model_state_(model_state)
{
    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("Create MindIE ModelInstanceState:")).c_str());
}
void ModelInstanceState::Enqueue(TRITONBACKEND_Request **requests, const uint32_t request_count)
{
    LOG_MESSAGE(
        TRITONSERVER_LOG_VERBOSE, (std::string("Process Requests: Executing ModelInstanceState::Enqueue")).c_str());
    
    std::unordered_map<mindie_llm::InferRequestId, std::shared_ptr<triton::backend::mindie::InferTask>> newInferTaskMap;
    std::vector<std::shared_ptr<mindie_llm::InferRequest>> newRequests;
    for (uint32_t i = 0; i < request_count; i++) {
        TRITONBACKEND_Request *bRequest = requests[i];
        auto inferTask = std::make_shared<InferTask>(bRequest);
        auto req = inferTask->GetMieRequest();
        auto requestId = req->GetRequestId();
        newInferTaskMap[requestId] = inferTask;
        newRequests.push_back(req);
    }
    std::unique_lock lock(model_state_->GetMutex());
    model_state_->GetInferTasksMap().insert(newInferTaskMap.begin(), newInferTaskMap.end());
    for (uint32_t i = 0; i < newRequests.size(); i++) {
        model_state_->GetRequestsQueue().push(newRequests.at(i));
    }
}
}  // namespace triton::backend::mindie

src/mindie_utils.h

// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef MINDIE_UTILS_H
#define MINDIE_UTILS_H
//
// ModelInstanceState
//
// State associated with a model instance. An object of this class is
// created and associated with each
// TRITONBACKEND_ModelInstance. ModelInstanceState is derived from
// BackendModelInstance class provided in the backend utilities that
// provides many common functions.
//
#include "triton/backend/backend_common.h"
#include "triton/backend/backend_model_instance.h"
#include "triton/backend/backend_model.h"
#include "triton/core/tritonbackend.h"
#include "llm_manager/infer_request.h"
#include "llm_manager/llm_manager.h"
#include "infer_task.h"

namespace triton::backend::mindie
{
class ModelState;
class MindIEUtils {
public:
    static TRITONSERVER_Error* ConvertResponse(TRITONBACKEND_Response* bResponse,
      const mindie_llm::TensorMap& result);
    static std::string GetEngineConfigPath(ModelState* model_state);
};
}
#endif

src/mindie_utils.cc

// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "mindie_utils.h"
#include <iostream>
#include <cmath>
#include <securec.h>
#include "model_state.h"

namespace triton::backend::mindie
{
TRITONSERVER_Error* MindIEUtils::ConvertResponse(TRITONBACKEND_Response* bResponse,
    const mindie_llm::TensorMap& result)
{
    for (const auto& entry : result) {
        const std::string& name = entry.first;
        if (name != "OUTPUT_IDS" && name != "IBIS_EOS_ATTR") {
            continue;
        }
        const auto& tensor = entry.second;
        TRITONBACKEND_Output* output = nullptr;
        RETURN_IF_ERROR(TRITONBACKEND_ResponseOutput(bResponse, &output, name.c_str(),
                                                     (TRITONSERVER_DataType)tensor->GetDataType(),
                                                     tensor->GetShape().data(),
                                                     tensor->GetShape().size()));
        
        uint64_t byteSize = tensor->GetSize();
        void* buffer = nullptr;
        TRITONSERVER_MemoryType memory_type = (TRITONSERVER_MemoryType)tensor->GetMemType();
        int64_t memory_type_id = 0;
        
        RETURN_IF_ERROR(TRITONBACKEND_OutputBuffer(output, &buffer, byteSize,
                                                   &memory_type, &memory_type_id));
        
        if (memory_type != TRITONSERVER_MEMORY_CPU && memory_type != TRITONSERVER_MEMORY_CPU_PINNED) {
            return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, "Triton failed to allocate output buffer on CPU");
        }
        errno_t memRet = memcpy_s(buffer, byteSize, tensor->GetData(), byteSize);
        if (memRet != EOK) {
            return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL,
                                         "Failed to copy tensor data to Triton output buffer");
        }
    }
    return nullptr;
}
std::string MindIEUtils::GetEngineConfigPath(ModelState* model_state)
{
    triton::common::TritonJson::Value parameters;
    TRITONSERVER_Error* err = model_state->ModelConfig().MemberAsObject("parameters", &parameters);
    if (err != nullptr) {
        TRITONSERVER_ErrorDelete(err);
        throw std::runtime_error("Model config doesn't have a parameters section");
    }
    std::string engineConfigPath;
    LOG_IF_ERROR(
        GetParameterValue(parameters, "engine_config_path", &engineConfigPath), "failed to get log_path config.");
    return engineConfigPath;
}
} // namespace triton::backend::mindie

src/infer_task.h

#ifndef MINDIE_INFERTASK_H
#define MINDIE_INFERTASK_H

#include "triton/backend/backend_common.h"
#include "triton/core/tritonbackend.h"
#include "llm_manager/infer_request.h"
#include "llm_manager/infer_tensor.h"
#include <unordered_set>

namespace triton::backend::mindie {
class InferTask {
public:
    explicit InferTask(TRITONBACKEND_Request* request);
    ~InferTask();
    TRITONBACKEND_ResponseFactory* GetResponseFactory();
    std::shared_ptr<mindie_llm::InferRequest> GetMieRequest() const;
    TRITONBACKEND_Request* GetBackendRequest() const;
private:
    template <typename T>
    void SetSamplingValue(std::shared_ptr<mindie_llm::InferRequest>& leRequest,
                          TRITONBACKEND_Request* request, const char* postprocessParam,
                          mindie_llm::InferDataType type, T default_sampling_value)
    {
        const int64_t paramNum = 1;
        auto samplingTensor = std::make_shared<mindie_llm::InferTensor>(postprocessParam, type,
                                                                        std::vector<int64_t>{1, paramNum});
        bool ret = samplingTensor->Allocate(paramNum * sizeof(T));
        samplingTensor->SetRelease(false);
        if (!ret) {
            LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "failed to allocate data for sampling param");
        }
        T *samplingBuffer = static_cast<T*>(samplingTensor->GetData());
        TRITONBACKEND_Input* input = nullptr;
        const void* buffer;
        size_t bufferByteSize;
        TRITONSERVER_MemoryType memoryType;
        int64_t memoryType_id;
        if (TRITONBACKEND_RequestInput(request, postprocessParam, &input) != nullptr) {
            *samplingBuffer = static_cast<T>(default_sampling_value);
        } else {
            TRITONBACKEND_InputBuffer(input, 0, &buffer, &bufferByteSize, &memoryType, &memoryType_id);
            *samplingBuffer = static_cast<T>(*reinterpret_cast<const T*>(buffer));
        }
        leRequest->AddTensor(postprocessParam, samplingTensor);
    }
    
    std::shared_ptr<mindie_llm::InferRequest> ConvertRequest(TRITONBACKEND_Request* request);
    void AddMieInput(TRITONBACKEND_Request* bRequest, std::shared_ptr<mindie_llm::InferRequest> &leRequest,
        const char* name);
    
    void SetSampling(std::shared_ptr<mindie_llm::InferRequest>& leRequest, TRITONBACKEND_Request* request);
    std::shared_ptr<mindie_llm::InferRequest> mieRequest_;
    TRITONBACKEND_ResponseFactory* responseFactory_;
    TRITONBACKEND_Request* backendRequest_;
};
}
#endif

src/infer_task.cc

#include "infer_task.h"
#include <securec.h>
namespace triton::backend::mindie {
InferTask::InferTask(TRITONBACKEND_Request* request)
{
    mieRequest_ = ConvertRequest(request);
    TRITONSERVER_Error* err = nullptr;
    err = TRITONBACKEND_ResponseFactoryNew(&responseFactory_, request);
    LOG_MESSAGE(
        TRITONSERVER_LOG_VERBOSE,
        (std::string("MindIE TRITONBACKEND_ResponseFactoryNew: ") +
         ((err == nullptr) ? "Success" : TRITONSERVER_ErrorMessage(err)))
            .c_str());
    backendRequest_ = request;
}
InferTask::~InferTask()
{
    if (responseFactory_ != nullptr) {
        TRITONBACKEND_ResponseFactoryDelete(responseFactory_);
    }
}
TRITONBACKEND_ResponseFactory* InferTask::GetResponseFactory()
{
    return responseFactory_;
}
std::shared_ptr<mindie_llm::InferRequest> InferTask::GetMieRequest() const
{
    return mieRequest_;
}
void InferTask::AddMieInput(TRITONBACKEND_Request* bRequest,
    std::shared_ptr<mindie_llm::InferRequest> &leRequest, const char* name)
{
    TRITONBACKEND_Input* input = nullptr;
    if (TRITONBACKEND_RequestInput(bRequest, name, &input) != nullptr) {
        return;
    }
    const int64_t* shape;
    TRITONSERVER_DataType datatype;
    uint32_t dims_count;
    size_t buffer_byte_size;
    TRITONSERVER_MemoryType data_memory_type = TRITONSERVER_MEMORY_CPU;
    int64_t data_memory_id;
    const void* buffer = nullptr;
    TRITONBACKEND_InputProperties(
        input, &name, &datatype, &shape, &dims_count, nullptr, nullptr);
    TRITONBACKEND_InputBuffer(
        input, 0 /* idx */, &buffer,
        &buffer_byte_size, &data_memory_type, &data_memory_id);
    auto tensor = std::make_shared<mindie_llm::InferTensor>(name, static_cast<mindie_llm::InferDataType>(datatype),
                                                            std::vector<int64_t>(shape, shape + dims_count));
    bool ret = tensor->Allocate(buffer_byte_size);
    tensor->SetRelease(false);
    if (!ret) {
        LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "failed to allocate data for INPUT_IDS params");
    }
    errno_t memRet = memcpy_s(tensor->GetData(), buffer_byte_size, buffer, buffer_byte_size);
    if (memRet != EOK) {
        LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Triton failed to allocate output buffer on CPU");
        return;
    }
    
    leRequest->AddTensor(name, tensor);
}
void InferTask::SetSampling(std::shared_ptr<mindie_llm::InferRequest>& leRequest, TRITONBACKEND_Request* request)
{
    SetSamplingValue<float>(leRequest, request, "TEMPERATURE",  mindie_llm::InferDataType::TYPE_FP32, 1.0f);
    SetSamplingValue<int32_t>(leRequest, request, "TOP_K", mindie_llm::InferDataType::TYPE_INT32, int32_t(0));
    SetSamplingValue<float>(leRequest, request, "TOP_P", mindie_llm::InferDataType::TYPE_FP32, 1.0f);
    SetSamplingValue<float>(leRequest, request, "TYPICAL_P", mindie_llm::InferDataType::TYPE_FP32, 1.0f);
    SetSamplingValue<bool>(leRequest, request, "DO_SAMPLE", mindie_llm::InferDataType::TYPE_BOOL, false);
    SetSamplingValue<uint64_t>(leRequest, request, "SEED", mindie_llm::InferDataType::TYPE_UINT64, uint64_t(0));
    SetSamplingValue<float>(leRequest, request, "REPETITION_PENALTY", mindie_llm::InferDataType::TYPE_FP32, 1.0f);
    SetSamplingValue<float>(leRequest, request, "FREQUENCY_PENALTY", mindie_llm::InferDataType::TYPE_FP32, 0.0f);
    SetSamplingValue<float>(leRequest, request, "PRESENCE_PENALTY", mindie_llm::InferDataType::TYPE_FP32, 0.0f);
    SetSamplingValue<bool>(leRequest, request, "WATERMARK", mindie_llm::InferDataType::TYPE_BOOL, false);
}
                                                                
std::shared_ptr<mindie_llm::InferRequest> InferTask::ConvertRequest(
    TRITONBACKEND_Request* request)
{
    const char* request_id;
    if (TRITONBACKEND_RequestId(request, &request_id) != nullptr) {
        return nullptr;
    }
    auto leRequest = std::make_shared<mindie_llm::InferRequest>(mindie_llm::InferRequestId(request_id));
    AddMieInput(request, leRequest, "INPUT_IDS");
    
    SetSampling(leRequest, request);
    // mindie_llm::TensorPtr tp;
    // leRequest->GetTensorByName("REPETITION_PENALTY", tp);
    // float* buffer = static_cast<float*>(tp->GetData());
    // std::cout << "************ REPETITION_PENALTY: " << *buffer 
    //     << "**************" << std::endl;
    return leRequest;
}
TRITONBACKEND_Request* InferTask::GetBackendRequest() const
{
    return backendRequest_;
}
} // namespace triton::backend::mindie

example/tritonModels/llama3_8b/config.pbtxt

name: "llama3_8b"
max_batch_size : 200
backend: "mindie"
model_transaction_policy {
  decoupled: True
}

dynamic_batching {
   max_queue_delay_microseconds: 1000
default_queue_policy : {
             timeout_action : 1
                 default_timeout_microseconds : 50000000
          }
}
# path of config.json to mindie
parameters {
  key: "engine_config_path", 
  value: {string_value: "tritonModels/config.json"}
}

# number of model instances, currently just support a single model instance
parameters {
  key: "model_instance_number"
  value: { string_value: "1" }
}
# npu card number on which each model instance is running, separate them sequentially with a semicolon
parameters {
  key: "npu_device_ids"
  value: { string_value: "01;"}
}

input [
  {
    name: "INPUT_IDS"
    data_type: TYPE_INT64
    dims: [ -1 ]
  },
  {
    name: "TEMPERATURE"
    data_type: TYPE_FP32
    dims: [ -1 ]
    optional: true
  },
  {
    name: "TOP_K"
    data_type: TYPE_INT32
    dims: [ -1 ]
    optional: true
  },
  {
    name: "TOP_P"
    data_type: TYPE_FP32
    dims: [ -1 ]
    optional: true
  },
  {
    name: "TYPICAL_P"
    data_type: TYPE_FP32
    dims: [ -1 ]
    optional: true
  },
  {
    name: "DO_SAMPLE"
    data_type: TYPE_BOOL
    dims: [ -1 ]
    optional: true
  },
  {
    name: "SEED"
    data_type: TYPE_UINT64
    dims: [ -1 ]
    optional: true
  },
  {
    name: "REPETITION_PENALTY"
    data_type: TYPE_FP32
    dims: [ -1 ]
    optional: true
  },
  {
    name: "FREQUENCY_PENALTY"
    data_type: TYPE_FP32
    dims: [ -1 ]
    optional: true
  },
  {
    name: "PRESENCE_PENALTY"
    data_type: TYPE_FP32
    dims: [ -1 ]
    optional: true
  },
  {
    name: "WATERMARK"
    data_type: TYPE_BOOL
    dims: [ -1 ]
    optional: true
  }
]

output [
  {
    name: "OUTPUT_IDS"
    data_type: TYPE_INT64
    dims: [ -1 ]
  },
  {
    name: "IBIS_EOS_ATTR"
    data_type: TYPE_INT64
    dims: [ -1 ]
  }
]
instance_group [
  {
    count: 1
    kind: KIND_CPU
  }
]

example/tritonModels/config.json

{
    "Version" : "1.0.0",
    "LogConfig" :
    {
        "logLevel" : "Info",
        "logFileSize" : 20,
        "logFileNum" : 20,
        "logPath" : "logs/mindservice.log"
    },
    "ServerConfig" :
    {
        "ipAddress" : "127.0.0.1",
        "managementIpAddress" : "127.0.0.2",
        "port" : 1025,
        "managementPort" : 1026,
        "allowAllZeroIpListening" : false,
        "maxLinkNum" : 1000,
        "httpsEnabled" : false,
        "fullTextEnabled" : false,
        "tlsCaPath" : "security/ca/",
        "tlsCaFile" : ["ca.pem"],
        "tlsCert" : "security/certs/server.pem",
        "tlsPk" : "security/keys/server.key.pem",
        "tlsPkPwd" : "security/pass/key_pwd.txt",
        "tlsCrl" : "security/certs/server_crl.pem",
        "managementTlsCaFile" : ["management_ca.pem"],
        "managementTlsCert" : "security/certs/management/server.pem",
        "managementTlsPk" : "security/keys/management/server.key.pem",
        "managementTlsPkPwd" : "security/pass/management/key_pwd.txt",
        "managementTlsCrl" : "security/certs/management/server_crl.pem",
        "kmcKsfMaster" : "tools/pmt/master/ksfa",
        "kmcKsfStandby" : "tools/pmt/standby/ksfb",
        "inferMode" : "standard",
        "interCommTLSEnabled" : true,
        "interCommPort" : 1121,
        "interCommTlsCaFile" : "security/grpc/ca/ca.pem",
        "interCommTlsCert" : "security/grpc/certs/server.pem",
        "interCommPk" : "security/grpc/keys/server.key.pem",
        "interCommPkPwd" : "security/grpc/pass/key_pwd.txt",
        "interCommTlsCrl" : "security/grpc/certs/server_crl.pem",
        "openAiSupport" : "vllm"
    },
    "BackendConfig" : {
        "backendName" : "mindieservice_llm_engine",
        "modelInstanceNumber" : 1,
        "npuDeviceIds" : [[0]],
        "tokenizerProcessNumber" : 8,
        "multiNodesInferEnabled" : false,
        "multiNodesInferPort" : 1120,
        "interNodeTLSEnabled" : true,
        "interNodeTlsCaFile" : "security/grpc/ca/ca.pem",
        "interNodeTlsCert" : "security/grpc/certs/server.pem",
        "interNodeTlsPk" : "security/grpc/keys/server.key.pem",
        "interNodeTlsPkPwd" : "security/grpc/pass/mindie_server_key_pwd.txt",
        "interNodeTlsCrl" : "security/grpc/certs/server_crl.pem",
        "interNodeKmcKsfMaster" : "tools/pmt/master/ksfa",
        "interNodeKmcKsfStandby" : "tools/pmt/standby/ksfb",
        "ModelDeployConfig" :
        {
            "maxSeqLen" : 2560,
            "maxInputTokenLen" : 2048,
            "truncation" : false,
            "ModelConfig" : [
                {
                    "modelInstanceType" : "Standard",
                    "modelName" : "llama3_8b",
                    "modelWeightPath" : "/path/to/weights/LLaMA3-8B",
                    "worldSize" : 2,
                    "cpuMemSize" : 5,
                    "npuMemSize" : -1,
                    "backendType" : "atb"
                }
            ]
        },
        "ScheduleConfig" :
        {
            "templateType" : "Standard",
            "templateName" : "Standard_LLM",
            "cacheBlockSize" : 128,
            "maxPrefillBatchSize" : 50,
            "maxPrefillTokens" : 8192,
            "prefillTimeMsPerReq" : 150,
            "prefillPolicyType" : 0,
            "decodeTimeMsPerReq" : 50,
            "decodePolicyType" : 0,
            "maxBatchSize" : 200,
            "maxIterTimes" : 512,
            "maxPreemptCount" : 0,
            "supportSelectBatch" : false,
            "maxQueueDelayMicroseconds" : 5000
        }
    }
}

example/set_env.sh

# Set the environment variables for ascend-toolkit, nnal, atb_models, and mindie, and modify them manually based on their installation paths
# source ascend-toolkit/set_env.sh
# source nnal/atb/set_env.sh
# source atb_models/set_env.sh
# source mindie/set_env.sh
export LCCL_DETERMINISTIC=1
export HCCL_DETERMINISTIC=true
export ATB_MATMUL_SHUFFLE_K_ENABLE=0
export ATB_LLM_LCOC_ENABLE=1
export ATB_STREAM_SYNC_EVERY_KERNEL_ENABLE=0
export ATB_STREAM_SYNC_EVERY_RUNNER_ENABLE=0
export ATB_STREAM_SYNC_EVERY_OPERATION_ENABLE=0
export ATB_OPERATION_EXECUTE_ASYNC=0
export TASK_QUEUE_ENABLE=0
export ATB_OPSRUNNER_KERNEL_CACHE_LOCAL_COUNT=8
export ATB_OPSRUNNER_KERNEL_CACHE_GLOABL_COUNT=16
export ATB_CONTEXT_WORKSPACE_RING=1
export HCCL_BUFFSIZE=120
export ATB_LAYER_INTERNAL_TENSOR_REUSE=1
export MASTER_ADDR=127.0.0.1
export MASTER_PORT=12345

example/launch.sh

export MINDIE_LLM_PYTHON_LOG_TO_FILE=0
export MINDIE_LLM_PYTHON_LOG_TO_STDOUT=0
export MINDIE_LLM_PYTHON_LOG_PATH=./mindiepython.log
export MINDIE_LLM_PYTHON_LOG_LEVEL=INFO
export MINDIE_LLM_LOG_TO_STDOUT=0
export MINDIE_LLM_LOG_TO_FILE=0
export MINDIE_LLM_LOG_LEVEL=INFO
export IBIS_PYTHON_LOG=0
rm -f ./tritonserver.log
# The path to the Triton model repository following --model-repository needs to be modified manually.
/opt/tritonserver/bin/tritonserver --model-repository=tritonModels \
--backend-config=mindie,shm-default-byte-size=134217728 --http-port 8110 --grpc-port 8111 --metrics-port 8112 --log-verbose=1 --log-info=1 --log-warning=0 --log-error=0 --log-file ./tritonserver.log

example/submit.py

import sys
import datetime
import queue
import random
import threading
from infer_client import ModelClient, InferRequest
import time
import json
import numpy as np
import multiprocessing as mp
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
import argparse
import pandas as pd
from prettytable import PrettyTable
from transformers import AutoTokenizer
from logger_util import get_logger
logger = get_logger()
SAMPLING_Param = {'TEMPERATURE':1.0, # float32
                  'TOP_K':0,         # int32
                  'TOP_P':1.0,       # float32
                  'TYPICAL_P':1.0,   # float32
                  'DO_SAMPLE':1,     # bool
                  'SEED':0,   # uint64
                  'REPETITION_PENALTY':1.0, # float32
                  'FREQUENCY_PENALTY':0,    # float32
                  'PRESENCE_PENALTY':0,     # float32
                  'WATERMARK':0             # bool
                }
RESULT_FILE_NAME = ''
MODEL_NAME = ''
MODEL_PATH = ''
TRACE_DATASET = ''
DATASET_LANGUAGE = ''
# total submission time
TOTAL_SUBMIT_TIME = 0
# the number of requests for different request types
REQUEST_TYPE_NUM = 2000
# input token quantity (randomly distributed with a uniform distribution within [INPUT_TOKEN_NUM_START, INPUT_TOKEN_NUM_END])
INPUT_TOKEN_NUM_START = 1
INPUT_TOKEN_NUM_END = 16
# output token quantity (randomly distributed with a uniform distribution within [OUTPUT_TOKEN_NUM_START, OUTPUT_TOKEN_NUM_END])
OUTPUT_TOKEN_NUM_START = 1
OUTPUT_TOKEN_NUM_END = 256
THREAD_NUM = 200
SEED = 1
random.seed(SEED)
class SubmitService:
    def __init__(self, token_config_list, thread_num):
        self.requests = queue.Queue()
        self.exception_queue = queue.Queue()
        self.total_request_num = 0
        self.submitted_request_num = 0
        self.start_time = 0
        self.elapsed = 0
        self.thread_num = thread_num
        self.lock = threading.Lock()
        request_idx = 0
        for token_config in token_config_list:
            request_num = token_config[0]
            self.total_request_num += request_num
            input_tensor = token_config[1]
            for _ in range(request_num):
                input_lens = token_config[2]
                question = token_config[3]
                self.requests.put(InferRequest(str(request_idx), input_tensor, input_lens, question, sampling_param=SAMPLING_Param, model_name=MODEL_NAME))
                request_idx += 1
        logger.info("total request num:{}".format(self.total_request_num))
        self.metrics_queue = dict()
    def start(self):
        logger.info("\n############ submit job ###########")
        all_task = []
        init = queue.Queue(self.thread_num)
        with ThreadPoolExecutor(self.thread_num + 1, thread_name_prefix='submit') as p:
            for i in range(THREAD_NUM):
                all_task.append(p.submit(self._submit_requests, i, init))
            all_task.append(p.submit(self._monitor, init))
            wait(all_task, return_when=ALL_COMPLETED)
        if not self.exception_queue.empty():
            raise self.exception_queue.get()
        logger.info("############ submit over ###########")
    def _submit_requests(self, client_id, init):
        try:
            client = ModelClient(client_id)
            init.put(1)
            # wait all grpc client init ready
            while not init.full():
                time.sleep(0.1)
            while not self.requests.empty():
                with self.lock:
                    if not self.requests.empty():
                        request = self.requests.get()
                        self.requests.task_done()
                logger.info("start a new request.client id:{},request_id:{}".format(client_id, request.request_id))
                success, metrics = client.infer(request)
                if not success:
                    logger.info("request {} submit failed! triton url: {}, model name: {}, inputs: {}".format(
                        request.request_id, client.server_url, request.model_name,
                        request.inputs))
                self._request_finish(request.request_id,metrics)
            client.triton_client.stop_stream()
        except Exception as e:
            self.exception_queue.put(e)
    def _request_finish(self, request_id, metrics):
        self.metrics_queue[request_id]  = metrics.format()
    def format_metrics(self):
        metrics_list = []
        for _, metric in self.metrics_queue.items():
            metrics_list.append(metric)
        json_str = json.dumps(metrics_list, ensure_ascii=False)
        with open(RESULT_FILE_NAME, 'w') as f:
            f.write(json_str)
        request_df = pd.read_json(json_str, orient='records')
        metrics = PrettyTable(['METRICS', 'AVG', 'P75', 'P99', 'MAX', 'N'])
        origin_metric = ['FirstTokenTime', 'DecodeTime', 'MaxDecodeTime', 'GenerateTime', 'InputCharacters',
                         'InputTokens', 'GeneratedTokens', 'GenerateTokenSpeed']
        for metric in origin_metric:
            if metric in ['FirstTokenTime', 'DecodeTime', 'MaxDecodeTime', 'GenerateTime']:
                metrics.add_row([metric, str(round(request_df[metric].mean() * 1000)) + 'ms',
                                 str(round(request_df[metric].quantile(0.75) * 1000)) + 'ms',
                                 str(round(request_df[metric].quantile(0.99) * 1000)) + 'ms',
                                 str(round(request_df[metric].max() * 1000)) + 'ms',
                                 request_df[metric].count()])
            elif metric in ['InputCharacters', 'InputTokens', 'GeneratedTokens']:
                metrics.add_row([metric, round(request_df[metric].mean()), round(request_df[metric].quantile(0.75)),
                                 round(request_df[metric].quantile(0.99)), round(request_df[metric].max()),
                                 request_df[metric].count()])
            else:
                metrics.add_row([metric, str(round(request_df[metric].mean(), 2)) + '/s',
                                 str(round(request_df[metric].quantile(0.75), 2)) + '/s',
                                 str(round(request_df[metric].quantile(0.99), 2)) + '/s',
                                 str(round(request_df[metric].max(), 2)) + '/s',
                                 request_df[metric].count()])
        logger.info(metrics)
        total = PrettyTable(['METRICS', 'VALUE'])
        total.add_row(['Current Time', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')])
        total.add_row(['Data Source', TRACE_DATASET])
        total.add_row(['NonEmpty', request_df['NotEmpty'].sum()])
        total.add_row(['Returned', request_df['InputCharacters'].count()])
        total.add_row(['Total', self.total_request_num])
        total.add_row(['Concurrency', THREAD_NUM])
        total.add_row(['Time Elapsed', round(self.elapsed)])
        total.add_row(['Throughput', round(self.total_request_num / self.elapsed, 2)])
        total.add_row(['JCT', round(request_df['GenerateTime'].mean(), 2)])
        total.add_row(['GenerateSpeed', round(request_df['GeneratedTokens'].sum() / self.elapsed, 2)])
        total.add_row(
            ['GenerateSpeedPerClient', round(request_df['GeneratedTokens'].sum() / self.elapsed / THREAD_NUM, 2)])
        logger.info(total)
    def _monitor(self, init):
        # wait all grpc client init ready
        while not init.full():
            time.sleep(0.1)
        self.start_time = time.time()
        last_submitted_count = -1
        while len(self.metrics_queue)<self.total_request_num:
            submitted_count = len(self.metrics_queue)
            if last_submitted_count != submitted_count:
                logger.info("progress: {}/{}, spend time:{}".format(submitted_count,
                                                              self.total_request_num,
                                                              int(time.time() - self.start_time)))
                last_submitted_count = submitted_count
            time.sleep(0.1)
        self.elapsed = time.time() - self.start_time
        self.format_metrics()
def gen_requests_from_config():
    token_config_list = list()
    for _ in range(REQUEST_TYPE_NUM):
        request_num = 1
        input_token_num = random.randint(INPUT_TOKEN_NUM_START, INPUT_TOKEN_NUM_END)
        output_token_num = random.randint(OUTPUT_TOKEN_NUM_START, OUTPUT_TOKEN_NUM_END)
        token_config_list.append([request_num, input_token_num, output_token_num])
    return token_config_list
def gen_requests_from_trace(trace_file):
    tokenizer = AutoTokenizer.from_pretrained(
    MODEL_PATH, trust_remote_code=True, use_fast=True)
    requests = list()
    df = pd.read_json(trace_file, lines=True)
    for i, row in df.iterrows():
        request_num = 1
        question = row["question"]
        token = tokenizer([question], return_tensors="np")
        token = token["input_ids"].astype(np.int64)
        input_tensor = token.reshape(1, -1)
        input_lens = len(question)
        requests.append([request_num, input_tensor, input_lens, question])
    return requests
def parser_input():
    parser = argparse.ArgumentParser(description='Personal information')
    parser.add_argument('--name', dest='name', required=True, type=str, help='Name of the model')
    parser.add_argument('--model_path', dest='model_path', required=True, type=str, help='path of the model')
    parser.add_argument('--trace_dataset', dest='trace_dataset', required=True, type=str,
                        help='Trace dataset with jsonl style file')
    parser.add_argument('--dataset_language', dest='dataset_language', default='english', type=str,
                        help='Language of the trace dataset')
    parser.add_argument('--req_num', dest='req_num', default=2000, type=int, help='Number of the request')
    parser.add_argument('--max_input_len', dest='max_input_len', default=16, type=int,
                        help='Max length of input sequence')
    parser.add_argument('--max_output_len', dest='max_output_len', default=100, type=int,
                        help='Max length of output sequence')
    args = parser.parse_args()
    globals()['MODEL_NAME'] = args.name
    globals()['MODEL_PATH'] = args.model_path
    globals()['REQUEST_TYPE_NUM'] = args.req_num
    globals()['INPUT_TOKEN_NUM_END'] = args.max_input_len
    globals()['OUTPUT_TOKEN_NUM_END'] = args.max_output_len
    globals()['TRACE_DATASET'] = args.trace_dataset
    globals()['DATASET_LANGUAGE'] = args.dataset_language
def main():
    parser_input()
    dt = datetime.datetime.now().strftime('%Y-%m-%d|%H:%M:%S')
    if TRACE_DATASET != '':
        requests = gen_requests_from_trace(TRACE_DATASET)
        globals()['RESULT_FILE_NAME'] = './trace.json'
    else:
        requests = gen_requests_from_config()
        globals()['RESULT_FILE_NAME'] = './mock-{}-{}-{}-{}-{}.json'.format(MODEL_NAME, REQUEST_TYPE_NUM,
                                                                            INPUT_TOKEN_NUM_END,
                                                                            OUTPUT_TOKEN_NUM_END, dt)
    submit_service = SubmitService(requests, THREAD_NUM)
    submit_service.start()
if __name__ == "__main__":
    try:
        main()
    except:
        ttype, tvalue, traceback = sys.exc_info()
        print(ttype, tvalue, end="\n")
        idx = 1
        while traceback:
            print("第{}层堆栈信息".format(idx))
            tracebackCode = traceback.tb_frame.f_code
            print("文件名：{}".format(tracebackCode.co_filename))
            print("函数或者模块名：{}".format(tracebackCode.co_name))
            traceback = traceback.tb_next
            idx += 1
        sys.exit(1)

example/client_stream.py

# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from functools import partial
import numpy as np
import queue
from tritonclient.utils import *
import tritonclient.grpc as grpcclient
from transformers import AutoTokenizer
SAMPLING_Param = {'TEMPERATURE':1.0, # float32
                  'TOP_K':0,         # int32
                  'TOP_P':1.0,       # float32
                  'TYPICAL_P':1.0,   # float32
                  'DO_SAMPLE':1,     # bool
                  'SEED':0,   # uint64
                  'REPETITION_PENALTY':1.0, # float32
                  'FREQUENCY_PENALTY':0,    # float32
                  'PRESENCE_PENALTY':0,     # float32
                  'WATERMARK':0             # bool
                }
MODEL_PATH = "weights/LLaMA3-8B"
model_name = "llama3_8b"
question = "Please introduce yourself."
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True, use_fast=True)
class UserData:
    def __init__(self):
        self._completed_requests = queue.Queue()
def callback(user_data, result, error):
    if error:
        user_data._completed_requests.put(error)
    else:
        user_data._completed_requests.put(result)
def main():
    user_data = UserData()
    with grpcclient.InferenceServerClient(url="localhost:8111", verbose=False) as triton_client:
        triton_client.start_stream(callback=partial(callback, user_data))
        request_id = '1'
        inputs = []
        outputs = []
        input_ids = tokenizer([question], return_tensors="np")
        input_ids = input_ids["input_ids"].astype(np.int64)
        input_ids = input_ids.reshape(1, -1)
        sampling_param = SAMPLING_Param
        temperature = np.array([[sampling_param['TEMPERATURE']]], np.float32)
        top_k = np.array([[sampling_param['TOP_K']]], np.int32)
        top_p = np.array([[sampling_param['TOP_P']]], np.float32)
        typical_p = np.array([[sampling_param['TYPICAL_P']]], np.float32)
        do_sample = np.array([[sampling_param['DO_SAMPLE']]], bool)
        random_seed = np.array([[sampling_param['SEED']]], np.uint64)
        repetition_penalty = np.array([[sampling_param['REPETITION_PENALTY']]], np.float32)
        frequency_penalty = np.array([[sampling_param['FREQUENCY_PENALTY']]], np.float32)
        presence_penalty = np.array([[sampling_param['PRESENCE_PENALTY']]], np.float32)
        watermark = np.array([[sampling_param['WATERMARK']]], bool)
        inputs.append(grpcclient.InferInput('INPUT_IDS', list(input_ids.shape), "INT64"))
        inputs.append(grpcclient.InferInput('TEMPERATURE', list(temperature.shape), "FP32"))
        inputs.append(grpcclient.InferInput('TOP_K', list(top_k.shape), "INT32"))
        inputs.append(grpcclient.InferInput('TOP_P', list(top_p.shape), "FP32"))
        inputs.append(grpcclient.InferInput('TYPICAL_P', list(typical_p.shape), "FP32"))
        inputs.append(grpcclient.InferInput('DO_SAMPLE', list(do_sample.shape), "BOOL"))
        inputs.append(grpcclient.InferInput('SEED', list(random_seed.shape), "UINT64"))
        inputs.append(grpcclient.InferInput('REPETITION_PENALTY', list(repetition_penalty.shape), "FP32"))
        inputs.append(grpcclient.InferInput('FREQUENCY_PENALTY', list(frequency_penalty.shape), "FP32"))
        inputs.append(grpcclient.InferInput('PRESENCE_PENALTY', list(presence_penalty.shape), "FP32"))
        inputs.append(grpcclient.InferInput('WATERMARK', list(watermark.shape), "BOOL"))
        
        inputs[0].set_data_from_numpy(input_ids)
        inputs[1].set_data_from_numpy(temperature)
        inputs[2].set_data_from_numpy(top_k)
        inputs[3].set_data_from_numpy(top_p)
        inputs[4].set_data_from_numpy(typical_p)
        inputs[5].set_data_from_numpy(do_sample)
        inputs[6].set_data_from_numpy(random_seed)
        inputs[7].set_data_from_numpy(repetition_penalty)
        inputs[8].set_data_from_numpy(frequency_penalty)
        inputs[9].set_data_from_numpy(presence_penalty)
        inputs[10].set_data_from_numpy(watermark)
        outputs.append(grpcclient.InferRequestedOutput("OUTPUT_IDS"))
        outputs.append(grpcclient.InferRequestedOutput('IBIS_EOS_ATTR'))
        triton_client.async_stream_infer(model_name=model_name,
                                        inputs=inputs,
                                        request_id=request_id,
                                        outputs=outputs)
        triton_client.stop_stream()
        output_ids = []
        while not user_data._completed_requests.empty():
            data_item = user_data._completed_requests.get()
            output_id = data_item.as_numpy("OUTPUT_IDS")
            output_ids.extend(output_id)
        output = tokenizer.decode(output_ids)
        print(f"output:\n{output}")
if __name__=='__main__':
    main()

example/infer_client.py

import sys
from functools import partial
import queue
from tritonclient.utils import *
import tritonclient.grpc as grpcclient
import time
import numpy as np
from transformers import AutoTokenizer
from logger_util import get_logger
logger = get_logger()
class UserData:
    def __init__(self):
        self.completed_requests = queue.Queue()
def callback(user_data, result, error):
    user_data.completed_requests.put((result, error))
class Metrics:
    def __init__(self):
        self.first_token_time = 0
        self.avg_decode_time = 0
        self.max_decode_time = 0
        self.full_time = 0
        self.prompt_lens = 0
        self.prompt_token_lens = 0
        self.generated_tokens = 0
        self.cost_timestamp_list = []
        self.client_id = -1
    def format(self):
        generate_token_speed = (self.generated_tokens / self.full_time) if self.full_time != 0 else 0
        return {'FirstTokenTime': self.first_token_time,
                'DecodeTime': self.avg_decode_time, 'MaxDecodeTime': self.max_decode_time,
                'GenerateTime': self.full_time, 'InputCharacters': self.prompt_lens,
                'InputTokens': self.prompt_token_lens,
                'GeneratedTokens': self.generated_tokens,
                'GenerateTokenSpeed': generate_token_speed,
                'NotEmpty': 0 if self.generated_tokens == 0 else 1,
                'CostTimestampList': self.cost_timestamp_list,
                'ClientID': self.client_id}
class InferRequest:
    def __init__(self, request_id, input_tensor, input_lens, question, sampling_param, max_iter_times=4096, model_name='ibis_benchmark'):
        self.submit_time = 0
        self.request_id = request_id
        self.input_lens = input_lens
        self.question = question
        self.input_token_lens = input_tensor.shape[1]
        
        temperature = np.array([[sampling_param['TEMPERATURE']]], np.float32)
        top_k = np.array([[sampling_param['TOP_K']]], np.int32)
        top_p = np.array([[sampling_param['TOP_P']]], np.float32)
        typical_p = np.array([[sampling_param['TYPICAL_P']]], np.float32)
        do_sample = np.array([[sampling_param['DO_SAMPLE']]], bool)
        random_seed = np.array([[sampling_param['SEED']]], np.uint64)
        repetition_penalty = np.array([[sampling_param['REPETITION_PENALTY']]], np.float32)
        frequency_penalty = np.array([[sampling_param['FREQUENCY_PENALTY']]], np.float32)
        presence_penalty = np.array([[sampling_param['PRESENCE_PENALTY']]], np.float32)
        watermark = np.array([[sampling_param['WATERMARK']]], bool)
        self.inputs = []
        self.inputs.append(grpcclient.InferInput('INPUT_IDS', list(input_tensor.shape), "INT64"))
        self.inputs.append(grpcclient.InferInput('TEMPERATURE', list(temperature.shape), "FP32"))
        self.inputs.append(grpcclient.InferInput('TOP_K', list(top_k.shape), "INT32"))
        self.inputs.append(grpcclient.InferInput('TOP_P', list(top_p.shape), "FP32"))
        self.inputs.append(grpcclient.InferInput('TYPICAL_P', list(typical_p.shape), "FP32"))
        self.inputs.append(grpcclient.InferInput('DO_SAMPLE', list(do_sample.shape), "BOOL"))
        self.inputs.append(grpcclient.InferInput('SEED', list(random_seed.shape), "UINT64"))
        self.inputs.append(grpcclient.InferInput('REPETITION_PENALTY', list(repetition_penalty.shape), "FP32"))
        self.inputs.append(grpcclient.InferInput('FREQUENCY_PENALTY', list(frequency_penalty.shape), "FP32"))
        self.inputs.append(grpcclient.InferInput('PRESENCE_PENALTY', list(presence_penalty.shape), "FP32"))
        self.inputs.append(grpcclient.InferInput('WATERMARK', list(watermark.shape), "BOOL"))
        
        self.inputs[0].set_data_from_numpy(input_tensor)
        self.inputs[1].set_data_from_numpy(temperature)
        self.inputs[2].set_data_from_numpy(top_k)
        self.inputs[3].set_data_from_numpy(top_p)
        self.inputs[4].set_data_from_numpy(typical_p)
        self.inputs[5].set_data_from_numpy(do_sample)
        self.inputs[6].set_data_from_numpy(random_seed)
        self.inputs[7].set_data_from_numpy(repetition_penalty)
        self.inputs[8].set_data_from_numpy(frequency_penalty)
        self.inputs[9].set_data_from_numpy(presence_penalty)
        self.inputs[10].set_data_from_numpy(watermark)
        
        self.outputs = []
        self.outputs.append(grpcclient.InferRequestedOutput('OUTPUT_IDS'))
        self.outputs.append(grpcclient.InferRequestedOutput('IBIS_EOS_ATTR'))
        self.max_iter_times = max_iter_times
        self.model_name = model_name
class ModelClient:
    def __init__(self, client_id=0, server_url="localhost:8111", verbose=False):
        self.server_url = server_url
        self.client_id = client_id
        self.triton_client = grpcclient.InferenceServerClient(url=server_url, verbose=verbose)
        self.user_data = UserData()
        self.triton_client.start_stream(callback=partial(callback, self.user_data))
    def infer(self, request: InferRequest):
        metrics = Metrics()
        metrics.client_id = self.client_id
        metrics.prompt_lens = request.input_lens
        metrics.prompt_token_lens = request.input_token_lens
        begin_time = time.time()
        try:
            logger.info("Send a request, request id : {}".format(request.request_id))
            self.triton_client.async_stream_infer(model_name=request.model_name,
                                                  inputs=request.inputs,
                                                  request_id=request.request_id,
                                                  outputs=request.outputs
                                                  )
        except Exception as e:
            raise e
        tokens = []
        last_token_time = time.time()
        metrics.cost_timestamp_list.append(last_token_time)
        decode_full_time = 0
        for i in range(request.max_iter_times):
            (output_tensor, error) = self.user_data.completed_requests.get()
            if error is not None:
                logger.error(error)
                return False, metrics
            gen_tokens = output_tensor.as_numpy('OUTPUT_IDS')
            for token in gen_tokens:
                tokens.append(token)
            if metrics.first_token_time == 0:
                metrics.first_token_time = time.time() - last_token_time
            else:
                decode_time = time.time() - last_token_time
                gen_token_num = output_tensor.as_numpy('IBIS_EOS_ATTR')[1]
                single_decode_time = decode_time / gen_token_num
                decode_full_time += decode_time
                if decode_time > metrics.max_decode_time:
                    metrics.max_decode_time = single_decode_time
            last_token_time = time.time()
            metrics.cost_timestamp_list.append(last_token_time)
            inferparam = output_tensor.get_response().parameters['triton_final_response']
            if(inferparam.bool_param):
                logger.info("recieve a eos token, request id : {}".format(request.request_id))
                break
        metrics.full_time = time.time() - begin_time
        metrics.generated_tokens = len(tokens)
        if metrics.generated_tokens > 1:
            metrics.avg_decode_time = decode_full_time / (metrics.generated_tokens - 1)
        logger.info("return from client infer, request id : {}".format(request.request_id))
        return True, metrics

example/logger_util.py

import sys
import loguru
import os, time
os.environ['TZ'] = 'UTC'
time.tzset()
def get_logger():
    logger = loguru.logger
    logger.add("script.log", level="INFO")
    return logger

build.sh

#!/bin/bash
set -e
export MINDIE_LLM_HOME_PATH=/usr/local/Ascend/mindie/latest/mindie-llm
export TRITON_HOME_PATH=/opt/tritonserver
if [ -z "$MINDIE_LLM_HOME_PATH" ]; then
  echo "env MINDIE_LLM_HOME_PATH is null, please install mindie and source set_env.sh"
  exit 1
fi
COMPILE_OPTIONS=""
if [ $(python3 -c 'import torch; print(torch.compiled_with_cxx11_abi())') == "True" ]; then
  USE_CXX11_ABI=ON
else
  USE_CXX11_ABI=OFF
fi
COMPILE_OPTIONS="${COMPILE_OPTIONS} -DUSE_CXX11_ABI=$USE_CXX11_ABI"
BUILD_TYPE="Release"
TRITON_VERSION="r24.02"
while getopts "dv:p:" opt; do
  case ${opt} in
    d)
      BUILD_TYPE="Debug"
      ;;
    v)
      TRITON_VERSION=$OPTARG
      ;;
    p)
      export TRITON_HOME_PATH=$OPTARG
      ;;
    \?)
      echo "Invalid option: -$opt" >&2
      exit 1
      ;;
  esac
done
echo "Triton version: ${TRITON_VERSION}"
if [ -z "$TRITON_HOME_PATH" ]; then
  echo "env TRITON_HOME_PATH is null, please set env or use -p to tell us where triton is installed."
  exit 1
fi
echo "Triton install path: ${TRITON_HOME_PATH}"
if [ ! -d "$TRITON_HOME_PATH" ]; then
  echo "$TRITON_HOME_PATH is not a directory! Please check triton install path."
  exit 1
fi
rm -rf build && mkdir build && cd build
COMPILE_OPTIONS="${COMPILE_OPTIONS} -DCMAKE_BUILD_TYPE=$BUILD_TYPE"
COMPILE_OPTIONS="${COMPILE_OPTIONS} -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install"
COMPILE_OPTIONS="${COMPILE_OPTIONS} -DTRITON_COMMON_REPO_TAG=$TRITON_VERSION"
COMPILE_OPTIONS="${COMPILE_OPTIONS} -DTRITON_BACKEND_REPO_TAG=$TRITON_VERSION"
COMPILE_OPTIONS="${COMPILE_OPTIONS} -DTRITON_CORE_REPO_TAG=$TRITON_VERSION"
COMPILE_OPTIONS="${COMPILE_OPTIONS} -DTRITON_ENABLE_GPU=OFF"
cmake $COMPILE_OPTIONS ..
make install -j$(npoc)

CMakeLists.txt

# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmake_minimum_required(VERSION 3.17)
project(tutorialmindiebackend LANGUAGES C CXX)
#
# Options
#
# Must include options required for this project as well as any
# projects included in this one by FetchContent.
#
# GPU support is disabled by default because mindie backend
# doesn't use GPUs.
#
option(TRITON_ENABLE_GPU "Enable GPU support in backend" OFF)
option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
if(NOT CMAKE_BUILD_TYPE)
  set(CMAKE_BUILD_TYPE Release)
endif()
add_compile_options(-std=c++17)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_SKIP_RPATH TRUE)
if (USE_CXX11_ABI)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
else()
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pipe -fstack-protector-all")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-copy")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wl,--build-id=none")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-z,relro,-z,now")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pie -fexceptions")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ftrapv -s")
# add_definitions(-D_GLIBCXX_USE_C99=1)
#
# Dependencies
#
# FetchContent requires us to include the transitive closure of all
# repos that we depend on so that we can override the tags.
#
include(FetchContent)
FetchContent_Declare(
  repo-common
  GIT_REPOSITORY https://github.com/triton-inference-server/common.git
  GIT_TAG ${TRITON_COMMON_REPO_TAG}
  GIT_SHALLOW ON
)
FetchContent_Declare(
  repo-core
  GIT_REPOSITORY https://github.com/triton-inference-server/core.git
  GIT_TAG ${TRITON_CORE_REPO_TAG}
  GIT_SHALLOW ON
)
FetchContent_Declare(
  repo-backend
  GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
  GIT_TAG ${TRITON_BACKEND_REPO_TAG}
  GIT_SHALLOW ON
)
FetchContent_MakeAvailable(repo-common repo-core repo-backend)
#
# The backend must be built into a shared library. Use an ldscript to
# hide all symbols except for the TRITONBACKEND API.
#
set(ASCEND_DRIVER_DIR /usr/local/Ascend/driver)
LINK_DIRECTORIES(
  $ENV{TRITON_HOME_PATH}/lib
  $ENV{MINDIE_LLM_HOME_PATH}/lib
)
INCLUDE_DIRECTORIES(
  $ENV{MINDIE_LLM_HOME_PATH}/include
  $ENV{MINDIE_LLM_HOME_PATH}/include/llm_manager
  $ENV{MINDIE_LLM_HOME_PATH}/include/llm_manager/utils
  ${ASCEND_DRIVER_DIR}/kernel/libc_sec/include
  $ENV{TRITON_HOME_PATH}/include
)
file(GLOB_RECURSE SOURCE_FILES "${CMAKE_CURRENT_LIST_DIR}/src/*.cc")
add_library(triton-mindie-backend SHARED ${SOURCE_FILES})
add_library(
  TutorialmindieBackend::triton-mindie-backend ALIAS triton-mindie-backend
)
target_include_directories(
  triton-mindie-backend
  PRIVATE
    ${CMAKE_CURRENT_SOURCE_DIR}/src
)
target_compile_features(triton-mindie-backend PRIVATE cxx_std_11)
target_link_libraries(
  triton-mindie-backend
  PRIVATE
    triton-core-serverapi   # from repo-core
    triton-core-backendapi  # from repo-core
    triton-core-serverstub  # from repo-core
    triton-backend-utils    # from repo-backend
    mindie_llm_manager      # from mindie-llm
)
set_target_properties(
  triton-mindie-backend PROPERTIES
  OUTPUT_NAME triton_mindie
)
# Install
install(
  TARGETS
  triton-mindie-backend
  DESTINATION $ENV{TRITON_HOME_PATH}/backends/mindie
)