代码目录结构如下:
Triton_MindIE-LLM_Backend |____build.sh |____CMakeLists.txt |____src | |____mindie.cc | |____model_state.cc | |____model_state.h | |____model_instance_state.cc | |____model_instance_state.h | |____mindie_utils.cc | |____mindie_utils.h | |____infer_task.cc | |____infer_task.h |____example | |____tritonModels | | |____llama3_8b | | | |____1 | | | |____config.pbtxt | | |____config.json | |____set_env.sh | |____launch.sh | |____submit.py | |____client_stream.py | |____infer_client.py | |____logger_util.py | |____GSM8K.jsonl
src源文件和example文件夹的含义和作用分别如表1和表2所示。
源文件 |
含义及作用 |
---|---|
mindie.cc |
包含了Triton提供给Backend的核心接口函数,分别是TRITONBACKEND_Backend初始化、TRITONBACKEND_Model初始化和释放、TRITONBACKEND_ModelInstance初始化和释放以及模型实例接收并执行请求推理的接口设置。 |
model_state.cc |
模型状态类的源文件,在其构造函数中使用engineConfigPath和五个callback函数给每个模型实例构造出一个LlmManager对象,完成初始化后创建线程从ModelState存储队列中读取出所有请求。 |
model_instance_state.cc |
模型实例状态类的源文件,核心是其中定义的Enqueue成员函数,将所有来自triton::core侧的待推理请求解析并转换成MindIE侧请求类型,最后存储进ModelState队列中以待MindIE读取。 |
mindie_utils.cc |
定义了几个MindIE工具函数,一个是将响应从MindIE侧转化为Triton侧类型的转换函数,其他几个是从文件中提取出配置和参数信息的解析函数。 |
infer_task.cc |
推理任务类的源文件,核心是构造函数中将Triton侧请求转化为MindIE侧类型,并给每个推理请求创建了一个响应工厂,用来在发送响应回调函数中创建Triton侧响应。 |
源文件 |
描述 |
---|---|
tritonModels |
Triton管理的模型仓,里面存放了LLaMa3_8b模型的Triton侧配置文件config.pbtxt和MindIE侧配置文件config.json。 |
set_env.sh |
环境变量配置文件,根据CANN、ATB Models、MindIE安装路径按需修改。 |
launch.sh |
启动Tritonserver服务端,其中包含MindIE LLM和Triton日志等级配置。 |
submit.py |
利用tritonClinet向服务端依次发送GSM8K.jsonl数据集中的请求。 |
client_stream.py |
利用tritonClinet向服务端发送单个请求。 |
GSM8K.jsonl |
推理数据集,这里以GSM8K为例,需用户自行提供。 |
样例代码:
// Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include <stdint.h> #include <string> #include "model_state.h" #include "model_instance_state.h" #include "triton/backend/backend_common.h" namespace triton { namespace backend { namespace mindie { // // ModelState // ///////////// extern "C" { TRITONSERVER_Error* TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend) { const char* cname; RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname)); std::string name(cname); LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("TRITONBACKEND_Initialize: ") + name).c_str()); uint32_t api_version_major, api_version_minor; RETURN_IF_ERROR( TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor)); LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("Triton TRITONBACKEND API version: ") + std::to_string(api_version_major) + "." + std::to_string(api_version_minor)) .c_str()); LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("'") + name + "' TRITONBACKEND API version: " + std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." + std::to_string(TRITONBACKEND_API_VERSION_MINOR)).c_str()); if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) || (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_UNSUPPORTED, (std::string("Triton TRITONBACKEND API version: ") + std::to_string(api_version_major) + "." + std::to_string(api_version_minor) + " does not support '" + name + "' TRITONBACKEND API version: " + std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." + std::to_string(TRITONBACKEND_API_VERSION_MINOR)).c_str()); } return nullptr; // success } TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model) { // Create a ModelState object and associate it with the // TRITONBACKEND_Model. If anything goes wrong with initialization // of the model state then an error is returned and Triton will fail // to load the model. ModelState* model_state; RETURN_IF_ERROR(ModelState::Create(model, &model_state)); RETURN_IF_ERROR(TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state))); return nullptr; // success } TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model) { void* vstate; RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate)); ModelState* model_state = reinterpret_cast<ModelState*>(vstate); delete model_state; return nullptr; // success } TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) { // Get the model state associated with this instance's model. TRITONBACKEND_Model* model; RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model)); void* vmodelstate; RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate)); ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate); // Create a ModelInstanceState object and associate it with the // TRITONBACKEND_ModelInstance. ModelInstanceState* instance_state; RETURN_IF_ERROR(ModelInstanceState::Create(model_state, instance, &instance_state)); RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(instance, reinterpret_cast<void*>(instance_state))); return nullptr; // success } TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) { void* vstate; RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate)); ModelInstanceState* instance_state = reinterpret_cast<ModelInstanceState*>(vstate); delete instance_state; return nullptr; // success } TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute( TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests, const uint32_t request_count) { ModelInstanceState* instance_state; RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, reinterpret_cast<void**>(&instance_state))); LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("model instance ") + instance_state->Name() + ", executing " + std::to_string(request_count) + " requests") .c_str()); instance_state->Enqueue(requests, request_count); return nullptr; // success } } // extern "C" }}} // namespace triton::backend::mindie
// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclr. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef MINDIE_STATE_H #define MINDIE_STATE_H // // ModelState // // State associated with a model that is using this backend. An object // of this class is created and associated with each // TRITONBACKEND_Model. ModelState is derived from BackendModel class // provided in the backend utilities that provides many common // functions. // #include <string> #include <thread> #include <queue> #include <unordered_map> #include <set> #include <mutex> #include <atomic> #include <functional> #include <memory> #include <shared_mutex> #include "infer_task.h" #include "triton/backend/backend_model.h" #include "triton/backend/backend_common.h" #include "triton/core/tritonbackend.h" #include "llm_manager/llm_manager.h" namespace triton::backend::mindie { class ModelState : public BackendModel { public: static TRITONSERVER_Error *Create(TRITONBACKEND_Model *triton_model, ModelState **state); virtual ~ModelState(); // Name of the input and output tensor const std::string &InputTensorName() const { return input_name_; } const std::string &OutputTensorName() const { return output_name_; } // Datatype of the input and output tensor TRITONSERVER_DataType TensorDataType() const { return datatype_; } // Shape of the input and output tensor as given in the model // configuration file. This shape will not include the batch // dimension (if the model has one). const std::vector<int64_t> &TensorNonBatchShape() const { return nb_shape_; } // Validate that this model is supported by this backend. TRITONSERVER_Error *ValidateModelConfig(); std::queue<std::shared_ptr<mindie_llm::InferRequest>> &GetRequestsQueue(); std::unordered_map<mindie_llm::InferRequestId, std::shared_ptr<InferTask>> &GetInferTasksMap(); std::shared_mutex &GetMutex(); std::vector<mindie_llm::LlmManager*> GetLlmManagers(); private: ModelState(TRITONBACKEND_Model *triton_model); mindie_llm::GetRequestsCallback CreateGetRequestsCallback(); mindie_llm::SendResponsesCallback CreateSendResponsesCallback(); mindie_llm::ControlSignalCallback CreateControlSignalCallback(); mindie_llm::LlmManagerStatsCallback CreateLlmManagerStatsCallback(); mindie_llm::SendStatusResponseCallback CreateSendStatusResponseCallback(); std::pair<uint32_t, std::vector<std::set<size_t>>> GetInitConfig(); std::string input_name_; std::string output_name_; TRITONSERVER_DataType datatype_; bool shape_initialized_; std::vector<int64_t> nb_shape_; std::vector<int64_t> shape_; std::queue<std::shared_ptr<mindie_llm::InferRequest>> requests_; std::unordered_map<mindie_llm::InferRequestId, std::shared_ptr<InferTask>> inferTasksMap_; std::shared_mutex mutex_; std::vector<mindie_llm::LlmManager*> llmManagers_; }; } // namespace triton::backend::mindie #endif
// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include <unordered_set> #include "mindie_utils.h" #include "model_state.h" namespace triton::backend::mindie { ModelState::ModelState(TRITONBACKEND_Model *triton_model) : BackendModel(triton_model, true), shape_initialized_(false) { // Validate that the model's configuration matches what is supported // by this backend. THROW_IF_BACKEND_MODEL_ERROR(ValidateModelConfig()); std::string configPath = MindIEUtils::GetEngineConfigPath(this); LOG_MESSAGE(TRITONSERVER_LOG_ERROR, (std::string("configPath: ") + configPath).c_str()); auto initConfig = GetInitConfig(); uint32_t modelInstanceNumber = initConfig.first; std::vector<std::set<size_t>> npuDeviceIds = initConfig.second; LOG_MESSAGE(TRITONSERVER_LOG_ERROR, (std::string("modelInstanceNumber: ") + std::to_string(modelInstanceNumber)).c_str()); auto getRequestsFunc = CreateGetRequestsCallback(); auto sendResponsesFunc = CreateSendResponsesCallback(); auto controlSignalFunc = CreateControlSignalCallback(); auto llmManagerStatsFunc = CreateLlmManagerStatsCallback(); auto sendStatusResponseCallback = CreateSendStatusResponseCallback(); mindie_llm::LlmManager* llm_manager = nullptr; for (uint32_t modelInstanceId = 0; modelInstanceId < modelInstanceNumber; ++modelInstanceId) { llm_manager = new mindie_llm::LlmManager(configPath, getRequestsFunc, sendResponsesFunc, controlSignalFunc, llmManagerStatsFunc, sendStatusResponseCallback); llmManagers_.emplace_back(llm_manager); auto status = llm_manager->Init(modelInstanceId, npuDeviceIds[modelInstanceId]); if (!status.IsOk()) { LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Failed to init llm_manager"); break; } } } mindie_llm::GetRequestsCallback ModelState::CreateGetRequestsCallback() { return [this]() -> std::vector<std::shared_ptr<mindie_llm::InferRequest>> { std::vector<std::shared_ptr<mindie_llm::InferRequest>> newRequests; std::unique_lock lock(this->mutex_); if (this->requests_.empty()) { return newRequests; } while (!this->requests_.empty()) { auto req = this->requests_.front(); auto requestId = req->GetRequestId(); newRequests.push_back(this->requests_.front()); this->requests_.pop(); } return newRequests; }; } mindie_llm::SendResponsesCallback ModelState::CreateSendResponsesCallback() { return [this](mindie_llm::InferRequestId reqId, const mindie_llm::TensorMap &tensorMap, bool isEnd, const std::string &flag) { auto item = this->inferTasksMap_.find(reqId); if (item == this->inferTasksMap_.end()) { LOG_MESSAGE(TRITONSERVER_LOG_ERROR, (std::string("can't find infertask with reqId: ") + reqId.StringValue()).c_str()); return; } std::shared_ptr<InferTask> inferTask = item->second; TRITONBACKEND_ResponseFactory *responseFactory = inferTask->GetResponseFactory(); if (responseFactory == nullptr) { LOG_MESSAGE( TRITONSERVER_LOG_ERROR, std::string("MindIE inferTask->GetResponseFactory() is nullptr").c_str()); return; } TRITONBACKEND_Response *bResponse; auto tritonErr = TRITONBACKEND_ResponseNewFromFactory(&bResponse, responseFactory); if (tritonErr != nullptr) { LOG_MESSAGE(TRITONSERVER_LOG_ERROR, (std::string("ResponseNewFromFactory failed, reqId: ") + reqId.StringValue()).c_str()); return; } tritonErr = MindIEUtils::ConvertResponse(bResponse, tensorMap); if (tritonErr != nullptr) { LOG_IF_ERROR(tritonErr, std::string("generate triton response failed, reqId: ") + reqId.StringValue()); return; } tritonErr = TRITONBACKEND_ResponseSend(bResponse, isEnd ? TRITONSERVER_RESPONSE_COMPLETE_FINAL : 0, tritonErr); if (isEnd) { TRITONBACKEND_RequestRelease(inferTask->GetBackendRequest(), TRITONSERVER_REQUEST_RELEASE_ALL); std::unique_lock lock(this->GetMutex()); this->inferTasksMap_.erase(item); } if (tritonErr != nullptr) { LOG_IF_ERROR(tritonErr, std::string("send triton response failed, reqId: ") + reqId.StringValue()); return; } }; } mindie_llm::ControlSignalCallback ModelState::CreateControlSignalCallback() { return [this]() -> std::vector<std::pair<mindie_llm::InferRequestId, mindie_llm::Operation>> { std::vector<std::pair<mindie_llm::InferRequestId, mindie_llm::Operation>> stopIds; return stopIds; }; } mindie_llm::LlmManagerStatsCallback ModelState::CreateLlmManagerStatsCallback() { return [this](const std::string &strData) { return; }; } mindie_llm::SendStatusResponseCallback ModelState::CreateSendStatusResponseCallback() { return [this](mindie_llm::InferRequestId requestId, mindie_llm::Status status, mindie_llm::StatusResponseType responsetype) { if (!(status == mindie_llm::Status())) { LOG_MESSAGE(TRITONSERVER_LOG_ERROR, (std::string("Failed to stop request ") + requestId.StringValue() + ", Error msg: " + status.StatusMsg() + ", responsetype: ").c_str()); } }; } ModelState::~ModelState() { if (!llmManagers_.empty()) { for (auto& llm_manager:llmManagers_) { llm_manager->Shutdown(); delete llm_manager; llm_manager = nullptr; } llmManagers_.clear(); } } TRITONSERVER_Error *ModelState::Create(TRITONBACKEND_Model *triton_model, ModelState **state) { LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("Create MindIE ModelState:")).c_str()); try { *state = new ModelState(triton_model); } catch (const BackendModelException &ex) { RETURN_ERROR_IF_TRUE(ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, std::string("unexpected nullptr in BackendModelException")); RETURN_IF_ERROR(ex.err_); } return nullptr; // success } std::pair<uint32_t, std::vector<std::set<size_t>>> ModelState::GetInitConfig() { triton::common::TritonJson::Value parameters; TRITONSERVER_Error* err = this->ModelConfig().MemberAsObject("parameters", ¶meters); if (err != nullptr) { TRITONSERVER_ErrorDelete(err); throw std::runtime_error("Model config doesn't have a parameters section"); } std::string modelInstanceNumber_s; LOG_IF_ERROR( GetParameterValue(parameters, "model_instance_number", &modelInstanceNumber_s), "failed to get modelInstanceNumber config."); std::string npuDeviceIds_s; LOG_IF_ERROR( GetParameterValue(parameters, "npu_device_ids", &npuDeviceIds_s), "failed to get npuDeviceIds config."); // 0 : number of modelInstances uint32_t modelInstanceNumber = static_cast<uint32_t>(modelInstanceNumber_s[0] - '0'); std::vector<std::set<size_t>> npuDeviceIds; std::set<size_t> npuDeviceIds_instance; for (const auto& id: npuDeviceIds_s) { if (id == ';') { npuDeviceIds.emplace_back(npuDeviceIds_instance); npuDeviceIds_instance.clear(); continue; } size_t npuId = static_cast<size_t>(id - '0'); npuDeviceIds_instance.emplace(npuId); } auto initConfig = std::pair(modelInstanceNumber, npuDeviceIds); return initConfig; } TRITONSERVER_Error *ModelState::ValidateModelConfig() { return nullptr; // success } std::queue<std::shared_ptr<mindie_llm::InferRequest>> &ModelState::GetRequestsQueue() { return requests_; } std::unordered_map<mindie_llm::InferRequestId, std::shared_ptr<InferTask>> &ModelState::GetInferTasksMap() { return inferTasksMap_; } std::shared_mutex &ModelState::GetMutex() { return mutex_; } std::vector<mindie_llm::LlmManager*> ModelState::GetLlmManagers() { return llmManagers_; } } // namespace triton::backend::mindie
// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef MINDIE_INSTANCE_H #define MINDIE_INSTANCE_H // // ModelInstanceState // // State associated with a model instance. An object of this class is // created and associated with each // TRITONBACKEND_ModelInstance. ModelInstanceState is derived from // BackendModelInstance class provided in the backend utilities that // provides many common functions. // #include "triton/backend/backend_common.h" #include "triton/backend/backend_model_instance.h" #include "triton/backend/backend_model.h" #include "triton/core/tritonbackend.h" #include <string> #include "model_state.h" #include "infer_task.h" #include "mindie_utils.h" namespace triton::backend::mindie { class ModelInstanceState : public BackendModelInstance { public: static TRITONSERVER_Error *Create( ModelState *model_state, TRITONBACKEND_ModelInstance *triton_model_instance, ModelInstanceState **state); virtual ~ModelInstanceState() = default; // Get the state of the model that corresponds to this instance. ModelState *StateForModel() const { return model_state_; } void Enqueue(TRITONBACKEND_Request **requests, const uint32_t request_count); private: ModelInstanceState(ModelState *model_state, TRITONBACKEND_ModelInstance *triton_model_instance); ModelState *model_state_; }; } // namespace triton::backend::mindie #endif
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "model_instance_state.h" #include <cmath> namespace triton::backend::mindie { TRITONSERVER_Error *ModelInstanceState::Create( ModelState *model_state, TRITONBACKEND_ModelInstance *triton_model_instance, ModelInstanceState **state) { try { *state = new ModelInstanceState(model_state, triton_model_instance); } catch (const BackendModelInstanceException &ex) { RETURN_ERROR_IF_TRUE(ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL, std::string("unexpected nullptr in BackendModelInstanceException")); RETURN_IF_ERROR(ex.err_); } return nullptr; } ModelInstanceState::ModelInstanceState(ModelState *model_state, TRITONBACKEND_ModelInstance *triton_model_instance) : BackendModelInstance(model_state, triton_model_instance), model_state_(model_state) { LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("Create MindIE ModelInstanceState:")).c_str()); } void ModelInstanceState::Enqueue(TRITONBACKEND_Request **requests, const uint32_t request_count) { LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, (std::string("Process Requests: Executing ModelInstanceState::Enqueue")).c_str()); std::unordered_map<mindie_llm::InferRequestId, std::shared_ptr<triton::backend::mindie::InferTask>> newInferTaskMap; std::vector<std::shared_ptr<mindie_llm::InferRequest>> newRequests; for (uint32_t i = 0; i < request_count; i++) { TRITONBACKEND_Request *bRequest = requests[i]; auto inferTask = std::make_shared<InferTask>(bRequest); auto req = inferTask->GetMieRequest(); auto requestId = req->GetRequestId(); newInferTaskMap[requestId] = inferTask; newRequests.push_back(req); } std::unique_lock lock(model_state_->GetMutex()); model_state_->GetInferTasksMap().insert(newInferTaskMap.begin(), newInferTaskMap.end()); for (uint32_t i = 0; i < newRequests.size(); i++) { model_state_->GetRequestsQueue().push(newRequests.at(i)); } } } // namespace triton::backend::mindie
// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef MINDIE_UTILS_H #define MINDIE_UTILS_H // // ModelInstanceState // // State associated with a model instance. An object of this class is // created and associated with each // TRITONBACKEND_ModelInstance. ModelInstanceState is derived from // BackendModelInstance class provided in the backend utilities that // provides many common functions. // #include "triton/backend/backend_common.h" #include "triton/backend/backend_model_instance.h" #include "triton/backend/backend_model.h" #include "triton/core/tritonbackend.h" #include "llm_manager/infer_request.h" #include "llm_manager/llm_manager.h" #include "infer_task.h" namespace triton::backend::mindie { class ModelState; class MindIEUtils { public: static TRITONSERVER_Error* ConvertResponse(TRITONBACKEND_Response* bResponse, const mindie_llm::TensorMap& result); static std::string GetEngineConfigPath(ModelState* model_state); }; } #endif
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "mindie_utils.h" #include <iostream> #include <cmath> #include <securec.h> #include "model_state.h" namespace triton::backend::mindie { TRITONSERVER_Error* MindIEUtils::ConvertResponse(TRITONBACKEND_Response* bResponse, const mindie_llm::TensorMap& result) { for (const auto& entry : result) { const std::string& name = entry.first; if (name != "OUTPUT_IDS" && name != "IBIS_EOS_ATTR") { continue; } const auto& tensor = entry.second; TRITONBACKEND_Output* output = nullptr; RETURN_IF_ERROR(TRITONBACKEND_ResponseOutput(bResponse, &output, name.c_str(), (TRITONSERVER_DataType)tensor->GetDataType(), tensor->GetShape().data(), tensor->GetShape().size())); uint64_t byteSize = tensor->GetSize(); void* buffer = nullptr; TRITONSERVER_MemoryType memory_type = (TRITONSERVER_MemoryType)tensor->GetMemType(); int64_t memory_type_id = 0; RETURN_IF_ERROR(TRITONBACKEND_OutputBuffer(output, &buffer, byteSize, &memory_type, &memory_type_id)); if (memory_type != TRITONSERVER_MEMORY_CPU && memory_type != TRITONSERVER_MEMORY_CPU_PINNED) { return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, "Triton failed to allocate output buffer on CPU"); } errno_t memRet = memcpy_s(buffer, byteSize, tensor->GetData(), byteSize); if (memRet != EOK) { return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, "Failed to copy tensor data to Triton output buffer"); } } return nullptr; } std::string MindIEUtils::GetEngineConfigPath(ModelState* model_state) { triton::common::TritonJson::Value parameters; TRITONSERVER_Error* err = model_state->ModelConfig().MemberAsObject("parameters", ¶meters); if (err != nullptr) { TRITONSERVER_ErrorDelete(err); throw std::runtime_error("Model config doesn't have a parameters section"); } std::string engineConfigPath; LOG_IF_ERROR( GetParameterValue(parameters, "engine_config_path", &engineConfigPath), "failed to get log_path config."); return engineConfigPath; } } // namespace triton::backend::mindie
#ifndef MINDIE_INFERTASK_H #define MINDIE_INFERTASK_H #include "triton/backend/backend_common.h" #include "triton/core/tritonbackend.h" #include "llm_manager/infer_request.h" #include "llm_manager/infer_tensor.h" #include <unordered_set> namespace triton::backend::mindie { class InferTask { public: explicit InferTask(TRITONBACKEND_Request* request); ~InferTask(); TRITONBACKEND_ResponseFactory* GetResponseFactory(); std::shared_ptr<mindie_llm::InferRequest> GetMieRequest() const; TRITONBACKEND_Request* GetBackendRequest() const; private: template <typename T> void SetSamplingValue(std::shared_ptr<mindie_llm::InferRequest>& leRequest, TRITONBACKEND_Request* request, const char* postprocessParam, mindie_llm::InferDataType type, T default_sampling_value) { const int64_t paramNum = 1; auto samplingTensor = std::make_shared<mindie_llm::InferTensor>(postprocessParam, type, std::vector<int64_t>{1, paramNum}); bool ret = samplingTensor->Allocate(paramNum * sizeof(T)); samplingTensor->SetRelease(false); if (!ret) { LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "failed to allocate data for sampling param"); } T *samplingBuffer = static_cast<T*>(samplingTensor->GetData()); TRITONBACKEND_Input* input = nullptr; const void* buffer; size_t bufferByteSize; TRITONSERVER_MemoryType memoryType; int64_t memoryType_id; if (TRITONBACKEND_RequestInput(request, postprocessParam, &input) != nullptr) { *samplingBuffer = static_cast<T>(default_sampling_value); } else { TRITONBACKEND_InputBuffer(input, 0, &buffer, &bufferByteSize, &memoryType, &memoryType_id); *samplingBuffer = static_cast<T>(*reinterpret_cast<const T*>(buffer)); } leRequest->AddTensor(postprocessParam, samplingTensor); } std::shared_ptr<mindie_llm::InferRequest> ConvertRequest(TRITONBACKEND_Request* request); void AddMieInput(TRITONBACKEND_Request* bRequest, std::shared_ptr<mindie_llm::InferRequest> &leRequest, const char* name); void SetSampling(std::shared_ptr<mindie_llm::InferRequest>& leRequest, TRITONBACKEND_Request* request); std::shared_ptr<mindie_llm::InferRequest> mieRequest_; TRITONBACKEND_ResponseFactory* responseFactory_; TRITONBACKEND_Request* backendRequest_; }; } #endif
#include "infer_task.h" #include <securec.h> namespace triton::backend::mindie { InferTask::InferTask(TRITONBACKEND_Request* request) { mieRequest_ = ConvertRequest(request); TRITONSERVER_Error* err = nullptr; err = TRITONBACKEND_ResponseFactoryNew(&responseFactory_, request); LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, (std::string("MindIE TRITONBACKEND_ResponseFactoryNew: ") + ((err == nullptr) ? "Success" : TRITONSERVER_ErrorMessage(err))) .c_str()); backendRequest_ = request; } InferTask::~InferTask() { if (responseFactory_ != nullptr) { TRITONBACKEND_ResponseFactoryDelete(responseFactory_); } } TRITONBACKEND_ResponseFactory* InferTask::GetResponseFactory() { return responseFactory_; } std::shared_ptr<mindie_llm::InferRequest> InferTask::GetMieRequest() const { return mieRequest_; } void InferTask::AddMieInput(TRITONBACKEND_Request* bRequest, std::shared_ptr<mindie_llm::InferRequest> &leRequest, const char* name) { TRITONBACKEND_Input* input = nullptr; if (TRITONBACKEND_RequestInput(bRequest, name, &input) != nullptr) { return; } const int64_t* shape; TRITONSERVER_DataType datatype; uint32_t dims_count; size_t buffer_byte_size; TRITONSERVER_MemoryType data_memory_type = TRITONSERVER_MEMORY_CPU; int64_t data_memory_id; const void* buffer = nullptr; TRITONBACKEND_InputProperties( input, &name, &datatype, &shape, &dims_count, nullptr, nullptr); TRITONBACKEND_InputBuffer( input, 0 /* idx */, &buffer, &buffer_byte_size, &data_memory_type, &data_memory_id); auto tensor = std::make_shared<mindie_llm::InferTensor>(name, static_cast<mindie_llm::InferDataType>(datatype), std::vector<int64_t>(shape, shape + dims_count)); bool ret = tensor->Allocate(buffer_byte_size); tensor->SetRelease(false); if (!ret) { LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "failed to allocate data for INPUT_IDS params"); } errno_t memRet = memcpy_s(tensor->GetData(), buffer_byte_size, buffer, buffer_byte_size); if (memRet != EOK) { LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Triton failed to allocate output buffer on CPU"); return; } leRequest->AddTensor(name, tensor); } void InferTask::SetSampling(std::shared_ptr<mindie_llm::InferRequest>& leRequest, TRITONBACKEND_Request* request) { SetSamplingValue<float>(leRequest, request, "TEMPERATURE", mindie_llm::InferDataType::TYPE_FP32, 1.0f); SetSamplingValue<int32_t>(leRequest, request, "TOP_K", mindie_llm::InferDataType::TYPE_INT32, int32_t(0)); SetSamplingValue<float>(leRequest, request, "TOP_P", mindie_llm::InferDataType::TYPE_FP32, 1.0f); SetSamplingValue<float>(leRequest, request, "TYPICAL_P", mindie_llm::InferDataType::TYPE_FP32, 1.0f); SetSamplingValue<bool>(leRequest, request, "DO_SAMPLE", mindie_llm::InferDataType::TYPE_BOOL, false); SetSamplingValue<uint64_t>(leRequest, request, "SEED", mindie_llm::InferDataType::TYPE_UINT64, uint64_t(0)); SetSamplingValue<float>(leRequest, request, "REPETITION_PENALTY", mindie_llm::InferDataType::TYPE_FP32, 1.0f); SetSamplingValue<float>(leRequest, request, "FREQUENCY_PENALTY", mindie_llm::InferDataType::TYPE_FP32, 0.0f); SetSamplingValue<float>(leRequest, request, "PRESENCE_PENALTY", mindie_llm::InferDataType::TYPE_FP32, 0.0f); SetSamplingValue<bool>(leRequest, request, "WATERMARK", mindie_llm::InferDataType::TYPE_BOOL, false); } std::shared_ptr<mindie_llm::InferRequest> InferTask::ConvertRequest( TRITONBACKEND_Request* request) { const char* request_id; if (TRITONBACKEND_RequestId(request, &request_id) != nullptr) { return nullptr; } auto leRequest = std::make_shared<mindie_llm::InferRequest>(mindie_llm::InferRequestId(request_id)); AddMieInput(request, leRequest, "INPUT_IDS"); SetSampling(leRequest, request); // mindie_llm::TensorPtr tp; // leRequest->GetTensorByName("REPETITION_PENALTY", tp); // float* buffer = static_cast<float*>(tp->GetData()); // std::cout << "************ REPETITION_PENALTY: " << *buffer // << "**************" << std::endl; return leRequest; } TRITONBACKEND_Request* InferTask::GetBackendRequest() const { return backendRequest_; } } // namespace triton::backend::mindie
name: "llama3_8b" max_batch_size : 200 backend: "mindie" model_transaction_policy { decoupled: True } dynamic_batching { max_queue_delay_microseconds: 1000 default_queue_policy : { timeout_action : 1 default_timeout_microseconds : 50000000 } } # path of config.json to mindie parameters { key: "engine_config_path", value: {string_value: "tritonModels/config.json"} } # number of model instances, currently just support a single model instance parameters { key: "model_instance_number" value: { string_value: "1" } } # npu card number on which each model instance is running, separate them sequentially with a semicolon parameters { key: "npu_device_ids" value: { string_value: "01;"} } input [ { name: "INPUT_IDS" data_type: TYPE_INT64 dims: [ -1 ] }, { name: "TEMPERATURE" data_type: TYPE_FP32 dims: [ -1 ] optional: true }, { name: "TOP_K" data_type: TYPE_INT32 dims: [ -1 ] optional: true }, { name: "TOP_P" data_type: TYPE_FP32 dims: [ -1 ] optional: true }, { name: "TYPICAL_P" data_type: TYPE_FP32 dims: [ -1 ] optional: true }, { name: "DO_SAMPLE" data_type: TYPE_BOOL dims: [ -1 ] optional: true }, { name: "SEED" data_type: TYPE_UINT64 dims: [ -1 ] optional: true }, { name: "REPETITION_PENALTY" data_type: TYPE_FP32 dims: [ -1 ] optional: true }, { name: "FREQUENCY_PENALTY" data_type: TYPE_FP32 dims: [ -1 ] optional: true }, { name: "PRESENCE_PENALTY" data_type: TYPE_FP32 dims: [ -1 ] optional: true }, { name: "WATERMARK" data_type: TYPE_BOOL dims: [ -1 ] optional: true } ] output [ { name: "OUTPUT_IDS" data_type: TYPE_INT64 dims: [ -1 ] }, { name: "IBIS_EOS_ATTR" data_type: TYPE_INT64 dims: [ -1 ] } ] instance_group [ { count: 1 kind: KIND_CPU } ]
{ "Version" : "1.0.0", "LogConfig" : { "logLevel" : "Info", "logFileSize" : 20, "logFileNum" : 20, "logPath" : "logs/mindservice.log" }, "ServerConfig" : { "ipAddress" : "127.0.0.1", "managementIpAddress" : "127.0.0.2", "port" : 1025, "managementPort" : 1026, "allowAllZeroIpListening" : false, "maxLinkNum" : 1000, "httpsEnabled" : false, "fullTextEnabled" : false, "tlsCaPath" : "security/ca/", "tlsCaFile" : ["ca.pem"], "tlsCert" : "security/certs/server.pem", "tlsPk" : "security/keys/server.key.pem", "tlsPkPwd" : "security/pass/key_pwd.txt", "tlsCrl" : "security/certs/server_crl.pem", "managementTlsCaFile" : ["management_ca.pem"], "managementTlsCert" : "security/certs/management/server.pem", "managementTlsPk" : "security/keys/management/server.key.pem", "managementTlsPkPwd" : "security/pass/management/key_pwd.txt", "managementTlsCrl" : "security/certs/management/server_crl.pem", "kmcKsfMaster" : "tools/pmt/master/ksfa", "kmcKsfStandby" : "tools/pmt/standby/ksfb", "inferMode" : "standard", "interCommTLSEnabled" : true, "interCommPort" : 1121, "interCommTlsCaFile" : "security/grpc/ca/ca.pem", "interCommTlsCert" : "security/grpc/certs/server.pem", "interCommPk" : "security/grpc/keys/server.key.pem", "interCommPkPwd" : "security/grpc/pass/key_pwd.txt", "interCommTlsCrl" : "security/grpc/certs/server_crl.pem", "openAiSupport" : "vllm" }, "BackendConfig" : { "backendName" : "mindieservice_llm_engine", "modelInstanceNumber" : 1, "npuDeviceIds" : [[0]], "tokenizerProcessNumber" : 8, "multiNodesInferEnabled" : false, "multiNodesInferPort" : 1120, "interNodeTLSEnabled" : true, "interNodeTlsCaFile" : "security/grpc/ca/ca.pem", "interNodeTlsCert" : "security/grpc/certs/server.pem", "interNodeTlsPk" : "security/grpc/keys/server.key.pem", "interNodeTlsPkPwd" : "security/grpc/pass/mindie_server_key_pwd.txt", "interNodeTlsCrl" : "security/grpc/certs/server_crl.pem", "interNodeKmcKsfMaster" : "tools/pmt/master/ksfa", "interNodeKmcKsfStandby" : "tools/pmt/standby/ksfb", "ModelDeployConfig" : { "maxSeqLen" : 2560, "maxInputTokenLen" : 2048, "truncation" : false, "ModelConfig" : [ { "modelInstanceType" : "Standard", "modelName" : "llama3_8b", "modelWeightPath" : "/path/to/weights/LLaMA3-8B", "worldSize" : 2, "cpuMemSize" : 5, "npuMemSize" : -1, "backendType" : "atb" } ] }, "ScheduleConfig" : { "templateType" : "Standard", "templateName" : "Standard_LLM", "cacheBlockSize" : 128, "maxPrefillBatchSize" : 50, "maxPrefillTokens" : 8192, "prefillTimeMsPerReq" : 150, "prefillPolicyType" : 0, "decodeTimeMsPerReq" : 50, "decodePolicyType" : 0, "maxBatchSize" : 200, "maxIterTimes" : 512, "maxPreemptCount" : 0, "supportSelectBatch" : false, "maxQueueDelayMicroseconds" : 5000 } } }
# Set the environment variables for ascend-toolkit, nnal, atb_models, and mindie, and modify them manually based on their installation paths # source ascend-toolkit/set_env.sh # source nnal/atb/set_env.sh # source atb_models/set_env.sh # source mindie/set_env.sh export LCCL_DETERMINISTIC=1 export HCCL_DETERMINISTIC=true export ATB_MATMUL_SHUFFLE_K_ENABLE=0 export ATB_LLM_LCOC_ENABLE=1 export ATB_STREAM_SYNC_EVERY_KERNEL_ENABLE=0 export ATB_STREAM_SYNC_EVERY_RUNNER_ENABLE=0 export ATB_STREAM_SYNC_EVERY_OPERATION_ENABLE=0 export ATB_OPERATION_EXECUTE_ASYNC=0 export TASK_QUEUE_ENABLE=0 export ATB_OPSRUNNER_KERNEL_CACHE_LOCAL_COUNT=8 export ATB_OPSRUNNER_KERNEL_CACHE_GLOABL_COUNT=16 export ATB_CONTEXT_WORKSPACE_RING=1 export HCCL_BUFFSIZE=120 export ATB_LAYER_INTERNAL_TENSOR_REUSE=1 export MASTER_ADDR=127.0.0.1 export MASTER_PORT=12345
export MINDIE_LLM_PYTHON_LOG_TO_FILE=0 export MINDIE_LLM_PYTHON_LOG_TO_STDOUT=0 export MINDIE_LLM_PYTHON_LOG_PATH=./mindiepython.log export MINDIE_LLM_PYTHON_LOG_LEVEL=INFO export MINDIE_LLM_LOG_TO_STDOUT=0 export MINDIE_LLM_LOG_TO_FILE=0 export MINDIE_LLM_LOG_LEVEL=INFO export IBIS_PYTHON_LOG=0 rm -f ./tritonserver.log # The path to the Triton model repository following --model-repository needs to be modified manually. /opt/tritonserver/bin/tritonserver --model-repository=tritonModels \ --backend-config=mindie,shm-default-byte-size=134217728 --http-port 8110 --grpc-port 8111 --metrics-port 8112 --log-verbose=1 --log-info=1 --log-warning=0 --log-error=0 --log-file ./tritonserver.log
import sys import datetime import queue import random import threading from infer_client import ModelClient, InferRequest import time import json import numpy as np import multiprocessing as mp from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED import argparse import pandas as pd from prettytable import PrettyTable from transformers import AutoTokenizer from logger_util import get_logger logger = get_logger() SAMPLING_Param = {'TEMPERATURE':1.0, # float32 'TOP_K':0, # int32 'TOP_P':1.0, # float32 'TYPICAL_P':1.0, # float32 'DO_SAMPLE':1, # bool 'SEED':0, # uint64 'REPETITION_PENALTY':1.0, # float32 'FREQUENCY_PENALTY':0, # float32 'PRESENCE_PENALTY':0, # float32 'WATERMARK':0 # bool } RESULT_FILE_NAME = '' MODEL_NAME = '' MODEL_PATH = '' TRACE_DATASET = '' DATASET_LANGUAGE = '' # total submission time TOTAL_SUBMIT_TIME = 0 # the number of requests for different request types REQUEST_TYPE_NUM = 2000 # input token quantity (randomly distributed with a uniform distribution within [INPUT_TOKEN_NUM_START, INPUT_TOKEN_NUM_END]) INPUT_TOKEN_NUM_START = 1 INPUT_TOKEN_NUM_END = 16 # output token quantity (randomly distributed with a uniform distribution within [OUTPUT_TOKEN_NUM_START, OUTPUT_TOKEN_NUM_END]) OUTPUT_TOKEN_NUM_START = 1 OUTPUT_TOKEN_NUM_END = 256 THREAD_NUM = 200 SEED = 1 random.seed(SEED) class SubmitService: def __init__(self, token_config_list, thread_num): self.requests = queue.Queue() self.exception_queue = queue.Queue() self.total_request_num = 0 self.submitted_request_num = 0 self.start_time = 0 self.elapsed = 0 self.thread_num = thread_num self.lock = threading.Lock() request_idx = 0 for token_config in token_config_list: request_num = token_config[0] self.total_request_num += request_num input_tensor = token_config[1] for _ in range(request_num): input_lens = token_config[2] question = token_config[3] self.requests.put(InferRequest(str(request_idx), input_tensor, input_lens, question, sampling_param=SAMPLING_Param, model_name=MODEL_NAME)) request_idx += 1 logger.info("total request num:{}".format(self.total_request_num)) self.metrics_queue = dict() def start(self): logger.info("\n############ submit job ###########") all_task = [] init = queue.Queue(self.thread_num) with ThreadPoolExecutor(self.thread_num + 1, thread_name_prefix='submit') as p: for i in range(THREAD_NUM): all_task.append(p.submit(self._submit_requests, i, init)) all_task.append(p.submit(self._monitor, init)) wait(all_task, return_when=ALL_COMPLETED) if not self.exception_queue.empty(): raise self.exception_queue.get() logger.info("############ submit over ###########") def _submit_requests(self, client_id, init): try: client = ModelClient(client_id) init.put(1) # wait all grpc client init ready while not init.full(): time.sleep(0.1) while not self.requests.empty(): with self.lock: if not self.requests.empty(): request = self.requests.get() self.requests.task_done() logger.info("start a new request.client id:{},request_id:{}".format(client_id, request.request_id)) success, metrics = client.infer(request) if not success: logger.info("request {} submit failed! triton url: {}, model name: {}, inputs: {}".format( request.request_id, client.server_url, request.model_name, request.inputs)) self._request_finish(request.request_id,metrics) client.triton_client.stop_stream() except Exception as e: self.exception_queue.put(e) def _request_finish(self, request_id, metrics): self.metrics_queue[request_id] = metrics.format() def format_metrics(self): metrics_list = [] for _, metric in self.metrics_queue.items(): metrics_list.append(metric) json_str = json.dumps(metrics_list, ensure_ascii=False) with open(RESULT_FILE_NAME, 'w') as f: f.write(json_str) request_df = pd.read_json(json_str, orient='records') metrics = PrettyTable(['METRICS', 'AVG', 'P75', 'P99', 'MAX', 'N']) origin_metric = ['FirstTokenTime', 'DecodeTime', 'MaxDecodeTime', 'GenerateTime', 'InputCharacters', 'InputTokens', 'GeneratedTokens', 'GenerateTokenSpeed'] for metric in origin_metric: if metric in ['FirstTokenTime', 'DecodeTime', 'MaxDecodeTime', 'GenerateTime']: metrics.add_row([metric, str(round(request_df[metric].mean() * 1000)) + 'ms', str(round(request_df[metric].quantile(0.75) * 1000)) + 'ms', str(round(request_df[metric].quantile(0.99) * 1000)) + 'ms', str(round(request_df[metric].max() * 1000)) + 'ms', request_df[metric].count()]) elif metric in ['InputCharacters', 'InputTokens', 'GeneratedTokens']: metrics.add_row([metric, round(request_df[metric].mean()), round(request_df[metric].quantile(0.75)), round(request_df[metric].quantile(0.99)), round(request_df[metric].max()), request_df[metric].count()]) else: metrics.add_row([metric, str(round(request_df[metric].mean(), 2)) + '/s', str(round(request_df[metric].quantile(0.75), 2)) + '/s', str(round(request_df[metric].quantile(0.99), 2)) + '/s', str(round(request_df[metric].max(), 2)) + '/s', request_df[metric].count()]) logger.info(metrics) total = PrettyTable(['METRICS', 'VALUE']) total.add_row(['Current Time', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')]) total.add_row(['Data Source', TRACE_DATASET]) total.add_row(['NonEmpty', request_df['NotEmpty'].sum()]) total.add_row(['Returned', request_df['InputCharacters'].count()]) total.add_row(['Total', self.total_request_num]) total.add_row(['Concurrency', THREAD_NUM]) total.add_row(['Time Elapsed', round(self.elapsed)]) total.add_row(['Throughput', round(self.total_request_num / self.elapsed, 2)]) total.add_row(['JCT', round(request_df['GenerateTime'].mean(), 2)]) total.add_row(['GenerateSpeed', round(request_df['GeneratedTokens'].sum() / self.elapsed, 2)]) total.add_row( ['GenerateSpeedPerClient', round(request_df['GeneratedTokens'].sum() / self.elapsed / THREAD_NUM, 2)]) logger.info(total) def _monitor(self, init): # wait all grpc client init ready while not init.full(): time.sleep(0.1) self.start_time = time.time() last_submitted_count = -1 while len(self.metrics_queue)<self.total_request_num: submitted_count = len(self.metrics_queue) if last_submitted_count != submitted_count: logger.info("progress: {}/{}, spend time:{}".format(submitted_count, self.total_request_num, int(time.time() - self.start_time))) last_submitted_count = submitted_count time.sleep(0.1) self.elapsed = time.time() - self.start_time self.format_metrics() def gen_requests_from_config(): token_config_list = list() for _ in range(REQUEST_TYPE_NUM): request_num = 1 input_token_num = random.randint(INPUT_TOKEN_NUM_START, INPUT_TOKEN_NUM_END) output_token_num = random.randint(OUTPUT_TOKEN_NUM_START, OUTPUT_TOKEN_NUM_END) token_config_list.append([request_num, input_token_num, output_token_num]) return token_config_list def gen_requests_from_trace(trace_file): tokenizer = AutoTokenizer.from_pretrained( MODEL_PATH, trust_remote_code=True, use_fast=True) requests = list() df = pd.read_json(trace_file, lines=True) for i, row in df.iterrows(): request_num = 1 question = row["question"] token = tokenizer([question], return_tensors="np") token = token["input_ids"].astype(np.int64) input_tensor = token.reshape(1, -1) input_lens = len(question) requests.append([request_num, input_tensor, input_lens, question]) return requests def parser_input(): parser = argparse.ArgumentParser(description='Personal information') parser.add_argument('--name', dest='name', required=True, type=str, help='Name of the model') parser.add_argument('--model_path', dest='model_path', required=True, type=str, help='path of the model') parser.add_argument('--trace_dataset', dest='trace_dataset', required=True, type=str, help='Trace dataset with jsonl style file') parser.add_argument('--dataset_language', dest='dataset_language', default='english', type=str, help='Language of the trace dataset') parser.add_argument('--req_num', dest='req_num', default=2000, type=int, help='Number of the request') parser.add_argument('--max_input_len', dest='max_input_len', default=16, type=int, help='Max length of input sequence') parser.add_argument('--max_output_len', dest='max_output_len', default=100, type=int, help='Max length of output sequence') args = parser.parse_args() globals()['MODEL_NAME'] = args.name globals()['MODEL_PATH'] = args.model_path globals()['REQUEST_TYPE_NUM'] = args.req_num globals()['INPUT_TOKEN_NUM_END'] = args.max_input_len globals()['OUTPUT_TOKEN_NUM_END'] = args.max_output_len globals()['TRACE_DATASET'] = args.trace_dataset globals()['DATASET_LANGUAGE'] = args.dataset_language def main(): parser_input() dt = datetime.datetime.now().strftime('%Y-%m-%d|%H:%M:%S') if TRACE_DATASET != '': requests = gen_requests_from_trace(TRACE_DATASET) globals()['RESULT_FILE_NAME'] = './trace.json' else: requests = gen_requests_from_config() globals()['RESULT_FILE_NAME'] = './mock-{}-{}-{}-{}-{}.json'.format(MODEL_NAME, REQUEST_TYPE_NUM, INPUT_TOKEN_NUM_END, OUTPUT_TOKEN_NUM_END, dt) submit_service = SubmitService(requests, THREAD_NUM) submit_service.start() if __name__ == "__main__": try: main() except: ttype, tvalue, traceback = sys.exc_info() print(ttype, tvalue, end="\n") idx = 1 while traceback: print("第{}层堆栈信息".format(idx)) tracebackCode = traceback.tb_frame.f_code print("文件名:{}".format(tracebackCode.co_filename)) print("函数或者模块名:{}".format(tracebackCode.co_name)) traceback = traceback.tb_next idx += 1 sys.exit(1)
# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from functools import partial import numpy as np import queue from tritonclient.utils import * import tritonclient.grpc as grpcclient from transformers import AutoTokenizer SAMPLING_Param = {'TEMPERATURE':1.0, # float32 'TOP_K':0, # int32 'TOP_P':1.0, # float32 'TYPICAL_P':1.0, # float32 'DO_SAMPLE':1, # bool 'SEED':0, # uint64 'REPETITION_PENALTY':1.0, # float32 'FREQUENCY_PENALTY':0, # float32 'PRESENCE_PENALTY':0, # float32 'WATERMARK':0 # bool } MODEL_PATH = "weights/LLaMA3-8B" model_name = "llama3_8b" question = "Please introduce yourself." tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True, use_fast=True) class UserData: def __init__(self): self._completed_requests = queue.Queue() def callback(user_data, result, error): if error: user_data._completed_requests.put(error) else: user_data._completed_requests.put(result) def main(): user_data = UserData() with grpcclient.InferenceServerClient(url="localhost:8111", verbose=False) as triton_client: triton_client.start_stream(callback=partial(callback, user_data)) request_id = '1' inputs = [] outputs = [] input_ids = tokenizer([question], return_tensors="np") input_ids = input_ids["input_ids"].astype(np.int64) input_ids = input_ids.reshape(1, -1) sampling_param = SAMPLING_Param temperature = np.array([[sampling_param['TEMPERATURE']]], np.float32) top_k = np.array([[sampling_param['TOP_K']]], np.int32) top_p = np.array([[sampling_param['TOP_P']]], np.float32) typical_p = np.array([[sampling_param['TYPICAL_P']]], np.float32) do_sample = np.array([[sampling_param['DO_SAMPLE']]], bool) random_seed = np.array([[sampling_param['SEED']]], np.uint64) repetition_penalty = np.array([[sampling_param['REPETITION_PENALTY']]], np.float32) frequency_penalty = np.array([[sampling_param['FREQUENCY_PENALTY']]], np.float32) presence_penalty = np.array([[sampling_param['PRESENCE_PENALTY']]], np.float32) watermark = np.array([[sampling_param['WATERMARK']]], bool) inputs.append(grpcclient.InferInput('INPUT_IDS', list(input_ids.shape), "INT64")) inputs.append(grpcclient.InferInput('TEMPERATURE', list(temperature.shape), "FP32")) inputs.append(grpcclient.InferInput('TOP_K', list(top_k.shape), "INT32")) inputs.append(grpcclient.InferInput('TOP_P', list(top_p.shape), "FP32")) inputs.append(grpcclient.InferInput('TYPICAL_P', list(typical_p.shape), "FP32")) inputs.append(grpcclient.InferInput('DO_SAMPLE', list(do_sample.shape), "BOOL")) inputs.append(grpcclient.InferInput('SEED', list(random_seed.shape), "UINT64")) inputs.append(grpcclient.InferInput('REPETITION_PENALTY', list(repetition_penalty.shape), "FP32")) inputs.append(grpcclient.InferInput('FREQUENCY_PENALTY', list(frequency_penalty.shape), "FP32")) inputs.append(grpcclient.InferInput('PRESENCE_PENALTY', list(presence_penalty.shape), "FP32")) inputs.append(grpcclient.InferInput('WATERMARK', list(watermark.shape), "BOOL")) inputs[0].set_data_from_numpy(input_ids) inputs[1].set_data_from_numpy(temperature) inputs[2].set_data_from_numpy(top_k) inputs[3].set_data_from_numpy(top_p) inputs[4].set_data_from_numpy(typical_p) inputs[5].set_data_from_numpy(do_sample) inputs[6].set_data_from_numpy(random_seed) inputs[7].set_data_from_numpy(repetition_penalty) inputs[8].set_data_from_numpy(frequency_penalty) inputs[9].set_data_from_numpy(presence_penalty) inputs[10].set_data_from_numpy(watermark) outputs.append(grpcclient.InferRequestedOutput("OUTPUT_IDS")) outputs.append(grpcclient.InferRequestedOutput('IBIS_EOS_ATTR')) triton_client.async_stream_infer(model_name=model_name, inputs=inputs, request_id=request_id, outputs=outputs) triton_client.stop_stream() output_ids = [] while not user_data._completed_requests.empty(): data_item = user_data._completed_requests.get() output_id = data_item.as_numpy("OUTPUT_IDS") output_ids.extend(output_id) output = tokenizer.decode(output_ids) print(f"output:\n{output}") if __name__=='__main__': main()
import sys from functools import partial import queue from tritonclient.utils import * import tritonclient.grpc as grpcclient import time import numpy as np from transformers import AutoTokenizer from logger_util import get_logger logger = get_logger() class UserData: def __init__(self): self.completed_requests = queue.Queue() def callback(user_data, result, error): user_data.completed_requests.put((result, error)) class Metrics: def __init__(self): self.first_token_time = 0 self.avg_decode_time = 0 self.max_decode_time = 0 self.full_time = 0 self.prompt_lens = 0 self.prompt_token_lens = 0 self.generated_tokens = 0 self.cost_timestamp_list = [] self.client_id = -1 def format(self): generate_token_speed = (self.generated_tokens / self.full_time) if self.full_time != 0 else 0 return {'FirstTokenTime': self.first_token_time, 'DecodeTime': self.avg_decode_time, 'MaxDecodeTime': self.max_decode_time, 'GenerateTime': self.full_time, 'InputCharacters': self.prompt_lens, 'InputTokens': self.prompt_token_lens, 'GeneratedTokens': self.generated_tokens, 'GenerateTokenSpeed': generate_token_speed, 'NotEmpty': 0 if self.generated_tokens == 0 else 1, 'CostTimestampList': self.cost_timestamp_list, 'ClientID': self.client_id} class InferRequest: def __init__(self, request_id, input_tensor, input_lens, question, sampling_param, max_iter_times=4096, model_name='ibis_benchmark'): self.submit_time = 0 self.request_id = request_id self.input_lens = input_lens self.question = question self.input_token_lens = input_tensor.shape[1] temperature = np.array([[sampling_param['TEMPERATURE']]], np.float32) top_k = np.array([[sampling_param['TOP_K']]], np.int32) top_p = np.array([[sampling_param['TOP_P']]], np.float32) typical_p = np.array([[sampling_param['TYPICAL_P']]], np.float32) do_sample = np.array([[sampling_param['DO_SAMPLE']]], bool) random_seed = np.array([[sampling_param['SEED']]], np.uint64) repetition_penalty = np.array([[sampling_param['REPETITION_PENALTY']]], np.float32) frequency_penalty = np.array([[sampling_param['FREQUENCY_PENALTY']]], np.float32) presence_penalty = np.array([[sampling_param['PRESENCE_PENALTY']]], np.float32) watermark = np.array([[sampling_param['WATERMARK']]], bool) self.inputs = [] self.inputs.append(grpcclient.InferInput('INPUT_IDS', list(input_tensor.shape), "INT64")) self.inputs.append(grpcclient.InferInput('TEMPERATURE', list(temperature.shape), "FP32")) self.inputs.append(grpcclient.InferInput('TOP_K', list(top_k.shape), "INT32")) self.inputs.append(grpcclient.InferInput('TOP_P', list(top_p.shape), "FP32")) self.inputs.append(grpcclient.InferInput('TYPICAL_P', list(typical_p.shape), "FP32")) self.inputs.append(grpcclient.InferInput('DO_SAMPLE', list(do_sample.shape), "BOOL")) self.inputs.append(grpcclient.InferInput('SEED', list(random_seed.shape), "UINT64")) self.inputs.append(grpcclient.InferInput('REPETITION_PENALTY', list(repetition_penalty.shape), "FP32")) self.inputs.append(grpcclient.InferInput('FREQUENCY_PENALTY', list(frequency_penalty.shape), "FP32")) self.inputs.append(grpcclient.InferInput('PRESENCE_PENALTY', list(presence_penalty.shape), "FP32")) self.inputs.append(grpcclient.InferInput('WATERMARK', list(watermark.shape), "BOOL")) self.inputs[0].set_data_from_numpy(input_tensor) self.inputs[1].set_data_from_numpy(temperature) self.inputs[2].set_data_from_numpy(top_k) self.inputs[3].set_data_from_numpy(top_p) self.inputs[4].set_data_from_numpy(typical_p) self.inputs[5].set_data_from_numpy(do_sample) self.inputs[6].set_data_from_numpy(random_seed) self.inputs[7].set_data_from_numpy(repetition_penalty) self.inputs[8].set_data_from_numpy(frequency_penalty) self.inputs[9].set_data_from_numpy(presence_penalty) self.inputs[10].set_data_from_numpy(watermark) self.outputs = [] self.outputs.append(grpcclient.InferRequestedOutput('OUTPUT_IDS')) self.outputs.append(grpcclient.InferRequestedOutput('IBIS_EOS_ATTR')) self.max_iter_times = max_iter_times self.model_name = model_name class ModelClient: def __init__(self, client_id=0, server_url="localhost:8111", verbose=False): self.server_url = server_url self.client_id = client_id self.triton_client = grpcclient.InferenceServerClient(url=server_url, verbose=verbose) self.user_data = UserData() self.triton_client.start_stream(callback=partial(callback, self.user_data)) def infer(self, request: InferRequest): metrics = Metrics() metrics.client_id = self.client_id metrics.prompt_lens = request.input_lens metrics.prompt_token_lens = request.input_token_lens begin_time = time.time() try: logger.info("Send a request, request id : {}".format(request.request_id)) self.triton_client.async_stream_infer(model_name=request.model_name, inputs=request.inputs, request_id=request.request_id, outputs=request.outputs ) except Exception as e: raise e tokens = [] last_token_time = time.time() metrics.cost_timestamp_list.append(last_token_time) decode_full_time = 0 for i in range(request.max_iter_times): (output_tensor, error) = self.user_data.completed_requests.get() if error is not None: logger.error(error) return False, metrics gen_tokens = output_tensor.as_numpy('OUTPUT_IDS') for token in gen_tokens: tokens.append(token) if metrics.first_token_time == 0: metrics.first_token_time = time.time() - last_token_time else: decode_time = time.time() - last_token_time gen_token_num = output_tensor.as_numpy('IBIS_EOS_ATTR')[1] single_decode_time = decode_time / gen_token_num decode_full_time += decode_time if decode_time > metrics.max_decode_time: metrics.max_decode_time = single_decode_time last_token_time = time.time() metrics.cost_timestamp_list.append(last_token_time) inferparam = output_tensor.get_response().parameters['triton_final_response'] if(inferparam.bool_param): logger.info("recieve a eos token, request id : {}".format(request.request_id)) break metrics.full_time = time.time() - begin_time metrics.generated_tokens = len(tokens) if metrics.generated_tokens > 1: metrics.avg_decode_time = decode_full_time / (metrics.generated_tokens - 1) logger.info("return from client infer, request id : {}".format(request.request_id)) return True, metrics
import sys import loguru import os, time os.environ['TZ'] = 'UTC' time.tzset() def get_logger(): logger = loguru.logger logger.add("script.log", level="INFO") return logger
#!/bin/bash set -e export MINDIE_LLM_HOME_PATH=/usr/local/Ascend/mindie/latest/mindie-llm export TRITON_HOME_PATH=/opt/tritonserver if [ -z "$MINDIE_LLM_HOME_PATH" ]; then echo "env MINDIE_LLM_HOME_PATH is null, please install mindie and source set_env.sh" exit 1 fi COMPILE_OPTIONS="" if [ $(python3 -c 'import torch; print(torch.compiled_with_cxx11_abi())') == "True" ]; then USE_CXX11_ABI=ON else USE_CXX11_ABI=OFF fi COMPILE_OPTIONS="${COMPILE_OPTIONS} -DUSE_CXX11_ABI=$USE_CXX11_ABI" BUILD_TYPE="Release" TRITON_VERSION="r24.02" while getopts "dv:p:" opt; do case ${opt} in d) BUILD_TYPE="Debug" ;; v) TRITON_VERSION=$OPTARG ;; p) export TRITON_HOME_PATH=$OPTARG ;; \?) echo "Invalid option: -$opt" >&2 exit 1 ;; esac done echo "Triton version: ${TRITON_VERSION}" if [ -z "$TRITON_HOME_PATH" ]; then echo "env TRITON_HOME_PATH is null, please set env or use -p to tell us where triton is installed." exit 1 fi echo "Triton install path: ${TRITON_HOME_PATH}" if [ ! -d "$TRITON_HOME_PATH" ]; then echo "$TRITON_HOME_PATH is not a directory! Please check triton install path." exit 1 fi rm -rf build && mkdir build && cd build COMPILE_OPTIONS="${COMPILE_OPTIONS} -DCMAKE_BUILD_TYPE=$BUILD_TYPE" COMPILE_OPTIONS="${COMPILE_OPTIONS} -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install" COMPILE_OPTIONS="${COMPILE_OPTIONS} -DTRITON_COMMON_REPO_TAG=$TRITON_VERSION" COMPILE_OPTIONS="${COMPILE_OPTIONS} -DTRITON_BACKEND_REPO_TAG=$TRITON_VERSION" COMPILE_OPTIONS="${COMPILE_OPTIONS} -DTRITON_CORE_REPO_TAG=$TRITON_VERSION" COMPILE_OPTIONS="${COMPILE_OPTIONS} -DTRITON_ENABLE_GPU=OFF" cmake $COMPILE_OPTIONS .. make install -j$(npoc)
# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmake_minimum_required(VERSION 3.17) project(tutorialmindiebackend LANGUAGES C CXX) # # Options # # Must include options required for this project as well as any # projects included in this one by FetchContent. # # GPU support is disabled by default because mindie backend # doesn't use GPUs. # option(TRITON_ENABLE_GPU "Enable GPU support in backend" OFF) option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON) set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo") set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo") set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo") if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() add_compile_options(-std=c++17) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_SKIP_RPATH TRUE) if (USE_CXX11_ABI) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1") else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pipe -fstack-protector-all") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-copy") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wl,--build-id=none") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-z,relro,-z,now") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pie -fexceptions") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ftrapv -s") # add_definitions(-D_GLIBCXX_USE_C99=1) # # Dependencies # # FetchContent requires us to include the transitive closure of all # repos that we depend on so that we can override the tags. # include(FetchContent) FetchContent_Declare( repo-common GIT_REPOSITORY https://github.com/triton-inference-server/common.git GIT_TAG ${TRITON_COMMON_REPO_TAG} GIT_SHALLOW ON ) FetchContent_Declare( repo-core GIT_REPOSITORY https://github.com/triton-inference-server/core.git GIT_TAG ${TRITON_CORE_REPO_TAG} GIT_SHALLOW ON ) FetchContent_Declare( repo-backend GIT_REPOSITORY https://github.com/triton-inference-server/backend.git GIT_TAG ${TRITON_BACKEND_REPO_TAG} GIT_SHALLOW ON ) FetchContent_MakeAvailable(repo-common repo-core repo-backend) # # The backend must be built into a shared library. Use an ldscript to # hide all symbols except for the TRITONBACKEND API. # set(ASCEND_DRIVER_DIR /usr/local/Ascend/driver) LINK_DIRECTORIES( $ENV{TRITON_HOME_PATH}/lib $ENV{MINDIE_LLM_HOME_PATH}/lib ) INCLUDE_DIRECTORIES( $ENV{MINDIE_LLM_HOME_PATH}/include $ENV{MINDIE_LLM_HOME_PATH}/include/llm_manager $ENV{MINDIE_LLM_HOME_PATH}/include/llm_manager/utils ${ASCEND_DRIVER_DIR}/kernel/libc_sec/include $ENV{TRITON_HOME_PATH}/include ) file(GLOB_RECURSE SOURCE_FILES "${CMAKE_CURRENT_LIST_DIR}/src/*.cc") add_library(triton-mindie-backend SHARED ${SOURCE_FILES}) add_library( TutorialmindieBackend::triton-mindie-backend ALIAS triton-mindie-backend ) target_include_directories( triton-mindie-backend PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src ) target_compile_features(triton-mindie-backend PRIVATE cxx_std_11) target_link_libraries( triton-mindie-backend PRIVATE triton-core-serverapi # from repo-core triton-core-backendapi # from repo-core triton-core-serverstub # from repo-core triton-backend-utils # from repo-backend mindie_llm_manager # from mindie-llm ) set_target_properties( triton-mindie-backend PROPERTIES OUTPUT_NAME triton_mindie ) # Install install( TARGETS triton-mindie-backend DESTINATION $ENV{TRITON_HOME_PATH}/backends/mindie )