本章节简单介绍如何使用异步调度功能,并使用MindIE Benchmark工具展示调优方式。
export MINDIE_ASYNC_SCHEDULING_ENABLE=1
cd {MindIE安装目录}/latest/mindie-service/ vi conf/config.json
本特性在maxBatchSize较大时收益比较明显,故选取小模型且maxBatchSize设置为200,请根据具体业务场景进行配置调整。
"BackendConfig" : { "backendName" : "mindieservice_llm_engine", "modelInstanceNumber" : 1, "npuDeviceIds" : [[6,7]], "tokenizerProcessNumber" : 8, "multiNodesInferEnabled" : false, "multiNodesInferPort" : 1120, "interNodeTLSEnabled" : true, "interNodeTlsCaPath" : "security/grpc/ca/", "interNodeTlsCaFiles" : ["ca.pem"], "interNodeTlsCert" : "security/grpc/certs/server.pem", "interNodeTlsPk" : "security/grpc/keys/server.key.pem", "interNodeTlsPkPwd" : "security/grpc/pass/mindie_server_key_pwd.txt", "interNodeTlsCrlPath" : "security/grpc/certs/", "interNodeTlsCrlFiles" : ["server_crl.pem"], "interNodeKmcKsfMaster" : "tools/pmt/master/ksfa", "interNodeKmcKsfStandby" : "tools/pmt/standby/ksfb", "ModelDeployConfig" : { "maxSeqLen" : 2560, "maxInputTokenLen" : 2048, "truncation" : false, "ModelConfig" : [ { "modelInstanceType" : "Standard", "modelName" : "llama3-8b", "modelWeightPath" : "/data/atb_testdata/weights/Meta-Llama-3-8B", "worldSize" : 2, "cpuMemSize" : 5, "npuMemSize" : 2, "backendType" : "atb", "trustRemoteCode" : false } ] }, "ScheduleConfig" : { "templateType" : "Standard", "templateName" : "Standard_LLM", "cacheBlockSize" : 128, "maxPrefillBatchSize" : 50, "maxPrefillTokens" : 8192, "prefillTimeMsPerReq" : 150, "prefillPolicyType" : 0, "decodeTimeMsPerReq" : 50, "decodePolicyType" : 0, "maxBatchSize" : 200, "maxIterTimes" : 512, "maxPreemptCount" : 0, "supportSelectBatch" : false, "maxQueueDelayMicroseconds" : 5000 } }
./bin/mindieservice_daemon
benchmark \ --DatasetPath "/{数据集路径}/GSM8K" \ --DatasetType "gsm8k" \ --ModelName "llama3-8b" \ --ModelPath "/{模型权重路径}/Meta-Llama-3-8B" \ --TestType client \ --Http https://{ipAddress}:{port} \ --ManagementHttp https://{managementIpAddress}:{managementPort} \ --Tokenizer True \ --MaxOutputLen 512 \ --TaskKind stream \ --WarmupSize 1 \ --DoSampling False \ --Concurrency 200 \ --TestAccuracy True\