aclDumpOpTensors
功能说明
模型执行过程中支持Dump算子输入/输出Tensor数据,方便算子输入/输出异常数据的问题定位和分析。
函数原型
aclnnStatus aclDumpOpTensors(const char *opType, const char *opName, aclTensor **tensors, size_t inputTensorNum, size_t outputTensorNum, aclrtStream stream)
参数说明
|
参数名 |
输入/输出 |
说明 |
|---|---|---|
|
opType |
输入 |
字符串,表示算子类型,例如“Add”。 |
|
opName |
输入 |
字符串,表示算子名称,例如“add_custom”。 |
|
tensors |
输入 |
一维张量,表示待Dump的输入/输出Tensor对象指针。注意Tensor顺序,输入Tensor在前,输出Tensor在后。 |
|
inputTensorNum |
输入 |
表示待Dump的输入Tensor个数。 |
|
outputTensorNum |
输入 |
表示待Dump的输出Tensor个数。 |
|
stream |
输入 |
指定执行任务的Stream。 |
返回值说明
返回0表示成功,返回其他值表示失败,返回码列表参见公共接口返回码。
约束说明
本接口需要在开启算子Dump功能时有效,您可以通过aclInit接口开启Dump,也可以通过aclmdlInitDump、aclmdlSetDump、aclmdlFinalizeDump系列接口开启Dump,接口详见《应用开发 (C&C++)》中“aclInit”和“Dump配置”。
调用示例
关键代码示例如下(仅供参考,不支持直接拷贝运行)。
- 通过aclInit接口使能算子Dump功能。关键代码如下:
1 2 3 4 5
// AscendCL Init aclInit("./acl.json"); aclrtSetDevice(0); aclrtStream stream = nullptr; aclrtCreateStream(&stream);
acl.json示例如下(具体参见aclInit接口文档中模型Dump配置、单算子Dump配置示例):1 2 3 4 5 6 7 8
{ "dump": { "dump_path": "./", "dump_list": [], "dump_mode": "all", "dump_data": "tensor" } }
- 接口调用的关键伪代码(以torch算子为例)如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
#include <torch/extension.h> #include "torch_npu/csrc/core/npu/NPUStream.h" #include "torch_npu/csrc/core/npu/NPUFunctions.h" #include "torch_npu/csrc/framework/OpCommand.h" #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h" #include "torch_npu/csrc/core/npu/register/OptionsManager.h" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include "torch_npu/csrc/flopcount/FlopCount.h" #include "torch_npu/csrc/flopcount/FlopCounter.h" #include "torch_npu/csrc/core/npu/NpuVariables.h" #include "kernel_operator.h" #include <acl/acl_base.h> #include <aclnn/acl_meta.h> constexpr int32_t BUFFER_NUM = 2; constexpr int64_t MAX_DIM_NUM = 5; constexpr int64_t NCL_DIM_NUM = 3; constexpr int64_t NCHW_DIM_NUM = 4; constexpr int64_t NCDHW_DIM_NUM = 5; // 生成待Dump算子的输入/输出Tensor对象指针一维张量。 #define INIT_ACL_TENSOR_ARRAY(tensors, ...) aclTensor* tensors[] = {__VA_ARGS__} // at::Tensor对象转换成aclTensor对象。本函数简化了处理过程,具体以实际算子为准。 aclTensor *ConvertTensor(const at::Tensor &at_tensor) { aclDataType acl_data_type = ACL_FLOAT16; c10::SmallVector<int64_t, MAX_DIM_NUM> storageDims; const auto dimNum = at_tensor.sizes().size(); aclFormat format = ACL_FORMAT_ND; switch (dimNum) { case NCL_DIM_NUM: format = ACL_FORMAT_NCL; break; case NCHW_DIM_NUM: format = ACL_FORMAT_NCHW; break; case NCDHW_DIM_NUM: format = ACL_FORMAT_NCDHW; break; default: format = ACL_FORMAT_ND; } // if acl_data_type is ACL_STRING, storageDims is empty. if (acl_data_type != ACL_STRING) { storageDims.push_back(at_tensor.storage().nbytes() / at_tensor.itemsize()); } auto acl_tensor = aclCreateTensor(at_tensor.sizes().data(), at_tensor.sizes().size(), acl_data_type, at_tensor.strides().data(), at_tensor.storage_offset(), format, storageDims.data(), storageDims.size(), const_cast<void *>(at_tensor.storage().data())); return acl_tensor; } // 自定义算子实现。具体以实际算子为准。 class KernelAdd { public: __aicore__ inline KernelAdd() {} __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength) { this->blockLength = totalLength / AscendC::GetBlockNum(); this->tileNum = 8; this->tileLength = this->blockLength / this->tileNum / BUFFER_NUM; xGm.SetGlobalBuffer((__gm__ half *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); yGm.SetGlobalBuffer((__gm__ half *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); zGm.SetGlobalBuffer((__gm__ half *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(half)); pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(half)); pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(half)); } __aicore__ inline void Process() { int32_t loopCount = this->tileNum * BUFFER_NUM; for (int32_t i = 0; i < loopCount; i++) { CopyIn(i); Compute(i); CopyOut(i); } } private: __aicore__ inline void CopyIn(int32_t progress) { AscendC::LocalTensor<half> xLocal = inQueueX.AllocTensor<half>(); AscendC::LocalTensor<half> yLocal = inQueueY.AllocTensor<half>(); AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength); AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength); inQueueX.EnQue(xLocal); inQueueY.EnQue(yLocal); } __aicore__ inline void Compute(int32_t progress) { AscendC::LocalTensor<half> xLocal = inQueueX.DeQue<half>(); AscendC::LocalTensor<half> yLocal = inQueueY.DeQue<half>(); AscendC::LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>(); AscendC::Add(zLocal, xLocal, yLocal, this->tileLength); outQueueZ.EnQue<half>(zLocal); inQueueX.FreeTensor(xLocal); inQueueY.FreeTensor(yLocal); } __aicore__ inline void CopyOut(int32_t progress) { AscendC::LocalTensor<half> zLocal = outQueueZ.DeQue<half>(); AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength); outQueueZ.FreeTensor(zLocal); } private: AscendC::TPipe pipe; AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX, inQueueY; AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ; AscendC::GlobalTensor<half> xGm; AscendC::GlobalTensor<half> yGm; AscendC::GlobalTensor<half> zGm; uint32_t blockLength; uint32_t tileNum; uint32_t tileLength; }; __global__ __vector__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength) { KernelAdd op; op.Init(x, y, z, totalLength); op.Process(); } namespace ascendc_ops { at::Tensor ascendc_add(const at::Tensor& x, const at::Tensor& y) { auto aclStream = c10_npu::getCurrentNPUStream().stream(false); at::Tensor z = at::empty_like(x); uint32_t numBlocks = 8; uint32_t totalLength = 1; for (uint32_t size : x.sizes()) { totalLength *= size; } add_custom<<<numBlocks, nullptr, aclStream>>>((uint8_t*)(x.mutable_data_ptr()), (uint8_t*)(y.mutable_data_ptr()), (uint8_t*)(z.mutable_data_ptr()), totalLength); // Dump算子输入/输出Tensor数据。 INIT_ACL_TENSOR_ARRAY(tensors, ConvertTensor(x), ConvertTensor(y), ConvertTensor(z)); aclDumpOpTensors("Add", "add_custom", tensors, 2, 1, aclStream); // 释放aclTensor对象。 for (size_t i = 0; i < 3; i++) { aclDestroyTensor(tensors[i]); } return z; } } // namespace ascendc_ops TORCH_LIBRARY(ascendc_ops, m) { m.def("ascendc_add(Tensor x, Tensor y) -> Tensor"); } TORCH_LIBRARY_IMPL(ascendc_ops, PrivateUse1, m) { m.impl("ascendc_add", TORCH_FN(ascendc_ops::ascendc_add)); }
父主题: 公共接口