支持自动同步

使用Ascend C编写算子时，通过设置毕昇编译器自动同步编译选项 “--cce-auto-sync”（kernel直调算子工程和自定义算子开发工程已默认开启，无需开发者设置），能够自动插入如下AI Core内部执行单元间的同步指令：

MTE2与Scalar之间
MTE3与Scalar之间
Vector与Scalar之间
Vector与Vector之间

Ascend C编程框架和编译器为开发者提供以下自动同步功能，详细内容请参考同步控制简介。

单流水同步：PIPE_V由编译器自动完成同步插入, PIPE_MTE2/PIPE_MTE3在搬运地址有重叠的情况下需要开发者插入同步。
多流水同步：PIPE_V、PIPE_MTE2、PIPE_MTE3、PIPE_S之间的多流水同步，都是双向的，如下图所示，黄色线条表示的同步由编译器自动完成同步插入，剩余的同步由Ascend C框架完成。

自动同步使用约束

使用自动同步功能需要满足以下约束：核函数中调用的所有函数需要为inline函数；规范使用Ascend C编程模型。

核函数中调用的所有函数需要为inline函数

下面的样例中，非inline函数不支持自动同步功能。

...
// 算子类的实现中Process函数
__aicore__ void Process()
{
    CopyIn();
    Compute();
    CopyOut();
}
__aicore__ void CopyIn()
{
    LocalTensor<int32_t> srcLocal = inQueueSrc.AllocTensor<int32_t>();
    DataCopy(srcLocal, srcGlobal, 512);
    inQueueSrc.EnQue(srcLocal);
}
__aicore__ void Compute()
{
    LocalTensor<int32_t> srcLocal = inQueueSrc.DeQue<int32_t>();
    LocalTensor<int32_t> dstLocal = outQueueDst.AllocTensor<int32_t>()
    uint64_t mask = 64;
    Copy(dstLocal, srcLocal, mask, 4, { 1, 1, 8, 8 });
    outQueueDst.EnQue<int32_t>(dstLocal);
    inQueueSrc.FreeTensor(srcLocal);
}
__aicore__ void CopyOut()
{
    LocalTensor<int32_t> dstLocal = outQueueDst.DeQue<int32_t>();
    DataCopy(dstGlobal, dstLocal, 512);
    outQueueDst.FreeTensor(dstLocal);
}
...

上述样例需要改写为如下形式才可以支持自动同步。

...
// 算子类的实现中Process函数
__aicore__ inline void Process()
{
    CopyIn();
    Compute();
    CopyOut();
}
__aicore__ inline void CopyIn()
{
    LocalTensor<int32_t> srcLocal = inQueueSrc.AllocTensor<int32_t>();
    DataCopy(srcLocal, srcGlobal, 512);
    inQueueSrc.EnQue(srcLocal);
}

__aicore__ inline void Compute()
{
    LocalTensor<int32_t> srcLocal = inQueueSrc.DeQue<int32_t>();
    LocalTensor<int32_t> dstLocal = outQueueDst.AllocTensor<int32_t>()
    uint64_t mask = 64;
    Copy(dstLocal, srcLocal, mask, 4, { 1, 1, 8, 8 });
    outQueueDst.EnQue<int32_t>(dstLocal);
    inQueueSrc.FreeTensor(srcLocal);
}

__aicore__ inline void CopyOut()
{
    LocalTensor<int32_t> dstLocal = outQueueDst.DeQue<int32_t>();
    DataCopy(dstGlobal, dstLocal, 512);
    outQueueDst.FreeTensor(dstLocal);
}
...

规范使用Ascend C编程模型

下面的样例中，没有使用Ascend C编程模型（EnQue()、DeQue()、AllocTensor()、FreeTensor()接口），不支持自动同步。

...
// 不使用Ascend C编程模型
__aicore__ inline void CopyIn()
{
    DataCopy(srcLocal, srcGlobal, 512);
}
__aicore__ inline void Compute()
{
    for(int i = 0;i<dstDataSize; i++) {
       dstLocal.SetValue(i,srcLocal.GetValue(i));
    }
}
__aicore__ inline void CopyOut()
{
    DataCopy(dstGlobal, dstLocal, 512);
}
private:
    TPipe pipe;
    LocalTensor<int32_t> srcLocal, dstLocal;
    GlobalTensor<int32_t> srcGlobal, dstGlobal;
    int dstDataSize = 512;
...

需要改写为如下形式才可以支持自动同步。

// 合理按照编程范式写法使用EnQue()、DeQue()、AllocTensor()以及FreeTensor()等内存管理与同步控制API接口
...
__aicore__ inline void CopyIn()
{
    LocalTensor<int32_t> srcLocal = inQueueSrc.AllocTensor<int32_t>();
    DataCopy(srcLocal, srcGlobal, 512);
    inQueueSrc.EnQue(srcLocal);
}
__aicore__ inline void Compute()
{
    LocalTensor<int32_t> srcLocal = inQueueSrc.DeQue<int32_t>();
    LocalTensor<int32_t> dstLocal = outQueueDst.AllocTensor<int32_t>()
    for(int i = 0;i<dstDataSize; i++) {
       dstLocal.SetValue(i,srcLocal.GetValue(i));
    }
    outQueueDst.EnQue<int32_t>(dstLocal);
    inQueueSrc.FreeTensor(srcLocal);
}
__aicore__ inline void CopyOut()
{
    LocalTensor<int32_t> dstLocal = outQueueDst.DeQue<int32_t>();
    DataCopy(dstGlobal, dstLocal, 512);
    outQueueDst.FreeTensor(dstLocal);
}
private:
    TPipe pipe;
    TQue<QuePosition::VECIN, 1> inQueueSrc;
    TQue<QuePosition::VECOUT, 1> outQueueDst;
    GlobalTensor<int32_t> srcGlobal, dstGlobal;
    int dstDataSize = 512;
...

自动同步debug日志功能

毕昇编译器提供“--cce-auto-sync-log=<file>”编译选项可以输出同步插入信息到<file>文件中，帮助开发者显式地识别编译器在算子文件中插入的同步指令信息。需debug模式（添加-g编译选项）编译算子，用于获取算子代码文件行号。

直接使用毕昇编译器的场景，可以直接在编译命令中添加该编译选项
使用Ascend C kernel直调算子工程，可以通过ascendc_compile_options添加该编译选项
使用Ascend C自定义算子开发工程，可以通过add_ops_compile_options添加该编译选项

如下的代码文件sync_log_test.h：

LocalTensor<T> dstLocal;
T ave_tmp = 0;
Vector_OP1(dstLocal, params); 
ave_tmp = dstLocal.GetValue(0);
Vector_OP2(dstLocal, params); 
for (int i = 0; i < ave_tmp; ++i) {
    dstLocal.SetValue(i,0);
}

开启自动同步后，同步指令的插入位置如下：

LocalTensor<T> dstLocal;
T ave_tmp = 0;
Vector_OP1(dstLocal, params); 
SetFlag<HardEvent::V_S>(EVENT_ID0);
WaitFlag<HardEvent::V_S>(EVENT_ID0);
ave_tmp = dstLocal.GetValue(0);
PipeBarrier<PIPE_V>();
SetFlag<HardEvent::S_V>(EVENT_ID0);
WaitFlag<HardEvent::S_V>(EVENT_ID0);
Vector_OP2(dstLocal, params); 
SetFlag<HardEvent::V_S>(EVENT_ID0);
WaitFlag<HardEvent::V_S>(EVENT_ID0);
for (int i = 0; i < ave_tmp; ++i) {
    dstLocal.SetValue(i,0);
}

开启自动同步debug日志功能后，输出日志如下：

The BiSheng Auto Sync log of sync_log_test :  
Position: absolute-path/sync_log_test.h:4 : line before insert sync : SetFlag<HardEvent::V_S>(EVENT_ID0);
Position: absolute-path/sync_log_test.h:4 : line before insert sync : WaitFlag<HardEvent::V_S>(EVENT_ID0);
Position: absolute-path/sync_log_test.h:5 : line before insert sync : PipeBarrier<PIPE_V>();
Position: absolute-path/sync_log_test.h:5 : line before insert sync : SetFlag<HardEvent::S_V>(EVENT_ID0);
Position: absolute-path/sync_log_test.h:5 : line before insert sync : WaitFlag<HardEvent::S_V>(EVENT_ID0);
Position: absolute-path/sync_log_test.h:6 : line before insert sync : SetFlag<HardEvent::V_S>(EVENT_ID0);
Position: absolute-path/sync_log_test.h:6 : line before insert sync : WaitFlag<HardEvent::V_S>(EVENT_ID0);

其中，line before表示紧接着当前行前面插入的同步指令。

父主题： 基本编程指导