DataStoreBarrier

Product Support

Product	Supported
Atlas A3 training products / Atlas A3 inference products	√
Atlas A2 training products / Atlas A2 inference products	√
Atlas 200I/500 A2 inference products	x
Atlas inference product 's AI Core	x
Atlas inference product 's Vector Core	x
Atlas training products	x

Function

Data synchronization barrier instruction, which blocks the execution of the current thread. Subsequent instructions can be executed only after all previous memory write operations are complete and visible to other hardware units. It is used to ensure data consistency between the AI CPU and AI Core multi-cores.

Prototype

      
           DataStoreBarrier(void)

Parameters

None

Returns

None

Restrictions

This API can be called only through <<<...>>> and is used in the heterogeneous compilation scenario.

Example

Call AscendC::DataStoreBarrier() in the kernel implementation code of the AI CPU operator to ensure that the AI CPU operator writes the modified tiling data to the memory so that the AI Core operator can correctly read the tiling data.

struct TilingInfo {
    uint64_t lock; // Lock for synchronization between the AI CPU and AI Core
    int8_t type;
    int8_t mode;
    int8_t len;
};
struct KernelArgs {
    uint32_t *xDevice;
    uint32_t *yDevice;
    uint32_t *zDevice;
    TilingInfo *ti; // Parameter shared with the AI Core, used to synchronize tiling selection
};

template<typename T, int8_t mode, int8_t len>
__aicore__ void hello_world_impl(GM_ADDR m)
{
    if constexpr (std::is_same_v<T, float>) {
       AscendC::printf("Hello World: float mode %u len %u.\n", mode, len);
    } else if constexpr (std::is_same_v<T, int>) {
       AscendC::printf("Hello World: int mode %u len %u.\n", mode, len);
    }
}

// Main entry of the AI Core operator
// tilingInfo: Parameter transferred together with the AI CPU operator for data sharing
template<typename T, int8_t mode, int8_t len>
__mix__(1,2) __global__ __aicore__ void hello_world(GM_ADDR m, GM_ADDR TilingPtr)
{
     __gm__ struct KernelInfo::TilingInfo *ti = (__gm__ struct KernelInfo::TilingInfo *)TilingPtr;
    AscendC::GlobalTensor<uint64_t> lock;
    lock.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t *>(&ti->lock));
    if ASCEND_IS_AIV {
        if (AscendC::GetBlockIdx() == 0) {
            while (*reinterpret_cast<volatile __gm__ uint64_t*>(lock.GetPhyAddr(0)) == 0) {  // Offload mode. The AI Core waits until the AI CPU tiling computation is complete.
                AscendC::DataCacheCleanAndInvalid<uint64_t, AscendC::CacheLine::SINGLE_CACHE_LINE,
                    AscendC::DcciDst::CACHELINE_OUT>(lock); // Directly access the global memory to obtain the latest data.
            }
        }
    }
    // The preceding code indicates that one core waits for the AI CPU tiling calculation to complete. The following indicates inter-core synchronization.
    AscendC::SyncAll<false>();
    // Select a template based on the tiling parameter value.
    if (ti->type ==0 && ti->mode == 1 && ti->len == 2) {
        hello_world_impl<float, 1, 2>(m);
    } else if (ti->type == 1 && ti->mode == 2 && ti->len == 4) {
        hello_world_impl<int, 2, 4>(m);
    }
    // After the execution is complete, leave one core to release the lock.
    if ASCEND_IS_AIV {
        if (AscendC::GetBlockIdx() == 0) {
            lock.SetValue(0, 0); // Update the lock.
            AscendC::DataCacheCleanAndInvalid<uint64_t, AscendC::CacheLine::SINGLE_CACHE_LINE,
                AscendC::DcciDst::CACHELINE_OUT>(lock); // Refresh the Dcache and synchronize data with the GM.
        }
    }
}

extern "C" __global__ __aicpu__ uint32_t MyAicpuKernel(void *arg)
{
    KernelArgs* cfg = (KernelArgs*)arg;
    AscendC::printf("MyAicpuKernel inited!\n");
    cfg->ti->lock = 1;
    cfg->ti->type = 1;
    cfg->ti->mode = 2;
    cfg->ti->len = 4;
    AscendC::DataStoreBarrier(); // Perform write synchronization on tilingInfo.
    AscendC::printf("MyAicpuKernel inited type %u mode %u len %u end!\n", cfg->ti->type, cfg->ti->mode, cfg->ti->len);
    return 0;
}

Parent topic: AI CPU API