DataStoreBarrier
Product Support
|
Product |
Supported |
|---|---|
|
|
√ |
|
|
√ |
|
|
x |
|
|
x |
|
|
x |
|
|
x |
Function
Data synchronization barrier instruction, which blocks the execution of the current thread. Subsequent instructions can be executed only after all previous memory write operations are complete and visible to other hardware units. It is used to ensure data consistency between the AI CPU and AI Core multi-cores.
Prototype
1
|
DataStoreBarrier(void) |
Parameters
None
Returns
None
Restrictions
Example
struct TilingInfo {
uint64_t lock; // Lock for synchronization between the AI CPU and AI Core
int8_t type;
int8_t mode;
int8_t len;
};
struct KernelArgs {
uint32_t *xDevice;
uint32_t *yDevice;
uint32_t *zDevice;
TilingInfo *ti; // Parameter shared with the AI Core, used to synchronize tiling selection
};
template<typename T, int8_t mode, int8_t len>
__aicore__ void hello_world_impl(GM_ADDR m)
{
if constexpr (std::is_same_v<T, float>) {
AscendC::printf("Hello World: float mode %u len %u.\n", mode, len);
} else if constexpr (std::is_same_v<T, int>) {
AscendC::printf("Hello World: int mode %u len %u.\n", mode, len);
}
}
// Main entry of the AI Core operator
// tilingInfo: Parameter transferred together with the AI CPU operator for data sharing
template<typename T, int8_t mode, int8_t len>
__mix__(1,2) __global__ __aicore__ void hello_world(GM_ADDR m, GM_ADDR TilingPtr)
{
__gm__ struct KernelInfo::TilingInfo *ti = (__gm__ struct KernelInfo::TilingInfo *)TilingPtr;
AscendC::GlobalTensor<uint64_t> lock;
lock.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t *>(&ti->lock));
if ASCEND_IS_AIV {
if (AscendC::GetBlockIdx() == 0) {
while (*reinterpret_cast<volatile __gm__ uint64_t*>(lock.GetPhyAddr(0)) == 0) { // Offload mode. The AI Core waits until the AI CPU tiling computation is complete.
AscendC::DataCacheCleanAndInvalid<uint64_t, AscendC::CacheLine::SINGLE_CACHE_LINE,
AscendC::DcciDst::CACHELINE_OUT>(lock); // Directly access the global memory to obtain the latest data.
}
}
}
// The preceding code indicates that one core waits for the AI CPU tiling calculation to complete. The following indicates inter-core synchronization.
AscendC::SyncAll<false>();
// Select a template based on the tiling parameter value.
if (ti->type ==0 && ti->mode == 1 && ti->len == 2) {
hello_world_impl<float, 1, 2>(m);
} else if (ti->type == 1 && ti->mode == 2 && ti->len == 4) {
hello_world_impl<int, 2, 4>(m);
}
// After the execution is complete, leave one core to release the lock.
if ASCEND_IS_AIV {
if (AscendC::GetBlockIdx() == 0) {
lock.SetValue(0, 0); // Update the lock.
AscendC::DataCacheCleanAndInvalid<uint64_t, AscendC::CacheLine::SINGLE_CACHE_LINE,
AscendC::DcciDst::CACHELINE_OUT>(lock); // Refresh the Dcache and synchronize data with the GM.
}
}
}
extern "C" __global__ __aicpu__ uint32_t MyAicpuKernel(void *arg)
{
KernelArgs* cfg = (KernelArgs*)arg;
AscendC::printf("MyAicpuKernel inited!\n");
cfg->ti->lock = 1;
cfg->ti->type = 1;
cfg->ti->mode = 2;
cfg->ti->len = 4;
AscendC::DataStoreBarrier(); // Perform write synchronization on tilingInfo.
AscendC::printf("MyAicpuKernel inited type %u mode %u len %u end!\n", cfg->ti->type, cfg->ti->mode, cfg->ti->len);
return 0;
}