以下亲和库适用于PyTorch 1.11.0版本。
def fuse_add_softmax_dropout(training, dropout, attn_mask, attn_scores, attn_head_size, p=0.5, dim=-1):
Using NPU custom operator to replace the native writing method to improve performance
>>> training = True >>> dropout = nn.DropoutWithByteMask(0.1) >>> npu_input1 = torch.rand(96, 12, 384, 384).half().npu() >>> npu_input2 = torch.rand(96, 12, 384, 384).half().npu() >>> alpha = 0.125 >>> axis = -1 >>> output = torch_npu.contrib.function.fuse_add_softmax_dropout(training, dropout, npu_input1, npu_input2, alpha, p=axis)
def npu_diou(boxes1,boxes2,trans=True, is_cross=False, mode=0):
Applies an NPU based DIOU operation.
Taking into account the distance between the targets,the overlap rate of the distance and the range, different targets or boundaries will tend to be stable.
Util now, diou backward only support trans==True, is_cross==False, mode==0('iou') current version if you need to back propagation, please ensure your parameter is correct!
>>> box1 = torch.randn(4, 32).npu() >>> box1.requires_grad = True >>> box2 = torch.randn(4, 32).npu() >>> box2.requires_grad = True >>> ciou = torch_npu.contrib.function.npu_diou(box1, box2) >>> l = diou.sum() >>> l.backward()
def npu_ciou(boxes1,boxes2,trans=True, is_cross=False, mode=0):
Applies an NPU based CIOU operation.
A penalty item is added on the basis of DIoU, and CIoU is proposed.
Util now, ciou backward only support trans==True, is_cross==False, mode==0('iou') current version if you need to back propagation, please ensure your parameter is correct!
>>> box1 = torch.randn(4, 32).npu() >>> box1.requires_grad = True >>> box2 = torch.randn(4, 32).npu() >>> box2.requires_grad = True >>> ciou = torch_npu.contrib.function.npu_ciou(box1, box2) >>> l = ciou.sum() >>> l.backward()
def npu_single_level_responsible_flags(featmap_size,gt_bboxes,stride,num_base_anchors):
Using NPU OP to generate the responsible flags of anchor in a single feature map.
torch.Tensor: The valid flags of each anchor in a single level feature map. Output size is [featmapsize[0] * featmapsize[1] * numbaseanchors].
>>> featmap_sizes = [[10, 10], [20, 20], [40, 40]] >>> stride = [[32, 32], [16, 16], [8, 8]] >>> gt_bboxes = torch.randint(0, 512, size=(128, 4)) >>> num_base_anchors = 3 >>> featmap_level = len(featmap_sizes) >>> torch.npu.set_device(0) >>> for i in range(featmap_level): gt_bboxes = gt_bboxes.npu() >>> out = npu_single_level_responsible_flags(featmap_sizes[i],gt_bboxes,stride[i],num_base_anchors) >>> print(out.shape, out.max(), out.min())
def npu_bbox_coder_encode_yolo(bboxes, gt_bboxes, stride):
Using NPU OP to Get box regression transformation deltas that can be used to transform the bboxes into the gt_bboxes
>>> A = 1024 >>> bboxes = torch.randint(0, 512, size=(A, 4)) >>> gt_bboxes = torch.randint(0, 512, size=(A, 4)) >>> stride = torch.randint(0, 32, size=(A,)) >>> torch.npu.set_device(0) >>> bboxes = bboxes.npu() >>> gt_bboxes = gt_bboxes.npu() >>> stride = stride.npu() >>> out = npu_bbox_coder_encode_yolo(bboxes, gt_bboxes, stride) >>> torch.npu.synchronize() >>> print('_npu_bbox_coder_encode_yolo done. output shape is ', out.shape)
def npu_bbox_coder_encode_xyxy2xywh(bboxes,gt_bboxes,means=None,stds=None,is_normalized=False,normalized_scale=10000.,):
Applies an NPU based bboxes's format-encode operation from xyxy to xywh.
Does not support dynamic shape, because of the semantics of operators, only supports 2-dimensional (n, 4) scenes, bboxes and gt_bboxes only support the same shape and the same dtype, dtype only supports f16 and fp32, The third input (stride) only supports 1D and the first dimension is the same as the first input(bboxes).
>>> A = 1024 >>> bboxes = torch.randint(0, 512, size=(A, 4)) >>> gt_bboxes = torch.randint(0, 512, size=(A, 4)) >>> stride = torch.randint(0, 32, size=(A,)) >>> torch.npu.set_device(0) >>> bboxes = bboxes.npu() >>> gt_bboxes = gt_bboxes.npu() >>> stride = stride.npu() >>> out = npu_bbox_coder_encode_yolo(bboxes, gt_bboxes, stride) >>> torch.npu.synchronize() >>> print('_npu_bbox_coder_encode_yolo done. output shape is ', out.shape)
def npu_bbox_coder_decode_xywh2xyxy(bboxes,predbboxes,means=None,stds=None,maxshape=None,whratioclip=16 / 1000,):
Applies an NPU based bboxes's format-encode operation from xywh to xyxy.
Tensor: Boxes with shape (N, 4), where 4 represent tl_x, tl_y, br_x, br_y.
>>> A = 1024 >>> max_shape = 512 >>> bboxes = torch.randint(0, max_shape, size=(A, 4)) >>> pred_bboxes = torch.randn(A, 4) >>> torch.npu.set_device(0) >>> bboxes = bboxes.npu() >>> pred_bboxes = pred_bboxes.npu() >>> out = npu_bbox_coder_decode_xywh2xyxy(bboxes, pred_bboxes, max_shape=(max_shape, max_shape)) >>> torch.npu.synchronize >>> () >>> print('_npu_bbox_coder_decode_xywh2xyxy done. output shape is ', out.shape)
def npu_fast_condition_index_put(x, condition, value):
Using NPU affinity writing method to replace the native writing method in bool type index_put function.
>>> x = torch.randn(128, 8192) >>> condition = x < 0.5 >>> value = 0. >>> x1 = copy.deepcopy(x)[condition] = value >>> x1_opt = npu_fast_condition_index_put(x, condition, value)
class MatmulApply(torch.autograd.Function):
Using NPU custom operator to replace the native writing method to improve performance.
In the dynamic shape scene, Due to the operator restriction, the broadcast scene is not supported.
>>> tensor1 = torch.randn(68, 5, 75, 16).npu() >>> tensor1.requires_grad_(True) >>> tensor2 = torch.randn(68, 5, 75, 16).npu() >>> tensor2.requires_grad_(True) >>> output = matmul_transpose(tensor1, tensor2) >>> output.sum().backward()
def npu_multiclass_nms(multi_bboxes,multi_scores, score_thr=0.05,nms_thr=0.45,max_num=50,score_factors=None):
NMS for multi-class bboxes using npu api.
In the case of dynamic shape, because of the limitation of NPU op, only supports the maximum number of categories(nmsed_classes) is 20, and the maximum number of boxes(nmsed_boxes) is 10000.
tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels are 0-based.
>>> boxes = torch.randint(1, 255, size=(1000, 4)) >>> scores = torch.randn(1000, 81) >>> boxes = boxes.npu().half() >>> scores = scores.npu().half() >>> det_bboxes, det_labels = npu_multiclass_nms(boxes, scores, score_thr=0.3, nms_thr=0.5, max_num=3) >>> expedt_det_bboxes = torch.tensor([[ 57.0000, 198.8750, 45.9688, 221.8750, 4.1484],[215.0000, 155.0000, 236.8750, 137.0000,3.9023], [208.8750, 221.0000, 228.0000, 17.0000, 3.8867]],dtype=torch.float16)
def npu_batched_multiclass_nms(multi_bboxes,multi_scores,max_num=50,score_factors=None):
NMS for batched multi-class bboxes using npu api.
In the case of dynamic shape, because of the limitation of NPU op, only supports the maximum number of categories(nmsed_classes) is 20, and the maximum number of boxes(nmsed_boxes) is 10000.
tuple: (bboxes, labels), tensors of shape (bs, k, 5) and (bs, k, 1). Labels are 0-based.
>>> boxes = torch.randint(1, 255, size=(4, 200, 80, 4)) >>> scores = torch.randn(4, 200, 81) >>> boxes = boxes.npu().half() >>> scores = scores.npu().half() >>> det_bboxes, det_labels = npu_batched_multiclass_nms(boxes, scores, score_thr=0.3, nms_thr=0.5, max_num=3) >>> expedt_det_bboxes = torch.tensor([[[221.8750, 60.0000, 183.0000, 22.0000, 3.8867], [167.0000, 250.0000, 136.0000, 144.0000, 3.6445], [ 45.9688, 147.0000, 67.0000, 241.8750, 3.4844]], [[ 5.0000, 178.0000, 243.8750, 138.0000, 3.7344], [238.0000, 132.0000, 47.0000, 84.0000, 3.6836], [ 32.0000, 110.0000, 131.0000, 73.0000, 3.6309]], [[111.9375, 120.9375, 54.0000, 231.0000, 3.9219], [147.0000, 162.0000, 78.0000, 1.0010, 3.9219], [157.0000, 118.0000, 57.0000, 115.0000, 3.6523]], [[ 80.0000, 126.9375, 54.0000, 246.8750, 3.7344], [ 31.0000, 253.8750, 19.0000, 138.0000, 3.6328], [ 54.0000, 253.8750, 78.0000, 75.0000, 3.5586]]],dtype=torch.float16)
def dropout_with_byte_mask(input1, p=0.5, training=True, inplace=False)
This dropoutwithbyte_mask method generates stateless random uint8 mask and do dropout according to the mask.
>>> torch.manual_seed(5) >>> items = [[np.float16, 2, (4, 4)], [np.float16, 0, (32, 384, 1024)]] >>> for item in items: cpu_input, npu_input = create_common_tensor(item, 0, 1) self.npu_op_exec(npu_input, prob=0.2)
class NpuRollWithIndexSelect():
Using NPU affinity writing method to replace the native roll in swin-transformer.
>>> input1 = torch.randn(32, 56, 56, 16).npu() >>> shift_size = 3 >>> shifted_x_npu = roll(input1, shifts=(-shift_size, -shift_size), dims=(1, 2))
class Mish(nn.Module):
Applies an NPU based Mish operation.
Mish exists in the official version in PyTorch 1.9.0.Currently, the PyTorch version adapted for NPU is 1.5.0,so Mish needs to be defined as an additional module.
>>> m = nnn.Mish() >>> input_tensor = torch.randn(2, 32, 5, 5) >>> output = m(input_tensor)
class SiLU(nn.Module):
Applies an NPU based Sigmoid Linear Unit (SiLU) function, element-wise. The SiLU function is also known as the swish function.
SiLU exists in the official version since PyTorch 1.7.0. Currently, the PyTorch version adapted for NPU is 1.5.0,so SiLU needs to be defined as an additional module.
>>> m = nnn.SiLU() >>> input_tensor = torch.randn(2, 32, 5, 5) >>> output = m(input_tensor)
class ChannelShuffle(nn.Module):
Applies an NPU compatible channel shuffle operation.In order to avoid contiguous operation which is not efficient on npu, we replaced the original operation with a rewrite of the same semantics. Two discontinuous operations are replaced, transpose and chunk.
>>> x1 = torch.randn(2,32,7,7) >>> x2 = torch.randn(2,32,7,7) >>> m = ChannelShuffle(64, split_shuffle=True) >>> output = m(x1, x2)
class LabelSmoothingCrossEntropy(nn.Module):
CrossEntropy with LabelSmoothing using npu api.
float: tensors of shape (k, 5) and (k, 1). Labels are 0-based.
>>> x = torch.randn(2, 10) >>> y = torch.randint(0, 10, size=(2,)) >>> x = x.npu() >>> y = y.npu() >>> x.requires_grad = True >>> m = LabelSmoothingCrossEntropy(10) >>> npu_output = m(x, y) >>> npu_output.backward()
class ModulatedDeformConv(nn.Module):
Applies an NPU based Modulated Deformable 2D convolution operation.
ModulatedDeformConv only implements operations under fp32 data types. Notice, weight and bias in conv_offset must be initialized to 0.
>>> m = ModulatedDeformConv(32, 32, 1) >>> input_tensor = torch.randn(2, 32, 5, 5) >>> output = m(input_tensor) >>> x = torch.randn(2, 32, 7, 7) >>> model = ModulatedDeformConv(32, 32, 3, 2, 1) >>> torch.npu.set_device(0) >>> x = x.npu() >>> model = model.npu() >>> o = model(x) >>> l = o.sum() >>> l.backward() >>> print(l)
class NpuDropPath(nn.Module):
Using NPU affinity writing method to replace the native Drop paths in swin_transformer.py. Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks.)
>>> input1 = torch.randn(68, 5).npu() >>> input1.requires_grad_(True) >>> input2 = torch.randn(68, 5).npu() >>> input2.requires_grad_(True) >>> fast_drop_path = NpuDropPath(0).npu() >>> output = input1 + fast_drop_path(input2) >>> output.sum().backward()
class NpuCachedDropout(torch.nn.Dropout):
FairseqDropout using on npu device
>>> model = NpuMNIST().to("npu") >>> x = torch.randn(2,10,16,16).to("npu") >>> NpuCachedDropout.enable_dropout_ensemble(model) >>> output = model(x)
class Focus(nn.Module):
Using NPU affinity writing method to replace the native Focus in Yolov5.
>>> input = torch.randn(4, 8, 300, 40).npu() >>> input.requires_grad_(True) >>> fast_focus = Focus(8, 13).npu() >>> output = fast_focus(input) >>> output.sum().backward()
class FusedColorJitter(torch.nn.Module):
Randomly change the brightness, contrast, saturation and hue of an image.
>>> train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2), transforms.RandomHorizontalFlip(), ]))
class MultiheadAttention(nn.Module):
Multi-headed attention.
>>> model = MultiheadAttention(embed_dim=1024,num_heads=16,dropout=0.1,kdim=1024,vdim=1024,self_attention=True,encoder_decoder_attention=True) >>> _, query = create_common_tensor([np.float16, FORMAT_NZ, (1024,1024)], -1, 1) >>> _, key = create_common_tensor([np.float16, FORMAT_NZ, (1024, 1024)], -1, 1) >>> _, value = create_common_tensor([np.float16, FORMAT_NZ, (1024, 1024)], -1, 1) >>> _, key_padding_mask = create_common_tensor([np.float16, FORMAT_NZ, (16,16,64,64)], -65504, 65504) >>> bsz = 16 >>> tgt_len = 64 >>> s_len=64 >>> model = model.to("npu") >>> output = model(query, key, value, bsz, tgt_len, s_len, key_padding_mask)
class DropoutWithByteMask(Module):
Applies an NPU compatible DropoutWithByteMask operation, Only supports npu devices.
maxseed is a hyper-parameter strongly related to the underlying operator.Please check the MAX(2 ** 31 - 1 / 2 ** 10 - 1) in dropoutv2.py in the opp package for matching settings.By default, it is matched by the Pytorch and OPP packages.
>>> m = nn.DropoutWithByteMask(p=0.5) >>> input = torch.randn(16, 16) >>> output = m(input)
class PSROIPool(nn.Module):
ROIAlign using npu api.
>>> model = PSROIPool(pooled_height=7, pooled_width=7, spatial_scale=1 / 16.0, group_size=7, output_dim=22)
class ROIAlign(nn.Module):
ROIAlign using npu api.
Given a continuous coordinate c, its two neighboring pixel indices (in our pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampledn from the underlying signal at continuous coordinates 0.5 and 1.5). But the original roialign (aligned=False) does not subtract the 0.5 when computing neighboring pixel indices and therefore it uses pixels with a slightly incorrect alignment (relative to our pixel model) when performing bilinear interpolation.With aligned=True, we first appropriately scale the ROI and then shift it by -0.5 prior to calling roialign. This produces the correct neighbors; see detectron2/tests/testroialign.py for verification. The difference does not make a difference to the model's performance if ROIAlign is used together with conv layers.
float: tensors of shape (k, 5) and (k, 1). Labels are 0-based.
>>> input1 = self.generate_input() >>> roi = torch.tensor([[0, -2.0, -2.0, 22.0, 22.0]]).npu() >>> output_size = (3, 3) >>> spatial_scale = 0.25 >>> sampling_ratio = 2 >>> aligned = False >>> npu_output, npu_inputgrad = self.npu_roi_align(input1, roi, output_size, spatial_scale, sampling_ratio, aligned) >>> expedt_cpu_output = torch.tensor([[[[ 4.5000, 6.5000, 8.5000],[16.5000, 18.5000, 20.5000],[28.5000, 30.5000, 32.5000]]]],dtype=torch.float32) >>> expedt_cpu_inputgrad = torch.tensor([[[[0.2397, 0.2346, 0.2346, 0.2346, 0.2346, 0.2907],[0.2346, 0.2296, 0.2296, 0.2296, 0.2296, 0.2845],[0.2346, 0.2296, 0.2296, 0.2296, 0.2296,0.2845],[0.2346, 0.2296, 0.2296, 0.2296, 0.2296, 0.2845],[0.2346,0.2296, 0.2296, 0.2296, 0.2296, 0.2845],[0.2907, 0.2845, 0.2845, 0.2845, 0.2845, 0.3525]]]],dtype=torch.float32) >>> self.assertRtolEqual(expedt_cpu_output, npu_output) >>> self.assertRtolEqual(expedt_cpu_inputgrad, npu_inputgrad)