以下亲和库适用于PyTorch 1.8.1版本。
def fuse_add_softmax_dropout(training, dropout, attn_mask, attn_scores, attn_head_size, p=0.5, dim=-1):
Use NPU custom operator to replace the native writing method to improve performance.
>>> training = True >>> dropout = nn.DropoutWithByteMask(0.1) >>> npu_input1 = torch.rand(96, 12, 384, 384).half().npu() >>> npu_input2 = torch.rand(96, 12, 384, 384).half().npu() >>> alpha = 0.125 >>> axis = -1 >>> output = torch_npu.contrib.function.fuse_add_softmax_dropout(training, dropout, npu_input1, npu_input2, alpha, p=axis)
def npu_diou(boxes1,boxes2,trans=True, is_cross=False, mode=0):
Apply an NPU based DIOU operation.
Taking the distance between the targets,the overlap rate of the distance and the range into account. Different targets or boundaries will tend to be stable.
Until now, diou backward only supports trans==True, is_cross==False, mode==0('iou') current version. If you need back propagation, please ensure your parameter is correct!
>>> box1 = torch.randn(4, 32).npu() >>> box1.requires_grad = True >>> box2 = torch.randn(4, 32).npu() >>> box2.requires_grad = True >>> ciou = torch_npu.contrib.function.npu_diou(box1, box2) >>> l = diou.sum() >>> l.backward()
def npu_ciou(boxes1,boxes2,trans=True, is_cross=False, mode=0):
Apply an NPU based CIOU operation.
A penalty item is added on the basis of DIoU, and CIoU is proposed.
Until now, ciou backward only supports trans==True, is_cross==False, mode==0('iou') current version. If you need back propagation, please ensure your parameter is correct!
>>> box1 = torch.randn(4, 32).npu() >>> box1.requires_grad = True >>> box2 = torch.randn(4, 32).npu() >>> box2.requires_grad = True >>> ciou = torch_npu.contrib.function.npu_ciou(box1, box2) >>> l = ciou.sum() >>> l.backward()
def npu_single_level_responsible_flags(featmap_size,gt_bboxes,stride,num_base_anchors):
Use NPU OP to generate the responsible flags of anchor in a single feature map.
torch.Tensor: The valid flags of each anchor in a single level feature map. Output size is [featmap_size[0] * featmap_size[1] * num_base_anchors].
>>> featmap_sizes = [[10, 10], [20, 20], [40, 40]] >>> stride = [[32, 32], [16, 16], [8, 8]] >>> gt_bboxes = torch.randint(0, 512, size=(128, 4)) >>> num_base_anchors = 3 >>> featmap_level = len(featmap_sizes) >>> torch.npu.set_device(0) >>> for i in range(featmap_level): gt_bboxes = gt_bboxes.npu() >>> out = npu_single_level_responsible_flags(featmap_sizes[i],gt_bboxes,stride[i],num_base_anchors) >>> print(out.shape, out.max(), out.min())
def npu_bbox_coder_encode_yolo(bboxes, gt_bboxes, stride):
Use NPU OP to get box regression transformation deltas that can be used to transform the bboxes into the gt_bboxes.
>>> A = 1024 >>> bboxes = torch.randint(0, 512, size=(A, 4)) >>> gt_bboxes = torch.randint(0, 512, size=(A, 4)) >>> stride = torch.randint(0, 32, size=(A,)) >>> torch.npu.set_device(0) >>> bboxes = bboxes.npu() >>> gt_bboxes = gt_bboxes.npu() >>> stride = stride.npu() >>> out = npu_bbox_coder_encode_yolo(bboxes, gt_bboxes, stride) >>> torch.npu.synchronize() >>> print('_npu_bbox_coder_encode_yolo done. output shape is ', out.shape)
def npu_bbox_coder_encode_xyxy2xywh(bboxes,gt_bboxes,means=None,stds=None,is_normalized=False,normalized_scale=10000.):
Apply an NPU based bboxes's format-encode operation from xyxy to xywh.
Do not support dynamic shape. Because of the semantics of operators, it only supports 2-dimensional (n, 4) scenes. Bboxes and gt_bboxes only support the same shape and the same dtype, and dtype only supports f16 and fp32. The third input (stride) only supports 1D and the first dimension is the same as the first input(bboxes).
>>> A = 1024 >>> bboxes = torch.randint(0, 512, size=(A, 4)) >>> gt_bboxes = torch.randint(0, 512, size=(A, 4)) >>> stride = torch.randint(0, 32, size=(A,)) >>> torch.npu.set_device(0) >>> bboxes = bboxes.npu() >>> gt_bboxes = gt_bboxes.npu() >>> stride = stride.npu() >>> out = npu_bbox_coder_encode_yolo(bboxes, gt_bboxes, stride) >>> torch.npu.synchronize() >>> print('_npu_bbox_coder_encode_yolo done. output shape is ', out.shape)
def npu_bbox_coder_decode_xywh2xyxy(bboxes,predbboxes,means=None,stds=None,maxshape=None,whratioclip=16 / 1000,):
Apply an NPU based bboxes's format-encode operation from xywh to xyxy.
Tensor: Boxes with shape (N, 4), where 4 represents tl_x, tl_y, br_x, br_y.
>>> A = 1024 >>> max_shape = 512 >>> bboxes = torch.randint(0, max_shape, size=(A, 4)) >>> pred_bboxes = torch.randn(A, 4) >>> torch.npu.set_device(0) >>> bboxes = bboxes.npu() >>> pred_bboxes = pred_bboxes.npu() >>> out = npu_bbox_coder_decode_xywh2xyxy(bboxes, pred_bboxes, max_shape=(max_shape, max_shape)) >>> torch.npu.synchronize >>> () >>> print('_npu_bbox_coder_decode_xywh2xyxy done. output shape is ', out.shape)
def npu_fast_condition_index_put(x, condition, value):
Use NPU affinity writing method to replace the native writing method in bool type index_put function.
x (torch.Tensor): normal tensor
condition (torch.BoolTensor): judgment condition
value (Int, Float): stride of bboxes
>>> x = torch.randn(128, 8192) >>> condition = x < 0.5 >>> value = 0. >>> x1 = copy.deepcopy(x)[condition] = value >>> x1_opt = npu_fast_condition_index_put(x, condition, value)
class MatmulApply(torch.autograd.Function):
Use NPU custom operator to replace the native writing method to improve performance.
In the dynamic shape scene, due to the operator restriction, the broadcast scene is not supported.
>>> tensor1 = torch.randn(68, 5, 75, 16).npu() >>> tensor1.requires_grad_(True) >>> tensor2 = torch.randn(68, 5, 75, 16).npu() >>> tensor2.requires_grad_(True) >>> output = matmul_transpose(tensor1, tensor2) >>> output.sum().backward()
def npu_multiclass_nms(multi_bboxes,multi_scores, score_thr=0.05,nms_thr=0.45,max_num=50,score_factors=None):
NMS for multi-class bboxes using npu api.
In the case of dynamic shape, because of the limitation of NPU op, it only supports the maximum of 20 categories(nmsed_classes), and the maximum of 10000 boxes(nmsed_boxes).
Tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels are 0-based.
>>> boxes = torch.randint(1, 255, size=(1000, 4)) >>> scores = torch.randn(1000, 81) >>> boxes = boxes.npu().half() >>> scores = scores.npu().half() >>> det_bboxes, det_labels = npu_multiclass_nms(boxes, scores, score_thr=0.3, nms_thr=0.5, max_num=3) >>> expedt_det_bboxes = torch.tensor([[ 57.0000, 198.8750, 45.9688, 221.8750, 4.1484],[215.0000, 155.0000, 236.8750, 137.0000,3.9023], [208.8750, 221.0000, 228.0000, 17.0000, 3.8867]],dtype=torch.float16)
def npu_batched_multiclass_nms(multi_bboxes,multi_scores,max_num=50,score_factors=None):
NMS for batched multi-class bboxes using npu api.
In the case of dynamic shape, because of the limitation of NPU op, it only supports the maximum of 20 categories(nmsed_classes), and the maximum of 10000 boxes(nmsed_boxes).
Tuple: (bboxes, labels), tensors of shape (bs, k, 5) and (bs, k, 1). Labels are 0-based.
>>> boxes = torch.randint(1, 255, size=(4, 200, 80, 4)) >>> scores = torch.randn(4, 200, 81) >>> boxes = boxes.npu().half() >>> scores = scores.npu().half() >>> det_bboxes, det_labels = npu_batched_multiclass_nms(boxes, scores, score_thr=0.3, nms_thr=0.5, max_num=3) >>> expedt_det_bboxes = torch.tensor([[[221.8750, 60.0000, 183.0000, 22.0000, 3.8867], [167.0000, 250.0000, 136.0000, 144.0000, 3.6445], [ 45.9688, 147.0000, 67.0000, 241.8750, 3.4844]], [[ 5.0000, 178.0000, 243.8750, 138.0000, 3.7344], [238.0000, 132.0000, 47.0000, 84.0000, 3.6836], [ 32.0000, 110.0000, 131.0000, 73.0000, 3.6309]], [[111.9375, 120.9375, 54.0000, 231.0000, 3.9219], [147.0000, 162.0000, 78.0000, 1.0010, 3.9219], [157.0000, 118.0000, 57.0000, 115.0000, 3.6523]], [[ 80.0000, 126.9375, 54.0000, 246.8750, 3.7344], [ 31.0000, 253.8750, 19.0000, 138.0000, 3.6328], [ 54.0000, 253.8750, 78.0000, 75.0000, 3.5586]]],dtype=torch.float16)
def dropout_with_byte_mask(input1, p=0.5, training=True, inplace=False)
This dropout_with_byte_mask method generates stateless random uint8 mask and does dropout according to the mask.
>>> torch.manual_seed(5) >>> items = [[np.float16, 2, (4, 4)], [np.float16, 0, (32, 384, 1024)]] >>> for item in items: cpu_input, npu_input = create_common_tensor(item, 0, 1) self.npu_op_exec(npu_input, prob=0.2)
class NpuRollWithIndexSelect():
Use NPU affinity writing method to replace the native roll in swin-transformer.
>>> input1 = torch.randn(32, 56, 56, 16).npu() >>> shift_size = 3 >>> shifted_x_npu = roll(input1, shifts=(-shift_size, -shift_size), dims=(1, 2))
class Mish(nn.Module):
Apply an NPU based Mish operation.
Mish exists in the official version in PyTorch 1.9.0.Currently, the PyTorch version adapted for NPU is 1.5.0,so Mish needs to be defined as an additional module.
>>> m = nnn.Mish() >>> input_tensor = torch.randn(2, 32, 5, 5) >>> output = m(input_tensor)
class SiLU(nn.Module):
Apply an NPU based Sigmoid Linear Unit (SiLU) function, element-wise. The SiLU function is also known as the swish function.
SiLU exists in the official version since PyTorch 1.7.0. Currently, the PyTorch version adapted for NPU is 1.5.0,so SiLU needs to be defined as an additional module.
>>> m = nnn.SiLU() >>> input_tensor = torch.randn(2, 32, 5, 5) >>> output = m(input_tensor)
class ChannelShuffle(nn.Module):
Apply an NPU compatible channel shuffle operation.In order to avoid contiguous operation which is not efficient on the npu, we replace the original operation with a rewrite of the same semantics. Two discontinuous operations are replaced: transpose and chunk.
>>> x1 = torch.randn(2,32,7,7) >>> x2 = torch.randn(2,32,7,7) >>> m = ChannelShuffle(64, split_shuffle=True) >>> output = m(x1, x2)
class LabelSmoothingCrossEntropy(nn.Module):
CrossEntropy with LabelSmoothing using npu api.
Float: tensors of shape (k, 5) and (k, 1). Labels are 0-based.
>>> x = torch.randn(2, 10) >>> y = torch.randint(0, 10, size=(2,)) >>> x = x.npu() >>> y = y.npu() >>> x.requires_grad = True >>> m = LabelSmoothingCrossEntropy(10) >>> npu_output = m(x, y) >>> npu_output.backward()
class ModulatedDeformConv(nn.Module):
Apply an NPU-based Modulated Deformable 2D convolution operation.
ModulatedDeformConv only implements operations under FP32 data types. Notice: weight and bias in conv_offset must be initialized to 0.
>>> m = ModulatedDeformConv(32, 32, 1) >>> input_tensor = torch.randn(2, 32, 5, 5) >>> output = m(input_tensor) >>> x = torch.randn(2, 32, 7, 7) >>> model = ModulatedDeformConv(32, 32, 3, 2, 1) >>> torch.npu.set_device(0) >>> x = x.npu() >>> model = model.npu() >>> o = model(x) >>> l = o.sum() >>> l.backward() >>> print(l)
class NpuDropPath(nn.Module):
Use NPU affinity writing method to replace the native Drop paths in swin_transformer.py. Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks.)
>>> input1 = torch.randn(68, 5).npu() >>> input1.requires_grad_(True) >>> input2 = torch.randn(68, 5).npu() >>> input2.requires_grad_(True) >>> fast_drop_path = NpuDropPath(0).npu() >>> output = input1 + fast_drop_path(input2) >>> output.sum().backward()
class NpuCachedDropout(torch.nn.Dropout):
FairseqDropout using on the npu device
>>> model = NpuMNIST().to("npu") >>> x = torch.randn(2,10,16,16).to("npu") >>> NpuCachedDropout.enable_dropout_ensemble(model) >>> output = model(x)
class Focus(nn.Module):
Use NPU affinity writing method to replace the native Focus in Yolov5.
>>> input = torch.randn(4, 8, 300, 40).npu() >>> input.requires_grad_(True) >>> fast_focus = Focus(8, 13).npu() >>> output = fast_focus(input) >>> output.sum().backward()
class FusedColorJitter(torch.nn.Module):
Randomly change the brightness, contrast, saturation and hue of an image.
>>> train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2), transforms.RandomHorizontalFlip(), ]))
class MultiheadAttention(nn.Module):
Multi-headed attention.
>>> model = MultiheadAttention(embed_dim=1024,num_heads=16,dropout=0.1,kdim=1024,vdim=1024,self_attention=True,encoder_decoder_attention=True) >>> _, query = create_common_tensor([np.float16, FORMAT_NZ, (1024,1024)], -1, 1) >>> _, key = create_common_tensor([np.float16, FORMAT_NZ, (1024, 1024)], -1, 1) >>> _, value = create_common_tensor([np.float16, FORMAT_NZ, (1024, 1024)], -1, 1) >>> _, key_padding_mask = create_common_tensor([np.float16, FORMAT_NZ, (16,16,64,64)], -65504, 65504) >>> bsz = 16 >>> tgt_len = 64 >>> s_len=64 >>> model = model.to("npu") >>> output = model(query, key, value, bsz, tgt_len, s_len, key_padding_mask)
class DropoutWithByteMask(Module):
Apply an NPU compatible DropoutWithByteMask operation. Only support npu devices.
Maxseed is a hyper-parameter strongly related to the underlying operator.Please check the MAX(2 ** 31 - 1 / 2 ** 10 - 1) in dropoutv2.py in the opp package for matching settings.By default, it is matched by the PyTorch and OPP packages.
>>> m = nn.DropoutWithByteMask(p=0.5) >>> input = torch.randn(16, 16) >>> output = m(input)
class PSROIPool(nn.Module):
ROIAlign using npu api.
Float: tensors of shape (k, 5) and (k, 1). Labels are 0-based.
>>> model = PSROIPool(pooled_height=7, pooled_width=7, spatial_scale=1 / 16.0, group_size=7, output_dim=22)
class ROIAlign(nn.Module):
ROIAlign using npu api.
Given a continuous coordinate c, its two neighboring pixel indices (in our pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled from the underlying signal at continuous coordinates 0.5 and 1.5). But the original roialign (aligned=False) does not subtract the 0.5 when computing neighboring pixel indices and therefore it uses pixels with a slightly incorrect alignment (relative to our pixel model) when performing bilinear interpolation.With aligned=True, we first appropriately scale the ROI and then shift it by -0.5 prior to calling roialign. This produces the correct neighbors; see detectron2/tests/testroialign.py for verification. The difference does not make a difference to the model's performance if ROIAlign is used together with conv layers.
Float: tensors of shape (k, 5) and (k, 1). Labels are 0-based.
>>> input1 = self.generate_input() >>> roi = torch.tensor([[0, -2.0, -2.0, 22.0, 22.0]]).npu() >>> output_size = (3, 3) >>> spatial_scale = 0.25 >>> sampling_ratio = 2 >>> aligned = False >>> npu_output, npu_inputgrad = self.npu_roi_align(input1, roi, output_size, spatial_scale, sampling_ratio, aligned) >>> expedt_cpu_output = torch.tensor([[[[ 4.5000, 6.5000, 8.5000],[16.5000, 18.5000, 20.5000],[28.5000, 30.5000, 32.5000]]]],dtype=torch.float32) >>> expedt_cpu_inputgrad = torch.tensor([[[[0.2397, 0.2346, 0.2346, 0.2346, 0.2346, 0.2907],[0.2346, 0.2296, 0.2296, 0.2296, 0.2296, 0.2845],[0.2346, 0.2296, 0.2296, 0.2296, 0.2296,0.2845],[0.2346, 0.2296, 0.2296, 0.2296, 0.2296, 0.2845],[0.2346,0.2296, 0.2296, 0.2296, 0.2296, 0.2845],[0.2907, 0.2845, 0.2845, 0.2845, 0.2845, 0.3525]]]],dtype=torch.float32) >>> self.assertRtolEqual(expedt_cpu_output, npu_output) >>> self.assertRtolEqual(expedt_cpu_inputgrad, npu_inputgrad)