定义
struct MultiLatentAttentionParam {
int32_t headNum = 0;
float qkScale = 1.0;
int32_t kvHeadNum = 0;
enum MaskType : int {
UNDEFINED = 0,
MASK_TYPE_SPEC,
MASK_TYPE_MASK_FREE,
MASK_TYPE_CAUSAL_MASK
};
MaskType maskType = UNDEFINED;
enum CalcType : int {
CALC_TYPE_UNDEFINED = 0,
CALC_TYPE_SPEC,
CALC_TYPE_RING,
CALC_TYPE_SPEC_AND_RING,
CALC_TYPE_PREFILL,
};
CalcType calcType = CALC_TYPE_UNDEFINED;
enum CacheMode : uint8_t {
KVCACHE = 0,
KROPE_CTKV,
INT8_NZCACHE,
NZCACHE,
};
CacheMode cacheMode = KVCACHE;
uint8_t rsv[43] = {0};
};