|
|
@@ -6573,6 +6573,144 @@
|
|
|
}
|
|
|
]
|
|
|
},
|
|
|
+ {
|
|
|
+ "name": "BitCast",
|
|
|
+ "module": "ai.onnx",
|
|
|
+ "version": 26,
|
|
|
+ "description": "Reinterprets the binary representation of a tensor as a different data type,\nspecified by the 'to' attribute. Unlike Cast, BitCast preserves the exact bit\npattern without any value conversion.\n\nThe target data type must have the same bit-width as the input data type.\nThe output tensor has the same shape as the input tensor.\nAll types except string are supported. Implementations must treat the\nunderlying bytes as little endian.\n",
|
|
|
+ "attributes": [
|
|
|
+ {
|
|
|
+ "name": "to",
|
|
|
+ "type": "int64",
|
|
|
+ "required": true,
|
|
|
+ "description": "The data type to which the input tensor is bitwise reinterpreted. Must be one of the non-string types from DataType enum in TensorProto. The target type must have the same bit-width as the input type."
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "inputs": [
|
|
|
+ {
|
|
|
+ "name": "input",
|
|
|
+ "type": "T1",
|
|
|
+ "description": "Input tensor to be bitcast."
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "min_input": 1,
|
|
|
+ "max_input": 1,
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "output",
|
|
|
+ "type": "T2",
|
|
|
+ "description": "Output tensor with the same shape as the input."
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "min_output": 1,
|
|
|
+ "max_output": 1,
|
|
|
+ "type_constraints": [
|
|
|
+ {
|
|
|
+ "description": "Constrain input types. Bitcasting from string is not supported.",
|
|
|
+ "type_param_str": "T1",
|
|
|
+ "allowed_type_strs": [
|
|
|
+ "tensor(uint8)",
|
|
|
+ "tensor(uint16)",
|
|
|
+ "tensor(uint32)",
|
|
|
+ "tensor(uint64)",
|
|
|
+ "tensor(int8)",
|
|
|
+ "tensor(int16)",
|
|
|
+ "tensor(int32)",
|
|
|
+ "tensor(int64)",
|
|
|
+ "tensor(bfloat16)",
|
|
|
+ "tensor(float16)",
|
|
|
+ "tensor(float)",
|
|
|
+ "tensor(double)",
|
|
|
+ "tensor(bool)",
|
|
|
+ "tensor(complex64)",
|
|
|
+ "tensor(complex128)",
|
|
|
+ "tensor(float8e4m3fn)",
|
|
|
+ "tensor(float8e4m3fnuz)",
|
|
|
+ "tensor(float8e5m2)",
|
|
|
+ "tensor(float8e5m2fnuz)",
|
|
|
+ "tensor(uint4)",
|
|
|
+ "tensor(int4)",
|
|
|
+ "tensor(float4e2m1)",
|
|
|
+ "tensor(float8e8m0)",
|
|
|
+ "tensor(uint2)",
|
|
|
+ "tensor(int2)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "description": "Constrain output types. Bitcasting to string is not supported.",
|
|
|
+ "type_param_str": "T2",
|
|
|
+ "allowed_type_strs": [
|
|
|
+ "tensor(uint8)",
|
|
|
+ "tensor(uint16)",
|
|
|
+ "tensor(uint32)",
|
|
|
+ "tensor(uint64)",
|
|
|
+ "tensor(int8)",
|
|
|
+ "tensor(int16)",
|
|
|
+ "tensor(int32)",
|
|
|
+ "tensor(int64)",
|
|
|
+ "tensor(bfloat16)",
|
|
|
+ "tensor(float16)",
|
|
|
+ "tensor(float)",
|
|
|
+ "tensor(double)",
|
|
|
+ "tensor(bool)",
|
|
|
+ "tensor(complex64)",
|
|
|
+ "tensor(complex128)",
|
|
|
+ "tensor(float8e4m3fn)",
|
|
|
+ "tensor(float8e4m3fnuz)",
|
|
|
+ "tensor(float8e5m2)",
|
|
|
+ "tensor(float8e5m2fnuz)",
|
|
|
+ "tensor(uint4)",
|
|
|
+ "tensor(int4)",
|
|
|
+ "tensor(float4e2m1)",
|
|
|
+ "tensor(float8e8m0)",
|
|
|
+ "tensor(uint2)",
|
|
|
+ "tensor(int2)"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "examples": [
|
|
|
+ {
|
|
|
+ "summary": "bitcast_2d_float32_to_int32",
|
|
|
+ "code": "\"\"\"Test bitcasting 2D array from float32 to int32.\"\"\"\nnode = onnx.helper.make_node(\n \"BitCast\",\n inputs=[\"x\"],\n outputs=[\"y\"],\n to=onnx.TensorProto.INT32,\n)\nx = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)\ny = x.view(np.int32)\nexpect(node, inputs=[x], outputs=[y], name=\"test_bitcast_2d_float32_to_int32\")"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "summary": "bitcast_bool_to_uint8",
|
|
|
+ "code": "\"\"\"Test bitcasting from bool to uint8 (same size).\"\"\"\nnode = onnx.helper.make_node(\n \"BitCast\",\n inputs=[\"x\"],\n outputs=[\"y\"],\n to=onnx.TensorProto.UINT8,\n)\nx = np.array([True, False, True, False], dtype=np.bool_)\ny = x.view(np.uint8)\nexpect(node, inputs=[x], outputs=[y], name=\"test_bitcast_bool_to_uint8\")"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "summary": "bitcast_float32_to_int32",
|
|
|
+ "code": "\"\"\"Test bitcasting from float32 to int32 (same size).\"\"\"\nnode = onnx.helper.make_node(\n \"BitCast\",\n inputs=[\"x\"],\n outputs=[\"y\"],\n to=onnx.TensorProto.INT32,\n)\nx = np.array([1.0, -2.5, 3.75], dtype=np.float32)\ny = x.view(np.int32)\nexpect(node, inputs=[x], outputs=[y], name=\"test_bitcast_float32_to_int32\")"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "summary": "bitcast_float64_to_int64",
|
|
|
+ "code": "\"\"\"Test bitcasting from float64 to int64 (same size).\"\"\"\nnode = onnx.helper.make_node(\n \"BitCast\",\n inputs=[\"x\"],\n outputs=[\"y\"],\n to=onnx.TensorProto.INT64,\n)\nx = np.array([1.0, -2.5, 3.75], dtype=np.float64)\ny = x.view(np.int64)\nexpect(node, inputs=[x], outputs=[y], name=\"test_bitcast_float64_to_int64\")"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "summary": "bitcast_int32_to_float32",
|
|
|
+ "code": "\"\"\"Test bitcasting from int32 to float32 (same size).\"\"\"\nnode = onnx.helper.make_node(\n \"BitCast\",\n inputs=[\"x\"],\n outputs=[\"y\"],\n to=onnx.TensorProto.FLOAT,\n)\nx = np.array([1065353216, -1071644672, 1081081856], dtype=np.int32)\ny = x.view(np.float32)\nexpect(node, inputs=[x], outputs=[y], name=\"test_bitcast_int32_to_float32\")"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "summary": "bitcast_int64_to_float64",
|
|
|
+ "code": "\"\"\"Test bitcasting from int64 to float64 (same size).\"\"\"\nnode = onnx.helper.make_node(\n \"BitCast\",\n inputs=[\"x\"],\n outputs=[\"y\"],\n to=onnx.TensorProto.DOUBLE,\n)\nx = np.array(\n [4607182418800017408, -4611686018427387904, 4614256656552045184],\n dtype=np.int64,\n)\ny = x.view(np.float64)\nexpect(node, inputs=[x], outputs=[y], name=\"test_bitcast_int64_to_float64\")"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "summary": "bitcast_int8_to_uint8",
|
|
|
+ "code": "\"\"\"Test bitcasting from int8 to uint8 (same size, different signedness).\"\"\"\nnode = onnx.helper.make_node(\n \"BitCast\",\n inputs=[\"x\"],\n outputs=[\"y\"],\n to=onnx.TensorProto.UINT8,\n)\nx = np.array([-1, -128, 127, 0], dtype=np.int8)\ny = x.view(np.uint8)\nexpect(node, inputs=[x], outputs=[y], name=\"test_bitcast_int8_to_uint8\")"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "summary": "bitcast_scalar_float32_to_int32",
|
|
|
+ "code": "\"\"\"Test bitcasting scalar from float32 to int32.\"\"\"\nnode = onnx.helper.make_node(\n \"BitCast\",\n inputs=[\"x\"],\n outputs=[\"y\"],\n to=onnx.TensorProto.INT32,\n)\nx = np.array(1.0, dtype=np.float32)\ny = x.view(np.int32)\nexpect(\n node, inputs=[x], outputs=[y], name=\"test_bitcast_scalar_float32_to_int32\"\n)"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "summary": "bitcast_uint16_to_int16",
|
|
|
+ "code": "\"\"\"Test bitcasting from uint16 to int16 (same size, different signedness).\"\"\"\nnode = onnx.helper.make_node(\n \"BitCast\",\n inputs=[\"x\"],\n outputs=[\"y\"],\n to=onnx.TensorProto.INT16,\n)\nx = np.array([1, 32768, 65535], dtype=np.uint16)\ny = x.view(np.int16)\nexpect(node, inputs=[x], outputs=[y], name=\"test_bitcast_uint16_to_int16\")"
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "summary": "bitcast_uint32_to_int32",
|
|
|
+ "code": "\"\"\"Test bitcasting from uint32 to int32 (same size, different signedness).\"\"\"\nnode = onnx.helper.make_node(\n \"BitCast\",\n inputs=[\"x\"],\n outputs=[\"y\"],\n to=onnx.TensorProto.INT32,\n)\nx = np.array([4294967295, 2147483648, 2147483647], dtype=np.uint32)\ny = x.view(np.int32)\nexpect(node, inputs=[x], outputs=[y], name=\"test_bitcast_uint32_to_int32\")"
|
|
|
+ }
|
|
|
+ ]
|
|
|
+ },
|
|
|
{
|
|
|
"name": "BitShift",
|
|
|
"module": "ai.onnx",
|
|
|
@@ -25302,7 +25440,7 @@
|
|
|
"name": "GroupQueryAttention",
|
|
|
"module": "com.microsoft",
|
|
|
"version": 1,
|
|
|
- "description": "Group Query Self/Cross Attention.\n\n*Highly recommend using k-v cache share buffer for both CPU and CUDA. Enabled through IOBinding past and present kv.\nSupports different number of heads for q and kv for CPU and CUDA.\nOnly supports causal and local attention.\nSupports rotary position embedding for CPU and CUDA.\nSupports packed input for CPU and CUDA.\nSupports continuous decoding for batch_size == 1 for CPU and CUDA.\n\n",
|
|
|
+ "description": "Group Query Self/Cross Attention with KV Cache Quantization Support.\n\nThis operator implements causal grouped-query attention with past state (KV cache) support.\nIt also supports optional float8, int8 or int4 quantization for the KV cache to reduce memory footprint.\n\n**Cache Format:**\nThe past and present KV cache tensors are expected in a BNSH format: `(batch_size, num_heads, cache_sequence_length, head_size)`, where `cache_sequence_length` is the length of the cached key/value sequences, or the maximum sequence length when past and present buffer sharing is used.\n\n**Quantization:**\nWhen quantization is enabled, `past_key` and `past_value` inputs can be of type `float8e4m3fn`, `uint8` or `int8`. The corresponding `k_scale` and `v_scale` tensors must be provided.\nThe operator will output `present_key` and `present_value` in same format as the `past_key` and `past_value`.\n\nFor 4-bit quantization, the data type is uint8 where each byte contains two 4-bit values. The bit width of quantized KV cache can be set using `kv_cache_bit_width` attribute.\n\nThe shapes of the k_scale, v_scale tensors shall be broadcastable to present_key shape.\n\n**Quantization Modes (`k_quant_type`, `v_quant_type` attributes):**\n- **\"NONE\"**: No quantization.\n- **\"PER_TENSOR\"**: A single scale for the entire tensor. Scale example shape: `[1]`.\n- **\"PER_CHANNEL\"**: A scale for each channel. Scale example shape: `[1, num_heads_k, 1, head_size]`.\n",
|
|
|
"attributes": [
|
|
|
{
|
|
|
"name": "do_rotary",
|
|
|
@@ -25310,6 +25448,19 @@
|
|
|
"required": false,
|
|
|
"description": "Whether to use rotary position embedding. Default value is 0."
|
|
|
},
|
|
|
+ {
|
|
|
+ "name": "k_quant_type",
|
|
|
+ "type": "string",
|
|
|
+ "required": false,
|
|
|
+ "default": "NONE",
|
|
|
+ "description": "Quantization type for K cache. One of 'NONE', 'PER_TENSOR', 'PER_CHANNEL'."
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "name": "kv_cache_bit_width",
|
|
|
+ "type": "int64",
|
|
|
+ "required": false,
|
|
|
+ "description": "Bit width of quantized KV cache. Supported values are 8 and 4."
|
|
|
+ },
|
|
|
{
|
|
|
"name": "kv_num_heads",
|
|
|
"type": "int64",
|
|
|
@@ -25359,6 +25510,13 @@
|
|
|
"type": "float32",
|
|
|
"required": false,
|
|
|
"description": "Softcap value for attention weights. Default value is 0."
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "name": "v_quant_type",
|
|
|
+ "type": "string",
|
|
|
+ "required": false,
|
|
|
+ "default": "NONE",
|
|
|
+ "description": "Quantization type for V cache. One of 'NONE', 'PER_TENSOR', 'PER_CHANNEL'."
|
|
|
}
|
|
|
],
|
|
|
"inputs": [
|
|
|
@@ -25381,13 +25539,13 @@
|
|
|
},
|
|
|
{
|
|
|
"name": "past_key",
|
|
|
- "type": "T",
|
|
|
+ "type": "T_CACHE",
|
|
|
"option": "optional",
|
|
|
"description": "past state key with support for format BNSH. When past_key uses same tensor as present_key(k-v cache), it is of length max_sequence_length... otherwise of length past_sequence_length."
|
|
|
},
|
|
|
{
|
|
|
"name": "past_value",
|
|
|
- "type": "T",
|
|
|
+ "type": "T_CACHE",
|
|
|
"option": "optional",
|
|
|
"description": "past state value with support for format BNSH. When past_value uses same tensor as present_value(k-v cache), it is of length max_sequence_length... otherwise of length past_sequence_length."
|
|
|
},
|
|
|
@@ -25430,10 +25588,22 @@
|
|
|
"type": "T",
|
|
|
"option": "optional",
|
|
|
"description": "1D tensor with shape (num_heads). Each head has a smooth factor adding to the denominator of softmax."
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "name": "k_scale",
|
|
|
+ "type": "T_KV_SCALE",
|
|
|
+ "option": "optional",
|
|
|
+ "description": "Scale tensor for past_key."
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "name": "v_scale",
|
|
|
+ "type": "T_KV_SCALE",
|
|
|
+ "option": "optional",
|
|
|
+ "description": "Scale tensor for past_value."
|
|
|
}
|
|
|
],
|
|
|
"min_input": 7,
|
|
|
- "max_input": 12,
|
|
|
+ "max_input": 14,
|
|
|
"outputs": [
|
|
|
{
|
|
|
"name": "output",
|
|
|
@@ -25442,12 +25612,12 @@
|
|
|
},
|
|
|
{
|
|
|
"name": "present_key",
|
|
|
- "type": "T",
|
|
|
+ "type": "T_CACHE",
|
|
|
"description": "present state key with support for format BNSH. When past_key uses same tensor as present_key(k-v buffer), it is of length max_sequence_length... otherwise of length past_sequence_length +kv_sequence_length."
|
|
|
},
|
|
|
{
|
|
|
"name": "present_value",
|
|
|
- "type": "T",
|
|
|
+ "type": "T_CACHE",
|
|
|
"description": "present state value with support for format BNSH. When past_value uses same tensor as present_value(k-v buffer), it is of length max_sequence_length... otherwise of length past_sequence_length +kv_sequence_length."
|
|
|
},
|
|
|
{
|
|
|
@@ -25459,7 +25629,7 @@
|
|
|
],
|
|
|
"min_output": 3,
|
|
|
"max_output": 4,
|
|
|
- "inputs_range": "7 - 12",
|
|
|
+ "inputs_range": "7 - 14",
|
|
|
"outputs_range": "3 - 4",
|
|
|
"type_constraints": [
|
|
|
{
|
|
|
@@ -25471,6 +25641,25 @@
|
|
|
"tensor(float)"
|
|
|
]
|
|
|
},
|
|
|
+ {
|
|
|
+ "description": "Constrain KV cache types.",
|
|
|
+ "type_param_str": "T_CACHE",
|
|
|
+ "allowed_type_strs": [
|
|
|
+ "tensor(float)",
|
|
|
+ "tensor(float16)",
|
|
|
+ "tensor(bfloat16)",
|
|
|
+ "tensor(uint8)",
|
|
|
+ "tensor(int8)",
|
|
|
+ "tensor(float8e4m3fn)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "description": "Constrain KV cache scale types.",
|
|
|
+ "type_param_str": "T_KV_SCALE",
|
|
|
+ "allowed_type_strs": [
|
|
|
+ "tensor(float)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
{
|
|
|
"description": "Constrain mask to int tensor.",
|
|
|
"type_param_str": "M",
|