Prechádzať zdrojové kódy

Update onnx-metadata.json

Lutz Roeder 2 týždňov pred
rodič
commit
0d504e7ebf
1 zmenil súbory, kde vykonal 196 pridanie a 7 odobranie
  1. 196 7
      source/onnx-metadata.json

+ 196 - 7
source/onnx-metadata.json

@@ -6573,6 +6573,144 @@
       }
     ]
   },
+  {
+    "name": "BitCast",
+    "module": "ai.onnx",
+    "version": 26,
+    "description": "Reinterprets the binary representation of a tensor as a different data type,\nspecified by the 'to' attribute. Unlike Cast, BitCast preserves the exact bit\npattern without any value conversion.\n\nThe target data type must have the same bit-width as the input data type.\nThe output tensor has the same shape as the input tensor.\nAll types except string are supported. Implementations must treat the\nunderlying bytes as little endian.\n",
+    "attributes": [
+      {
+        "name": "to",
+        "type": "int64",
+        "required": true,
+        "description": "The data type to which the input tensor is bitwise reinterpreted. Must be one of the non-string types from DataType enum in TensorProto. The target type must have the same bit-width as the input type."
+      }
+    ],
+    "inputs": [
+      {
+        "name": "input",
+        "type": "T1",
+        "description": "Input tensor to be bitcast."
+      }
+    ],
+    "min_input": 1,
+    "max_input": 1,
+    "outputs": [
+      {
+        "name": "output",
+        "type": "T2",
+        "description": "Output tensor with the same shape as the input."
+      }
+    ],
+    "min_output": 1,
+    "max_output": 1,
+    "type_constraints": [
+      {
+        "description": "Constrain input types. Bitcasting from string is not supported.",
+        "type_param_str": "T1",
+        "allowed_type_strs": [
+          "tensor(uint8)",
+          "tensor(uint16)",
+          "tensor(uint32)",
+          "tensor(uint64)",
+          "tensor(int8)",
+          "tensor(int16)",
+          "tensor(int32)",
+          "tensor(int64)",
+          "tensor(bfloat16)",
+          "tensor(float16)",
+          "tensor(float)",
+          "tensor(double)",
+          "tensor(bool)",
+          "tensor(complex64)",
+          "tensor(complex128)",
+          "tensor(float8e4m3fn)",
+          "tensor(float8e4m3fnuz)",
+          "tensor(float8e5m2)",
+          "tensor(float8e5m2fnuz)",
+          "tensor(uint4)",
+          "tensor(int4)",
+          "tensor(float4e2m1)",
+          "tensor(float8e8m0)",
+          "tensor(uint2)",
+          "tensor(int2)"
+        ]
+      },
+      {
+        "description": "Constrain output types. Bitcasting to string is not supported.",
+        "type_param_str": "T2",
+        "allowed_type_strs": [
+          "tensor(uint8)",
+          "tensor(uint16)",
+          "tensor(uint32)",
+          "tensor(uint64)",
+          "tensor(int8)",
+          "tensor(int16)",
+          "tensor(int32)",
+          "tensor(int64)",
+          "tensor(bfloat16)",
+          "tensor(float16)",
+          "tensor(float)",
+          "tensor(double)",
+          "tensor(bool)",
+          "tensor(complex64)",
+          "tensor(complex128)",
+          "tensor(float8e4m3fn)",
+          "tensor(float8e4m3fnuz)",
+          "tensor(float8e5m2)",
+          "tensor(float8e5m2fnuz)",
+          "tensor(uint4)",
+          "tensor(int4)",
+          "tensor(float4e2m1)",
+          "tensor(float8e8m0)",
+          "tensor(uint2)",
+          "tensor(int2)"
+        ]
+      }
+    ],
+    "examples": [
+      {
+        "summary": "bitcast_2d_float32_to_int32",
+        "code": "\"\"\"Test bitcasting 2D array from float32 to int32.\"\"\"\nnode = onnx.helper.make_node(\n    \"BitCast\",\n    inputs=[\"x\"],\n    outputs=[\"y\"],\n    to=onnx.TensorProto.INT32,\n)\nx = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)\ny = x.view(np.int32)\nexpect(node, inputs=[x], outputs=[y], name=\"test_bitcast_2d_float32_to_int32\")"
+      },
+      {
+        "summary": "bitcast_bool_to_uint8",
+        "code": "\"\"\"Test bitcasting from bool to uint8 (same size).\"\"\"\nnode = onnx.helper.make_node(\n    \"BitCast\",\n    inputs=[\"x\"],\n    outputs=[\"y\"],\n    to=onnx.TensorProto.UINT8,\n)\nx = np.array([True, False, True, False], dtype=np.bool_)\ny = x.view(np.uint8)\nexpect(node, inputs=[x], outputs=[y], name=\"test_bitcast_bool_to_uint8\")"
+      },
+      {
+        "summary": "bitcast_float32_to_int32",
+        "code": "\"\"\"Test bitcasting from float32 to int32 (same size).\"\"\"\nnode = onnx.helper.make_node(\n    \"BitCast\",\n    inputs=[\"x\"],\n    outputs=[\"y\"],\n    to=onnx.TensorProto.INT32,\n)\nx = np.array([1.0, -2.5, 3.75], dtype=np.float32)\ny = x.view(np.int32)\nexpect(node, inputs=[x], outputs=[y], name=\"test_bitcast_float32_to_int32\")"
+      },
+      {
+        "summary": "bitcast_float64_to_int64",
+        "code": "\"\"\"Test bitcasting from float64 to int64 (same size).\"\"\"\nnode = onnx.helper.make_node(\n    \"BitCast\",\n    inputs=[\"x\"],\n    outputs=[\"y\"],\n    to=onnx.TensorProto.INT64,\n)\nx = np.array([1.0, -2.5, 3.75], dtype=np.float64)\ny = x.view(np.int64)\nexpect(node, inputs=[x], outputs=[y], name=\"test_bitcast_float64_to_int64\")"
+      },
+      {
+        "summary": "bitcast_int32_to_float32",
+        "code": "\"\"\"Test bitcasting from int32 to float32 (same size).\"\"\"\nnode = onnx.helper.make_node(\n    \"BitCast\",\n    inputs=[\"x\"],\n    outputs=[\"y\"],\n    to=onnx.TensorProto.FLOAT,\n)\nx = np.array([1065353216, -1071644672, 1081081856], dtype=np.int32)\ny = x.view(np.float32)\nexpect(node, inputs=[x], outputs=[y], name=\"test_bitcast_int32_to_float32\")"
+      },
+      {
+        "summary": "bitcast_int64_to_float64",
+        "code": "\"\"\"Test bitcasting from int64 to float64 (same size).\"\"\"\nnode = onnx.helper.make_node(\n    \"BitCast\",\n    inputs=[\"x\"],\n    outputs=[\"y\"],\n    to=onnx.TensorProto.DOUBLE,\n)\nx = np.array(\n    [4607182418800017408, -4611686018427387904, 4614256656552045184],\n    dtype=np.int64,\n)\ny = x.view(np.float64)\nexpect(node, inputs=[x], outputs=[y], name=\"test_bitcast_int64_to_float64\")"
+      },
+      {
+        "summary": "bitcast_int8_to_uint8",
+        "code": "\"\"\"Test bitcasting from int8 to uint8 (same size, different signedness).\"\"\"\nnode = onnx.helper.make_node(\n    \"BitCast\",\n    inputs=[\"x\"],\n    outputs=[\"y\"],\n    to=onnx.TensorProto.UINT8,\n)\nx = np.array([-1, -128, 127, 0], dtype=np.int8)\ny = x.view(np.uint8)\nexpect(node, inputs=[x], outputs=[y], name=\"test_bitcast_int8_to_uint8\")"
+      },
+      {
+        "summary": "bitcast_scalar_float32_to_int32",
+        "code": "\"\"\"Test bitcasting scalar from float32 to int32.\"\"\"\nnode = onnx.helper.make_node(\n    \"BitCast\",\n    inputs=[\"x\"],\n    outputs=[\"y\"],\n    to=onnx.TensorProto.INT32,\n)\nx = np.array(1.0, dtype=np.float32)\ny = x.view(np.int32)\nexpect(\n    node, inputs=[x], outputs=[y], name=\"test_bitcast_scalar_float32_to_int32\"\n)"
+      },
+      {
+        "summary": "bitcast_uint16_to_int16",
+        "code": "\"\"\"Test bitcasting from uint16 to int16 (same size, different signedness).\"\"\"\nnode = onnx.helper.make_node(\n    \"BitCast\",\n    inputs=[\"x\"],\n    outputs=[\"y\"],\n    to=onnx.TensorProto.INT16,\n)\nx = np.array([1, 32768, 65535], dtype=np.uint16)\ny = x.view(np.int16)\nexpect(node, inputs=[x], outputs=[y], name=\"test_bitcast_uint16_to_int16\")"
+      },
+      {
+        "summary": "bitcast_uint32_to_int32",
+        "code": "\"\"\"Test bitcasting from uint32 to int32 (same size, different signedness).\"\"\"\nnode = onnx.helper.make_node(\n    \"BitCast\",\n    inputs=[\"x\"],\n    outputs=[\"y\"],\n    to=onnx.TensorProto.INT32,\n)\nx = np.array([4294967295, 2147483648, 2147483647], dtype=np.uint32)\ny = x.view(np.int32)\nexpect(node, inputs=[x], outputs=[y], name=\"test_bitcast_uint32_to_int32\")"
+      }
+    ]
+  },
   {
     "name": "BitShift",
     "module": "ai.onnx",
@@ -25302,7 +25440,7 @@
     "name": "GroupQueryAttention",
     "module": "com.microsoft",
     "version": 1,
-    "description": "Group Query Self/Cross Attention.\n\n*Highly recommend using k-v cache share buffer for both CPU and CUDA. Enabled through IOBinding past and present kv.\nSupports different number of heads for q and kv for CPU and CUDA.\nOnly supports causal and local attention.\nSupports rotary position embedding for CPU and CUDA.\nSupports packed input for CPU and CUDA.\nSupports continuous decoding for batch_size == 1 for CPU and CUDA.\n\n",
+    "description": "Group Query Self/Cross Attention with KV Cache Quantization Support.\n\nThis operator implements causal grouped-query attention with past state (KV cache) support.\nIt also supports optional float8, int8 or int4 quantization for the KV cache to reduce memory footprint.\n\n**Cache Format:**\nThe past and present KV cache tensors are expected in a BNSH format: `(batch_size, num_heads, cache_sequence_length, head_size)`, where `cache_sequence_length` is the length of the cached key/value sequences, or the maximum sequence length when past and present buffer sharing is used.\n\n**Quantization:**\nWhen quantization is enabled, `past_key` and `past_value` inputs can be of type `float8e4m3fn`, `uint8` or `int8`. The corresponding `k_scale` and `v_scale` tensors must be provided.\nThe operator will output `present_key` and `present_value` in same format as the `past_key` and `past_value`.\n\nFor 4-bit quantization, the data type is uint8 where each byte contains two 4-bit values. The bit width of quantized KV cache can be set using `kv_cache_bit_width` attribute.\n\nThe shapes of the k_scale, v_scale tensors shall be broadcastable to present_key shape.\n\n**Quantization Modes (`k_quant_type`, `v_quant_type` attributes):**\n- **\"NONE\"**: No quantization.\n- **\"PER_TENSOR\"**: A single scale for the entire tensor. Scale example shape: `[1]`.\n- **\"PER_CHANNEL\"**: A scale for each channel. Scale example shape: `[1, num_heads_k, 1, head_size]`.\n",
     "attributes": [
       {
         "name": "do_rotary",
@@ -25310,6 +25448,19 @@
         "required": false,
         "description": "Whether to use rotary position embedding. Default value is 0."
       },
+      {
+        "name": "k_quant_type",
+        "type": "string",
+        "required": false,
+        "default": "NONE",
+        "description": "Quantization type for K cache. One of 'NONE', 'PER_TENSOR', 'PER_CHANNEL'."
+      },
+      {
+        "name": "kv_cache_bit_width",
+        "type": "int64",
+        "required": false,
+        "description": "Bit width of quantized KV cache. Supported values are 8 and 4."
+      },
       {
         "name": "kv_num_heads",
         "type": "int64",
@@ -25359,6 +25510,13 @@
         "type": "float32",
         "required": false,
         "description": "Softcap value for attention weights. Default value is 0."
+      },
+      {
+        "name": "v_quant_type",
+        "type": "string",
+        "required": false,
+        "default": "NONE",
+        "description": "Quantization type for V cache. One of 'NONE', 'PER_TENSOR', 'PER_CHANNEL'."
       }
     ],
     "inputs": [
@@ -25381,13 +25539,13 @@
       },
       {
         "name": "past_key",
-        "type": "T",
+        "type": "T_CACHE",
         "option": "optional",
         "description": "past state key with support for format BNSH. When past_key uses same tensor as present_key(k-v cache), it is of length max_sequence_length... otherwise of length past_sequence_length."
       },
       {
         "name": "past_value",
-        "type": "T",
+        "type": "T_CACHE",
         "option": "optional",
         "description": "past state value with support for format BNSH. When past_value uses same tensor as present_value(k-v cache), it is of length max_sequence_length... otherwise of length past_sequence_length."
       },
@@ -25430,10 +25588,22 @@
         "type": "T",
         "option": "optional",
         "description": "1D tensor with shape (num_heads). Each head has a smooth factor adding to the denominator of softmax."
+      },
+      {
+        "name": "k_scale",
+        "type": "T_KV_SCALE",
+        "option": "optional",
+        "description": "Scale tensor for past_key."
+      },
+      {
+        "name": "v_scale",
+        "type": "T_KV_SCALE",
+        "option": "optional",
+        "description": "Scale tensor for past_value."
       }
     ],
     "min_input": 7,
-    "max_input": 12,
+    "max_input": 14,
     "outputs": [
       {
         "name": "output",
@@ -25442,12 +25612,12 @@
       },
       {
         "name": "present_key",
-        "type": "T",
+        "type": "T_CACHE",
         "description": "present state key with support for format BNSH. When past_key uses same tensor as present_key(k-v buffer), it is of length max_sequence_length... otherwise of length past_sequence_length +kv_sequence_length."
       },
       {
         "name": "present_value",
-        "type": "T",
+        "type": "T_CACHE",
         "description": "present state value with support for format BNSH. When past_value uses same tensor as present_value(k-v buffer), it is of length max_sequence_length... otherwise of length past_sequence_length +kv_sequence_length."
       },
       {
@@ -25459,7 +25629,7 @@
     ],
     "min_output": 3,
     "max_output": 4,
-    "inputs_range": "7 - 12",
+    "inputs_range": "7 - 14",
     "outputs_range": "3 - 4",
     "type_constraints": [
       {
@@ -25471,6 +25641,25 @@
           "tensor(float)"
         ]
       },
+      {
+        "description": "Constrain KV cache types.",
+        "type_param_str": "T_CACHE",
+        "allowed_type_strs": [
+          "tensor(float)",
+          "tensor(float16)",
+          "tensor(bfloat16)",
+          "tensor(uint8)",
+          "tensor(int8)",
+          "tensor(float8e4m3fn)"
+        ]
+      },
+      {
+        "description": "Constrain KV cache scale types.",
+        "type_param_str": "T_KV_SCALE",
+        "allowed_type_strs": [
+          "tensor(float)"
+        ]
+      },
       {
         "description": "Constrain mask to int tensor.",
         "type_param_str": "M",