5 ヶ月前 · 4762f9fc63
--- a/source/onnx-metadata.json
+++ b/source/onnx-metadata.json
@@ -36574,23 +36574,23 @@
 
				       {
			
 
				         "name": "input",
			
 
				         "type": "T",
			
 
				-        "description": "2D input tensor with shape (num_tokens, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)"
			
 
				+        "description": "2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)"
			
 
				       },
			
 
				       {
			
 
				         "name": "router_probs",
			
 
				         "type": "T",
			
 
				-        "description": "2D input tensor with shape (num_tokens, num_experts)"
			
 
				+        "description": "2D input tensor with shape (num_rows, num_experts)"
			
 
				       },
			
 
				       {
			
 
				         "name": "fc1_experts_weights",
			
 
				         "type": "T",
			
 
				-        "description": "3D input tensor with shape (num_experts, fusion_size * inter_size, hidden_size), where fusion_size is 2 for fused swiglu, and 1 otherwise"
			
 
				+        "description": "3D input tensor with shape (num_experts, inter_size, hidden_size), or (num_experts, 2 * inter_size, hidden_size) for swiglu"
			
 
				       },
			
 
				       {
			
 
				         "name": "fc1_experts_bias",
			
 
				         "type": "T",
			
 
				         "option": "optional",
			
 
				-        "description": "2D optional input tensor with shape (num_experts, fusion_size * inter_size)"
			
 
				+        "description": "2D optional input tensor with shape (num_experts, inter_size), or (num_experts, 2 * inter_size) for swiglu"
			
 
				       },
			
 
				       {
			
 
				         "name": "fc2_experts_weights",
			
@@ -36622,7 +36622,7 @@
 
				       {
			
 
				         "name": "output",
			
 
				         "type": "T",
			
 
				-        "description": "2D input tensor with shape (num_tokens, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)"
			
 
				+        "description": "2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)"
			
 
				       }
			
 
				     ],
			
 
				     "min_output": 1,
			
@@ -44013,7 +44013,7 @@
 
				     "name": "QMoE",
			
 
				     "module": "com.microsoft",
			
 
				     "version": 1,
			
 
				-    "description": "Quantized mixture of experts (MoE).\n\n      Only weights are quantized with symmetric quantization.\n      The quantized weights are stored in column major order per expert.\n      The quantization block size can be specified. If not provided, column wise quantization is used.\n\n      The SwiGLU (Swish-Gated Linear Unit) activation function is like:\n         g = xW + b\n         l = xV + c\n         G = clamp(g, max=limit)\n         L = clamp(l, min=-limit, max=limit)\n         swiglu = G * sigmoid(alpha * G) * (L + beta)\n      where x is the input, W and V are weight matrices, b and c are bias vectors, and alpha, beta and limit are constant float parameters.\n      When swiglu_fusion=0, two GEMMs are not fused, and they are FC1 and FC3 in the inputs.\n      When swiglu_fusion=1, two GEMMs are fused so that g and l are computed in a single GEMM (FC1), and g and l are interleaved on each row of size 2 * inter_size.\n      When swiglu_fusion=2, two GEMMs are fused, and g and l are concatenated on each row.\n      ",
			
 
				+    "description": "Quantized MoE",
			
 
				     "attributes": [
			
 
				       {
			
 
				         "name": "activation_alpha",
			
@@ -44035,12 +44035,6 @@
 
				         "default": "relu",
			
 
				         "description": "Activation function to use. Choose from relu, gelu, silu, swiglu and identity. Default is relu"
			
 
				       },
			
 
				-      {
			
 
				-        "name": "block_size",
			
 
				-        "type": "int64",
			
 
				-        "required": false,
			
 
				-        "description": "Size of each quantization block along the K (input feature) dimension. Must be power of two and \u2265 16 (e.g., 16, 32, 64, 128). If provided, both hidden_size and inter_size must be divisible by the block size. Otherwise, there is no blocking and a whole column shares one scaling factor. "
			
 
				-      },
			
 
				       {
			
 
				         "name": "expert_weight_bits",
			
 
				         "type": "int64",
			
@@ -44084,62 +44078,62 @@
 
				       {
			
 
				         "name": "input",
			
 
				         "type": "T",
			
 
				-        "description": "2D tensor with shape (num_tokens, hidden_size), or 3D tensor with shape (batch_size, sequence_length, hidden_size)"
			
 
				+        "description": "2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)"
			
 
				       },
			
 
				       {
			
 
				         "name": "router_probs",
			
 
				         "type": "T",
			
 
				-        "description": "2D tensor with shape (num_tokens, num_experts)"
			
 
				+        "description": "2D input tensor with shape (num_rows, num_experts)"
			
 
				       },
			
 
				       {
			
 
				         "name": "fc1_experts_weights",
			
 
				         "type": "T1",
			
 
				-        "description": "3D tensor with shape (num_experts, fusion_size * inter_size, hidden_size / pack_size), The fusion_size is 2 for fused swiglu, or 1 otherwise. The pack_size is 8 / expert_weight_bits."
			
 
				+        "description": "3D input tensor with shape (num_experts, inter_size, hidden_size), or (num_experts, inter_size, hidden_size / 2) for 4 bits. For swiglu, shape can be (num_experts, 2 * inter_size, hidden_size), or (num_experts, 2 * inter_size, hidden_size / 2) for 4 bits."
			
 
				       },
			
 
				       {
			
 
				         "name": "fc1_scales",
			
 
				         "type": "T2",
			
 
				-        "description": "2D tensor with shape (num_experts, fusion_size * inter_size), or 3D tensor with shape (num_experts, fusion_size * inter_size, hidden_size / block_size) when block_size is provided."
			
 
				+        "description": "2D input tensor with shape (num_experts, inter_size), or (num_experts, 2 * inter_size) for swiglu"
			
 
				       },
			
 
				       {
			
 
				         "name": "fc1_experts_bias",
			
 
				         "type": "T",
			
 
				         "option": "optional",
			
 
				-        "description": "2D optional tensor with shape (num_experts, fusion_size * inter_size)"
			
 
				+        "description": "2D optional input tensor with shape (num_experts, inter_size), or (num_experts, 2 * inter_size) for swiglu"
			
 
				       },
			
 
				       {
			
 
				         "name": "fc2_experts_weights",
			
 
				         "type": "T1",
			
 
				-        "description": "3D tensor with shape (num_experts, hidden_size, inter_size / pack_size)"
			
 
				+        "description": "3D input tensor with shape (num_experts, hidden_size, inter_size) or (num_experts, hidden_size, inter_size / 2) for 4 bits"
			
 
				       },
			
 
				       {
			
 
				         "name": "fc2_scales",
			
 
				         "type": "T2",
			
 
				-        "description": "2D tensor with shape (num_experts, hidden_size), or 3D tensor with shape (num_experts, hidden_size, inter_size / block_size) when block_size is provided."
			
 
				+        "description": "2D input tensor with shape (num_experts, hidden_size)"
			
 
				       },
			
 
				       {
			
 
				         "name": "fc2_experts_bias",
			
 
				         "type": "T",
			
 
				         "option": "optional",
			
 
				-        "description": "2D optional tensor with shape (num_experts, hidden_size)"
			
 
				+        "description": "2D optional input tensor with shape (num_experts, hidden_size)"
			
 
				       },
			
 
				       {
			
 
				         "name": "fc3_experts_weights",
			
 
				         "type": "T1",
			
 
				         "option": "optional",
			
 
				-        "description": "3D optional tensor with shape (num_experts, inter_size, hidden_size / pack_size)"
			
 
				+        "description": "3D optional input tensor with shape (num_experts, inter_size, hidden_size) or (num_experts, inter_size, hidden_size / 2)"
			
 
				       },
			
 
				       {
			
 
				         "name": "fc3_scales",
			
 
				         "type": "T2",
			
 
				         "option": "optional",
			
 
				-        "description": "2D optional tensor with shape (num_experts, inter_size), or 3D optional tensor with shape (num_experts, inter_size, hidden_size / block_size) when block_size is provided."
			
 
				+        "description": "2D optional input tensor with shape (num_experts, inter_size)"
			
 
				       },
			
 
				       {
			
 
				         "name": "fc3_experts_bias",
			
 
				         "type": "T",
			
 
				         "option": "optional",
			
 
				-        "description": "2D optional tensor with shape (num_experts, inter_size)"
			
 
				+        "description": "2D optional input tensor with shape (num_experts, inter_size)"
			
 
				       }
			
 
				     ],
			
 
				     "min_input": 7,
			
@@ -44148,7 +44142,7 @@
 
				       {
			
 
				         "name": "output",
			
 
				         "type": "T",
			
 
				-        "description": "output tensor with same shape of input"
			
 
				+        "description": "2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)"
			
 
				       }
			
 
				     ],
			
 
				     "min_output": 1,