openvino-metadata.json 85 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664
  1. [
  2. {
  3. "name": "Activation",
  4. "category": "Activation",
  5. "description": "*Activation* layer represents an activation function of each neuron in a layer, which is used to add non-linearity to the computational flow.\n**Detailed description**: [Reference](https://medium.com/the-theory-of-everything/understanding-activation-functions-in-neural-networks-9491262884e0)\n**Parameters**: *Activation layer* parameters should be specified in the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\n* Sigmoid function:\n \\f[\n f( x ) = \\frac{1}{1+e^{-x}}\n \\f]\n* Tahn function:\n \\f[\n f ( x ) = \\frac{2}{1+e^{-2x}} - 1 = 2sigmoid(2x) - 1\n \\f]\n*\tElu function:\n\t\\f[\n f(x) = \\left\\{\\begin{array}{ll}\n\t\te^{x} - 1 \\quad \\mbox{if } x < 0 \\\\\n\t\tx \\quad \\mbox{if } x \\geq 0\n\t\\end{array}\\right.\n\t\\f]\n*\tRelu6 function:\n\t\\f[\n f(x) = min(max(0, x), 6)\n\t\\f]\n**Example**\n\n```html\n<layer ... type=\"Activation\" ... >\n <data type=\"sigmoid\" />\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  6. "attributes": [
  7. {
  8. "description": "*type* represents particular activation function. For example, *type* equal *sigmoid* means that neurons of this layer have a sigmoid activation function.",
  9. "name": "type",
  10. "required": true
  11. },
  12. {
  13. "default": 1,
  14. "name": "alpha",
  15. "type": "float32"
  16. }
  17. ],
  18. "status": "default"
  19. },
  20. {
  21. "name": "Add",
  22. "inputs": [
  23. { "name": "A" },
  24. { "name": "B" }
  25. ],
  26. "outputs": [
  27. { "name": "C" }
  28. ]
  29. },
  30. {
  31. "name": "ArgMax",
  32. "description": "*ArgMax* layer compute the index of the *K* maximum values for each datum across all dimensions *CxHxW*.\n**Detailed description**: Intended for use after a classification layer to produce a prediction. If parameter *out_max_val* is set to \"true\", output is a vector of pairs *(max_ind, max_val)* for each image. The *axis* parameter specifies an axis along which to maximize.\n**Parameters**: *ArgMax* layer parameters should be specified as the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\n*ArgMax* generally does the following with the input blobs:\n\\f[\no_{i} = \\left\\{\nx| x \\in S \\wedge \\forall y \\in S : f(y) \\leq f(x)\n\\right\\}\n\\f]\n**Example**\n\n```html\n<layer ... type=\"ArgMax\" ... >\n <data top_k=\"10\" out_max_val=\"1\" axis=\"-1\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  33. "attributes": [
  34. {
  35. "default": 1,
  36. "description": " if *out_max_val* equals 1, output is a vector of pairs *(max_ind, max_val)*, unless axis is set. Then output is *max_val* along the specified axis.",
  37. "name": "top_k",
  38. "required": true,
  39. "type": "int32"
  40. },
  41. {
  42. "default": 1,
  43. "description": " if *out_max_val* equals 1, output is a vector of pairs *(max_ind, max_val)*, unless axis is set. Then output is *max_val* along the specified axis.",
  44. "name": "top_k",
  45. "required": true,
  46. "type": "int32"
  47. },
  48. {
  49. "default": 1,
  50. "description": " if set, maximizes along the specified axis, else maximizes the flattened trailing dimensions for each index of the first / num dimension.",
  51. "name": "axis",
  52. "required": true,
  53. "type": "int32"
  54. }
  55. ],
  56. "status": "default"
  57. },
  58. {
  59. "name": "BatchNormalization",
  60. "category": "Normalization",
  61. "description": "[Reference](http://caffe.berkeleyvision.org/tutorial/layers/batchnorm.html)\n**Detailed description**: [Reference](https://kratzert.github.io/2016/02/12/understanding-the-gradient-flow-through-the-batch-normalization-layer.html)\n**Parameters**: *BatchNormalization* layer parameters should be specified as the `batch_norm_data` node, which is a child of the layer node.\n**Mathematical Formulation**\n*BatchNormalization* is the normalization of the output in each hidden layer.\n* **Input**: Values of \\f$x\\f$ over a mini-batch:\n \\f[\n \\beta = \\{ x_{1...m} \\}\n \\f]\n* **Parameters to learn**: \\f$ \\gamma, \\beta\\f$\n* **Output**:\n \\f[\n \\{ o_{i} = BN_{\\gamma, \\beta} ( b_{i} ) \\}\n \\f]\n* **Mini-batch mean**:\n \\f[\n \\mu_{\\beta} \\leftarrow \\frac{1}{m}\\sum_{i=1}^{m}b_{i}\n \\f]\n* **Mini-batch variance**:\n \\f[\n \\sigma_{\\beta }^{2}\\leftarrow \\frac{1}{m}\\sum_{i=1}^{m} ( b_{i} - \\mu_{\\beta} )^{2}\n \\f]\n* **Normalize**:\n \\f[\n \\hat{b_{i}} \\leftarrow \\frac{b_{i} - \\mu_{\\beta}}{\\sqrt{\\sigma_{\\beta }^{2} + \\epsilon }}\n \\f]\n* **Scale and shift**:\n \\f[\n o_{i} \\leftarrow \\gamma\\hat{b_{i}} + \\beta = BN_{\\gamma ,\\beta } ( b_{i} )\n \\f]\n**Example**\n\n```html\n<layer ... type=\"BatchNormalization\" ... >\n <batch_norm_data epsilon=\"9.99e-06\" />\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  62. "attributes": [
  63. {
  64. "default": 1,
  65. "description": "*epsilon* is the number to be added to the variance to avoid division by zero when normalizing the value. For example, *epsilon* equal 0.001 means that 0.001 is added to the variance.",
  66. "name": "epsilon",
  67. "required": true,
  68. "type": "float32"
  69. }
  70. ],
  71. "status": "default"
  72. },
  73. {
  74. "name": "BinaryConvolution",
  75. "category": "Layer",
  76. "inputs": [
  77. { "name": "inputs" },
  78. { "name": "weights" },
  79. { "name": "bias" }
  80. ],
  81. "outputs": [
  82. { "name": "output" }
  83. ]
  84. },
  85. {
  86. "name": "Broadcast",
  87. "inputs": [
  88. { "name": "data" },
  89. { "name": "target_shape" },
  90. { "name": "axes_mapping" }
  91. ]
  92. },
  93. {
  94. "name": "Clamp",
  95. "description": "*Clamp* layer represents clipping activation operation.\n**Detailed description**: [Reference](https://www.tensorflow.org/versions/r1.2/api_docs/MO_DG/prepare_model/python/tf/clip_by_value)\n**Parameters**: *Clamp* layer parameters should be specified as the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\n*Clamp* generally does the following with the input blobs:\n\\f[\nout_i=\\left\\{\\begin{array}{ll}\n\tmax\\_value \\quad \\mbox{if } \\quad input_i>max\\_value \\\\\n\tmin\\_value \\quad \\mbox{if } \\quad input_i\n\\end{array}\\right.\n\\f]\n**Example**\n\n```html\n<layer ... type=\"Clamp\" ... >\n <data min=\"10\" max=\"50\" />\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  96. "attributes": [
  97. {
  98. "default": 0,
  99. "description": "*min* is the lower bound of values in the output shape. Any value in the input shape that is smaller than the bound, is replaced by the *min* value. For example, *min* equal 10 means that any value in the input shape that is smaller than the bound, is replaced by 10.",
  100. "name": "min",
  101. "required": true,
  102. "type": "int32"
  103. },
  104. {
  105. "default": 1,
  106. "description": "*max* is the upper bound of values in the output shape. Any value in the input shape that is greater than the bound, is replaced by the *max* value. For example, *max* equals 50 means that any value in the input shape that is greater than the bound, is replaced by 50.",
  107. "name": "max",
  108. "required": true,
  109. "type": "int32"
  110. }
  111. ],
  112. "status": "default"
  113. },
  114. {
  115. "name": "Concat",
  116. "category": "Tensor",
  117. "description": "[Reference](http://caffe.berkeleyvision.org/tutorial/layers/concat.html)\n**Parameters**: *Concat* layer parameters should be specified in the `concat_data` node, which is a child of the layer node.\n**Mathematical Formulation**\n*Axis* parameter specifies a blob dimension to concat values. For example, for two input blobs *B1xC1xH1xW1* and *B2xC2xh4xW2* if axis: 1, output blob is****: *B1xC1+C2xH1xW1*. This is only possible if *B1=B2*, *H1=H4*, *W1=W2*.\n**Example**\n\n```html\n<layer ... type=\"Concat\" ... >\n <concat_data axis=\"1\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  118. "attributes": [
  119. {
  120. "description": "*axis* is the number of axis over which input blobs are concatenated. For example, *axis* equal 1 means that input blobs are concatenated over the first axis.",
  121. "name": "axis",
  122. "required": true,
  123. "type": "int32"
  124. }
  125. ],
  126. "inputs": [
  127. {
  128. "name": "inputs",
  129. "type": "Tensor[]"
  130. }
  131. ],
  132. "status": "default"
  133. },
  134. {
  135. "name": "Concatenation",
  136. "category": "Tensor"
  137. },
  138. {
  139. "name": "Convolution",
  140. "category": "Layer",
  141. "description": "[Reference](http://caffe.berkeleyvision.org/tutorial/layers/convolution.html)<br>**Detailed description**: [Reference](http://cs231n.github.io/convolutional-networks/#conv)\n**Parameters**: *Convolution* layer parameters should be specified in the `convolution_data` node, which is a child of the layer node.\n**Weights Layout** Weights layout is GOIYX, which means that *X* is changing the fastest, then *Y*, then *Input*, *Output*, then *Group*.\n**Mathematical Formulation**\n* For the convolutional layer, the number of output features in each dimension is calculated using the formula:\n\\f[\nn_{out} = \\left ( \\frac{n_{in} + 2p - k}{s} \\right ) + 1\n\\f]\n* The receptive field in each layer is calculated using the formulas:\n * Jump in the output feature map:\n \\f[\n j_{out} = j_{in} * s\n \\f]\n * Size of the receptive field of output feature:\n \\f[\n r_{out} = r_{in} + ( k - 1 ) * j_{in}\n \\f]\n * Center position of the receptive field of the first output feature:\n \\f[\n start_{out} = start_{in} + ( \\frac{k - 1}{2} - p ) * j_{in}\n \\f]\n * Output is calculated using the following formula:\n \\f[\n out = \\sum_{i = 0}^{n}w_{i}x_{i} + b\n \\f]\n**Example**\n\n```html\n<layer ... type=\"Convolution\" ... >\n <convolution_data stride-x=\"4\" stride-y=\"4\" pad-x=\"0\" pad-y=\"0\" kernel-x=\"11\" kernel-y=\"11\" output=\"96\" group=\"1\" dilation-x=\"2\" dilation-y=\"2\"/>\n <input> ... </input>\n <output> ... </output>\n <weights ... />\n <biases ... />\n </layer>\n```",
  142. "attributes": [
  143. {
  144. "default": [
  145. 1,
  146. null
  147. ],
  148. "description": "*stride* is a distance (in pixels) to slide the filter on the feature map over the (x, y) axis. For example, *stride* equal \"1,1\" means sliding the filter 1 pixel at a time over the (x, y) axis.",
  149. "name": "stride",
  150. "required": true,
  151. "type": "int32[]"
  152. },
  153. {
  154. "default": 1,
  155. "description": "*stride-x* is a distance (in pixels) to slide the filter on the feature map over the x axis. For example, *stride-x* equal 1 means sliding the filter 1 pixel at a time over the x axis.",
  156. "name": "stride-x",
  157. "required": true,
  158. "type": "int32"
  159. },
  160. {
  161. "default": 1,
  162. "description": "*stride-y* is a distance (in pixels) to slide the filter on the feature map over the y axis. For example, *stride-y* equal 1 means sliding the filter 1 pixel at a time over the y axis.",
  163. "name": "stride-y",
  164. "required": true,
  165. "type": "int32"
  166. },
  167. {
  168. "default": [
  169. 1,
  170. null
  171. ],
  172. "name": "strides",
  173. "type": "int32[]"
  174. },
  175. {
  176. "default": 0,
  177. "description": "*pad* is a number of pixels to add to the left and top of the input. For example, *pad* equal 1 means adding 1 pixel to the left of the input. Right and bottom padding should be calculated from the expected output width (height).",
  178. "name": "pad",
  179. "required": true,
  180. "type": "int32"
  181. },
  182. {
  183. "default": 0,
  184. "description": "*pad-x* is a number of pixels to add to the left of the input. For example, *pad-x* equal 1 means adding 1 pixel to the left of the input. Right and bottom padding should be calculated from the expected output width (height).",
  185. "name": "pad-x",
  186. "required": true,
  187. "type": "int32"
  188. },
  189. {
  190. "default": 0,
  191. "description": "*pad-y* is a number of pixels to add to the top of the input. For example, *pad-y* equal 1 means adding 1 pixel to the top of the input. Right and bottom padding should be calculated from the expected output width (height).",
  192. "name": "pad-y",
  193. "required": true,
  194. "type": "int32"
  195. },
  196. {
  197. "default": 0,
  198. "name": "pad-r",
  199. "type": "int32"
  200. },
  201. {
  202. "default": 0,
  203. "name": "pad-b",
  204. "type": "int32"
  205. },
  206. {
  207. "default": [
  208. 1,
  209. 1
  210. ],
  211. "description": "*kernel* is a width and height of each filter. For example, *kernel* equal 3 (3, 3) means that each filter has width and height equal to 3.",
  212. "name": "kernel",
  213. "required": true,
  214. "type": "int32[]"
  215. },
  216. {
  217. "default": 1,
  218. "description": "*kernel-x* is a width of each filter. For example, *kernel* equal 3 means that each filter has width equal to 3.",
  219. "name": "kernel-x",
  220. "required": true,
  221. "type": "int32"
  222. },
  223. {
  224. "default": 1,
  225. "description": "*kernel-y* is a height of each filter. For example, *kernel-y* equal 3 means that each filter has height equal to 3.",
  226. "name": "kernel-y",
  227. "required": true,
  228. "type": "int32"
  229. },
  230. {
  231. "default": 1,
  232. "description": "*output* is a number of output feature maps per whole output (when *group* > 1, *output* still matches the number of output features regardless of *group* value). For example, *output* equals 1 means that there is 1 output feature map in a layer.",
  233. "name": "output",
  234. "required": true,
  235. "type": "int32",
  236. "visible": false
  237. },
  238. {
  239. "default": 1,
  240. "description": "*group* denotes the number of groups to which *output* and *input* should be split. For example, *group* equal 1 means that all the filters are applied to full input (usual convolution), *group* equals 2 means that both *input* and *output* channels are separated into 2 groups and *i-th output* group is connected to *i-th input* group channels. *group* equals number of output feature maps denotes depth-wise separable convolution ([Reference](https://medium.com/towards-data-science/types-of-convolutions-in-deep-learning-717013397f4d#6f51)).",
  241. "name": "group",
  242. "required": true,
  243. "type": "int32"
  244. },
  245. {
  246. "default": 1,
  247. "description": "*dilation* denotes the distance in width and height between elements (weights) in the filter. For example, *dilation* equal \"1,1\" means that all the elements in the filter are neighbors, so it is the same as for the usual convolution. *dilation* equal \"2,2\" means that all the elements in the filter are matched not to adjacent elements in the input matrix, but to those that are adjacent with distance 1.",
  248. "name": "dilation",
  249. "required": true,
  250. "type": "int32"
  251. },
  252. {
  253. "default": 1,
  254. "name": "dilation-x",
  255. "type": "int32"
  256. },
  257. {
  258. "default": [
  259. 1,
  260. null
  261. ],
  262. "name": "dilations",
  263. "type": "int32[]"
  264. },
  265. {
  266. "default": "same_upper",
  267. "name": "auto_pad"
  268. },
  269. {
  270. "default": [
  271. 0,
  272. null
  273. ],
  274. "name": "pads_begin",
  275. "type": "int32[]"
  276. },
  277. {
  278. "default": [
  279. 0,
  280. null
  281. ],
  282. "name": "pads_end",
  283. "type": "int32[]"
  284. },
  285. {
  286. "default": 1,
  287. "description": "*dilation-y* denotes the distance in height between elements (weights) in the filter. For example, *dilation-y* equal 1 means that all the elements in the filter are neighbors, so it is the same as for the usual convolution. *dilation-y* equal 2 means that all the elements in the filter are matched not to adjacent elements in the input matrix, but to those that are adjacent with distance 1.",
  288. "name": "dilation-y",
  289. "required": true,
  290. "type": "int32"
  291. }
  292. ],
  293. "inputs": [
  294. {
  295. "name": "input"
  296. },
  297. {
  298. "name": "weights"
  299. },
  300. {
  301. "name": "bias"
  302. }
  303. ],
  304. "outputs": [
  305. {
  306. "name": "output"
  307. }
  308. ],
  309. "status": "default"
  310. },
  311. {
  312. "name": "Crop",
  313. "category": "Data",
  314. "description": "*Crop* layer changes selected dimensions of the input blob according to the specified parameters.\n**Parameters**: *Crop* layer parameters should be specified in `data` section, which is placed as a child of the layer node. Due to various representation of Crop attributes in existing frameworks, this layer can be described in three independent ways: *Crop* **Type 1** layer takes two input blobs, and the shape of the second blob specifies the *Crop* size. The layer has two attributes: *axis* and *offset*. Crop layer takes two input blobs, and the shape of the second blob specifies the *Crop* size. The *Crop* layer of this type supports shape inference.\n**Inputs**\n* **1**: Multidimensional input blob *(for example, NCHW, NCH, or NC)*\n* **2**: Shape of this input will be used for crop\n**Example**\n\n```html\n<layer id=\"39\" name=\"score_pool4c\" precision=\"FP32\" type=\"Crop\">\n <data axis=\"2,3\" offset=\"0,0\"/>\n <input>\n <port id=\"0\">\n <dim>1</dim>\n <dim>21</dim>\n <dim>44</dim>\n <dim>44</dim>\n </port>\n <port id=\"1\">\n <dim>1</dim>\n <dim>21</dim>\n <dim>34</dim>\n <dim>34</dim>\n </port>\n </input>\n <output>\n <port id=\"2\">\n <dim>1</dim>\n <dim>21</dim>\n <dim>34</dim>\n <dim>34</dim>\n </port>\n </output>\n</layer>\n```",
  315. "attributes": [
  316. {
  317. "default": 1,
  318. "description": "*axis* is a number of a dimension to be used for cropping. For example, *axis* equal to 1 means that cropping is performed over the first dimension.",
  319. "name": "axis",
  320. "required": true,
  321. "type": "int32[]"
  322. },
  323. {
  324. "default": 1,
  325. "description": "*offset* denotes the starting point for crop in the input blob. For example, *offset* equal to 2 means that crop is starting from the second value in the given axis.",
  326. "name": "offset",
  327. "required": true,
  328. "type": "int32[]"
  329. }
  330. ],
  331. "status": "default"
  332. },
  333. {
  334. "name": "CTCGreadyDecoder",
  335. "category": "Layer",
  336. "description": "*CTCGreadyDecoder* performs greedy decoding on the logits given in input (best path).\n**Detailed description**: [Reference](https://www.tensorflow.org/api_docs/python/tf/nn/ctc_greedy_decoder)\n**Parameters**: *CTCGreadyDecoder* layer parameters should be specified as the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\nGiven an input sequence \\f$X\\f$ of length \\f$T\\f$, *CTCGreadyDecoder* assumes the probability of a length \\f$T\\f$ character sequence \\f$C\\f$ is given by\n\\f[\np(C|X) = \\prod_{t=1}^{T} p(c_{t}|X)\n\\f]\n**Example**\n\n```html\n<layer ... type=\"CTCGreadyDecoder\" ... >\n <data stride=\"1\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  337. "attributes": [
  338. {
  339. "default": 1,
  340. "description": "*ctc_merge_repeated* is a flag for collapsing the repeated labels during the ctc calculation.",
  341. "name": "ctc_merge_repeated",
  342. "required": true,
  343. "type": "int32"
  344. }
  345. ],
  346. "status": "default"
  347. },
  348. {
  349. "name": "Deconvolution",
  350. "category": "Layer",
  351. "description": "*Deconvolution* layer is applied for upsampling the output to the higher image resolution.\n**Detailed description**: [Reference](https://distill.pub/2016/deconv-checkerboard/)\n**Parameters**: *Deconvolution* layer parameters should be specified in the `deconvolution_data` node, which is a child of the layer node.\n**Parameters**: *Convolution* layer parameters should be specified in the `convolution_data` node, which is a child of the layer node.\n**Weights Layout** Weights layout is the following: GOIYX, which means that *X* is changing the fastest, then *Y*, then *Input*, *Output*, then *Group*.\n**Mathematical Formulation**\n*Deconvolution* is also called transpose convolution and performs operation, reverse to convolution.\nThe number of output features for each dimensions is calculated:\n\\f[S_{o}=stride(S_{i} - 1 ) + S_{f} - 2pad \\f]\nWhere \\f$S\\f$ is size of output, input and filter.\nOutput is calculated in the same way as for convolution layer:\n\\f[out = \\sum_{i = 0}^{n}w_{i}x_{i} + b\\f]\n**Example**\n\n```html\n<layer ... type=\"Deconvolution\" ... >\n <deconvolution_data stride-x=\"2\" stride-y=\"2\" pad-x=\"1\" pad-y=\"1\" kernel-x=\"4\" kernel-y=\"4\" output=\"19\" group=\"1\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  352. "attributes": [
  353. {
  354. "default": 1,
  355. "description": "*stride* is a distance (in pixels) to slide the filter on the feature map over the (x, y) axis. For example, *stride* equal \"1,1\" means sliding the filter 1 pixel at a time over the (x, y) axis.",
  356. "name": "stride",
  357. "required": true,
  358. "type": "int32"
  359. },
  360. {
  361. "default": 1,
  362. "description": "*stride-x* is a distance (in pixels) to slide the filter on the feature map over the x axis. For example, *stride-x* equal 1 means sliding the filter 1 pixel at a time over the x axis.",
  363. "name": "stride-x",
  364. "required": true,
  365. "type": "int32"
  366. },
  367. {
  368. "default": 1,
  369. "description": "*stride-y* is a distance (in pixels) to slide the filter on the feature map over the y axis. For example, *stride-y* equal 1 means sliding the filter 1 pixel at a time over the y axis.",
  370. "name": "stride-y",
  371. "required": true,
  372. "type": "int32"
  373. },
  374. {
  375. "default": 1,
  376. "description": "*pad* is a number of pixels to add to the left and top of the input. For example, *pad* equal 1 means adding 1 pixel to the left of the input. Right and bottom padding should be calculated from the expected output width (height).",
  377. "name": "pad",
  378. "required": true,
  379. "type": "int32"
  380. },
  381. {
  382. "default": 1,
  383. "description": "*pad-x* is a number of pixels to add to the left of the input. For example, *pad-x* equal 1 means adding 1 pixel to the left of the input. Right and bottom padding should be calculated from the expected output width (height).",
  384. "name": "pad-x",
  385. "required": true,
  386. "type": "int32"
  387. },
  388. {
  389. "default": 1,
  390. "description": "*pad-y* is a number of pixels to add to the top of the input. For example, *pad-y* equal 1 means adding 1 pixel to the top of the input. Right and bottom padding should be calculated from the expected output width (height).",
  391. "name": "pad-y",
  392. "required": true,
  393. "type": "int32"
  394. },
  395. {
  396. "default": 1,
  397. "description": "*kernel* is a width and height of each filter. For example, *kernel* equal 3 (3, 3) means that each filter has width and height equal to 3.",
  398. "name": "kernel",
  399. "required": true,
  400. "type": "int32"
  401. },
  402. {
  403. "default": 1,
  404. "description": "*kernel-x* is a width of each filter. For example, *kernel* equal 3 means that each filter has width equal to 3.",
  405. "name": "kernel-x",
  406. "required": true,
  407. "type": "int32"
  408. },
  409. {
  410. "default": 1,
  411. "description": "*kernel-y* is a height of each filter. For example, *kernel-y* equal 3 means that each filter has height equal to 3.",
  412. "name": "kernel-y",
  413. "required": true,
  414. "type": "int32"
  415. },
  416. {
  417. "default": 1,
  418. "description": "*output* is a number of output feature maps per whole output (when *group* > 1, *output* still matches the number of output features regardless of *group* value). For example, *output* equals 1 means that there is 1 output feature map in a layer.",
  419. "name": "output",
  420. "required": true,
  421. "type": "int32"
  422. },
  423. {
  424. "default": 1,
  425. "description": "*group* denotes the number of groups to which *output* and *input* should be split. For example, *group* equal 1 means that all the filters are applied to full input (usual convolution), *group* equals 2 means that both *input* and *output* channels are separated into 2 groups and *i-th output* group is connected to *i-th input* group channels. *group* equals number of output feature maps denotes depth-wise separable convolution ([Reference](https://medium.com/towards-data-science/types-of-convolutions-in-deep-learning-717013397f4d#6f51)).",
  426. "name": "group",
  427. "required": true,
  428. "type": "int32"
  429. },
  430. {
  431. "default": 1,
  432. "description": "*dilation* denotes the distance in width and height between elements (weights) in the filter. For example, *dilation* equal \"1,1\" means that all the elements in the filter are neighbors, so it is the same as for the usual convolution. *dilation* equal \"2,2\" means that all the elements in the filter are matched not to adjacent elements in the input matrix, but to those that are adjacent with distance 1.",
  433. "name": "dilation",
  434. "required": true,
  435. "type": "int32"
  436. },
  437. {
  438. "default": 1,
  439. "description": "*dilation-y* denotes the distance in height between elements (weights) in the filter. For example, *dilation-y* equal 1 means that all the elements in the filter are neighbors, so it is the same as for the usual convolution. *dilation-y* equal 2 means that all the elements in the filter are matched not to adjacent elements in the input matrix, but to those that are adjacent with distance 1.",
  440. "name": "dilation-y",
  441. "required": true,
  442. "type": "int32"
  443. }
  444. ],
  445. "status": "default"
  446. },
  447. {
  448. "name": "DetectionOutput",
  449. "description": "*DetectionOutput* layer performs non-maximum suppression to generate the detection output using information on location and confidence predictions.\n**Detailed description**: [Reference](https://arxiv.org/pdf/1512.02325.pdf)\n**Parameters**: *DetectionOutput* layer parameters should be specified as the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\nAt each feature map cell, *DetectionOutput* predicts the offsets relative to the default box shapes in the cell, as well as the per-class scores that indicate the presence of a class instance in each of those boxes. Specifically, for each box out of k at a given location, *DetectionOutput* computes class scores and the four offsets relative to the original default box shape. This results in a total of \\f$(c + 4)k\\f$ filters that are applied around each location in the feature map, yielding \\f$(c + 4)kmn\\f$ outputs for a m × n feature map.\n**Example**\n\n```html\n<layer ... type=\"DetectionOutput\" ... >\n <data num_classes=\"21\" share_location=\"1\" background_label_id=\"0\" nms_threshold=\"0.450000\" top_k=\"400\" eta=\"1.000000\" output_directory=\"\" output_name_prefix=\"\" output_format=\"\" label_map_file=\"\" name_size_file=\"\" num_test_image=\"0\" prob=\"1.000000\" resize_mode=\"caffe.ResizeParameter.WARP\" height=\"0\" width=\"0\" height_scale=\"0\" width_scale=\"0\" pad_mode=\"caffe.ResizeParameter.CONSTANT\" pad_value=\"#\" interp_mode=\"#\" code_type=\"caffe.PriorBoxParameter.CENTER_SIZE\" variance_encoded_in_target=\"0\" keep_top_k=\"200\" confidence_threshold=\"0.010000\" visualize=\"0\" visualize_threshold=\"0.000000\" save_file=\"\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  450. "attributes": [
  451. {
  452. "default": 1,
  453. "description": " number of classes to be predicted",
  454. "name": "num_classes",
  455. "required": true,
  456. "type": "int32"
  457. },
  458. {
  459. "default": 1,
  460. "description": " background label id. If there is no background class, set it to -1.",
  461. "name": "background_label_id",
  462. "required": true,
  463. "type": "int32"
  464. },
  465. {
  466. "default": 1,
  467. "description": " maximum number of results to be kept on NMS stage",
  468. "name": "top_k",
  469. "required": true,
  470. "type": "int32"
  471. },
  472. {
  473. "default": 1,
  474. "description": " if \"true\", variance is encoded in target. Otherwise, we need to adjust the predicted offset accordingly.",
  475. "name": "variance_encoded_in_target",
  476. "required": true
  477. },
  478. {
  479. "default": 1,
  480. "description": " number of total bboxes to be kept per image after NMS step. -1 means keeping all bboxes after NMS step.",
  481. "name": "keep_top_k",
  482. "required": true,
  483. "type": "int32"
  484. },
  485. {
  486. "default": 1,
  487. "name": "num_orient_classes",
  488. "required": true,
  489. "type": "int32"
  490. },
  491. {
  492. "default": 1,
  493. "description": " type of coding method for bounding boxes. caffe.PriorBoxParameter.CENTER_SIZE and others.",
  494. "name": "code_type",
  495. "required": true,
  496. "type": "int32"
  497. },
  498. {
  499. "default": 1,
  500. "description": " bounding boxes are shared among different classes.",
  501. "name": "share_location",
  502. "required": true
  503. },
  504. {
  505. "default": 1,
  506. "name": "interpolate_orientation",
  507. "required": true,
  508. "type": "int32"
  509. },
  510. {
  511. "default": 1,
  512. "description": " threshold to be used in NMS stage",
  513. "name": "nms_threshold",
  514. "required": true,
  515. "type": "float32"
  516. },
  517. {
  518. "default": 1,
  519. "description": " only consider detections whose confidences are larger than a threshold. If not provided, consider all boxes.",
  520. "name": "confidence_threshold",
  521. "required": true,
  522. "type": "float32"
  523. }
  524. ],
  525. "status": "default"
  526. },
  527. {
  528. "name": "Eltwise",
  529. "description": "*Eltwise* layer performs element-wise operation, which is specified in parameters, over given inputs.\n**Parameters**: *Eltwise* layer parameters should be specified in the `elementwise_data` node, which is placed as a child of the layer node.\n**Mathematical Formulation** *Eltwise* accepts 2 inputs of any number of dimensions - from 1 to 4, however, it is required for both of them to have absolutely same dimensions. The produced blob is also of the same dimension as each of its parents\n*Eltwise* does the following with the input blobs:\n\\f[\no_{i} = f(b_{i}^{1}, b_{i}^{2})\n\\f]\nwhere \\f$b_{i}^{1}\\f$ - first blob \\f$i\\f$-th element, \\f$b_{i}^{2}\\f$ - second blob \\f$i\\f$-th element, \\f$o_{i}\\f$ - output blob \\f$i\\f$-th element, \\f$f(a, b)\\f$ - is a function that performs an operation over its two arguments \\f$a, b\\f$.\n* For *sum* operation, \\f$f(a, b)\\f$ is defined as\n \\f[\n f(a,b) = a + b\n \\f]\n* For *mul* operation, \\f$f(a, b)\\f$ is defined as\n \\f[\n f(a,b) = a * b\n \\f]\n* For *max* operation, \\f$f(a, b)\\f$ is defined as\n \\f[\n f(a,b) = \\left\\{\\begin{array}{ll}\n\t\ta \\quad \\mbox{if } a \\geq b \\\\\n\t\tb \\quad \\mbox{if } b > a\n\t\\end{array}\\right. \\f]\n**Example**\n\n```html\n<layer ... type=\"Eltwise\" ... >\n <elementwise_data operation=\"sum\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  530. "attributes": [
  531. {
  532. "default": "sum",
  533. "description": "*operation* is the simple mathematical operation to be performed over inputs. For example, *operation* equal *mul* means that input blobs are multiplied.",
  534. "name": "operation",
  535. "required": true,
  536. "type": "string"
  537. }
  538. ],
  539. "inputs": [
  540. {
  541. "name": "inputs",
  542. "type": "Tensor[]"
  543. }
  544. ],
  545. "outputs": [
  546. {
  547. "name": "output"
  548. }
  549. ],
  550. "status": "default"
  551. },
  552. {
  553. "name": "Flatten",
  554. "category": "Shape",
  555. "attributes": [
  556. {
  557. "name": "axis",
  558. "type": "int32"
  559. },
  560. {
  561. "name": "end_axis",
  562. "type": "int32",
  563. "default": -1
  564. }
  565. ]
  566. },
  567. {
  568. "name": "FakeQuantize",
  569. "inputs": [
  570. { "name": "X" },
  571. { "name": "input_low" },
  572. { "name": "input_high" },
  573. { "name": "output_low" },
  574. { "name": "output_high" }
  575. ]
  576. },
  577. {
  578. "name": "FullyConnected",
  579. "category": "Layer",
  580. "description": "[Reference](http://caffe.berkeleyvision.org/tutorial/layers/innerproduct.html)\n**Detailed description**: [Reference](http://cs231n.github.io/convolutional-networks/#fc)\n**Parameters**: Specify *FullyConnected* layer parameters in the `fc_data` node, which is a child of the layer node.\n**Weights Layout** OI, which means that Input is changing the fastest, then Output.\n**Mathematical Formulation**\n* If previous layer is *FullyConnected*:\n \\f[\n y_{i} = f( z_{i} ) \\quad with \\quad z_{i} = \\sum_{j=1}^{m_{1}^{( l-1 )}}w_{i,j}^{( l )}y_{i}^{ ( l -1 )}\n \\f]\n* Otherwise:\n \\f[\n y_{i} = f( z_{i} ) \\quad with \\quad z_{i}^{ ( l )} = \\sum_{j=1}^{m_{1}^{( l-1 )}}\\sum_{r=1}^{m_{2}^{ ( l-1 )}}\\sum_{s=1}^{m_{3}^{ ( l-1 )}}w_{i,j,r,s}^{ ( l )} ( Y_{i}^{ (l-1) })_{r,s}\n \\f]\n**Example**\n\n```html\n<layer ... type=\"FullyConnected\" ... >\n <fc_data out-size=\"4096\"/>\n <input> ... </input>\n <output> ... </output>\n </layer>\n```",
  581. "attributes": [
  582. {
  583. "default": 1,
  584. "description": "*out-size* is a length of the output vector. For example, *out-size* equal 4096 means that the output vector length is 4096.",
  585. "name": "out-size",
  586. "required": true,
  587. "type": "int32"
  588. }
  589. ],
  590. "inputs": [
  591. {
  592. "name": "input"
  593. },
  594. {
  595. "name": "weights"
  596. },
  597. {
  598. "name": "bias"
  599. }
  600. ],
  601. "status": "default"
  602. },
  603. {
  604. "name": "Gather",
  605. "category": "Transform",
  606. "inputs": [
  607. { "name": "data" },
  608. { "name": "indices" },
  609. { "name": "axis" }
  610. ]
  611. },
  612. {
  613. "name": "Gelu",
  614. "category": "Activation",
  615. "description": "Gaussian error linear unit element-wise activation function."
  616. },
  617. {
  618. "name": "GRN",
  619. "category": "Normalization",
  620. "description": "*GRN* is Global Response Normalization with L2 norm (across channels only).\n**Parameters**: GRN layer parameters should be specified as the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\n*GRN* computes L2 norm by channels for input blob. *GRN* generally does the following with the input blob:\n\\f[\noutput_{i} = \\frac{input_{i}}{\\sqrt{\\sum_{i}^{C} input_{i}}}\n\\f]\n**Example**\n\n```html\n<layer ... type=\"GRN\" ... >\n <data bias=\"1.0\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  621. "attributes": [
  622. {
  623. "default": 1,
  624. "description": "*bias* is added to the variance.",
  625. "name": "bias",
  626. "required": true,
  627. "type": "float32"
  628. }
  629. ],
  630. "status": "default"
  631. },
  632. {
  633. "name": "GroupConvolution",
  634. "category": "Layer",
  635. "inputs": [
  636. { "name": "input" },
  637. { "name": "weights" }
  638. ]
  639. },
  640. {
  641. "name": "GRUCell",
  642. "category": "Layer",
  643. "description": "GRUCell represents a single GRU Cell that computes the output using the formula described in the [paper](https://arxiv.org/abs/1406.1078).",
  644. "attributes": [
  645. { "name": "hidden_size", "type": "int64", "description": "pecifies hidden state size." },
  646. { "name": "linear_before_reset", "type": "boolean", "optional": true, "default": false, "description": "denotes if the layer behaves according to the modification of GRUCell described in the formula in the [ONNX documentation](https://github.com/onnx/onnx/blob/master/docs/Operators.md#GRU)." }
  647. ],
  648. "inputs": [
  649. { "name": "X", "description": "2D tensor of type T `[batch_size, input_size]`, input data. Required." },
  650. { "name": "initial_hidden_state", "description": "2D tensor of type T `[batch_size, hidden_size]`. Required." },
  651. { "name": "W", "description": "2D tensor of type T `[3 * hidden_size, input_size]`, the weights for matrix multiplication, gate order: zrh. Required." },
  652. { "name": "R", "description": "2D tensor of type T `[3 * hidden_size, hidden_size]`, the recurrence weights for matrix multiplication, gate order: zrh. Required." },
  653. { "name": "B", "description": "1D tensor of type T. If linear_before_reset is set to 1, then the shape is `[4 * hidden_size]` - the sum of biases for z and r gates (weights and recurrence weights), the biases for h gate are placed separately. Otherwise the shape is `[3 * hidden_size]`, the sum of biases (weights and recurrence weights). Optional." }
  654. ]
  655. },
  656. {
  657. "name": "Interpolate",
  658. "inputs": [
  659. { "name": "data" },
  660. { "name": "sizes" },
  661. { "name": "scales" },
  662. { "name": "axes" }
  663. ]
  664. },
  665. {
  666. "name": "LRN",
  667. "category": "Normalization"
  668. },
  669. {
  670. "name": "LSTMCell",
  671. "category": "Layer",
  672. "inputs": [
  673. { "name": "X" },
  674. { "name": "initial_hidden_state" },
  675. { "name": "initial_cell_state" },
  676. { "name": "W" },
  677. { "name": "R" },
  678. { "name": "B" }
  679. ]
  680. },
  681. {
  682. "name": "MaxPool",
  683. "category": "Pool"
  684. },
  685. {
  686. "name": "MatMul",
  687. "inputs": [
  688. { "name": "A" },
  689. { "name": "B" }
  690. ],
  691. "outputs": [
  692. { "name": "C" }
  693. ]
  694. },
  695. {
  696. "name": "Memory",
  697. "description": "*Memory* layer represents delay layer in terms of LSTM terminology. To read more about LSTM topologies please refer this [link](http://colah.github.io/posts/2015-08-Understanding-LSTMs).\n**Detailed description**: *Memory* layer saves state between two infer requests. In the topology, it is the single layer, however, in the Intermediate Representation, it is always represented as a pair of **Memory** layers. One of these layers does not have outputs and another does not have inputs (in terms of the Intermediate Representation).\n**Parameters**: *Memory* layer parameters should be specified as the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\n*Memory* save data from the input blob.\n**Example**\n\n```html\n<layer ... type=\"Memory\" ... >\n <data id=\"r_27-28\" index=\"0\" size=\"2\" />\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  698. "attributes": [
  699. {
  700. "default": 1,
  701. "description": "*id* is the id of the pair of *Memory* layers. For example, *id* equals r_27-28 means that layers with id 27 and 28 are in one pair.",
  702. "name": "id",
  703. "required": true,
  704. "type": "int32"
  705. },
  706. {
  707. "default": 1,
  708. "description": "*index* represents if the given layer is input or output. For example, *index* equal 0 means this layer is output one.",
  709. "name": "index",
  710. "required": true,
  711. "type": "int32"
  712. },
  713. {
  714. "default": 1,
  715. "description": "*size* represents the size of the group. For example, *size* equals 2 means this group is a pair.",
  716. "name": "size",
  717. "required": true,
  718. "type": "int32"
  719. }
  720. ],
  721. "status": "default"
  722. },
  723. {
  724. "name": "Minimum",
  725. "description": "Minimum performs element-wise minimum operation with two given tensors applying broadcasting rule specified in the auto_broadcast attribute.",
  726. "inputs": [
  727. { "name": "A" },
  728. { "name": "B" }
  729. ]
  730. },
  731. {
  732. "name": "Multiply",
  733. "inputs": [
  734. { "name": "A" },
  735. { "name": "B" }
  736. ],
  737. "outputs": [
  738. { "name": "C" }
  739. ]
  740. },
  741. {
  742. "name": "MVN",
  743. "category": "Normalization",
  744. "description": "[Reference](http://caffe.berkeleyvision.org/tutorial/layers/mvn.html)\n**Parameters**: *MVN* layer parameters should be specified as the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\n*MVN* subtracts mean from the input blob:\n\\f[\no_{i} = i_{i} - \\frac{\\sum{i_{k}}}{C * H * W}\n\\f]\nIf *normalize_variance* is set to 1, the output blob is divided by variance:\n\\f[\no_{i}=\\frac{o_{i}}{\\sum \\sqrt {o_{k}^2}+\\epsilon}\n\\f]\n**Example**\n\n```html\n<layer ... type=\"MVN\">\n <data across_channels=\"1\" eps=\"9.999999717180685e-10\" normalize_variance=\"1\"/>\n <input>\n ...\n </input>\n <output>\n ...\n </output>\n</layer>\n```",
  745. "attributes": [
  746. {
  747. "default": 1,
  748. "description": "*across_channels* is a flag that denotes if mean values are shared across channels. For example, *across_channels* equal 0 means that mean values are not shared across channels.",
  749. "name": "across_channels",
  750. "required": true,
  751. "type": "int32"
  752. },
  753. {
  754. "default": 1,
  755. "description": "*normalize_variance* is a flag that denotes whether to perform variance normalization.",
  756. "name": "normalize_variance",
  757. "required": true,
  758. "type": "int32"
  759. },
  760. {
  761. "default": 1,
  762. "description": "*eps* is the number to be added to the variance to avoid division by zero when normalizing the value. For example, *epsilon* equal 0.001 means that 0.001 is added to the variance.",
  763. "name": "eps",
  764. "required": true,
  765. "type": "float32"
  766. }
  767. ],
  768. "inputs": [
  769. { "name": "input" },
  770. { "name": "data" }
  771. ],
  772. "status": "default"
  773. },
  774. {
  775. "name": "Norm",
  776. "category": "Normalization",
  777. "description": "[Reference](http://caffe.berkeleyvision.org/tutorial/layers/lrn.html)\n**Detailed description**: [Reference](http://yeephycho.github.io/2016/08/03/Normalizations-in-neural-networks/#Local-Response-Normalization-LRN)\n**Parameters**: *Norm* layer parameters should be specified in the `norm_data` node, which is a child of the layer node.\n**Mathematical Formulation**\n\\f[o_{i} = \\left( 1 + \\left( \\frac{\\alpha}{n} \\right)\\sum_{i}x_{i}^{2} \\right)^{\\beta}\\f]\nWhere \\f$n\\f$ is the size of each local region.\n**Example**\n\n```html\n<layer ... type=\"Norm\" ... >\n <norm_data alpha=\"9.9999997e-05\" beta=\"0.75\" local-size=\"5\" region=\"across\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  778. "attributes": [
  779. {
  780. "default": 1,
  781. "description": "*alpha* represents the scaling parameter for the normalizing sum. For example, *alpha* equal 0.0001 means that the normalizing sum is multiplied by 0.0001.",
  782. "name": "alpha",
  783. "required": true,
  784. "type": "float32"
  785. },
  786. {
  787. "default": 1,
  788. "description": "*beta* represents the exponent for the normalizing sum. For example, *beta* equal 0.75 means that the normalizing sum is raised to the power of 0.75.",
  789. "name": "beta",
  790. "required": true,
  791. "type": "float32"
  792. },
  793. {
  794. "default": 1,
  795. "description": "*region* represents strategy of local regions extension. For example, *region* equal *across* means that the normalizing sum is performed over adjacent channels.",
  796. "name": "region",
  797. "required": true,
  798. "type": ""
  799. },
  800. {
  801. "default": 1,
  802. "description": "*local-size* represents the side length of the region to be used for the normalization sum or number of channels depending on the strategy specified in the *region* parameter. For example, *local-size* equal 5 for the across strategy means application of sum across 5 adjacent channels.",
  803. "name": "local-size",
  804. "required": true,
  805. "type": "int32"
  806. }
  807. ],
  808. "status": "default"
  809. },
  810. {
  811. "name": "Normalize",
  812. "category": "Normalization",
  813. "description": "*Normalize* layer performs l-p normalization of 1 of input blob.\n**Parameters**: *Normalize* layer parameters should be specified as the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\n\\f[\no_{i} = \\sum_{i}^{H*W}\\frac{\\left ( n*C*H*W \\right )* scale}{\\sqrt{\\sum_{i=0}^{C*H*W}\\left ( n*C*H*W \\right )^{2}}}\n\\f]\n**Example**\n\n```html\n<layer ... type=\"Normalize\" ... >\n <data across_spatial=\"0\" channel_shared=\"0\" eps=\"0.000000\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  814. "attributes": [
  815. {
  816. "default": 1,
  817. "description": "*across_spatial* is a flag that denotes if normalization is performed over CHW or HW. For example, *across_spatial* equals 0 means that normalization is not shared across channels.",
  818. "name": "across_spatial",
  819. "required": true
  820. },
  821. {
  822. "default": 1,
  823. "description": "*channel_shared* is a flag that denotes if scale parameters are shared across channels. For example, *channel_shared* equal 0 means that scale parameters are not shared across channels.",
  824. "name": "channel_shared",
  825. "required": true
  826. },
  827. {
  828. "default": 1,
  829. "description": "*eps* is the epsilon used to avoid division by zero when normalizing the value. For example, *eps* equals 0.001 means that 0.001 is used if all the values in normalization are equal to zero.",
  830. "name": "eps",
  831. "required": true,
  832. "type": "float32"
  833. }
  834. ],
  835. "status": "default"
  836. },
  837. {
  838. "name": "NormalizeL2",
  839. "category": "Normalization",
  840. "inputs": [
  841. { "name": "data" },
  842. { "name": "axes" }
  843. ]
  844. },
  845. {
  846. "name": "Pad",
  847. "category": "Tensor",
  848. "attributes": [
  849. {
  850. "name": "pad_value",
  851. "type": "float32"
  852. },
  853. {
  854. "name": "pads_begin",
  855. "type": "int32[]"
  856. },
  857. {
  858. "name": "pads_end",
  859. "type": "int32[]"
  860. },
  861. {
  862. "name": "pad_mode"
  863. }
  864. ]
  865. },
  866. {
  867. "name": "Permute",
  868. "category": "Shape",
  869. "description": "*Permute* layer performs reordering of input blob dimensions.\n**Detailed description**: [Reference](http://caffe.help/manual/layers/tile.html)\n**Parameters**: *Permute* layer parameters should be specified as the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\n*Permute* layer performs reordering input blob. Source indexes and destination indexes are bound by formula:\n\\f[\nsrc\\_ind_{offset} = n * ordered[1] * ordered[2] * ordered[3] + (h * ordered[3] + w)\n\\f]\n\\f[\nn \\in ( 0, order[0] )\n\\f]\n\\f[\nh \\in ( 0, order[2] )\n\\f]\n\\f[\nw \\in ( 0, order[3] )\n\\f]\n**Example**\n\n```html\n<layer ... type=\"Permute\" ... >\n <data order=\"0,2,3,1\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  870. "attributes": [
  871. {
  872. "description": "*order* is the set of dimensions indexes for output blob. For example, *order* equal 0,2,3,1 means that the output blob has following dimensions: first dimension from the input blob, third dimension from the input blob, fourth dimension from the input blob, second dimension from the input blob.",
  873. "name": "order",
  874. "required": true,
  875. "type": "int32[]"
  876. }
  877. ],
  878. "status": "default"
  879. },
  880. {
  881. "name": "Pooling",
  882. "category": "Pool",
  883. "description": "[Reference](http://caffe.berkeleyvision.org/tutorial/layers/pooling.html)\n**Detailed description**: [Reference](http://cs231n.github.io/convolutional-networks/#pool)\n**Parameters**: Specify pooling layer parameters in the `pooling_data` node, which is a child of the layer node.\n**Mathematical Formulation**\n* For *max pool-method*:\n \\f[\n output_{j} = MAX\\{ x_{0}, ... x_{i}\\}\n \\f]\n* For *avg pool-method*:\n \\f[\n output_{j} = \\frac{\\sum_{i = 0}^{n}x_{i}}{n}\n \\f]\n**Example**\n\n```html\n<layer ... type=\"Pooling\" ... >\n <pooling_data kernel-x=\"3\" kernel-y=\"3\" pad-x=\"0\" pad-y=\"0\" stride-x=\"2\" stride-y=\"2\" pool-method=\"max\" exclude-pad=\"true\" rounding_type=\"floor\"/>\n <input> ... </input>\n <output> ... </output>\n </layer>\n```",
  884. "attributes": [
  885. {
  886. "default": [
  887. 1,
  888. null
  889. ],
  890. "description": "*stride* is a distance (in pixels) to slide the filter on the feature map over the (x, y) axis. For example, *stride* equal \"1,1\" means sliding the filter 1 pixel at a time over the (x, y) axis.",
  891. "name": "stride",
  892. "required": true,
  893. "type": "int32[]"
  894. },
  895. {
  896. "default": 1,
  897. "description": "*stride-x* is a distance (in pixels) to slide the filter on the feature map over the x axis. For example, *stride-x* equal 1 means sliding the filter 1 pixel at a time over the x axis.",
  898. "name": "stride-x",
  899. "required": true,
  900. "type": "int32"
  901. },
  902. {
  903. "default": 1,
  904. "description": "*stride-y* is a distance (in pixels) to slide the filter on the feature map over the y axis. For example, *stride-y* equal 1 means sliding the filter 1 pixel at a time over the y axis.",
  905. "name": "stride-y",
  906. "required": true,
  907. "type": "int32"
  908. },
  909. {
  910. "default": [
  911. 1,
  912. null
  913. ],
  914. "name": "strides",
  915. "type": "int32[]"
  916. },
  917. {
  918. "default": 1,
  919. "description": "*pad* is a number of pixels to add to the left and top of the input. For example, *pad* equal 1 means adding 1 pixel to the left of the input. Right and bottom padding should be calculated from the expected output width (height).",
  920. "name": "pad",
  921. "required": true,
  922. "type": "int32"
  923. },
  924. {
  925. "default": 0,
  926. "description": "*pad-x* is a number of pixels to add to the left of the input. For example, *pad-x* equal 1 means adding 1 pixel to the left of the input. Right and bottom padding should be calculated from the expected output width (height).",
  927. "name": "pad-x",
  928. "required": true,
  929. "type": "int32"
  930. },
  931. {
  932. "default": 0,
  933. "description": "*pad-y* is a number of pixels to add to the top of the input. For example, *pad-y* equal 1 means adding 1 pixel to the top of the input. Right and bottom padding should be calculated from the expected output width (height).",
  934. "name": "pad-y",
  935. "required": true,
  936. "type": "int32"
  937. },
  938. {
  939. "default": 0,
  940. "name": "pad-r",
  941. "type": "int32"
  942. },
  943. {
  944. "default": 0,
  945. "name": "pad-b",
  946. "type": "int32"
  947. },
  948. {
  949. "default": [
  950. 0,
  951. null
  952. ],
  953. "name": "pads_begin",
  954. "type": "int32[]"
  955. },
  956. {
  957. "default": [
  958. 0,
  959. null
  960. ],
  961. "name": "pads_end",
  962. "type": "int32[]"
  963. },
  964. {
  965. "description": "*kernel* is a width and height of each filter. For example, *kernel* equal 3 (3, 3) means that each filter has width and height equal to 3.",
  966. "name": "kernel",
  967. "required": true,
  968. "type": "int32[]"
  969. },
  970. {
  971. "default": 1,
  972. "description": "*kernel-x* is a width of each filter. For example, *kernel* equal 3 means that each filter has width equal to 3.",
  973. "name": "kernel-x",
  974. "required": true,
  975. "type": "int32"
  976. },
  977. {
  978. "default": 1,
  979. "description": "*kernel-y* is a height of each filter. For example, *kernel-y* equal 3 means that each filter has height equal to 3.",
  980. "name": "kernel-y",
  981. "required": true,
  982. "type": "int32"
  983. },
  984. {
  985. "default": "max",
  986. "description": "*pool-method* is a type of pooling strategy for values.",
  987. "name": "pool-method",
  988. "required": true,
  989. "type": ""
  990. },
  991. {
  992. "default": false,
  993. "description": "*exclude-pad* is a type of pooling strategy for values in the padding area. For example, if *exclude-pad* is \"true\", zero-values in the padding are not used.",
  994. "name": "exclude-pad",
  995. "required": true,
  996. "type": "boolean"
  997. },
  998. {
  999. "default": "ceil",
  1000. "description": "*rounding_type* is a type of rounding to be applied.",
  1001. "name": "rounding-type",
  1002. "required": true
  1003. }
  1004. ],
  1005. "status": "default"
  1006. },
  1007. {
  1008. "name": "Power",
  1009. "description": "*Power* layer computes the output as (shift + scale * x) ^ power for each input element x.\n**Parameters**: Power layer parameters should be specified as the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\n\\f[\np = (shift + scale * x)^{power}\n\\f]\n**Example**\n\n```html\n<layer ... type=\"Power\" ... >\n <data power=\"2\" scale=\"0.1\" shift=\"5\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  1010. "status": "default",
  1011. "inputs": [
  1012. { "name": "A" },
  1013. { "name": "B" }
  1014. ],
  1015. "outputs": [
  1016. { "name": "C" }
  1017. ]
  1018. },
  1019. {
  1020. "name": "PReLU",
  1021. "category": "Activation",
  1022. "description": "*PReLU* is the Parametric Rectifier Linear Unit. The difference from *ReLU* is that negative slopes can vary across channels.\n**Parameters**: *PReLU* layer parameters should be specified as the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\n*PReLU* accepts one input with four dimensions. The produced blob has the same dimensions as input.\n*PReLU* does the following with the input blob:\n\\f[\no_{i} = max(0, x_{i}) + w_{i} * min(0,x_{i})\n\\f]\nwhere \\f$w_{i}\\f$ is from weights blob.\n**Example**\n\n```html\n<layer ... type=\"PReLU\" ... >\n <data bias=\"1.0\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  1023. "attributes": [
  1024. {
  1025. "default": 1,
  1026. "description": "*channel_shared* shows if negative slope shared across channels or not.",
  1027. "name": "channel_shared",
  1028. "required": true,
  1029. "type": "int32"
  1030. },
  1031. {
  1032. "description": "*filler_type* defines initialization type for negative slope.",
  1033. "name": "filler_type",
  1034. "required": true,
  1035. "type": "string"
  1036. },
  1037. {
  1038. "default": 1,
  1039. "description": "*filler_value* defines the value in constant filler.",
  1040. "name": "filler_value",
  1041. "required": true,
  1042. "type": "int32"
  1043. },
  1044. {
  1045. "default": 1,
  1046. "description": "*min(max)* defines the minimal(maximal) value in uniform filler.",
  1047. "name": "min(max)",
  1048. "required": true,
  1049. "type": "int32"
  1050. },
  1051. {
  1052. "default": 1,
  1053. "description": "*mean* defines the mean value in Gaussian filler.",
  1054. "name": "mean",
  1055. "required": true,
  1056. "type": "int32"
  1057. }
  1058. ],
  1059. "status": "default",
  1060. "inputs": [
  1061. { "name": "data" },
  1062. { "name": "slope" }
  1063. ]
  1064. },
  1065. {
  1066. "name": "PriorBox",
  1067. "description": "*PriorBox* layer generates prior boxes of specified sizes and aspect ratios across all dimensions.\n**Parameters**: *PriorBox* layer parameters should be specified as the `data` node, which is a child of the layer node.\n**Mathematical Formulation**:\n*PriorBox* computes coordinates of prior boxes by following:\n1. First calculates *center_x* and *center_y* of prior box:\n \\f[\n W \\equiv Width \\quad Of \\quad Image\n \\f]\n \\f[\n H \\equiv Height \\quad Of \\quad Image\n \\f]\n * If step equals 0:\n \\f[\n center_x=(w+0.5)\n \\f]\n \\f[\n center_y=(h+0.5)\n \\f]\n * else:\n \\f[\n center_x=(w+offset)*step\n \\f]\n \\f[\n center_y=(h+offset)*step\n \\f]\n \\f[\n w \\subset \\left( 0, W \\right )\n \\f]\n \\f[\n h \\subset \\left( 0, H \\right )\n \\f]\n2. Then, for each \\f$ s \\subset \\left( 0, min_sizes \\right ) \\f$ calculates coordinates of priorboxes:\n \\f[\n xmin = \\frac{\\frac{center_x - s}{2}}{W}\n \\f]\n \\f[\n ymin = \\frac{\\frac{center_y - s}{2}}{H}\n \\f]\n \\f[\n xmax = \\frac{\\frac{center_x + s}{2}}{W}\n \\f]\n \\f[\n ymin = \\frac{\\frac{center_y + s}{2}}{H}\n \\f]\n**Example**\n\n```html\n<layer ... type=\"PriorBox\" ... >\n <data step=\"64.000000\" min_size=\"162.000000\" max_size=\"213.000000\" offset=\"0.500000\" flip=\"1\" clip=\"0\" aspect_ratio=\"2.000000,3.000000\" variance=\"0.100000,0.100000,0.200000,0.200000\" />\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  1068. "attributes": [
  1069. {
  1070. "name": "min_size",
  1071. "required": true,
  1072. "type": "float32"
  1073. },
  1074. {
  1075. "name": "max_size",
  1076. "required": true,
  1077. "type": "float32"
  1078. },
  1079. {
  1080. "default": 1,
  1081. "description": "*aspect_ratio* is a variance of aspect ratios. Duplicate values are ignored. For example, *aspect_ratio* equal 2.000000,3.000000 means that for the first box aspect_ratio is equal to 2 and for the second box - 3.",
  1082. "name": "aspect_ratio",
  1083. "required": true,
  1084. "type": "float32"
  1085. },
  1086. {
  1087. "default": false,
  1088. "description": "*flip* is a flag that denotes that each *aspect_ratio* is duplicated and flipped. For example, *flip* equals 1 and *aspect_ratio* equals 3 mean that aspect_ratio is equal to 1/3.",
  1089. "name": "flip",
  1090. "required": true,
  1091. "type": "boolean"
  1092. },
  1093. {
  1094. "default": false,
  1095. "description": "*clip* is a flag that denotes if each value in the output blob is within [0,1]. For example, *clip* equal 1 means that each value in the output blob is within [0,1].",
  1096. "name": "clip",
  1097. "required": true,
  1098. "type": "boolean"
  1099. },
  1100. {
  1101. "description": "*step* is a distance between box centers. For example, *step* equal 85 means that the distance between neighborhood prior boxes centers is 85.",
  1102. "name": "step",
  1103. "required": true,
  1104. "type": "float32"
  1105. },
  1106. {
  1107. "default": 0.5,
  1108. "description": "*offset* is a shift of box respectively to top left corner. For example, *offset* equal 85 means that the shift of neighborhood prior boxes centers is 85.",
  1109. "name": "offset",
  1110. "required": true,
  1111. "type": "float32"
  1112. },
  1113. {
  1114. "description": "*variance* denotes a variance of adjusting bounding boxes. For example, *variance* equals 85 means that the shift of neighborhood prior boxes centers is 85.",
  1115. "name": "variance",
  1116. "required": true,
  1117. "type": "float32[]"
  1118. },
  1119. {
  1120. "default": 1,
  1121. "description": "*scale_all_sizes* is a flag that denotes type of inference. For example, *scale_all_sizes* equals 0 means that priorbox layer is inferd in MXNet-like manner. In particular, *max_size* parameter is ignored.",
  1122. "name": "scale_all_sizes",
  1123. "required": true,
  1124. "type": "int32"
  1125. }
  1126. ],
  1127. "inputs": [
  1128. { "name": "output_size" },
  1129. { "name": "image_size" }
  1130. ],
  1131. "status": "default"
  1132. },
  1133. {
  1134. "name": "PriorBoxClustered",
  1135. "description": "*PriorBoxClustered* layer generates prior boxes of specified sizes.\n**Parameters**: *PriorBoxClustered* layer parameters should be specified as the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\n*PriorBoxClustered* computes coordinates of prior boxes by following:\n1. Calculates the *center_x* and *center_y* of prior box:\n \\f[\n W \\equiv Width \\quad Of \\quad Image\n \\f]\n \\f[\n H \\equiv Height \\quad Of \\quad Image\n \\f]\n \\f[\n center_x=(w+offset)*step\n \\f]\n \\f[\n center_y=(h+offset)*step\n \\f]\n \\f[\n w \\subset \\left( 0, W \\right )\n \\f]\n \\f[\n h \\subset \\left( 0, H \\right )\n \\f]\n2. For each \\f$s \\subset \\left( 0, W \\right )\\f$ calculates the prior boxes coordinates:\n \\f[\n xmin = \\frac{center_x - \\frac{width_s}{2}}{W}\n \\f]\n\t\\f[\n\tymin = \\frac{center_y - \\frac{height_s}{2}}{H}\n\t\\f]\n\t\\f[\n\txmax = \\frac{center_x - \\frac{width_s}{2}}{W}\n\t\\f]\n\t\\f[\n\tymax = \\frac{center_y - \\frac{height_s}{2}}{H}\n\t\\f]\nIf *clip* is defined, the coordinates of prior boxes are recalculated with the formula:\n\\f$coordinate = \\min(\\max(coordinate,0), 1)\\f$\n**Example**\n\n```html\n<layer ... type=\"PriorBoxClustered\">\n <data clip=\"0\" flip=\"0\" height=\"44.0,10.0,30.0,19.0,94.0,32.0,61.0,53.0,17.0\" offset=\"0.5\" step=\"16.0\" variance=\"0.1,0.1,0.2,0.2\"\n width=\"86.0,13.0,57.0,39.0,68.0,34.0,142.0,50.0,23.0\"/>\n <input>\n ...\n </input>\n <output>\n ...\n </output>\n</layer>\n```",
  1136. "attributes": [
  1137. {
  1138. "description": "*width* is a parameter that specifies desired boxes widths in pixels.",
  1139. "name": "width",
  1140. "required": true,
  1141. "type": "float32[]"
  1142. },
  1143. {
  1144. "name": "height",
  1145. "required": true,
  1146. "type": "float32[]"
  1147. },
  1148. {
  1149. "default": false,
  1150. "description": "*clip* is a flag that denotes if each value in the output blob is within [0,1]. For example, *clip* equal 1 means that each value in the output blob is within [0,1].",
  1151. "name": "clip",
  1152. "required": true,
  1153. "type": "boolean"
  1154. },
  1155. {
  1156. "default": false,
  1157. "description": "*flip* is a flag that denotes whether the list of boxes is augmented with the flipped ones.",
  1158. "name": "flip",
  1159. "required": true,
  1160. "type": "boolean"
  1161. },
  1162. {
  1163. "description": "*step* is a distance between box centers. For example, *step* equal 85 means that the distance between neighborhood prior boxes centers is 85.",
  1164. "name": "step",
  1165. "required": true,
  1166. "type": "float32"
  1167. },
  1168. {
  1169. "name": "step_w",
  1170. "required": true,
  1171. "type": "float32"
  1172. },
  1173. {
  1174. "name": "step_h",
  1175. "required": true,
  1176. "type": "float32"
  1177. },
  1178. {
  1179. "default": 1,
  1180. "description": "*offset* is a shift of box respectively to top left corner. For example, *offset* equal 85 means that the shift of neighborhood prior boxes centers is 85.",
  1181. "name": "offset",
  1182. "required": true,
  1183. "type": "float32"
  1184. },
  1185. {
  1186. "description": "*variance* denotes a variance of adjusting bounding boxes. For example, *variance* equal 85 means that the shift of neighborhood prior boxes centers is 85.",
  1187. "name": "variance",
  1188. "required": true,
  1189. "type": "float32[]"
  1190. },
  1191. {
  1192. "description": "*img_h* specifies height of input image. These parameters are calculated unless provided explicitly.",
  1193. "name": "img_h",
  1194. "required": true,
  1195. "type": "float32"
  1196. },
  1197. {
  1198. "name": "img_w",
  1199. "required": true,
  1200. "type": "float32"
  1201. }
  1202. ],
  1203. "status": "default"
  1204. },
  1205. {
  1206. "name": "Proposal",
  1207. "category": "Layer",
  1208. "description": "*Proposal* layer performs filtering of only those bounding boxes and outputs with the highest confidence of prediction.\n**Parameters**: Proposal layer parameters should be specified as the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\n*Proposal* layer accepts three inputs with four dimensions. The produced blob has two dimensions: first one equals *batch_size * post_nms_topn*.\n*Proposal* does the following with the input blob:\n1. Generates initial anchor boxes Left top corner of all boxes in (0, 0). Width and height of boxes are calculated from *base_size* with scale and ratio parameters\n2. For each point in the first input blob:\n * pins anchor boxes to the image according to the second input blob that contains four deltas for each box: for *x* and *y* of center, for *width* and for *height*\n * finds out score in the first input blob\n3. Filters out boxes with size less than *min_size*\n4. Sorts all proposals (*box*, *score*) by score from highest to lowest\n5. Takes top *pre_nms_topn* proposals\n6. Calculates intersections for boxes and filter out all with \\f$intersection/union > nms\\_thresh\\f$\n7. Takes top *post_nms_topn* proposals\n8. Returns top proposals\n**Example**\n\n```html\n<layer ... type=\"Proposal\" ... >\n <data base_size=\"16\" feat_stride=\"16\" min_size=\"16\" nms_thresh=\"0.6\" post_nms_topn=\"200\" pre_nms_topn=\"6000\"\n ratio=\"2.67\" scale=\"4.0,6.0,9.0,16.0,24.0,32.0\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  1209. "attributes": [
  1210. {
  1211. "default": 1,
  1212. "description": "*pre_nms_topn (post_nms_topn)* is the quantity of bounding boxes before (after) applying NMS operation. For example, *pre_nms_topn (post_nms_topn)* equal 15 means that the minimum (maximum) box size is 15.",
  1213. "name": "pre_nms_topn (post_nms_topn)",
  1214. "required": true,
  1215. "type": "int32"
  1216. },
  1217. {
  1218. "default": 1,
  1219. "description": "*nms_thresh* is the minimum value of the proposal to be taken into consideration. For example, *nms_thresh* equal 0.5 means that all boxes with prediction probability less than 0.5 are filtered out.",
  1220. "name": "nms_thresh",
  1221. "required": true,
  1222. "type": "float32"
  1223. },
  1224. {
  1225. "default": 1,
  1226. "description": "*feat_stride* is the step size to slide over boxes (in pixels). For example, *feat_stride* equal 16 means that all boxes are analyzed with the slide 16.",
  1227. "name": "feat_stride",
  1228. "required": true,
  1229. "type": "int32"
  1230. },
  1231. {
  1232. "default": 1,
  1233. "description": "*min_size* is the minimum size of box to be taken into consideration. For example, *min_size* equal 35 means that all boxes with box size less than 35 are filtered out.",
  1234. "name": "min_size",
  1235. "required": true,
  1236. "type": "int32"
  1237. },
  1238. {
  1239. "default": 1,
  1240. "description": "*ratio* is the ratios for anchor generation.",
  1241. "name": "ratio",
  1242. "required": true,
  1243. "type": "float32[]"
  1244. },
  1245. {
  1246. "default": 1,
  1247. "description": "*ratio* is the ratios for anchor generation.",
  1248. "name": "ratio",
  1249. "required": true,
  1250. "type": "float32[]"
  1251. },
  1252. {
  1253. "default": 1,
  1254. "description": "*scale* is the scales for anchor generation.",
  1255. "name": "scale",
  1256. "required": true,
  1257. "type": "float32[]"
  1258. }
  1259. ],
  1260. "status": "default"
  1261. },
  1262. {
  1263. "name": "PSROIPooling",
  1264. "category": "Pool",
  1265. "description": "*PSROIPooling* layer compute position-sensitive max pooling on regions of interest specified by input, takes as input N position-sensitive score maps and a list of R regions of interest.\n**Detailed description**: [Reference](https://arxiv.org/pdf/1703.06211.pdf)\n**Parameters**: *PSRoiPooling* layer parameters should be specified as the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\nThe output value for \\f$(i, j)\\f$-th bin is obtained by summation from one score map \\f$x_{i,j}\\f$ corresponding to that bin. In short, the difference from *RoIPooling* is that a general feature map \\f$x\\f$ is replaced by a specific positive-sensitive score map \\f$x_{i,j}\\f$.\n**Example**\n\n```html\n<layer ... type=\"PSROIPooling\" ... >\n <data output_dim=\"10\" out_max_val=\"1\" spatial_scale=\"0.1\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  1266. "attributes": [
  1267. {
  1268. "default": 1,
  1269. "description": " pooled output channel number",
  1270. "name": "output_dim",
  1271. "required": true,
  1272. "type": "int32"
  1273. },
  1274. {
  1275. "default": 1,
  1276. "description": " number of groups to encode position-sensitive score maps",
  1277. "name": "group_size",
  1278. "required": true,
  1279. "type": "int32"
  1280. },
  1281. {
  1282. "default": 1,
  1283. "description": " multiplicative spatial scale factor to translate ROI coordinates from their input scale to the scale used when pooling",
  1284. "name": "spatial_scale",
  1285. "required": true,
  1286. "type": "float32"
  1287. }
  1288. ],
  1289. "status": "default"
  1290. },
  1291. {
  1292. "name": "Range",
  1293. "inputs": [
  1294. { "name": "start" },
  1295. { "name": "stop" },
  1296. { "name": "step" }
  1297. ]
  1298. },
  1299. {
  1300. "name": "RegionYolo",
  1301. "category": "Layer",
  1302. "description": "*RegionYolo* computes coordinates of regions with probability for each class.\n**Detailed description**: [Reference][p_yolo]\n**Parameters**: *RegionYolo* layer parameters should be specified as the `data` node, which is a child of the `layer` node.\n**Example**\n\n```html\n<layer ... type=\"RegionYolo\" ... >\n <data bias=\"1.0\"/>\n <input> ... </input>\n <output> ... </output>\n <weights .../>\n</layer>\n```",
  1303. "attributes": [
  1304. {
  1305. "default": 1,
  1306. "description": "*coords* is num coordinates for each region",
  1307. "name": "coords",
  1308. "required": true,
  1309. "type": "int32"
  1310. },
  1311. {
  1312. "default": 1,
  1313. "description": "*classes* is num classes for each region",
  1314. "name": "classes",
  1315. "required": true,
  1316. "type": "int32"
  1317. },
  1318. {
  1319. "default": 1,
  1320. "description": "*num* is number of regions",
  1321. "name": "num",
  1322. "required": true,
  1323. "type": "int32"
  1324. },
  1325. {
  1326. "default": 1,
  1327. "description": "*do_softmax* is a flag which specifies the method of infer",
  1328. "name": "do_softmax",
  1329. "required": true,
  1330. "type": "int32"
  1331. },
  1332. {
  1333. "default": 1,
  1334. "description": "*anchors* coordinates regions",
  1335. "name": "anchors",
  1336. "required": true,
  1337. "type": "float32[]"
  1338. },
  1339. {
  1340. "default": 1,
  1341. "description": "*mask* specifies which anchors to use",
  1342. "name": "mask",
  1343. "required": true,
  1344. "type": "int32"
  1345. },
  1346. {
  1347. "default": 1,
  1348. "description": "*mask* specifies which anchors to use",
  1349. "name": "mask",
  1350. "required": true,
  1351. "type": "int32"
  1352. },
  1353. {
  1354. "default": 1,
  1355. "description": "*axis* is the number of the dimension from which flattening is performed. For example, *axis* equals 1 means that flattening is started from the 1st dimension.",
  1356. "name": "axis",
  1357. "required": true,
  1358. "type": "int32"
  1359. },
  1360. {
  1361. "default": 1,
  1362. "description": "*end_axis* is the number of the dimension on which flattening is ended. For example, *end_axis* equals -1 means that flattening is performed till the last dimension.",
  1363. "name": "end_axis",
  1364. "required": true,
  1365. "type": "int32"
  1366. }
  1367. ],
  1368. "status": "default"
  1369. },
  1370. {
  1371. "name": "ReLU",
  1372. "category": "Activation",
  1373. "description": "[Reference](http://caffe.berkeleyvision.org/tutorial/layers/relu.html)\n**Detailed description**: [Reference](https://github.com/Kulbear/deep-learning-nano-foundation/wiki/ReLU-and-Softmax-Activation-Functions#rectified-linear-units)\n**Parameters**: *ReLU* layer parameters can be (not mandatory) specified in the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\n\\f[\nY_{i}^{( l )} = max(0, Y_{i}^{( l - 1 )})\n\\f]\n**Example**\n\n```html\n<layer ... type=\"ReLU\" ... >\n <data negative_slope=\"0.100000\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  1374. "attributes": [
  1375. {
  1376. "default": 0,
  1377. "description": "*negative_slope* is a multiplier, which is used if the unit is not active (that is negative). For example, *negative_slope* equal 0.1 means that an inactive unit value would be multiplied by 0.1 and this is the [Leaky ReLU](https://keras.io/layers/advanced-activations/#leakyrelu). If *negative_slope* is equal to 0, this is the usual *ReLU*.",
  1378. "name": "negative_slope",
  1379. "required": true,
  1380. "type": "float64"
  1381. }
  1382. ],
  1383. "status": "default"
  1384. },
  1385. {
  1386. "name": "ReorgYolo",
  1387. "category": "Layer",
  1388. "description": "*ReorgYolo* reorganizes input blob taking into account strides.\n**Detailed description**: [Reference][p_yolo]\n**Parameters**: *ReorgYolo* layer parameters should be specified as the `data` node, which is a child of the `layer` node.\n**Example**\n\n```html\n<layer ... type=\"ReorgYolo\" ... >\n <data stride=\"1\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  1389. "attributes": [
  1390. {
  1391. "default": 1,
  1392. "description": "*stride* is distance of cut throws in output blobs.",
  1393. "name": "stride",
  1394. "required": true,
  1395. "type": "int32"
  1396. }
  1397. ],
  1398. "status": "default"
  1399. },
  1400. {
  1401. "name": "Resample",
  1402. "category": "Layer",
  1403. "description": "Layer scales the input blob by the specified parameters.\n**Parameters**: Resample layer parameters should be specified as the `data` node, which is a child of the layer node.\n**Example**\n\n```html\n<layer type=\"Resample\">\n <data antialias=\"0\" factor=\"1.0\" height=\"227\" type=\"caffe.ResampleParameter.LINEAR\" width=\"227\"/>\n <input>\n ...\n </input>\n <output>\n ...\n </output>\n​</layer>\n```",
  1404. "attributes": [
  1405. {
  1406. "default": 1,
  1407. "description": "Parameter specifies type of blob interpolation.",
  1408. "name": "type",
  1409. "required": true
  1410. },
  1411. {
  1412. "default": 1,
  1413. "description": "*antialias* is a flag that denotes whether to perform anti-aliasing.",
  1414. "name": "antialias",
  1415. "required": true
  1416. }
  1417. ],
  1418. "status": "default"
  1419. },
  1420. {
  1421. "name": "ReduceMax",
  1422. "description": "*ReduceMax* operation performs the reduction with finding the maximum value on a given input data along dimensions specified by axes input.",
  1423. "inputs": [
  1424. { "name": "data" },
  1425. { "name": "axes" }
  1426. ]
  1427. },
  1428. {
  1429. "name": "ReduceMean",
  1430. "inputs": [
  1431. { "name": "data" },
  1432. { "name": "axes" }
  1433. ]
  1434. },
  1435. {
  1436. "name": "Reshape",
  1437. "category": "Shape",
  1438. "description": "*Reshape* layer changes dimensions of the input blob according to the specified order. Input blob volume is equal to output blob volume, where volume is the product of dimensions.\n**Detailed description**: [Reference](http://caffe.berkeleyvision.org/tutorial/layers/reshape.html)\n**Parameters**: *Reshape* layer parameters should be specified in the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\nIf you want to reshape input blob *BxCxHxW* into *Bx1x(C*H)xW*, the *dim* parameters of your layer should be:\n```html\n layer {\n name: \"reshape\"\n type: \"Reshape\"\n bottom: \"input\"\n top: \"output\"\n reshape_param {\n shape {\n dim: 0 # copy the dimension from below\n dim: 1\n dim: -1 # infer it from the other dimensions\n dim: 0\n }\n }\n }\n```\n**Example**\n\n```html\n<layer ... type=\"Reshape\" ... >\n <data axis=\"0\" dim=\"1, 1001\" num_axes=\"-1\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  1439. "attributes": [
  1440. {
  1441. "default": 1,
  1442. "description": "*axis* is the number of the starting axis for reshape. For example, *axis* equal 1 means that *Reshape* replaces dimensions starting from the next after the first dimension.",
  1443. "name": "axis",
  1444. "required": true,
  1445. "type": "int32"
  1446. },
  1447. {
  1448. "description": "*dim* is a set of numbers separated with comma, which denote the dimensions of output blob. For example, *dim* equal 88,1,71 means that output blob gets following dimensions: first dimension equals 88, second dimension equals 1, third dimension equals 71. For more information, refer to the **Description** block. If *dim* is equal to two numbers, it performs [flattening](http://caffe.berkeleyvision.org/tutorial/layers/flatten.html).",
  1449. "name": "dim",
  1450. "required": true,
  1451. "type": "int32[]"
  1452. },
  1453. {
  1454. "default": 1,
  1455. "description": "*num_axes* is the number of dimensions to be replaced with a reshaped blob starting from the dimension number specified in *axis* property. For example, *num_axes* equal 2 means that 2 dimensions are replaced with reshaped blob.",
  1456. "name": "num_axes",
  1457. "required": true,
  1458. "type": "int32"
  1459. },
  1460. {
  1461. "name": "special_zero",
  1462. "type": "boolean"
  1463. }
  1464. ],
  1465. "inputs": [
  1466. { "name": "data" },
  1467. { "name": "shape" }
  1468. ],
  1469. "status": "default"
  1470. },
  1471. {
  1472. "name": "ROIPooling",
  1473. "category": "Layer",
  1474. "description": "It is a *pooling layer* with *max* pooling strategy (see *max* option in the *<a href=\"IRLayersCatalogSpec.html#pooling-layer\">Pooling layer</a>* parameters description). It is used over feature maps of non-uniform sizes and outputs another feature map of a fixed size.\n**Detailed description**: [deepsense.io reference](https://blog.deepsense.ai/region-of-interest-pooling-explained/)\n**Parameters**: Specify *ROIPooling* layer parameters in the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\n\\f[\noutput_{j} = MAX\\{ x_{0}, ... x_{i}\\}\n\\f]\n**Example**\n\n```html\n<layer ... type=\"ROIPooling\" ... >\n <data pooled_h=\"6\" pooled_w=\"6\" spatial_scale=\"0.062500\"/>\n <input> ... </input>\n <output> ... </output>\n </layer>\n```",
  1475. "attributes": [
  1476. {
  1477. "default": 1,
  1478. "description": "*pooled_h* is a height of the ROI output feature map. For example, *pooled_h* equal 6 means that the height of the output of *ROIpooling* is 6.",
  1479. "name": "pooled_h",
  1480. "required": true,
  1481. "type": "int32"
  1482. },
  1483. {
  1484. "default": 1,
  1485. "description": "*pooled_w* is a width of the ROI output feature map. For example, *pooled_w* equal 6 means that the width of the output of *ROIpooling* is 6.",
  1486. "name": "pooled_w",
  1487. "required": true,
  1488. "type": "int32"
  1489. },
  1490. {
  1491. "default": 1,
  1492. "description": "*spatial_scale* is a ratio of the input feature map over the input image size.",
  1493. "name": "spatial_scale",
  1494. "required": true,
  1495. "type": "float32"
  1496. }
  1497. ],
  1498. "status": "default"
  1499. },
  1500. {
  1501. "name": "ScaleShift",
  1502. "category": "Layer",
  1503. "description": "*ScaleShift* layer performs linear transformation of the input blobs. Weights denote scaling parameter, biases - a shift.\n**Parameters**: *ScaleShift* layer does not have additional parameters.\n**Mathematical Formulation**\n\\f[\no_{i} =\\gamma b_{i} + \\beta\n\\f]\n**Example**\n\n```\n<layer ... type=\"ScaleShift\" ... >\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  1504. "status": "default"
  1505. },
  1506. {
  1507. "name": "Sigmoid",
  1508. "category": "Activation"
  1509. },
  1510. {
  1511. "name": "SimplerNMS",
  1512. "category": "Layer",
  1513. "description": "*SimplerNMS* layer performs filtering of bounding boxes and outputs only those with the highest confidence of prediction.\n**Parameters**: *SimplerNMS* layer parameters should be specified as the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\n*SimplerNMS* accepts three inputs with four dimensions. Produced blob has two dimensions, the first one equals *post_nms_topn*.\n*SimplerNMS* does the following with the input blob:\n1. Generates initial anchor boxes. Left top corner of all boxes is (0, 0). Width and height of boxes are calculated based on scaled (according to the scale parameter) default widths and heights\n2. For each point in the first input blob:\n * pins anchor boxes to picture according to the second input blob, which contains four deltas for each box: for x and y of center, for width, and for height\n * finds out score in the first input blob\n3. Filters out boxes with size less than *min_bbox_size.*\n4. Sorts all proposals (*box, score*) by score from highest to lowest\n5. Takes top *pre_nms_topn* proposals\n6. Calculates intersections for boxes and filters out all with \\f$intersection/union > iou\\_threshold\\f$\n7. Takes top *post_nms_topn* proposals\n8. Returns top proposals\n**Example**\n\n```html\n<layer ... type=\"SimplerNMS\" ... >\n <data cls_threshold=\"0.500000\" iou_threshold=\"0.700000\" min_bbox_size=\"16\" feat_stride=\"16\" pre_nms_topn=\"6000\" post_nms_topn=\"150\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  1514. "attributes": [
  1515. {
  1516. "default": 1,
  1517. "description": "*pre_nms_topn (post_nms_topn)* is the quantity of bounding boxes before (after) applying NMS operation. For example, *pre_nms_topn (post_nms_topn)* equals 15 means that the minimum (maximum) box size is 15.",
  1518. "name": "pre_nms_topn (post_nms_topn)",
  1519. "required": true,
  1520. "type": "int32"
  1521. },
  1522. {
  1523. "default": 1,
  1524. "description": "*cls_threshold* is the minimum value of the proposal to be taken into consideration. For example, *cls_threshold* equal 0.5 means that all boxes with prediction probability less than 0.5 are filtered out.",
  1525. "name": "cls_threshold",
  1526. "required": true,
  1527. "type": "float32"
  1528. },
  1529. {
  1530. "default": 1,
  1531. "description": "*iou_threshold* is the minimum ratio of boxes overlapping to be taken into consideration. For example, *iou_threshold* equal 0.7 means that all boxes with overlapping ratio less than 0.7 are filtered out.",
  1532. "name": "iou_threshold",
  1533. "required": true,
  1534. "type": "float32"
  1535. },
  1536. {
  1537. "default": 1,
  1538. "description": "*feat_stride* is the step size to slide over boxes (in pixels). For example, *feat_stride* equal 16 means that all boxes are analyzed with the slide 16.",
  1539. "name": "feat_stride",
  1540. "required": true,
  1541. "type": "int32"
  1542. },
  1543. {
  1544. "default": 1,
  1545. "description": "*min_bbox_size* is the minimum size of box to be taken into consideration. For example, *min_bbox_size* equal 35 means that all boxes with box size less than 35 are filtered out.",
  1546. "name": "min_bbox_size",
  1547. "required": true,
  1548. "type": "int32"
  1549. },
  1550. {
  1551. "default": 1,
  1552. "description": "*scale* is array of scales for anchor boxes generating.",
  1553. "name": "scale",
  1554. "required": true,
  1555. "type": "int32"
  1556. }
  1557. ],
  1558. "status": "default"
  1559. },
  1560. {
  1561. "name": "Subtract",
  1562. "inputs": [
  1563. { "name": "A" },
  1564. { "name": "B" }
  1565. ],
  1566. "outputs": [
  1567. { "name": "C" }
  1568. ]
  1569. },
  1570. {
  1571. "name": "SoftMax",
  1572. "category": "Activation",
  1573. "description": "[Reference](https://github.com/Kulbear/deep-learning-nano-foundation/wiki/ReLU-and-Softmax-Activation-Functions#softmax)\n**Detailed description**: [Reference](http://cs231n.github.io/linear-classify/#softmax)\n**Parameters**: *SoftMax* layer parameters can be (not mandatory) specified in the `data` node, which is a child of the layer node.\n**Mathematical Formulation**\n\\f[\ny_{c} = \\frac{e^{Z_{c}}}{\\sum_{d=1}^{C}e^{Z_{d}}}\n\\f]\nwhere \\f$C\\f$ is a number of classes\n**Example**\n\n```html\n<layer ... type=\"SoftMax\" ... >\n <data axis=\"1\" />\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  1574. "attributes": [
  1575. {
  1576. "description": "*axis* represents the axis of which the *SoftMax* is calculated. *axis* equal 1 is a default value.",
  1577. "name": "axis",
  1578. "required": true,
  1579. "type": "int32"
  1580. }
  1581. ],
  1582. "status": "default"
  1583. },
  1584. {
  1585. "name": "Split",
  1586. "category": "Tensor",
  1587. "description": "*Split* layer splits the input into several output groups. Group sizes are denoted by the number and the size of output ports.\n**Detailed description**: [Reference](http://caffe.berkeleyvision.org/tutorial/layers/split.html)\n**Parameters**: *None*\n**Mathematical Formulation**\nSplits input blob among children. For example, blob is *BxC+CxHxW* and there are two children. Then, output blob is *BxCxHxW*.\n**Example**\n\n```html\n<layer ... type=\"Split\" ... >\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  1588. "attributes": [
  1589. {
  1590. "name": "axis",
  1591. "type": "int32"
  1592. }
  1593. ],
  1594. "status": "default"
  1595. },
  1596. {
  1597. "name": "Squeeze",
  1598. "category": "Transform",
  1599. "inputs": [
  1600. { "name": "input" },
  1601. { "name": "axes" }
  1602. ]
  1603. },
  1604. {
  1605. "name": "StridedSlice",
  1606. "category": "Tensor",
  1607. "inputs": [
  1608. { "name": "data" },
  1609. { "name": "begin" },
  1610. { "name": "end" },
  1611. { "name": "stride" }
  1612. ]
  1613. },
  1614. {
  1615. "name": "Swish",
  1616. "category": "Activation"
  1617. },
  1618. {
  1619. "name": "TensorIterator",
  1620. "description": "*TensorIterator* layer performs recurrent execution of the network, which is described in the **body**, iterating through the data.",
  1621. "inputs": [
  1622. { "name": "inputs", "type": "Tensor[]" }
  1623. ],
  1624. "outputs": [
  1625. { "name": "outputs", "type": "Tensor[]" }
  1626. ]
  1627. },
  1628. {
  1629. "name": "Tile",
  1630. "description": "*Tile* layer extends input blob with copies of data along specific axis.\n**Detailed description**: [Reference](http://caffe.help/manual/layers/tile.html)\n**Parameters**: *Tile* layer parameters should be specified as the `tile_data` node, which is a child of the layer node.\n**Mathematical Formulation**\n*Tile* extends input blobs and filling in output blobs following rules:\n\\f[\nout_i=input_i[inner\\_dim*t]\n\\f]\n\\f[\nt \\in \\left ( 0, \\quad tiles \\right )\n\\f]\n**Example**\n\n```html\n<layer ... type=\"Tile\" ... >\n <tile_data axis=\"3\" tiles=\"88\"/>\n <input> ... </input>\n <output> ... </output>\n</layer>\n```",
  1631. "attributes": [
  1632. {
  1633. "default": 1,
  1634. "description": "*axis* is the index of the axis to tile. For example, *axis* equals 3 means that fourth axis is used for tiling.",
  1635. "name": "axis",
  1636. "required": true,
  1637. "type": "int32"
  1638. },
  1639. {
  1640. "description": "*tiles* is a size of the specified axis in the output blob. For example, *tiles* equal 88 means that output blob gets 88 copies of data from specified axis.",
  1641. "name": "tiles",
  1642. "required": true,
  1643. "type": "int32"
  1644. }
  1645. ],
  1646. "status": "default"
  1647. },
  1648. {
  1649. "name": "Transpose",
  1650. "category": "Transform",
  1651. "inputs": [
  1652. { "name": "arg" },
  1653. { "name": "input_order" }
  1654. ]
  1655. },
  1656. {
  1657. "name": "Unsqueeze",
  1658. "category": "Transform",
  1659. "inputs": [
  1660. { "name": "input" },
  1661. { "name": "axes" }
  1662. ]
  1663. }
  1664. ]