Kaynağa Gözat

add neg_, pow_ and mseloss

JasonWang 6 yıl önce
ebeveyn
işleme
1506cb8974

+ 11 - 1
README.md

@@ -1,2 +1,12 @@
 # traph
-Traph is a open-source machine learning platform.
+Traph is a open-source `toy` machine learning platform. It is inspired by torch.
+
+# Installation
+To install the CPU-only traph:
+`pip install pytraph`
+
+# Compilation
+
+# License
+MIT
+

+ 6 - 0
traph/include/traph/core/tensor.h

@@ -35,10 +35,12 @@ namespace traph
         virtual device_id device() = 0;
         virtual std::shared_ptr<TensorInterface> inverse() const = 0;
         virtual std::shared_ptr<TensorInterface> matmul(std::shared_ptr<TensorInterface> mat) const = 0;
+        virtual void neg_() = 0;
         virtual idx_type offset() const = 0;
 		virtual layout_type order() const = 0;
         // virtual std::shared_ptr<TensorInterface> permute(const DimVector& dims) const = 0;
         virtual platform_type platform() = 0;
+        virtual void pow_(f32 exp) = 0;
         virtual void reshape_(const DimVector& dims) = 0;
         virtual void resize_(const DimVector& dims) = 0;
         virtual std::shared_ptr<TensorInterface> select(const SliceVector& slice) const = 0;
@@ -47,6 +49,7 @@ namespace traph
 		virtual idx_type size(idx_type i) const = 0;
 		virtual DimVector stride() const = 0;
 		virtual idx_type stride(idx_type i) const = 0;
+        virtual void sub_(std::shared_ptr<TensorInterface> other) = 0;
         virtual shared_pointer sum() const = 0;
         virtual std::string to_string() const = 0;
         virtual void transpose_(idx_type dim0, idx_type dim1) = 0;
@@ -84,9 +87,11 @@ namespace traph
         virtual std::shared_ptr<TensorInterface> inverse() const = 0;
         virtual T item() const = 0;
         virtual std::shared_ptr<TensorInterface> matmul(std::shared_ptr<TensorInterface> mat) const = 0;
+        virtual void neg_() = 0;
         virtual idx_type offset() const = 0;
 		virtual layout_type order() const = 0;
         virtual platform_type platform() = 0;
+        virtual void pow_(f32 exp) = 0;
         virtual T reduce_(std::function<T(T,T)> f) const = 0;
         virtual TensorInterfacePtr reduce_dim(idx_type dim, std::function<T(T,T)> f) const = 0;
         virtual void reshape_(const DimVector& dims) = 0;
@@ -98,6 +103,7 @@ namespace traph
         virtual std::shared_ptr<StorageBase<T>> storage() const = 0;
 		virtual DimVector stride() const = 0;
 		virtual idx_type stride(idx_type i) const = 0;
+        virtual void sub_(std::shared_ptr<TensorInterface> other) = 0;
         virtual TensorInterfacePtr sum() const = 0;
         virtual std::string to_string() const = 0;
         virtual void transpose_(idx_type dim0, idx_type dim1) = 0;

+ 25 - 0
traph/include/traph/nn/function.h

@@ -206,6 +206,31 @@ namespace traph
 		return result;
 	}
 
+	VariableInterfacePtr sub(VariableInterfacePtr left, VariableInterfacePtr right)
+	{
+		DimVector result_dim;
+
+        VariableInterfacePtr result = left->new_empty(result_dim, true);
+		std::shared_ptr<SubOp> op(new SubOp);
+		result->data_(op->forward({ left->data(), right->data() }));
+		result->leaf_(false);
+		if (left->requires_grad() || right->requires_grad())
+		{
+			std::vector<VariableInterfacePtr> result_inputs{ left, right };
+			result->grad_(result->data()->create_grad());
+			result->grad()->fill_(0);
+			result->requires_grad_(true);
+			result->grad_fn_(op);
+			result->inputs_(result_inputs);
+		}
+		else
+		{
+			result->requires_grad_(false);
+		}
+
+		return result;
+	}
+
 	VariableInterfacePtr transpose(VariableInterfacePtr input, idx_type dim0, idx_type dim1)
 	{
 		DimVector result_dim;

+ 42 - 0
traph/include/traph/nn/layers/linear.h

@@ -0,0 +1,42 @@
+#ifndef TRAPH_NN_LAYERS_LINEAR
+#define TRAPH_NN_LAYERS_LINEAR
+
+
+#include <traph/nn/module.h>
+
+namespace traph
+{
+    class Linear: public Module
+    {
+    private:
+        int _in_features;
+        int _out_features;
+        std::shared_ptr<VariableInterface> _weight;
+        std::shared_ptr<VariableInterface> _bias;
+    public:
+        Linear(int in_features, int out_features, bool bias)
+        {
+            _in_features = in_features;
+            _out_features = out_features;
+            _weight = std::shared_ptr<VariableInterface>(new FloatParameter({out_features, in_features}));
+            if(bias)
+                _bias = std::shared_ptr<VariableInterface>(new FloatParameter({out_features}));
+            
+            register_parameter("weight", std::dynamic_pointer_cast<FloatParameter>(_weight));
+            register_parameter("bias", std::dynamic_pointer_cast<FloatParameter>(_bias));
+        }
+
+        std::shared_ptr<VariableInterface> forward(std::shared_ptr<VariableInterface> input)
+        {
+            std::shared_ptr<VariableInterface> result;
+            if(_bias)
+                result = add(matmul(input, transpose(_weight, 0, 1)), _bias);
+            else
+                result = matmul(input, transpose(_weight, 0, 1));
+            
+            return result;
+        }
+    };
+}
+
+#endif // TRAPH_NN_LAYERS_LINEAR

+ 46 - 0
traph/include/traph/nn/layers/loss.h

@@ -0,0 +1,46 @@
+#ifndef TRAPH_NN_LAYERS_LOSS
+#define TRAPH_NN_LAYERS_LOSS
+
+#include <traph/nn/module.h>
+
+namespace traph
+{
+    enum class MSELossReduction
+    {
+        NONE,
+        MEAN,
+        SUM
+    };
+
+    class MSELoss: public Module
+    {
+    private:
+        MSELossReduction _reduction;
+    public:
+        MSELoss(MSELossReduction reduction = MSELossReduction::MEAN)
+            :_reduction(reduction)
+        {
+        }
+
+        std::shared_ptr<VariableInterface> forward(std::shared_ptr<VariableInterface> input, std::shared_ptr<VariableInterface> target)
+        {
+            std::shared_ptr<VariableInterface> ret;
+            if(_reduction == MSELossReduction::SUM)
+            {
+                ret = sum(sub(input, target));
+            }
+            else if(_reduction == MSELossReduction::MEAN)
+            {
+                // fixme: use mean if it impled
+                ret = sum(sub(input, target));
+            }
+            else
+            {
+                ret = sum(sub(input, target));
+            }
+            return ret;
+        }
+    };
+}
+
+#endif // TRAPH_NN_LAYERS_LOSS

+ 0 - 32
traph/include/traph/nn/module.h

@@ -40,38 +40,6 @@ namespace traph
             _parameters[name] = param;
         }
     };
-
-    class LinearModule: public Module
-    {
-    private:
-        int _in_features;
-        int _out_features;
-        std::shared_ptr<VariableInterface> _weight;
-        std::shared_ptr<VariableInterface> _bias;
-    public:
-        LinearModule(int in_features, int out_features, bool bias)
-        {
-            _in_features = in_features;
-            _out_features = out_features;
-            _weight = std::shared_ptr<VariableInterface>(new FloatParameter({out_features, in_features}));
-            if(bias)
-                _bias = std::shared_ptr<VariableInterface>(new FloatParameter({out_features}));
-            
-            register_parameter("weight", std::dynamic_pointer_cast<FloatParameter>(_weight));
-            register_parameter("bias", std::dynamic_pointer_cast<FloatParameter>(_bias));
-        }
-
-        std::shared_ptr<VariableInterface> forward(std::shared_ptr<VariableInterface> input)
-        {
-            std::shared_ptr<VariableInterface> result;
-            if(_bias)
-                result = add(matmul(input, transpose(_weight, 0, 1)), _bias);
-            else
-                result = matmul(input, transpose(_weight, 0, 1));
-            
-            return result;
-        }
-    };
 } // traph
 
 #endif

+ 24 - 0
traph/include/traph/nn/operation.h

@@ -162,6 +162,30 @@ namespace traph
 		}
 	};
 
+	class SubOp : public OpBase
+	{
+	public:
+		virtual TensorInterfacePtr forward(std::vector<TensorInterfacePtr> inputs) override
+		{
+			assert(inputs.size() == 2);
+
+			TensorInterfacePtr left_input = inputs[0];
+			TensorInterfacePtr right_input = inputs[1];
+			TensorInterfacePtr result = left_input->clone();
+            result->sub_(right_input);
+
+			return result;
+		}
+
+		virtual std::vector<TensorBasePtr<f32>> backward(TensorBasePtr<f32> output_grad) override
+		{
+			auto left = output_grad;
+			auto right = output_grad->clone();
+			right->neg_();
+			return { output_grad, std::dynamic_pointer_cast<TensorBase<f32>>(right) };
+		}
+	};
+
 	class TransposeOp : public OpBase
 	{
 	private:

+ 4 - 3
traph/include/traph/nn/variable.h

@@ -189,9 +189,10 @@ namespace traph
 			std::vector<TensorBasePtr<f32>> back_grad = cur_node->grad_fn()->backward(cur_node->grad());
 
 			assert(back_grad.size() == cur_node->inputs().size());
-			for (int i = 0; i < cur_node->inputs().size(); ++i)
+			for (int j = 0; j < cur_node->inputs().size(); ++j)
 			{
-				cur_node->inputs()[i]->grad()->add_(back_grad[i]);
+				if(cur_node->inputs()[j]->requires_grad())
+					cur_node->inputs()[j]->grad()->add_(back_grad[j]);
 			}
 		}
 
@@ -260,7 +261,7 @@ namespace traph
     template<typename T>
     bool Variable<T>::is_leaf() const
     {
-        return _leaf;
+        return !_grad_fn;
     }
 
 	template<typename T>

+ 4 - 1
traph/include/traph/tensor/byte_tensor.h

@@ -66,9 +66,11 @@ namespace traph
 		virtual std::shared_ptr<TensorInterface> inverse() const override;
 		virtual u8 item() const override;
 		virtual std::shared_ptr<TensorInterface> matmul(std::shared_ptr<TensorInterface> mat) const override;
-		virtual idx_type offset() const override;
+		virtual void neg_() override;
+        virtual idx_type offset() const override;
 		virtual layout_type order() const override;
 		virtual platform_type platform() override;
+        virtual void pow_(f32 exp) override;
 		virtual u8 reduce_(std::function<u8(u8, u8)> f) const override;
 		virtual TensorInterfacePtr reduce_dim(idx_type dim, std::function<u8(u8, u8)> f) const override;
 		virtual void reshape_(const DimVector& dims) override;
@@ -80,6 +82,7 @@ namespace traph
 		virtual std::shared_ptr<StorageBase<u8>> storage() const override;
 		virtual DimVector stride() const override;
 		virtual idx_type stride(idx_type i) const override;
+        virtual void sub_(std::shared_ptr<TensorInterface> other) override;
 		virtual TensorInterfacePtr sum() const override;
 		virtual std::string to_string() const override;
         virtual void transpose_(idx_type dim0, idx_type dim1) override;

+ 4 - 1
traph/include/traph/tensor/char_tensor.h

@@ -66,9 +66,11 @@ namespace traph
 		virtual std::shared_ptr<TensorInterface> inverse() const override;
 		virtual i8 item() const override;
 		virtual std::shared_ptr<TensorInterface> matmul(std::shared_ptr<TensorInterface> mat) const override;
-		virtual idx_type offset() const override;
+		virtual void neg_() override;
+        virtual idx_type offset() const override;
 		virtual layout_type order() const override;
 		virtual platform_type platform() override;
+        virtual void pow_(f32 exp) override;
 		virtual i8 reduce_(std::function<i8(i8, i8)> f) const override;
 		virtual TensorInterfacePtr reduce_dim(idx_type dim, std::function<i8(i8, i8)> f) const override;
 		virtual void reshape_(const DimVector& dims) override;
@@ -80,6 +82,7 @@ namespace traph
 		virtual std::shared_ptr<StorageBase<i8>> storage() const override;
 		virtual DimVector stride() const override;
 		virtual idx_type stride(idx_type i) const override;
+        virtual void sub_(std::shared_ptr<TensorInterface> other) override;
 		virtual TensorInterfacePtr sum() const override;
 		virtual std::string to_string() const override;
         virtual void transpose_(idx_type dim0, idx_type dim1) override;

+ 4 - 1
traph/include/traph/tensor/double_tensor.h

@@ -66,9 +66,11 @@ namespace traph
 		virtual std::shared_ptr<TensorInterface> inverse() const override;
 		virtual f64 item() const override;
 		virtual std::shared_ptr<TensorInterface> matmul(std::shared_ptr<TensorInterface> mat) const override;
-		virtual idx_type offset() const override;
+		virtual void neg_() override;
+        virtual idx_type offset() const override;
 		virtual layout_type order() const override;
 		virtual platform_type platform() override;
+        virtual void pow_(f32 exp) override;
 		virtual f64 reduce_(std::function<f64(f64, f64)> f) const override;
 		virtual TensorInterfacePtr reduce_dim(idx_type dim, std::function<f64(f64, f64)> f) const override;
 		virtual void reshape_(const DimVector& dims) override;
@@ -80,6 +82,7 @@ namespace traph
 		virtual std::shared_ptr<StorageBase<f64>> storage() const override;
 		virtual DimVector stride() const override;
 		virtual idx_type stride(idx_type i) const override;
+        virtual void sub_(std::shared_ptr<TensorInterface> other) override;
 		virtual TensorInterfacePtr sum() const override;
 		virtual std::string to_string() const override;
         virtual void transpose_(idx_type dim0, idx_type dim1) override;

+ 4 - 1
traph/include/traph/tensor/float_tensor.h

@@ -67,9 +67,11 @@ namespace traph
 		virtual std::shared_ptr<TensorInterface> inverse() const override;
 		virtual f32 item() const override;
 		virtual std::shared_ptr<TensorInterface> matmul(std::shared_ptr<TensorInterface> mat) const override;
-		virtual idx_type offset() const override;
+		virtual void neg_() override;
+        virtual idx_type offset() const override;
 		virtual layout_type order() const override;
 		virtual platform_type platform() override;
+        virtual void pow_(f32 exp) override;
 		virtual f32 reduce_(std::function<f32(f32, f32)> f) const override;
 		virtual TensorInterfacePtr reduce_dim(idx_type dim, std::function<f32(f32, f32)> f) const override;
 		virtual void reshape_(const DimVector& dims) override;
@@ -81,6 +83,7 @@ namespace traph
 		virtual std::shared_ptr<StorageBase<f32>> storage() const override;
 		virtual DimVector stride() const override;
 		virtual idx_type stride(idx_type i) const override;
+        virtual void sub_(std::shared_ptr<TensorInterface> other) override;
 		virtual TensorInterfacePtr sum() const override;
 		virtual std::string to_string() const override;
         virtual void transpose_(idx_type dim0, idx_type dim1) override;

+ 4 - 1
traph/include/traph/tensor/int_tensor.h

@@ -66,9 +66,11 @@ namespace traph
 		virtual std::shared_ptr<TensorInterface> inverse() const override;
 		virtual i32 item() const override;
 		virtual std::shared_ptr<TensorInterface> matmul(std::shared_ptr<TensorInterface> mat) const override;
-		virtual idx_type offset() const override;
+		virtual void neg_() override;
+        virtual idx_type offset() const override;
 		virtual layout_type order() const override;
 		virtual platform_type platform() override;
+        virtual void pow_(f32 exp) override;
 		virtual i32 reduce_(std::function<i32(i32, i32)> f) const override;
 		virtual TensorInterfacePtr reduce_dim(idx_type dim, std::function<i32(i32, i32)> f) const override;
 		virtual void reshape_(const DimVector& dims) override;
@@ -80,6 +82,7 @@ namespace traph
 		virtual std::shared_ptr<StorageBase<i32>> storage() const override;
 		virtual DimVector stride() const override;
 		virtual idx_type stride(idx_type i) const override;
+        virtual void sub_(std::shared_ptr<TensorInterface> other) override;
 		virtual TensorInterfacePtr sum() const override;
 		virtual std::string to_string() const override;
         virtual void transpose_(idx_type dim0, idx_type dim1) override;

+ 3 - 0
traph/include/traph/tensor/long_tensor.h

@@ -66,9 +66,11 @@ namespace traph
 		virtual std::shared_ptr<TensorInterface> inverse() const override;
 		virtual i64 item() const override;
 		virtual std::shared_ptr<TensorInterface> matmul(std::shared_ptr<TensorInterface> mat) const override;
+		virtual void neg_() override;
 		virtual idx_type offset() const override;
 		virtual layout_type order() const override;
 		virtual platform_type platform() override;
+        virtual void pow_(f32 exp) override;
 		virtual i64 reduce_(std::function<i64(i64, i64)> f) const override;
 		virtual TensorInterfacePtr reduce_dim(idx_type dim, std::function<i64(i64, i64)> f) const override;
 		virtual void reshape_(const DimVector& dims) override;
@@ -80,6 +82,7 @@ namespace traph
 		virtual std::shared_ptr<StorageBase<i64>> storage() const override;
 		virtual DimVector stride() const override;
 		virtual idx_type stride(idx_type i) const override;
+        virtual void sub_(std::shared_ptr<TensorInterface> other) override;
 		virtual TensorInterfacePtr sum() const override;
 		virtual std::string to_string() const override;
         virtual void transpose_(idx_type dim0, idx_type dim1) override;

+ 4 - 1
traph/include/traph/tensor/short_tensor.h

@@ -66,9 +66,11 @@ namespace traph
 		virtual std::shared_ptr<TensorInterface> inverse() const override;
 		virtual i16 item() const override;
 		virtual std::shared_ptr<TensorInterface> matmul(std::shared_ptr<TensorInterface> mat) const override;
-		virtual idx_type offset() const override;
+		virtual void neg_() override;
+        virtual idx_type offset() const override;
 		virtual layout_type order() const override;
 		virtual platform_type platform() override;
+        virtual void pow_(f32 exp) override;
 		virtual i16 reduce_(std::function<i16(i16, i16)> f) const override;
 		virtual TensorInterfacePtr reduce_dim(idx_type dim, std::function<i16(i16, i16)> f) const override;
 		virtual void reshape_(const DimVector& dims) override;
@@ -80,6 +82,7 @@ namespace traph
 		virtual std::shared_ptr<StorageBase<i16>> storage() const override;
 		virtual DimVector stride() const override;
 		virtual idx_type stride(idx_type i) const override;
+        virtual void sub_(std::shared_ptr<TensorInterface> other) override;
 		virtual TensorInterfacePtr sum() const override;
 		virtual std::string to_string() const override;
         virtual void transpose_(idx_type dim0, idx_type dim1) override;

+ 4 - 1
traph/include/traph/tensor/tensor.h

@@ -68,9 +68,11 @@ namespace traph
         virtual std::shared_ptr<TensorInterface> inverse() const override;
         virtual T item() const override;
         virtual std::shared_ptr<TensorInterface> matmul(std::shared_ptr<TensorInterface> mat) const override;
-		virtual idx_type offset() const override;
+		virtual void neg_() override;
+        virtual idx_type offset() const override;
 		virtual layout_type order() const override;
         virtual platform_type platform() override;
+        virtual void pow_(f32 exp) override;
         virtual T reduce_(std::function<T(T,T)> f) const override;
         virtual TensorInterfacePtr reduce_dim(idx_type dim, std::function<T(T,T)> f) const override;
         virtual void reshape_(const DimVector& dims) override;
@@ -82,6 +84,7 @@ namespace traph
         virtual std::shared_ptr<StorageBase<T>> storage() const override;
 		virtual DimVector stride() const override;
 		virtual idx_type stride(idx_type i) const override;
+        virtual void sub_(std::shared_ptr<TensorInterface> other) override;
         virtual TensorInterfacePtr sum() const override;
         virtual std::string to_string() const override;
         virtual void transpose_(idx_type dim0, idx_type dim1) override;

+ 45 - 0
traph/source/tensor/byte_tensor.cpp

@@ -257,12 +257,23 @@ namespace traph
 		return matmul_impl(*this, *right_matrix);
 	}
 
+    void Tensor<u8>::neg_()
+    {
+        apply_([](u8 a)->u8 {return -a; });
+    }
+
     idx_type Tensor<u8>::offset() const { return _offset; }
 
     layout_type Tensor<u8>::order() const { return _order; }
 
     platform_type Tensor<u8>::platform() { return platform_type::none; }
 
+    void Tensor<u8>::pow_(f32 exp)
+    {
+        std::int32_t exp_int = static_cast<std::int32_t>(exp);
+        apply_([&exp_int](u8 a)->u8 {return static_cast<u8>(std::pow(a, exp_int)); });
+    }
+
 	u8 Tensor<u8>::reduce_(std::function<u8(u8, u8)> f) const
     {
 		u8 result{};
@@ -362,6 +373,40 @@ namespace traph
 		else
 			throw std::runtime_error("Stride out of range");
 	}
+
+    void Tensor<u8>::sub_(std::shared_ptr<TensorInterface> other)
+    {
+        Tensor<u8> * lhs = this;
+		Tensor<u8> * rhs = dynamic_cast<Tensor<u8> *>(other.get());
+		std::function<void(Tensor<u8> *, Tensor<u8> *, idx_type, idx_type,idx_type, idx_type)> sub_impl =
+			[&](Tensor<u8> * lhs, Tensor<u8> * rhs, idx_type lhs_dim, idx_type rhs_dim, idx_type lhs_idx, idx_type rhs_idx) {
+
+			auto lhs_storage = std::dynamic_pointer_cast<TensorStorage<u8>>(lhs->storage())->data_ptr();
+			auto rhs_storage = std::dynamic_pointer_cast<TensorStorage<u8>>(rhs->storage())->data_ptr();
+
+			if (lhs_dim < -(lhs->size().size()) && rhs_dim < -(rhs->size().size()))
+			{
+				lhs_storage[lhs_idx] -= rhs_storage[rhs_idx];
+				return;
+			}
+
+			idx_type lhs_shape_size = lhs_dim >= -(lhs->size().size())? lhs->size(lhs_dim) : 1;
+			idx_type rhs_shape_size = rhs_dim >= -(rhs->size().size()) ? rhs->size(rhs_dim) : 1;
+			idx_type max_shape_size = std::max(lhs_shape_size, rhs_shape_size);
+
+			for (idx_type i = 0; i < max_shape_size; ++i)
+			{
+				sub_impl(lhs, rhs, lhs_dim - 1, rhs_dim - 1, lhs_idx, rhs_idx);
+
+				if(lhs_shape_size > 1)
+					lhs_idx += lhs->stride(lhs_dim);
+				if (rhs_shape_size > 1)
+					rhs_idx += rhs->stride(rhs_dim);
+			}
+		};
+
+		sub_impl(lhs, rhs, -1, -1, lhs->offset(), rhs->offset());
+    }
     
     TensorInterfacePtr Tensor<u8>::sum() const
     {

+ 45 - 0
traph/source/tensor/char_tensor.cpp

@@ -257,12 +257,23 @@ namespace traph
 		return matmul_impl(*this, *right_matrix);
 	}
 
+    void Tensor<i8>::neg_()
+    {
+        apply_([](i8 a)->i8 {return -a; });
+    }
+
     idx_type Tensor<i8>::offset() const { return _offset; }
 
     layout_type Tensor<i8>::order() const { return _order; }
 
     platform_type Tensor<i8>::platform() { return platform_type::none; }
 
+    void Tensor<i8>::pow_(f32 exp)
+    {
+        std::int32_t exp_int = static_cast<std::int32_t>(exp);
+        apply_([&exp_int](i8 a)->i8 {return static_cast<i8>(std::pow(a, exp_int)); });
+    }
+
 	i8 Tensor<i8>::reduce_(std::function<i8(i8, i8)> f) const
     {
 		i8 result{};
@@ -362,6 +373,40 @@ namespace traph
 		else
 			throw std::runtime_error("Stride out of range");
 	}
+
+    void Tensor<i8>::sub_(std::shared_ptr<TensorInterface> other)
+    {
+        Tensor<i8> * lhs = this;
+		Tensor<i8> * rhs = dynamic_cast<Tensor<i8> *>(other.get());
+		std::function<void(Tensor<i8> *, Tensor<i8> *, idx_type, idx_type,idx_type, idx_type)> sub_impl =
+			[&](Tensor<i8> * lhs, Tensor<i8> * rhs, idx_type lhs_dim, idx_type rhs_dim, idx_type lhs_idx, idx_type rhs_idx) {
+
+			auto lhs_storage = std::dynamic_pointer_cast<TensorStorage<i8>>(lhs->storage())->data_ptr();
+			auto rhs_storage = std::dynamic_pointer_cast<TensorStorage<i8>>(rhs->storage())->data_ptr();
+
+			if (lhs_dim < -(lhs->size().size()) && rhs_dim < -(rhs->size().size()))
+			{
+				lhs_storage[lhs_idx] -= rhs_storage[rhs_idx];
+				return;
+			}
+
+			idx_type lhs_shape_size = lhs_dim >= -(lhs->size().size())? lhs->size(lhs_dim) : 1;
+			idx_type rhs_shape_size = rhs_dim >= -(rhs->size().size()) ? rhs->size(rhs_dim) : 1;
+			idx_type max_shape_size = std::max(lhs_shape_size, rhs_shape_size);
+
+			for (idx_type i = 0; i < max_shape_size; ++i)
+			{
+				sub_impl(lhs, rhs, lhs_dim - 1, rhs_dim - 1, lhs_idx, rhs_idx);
+
+				if(lhs_shape_size > 1)
+					lhs_idx += lhs->stride(lhs_dim);
+				if (rhs_shape_size > 1)
+					rhs_idx += rhs->stride(rhs_dim);
+			}
+		};
+
+		sub_impl(lhs, rhs, -1, -1, lhs->offset(), rhs->offset());
+    }
     
     TensorInterfacePtr Tensor<i8>::sum() const
     {

+ 44 - 0
traph/source/tensor/double_tensor.cpp

@@ -257,12 +257,22 @@ namespace traph
 		return matmul_impl(*this, *right_matrix);
 	}
 
+    void Tensor<f64>::neg_()
+    {
+        apply_([](f64 a)->f64 {return -a; });
+    }
+
     idx_type Tensor<f64>::offset() const { return _offset; }
 
     layout_type Tensor<f64>::order() const { return _order; }
 
     platform_type Tensor<f64>::platform() { return platform_type::none; }
 
+    void Tensor<f64>::pow_(f32 exp)
+    {
+        apply_([&exp](f64 a)->f64 {return std::pow(a, exp); });
+    }
+
 	f64 Tensor<f64>::reduce_(std::function<f64(f64, f64)> f) const
     {
 		f64 result{};
@@ -362,6 +372,40 @@ namespace traph
 		else
 			throw std::runtime_error("Stride out of range");
 	}
+
+    void Tensor<f64>::sub_(std::shared_ptr<TensorInterface> other)
+    {
+        Tensor<f64> * lhs = this;
+		Tensor<f64> * rhs = dynamic_cast<Tensor<f64> *>(other.get());
+		std::function<void(Tensor<f64> *, Tensor<f64> *, idx_type, idx_type,idx_type, idx_type)> sub_impl =
+			[&](Tensor<f64> * lhs, Tensor<f64> * rhs, idx_type lhs_dim, idx_type rhs_dim, idx_type lhs_idx, idx_type rhs_idx) {
+
+			auto lhs_storage = std::dynamic_pointer_cast<TensorStorage<f64>>(lhs->storage())->data_ptr();
+			auto rhs_storage = std::dynamic_pointer_cast<TensorStorage<f64>>(rhs->storage())->data_ptr();
+
+			if (lhs_dim < -(lhs->size().size()) && rhs_dim < -(rhs->size().size()))
+			{
+				lhs_storage[lhs_idx] -= rhs_storage[rhs_idx];
+				return;
+			}
+
+			idx_type lhs_shape_size = lhs_dim >= -(lhs->size().size())? lhs->size(lhs_dim) : 1;
+			idx_type rhs_shape_size = rhs_dim >= -(rhs->size().size()) ? rhs->size(rhs_dim) : 1;
+			idx_type max_shape_size = std::max(lhs_shape_size, rhs_shape_size);
+
+			for (idx_type i = 0; i < max_shape_size; ++i)
+			{
+				sub_impl(lhs, rhs, lhs_dim - 1, rhs_dim - 1, lhs_idx, rhs_idx);
+
+				if(lhs_shape_size > 1)
+					lhs_idx += lhs->stride(lhs_dim);
+				if (rhs_shape_size > 1)
+					rhs_idx += rhs->stride(rhs_dim);
+			}
+		};
+
+		sub_impl(lhs, rhs, -1, -1, lhs->offset(), rhs->offset());
+    }
     
     TensorInterfacePtr Tensor<f64>::sum() const
     {

+ 44 - 0
traph/source/tensor/float_tensor.cpp

@@ -258,12 +258,22 @@ namespace traph
 		return matmul_impl(*this, *right_matrix);
 	}
 
+    void Tensor<f32>::neg_()
+    {
+        apply_([](f32 a)->f32 {return -a; });
+    }
+
     idx_type Tensor<f32>::offset() const { return _offset; }
 
     layout_type Tensor<f32>::order() const { return _order; }
 
     platform_type Tensor<f32>::platform() { return platform_type::none; }
 
+    void Tensor<f32>::pow_(f32 exp)
+    {
+        apply_([&exp](f32 a)->f32 {return std::pow(a, exp); });
+    }
+
 	f32 Tensor<f32>::reduce_(std::function<f32(f32, f32)> f) const
     {
 		f32 result{};
@@ -363,6 +373,40 @@ namespace traph
 		else
 			throw std::runtime_error("Stride out of range");
 	}
+
+    void Tensor<f32>::sub_(std::shared_ptr<TensorInterface> other)
+    {
+        Tensor<f32> * lhs = this;
+		Tensor<f32> * rhs = dynamic_cast<Tensor<f32> *>(other.get());
+		std::function<void(Tensor<f32> *, Tensor<f32> *, idx_type, idx_type,idx_type, idx_type)> sub_impl =
+			[&](Tensor<f32> * lhs, Tensor<f32> * rhs, idx_type lhs_dim, idx_type rhs_dim, idx_type lhs_idx, idx_type rhs_idx) {
+
+			auto lhs_storage = std::dynamic_pointer_cast<TensorStorage<f32>>(lhs->storage())->data_ptr();
+			auto rhs_storage = std::dynamic_pointer_cast<TensorStorage<f32>>(rhs->storage())->data_ptr();
+
+			if (lhs_dim < -(lhs->size().size()) && rhs_dim < -(rhs->size().size()))
+			{
+				lhs_storage[lhs_idx] -= rhs_storage[rhs_idx];
+				return;
+			}
+
+			idx_type lhs_shape_size = lhs_dim >= -(lhs->size().size())? lhs->size(lhs_dim) : 1;
+			idx_type rhs_shape_size = rhs_dim >= -(rhs->size().size()) ? rhs->size(rhs_dim) : 1;
+			idx_type max_shape_size = std::max(lhs_shape_size, rhs_shape_size);
+
+			for (idx_type i = 0; i < max_shape_size; ++i)
+			{
+				sub_impl(lhs, rhs, lhs_dim - 1, rhs_dim - 1, lhs_idx, rhs_idx);
+
+				if(lhs_shape_size > 1)
+					lhs_idx += lhs->stride(lhs_dim);
+				if (rhs_shape_size > 1)
+					rhs_idx += rhs->stride(rhs_dim);
+			}
+		};
+
+		sub_impl(lhs, rhs, -1, -1, lhs->offset(), rhs->offset());
+    }
     
     TensorInterfacePtr Tensor<f32>::sum() const
     {

+ 45 - 0
traph/source/tensor/int_tensor.cpp

@@ -257,12 +257,23 @@ namespace traph
 		return matmul_impl(*this, *right_matrix);
 	}
 
+    void Tensor<i32>::neg_()
+    {
+        apply_([](i32 a)->i32 {return -a; });
+    }
+
     idx_type Tensor<i32>::offset() const { return _offset; }
 
     layout_type Tensor<i32>::order() const { return _order; }
 
     platform_type Tensor<i32>::platform() { return platform_type::none; }
 
+    void Tensor<i32>::pow_(f32 exp)
+    {
+        std::int32_t exp_int = static_cast<std::int32_t>(exp);
+        apply_([&exp_int](i32 a)->i32 {return static_cast<i32>(std::pow(a, exp_int)); });
+    }
+
 	i32 Tensor<i32>::reduce_(std::function<i32(i32, i32)> f) const
     {
 		i32 result{};
@@ -362,6 +373,40 @@ namespace traph
 		else
 			throw std::runtime_error("Stride out of range");
 	}
+
+    void Tensor<i32>::sub_(std::shared_ptr<TensorInterface> other)
+    {
+        Tensor<i32> * lhs = this;
+		Tensor<i32> * rhs = dynamic_cast<Tensor<i32> *>(other.get());
+		std::function<void(Tensor<i32> *, Tensor<i32> *, idx_type, idx_type,idx_type, idx_type)> sub_impl =
+			[&](Tensor<i32> * lhs, Tensor<i32> * rhs, idx_type lhs_dim, idx_type rhs_dim, idx_type lhs_idx, idx_type rhs_idx) {
+
+			auto lhs_storage = std::dynamic_pointer_cast<TensorStorage<i32>>(lhs->storage())->data_ptr();
+			auto rhs_storage = std::dynamic_pointer_cast<TensorStorage<i32>>(rhs->storage())->data_ptr();
+
+			if (lhs_dim < -(lhs->size().size()) && rhs_dim < -(rhs->size().size()))
+			{
+				lhs_storage[lhs_idx] -= rhs_storage[rhs_idx];
+				return;
+			}
+
+			idx_type lhs_shape_size = lhs_dim >= -(lhs->size().size())? lhs->size(lhs_dim) : 1;
+			idx_type rhs_shape_size = rhs_dim >= -(rhs->size().size()) ? rhs->size(rhs_dim) : 1;
+			idx_type max_shape_size = std::max(lhs_shape_size, rhs_shape_size);
+
+			for (idx_type i = 0; i < max_shape_size; ++i)
+			{
+				sub_impl(lhs, rhs, lhs_dim - 1, rhs_dim - 1, lhs_idx, rhs_idx);
+
+				if(lhs_shape_size > 1)
+					lhs_idx += lhs->stride(lhs_dim);
+				if (rhs_shape_size > 1)
+					rhs_idx += rhs->stride(rhs_dim);
+			}
+		};
+
+		sub_impl(lhs, rhs, -1, -1, lhs->offset(), rhs->offset());
+    }
     
     TensorInterfacePtr Tensor<i32>::sum() const
     {

+ 45 - 0
traph/source/tensor/long_tensor.cpp

@@ -257,12 +257,23 @@ namespace traph
 		return matmul_impl(*this, *right_matrix);
 	}
 
+    void Tensor<i64>::neg_()
+    {
+        apply_([](i64 a)->i64 {return -a; });
+    }
+
     idx_type Tensor<i64>::offset() const { return _offset; }
 
     layout_type Tensor<i64>::order() const { return _order; }
 
     platform_type Tensor<i64>::platform() { return platform_type::none; }
 
+    void Tensor<i64>::pow_(f32 exp)
+    {
+        std::int32_t exp_int = static_cast<std::int32_t>(exp);
+        apply_([&exp_int](i64 a)->i64 {return static_cast<i64>(std::pow(a, exp_int)); });
+    }
+
 	i64 Tensor<i64>::reduce_(std::function<i64(i64, i64)> f) const
     {
 		i64 result{};
@@ -362,6 +373,40 @@ namespace traph
 		else
 			throw std::runtime_error("Stride out of range");
 	}
+
+    void Tensor<i64>::sub_(std::shared_ptr<TensorInterface> other)
+    {
+        Tensor<i64> * lhs = this;
+		Tensor<i64> * rhs = dynamic_cast<Tensor<i64> *>(other.get());
+		std::function<void(Tensor<i64> *, Tensor<i64> *, idx_type, idx_type,idx_type, idx_type)> sub_impl =
+			[&](Tensor<i64> * lhs, Tensor<i64> * rhs, idx_type lhs_dim, idx_type rhs_dim, idx_type lhs_idx, idx_type rhs_idx) {
+
+			auto lhs_storage = std::dynamic_pointer_cast<TensorStorage<i64>>(lhs->storage())->data_ptr();
+			auto rhs_storage = std::dynamic_pointer_cast<TensorStorage<i64>>(rhs->storage())->data_ptr();
+
+			if (lhs_dim < -(lhs->size().size()) && rhs_dim < -(rhs->size().size()))
+			{
+				lhs_storage[lhs_idx] -= rhs_storage[rhs_idx];
+				return;
+			}
+
+			idx_type lhs_shape_size = lhs_dim >= -(lhs->size().size())? lhs->size(lhs_dim) : 1;
+			idx_type rhs_shape_size = rhs_dim >= -(rhs->size().size()) ? rhs->size(rhs_dim) : 1;
+			idx_type max_shape_size = std::max(lhs_shape_size, rhs_shape_size);
+
+			for (idx_type i = 0; i < max_shape_size; ++i)
+			{
+				sub_impl(lhs, rhs, lhs_dim - 1, rhs_dim - 1, lhs_idx, rhs_idx);
+
+				if(lhs_shape_size > 1)
+					lhs_idx += lhs->stride(lhs_dim);
+				if (rhs_shape_size > 1)
+					rhs_idx += rhs->stride(rhs_dim);
+			}
+		};
+
+		sub_impl(lhs, rhs, -1, -1, lhs->offset(), rhs->offset());
+    }
     
     TensorInterfacePtr Tensor<i64>::sum() const
     {

+ 45 - 0
traph/source/tensor/short_tensor.cpp

@@ -257,12 +257,23 @@ namespace traph
 		return matmul_impl(*this, *right_matrix);
 	}
 
+    void Tensor<i16>::neg_()
+    {
+        apply_([](i16 a)->i16 {return -a; });
+    }
+
     idx_type Tensor<i16>::offset() const { return _offset; }
 
     layout_type Tensor<i16>::order() const { return _order; }
 
     platform_type Tensor<i16>::platform() { return platform_type::none; }
 
+    void Tensor<i16>::pow_(f32 exp)
+    {
+        std::int32_t exp_int = static_cast<std::int32_t>(exp);
+        apply_([&exp_int](i16 a)->i16 {return static_cast<i16>(std::pow(a, exp_int)); });
+    }
+
 	i16 Tensor<i16>::reduce_(std::function<i16(i16, i16)> f) const
     {
 		i16 result{};
@@ -362,6 +373,40 @@ namespace traph
 		else
 			throw std::runtime_error("Stride out of range");
 	}
+
+    void Tensor<i16>::sub_(std::shared_ptr<TensorInterface> other)
+    {
+        Tensor<i16> * lhs = this;
+		Tensor<i16> * rhs = dynamic_cast<Tensor<i16> *>(other.get());
+		std::function<void(Tensor<i16> *, Tensor<i16> *, idx_type, idx_type,idx_type, idx_type)> sub_impl =
+			[&](Tensor<i16> * lhs, Tensor<i16> * rhs, idx_type lhs_dim, idx_type rhs_dim, idx_type lhs_idx, idx_type rhs_idx) {
+
+			auto lhs_storage = std::dynamic_pointer_cast<TensorStorage<i16>>(lhs->storage())->data_ptr();
+			auto rhs_storage = std::dynamic_pointer_cast<TensorStorage<i16>>(rhs->storage())->data_ptr();
+
+			if (lhs_dim < -(lhs->size().size()) && rhs_dim < -(rhs->size().size()))
+			{
+				lhs_storage[lhs_idx] -= rhs_storage[rhs_idx];
+				return;
+			}
+
+			idx_type lhs_shape_size = lhs_dim >= -(lhs->size().size())? lhs->size(lhs_dim) : 1;
+			idx_type rhs_shape_size = rhs_dim >= -(rhs->size().size()) ? rhs->size(rhs_dim) : 1;
+			idx_type max_shape_size = std::max(lhs_shape_size, rhs_shape_size);
+
+			for (idx_type i = 0; i < max_shape_size; ++i)
+			{
+				sub_impl(lhs, rhs, lhs_dim - 1, rhs_dim - 1, lhs_idx, rhs_idx);
+
+				if(lhs_shape_size > 1)
+					lhs_idx += lhs->stride(lhs_dim);
+				if (rhs_shape_size > 1)
+					rhs_idx += rhs->stride(rhs_dim);
+			}
+		};
+
+		sub_impl(lhs, rhs, -1, -1, lhs->offset(), rhs->offset());
+    }
     
     TensorInterfacePtr Tensor<i16>::sum() const
     {

+ 19 - 0
traph/source/tensor/tensor.cpp

@@ -114,6 +114,13 @@ namespace traph
     {
 		throw std::runtime_error("No implement");
     }
+
+    template<typename T>
+    void Tensor<T>::neg_()
+    {
+        throw std::runtime_error("No implement");
+    }
+
     template<typename T>
     idx_type Tensor<T>::offset() const { throw std::runtime_error("No implement"); }
     template<typename T>
@@ -121,6 +128,11 @@ namespace traph
     template<typename T>
     platform_type Tensor<T>::platform() { throw std::runtime_error("No implement"); }
     template<typename T>
+    void Tensor<T>::pow_(f32 exp)
+    {
+        throw std::runtime_error("No implement");
+    }
+    template<typename T>
     T Tensor<T>::reduce_(std::function<T(T,T)> f) const
     {
         throw std::runtime_error("No implement");
@@ -166,6 +178,13 @@ namespace traph
 	{
 		throw std::runtime_error("No implement");
 	}
+
+    template<typename T>
+    void Tensor<T>::sub_(std::shared_ptr<TensorInterface> other)
+    {
+        throw std::runtime_error("No implement");
+    }
+
     template<typename T>
     TensorInterfacePtr Tensor<T>::sum() const
     {

+ 12 - 4
traph/source/test/main.cpp

@@ -1,6 +1,7 @@
 #include <algorithm>
 
-#include <traph/nn/module.h>
+#include <traph/nn/layers/linear.h>
+#include <traph/nn/layers/loss.h>
 
 #include <iostream>
 
@@ -57,10 +58,17 @@ int main()
 	*/
 
 	int batch_size = 16;
-	auto a = traph::ones<traph::f32>({ batch_size,4 });
+	auto x = traph::ones<traph::f32>({ batch_size,4 });
+	auto y = traph::zeros<traph::f32>({ batch_size,2 });
 
-	traph::LinearModule linear_model(4, 2, false);
-	auto out = linear_model.forward(a);
+	traph::Linear linear_model(4, 2, false);
+	traph::MSELoss loss;
+
+	auto out = linear_model.forward(x);
+	auto result = loss.forward(out, y);
+
+	result->backward();
+	std::cout << result->data()->to_string();
 
     return 0;
 }