6 vuotta sitten · 928e9379d3
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -78,6 +78,7 @@
 
				         "variant": "cpp",
			
 
				         "resumable": "cpp",
			
 
				         "future": "cpp",
			
 
				-        "queue": "cpp"
			
 
				+        "queue": "cpp",
			
 
				+        "cfenv": "cpp"
			
 
				     }
			
 
				 }
			
--- a/traph/include/traph/tensor/byte_tensor.h
+++ b/traph/include/traph/tensor/byte_tensor.h
@@ -8,8 +8,83 @@
 
				 
			
 
				 namespace traph
			
 
				 {
			
 
				+    // ndarray
			
 
				+    template<>
			
 
				+    class Tensor<u8>: public TensorBase<u8>
			
 
				+    {
			
 
				+    public:
			
 
				+        using value_type = u8;
			
 
				+        using self_type = Tensor<u8>;
			
 
				+        using base_type = TensorBase<u8>;
			
 
				+        using storage_type = TensorStorage<value_type>;
			
 
				 
			
 
				+        using raw_pointer = self_type*;
			
 
				+        using shared_pointer = std::shared_ptr<self_type>;
			
 
				+        using reference = self_type&;
			
 
				+        using const_reference = const self_type&;
			
 
				+    private:
			
 
				+        std::shared_ptr<storage_type> _rep;
			
 
				+        DimVector _dimensions;
			
 
				+        idx_type _offset;
			
 
				+		DimVector _strides;
			
 
				+        layout_type _order;
			
 
				 
			
 
				+    private:
			
 
				+        void auto_strides();
			
 
				+
			
 
				+        void apply_impl(idx_type dim, idx_type idx, std::function<value_type(value_type)> f);
			
 
				+
			
 
				+        void reduce_impl(value_type& result, idx_type dim, idx_type idx, std::function<value_type(value_type,value_type)> f) const;
			
 
				+
			
 
				+        value_type reduce_dim_kernel(idx_type begin, idx_type step_len, idx_type step_num, std::function<value_type(value_type,value_type)> f) const;
			
 
				+
			
 
				+        void reduce_dim_impl(reference result, idx_type dim, idx_type reduce_dim,
			
 
				+            idx_type this_idx, idx_type result_idx,
			
 
				+            std::function<value_type(value_type,value_type)> f) const;
			
 
				+    public:
			
 
				+        Tensor();
			
 
				+        explicit Tensor(const DimVector& dimensions);
			
 
				+        explicit Tensor(const DimVector& dimensions, layout_type order);
			
 
				+        explicit Tensor(const DimVector& dimensions, const DimVector& strides);
			
 
				+        explicit Tensor(const DimVector& dimensions, const DimVector& strides, layout_type order);
			
 
				+        Tensor(const value_type& t);
			
 
				+
			
 
				+        Tensor(const Tensor& other) = delete;
			
 
				+        Tensor(Tensor&& other) = delete;
			
 
				+        Tensor& operator= (const Tensor& other) = delete;
			
 
				+        Tensor& operator= (Tensor&& other) = delete;
			
 
				+
			
 
				+		virtual void add_(TensorInterfacePtr other) override;
			
 
				+		virtual void apply_(std::function<u8(u8)> f) override;
			
 
				+		virtual TensorInterfacePtr clone() const override;
			
 
				+		virtual void cos_() override;
			
 
				+		virtual std::shared_ptr<TensorBase<f32>> create_grad() override;
			
 
				+		virtual u8* data_ptr() override;
			
 
				+		virtual const u8* data_ptr() const override;
			
 
				+		virtual device_id device() override;
			
 
				+		virtual void fill_(u8 value) override;
			
 
				+		virtual std::shared_ptr<TensorInterface> inverse() const override;
			
 
				+		virtual u8 item() const override;
			
 
				+		virtual std::shared_ptr<TensorInterface> matmul(std::shared_ptr<TensorInterface> mat) const override;
			
 
				+		virtual idx_type offset() const override;
			
 
				+		virtual layout_type order() const override;
			
 
				+		virtual platform_type platform() override;
			
 
				+		virtual u8 reduce_(std::function<u8(u8, u8)> f) const override;
			
 
				+		virtual TensorInterfacePtr reduce_dim(idx_type dim, std::function<u8(u8, u8)> f) const override;
			
 
				+		virtual void reshape_(const DimVector& dims) override;
			
 
				+		virtual void resize_(const DimVector& dims) override;
			
 
				+		virtual std::shared_ptr<TensorInterface> select(const SliceVector& slice) const override;
			
 
				+		virtual void sin_() override;
			
 
				+		virtual DimVector size() const override;
			
 
				+		virtual idx_type size(idx_type i) const override;
			
 
				+		virtual std::shared_ptr<StorageBase<u8>> storage() const override;
			
 
				+		virtual DimVector stride() const override;
			
 
				+		virtual idx_type stride(idx_type i) const override;
			
 
				+		virtual TensorInterfacePtr sum() const override;
			
 
				+		virtual std::string to_string() const override;
			
 
				+    };
			
 
				+
			
 
				+    using ByteTensor = Tensor<u8>;
			
 
				 }
			
 
				 
			
 
				 #endif
			
--- a/traph/include/traph/tensor/char_tensor.h
+++ b/traph/include/traph/tensor/char_tensor.h
@@ -8,7 +8,83 @@
 
				 
			
 
				 namespace traph
			
 
				 {
			
 
				+    // ndarray
			
 
				+    template<>
			
 
				+    class Tensor<i8>: public TensorBase<i8>
			
 
				+    {
			
 
				+    public:
			
 
				+        using value_type = i8;
			
 
				+        using self_type = Tensor<i8>;
			
 
				+        using base_type = TensorBase<i8>;
			
 
				+        using storage_type = TensorStorage<value_type>;
			
 
				 
			
 
				+        using raw_pointer = self_type*;
			
 
				+        using shared_pointer = std::shared_ptr<self_type>;
			
 
				+        using reference = self_type&;
			
 
				+        using const_reference = const self_type&;
			
 
				+    private:
			
 
				+        std::shared_ptr<storage_type> _rep;
			
 
				+        DimVector _dimensions;
			
 
				+        idx_type _offset;
			
 
				+		DimVector _strides;
			
 
				+        layout_type _order;
			
 
				+
			
 
				+    private:
			
 
				+        void auto_strides();
			
 
				+
			
 
				+        void apply_impl(idx_type dim, idx_type idx, std::function<value_type(value_type)> f);
			
 
				+
			
 
				+        void reduce_impl(value_type& result, idx_type dim, idx_type idx, std::function<value_type(value_type,value_type)> f) const;
			
 
				+
			
 
				+        value_type reduce_dim_kernel(idx_type begin, idx_type step_len, idx_type step_num, std::function<value_type(value_type,value_type)> f) const;
			
 
				+
			
 
				+        void reduce_dim_impl(reference result, idx_type dim, idx_type reduce_dim,
			
 
				+            idx_type this_idx, idx_type result_idx,
			
 
				+            std::function<value_type(value_type,value_type)> f) const;
			
 
				+    public:
			
 
				+        Tensor();
			
 
				+        explicit Tensor(const DimVector& dimensions);
			
 
				+        explicit Tensor(const DimVector& dimensions, layout_type order);
			
 
				+        explicit Tensor(const DimVector& dimensions, const DimVector& strides);
			
 
				+        explicit Tensor(const DimVector& dimensions, const DimVector& strides, layout_type order);
			
 
				+        Tensor(const value_type& t);
			
 
				+
			
 
				+        Tensor(const Tensor& other) = delete;
			
 
				+        Tensor(Tensor&& other) = delete;
			
 
				+        Tensor& operator= (const Tensor& other) = delete;
			
 
				+        Tensor& operator= (Tensor&& other) = delete;
			
 
				+
			
 
				+		virtual void add_(TensorInterfacePtr other) override;
			
 
				+		virtual void apply_(std::function<i8(i8)> f) override;
			
 
				+		virtual TensorInterfacePtr clone() const override;
			
 
				+		virtual void cos_() override;
			
 
				+		virtual std::shared_ptr<TensorBase<f32>> create_grad() override;
			
 
				+		virtual i8* data_ptr() override;
			
 
				+		virtual const i8* data_ptr() const override;
			
 
				+		virtual device_id device() override;
			
 
				+		virtual void fill_(i8 value) override;
			
 
				+		virtual std::shared_ptr<TensorInterface> inverse() const override;
			
 
				+		virtual i8 item() const override;
			
 
				+		virtual std::shared_ptr<TensorInterface> matmul(std::shared_ptr<TensorInterface> mat) const override;
			
 
				+		virtual idx_type offset() const override;
			
 
				+		virtual layout_type order() const override;
			
 
				+		virtual platform_type platform() override;
			
 
				+		virtual i8 reduce_(std::function<i8(i8, i8)> f) const override;
			
 
				+		virtual TensorInterfacePtr reduce_dim(idx_type dim, std::function<i8(i8, i8)> f) const override;
			
 
				+		virtual void reshape_(const DimVector& dims) override;
			
 
				+		virtual void resize_(const DimVector& dims) override;
			
 
				+		virtual std::shared_ptr<TensorInterface> select(const SliceVector& slice) const override;
			
 
				+		virtual void sin_() override;
			
 
				+		virtual DimVector size() const override;
			
 
				+		virtual idx_type size(idx_type i) const override;
			
 
				+		virtual std::shared_ptr<StorageBase<i8>> storage() const override;
			
 
				+		virtual DimVector stride() const override;
			
 
				+		virtual idx_type stride(idx_type i) const override;
			
 
				+		virtual TensorInterfacePtr sum() const override;
			
 
				+		virtual std::string to_string() const override;
			
 
				+    };
			
 
				+
			
 
				+    using CharTensor = Tensor<i8>;
			
 
				 
			
 
				 }
			
 
				 
			
--- a/traph/include/traph/tensor/double_tensor.h
+++ b/traph/include/traph/tensor/double_tensor.h
@@ -8,7 +8,83 @@
 
				 
			
 
				 namespace traph
			
 
				 {
			
 
				+    // ndarray
			
 
				+    template<>
			
 
				+    class Tensor<f64>: public TensorBase<f64>
			
 
				+    {
			
 
				+    public:
			
 
				+        using value_type = f64;
			
 
				+        using self_type = Tensor<f64>;
			
 
				+        using base_type = TensorBase<f64>;
			
 
				+        using storage_type = TensorStorage<value_type>;
			
 
				 
			
 
				+        using raw_pointer = self_type*;
			
 
				+        using shared_pointer = std::shared_ptr<self_type>;
			
 
				+        using reference = self_type&;
			
 
				+        using const_reference = const self_type&;
			
 
				+    private:
			
 
				+        std::shared_ptr<storage_type> _rep;
			
 
				+        DimVector _dimensions;
			
 
				+        idx_type _offset;
			
 
				+		DimVector _strides;
			
 
				+        layout_type _order;
			
 
				+
			
 
				+    private:
			
 
				+        void auto_strides();
			
 
				+
			
 
				+        void apply_impl(idx_type dim, idx_type idx, std::function<value_type(value_type)> f);
			
 
				+
			
 
				+        void reduce_impl(value_type& result, idx_type dim, idx_type idx, std::function<value_type(value_type,value_type)> f) const;
			
 
				+
			
 
				+        value_type reduce_dim_kernel(idx_type begin, idx_type step_len, idx_type step_num, std::function<value_type(value_type,value_type)> f) const;
			
 
				+
			
 
				+        void reduce_dim_impl(reference result, idx_type dim, idx_type reduce_dim,
			
 
				+            idx_type this_idx, idx_type result_idx,
			
 
				+            std::function<value_type(value_type,value_type)> f) const;
			
 
				+    public:
			
 
				+        Tensor();
			
 
				+        explicit Tensor(const DimVector& dimensions);
			
 
				+        explicit Tensor(const DimVector& dimensions, layout_type order);
			
 
				+        explicit Tensor(const DimVector& dimensions, const DimVector& strides);
			
 
				+        explicit Tensor(const DimVector& dimensions, const DimVector& strides, layout_type order);
			
 
				+        Tensor(const value_type& t);
			
 
				+
			
 
				+        Tensor(const Tensor& other) = delete;
			
 
				+        Tensor(Tensor&& other) = delete;
			
 
				+        Tensor& operator= (const Tensor& other) = delete;
			
 
				+        Tensor& operator= (Tensor&& other) = delete;
			
 
				+
			
 
				+		virtual void add_(TensorInterfacePtr other) override;
			
 
				+		virtual void apply_(std::function<f64(f64)> f) override;
			
 
				+		virtual TensorInterfacePtr clone() const override;
			
 
				+		virtual void cos_() override;
			
 
				+		virtual std::shared_ptr<TensorBase<f32>> create_grad() override;
			
 
				+		virtual f64* data_ptr() override;
			
 
				+		virtual const f64* data_ptr() const override;
			
 
				+		virtual device_id device() override;
			
 
				+		virtual void fill_(f64 value) override;
			
 
				+		virtual std::shared_ptr<TensorInterface> inverse() const override;
			
 
				+		virtual f64 item() const override;
			
 
				+		virtual std::shared_ptr<TensorInterface> matmul(std::shared_ptr<TensorInterface> mat) const override;
			
 
				+		virtual idx_type offset() const override;
			
 
				+		virtual layout_type order() const override;
			
 
				+		virtual platform_type platform() override;
			
 
				+		virtual f64 reduce_(std::function<f64(f64, f64)> f) const override;
			
 
				+		virtual TensorInterfacePtr reduce_dim(idx_type dim, std::function<f64(f64, f64)> f) const override;
			
 
				+		virtual void reshape_(const DimVector& dims) override;
			
 
				+		virtual void resize_(const DimVector& dims) override;
			
 
				+		virtual std::shared_ptr<TensorInterface> select(const SliceVector& slice) const override;
			
 
				+		virtual void sin_() override;
			
 
				+		virtual DimVector size() const override;
			
 
				+		virtual idx_type size(idx_type i) const override;
			
 
				+		virtual std::shared_ptr<StorageBase<f64>> storage() const override;
			
 
				+		virtual DimVector stride() const override;
			
 
				+		virtual idx_type stride(idx_type i) const override;
			
 
				+		virtual TensorInterfacePtr sum() const override;
			
 
				+		virtual std::string to_string() const override;
			
 
				+    };
			
 
				+
			
 
				+    using DoubleTensor = Tensor<f64>;
			
 
				 
			
 
				 }
			
 
				 
			
--- a/traph/include/traph/tensor/float_tensor.h
+++ b/traph/include/traph/tensor/float_tensor.h
@@ -3,12 +3,12 @@
 
				 
			
 
				 #include <utility>
			
 
				 #include <cmath>
			
 
				+#include <memory>
			
 
				 
			
 
				 #include <traph/tensor/tensor.h>
			
 
				 
			
 
				 namespace traph
			
 
				 {
			
 
				-    
			
 
				     // ndarray
			
 
				     template<>
			
 
				     class Tensor<f32>: public TensorBase<f32>
			
@@ -55,407 +55,37 @@ namespace traph
 
				         Tensor& operator= (const Tensor& other) = delete;
			
 
				         Tensor& operator= (Tensor&& other) = delete;
			
 
				 
			
 
				-        virtual void add_(TensorInterfacePtr other) override;
			
 
				-        virtual void apply_(std::function<value_type(value_type)> f) override;
			
 
				-        virtual TensorInterfacePtr clone() const override;
			
 
				-        virtual void cos_() override;
			
 
				-        virtual std::shared_ptr<TensorBase<f32>> create_grad() override;
			
 
				-        virtual value_type* data_ptr() override;
			
 
				-        virtual const value_type* data_ptr() const override;
			
 
				-        virtual device_id device() override;
			
 
				-        virtual void fill_(value_type value) override;
			
 
				-        virtual value_type item() const override;
			
 
				+		virtual void add_(TensorInterfacePtr other) override;
			
 
				+		virtual void apply_(std::function<f32(f32)> f) override;
			
 
				+		virtual TensorInterfacePtr clone() const override;
			
 
				+		virtual void cos_() override;
			
 
				+		virtual std::shared_ptr<TensorBase<f32>> create_grad() override;
			
 
				+		virtual f32* data_ptr() override;
			
 
				+		virtual const f32* data_ptr() const override;
			
 
				+		virtual device_id device() override;
			
 
				+		virtual void fill_(f32 value) override;
			
 
				+		virtual std::shared_ptr<TensorInterface> inverse() const override;
			
 
				+		virtual f32 item() const override;
			
 
				+		virtual std::shared_ptr<TensorInterface> matmul(std::shared_ptr<TensorInterface> mat) const override;
			
 
				 		virtual idx_type offset() const override;
			
 
				 		virtual layout_type order() const override;
			
 
				-        virtual platform_type platform() override;
			
 
				-        virtual value_type reduce_(std::function<value_type(value_type,value_type)> f) const override;
			
 
				-        virtual TensorInterfacePtr reduce_dim(idx_type dim, std::function<value_type(value_type,value_type)> f) const override;
			
 
				-        virtual void reshape_(const DimVector& dims) override;
			
 
				-        virtual void resize_(const DimVector& dims) override;
			
 
				-        virtual void sin_() override;
			
 
				+		virtual platform_type platform() override;
			
 
				+		virtual f32 reduce_(std::function<f32(f32, f32)> f) const override;
			
 
				+		virtual TensorInterfacePtr reduce_dim(idx_type dim, std::function<f32(f32, f32)> f) const override;
			
 
				+		virtual void reshape_(const DimVector& dims) override;
			
 
				+		virtual void resize_(const DimVector& dims) override;
			
 
				+		virtual std::shared_ptr<TensorInterface> select(const SliceVector& slice) const override;
			
 
				+		virtual void sin_() override;
			
 
				 		virtual DimVector size() const override;
			
 
				 		virtual idx_type size(idx_type i) const override;
			
 
				-        virtual std::shared_ptr<storage_type> storage() const override;
			
 
				+		virtual std::shared_ptr<StorageBase<f32>> storage() const override;
			
 
				 		virtual DimVector stride() const override;
			
 
				 		virtual idx_type stride(idx_type i) const override;
			
 
				-        virtual TensorInterfacePtr sum() const override;
			
 
				-        virtual std::string to_string() const override;
			
 
				+		virtual TensorInterfacePtr sum() const override;
			
 
				+		virtual std::string to_string() const override;
			
 
				     };
			
 
				 
			
 
				-    using DoubleTensor = Tensor<f64>;
			
 
				     using FloatTensor = Tensor<f32>;
			
 
				-    using LongTensor = Tensor<i64>;
			
 
				-    using IntTensor = Tensor<i32>;
			
 
				-    using ShortTensor = Tensor<i16>;
			
 
				-    using CharTensor = Tensor<i8>;
			
 
				-    using ByteTensor = Tensor<u8>;
			
 
				-
			
 
				-	// definition
			
 
				-    // private
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::auto_strides()
			
 
				-    {
			
 
				-        idx_type dim_num = _dimensions.size();
			
 
				-        _strides.resize(dim_num);
			
 
				-        idx_type stride = 1;
			
 
				-        if(_order == layout_type::column_major)
			
 
				-        {
			
 
				-            for (idx_type i = dim_num - 1; i >= 0; --i)
			
 
				-            {
			
 
				-                _strides[i] = stride;
			
 
				-                stride *= _dimensions[i];
			
 
				-            }
			
 
				-        }
			
 
				-        else
			
 
				-        {
			
 
				-            for (idx_type i = 0; i < dim_num; ++i)
			
 
				-            {
			
 
				-                _strides[i] = stride;
			
 
				-                stride *= _dimensions[i];
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::apply_impl(idx_type dim, idx_type idx, std::function<T(T)> f)
			
 
				-    {
			
 
				-        idx_type dim_size = _dimensions.size();
			
 
				-
			
 
				-        idx_type step_len = _strides[dim];
			
 
				-        idx_type step_num = _dimensions[dim];
			
 
				-        
			
 
				-        for(idx_type i = 0; i < step_num; ++i)
			
 
				-        {
			
 
				-            if(dim == dim_size - 1)
			
 
				-                _rep->data[idx] = f(_rep->data[idx]);
			
 
				-            else
			
 
				-                apply_impl(dim + 1, idx, f);
			
 
				-            idx += step_len;
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::reduce_impl(T& result, idx_type dim, idx_type idx, std::function<T(T,T)> f) const
			
 
				-    {
			
 
				-        idx_type dim_size = _dimensions.size();
			
 
				-
			
 
				-        idx_type step_len = _strides[dim];
			
 
				-        idx_type step_num = _dimensions[dim];
			
 
				-
			
 
				-        for(idx_type i = 0; i < step_num; ++i)
			
 
				-        {
			
 
				-            if(dim == dim_size - 1)
			
 
				-                result = f(result, _rep->data[idx]);
			
 
				-            else
			
 
				-                reduce_impl(result, dim + 1, idx, f);
			
 
				-            idx += step_len;
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    T Tensor<T>::reduce_dim_kernel(idx_type begin, idx_type step_len, idx_type step_num, std::function<T(T,T)> f) const
			
 
				-    {
			
 
				-        T result{};
			
 
				-        for(idx_type i = 0; i < step_num; ++i)
			
 
				-        {
			
 
				-            result = f(result, _rep->data[begin]);
			
 
				-            begin += step_len;
			
 
				-        }
			
 
				-        return result;
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::reduce_dim_impl(Tensor<T>& result, idx_type dim, idx_type reduce_dim,
			
 
				-        idx_type this_idx, idx_type result_idx,
			
 
				-        std::function<T(T,T)> f) const
			
 
				-    {
			
 
				-        idx_type dim_size = _dimensions.size();
			
 
				-
			
 
				-        if(dim == dim_size)
			
 
				-        {
			
 
				-            result._rep->data[result_idx] = 
			
 
				-                reduce_dim_kernel(this_idx, _strides[reduce_dim], _dimensions[reduce_dim], f);
			
 
				-            return;
			
 
				-        }
			
 
				-
			
 
				-        if(dim == reduce_dim)
			
 
				-        {
			
 
				-            reduce_dim_impl(result, dim + 1, reduce_dim, this_idx,result_idx, f);
			
 
				-        }
			
 
				-        else
			
 
				-        {
			
 
				-            for(idx_type i = 0; i < _dimensions[dim]; ++i)
			
 
				-            {
			
 
				-                reduce_dim_impl(result, dim + 1, reduce_dim, this_idx,result_idx, f);
			
 
				-                    
			
 
				-                this_idx += _strides[dim];
			
 
				-                result_idx += result._strides[dim];
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-    // public
			
 
				-    template<typename T>
			
 
				-    Tensor<T>::Tensor()
			
 
				-        :_rep(new TensorStorage<T>),
			
 
				-        _dimensions(), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				-    {
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-    template<typename T>
			
 
				-    Tensor<T>::Tensor(const DimVector& dimensions)
			
 
				-        :_rep(new TensorStorage<T>),
			
 
				-        _dimensions(dimensions), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				-    {
			
 
				-        auto_strides();
			
 
				-        
			
 
				-        _rep->resize_(_dimensions.flat_size());
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    Tensor<T>::Tensor(const DimVector& dimensions, layout_type order)
			
 
				-        :_rep(new TensorStorage<T>),
			
 
				-        _dimensions(dimensions), _offset(0), _strides(), _order(order)
			
 
				-    {
			
 
				-        auto_strides();
			
 
				-
			
 
				-        _rep->resize_(_dimensions.flat_size());
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    Tensor<T>::Tensor(const DimVector& dimensions, const DimVector& strides)
			
 
				-        :_rep(new TensorStorage<T>),
			
 
				-        _dimensions(dimensions), _offset(0), _strides(strides), _order(layout_type::column_major)
			
 
				-    {
			
 
				-        auto_strides();
			
 
				-
			
 
				-        _rep->resize_(_dimensions.flat_size());
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    Tensor<T>::Tensor(const DimVector& dimensions, const DimVector& strides, layout_type order)
			
 
				-        :_rep(new TensorStorage<T>),
			
 
				-        _dimensions(dimensions), _offset(0), _strides(strides), _order(order)
			
 
				-    {
			
 
				-        auto_strides();
			
 
				-
			
 
				-        _rep->resize_(_dimensions.flat_size());
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    Tensor<T>::Tensor(const T& t)
			
 
				-        :_rep(new TensorStorage<T>),
			
 
				-        _dimensions(), _offset(0), strides(), _order(order)
			
 
				-    {
			
 
				-        _dimensions.resize(1);
			
 
				-        auto_strides();
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::add_(TensorInterfacePtr other)
			
 
				-    {
			
 
				-		// check tensor other type
			
 
				-
			
 
				-		// check broadcast.shape = this.shape
			
 
				-
			
 
				-		// ok, get lhs, rhs
			
 
				-		Tensor<T> * lhs = this;
			
 
				-		Tensor<T> * rhs = dynamic_cast<Tensor<T> *>(other.get());
			
 
				-		std::function<void(Tensor<T> *, Tensor<T> *, idx_type, idx_type,idx_type, idx_type)> add_impl =
			
 
				-			[&](Tensor<T> * lhs, Tensor<T> * rhs, idx_type lhs_dim, idx_type rhs_dim, idx_type lhs_idx, idx_type rhs_idx) {
			
 
				-
			
 
				-			auto lhs_storage = std::dynamic_pointer_cast<TensorStorage<T>>(lhs->storage())->data_ptr();
			
 
				-			auto rhs_storage = std::dynamic_pointer_cast<TensorStorage<T>>(rhs->storage())->data_ptr();
			
 
				-
			
 
				-			if (lhs_dim < -(lhs->size().size()) && rhs_dim < -(rhs->size().size()))
			
 
				-			{
			
 
				-				lhs_storage[lhs_idx] += rhs_storage[rhs_idx];
			
 
				-				return;
			
 
				-			}
			
 
				-
			
 
				-			idx_type lsh_shape_size = lhs_dim >= -(lhs->size().size())? lhs->size(lhs_dim) : 1;
			
 
				-			idx_type rsh_shape_size = rhs_dim >= -(rhs->size().size()) ? rhs->size(rhs_dim) : 1;
			
 
				-			idx_type max_shape_size = std::max(lsh_shape_size, rsh_shape_size);
			
 
				-
			
 
				-			for (idx_type i = 0; i < max_shape_size; ++i)
			
 
				-			{
			
 
				-				add_impl(lhs, rhs, lhs_dim - 1, rhs_dim - 1, lhs_idx, rhs_idx);
			
 
				-
			
 
				-				if(lsh_shape_size > 1)
			
 
				-					lhs_idx += lhs->stride(lhs_dim);
			
 
				-				if (rsh_shape_size > 1)
			
 
				-					rhs_idx += rhs->stride(rhs_dim);
			
 
				-			}
			
 
				-		};
			
 
				-
			
 
				-		add_impl(lhs, rhs, -1, -1, lhs->offset(), rhs->offset());
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::apply_(std::function<T(T)> f)
			
 
				-    {
			
 
				-        apply_impl(0, _offset, f);
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    TensorInterfacePtr Tensor<T>::clone() const
			
 
				-    {
			
 
				-        std::shared_ptr<Tensor<T>> cloned_tensor(new Tensor<T>);
			
 
				-        cloned_tensor->_rep = std::dynamic_pointer_cast<TensorStorage<T>>(_rep->clone());
			
 
				-        cloned_tensor->_dimensions = _dimensions;
			
 
				-        cloned_tensor->_offset = _offset;
			
 
				-        cloned_tensor->_strides = _strides;
			
 
				-        cloned_tensor->_order = _order;
			
 
				-        
			
 
				-        return cloned_tensor;
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::cos_()
			
 
				-    {
			
 
				-        apply_([](T a)->T {return std::cos(a); });
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    std::shared_ptr<TensorBase<f32>> Tensor<T>::create_grad()
			
 
				-    {
			
 
				-        return std::shared_ptr<TensorBase<f32>>(new Tensor<f32>(_dimensions));
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    T* Tensor<T>::data_ptr()
			
 
				-    {
			
 
				-        return _rep->data_ptr();
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    const T* Tensor<T>::data_ptr() const
			
 
				-    {
			
 
				-        return _rep->data_ptr();
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    device_id Tensor<T>::device() { return 0; }
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::fill_(T value)
			
 
				-    {
			
 
				-        apply_([&value](T a)->T {return value; });
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    T Tensor<T>::item() const
			
 
				-    {
			
 
				-        if(_dimensions.flat_size() == 1)
			
 
				-        {
			
 
				-            return _rep->data[_offset];
			
 
				-        }
			
 
				-        else
			
 
				-        {
			
 
				-            throw std::runtime_error("item: only one element tensors can be converted to scalars");
			
 
				-        }
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    idx_type Tensor<T>::offset() const { return _offset; }
			
 
				-    template<typename T>
			
 
				-    layout_type Tensor<T>::order() const { return _order; }
			
 
				-    template<typename T>
			
 
				-    platform_type Tensor<T>::platform() { return platform_type::none; }
			
 
				-    template<typename T>
			
 
				-    T Tensor<T>::reduce_(std::function<T(T,T)> f) const
			
 
				-    {
			
 
				-        T result{};
			
 
				-        reduce_impl(result, 0, _offset, f);
			
 
				-        return result;
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    TensorInterfacePtr Tensor<T>::reduce_dim(idx_type dim, std::function<T(T,T)> f) const
			
 
				-    {
			
 
				-        DimVector reduced_dim = _dimensions;
			
 
				-        reduced_dim.erase(dim); // check dim?
			
 
				-        TensorBasePtr result(new Tensor<T>(reduced_dim));
			
 
				-        TensorPtr raw_result = std::dynamic_pointer_cast<Tensor<T>>(result);
			
 
				-        reduce_dim_impl(*(raw_result.get()), 0, dim, _offset, raw_result->_offset, f);
			
 
				-        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::reshape_(const DimVector& dims)
			
 
				-    {
			
 
				-
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::resize_(const DimVector& dims)
			
 
				-    {
			
 
				-        _dimensions = dims;
			
 
				-        _rep->resize_(dims.flat_size());
			
 
				-        auto_strides();
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::sin_()
			
 
				-    {
			
 
				-        apply_([](T a)->T {return std::sin(a); });
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    DimVector Tensor<T>::size() const { return _dimensions;}
			
 
				-	template<typename T>
			
 
				-	idx_type Tensor<T>::size(idx_type i) const
			
 
				-	{ 
			
 
				-		auto shape_size = _dimensions.size();
			
 
				-		if (i >= 0 && i < _dimensions.size())
			
 
				-			return _dimensions[i];
			
 
				-		else if (i <= -1 && i >= -_dimensions.size())
			
 
				-			return _dimensions[shape_size + i];
			
 
				-		else
			
 
				-			throw std::runtime_error("Dimension out of range");
			
 
				-	}
			
 
				-    template<typename T>
			
 
				-	std::shared_ptr<StorageBase<T>>  Tensor<T>::storage() const { return _rep; }
			
 
				-    template<typename T>
			
 
				-    DimVector Tensor<T>::stride() const { return _strides; }
			
 
				-	template<typename T>
			
 
				-	idx_type Tensor<T>::stride(idx_type i) const
			
 
				-	{
			
 
				-		auto stride_size = _strides.size();
			
 
				-		if (i >= 0 && i < _strides.size())
			
 
				-			return _strides[i];
			
 
				-		else if (i <= -1 && i >= -_strides.size())
			
 
				-			return _strides[stride_size + i];
			
 
				-		else
			
 
				-			throw std::runtime_error("Stride out of range");
			
 
				-	}
			
 
				-    template<typename T>
			
 
				-    TensorInterfacePtr Tensor<T>::sum() const
			
 
				-    {
			
 
				-        DimVector d(1);
			
 
				-        d[0] = 1;
			
 
				-
			
 
				-        TensorPtr result(new Tensor<T>(d));
			
 
				-        result->_rep->data[0] = reduce_([](T a, T b)->T {return a + b; });
			
 
				-        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    std::string Tensor<T>::to_string() const
			
 
				-    {
			
 
				-        std::function<std::string(const Tensor<T>&, idx_type, idx_type)> to_string_impl =
			
 
				-			[&](const Tensor<T>& t, idx_type dim, idx_type idx)->std::string {
			
 
				-            std::string result;
			
 
				-			if (dim == t.size().size())
			
 
				-            {
			
 
				-                result += std::to_string(t.data_ptr()[idx]);
			
 
				-				return result;
			
 
				-            }
			
 
				-
			
 
				-			for (idx_type i = 0; i < t.size(dim); ++i)
			
 
				-			{
			
 
				-				if (dim != t.size().size() - 1 && i != 0) result += ",\n";
			
 
				-				if(dim != t.size().size() - 1)	result += "[";
			
 
				-				result += to_string_impl(t, dim + 1, idx);
			
 
				-				if (i != t.size(dim) - 1 && dim == t.size().size() - 1)
			
 
				-					result += ",";
			
 
				-				if (dim != t.size().size() - 1) result += "]";
			
 
				-
			
 
				-				idx += t.stride(dim);
			
 
				-			}
			
 
				-
			
 
				-			return result;
			
 
				-		};
			
 
				-
			
 
				-		std::string result;
			
 
				-		result += "[" + to_string_impl(*this, 0, offset()) + "]";
			
 
				-		return result;
			
 
				-    }
			
 
				-
			
 
				 }
			
 
				 
			
 
				 #endif
			
--- a/traph/include/traph/tensor/int_tensor.h
+++ b/traph/include/traph/tensor/int_tensor.h
@@ -8,7 +8,83 @@
 
				 
			
 
				 namespace traph
			
 
				 {
			
 
				+    // ndarray
			
 
				+    template<>
			
 
				+    class Tensor<i32>: public TensorBase<i32>
			
 
				+    {
			
 
				+    public:
			
 
				+        using value_type = i32;
			
 
				+        using self_type = Tensor<i32>;
			
 
				+        using base_type = TensorBase<i32>;
			
 
				+        using storage_type = TensorStorage<value_type>;
			
 
				 
			
 
				+        using raw_pointer = self_type*;
			
 
				+        using shared_pointer = std::shared_ptr<self_type>;
			
 
				+        using reference = self_type&;
			
 
				+        using const_reference = const self_type&;
			
 
				+    private:
			
 
				+        std::shared_ptr<storage_type> _rep;
			
 
				+        DimVector _dimensions;
			
 
				+        idx_type _offset;
			
 
				+		DimVector _strides;
			
 
				+        layout_type _order;
			
 
				+
			
 
				+    private:
			
 
				+        void auto_strides();
			
 
				+
			
 
				+        void apply_impl(idx_type dim, idx_type idx, std::function<value_type(value_type)> f);
			
 
				+
			
 
				+        void reduce_impl(value_type& result, idx_type dim, idx_type idx, std::function<value_type(value_type,value_type)> f) const;
			
 
				+
			
 
				+        value_type reduce_dim_kernel(idx_type begin, idx_type step_len, idx_type step_num, std::function<value_type(value_type,value_type)> f) const;
			
 
				+
			
 
				+        void reduce_dim_impl(reference result, idx_type dim, idx_type reduce_dim,
			
 
				+            idx_type this_idx, idx_type result_idx,
			
 
				+            std::function<value_type(value_type,value_type)> f) const;
			
 
				+    public:
			
 
				+        Tensor();
			
 
				+        explicit Tensor(const DimVector& dimensions);
			
 
				+        explicit Tensor(const DimVector& dimensions, layout_type order);
			
 
				+        explicit Tensor(const DimVector& dimensions, const DimVector& strides);
			
 
				+        explicit Tensor(const DimVector& dimensions, const DimVector& strides, layout_type order);
			
 
				+        Tensor(const value_type& t);
			
 
				+
			
 
				+        Tensor(const Tensor& other) = delete;
			
 
				+        Tensor(Tensor&& other) = delete;
			
 
				+        Tensor& operator= (const Tensor& other) = delete;
			
 
				+        Tensor& operator= (Tensor&& other) = delete;
			
 
				+
			
 
				+		virtual void add_(TensorInterfacePtr other) override;
			
 
				+		virtual void apply_(std::function<i32(i32)> f) override;
			
 
				+		virtual TensorInterfacePtr clone() const override;
			
 
				+		virtual void cos_() override;
			
 
				+		virtual std::shared_ptr<TensorBase<f32>> create_grad() override;
			
 
				+		virtual i32* data_ptr() override;
			
 
				+		virtual const i32* data_ptr() const override;
			
 
				+		virtual device_id device() override;
			
 
				+		virtual void fill_(i32 value) override;
			
 
				+		virtual std::shared_ptr<TensorInterface> inverse() const override;
			
 
				+		virtual i32 item() const override;
			
 
				+		virtual std::shared_ptr<TensorInterface> matmul(std::shared_ptr<TensorInterface> mat) const override;
			
 
				+		virtual idx_type offset() const override;
			
 
				+		virtual layout_type order() const override;
			
 
				+		virtual platform_type platform() override;
			
 
				+		virtual i32 reduce_(std::function<i32(i32, i32)> f) const override;
			
 
				+		virtual TensorInterfacePtr reduce_dim(idx_type dim, std::function<i32(i32, i32)> f) const override;
			
 
				+		virtual void reshape_(const DimVector& dims) override;
			
 
				+		virtual void resize_(const DimVector& dims) override;
			
 
				+		virtual std::shared_ptr<TensorInterface> select(const SliceVector& slice) const override;
			
 
				+		virtual void sin_() override;
			
 
				+		virtual DimVector size() const override;
			
 
				+		virtual idx_type size(idx_type i) const override;
			
 
				+		virtual std::shared_ptr<StorageBase<i32>> storage() const override;
			
 
				+		virtual DimVector stride() const override;
			
 
				+		virtual idx_type stride(idx_type i) const override;
			
 
				+		virtual TensorInterfacePtr sum() const override;
			
 
				+		virtual std::string to_string() const override;
			
 
				+    };
			
 
				+
			
 
				+    using IntTensor = Tensor<i32>;
			
 
				 
			
 
				 }
			
 
				 
			
--- a/traph/include/traph/tensor/long_tensor.h
+++ b/traph/include/traph/tensor/long_tensor.h
@@ -8,7 +8,83 @@
 
				 
			
 
				 namespace traph
			
 
				 {
			
 
				+    // ndarray
			
 
				+    template<>
			
 
				+    class Tensor<i64>: public TensorBase<i64>
			
 
				+    {
			
 
				+    public:
			
 
				+        using value_type = i64;
			
 
				+        using self_type = Tensor<i64>;
			
 
				+        using base_type = TensorBase<i64>;
			
 
				+        using storage_type = TensorStorage<value_type>;
			
 
				 
			
 
				+        using raw_pointer = self_type*;
			
 
				+        using shared_pointer = std::shared_ptr<self_type>;
			
 
				+        using reference = self_type&;
			
 
				+        using const_reference = const self_type&;
			
 
				+    private:
			
 
				+        std::shared_ptr<storage_type> _rep;
			
 
				+        DimVector _dimensions;
			
 
				+        idx_type _offset;
			
 
				+		DimVector _strides;
			
 
				+        layout_type _order;
			
 
				+
			
 
				+    private:
			
 
				+        void auto_strides();
			
 
				+
			
 
				+        void apply_impl(idx_type dim, idx_type idx, std::function<value_type(value_type)> f);
			
 
				+
			
 
				+        void reduce_impl(value_type& result, idx_type dim, idx_type idx, std::function<value_type(value_type,value_type)> f) const;
			
 
				+
			
 
				+        value_type reduce_dim_kernel(idx_type begin, idx_type step_len, idx_type step_num, std::function<value_type(value_type,value_type)> f) const;
			
 
				+
			
 
				+        void reduce_dim_impl(reference result, idx_type dim, idx_type reduce_dim,
			
 
				+            idx_type this_idx, idx_type result_idx,
			
 
				+            std::function<value_type(value_type,value_type)> f) const;
			
 
				+    public:
			
 
				+        Tensor();
			
 
				+        explicit Tensor(const DimVector& dimensions);
			
 
				+        explicit Tensor(const DimVector& dimensions, layout_type order);
			
 
				+        explicit Tensor(const DimVector& dimensions, const DimVector& strides);
			
 
				+        explicit Tensor(const DimVector& dimensions, const DimVector& strides, layout_type order);
			
 
				+        Tensor(const value_type& t);
			
 
				+
			
 
				+        Tensor(const Tensor& other) = delete;
			
 
				+        Tensor(Tensor&& other) = delete;
			
 
				+        Tensor& operator= (const Tensor& other) = delete;
			
 
				+        Tensor& operator= (Tensor&& other) = delete;
			
 
				+
			
 
				+		virtual void add_(TensorInterfacePtr other) override;
			
 
				+		virtual void apply_(std::function<i64(i64)> f) override;
			
 
				+		virtual TensorInterfacePtr clone() const override;
			
 
				+		virtual void cos_() override;
			
 
				+		virtual std::shared_ptr<TensorBase<f32>> create_grad() override;
			
 
				+		virtual i64* data_ptr() override;
			
 
				+		virtual const i64* data_ptr() const override;
			
 
				+		virtual device_id device() override;
			
 
				+		virtual void fill_(i64 value) override;
			
 
				+		virtual std::shared_ptr<TensorInterface> inverse() const override;
			
 
				+		virtual i64 item() const override;
			
 
				+		virtual std::shared_ptr<TensorInterface> matmul(std::shared_ptr<TensorInterface> mat) const override;
			
 
				+		virtual idx_type offset() const override;
			
 
				+		virtual layout_type order() const override;
			
 
				+		virtual platform_type platform() override;
			
 
				+		virtual i64 reduce_(std::function<i64(i64, i64)> f) const override;
			
 
				+		virtual TensorInterfacePtr reduce_dim(idx_type dim, std::function<i64(i64, i64)> f) const override;
			
 
				+		virtual void reshape_(const DimVector& dims) override;
			
 
				+		virtual void resize_(const DimVector& dims) override;
			
 
				+		virtual std::shared_ptr<TensorInterface> select(const SliceVector& slice) const override;
			
 
				+		virtual void sin_() override;
			
 
				+		virtual DimVector size() const override;
			
 
				+		virtual idx_type size(idx_type i) const override;
			
 
				+		virtual std::shared_ptr<StorageBase<i64>> storage() const override;
			
 
				+		virtual DimVector stride() const override;
			
 
				+		virtual idx_type stride(idx_type i) const override;
			
 
				+		virtual TensorInterfacePtr sum() const override;
			
 
				+		virtual std::string to_string() const override;
			
 
				+    };
			
 
				+
			
 
				+    using LongTensor = Tensor<i64>;
			
 
				 
			
 
				 }
			
 
				 
			
--- a/traph/include/traph/tensor/short_tensor.h
+++ b/traph/include/traph/tensor/short_tensor.h
@@ -8,8 +8,83 @@
 
				 
			
 
				 namespace traph
			
 
				 {
			
 
				+    // ndarray
			
 
				+    template<>
			
 
				+    class Tensor<i16>: public TensorBase<i16>
			
 
				+    {
			
 
				+    public:
			
 
				+        using value_type = i16;
			
 
				+        using self_type = Tensor<i16>;
			
 
				+        using base_type = TensorBase<i16>;
			
 
				+        using storage_type = TensorStorage<value_type>;
			
 
				 
			
 
				+        using raw_pointer = self_type*;
			
 
				+        using shared_pointer = std::shared_ptr<self_type>;
			
 
				+        using reference = self_type&;
			
 
				+        using const_reference = const self_type&;
			
 
				+    private:
			
 
				+        std::shared_ptr<storage_type> _rep;
			
 
				+        DimVector _dimensions;
			
 
				+        idx_type _offset;
			
 
				+		DimVector _strides;
			
 
				+        layout_type _order;
			
 
				 
			
 
				+    private:
			
 
				+        void auto_strides();
			
 
				+
			
 
				+        void apply_impl(idx_type dim, idx_type idx, std::function<value_type(value_type)> f);
			
 
				+
			
 
				+        void reduce_impl(value_type& result, idx_type dim, idx_type idx, std::function<value_type(value_type,value_type)> f) const;
			
 
				+
			
 
				+        value_type reduce_dim_kernel(idx_type begin, idx_type step_len, idx_type step_num, std::function<value_type(value_type,value_type)> f) const;
			
 
				+
			
 
				+        void reduce_dim_impl(reference result, idx_type dim, idx_type reduce_dim,
			
 
				+            idx_type this_idx, idx_type result_idx,
			
 
				+            std::function<value_type(value_type,value_type)> f) const;
			
 
				+    public:
			
 
				+        Tensor();
			
 
				+        explicit Tensor(const DimVector& dimensions);
			
 
				+        explicit Tensor(const DimVector& dimensions, layout_type order);
			
 
				+        explicit Tensor(const DimVector& dimensions, const DimVector& strides);
			
 
				+        explicit Tensor(const DimVector& dimensions, const DimVector& strides, layout_type order);
			
 
				+        Tensor(const value_type& t);
			
 
				+
			
 
				+        Tensor(const Tensor& other) = delete;
			
 
				+        Tensor(Tensor&& other) = delete;
			
 
				+        Tensor& operator= (const Tensor& other) = delete;
			
 
				+        Tensor& operator= (Tensor&& other) = delete;
			
 
				+
			
 
				+		virtual void add_(TensorInterfacePtr other) override;
			
 
				+		virtual void apply_(std::function<i16(i16)> f) override;
			
 
				+		virtual TensorInterfacePtr clone() const override;
			
 
				+		virtual void cos_() override;
			
 
				+		virtual std::shared_ptr<TensorBase<f32>> create_grad() override;
			
 
				+		virtual i16* data_ptr() override;
			
 
				+		virtual const i16* data_ptr() const override;
			
 
				+		virtual device_id device() override;
			
 
				+		virtual void fill_(i16 value) override;
			
 
				+		virtual std::shared_ptr<TensorInterface> inverse() const override;
			
 
				+		virtual i16 item() const override;
			
 
				+		virtual std::shared_ptr<TensorInterface> matmul(std::shared_ptr<TensorInterface> mat) const override;
			
 
				+		virtual idx_type offset() const override;
			
 
				+		virtual layout_type order() const override;
			
 
				+		virtual platform_type platform() override;
			
 
				+		virtual i16 reduce_(std::function<i16(i16, i16)> f) const override;
			
 
				+		virtual TensorInterfacePtr reduce_dim(idx_type dim, std::function<i16(i16, i16)> f) const override;
			
 
				+		virtual void reshape_(const DimVector& dims) override;
			
 
				+		virtual void resize_(const DimVector& dims) override;
			
 
				+		virtual std::shared_ptr<TensorInterface> select(const SliceVector& slice) const override;
			
 
				+		virtual void sin_() override;
			
 
				+		virtual DimVector size() const override;
			
 
				+		virtual idx_type size(idx_type i) const override;
			
 
				+		virtual std::shared_ptr<StorageBase<i16>> storage() const override;
			
 
				+		virtual DimVector stride() const override;
			
 
				+		virtual idx_type stride(idx_type i) const override;
			
 
				+		virtual TensorInterfacePtr sum() const override;
			
 
				+		virtual std::string to_string() const override;
			
 
				+    };
			
 
				+
			
 
				+    using ShortTensor = Tensor<i16>;
			
 
				 }
			
 
				 
			
 
				 #endif
			
--- a/traph/include/traph/tensor/tensor.h
+++ b/traph/include/traph/tensor/tensor.h
@@ -43,18 +43,6 @@ namespace traph
 
				 		DimVector _strides;
			
 
				         layout_type _order;
			
 
				 
			
 
				-    private:
			
 
				-        void auto_strides();
			
 
				-
			
 
				-        void apply_impl(idx_type dim, idx_type idx, std::function<T(T)> f);
			
 
				-
			
 
				-        void reduce_impl(T& result, idx_type dim, idx_type idx, std::function<T(T,T)> f) const;
			
 
				-
			
 
				-        T reduce_dim_kernel(idx_type begin, idx_type step_len, idx_type step_num, std::function<T(T,T)> f) const;
			
 
				-
			
 
				-        void reduce_dim_impl(Tensor<T>& result, idx_type dim, idx_type reduce_dim,
			
 
				-            idx_type this_idx, idx_type result_idx,
			
 
				-            std::function<T(T,T)> f) const;
			
 
				     public:
			
 
				         Tensor();
			
 
				         explicit Tensor(const DimVector& dimensions);
			
@@ -98,437 +86,14 @@ namespace traph
 
				         virtual std::string to_string() const override;
			
 
				     };
			
 
				 
			
 
				-    using DoubleTensor = Tensor<f64>;
			
 
				-    using FloatTensor = Tensor<f32>;
			
 
				-    using LongTensor = Tensor<i64>;
			
 
				-    using IntTensor = Tensor<i32>;
			
 
				-    using ShortTensor = Tensor<i16>;
			
 
				-    using CharTensor = Tensor<i8>;
			
 
				-    using ByteTensor = Tensor<u8>;
			
 
				-
			
 
				 	template<typename T>
			
 
				 	using TensorPtr = std::shared_ptr<Tensor<T>>;
			
 
				 	template<typename T>
			
 
				 	using TensorRef = Tensor<T> &;
			
 
				 	template<typename T>
			
 
				 	using TensorConstRef = const Tensor<T>&;
			
 
				-
			
 
				-	// definition
			
 
				-    // private
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::auto_strides()
			
 
				-    {
			
 
				-        idx_type dim_num = _dimensions.size();
			
 
				-        _strides.resize(dim_num);
			
 
				-        idx_type stride = 1;
			
 
				-        if(_order == layout_type::column_major)
			
 
				-        {
			
 
				-            for (idx_type i = dim_num - 1; i >= 0; --i)
			
 
				-            {
			
 
				-                _strides[i] = stride;
			
 
				-                stride *= _dimensions[i];
			
 
				-            }
			
 
				-        }
			
 
				-        else
			
 
				-        {
			
 
				-            for (idx_type i = 0; i < dim_num; ++i)
			
 
				-            {
			
 
				-                _strides[i] = stride;
			
 
				-                stride *= _dimensions[i];
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::apply_impl(idx_type dim, idx_type idx, std::function<T(T)> f)
			
 
				-    {
			
 
				-        idx_type dim_size = _dimensions.size();
			
 
				-
			
 
				-        idx_type step_len = _strides[dim];
			
 
				-        idx_type step_num = _dimensions[dim];
			
 
				-        
			
 
				-        for(idx_type i = 0; i < step_num; ++i)
			
 
				-        {
			
 
				-            if(dim == dim_size - 1)
			
 
				-                _rep->data[idx] = f(_rep->data[idx]);
			
 
				-            else
			
 
				-                apply_impl(dim + 1, idx, f);
			
 
				-            idx += step_len;
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::reduce_impl(T& result, idx_type dim, idx_type idx, std::function<T(T,T)> f) const
			
 
				-    {
			
 
				-        idx_type dim_size = _dimensions.size();
			
 
				-
			
 
				-        idx_type step_len = _strides[dim];
			
 
				-        idx_type step_num = _dimensions[dim];
			
 
				-
			
 
				-        for(idx_type i = 0; i < step_num; ++i)
			
 
				-        {
			
 
				-            if(dim == dim_size - 1)
			
 
				-                result = f(result, _rep->data[idx]);
			
 
				-            else
			
 
				-                reduce_impl(result, dim + 1, idx, f);
			
 
				-            idx += step_len;
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    T Tensor<T>::reduce_dim_kernel(idx_type begin, idx_type step_len, idx_type step_num, std::function<T(T,T)> f) const
			
 
				-    {
			
 
				-        T result{};
			
 
				-        for(idx_type i = 0; i < step_num; ++i)
			
 
				-        {
			
 
				-            result = f(result, _rep->data[begin]);
			
 
				-            begin += step_len;
			
 
				-        }
			
 
				-        return result;
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::reduce_dim_impl(Tensor<T>& result, idx_type dim, idx_type reduce_dim,
			
 
				-        idx_type this_idx, idx_type result_idx,
			
 
				-        std::function<T(T,T)> f) const
			
 
				-    {
			
 
				-        idx_type dim_size = _dimensions.size();
			
 
				-
			
 
				-        if(dim == dim_size)
			
 
				-        {
			
 
				-            result._rep->data[result_idx] = 
			
 
				-                reduce_dim_kernel(this_idx, _strides[reduce_dim], _dimensions[reduce_dim], f);
			
 
				-            return;
			
 
				-        }
			
 
				-
			
 
				-        if(dim == reduce_dim)
			
 
				-        {
			
 
				-            reduce_dim_impl(result, dim + 1, reduce_dim, this_idx,result_idx, f);
			
 
				-        }
			
 
				-        else
			
 
				-        {
			
 
				-            for(idx_type i = 0; i < _dimensions[dim]; ++i)
			
 
				-            {
			
 
				-                reduce_dim_impl(result, dim + 1, reduce_dim, this_idx,result_idx, f);
			
 
				-                    
			
 
				-                this_idx += _strides[dim];
			
 
				-                result_idx += result._strides[dim];
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-    // public
			
 
				-    template<typename T>
			
 
				-    Tensor<T>::Tensor()
			
 
				-        :_rep(new TensorStorage<T>),
			
 
				-        _dimensions(), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				-    {
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-    template<typename T>
			
 
				-    Tensor<T>::Tensor(const DimVector& dimensions)
			
 
				-        :_rep(new TensorStorage<T>),
			
 
				-        _dimensions(dimensions), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				-    {
			
 
				-        auto_strides();
			
 
				-        
			
 
				-        _rep->resize_(_dimensions.flat_size());
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    Tensor<T>::Tensor(const DimVector& dimensions, layout_type order)
			
 
				-        :_rep(new TensorStorage<T>),
			
 
				-        _dimensions(dimensions), _offset(0), _strides(), _order(order)
			
 
				-    {
			
 
				-        auto_strides();
			
 
				-
			
 
				-        _rep->resize_(_dimensions.flat_size());
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    Tensor<T>::Tensor(const DimVector& dimensions, const DimVector& strides)
			
 
				-        :_rep(new TensorStorage<T>),
			
 
				-        _dimensions(dimensions), _offset(0), _strides(strides), _order(layout_type::column_major)
			
 
				-    {
			
 
				-        auto_strides();
			
 
				-
			
 
				-        _rep->resize_(_dimensions.flat_size());
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    Tensor<T>::Tensor(const DimVector& dimensions, const DimVector& strides, layout_type order)
			
 
				-        :_rep(new TensorStorage<T>),
			
 
				-        _dimensions(dimensions), _offset(0), _strides(strides), _order(order)
			
 
				-    {
			
 
				-        auto_strides();
			
 
				-
			
 
				-        _rep->resize_(_dimensions.flat_size());
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    Tensor<T>::Tensor(const T& t)
			
 
				-        :_rep(new TensorStorage<T>),
			
 
				-        _dimensions(), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				-    {
			
 
				-        _dimensions.resize(1);
			
 
				-        auto_strides();
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::add_(TensorInterfacePtr other)
			
 
				-    {
			
 
				-		// check tensor other type
			
 
				-
			
 
				-		// check broadcast.shape = this.shape
			
 
				-
			
 
				-		// ok, get lhs, rhs
			
 
				-		Tensor<T> * lhs = this;
			
 
				-		Tensor<T> * rhs = dynamic_cast<Tensor<T> *>(other.get());
			
 
				-		std::function<void(Tensor<T> *, Tensor<T> *, idx_type, idx_type,idx_type, idx_type)> add_impl =
			
 
				-			[&](Tensor<T> * lhs, Tensor<T> * rhs, idx_type lhs_dim, idx_type rhs_dim, idx_type lhs_idx, idx_type rhs_idx) {
			
 
				-
			
 
				-			auto lhs_storage = std::dynamic_pointer_cast<TensorStorage<T>>(lhs->storage())->data_ptr();
			
 
				-			auto rhs_storage = std::dynamic_pointer_cast<TensorStorage<T>>(rhs->storage())->data_ptr();
			
 
				-
			
 
				-			if (lhs_dim < -(lhs->size().size()) && rhs_dim < -(rhs->size().size()))
			
 
				-			{
			
 
				-				lhs_storage[lhs_idx] += rhs_storage[rhs_idx];
			
 
				-				return;
			
 
				-			}
			
 
				-
			
 
				-			idx_type lsh_shape_size = lhs_dim >= -(lhs->size().size())? lhs->size(lhs_dim) : 1;
			
 
				-			idx_type rsh_shape_size = rhs_dim >= -(rhs->size().size()) ? rhs->size(rhs_dim) : 1;
			
 
				-			idx_type max_shape_size = std::max(lsh_shape_size, rsh_shape_size);
			
 
				-
			
 
				-			for (idx_type i = 0; i < max_shape_size; ++i)
			
 
				-			{
			
 
				-				add_impl(lhs, rhs, lhs_dim - 1, rhs_dim - 1, lhs_idx, rhs_idx);
			
 
				-
			
 
				-				if(lsh_shape_size > 1)
			
 
				-					lhs_idx += lhs->stride(lhs_dim);
			
 
				-				if (rsh_shape_size > 1)
			
 
				-					rhs_idx += rhs->stride(rhs_dim);
			
 
				-			}
			
 
				-		};
			
 
				-
			
 
				-		add_impl(lhs, rhs, -1, -1, lhs->offset(), rhs->offset());
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::apply_(std::function<T(T)> f)
			
 
				-    {
			
 
				-        apply_impl(0, _offset, f);
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    TensorInterfacePtr Tensor<T>::clone() const
			
 
				-    {
			
 
				-        std::shared_ptr<Tensor<T>> cloned_tensor(new Tensor<T>);
			
 
				-        cloned_tensor->_rep = std::dynamic_pointer_cast<TensorStorage<T>>(_rep->clone());
			
 
				-        cloned_tensor->_dimensions = _dimensions;
			
 
				-        cloned_tensor->_offset = _offset;
			
 
				-        cloned_tensor->_strides = _strides;
			
 
				-        cloned_tensor->_order = _order;
			
 
				-        
			
 
				-        return cloned_tensor;
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::cos_()
			
 
				-    {
			
 
				-        apply_([](T a)->T {return std::cos(a); });
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    std::shared_ptr<TensorBase<f32>> Tensor<T>::create_grad()
			
 
				-    {
			
 
				-        return std::shared_ptr<TensorBase<f32>>(new Tensor<f32>(_dimensions));
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    T* Tensor<T>::data_ptr()
			
 
				-    {
			
 
				-        return _rep->data_ptr();
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    const T* Tensor<T>::data_ptr() const
			
 
				-    {
			
 
				-        return _rep->data_ptr();
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    device_id Tensor<T>::device() { return 0; }
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::fill_(T value)
			
 
				-    {
			
 
				-        apply_([&value](T a)->T {return value; });
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    std::shared_ptr<TensorInterface> Tensor<T>::inverse() const
			
 
				-    {
			
 
				-        return std::dynamic_pointer_cast<TensorInterface>(inverse_impl(*this);
			
 
				-    }
			
 
				-
			
 
				-    template<typename T>
			
 
				-    T Tensor<T>::item() const
			
 
				-    {
			
 
				-        if(_dimensions.flat_size() == 1)
			
 
				-        {
			
 
				-            return _rep->data[_offset];
			
 
				-        }
			
 
				-        else
			
 
				-        {
			
 
				-            throw std::runtime_error("item: only one element tensors can be converted to scalars");
			
 
				-        }
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    std::shared_ptr<TensorInterface> Tensor<T>::matmul(std::shared_ptr<TensorInterface> mat) const
			
 
				-    {
			
 
				-		auto right_matrix = std::dynamic_pointer_cast<Tensor<T>>(mat);
			
 
				-		return matmul_impl(*this, *right_matrix);
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    idx_type Tensor<T>::offset() const { return _offset; }
			
 
				-    template<typename T>
			
 
				-    layout_type Tensor<T>::order() const { return _order; }
			
 
				-    template<typename T>
			
 
				-    platform_type Tensor<T>::platform() { return platform_type::none; }
			
 
				-    template<typename T>
			
 
				-    T Tensor<T>::reduce_(std::function<T(T,T)> f) const
			
 
				-    {
			
 
				-        T result{};
			
 
				-        reduce_impl(result, 0, _offset, f);
			
 
				-        return result;
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    TensorInterfacePtr Tensor<T>::reduce_dim(idx_type dim, std::function<T(T,T)> f) const
			
 
				-    {
			
 
				-        DimVector reduced_dim = _dimensions;
			
 
				-        reduced_dim.erase(dim); // check dim?
			
 
				-        TensorBasePtr<T> result(new Tensor<T>(reduced_dim));
			
 
				-        TensorPtr<T> raw_result = std::dynamic_pointer_cast<Tensor<T>>(result);
			
 
				-        reduce_dim_impl(*(raw_result.get()), 0, dim, _offset, raw_result->_offset, f);
			
 
				-        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::reshape_(const DimVector& dims)
			
 
				-    {
			
 
				-
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::resize_(const DimVector& dims)
			
 
				-    {
			
 
				-        _dimensions = dims;
			
 
				-        _rep->resize_(dims.flat_size());
			
 
				-        auto_strides();
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    std::shared_ptr<TensorInterface> Tensor<T>::select(const SliceVector& slice) const
			
 
				-    {
			
 
				-        std::shared_ptr<Tensor<T>> result(new Tensor<T>);
			
 
				-        result->_rep = _rep;
			
 
				-
			
 
				-        // dimension
			
 
				-        DimVector dim;
			
 
				-		std::fesetround(FE_TONEAREST);
			
 
				-        for(idx_type i = 0; i<slice.size(); ++i)
			
 
				-        {
			
 
				-			auto& each = slice[i];
			
 
				-            dim.push_back(
			
 
				-				std::lrint(std::ceil((each.end.value_or(_dimensions[i]) - each.start.value_or(0))/(float)each.step.value_or(1)))
			
 
				-			);
			
 
				-        }
			
 
				-        result->_dimensions = dim;
			
 
				-
			
 
				-        // offset
			
 
				-        idx_type new_offset =1;
			
 
				-        for(idx_type i = 0; i < slice.size(); ++i)
			
 
				-        {
			
 
				-            new_offset *= _strides[i] * slice[i].start.value_or(0);
			
 
				-        }
			
 
				-        result->_offset = _offset + new_offset;
			
 
				-
			
 
				-        // strides
			
 
				-        DimVector strides;
			
 
				-        for(idx_type i = 0; i < slice.size(); ++i)
			
 
				-        {
			
 
				-            strides.push_back(_strides[i] * slice[i].step.value_or(1));
			
 
				-        }
			
 
				-        result->_strides = strides;
			
 
				-
			
 
				-        result->_order = _order;
			
 
				-
			
 
				-        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    void Tensor<T>::sin_()
			
 
				-    {
			
 
				-        apply_([](T a)->T { return std::sin(a); });
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    DimVector Tensor<T>::size() const { return _dimensions;}
			
 
				-	template<typename T>
			
 
				-	idx_type Tensor<T>::size(idx_type i) const
			
 
				-	{ 
			
 
				-		auto shape_size = _dimensions.size();
			
 
				-		if (i >= 0 && i < _dimensions.size())
			
 
				-			return _dimensions[i];
			
 
				-		else if (i <= -1 && i >= -_dimensions.size())
			
 
				-			return _dimensions[shape_size + i];
			
 
				-		else
			
 
				-			throw std::runtime_error("Dimension out of range");
			
 
				-	}
			
 
				-    template<typename T>
			
 
				-	std::shared_ptr<StorageBase<T>>  Tensor<T>::storage() const { return _rep; }
			
 
				-    template<typename T>
			
 
				-    DimVector Tensor<T>::stride() const { return _strides; }
			
 
				-	template<typename T>
			
 
				-	idx_type Tensor<T>::stride(idx_type i) const
			
 
				-	{
			
 
				-		auto stride_size = _strides.size();
			
 
				-		if (i >= 0 && i < _strides.size())
			
 
				-			return _strides[i];
			
 
				-		else if (i <= -1 && i >= -_strides.size())
			
 
				-			return _strides[stride_size + i];
			
 
				-		else
			
 
				-			throw std::runtime_error("Stride out of range");
			
 
				-	}
			
 
				-    template<typename T>
			
 
				-    TensorInterfacePtr Tensor<T>::sum() const
			
 
				-    {
			
 
				-        DimVector d(1);
			
 
				-        d[0] = 1;
			
 
				-
			
 
				-        TensorPtr<T> result(new Tensor<T>(d));
			
 
				-        result->_rep->data[0] = reduce_([](T a, T b)->T {return a + b; });
			
 
				-        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				-    }
			
 
				-    template<typename T>
			
 
				-    std::string Tensor<T>::to_string() const
			
 
				-    {
			
 
				-        std::function<std::string(const Tensor<T>&, idx_type, idx_type)> to_string_impl =
			
 
				-			[&](const Tensor<T>& t, idx_type dim, idx_type idx)->std::string {
			
 
				-            std::string result;
			
 
				-			if (dim == t.size().size())
			
 
				-            {
			
 
				-                result += std::to_string(t.data_ptr()[idx]);
			
 
				-				return result;
			
 
				-            }
			
 
				-
			
 
				-			for (idx_type i = 0; i < t.size(dim); ++i)
			
 
				-			{
			
 
				-				if (dim != t.size().size() - 1 && i != 0) result += ",\n";
			
 
				-				if(dim != t.size().size() - 1)	result += "[";
			
 
				-				result += to_string_impl(t, dim + 1, idx);
			
 
				-				if (i != t.size(dim) - 1 && dim == t.size().size() - 1)
			
 
				-					result += ",";
			
 
				-				if (dim != t.size().size() - 1) result += "]";
			
 
				-
			
 
				-				idx += t.stride(dim);
			
 
				-			}
			
 
				-
			
 
				-			return result;
			
 
				-		};
			
 
				-
			
 
				-		std::string result;
			
 
				-		result += "[" + to_string_impl(*this, 0, offset()) + "]";
			
 
				-		return result;
			
 
				-    }
			
 
				 }
			
 
				 
			
 
				+#include<traph/tensor/float_tensor.h>
			
 
				+
			
 
				 #endif // !TRAPH_TENSOR
			
--- a/traph/source/tensor/CMakeLists.txt
+++ b/traph/source/tensor/CMakeLists.txt
@@ -5,6 +5,20 @@ SET(HEADER_PATH ${TRAPH_PATH_HEADER}/${LIB_NAME})
 
				 SET(SOURCE_PATH ${TRAPH_PATH_SOURCE}/${LIB_NAME})
			
 
				 
			
 
				 SET(TENSOR_LIST
			
 
				+	${HEADER_PATH}/float_tensor.h
			
 
				+	${SOURCE_PATH}/float_tensor.cpp
			
 
				+	${HEADER_PATH}/double_tensor.h
			
 
				+	${SOURCE_PATH}/double_tensor.cpp
			
 
				+	${HEADER_PATH}/char_tensor.h
			
 
				+	${SOURCE_PATH}/char_tensor.cpp
			
 
				+	${HEADER_PATH}/byte_tensor.h
			
 
				+	${SOURCE_PATH}/byte_tensor.cpp
			
 
				+	${HEADER_PATH}/short_tensor.h
			
 
				+	${SOURCE_PATH}/short_tensor.cpp
			
 
				+	${HEADER_PATH}/int_tensor.h
			
 
				+	${SOURCE_PATH}/int_tensor.cpp
			
 
				+	${HEADER_PATH}/long_tensor.h
			
 
				+	${SOURCE_PATH}/long_tensor.cpp
			
 
				 	${HEADER_PATH}/tensor.h
			
 
				 	${SOURCE_PATH}/tensor.cpp
			
 
				 	${HEADER_PATH}/arithmetic.h
			
--- a/traph/source/tensor/byte_tensor.cpp
+++ b/traph/source/tensor/byte_tensor.cpp
@@ -0,0 +1,406 @@
 
				+#include <traph/tensor/byte_tensor.h>
			
 
				+
			
 
				+namespace traph
			
 
				+{
			
 
				+    // definition
			
 
				+    // private
			
 
				+    void Tensor<u8>::auto_strides()
			
 
				+    {
			
 
				+        idx_type dim_num = _dimensions.size();
			
 
				+        _strides.resize(dim_num);
			
 
				+        idx_type stride = 1;
			
 
				+        if(_order == layout_type::column_major)
			
 
				+        {
			
 
				+            for (idx_type i = dim_num - 1; i >= 0; --i)
			
 
				+            {
			
 
				+                _strides[i] = stride;
			
 
				+                stride *= _dimensions[i];
			
 
				+            }
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            for (idx_type i = 0; i < dim_num; ++i)
			
 
				+            {
			
 
				+                _strides[i] = stride;
			
 
				+                stride *= _dimensions[i];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<u8>::apply_impl(idx_type dim, idx_type idx, std::function<u8(u8)> f)
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        idx_type step_len = _strides[dim];
			
 
				+        idx_type step_num = _dimensions[dim];
			
 
				+        
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            if(dim == dim_size - 1)
			
 
				+                _rep->data[idx] = f(_rep->data[idx]);
			
 
				+            else
			
 
				+                apply_impl(dim + 1, idx, f);
			
 
				+            idx += step_len;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<u8>::reduce_impl(u8& result, idx_type dim, idx_type idx, std::function<u8(u8,u8)> f) const
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        idx_type step_len = _strides[dim];
			
 
				+        idx_type step_num = _dimensions[dim];
			
 
				+
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            if(dim == dim_size - 1)
			
 
				+                result = f(result, _rep->data[idx]);
			
 
				+            else
			
 
				+                reduce_impl(result, dim + 1, idx, f);
			
 
				+            idx += step_len;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    u8 Tensor<u8>::reduce_dim_kernel(idx_type begin, idx_type step_len, idx_type step_num, std::function<u8(u8,u8)> f) const
			
 
				+    {
			
 
				+        u8 result{};
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            result = f(result, _rep->data[begin]);
			
 
				+            begin += step_len;
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<u8>::reduce_dim_impl(Tensor<u8>& result, idx_type dim, idx_type reduce_dim,
			
 
				+        idx_type this_idx, idx_type result_idx,
			
 
				+        std::function<u8(u8,u8)> f) const
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        if(dim == dim_size)
			
 
				+        {
			
 
				+            result._rep->data[result_idx] = 
			
 
				+                reduce_dim_kernel(this_idx, _strides[reduce_dim], _dimensions[reduce_dim], f);
			
 
				+            return;
			
 
				+        }
			
 
				+
			
 
				+        if(dim == reduce_dim)
			
 
				+        {
			
 
				+            reduce_dim_impl(result, dim + 1, reduce_dim, this_idx,result_idx, f);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            for(idx_type i = 0; i < _dimensions[dim]; ++i)
			
 
				+            {
			
 
				+                reduce_dim_impl(result, dim + 1, reduce_dim, this_idx,result_idx, f);
			
 
				+                    
			
 
				+                this_idx += _strides[dim];
			
 
				+                result_idx += result._strides[dim];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    // public
			
 
				+    Tensor<u8>::Tensor()
			
 
				+        :_rep(new TensorStorage<u8>),
			
 
				+        _dimensions(), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				+    Tensor<u8>::Tensor(const DimVector& dimensions)
			
 
				+        :_rep(new TensorStorage<u8>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+        
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<u8>::Tensor(const DimVector& dimensions, layout_type order)
			
 
				+        :_rep(new TensorStorage<u8>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(), _order(order)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<u8>::Tensor(const DimVector& dimensions, const DimVector& strides)
			
 
				+        :_rep(new TensorStorage<u8>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(strides), _order(layout_type::column_major)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<u8>::Tensor(const DimVector& dimensions, const DimVector& strides, layout_type order)
			
 
				+        :_rep(new TensorStorage<u8>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(strides), _order(order)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<u8>::Tensor(const u8& t)
			
 
				+        :_rep(new TensorStorage<u8>),
			
 
				+        _dimensions(), _offset(0), _strides()
			
 
				+    {
			
 
				+        _dimensions.resize(1);
			
 
				+        auto_strides();
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<u8>::add_(TensorInterfacePtr other)
			
 
				+    {
			
 
				+		// check tensor other type
			
 
				+
			
 
				+		// check broadcast.shape = this.shape
			
 
				+
			
 
				+		// ok, get lhs, rhs
			
 
				+		Tensor<u8> * lhs = this;
			
 
				+		Tensor<u8> * rhs = dynamic_cast<Tensor<u8> *>(other.get());
			
 
				+		std::function<void(Tensor<u8> *, Tensor<u8> *, idx_type, idx_type,idx_type, idx_type)> add_impl =
			
 
				+			[&](Tensor<u8> * lhs, Tensor<u8> * rhs, idx_type lhs_dim, idx_type rhs_dim, idx_type lhs_idx, idx_type rhs_idx) {
			
 
				+
			
 
				+			auto lhs_storage = std::dynamic_pointer_cast<TensorStorage<u8>>(lhs->storage())->data_ptr();
			
 
				+			auto rhs_storage = std::dynamic_pointer_cast<TensorStorage<u8>>(rhs->storage())->data_ptr();
			
 
				+
			
 
				+			if (lhs_dim < -(lhs->size().size()) && rhs_dim < -(rhs->size().size()))
			
 
				+			{
			
 
				+				lhs_storage[lhs_idx] += rhs_storage[rhs_idx];
			
 
				+				return;
			
 
				+			}
			
 
				+
			
 
				+			idx_type lsh_shape_size = lhs_dim >= -(lhs->size().size())? lhs->size(lhs_dim) : 1;
			
 
				+			idx_type rsh_shape_size = rhs_dim >= -(rhs->size().size()) ? rhs->size(rhs_dim) : 1;
			
 
				+			idx_type max_shape_size = std::max(lsh_shape_size, rsh_shape_size);
			
 
				+
			
 
				+			for (idx_type i = 0; i < max_shape_size; ++i)
			
 
				+			{
			
 
				+				add_impl(lhs, rhs, lhs_dim - 1, rhs_dim - 1, lhs_idx, rhs_idx);
			
 
				+
			
 
				+				if(lsh_shape_size > 1)
			
 
				+					lhs_idx += lhs->stride(lhs_dim);
			
 
				+				if (rsh_shape_size > 1)
			
 
				+					rhs_idx += rhs->stride(rhs_dim);
			
 
				+			}
			
 
				+		};
			
 
				+
			
 
				+		add_impl(lhs, rhs, -1, -1, lhs->offset(), rhs->offset());
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<u8>::apply_(std::function<u8(u8)> f)
			
 
				+    {
			
 
				+        apply_impl(0, _offset, f);
			
 
				+    }
			
 
				+
			
 
				+    TensorInterfacePtr Tensor<u8>::clone() const
			
 
				+    {
			
 
				+        std::shared_ptr<Tensor<u8>> cloned_tensor(new Tensor<u8>);
			
 
				+        cloned_tensor->_rep = std::dynamic_pointer_cast<TensorStorage<u8>>(_rep->clone());
			
 
				+        cloned_tensor->_dimensions = _dimensions;
			
 
				+        cloned_tensor->_offset = _offset;
			
 
				+        cloned_tensor->_strides = _strides;
			
 
				+        cloned_tensor->_order = _order;
			
 
				+        
			
 
				+        return cloned_tensor;
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<u8>::cos_()
			
 
				+    {
			
 
				+        apply_([](u8 a)->u8 {return std::cos(a); });
			
 
				+    }
			
 
				+
			
 
				+    std::shared_ptr<TensorBase<f32>> Tensor<u8>::create_grad()
			
 
				+    {
			
 
				+        return std::shared_ptr<TensorBase<f32>>(new Tensor<f32>(_dimensions));
			
 
				+    }
			
 
				+
			
 
				+	u8* Tensor<u8>::data_ptr()
			
 
				+    {
			
 
				+        return _rep->data_ptr();
			
 
				+    }
			
 
				+
			
 
				+    const u8* Tensor<u8>::data_ptr() const
			
 
				+    {
			
 
				+        return _rep->data_ptr();
			
 
				+    }
			
 
				+
			
 
				+    device_id Tensor<u8>::device() { return 0; }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<u8>::inverse() const
			
 
				+	{
			
 
				+		throw std::runtime_error("No implement");
			
 
				+	}
			
 
				+
			
 
				+    void Tensor<u8>::fill_(u8 value)
			
 
				+    {
			
 
				+        apply_([&value](u8 a)->u8 {return value; });
			
 
				+    }
			
 
				+
			
 
				+	u8 Tensor<u8>::item() const
			
 
				+    {
			
 
				+        if(_dimensions.flat_size() == 1)
			
 
				+        {
			
 
				+            return _rep->data[_offset];
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            throw std::runtime_error("item: only one element tensors can be converted to scalars");
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<u8>::matmul(std::shared_ptr<TensorInterface> mat) const
			
 
				+	{
			
 
				+		auto right_matrix = std::dynamic_pointer_cast<Tensor<u8>>(mat);
			
 
				+		return matmul_impl(*this, *right_matrix);
			
 
				+	}
			
 
				+
			
 
				+    idx_type Tensor<u8>::offset() const { return _offset; }
			
 
				+
			
 
				+    layout_type Tensor<u8>::order() const { return _order; }
			
 
				+
			
 
				+    platform_type Tensor<u8>::platform() { return platform_type::none; }
			
 
				+
			
 
				+	u8 Tensor<u8>::reduce_(std::function<u8(u8, u8)> f) const
			
 
				+    {
			
 
				+		u8 result{};
			
 
				+        reduce_impl(result, 0, _offset, f);
			
 
				+        return result;
			
 
				+    }
			
 
				+    
			
 
				+    TensorInterfacePtr Tensor<u8>::reduce_dim(idx_type dim, std::function<u8(u8, u8)> f) const
			
 
				+    {
			
 
				+        DimVector reduced_dim = _dimensions;
			
 
				+        reduced_dim.erase(dim); // check dim?
			
 
				+        TensorBasePtr<u8> result(new Tensor<u8>(reduced_dim));
			
 
				+        TensorPtr<u8> raw_result = std::dynamic_pointer_cast<Tensor<u8>>(result);
			
 
				+        reduce_dim_impl(*(raw_result.get()), 0, dim, _offset, raw_result->_offset, f);
			
 
				+        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+    }
			
 
				+    
			
 
				+    void Tensor<u8>::reshape_(const DimVector& dims)
			
 
				+    {
			
 
				+
			
 
				+    }
			
 
				+    
			
 
				+    void Tensor<u8>::resize_(const DimVector& dims)
			
 
				+    {
			
 
				+        _dimensions = dims;
			
 
				+        _rep->resize_(dims.flat_size());
			
 
				+        auto_strides();
			
 
				+    }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<u8>::select(const SliceVector& slice) const
			
 
				+	{
			
 
				+		std::shared_ptr<Tensor<u8>> result(new Tensor<u8>);
			
 
				+		result->_rep = _rep;
			
 
				+
			
 
				+		// dimension
			
 
				+		DimVector dim;
			
 
				+		std::fesetround(FE_TONEAREST);
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			auto& each = slice[i];
			
 
				+			dim.push_back(
			
 
				+				std::lrint(std::ceil((each.end.value_or(_dimensions[i]) - each.start.value_or(0)) / (float)each.step.value_or(1)))
			
 
				+			);
			
 
				+		}
			
 
				+		result->_dimensions = dim;
			
 
				+
			
 
				+		// offset
			
 
				+		idx_type new_offset = 1;
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			new_offset *= _strides[i] * slice[i].start.value_or(0);
			
 
				+		}
			
 
				+		result->_offset = _offset + new_offset;
			
 
				+
			
 
				+		// strides
			
 
				+		DimVector strides;
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			strides.push_back(_strides[i] * slice[i].step.value_or(1));
			
 
				+		}
			
 
				+		result->_strides = strides;
			
 
				+
			
 
				+		result->_order = _order;
			
 
				+
			
 
				+		return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+	}
			
 
				+    
			
 
				+    void Tensor<u8>::sin_()
			
 
				+    {
			
 
				+        apply_([](u8 a)->u8 {return std::sin(a); });
			
 
				+    }
			
 
				+    
			
 
				+    DimVector Tensor<u8>::size() const { return _dimensions;}
			
 
				+	
			
 
				+	idx_type Tensor<u8>::size(idx_type i) const
			
 
				+	{ 
			
 
				+		auto shape_size = _dimensions.size();
			
 
				+		if (i >= 0 && i < _dimensions.size())
			
 
				+			return _dimensions[i];
			
 
				+		else if (i <= -1 && i >= -_dimensions.size())
			
 
				+			return _dimensions[shape_size + i];
			
 
				+		else
			
 
				+			throw std::runtime_error("Dimension out of range");
			
 
				+	}
			
 
				+    
			
 
				+	std::shared_ptr<StorageBase<u8>>  Tensor<u8>::storage() const { return _rep; }
			
 
				+    
			
 
				+    DimVector Tensor<u8>::stride() const { return _strides; }
			
 
				+	
			
 
				+	idx_type Tensor<u8>::stride(idx_type i) const
			
 
				+	{
			
 
				+		auto stride_size = _strides.size();
			
 
				+		if (i >= 0 && i < _strides.size())
			
 
				+			return _strides[i];
			
 
				+		else if (i <= -1 && i >= -_strides.size())
			
 
				+			return _strides[stride_size + i];
			
 
				+		else
			
 
				+			throw std::runtime_error("Stride out of range");
			
 
				+	}
			
 
				+    
			
 
				+    TensorInterfacePtr Tensor<u8>::sum() const
			
 
				+    {
			
 
				+        DimVector d(1);
			
 
				+        d[0] = 1;
			
 
				+
			
 
				+        TensorPtr<u8> result(new Tensor<u8>(d));
			
 
				+        result->_rep->data[0] = reduce_([](u8 a, u8 b)->u8 {return a + b; });
			
 
				+        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+    }
			
 
				+    
			
 
				+    std::string Tensor<u8>::to_string() const
			
 
				+    {
			
 
				+        std::function<std::string(const Tensor<u8>&, idx_type, idx_type)> to_string_impl =
			
 
				+			[&](const Tensor<u8>& t, idx_type dim, idx_type idx)->std::string {
			
 
				+            std::string result;
			
 
				+			if (dim == t.size().size())
			
 
				+            {
			
 
				+                result += std::to_string(t.data_ptr()[idx]);
			
 
				+				return result;
			
 
				+            }
			
 
				+
			
 
				+			for (idx_type i = 0; i < t.size(dim); ++i)
			
 
				+			{
			
 
				+				if (dim != t.size().size() - 1 && i != 0) result += ",\n";
			
 
				+				if(dim != t.size().size() - 1)	result += "[";
			
 
				+				result += to_string_impl(t, dim + 1, idx);
			
 
				+				if (i != t.size(dim) - 1 && dim == t.size().size() - 1)
			
 
				+					result += ",";
			
 
				+				if (dim != t.size().size() - 1) result += "]";
			
 
				+
			
 
				+				idx += t.stride(dim);
			
 
				+			}
			
 
				+
			
 
				+			return result;
			
 
				+		};
			
 
				+
			
 
				+		std::string result;
			
 
				+		result += "[" + to_string_impl(*this, 0, offset()) + "]";
			
 
				+		return result;
			
 
				+    }
			
 
				+}
			
--- a/traph/source/tensor/char_tensor.cpp
+++ b/traph/source/tensor/char_tensor.cpp
@@ -0,0 +1,406 @@
 
				+#include <traph/tensor/char_tensor.h>
			
 
				+
			
 
				+namespace traph
			
 
				+{
			
 
				+    // definition
			
 
				+    // private
			
 
				+    void Tensor<i8>::auto_strides()
			
 
				+    {
			
 
				+        idx_type dim_num = _dimensions.size();
			
 
				+        _strides.resize(dim_num);
			
 
				+        idx_type stride = 1;
			
 
				+        if(_order == layout_type::column_major)
			
 
				+        {
			
 
				+            for (idx_type i = dim_num - 1; i >= 0; --i)
			
 
				+            {
			
 
				+                _strides[i] = stride;
			
 
				+                stride *= _dimensions[i];
			
 
				+            }
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            for (idx_type i = 0; i < dim_num; ++i)
			
 
				+            {
			
 
				+                _strides[i] = stride;
			
 
				+                stride *= _dimensions[i];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i8>::apply_impl(idx_type dim, idx_type idx, std::function<i8(i8)> f)
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        idx_type step_len = _strides[dim];
			
 
				+        idx_type step_num = _dimensions[dim];
			
 
				+        
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            if(dim == dim_size - 1)
			
 
				+                _rep->data[idx] = f(_rep->data[idx]);
			
 
				+            else
			
 
				+                apply_impl(dim + 1, idx, f);
			
 
				+            idx += step_len;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i8>::reduce_impl(i8& result, idx_type dim, idx_type idx, std::function<i8(i8,i8)> f) const
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        idx_type step_len = _strides[dim];
			
 
				+        idx_type step_num = _dimensions[dim];
			
 
				+
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            if(dim == dim_size - 1)
			
 
				+                result = f(result, _rep->data[idx]);
			
 
				+            else
			
 
				+                reduce_impl(result, dim + 1, idx, f);
			
 
				+            idx += step_len;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    i8 Tensor<i8>::reduce_dim_kernel(idx_type begin, idx_type step_len, idx_type step_num, std::function<i8(i8,i8)> f) const
			
 
				+    {
			
 
				+        i8 result{};
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            result = f(result, _rep->data[begin]);
			
 
				+            begin += step_len;
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i8>::reduce_dim_impl(Tensor<i8>& result, idx_type dim, idx_type reduce_dim,
			
 
				+        idx_type this_idx, idx_type result_idx,
			
 
				+        std::function<i8(i8,i8)> f) const
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        if(dim == dim_size)
			
 
				+        {
			
 
				+            result._rep->data[result_idx] = 
			
 
				+                reduce_dim_kernel(this_idx, _strides[reduce_dim], _dimensions[reduce_dim], f);
			
 
				+            return;
			
 
				+        }
			
 
				+
			
 
				+        if(dim == reduce_dim)
			
 
				+        {
			
 
				+            reduce_dim_impl(result, dim + 1, reduce_dim, this_idx,result_idx, f);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            for(idx_type i = 0; i < _dimensions[dim]; ++i)
			
 
				+            {
			
 
				+                reduce_dim_impl(result, dim + 1, reduce_dim, this_idx,result_idx, f);
			
 
				+                    
			
 
				+                this_idx += _strides[dim];
			
 
				+                result_idx += result._strides[dim];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    // public
			
 
				+    Tensor<i8>::Tensor()
			
 
				+        :_rep(new TensorStorage<i8>),
			
 
				+        _dimensions(), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i8>::Tensor(const DimVector& dimensions)
			
 
				+        :_rep(new TensorStorage<i8>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+        
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i8>::Tensor(const DimVector& dimensions, layout_type order)
			
 
				+        :_rep(new TensorStorage<i8>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(), _order(order)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i8>::Tensor(const DimVector& dimensions, const DimVector& strides)
			
 
				+        :_rep(new TensorStorage<i8>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(strides), _order(layout_type::column_major)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i8>::Tensor(const DimVector& dimensions, const DimVector& strides, layout_type order)
			
 
				+        :_rep(new TensorStorage<i8>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(strides), _order(order)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i8>::Tensor(const i8& t)
			
 
				+        :_rep(new TensorStorage<i8>),
			
 
				+        _dimensions(), _offset(0), _strides()
			
 
				+    {
			
 
				+        _dimensions.resize(1);
			
 
				+        auto_strides();
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i8>::add_(TensorInterfacePtr other)
			
 
				+    {
			
 
				+		// check tensor other type
			
 
				+
			
 
				+		// check broadcast.shape = this.shape
			
 
				+
			
 
				+		// ok, get lhs, rhs
			
 
				+		Tensor<i8> * lhs = this;
			
 
				+		Tensor<i8> * rhs = dynamic_cast<Tensor<i8> *>(other.get());
			
 
				+		std::function<void(Tensor<i8> *, Tensor<i8> *, idx_type, idx_type,idx_type, idx_type)> add_impl =
			
 
				+			[&](Tensor<i8> * lhs, Tensor<i8> * rhs, idx_type lhs_dim, idx_type rhs_dim, idx_type lhs_idx, idx_type rhs_idx) {
			
 
				+
			
 
				+			auto lhs_storage = std::dynamic_pointer_cast<TensorStorage<i8>>(lhs->storage())->data_ptr();
			
 
				+			auto rhs_storage = std::dynamic_pointer_cast<TensorStorage<i8>>(rhs->storage())->data_ptr();
			
 
				+
			
 
				+			if (lhs_dim < -(lhs->size().size()) && rhs_dim < -(rhs->size().size()))
			
 
				+			{
			
 
				+				lhs_storage[lhs_idx] += rhs_storage[rhs_idx];
			
 
				+				return;
			
 
				+			}
			
 
				+
			
 
				+			idx_type lsh_shape_size = lhs_dim >= -(lhs->size().size())? lhs->size(lhs_dim) : 1;
			
 
				+			idx_type rsh_shape_size = rhs_dim >= -(rhs->size().size()) ? rhs->size(rhs_dim) : 1;
			
 
				+			idx_type max_shape_size = std::max(lsh_shape_size, rsh_shape_size);
			
 
				+
			
 
				+			for (idx_type i = 0; i < max_shape_size; ++i)
			
 
				+			{
			
 
				+				add_impl(lhs, rhs, lhs_dim - 1, rhs_dim - 1, lhs_idx, rhs_idx);
			
 
				+
			
 
				+				if(lsh_shape_size > 1)
			
 
				+					lhs_idx += lhs->stride(lhs_dim);
			
 
				+				if (rsh_shape_size > 1)
			
 
				+					rhs_idx += rhs->stride(rhs_dim);
			
 
				+			}
			
 
				+		};
			
 
				+
			
 
				+		add_impl(lhs, rhs, -1, -1, lhs->offset(), rhs->offset());
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i8>::apply_(std::function<i8(i8)> f)
			
 
				+    {
			
 
				+        apply_impl(0, _offset, f);
			
 
				+    }
			
 
				+
			
 
				+    TensorInterfacePtr Tensor<i8>::clone() const
			
 
				+    {
			
 
				+        std::shared_ptr<Tensor<i8>> cloned_tensor(new Tensor<i8>);
			
 
				+        cloned_tensor->_rep = std::dynamic_pointer_cast<TensorStorage<i8>>(_rep->clone());
			
 
				+        cloned_tensor->_dimensions = _dimensions;
			
 
				+        cloned_tensor->_offset = _offset;
			
 
				+        cloned_tensor->_strides = _strides;
			
 
				+        cloned_tensor->_order = _order;
			
 
				+        
			
 
				+        return cloned_tensor;
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i8>::cos_()
			
 
				+    {
			
 
				+        apply_([](i8 a)->i8 {return std::cos(a); });
			
 
				+    }
			
 
				+
			
 
				+    std::shared_ptr<TensorBase<f32>> Tensor<i8>::create_grad()
			
 
				+    {
			
 
				+        return std::shared_ptr<TensorBase<f32>>(new Tensor<f32>(_dimensions));
			
 
				+    }
			
 
				+
			
 
				+	i8* Tensor<i8>::data_ptr()
			
 
				+    {
			
 
				+        return _rep->data_ptr();
			
 
				+    }
			
 
				+
			
 
				+    const i8* Tensor<i8>::data_ptr() const
			
 
				+    {
			
 
				+        return _rep->data_ptr();
			
 
				+    }
			
 
				+
			
 
				+    device_id Tensor<i8>::device() { return 0; }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<i8>::inverse() const
			
 
				+	{
			
 
				+		throw std::runtime_error("No implement");
			
 
				+	}
			
 
				+
			
 
				+    void Tensor<i8>::fill_(i8 value)
			
 
				+    {
			
 
				+        apply_([&value](i8 a)->i8 {return value; });
			
 
				+    }
			
 
				+
			
 
				+	i8 Tensor<i8>::item() const
			
 
				+    {
			
 
				+        if(_dimensions.flat_size() == 1)
			
 
				+        {
			
 
				+            return _rep->data[_offset];
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            throw std::runtime_error("item: only one element tensors can be converted to scalars");
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<i8>::matmul(std::shared_ptr<TensorInterface> mat) const
			
 
				+	{
			
 
				+		auto right_matrix = std::dynamic_pointer_cast<Tensor<i8>>(mat);
			
 
				+		return matmul_impl(*this, *right_matrix);
			
 
				+	}
			
 
				+
			
 
				+    idx_type Tensor<i8>::offset() const { return _offset; }
			
 
				+
			
 
				+    layout_type Tensor<i8>::order() const { return _order; }
			
 
				+
			
 
				+    platform_type Tensor<i8>::platform() { return platform_type::none; }
			
 
				+
			
 
				+	i8 Tensor<i8>::reduce_(std::function<i8(i8, i8)> f) const
			
 
				+    {
			
 
				+		i8 result{};
			
 
				+        reduce_impl(result, 0, _offset, f);
			
 
				+        return result;
			
 
				+    }
			
 
				+    
			
 
				+    TensorInterfacePtr Tensor<i8>::reduce_dim(idx_type dim, std::function<i8(i8, i8)> f) const
			
 
				+    {
			
 
				+        DimVector reduced_dim = _dimensions;
			
 
				+        reduced_dim.erase(dim); // check dim?
			
 
				+        TensorBasePtr<i8> result(new Tensor<i8>(reduced_dim));
			
 
				+        TensorPtr<i8> raw_result = std::dynamic_pointer_cast<Tensor<i8>>(result);
			
 
				+        reduce_dim_impl(*(raw_result.get()), 0, dim, _offset, raw_result->_offset, f);
			
 
				+        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+    }
			
 
				+    
			
 
				+    void Tensor<i8>::reshape_(const DimVector& dims)
			
 
				+    {
			
 
				+
			
 
				+    }
			
 
				+    
			
 
				+    void Tensor<i8>::resize_(const DimVector& dims)
			
 
				+    {
			
 
				+        _dimensions = dims;
			
 
				+        _rep->resize_(dims.flat_size());
			
 
				+        auto_strides();
			
 
				+    }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<i8>::select(const SliceVector& slice) const
			
 
				+	{
			
 
				+		std::shared_ptr<Tensor<i8>> result(new Tensor<i8>);
			
 
				+		result->_rep = _rep;
			
 
				+
			
 
				+		// dimension
			
 
				+		DimVector dim;
			
 
				+		std::fesetround(FE_TONEAREST);
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			auto& each = slice[i];
			
 
				+			dim.push_back(
			
 
				+				std::lrint(std::ceil((each.end.value_or(_dimensions[i]) - each.start.value_or(0)) / (float)each.step.value_or(1)))
			
 
				+			);
			
 
				+		}
			
 
				+		result->_dimensions = dim;
			
 
				+
			
 
				+		// offset
			
 
				+		idx_type new_offset = 1;
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			new_offset *= _strides[i] * slice[i].start.value_or(0);
			
 
				+		}
			
 
				+		result->_offset = _offset + new_offset;
			
 
				+
			
 
				+		// strides
			
 
				+		DimVector strides;
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			strides.push_back(_strides[i] * slice[i].step.value_or(1));
			
 
				+		}
			
 
				+		result->_strides = strides;
			
 
				+
			
 
				+		result->_order = _order;
			
 
				+
			
 
				+		return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+	}
			
 
				+    
			
 
				+    void Tensor<i8>::sin_()
			
 
				+    {
			
 
				+        apply_([](i8 a)->i8 {return std::sin(a); });
			
 
				+    }
			
 
				+    
			
 
				+    DimVector Tensor<i8>::size() const { return _dimensions;}
			
 
				+	
			
 
				+	idx_type Tensor<i8>::size(idx_type i) const
			
 
				+	{ 
			
 
				+		auto shape_size = _dimensions.size();
			
 
				+		if (i >= 0 && i < _dimensions.size())
			
 
				+			return _dimensions[i];
			
 
				+		else if (i <= -1 && i >= -_dimensions.size())
			
 
				+			return _dimensions[shape_size + i];
			
 
				+		else
			
 
				+			throw std::runtime_error("Dimension out of range");
			
 
				+	}
			
 
				+    
			
 
				+	std::shared_ptr<StorageBase<i8>>  Tensor<i8>::storage() const { return _rep; }
			
 
				+    
			
 
				+    DimVector Tensor<i8>::stride() const { return _strides; }
			
 
				+	
			
 
				+	idx_type Tensor<i8>::stride(idx_type i) const
			
 
				+	{
			
 
				+		auto stride_size = _strides.size();
			
 
				+		if (i >= 0 && i < _strides.size())
			
 
				+			return _strides[i];
			
 
				+		else if (i <= -1 && i >= -_strides.size())
			
 
				+			return _strides[stride_size + i];
			
 
				+		else
			
 
				+			throw std::runtime_error("Stride out of range");
			
 
				+	}
			
 
				+    
			
 
				+    TensorInterfacePtr Tensor<i8>::sum() const
			
 
				+    {
			
 
				+        DimVector d(1);
			
 
				+        d[0] = 1;
			
 
				+
			
 
				+        TensorPtr<i8> result(new Tensor<i8>(d));
			
 
				+        result->_rep->data[0] = reduce_([](i8 a, i8 b)->i8 {return a + b; });
			
 
				+        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+    }
			
 
				+    
			
 
				+    std::string Tensor<i8>::to_string() const
			
 
				+    {
			
 
				+        std::function<std::string(const Tensor<i8>&, idx_type, idx_type)> to_string_impl =
			
 
				+			[&](const Tensor<i8>& t, idx_type dim, idx_type idx)->std::string {
			
 
				+            std::string result;
			
 
				+			if (dim == t.size().size())
			
 
				+            {
			
 
				+                result += std::to_string(t.data_ptr()[idx]);
			
 
				+				return result;
			
 
				+            }
			
 
				+
			
 
				+			for (idx_type i = 0; i < t.size(dim); ++i)
			
 
				+			{
			
 
				+				if (dim != t.size().size() - 1 && i != 0) result += ",\n";
			
 
				+				if(dim != t.size().size() - 1)	result += "[";
			
 
				+				result += to_string_impl(t, dim + 1, idx);
			
 
				+				if (i != t.size(dim) - 1 && dim == t.size().size() - 1)
			
 
				+					result += ",";
			
 
				+				if (dim != t.size().size() - 1) result += "]";
			
 
				+
			
 
				+				idx += t.stride(dim);
			
 
				+			}
			
 
				+
			
 
				+			return result;
			
 
				+		};
			
 
				+
			
 
				+		std::string result;
			
 
				+		result += "[" + to_string_impl(*this, 0, offset()) + "]";
			
 
				+		return result;
			
 
				+    }
			
 
				+}
			
--- a/traph/source/tensor/double_tensor.cpp
+++ b/traph/source/tensor/double_tensor.cpp
@@ -0,0 +1,406 @@
 
				+#include <traph/tensor/double_tensor.h>
			
 
				+
			
 
				+namespace traph
			
 
				+{
			
 
				+    // definition
			
 
				+    // private
			
 
				+    void Tensor<f64>::auto_strides()
			
 
				+    {
			
 
				+        idx_type dim_num = _dimensions.size();
			
 
				+        _strides.resize(dim_num);
			
 
				+        idx_type stride = 1;
			
 
				+        if(_order == layout_type::column_major)
			
 
				+        {
			
 
				+            for (idx_type i = dim_num - 1; i >= 0; --i)
			
 
				+            {
			
 
				+                _strides[i] = stride;
			
 
				+                stride *= _dimensions[i];
			
 
				+            }
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            for (idx_type i = 0; i < dim_num; ++i)
			
 
				+            {
			
 
				+                _strides[i] = stride;
			
 
				+                stride *= _dimensions[i];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<f64>::apply_impl(idx_type dim, idx_type idx, std::function<f64(f64)> f)
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        idx_type step_len = _strides[dim];
			
 
				+        idx_type step_num = _dimensions[dim];
			
 
				+        
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            if(dim == dim_size - 1)
			
 
				+                _rep->data[idx] = f(_rep->data[idx]);
			
 
				+            else
			
 
				+                apply_impl(dim + 1, idx, f);
			
 
				+            idx += step_len;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<f64>::reduce_impl(f64& result, idx_type dim, idx_type idx, std::function<f64(f64,f64)> f) const
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        idx_type step_len = _strides[dim];
			
 
				+        idx_type step_num = _dimensions[dim];
			
 
				+
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            if(dim == dim_size - 1)
			
 
				+                result = f(result, _rep->data[idx]);
			
 
				+            else
			
 
				+                reduce_impl(result, dim + 1, idx, f);
			
 
				+            idx += step_len;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    f64 Tensor<f64>::reduce_dim_kernel(idx_type begin, idx_type step_len, idx_type step_num, std::function<f64(f64,f64)> f) const
			
 
				+    {
			
 
				+        f64 result{};
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            result = f(result, _rep->data[begin]);
			
 
				+            begin += step_len;
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<f64>::reduce_dim_impl(Tensor<f64>& result, idx_type dim, idx_type reduce_dim,
			
 
				+        idx_type this_idx, idx_type result_idx,
			
 
				+        std::function<f64(f64,f64)> f) const
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        if(dim == dim_size)
			
 
				+        {
			
 
				+            result._rep->data[result_idx] = 
			
 
				+                reduce_dim_kernel(this_idx, _strides[reduce_dim], _dimensions[reduce_dim], f);
			
 
				+            return;
			
 
				+        }
			
 
				+
			
 
				+        if(dim == reduce_dim)
			
 
				+        {
			
 
				+            reduce_dim_impl(result, dim + 1, reduce_dim, this_idx,result_idx, f);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            for(idx_type i = 0; i < _dimensions[dim]; ++i)
			
 
				+            {
			
 
				+                reduce_dim_impl(result, dim + 1, reduce_dim, this_idx,result_idx, f);
			
 
				+                    
			
 
				+                this_idx += _strides[dim];
			
 
				+                result_idx += result._strides[dim];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    // public
			
 
				+    Tensor<f64>::Tensor()
			
 
				+        :_rep(new TensorStorage<f64>),
			
 
				+        _dimensions(), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				+    Tensor<f64>::Tensor(const DimVector& dimensions)
			
 
				+        :_rep(new TensorStorage<f64>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+        
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<f64>::Tensor(const DimVector& dimensions, layout_type order)
			
 
				+        :_rep(new TensorStorage<f64>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(), _order(order)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<f64>::Tensor(const DimVector& dimensions, const DimVector& strides)
			
 
				+        :_rep(new TensorStorage<f64>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(strides), _order(layout_type::column_major)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<f64>::Tensor(const DimVector& dimensions, const DimVector& strides, layout_type order)
			
 
				+        :_rep(new TensorStorage<f64>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(strides), _order(order)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<f64>::Tensor(const f64& t)
			
 
				+        :_rep(new TensorStorage<f64>),
			
 
				+        _dimensions(), _offset(0), _strides()
			
 
				+    {
			
 
				+        _dimensions.resize(1);
			
 
				+        auto_strides();
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<f64>::add_(TensorInterfacePtr other)
			
 
				+    {
			
 
				+		// check tensor other type
			
 
				+
			
 
				+		// check broadcast.shape = this.shape
			
 
				+
			
 
				+		// ok, get lhs, rhs
			
 
				+		Tensor<f64> * lhs = this;
			
 
				+		Tensor<f64> * rhs = dynamic_cast<Tensor<f64> *>(other.get());
			
 
				+		std::function<void(Tensor<f64> *, Tensor<f64> *, idx_type, idx_type,idx_type, idx_type)> add_impl =
			
 
				+			[&](Tensor<f64> * lhs, Tensor<f64> * rhs, idx_type lhs_dim, idx_type rhs_dim, idx_type lhs_idx, idx_type rhs_idx) {
			
 
				+
			
 
				+			auto lhs_storage = std::dynamic_pointer_cast<TensorStorage<f64>>(lhs->storage())->data_ptr();
			
 
				+			auto rhs_storage = std::dynamic_pointer_cast<TensorStorage<f64>>(rhs->storage())->data_ptr();
			
 
				+
			
 
				+			if (lhs_dim < -(lhs->size().size()) && rhs_dim < -(rhs->size().size()))
			
 
				+			{
			
 
				+				lhs_storage[lhs_idx] += rhs_storage[rhs_idx];
			
 
				+				return;
			
 
				+			}
			
 
				+
			
 
				+			idx_type lsh_shape_size = lhs_dim >= -(lhs->size().size())? lhs->size(lhs_dim) : 1;
			
 
				+			idx_type rsh_shape_size = rhs_dim >= -(rhs->size().size()) ? rhs->size(rhs_dim) : 1;
			
 
				+			idx_type max_shape_size = std::max(lsh_shape_size, rsh_shape_size);
			
 
				+
			
 
				+			for (idx_type i = 0; i < max_shape_size; ++i)
			
 
				+			{
			
 
				+				add_impl(lhs, rhs, lhs_dim - 1, rhs_dim - 1, lhs_idx, rhs_idx);
			
 
				+
			
 
				+				if(lsh_shape_size > 1)
			
 
				+					lhs_idx += lhs->stride(lhs_dim);
			
 
				+				if (rsh_shape_size > 1)
			
 
				+					rhs_idx += rhs->stride(rhs_dim);
			
 
				+			}
			
 
				+		};
			
 
				+
			
 
				+		add_impl(lhs, rhs, -1, -1, lhs->offset(), rhs->offset());
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<f64>::apply_(std::function<f64(f64)> f)
			
 
				+    {
			
 
				+        apply_impl(0, _offset, f);
			
 
				+    }
			
 
				+
			
 
				+    TensorInterfacePtr Tensor<f64>::clone() const
			
 
				+    {
			
 
				+        std::shared_ptr<Tensor<f64>> cloned_tensor(new Tensor<f64>);
			
 
				+        cloned_tensor->_rep = std::dynamic_pointer_cast<TensorStorage<f64>>(_rep->clone());
			
 
				+        cloned_tensor->_dimensions = _dimensions;
			
 
				+        cloned_tensor->_offset = _offset;
			
 
				+        cloned_tensor->_strides = _strides;
			
 
				+        cloned_tensor->_order = _order;
			
 
				+        
			
 
				+        return cloned_tensor;
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<f64>::cos_()
			
 
				+    {
			
 
				+        apply_([](f64 a)->f64 {return std::cos(a); });
			
 
				+    }
			
 
				+
			
 
				+    std::shared_ptr<TensorBase<f32>> Tensor<f64>::create_grad()
			
 
				+    {
			
 
				+        return std::shared_ptr<TensorBase<f32>>(new Tensor<f32>(_dimensions));
			
 
				+    }
			
 
				+
			
 
				+	f64* Tensor<f64>::data_ptr()
			
 
				+    {
			
 
				+        return _rep->data_ptr();
			
 
				+    }
			
 
				+
			
 
				+    const f64* Tensor<f64>::data_ptr() const
			
 
				+    {
			
 
				+        return _rep->data_ptr();
			
 
				+    }
			
 
				+
			
 
				+    device_id Tensor<f64>::device() { return 0; }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<f64>::inverse() const
			
 
				+	{
			
 
				+		return std::dynamic_pointer_cast<TensorInterface>(inverse_impl(*this));
			
 
				+	}
			
 
				+
			
 
				+    void Tensor<f64>::fill_(f64 value)
			
 
				+    {
			
 
				+        apply_([&value](f64 a)->f64 {return value; });
			
 
				+    }
			
 
				+
			
 
				+	f64 Tensor<f64>::item() const
			
 
				+    {
			
 
				+        if(_dimensions.flat_size() == 1)
			
 
				+        {
			
 
				+            return _rep->data[_offset];
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            throw std::runtime_error("item: only one element tensors can be converted to scalars");
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<f64>::matmul(std::shared_ptr<TensorInterface> mat) const
			
 
				+	{
			
 
				+		auto right_matrix = std::dynamic_pointer_cast<Tensor<f64>>(mat);
			
 
				+		return matmul_impl(*this, *right_matrix);
			
 
				+	}
			
 
				+
			
 
				+    idx_type Tensor<f64>::offset() const { return _offset; }
			
 
				+
			
 
				+    layout_type Tensor<f64>::order() const { return _order; }
			
 
				+
			
 
				+    platform_type Tensor<f64>::platform() { return platform_type::none; }
			
 
				+
			
 
				+	f64 Tensor<f64>::reduce_(std::function<f64(f64, f64)> f) const
			
 
				+    {
			
 
				+		f64 result{};
			
 
				+        reduce_impl(result, 0, _offset, f);
			
 
				+        return result;
			
 
				+    }
			
 
				+    
			
 
				+    TensorInterfacePtr Tensor<f64>::reduce_dim(idx_type dim, std::function<f64(f64, f64)> f) const
			
 
				+    {
			
 
				+        DimVector reduced_dim = _dimensions;
			
 
				+        reduced_dim.erase(dim); // check dim?
			
 
				+        TensorBasePtr<f64> result(new Tensor<f64>(reduced_dim));
			
 
				+        TensorPtr<f64> raw_result = std::dynamic_pointer_cast<Tensor<f64>>(result);
			
 
				+        reduce_dim_impl(*(raw_result.get()), 0, dim, _offset, raw_result->_offset, f);
			
 
				+        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+    }
			
 
				+    
			
 
				+    void Tensor<f64>::reshape_(const DimVector& dims)
			
 
				+    {
			
 
				+
			
 
				+    }
			
 
				+    
			
 
				+    void Tensor<f64>::resize_(const DimVector& dims)
			
 
				+    {
			
 
				+        _dimensions = dims;
			
 
				+        _rep->resize_(dims.flat_size());
			
 
				+        auto_strides();
			
 
				+    }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<f64>::select(const SliceVector& slice) const
			
 
				+	{
			
 
				+		std::shared_ptr<Tensor<f64>> result(new Tensor<f64>);
			
 
				+		result->_rep = _rep;
			
 
				+
			
 
				+		// dimension
			
 
				+		DimVector dim;
			
 
				+		std::fesetround(FE_TONEAREST);
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			auto& each = slice[i];
			
 
				+			dim.push_back(
			
 
				+				std::lrint(std::ceil((each.end.value_or(_dimensions[i]) - each.start.value_or(0)) / (float)each.step.value_or(1)))
			
 
				+			);
			
 
				+		}
			
 
				+		result->_dimensions = dim;
			
 
				+
			
 
				+		// offset
			
 
				+		idx_type new_offset = 1;
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			new_offset *= _strides[i] * slice[i].start.value_or(0);
			
 
				+		}
			
 
				+		result->_offset = _offset + new_offset;
			
 
				+
			
 
				+		// strides
			
 
				+		DimVector strides;
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			strides.push_back(_strides[i] * slice[i].step.value_or(1));
			
 
				+		}
			
 
				+		result->_strides = strides;
			
 
				+
			
 
				+		result->_order = _order;
			
 
				+
			
 
				+		return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+	}
			
 
				+    
			
 
				+    void Tensor<f64>::sin_()
			
 
				+    {
			
 
				+        apply_([](f64 a)->f64 {return std::sin(a); });
			
 
				+    }
			
 
				+    
			
 
				+    DimVector Tensor<f64>::size() const { return _dimensions;}
			
 
				+	
			
 
				+	idx_type Tensor<f64>::size(idx_type i) const
			
 
				+	{ 
			
 
				+		auto shape_size = _dimensions.size();
			
 
				+		if (i >= 0 && i < _dimensions.size())
			
 
				+			return _dimensions[i];
			
 
				+		else if (i <= -1 && i >= -_dimensions.size())
			
 
				+			return _dimensions[shape_size + i];
			
 
				+		else
			
 
				+			throw std::runtime_error("Dimension out of range");
			
 
				+	}
			
 
				+    
			
 
				+	std::shared_ptr<StorageBase<f64>>  Tensor<f64>::storage() const { return _rep; }
			
 
				+    
			
 
				+    DimVector Tensor<f64>::stride() const { return _strides; }
			
 
				+	
			
 
				+	idx_type Tensor<f64>::stride(idx_type i) const
			
 
				+	{
			
 
				+		auto stride_size = _strides.size();
			
 
				+		if (i >= 0 && i < _strides.size())
			
 
				+			return _strides[i];
			
 
				+		else if (i <= -1 && i >= -_strides.size())
			
 
				+			return _strides[stride_size + i];
			
 
				+		else
			
 
				+			throw std::runtime_error("Stride out of range");
			
 
				+	}
			
 
				+    
			
 
				+    TensorInterfacePtr Tensor<f64>::sum() const
			
 
				+    {
			
 
				+        DimVector d(1);
			
 
				+        d[0] = 1;
			
 
				+
			
 
				+        TensorPtr<f64> result(new Tensor<f64>(d));
			
 
				+        result->_rep->data[0] = reduce_([](f64 a, f64 b)->f64 {return a + b; });
			
 
				+        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+    }
			
 
				+    
			
 
				+    std::string Tensor<f64>::to_string() const
			
 
				+    {
			
 
				+        std::function<std::string(const Tensor<f64>&, idx_type, idx_type)> to_string_impl =
			
 
				+			[&](const Tensor<f64>& t, idx_type dim, idx_type idx)->std::string {
			
 
				+            std::string result;
			
 
				+			if (dim == t.size().size())
			
 
				+            {
			
 
				+                result += std::to_string(t.data_ptr()[idx]);
			
 
				+				return result;
			
 
				+            }
			
 
				+
			
 
				+			for (idx_type i = 0; i < t.size(dim); ++i)
			
 
				+			{
			
 
				+				if (dim != t.size().size() - 1 && i != 0) result += ",\n";
			
 
				+				if(dim != t.size().size() - 1)	result += "[";
			
 
				+				result += to_string_impl(t, dim + 1, idx);
			
 
				+				if (i != t.size(dim) - 1 && dim == t.size().size() - 1)
			
 
				+					result += ",";
			
 
				+				if (dim != t.size().size() - 1) result += "]";
			
 
				+
			
 
				+				idx += t.stride(dim);
			
 
				+			}
			
 
				+
			
 
				+			return result;
			
 
				+		};
			
 
				+
			
 
				+		std::string result;
			
 
				+		result += "[" + to_string_impl(*this, 0, offset()) + "]";
			
 
				+		return result;
			
 
				+    }
			
 
				+}
			
--- a/traph/source/tensor/float_tensor.cpp
+++ b/traph/source/tensor/float_tensor.cpp
@@ -0,0 +1,406 @@
 
				+#include <traph/tensor/float_tensor.h>
			
 
				+
			
 
				+namespace traph
			
 
				+{
			
 
				+	// definition
			
 
				+    // private
			
 
				+    void Tensor<f32>::auto_strides()
			
 
				+    {
			
 
				+        idx_type dim_num = _dimensions.size();
			
 
				+        _strides.resize(dim_num);
			
 
				+        idx_type stride = 1;
			
 
				+        if(_order == layout_type::column_major)
			
 
				+        {
			
 
				+            for (idx_type i = dim_num - 1; i >= 0; --i)
			
 
				+            {
			
 
				+                _strides[i] = stride;
			
 
				+                stride *= _dimensions[i];
			
 
				+            }
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            for (idx_type i = 0; i < dim_num; ++i)
			
 
				+            {
			
 
				+                _strides[i] = stride;
			
 
				+                stride *= _dimensions[i];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<f32>::apply_impl(idx_type dim, idx_type idx, std::function<f32(f32)> f)
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        idx_type step_len = _strides[dim];
			
 
				+        idx_type step_num = _dimensions[dim];
			
 
				+        
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            if(dim == dim_size - 1)
			
 
				+                _rep->data[idx] = f(_rep->data[idx]);
			
 
				+            else
			
 
				+                apply_impl(dim + 1, idx, f);
			
 
				+            idx += step_len;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<f32>::reduce_impl(f32& result, idx_type dim, idx_type idx, std::function<f32(f32,f32)> f) const
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        idx_type step_len = _strides[dim];
			
 
				+        idx_type step_num = _dimensions[dim];
			
 
				+
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            if(dim == dim_size - 1)
			
 
				+                result = f(result, _rep->data[idx]);
			
 
				+            else
			
 
				+                reduce_impl(result, dim + 1, idx, f);
			
 
				+            idx += step_len;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    f32 Tensor<f32>::reduce_dim_kernel(idx_type begin, idx_type step_len, idx_type step_num, std::function<f32(f32,f32)> f) const
			
 
				+    {
			
 
				+        f32 result{};
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            result = f(result, _rep->data[begin]);
			
 
				+            begin += step_len;
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<f32>::reduce_dim_impl(Tensor<f32>& result, idx_type dim, idx_type reduce_dim,
			
 
				+        idx_type this_idx, idx_type result_idx,
			
 
				+        std::function<f32(f32,f32)> f) const
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        if(dim == dim_size)
			
 
				+        {
			
 
				+            result._rep->data[result_idx] = 
			
 
				+                reduce_dim_kernel(this_idx, _strides[reduce_dim], _dimensions[reduce_dim], f);
			
 
				+            return;
			
 
				+        }
			
 
				+
			
 
				+        if(dim == reduce_dim)
			
 
				+        {
			
 
				+            reduce_dim_impl(result, dim + 1, reduce_dim, this_idx,result_idx, f);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            for(idx_type i = 0; i < _dimensions[dim]; ++i)
			
 
				+            {
			
 
				+                reduce_dim_impl(result, dim + 1, reduce_dim, this_idx,result_idx, f);
			
 
				+                    
			
 
				+                this_idx += _strides[dim];
			
 
				+                result_idx += result._strides[dim];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    // public
			
 
				+    Tensor<f32>::Tensor()
			
 
				+        :_rep(new TensorStorage<f32>),
			
 
				+        _dimensions(), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				+    Tensor<f32>::Tensor(const DimVector& dimensions)
			
 
				+        :_rep(new TensorStorage<f32>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+        
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<f32>::Tensor(const DimVector& dimensions, layout_type order)
			
 
				+        :_rep(new TensorStorage<f32>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(), _order(order)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<f32>::Tensor(const DimVector& dimensions, const DimVector& strides)
			
 
				+        :_rep(new TensorStorage<f32>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(strides), _order(layout_type::column_major)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<f32>::Tensor(const DimVector& dimensions, const DimVector& strides, layout_type order)
			
 
				+        :_rep(new TensorStorage<f32>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(strides), _order(order)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<f32>::Tensor(const f32& t)
			
 
				+        :_rep(new TensorStorage<f32>),
			
 
				+        _dimensions(), _offset(0), _strides()
			
 
				+    {
			
 
				+        _dimensions.resize(1);
			
 
				+        auto_strides();
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<f32>::add_(TensorInterfacePtr other)
			
 
				+    {
			
 
				+		// check tensor other type
			
 
				+
			
 
				+		// check broadcast.shape = this.shape
			
 
				+
			
 
				+		// ok, get lhs, rhs
			
 
				+		Tensor<f32> * lhs = this;
			
 
				+		Tensor<f32> * rhs = dynamic_cast<Tensor<f32> *>(other.get());
			
 
				+		std::function<void(Tensor<f32> *, Tensor<f32> *, idx_type, idx_type,idx_type, idx_type)> add_impl =
			
 
				+			[&](Tensor<f32> * lhs, Tensor<f32> * rhs, idx_type lhs_dim, idx_type rhs_dim, idx_type lhs_idx, idx_type rhs_idx) {
			
 
				+
			
 
				+			auto lhs_storage = std::dynamic_pointer_cast<TensorStorage<f32>>(lhs->storage())->data_ptr();
			
 
				+			auto rhs_storage = std::dynamic_pointer_cast<TensorStorage<f32>>(rhs->storage())->data_ptr();
			
 
				+
			
 
				+			if (lhs_dim < -(lhs->size().size()) && rhs_dim < -(rhs->size().size()))
			
 
				+			{
			
 
				+				lhs_storage[lhs_idx] += rhs_storage[rhs_idx];
			
 
				+				return;
			
 
				+			}
			
 
				+
			
 
				+			idx_type lsh_shape_size = lhs_dim >= -(lhs->size().size())? lhs->size(lhs_dim) : 1;
			
 
				+			idx_type rsh_shape_size = rhs_dim >= -(rhs->size().size()) ? rhs->size(rhs_dim) : 1;
			
 
				+			idx_type max_shape_size = std::max(lsh_shape_size, rsh_shape_size);
			
 
				+
			
 
				+			for (idx_type i = 0; i < max_shape_size; ++i)
			
 
				+			{
			
 
				+				add_impl(lhs, rhs, lhs_dim - 1, rhs_dim - 1, lhs_idx, rhs_idx);
			
 
				+
			
 
				+				if(lsh_shape_size > 1)
			
 
				+					lhs_idx += lhs->stride(lhs_dim);
			
 
				+				if (rsh_shape_size > 1)
			
 
				+					rhs_idx += rhs->stride(rhs_dim);
			
 
				+			}
			
 
				+		};
			
 
				+
			
 
				+		add_impl(lhs, rhs, -1, -1, lhs->offset(), rhs->offset());
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<f32>::apply_(std::function<f32(f32)> f)
			
 
				+    {
			
 
				+        apply_impl(0, _offset, f);
			
 
				+    }
			
 
				+
			
 
				+    TensorInterfacePtr Tensor<f32>::clone() const
			
 
				+    {
			
 
				+        std::shared_ptr<Tensor<f32>> cloned_tensor(new Tensor<f32>);
			
 
				+        cloned_tensor->_rep = std::dynamic_pointer_cast<TensorStorage<f32>>(_rep->clone());
			
 
				+        cloned_tensor->_dimensions = _dimensions;
			
 
				+        cloned_tensor->_offset = _offset;
			
 
				+        cloned_tensor->_strides = _strides;
			
 
				+        cloned_tensor->_order = _order;
			
 
				+        
			
 
				+        return cloned_tensor;
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<f32>::cos_()
			
 
				+    {
			
 
				+        apply_([](f32 a)->f32 {return std::cos(a); });
			
 
				+    }
			
 
				+
			
 
				+    std::shared_ptr<TensorBase<f32>> Tensor<f32>::create_grad()
			
 
				+    {
			
 
				+        return std::shared_ptr<TensorBase<f32>>(new Tensor<f32>(_dimensions));
			
 
				+    }
			
 
				+
			
 
				+	f32* Tensor<f32>::data_ptr()
			
 
				+    {
			
 
				+        return _rep->data_ptr();
			
 
				+    }
			
 
				+
			
 
				+    const f32* Tensor<f32>::data_ptr() const
			
 
				+    {
			
 
				+        return _rep->data_ptr();
			
 
				+    }
			
 
				+
			
 
				+    device_id Tensor<f32>::device() { return 0; }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<f32>::inverse() const
			
 
				+	{
			
 
				+		return std::dynamic_pointer_cast<TensorInterface>(inverse_impl(*this));
			
 
				+	}
			
 
				+
			
 
				+    void Tensor<f32>::fill_(f32 value)
			
 
				+    {
			
 
				+        apply_([&value](f32 a)->f32 {return value; });
			
 
				+    }
			
 
				+
			
 
				+	f32 Tensor<f32>::item() const
			
 
				+    {
			
 
				+        if(_dimensions.flat_size() == 1)
			
 
				+        {
			
 
				+            return _rep->data[_offset];
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            throw std::runtime_error("item: only one element tensors can be converted to scalars");
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<f32>::matmul(std::shared_ptr<TensorInterface> mat) const
			
 
				+	{
			
 
				+		auto right_matrix = std::dynamic_pointer_cast<Tensor<f32>>(mat);
			
 
				+		return matmul_impl(*this, *right_matrix);
			
 
				+	}
			
 
				+
			
 
				+    idx_type Tensor<f32>::offset() const { return _offset; }
			
 
				+
			
 
				+    layout_type Tensor<f32>::order() const { return _order; }
			
 
				+
			
 
				+    platform_type Tensor<f32>::platform() { return platform_type::none; }
			
 
				+
			
 
				+	f32 Tensor<f32>::reduce_(std::function<f32(f32, f32)> f) const
			
 
				+    {
			
 
				+		f32 result{};
			
 
				+        reduce_impl(result, 0, _offset, f);
			
 
				+        return result;
			
 
				+    }
			
 
				+    
			
 
				+    TensorInterfacePtr Tensor<f32>::reduce_dim(idx_type dim, std::function<f32(f32, f32)> f) const
			
 
				+    {
			
 
				+        DimVector reduced_dim = _dimensions;
			
 
				+        reduced_dim.erase(dim); // check dim?
			
 
				+        TensorBasePtr<f32> result(new Tensor<f32>(reduced_dim));
			
 
				+        TensorPtr<f32> raw_result = std::dynamic_pointer_cast<Tensor<f32>>(result);
			
 
				+        reduce_dim_impl(*(raw_result.get()), 0, dim, _offset, raw_result->_offset, f);
			
 
				+        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+    }
			
 
				+    
			
 
				+    void Tensor<f32>::reshape_(const DimVector& dims)
			
 
				+    {
			
 
				+
			
 
				+    }
			
 
				+    
			
 
				+    void Tensor<f32>::resize_(const DimVector& dims)
			
 
				+    {
			
 
				+        _dimensions = dims;
			
 
				+        _rep->resize_(dims.flat_size());
			
 
				+        auto_strides();
			
 
				+    }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<f32>::select(const SliceVector& slice) const
			
 
				+	{
			
 
				+		std::shared_ptr<Tensor<f32>> result(new Tensor<f32>);
			
 
				+		result->_rep = _rep;
			
 
				+
			
 
				+		// dimension
			
 
				+		DimVector dim;
			
 
				+		std::fesetround(FE_TONEAREST);
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			auto& each = slice[i];
			
 
				+			dim.push_back(
			
 
				+				std::lrint(std::ceil((each.end.value_or(_dimensions[i]) - each.start.value_or(0)) / (float)each.step.value_or(1)))
			
 
				+			);
			
 
				+		}
			
 
				+		result->_dimensions = dim;
			
 
				+
			
 
				+		// offset
			
 
				+		idx_type new_offset = 1;
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			new_offset *= _strides[i] * slice[i].start.value_or(0);
			
 
				+		}
			
 
				+		result->_offset = _offset + new_offset;
			
 
				+
			
 
				+		// strides
			
 
				+		DimVector strides;
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			strides.push_back(_strides[i] * slice[i].step.value_or(1));
			
 
				+		}
			
 
				+		result->_strides = strides;
			
 
				+
			
 
				+		result->_order = _order;
			
 
				+
			
 
				+		return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+	}
			
 
				+    
			
 
				+    void Tensor<f32>::sin_()
			
 
				+    {
			
 
				+        apply_([](f32 a)->f32 {return std::sin(a); });
			
 
				+    }
			
 
				+    
			
 
				+    DimVector Tensor<f32>::size() const { return _dimensions;}
			
 
				+	
			
 
				+	idx_type Tensor<f32>::size(idx_type i) const
			
 
				+	{ 
			
 
				+		auto shape_size = _dimensions.size();
			
 
				+		if (i >= 0 && i < _dimensions.size())
			
 
				+			return _dimensions[i];
			
 
				+		else if (i <= -1 && i >= -_dimensions.size())
			
 
				+			return _dimensions[shape_size + i];
			
 
				+		else
			
 
				+			throw std::runtime_error("Dimension out of range");
			
 
				+	}
			
 
				+    
			
 
				+	std::shared_ptr<StorageBase<f32>>  Tensor<f32>::storage() const { return _rep; }
			
 
				+    
			
 
				+    DimVector Tensor<f32>::stride() const { return _strides; }
			
 
				+	
			
 
				+	idx_type Tensor<f32>::stride(idx_type i) const
			
 
				+	{
			
 
				+		auto stride_size = _strides.size();
			
 
				+		if (i >= 0 && i < _strides.size())
			
 
				+			return _strides[i];
			
 
				+		else if (i <= -1 && i >= -_strides.size())
			
 
				+			return _strides[stride_size + i];
			
 
				+		else
			
 
				+			throw std::runtime_error("Stride out of range");
			
 
				+	}
			
 
				+    
			
 
				+    TensorInterfacePtr Tensor<f32>::sum() const
			
 
				+    {
			
 
				+        DimVector d(1);
			
 
				+        d[0] = 1;
			
 
				+
			
 
				+        TensorPtr<f32> result(new Tensor<f32>(d));
			
 
				+        result->_rep->data[0] = reduce_([](f32 a, f32 b)->f32 {return a + b; });
			
 
				+        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+    }
			
 
				+    
			
 
				+    std::string Tensor<f32>::to_string() const
			
 
				+    {
			
 
				+        std::function<std::string(const Tensor<f32>&, idx_type, idx_type)> to_string_impl =
			
 
				+			[&](const Tensor<f32>& t, idx_type dim, idx_type idx)->std::string {
			
 
				+            std::string result;
			
 
				+			if (dim == t.size().size())
			
 
				+            {
			
 
				+                result += std::to_string(t.data_ptr()[idx]);
			
 
				+				return result;
			
 
				+            }
			
 
				+
			
 
				+			for (idx_type i = 0; i < t.size(dim); ++i)
			
 
				+			{
			
 
				+				if (dim != t.size().size() - 1 && i != 0) result += ",\n";
			
 
				+				if(dim != t.size().size() - 1)	result += "[";
			
 
				+				result += to_string_impl(t, dim + 1, idx);
			
 
				+				if (i != t.size(dim) - 1 && dim == t.size().size() - 1)
			
 
				+					result += ",";
			
 
				+				if (dim != t.size().size() - 1) result += "]";
			
 
				+
			
 
				+				idx += t.stride(dim);
			
 
				+			}
			
 
				+
			
 
				+			return result;
			
 
				+		};
			
 
				+
			
 
				+		std::string result;
			
 
				+		result += "[" + to_string_impl(*this, 0, offset()) + "]";
			
 
				+		return result;
			
 
				+    }
			
 
				+}
			
--- a/traph/source/tensor/int_tensor.cpp
+++ b/traph/source/tensor/int_tensor.cpp
@@ -0,0 +1,406 @@
 
				+#include <traph/tensor/int_tensor.h>
			
 
				+
			
 
				+namespace traph
			
 
				+{
			
 
				+    // definition
			
 
				+    // private
			
 
				+    void Tensor<i32>::auto_strides()
			
 
				+    {
			
 
				+        idx_type dim_num = _dimensions.size();
			
 
				+        _strides.resize(dim_num);
			
 
				+        idx_type stride = 1;
			
 
				+        if(_order == layout_type::column_major)
			
 
				+        {
			
 
				+            for (idx_type i = dim_num - 1; i >= 0; --i)
			
 
				+            {
			
 
				+                _strides[i] = stride;
			
 
				+                stride *= _dimensions[i];
			
 
				+            }
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            for (idx_type i = 0; i < dim_num; ++i)
			
 
				+            {
			
 
				+                _strides[i] = stride;
			
 
				+                stride *= _dimensions[i];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i32>::apply_impl(idx_type dim, idx_type idx, std::function<i32(i32)> f)
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        idx_type step_len = _strides[dim];
			
 
				+        idx_type step_num = _dimensions[dim];
			
 
				+        
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            if(dim == dim_size - 1)
			
 
				+                _rep->data[idx] = f(_rep->data[idx]);
			
 
				+            else
			
 
				+                apply_impl(dim + 1, idx, f);
			
 
				+            idx += step_len;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i32>::reduce_impl(i32& result, idx_type dim, idx_type idx, std::function<i32(i32,i32)> f) const
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        idx_type step_len = _strides[dim];
			
 
				+        idx_type step_num = _dimensions[dim];
			
 
				+
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            if(dim == dim_size - 1)
			
 
				+                result = f(result, _rep->data[idx]);
			
 
				+            else
			
 
				+                reduce_impl(result, dim + 1, idx, f);
			
 
				+            idx += step_len;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    i32 Tensor<i32>::reduce_dim_kernel(idx_type begin, idx_type step_len, idx_type step_num, std::function<i32(i32,i32)> f) const
			
 
				+    {
			
 
				+        i32 result{};
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            result = f(result, _rep->data[begin]);
			
 
				+            begin += step_len;
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i32>::reduce_dim_impl(Tensor<i32>& result, idx_type dim, idx_type reduce_dim,
			
 
				+        idx_type this_idx, idx_type result_idx,
			
 
				+        std::function<i32(i32,i32)> f) const
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        if(dim == dim_size)
			
 
				+        {
			
 
				+            result._rep->data[result_idx] = 
			
 
				+                reduce_dim_kernel(this_idx, _strides[reduce_dim], _dimensions[reduce_dim], f);
			
 
				+            return;
			
 
				+        }
			
 
				+
			
 
				+        if(dim == reduce_dim)
			
 
				+        {
			
 
				+            reduce_dim_impl(result, dim + 1, reduce_dim, this_idx,result_idx, f);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            for(idx_type i = 0; i < _dimensions[dim]; ++i)
			
 
				+            {
			
 
				+                reduce_dim_impl(result, dim + 1, reduce_dim, this_idx,result_idx, f);
			
 
				+                    
			
 
				+                this_idx += _strides[dim];
			
 
				+                result_idx += result._strides[dim];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    // public
			
 
				+    Tensor<i32>::Tensor()
			
 
				+        :_rep(new TensorStorage<i32>),
			
 
				+        _dimensions(), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i32>::Tensor(const DimVector& dimensions)
			
 
				+        :_rep(new TensorStorage<i32>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+        
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i32>::Tensor(const DimVector& dimensions, layout_type order)
			
 
				+        :_rep(new TensorStorage<i32>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(), _order(order)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i32>::Tensor(const DimVector& dimensions, const DimVector& strides)
			
 
				+        :_rep(new TensorStorage<i32>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(strides), _order(layout_type::column_major)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i32>::Tensor(const DimVector& dimensions, const DimVector& strides, layout_type order)
			
 
				+        :_rep(new TensorStorage<i32>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(strides), _order(order)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i32>::Tensor(const i32& t)
			
 
				+        :_rep(new TensorStorage<i32>),
			
 
				+        _dimensions(), _offset(0), _strides()
			
 
				+    {
			
 
				+        _dimensions.resize(1);
			
 
				+        auto_strides();
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i32>::add_(TensorInterfacePtr other)
			
 
				+    {
			
 
				+		// check tensor other type
			
 
				+
			
 
				+		// check broadcast.shape = this.shape
			
 
				+
			
 
				+		// ok, get lhs, rhs
			
 
				+		Tensor<i32> * lhs = this;
			
 
				+		Tensor<i32> * rhs = dynamic_cast<Tensor<i32> *>(other.get());
			
 
				+		std::function<void(Tensor<i32> *, Tensor<i32> *, idx_type, idx_type,idx_type, idx_type)> add_impl =
			
 
				+			[&](Tensor<i32> * lhs, Tensor<i32> * rhs, idx_type lhs_dim, idx_type rhs_dim, idx_type lhs_idx, idx_type rhs_idx) {
			
 
				+
			
 
				+			auto lhs_storage = std::dynamic_pointer_cast<TensorStorage<i32>>(lhs->storage())->data_ptr();
			
 
				+			auto rhs_storage = std::dynamic_pointer_cast<TensorStorage<i32>>(rhs->storage())->data_ptr();
			
 
				+
			
 
				+			if (lhs_dim < -(lhs->size().size()) && rhs_dim < -(rhs->size().size()))
			
 
				+			{
			
 
				+				lhs_storage[lhs_idx] += rhs_storage[rhs_idx];
			
 
				+				return;
			
 
				+			}
			
 
				+
			
 
				+			idx_type lsh_shape_size = lhs_dim >= -(lhs->size().size())? lhs->size(lhs_dim) : 1;
			
 
				+			idx_type rsh_shape_size = rhs_dim >= -(rhs->size().size()) ? rhs->size(rhs_dim) : 1;
			
 
				+			idx_type max_shape_size = std::max(lsh_shape_size, rsh_shape_size);
			
 
				+
			
 
				+			for (idx_type i = 0; i < max_shape_size; ++i)
			
 
				+			{
			
 
				+				add_impl(lhs, rhs, lhs_dim - 1, rhs_dim - 1, lhs_idx, rhs_idx);
			
 
				+
			
 
				+				if(lsh_shape_size > 1)
			
 
				+					lhs_idx += lhs->stride(lhs_dim);
			
 
				+				if (rsh_shape_size > 1)
			
 
				+					rhs_idx += rhs->stride(rhs_dim);
			
 
				+			}
			
 
				+		};
			
 
				+
			
 
				+		add_impl(lhs, rhs, -1, -1, lhs->offset(), rhs->offset());
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i32>::apply_(std::function<i32(i32)> f)
			
 
				+    {
			
 
				+        apply_impl(0, _offset, f);
			
 
				+    }
			
 
				+
			
 
				+    TensorInterfacePtr Tensor<i32>::clone() const
			
 
				+    {
			
 
				+        std::shared_ptr<Tensor<i32>> cloned_tensor(new Tensor<i32>);
			
 
				+        cloned_tensor->_rep = std::dynamic_pointer_cast<TensorStorage<i32>>(_rep->clone());
			
 
				+        cloned_tensor->_dimensions = _dimensions;
			
 
				+        cloned_tensor->_offset = _offset;
			
 
				+        cloned_tensor->_strides = _strides;
			
 
				+        cloned_tensor->_order = _order;
			
 
				+        
			
 
				+        return cloned_tensor;
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i32>::cos_()
			
 
				+    {
			
 
				+        apply_([](i32 a)->i32 {return std::cos(a); });
			
 
				+    }
			
 
				+
			
 
				+    std::shared_ptr<TensorBase<f32>> Tensor<i32>::create_grad()
			
 
				+    {
			
 
				+        return std::shared_ptr<TensorBase<f32>>(new Tensor<f32>(_dimensions));
			
 
				+    }
			
 
				+
			
 
				+	i32* Tensor<i32>::data_ptr()
			
 
				+    {
			
 
				+        return _rep->data_ptr();
			
 
				+    }
			
 
				+
			
 
				+    const i32* Tensor<i32>::data_ptr() const
			
 
				+    {
			
 
				+        return _rep->data_ptr();
			
 
				+    }
			
 
				+
			
 
				+    device_id Tensor<i32>::device() { return 0; }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<i32>::inverse() const
			
 
				+	{
			
 
				+		throw std::runtime_error("No implement");
			
 
				+	}
			
 
				+
			
 
				+    void Tensor<i32>::fill_(i32 value)
			
 
				+    {
			
 
				+        apply_([&value](i32 a)->i32 {return value; });
			
 
				+    }
			
 
				+
			
 
				+	i32 Tensor<i32>::item() const
			
 
				+    {
			
 
				+        if(_dimensions.flat_size() == 1)
			
 
				+        {
			
 
				+            return _rep->data[_offset];
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            throw std::runtime_error("item: only one element tensors can be converted to scalars");
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<i32>::matmul(std::shared_ptr<TensorInterface> mat) const
			
 
				+	{
			
 
				+		auto right_matrix = std::dynamic_pointer_cast<Tensor<i32>>(mat);
			
 
				+		return matmul_impl(*this, *right_matrix);
			
 
				+	}
			
 
				+
			
 
				+    idx_type Tensor<i32>::offset() const { return _offset; }
			
 
				+
			
 
				+    layout_type Tensor<i32>::order() const { return _order; }
			
 
				+
			
 
				+    platform_type Tensor<i32>::platform() { return platform_type::none; }
			
 
				+
			
 
				+	i32 Tensor<i32>::reduce_(std::function<i32(i32, i32)> f) const
			
 
				+    {
			
 
				+		i32 result{};
			
 
				+        reduce_impl(result, 0, _offset, f);
			
 
				+        return result;
			
 
				+    }
			
 
				+    
			
 
				+    TensorInterfacePtr Tensor<i32>::reduce_dim(idx_type dim, std::function<i32(i32, i32)> f) const
			
 
				+    {
			
 
				+        DimVector reduced_dim = _dimensions;
			
 
				+        reduced_dim.erase(dim); // check dim?
			
 
				+        TensorBasePtr<i32> result(new Tensor<i32>(reduced_dim));
			
 
				+        TensorPtr<i32> raw_result = std::dynamic_pointer_cast<Tensor<i32>>(result);
			
 
				+        reduce_dim_impl(*(raw_result.get()), 0, dim, _offset, raw_result->_offset, f);
			
 
				+        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+    }
			
 
				+    
			
 
				+    void Tensor<i32>::reshape_(const DimVector& dims)
			
 
				+    {
			
 
				+
			
 
				+    }
			
 
				+    
			
 
				+    void Tensor<i32>::resize_(const DimVector& dims)
			
 
				+    {
			
 
				+        _dimensions = dims;
			
 
				+        _rep->resize_(dims.flat_size());
			
 
				+        auto_strides();
			
 
				+    }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<i32>::select(const SliceVector& slice) const
			
 
				+	{
			
 
				+		std::shared_ptr<Tensor<i32>> result(new Tensor<i32>);
			
 
				+		result->_rep = _rep;
			
 
				+
			
 
				+		// dimension
			
 
				+		DimVector dim;
			
 
				+		std::fesetround(FE_TONEAREST);
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			auto& each = slice[i];
			
 
				+			dim.push_back(
			
 
				+				std::lrint(std::ceil((each.end.value_or(_dimensions[i]) - each.start.value_or(0)) / (float)each.step.value_or(1)))
			
 
				+			);
			
 
				+		}
			
 
				+		result->_dimensions = dim;
			
 
				+
			
 
				+		// offset
			
 
				+		idx_type new_offset = 1;
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			new_offset *= _strides[i] * slice[i].start.value_or(0);
			
 
				+		}
			
 
				+		result->_offset = _offset + new_offset;
			
 
				+
			
 
				+		// strides
			
 
				+		DimVector strides;
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			strides.push_back(_strides[i] * slice[i].step.value_or(1));
			
 
				+		}
			
 
				+		result->_strides = strides;
			
 
				+
			
 
				+		result->_order = _order;
			
 
				+
			
 
				+		return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+	}
			
 
				+    
			
 
				+    void Tensor<i32>::sin_()
			
 
				+    {
			
 
				+        apply_([](i32 a)->i32 {return std::sin(a); });
			
 
				+    }
			
 
				+    
			
 
				+    DimVector Tensor<i32>::size() const { return _dimensions;}
			
 
				+	
			
 
				+	idx_type Tensor<i32>::size(idx_type i) const
			
 
				+	{ 
			
 
				+		auto shape_size = _dimensions.size();
			
 
				+		if (i >= 0 && i < _dimensions.size())
			
 
				+			return _dimensions[i];
			
 
				+		else if (i <= -1 && i >= -_dimensions.size())
			
 
				+			return _dimensions[shape_size + i];
			
 
				+		else
			
 
				+			throw std::runtime_error("Dimension out of range");
			
 
				+	}
			
 
				+    
			
 
				+	std::shared_ptr<StorageBase<i32>>  Tensor<i32>::storage() const { return _rep; }
			
 
				+    
			
 
				+    DimVector Tensor<i32>::stride() const { return _strides; }
			
 
				+	
			
 
				+	idx_type Tensor<i32>::stride(idx_type i) const
			
 
				+	{
			
 
				+		auto stride_size = _strides.size();
			
 
				+		if (i >= 0 && i < _strides.size())
			
 
				+			return _strides[i];
			
 
				+		else if (i <= -1 && i >= -_strides.size())
			
 
				+			return _strides[stride_size + i];
			
 
				+		else
			
 
				+			throw std::runtime_error("Stride out of range");
			
 
				+	}
			
 
				+    
			
 
				+    TensorInterfacePtr Tensor<i32>::sum() const
			
 
				+    {
			
 
				+        DimVector d(1);
			
 
				+        d[0] = 1;
			
 
				+
			
 
				+        TensorPtr<i32> result(new Tensor<i32>(d));
			
 
				+        result->_rep->data[0] = reduce_([](i32 a, i32 b)->i32 {return a + b; });
			
 
				+        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+    }
			
 
				+    
			
 
				+    std::string Tensor<i32>::to_string() const
			
 
				+    {
			
 
				+        std::function<std::string(const Tensor<i32>&, idx_type, idx_type)> to_string_impl =
			
 
				+			[&](const Tensor<i32>& t, idx_type dim, idx_type idx)->std::string {
			
 
				+            std::string result;
			
 
				+			if (dim == t.size().size())
			
 
				+            {
			
 
				+                result += std::to_string(t.data_ptr()[idx]);
			
 
				+				return result;
			
 
				+            }
			
 
				+
			
 
				+			for (idx_type i = 0; i < t.size(dim); ++i)
			
 
				+			{
			
 
				+				if (dim != t.size().size() - 1 && i != 0) result += ",\n";
			
 
				+				if(dim != t.size().size() - 1)	result += "[";
			
 
				+				result += to_string_impl(t, dim + 1, idx);
			
 
				+				if (i != t.size(dim) - 1 && dim == t.size().size() - 1)
			
 
				+					result += ",";
			
 
				+				if (dim != t.size().size() - 1) result += "]";
			
 
				+
			
 
				+				idx += t.stride(dim);
			
 
				+			}
			
 
				+
			
 
				+			return result;
			
 
				+		};
			
 
				+
			
 
				+		std::string result;
			
 
				+		result += "[" + to_string_impl(*this, 0, offset()) + "]";
			
 
				+		return result;
			
 
				+    }
			
 
				+}
			
--- a/traph/source/tensor/long_tensor.cpp
+++ b/traph/source/tensor/long_tensor.cpp
@@ -0,0 +1,406 @@
 
				+#include <traph/tensor/long_tensor.h>
			
 
				+
			
 
				+namespace traph
			
 
				+{
			
 
				+    // definition
			
 
				+    // private
			
 
				+    void Tensor<i64>::auto_strides()
			
 
				+    {
			
 
				+        idx_type dim_num = _dimensions.size();
			
 
				+        _strides.resize(dim_num);
			
 
				+        idx_type stride = 1;
			
 
				+        if(_order == layout_type::column_major)
			
 
				+        {
			
 
				+            for (idx_type i = dim_num - 1; i >= 0; --i)
			
 
				+            {
			
 
				+                _strides[i] = stride;
			
 
				+                stride *= _dimensions[i];
			
 
				+            }
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            for (idx_type i = 0; i < dim_num; ++i)
			
 
				+            {
			
 
				+                _strides[i] = stride;
			
 
				+                stride *= _dimensions[i];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i64>::apply_impl(idx_type dim, idx_type idx, std::function<i64(i64)> f)
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        idx_type step_len = _strides[dim];
			
 
				+        idx_type step_num = _dimensions[dim];
			
 
				+        
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            if(dim == dim_size - 1)
			
 
				+                _rep->data[idx] = f(_rep->data[idx]);
			
 
				+            else
			
 
				+                apply_impl(dim + 1, idx, f);
			
 
				+            idx += step_len;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i64>::reduce_impl(i64& result, idx_type dim, idx_type idx, std::function<i64(i64,i64)> f) const
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        idx_type step_len = _strides[dim];
			
 
				+        idx_type step_num = _dimensions[dim];
			
 
				+
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            if(dim == dim_size - 1)
			
 
				+                result = f(result, _rep->data[idx]);
			
 
				+            else
			
 
				+                reduce_impl(result, dim + 1, idx, f);
			
 
				+            idx += step_len;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    i64 Tensor<i64>::reduce_dim_kernel(idx_type begin, idx_type step_len, idx_type step_num, std::function<i64(i64,i64)> f) const
			
 
				+    {
			
 
				+        i64 result{};
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            result = f(result, _rep->data[begin]);
			
 
				+            begin += step_len;
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i64>::reduce_dim_impl(Tensor<i64>& result, idx_type dim, idx_type reduce_dim,
			
 
				+        idx_type this_idx, idx_type result_idx,
			
 
				+        std::function<i64(i64,i64)> f) const
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        if(dim == dim_size)
			
 
				+        {
			
 
				+            result._rep->data[result_idx] = 
			
 
				+                reduce_dim_kernel(this_idx, _strides[reduce_dim], _dimensions[reduce_dim], f);
			
 
				+            return;
			
 
				+        }
			
 
				+
			
 
				+        if(dim == reduce_dim)
			
 
				+        {
			
 
				+            reduce_dim_impl(result, dim + 1, reduce_dim, this_idx,result_idx, f);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            for(idx_type i = 0; i < _dimensions[dim]; ++i)
			
 
				+            {
			
 
				+                reduce_dim_impl(result, dim + 1, reduce_dim, this_idx,result_idx, f);
			
 
				+                    
			
 
				+                this_idx += _strides[dim];
			
 
				+                result_idx += result._strides[dim];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    // public
			
 
				+    Tensor<i64>::Tensor()
			
 
				+        :_rep(new TensorStorage<i64>),
			
 
				+        _dimensions(), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i64>::Tensor(const DimVector& dimensions)
			
 
				+        :_rep(new TensorStorage<i64>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+        
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i64>::Tensor(const DimVector& dimensions, layout_type order)
			
 
				+        :_rep(new TensorStorage<i64>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(), _order(order)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i64>::Tensor(const DimVector& dimensions, const DimVector& strides)
			
 
				+        :_rep(new TensorStorage<i64>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(strides), _order(layout_type::column_major)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i64>::Tensor(const DimVector& dimensions, const DimVector& strides, layout_type order)
			
 
				+        :_rep(new TensorStorage<i64>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(strides), _order(order)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i64>::Tensor(const i64& t)
			
 
				+        :_rep(new TensorStorage<i64>),
			
 
				+        _dimensions(), _offset(0), _strides()
			
 
				+    {
			
 
				+        _dimensions.resize(1);
			
 
				+        auto_strides();
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i64>::add_(TensorInterfacePtr other)
			
 
				+    {
			
 
				+		// check tensor other type
			
 
				+
			
 
				+		// check broadcast.shape = this.shape
			
 
				+
			
 
				+		// ok, get lhs, rhs
			
 
				+		Tensor<i64> * lhs = this;
			
 
				+		Tensor<i64> * rhs = dynamic_cast<Tensor<i64> *>(other.get());
			
 
				+		std::function<void(Tensor<i64> *, Tensor<i64> *, idx_type, idx_type,idx_type, idx_type)> add_impl =
			
 
				+			[&](Tensor<i64> * lhs, Tensor<i64> * rhs, idx_type lhs_dim, idx_type rhs_dim, idx_type lhs_idx, idx_type rhs_idx) {
			
 
				+
			
 
				+			auto lhs_storage = std::dynamic_pointer_cast<TensorStorage<i64>>(lhs->storage())->data_ptr();
			
 
				+			auto rhs_storage = std::dynamic_pointer_cast<TensorStorage<i64>>(rhs->storage())->data_ptr();
			
 
				+
			
 
				+			if (lhs_dim < -(lhs->size().size()) && rhs_dim < -(rhs->size().size()))
			
 
				+			{
			
 
				+				lhs_storage[lhs_idx] += rhs_storage[rhs_idx];
			
 
				+				return;
			
 
				+			}
			
 
				+
			
 
				+			idx_type lsh_shape_size = lhs_dim >= -(lhs->size().size())? lhs->size(lhs_dim) : 1;
			
 
				+			idx_type rsh_shape_size = rhs_dim >= -(rhs->size().size()) ? rhs->size(rhs_dim) : 1;
			
 
				+			idx_type max_shape_size = std::max(lsh_shape_size, rsh_shape_size);
			
 
				+
			
 
				+			for (idx_type i = 0; i < max_shape_size; ++i)
			
 
				+			{
			
 
				+				add_impl(lhs, rhs, lhs_dim - 1, rhs_dim - 1, lhs_idx, rhs_idx);
			
 
				+
			
 
				+				if(lsh_shape_size > 1)
			
 
				+					lhs_idx += lhs->stride(lhs_dim);
			
 
				+				if (rsh_shape_size > 1)
			
 
				+					rhs_idx += rhs->stride(rhs_dim);
			
 
				+			}
			
 
				+		};
			
 
				+
			
 
				+		add_impl(lhs, rhs, -1, -1, lhs->offset(), rhs->offset());
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i64>::apply_(std::function<i64(i64)> f)
			
 
				+    {
			
 
				+        apply_impl(0, _offset, f);
			
 
				+    }
			
 
				+
			
 
				+    TensorInterfacePtr Tensor<i64>::clone() const
			
 
				+    {
			
 
				+        std::shared_ptr<Tensor<i64>> cloned_tensor(new Tensor<i64>);
			
 
				+        cloned_tensor->_rep = std::dynamic_pointer_cast<TensorStorage<i64>>(_rep->clone());
			
 
				+        cloned_tensor->_dimensions = _dimensions;
			
 
				+        cloned_tensor->_offset = _offset;
			
 
				+        cloned_tensor->_strides = _strides;
			
 
				+        cloned_tensor->_order = _order;
			
 
				+        
			
 
				+        return cloned_tensor;
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i64>::cos_()
			
 
				+    {
			
 
				+        apply_([](i64 a)->i64 {return std::cos(a); });
			
 
				+    }
			
 
				+
			
 
				+    std::shared_ptr<TensorBase<f32>> Tensor<i64>::create_grad()
			
 
				+    {
			
 
				+        return std::shared_ptr<TensorBase<f32>>(new Tensor<f32>(_dimensions));
			
 
				+    }
			
 
				+
			
 
				+	i64* Tensor<i64>::data_ptr()
			
 
				+    {
			
 
				+        return _rep->data_ptr();
			
 
				+    }
			
 
				+
			
 
				+    const i64* Tensor<i64>::data_ptr() const
			
 
				+    {
			
 
				+        return _rep->data_ptr();
			
 
				+    }
			
 
				+
			
 
				+    device_id Tensor<i64>::device() { return 0; }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<i64>::inverse() const
			
 
				+	{
			
 
				+		throw std::runtime_error("No implement");
			
 
				+	}
			
 
				+
			
 
				+    void Tensor<i64>::fill_(i64 value)
			
 
				+    {
			
 
				+        apply_([&value](i64 a)->i64 {return value; });
			
 
				+    }
			
 
				+
			
 
				+	i64 Tensor<i64>::item() const
			
 
				+    {
			
 
				+        if(_dimensions.flat_size() == 1)
			
 
				+        {
			
 
				+            return _rep->data[_offset];
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            throw std::runtime_error("item: only one element tensors can be converted to scalars");
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<i64>::matmul(std::shared_ptr<TensorInterface> mat) const
			
 
				+	{
			
 
				+		auto right_matrix = std::dynamic_pointer_cast<Tensor<i64>>(mat);
			
 
				+		return matmul_impl(*this, *right_matrix);
			
 
				+	}
			
 
				+
			
 
				+    idx_type Tensor<i64>::offset() const { return _offset; }
			
 
				+
			
 
				+    layout_type Tensor<i64>::order() const { return _order; }
			
 
				+
			
 
				+    platform_type Tensor<i64>::platform() { return platform_type::none; }
			
 
				+
			
 
				+	i64 Tensor<i64>::reduce_(std::function<i64(i64, i64)> f) const
			
 
				+    {
			
 
				+		i64 result{};
			
 
				+        reduce_impl(result, 0, _offset, f);
			
 
				+        return result;
			
 
				+    }
			
 
				+    
			
 
				+    TensorInterfacePtr Tensor<i64>::reduce_dim(idx_type dim, std::function<i64(i64, i64)> f) const
			
 
				+    {
			
 
				+        DimVector reduced_dim = _dimensions;
			
 
				+        reduced_dim.erase(dim); // check dim?
			
 
				+        TensorBasePtr<i64> result(new Tensor<i64>(reduced_dim));
			
 
				+        TensorPtr<i64> raw_result = std::dynamic_pointer_cast<Tensor<i64>>(result);
			
 
				+        reduce_dim_impl(*(raw_result.get()), 0, dim, _offset, raw_result->_offset, f);
			
 
				+        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+    }
			
 
				+    
			
 
				+    void Tensor<i64>::reshape_(const DimVector& dims)
			
 
				+    {
			
 
				+
			
 
				+    }
			
 
				+    
			
 
				+    void Tensor<i64>::resize_(const DimVector& dims)
			
 
				+    {
			
 
				+        _dimensions = dims;
			
 
				+        _rep->resize_(dims.flat_size());
			
 
				+        auto_strides();
			
 
				+    }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<i64>::select(const SliceVector& slice) const
			
 
				+	{
			
 
				+		std::shared_ptr<Tensor<i64>> result(new Tensor<i64>);
			
 
				+		result->_rep = _rep;
			
 
				+
			
 
				+		// dimension
			
 
				+		DimVector dim;
			
 
				+		std::fesetround(FE_TONEAREST);
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			auto& each = slice[i];
			
 
				+			dim.push_back(
			
 
				+				std::lrint(std::ceil((each.end.value_or(_dimensions[i]) - each.start.value_or(0)) / (float)each.step.value_or(1)))
			
 
				+			);
			
 
				+		}
			
 
				+		result->_dimensions = dim;
			
 
				+
			
 
				+		// offset
			
 
				+		idx_type new_offset = 1;
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			new_offset *= _strides[i] * slice[i].start.value_or(0);
			
 
				+		}
			
 
				+		result->_offset = _offset + new_offset;
			
 
				+
			
 
				+		// strides
			
 
				+		DimVector strides;
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			strides.push_back(_strides[i] * slice[i].step.value_or(1));
			
 
				+		}
			
 
				+		result->_strides = strides;
			
 
				+
			
 
				+		result->_order = _order;
			
 
				+
			
 
				+		return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+	}
			
 
				+    
			
 
				+    void Tensor<i64>::sin_()
			
 
				+    {
			
 
				+        apply_([](i64 a)->i64 {return std::sin(a); });
			
 
				+    }
			
 
				+    
			
 
				+    DimVector Tensor<i64>::size() const { return _dimensions;}
			
 
				+	
			
 
				+	idx_type Tensor<i64>::size(idx_type i) const
			
 
				+	{ 
			
 
				+		auto shape_size = _dimensions.size();
			
 
				+		if (i >= 0 && i < _dimensions.size())
			
 
				+			return _dimensions[i];
			
 
				+		else if (i <= -1 && i >= -_dimensions.size())
			
 
				+			return _dimensions[shape_size + i];
			
 
				+		else
			
 
				+			throw std::runtime_error("Dimension out of range");
			
 
				+	}
			
 
				+    
			
 
				+	std::shared_ptr<StorageBase<i64>>  Tensor<i64>::storage() const { return _rep; }
			
 
				+    
			
 
				+    DimVector Tensor<i64>::stride() const { return _strides; }
			
 
				+	
			
 
				+	idx_type Tensor<i64>::stride(idx_type i) const
			
 
				+	{
			
 
				+		auto stride_size = _strides.size();
			
 
				+		if (i >= 0 && i < _strides.size())
			
 
				+			return _strides[i];
			
 
				+		else if (i <= -1 && i >= -_strides.size())
			
 
				+			return _strides[stride_size + i];
			
 
				+		else
			
 
				+			throw std::runtime_error("Stride out of range");
			
 
				+	}
			
 
				+    
			
 
				+    TensorInterfacePtr Tensor<i64>::sum() const
			
 
				+    {
			
 
				+        DimVector d(1);
			
 
				+        d[0] = 1;
			
 
				+
			
 
				+        TensorPtr<i64> result(new Tensor<i64>(d));
			
 
				+        result->_rep->data[0] = reduce_([](i64 a, i64 b)->i64 {return a + b; });
			
 
				+        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+    }
			
 
				+    
			
 
				+    std::string Tensor<i64>::to_string() const
			
 
				+    {
			
 
				+        std::function<std::string(const Tensor<i64>&, idx_type, idx_type)> to_string_impl =
			
 
				+			[&](const Tensor<i64>& t, idx_type dim, idx_type idx)->std::string {
			
 
				+            std::string result;
			
 
				+			if (dim == t.size().size())
			
 
				+            {
			
 
				+                result += std::to_string(t.data_ptr()[idx]);
			
 
				+				return result;
			
 
				+            }
			
 
				+
			
 
				+			for (idx_type i = 0; i < t.size(dim); ++i)
			
 
				+			{
			
 
				+				if (dim != t.size().size() - 1 && i != 0) result += ",\n";
			
 
				+				if(dim != t.size().size() - 1)	result += "[";
			
 
				+				result += to_string_impl(t, dim + 1, idx);
			
 
				+				if (i != t.size(dim) - 1 && dim == t.size().size() - 1)
			
 
				+					result += ",";
			
 
				+				if (dim != t.size().size() - 1) result += "]";
			
 
				+
			
 
				+				idx += t.stride(dim);
			
 
				+			}
			
 
				+
			
 
				+			return result;
			
 
				+		};
			
 
				+
			
 
				+		std::string result;
			
 
				+		result += "[" + to_string_impl(*this, 0, offset()) + "]";
			
 
				+		return result;
			
 
				+    }
			
 
				+}
			
--- a/traph/source/tensor/short_tensor.cpp
+++ b/traph/source/tensor/short_tensor.cpp
@@ -0,0 +1,406 @@
 
				+#include <traph/tensor/short_tensor.h>
			
 
				+
			
 
				+namespace traph
			
 
				+{
			
 
				+    // definition
			
 
				+    // private
			
 
				+    void Tensor<i16>::auto_strides()
			
 
				+    {
			
 
				+        idx_type dim_num = _dimensions.size();
			
 
				+        _strides.resize(dim_num);
			
 
				+        idx_type stride = 1;
			
 
				+        if(_order == layout_type::column_major)
			
 
				+        {
			
 
				+            for (idx_type i = dim_num - 1; i >= 0; --i)
			
 
				+            {
			
 
				+                _strides[i] = stride;
			
 
				+                stride *= _dimensions[i];
			
 
				+            }
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            for (idx_type i = 0; i < dim_num; ++i)
			
 
				+            {
			
 
				+                _strides[i] = stride;
			
 
				+                stride *= _dimensions[i];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i16>::apply_impl(idx_type dim, idx_type idx, std::function<i16(i16)> f)
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        idx_type step_len = _strides[dim];
			
 
				+        idx_type step_num = _dimensions[dim];
			
 
				+        
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            if(dim == dim_size - 1)
			
 
				+                _rep->data[idx] = f(_rep->data[idx]);
			
 
				+            else
			
 
				+                apply_impl(dim + 1, idx, f);
			
 
				+            idx += step_len;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i16>::reduce_impl(i16& result, idx_type dim, idx_type idx, std::function<i16(i16,i16)> f) const
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        idx_type step_len = _strides[dim];
			
 
				+        idx_type step_num = _dimensions[dim];
			
 
				+
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            if(dim == dim_size - 1)
			
 
				+                result = f(result, _rep->data[idx]);
			
 
				+            else
			
 
				+                reduce_impl(result, dim + 1, idx, f);
			
 
				+            idx += step_len;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    i16 Tensor<i16>::reduce_dim_kernel(idx_type begin, idx_type step_len, idx_type step_num, std::function<i16(i16,i16)> f) const
			
 
				+    {
			
 
				+        i16 result{};
			
 
				+        for(idx_type i = 0; i < step_num; ++i)
			
 
				+        {
			
 
				+            result = f(result, _rep->data[begin]);
			
 
				+            begin += step_len;
			
 
				+        }
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i16>::reduce_dim_impl(Tensor<i16>& result, idx_type dim, idx_type reduce_dim,
			
 
				+        idx_type this_idx, idx_type result_idx,
			
 
				+        std::function<i16(i16,i16)> f) const
			
 
				+    {
			
 
				+        idx_type dim_size = _dimensions.size();
			
 
				+
			
 
				+        if(dim == dim_size)
			
 
				+        {
			
 
				+            result._rep->data[result_idx] = 
			
 
				+                reduce_dim_kernel(this_idx, _strides[reduce_dim], _dimensions[reduce_dim], f);
			
 
				+            return;
			
 
				+        }
			
 
				+
			
 
				+        if(dim == reduce_dim)
			
 
				+        {
			
 
				+            reduce_dim_impl(result, dim + 1, reduce_dim, this_idx,result_idx, f);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            for(idx_type i = 0; i < _dimensions[dim]; ++i)
			
 
				+            {
			
 
				+                reduce_dim_impl(result, dim + 1, reduce_dim, this_idx,result_idx, f);
			
 
				+                    
			
 
				+                this_idx += _strides[dim];
			
 
				+                result_idx += result._strides[dim];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    // public
			
 
				+    Tensor<i16>::Tensor()
			
 
				+        :_rep(new TensorStorage<i16>),
			
 
				+        _dimensions(), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i16>::Tensor(const DimVector& dimensions)
			
 
				+        :_rep(new TensorStorage<i16>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+        
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i16>::Tensor(const DimVector& dimensions, layout_type order)
			
 
				+        :_rep(new TensorStorage<i16>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(), _order(order)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i16>::Tensor(const DimVector& dimensions, const DimVector& strides)
			
 
				+        :_rep(new TensorStorage<i16>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(strides), _order(layout_type::column_major)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i16>::Tensor(const DimVector& dimensions, const DimVector& strides, layout_type order)
			
 
				+        :_rep(new TensorStorage<i16>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(strides), _order(order)
			
 
				+    {
			
 
				+        auto_strides();
			
 
				+
			
 
				+        _rep->resize_(_dimensions.flat_size());
			
 
				+    }
			
 
				+
			
 
				+    Tensor<i16>::Tensor(const i16& t)
			
 
				+        :_rep(new TensorStorage<i16>),
			
 
				+        _dimensions(), _offset(0), _strides()
			
 
				+    {
			
 
				+        _dimensions.resize(1);
			
 
				+        auto_strides();
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i16>::add_(TensorInterfacePtr other)
			
 
				+    {
			
 
				+		// check tensor other type
			
 
				+
			
 
				+		// check broadcast.shape = this.shape
			
 
				+
			
 
				+		// ok, get lhs, rhs
			
 
				+		Tensor<i16> * lhs = this;
			
 
				+		Tensor<i16> * rhs = dynamic_cast<Tensor<i16> *>(other.get());
			
 
				+		std::function<void(Tensor<i16> *, Tensor<i16> *, idx_type, idx_type,idx_type, idx_type)> add_impl =
			
 
				+			[&](Tensor<i16> * lhs, Tensor<i16> * rhs, idx_type lhs_dim, idx_type rhs_dim, idx_type lhs_idx, idx_type rhs_idx) {
			
 
				+
			
 
				+			auto lhs_storage = std::dynamic_pointer_cast<TensorStorage<i16>>(lhs->storage())->data_ptr();
			
 
				+			auto rhs_storage = std::dynamic_pointer_cast<TensorStorage<i16>>(rhs->storage())->data_ptr();
			
 
				+
			
 
				+			if (lhs_dim < -(lhs->size().size()) && rhs_dim < -(rhs->size().size()))
			
 
				+			{
			
 
				+				lhs_storage[lhs_idx] += rhs_storage[rhs_idx];
			
 
				+				return;
			
 
				+			}
			
 
				+
			
 
				+			idx_type lsh_shape_size = lhs_dim >= -(lhs->size().size())? lhs->size(lhs_dim) : 1;
			
 
				+			idx_type rsh_shape_size = rhs_dim >= -(rhs->size().size()) ? rhs->size(rhs_dim) : 1;
			
 
				+			idx_type max_shape_size = std::max(lsh_shape_size, rsh_shape_size);
			
 
				+
			
 
				+			for (idx_type i = 0; i < max_shape_size; ++i)
			
 
				+			{
			
 
				+				add_impl(lhs, rhs, lhs_dim - 1, rhs_dim - 1, lhs_idx, rhs_idx);
			
 
				+
			
 
				+				if(lsh_shape_size > 1)
			
 
				+					lhs_idx += lhs->stride(lhs_dim);
			
 
				+				if (rsh_shape_size > 1)
			
 
				+					rhs_idx += rhs->stride(rhs_dim);
			
 
				+			}
			
 
				+		};
			
 
				+
			
 
				+		add_impl(lhs, rhs, -1, -1, lhs->offset(), rhs->offset());
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i16>::apply_(std::function<i16(i16)> f)
			
 
				+    {
			
 
				+        apply_impl(0, _offset, f);
			
 
				+    }
			
 
				+
			
 
				+    TensorInterfacePtr Tensor<i16>::clone() const
			
 
				+    {
			
 
				+        std::shared_ptr<Tensor<i16>> cloned_tensor(new Tensor<i16>);
			
 
				+        cloned_tensor->_rep = std::dynamic_pointer_cast<TensorStorage<i16>>(_rep->clone());
			
 
				+        cloned_tensor->_dimensions = _dimensions;
			
 
				+        cloned_tensor->_offset = _offset;
			
 
				+        cloned_tensor->_strides = _strides;
			
 
				+        cloned_tensor->_order = _order;
			
 
				+        
			
 
				+        return cloned_tensor;
			
 
				+    }
			
 
				+
			
 
				+    void Tensor<i16>::cos_()
			
 
				+    {
			
 
				+        apply_([](i16 a)->i16 {return std::cos(a); });
			
 
				+    }
			
 
				+
			
 
				+    std::shared_ptr<TensorBase<f32>> Tensor<i16>::create_grad()
			
 
				+    {
			
 
				+        return std::shared_ptr<TensorBase<f32>>(new Tensor<f32>(_dimensions));
			
 
				+    }
			
 
				+
			
 
				+	i16* Tensor<i16>::data_ptr()
			
 
				+    {
			
 
				+        return _rep->data_ptr();
			
 
				+    }
			
 
				+
			
 
				+    const i16* Tensor<i16>::data_ptr() const
			
 
				+    {
			
 
				+        return _rep->data_ptr();
			
 
				+    }
			
 
				+
			
 
				+    device_id Tensor<i16>::device() { return 0; }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<i16>::inverse() const
			
 
				+	{
			
 
				+		throw std::runtime_error("No implement");
			
 
				+	}
			
 
				+
			
 
				+    void Tensor<i16>::fill_(i16 value)
			
 
				+    {
			
 
				+        apply_([&value](i16 a)->i16 {return value; });
			
 
				+    }
			
 
				+
			
 
				+	i16 Tensor<i16>::item() const
			
 
				+    {
			
 
				+        if(_dimensions.flat_size() == 1)
			
 
				+        {
			
 
				+            return _rep->data[_offset];
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            throw std::runtime_error("item: only one element tensors can be converted to scalars");
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<i16>::matmul(std::shared_ptr<TensorInterface> mat) const
			
 
				+	{
			
 
				+		auto right_matrix = std::dynamic_pointer_cast<Tensor<i16>>(mat);
			
 
				+		return matmul_impl(*this, *right_matrix);
			
 
				+	}
			
 
				+
			
 
				+    idx_type Tensor<i16>::offset() const { return _offset; }
			
 
				+
			
 
				+    layout_type Tensor<i16>::order() const { return _order; }
			
 
				+
			
 
				+    platform_type Tensor<i16>::platform() { return platform_type::none; }
			
 
				+
			
 
				+	i16 Tensor<i16>::reduce_(std::function<i16(i16, i16)> f) const
			
 
				+    {
			
 
				+		i16 result{};
			
 
				+        reduce_impl(result, 0, _offset, f);
			
 
				+        return result;
			
 
				+    }
			
 
				+    
			
 
				+    TensorInterfacePtr Tensor<i16>::reduce_dim(idx_type dim, std::function<i16(i16, i16)> f) const
			
 
				+    {
			
 
				+        DimVector reduced_dim = _dimensions;
			
 
				+        reduced_dim.erase(dim); // check dim?
			
 
				+        TensorBasePtr<i16> result(new Tensor<i16>(reduced_dim));
			
 
				+        TensorPtr<i16> raw_result = std::dynamic_pointer_cast<Tensor<i16>>(result);
			
 
				+        reduce_dim_impl(*(raw_result.get()), 0, dim, _offset, raw_result->_offset, f);
			
 
				+        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+    }
			
 
				+    
			
 
				+    void Tensor<i16>::reshape_(const DimVector& dims)
			
 
				+    {
			
 
				+
			
 
				+    }
			
 
				+    
			
 
				+    void Tensor<i16>::resize_(const DimVector& dims)
			
 
				+    {
			
 
				+        _dimensions = dims;
			
 
				+        _rep->resize_(dims.flat_size());
			
 
				+        auto_strides();
			
 
				+    }
			
 
				+
			
 
				+	std::shared_ptr<TensorInterface> Tensor<i16>::select(const SliceVector& slice) const
			
 
				+	{
			
 
				+		std::shared_ptr<Tensor<i16>> result(new Tensor<i16>);
			
 
				+		result->_rep = _rep;
			
 
				+
			
 
				+		// dimension
			
 
				+		DimVector dim;
			
 
				+		std::fesetround(FE_TONEAREST);
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			auto& each = slice[i];
			
 
				+			dim.push_back(
			
 
				+				std::lrint(std::ceil((each.end.value_or(_dimensions[i]) - each.start.value_or(0)) / (float)each.step.value_or(1)))
			
 
				+			);
			
 
				+		}
			
 
				+		result->_dimensions = dim;
			
 
				+
			
 
				+		// offset
			
 
				+		idx_type new_offset = 1;
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			new_offset *= _strides[i] * slice[i].start.value_or(0);
			
 
				+		}
			
 
				+		result->_offset = _offset + new_offset;
			
 
				+
			
 
				+		// strides
			
 
				+		DimVector strides;
			
 
				+		for (idx_type i = 0; i < slice.size(); ++i)
			
 
				+		{
			
 
				+			strides.push_back(_strides[i] * slice[i].step.value_or(1));
			
 
				+		}
			
 
				+		result->_strides = strides;
			
 
				+
			
 
				+		result->_order = _order;
			
 
				+
			
 
				+		return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+	}
			
 
				+    
			
 
				+    void Tensor<i16>::sin_()
			
 
				+    {
			
 
				+        apply_([](i16 a)->i16 {return std::sin(a); });
			
 
				+    }
			
 
				+    
			
 
				+    DimVector Tensor<i16>::size() const { return _dimensions;}
			
 
				+	
			
 
				+	idx_type Tensor<i16>::size(idx_type i) const
			
 
				+	{ 
			
 
				+		auto shape_size = _dimensions.size();
			
 
				+		if (i >= 0 && i < _dimensions.size())
			
 
				+			return _dimensions[i];
			
 
				+		else if (i <= -1 && i >= -_dimensions.size())
			
 
				+			return _dimensions[shape_size + i];
			
 
				+		else
			
 
				+			throw std::runtime_error("Dimension out of range");
			
 
				+	}
			
 
				+    
			
 
				+	std::shared_ptr<StorageBase<i16>>  Tensor<i16>::storage() const { return _rep; }
			
 
				+    
			
 
				+    DimVector Tensor<i16>::stride() const { return _strides; }
			
 
				+	
			
 
				+	idx_type Tensor<i16>::stride(idx_type i) const
			
 
				+	{
			
 
				+		auto stride_size = _strides.size();
			
 
				+		if (i >= 0 && i < _strides.size())
			
 
				+			return _strides[i];
			
 
				+		else if (i <= -1 && i >= -_strides.size())
			
 
				+			return _strides[stride_size + i];
			
 
				+		else
			
 
				+			throw std::runtime_error("Stride out of range");
			
 
				+	}
			
 
				+    
			
 
				+    TensorInterfacePtr Tensor<i16>::sum() const
			
 
				+    {
			
 
				+        DimVector d(1);
			
 
				+        d[0] = 1;
			
 
				+
			
 
				+        TensorPtr<i16> result(new Tensor<i16>(d));
			
 
				+        result->_rep->data[0] = reduce_([](i16 a, i16 b)->i16 {return a + b; });
			
 
				+        return std::dynamic_pointer_cast<TensorInterface>(result);
			
 
				+    }
			
 
				+    
			
 
				+    std::string Tensor<i16>::to_string() const
			
 
				+    {
			
 
				+        std::function<std::string(const Tensor<i16>&, idx_type, idx_type)> to_string_impl =
			
 
				+			[&](const Tensor<i16>& t, idx_type dim, idx_type idx)->std::string {
			
 
				+            std::string result;
			
 
				+			if (dim == t.size().size())
			
 
				+            {
			
 
				+                result += std::to_string(t.data_ptr()[idx]);
			
 
				+				return result;
			
 
				+            }
			
 
				+
			
 
				+			for (idx_type i = 0; i < t.size(dim); ++i)
			
 
				+			{
			
 
				+				if (dim != t.size().size() - 1 && i != 0) result += ",\n";
			
 
				+				if(dim != t.size().size() - 1)	result += "[";
			
 
				+				result += to_string_impl(t, dim + 1, idx);
			
 
				+				if (i != t.size(dim) - 1 && dim == t.size().size() - 1)
			
 
				+					result += ",";
			
 
				+				if (dim != t.size().size() - 1) result += "]";
			
 
				+
			
 
				+				idx += t.stride(dim);
			
 
				+			}
			
 
				+
			
 
				+			return result;
			
 
				+		};
			
 
				+
			
 
				+		std::string result;
			
 
				+		result += "[" + to_string_impl(*this, 0, offset()) + "]";
			
 
				+		return result;
			
 
				+    }
			
 
				+}
			
--- a/traph/source/tensor/tensor.cpp
+++ b/traph/source/tensor/tensor.cpp
@@ -0,0 +1,179 @@
 
				+#include <traph/tensor/tensor.h>
			
 
				+
			
 
				+
			
 
				+namespace traph
			
 
				+{
			
 
				+    // definition
			
 
				+    // public
			
 
				+    template<typename T>
			
 
				+    Tensor<T>::Tensor()
			
 
				+        :_rep(new TensorStorage<T>),
			
 
				+        _dimensions(), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    template<typename T>
			
 
				+    Tensor<T>::Tensor(const DimVector& dimensions)
			
 
				+        :_rep(new TensorStorage<T>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+
			
 
				+    template<typename T>
			
 
				+    Tensor<T>::Tensor(const DimVector& dimensions, layout_type order)
			
 
				+        :_rep(new TensorStorage<T>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(), _order(order)
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+
			
 
				+    template<typename T>
			
 
				+    Tensor<T>::Tensor(const DimVector& dimensions, const DimVector& strides)
			
 
				+        :_rep(new TensorStorage<T>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(strides), _order(layout_type::column_major)
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+
			
 
				+    template<typename T>
			
 
				+    Tensor<T>::Tensor(const DimVector& dimensions, const DimVector& strides, layout_type order)
			
 
				+        :_rep(new TensorStorage<T>),
			
 
				+        _dimensions(dimensions), _offset(0), _strides(strides), _order(order)
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+
			
 
				+    template<typename T>
			
 
				+    Tensor<T>::Tensor(const T& t)
			
 
				+        :_rep(new TensorStorage<T>),
			
 
				+        _dimensions(), _offset(0), _strides(), _order(layout_type::column_major)
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+
			
 
				+    template<typename T>
			
 
				+    void Tensor<T>::add_(TensorInterfacePtr other)
			
 
				+    {
			
 
				+		throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+    template<typename T>
			
 
				+    void Tensor<T>::apply_(std::function<T(T)> f)
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+    template<typename T>
			
 
				+    TensorInterfacePtr Tensor<T>::clone() const
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+    template<typename T>
			
 
				+    void Tensor<T>::cos_()
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+    template<typename T>
			
 
				+    std::shared_ptr<TensorBase<f32>> Tensor<T>::create_grad()
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+    template<typename T>
			
 
				+    T* Tensor<T>::data_ptr()
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+    template<typename T>
			
 
				+    const T* Tensor<T>::data_ptr() const
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+    template<typename T>
			
 
				+    device_id Tensor<T>::device() { throw std::runtime_error("No implement"); }
			
 
				+    template<typename T>
			
 
				+    void Tensor<T>::fill_(T value)
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+
			
 
				+    template<typename T>
			
 
				+    std::shared_ptr<TensorInterface> Tensor<T>::inverse() const
			
 
				+    {
			
 
				+        // return std::dynamic_pointer_cast<TensorInterface>(inverse_impl(*this));
			
 
				+		throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+
			
 
				+    template<typename T>
			
 
				+    T Tensor<T>::item() const
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+    template<typename T>
			
 
				+    std::shared_ptr<TensorInterface> Tensor<T>::matmul(std::shared_ptr<TensorInterface> mat) const
			
 
				+    {
			
 
				+		throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+    template<typename T>
			
 
				+    idx_type Tensor<T>::offset() const { throw std::runtime_error("No implement"); }
			
 
				+    template<typename T>
			
 
				+    layout_type Tensor<T>::order() const { throw std::runtime_error("No implement"); }
			
 
				+    template<typename T>
			
 
				+    platform_type Tensor<T>::platform() { throw std::runtime_error("No implement"); }
			
 
				+    template<typename T>
			
 
				+    T Tensor<T>::reduce_(std::function<T(T,T)> f) const
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+    template<typename T>
			
 
				+    TensorInterfacePtr Tensor<T>::reduce_dim(idx_type dim, std::function<T(T,T)> f) const
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+    template<typename T>
			
 
				+    void Tensor<T>::reshape_(const DimVector& dims)
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+    template<typename T>
			
 
				+    void Tensor<T>::resize_(const DimVector& dims)
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+    template<typename T>
			
 
				+    std::shared_ptr<TensorInterface> Tensor<T>::select(const SliceVector& slice) const
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+    template<typename T>
			
 
				+    void Tensor<T>::sin_()
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+    template<typename T>
			
 
				+    DimVector Tensor<T>::size() const { throw std::runtime_error("No implement"); }
			
 
				+	template<typename T>
			
 
				+	idx_type Tensor<T>::size(idx_type i) const
			
 
				+	{ 
			
 
				+		throw std::runtime_error("No implement");
			
 
				+	}
			
 
				+    template<typename T>
			
 
				+	std::shared_ptr<StorageBase<T>>  Tensor<T>::storage() const { throw std::runtime_error("No implement"); }
			
 
				+    template<typename T>
			
 
				+    DimVector Tensor<T>::stride() const { throw std::runtime_error("No implement"); }
			
 
				+	template<typename T>
			
 
				+	idx_type Tensor<T>::stride(idx_type i) const
			
 
				+	{
			
 
				+		throw std::runtime_error("No implement");
			
 
				+	}
			
 
				+    template<typename T>
			
 
				+    TensorInterfacePtr Tensor<T>::sum() const
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+    template<typename T>
			
 
				+    std::string Tensor<T>::to_string() const
			
 
				+    {
			
 
				+        throw std::runtime_error("No implement");
			
 
				+    }
			
 
				+}