TR-mbed/TensorBlock_8h_source.html

// This file is part of Eigen, a lightweight C++ template library

// for linear algebra.

//

// This Source Code Form is subject to the terms of the Mozilla

// Public License v. 2.0. If a copy of the MPL was not distributed

// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.


#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H

#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H


namespace Eigen {

namespace internal {


// -------------------------------------------------------------------------- //

// Forward declarations for templates defined below.

template <typename Scalar, typename IndexType, int NumDims, int Layout>

class TensorBlockIO;


// -------------------------------------------------------------------------- //

// Helper function to compute strides for densely stored buffer of given

// dimensions.


// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use

// this function instead everywhere.

template <int Layout, typename IndexType, int NumDims>


EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(

    const DSizes<IndexType, NumDims>& dimensions) {

  DSizes<IndexType, NumDims> strides;

  if (NumDims == 0) return strides;


  // TODO(ezhulenev): Use templates to unroll this loop (similar to

  // h_array_reduce in CXX11meta.h)? Benchmark it.

  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {

    strides[0] = 1;

    for (int i = 1; i < NumDims; ++i) {

      strides[i] = strides[i - 1] * dimensions[i - 1];

    }

  } else {

    strides[NumDims - 1] = 1;

    for (int i = NumDims - 2; i >= 0; --i) {

      strides[i] = strides[i + 1] * dimensions[i + 1];

    }

  }


  return strides;

}


template <int Layout, typename IndexType, size_t NumDims>


EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(

    const Eigen::array<IndexType, NumDims>& dimensions) {

  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));

}


template <int Layout, std::ptrdiff_t... Indices>


EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(

    const Sizes<Indices...>& sizes) {

  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));

}


// -------------------------------------------------------------------------- //


// Tensor block shape type defines what are the shape preference for the blocks

// extracted from the larger tensor.

//

// Example: blocks of 100 elements from the large 100x100 tensor:

// - tensor: 100x100

// - target_block_size: 100

//

// TensorBlockShapeType:

//  - kUniformAllDims: 100 blocks of size 10x10

//  - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column

//                      or row major layout)

enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };


struct TensorBlockResourceRequirements {

  TensorBlockShapeType shape_type;  // target block shape

  size_t size;                      // target block size

  TensorOpCost cost_per_coeff;      // cost of computing a single block element


#ifdef EIGEN_HIPCC

  // For HIPCC, we need to explicitly declare as a "device fun", the constructor

  // which is implicitly invoked in the "merge" / "any" routines. else HIPCC

  // errors out complaining about the lack of a matching constructor

  EIGEN_DEVICE_FUNC

  TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_,

                  TensorOpCost cost_)

    : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)

  {}

#endif


  template <typename Scalar>


  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(

      TensorBlockShapeType shape_type, size_t size_in_bytes,

      TensorOpCost cost) {

    const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));

    return {shape_type, size, cost};

  }


  template <typename Scalar>


  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(

      TensorBlockShapeType shape_type, size_t size_in_bytes) {

    // This default cost per coefficient is valid for most materialized tensor

    // block evaluation implementations, because they typically just read

    // coefficients from the underlying tensor storage, and write to the tensor

    // block buffer (scratch or destination memory, reads and writes have linear

    // access pattern). We ignore the fixed cost of block evaluation, because in

    // practice it should negligible.

    //

    // Lazy block evaluation adds the cost of calling a functor for each

    // coefficient.

    //

    // All non-trivial block evaluation implementations must provide their own

    // cost approximation (e.g. shuffling inner dimension has a much higher cost

    // because it reads memory randomly, although the total number of moved

    // bytes is the same).

    return withShapeAndSize<Scalar>(shape_type, size_in_bytes,

                                    {/*bytes_loaded=*/sizeof(Scalar),

                                     /*bytes_stored=*/sizeof(Scalar),

                                     /*compute_cycles=*/0});

  }


  template <typename Scalar>


  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(

      size_t size_in_bytes) {

    return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,

                                    size_in_bytes);

  }


  template <typename Scalar>


  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(

      size_t size_in_bytes) {

    return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,

                                    size_in_bytes);

  }


  EIGEN_DEVICE_FUNC

  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements


  merge(const TensorBlockResourceRequirements& lhs,

        const TensorBlockResourceRequirements& rhs) {

    return {merge(lhs.shape_type, rhs.shape_type),           // shape_type

            merge(lhs.size, rhs.size),                       // size

            merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};  // cost_per_coeff

  }


  EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(

      TensorOpCost cost) {

    cost_per_coeff += cost;

    return *this;

  }


  // This is a resource requirement that should be returned from expressions

  // that do not have any block evaluation preference (e.g. default tensor

  // expression with raw buffer access).

  EIGEN_DEVICE_FUNC


  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {

    return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};

  }


 private:

  using Requirements = TensorBlockResourceRequirements;


  EIGEN_DEVICE_FUNC

  static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {

    return numext::maxi(lhs_size, rhs_size);

  }


  EIGEN_DEVICE_FUNC

  static EIGEN_STRONG_INLINE TensorBlockShapeType

  merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {

    return (lhs == TensorBlockShapeType::kSkewedInnerDims ||

            rhs == TensorBlockShapeType::kSkewedInnerDims)

               ? TensorBlockShapeType::kSkewedInnerDims

               : TensorBlockShapeType::kUniformAllDims;

  }


  EIGEN_DEVICE_FUNC

  static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,

                                                TensorOpCost rhs_cost) {

    return lhs_cost + rhs_cost;

  }

};


// -------------------------------------------------------------------------- //

// TensorBlockDescriptor specifies a block offset within a tensor and the block

// sizes along each of the tensor dimensions.


template <int NumDims, typename IndexType = Eigen::Index>


class TensorBlockDescriptor {

 public:

  typedef DSizes<IndexType, NumDims> Dimensions;


  // If we evaluate a Tensor assignment, and expression on the left, already has

  // a memory buffer, then we might do performance optimization, and evaluate

  // the root expression directly into the final output memory. Some time it's

  // possible to reuse it for materializing subexpressions inside an expression

  // tree, to to avoid dynamic memory allocation.

  //

  // The pointer type of the underlying storage is erased, because passing

  // Scalar type through all the expression evaluation layers is way too many

  // templates. In practice destination buffer type should always match the

  // evaluated expression scalar type.


  class DestinationBuffer {

   public:


    enum DestinationBufferKind : int {

      // The above explicit specification of "int" as the enum basetype is

      // needed to get around a HIPCC link error ("the field type is not

      // amp-compatible")

      // which is issued for class members with the enum type.

      // TODO(rocm):

      // remove the "int" basetype once HIPCC has been fixed to not error out

      // in the above scenario.


      // Destination buffer is not defined (`m_data` == nullptr).

      kEmpty,


      // Tensor block defined by an owning tensor block descriptor can fit

      // contiguously into the destination buffer. In this case it's safe to

      // materialize tensor block in the destination buffer, wrap it in a

      // TensorMap, and use to build Eigen expression on top of it.

      kContiguous,


      // Destination buffer strides do not match strides of the contiguously

      // stored block, and it's impossible to define a TensorMap over this

      // buffer. However if we are evaluating a root of an expression tree, we

      // still can materialize an output into this destination, because we can

      // guarantee that no one will ever access it through block API.

      //

      // In theory it is possible to build valid TensorStriding<TensorMap>

      // expression on top of this destination buffer, however it has

      // inefficient coeff/packet access, and defeats the purpose of fast block

      // evaluation API.

      kStrided

    };


    template <typename Scalar>


    Scalar* data() const {

      eigen_assert(m_data_type_size == sizeof(Scalar));

      return static_cast<Scalar*>(m_data);

    }


    const Dimensions& strides() const { return m_strides; }

    const DestinationBufferKind& kind() const { return m_kind; }


   private:

    friend class TensorBlockDescriptor;


    DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}


    template <typename Scalar>

    DestinationBuffer(Scalar* data, const Dimensions& strides,

                      DestinationBufferKind kind)

        : m_data(static_cast<void*>(data)),

          m_data_type_size(sizeof(Scalar)),

          m_strides(strides),

          m_kind(kind) {}


    template <int Layout, typename Scalar>

    static DestinationBuffer make(const TensorBlockDescriptor& desc,

                                  Scalar* data, const Dimensions& strides) {

      return DestinationBuffer(data, strides, kind<Layout>(desc, strides));

    }


    template <int Layout>

    static DestinationBufferKind kind(const TensorBlockDescriptor& desc,

                                      const Dimensions& strides) {

      const Dimensions& desc_dims = desc.dimensions();

      const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);

      for (int i = 0; i < NumDims; ++i) {

        if (desc_dims[i] == 1) continue;

        if (desc_strides[i] != strides[i]) return kStrided;

      }

      return kContiguous;

    }


    // Storage pointer is type erased, to reduce template bloat, but we still

    // keep the size of the underlying element type for error checking.

    void* m_data;

    size_t m_data_type_size;


    // Destination buffer dimensions always match the dimensions of a tensor

    // block descriptor it belongs to, however strides might be different.

    Dimensions m_strides;


    DestinationBufferKind m_kind;

  };


  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,

                        const DestinationBuffer& destination)

      : m_offset(offset),

        m_dimensions(dimensions),

        m_destination(destination) {}


  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)

      : m_offset(offset),

        m_dimensions(dimensions),

        m_destination(DestinationBuffer()) {}


  IndexType offset() const { return m_offset; }

  const Dimensions& dimensions() const { return m_dimensions; }

  IndexType dimension(int index) const { return m_dimensions[index]; }

  IndexType size() const { return array_prod<IndexType>(m_dimensions); }


  const DestinationBuffer& destination() const { return m_destination; }


  template <int Layout, typename Scalar>


  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {

    eigen_assert(dst_base != NULL);

    m_destination =

        DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);

  }


  template <int Layout, typename Scalar, typename DstStridesIndexType>


  void AddDestinationBuffer(

      Scalar* dst_base,

      const DSizes<DstStridesIndexType, NumDims>& dst_strides) {

    // DSizes constructor will do index type promotion if it's safe.

    AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));

  }


  TensorBlockDescriptor& DropDestinationBuffer() {

    m_destination.m_data = NULL;

    m_destination.m_kind = DestinationBuffer::kEmpty;

    return *this;

  }


  bool HasDestinationBuffer() const {

    return m_destination.kind() != DestinationBuffer::kEmpty;

  }


  // Returns a copy of `*this` with updated offset.


  TensorBlockDescriptor WithOffset(IndexType offset) const {

    return TensorBlockDescriptor(offset, m_dimensions, m_destination);

  }


 private:

  // Offset and dimensions are immutable after construction. Block descriptor

  // can only be mutated by adding or dropping destination.

  const IndexType m_offset;

  const Dimensions m_dimensions;

  DestinationBuffer m_destination;

};


// -------------------------------------------------------------------------- //

// TensorBlockMapper is responsible for iterating over the blocks of a tensor.


template <int NumDims, int Layout, typename IndexType = Eigen::Index>


class TensorBlockMapper {

  typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;


 public:

  typedef DSizes<IndexType, NumDims> Dimensions;


  TensorBlockMapper() = default;


  TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,

                    const TensorBlockResourceRequirements& requirements)

      : m_tensor_dimensions(dimensions), m_requirements(requirements) {

    // Compute block dimensions and the total number of blocks.

    InitializeBlockDimensions();

  }


  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {

    return m_total_block_count;

  }


  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {

    return m_block_dimensions.TotalSize();

  }


  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&


  blockDimensions() const {

    return m_block_dimensions;

  }


  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor


  blockDescriptor(IndexType block_index) const {

    static const bool isColMajor = Layout == static_cast<int>(ColMajor);


    IndexType offset = 0;

    DSizes<IndexType, NumDims> dimensions;


    if (NumDims == 0) return BlockDescriptor(offset, dimensions);


    // Iterate outer -> inner dimensions.

    for (int i = NumDims - 1; i >= 0; --i) {

      const int dim = isColMajor ? i : NumDims - i - 1;


      const IndexType idx = block_index / m_block_strides[dim];

      block_index -= idx * m_block_strides[dim];


      const IndexType coord = idx * m_block_dimensions[dim];

      dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,

                                     m_block_dimensions[dim]);

      offset += coord * m_tensor_strides[dim];

    }


    return {offset, dimensions};

  }


 private:

  void InitializeBlockDimensions() {

    // Requested block shape and size.

    const TensorBlockShapeType shape_type = m_requirements.shape_type;

    IndexType target_block_size =

        numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));


    IndexType tensor_size = m_tensor_dimensions.TotalSize();


    // Corner case: one of the dimensions is zero. Logic below is too complex

    // to handle this case on a general basis, just use unit block size.

    // Note: we must not yield blocks with zero dimensions (recipe for

    // overflows/underflows, divisions by zero and NaNs later).

    if (tensor_size == 0) {

      for (int i = 0; i < NumDims; ++i) {

        m_block_dimensions[i] = 1;

      }

      m_total_block_count = 0;

      return;

    }


    // If tensor fits into a target block size, evaluate it as a single block.

    if (tensor_size <= target_block_size) {

      m_block_dimensions = m_tensor_dimensions;

      m_total_block_count = 1;

      // The only valid block index is `0`, and in this case we do not need

      // to compute real strides for tensor or blocks (see blockDescriptor).

      for (int i = 0; i < NumDims; ++i) {

        m_tensor_strides[i] = 0;

        m_block_strides[i] = 1;

      }

      return;

    }


    static const bool isColMajor = Layout == static_cast<int>(ColMajor);


    // Block shape skewed towards inner dimension.

    if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {

      IndexType coeff_to_allocate = target_block_size;


      for (int i = 0; i < NumDims; ++i) {

        const int dim = isColMajor ? i : NumDims - i - 1;

        m_block_dimensions[dim] =

            numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);

        coeff_to_allocate = divup(

            coeff_to_allocate,

            numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));

      }

      eigen_assert(coeff_to_allocate == 1);


    } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {

      // Tensor will not fit within 'target_block_size' budget: calculate tensor

      // block dimension sizes based on "square" dimension size target.

      const IndexType dim_size_target = convert_index<IndexType>(

          std::pow(static_cast<float>(target_block_size),

                   1.0f / static_cast<float>(m_block_dimensions.rank())));


      for (int i = 0; i < NumDims; ++i) {

        // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it

        // a multiple of the packet size. Note that reducing

        // 'block_dim_size' in this manner can increase the number of

        // blocks, and so will amplify any per-block overhead.

        m_block_dimensions[i] =

            numext::mini(dim_size_target, m_tensor_dimensions[i]);

      }


      // Add any un-allocated coefficients to inner dimension(s).

      IndexType total_size = m_block_dimensions.TotalSize();

      for (int i = 0; i < NumDims; ++i) {

        const int dim = isColMajor ? i : NumDims - i - 1;


        if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {

          const IndexType total_size_other_dims =

              total_size / m_block_dimensions[dim];

          const IndexType alloc_avail =

              divup<IndexType>(target_block_size, total_size_other_dims);

          if (alloc_avail == m_block_dimensions[dim]) {

            // Insufficient excess coefficients to allocate.

            break;

          }

          m_block_dimensions[dim] =

              numext::mini(m_tensor_dimensions[dim], alloc_avail);

          total_size = total_size_other_dims * m_block_dimensions[dim];

        }

      }


    } else {

      eigen_assert(false);  // unknown block shape

    }


    eigen_assert(m_block_dimensions.TotalSize() >=

                 numext::mini<IndexType>(target_block_size,

                                         m_tensor_dimensions.TotalSize()));


    // Calculate block counts by dimension and total block count.

    DSizes<IndexType, NumDims> block_count;

    for (int i = 0; i < NumDims; ++i) {

      block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);

    }

    m_total_block_count = array_prod(block_count);


    // Calculate block strides (used for enumerating blocks).

    m_tensor_strides = strides<Layout>(m_tensor_dimensions);

    m_block_strides = strides<Layout>(block_count);

  }


  DSizes<IndexType, NumDims> m_tensor_dimensions;

  TensorBlockResourceRequirements m_requirements;


  DSizes<IndexType, NumDims> m_block_dimensions;

  IndexType m_total_block_count;


  DSizes<IndexType, NumDims> m_tensor_strides;

  DSizes<IndexType, NumDims> m_block_strides;

};


// -------------------------------------------------------------------------- //

// TensorBlockScratchAllocator is responsible for allocating temporary buffers

// for block evaluation (output or input block materialization). Given that

// Eigen expression traversal order is deterministic, all temporary allocations

// are happening in the same order, and usually have exactly the same size.

// Scratch allocator keeps a trace of all dynamic allocations, and after the

// first block evaluation is completed, we should be able to reuse all the

// temporary buffers for the next block evaluation.


template <typename Device>


class TensorBlockScratchAllocator {

 public:


  explicit TensorBlockScratchAllocator(const Device& device)

      : m_device(device), m_allocation_index(0) {}


  ~TensorBlockScratchAllocator() {

    for (size_t i = 0; i < m_allocations.size(); ++i) {

      m_device.deallocate(m_allocations[i].ptr);

    }

  }


  void* allocate(size_t size) {

    // TODO(ezhulenev): Remove when replaced with inlined vector.

    if (m_allocations.capacity() == 0) m_allocations.reserve(8);


    // Check if we already have an existing allocation att current index.

    const int num_allocations = static_cast<int>(m_allocations.size());

    const bool has_allocation = m_allocation_index < num_allocations;


    // Allocation index can't be larger than the number of allocations.

    eigen_assert(m_allocation_index <= num_allocations);


    // If we have existing allocation, and its size is larger or equal to

    // requested size, we do nothing.


    // If current allocation can't fit requested size, we deallocate it, and

    // replace with a larger allocation.

    if (has_allocation && m_allocations[m_allocation_index].size < size) {

      m_device.deallocate(m_allocations[m_allocation_index].ptr);

      m_allocations[m_allocation_index].ptr = m_device.allocate(size);

      m_allocations[m_allocation_index].size = size;

    }


    // Make a new allocation if we don't have and existing one.

    if (!has_allocation) {

      Allocation allocation;

      allocation.ptr = m_device.allocate(size);

      allocation.size = size;

      m_allocations.push_back(allocation);

    }


    eigen_assert(m_allocations[m_allocation_index].ptr != NULL);

    eigen_assert(m_allocations[m_allocation_index].size >= size);


    return m_allocations[m_allocation_index++].ptr;

  }


  void reset() { m_allocation_index = 0; }


 private:

  struct Allocation {

    void* ptr;

    size_t size;

  };


  const Device& m_device;

  int m_allocation_index;

  // TODO(ezhulenev): This should be an inlined vector.

  std::vector<Allocation> m_allocations;

};


// -------------------------------------------------------------------------- //

// TensorBlockKind represents all possible block kinds, that can be produced by

// TensorEvaluator::evalBlock function.


enum TensorBlockKind {

  // Tensor block that is a lazy expression that must be assigned to a

  // destination using TensorBlockAssign.

  kExpr,


  // Tensor block that is a view into a memory buffer owned by an underlying

  // Tensor expression (e.g. it can be a view into a Tensor buffer).

  kView,


  // Tensor block that was materialized in a scratch memory buffer, allocated

  // with TensorBlockScratchAllocator. This block must be copied to a

  // destination, similar to a block of `kExpr` type.

  kMaterializedInScratch,


  // Tensor block that was materialized directly into the final output memory

  // buffer. For example if the left side of an assignment is a Tensor, we can

  // directly materialize the block in the destination memory.

  //

  // If strides in the output buffer do not match tensor block strides, the

  // Tensor expression will be invalid, and should not be used by

  // TensorBlockAssign or for constructing another block expression.

  kMaterializedInOutput

};


// -------------------------------------------------------------------------- //

// TensorBlockNotImplemented should be used to defined TensorBlock typedef in

// TensorEvaluators that do not support block evaluation.


class TensorBlockNotImplemented {

 public:

  typedef void XprType;

};


// -------------------------------------------------------------------------- //

// XprScalar extracts Scalar type from the Eigen expressions (if expression type

// is not void). It's required to be able to define lazy block expression for

// argument types, that do not support block evaluation.


template <typename XprType>


struct XprScalar {

  typedef typename XprType::Scalar type;

};


template <>


struct XprScalar<void> {

  typedef void type;

};


// -------------------------------------------------------------------------- //

// TensorMaterializedBlock is a fully evaluated block of the original tensor,

// and XprType is just a TensorMap over the data. This block type is typically

// used to materialize blocks of tensor expressions, that can't be efficiently

// represented as lazy Tensor expressions with fast coeff/packet operations,

// e.g. we materialize all broadcasts into evaluated blocks.

//

// TensorMaterializedBlock does not own its memory buffer, it's either a memory

// buffer that backs the original expression (e.g. block is just a view into a

// Tensor), or a memory buffer allocated with scratch allocator, and in this

// case the scratch allocator will deallocate it at the end of block based

// expression execution.

//

// If the block was evaluated directly into the output buffer, and strides in

// the output buffer do not match block strides, the TensorMap expression will

// be invalid, and should never be used in block assignment or any other tensor

// expression.


template <typename Scalar, int NumDims, int Layout,

          typename IndexType = Eigen::Index>


class TensorMaterializedBlock {

 public:

  typedef DSizes<IndexType, NumDims> Dimensions;

  typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;


  TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,

                          const Dimensions& dimensions, bool valid_expr = true)

      : m_kind(kind),

        m_data(data),

        m_dimensions(dimensions),

        m_expr(m_data, m_dimensions),

        m_valid_expr(valid_expr) {

    eigen_assert(m_kind == internal::TensorBlockKind::kView ||

                 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||

                 m_kind == internal::TensorBlockKind::kMaterializedInOutput);

  }


  TensorBlockKind kind() const { return m_kind; }

  // NOTE(ezhulenev): Returning XprType by value like in other block types

  // causes asan failures. The theory is that XprType::Nested doesn't work

  // properly for TensorMap.


  const XprType& expr() const {

    eigen_assert(m_valid_expr);

    return m_expr;

  }


  const Scalar* data() const { return m_data; }

  void cleanup() {}


  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;


  // TensorMaterializedBlock can be backed by different types of storage:

  //

  //   (1) Contiguous block of memory allocated with scratch allocator.

  //   (2) Contiguous block of memory reused from tensor block descriptor

  //       destination buffer.

  //   (3) Strided block of memory reused from tensor block descriptor

  //       destination buffer.

  //


  class Storage {

   public:

    Scalar* data() const { return m_data; }

    const Dimensions& dimensions() const { return m_dimensions; }

    const Dimensions& strides() const { return m_strides; }


    TensorMaterializedBlock AsTensorMaterializedBlock() const {

      return TensorMaterializedBlock(

          m_materialized_in_output

              ? internal::TensorBlockKind::kMaterializedInOutput

              : internal::TensorBlockKind::kMaterializedInScratch,

          m_data, m_dimensions, !m_strided_storage);

    }


   private:

    friend class TensorMaterializedBlock;


    Storage(Scalar* data, const Dimensions& dimensions,

            const Dimensions& strides, bool materialized_in_output,

            bool strided_storage)

        : m_data(data),

          m_dimensions(dimensions),

          m_strides(strides),

          m_materialized_in_output(materialized_in_output),

          m_strided_storage(strided_storage) {}


    Scalar* m_data;

    Dimensions m_dimensions;

    Dimensions m_strides;

    bool m_materialized_in_output;

    bool m_strided_storage;

  };


  // Creates a storage for materialized block either from the block descriptor

  // destination buffer, or allocates a new buffer with scratch allocator.

  template <typename TensorBlockScratch>


  EIGEN_STRONG_INLINE static Storage prepareStorage(

      TensorBlockDesc& desc, TensorBlockScratch& scratch,

      bool allow_strided_storage = false) {

    // Try to reuse destination as an output block buffer.

    typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;


    if (desc.destination().kind() == DestinationBuffer::kContiguous) {

      Scalar* buffer = desc.destination().template data<Scalar>();

      desc.DropDestinationBuffer();

      return Storage(buffer, desc.dimensions(),

                     internal::strides<Layout>(desc.dimensions()),

                     /*materialized_in_output=*/true,

                     /*strided_storage=*/false);


    } else if (desc.destination().kind() == DestinationBuffer::kStrided &&

               allow_strided_storage) {

      Scalar* buffer = desc.destination().template data<Scalar>();

      desc.DropDestinationBuffer();

      return Storage(buffer, desc.dimensions(), desc.destination().strides(),

                     /*materialized_in_output=*/true, /*strided_storage=*/true);


    } else {

      void* mem = scratch.allocate(desc.size() * sizeof(Scalar));

      return Storage(static_cast<Scalar*>(mem), desc.dimensions(),

                     internal::strides<Layout>(desc.dimensions()),

                     /*materialized_in_output=*/false,

                     /*strided_storage=*/false);

    }

  }


  // Creates a materialized block for the given descriptor from a memory buffer.

  template <typename DataDimensions, typename TensorBlockScratch>


  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(

      const Scalar* data, const DataDimensions& data_dims,

      TensorBlockDesc& desc, TensorBlockScratch& scratch) {

    eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());


    // If a tensor block dimensions covers a contiguous block of the underlying

    // memory, we can skip block buffer memory allocation, and construct a block

    // from existing `data` memory buffer.

    //

    // Example: (RowMajor layout)

    //   data_dims:          [11, 12, 13, 14]

    //   desc.dimensions():  [1,   1,  3, 14]

    //

    // In this case we can construct a TensorBlock starting at

    // `data + desc.offset()`, with a `desc.dimensions()` block sizes.

    static const bool is_col_major = Layout == ColMajor;


    // Find out how many inner dimensions have a matching size.

    int num_matching_inner_dims = 0;

    for (int i = 0; i < NumDims; ++i) {

      int dim = is_col_major ? i : NumDims - i - 1;

      if (data_dims[dim] != desc.dimensions()[dim]) break;

      ++num_matching_inner_dims;

    }


    // All the outer dimensions must be of size `1`, except a single dimension

    // before the matching inner dimension (`3` in the example above).

    bool can_use_direct_access = true;

    for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {

      int dim = is_col_major ? i : NumDims - i - 1;

      if (desc.dimension(dim) != 1) {

        can_use_direct_access = false;

        break;

      }

    }


    if (can_use_direct_access) {

      const Scalar* block_start = data + desc.offset();

      return TensorMaterializedBlock(internal::TensorBlockKind::kView,

                                     block_start, desc.dimensions());


    } else {

      // Reuse destination buffer or allocate new buffer with scratch allocator.

      const Storage storage = prepareStorage(desc, scratch);


      typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>

          TensorBlockIO;

      typedef typename TensorBlockIO::Dst TensorBlockIODst;

      typedef typename TensorBlockIO::Src TensorBlockIOSrc;


      TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),

                           data, desc.offset());

      TensorBlockIODst dst(storage.dimensions(), storage.strides(),

                           storage.data());


      TensorBlockIO::Copy(dst, src);

      return storage.AsTensorMaterializedBlock();

    }

  }


 private:

  TensorBlockKind m_kind;

  const Scalar* m_data;

  Dimensions m_dimensions;

  XprType m_expr;

  bool m_valid_expr;

};


// -------------------------------------------------------------------------- //

// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp

// functor to the blocks produced by the underlying Tensor expression.


template <typename UnaryOp, typename ArgTensorBlock>


class TensorCwiseUnaryBlock {

  static const bool NoArgBlockAccess =

      internal::is_void<typename ArgTensorBlock::XprType>::value;


 public:

  typedef typename conditional<

      NoArgBlockAccess, void,

      TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::

      type XprType;


  typedef typename XprScalar<XprType>::type Scalar;


  TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)

      : m_arg_block(arg_block), m_functor(functor) {}


  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }


  XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }

  const Scalar* data() const { return NULL; }

  void cleanup() { m_arg_block.cleanup(); }


 private:

  ArgTensorBlock m_arg_block;

  UnaryOp m_functor;

};


// -------------------------------------------------------------------------- //

// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp

// functor to the blocks produced by the underlying Tensor expression.


template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>


class TensorCwiseBinaryBlock {

  static const bool NoArgBlockAccess =

      internal::is_void<typename LhsTensorBlock::XprType>::value ||

      internal::is_void<typename RhsTensorBlock::XprType>::value;


 public:

  typedef typename conditional<

      NoArgBlockAccess, void,

      TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,

                          const typename RhsTensorBlock::XprType> >::type

      XprType;


  typedef typename XprScalar<XprType>::type Scalar;


  TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,

                         const RhsTensorBlock& right_block,

                         const BinaryOp& functor)

      : m_left_block(left_block),

        m_right_block(right_block),

        m_functor(functor) {}


  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }


  XprType expr() const {

    return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);

  }


  const Scalar* data() const { return NULL; }


  void cleanup() {

    m_left_block.cleanup();

    m_right_block.cleanup();

  }


 private:

  LhsTensorBlock m_left_block;

  RhsTensorBlock m_right_block;

  BinaryOp m_functor;

};


// -------------------------------------------------------------------------- //

// TensorUnaryExprBlock is a lazy tensor expression block that can construct

// an arbitrary tensor expression from a block of the underlying type (this is a

// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).


template <typename BlockFactory, typename ArgTensorBlock>


class TensorUnaryExprBlock {

  typedef typename ArgTensorBlock::XprType ArgXprType;

  static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;


 public:

  typedef typename conditional<

      NoArgBlockAccess, void,

      typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;


  typedef typename XprScalar<XprType>::type Scalar;


  TensorUnaryExprBlock(const ArgTensorBlock& arg_block,

                       const BlockFactory& factory)

      : m_arg_block(arg_block), m_factory(factory) {}


  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }

  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }

  const Scalar* data() const { return NULL; }

  void cleanup() { m_arg_block.cleanup(); }


 private:

  ArgTensorBlock m_arg_block;

  BlockFactory m_factory;

};


// -------------------------------------------------------------------------- //

// TensorTernaryExprBlock is a lazy tensor expression block that can construct

// an arbitrary tensor expression from three blocks of the underlying type.


template <typename BlockFactory, typename Arg1TensorBlock,

          typename Arg2TensorBlock, typename Arg3TensorBlock>


class TensorTernaryExprBlock {

  typedef typename Arg1TensorBlock::XprType Arg1XprType;

  typedef typename Arg2TensorBlock::XprType Arg2XprType;

  typedef typename Arg3TensorBlock::XprType Arg3XprType;


  static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||

                                       internal::is_void<Arg2XprType>::value ||

                                       internal::is_void<Arg3XprType>::value;


 public:

  typedef typename conditional<

      NoArgBlockAccess, void,

      typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,

                                              Arg3XprType>::type>::type XprType;


  typedef typename XprScalar<XprType>::type Scalar;


  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,

                         const Arg2TensorBlock& arg2_block,

                         const Arg3TensorBlock& arg3_block,

                         const BlockFactory& factory)

      : m_arg1_block(arg1_block),

        m_arg2_block(arg2_block),

        m_arg3_block(arg3_block),

        m_factory(factory) {}


  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }


  XprType expr() const {

    return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),

                          m_arg3_block.expr());

  }


  const Scalar* data() const { return NULL; }


  void cleanup() {

    m_arg1_block.cleanup();

    m_arg2_block.cleanup();

    m_arg3_block.cleanup();

  }


 private:

  Arg1TensorBlock m_arg1_block;

  Arg2TensorBlock m_arg2_block;

  Arg3TensorBlock m_arg3_block;

  BlockFactory m_factory;

};


// -------------------------------------------------------------------------- //

// StridedLinearBufferCopy provides a method to copy data between two linear

// buffers with different strides, with optimized paths for scatter/gather.


template <typename Scalar, typename IndexType>


class StridedLinearBufferCopy {

  typedef typename packet_traits<Scalar>::type Packet;

  enum {

    Vectorizable = packet_traits<Scalar>::Vectorizable,

    PacketSize = packet_traits<Scalar>::size

  };


 public:

  // Specifying linear copy kind statically gives ~30% speedup for small sizes.


  enum class Kind {

    Linear = 0,       // src_stride == 1 && dst_stride == 1

    Scatter = 1,      // src_stride == 1 && dst_stride != 1

    FillLinear = 2,   // src_stride == 0 && dst_stride == 1

    FillScatter = 3,  // src_stride == 0 && dst_stride != 1

    Gather = 4,       // dst_stride == 1

    Random = 5        // everything else

  };


  struct Dst {

    Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}


    IndexType offset;

    IndexType stride;

    Scalar* data;

  };


  struct Src {


    Src(IndexType o, IndexType s, const Scalar* d)

        : offset(o), stride(s), data(d) {}


    IndexType offset;

    IndexType stride;

    const Scalar* data;

  };


  template <typename StridedLinearBufferCopy::Kind kind>


  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,

                                                        const Src& src,

                                                        const size_t count) {

    Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,

              src.data);

  }


 private:

  template <typename StridedLinearBufferCopy::Kind kind>

  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(

      const IndexType count, const IndexType dst_offset,

      const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,

      const IndexType src_offset, const IndexType src_stride,

      const Scalar* EIGEN_RESTRICT src_data) {

    const Scalar* src = &src_data[src_offset];

    Scalar* dst = &dst_data[dst_offset];


    if (!Vectorizable) {

      for (Index i = 0; i < count; ++i) {

        dst[i * dst_stride] = src[i * src_stride];

      }

      return;

    }


    const IndexType vectorized_size = count - PacketSize;

    IndexType i = 0;


    if (kind == StridedLinearBufferCopy::Kind::Linear) {

      // ******************************************************************** //

      // Linear copy from `src` to `dst`.

      const IndexType unrolled_size = count - 4 * PacketSize;

      eigen_assert(src_stride == 1 && dst_stride == 1);

      for (; i <= unrolled_size; i += 4 * PacketSize) {

        for (int j = 0; j < 4; ++j) {

          Packet p = ploadu<Packet>(src + i + j * PacketSize);

          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);

        }

      }

      for (; i <= vectorized_size; i += PacketSize) {

        Packet p = ploadu<Packet>(src + i);

        pstoreu<Scalar, Packet>(dst + i, p);

      }

      for (; i < count; ++i) {

        dst[i] = src[i];

      }

      // ******************************************************************** //

    } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {

      // Scatter from `src` to `dst`.

      eigen_assert(src_stride == 1 && dst_stride != 1);

      for (; i <= vectorized_size; i += PacketSize) {

        Packet p = ploadu<Packet>(src + i);

        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);

      }

      for (; i < count; ++i) {

        dst[i * dst_stride] = src[i];

      }

      // ******************************************************************** //

    } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {

      // Fill `dst` with value at `*src`.

      eigen_assert(src_stride == 0 && dst_stride == 1);

      const IndexType unrolled_size = count - 4 * PacketSize;

      Packet p = pload1<Packet>(src);

      for (; i <= unrolled_size; i += 4 * PacketSize) {

        for (int j = 0; j < 4; ++j) {

          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);

        }

      }

      for (; i <= vectorized_size; i += PacketSize) {

        pstoreu<Scalar, Packet>(dst + i, p);

      }

      for (; i < count; ++i) {

        dst[i] = *src;

      }

      // ******************************************************************** //

    } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {

      // Scatter `*src` into `dst`.

      eigen_assert(src_stride == 0 && dst_stride != 1);

      Packet p = pload1<Packet>(src);

      for (; i <= vectorized_size; i += PacketSize) {

        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);

      }

      for (; i < count; ++i) {

        dst[i * dst_stride] = *src;

      }

      // ******************************************************************** //

    } else if (kind == StridedLinearBufferCopy::Kind::Gather) {

      // Gather from `src` into `dst`.

      eigen_assert(dst_stride == 1);

      for (; i <= vectorized_size; i += PacketSize) {

        Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);

        pstoreu<Scalar, Packet>(dst + i, p);

      }

      for (; i < count; ++i) {

        dst[i] = src[i * src_stride];

      }

      // ******************************************************************** //

    } else if (kind == StridedLinearBufferCopy::Kind::Random) {

      // Random.

      for (; i < count; ++i) {

        dst[i * dst_stride] = src[i * src_stride];

      }

    } else {

      eigen_assert(false);

    }

  }

};


// -------------------------------------------------------------------------- //

// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.

// It's possible to specify src->dst dimension mapping for the copy operation.

// Dimensions of `dst` specify how many elements have to be copied, for the

// `src` we need to know only stride to navigate through source memory buffer.


template <typename Scalar, typename IndexType, int NumDims, int Layout>


class TensorBlockIO {

  static const bool IsColMajor = (Layout == ColMajor);


  typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;


 public:

  typedef DSizes<IndexType, NumDims> Dimensions;

  typedef DSizes<int, NumDims> DimensionsMap;


  struct Dst {


    Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,

        IndexType dst_offset = 0)

        : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}


    Dimensions dims;

    Dimensions strides;

    Scalar* data;

    IndexType offset;

  };


  struct Src {


    Src(const Dimensions& src_strides, const Scalar* src,

        IndexType src_offset = 0)

        : strides(src_strides), data(src), offset(src_offset) {}


    Dimensions strides;

    const Scalar* data;

    IndexType offset;

  };


  // Copies data to `dst` from `src`, using provided dimensions mapping:

  //

  //   src_dimension_index = dst_to_src_dim_map[dst_dimension_index]

  //

  // Returns the number of copied elements.


  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(

      const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {

    // Copy single scalar value from `src` to `dst`.

    if (NumDims == 0) {

      *(dst.data + dst.offset) = *(src.data + src.offset);

      return 1;

    }


    // Both `dst` and `src` must have contiguous innermost dimension. We also

    // accept the special case with stride '0', because it's used as a trick to

    // implement broadcasting.

    {

      int inner_dim = IsColMajor ? 0 : NumDims - 1;

      EIGEN_UNUSED_VARIABLE(inner_dim);

      eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);

      eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);

    }


    // Give a shorter name to `dst_to_src_dim_map`.

    const DimensionsMap& dim_map = dst_to_src_dim_map;


    // Do not squeeze reordered inner dimensions.

    int num_squeezable_dims = NumSqueezableInnerDims(dim_map);


    // NOTE: We find the innermost dimension (contiguous in memory) in the dst

    // block, and we write data linearly into that dimension, reading it from

    // the src. If dimensions are reordered, we might end up reading data from

    // the src with `stride != 1`.

    //

    // NOTE: Random-Read/Linear-Write can be up to ~2X faster than

    // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680


    // Find the innermost dimension in the dst whose size is not 1. This is the

    // effective inner dim.

    int num_size_one_inner_dims = 0;

    for (int i = 0; i < num_squeezable_dims; ++i) {

      const int dst_dim = IsColMajor ? i : NumDims - i - 1;

      if (dst.dims[dst_dim] != 1) break;

      num_size_one_inner_dims++;

    }


    // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.

    if (num_size_one_inner_dims == NumDims) {

      *(dst.data + dst.offset) = *(src.data + src.offset);

      return 1;

    }


    // Outermost dimension in the dst with `stride == 1` (contiguous in memory).

    const int dst_stride1_dim = IsColMajor

                                    ? num_size_one_inner_dims

                                    : NumDims - num_size_one_inner_dims - 1;


    // Dimension in the src that corresponds to the dst innermost dimension.

    const int src_dim_for_dst_stride1_dim =

        NumDims == 0 ? 1 : dim_map[dst_stride1_dim];


    // Size of the innermost dimension (length of contiguous blocks of memory).

    IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];


    // Squeeze multiple inner dims into one if they are contiguous in `dst` and

    // `src` memory, so we can do less linear copy calls.

    for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {

      const int dst_dim = IsColMajor ? i : NumDims - i - 1;

      const IndexType dst_stride = dst.strides[dst_dim];

      const IndexType src_stride = src.strides[dim_map[dst_dim]];

      if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {

        dst_inner_dim_size *= dst.dims[dst_dim];

        ++num_size_one_inner_dims;

      } else {

        break;

      }

    }


    // Setup strides to read data from `src` and write to `dst`.

    IndexType input_offset = src.offset;

    IndexType output_offset = dst.offset;

    IndexType input_stride =

        NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];

    IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];


    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;

    array<BlockIteratorState, at_least_1_dim> it;


    // Initialize block iterator state. Squeeze away any dimension of size 1.

    int idx = 0;  // currently initialized iterator state index

    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {

      const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;

      if (dst.dims[dst_dim] == 1) continue;


      it[idx].size = dst.dims[dst_dim];

      it[idx].input_stride = src.strides[dim_map[dst_dim]];

      it[idx].output_stride = dst.strides[dst_dim];


      it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);

      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);


      idx++;

    }


    // Iterate copying data from src to dst.

    const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();


#define COPY_INNER_DIM(KIND)                                           \

  IndexType num_copied = 0;                                            \

  for (num_copied = 0; num_copied < block_total_size;                  \

       num_copied += dst_inner_dim_size) {                             \

    LinCopy::template Run<KIND>(                                       \

        typename LinCopy::Dst(output_offset, output_stride, dst.data), \

        typename LinCopy::Src(input_offset, input_stride, src.data),   \

        dst_inner_dim_size);                                           \

                                                                       \

    for (int j = 0; j < idx; ++j) {                                    \

      if (++it[j].count < it[j].size) {                                \

        input_offset += it[j].input_stride;                            \

        output_offset += it[j].output_stride;                          \

        break;                                                         \

      }                                                                \

      it[j].count = 0;                                                 \

      input_offset -= it[j].input_span;                                \

      output_offset -= it[j].output_span;                              \

    }                                                                  \

  }                                                                    \

  return num_copied;


    if (input_stride == 1 && output_stride == 1) {

      COPY_INNER_DIM(LinCopy::Kind::Linear);

    } else if (input_stride == 1 && output_stride != 1) {

      COPY_INNER_DIM(LinCopy::Kind::Scatter);

    } else if (input_stride == 0 && output_stride == 1) {

      COPY_INNER_DIM(LinCopy::Kind::FillLinear);

    } else if (input_stride == 0 && output_stride != 1) {

      COPY_INNER_DIM(LinCopy::Kind::FillScatter);

    } else if (output_stride == 1) {

      COPY_INNER_DIM(LinCopy::Kind::Gather);

    } else {

      COPY_INNER_DIM(LinCopy::Kind::Random);

    }


#undef COPY_INNER_DIM

  }


  // Copy from `src` to `dst` with an identity src->dst dimension map. Returns

  // the number of copied elements.


  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,

                                                              const Src& src) {

    DimensionsMap dst_to_src_map;

    for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;

    return Copy(dst, src, dst_to_src_map);

  }


 private:

  struct BlockIteratorState {

    BlockIteratorState()

        : size(0),

          count(0),

          input_stride(0),

          output_stride(0),

          input_span(0),

          output_span(0) {}


    IndexType size;

    IndexType count;

    IndexType input_stride;

    IndexType output_stride;

    IndexType input_span;

    IndexType output_span;

  };


  // Compute how many inner dimensions it's allowed to squeeze when doing IO

  // between two tensor blocks. It's safe to squeeze inner dimensions, only

  // if they are not reordered.

  static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {

    int num_squeezable_dims = 0;

    for (int i = 0; i < NumDims; ++i) {

      const int dim = IsColMajor ? i : NumDims - i - 1;

      if (dim_map[dim] != dim) break;

      num_squeezable_dims++;

    }

    return num_squeezable_dims;

  }

};


// -------------------------------------------------------------------------- //

// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to

// a Tensor block defined by `desc`, backed by a memory buffer at `target`.

//

// Currently there is no way to write from a Tensor expression to a block of

// memory, if dimensions are reordered. If you need to do that, you should

// materialize a Tensor block expression into a memory buffer, and then use

// TensorBlockIO to copy data between two memory buffers with a custom

// `target->src` dimension map (see definition above).

//

// Also currently the innermost dimension of `target` must have a stride '1'

// (contiguous in memory). This restriction could be lifted with a `pscatter`,

// but in practice it's never needed, and there is a similar TensorBlockIO

// workaround for that.

//

// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO

// where `src` is a tensor expression. Explore if it is possible to rewrite IO

// to use expressions instead of pointers, and after that TensorBlockAssignment

// will become an alias to IO.

template <typename Scalar, int NumDims, typename TensorBlockExpr,

          typename IndexType = Eigen::Index>


class TensorBlockAssignment {

  // We will use coeff/packet path to evaluate block expressions.

  typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>

      TensorBlockEvaluator;


  typedef DSizes<IndexType, NumDims> Dimensions;


  enum {

    Vectorizable = packet_traits<Scalar>::Vectorizable,

    PacketSize = packet_traits<Scalar>::size

  };


  template <bool Vectorizable, typename Evaluator>

  struct InnerDimAssign {

    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,

                                        const Evaluator& eval,

                                        IndexType eval_offset) {

      for (IndexType i = 0; i < count; ++i) {

        target[i] = eval.coeff(eval_offset + i);

      }

    }

  };


  template <typename Evaluator>

  struct InnerDimAssign<true, Evaluator> {

    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,

                                        const Evaluator& eval,

                                        IndexType eval_offset) {

      typedef typename packet_traits<Scalar>::type Packet;


      const IndexType unrolled_size = count - 4 * PacketSize;

      const IndexType vectorized_size = count - PacketSize;

      IndexType i = 0;


      for (; i <= unrolled_size; i += 4 * PacketSize) {

        for (int j = 0; j < 4; ++j) {

          const IndexType idx = eval_offset + i + j * PacketSize;

          Packet p = eval.template packet<Unaligned>(idx);

          pstoreu<Scalar>(target + i + j * PacketSize, p);

        }

      }


      for (; i <= vectorized_size; i += PacketSize) {

        Packet p = eval.template packet<Unaligned>(eval_offset + i);

        pstoreu<Scalar>(target + i, p);

      }


      for (; i < count; ++i) {

        target[i] = eval.coeff(eval_offset + i);

      }

    }

  };


 public:


  struct Target {


    Target(const Dimensions& target_dims, const Dimensions& target_strides,

           Scalar* target_data, IndexType target_offset = 0)

        : dims(target_dims),

          strides(target_strides),

          data(target_data),

          offset(target_offset) {}


    Dimensions dims;

    Dimensions strides;

    Scalar* data;

    IndexType offset;

  };


  static Target target(const Dimensions& target_dims,

                       const Dimensions& target_strides, Scalar* target_data,

                       IndexType target_offset = 0) {

    return Target(target_dims, target_strides, target_data, target_offset);

  }


  template <typename TargetDimsIndexType, typename TargetStridesIndexType>


  static Target target(

      const DSizes<TargetDimsIndexType, NumDims>& target_dims,

      const DSizes<TargetStridesIndexType, NumDims>& target_strides,

      Scalar* target_data, IndexType target_offset = 0) {

    // DSizes constructor will do index type promotion if it's safe.

    return Target(Dimensions(target_dims), Dimensions(target_strides),

                  target_data, target_offset);

  }


  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(

      const Target& target, const TensorBlockExpr& expr) {

    // Prepare evaluator for block expression.

    DefaultDevice default_device;

    TensorBlockEvaluator eval(expr, default_device);


    // Tensor block expression dimension should match destination dimensions.

    eigen_assert(dimensions_match(target.dims, eval.dimensions()));


    static const int Layout = TensorBlockEvaluator::Layout;

    static const bool is_col_major = Layout == ColMajor;


    // Initialize output inner dimension size based on a layout.

    const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();

    const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;

    IndexType output_inner_dim_size = target.dims[inner_dim_idx];


    // Target inner dimension stride must be '1'.

    eigen_assert(target.strides[inner_dim_idx] == 1);


    // Squeeze multiple inner dims into one if they are contiguous in `target`.

    IndexType num_squeezed_dims = 0;

    for (Index i = 1; i < NumDims; ++i) {

      const Index dim = is_col_major ? i : NumDims - i - 1;

      const IndexType target_stride = target.strides[dim];


      if (output_inner_dim_size == target_stride) {

        output_inner_dim_size *= target.dims[dim];

        num_squeezed_dims++;

      } else {

        break;

      }

    }


    // Initialize output block iterator state. Dimension in this array are

    // always in inner_most -> outer_most order (col major layout).

    array<BlockIteratorState, NumDims> it;


    int idx = 0;  // currently initialized iterator state index

    for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {

      const Index dim = is_col_major ? i + 1 : NumDims - i - 2;


      it[idx].count = 0;

      it[idx].size = target.dims[dim];

      it[idx].output_stride = target.strides[dim];

      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);

      idx++;

    }


    // We read block expression from the beginning, and start writing data to

    // `target` at given offset.

    IndexType input_offset = 0;

    IndexType output_offset = target.offset;


    // Iterate copying data from `eval` to `target`.

    for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {

      // Assign to `target` at current offset.

      InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,

                     TensorBlockEvaluator>::Run(target.data + output_offset,

                                                output_inner_dim_size, eval,

                                                input_offset);


      // Move input offset forward by the number of assigned coefficients.

      input_offset += output_inner_dim_size;


      // Update index.

      for (int j = 0; j < idx; ++j) {

        if (++it[j].count < it[j].size) {

          output_offset += it[j].output_stride;

          break;

        }

        it[j].count = 0;

        output_offset -= it[j].output_span;

      }

    }

  }


 private:

  struct BlockIteratorState {

    BlockIteratorState()

        : count(0), size(0), output_stride(0), output_span(0) {}


    IndexType count;

    IndexType size;

    IndexType output_stride;

    IndexType output_span;

  };

};


// -------------------------------------------------------------------------- //


}  // namespace internal

}  // namespace Eigen


#endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H

i
int i
Definition BiCGSTAB_step_by_step.cpp:9

EIGEN_RESTRICT
#define EIGEN_RESTRICT
Definition Macros.h:1160

EIGEN_ALWAYS_INLINE
#define EIGEN_ALWAYS_INLINE
Definition Macros.h:932

EIGEN_UNUSED_VARIABLE
#define EIGEN_UNUSED_VARIABLE(var)
Definition Macros.h:1076

EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC
Definition Macros.h:976

eigen_assert
#define eigen_assert(x)
Definition Macros.h:1037

EIGEN_STRONG_INLINE
#define EIGEN_STRONG_INLINE
Definition Macros.h:917

COPY_INNER_DIM
#define COPY_INNER_DIM(KIND)

p
float * p
Definition Tutorial_Map_using.cpp:9

size
Scalar Scalar int size
Definition benchVecAdd.cpp:17

Scalar
SCALAR Scalar
Definition bench_gemm.cpp:46

Eigen::CwiseBinaryOp
Generic expression where a coefficient-wise binary operator is applied to two expressions.
Definition CwiseBinaryOp.h:84

Eigen::TensorCwiseBinaryOp
Definition TensorExpr.h:197

Eigen::TensorMap< const Tensor< Scalar, NumDims, Layout > >

Eigen::TensorOpCost
Definition TensorCostModel.h:25

Eigen::Triplet< double >

Eigen::array
Definition EmulateArray.h:21

Eigen::internal::StridedLinearBufferCopy
Definition TensorBlock.h:993

Eigen::internal::StridedLinearBufferCopy::Kind
Kind
Definition TensorBlock.h:1002

Eigen::internal::StridedLinearBufferCopy::Kind::Scatter
@ Scatter

Eigen::internal::StridedLinearBufferCopy::Kind::FillScatter
@ FillScatter

Eigen::internal::StridedLinearBufferCopy::Kind::Linear
@ Linear

Eigen::internal::StridedLinearBufferCopy::Kind::Random
@ Random

Eigen::internal::StridedLinearBufferCopy::Kind::Gather
@ Gather

Eigen::internal::StridedLinearBufferCopy::Kind::FillLinear
@ FillLinear

Eigen::internal::StridedLinearBufferCopy::Run
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst &dst, const Src &src, const size_t count)
Definition TensorBlock.h:1029

Eigen::internal::TensorBlockAssignment
Definition TensorBlock.h:1381

Eigen::internal::TensorBlockAssignment::Run
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Target &target, const TensorBlockExpr &expr)
Definition TensorBlock.h:1465

Eigen::internal::TensorBlockAssignment::target
static Target target(const Dimensions &target_dims, const Dimensions &target_strides, Scalar *target_data, IndexType target_offset=0)
Definition TensorBlock.h:1449

Eigen::internal::TensorBlockAssignment::target
static Target target(const DSizes< TargetDimsIndexType, NumDims > &target_dims, const DSizes< TargetStridesIndexType, NumDims > &target_strides, Scalar *target_data, IndexType target_offset=0)
Definition TensorBlock.h:1456

Eigen::internal::TensorBlockDescriptor::DestinationBuffer
Definition TensorBlock.h:202

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::strides
const Dimensions & strides() const
Definition TensorBlock.h:241

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::DestinationBufferKind
DestinationBufferKind
Definition TensorBlock.h:204

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::kEmpty
@ kEmpty
Definition TensorBlock.h:214

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::kStrided
@ kStrided
Definition TensorBlock.h:232

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::kContiguous
@ kContiguous
Definition TensorBlock.h:220

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::kind
const DestinationBufferKind & kind() const
Definition TensorBlock.h:242

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::TensorBlockDescriptor
friend class TensorBlockDescriptor
Definition TensorBlock.h:245

Eigen::internal::TensorBlockDescriptor::DestinationBuffer::data
Scalar * data() const
Definition TensorBlock.h:236

Eigen::internal::TensorBlockDescriptor
Definition TensorBlock.h:188

Eigen::internal::TensorBlockDescriptor::WithOffset
TensorBlockDescriptor WithOffset(IndexType offset) const
Definition TensorBlock.h:331

Eigen::internal::TensorBlockDescriptor::AddDestinationBuffer
void AddDestinationBuffer(Scalar *dst_base, const DSizes< DstStridesIndexType, NumDims > &dst_strides)
Definition TensorBlock.h:313

Eigen::internal::TensorBlockDescriptor::AddDestinationBuffer
void AddDestinationBuffer(Scalar *dst_base, const Dimensions &dst_strides)
Definition TensorBlock.h:306

Eigen::internal::TensorBlockDescriptor::destination
const DestinationBuffer & destination() const
Definition TensorBlock.h:303

Eigen::internal::TensorBlockDescriptor::Dimensions
DSizes< IndexType, NumDims > Dimensions
Definition TensorBlock.h:190

Eigen::internal::TensorBlockDescriptor::size
IndexType size() const
Definition TensorBlock.h:301

Eigen::internal::TensorBlockDescriptor::offset
IndexType offset() const
Definition TensorBlock.h:298

Eigen::internal::TensorBlockDescriptor::HasDestinationBuffer
bool HasDestinationBuffer() const
Definition TensorBlock.h:326

Eigen::internal::TensorBlockDescriptor::dimension
IndexType dimension(int index) const
Definition TensorBlock.h:300

Eigen::internal::TensorBlockDescriptor::dimensions
const Dimensions & dimensions() const
Definition TensorBlock.h:299

Eigen::internal::TensorBlockDescriptor::TensorBlockDescriptor
TensorBlockDescriptor(const IndexType offset, const Dimensions &dimensions, const DestinationBuffer &destination)
Definition TensorBlock.h:287

Eigen::internal::TensorBlockDescriptor::DropDestinationBuffer
TensorBlockDescriptor & DropDestinationBuffer()
Definition TensorBlock.h:320

Eigen::internal::TensorBlockDescriptor::TensorBlockDescriptor
TensorBlockDescriptor(const IndexType offset, const Dimensions &dimensions)
Definition TensorBlock.h:293

Eigen::internal::TensorBlockIO
Definition TensorBlock.h:1143

Eigen::internal::TensorBlockIO::Copy
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(const Dst &dst, const Src &src, const DimensionsMap &dst_to_src_dim_map)
Definition TensorBlock.h:1178

Eigen::internal::TensorBlockIO::Dimensions
DSizes< IndexType, NumDims > Dimensions
Definition TensorBlock.h:1149

Eigen::internal::TensorBlockIO::DimensionsMap
DSizes< int, NumDims > DimensionsMap
Definition TensorBlock.h:1150

Eigen::internal::TensorBlockIO::Copy
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst &dst, const Src &src)
Definition TensorBlock.h:1321

Eigen::internal::TensorBlockMapper
Definition TensorBlock.h:347

Eigen::internal::TensorBlockMapper::TensorBlockMapper
TensorBlockMapper(const DSizes< IndexType, NumDims > &dimensions, const TensorBlockResourceRequirements &requirements)
Definition TensorBlock.h:354

Eigen::internal::TensorBlockMapper::blockCount
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const
Definition TensorBlock.h:361

Eigen::internal::TensorBlockMapper::Dimensions
DSizes< IndexType, NumDims > Dimensions
Definition TensorBlock.h:351

Eigen::internal::TensorBlockMapper::TensorBlockMapper
TensorBlockMapper()=default

Eigen::internal::TensorBlockMapper::blockTotalSize
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const
Definition TensorBlock.h:365

Eigen::internal::TensorBlockMapper::blockDimensions
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes< IndexType, NumDims > & blockDimensions() const
Definition TensorBlock.h:370

Eigen::internal::TensorBlockMapper::blockDescriptor
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index) const
Definition TensorBlock.h:375

Eigen::internal::TensorBlockNotImplemented
Definition TensorBlock.h:617

Eigen::internal::TensorBlockNotImplemented::XprType
void XprType
Definition TensorBlock.h:619

Eigen::internal::TensorBlockScratchAllocator
Definition TensorBlock.h:525

Eigen::internal::TensorBlockScratchAllocator::~TensorBlockScratchAllocator
~TensorBlockScratchAllocator()
Definition TensorBlock.h:530

Eigen::internal::TensorBlockScratchAllocator::allocate
void * allocate(size_t size)
Definition TensorBlock.h:536

Eigen::internal::TensorBlockScratchAllocator::TensorBlockScratchAllocator
TensorBlockScratchAllocator(const Device &device)
Definition TensorBlock.h:527

Eigen::internal::TensorBlockScratchAllocator::reset
void reset()
Definition TensorBlock.h:572

Eigen::internal::TensorCwiseBinaryBlock
Definition TensorBlock.h:866

Eigen::internal::TensorCwiseBinaryBlock::Scalar
XprScalar< XprType >::type Scalar
Definition TensorBlock.h:878

Eigen::internal::TensorCwiseBinaryBlock::TensorCwiseBinaryBlock
TensorCwiseBinaryBlock(const LhsTensorBlock &left_block, const RhsTensorBlock &right_block, const BinaryOp &functor)
Definition TensorBlock.h:880

Eigen::internal::TensorCwiseBinaryBlock::kind
TensorBlockKind kind() const
Definition TensorBlock.h:887

Eigen::internal::TensorCwiseBinaryBlock::expr
XprType expr() const
Definition TensorBlock.h:889

Eigen::internal::TensorCwiseBinaryBlock::cleanup
void cleanup()
Definition TensorBlock.h:895

Eigen::internal::TensorCwiseBinaryBlock::data
const Scalar * data() const
Definition TensorBlock.h:893

Eigen::internal::TensorCwiseBinaryBlock::XprType
conditional< NoArgBlockAccess, void, TensorCwiseBinaryOp< BinaryOp, consttypenameLhsTensorBlock::XprType, consttypenameRhsTensorBlock::XprType > >::type XprType
Definition TensorBlock.h:876

Eigen::internal::TensorCwiseUnaryBlock
Definition TensorBlock.h:835

Eigen::internal::TensorCwiseUnaryBlock::data
const Scalar * data() const
Definition TensorBlock.h:853

Eigen::internal::TensorCwiseUnaryBlock::Scalar
XprScalar< XprType >::type Scalar
Definition TensorBlock.h:845

Eigen::internal::TensorCwiseUnaryBlock::TensorCwiseUnaryBlock
TensorCwiseUnaryBlock(const ArgTensorBlock &arg_block, const UnaryOp &functor)
Definition TensorBlock.h:847

Eigen::internal::TensorCwiseUnaryBlock::cleanup
void cleanup()
Definition TensorBlock.h:854

Eigen::internal::TensorCwiseUnaryBlock::kind
TensorBlockKind kind() const
Definition TensorBlock.h:850

Eigen::internal::TensorCwiseUnaryBlock::XprType
conditional< NoArgBlockAccess, void, TensorCwiseUnaryOp< UnaryOp, consttypenameArgTensorBlock::XprType > >::type XprType
Definition TensorBlock.h:843

Eigen::internal::TensorCwiseUnaryBlock::expr
XprType expr() const
Definition TensorBlock.h:852

Eigen::internal::TensorLazyEvaluatorWritable
Definition TensorRef.h:81

Eigen::internal::TensorMaterializedBlock::Storage
Definition TensorBlock.h:694

Eigen::internal::TensorMaterializedBlock::Storage::data
Scalar * data() const
Definition TensorBlock.h:696

Eigen::internal::TensorMaterializedBlock::Storage::dimensions
const Dimensions & dimensions() const
Definition TensorBlock.h:697

Eigen::internal::TensorMaterializedBlock::Storage::strides
const Dimensions & strides() const
Definition TensorBlock.h:698

Eigen::internal::TensorMaterializedBlock::Storage::AsTensorMaterializedBlock
TensorMaterializedBlock AsTensorMaterializedBlock() const
Definition TensorBlock.h:700

Eigen::internal::TensorMaterializedBlock::Storage::TensorMaterializedBlock
friend class TensorMaterializedBlock
Definition TensorBlock.h:709

Eigen::internal::TensorMaterializedBlock
Definition TensorBlock.h:656

Eigen::internal::TensorMaterializedBlock::TensorMaterializedBlock
TensorMaterializedBlock(TensorBlockKind kind, const Scalar *data, const Dimensions &dimensions, bool valid_expr=true)
Definition TensorBlock.h:661

Eigen::internal::TensorMaterializedBlock::data
const Scalar * data() const
Definition TensorBlock.h:681

Eigen::internal::TensorMaterializedBlock::materialize
static EIGEN_STRONG_INLINE TensorMaterializedBlock materialize(const Scalar *data, const DataDimensions &data_dims, TensorBlockDesc &desc, TensorBlockScratch &scratch)
Definition TensorBlock.h:762

Eigen::internal::TensorMaterializedBlock::kind
TensorBlockKind kind() const
Definition TensorBlock.h:673

Eigen::internal::TensorMaterializedBlock::prepareStorage
static EIGEN_STRONG_INLINE Storage prepareStorage(TensorBlockDesc &desc, TensorBlockScratch &scratch, bool allow_strided_storage=false)
Definition TensorBlock.h:730

Eigen::internal::TensorMaterializedBlock::expr
const XprType & expr() const
Definition TensorBlock.h:677

Eigen::internal::TensorMaterializedBlock::TensorBlockDesc
internal::TensorBlockDescriptor< NumDims, IndexType > TensorBlockDesc
Definition TensorBlock.h:684

Eigen::internal::TensorMaterializedBlock::Dimensions
DSizes< IndexType, NumDims > Dimensions
Definition TensorBlock.h:658

Eigen::internal::TensorMaterializedBlock::XprType
TensorMap< const Tensor< Scalar, NumDims, Layout > > XprType
Definition TensorBlock.h:659

Eigen::internal::TensorMaterializedBlock::cleanup
void cleanup()
Definition TensorBlock.h:682

Eigen::internal::TensorTernaryExprBlock
Definition TensorBlock.h:943

Eigen::internal::TensorTernaryExprBlock::expr
XprType expr() const
Definition TensorBlock.h:970

Eigen::internal::TensorTernaryExprBlock::kind
TensorBlockKind kind() const
Definition TensorBlock.h:969

Eigen::internal::TensorTernaryExprBlock::TensorTernaryExprBlock
TensorTernaryExprBlock(const Arg1TensorBlock &arg1_block, const Arg2TensorBlock &arg2_block, const Arg3TensorBlock &arg3_block, const BlockFactory &factory)
Definition TensorBlock.h:960

Eigen::internal::TensorTernaryExprBlock::data
const Scalar * data() const
Definition TensorBlock.h:974

Eigen::internal::TensorTernaryExprBlock::XprType
conditional< NoArgBlockAccess, void, typenameBlockFactory::templateXprType< Arg1XprType, Arg2XprType, Arg3XprType >::type >::type XprType
Definition TensorBlock.h:956

Eigen::internal::TensorTernaryExprBlock::Scalar
XprScalar< XprType >::type Scalar
Definition TensorBlock.h:958

Eigen::internal::TensorTernaryExprBlock::cleanup
void cleanup()
Definition TensorBlock.h:975

Eigen::internal::TensorUnaryExprBlock
Definition TensorBlock.h:912

Eigen::internal::TensorUnaryExprBlock::TensorUnaryExprBlock
TensorUnaryExprBlock(const ArgTensorBlock &arg_block, const BlockFactory &factory)
Definition TensorBlock.h:923

Eigen::internal::TensorUnaryExprBlock::expr
XprType expr() const
Definition TensorBlock.h:928

Eigen::internal::TensorUnaryExprBlock::Scalar
XprScalar< XprType >::type Scalar
Definition TensorBlock.h:921

Eigen::internal::TensorUnaryExprBlock::data
const Scalar * data() const
Definition TensorBlock.h:929

Eigen::internal::TensorUnaryExprBlock::XprType
conditional< NoArgBlockAccess, void, typenameBlockFactory::templateXprType< ArgXprType >::type >::type XprType
Definition TensorBlock.h:919

Eigen::internal::TensorUnaryExprBlock::cleanup
void cleanup()
Definition TensorBlock.h:930

Eigen::internal::TensorUnaryExprBlock::kind
TensorBlockKind kind() const
Definition TensorBlock.h:927

sizes
std::vector< Array2i > sizes
Definition dense_solvers.cpp:12

offset
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition gnuplot_common_settings.hh:64

Eigen::ColMajor
@ ColMajor
Definition Constants.h:319

s
RealScalar s
Definition level1_cplx_impl.h:126

Eigen::internal::TensorBlockKind
TensorBlockKind
Definition TensorBlock.h:589

Eigen::internal::kMaterializedInOutput
@ kMaterializedInOutput
Definition TensorBlock.h:610

Eigen::internal::kMaterializedInScratch
@ kMaterializedInScratch
Definition TensorBlock.h:601

Eigen::internal::kView
@ kView
Definition TensorBlock.h:596

Eigen::internal::kExpr
@ kExpr
Definition TensorBlock.h:592

Eigen::internal::array_prod
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
Definition TensorDimensions.h:140

Eigen::internal::strides
EIGEN_ALWAYS_INLINE DSizes< IndexType, NumDims > strides(const DSizes< IndexType, NumDims > &dimensions)
Definition TensorBlock.h:26

Eigen::internal::TensorBlockShapeType
TensorBlockShapeType
Definition TensorBlock.h:73

Eigen::internal::TensorBlockShapeType::kUniformAllDims
@ kUniformAllDims

Eigen::internal::TensorBlockShapeType::kSkewedInnerDims
@ kSkewedInnerDims

Eigen::numext::maxi
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
Definition MathFunctions.h:1091

Eigen::numext::mini
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition MathFunctions.h:1083

Eigen
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85

Eigen::dimensions_match
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2)
Definition TensorDimensions.h:484

Eigen::divup
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const X x, const Y y)
Definition TensorMeta.h:30

Eigen::Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74

internal
Definition BandTriangularSolver.h:13

count
uint8_t count
Definition ref_serial.h:256

Eigen::DSizes
Definition TensorDimensions.h:263

Eigen::DSizes::TotalSize
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const
Definition TensorDimensions.h:271

Eigen::DSizes::rank
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const
Definition TensorDimensions.h:267

Eigen::DefaultDevice
Definition TensorDeviceDefault.h:17

Eigen::internal::StridedLinearBufferCopy::Dst
Definition TensorBlock.h:1011

Eigen::internal::StridedLinearBufferCopy::Dst::Dst
Dst(IndexType o, IndexType s, Scalar *d)
Definition TensorBlock.h:1012

Eigen::internal::StridedLinearBufferCopy::Dst::offset
IndexType offset
Definition TensorBlock.h:1014

Eigen::internal::StridedLinearBufferCopy::Dst::data
Scalar * data
Definition TensorBlock.h:1016

Eigen::internal::StridedLinearBufferCopy::Dst::stride
IndexType stride
Definition TensorBlock.h:1015

Eigen::internal::StridedLinearBufferCopy::Src
Definition TensorBlock.h:1019

Eigen::internal::StridedLinearBufferCopy::Src::data
const Scalar * data
Definition TensorBlock.h:1025

Eigen::internal::StridedLinearBufferCopy::Src::Src
Src(IndexType o, IndexType s, const Scalar *d)
Definition TensorBlock.h:1020

Eigen::internal::StridedLinearBufferCopy::Src::offset
IndexType offset
Definition TensorBlock.h:1023

Eigen::internal::StridedLinearBufferCopy::Src::stride
IndexType stride
Definition TensorBlock.h:1024

Eigen::internal::TensorBlockAssignment::Target
Definition TensorBlock.h:1435

Eigen::internal::TensorBlockAssignment::Target::strides
Dimensions strides
Definition TensorBlock.h:1444

Eigen::internal::TensorBlockAssignment::Target::dims
Dimensions dims
Definition TensorBlock.h:1443

Eigen::internal::TensorBlockAssignment::Target::data
Scalar * data
Definition TensorBlock.h:1445

Eigen::internal::TensorBlockAssignment::Target::Target
Target(const Dimensions &target_dims, const Dimensions &target_strides, Scalar *target_data, IndexType target_offset=0)
Definition TensorBlock.h:1436

Eigen::internal::TensorBlockAssignment::Target::offset
IndexType offset
Definition TensorBlock.h:1446

Eigen::internal::TensorBlockIO::Dst
Definition TensorBlock.h:1152

Eigen::internal::TensorBlockIO::Dst::offset
IndexType offset
Definition TensorBlock.h:1160

Eigen::internal::TensorBlockIO::Dst::strides
Dimensions strides
Definition TensorBlock.h:1158

Eigen::internal::TensorBlockIO::Dst::dims
Dimensions dims
Definition TensorBlock.h:1157

Eigen::internal::TensorBlockIO::Dst::data
Scalar * data
Definition TensorBlock.h:1159

Eigen::internal::TensorBlockIO::Dst::Dst
Dst(const Dimensions &dst_dims, const Dimensions &dst_strides, Scalar *dst, IndexType dst_offset=0)
Definition TensorBlock.h:1153

Eigen::internal::TensorBlockIO::Src
Definition TensorBlock.h:1163

Eigen::internal::TensorBlockIO::Src::data
const Scalar * data
Definition TensorBlock.h:1169

Eigen::internal::TensorBlockIO::Src::strides
Dimensions strides
Definition TensorBlock.h:1168

Eigen::internal::TensorBlockIO::Src::offset
IndexType offset
Definition TensorBlock.h:1170

Eigen::internal::TensorBlockIO::Src::Src
Src(const Dimensions &src_strides, const Scalar *src, IndexType src_offset=0)
Definition TensorBlock.h:1164

Eigen::internal::TensorBlockResourceRequirements
Definition TensorBlock.h:75

Eigen::internal::TensorBlockResourceRequirements::withShapeAndSize
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type, size_t size_in_bytes)
Definition TensorBlock.h:100

Eigen::internal::TensorBlockResourceRequirements::withShapeAndSize
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type, size_t size_in_bytes, TensorOpCost cost)
Definition TensorBlock.h:92

Eigen::internal::TensorBlockResourceRequirements::skewed
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements skewed(size_t size_in_bytes)
Definition TensorBlock.h:123

Eigen::internal::TensorBlockResourceRequirements::any
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockResourceRequirements any()
Definition TensorBlock.h:155

Eigen::internal::TensorBlockResourceRequirements::size
size_t size
Definition TensorBlock.h:77

Eigen::internal::TensorBlockResourceRequirements::merge
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockResourceRequirements merge(const TensorBlockResourceRequirements &lhs, const TensorBlockResourceRequirements &rhs)
Definition TensorBlock.h:138

Eigen::internal::TensorBlockResourceRequirements::uniform
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements uniform(size_t size_in_bytes)
Definition TensorBlock.h:130

Eigen::internal::TensorBlockResourceRequirements::cost_per_coeff
TensorOpCost cost_per_coeff
Definition TensorBlock.h:78

Eigen::internal::TensorBlockResourceRequirements::shape_type
TensorBlockShapeType shape_type
Definition TensorBlock.h:76

Eigen::internal::TensorBlockResourceRequirements::addCostPerCoeff
EIGEN_DEVICE_FUNC TensorBlockResourceRequirements & addCostPerCoeff(TensorOpCost cost)
Definition TensorBlock.h:145

Eigen::internal::XprScalar< void >::type
void type
Definition TensorBlock.h:633

Eigen::internal::XprScalar
Definition TensorBlock.h:628

Eigen::internal::XprScalar::type
XprType::Scalar type
Definition TensorBlock.h:629

Eigen::internal::array_size
Definition Meta.h:445

Eigen::internal::conditional
Definition Meta.h:109

Eigen::internal::eval
Definition XprHelper.h:332

Eigen::internal::is_void
Definition Meta.h:152

Eigen::internal::packet_traits
Definition GenericPacketMath.h:107

Eigen::internal::traits
Definition ForwardDeclarations.h:17

j
std::ptrdiff_t j
Definition tut_arithmetic_redux_minmax.cpp:2

Eigen::internal::Packet
Definition PacketMath.h:47