TR-mbed 1.0
Loading...
Searching...
No Matches
TensorBlock.h
Go to the documentation of this file.
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// This Source Code Form is subject to the terms of the Mozilla
5// Public License v. 2.0. If a copy of the MPL was not distributed
6// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
7
8#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
9#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
10
11namespace Eigen {
12namespace internal {
13
14// -------------------------------------------------------------------------- //
15// Forward declarations for templates defined below.
16template <typename Scalar, typename IndexType, int NumDims, int Layout>
17class TensorBlockIO;
18
19// -------------------------------------------------------------------------- //
20// Helper function to compute strides for densely stored buffer of given
21// dimensions.
22
23// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
24// this function instead everywhere.
25template <int Layout, typename IndexType, int NumDims>
27 const DSizes<IndexType, NumDims>& dimensions) {
29 if (NumDims == 0) return strides;
30
31 // TODO(ezhulenev): Use templates to unroll this loop (similar to
32 // h_array_reduce in CXX11meta.h)? Benchmark it.
33 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
34 strides[0] = 1;
35 for (int i = 1; i < NumDims; ++i) {
36 strides[i] = strides[i - 1] * dimensions[i - 1];
37 }
38 } else {
39 strides[NumDims - 1] = 1;
40 for (int i = NumDims - 2; i >= 0; --i) {
41 strides[i] = strides[i + 1] * dimensions[i + 1];
42 }
43 }
44
45 return strides;
46}
47
48template <int Layout, typename IndexType, size_t NumDims>
53
54template <int Layout, std::ptrdiff_t... Indices>
55EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
56 const Sizes<Indices...>& sizes) {
57 return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
58}
59
60// -------------------------------------------------------------------------- //
61
62// Tensor block shape type defines what are the shape preference for the blocks
63// extracted from the larger tensor.
64//
65// Example: blocks of 100 elements from the large 100x100 tensor:
66// - tensor: 100x100
67// - target_block_size: 100
68//
69// TensorBlockShapeType:
70// - kUniformAllDims: 100 blocks of size 10x10
71// - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
72// or row major layout)
74
76 TensorBlockShapeType shape_type; // target block shape
77 size_t size; // target block size
78 TensorOpCost cost_per_coeff; // cost of computing a single block element
79
80#ifdef EIGEN_HIPCC
81 // For HIPCC, we need to explicitly declare as a "device fun", the constructor
82 // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
83 // errors out complaining about the lack of a matching constructor
88 {}
89#endif
90
91 template <typename Scalar>
98
99 template <typename Scalar>
102 // This default cost per coefficient is valid for most materialized tensor
103 // block evaluation implementations, because they typically just read
104 // coefficients from the underlying tensor storage, and write to the tensor
105 // block buffer (scratch or destination memory, reads and writes have linear
106 // access pattern). We ignore the fixed cost of block evaluation, because in
107 // practice it should negligible.
108 //
109 // Lazy block evaluation adds the cost of calling a functor for each
110 // coefficient.
111 //
112 // All non-trivial block evaluation implementations must provide their own
113 // cost approximation (e.g. shuffling inner dimension has a much higher cost
114 // because it reads memory randomly, although the total number of moved
115 // bytes is the same).
117 {/*bytes_loaded=*/sizeof(Scalar),
118 /*bytes_stored=*/sizeof(Scalar),
119 /*compute_cycles=*/0});
120 }
121
122 template <typename Scalar>
128
129 template <typename Scalar>
135
140 return {merge(lhs.shape_type, rhs.shape_type), // shape_type
141 merge(lhs.size, rhs.size), // size
142 merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff
143 }
144
150
151 // This is a resource requirement that should be returned from expressions
152 // that do not have any block evaluation preference (e.g. default tensor
153 // expression with raw buffer access).
158
159 private:
160 using Requirements = TensorBlockResourceRequirements;
161
163 static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
164 return numext::maxi(lhs_size, rhs_size);
165 }
166
174 }
175
177 static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
178 TensorOpCost rhs_cost) {
179 return lhs_cost + rhs_cost;
180 }
181};
182
183// -------------------------------------------------------------------------- //
184// TensorBlockDescriptor specifies a block offset within a tensor and the block
185// sizes along each of the tensor dimensions.
186
187template <int NumDims, typename IndexType = Eigen::Index>
189 public:
191
192 // If we evaluate a Tensor assignment, and expression on the left, already has
193 // a memory buffer, then we might do performance optimization, and evaluate
194 // the root expression directly into the final output memory. Some time it's
195 // possible to reuse it for materializing subexpressions inside an expression
196 // tree, to to avoid dynamic memory allocation.
197 //
198 // The pointer type of the underlying storage is erased, because passing
199 // Scalar type through all the expression evaluation layers is way too many
200 // templates. In practice destination buffer type should always match the
201 // evaluated expression scalar type.
203 public:
205 // The above explicit specification of "int" as the enum basetype is
206 // needed to get around a HIPCC link error ("the field type is not
207 // amp-compatible")
208 // which is issued for class members with the enum type.
209 // TODO(rocm):
210 // remove the "int" basetype once HIPCC has been fixed to not error out
211 // in the above scenario.
212
213 // Destination buffer is not defined (`m_data` == nullptr).
215
216 // Tensor block defined by an owning tensor block descriptor can fit
217 // contiguously into the destination buffer. In this case it's safe to
218 // materialize tensor block in the destination buffer, wrap it in a
219 // TensorMap, and use to build Eigen expression on top of it.
221
222 // Destination buffer strides do not match strides of the contiguously
223 // stored block, and it's impossible to define a TensorMap over this
224 // buffer. However if we are evaluating a root of an expression tree, we
225 // still can materialize an output into this destination, because we can
226 // guarantee that no one will ever access it through block API.
227 //
228 // In theory it is possible to build valid TensorStriding<TensorMap>
229 // expression on top of this destination buffer, however it has
230 // inefficient coeff/packet access, and defeats the purpose of fast block
231 // evaluation API.
233 };
234
235 template <typename Scalar>
236 Scalar* data() const {
237 eigen_assert(m_data_type_size == sizeof(Scalar));
238 return static_cast<Scalar*>(m_data);
239 }
240
241 const Dimensions& strides() const { return m_strides; }
242 const DestinationBufferKind& kind() const { return m_kind; }
243
244 private:
246
247 DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
248
249 template <typename Scalar>
252 : m_data(static_cast<void*>(data)),
253 m_data_type_size(sizeof(Scalar)),
254 m_strides(strides),
255 m_kind(kind) {}
256
257 template <int Layout, typename Scalar>
258 static DestinationBuffer make(const TensorBlockDescriptor& desc,
259 Scalar* data, const Dimensions& strides) {
260 return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
261 }
262
263 template <int Layout>
265 const Dimensions& strides) {
266 const Dimensions& desc_dims = desc.dimensions();
267 const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
268 for (int i = 0; i < NumDims; ++i) {
269 if (desc_dims[i] == 1) continue;
270 if (desc_strides[i] != strides[i]) return kStrided;
271 }
272 return kContiguous;
273 }
274
275 // Storage pointer is type erased, to reduce template bloat, but we still
276 // keep the size of the underlying element type for error checking.
277 void* m_data;
278 size_t m_data_type_size;
279
280 // Destination buffer dimensions always match the dimensions of a tensor
281 // block descriptor it belongs to, however strides might be different.
282 Dimensions m_strides;
283
285 };
286
289 : m_offset(offset),
290 m_dimensions(dimensions),
291 m_destination(destination) {}
292
294 : m_offset(offset),
295 m_dimensions(dimensions),
296 m_destination(DestinationBuffer()) {}
297
298 IndexType offset() const { return m_offset; }
299 const Dimensions& dimensions() const { return m_dimensions; }
300 IndexType dimension(int index) const { return m_dimensions[index]; }
301 IndexType size() const { return array_prod<IndexType>(m_dimensions); }
302
303 const DestinationBuffer& destination() const { return m_destination; }
304
305 template <int Layout, typename Scalar>
308 m_destination =
309 DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
310 }
311
312 template <int Layout, typename Scalar, typename DstStridesIndexType>
316 // DSizes constructor will do index type promotion if it's safe.
318 }
319
321 m_destination.m_data = NULL;
322 m_destination.m_kind = DestinationBuffer::kEmpty;
323 return *this;
324 }
325
326 bool HasDestinationBuffer() const {
327 return m_destination.kind() != DestinationBuffer::kEmpty;
328 }
329
330 // Returns a copy of `*this` with updated offset.
332 return TensorBlockDescriptor(offset, m_dimensions, m_destination);
333 }
334
335 private:
336 // Offset and dimensions are immutable after construction. Block descriptor
337 // can only be mutated by adding or dropping destination.
338 const IndexType m_offset;
339 const Dimensions m_dimensions;
340 DestinationBuffer m_destination;
341};
342
343// -------------------------------------------------------------------------- //
344// TensorBlockMapper is responsible for iterating over the blocks of a tensor.
345
346template <int NumDims, int Layout, typename IndexType = Eigen::Index>
349
350 public:
352
353 TensorBlockMapper() = default;
356 : m_tensor_dimensions(dimensions), m_requirements(requirements) {
357 // Compute block dimensions and the total number of blocks.
358 InitializeBlockDimensions();
359 }
360
362 return m_total_block_count;
363 }
364
366 return m_block_dimensions.TotalSize();
367 }
368
371 return m_block_dimensions;
372 }
373
375 blockDescriptor(IndexType block_index) const {
376 static const bool isColMajor = Layout == static_cast<int>(ColMajor);
377
378 IndexType offset = 0;
380
381 if (NumDims == 0) return BlockDescriptor(offset, dimensions);
382
383 // Iterate outer -> inner dimensions.
384 for (int i = NumDims - 1; i >= 0; --i) {
385 const int dim = isColMajor ? i : NumDims - i - 1;
386
387 const IndexType idx = block_index / m_block_strides[dim];
388 block_index -= idx * m_block_strides[dim];
389
390 const IndexType coord = idx * m_block_dimensions[dim];
391 dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
392 m_block_dimensions[dim]);
393 offset += coord * m_tensor_strides[dim];
394 }
395
396 return {offset, dimensions};
397 }
398
399 private:
400 void InitializeBlockDimensions() {
401 // Requested block shape and size.
402 const TensorBlockShapeType shape_type = m_requirements.shape_type;
403 IndexType target_block_size =
404 numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
405
406 IndexType tensor_size = m_tensor_dimensions.TotalSize();
407
408 // Corner case: one of the dimensions is zero. Logic below is too complex
409 // to handle this case on a general basis, just use unit block size.
410 // Note: we must not yield blocks with zero dimensions (recipe for
411 // overflows/underflows, divisions by zero and NaNs later).
412 if (tensor_size == 0) {
413 for (int i = 0; i < NumDims; ++i) {
414 m_block_dimensions[i] = 1;
415 }
416 m_total_block_count = 0;
417 return;
418 }
419
420 // If tensor fits into a target block size, evaluate it as a single block.
421 if (tensor_size <= target_block_size) {
422 m_block_dimensions = m_tensor_dimensions;
423 m_total_block_count = 1;
424 // The only valid block index is `0`, and in this case we do not need
425 // to compute real strides for tensor or blocks (see blockDescriptor).
426 for (int i = 0; i < NumDims; ++i) {
427 m_tensor_strides[i] = 0;
428 m_block_strides[i] = 1;
429 }
430 return;
431 }
432
433 static const bool isColMajor = Layout == static_cast<int>(ColMajor);
434
435 // Block shape skewed towards inner dimension.
436 if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
437 IndexType coeff_to_allocate = target_block_size;
438
439 for (int i = 0; i < NumDims; ++i) {
440 const int dim = isColMajor ? i : NumDims - i - 1;
441 m_block_dimensions[dim] =
442 numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
443 coeff_to_allocate = divup(
444 coeff_to_allocate,
445 numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
446 }
447 eigen_assert(coeff_to_allocate == 1);
448
449 } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
450 // Tensor will not fit within 'target_block_size' budget: calculate tensor
451 // block dimension sizes based on "square" dimension size target.
452 const IndexType dim_size_target = convert_index<IndexType>(
453 std::pow(static_cast<float>(target_block_size),
454 1.0f / static_cast<float>(m_block_dimensions.rank())));
455
456 for (int i = 0; i < NumDims; ++i) {
457 // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
458 // a multiple of the packet size. Note that reducing
459 // 'block_dim_size' in this manner can increase the number of
460 // blocks, and so will amplify any per-block overhead.
461 m_block_dimensions[i] =
462 numext::mini(dim_size_target, m_tensor_dimensions[i]);
463 }
464
465 // Add any un-allocated coefficients to inner dimension(s).
466 IndexType total_size = m_block_dimensions.TotalSize();
467 for (int i = 0; i < NumDims; ++i) {
468 const int dim = isColMajor ? i : NumDims - i - 1;
469
470 if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
471 const IndexType total_size_other_dims =
472 total_size / m_block_dimensions[dim];
473 const IndexType alloc_avail =
474 divup<IndexType>(target_block_size, total_size_other_dims);
475 if (alloc_avail == m_block_dimensions[dim]) {
476 // Insufficient excess coefficients to allocate.
477 break;
478 }
479 m_block_dimensions[dim] =
480 numext::mini(m_tensor_dimensions[dim], alloc_avail);
481 total_size = total_size_other_dims * m_block_dimensions[dim];
482 }
483 }
484
485 } else {
486 eigen_assert(false); // unknown block shape
487 }
488
489 eigen_assert(m_block_dimensions.TotalSize() >=
490 numext::mini<IndexType>(target_block_size,
491 m_tensor_dimensions.TotalSize()));
492
493 // Calculate block counts by dimension and total block count.
494 DSizes<IndexType, NumDims> block_count;
495 for (int i = 0; i < NumDims; ++i) {
496 block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
497 }
498 m_total_block_count = array_prod(block_count);
499
500 // Calculate block strides (used for enumerating blocks).
501 m_tensor_strides = strides<Layout>(m_tensor_dimensions);
502 m_block_strides = strides<Layout>(block_count);
503 }
504
505 DSizes<IndexType, NumDims> m_tensor_dimensions;
506 TensorBlockResourceRequirements m_requirements;
507
508 DSizes<IndexType, NumDims> m_block_dimensions;
509 IndexType m_total_block_count;
510
511 DSizes<IndexType, NumDims> m_tensor_strides;
512 DSizes<IndexType, NumDims> m_block_strides;
513};
514
515// -------------------------------------------------------------------------- //
516// TensorBlockScratchAllocator is responsible for allocating temporary buffers
517// for block evaluation (output or input block materialization). Given that
518// Eigen expression traversal order is deterministic, all temporary allocations
519// are happening in the same order, and usually have exactly the same size.
520// Scratch allocator keeps a trace of all dynamic allocations, and after the
521// first block evaluation is completed, we should be able to reuse all the
522// temporary buffers for the next block evaluation.
523
524template <typename Device>
526 public:
527 explicit TensorBlockScratchAllocator(const Device& device)
528 : m_device(device), m_allocation_index(0) {}
529
531 for (size_t i = 0; i < m_allocations.size(); ++i) {
532 m_device.deallocate(m_allocations[i].ptr);
533 }
534 }
535
536 void* allocate(size_t size) {
537 // TODO(ezhulenev): Remove when replaced with inlined vector.
538 if (m_allocations.capacity() == 0) m_allocations.reserve(8);
539
540 // Check if we already have an existing allocation att current index.
541 const int num_allocations = static_cast<int>(m_allocations.size());
542 const bool has_allocation = m_allocation_index < num_allocations;
543
544 // Allocation index can't be larger than the number of allocations.
545 eigen_assert(m_allocation_index <= num_allocations);
546
547 // If we have existing allocation, and its size is larger or equal to
548 // requested size, we do nothing.
549
550 // If current allocation can't fit requested size, we deallocate it, and
551 // replace with a larger allocation.
552 if (has_allocation && m_allocations[m_allocation_index].size < size) {
553 m_device.deallocate(m_allocations[m_allocation_index].ptr);
554 m_allocations[m_allocation_index].ptr = m_device.allocate(size);
555 m_allocations[m_allocation_index].size = size;
556 }
557
558 // Make a new allocation if we don't have and existing one.
559 if (!has_allocation) {
560 Allocation allocation;
561 allocation.ptr = m_device.allocate(size);
562 allocation.size = size;
563 m_allocations.push_back(allocation);
564 }
565
566 eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
567 eigen_assert(m_allocations[m_allocation_index].size >= size);
568
569 return m_allocations[m_allocation_index++].ptr;
570 }
571
572 void reset() { m_allocation_index = 0; }
573
574 private:
575 struct Allocation {
576 void* ptr;
577 size_t size;
578 };
579
580 const Device& m_device;
581 int m_allocation_index;
582 // TODO(ezhulenev): This should be an inlined vector.
583 std::vector<Allocation> m_allocations;
584};
585
586// -------------------------------------------------------------------------- //
587// TensorBlockKind represents all possible block kinds, that can be produced by
588// TensorEvaluator::evalBlock function.
590 // Tensor block that is a lazy expression that must be assigned to a
591 // destination using TensorBlockAssign.
593
594 // Tensor block that is a view into a memory buffer owned by an underlying
595 // Tensor expression (e.g. it can be a view into a Tensor buffer).
597
598 // Tensor block that was materialized in a scratch memory buffer, allocated
599 // with TensorBlockScratchAllocator. This block must be copied to a
600 // destination, similar to a block of `kExpr` type.
602
603 // Tensor block that was materialized directly into the final output memory
604 // buffer. For example if the left side of an assignment is a Tensor, we can
605 // directly materialize the block in the destination memory.
606 //
607 // If strides in the output buffer do not match tensor block strides, the
608 // Tensor expression will be invalid, and should not be used by
609 // TensorBlockAssign or for constructing another block expression.
612
613// -------------------------------------------------------------------------- //
614// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
615// TensorEvaluators that do not support block evaluation.
616
618 public:
619 typedef void XprType;
620};
621
622// -------------------------------------------------------------------------- //
623// XprScalar extracts Scalar type from the Eigen expressions (if expression type
624// is not void). It's required to be able to define lazy block expression for
625// argument types, that do not support block evaluation.
626
627template <typename XprType>
628struct XprScalar {
629 typedef typename XprType::Scalar type;
630};
631template <>
633 typedef void type;
634};
635
636// -------------------------------------------------------------------------- //
637// TensorMaterializedBlock is a fully evaluated block of the original tensor,
638// and XprType is just a TensorMap over the data. This block type is typically
639// used to materialize blocks of tensor expressions, that can't be efficiently
640// represented as lazy Tensor expressions with fast coeff/packet operations,
641// e.g. we materialize all broadcasts into evaluated blocks.
642//
643// TensorMaterializedBlock does not own its memory buffer, it's either a memory
644// buffer that backs the original expression (e.g. block is just a view into a
645// Tensor), or a memory buffer allocated with scratch allocator, and in this
646// case the scratch allocator will deallocate it at the end of block based
647// expression execution.
648//
649// If the block was evaluated directly into the output buffer, and strides in
650// the output buffer do not match block strides, the TensorMap expression will
651// be invalid, and should never be used in block assignment or any other tensor
652// expression.
653
654template <typename Scalar, int NumDims, int Layout,
655 typename IndexType = Eigen::Index>
657 public:
660
662 const Dimensions& dimensions, bool valid_expr = true)
663 : m_kind(kind),
664 m_data(data),
665 m_dimensions(dimensions),
666 m_expr(m_data, m_dimensions),
667 m_valid_expr(valid_expr) {
671 }
672
673 TensorBlockKind kind() const { return m_kind; }
674 // NOTE(ezhulenev): Returning XprType by value like in other block types
675 // causes asan failures. The theory is that XprType::Nested doesn't work
676 // properly for TensorMap.
677 const XprType& expr() const {
678 eigen_assert(m_valid_expr);
679 return m_expr;
680 }
681 const Scalar* data() const { return m_data; }
682 void cleanup() {}
683
685
686 // TensorMaterializedBlock can be backed by different types of storage:
687 //
688 // (1) Contiguous block of memory allocated with scratch allocator.
689 // (2) Contiguous block of memory reused from tensor block descriptor
690 // destination buffer.
691 // (3) Strided block of memory reused from tensor block descriptor
692 // destination buffer.
693 //
694 class Storage {
695 public:
696 Scalar* data() const { return m_data; }
697 const Dimensions& dimensions() const { return m_dimensions; }
698 const Dimensions& strides() const { return m_strides; }
699
702 m_materialized_in_output
705 m_data, m_dimensions, !m_strided_storage);
706 }
707
708 private:
710
713 bool strided_storage)
714 : m_data(data),
715 m_dimensions(dimensions),
716 m_strides(strides),
717 m_materialized_in_output(materialized_in_output),
718 m_strided_storage(strided_storage) {}
719
720 Scalar* m_data;
721 Dimensions m_dimensions;
722 Dimensions m_strides;
723 bool m_materialized_in_output;
724 bool m_strided_storage;
725 };
726
727 // Creates a storage for materialized block either from the block descriptor
728 // destination buffer, or allocates a new buffer with scratch allocator.
729 template <typename TensorBlockScratch>
731 TensorBlockDesc& desc, TensorBlockScratch& scratch,
732 bool allow_strided_storage = false) {
733 // Try to reuse destination as an output block buffer.
734 typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
735
736 if (desc.destination().kind() == DestinationBuffer::kContiguous) {
737 Scalar* buffer = desc.destination().template data<Scalar>();
738 desc.DropDestinationBuffer();
739 return Storage(buffer, desc.dimensions(),
740 internal::strides<Layout>(desc.dimensions()),
741 /*materialized_in_output=*/true,
742 /*strided_storage=*/false);
743
744 } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
746 Scalar* buffer = desc.destination().template data<Scalar>();
747 desc.DropDestinationBuffer();
748 return Storage(buffer, desc.dimensions(), desc.destination().strides(),
749 /*materialized_in_output=*/true, /*strided_storage=*/true);
750
751 } else {
752 void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
753 return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
754 internal::strides<Layout>(desc.dimensions()),
755 /*materialized_in_output=*/false,
756 /*strided_storage=*/false);
757 }
758 }
759
760 // Creates a materialized block for the given descriptor from a memory buffer.
761 template <typename DataDimensions, typename TensorBlockScratch>
763 const Scalar* data, const DataDimensions& data_dims,
764 TensorBlockDesc& desc, TensorBlockScratch& scratch) {
765 eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
766
767 // If a tensor block dimensions covers a contiguous block of the underlying
768 // memory, we can skip block buffer memory allocation, and construct a block
769 // from existing `data` memory buffer.
770 //
771 // Example: (RowMajor layout)
772 // data_dims: [11, 12, 13, 14]
773 // desc.dimensions(): [1, 1, 3, 14]
774 //
775 // In this case we can construct a TensorBlock starting at
776 // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
777 static const bool is_col_major = Layout == ColMajor;
778
779 // Find out how many inner dimensions have a matching size.
781 for (int i = 0; i < NumDims; ++i) {
782 int dim = is_col_major ? i : NumDims - i - 1;
783 if (data_dims[dim] != desc.dimensions()[dim]) break;
785 }
786
787 // All the outer dimensions must be of size `1`, except a single dimension
788 // before the matching inner dimension (`3` in the example above).
789 bool can_use_direct_access = true;
790 for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
791 int dim = is_col_major ? i : NumDims - i - 1;
792 if (desc.dimension(dim) != 1) {
793 can_use_direct_access = false;
794 break;
795 }
796 }
797
799 const Scalar* block_start = data + desc.offset();
801 block_start, desc.dimensions());
802
803 } else {
804 // Reuse destination buffer or allocate new buffer with scratch allocator.
805 const Storage storage = prepareStorage(desc, scratch);
806
809 typedef typename TensorBlockIO::Dst TensorBlockIODst;
810 typedef typename TensorBlockIO::Src TensorBlockIOSrc;
811
813 data, desc.offset());
814 TensorBlockIODst dst(storage.dimensions(), storage.strides(),
815 storage.data());
816
818 return storage.AsTensorMaterializedBlock();
819 }
820 }
821
822 private:
823 TensorBlockKind m_kind;
824 const Scalar* m_data;
825 Dimensions m_dimensions;
826 XprType m_expr;
827 bool m_valid_expr;
828};
829
830// -------------------------------------------------------------------------- //
831// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
832// functor to the blocks produced by the underlying Tensor expression.
833
834template <typename UnaryOp, typename ArgTensorBlock>
836 static const bool NoArgBlockAccess =
838
839 public:
840 typedef typename conditional<
841 NoArgBlockAccess, void,
844
846
847 TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
848 : m_arg_block(arg_block), m_functor(functor) {}
849
851
852 XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
853 const Scalar* data() const { return NULL; }
854 void cleanup() { m_arg_block.cleanup(); }
855
856 private:
857 ArgTensorBlock m_arg_block;
858 UnaryOp m_functor;
859};
860
861// -------------------------------------------------------------------------- //
862// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
863// functor to the blocks produced by the underlying Tensor expression.
864
865template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
867 static const bool NoArgBlockAccess =
870
871 public:
872 typedef typename conditional<
873 NoArgBlockAccess, void,
874 TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
875 const typename RhsTensorBlock::XprType> >::type
877
879
882 const BinaryOp& functor)
883 : m_left_block(left_block),
884 m_right_block(right_block),
885 m_functor(functor) {}
886
888
889 XprType expr() const {
890 return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
891 }
892
893 const Scalar* data() const { return NULL; }
894
895 void cleanup() {
896 m_left_block.cleanup();
897 m_right_block.cleanup();
898 }
899
900 private:
901 LhsTensorBlock m_left_block;
902 RhsTensorBlock m_right_block;
903 BinaryOp m_functor;
904};
905
906// -------------------------------------------------------------------------- //
907// TensorUnaryExprBlock is a lazy tensor expression block that can construct
908// an arbitrary tensor expression from a block of the underlying type (this is a
909// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
910
911template <typename BlockFactory, typename ArgTensorBlock>
913 typedef typename ArgTensorBlock::XprType ArgXprType;
914 static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
915
916 public:
917 typedef typename conditional<
918 NoArgBlockAccess, void,
919 typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
920
922
923 TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
924 const BlockFactory& factory)
925 : m_arg_block(arg_block), m_factory(factory) {}
926
928 XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
929 const Scalar* data() const { return NULL; }
930 void cleanup() { m_arg_block.cleanup(); }
931
932 private:
933 ArgTensorBlock m_arg_block;
934 BlockFactory m_factory;
935};
936
937// -------------------------------------------------------------------------- //
938// TensorTernaryExprBlock is a lazy tensor expression block that can construct
939// an arbitrary tensor expression from three blocks of the underlying type.
940
941template <typename BlockFactory, typename Arg1TensorBlock,
942 typename Arg2TensorBlock, typename Arg3TensorBlock>
944 typedef typename Arg1TensorBlock::XprType Arg1XprType;
945 typedef typename Arg2TensorBlock::XprType Arg2XprType;
946 typedef typename Arg3TensorBlock::XprType Arg3XprType;
947
948 static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
951
952 public:
953 typedef typename conditional<
954 NoArgBlockAccess, void,
955 typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
956 Arg3XprType>::type>::type XprType;
957
959
963 const BlockFactory& factory)
964 : m_arg1_block(arg1_block),
965 m_arg2_block(arg2_block),
966 m_arg3_block(arg3_block),
967 m_factory(factory) {}
968
970 XprType expr() const {
971 return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
972 m_arg3_block.expr());
973 }
974 const Scalar* data() const { return NULL; }
975 void cleanup() {
976 m_arg1_block.cleanup();
977 m_arg2_block.cleanup();
978 m_arg3_block.cleanup();
979 }
980
981 private:
982 Arg1TensorBlock m_arg1_block;
983 Arg2TensorBlock m_arg2_block;
984 Arg3TensorBlock m_arg3_block;
985 BlockFactory m_factory;
986};
987
988// -------------------------------------------------------------------------- //
989// StridedLinearBufferCopy provides a method to copy data between two linear
990// buffers with different strides, with optimized paths for scatter/gather.
991
992template <typename Scalar, typename IndexType>
994 typedef typename packet_traits<Scalar>::type Packet;
995 enum {
997 PacketSize = packet_traits<Scalar>::size
998 };
999
1000 public:
1001 // Specifying linear copy kind statically gives ~30% speedup for small sizes.
1002 enum class Kind {
1003 Linear = 0, // src_stride == 1 && dst_stride == 1
1004 Scatter = 1, // src_stride == 1 && dst_stride != 1
1005 FillLinear = 2, // src_stride == 0 && dst_stride == 1
1006 FillScatter = 3, // src_stride == 0 && dst_stride != 1
1007 Gather = 4, // dst_stride == 1
1008 Random = 5 // everything else
1009 };
1010
1011 struct Dst {
1012 Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
1013
1014 IndexType offset;
1015 IndexType stride;
1017 };
1018
1019 struct Src {
1020 Src(IndexType o, IndexType s, const Scalar* d)
1021 : offset(o), stride(s), data(d) {}
1022
1023 IndexType offset;
1024 IndexType stride;
1025 const Scalar* data;
1026 };
1027
1028 template <typename StridedLinearBufferCopy::Kind kind>
1030 const Src& src,
1031 const size_t count) {
1032 Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
1033 src.data);
1034 }
1035
1036 private:
1037 template <typename StridedLinearBufferCopy::Kind kind>
1039 const IndexType count, const IndexType dst_offset,
1040 const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
1041 const IndexType src_offset, const IndexType src_stride,
1043 const Scalar* src = &src_data[src_offset];
1045
1046 if (!Vectorizable) {
1047 for (Index i = 0; i < count; ++i) {
1048 dst[i * dst_stride] = src[i * src_stride];
1049 }
1050 return;
1051 }
1052
1053 const IndexType vectorized_size = count - PacketSize;
1054 IndexType i = 0;
1055
1057 // ******************************************************************** //
1058 // Linear copy from `src` to `dst`.
1059 const IndexType unrolled_size = count - 4 * PacketSize;
1060 eigen_assert(src_stride == 1 && dst_stride == 1);
1061 for (; i <= unrolled_size; i += 4 * PacketSize) {
1062 for (int j = 0; j < 4; ++j) {
1063 Packet p = ploadu<Packet>(src + i + j * PacketSize);
1064 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1065 }
1066 }
1067 for (; i <= vectorized_size; i += PacketSize) {
1068 Packet p = ploadu<Packet>(src + i);
1069 pstoreu<Scalar, Packet>(dst + i, p);
1070 }
1071 for (; i < count; ++i) {
1072 dst[i] = src[i];
1073 }
1074 // ******************************************************************** //
1075 } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
1076 // Scatter from `src` to `dst`.
1077 eigen_assert(src_stride == 1 && dst_stride != 1);
1078 for (; i <= vectorized_size; i += PacketSize) {
1079 Packet p = ploadu<Packet>(src + i);
1080 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1081 }
1082 for (; i < count; ++i) {
1083 dst[i * dst_stride] = src[i];
1084 }
1085 // ******************************************************************** //
1086 } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
1087 // Fill `dst` with value at `*src`.
1088 eigen_assert(src_stride == 0 && dst_stride == 1);
1089 const IndexType unrolled_size = count - 4 * PacketSize;
1090 Packet p = pload1<Packet>(src);
1091 for (; i <= unrolled_size; i += 4 * PacketSize) {
1092 for (int j = 0; j < 4; ++j) {
1093 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1094 }
1095 }
1096 for (; i <= vectorized_size; i += PacketSize) {
1097 pstoreu<Scalar, Packet>(dst + i, p);
1098 }
1099 for (; i < count; ++i) {
1100 dst[i] = *src;
1101 }
1102 // ******************************************************************** //
1103 } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
1104 // Scatter `*src` into `dst`.
1105 eigen_assert(src_stride == 0 && dst_stride != 1);
1106 Packet p = pload1<Packet>(src);
1107 for (; i <= vectorized_size; i += PacketSize) {
1108 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1109 }
1110 for (; i < count; ++i) {
1111 dst[i * dst_stride] = *src;
1112 }
1113 // ******************************************************************** //
1114 } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
1115 // Gather from `src` into `dst`.
1116 eigen_assert(dst_stride == 1);
1117 for (; i <= vectorized_size; i += PacketSize) {
1118 Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
1119 pstoreu<Scalar, Packet>(dst + i, p);
1120 }
1121 for (; i < count; ++i) {
1122 dst[i] = src[i * src_stride];
1123 }
1124 // ******************************************************************** //
1125 } else if (kind == StridedLinearBufferCopy::Kind::Random) {
1126 // Random.
1127 for (; i < count; ++i) {
1128 dst[i * dst_stride] = src[i * src_stride];
1129 }
1130 } else {
1131 eigen_assert(false);
1132 }
1133 }
1134};
1135
1136// -------------------------------------------------------------------------- //
1137// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
1138// It's possible to specify src->dst dimension mapping for the copy operation.
1139// Dimensions of `dst` specify how many elements have to be copied, for the
1140// `src` we need to know only stride to navigate through source memory buffer.
1141
1142template <typename Scalar, typename IndexType, int NumDims, int Layout>
1144 static const bool IsColMajor = (Layout == ColMajor);
1145
1147
1148 public:
1151
1162
1163 struct Src {
1165 IndexType src_offset = 0)
1167
1169 const Scalar* data;
1170 IndexType offset;
1171 };
1172
1173 // Copies data to `dst` from `src`, using provided dimensions mapping:
1174 //
1175 // src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
1176 //
1177 // Returns the number of copied elements.
1179 const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
1180 // Copy single scalar value from `src` to `dst`.
1181 if (NumDims == 0) {
1182 *(dst.data + dst.offset) = *(src.data + src.offset);
1183 return 1;
1184 }
1185
1186 // Both `dst` and `src` must have contiguous innermost dimension. We also
1187 // accept the special case with stride '0', because it's used as a trick to
1188 // implement broadcasting.
1189 {
1190 int inner_dim = IsColMajor ? 0 : NumDims - 1;
1192 eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
1193 eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
1194 }
1195
1196 // Give a shorter name to `dst_to_src_dim_map`.
1198
1199 // Do not squeeze reordered inner dimensions.
1200 int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
1201
1202 // NOTE: We find the innermost dimension (contiguous in memory) in the dst
1203 // block, and we write data linearly into that dimension, reading it from
1204 // the src. If dimensions are reordered, we might end up reading data from
1205 // the src with `stride != 1`.
1206 //
1207 // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
1208 // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
1209
1210 // Find the innermost dimension in the dst whose size is not 1. This is the
1211 // effective inner dim.
1213 for (int i = 0; i < num_squeezable_dims; ++i) {
1214 const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1215 if (dst.dims[dst_dim] != 1) break;
1217 }
1218
1219 // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
1220 if (num_size_one_inner_dims == NumDims) {
1221 *(dst.data + dst.offset) = *(src.data + src.offset);
1222 return 1;
1223 }
1224
1225 // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
1226 const int dst_stride1_dim = IsColMajor
1228 : NumDims - num_size_one_inner_dims - 1;
1229
1230 // Dimension in the src that corresponds to the dst innermost dimension.
1231 const int src_dim_for_dst_stride1_dim =
1232 NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
1233
1234 // Size of the innermost dimension (length of contiguous blocks of memory).
1235 IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
1236
1237 // Squeeze multiple inner dims into one if they are contiguous in `dst` and
1238 // `src` memory, so we can do less linear copy calls.
1239 for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
1240 const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1241 const IndexType dst_stride = dst.strides[dst_dim];
1242 const IndexType src_stride = src.strides[dim_map[dst_dim]];
1246 } else {
1247 break;
1248 }
1249 }
1250
1251 // Setup strides to read data from `src` and write to `dst`.
1252 IndexType input_offset = src.offset;
1253 IndexType output_offset = dst.offset;
1254 IndexType input_stride =
1255 NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
1256 IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
1257
1258 const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
1260
1261 // Initialize block iterator state. Squeeze away any dimension of size 1.
1262 int idx = 0; // currently initialized iterator state index
1263 for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
1264 const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
1265 if (dst.dims[dst_dim] == 1) continue;
1266
1267 it[idx].size = dst.dims[dst_dim];
1268 it[idx].input_stride = src.strides[dim_map[dst_dim]];
1269 it[idx].output_stride = dst.strides[dst_dim];
1270
1271 it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
1272 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1273
1274 idx++;
1275 }
1276
1277 // Iterate copying data from src to dst.
1278 const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
1279
1280#define COPY_INNER_DIM(KIND) \
1281 IndexType num_copied = 0; \
1282 for (num_copied = 0; num_copied < block_total_size; \
1283 num_copied += dst_inner_dim_size) { \
1284 LinCopy::template Run<KIND>( \
1285 typename LinCopy::Dst(output_offset, output_stride, dst.data), \
1286 typename LinCopy::Src(input_offset, input_stride, src.data), \
1287 dst_inner_dim_size); \
1288 \
1289 for (int j = 0; j < idx; ++j) { \
1290 if (++it[j].count < it[j].size) { \
1291 input_offset += it[j].input_stride; \
1292 output_offset += it[j].output_stride; \
1293 break; \
1294 } \
1295 it[j].count = 0; \
1296 input_offset -= it[j].input_span; \
1297 output_offset -= it[j].output_span; \
1298 } \
1299 } \
1300 return num_copied;
1301
1302 if (input_stride == 1 && output_stride == 1) {
1303 COPY_INNER_DIM(LinCopy::Kind::Linear);
1304 } else if (input_stride == 1 && output_stride != 1) {
1305 COPY_INNER_DIM(LinCopy::Kind::Scatter);
1306 } else if (input_stride == 0 && output_stride == 1) {
1307 COPY_INNER_DIM(LinCopy::Kind::FillLinear);
1308 } else if (input_stride == 0 && output_stride != 1) {
1309 COPY_INNER_DIM(LinCopy::Kind::FillScatter);
1310 } else if (output_stride == 1) {
1311 COPY_INNER_DIM(LinCopy::Kind::Gather);
1312 } else {
1313 COPY_INNER_DIM(LinCopy::Kind::Random);
1314 }
1315
1316#undef COPY_INNER_DIM
1317 }
1318
1319 // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
1320 // the number of copied elements.
1322 const Src& src) {
1324 for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
1325 return Copy(dst, src, dst_to_src_map);
1326 }
1327
1328 private:
1329 struct BlockIteratorState {
1330 BlockIteratorState()
1331 : size(0),
1332 count(0),
1333 input_stride(0),
1334 output_stride(0),
1335 input_span(0),
1336 output_span(0) {}
1337
1338 IndexType size;
1339 IndexType count;
1340 IndexType input_stride;
1341 IndexType output_stride;
1342 IndexType input_span;
1343 IndexType output_span;
1344 };
1345
1346 // Compute how many inner dimensions it's allowed to squeeze when doing IO
1347 // between two tensor blocks. It's safe to squeeze inner dimensions, only
1348 // if they are not reordered.
1349 static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
1350 int num_squeezable_dims = 0;
1351 for (int i = 0; i < NumDims; ++i) {
1352 const int dim = IsColMajor ? i : NumDims - i - 1;
1353 if (dim_map[dim] != dim) break;
1354 num_squeezable_dims++;
1355 }
1356 return num_squeezable_dims;
1357 }
1358};
1359
1360// -------------------------------------------------------------------------- //
1361// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
1362// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
1363//
1364// Currently there is no way to write from a Tensor expression to a block of
1365// memory, if dimensions are reordered. If you need to do that, you should
1366// materialize a Tensor block expression into a memory buffer, and then use
1367// TensorBlockIO to copy data between two memory buffers with a custom
1368// `target->src` dimension map (see definition above).
1369//
1370// Also currently the innermost dimension of `target` must have a stride '1'
1371// (contiguous in memory). This restriction could be lifted with a `pscatter`,
1372// but in practice it's never needed, and there is a similar TensorBlockIO
1373// workaround for that.
1374//
1375// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
1376// where `src` is a tensor expression. Explore if it is possible to rewrite IO
1377// to use expressions instead of pointers, and after that TensorBlockAssignment
1378// will become an alias to IO.
1379template <typename Scalar, int NumDims, typename TensorBlockExpr,
1380 typename IndexType = Eigen::Index>
1382 // We will use coeff/packet path to evaluate block expressions.
1385
1387
1388 enum {
1390 PacketSize = packet_traits<Scalar>::size
1391 };
1392
1393 template <bool Vectorizable, typename Evaluator>
1394 struct InnerDimAssign {
1395 EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
1396 const Evaluator& eval,
1397 IndexType eval_offset) {
1398 for (IndexType i = 0; i < count; ++i) {
1399 target[i] = eval.coeff(eval_offset + i);
1400 }
1401 }
1402 };
1403
1404 template <typename Evaluator>
1405 struct InnerDimAssign<true, Evaluator> {
1406 EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
1407 const Evaluator& eval,
1408 IndexType eval_offset) {
1409 typedef typename packet_traits<Scalar>::type Packet;
1410
1411 const IndexType unrolled_size = count - 4 * PacketSize;
1412 const IndexType vectorized_size = count - PacketSize;
1413 IndexType i = 0;
1414
1415 for (; i <= unrolled_size; i += 4 * PacketSize) {
1416 for (int j = 0; j < 4; ++j) {
1417 const IndexType idx = eval_offset + i + j * PacketSize;
1418 Packet p = eval.template packet<Unaligned>(idx);
1419 pstoreu<Scalar>(target + i + j * PacketSize, p);
1420 }
1421 }
1422
1423 for (; i <= vectorized_size; i += PacketSize) {
1426 }
1427
1428 for (; i < count; ++i) {
1429 target[i] = eval.coeff(eval_offset + i);
1430 }
1431 }
1432 };
1433
1434 public:
1448
1454
1455 template <typename TargetDimsIndexType, typename TargetStridesIndexType>
1459 Scalar* target_data, IndexType target_offset = 0) {
1460 // DSizes constructor will do index type promotion if it's safe.
1463 }
1464
1466 const Target& target, const TensorBlockExpr& expr) {
1467 // Prepare evaluator for block expression.
1470
1471 // Tensor block expression dimension should match destination dimensions.
1472 eigen_assert(dimensions_match(target.dims, eval.dimensions()));
1473
1474 static const int Layout = TensorBlockEvaluator::Layout;
1475 static const bool is_col_major = Layout == ColMajor;
1476
1477 // Initialize output inner dimension size based on a layout.
1478 const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
1479 const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
1480 IndexType output_inner_dim_size = target.dims[inner_dim_idx];
1481
1482 // Target inner dimension stride must be '1'.
1483 eigen_assert(target.strides[inner_dim_idx] == 1);
1484
1485 // Squeeze multiple inner dims into one if they are contiguous in `target`.
1486 IndexType num_squeezed_dims = 0;
1487 for (Index i = 1; i < NumDims; ++i) {
1488 const Index dim = is_col_major ? i : NumDims - i - 1;
1489 const IndexType target_stride = target.strides[dim];
1490
1492 output_inner_dim_size *= target.dims[dim];
1494 } else {
1495 break;
1496 }
1497 }
1498
1499 // Initialize output block iterator state. Dimension in this array are
1500 // always in inner_most -> outer_most order (col major layout).
1502
1503 int idx = 0; // currently initialized iterator state index
1504 for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
1505 const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
1506
1507 it[idx].count = 0;
1508 it[idx].size = target.dims[dim];
1509 it[idx].output_stride = target.strides[dim];
1510 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1511 idx++;
1512 }
1513
1514 // We read block expression from the beginning, and start writing data to
1515 // `target` at given offset.
1516 IndexType input_offset = 0;
1517 IndexType output_offset = target.offset;
1518
1519 // Iterate copying data from `eval` to `target`.
1520 for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
1521 // Assign to `target` at current offset.
1522 InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
1525 input_offset);
1526
1527 // Move input offset forward by the number of assigned coefficients.
1529
1530 // Update index.
1531 for (int j = 0; j < idx; ++j) {
1532 if (++it[j].count < it[j].size) {
1533 output_offset += it[j].output_stride;
1534 break;
1535 }
1536 it[j].count = 0;
1537 output_offset -= it[j].output_span;
1538 }
1539 }
1540 }
1541
1542 private:
1543 struct BlockIteratorState {
1544 BlockIteratorState()
1545 : count(0), size(0), output_stride(0), output_span(0) {}
1546
1547 IndexType count;
1548 IndexType size;
1549 IndexType output_stride;
1550 IndexType output_span;
1551 };
1552};
1553
1554// -------------------------------------------------------------------------- //
1555
1556} // namespace internal
1557} // namespace Eigen
1558
1559#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
int i
Definition BiCGSTAB_step_by_step.cpp:9
#define EIGEN_RESTRICT
Definition Macros.h:1160
#define EIGEN_ALWAYS_INLINE
Definition Macros.h:932
#define EIGEN_UNUSED_VARIABLE(var)
Definition Macros.h:1076
#define EIGEN_DEVICE_FUNC
Definition Macros.h:976
#define eigen_assert(x)
Definition Macros.h:1037
#define EIGEN_STRONG_INLINE
Definition Macros.h:917
#define COPY_INNER_DIM(KIND)
float * p
Definition Tutorial_Map_using.cpp:9
Scalar Scalar int size
Definition benchVecAdd.cpp:17
SCALAR Scalar
Definition bench_gemm.cpp:46
Generic expression where a coefficient-wise binary operator is applied to two expressions.
Definition CwiseBinaryOp.h:84
Definition TensorExpr.h:197
Definition TensorCostModel.h:25
Definition EmulateArray.h:21
Definition TensorBlock.h:993
Kind
Definition TensorBlock.h:1002
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst &dst, const Src &src, const size_t count)
Definition TensorBlock.h:1029
Definition TensorBlock.h:1381
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Target &target, const TensorBlockExpr &expr)
Definition TensorBlock.h:1465
static Target target(const Dimensions &target_dims, const Dimensions &target_strides, Scalar *target_data, IndexType target_offset=0)
Definition TensorBlock.h:1449
static Target target(const DSizes< TargetDimsIndexType, NumDims > &target_dims, const DSizes< TargetStridesIndexType, NumDims > &target_strides, Scalar *target_data, IndexType target_offset=0)
Definition TensorBlock.h:1456
const Dimensions & strides() const
Definition TensorBlock.h:241
const DestinationBufferKind & kind() const
Definition TensorBlock.h:242
friend class TensorBlockDescriptor
Definition TensorBlock.h:245
Scalar * data() const
Definition TensorBlock.h:236
Definition TensorBlock.h:188
TensorBlockDescriptor WithOffset(IndexType offset) const
Definition TensorBlock.h:331
void AddDestinationBuffer(Scalar *dst_base, const DSizes< DstStridesIndexType, NumDims > &dst_strides)
Definition TensorBlock.h:313
void AddDestinationBuffer(Scalar *dst_base, const Dimensions &dst_strides)
Definition TensorBlock.h:306
const DestinationBuffer & destination() const
Definition TensorBlock.h:303
DSizes< IndexType, NumDims > Dimensions
Definition TensorBlock.h:190
IndexType size() const
Definition TensorBlock.h:301
IndexType offset() const
Definition TensorBlock.h:298
bool HasDestinationBuffer() const
Definition TensorBlock.h:326
IndexType dimension(int index) const
Definition TensorBlock.h:300
const Dimensions & dimensions() const
Definition TensorBlock.h:299
TensorBlockDescriptor(const IndexType offset, const Dimensions &dimensions, const DestinationBuffer &destination)
Definition TensorBlock.h:287
TensorBlockDescriptor & DropDestinationBuffer()
Definition TensorBlock.h:320
TensorBlockDescriptor(const IndexType offset, const Dimensions &dimensions)
Definition TensorBlock.h:293
Definition TensorBlock.h:1143
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(const Dst &dst, const Src &src, const DimensionsMap &dst_to_src_dim_map)
Definition TensorBlock.h:1178
DSizes< IndexType, NumDims > Dimensions
Definition TensorBlock.h:1149
DSizes< int, NumDims > DimensionsMap
Definition TensorBlock.h:1150
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst &dst, const Src &src)
Definition TensorBlock.h:1321
Definition TensorBlock.h:347
TensorBlockMapper(const DSizes< IndexType, NumDims > &dimensions, const TensorBlockResourceRequirements &requirements)
Definition TensorBlock.h:354
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const
Definition TensorBlock.h:361
DSizes< IndexType, NumDims > Dimensions
Definition TensorBlock.h:351
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const
Definition TensorBlock.h:365
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes< IndexType, NumDims > & blockDimensions() const
Definition TensorBlock.h:370
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index) const
Definition TensorBlock.h:375
Definition TensorBlock.h:617
void XprType
Definition TensorBlock.h:619
~TensorBlockScratchAllocator()
Definition TensorBlock.h:530
void * allocate(size_t size)
Definition TensorBlock.h:536
TensorBlockScratchAllocator(const Device &device)
Definition TensorBlock.h:527
void reset()
Definition TensorBlock.h:572
Definition TensorBlock.h:866
XprScalar< XprType >::type Scalar
Definition TensorBlock.h:878
TensorCwiseBinaryBlock(const LhsTensorBlock &left_block, const RhsTensorBlock &right_block, const BinaryOp &functor)
Definition TensorBlock.h:880
TensorBlockKind kind() const
Definition TensorBlock.h:887
XprType expr() const
Definition TensorBlock.h:889
void cleanup()
Definition TensorBlock.h:895
const Scalar * data() const
Definition TensorBlock.h:893
conditional< NoArgBlockAccess, void, TensorCwiseBinaryOp< BinaryOp, consttypenameLhsTensorBlock::XprType, consttypenameRhsTensorBlock::XprType > >::type XprType
Definition TensorBlock.h:876
Definition TensorBlock.h:835
const Scalar * data() const
Definition TensorBlock.h:853
XprScalar< XprType >::type Scalar
Definition TensorBlock.h:845
TensorCwiseUnaryBlock(const ArgTensorBlock &arg_block, const UnaryOp &functor)
Definition TensorBlock.h:847
void cleanup()
Definition TensorBlock.h:854
TensorBlockKind kind() const
Definition TensorBlock.h:850
conditional< NoArgBlockAccess, void, TensorCwiseUnaryOp< UnaryOp, consttypenameArgTensorBlock::XprType > >::type XprType
Definition TensorBlock.h:843
XprType expr() const
Definition TensorBlock.h:852
Scalar * data() const
Definition TensorBlock.h:696
const Dimensions & dimensions() const
Definition TensorBlock.h:697
const Dimensions & strides() const
Definition TensorBlock.h:698
TensorMaterializedBlock AsTensorMaterializedBlock() const
Definition TensorBlock.h:700
friend class TensorMaterializedBlock
Definition TensorBlock.h:709
Definition TensorBlock.h:656
TensorMaterializedBlock(TensorBlockKind kind, const Scalar *data, const Dimensions &dimensions, bool valid_expr=true)
Definition TensorBlock.h:661
const Scalar * data() const
Definition TensorBlock.h:681
static EIGEN_STRONG_INLINE TensorMaterializedBlock materialize(const Scalar *data, const DataDimensions &data_dims, TensorBlockDesc &desc, TensorBlockScratch &scratch)
Definition TensorBlock.h:762
TensorBlockKind kind() const
Definition TensorBlock.h:673
static EIGEN_STRONG_INLINE Storage prepareStorage(TensorBlockDesc &desc, TensorBlockScratch &scratch, bool allow_strided_storage=false)
Definition TensorBlock.h:730
const XprType & expr() const
Definition TensorBlock.h:677
internal::TensorBlockDescriptor< NumDims, IndexType > TensorBlockDesc
Definition TensorBlock.h:684
DSizes< IndexType, NumDims > Dimensions
Definition TensorBlock.h:658
TensorMap< const Tensor< Scalar, NumDims, Layout > > XprType
Definition TensorBlock.h:659
void cleanup()
Definition TensorBlock.h:682
Definition TensorBlock.h:943
XprType expr() const
Definition TensorBlock.h:970
TensorBlockKind kind() const
Definition TensorBlock.h:969
TensorTernaryExprBlock(const Arg1TensorBlock &arg1_block, const Arg2TensorBlock &arg2_block, const Arg3TensorBlock &arg3_block, const BlockFactory &factory)
Definition TensorBlock.h:960
const Scalar * data() const
Definition TensorBlock.h:974
conditional< NoArgBlockAccess, void, typenameBlockFactory::templateXprType< Arg1XprType, Arg2XprType, Arg3XprType >::type >::type XprType
Definition TensorBlock.h:956
XprScalar< XprType >::type Scalar
Definition TensorBlock.h:958
void cleanup()
Definition TensorBlock.h:975
Definition TensorBlock.h:912
TensorUnaryExprBlock(const ArgTensorBlock &arg_block, const BlockFactory &factory)
Definition TensorBlock.h:923
XprType expr() const
Definition TensorBlock.h:928
XprScalar< XprType >::type Scalar
Definition TensorBlock.h:921
const Scalar * data() const
Definition TensorBlock.h:929
conditional< NoArgBlockAccess, void, typenameBlockFactory::templateXprType< ArgXprType >::type >::type XprType
Definition TensorBlock.h:919
void cleanup()
Definition TensorBlock.h:930
TensorBlockKind kind() const
Definition TensorBlock.h:927
std::vector< Array2i > sizes
Definition dense_solvers.cpp:12
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition gnuplot_common_settings.hh:64
@ ColMajor
Definition Constants.h:319
RealScalar s
Definition level1_cplx_impl.h:126
TensorBlockKind
Definition TensorBlock.h:589
@ kMaterializedInOutput
Definition TensorBlock.h:610
@ kMaterializedInScratch
Definition TensorBlock.h:601
@ kView
Definition TensorBlock.h:596
@ kExpr
Definition TensorBlock.h:592
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
Definition TensorDimensions.h:140
EIGEN_ALWAYS_INLINE DSizes< IndexType, NumDims > strides(const DSizes< IndexType, NumDims > &dimensions)
Definition TensorBlock.h:26
TensorBlockShapeType
Definition TensorBlock.h:73
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
Definition MathFunctions.h:1091
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition MathFunctions.h:1083
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2)
Definition TensorDimensions.h:484
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const X x, const Y y)
Definition TensorMeta.h:30
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
Definition BandTriangularSolver.h:13
uint8_t count
Definition ref_serial.h:256
Definition TensorDimensions.h:263
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const
Definition TensorDimensions.h:271
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const
Definition TensorDimensions.h:267
Definition TensorDeviceDefault.h:17
Dst(IndexType o, IndexType s, Scalar *d)
Definition TensorBlock.h:1012
IndexType offset
Definition TensorBlock.h:1014
Scalar * data
Definition TensorBlock.h:1016
IndexType stride
Definition TensorBlock.h:1015
const Scalar * data
Definition TensorBlock.h:1025
Src(IndexType o, IndexType s, const Scalar *d)
Definition TensorBlock.h:1020
IndexType offset
Definition TensorBlock.h:1023
IndexType stride
Definition TensorBlock.h:1024
Dimensions strides
Definition TensorBlock.h:1444
Dimensions dims
Definition TensorBlock.h:1443
Scalar * data
Definition TensorBlock.h:1445
Target(const Dimensions &target_dims, const Dimensions &target_strides, Scalar *target_data, IndexType target_offset=0)
Definition TensorBlock.h:1436
IndexType offset
Definition TensorBlock.h:1446
Definition TensorBlock.h:1152
IndexType offset
Definition TensorBlock.h:1160
Dimensions strides
Definition TensorBlock.h:1158
Dimensions dims
Definition TensorBlock.h:1157
Scalar * data
Definition TensorBlock.h:1159
Dst(const Dimensions &dst_dims, const Dimensions &dst_strides, Scalar *dst, IndexType dst_offset=0)
Definition TensorBlock.h:1153
Definition TensorBlock.h:1163
const Scalar * data
Definition TensorBlock.h:1169
Dimensions strides
Definition TensorBlock.h:1168
IndexType offset
Definition TensorBlock.h:1170
Src(const Dimensions &src_strides, const Scalar *src, IndexType src_offset=0)
Definition TensorBlock.h:1164
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type, size_t size_in_bytes)
Definition TensorBlock.h:100
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type, size_t size_in_bytes, TensorOpCost cost)
Definition TensorBlock.h:92
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements skewed(size_t size_in_bytes)
Definition TensorBlock.h:123
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockResourceRequirements any()
Definition TensorBlock.h:155
size_t size
Definition TensorBlock.h:77
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockResourceRequirements merge(const TensorBlockResourceRequirements &lhs, const TensorBlockResourceRequirements &rhs)
Definition TensorBlock.h:138
static EIGEN_DEVICE_FUNC TensorBlockResourceRequirements uniform(size_t size_in_bytes)
Definition TensorBlock.h:130
TensorOpCost cost_per_coeff
Definition TensorBlock.h:78
TensorBlockShapeType shape_type
Definition TensorBlock.h:76
EIGEN_DEVICE_FUNC TensorBlockResourceRequirements & addCostPerCoeff(TensorOpCost cost)
Definition TensorBlock.h:145
void type
Definition TensorBlock.h:633
Definition TensorBlock.h:628
XprType::Scalar type
Definition TensorBlock.h:629
Definition Meta.h:445
Definition Meta.h:109
Definition XprHelper.h:332
Definition Meta.h:152
Definition GenericPacketMath.h:107
Definition ForwardDeclarations.h:17
std::ptrdiff_t j
Definition tut_arithmetic_redux_minmax.cpp:2
Definition PacketMath.h:47