10#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
11#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
17template <
typename Op,
typename XprType>
20 typedef typename XprType::Scalar
Scalar;
23 typedef typename XprType::Nested
Nested;
25 static const int NumDimensions = XprTraits::NumDimensions;
26 static const int Layout = XprTraits::Layout;
30template<
typename Op,
typename XprType>
36template<
typename Op,
typename XprType>
49template <
typename Op,
typename XprType>
51 :
public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
83template <
typename Self>
85 typename Self::CoeffReturnType*
data) {
87 typename Self::CoeffReturnType accum =
self.accumulator().initialize();
88 if (
self.stride() == 1) {
89 if (
self.exclusive()) {
92 self.accumulator().reduce(
self.inner().coeff(
curr), &accum);
96 self.accumulator().reduce(
self.inner().coeff(
curr), &accum);
101 if (
self.exclusive()) {
105 self.accumulator().reduce(
self.inner().coeff(
curr), &accum);
110 self.accumulator().reduce(
self.inner().coeff(
curr), &accum);
117template <
typename Self>
119 typename Self::CoeffReturnType*
data) {
120 using Scalar =
typename Self::CoeffReturnType;
121 using Packet =
typename Self::PacketReturnType;
124 if (
self.stride() == 1) {
125 if (
self.exclusive()) {
137 if (
self.exclusive()) {
153template <
typename Self,
bool Vectorize,
bool Parallel>
156 typename Self::CoeffReturnType*
data) {
166template <
typename Self>
169 typename Self::CoeffReturnType*
data) {
170 using Packet =
typename Self::PacketReturnType;
173 for (;
idx2 + PacketSize <=
self.stride();
idx2 += PacketSize) {
187template <
typename Self,
typename Reducer,
typename Device,
206#ifdef EIGEN_USE_THREADS
213 const Index items_per_cacheline =
214 numext::maxi<Index>(1, kBlockAlignment / item_size);
215 return items_per_cacheline *
divup(block_size, items_per_cacheline);
218template <
typename Self>
219struct ReduceBlock<Self,
true,
true> {
221 typename Self::CoeffReturnType*
data) {
222 using Scalar =
typename Self::CoeffReturnType;
223 using Packet =
typename Self::PacketReturnType;
225 Index num_scalars = self.stride();
226 Index num_packets = 0;
227 if (self.stride() >= PacketSize) {
228 num_packets = self.stride() / PacketSize;
229 self.device().parallelFor(
231 TensorOpCost(PacketSize * self.size(), PacketSize * self.size(),
232 16 * PacketSize * self.size(),
true, PacketSize),
235 [=](
Index blk_size) {
236 return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size);
239 for (Index packet = first; packet < last; ++packet) {
240 const Index idx2 = packet * PacketSize;
241 ReducePacket(self, idx1 + idx2, data);
244 num_scalars -= num_packets * PacketSize;
246 self.device().parallelFor(
247 num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()),
250 [=](
Index blk_size) {
251 return AdjustBlockSize(sizeof(Scalar), blk_size);
254 for (Index scalar = first; scalar < last; ++scalar) {
255 const Index idx2 = num_packets * PacketSize + scalar;
256 ReduceScalar(self, idx1 + idx2, data);
262template <
typename Self>
263struct ReduceBlock<Self, false, true> {
265 typename Self::CoeffReturnType*
data) {
266 using Scalar =
typename Self::CoeffReturnType;
267 self.device().parallelFor(
268 self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()),
271 [=](Index blk_size) {
272 return AdjustBlockSize(sizeof(Scalar), blk_size);
274 [&](Index first, Index last) {
275 for (Index idx2 = first; idx2 < last; ++idx2) {
276 ReduceScalar(self, idx1 + idx2, data);
283template <
typename Self,
typename Reducer,
bool Vectorize>
284struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
285 void operator()(Self& self,
typename Self::CoeffReturnType*
data) {
286 using Scalar =
typename Self::CoeffReturnType;
287 using Packet =
typename Self::PacketReturnType;
288 const int PacketSize = internal::unpacket_traits<Packet>::size;
289 const Index total_size = internal::array_prod(self.dimensions());
290 const Index inner_block_size = self.stride() * self.size();
291 bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size));
293 if ((parallelize_by_outer_blocks && total_size <= 4096) ||
294 (!parallelize_by_outer_blocks && self.stride() < PacketSize)) {
295 ScanLauncher<Self, Reducer, DefaultDevice, Vectorize> launcher;
296 launcher(self,
data);
300 if (parallelize_by_outer_blocks) {
302 const Index num_outer_blocks = total_size / inner_block_size;
303 self.device().parallelFor(
305 TensorOpCost(inner_block_size, inner_block_size,
306 16 * PacketSize * inner_block_size, Vectorize,
308 [=](Index blk_size) {
309 return AdjustBlockSize(inner_block_size *
sizeof(
Scalar), blk_size);
312 for (Index idx1 = first; idx1 < last; ++idx1) {
313 ReduceBlock<Self, Vectorize,
false> block_reducer;
314 block_reducer(self, idx1 * inner_block_size,
data);
320 ReduceBlock<Self, Vectorize,
true> block_reducer;
321 for (Index idx1 = 0; idx1 < total_size;
322 idx1 += self.stride() * self.size()) {
323 block_reducer(self, idx1,
data);
330#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
336template <
typename Self,
typename Reducer>
340 Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
342 if (
offset + (self.size() - 1) * self.stride() < total_size) {
344 typename Self::CoeffReturnType accum = self.accumulator().initialize();
345 for (Index idx = 0; idx < self.size(); idx++) {
347 if (self.exclusive()) {
348 data[curr] = self.accumulator().finalize(accum);
349 self.accumulator().reduce(self.inner().coeff(curr), &accum);
351 self.accumulator().reduce(self.inner().coeff(curr), &accum);
352 data[curr] = self.accumulator().finalize(accum);
360template <
typename Self,
typename Reducer,
bool Vectorize>
361struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
362 void operator()(
const Self& self,
typename Self::CoeffReturnType*
data) {
363 Index total_size = internal::array_prod(self.dimensions());
364 Index num_blocks = (total_size / self.size() + 63) / 64;
365 Index block_size = 64;
367 LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size,
data);
375template <
typename Op,
typename ArgType,
typename Device>
395 PreferBlockAccess =
false,
406 : m_impl(op.expression(), device),
408 m_exclusive(op.exclusive()),
409 m_accumulator(op.accumulator()),
410 m_size(m_impl.dimensions()[op.axis()]),
411 m_stride(1), m_consume_dim(op.axis()),
420 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
421 for (
int i = 0;
i < op.
axis(); ++
i) {
422 m_stride = m_stride * dims[
i];
429 for (
unsigned int i = NumDims - 1;
i > axis; --
i) {
430 m_stride = m_stride * dims[
i];
436 return m_impl.dimensions();
444 return m_consume_dim;
452 return m_accumulator;
468 m_impl.evalSubExprsIfNeeded(NULL);
471 launcher(*
this,
data);
475 const Index total_size = internal::array_prod(dimensions());
477 launcher(*
this, m_output);
481 template<
int LoadMode>
493 return m_output[index];
502 m_device.deallocate_temp(m_output);
int i
Definition BiCGSTAB_step_by_step.cpp:9
internal::enable_if< internal::valid_indexed_view_overload< RowIndices, ColIndices >::value &&internal::traits< typenameEIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::ReturnAsIndexedView, typenameEIGEN_INDEXED_VIEW_METHOD_TYPE< RowIndices, ColIndices >::type >::type operator()(const RowIndices &rowIndices, const ColIndices &colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
Definition IndexedViewMethods.h:73
#define EIGEN_CONSTEXPR
Definition Macros.h:787
#define EIGEN_DEVICE_FUNC
Definition Macros.h:976
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition Macros.h:510
#define eigen_assert(x)
Definition Macros.h:1037
#define EIGEN_STRONG_INLINE
Definition Macros.h:917
int data[]
Definition Map_placement_new.cpp:1
#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
Definition StaticAssert.h:127
#define EIGEN_DEVICE_REF
Definition TensorMacros.h:50
SCALAR Scalar
Definition bench_gemm.cpp:46
Generic expression where a coefficient-wise binary operator is applied to two expressions.
Definition CwiseBinaryOp.h:84
The tensor base class.
Definition TensorBase.h:973
Definition TensorCostModel.h:25
Definition TensorScan.h:51
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType & expression() const
Definition TensorScan.h:67
XprType::CoeffReturnType CoeffReturnType
Definition TensorScan.h:55
Eigen::internal::nested< TensorScanOp >::type Nested
Definition TensorScan.h:56
const bool m_exclusive
Definition TensorScan.h:77
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index axis() const
Definition TensorScan.h:65
Eigen::internal::traits< TensorScanOp >::StorageKind StorageKind
Definition TensorScan.h:57
Eigen::internal::traits< TensorScanOp >::Scalar Scalar
Definition TensorScan.h:53
XprType::Nested m_expr
Definition TensorScan.h:74
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op accumulator() const
Definition TensorScan.h:69
Eigen::NumTraits< Scalar >::Real RealScalar
Definition TensorScan.h:54
const Op m_accumulator
Definition TensorScan.h:76
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const
Definition TensorScan.h:71
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(const XprType &expr, const Index &axis, bool exclusive=false, const Op &op=Op())
Definition TensorScan.h:60
Eigen::internal::traits< TensorScanOp >::Index Index
Definition TensorScan.h:58
const Index m_axis
Definition TensorScan.h:75
Definition TensorBlock.h:617
Definition TensorRef.h:81
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy y set format x g set format y g set format x2 g set format y2 g set format z g set angles radians set nogrid set key title set key left top Right noreverse box linetype linewidth samplen spacing width set nolabel set noarrow set nologscale set logscale x set set pointsize set encoding default set nopolar set noparametric set set set set surface set nocontour set clabel set mapping cartesian set nohidden3d set cntrparam order set cntrparam linear set cntrparam levels auto set cntrparam points set size set set xzeroaxis lt lw set x2zeroaxis lt lw set yzeroaxis lt lw set y2zeroaxis lt lw set tics in set ticslevel set tics set mxtics default set mytics default set mx2tics default set my2tics default set xtics border mirror norotate autofreq set ytics border mirror norotate autofreq set ztics border nomirror norotate autofreq set nox2tics set noy2tics set timestamp bottom norotate offset
Definition gnuplot_common_settings.hh:64
dim3 threadIdx
Definition gpu_common.h:19
dim3 blockDim
Definition gpu_common.h:19
dim3 blockIdx
Definition gpu_common.h:19
@ ColMajor
Definition Constants.h:319
EIGEN_STRONG_INLINE void ReduceScalar(Self &self, Index offset, typename Self::CoeffReturnType *data)
Definition TensorScan.h:84
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes< Indices... > &)
Definition TensorDimensions.h:140
EIGEN_CONSTEXPR Index first(const T &x) EIGEN_NOEXCEPT
Definition IndexedViewHelper.h:81
EIGEN_STRONG_INLINE void ReducePacket(Self &self, Index offset, typename Self::CoeffReturnType *data)
Definition TensorScan.h:118
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T divup(const X x, const Y y)
Definition TensorMeta.h:30
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
Definition BandTriangularSolver.h:13
Definition TensorDimensions.h:263
Definition Constants.h:507
Definition TensorMeta.h:50
Definition TensorForwardDeclarations.h:37
Storage::Type EvaluatorPointerType
Definition TensorScan.h:389
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition TensorScan.h:491
const ArgType ChildType
Definition TensorScan.h:381
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
Definition TensorScan.h:486
XprType::CoeffReturnType CoeffReturnType
Definition TensorScan.h:385
Index m_stride
Definition TensorScan.h:521
TensorScanOp< Op, ArgType > XprType
Definition TensorScan.h:378
DSizes< Index, NumDims > Dimensions
Definition TensorScan.h:383
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device & device() const
Definition TensorScan.h:463
internal::TensorBlockNotImplemented TensorBlock
Definition TensorScan.h:402
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index & size() const
Definition TensorScan.h:447
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const
Definition TensorScan.h:455
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator< ArgType, Device > & inner() const
Definition TensorScan.h:459
EIGEN_STRONG_INLINE TensorEvaluator(const XprType &op, const Device &device)
Definition TensorScan.h:405
Index m_consume_dim
Definition TensorScan.h:522
const Index m_size
Definition TensorScan.h:520
const ArgType ChildTypeNoConst
Definition TensorScan.h:380
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
Definition TensorScan.h:467
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition TensorScan.h:386
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op & accumulator() const
Definition TensorScan.h:451
const Device EIGEN_DEVICE_REF m_device
Definition TensorScan.h:517
const bool m_exclusive
Definition TensorScan.h:518
EIGEN_STRONG_INLINE void cleanup()
Definition TensorScan.h:500
EvaluatorPointerType m_output
Definition TensorScan.h:523
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition TensorScan.h:435
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index & consume_dim() const
Definition TensorScan.h:443
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
Definition TensorScan.h:482
TensorEvaluator< ArgType, Device > m_impl
Definition TensorScan.h:516
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index & stride() const
Definition TensorScan.h:439
StorageMemory< Scalar, Device > Storage
Definition TensorScan.h:388
TensorEvaluator< const TensorScanOp< Op, ArgType >, Device > Self
Definition TensorScan.h:387
Op m_accumulator
Definition TensorScan.h:519
XprType::Index Index
Definition TensorScan.h:379
internal::remove_const< typenameXprType::Scalar >::type Scalar
Definition TensorScan.h:384
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const
Definition TensorScan.h:496
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition TensorEvaluator.h:29
EIGEN_STRONG_INLINE void operator()(Self &self, Index idx1, typename Self::CoeffReturnType *data)
Definition TensorScan.h:168
Definition TensorScan.h:154
EIGEN_STRONG_INLINE void operator()(Self &self, Index idx1, typename Self::CoeffReturnType *data)
Definition TensorScan.h:155
Definition TensorScan.h:191
void operator()(Self &self, typename Self::CoeffReturnType *data)
Definition TensorScan.h:192
const TensorScanOp< Op, XprType > & type
Definition TensorScan.h:33
Definition XprHelper.h:332
TensorScanOp< Op, XprType > type
Definition TensorScan.h:40
Definition TensorTraits.h:175
Definition TensorFunctors.h:58
traits< XprType > XprTraits
Definition TensorScan.h:21
XprTraits::StorageKind StorageKind
Definition TensorScan.h:22
XprTraits::PointerType PointerType
Definition TensorScan.h:27
remove_reference< Nested >::type _Nested
Definition TensorScan.h:24
XprType::Nested Nested
Definition TensorScan.h:23
XprType::Scalar Scalar
Definition TensorScan.h:20
Definition ForwardDeclarations.h:17
Definition GenericPacketMath.h:133
@ size
Definition GenericPacketMath.h:138
Definition PacketMath.h:47