TR-mbed 1.0
Loading...
Searching...
No Matches
TensorRandom.h
Go to the documentation of this file.
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
5// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
6//
7// This Source Code Form is subject to the terms of the Mozilla
8// Public License v. 2.0. If a copy of the MPL was not distributed
9// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10
11#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
12#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
13
14namespace Eigen {
15namespace internal {
16
17namespace {
18
19EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
20#if defined(EIGEN_GPU_COMPILE_PHASE)
21 // We don't support 3d kernels since we currently only use 1 and
22 // 2d kernels.
23 gpu_assert(threadIdx.z == 0);
24 return blockIdx.x * blockDim.x + threadIdx.x
25 + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
26#else
27 // Rely on Eigen's random implementation.
28 return random<uint64_t>();
29#endif
30}
31
32static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
33 // TODO: Unify with the implementation in the non blocking thread pool.
34 uint64_t current = *state;
35 // Update the internal state
36 *state = current * 6364136223846793005ULL + (stream << 1 | 1);
37 // Generate the random output (using the PCG-XSH-RS scheme)
38 return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
39}
40
41static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
42 seed = seed ? seed : get_random_seed();
43 return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
44}
45
46} // namespace
47
48
49template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
50T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
51 unsigned rnd = PCG_XSH_RS_generator(state, stream);
52 return static_cast<T>(rnd);
53}
54
55
58 // Generate 10 random bits for the mantissa, merge with exponent.
59 unsigned rnd = PCG_XSH_RS_generator(state, stream);
60 const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
61 Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
62 // Return the final result
63 return result - Eigen::half(1.0f);
64}
65
68
69 // Generate 7 random bits for the mantissa, merge with exponent.
70 unsigned rnd = PCG_XSH_RS_generator(state, stream);
71 const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
72 Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
73 // Return the final result
74 return result - Eigen::bfloat16(1.0f);
75}
76
78float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
79 typedef union {
80 uint32_t raw;
81 float fp;
82 } internal;
84 // Generate 23 random bits for the mantissa mantissa
85 const unsigned rnd = PCG_XSH_RS_generator(state, stream);
86 result.raw = rnd & 0x7fffffu;
87 // Set the exponent
88 result.raw |= (static_cast<uint32_t>(127) << 23);
89 // Return the final result
90 return result.fp - 1.0f;
91}
92
94double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
95 typedef union {
96 uint64_t raw;
97 double dp;
98 } internal;
100 result.raw = 0;
101 // Generate 52 random bits for the mantissa
102 // First generate the upper 20 bits
103 unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
104 // The generate the lower 32 bits
105 unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
106 result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
107 // Set the exponent
108 result.raw |= (static_cast<uint64_t>(1023) << 52);
109 // Return the final result
110 return result.dp - 1.0;
111}
112
114std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) {
115 return std::complex<float>(RandomToTypeUniform<float>(state, stream),
117}
119std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) {
120 return std::complex<double>(RandomToTypeUniform<double>(state, stream),
122}
123
124template <typename T> class UniformRandomGenerator {
125 public:
126 static const bool PacketAccess = true;
127
128 // Uses the given "seed" if non-zero, otherwise uses a random seed.
130 uint64_t seed = 0) {
131 m_state = PCG_XSH_RS_state(seed);
132 #ifdef EIGEN_USE_SYCL
133 // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
134 // Therefor, we need two step to initializate the m_state.
135 // IN SYCL, the constructor of the functor is s called on the CPU
136 // and we get the clock seed here from the CPU. However, This seed is
137 //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
138 // and only available on the Operator() function (which is called on the GPU).
139 // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
140 // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
141 // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
142 // similar to CUDA Therefore, the thread Id injection is not available at this stage.
143 //However when the operator() is called the thread ID will be avilable. So inside the opeator,
144 // we add the thrreadID, BlockId,... (which is equivalent of i)
145 //to the seed and construct the unique m_state per thead similar to cuda.
146 m_exec_once =false;
147 #endif
148 }
150 const UniformRandomGenerator& other) {
151 m_state = other.m_state;
152 #ifdef EIGEN_USE_SYCL
153 m_exec_once =other.m_exec_once;
154 #endif
155 }
156
157 template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
159 #ifdef EIGEN_USE_SYCL
160 if(!m_exec_once) {
161 // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
162 // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
163 m_state += (i * 6364136223846793005ULL);
164 m_exec_once =true;
165 }
166 #endif
167 T result = RandomToTypeUniform<T>(&m_state, i);
168 return result;
169 }
170
171 template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
175 #ifdef EIGEN_USE_SYCL
176 if(!m_exec_once) {
177 // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
178 m_state += (i * 6364136223846793005ULL);
179 m_exec_once =true;
180 }
181 #endif
183 for (int j = 0; j < packetSize; ++j) {
184 values[j] = RandomToTypeUniform<T>(&m_state, i);
185 }
186 return internal::pload<Packet>(values);
187 }
188
189 private:
190 mutable uint64_t m_state;
191 #ifdef EIGEN_USE_SYCL
192 mutable bool m_exec_once;
193 #endif
194};
195
196template <typename Scalar>
198 enum {
199 // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
201 ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
203 };
204};
205
206
207
208template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
209T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
210 // Use the ratio of uniform method to generate numbers following a normal
211 // distribution. See for example Numerical Recipes chapter 7.3.9 for the
212 // details.
213 T u, v, q;
214 do {
215 u = RandomToTypeUniform<T>(state, stream);
216 v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
217 const T x = u - T(0.449871);
218 const T y = numext::abs(v) + T(0.386595);
219 q = x*x + y * (T(0.196)*y - T(0.25472)*x);
220 } while (q > T(0.27597) &&
221 (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
222
223 return v/u;
224}
225
227std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) {
228 return std::complex<float>(RandomToTypeNormal<float>(state, stream),
230}
232std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) {
233 return std::complex<double>(RandomToTypeNormal<double>(state, stream),
235}
236
237
238template <typename T> class NormalRandomGenerator {
239 public:
240 static const bool PacketAccess = true;
241
242 // Uses the given "seed" if non-zero, otherwise uses a random seed.
244 m_state = PCG_XSH_RS_state(seed);
245 #ifdef EIGEN_USE_SYCL
246 // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
247 // Therefor, we need two steps to initializate the m_state.
248 // IN SYCL, the constructor of the functor is s called on the CPU
249 // and we get the clock seed here from the CPU. However, This seed is
250 //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
251 // and only available on the Operator() function (which is called on the GPU).
252 // Therefore, the thread Id injection is not available at this stage. However when the operator()
253 //is called the thread ID will be avilable. So inside the opeator,
254 // we add the thrreadID, BlockId,... (which is equivalent of i)
255 //to the seed and construct the unique m_state per thead similar to cuda.
256 m_exec_once =false;
257 #endif
258 }
260 const NormalRandomGenerator& other) {
261 m_state = other.m_state;
262#ifdef EIGEN_USE_SYCL
263 m_exec_once=other.m_exec_once;
264#endif
265 }
266
267 template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
269 #ifdef EIGEN_USE_SYCL
270 if(!m_exec_once) {
271 // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
272 m_state += (i * 6364136223846793005ULL);
273 m_exec_once =true;
274 }
275 #endif
276 T result = RandomToTypeNormal<T>(&m_state, i);
277 return result;
278 }
279
280 template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
284 #ifdef EIGEN_USE_SYCL
285 if(!m_exec_once) {
286 // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
287 m_state += (i * 6364136223846793005ULL);
288 m_exec_once =true;
289 }
290 #endif
292 for (int j = 0; j < packetSize; ++j) {
293 values[j] = RandomToTypeNormal<T>(&m_state, i);
294 }
295 return internal::pload<Packet>(values);
296 }
297
298 private:
299 mutable uint64_t m_state;
300 #ifdef EIGEN_USE_SYCL
301 mutable bool m_exec_once;
302 #endif
303};
304
305
306template <typename Scalar>
308 enum {
309 // On average, we need to generate about 3 random numbers
310 // 15 mul, 8 add, 1.5 logs
315 };
316};
317
318
319} // end namespace internal
320} // end namespace Eigen
321
322#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
Array< int, Dynamic, 1 > v
Definition Array_initializer_list_vector_cxx11.cpp:1
int i
Definition BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ALIGN_MAX
Definition ConfigureVectorization.h:157
#define EIGEN_UNROLL_LOOP
Definition Macros.h:1461
#define EIGEN_DEVICE_FUNC
Definition Macros.h:976
#define EIGEN_STRONG_INLINE
Definition Macros.h:917
Eigen::Triplet< double > T
Definition Tutorial_sparse_example.cpp:6
SCALAR Scalar
Definition bench_gemm.cpp:46
Definition TensorRandom.h:238
static const bool PacketAccess
Definition TensorRandom.h:240
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed=0)
Definition TensorRandom.h:243
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const
Definition TensorRandom.h:281
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(const NormalRandomGenerator &other)
Definition TensorRandom.h:259
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const
Definition TensorRandom.h:268
Definition TensorRandom.h:124
static const bool PacketAccess
Definition TensorRandom.h:126
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const
Definition TensorRandom.h:158
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const
Definition TensorRandom.h:172
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(const UniformRandomGenerator &other)
Definition TensorRandom.h:149
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(uint64_t seed=0)
Definition TensorRandom.h:129
set noclip points set clip one set noclip two set bar set border lt lw set xdata set ydata set zdata set x2data set y2data set boxwidth set dummy x
Definition gnuplot_common_settings.hh:12
dim3 threadIdx
Definition gpu_common.h:19
dim3 blockDim
Definition gpu_common.h:19
dim3 blockIdx
Definition gpu_common.h:19
const Scalar & y
Definition MathFunctions.h:821
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeNormal(uint64_t *state, uint64_t stream)
Definition TensorRandom.h:209
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half RandomToTypeUniform< Eigen::half >(uint64_t *state, uint64_t stream)
Definition TensorRandom.h:57
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float RandomToTypeUniform< float >(uint64_t *state, uint64_t stream)
Definition TensorRandom.h:78
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeUniform(uint64_t *state, uint64_t stream)
Definition TensorRandom.h:50
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double RandomToTypeUniform< double >(uint64_t *state, uint64_t stream)
Definition TensorRandom.h:94
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 RandomToTypeUniform< Eigen::bfloat16 >(uint64_t *state, uint64_t stream)
Definition TensorRandom.h:67
::uint64_t uint64_t
Definition Meta.h:58
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T log(const T &x)
Definition MathFunctions.h:1489
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE internal::enable_if< NumTraits< T >::IsSigned||NumTraits< T >::IsComplex, typenameNumTraits< T >::Real >::type abs(const T &x)
Definition MathFunctions.h:1509
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
Definition BandTriangularSolver.h:13
Holds information about the various numeric (i.e. scalar) types allowed by Eigen.
Definition NumTraits.h:233
Definition BFloat16.h:58
Definition Half.h:142
Definition XprHelper.h:176
@ PacketAccess
Definition XprHelper.h:180
@ Cost
Definition XprHelper.h:179
Definition ForwardDeclarations.h:17
Definition GenericPacketMath.h:133
std::ptrdiff_t j
Definition tut_arithmetic_redux_minmax.cpp:2
Definition PacketMath.h:47