TR-mbed 1.0
Loading...
Searching...
No Matches
InverseSize4.h
Go to the documentation of this file.
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2001 Intel Corporation
5// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
6// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
7//
8// This Source Code Form is subject to the terms of the Mozilla
9// Public License v. 2.0. If a copy of the MPL was not distributed
10// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11//
12// The algorithm below is a reimplementation of former \src\LU\Inverse_SSE.h using PacketMath.
13// inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M,
14// adjugate of M and determinant of M respectively. M# is computed block-wise
15// using specific formulae. For proof, see:
16// https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html
17// Variable names are adopted from \src\LU\Inverse_SSE.h.
18//
19// The SSE code for the 4x4 float and double matrix inverse in former (deprecated) \src\LU\Inverse_SSE.h
20// comes from the following Intel's library:
21// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
22//
23// Here is the respective copyright and license statement:
24//
25// Copyright (c) 2001 Intel Corporation.
26//
27// Permition is granted to use, copy, distribute and prepare derivative works
28// of this library for any purpose and without fee, provided, that the above
29// copyright notice and this statement appear in all copies.
30// Intel makes no representations about the suitability of this software for
31// any purpose, and specifically disclaims all warranties.
32// See LEGAL.TXT for all the legal information.
33//
34// TODO: Unify implementations of different data types (i.e. float and double).
35#ifndef EIGEN_INVERSE_SIZE_4_H
36#define EIGEN_INVERSE_SIZE_4_H
37
38namespace Eigen
39{
40namespace internal
41{
42template <typename MatrixType, typename ResultType>
43struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType>
44{
45 enum
46 {
49 StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
50 };
51 typedef typename conditional<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject>::type ActualMatrixType;
52
53 static void run(const MatrixType &mat, ResultType &result)
54 {
56
57 const float* data = matrix.data();
58 const Index stride = matrix.innerStride();
63
64 // Four 2x2 sub-matrices of the input matrix
65 // input = [[A, B],
66 // [C, D]]
67 Packet4f A, B, C, D;
68
69 if (!StorageOrdersMatch)
70 {
75 }
76 else
77 {
82 }
83
84 Packet4f AB, DC;
85
86 // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
87 AB = pmul(vec4f_swizzle2(A, A, 3, 3, 0, 0), B);
88 AB = psub(AB, pmul(vec4f_swizzle2(A, A, 1, 1, 2, 2), vec4f_swizzle2(B, B, 2, 3, 0, 1)));
89
90 // DC = D#*C
91 DC = pmul(vec4f_swizzle2(D, D, 3, 3, 0, 0), C);
92 DC = psub(DC, pmul(vec4f_swizzle2(D, D, 1, 1, 2, 2), vec4f_swizzle2(C, C, 2, 3, 0, 1)));
93
94 // determinants of the sub-matrices
95 Packet4f dA, dB, dC, dD;
96
97 dA = pmul(vec4f_swizzle2(A, A, 3, 3, 1, 1), A);
98 dA = psub(dA, vec4f_movehl(dA, dA));
99
100 dB = pmul(vec4f_swizzle2(B, B, 3, 3, 1, 1), B);
101 dB = psub(dB, vec4f_movehl(dB, dB));
102
103 dC = pmul(vec4f_swizzle2(C, C, 3, 3, 1, 1), C);
104 dC = psub(dC, vec4f_movehl(dC, dC));
105
106 dD = pmul(vec4f_swizzle2(D, D, 3, 3, 1, 1), D);
107 dD = psub(dD, vec4f_movehl(dD, dD));
108
109 Packet4f d, d1, d2;
110
111 d = pmul(vec4f_swizzle2(DC, DC, 0, 2, 1, 3), AB);
112 d = padd(d, vec4f_movehl(d, d));
113 d = padd(d, vec4f_swizzle2(d, d, 1, 0, 0, 0));
114 d1 = pmul(dA, dD);
115 d2 = pmul(dB, dC);
116
117 // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
118 Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0);
119
120 // reciprocal of the determinant of the input matrix, rd = 1/det
122
123 // Four sub-matrices of the inverse
124 Packet4f iA, iB, iC, iD;
125
126 // iD = D*|A| - C*A#*B
127 iD = pmul(vec4f_swizzle2(C, C, 0, 0, 2, 2), vec4f_movelh(AB, AB));
128 iD = padd(iD, pmul(vec4f_swizzle2(C, C, 1, 1, 3, 3), vec4f_movehl(AB, AB)));
129 iD = psub(pmul(D, vec4f_duplane(dA, 0)), iD);
130
131 // iA = A*|D| - B*D#*C
132 iA = pmul(vec4f_swizzle2(B, B, 0, 0, 2, 2), vec4f_movelh(DC, DC));
133 iA = padd(iA, pmul(vec4f_swizzle2(B, B, 1, 1, 3, 3), vec4f_movehl(DC, DC)));
134 iA = psub(pmul(A, vec4f_duplane(dD, 0)), iA);
135
136 // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
137 iB = pmul(D, vec4f_swizzle2(AB, AB, 3, 0, 3, 0));
138 iB = psub(iB, pmul(vec4f_swizzle2(D, D, 1, 0, 3, 2), vec4f_swizzle2(AB, AB, 2, 1, 2, 1)));
139 iB = psub(pmul(C, vec4f_duplane(dB, 0)), iB);
140
141 // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
142 iC = pmul(A, vec4f_swizzle2(DC, DC, 3, 0, 3, 0));
143 iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1)));
144 iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC);
145
146 const float sign_mask[4] = {0.0f, numext::bit_cast<float>(0x80000000u), numext::bit_cast<float>(0x80000000u), 0.0f};
149 iA = pmul(iA, rd);
150 iB = pmul(iB, rd);
151 iC = pmul(iC, rd);
152 iD = pmul(iD, rd);
153
154 Index res_stride = result.outerStride();
155 float *res = result.data();
156
161 }
162};
163
164#if !(defined EIGEN_VECTORIZE_NEON && !(EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG))
165// same algorithm as above, except that each operand is split into
166// halves for two registers to hold.
167template <typename MatrixType, typename ResultType>
168struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultType>
169{
170 enum
171 {
174 StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
175 };
176 typedef typename conditional<(MatrixType::Flags & LinearAccessBit),
177 MatrixType const &,
178 typename MatrixType::PlainObject>::type
180
181 static void run(const MatrixType &mat, ResultType &result)
182 {
184
185 // Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower
186 // row e.g. A1, upper row of A, A2, lower row of A
187 // input = [[A, B], = [[[A1, [B1,
188 // [C, D]] A2], B2]],
189 // [[C1, [D1,
190 // C2], D2]]]
191
192 Packet2d A1, A2, B1, B2, C1, C2, D1, D2;
193
194 const double* data = matrix.data();
195 const Index stride = matrix.innerStride();
196 if (StorageOrdersMatch)
197 {
199 B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
201 B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
206 }
207 else
208 {
209 Packet2d temp;
214 temp = A1;
216 A2 = vec2d_unpackhi(temp, A2);
217
218 temp = C1;
220 C2 = vec2d_unpackhi(temp, C2);
221
222 B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
224 B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
226
227 temp = B1;
228 B1 = vec2d_unpacklo(B1, B2);
229 B2 = vec2d_unpackhi(temp, B2);
230
231 temp = D1;
233 D2 = vec2d_unpackhi(temp, D2);
234 }
235
236 // determinants of the sub-matrices
237 Packet2d dA, dB, dC, dD;
238
239 dA = vec2d_swizzle2(A2, A2, 1);
240 dA = pmul(A1, dA);
241 dA = psub(dA, vec2d_duplane(dA, 1));
242
243 dB = vec2d_swizzle2(B2, B2, 1);
244 dB = pmul(B1, dB);
245 dB = psub(dB, vec2d_duplane(dB, 1));
246
247 dC = vec2d_swizzle2(C2, C2, 1);
248 dC = pmul(C1, dC);
249 dC = psub(dC, vec2d_duplane(dC, 1));
250
251 dD = vec2d_swizzle2(D2, D2, 1);
252 dD = pmul(D1, dD);
253 dD = psub(dD, vec2d_duplane(dD, 1));
254
256
257 // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
258 AB1 = pmul(B1, vec2d_duplane(A2, 1));
259 AB2 = pmul(B2, vec2d_duplane(A1, 0));
260 AB1 = psub(AB1, pmul(B2, vec2d_duplane(A1, 1)));
261 AB2 = psub(AB2, pmul(B1, vec2d_duplane(A2, 0)));
262
263 // DC = D#*C
264 DC1 = pmul(C1, vec2d_duplane(D2, 1));
265 DC2 = pmul(C2, vec2d_duplane(D1, 0));
266 DC1 = psub(DC1, pmul(C2, vec2d_duplane(D1, 1)));
267 DC2 = psub(DC2, pmul(C1, vec2d_duplane(D2, 0)));
268
269 Packet2d d1, d2;
270
271 // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
273
274 // reciprocal of the determinant of the input matrix, rd = 1/det
275 Packet2d rd;
276
277 d1 = pmul(AB1, vec2d_swizzle2(DC1, DC2, 0));
278 d2 = pmul(AB2, vec2d_swizzle2(DC1, DC2, 3));
279 rd = padd(d1, d2);
280 rd = padd(rd, vec2d_duplane(rd, 1));
281
282 d1 = pmul(dA, dD);
283 d2 = pmul(dB, dC);
284
285 det = padd(d1, d2);
286 det = psub(det, rd);
287 det = vec2d_duplane(det, 0);
288 rd = pdiv(pset1<Packet2d>(1.0), det);
289
290 // rows of four sub-matrices of the inverse
292
293 // iD = D*|A| - C*A#*B
294 iD1 = pmul(AB1, vec2d_duplane(C1, 0));
295 iD2 = pmul(AB1, vec2d_duplane(C2, 0));
296 iD1 = padd(iD1, pmul(AB2, vec2d_duplane(C1, 1)));
297 iD2 = padd(iD2, pmul(AB2, vec2d_duplane(C2, 1)));
298 dA = vec2d_duplane(dA, 0);
299 iD1 = psub(pmul(D1, dA), iD1);
300 iD2 = psub(pmul(D2, dA), iD2);
301
302 // iA = A*|D| - B*D#*C
303 iA1 = pmul(DC1, vec2d_duplane(B1, 0));
304 iA2 = pmul(DC1, vec2d_duplane(B2, 0));
305 iA1 = padd(iA1, pmul(DC2, vec2d_duplane(B1, 1)));
306 iA2 = padd(iA2, pmul(DC2, vec2d_duplane(B2, 1)));
307 dD = vec2d_duplane(dD, 0);
308 iA1 = psub(pmul(A1, dD), iA1);
309 iA2 = psub(pmul(A2, dD), iA2);
310
311 // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
312 iB1 = pmul(D1, vec2d_swizzle2(AB2, AB1, 1));
313 iB2 = pmul(D2, vec2d_swizzle2(AB2, AB1, 1));
316 dB = vec2d_duplane(dB, 0);
317 iB1 = psub(pmul(C1, dB), iB1);
318 iB2 = psub(pmul(C2, dB), iB2);
319
320 // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
321 iC1 = pmul(A1, vec2d_swizzle2(DC2, DC1, 1));
322 iC2 = pmul(A2, vec2d_swizzle2(DC2, DC1, 1));
325 dC = vec2d_duplane(dC, 0);
326 iC1 = psub(pmul(B1, dC), iC1);
327 iC2 = psub(pmul(B2, dC), iC2);
328
329 const double sign_mask1[2] = {0.0, numext::bit_cast<double>(0x8000000000000000ull)};
330 const double sign_mask2[2] = {numext::bit_cast<double>(0x8000000000000000ull), 0.0};
333 d1 = pxor(rd, sign_PN);
334 d2 = pxor(rd, sign_NP);
335
336 Index res_stride = result.outerStride();
337 double *res = result.data();
346 }
347};
348#endif
349} // namespace internal
350} // namespace Eigen
351#endif
MatrixXcd D
Definition EigenSolver_EigenSolver_MatrixType.cpp:14
int data[]
Definition Map_placement_new.cpp:1
#define vec4f_duplane(a, p)
Definition PacketMath.h:137
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition PartialRedux_count.cpp:3
#define vec4f_swizzle2(a, b, p, q, r, s)
Definition PacketMath.h:70
#define vec2d_duplane(a, p)
Definition PacketMath.h:106
#define vec2d_swizzle2(a, b, mask)
Definition PacketMath.h:95
MatrixXf mat
Definition Tutorial_AdvancedInitialization_CommaTemporary.cpp:1
Matrix< Scalar, Dynamic, Dynamic > C
Definition bench_gemm.cpp:50
Matrix< SCALARA, Dynamic, Dynamic, opt_A > A
Definition bench_gemm.cpp:48
Matrix< SCALARB, Dynamic, Dynamic, opt_B > B
Definition bench_gemm.cpp:49
MatrixXf MatrixType
Definition benchmark-blocking-sizes.cpp:52
The matrix class, also used for vectors and row-vectors.
Definition Matrix.h:180
Map< Matrix< T, Dynamic, Dynamic, ColMajor >, 0, OuterStride<> > matrix(T *data, int rows, int cols, int stride)
Definition common.h:110
const unsigned int LinearAccessBit
Definition Constants.h:130
const unsigned int RowMajorBit
Definition Constants.h:66
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition GenericPacketMath.h:215
EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f &a, const Packet4f &b)
Definition PacketMath.h:121
EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d &a, const Packet2d &b)
Definition PacketMath.h:102
EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f &a, const Packet4f &b)
Definition PacketMath.h:125
EIGEN_DEVICE_FUNC Packet pdiv(const Packet &a, const Packet &b)
Definition GenericPacketMath.h:244
EIGEN_DEVICE_FUNC Packet pmul(const Packet &a, const Packet &b)
Definition GenericPacketMath.h:237
EIGEN_STRONG_INLINE Packet2d ploadu< Packet2d >(const double *from)
Definition PacketMath.h:1004
EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f &a, const Packet4f &b)
Definition PacketMath.h:133
EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f &a, const Packet4f &b)
Definition PacketMath.h:129
EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h &a, const Packet8h &b)
Definition PacketMath.h:1047
EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d &a, const Packet2d &b)
Definition PacketMath.h:98
EIGEN_STRONG_INLINE Packet4f ploadu< Packet4f >(const float *from)
Definition PacketMath.h:968
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
Definition GenericPacketMath.h:222
Namespace containing all symbols from the Eigen library.
Definition bench_norm.cpp:85
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
Definition BandTriangularSolver.h:13
static void run(const MatrixType &mat, ResultType &result)
Definition InverseSize4.h:181
static void run(const MatrixType &mat, ResultType &result)
Definition InverseSize4.h:53
Definition InverseImpl.h:235
Definition Meta.h:109
Definition ForwardDeclarations.h:17