/src/Geometry_Eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h
C Header | 403 lines | 277 code | 51 blank | 75 comment | 27 complexity | 5727edaa2fbe4675f68639107b5bb574 MD5 | raw file
Possible License(s): AGPL-3.0, LGPL-2.1, LGPL-3.0, GPL-2.0
1// This file is part of Eigen, a lightweight C++ template library 2// for linear algebra. 3// 4// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> 5// 6// Eigen is free software; you can redistribute it and/or 7// modify it under the terms of the GNU Lesser General Public 8// License as published by the Free Software Foundation; either 9// version 3 of the License, or (at your option) any later version. 10// 11// Alternatively, you can redistribute it and/or 12// modify it under the terms of the GNU General Public License as 13// published by the Free Software Foundation; either version 2 of 14// the License, or (at your option) any later version. 15// 16// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY 17// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 18// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the 19// GNU General Public License for more details. 20// 21// You should have received a copy of the GNU Lesser General Public 22// License and a copy of the GNU General Public License along with 23// Eigen. If not, see <http://www.gnu.org/licenses/>. 24 25#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_H 26#define EIGEN_TRIANGULAR_MATRIX_MATRIX_H 27 28namespace internal { 29 30// template<typename Scalar, int mr, int StorageOrder, bool Conjugate, int Mode> 31// struct gemm_pack_lhs_triangular 32// { 33// Matrix<Scalar,mr,mr, 34// void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int depth, int rows) 35// { 36// conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; 37// const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride); 38// int count = 0; 39// const int peeled_mc = (rows/mr)*mr; 40// for(int i=0; i<peeled_mc; i+=mr) 41// { 42// for(int k=0; k<depth; k++) 43// for(int w=0; w<mr; w++) 44// blockA[count++] = cj(lhs(i+w, k)); 45// } 46// for(int i=peeled_mc; i<rows; i++) 47// { 48// for(int k=0; k<depth; k++) 49// blockA[count++] = cj(lhs(i, k)); 50// } 51// } 52// }; 53 54/* Optimized triangular matrix * matrix (_TRMM++) product built on top of 55 * the general matrix matrix product. 56 */ 57template <typename Scalar, typename Index, 58 int Mode, bool LhsIsTriangular, 59 int LhsStorageOrder, bool ConjugateLhs, 60 int RhsStorageOrder, bool ConjugateRhs, 61 int ResStorageOrder> 62struct product_triangular_matrix_matrix; 63 64template <typename Scalar, typename Index, 65 int Mode, bool LhsIsTriangular, 66 int LhsStorageOrder, bool ConjugateLhs, 67 int RhsStorageOrder, bool ConjugateRhs> 68struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular, 69 LhsStorageOrder,ConjugateLhs, 70 RhsStorageOrder,ConjugateRhs,RowMajor> 71{ 72 static EIGEN_STRONG_INLINE void run( 73 Index rows, Index cols, Index depth, 74 const Scalar* lhs, Index lhsStride, 75 const Scalar* rhs, Index rhsStride, 76 Scalar* res, Index resStride, 77 Scalar alpha) 78 { 79 product_triangular_matrix_matrix<Scalar, Index, 80 (Mode&(UnitDiag|ZeroDiag)) | ((Mode&Upper) ? Lower : Upper), 81 (!LhsIsTriangular), 82 RhsStorageOrder==RowMajor ? ColMajor : RowMajor, 83 ConjugateRhs, 84 LhsStorageOrder==RowMajor ? ColMajor : RowMajor, 85 ConjugateLhs, 86 ColMajor> 87 ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha); 88 } 89}; 90 91// implements col-major += alpha * op(triangular) * op(general) 92template <typename Scalar, typename Index, int Mode, 93 int LhsStorageOrder, bool ConjugateLhs, 94 int RhsStorageOrder, bool ConjugateRhs> 95struct product_triangular_matrix_matrix<Scalar,Index,Mode,true, 96 LhsStorageOrder,ConjugateLhs, 97 RhsStorageOrder,ConjugateRhs,ColMajor> 98{ 99 100 typedef gebp_traits<Scalar,Scalar> Traits; 101 enum { 102 SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), 103 IsLower = (Mode&Lower) == Lower, 104 SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1 105 }; 106 107 static EIGEN_DONT_INLINE void run( 108 Index _rows, Index _cols, Index _depth, 109 const Scalar* _lhs, Index lhsStride, 110 const Scalar* _rhs, Index rhsStride, 111 Scalar* res, Index resStride, 112 Scalar alpha) 113 { 114 // strip zeros 115 Index diagSize = (std::min)(_rows,_depth); 116 Index rows = IsLower ? _rows : diagSize; 117 Index depth = IsLower ? diagSize : _depth; 118 Index cols = _cols; 119 120 const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); 121 const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); 122 123 Index kc = depth; // cache block size along the K direction 124 Index mc = rows; // cache block size along the M direction 125 Index nc = cols; // cache block size along the N direction 126 computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc); 127 std::size_t sizeW = kc*Traits::WorkSpaceFactor; 128 std::size_t sizeB = sizeW + kc*cols; 129 ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); 130 ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); 131 Scalar* blockB = allocatedBlockB + sizeW; 132 133 Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer; 134 triangularBuffer.setZero(); 135 if((Mode&ZeroDiag)==ZeroDiag) 136 triangularBuffer.diagonal().setZero(); 137 else 138 triangularBuffer.diagonal().setOnes(); 139 140 gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; 141 gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; 142 gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; 143 144 for(Index k2=IsLower ? depth : 0; 145 IsLower ? k2>0 : k2<depth; 146 IsLower ? k2-=kc : k2+=kc) 147 { 148 Index actual_kc = (std::min)(IsLower ? k2 : depth-k2, kc); 149 Index actual_k2 = IsLower ? k2-actual_kc : k2; 150 151 // align blocks with the end of the triangular part for trapezoidal lhs 152 if((!IsLower)&&(k2<rows)&&(k2+actual_kc>rows)) 153 { 154 actual_kc = rows-k2; 155 k2 = k2+actual_kc-kc; 156 } 157 158 pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols); 159 160 // the selected lhs's panel has to be split in three different parts: 161 // 1 - the part which is zero => skip it 162 // 2 - the diagonal block => special kernel 163 // 3 - the dense panel below (lower case) or above (upper case) the diagonal block => GEPP 164 165 // the block diagonal, if any: 166 if(IsLower || actual_k2<rows) 167 { 168 // for each small vertical panels of lhs 169 for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth) 170 { 171 Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth); 172 Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1; 173 Index startBlock = actual_k2+k1; 174 Index blockBOffset = k1; 175 176 // => GEBP with the micro triangular block 177 // The trick is to pack this micro block while filling the opposite triangular part with zeros. 178 // To this end we do an extra triangular copy to a small temporary buffer 179 for (Index k=0;k<actualPanelWidth;++k) 180 { 181 if (SetDiag) 182 triangularBuffer.coeffRef(k,k) = lhs(startBlock+k,startBlock+k); 183 for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i) 184 triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k); 185 } 186 pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth); 187 188 gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha, 189 actualPanelWidth, actual_kc, 0, blockBOffset); 190 191 // GEBP with remaining micro panel 192 if (lengthTarget>0) 193 { 194 Index startTarget = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2; 195 196 pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget); 197 198 gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha, 199 actualPanelWidth, actual_kc, 0, blockBOffset); 200 } 201 } 202 } 203 // the part below (lower case) or above (upper case) the diagonal => GEPP 204 { 205 Index start = IsLower ? k2 : 0; 206 Index end = IsLower ? rows : (std::min)(actual_k2,rows); 207 for(Index i2=start; i2<end; i2+=mc) 208 { 209 const Index actual_mc = (std::min)(i2+mc,end)-i2; 210 gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>() 211 (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); 212 213 gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); 214 } 215 } 216 } 217 } 218}; 219 220// implements col-major += alpha * op(general) * op(triangular) 221template <typename Scalar, typename Index, int Mode, 222 int LhsStorageOrder, bool ConjugateLhs, 223 int RhsStorageOrder, bool ConjugateRhs> 224struct product_triangular_matrix_matrix<Scalar,Index,Mode,false, 225 LhsStorageOrder,ConjugateLhs, 226 RhsStorageOrder,ConjugateRhs,ColMajor> 227{ 228 typedef gebp_traits<Scalar,Scalar> Traits; 229 enum { 230 SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), 231 IsLower = (Mode&Lower) == Lower, 232 SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1 233 }; 234 235 static EIGEN_DONT_INLINE void run( 236 Index _rows, Index _cols, Index _depth, 237 const Scalar* _lhs, Index lhsStride, 238 const Scalar* _rhs, Index rhsStride, 239 Scalar* res, Index resStride, 240 Scalar alpha) 241 { 242 // strip zeros 243 Index diagSize = (std::min)(_cols,_depth); 244 Index rows = _rows; 245 Index depth = IsLower ? _depth : diagSize; 246 Index cols = IsLower ? diagSize : _cols; 247 248 const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); 249 const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); 250 251 Index kc = depth; // cache block size along the K direction 252 Index mc = rows; // cache block size along the M direction 253 Index nc = cols; // cache block size along the N direction 254 computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc); 255 256 std::size_t sizeW = kc*Traits::WorkSpaceFactor; 257 std::size_t sizeB = sizeW + kc*cols; 258 ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); 259 ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); 260 Scalar* blockB = allocatedBlockB + sizeW; 261 262 Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer; 263 triangularBuffer.setZero(); 264 if((Mode&ZeroDiag)==ZeroDiag) 265 triangularBuffer.diagonal().setZero(); 266 else 267 triangularBuffer.diagonal().setOnes(); 268 269 gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; 270 gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; 271 gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; 272 gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel; 273 274 for(Index k2=IsLower ? 0 : depth; 275 IsLower ? k2<depth : k2>0; 276 IsLower ? k2+=kc : k2-=kc) 277 { 278 Index actual_kc = (std::min)(IsLower ? depth-k2 : k2, kc); 279 Index actual_k2 = IsLower ? k2 : k2-actual_kc; 280 281 // align blocks with the end of the triangular part for trapezoidal rhs 282 if(IsLower && (k2<cols) && (actual_k2+actual_kc>cols)) 283 { 284 actual_kc = cols-k2; 285 k2 = actual_k2 + actual_kc - kc; 286 } 287 288 // remaining size 289 Index rs = IsLower ? (std::min)(cols,actual_k2) : cols - k2; 290 // size of the triangular part 291 Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc; 292 293 Scalar* geb = blockB+ts*ts; 294 295 pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs); 296 297 // pack the triangular part of the rhs padding the unrolled blocks with zeros 298 if(ts>0) 299 { 300 for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth) 301 { 302 Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth); 303 Index actual_j2 = actual_k2 + j2; 304 Index panelOffset = IsLower ? j2+actualPanelWidth : 0; 305 Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; 306 // general part 307 pack_rhs_panel(blockB+j2*actual_kc, 308 &rhs(actual_k2+panelOffset, actual_j2), rhsStride, 309 panelLength, actualPanelWidth, 310 actual_kc, panelOffset); 311 312 // append the triangular part via a temporary buffer 313 for (Index j=0;j<actualPanelWidth;++j) 314 { 315 if (SetDiag) 316 triangularBuffer.coeffRef(j,j) = rhs(actual_j2+j,actual_j2+j); 317 for (Index k=IsLower ? j+1 : 0; IsLower ? k<actualPanelWidth : k<j; ++k) 318 triangularBuffer.coeffRef(k,j) = rhs(actual_j2+k,actual_j2+j); 319 } 320 321 pack_rhs_panel(blockB+j2*actual_kc, 322 triangularBuffer.data(), triangularBuffer.outerStride(), 323 actualPanelWidth, actualPanelWidth, 324 actual_kc, j2); 325 } 326 } 327 328 for (Index i2=0; i2<rows; i2+=mc) 329 { 330 const Index actual_mc = (std::min)(mc,rows-i2); 331 pack_lhs(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); 332 333 // triangular kernel 334 if(ts>0) 335 { 336 for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth) 337 { 338 Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth); 339 Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth; 340 Index blockOffset = IsLower ? j2 : 0; 341 342 gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride, 343 blockA, blockB+j2*actual_kc, 344 actual_mc, panelLength, actualPanelWidth, 345 alpha, 346 actual_kc, actual_kc, // strides 347 blockOffset, blockOffset,// offsets 348 allocatedBlockB); // workspace 349 } 350 } 351 gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride, 352 blockA, geb, actual_mc, actual_kc, rs, 353 alpha, 354 -1, -1, 0, 0, allocatedBlockB); 355 } 356 } 357 } 358}; 359 360/*************************************************************************** 361* Wrapper to product_triangular_matrix_matrix 362***************************************************************************/ 363 364template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs> 365struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false> > 366 : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs> > 367{}; 368 369} // end namespace internal 370 371template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs> 372struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false> 373 : public ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs > 374{ 375 EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct) 376 377 TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} 378 379 template<typename Dest> void scaleAndAddTo(Dest& dst, Scalar alpha) const 380 { 381 const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); 382 const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); 383 384 Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) 385 * RhsBlasTraits::extractScalarFactor(m_rhs); 386 387 internal::product_triangular_matrix_matrix<Scalar, Index, 388 Mode, LhsIsTriangular, 389 (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, 390 (internal::traits<_ActualRhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, 391 (internal::traits<Dest >::Flags&RowMajorBit) ? RowMajor : ColMajor> 392 ::run( 393 lhs.rows(), rhs.cols(), lhs.cols(),// LhsIsTriangular ? rhs.cols() : lhs.rows(), // sizes 394 &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info 395 &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info 396 &dst.coeffRef(0,0), dst.outerStride(), // result info 397 actualAlpha // alpha 398 ); 399 } 400}; 401 402 403#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H