/src/mat/impls/baij/seq/dgefa4.c

https://bitbucket.org/tisaac/petsc · C · 394 lines · 256 code · 97 blank · 41 comment · 27 complexity · 07bac54fda1553fea2892579873ba080 MD5 · raw file

  1. /*
  2. Inverts 4 by 4 matrix using partial pivoting.
  3. Used by the sparse factorization routines in
  4. src/mat/impls/baij/seq
  5. This is a combination of the Linpack routines
  6. dgefa() and dgedi() specialized for a size of 4.
  7. */
  8. #include <petscsys.h>
  9. #undef __FUNCT__
  10. #define __FUNCT__ "PetscKernel_A_gets_inverse_A_4"
  11. PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4(MatScalar *a,PetscReal shift)
  12. {
  13. PetscInt i__2,i__3,kp1,j,k,l,ll,i,ipvt[4],kb,k3;
  14. PetscInt k4,j3;
  15. MatScalar *aa,*ax,*ay,work[16],stmp;
  16. MatReal tmp,max;
  17. /* gaussian elimination with partial pivoting */
  18. PetscFunctionBegin;
  19. shift = .25*shift*(1.e-12 + PetscAbsScalar(a[0]) + PetscAbsScalar(a[5]) + PetscAbsScalar(a[10]) + PetscAbsScalar(a[15]));
  20. /* Parameter adjustments */
  21. a -= 5;
  22. for (k = 1; k <= 3; ++k) {
  23. kp1 = k + 1;
  24. k3 = 4*k;
  25. k4 = k3 + k;
  26. /* find l = pivot index */
  27. i__2 = 5 - k;
  28. aa = &a[k4];
  29. max = PetscAbsScalar(aa[0]);
  30. l = 1;
  31. for (ll=1; ll<i__2; ll++) {
  32. tmp = PetscAbsScalar(aa[ll]);
  33. if (tmp > max) { max = tmp; l = ll+1;}
  34. }
  35. l += k - 1;
  36. ipvt[k-1] = l;
  37. if (a[l + k3] == 0.0) {
  38. if (shift == 0.0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",k-1);
  39. else {
  40. /* SHIFT is applied to SINGLE diagonal entry; does this make any sense? */
  41. a[l + k3] = shift;
  42. }
  43. }
  44. /* interchange if necessary */
  45. if (l != k) {
  46. stmp = a[l + k3];
  47. a[l + k3] = a[k4];
  48. a[k4] = stmp;
  49. }
  50. /* compute multipliers */
  51. stmp = -1. / a[k4];
  52. i__2 = 4 - k;
  53. aa = &a[1 + k4];
  54. for (ll=0; ll<i__2; ll++) aa[ll] *= stmp;
  55. /* row elimination with column indexing */
  56. ax = &a[k4+1];
  57. for (j = kp1; j <= 4; ++j) {
  58. j3 = 4*j;
  59. stmp = a[l + j3];
  60. if (l != k) {
  61. a[l + j3] = a[k + j3];
  62. a[k + j3] = stmp;
  63. }
  64. i__3 = 4 - k;
  65. ay = &a[1+k+j3];
  66. for (ll=0; ll<i__3; ll++) ay[ll] += stmp*ax[ll];
  67. }
  68. }
  69. ipvt[3] = 4;
  70. if (a[20] == 0.0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",3);
  71. /*
  72. Now form the inverse
  73. */
  74. /* compute inverse(u) */
  75. for (k = 1; k <= 4; ++k) {
  76. k3 = 4*k;
  77. k4 = k3 + k;
  78. a[k4] = 1.0 / a[k4];
  79. stmp = -a[k4];
  80. i__2 = k - 1;
  81. aa = &a[k3 + 1];
  82. for (ll=0; ll<i__2; ll++) aa[ll] *= stmp;
  83. kp1 = k + 1;
  84. if (4 < kp1) continue;
  85. ax = aa;
  86. for (j = kp1; j <= 4; ++j) {
  87. j3 = 4*j;
  88. stmp = a[k + j3];
  89. a[k + j3] = 0.0;
  90. ay = &a[j3 + 1];
  91. for (ll=0; ll<k; ll++) ay[ll] += stmp*ax[ll];
  92. }
  93. }
  94. /* form inverse(u)*inverse(l) */
  95. for (kb = 1; kb <= 3; ++kb) {
  96. k = 4 - kb;
  97. k3 = 4*k;
  98. kp1 = k + 1;
  99. aa = a + k3;
  100. for (i = kp1; i <= 4; ++i) {
  101. work[i-1] = aa[i];
  102. aa[i] = 0.0;
  103. }
  104. for (j = kp1; j <= 4; ++j) {
  105. stmp = work[j-1];
  106. ax = &a[4*j + 1];
  107. ay = &a[k3 + 1];
  108. ay[0] += stmp*ax[0];
  109. ay[1] += stmp*ax[1];
  110. ay[2] += stmp*ax[2];
  111. ay[3] += stmp*ax[3];
  112. }
  113. l = ipvt[k-1];
  114. if (l != k) {
  115. ax = &a[k3 + 1];
  116. ay = &a[4*l + 1];
  117. stmp = ax[0]; ax[0] = ay[0]; ay[0] = stmp;
  118. stmp = ax[1]; ax[1] = ay[1]; ay[1] = stmp;
  119. stmp = ax[2]; ax[2] = ay[2]; ay[2] = stmp;
  120. stmp = ax[3]; ax[3] = ay[3]; ay[3] = stmp;
  121. }
  122. }
  123. PetscFunctionReturn(0);
  124. }
  125. #if defined(PETSC_HAVE_SSE)
  126. #include PETSC_HAVE_SSE
  127. #undef __FUNCT__
  128. #define __FUNCT__ "PetscKernel_A_gets_inverse_A_4_SSE"
  129. PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4_SSE(float *a)
  130. {
  131. /*
  132. This routine is converted from Intel's Small Matrix Library.
  133. See: Streaming SIMD Extensions -- Inverse of 4x4 Matrix
  134. Order Number: 245043-001
  135. March 1999
  136. http://www.intel.com
  137. Inverse of a 4x4 matrix via Kramer's Rule:
  138. bool Invert4x4(SMLXMatrix &);
  139. */
  140. PetscFunctionBegin;
  141. SSE_SCOPE_BEGIN;
  142. SSE_INLINE_BEGIN_1(a)
  143. /* ----------------------------------------------- */
  144. SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
  145. SSE_LOADH_PS(SSE_ARG_1,FLOAT_4,XMM0)
  146. SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
  147. SSE_LOADH_PS(SSE_ARG_1,FLOAT_12,XMM5)
  148. SSE_COPY_PS(XMM3,XMM0)
  149. SSE_SHUFFLE(XMM3,XMM5,0x88)
  150. SSE_SHUFFLE(XMM5,XMM0,0xDD)
  151. SSE_LOADL_PS(SSE_ARG_1,FLOAT_2,XMM0)
  152. SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM0)
  153. SSE_LOADL_PS(SSE_ARG_1,FLOAT_10,XMM6)
  154. SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM6)
  155. SSE_COPY_PS(XMM4,XMM0)
  156. SSE_SHUFFLE(XMM4,XMM6,0x88)
  157. SSE_SHUFFLE(XMM6,XMM0,0xDD)
  158. /* ----------------------------------------------- */
  159. SSE_COPY_PS(XMM7,XMM4)
  160. SSE_MULT_PS(XMM7,XMM6)
  161. SSE_SHUFFLE(XMM7,XMM7,0xB1)
  162. SSE_COPY_PS(XMM0,XMM5)
  163. SSE_MULT_PS(XMM0,XMM7)
  164. SSE_COPY_PS(XMM2,XMM3)
  165. SSE_MULT_PS(XMM2,XMM7)
  166. SSE_SHUFFLE(XMM7,XMM7,0x4E)
  167. SSE_COPY_PS(XMM1,XMM5)
  168. SSE_MULT_PS(XMM1,XMM7)
  169. SSE_SUB_PS(XMM1,XMM0)
  170. SSE_MULT_PS(XMM7,XMM3)
  171. SSE_SUB_PS(XMM7,XMM2)
  172. SSE_SHUFFLE(XMM7,XMM7,0x4E)
  173. SSE_STORE_PS(SSE_ARG_1,FLOAT_4,XMM7)
  174. /* ----------------------------------------------- */
  175. SSE_COPY_PS(XMM0,XMM5)
  176. SSE_MULT_PS(XMM0,XMM4)
  177. SSE_SHUFFLE(XMM0,XMM0,0xB1)
  178. SSE_COPY_PS(XMM2,XMM6)
  179. SSE_MULT_PS(XMM2,XMM0)
  180. SSE_ADD_PS(XMM2,XMM1)
  181. SSE_COPY_PS(XMM7,XMM3)
  182. SSE_MULT_PS(XMM7,XMM0)
  183. SSE_SHUFFLE(XMM0,XMM0,0x4E)
  184. SSE_COPY_PS(XMM1,XMM6)
  185. SSE_MULT_PS(XMM1,XMM0)
  186. SSE_SUB_PS(XMM2,XMM1)
  187. SSE_MULT_PS(XMM0,XMM3)
  188. SSE_SUB_PS(XMM0,XMM7)
  189. SSE_SHUFFLE(XMM0,XMM0,0x4E)
  190. SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM0)
  191. /* ----------------------------------------------- */
  192. SSE_COPY_PS(XMM7,XMM5)
  193. SSE_SHUFFLE(XMM7,XMM5,0x4E)
  194. SSE_MULT_PS(XMM7,XMM6)
  195. SSE_SHUFFLE(XMM7,XMM7,0xB1)
  196. SSE_SHUFFLE(XMM4,XMM4,0x4E)
  197. SSE_COPY_PS(XMM0,XMM4)
  198. SSE_MULT_PS(XMM0,XMM7)
  199. SSE_ADD_PS(XMM0,XMM2)
  200. SSE_COPY_PS(XMM2,XMM3)
  201. SSE_MULT_PS(XMM2,XMM7)
  202. SSE_SHUFFLE(XMM7,XMM7,0x4E)
  203. SSE_COPY_PS(XMM1,XMM4)
  204. SSE_MULT_PS(XMM1,XMM7)
  205. SSE_SUB_PS(XMM0,XMM1)
  206. SSE_STORE_PS(SSE_ARG_1,FLOAT_0,XMM0)
  207. SSE_MULT_PS(XMM7,XMM3)
  208. SSE_SUB_PS(XMM7,XMM2)
  209. SSE_SHUFFLE(XMM7,XMM7,0x4E)
  210. /* ----------------------------------------------- */
  211. SSE_COPY_PS(XMM1,XMM3)
  212. SSE_MULT_PS(XMM1,XMM5)
  213. SSE_SHUFFLE(XMM1,XMM1,0xB1)
  214. SSE_COPY_PS(XMM0,XMM6)
  215. SSE_MULT_PS(XMM0,XMM1)
  216. SSE_ADD_PS(XMM0,XMM7)
  217. SSE_COPY_PS(XMM2,XMM4)
  218. SSE_MULT_PS(XMM2,XMM1)
  219. SSE_SUB_PS_M(XMM2,SSE_ARG_1,FLOAT_12)
  220. SSE_SHUFFLE(XMM1,XMM1,0x4E)
  221. SSE_COPY_PS(XMM7,XMM6)
  222. SSE_MULT_PS(XMM7,XMM1)
  223. SSE_SUB_PS(XMM7,XMM0)
  224. SSE_MULT_PS(XMM1,XMM4)
  225. SSE_SUB_PS(XMM2,XMM1)
  226. SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM2)
  227. /* ----------------------------------------------- */
  228. SSE_COPY_PS(XMM1,XMM3)
  229. SSE_MULT_PS(XMM1,XMM6)
  230. SSE_SHUFFLE(XMM1,XMM1,0xB1)
  231. SSE_COPY_PS(XMM2,XMM4)
  232. SSE_MULT_PS(XMM2,XMM1)
  233. SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM0)
  234. SSE_SUB_PS(XMM0,XMM2)
  235. SSE_COPY_PS(XMM2,XMM5)
  236. SSE_MULT_PS(XMM2,XMM1)
  237. SSE_ADD_PS(XMM2,XMM7)
  238. SSE_SHUFFLE(XMM1,XMM1,0x4E)
  239. SSE_COPY_PS(XMM7,XMM4)
  240. SSE_MULT_PS(XMM7,XMM1)
  241. SSE_ADD_PS(XMM7,XMM0)
  242. SSE_MULT_PS(XMM1,XMM5)
  243. SSE_SUB_PS(XMM2,XMM1)
  244. /* ----------------------------------------------- */
  245. SSE_MULT_PS(XMM4,XMM3)
  246. SSE_SHUFFLE(XMM4,XMM4,0xB1)
  247. SSE_COPY_PS(XMM1,XMM6)
  248. SSE_MULT_PS(XMM1,XMM4)
  249. SSE_ADD_PS(XMM1,XMM7)
  250. SSE_COPY_PS(XMM0,XMM5)
  251. SSE_MULT_PS(XMM0,XMM4)
  252. SSE_LOAD_PS(SSE_ARG_1,FLOAT_12,XMM7)
  253. SSE_SUB_PS(XMM7,XMM0)
  254. SSE_SHUFFLE(XMM4,XMM4,0x4E)
  255. SSE_MULT_PS(XMM6,XMM4)
  256. SSE_SUB_PS(XMM1,XMM6)
  257. SSE_MULT_PS(XMM5,XMM4)
  258. SSE_ADD_PS(XMM5,XMM7)
  259. /* ----------------------------------------------- */
  260. SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
  261. SSE_MULT_PS(XMM3,XMM0)
  262. SSE_COPY_PS(XMM4,XMM3)
  263. SSE_SHUFFLE(XMM4,XMM3,0x4E)
  264. SSE_ADD_PS(XMM4,XMM3)
  265. SSE_COPY_PS(XMM6,XMM4)
  266. SSE_SHUFFLE(XMM6,XMM4,0xB1)
  267. SSE_ADD_SS(XMM6,XMM4)
  268. SSE_COPY_PS(XMM3,XMM6)
  269. SSE_RECIP_SS(XMM3,XMM6)
  270. SSE_COPY_SS(XMM4,XMM3)
  271. SSE_ADD_SS(XMM4,XMM3)
  272. SSE_MULT_SS(XMM3,XMM3)
  273. SSE_MULT_SS(XMM6,XMM3)
  274. SSE_SUB_SS(XMM4,XMM6)
  275. SSE_SHUFFLE(XMM4,XMM4,0x00)
  276. SSE_MULT_PS(XMM0,XMM4)
  277. SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM0)
  278. SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM0)
  279. SSE_MULT_PS(XMM1,XMM4)
  280. SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM1)
  281. SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM1)
  282. SSE_MULT_PS(XMM2,XMM4)
  283. SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM2)
  284. SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM2)
  285. SSE_MULT_PS(XMM4,XMM5)
  286. SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
  287. SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
  288. /* ----------------------------------------------- */
  289. SSE_INLINE_END_1;
  290. SSE_SCOPE_END;
  291. PetscFunctionReturn(0);
  292. }
  293. #endif