PageRenderTime 37ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/ATLAS/tune/blas/gemv/CASES/ATL_gemvT_4x16_1.c

#
C | 426 lines | 381 code | 16 blank | 29 comment | 36 complexity | a71abd384b6e03d2500d83aa2e416985 MD5 | raw file
  1. /*
  2. * Automatically Tuned Linear Algebra Software v3.8.4
  3. * (C) Copyright 1999 R. Clint Whaley
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions, and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. * 3. The name of the ATLAS group or the names of its contributers may
  14. * not be used to endorse or promote products derived from this
  15. * software without specific written permission.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18. * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  19. * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  20. * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
  21. * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  22. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  23. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  24. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  25. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  26. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  27. * POSSIBILITY OF SUCH DAMAGE.
  28. *
  29. */
  30. #include "atlas_misc.h"
  31. #include "atlas_level1.h"
  32. #include "atlas_level2.h"
  33. #include "atlas_prefetch.h"
  34. static void gemvT_Nsmall(const int M, const int N, const TYPE *A, const int lda,
  35. const TYPE *X, const SCALAR beta, TYPE *Y)
  36. {
  37. int i;
  38. register TYPE x0, x1, x2, x3, x4, x5, x6, x7, y0;
  39. TYPE *stY = Y + M;
  40. switch(N)
  41. {
  42. case 1:
  43. #ifdef BETA0
  44. Mjoin(PATL,cpsc)(M, *X, A, lda, Y, 1);
  45. #elif defined(BETAX)
  46. Mjoin(PATL,axpby)(M, *X, A, lda, beta, Y, 1);
  47. #else
  48. Mjoin(PATL,axpy)(M, *X, A, lda, Y, 1);
  49. #endif
  50. break;
  51. case 2:
  52. x0 = *X;
  53. x1 = X[1];
  54. do
  55. {
  56. #ifdef BETA0
  57. *Y = x0 * *A + x1 * A[1];
  58. #elif defined(BETAX)
  59. y0 = *Y;
  60. *Y = y0 * beta + x0 * *A + x1 * A[1];
  61. #else
  62. *Y += x0 * *A + x1 * A[1];
  63. #endif
  64. A += lda;
  65. }
  66. while (++Y != stY);
  67. break;
  68. case 3:
  69. x0 = *X;
  70. x1 = X[1];
  71. x2 = X[2];
  72. do
  73. {
  74. #ifdef BETA0
  75. *Y = x0 * *A + x1 * A[1] + x2 * A[2];
  76. #elif defined(BETAX)
  77. y0 = *Y;
  78. *Y = y0 * beta + x0 * *A + x1 * A[1] + x2 * A[2];
  79. #else
  80. *Y += x0 * *A + x1 * A[1] + x2 * A[2];
  81. #endif
  82. A += lda;
  83. }
  84. while (++Y != stY);
  85. break;
  86. case 4:
  87. x0 = *X;
  88. x1 = X[1];
  89. x2 = X[2];
  90. x3 = X[3];
  91. do
  92. {
  93. #ifdef BETA0
  94. *Y = x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3];
  95. #elif defined(BETAX)
  96. y0 = *Y;
  97. *Y = y0 * beta + x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3];
  98. #else
  99. *Y += x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3];
  100. #endif
  101. A += lda;
  102. }
  103. while (++Y != stY);
  104. break;
  105. case 5:
  106. x0 = *X;
  107. x1 = X[1];
  108. x2 = X[2];
  109. x3 = X[3];
  110. x4 = X[4];
  111. do
  112. {
  113. #ifdef BETA0
  114. *Y = x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
  115. + x4 * A[4];
  116. #elif defined(BETAX)
  117. y0 = *Y;
  118. *Y = y0 * beta + x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
  119. + x4 * A[4];
  120. #else
  121. *Y += x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3] + x4 * A[4];
  122. #endif
  123. A += lda;
  124. }
  125. while (++Y != stY);
  126. break;
  127. case 6:
  128. x0 = *X;
  129. x1 = X[1];
  130. x2 = X[2];
  131. x3 = X[3];
  132. x4 = X[4];
  133. x5 = X[5];
  134. do
  135. {
  136. #ifdef BETA0
  137. *Y = x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
  138. + x4 * A[4] + x5 * A[5];
  139. #elif defined(BETAX)
  140. y0 = *Y;
  141. *Y = y0 * beta + x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
  142. + x4 * A[4] + x5 * A[5];
  143. #else
  144. *Y += x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
  145. + x4 * A[4] + x5 * A[5];
  146. #endif
  147. A += lda;
  148. }
  149. while (++Y != stY);
  150. break;
  151. case 7:
  152. x0 = *X;
  153. x1 = X[1];
  154. x2 = X[2];
  155. x3 = X[3];
  156. x4 = X[4];
  157. x5 = X[5];
  158. x6 = X[6];
  159. do
  160. {
  161. #ifdef BETA0
  162. *Y = x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
  163. + x4 * A[4] + x5 * A[5] + x6 * A[6];
  164. #elif defined(BETAX)
  165. y0 = *Y;
  166. *Y = y0 * beta + x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
  167. + x4 * A[4] + x5 * A[5] + x6 * A[6];
  168. #else
  169. *Y += x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
  170. + x4 * A[4] + x5 * A[5] + x6 * A[6];
  171. #endif
  172. A += lda;
  173. }
  174. while (++Y != stY);
  175. break;
  176. case 8:
  177. x0 = *X;
  178. x1 = X[1];
  179. x2 = X[2];
  180. x3 = X[3];
  181. x4 = X[4];
  182. x5 = X[5];
  183. x6 = X[6];
  184. x7 = X[7];
  185. do
  186. {
  187. #ifdef BETA0
  188. *Y = x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
  189. + x4 * A[4] + x5 * A[5] + x6 * A[6] + x7 * A[7];
  190. #elif defined(BETAX)
  191. y0 = *Y;
  192. *Y = y0 * beta + x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
  193. + x4 * A[4] + x5 * A[5] + x6 * A[6] + x7 * A[7];
  194. #else
  195. *Y += x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
  196. + x4 * A[4] + x5 * A[5] + x6 * A[6] + x7 * A[7];
  197. #endif
  198. A += lda;
  199. }
  200. while (++Y != stY);
  201. break;
  202. default:
  203. if (M)
  204. {
  205. do
  206. {
  207. #ifdef BETA0
  208. y0 = ATL_rzero;
  209. #elif defined(BETAX)
  210. y0 = *Y * beta;
  211. #else
  212. y0 = *Y;
  213. #endif
  214. for (i=0; i != N; i++) y0 += A[i] * X[i];
  215. *Y++ = y0;
  216. A += lda;
  217. }
  218. while (Y != stY);
  219. }
  220. }
  221. }
  222. static void gemvT_Msmall(const int M, const int N, const TYPE *A, const int lda,
  223. const TYPE *X, const SCALAR beta, TYPE *Y)
  224. {
  225. const TYPE *stY = Y + M;
  226. #ifndef BETA0
  227. register TYPE y0;
  228. #endif
  229. do
  230. {
  231. #ifdef BETA0
  232. *Y = Mjoin(PATL,dot)(N, X, 1, A, 1);
  233. #else
  234. y0 = *Y;
  235. #ifdef BETAX
  236. y0 *= beta;
  237. #endif
  238. y0 += Mjoin(PATL,dot)(N, X, 1, A, 1);
  239. *Y = y0;
  240. #endif
  241. Y++;
  242. A += lda;
  243. }
  244. while (Y != stY);
  245. }
  246. static void gemvT4x16(const int M, const int N, const TYPE *A, const int lda,
  247. const TYPE *X, const SCALAR beta, TYPE *Y)
  248. {
  249. int j;
  250. const int M4 = (M>>2)<<2, N16 = (N>>4)<<4, incAm = (lda<<2) - N;
  251. const int nr = ( N16 ? N-N16+16 : N );
  252. const TYPE *stX = X + N16-16, *x;
  253. const TYPE *A0 = A, *A1 = A + lda, *A2 = A1 + lda, *A3 = A2 + lda;
  254. TYPE *stY = Y + M4;
  255. register TYPE a00, a01, a02, a03, a10, a11, a12, a13;
  256. register TYPE b00, b01, b02, b03, b10, b11, b12, b13;
  257. register TYPE y0, y1, y2, y3, z0, z1, z2, z3;
  258. register TYPE x0, x1, x2, x3;
  259. #ifdef ATL_AltiVec
  260. int cwrd = ATL_MulBySize(N)>>4;
  261. if (cwrd >= 64) cwrd = ATL_GetCtrl(512, (cwrd+31)>>5, 0);
  262. else cwrd = ATL_GetCtrl(64, (cwrd+3)>>2, 4);
  263. #endif
  264. if (N >= 10)
  265. {
  266. if (M4)
  267. {
  268. do
  269. {
  270. #ifdef ATL_AltiVec
  271. ATL_pfavR(A0, cwrd, 0);
  272. ATL_pfavR(A1, cwrd, 1);
  273. ATL_pfavR(A2, cwrd, 2);
  274. ATL_pfavR(A3, cwrd, 3);
  275. #endif
  276. a00 = *A0; a01 = *A1; a02 = *A2; a03 = *A3;
  277. a10 = A0[1]; a11 = A1[1]; a12 = A2[1]; a13 = A3[1];
  278. b00 = A0[8]; b01 = A1[8]; b02 = A2[8]; b03 = A3[8];
  279. b10 = A0[9]; b11 = A1[9]; b12 = A2[9]; b13 = A3[9];
  280. #ifdef BETA0
  281. y0 = y1 = y2 = y3 = z0 = z1 = z2 = z3 = ATL_rzero;
  282. #elif defined(BETAX)
  283. y0 = beta;
  284. z0 = *Y; z1 = Y[1]; z2 = Y[2]; z3 = Y[3];
  285. z0 *= y0; z1 *= y0; z2 *= y0; z3 *= y0;
  286. y0 = y1 = y2 = y3 = ATL_rzero;
  287. #else
  288. z0 = *Y; z1 = Y[1]; z2 = Y[2]; z3 = Y[3];
  289. y0 = y1 = y2 = y3 = ATL_rzero;
  290. #endif
  291. x0 = *X; x1 = X[1]; x2 = X[8]; x3 = X[9]; x = X;
  292. if (N16 > 16)
  293. {
  294. do
  295. {
  296. y0 += a00 * x0; a00 = A0[2];
  297. y1 += a01 * x0; a01 = A1[2];
  298. y2 += a02 * x0; a02 = A2[2];
  299. y3 += a03 * x0; a03 = A3[2]; x0 = x[2];
  300. z0 += b00 * x2; b00 = A0[10];
  301. z1 += b01 * x2; b01 = A1[10];
  302. z2 += b02 * x2; b02 = A2[10];
  303. z3 += b03 * x2; b03 = A3[10]; x2 = x[10];
  304. y0 += a10 * x1; a10 = A0[3];
  305. y1 += a11 * x1; a11 = A1[3];
  306. y2 += a12 * x1; a12 = A2[3];
  307. y3 += a13 * x1; a13 = A3[3]; x1 = x[3];
  308. z0 += b10 * x3; b10 = A0[11];
  309. z1 += b11 * x3; b11 = A1[11];
  310. z2 += b12 * x3; b12 = A2[11];
  311. z3 += b13 * x3; b13 = A3[11]; x3 = x[11];
  312. y0 += a00 * x0; a00 = A0[4];
  313. y1 += a01 * x0; a01 = A1[4];
  314. y2 += a02 * x0; a02 = A2[4];
  315. y3 += a03 * x0; a03 = A3[4]; x0 = x[4];
  316. z0 += b00 * x2; b00 = A0[12];
  317. z1 += b01 * x2; b01 = A1[12];
  318. z2 += b02 * x2; b02 = A2[12];
  319. z3 += b03 * x2; b03 = A3[12]; x2 = x[12];
  320. y0 += a10 * x1; a10 = A0[5];
  321. y1 += a11 * x1; a11 = A1[5];
  322. y2 += a12 * x1; a12 = A2[5];
  323. y3 += a13 * x1; a13 = A3[5]; x1 = x[5];
  324. z0 += b10 * x3; b10 = A0[13];
  325. z1 += b11 * x3; b11 = A1[13];
  326. z2 += b12 * x3; b12 = A2[13];
  327. z3 += b13 * x3; b13 = A3[13]; x3 = x[13];
  328. y0 += a00 * x0; a00 = A0[6];
  329. y1 += a01 * x0; a01 = A1[6];
  330. y2 += a02 * x0; a02 = A2[6];
  331. y3 += a03 * x0; a03 = A3[6]; x0 = x[6];
  332. z0 += b00 * x2; b00 = A0[14];
  333. z1 += b01 * x2; b01 = A1[14];
  334. z2 += b02 * x2; b02 = A2[14];
  335. z3 += b03 * x2; b03 = A3[14]; x2 = x[14];
  336. y0 += a10 * x1; a10 = A0[7];
  337. y1 += a11 * x1; a11 = A1[7];
  338. y2 += a12 * x1; a12 = A2[7];
  339. y3 += a13 * x1; a13 = A3[7]; x1 = x[7];
  340. z0 += b10 * x3; b10 = A0[15]; A0 += 16;
  341. z1 += b11 * x3; b11 = A1[15]; A1 += 16;
  342. z2 += b12 * x3; b12 = A2[15]; A2 += 16;
  343. z3 += b13 * x3; b13 = A3[15]; x3 = x[15]; x += 16; A3 += 16;
  344. y0 += a00 * x0; a00 = *A0;
  345. y1 += a01 * x0; a01 = *A1;
  346. y2 += a02 * x0; a02 = *A2;
  347. y3 += a03 * x0; a03 = *A3; x0 = *x;
  348. z0 += b00 * x2; b00 = A0[8];
  349. z1 += b01 * x2; b01 = A1[8];
  350. z2 += b02 * x2; b02 = A2[8];
  351. z3 += b03 * x2; b03 = A3[8]; x2 = x[8];
  352. y0 += a10 * x1; a10 = A0[1];
  353. y1 += a11 * x1; a11 = A1[1];
  354. y2 += a12 * x1; a12 = A2[1];
  355. y3 += a13 * x1; a13 = A3[1]; x1 = x[1];
  356. z0 += b10 * x3; b10 = A0[9];
  357. z1 += b11 * x3; b11 = A1[9];
  358. z2 += b12 * x3; b12 = A2[9];
  359. z3 += b13 * x3; b13 = A3[9]; x3 = x[9];
  360. }
  361. while (x != stX);
  362. }
  363. for (j=(nr>>1); j; j--, A0 += 2, A1 += 2, A2 += 2, A3 += 2, x += 2)
  364. {
  365. x0 = *x; x1 = x[1];
  366. y0 += *A0 * x0;
  367. y1 += *A1 * x0;
  368. y2 += *A2 * x0;
  369. y3 += *A3 * x0;
  370. z0 += A0[1] * x1;
  371. z1 += A1[1] * x1;
  372. z2 += A2[1] * x1;
  373. z3 += A3[1] * x1;
  374. }
  375. if (nr != (nr>>1)<<1)
  376. {
  377. x0 = *x;
  378. z0 += *A0++ * x0;
  379. z1 += *A1++ * x0;
  380. z2 += *A2++ * x0;
  381. z3 += *A3++ * x0;
  382. }
  383. A0 += incAm;
  384. y0 += z0;
  385. A1 += incAm;
  386. y1 += z1;
  387. A2 += incAm;
  388. y2 += z2;
  389. A3 += incAm;
  390. y3 += z3;
  391. *Y = y0;
  392. Y[1] = y1;
  393. Y[2] = y2;
  394. Y[3] = y3;
  395. Y += 4;
  396. }
  397. while (Y != stY);
  398. }
  399. if (M-M4) gemvT_Msmall(M-M4, N, A0, lda, X, beta, Y);
  400. }
  401. else if (M) gemvT_Nsmall(M, N, A, lda, X, beta, Y);
  402. }
  403. #define gemv0 Mjoin(Mjoin(Mjoin(Mjoin(gemvT,NM),_x1),BNM),_y1)
  404. void Mjoin(PATL,gemv0)
  405. (const int M, const int N, const SCALAR alpha,
  406. const TYPE *A, const int lda, const TYPE *X, const int incX,
  407. const SCALAR beta, TYPE *Y, const int incY)
  408. {
  409. gemvT4x16(M, N, A, lda, X, beta, Y);
  410. }