/xblas-1.0.248/src/symv2/BLAS_dsymv2_s_s_x.c

# · C · 588 lines · 386 code · 103 blank · 99 comment · 67 complexity · fd12b155137e4f609f9b31c0b7d5daa0 MD5 · raw file

  1. #include <blas_extended.h>
  2. #include <blas_extended_private.h>
  3. #include <blas_fpu.h>
  4. void BLAS_dsymv2_s_s_x(enum blas_order_type order, enum blas_uplo_type uplo,
  5. int n, double alpha, const float *a, int lda,
  6. const float *x_head, const float *x_tail, int incx,
  7. double beta, double *y, int incy,
  8. enum blas_prec_type prec)
  9. /*
  10. * Purpose
  11. * =======
  12. *
  13. * This routines computes the matrix product:
  14. *
  15. * y <- alpha * A * (x_head + x_tail) + beta * y
  16. *
  17. * where A is a symmetric matrix.
  18. *
  19. * Arguments
  20. * =========
  21. *
  22. * order (input) enum blas_order_type
  23. * Storage format of input symmetric matrix A.
  24. *
  25. * uplo (input) enum blas_uplo_type
  26. * Determines which half of matrix A (upper or lower triangle)
  27. * is accessed.
  28. *
  29. * n (input) int
  30. * Dimension of A and size of vectors x, y.
  31. *
  32. * alpha (input) double
  33. *
  34. * a (input) float*
  35. * Matrix A.
  36. *
  37. * lda (input) int
  38. * Leading dimension of matrix A.
  39. *
  40. * x_head (input) float*
  41. * Vector x_head
  42. *
  43. * x_tail (input) float*
  44. * Vector x_tail
  45. *
  46. * incx (input) int
  47. * Stride for vector x.
  48. *
  49. * beta (input) double
  50. *
  51. * y (input) float*
  52. * Vector y.
  53. *
  54. * incy (input) int
  55. * Stride for vector y.
  56. *
  57. * prec (input) enum blas_prec_type
  58. * Specifies the internal precision to be used.
  59. * = blas_prec_single: single precision.
  60. * = blas_prec_double: double precision.
  61. * = blas_prec_extra : anything at least 1.5 times as accurate
  62. * than double, and wider than 80-bits.
  63. * We use double-double in our implementation.
  64. *
  65. */
  66. {
  67. /* Routine name */
  68. const char routine_name[] = "BLAS_dsymv2_s_s_x";
  69. switch (prec) {
  70. case blas_prec_single:{
  71. int i, j;
  72. int xi, yi, xi0, yi0;
  73. int aij, ai;
  74. int incai;
  75. int incaij, incaij2;
  76. const float *a_i = a;
  77. const float *x_head_i = x_head;
  78. const float *x_tail_i = x_tail;
  79. double *y_i = y;
  80. double alpha_i = alpha;
  81. double beta_i = beta;
  82. float a_elem;
  83. float x_elem;
  84. double y_elem;
  85. double prod1;
  86. double prod2;
  87. double sum1;
  88. double sum2;
  89. double tmp1;
  90. double tmp2;
  91. double tmp3;
  92. /* Test for no-op */
  93. if (n <= 0) {
  94. return;
  95. }
  96. if (alpha_i == 0.0 && beta_i == 1.0) {
  97. return;
  98. }
  99. /* Check for error conditions. */
  100. if (n < 0) {
  101. BLAS_error(routine_name, -3, n, NULL);
  102. }
  103. if (lda < n) {
  104. BLAS_error(routine_name, -6, n, NULL);
  105. }
  106. if (incx == 0) {
  107. BLAS_error(routine_name, -9, incx, NULL);
  108. }
  109. if (incy == 0) {
  110. BLAS_error(routine_name, -12, incy, NULL);
  111. }
  112. if ((order == blas_colmajor && uplo == blas_upper) ||
  113. (order == blas_rowmajor && uplo == blas_lower)) {
  114. incai = lda;
  115. incaij = 1;
  116. incaij2 = lda;
  117. } else {
  118. incai = 1;
  119. incaij = lda;
  120. incaij2 = 1;
  121. }
  122. xi0 = (incx > 0) ? 0 : ((-n + 1) * incx);
  123. yi0 = (incy > 0) ? 0 : ((-n + 1) * incy);
  124. /* The most general form, y <--- alpha * A * (x_head + x_tail) + beta * y */
  125. for (i = 0, yi = yi0, ai = 0; i < n; i++, yi += incy, ai += incai) {
  126. sum1 = 0.0;
  127. sum2 = 0.0;
  128. for (j = 0, aij = ai, xi = xi0; j < i; j++, aij += incaij, xi += incx) {
  129. a_elem = a_i[aij];
  130. x_elem = x_head_i[xi];
  131. prod1 = (double) a_elem *x_elem;
  132. sum1 = sum1 + prod1;
  133. x_elem = x_tail_i[xi];
  134. prod2 = (double) a_elem *x_elem;
  135. sum2 = sum2 + prod2;
  136. }
  137. for (; j < n; j++, aij += incaij2, xi += incx) {
  138. a_elem = a_i[aij];
  139. x_elem = x_head_i[xi];
  140. prod1 = (double) a_elem *x_elem;
  141. sum1 = sum1 + prod1;
  142. x_elem = x_tail_i[xi];
  143. prod2 = (double) a_elem *x_elem;
  144. sum2 = sum2 + prod2;
  145. }
  146. sum1 = sum1 + sum2;
  147. tmp1 = sum1 * alpha_i;
  148. y_elem = y_i[yi];
  149. tmp2 = y_elem * beta_i;
  150. tmp3 = tmp1 + tmp2;
  151. y_i[yi] = tmp3;
  152. }
  153. break;
  154. }
  155. case blas_prec_double:
  156. case blas_prec_indigenous:{
  157. int i, j;
  158. int xi, yi, xi0, yi0;
  159. int aij, ai;
  160. int incai;
  161. int incaij, incaij2;
  162. const float *a_i = a;
  163. const float *x_head_i = x_head;
  164. const float *x_tail_i = x_tail;
  165. double *y_i = y;
  166. double alpha_i = alpha;
  167. double beta_i = beta;
  168. float a_elem;
  169. float x_elem;
  170. double y_elem;
  171. double prod1;
  172. double prod2;
  173. double sum1;
  174. double sum2;
  175. double tmp1;
  176. double tmp2;
  177. double tmp3;
  178. /* Test for no-op */
  179. if (n <= 0) {
  180. return;
  181. }
  182. if (alpha_i == 0.0 && beta_i == 1.0) {
  183. return;
  184. }
  185. /* Check for error conditions. */
  186. if (n < 0) {
  187. BLAS_error(routine_name, -3, n, NULL);
  188. }
  189. if (lda < n) {
  190. BLAS_error(routine_name, -6, n, NULL);
  191. }
  192. if (incx == 0) {
  193. BLAS_error(routine_name, -9, incx, NULL);
  194. }
  195. if (incy == 0) {
  196. BLAS_error(routine_name, -12, incy, NULL);
  197. }
  198. if ((order == blas_colmajor && uplo == blas_upper) ||
  199. (order == blas_rowmajor && uplo == blas_lower)) {
  200. incai = lda;
  201. incaij = 1;
  202. incaij2 = lda;
  203. } else {
  204. incai = 1;
  205. incaij = lda;
  206. incaij2 = 1;
  207. }
  208. xi0 = (incx > 0) ? 0 : ((-n + 1) * incx);
  209. yi0 = (incy > 0) ? 0 : ((-n + 1) * incy);
  210. /* The most general form, y <--- alpha * A * (x_head + x_tail) + beta * y */
  211. for (i = 0, yi = yi0, ai = 0; i < n; i++, yi += incy, ai += incai) {
  212. sum1 = 0.0;
  213. sum2 = 0.0;
  214. for (j = 0, aij = ai, xi = xi0; j < i; j++, aij += incaij, xi += incx) {
  215. a_elem = a_i[aij];
  216. x_elem = x_head_i[xi];
  217. prod1 = (double) a_elem *x_elem;
  218. sum1 = sum1 + prod1;
  219. x_elem = x_tail_i[xi];
  220. prod2 = (double) a_elem *x_elem;
  221. sum2 = sum2 + prod2;
  222. }
  223. for (; j < n; j++, aij += incaij2, xi += incx) {
  224. a_elem = a_i[aij];
  225. x_elem = x_head_i[xi];
  226. prod1 = (double) a_elem *x_elem;
  227. sum1 = sum1 + prod1;
  228. x_elem = x_tail_i[xi];
  229. prod2 = (double) a_elem *x_elem;
  230. sum2 = sum2 + prod2;
  231. }
  232. sum1 = sum1 + sum2;
  233. tmp1 = sum1 * alpha_i;
  234. y_elem = y_i[yi];
  235. tmp2 = y_elem * beta_i;
  236. tmp3 = tmp1 + tmp2;
  237. y_i[yi] = tmp3;
  238. }
  239. break;
  240. }
  241. case blas_prec_extra:{
  242. int i, j;
  243. int xi, yi, xi0, yi0;
  244. int aij, ai;
  245. int incai;
  246. int incaij, incaij2;
  247. const float *a_i = a;
  248. const float *x_head_i = x_head;
  249. const float *x_tail_i = x_tail;
  250. double *y_i = y;
  251. double alpha_i = alpha;
  252. double beta_i = beta;
  253. float a_elem;
  254. float x_elem;
  255. double y_elem;
  256. double head_prod1, tail_prod1;
  257. double head_prod2, tail_prod2;
  258. double head_sum1, tail_sum1;
  259. double head_sum2, tail_sum2;
  260. double head_tmp1, tail_tmp1;
  261. double head_tmp2, tail_tmp2;
  262. double head_tmp3, tail_tmp3;
  263. FPU_FIX_DECL;
  264. /* Test for no-op */
  265. if (n <= 0) {
  266. return;
  267. }
  268. if (alpha_i == 0.0 && beta_i == 1.0) {
  269. return;
  270. }
  271. /* Check for error conditions. */
  272. if (n < 0) {
  273. BLAS_error(routine_name, -3, n, NULL);
  274. }
  275. if (lda < n) {
  276. BLAS_error(routine_name, -6, n, NULL);
  277. }
  278. if (incx == 0) {
  279. BLAS_error(routine_name, -9, incx, NULL);
  280. }
  281. if (incy == 0) {
  282. BLAS_error(routine_name, -12, incy, NULL);
  283. }
  284. if ((order == blas_colmajor && uplo == blas_upper) ||
  285. (order == blas_rowmajor && uplo == blas_lower)) {
  286. incai = lda;
  287. incaij = 1;
  288. incaij2 = lda;
  289. } else {
  290. incai = 1;
  291. incaij = lda;
  292. incaij2 = 1;
  293. }
  294. xi0 = (incx > 0) ? 0 : ((-n + 1) * incx);
  295. yi0 = (incy > 0) ? 0 : ((-n + 1) * incy);
  296. FPU_FIX_START;
  297. /* The most general form, y <--- alpha * A * (x_head + x_tail) + beta * y */
  298. for (i = 0, yi = yi0, ai = 0; i < n; i++, yi += incy, ai += incai) {
  299. head_sum1 = tail_sum1 = 0.0;
  300. head_sum2 = tail_sum2 = 0.0;
  301. for (j = 0, aij = ai, xi = xi0; j < i; j++, aij += incaij, xi += incx) {
  302. a_elem = a_i[aij];
  303. x_elem = x_head_i[xi];
  304. head_prod1 = (double) a_elem *x_elem;
  305. tail_prod1 = 0.0;
  306. {
  307. /* Compute double-double = double-double + double-double. */
  308. double bv;
  309. double s1, s2, t1, t2;
  310. /* Add two hi words. */
  311. s1 = head_sum1 + head_prod1;
  312. bv = s1 - head_sum1;
  313. s2 = ((head_prod1 - bv) + (head_sum1 - (s1 - bv)));
  314. /* Add two lo words. */
  315. t1 = tail_sum1 + tail_prod1;
  316. bv = t1 - tail_sum1;
  317. t2 = ((tail_prod1 - bv) + (tail_sum1 - (t1 - bv)));
  318. s2 += t1;
  319. /* Renormalize (s1, s2) to (t1, s2) */
  320. t1 = s1 + s2;
  321. s2 = s2 - (t1 - s1);
  322. t2 += s2;
  323. /* Renormalize (t1, t2) */
  324. head_sum1 = t1 + t2;
  325. tail_sum1 = t2 - (head_sum1 - t1);
  326. }
  327. x_elem = x_tail_i[xi];
  328. head_prod2 = (double) a_elem *x_elem;
  329. tail_prod2 = 0.0;
  330. {
  331. /* Compute double-double = double-double + double-double. */
  332. double bv;
  333. double s1, s2, t1, t2;
  334. /* Add two hi words. */
  335. s1 = head_sum2 + head_prod2;
  336. bv = s1 - head_sum2;
  337. s2 = ((head_prod2 - bv) + (head_sum2 - (s1 - bv)));
  338. /* Add two lo words. */
  339. t1 = tail_sum2 + tail_prod2;
  340. bv = t1 - tail_sum2;
  341. t2 = ((tail_prod2 - bv) + (tail_sum2 - (t1 - bv)));
  342. s2 += t1;
  343. /* Renormalize (s1, s2) to (t1, s2) */
  344. t1 = s1 + s2;
  345. s2 = s2 - (t1 - s1);
  346. t2 += s2;
  347. /* Renormalize (t1, t2) */
  348. head_sum2 = t1 + t2;
  349. tail_sum2 = t2 - (head_sum2 - t1);
  350. }
  351. }
  352. for (; j < n; j++, aij += incaij2, xi += incx) {
  353. a_elem = a_i[aij];
  354. x_elem = x_head_i[xi];
  355. head_prod1 = (double) a_elem *x_elem;
  356. tail_prod1 = 0.0;
  357. {
  358. /* Compute double-double = double-double + double-double. */
  359. double bv;
  360. double s1, s2, t1, t2;
  361. /* Add two hi words. */
  362. s1 = head_sum1 + head_prod1;
  363. bv = s1 - head_sum1;
  364. s2 = ((head_prod1 - bv) + (head_sum1 - (s1 - bv)));
  365. /* Add two lo words. */
  366. t1 = tail_sum1 + tail_prod1;
  367. bv = t1 - tail_sum1;
  368. t2 = ((tail_prod1 - bv) + (tail_sum1 - (t1 - bv)));
  369. s2 += t1;
  370. /* Renormalize (s1, s2) to (t1, s2) */
  371. t1 = s1 + s2;
  372. s2 = s2 - (t1 - s1);
  373. t2 += s2;
  374. /* Renormalize (t1, t2) */
  375. head_sum1 = t1 + t2;
  376. tail_sum1 = t2 - (head_sum1 - t1);
  377. }
  378. x_elem = x_tail_i[xi];
  379. head_prod2 = (double) a_elem *x_elem;
  380. tail_prod2 = 0.0;
  381. {
  382. /* Compute double-double = double-double + double-double. */
  383. double bv;
  384. double s1, s2, t1, t2;
  385. /* Add two hi words. */
  386. s1 = head_sum2 + head_prod2;
  387. bv = s1 - head_sum2;
  388. s2 = ((head_prod2 - bv) + (head_sum2 - (s1 - bv)));
  389. /* Add two lo words. */
  390. t1 = tail_sum2 + tail_prod2;
  391. bv = t1 - tail_sum2;
  392. t2 = ((tail_prod2 - bv) + (tail_sum2 - (t1 - bv)));
  393. s2 += t1;
  394. /* Renormalize (s1, s2) to (t1, s2) */
  395. t1 = s1 + s2;
  396. s2 = s2 - (t1 - s1);
  397. t2 += s2;
  398. /* Renormalize (t1, t2) */
  399. head_sum2 = t1 + t2;
  400. tail_sum2 = t2 - (head_sum2 - t1);
  401. }
  402. }
  403. {
  404. /* Compute double-double = double-double + double-double. */
  405. double bv;
  406. double s1, s2, t1, t2;
  407. /* Add two hi words. */
  408. s1 = head_sum1 + head_sum2;
  409. bv = s1 - head_sum1;
  410. s2 = ((head_sum2 - bv) + (head_sum1 - (s1 - bv)));
  411. /* Add two lo words. */
  412. t1 = tail_sum1 + tail_sum2;
  413. bv = t1 - tail_sum1;
  414. t2 = ((tail_sum2 - bv) + (tail_sum1 - (t1 - bv)));
  415. s2 += t1;
  416. /* Renormalize (s1, s2) to (t1, s2) */
  417. t1 = s1 + s2;
  418. s2 = s2 - (t1 - s1);
  419. t2 += s2;
  420. /* Renormalize (t1, t2) */
  421. head_sum1 = t1 + t2;
  422. tail_sum1 = t2 - (head_sum1 - t1);
  423. }
  424. {
  425. /* Compute double-double = double-double * double. */
  426. double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;
  427. con = head_sum1 * split;
  428. a11 = con - head_sum1;
  429. a11 = con - a11;
  430. a21 = head_sum1 - a11;
  431. con = alpha_i * split;
  432. b1 = con - alpha_i;
  433. b1 = con - b1;
  434. b2 = alpha_i - b1;
  435. c11 = head_sum1 * alpha_i;
  436. c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;
  437. c2 = tail_sum1 * alpha_i;
  438. t1 = c11 + c2;
  439. t2 = (c2 - (t1 - c11)) + c21;
  440. head_tmp1 = t1 + t2;
  441. tail_tmp1 = t2 - (head_tmp1 - t1);
  442. }
  443. y_elem = y_i[yi];
  444. {
  445. /* Compute double_double = double * double. */
  446. double a1, a2, b1, b2, con;
  447. con = y_elem * split;
  448. a1 = con - y_elem;
  449. a1 = con - a1;
  450. a2 = y_elem - a1;
  451. con = beta_i * split;
  452. b1 = con - beta_i;
  453. b1 = con - b1;
  454. b2 = beta_i - b1;
  455. head_tmp2 = y_elem * beta_i;
  456. tail_tmp2 = (((a1 * b1 - head_tmp2) + a1 * b2) + a2 * b1) + a2 * b2;
  457. }
  458. {
  459. /* Compute double-double = double-double + double-double. */
  460. double bv;
  461. double s1, s2, t1, t2;
  462. /* Add two hi words. */
  463. s1 = head_tmp1 + head_tmp2;
  464. bv = s1 - head_tmp1;
  465. s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv)));
  466. /* Add two lo words. */
  467. t1 = tail_tmp1 + tail_tmp2;
  468. bv = t1 - tail_tmp1;
  469. t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv)));
  470. s2 += t1;
  471. /* Renormalize (s1, s2) to (t1, s2) */
  472. t1 = s1 + s2;
  473. s2 = s2 - (t1 - s1);
  474. t2 += s2;
  475. /* Renormalize (t1, t2) */
  476. head_tmp3 = t1 + t2;
  477. tail_tmp3 = t2 - (head_tmp3 - t1);
  478. }
  479. y_i[yi] = head_tmp3;
  480. }
  481. FPU_FIX_STOP;
  482. break;
  483. }
  484. }
  485. } /* end BLAS_dsymv2_s_s_x */