/driver/level2/trmv_thread.c

https://github.com/nega0/gbp · C · 440 lines · 326 code · 77 blank · 37 comment · 33 complexity · 59253141a196405bce0f6481c4de3f04 MD5 · raw file

  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include <stdlib.h>
  40. #include "common.h"
  41. #include "symcopy.h"
  42. #ifndef COMPLEX
  43. #ifndef TRANSA
  44. #define MYGEMV GEMV_N
  45. #undef TRANS
  46. #else
  47. #define MYGEMV GEMV_T
  48. #define TRANS
  49. #endif
  50. #define MYDOT DOTU_K
  51. #define MYAXPY AXPYU_K
  52. #else
  53. #if TRANSA == 1
  54. #define MYGEMV GEMV_N
  55. #undef TRANS
  56. #define MYDOT DOTU_K
  57. #define MYAXPY AXPYU_K
  58. #elif TRANSA == 2
  59. #define MYGEMV GEMV_T
  60. #define TRANS
  61. #define MYDOT DOTU_K
  62. #define MYAXPY AXPYU_K
  63. #elif TRANSA == 3
  64. #define MYGEMV GEMV_R
  65. #undef TRANS
  66. #define MYDOT DOTC_K
  67. #define MYAXPY AXPYC_K
  68. #else
  69. #define MYGEMV GEMV_C
  70. #define TRANS
  71. #define MYDOT DOTC_K
  72. #define MYAXPY AXPYC_K
  73. #endif
  74. #endif
  75. static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
  76. FLOAT *a, *x, *y;
  77. BLASLONG lda, incx;
  78. BLASLONG m_from, m_to;
  79. BLASLONG i, is, min_i;
  80. #ifdef TRANS
  81. #ifndef COMPLEX
  82. FLOAT result;
  83. #else
  84. FLOAT _Complex result;
  85. #endif
  86. #endif
  87. #if defined(COMPLEX) && !defined(UNIT)
  88. FLOAT ar, ai, xr, xi;
  89. #endif
  90. a = (FLOAT *)args -> a;
  91. x = (FLOAT *)args -> b;
  92. y = (FLOAT *)args -> c;
  93. lda = args -> lda;
  94. incx = args -> ldb;
  95. m_from = 0;
  96. m_to = args -> m;
  97. if (range_m) {
  98. m_from = *(range_m + 0);
  99. m_to = *(range_m + 1);
  100. }
  101. if (incx != 1) {
  102. #ifndef LOWER
  103. COPY_K(m_to, x, incx, buffer, 1);
  104. #else
  105. COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1);
  106. #endif
  107. x = buffer;
  108. buffer += ((COMPSIZE * args -> m + 1023) & ~1023);
  109. }
  110. #ifndef TRANS
  111. if (range_n) y += *range_n * COMPSIZE;
  112. #ifndef LOWER
  113. SCAL_K(m_to, 0, 0, ZERO,
  114. #ifdef COMPLEX
  115. ZERO,
  116. #endif
  117. y, 1, NULL, 0, NULL, 0);
  118. #else
  119. SCAL_K(args -> m - m_from, 0, 0, ZERO,
  120. #ifdef COMPLEX
  121. ZERO,
  122. #endif
  123. y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
  124. #endif
  125. #else
  126. SCAL_K(m_to - m_from, 0, 0, ZERO,
  127. #ifdef COMPLEX
  128. ZERO,
  129. #endif
  130. y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
  131. #endif
  132. for (is = m_from; is < m_to; is += DTB_ENTRIES){
  133. min_i = MIN(m_to - is, DTB_ENTRIES);
  134. #ifndef LOWER
  135. if (is > 0){
  136. MYGEMV(is, min_i, 0,
  137. ONE,
  138. #ifdef COMPLEX
  139. ZERO,
  140. #endif
  141. a + is * lda * COMPSIZE, lda,
  142. #ifndef TRANS
  143. x + is * COMPSIZE, 1,
  144. y, 1,
  145. #else
  146. x, 1,
  147. y + is * COMPSIZE, 1,
  148. #endif
  149. buffer);
  150. }
  151. #endif
  152. for (i = is; i < is + min_i; i++) {
  153. #ifndef LOWER
  154. if (i - is > 0) {
  155. #ifndef TRANS
  156. MYAXPY(i - is, 0, 0,
  157. *(x + i * COMPSIZE + 0),
  158. #ifdef COMPLEX
  159. *(x + i * COMPSIZE + 1),
  160. #endif
  161. a + (is + i * lda) * COMPSIZE, 1, y + is * COMPSIZE, 1, NULL, 0);
  162. #else
  163. result = MYDOT(i - is, a + (is + i * lda) * COMPSIZE, 1, x + is * COMPSIZE, 1);
  164. #ifndef COMPLEX
  165. *(y + i * COMPSIZE + 0) += result;
  166. #else
  167. *(y + i * COMPSIZE + 0) += CREAL(result);
  168. *(y + i * COMPSIZE + 1) += CIMAG(result);
  169. #endif
  170. #endif
  171. }
  172. #endif
  173. #ifndef COMPLEX
  174. #ifdef UNIT
  175. *(y + i * COMPSIZE) += *(x + i * COMPSIZE);
  176. #else
  177. *(y + i * COMPSIZE) += *(a + (i + i * lda) * COMPSIZE) * *(x + i * COMPSIZE);
  178. #endif
  179. #else
  180. #ifdef UNIT
  181. *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0);
  182. *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1);
  183. #else
  184. ar = *(a + (i + i * lda) * COMPSIZE + 0);
  185. ai = *(a + (i + i * lda) * COMPSIZE + 1);
  186. xr = *(x + i * COMPSIZE + 0);
  187. xi = *(x + i * COMPSIZE + 1);
  188. #if (TRANSA == 1) || (TRANSA == 2)
  189. *(y + i * COMPSIZE + 0) += ar * xr - ai * xi;
  190. *(y + i * COMPSIZE + 1) += ar * xi + ai * xr;
  191. #else
  192. *(y + i * COMPSIZE + 0) += ar * xr + ai * xi;
  193. *(y + i * COMPSIZE + 1) += ar * xi - ai * xr;
  194. #endif
  195. #endif
  196. #endif
  197. #ifdef LOWER
  198. if (is + min_i > i + 1) {
  199. #ifndef TRANS
  200. MYAXPY(is + min_i - i - 1, 0, 0,
  201. *(x + i * COMPSIZE + 0),
  202. #ifdef COMPLEX
  203. *(x + i * COMPSIZE + 1),
  204. #endif
  205. a + (i + 1 + i * lda) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0);
  206. #else
  207. result = MYDOT(is + min_i - i - 1, a + (i + 1 + i * lda) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1);
  208. #ifndef COMPLEX
  209. *(y + i * COMPSIZE + 0) += result;
  210. #else
  211. *(y + i * COMPSIZE + 0) += CREAL(result);
  212. *(y + i * COMPSIZE + 1) += CIMAG(result);
  213. #endif
  214. #endif
  215. }
  216. #endif
  217. }
  218. #ifdef LOWER
  219. if (args -> m > is + min_i){
  220. MYGEMV(args -> m - is - min_i, min_i, 0,
  221. ONE,
  222. #ifdef COMPLEX
  223. ZERO,
  224. #endif
  225. a + (is + min_i + is * lda) * COMPSIZE, lda,
  226. #ifndef TRANS
  227. x + is * COMPSIZE, 1,
  228. y + (is + min_i) * COMPSIZE, 1,
  229. #else
  230. x + (is + min_i) * COMPSIZE, 1,
  231. y + is * COMPSIZE, 1,
  232. #endif
  233. buffer);
  234. }
  235. #endif
  236. }
  237. return 0;
  238. }
  239. #ifndef COMPLEX
  240. int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){
  241. #else
  242. int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){
  243. #endif
  244. blas_arg_t args;
  245. blas_queue_t queue[MAX_CPU_NUMBER];
  246. BLASLONG range_m[MAX_CPU_NUMBER + 1];
  247. BLASLONG range_n[MAX_CPU_NUMBER];
  248. BLASLONG width, i, num_cpu;
  249. double dnum;
  250. int mask = 7;
  251. #ifdef SMP
  252. #ifndef COMPLEX
  253. #ifdef XDOUBLE
  254. int mode = BLAS_XDOUBLE | BLAS_REAL;
  255. #elif defined(DOUBLE)
  256. int mode = BLAS_DOUBLE | BLAS_REAL;
  257. #else
  258. int mode = BLAS_SINGLE | BLAS_REAL;
  259. #endif
  260. #else
  261. #ifdef XDOUBLE
  262. int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
  263. #elif defined(DOUBLE)
  264. int mode = BLAS_DOUBLE | BLAS_COMPLEX;
  265. #else
  266. int mode = BLAS_SINGLE | BLAS_COMPLEX;
  267. #endif
  268. #endif
  269. #endif
  270. args.m = m;
  271. args.a = (void *)a;
  272. args.b = (void *)x;
  273. args.c = (void *)(buffer);
  274. args.lda = lda;
  275. args.ldb = incx;
  276. args.ldc = incx;
  277. dnum = (double)m * (double)m / (double)nthreads;
  278. num_cpu = 0;
  279. #ifndef LOWER
  280. range_m[MAX_CPU_NUMBER] = m;
  281. i = 0;
  282. while (i < m){
  283. if (nthreads - num_cpu > 1) {
  284. double di = (double)(m - i);
  285. if (di * di - dnum > 0) {
  286. width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
  287. } else {
  288. width = m - i;
  289. }
  290. if (width < 16) width = 16;
  291. if (width > m - i) width = m - i;
  292. } else {
  293. width = m - i;
  294. }
  295. range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
  296. range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
  297. queue[num_cpu].mode = mode;
  298. queue[num_cpu].routine = trmv_kernel;
  299. queue[num_cpu].args = &args;
  300. queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1];
  301. queue[num_cpu].range_n = &range_n[num_cpu];
  302. queue[num_cpu].sa = NULL;
  303. queue[num_cpu].sb = NULL;
  304. queue[num_cpu].next = &queue[num_cpu + 1];
  305. num_cpu ++;
  306. i += width;
  307. }
  308. #else
  309. range_m[0] = 0;
  310. i = 0;
  311. while (i < m){
  312. if (nthreads - num_cpu > 1) {
  313. double di = (double)(m - i);
  314. if (di * di - dnum > 0) {
  315. width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
  316. } else {
  317. width = m - i;
  318. }
  319. if (width < 16) width = 16;
  320. if (width > m - i) width = m - i;
  321. } else {
  322. width = m - i;
  323. }
  324. range_m[num_cpu + 1] = range_m[num_cpu] + width;
  325. range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
  326. queue[num_cpu].mode = mode;
  327. queue[num_cpu].routine = trmv_kernel;
  328. queue[num_cpu].args = &args;
  329. queue[num_cpu].range_m = &range_m[num_cpu];
  330. queue[num_cpu].range_n = &range_n[num_cpu];
  331. queue[num_cpu].sa = NULL;
  332. queue[num_cpu].sb = NULL;
  333. queue[num_cpu].next = &queue[num_cpu + 1];
  334. num_cpu ++;
  335. i += width;
  336. }
  337. #endif
  338. if (num_cpu) {
  339. queue[0].sa = NULL;
  340. queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
  341. queue[num_cpu - 1].next = NULL;
  342. exec_blas(num_cpu, queue);
  343. }
  344. #ifndef TRANS
  345. for (i = 1; i < num_cpu; i ++) {
  346. #ifndef LOWER
  347. AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE,
  348. #ifdef COMPLEX
  349. ZERO,
  350. #endif
  351. buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0);
  352. #else
  353. AXPYU_K(m - range_m[i], 0, 0, ONE,
  354. #ifdef COMPLEX
  355. ZERO,
  356. #endif
  357. buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0);
  358. #endif
  359. }
  360. #endif
  361. COPY_K(m, buffer, 1, x, incx);
  362. return 0;
  363. }