/xbmc/visualizations/Goom/goom2k4-0/src/xmmx.c

http://github.com/xbmc/xbmc · C · 390 lines · 244 code · 59 blank · 87 comment · 36 complexity · 0b35e33c4a9cf5dc7777a2b85c4589d4 MD5 · raw file

  1. #ifdef HAVE_MMX
  2. /* a definir pour avoir exactement le meme resultat que la fonction C
  3. * (un chouillat plus lent).. mais la difference est assez peu notable.
  4. */
  5. // #define STRICT_COMPAT
  6. #define BUFFPOINTNB 16
  7. #define BUFFPOINTMASK 0xffff
  8. #define BUFFINCR 0xff
  9. #define sqrtperte 16
  10. /* faire : a % sqrtperte <=> a & pertemask*/
  11. #define PERTEMASK 0xf
  12. /* faire : a / sqrtperte <=> a >> PERTEDEC*/
  13. #define PERTEDEC 4
  14. /*#define MMX_TRACE*/
  15. #include "mmx.h"
  16. /*#include "xmmx.h"*/
  17. #include "goom_graphic.h"
  18. int xmmx_supported (void) {
  19. return (mm_support()&0x8)>>3;
  20. }
  21. void zoom_filter_xmmx (int prevX, int prevY,
  22. Pixel *expix1, Pixel *expix2,
  23. int *lbruS, int *lbruD, int buffratio,
  24. int precalCoef[16][16])
  25. {
  26. int bufsize = prevX * prevY; /* taille du buffer */
  27. volatile int loop; /* variable de boucle */
  28. mmx_t *brutS = (mmx_t*)lbruS; /* buffer de transformation source */
  29. mmx_t *brutD = (mmx_t*)lbruD; /* buffer de transformation dest */
  30. volatile mmx_t prevXY;
  31. volatile mmx_t ratiox;
  32. /* volatile mmx_t interpix; */
  33. expix1[0].val=expix1[prevX-1].val=expix1[prevX*prevY-1].val=expix1[prevX*prevY-prevX].val=0;
  34. prevXY.ud[0] = (prevX-1)<<PERTEDEC;
  35. prevXY.ud[1] = (prevY-1)<<PERTEDEC;
  36. ratiox.d[0] = buffratio;
  37. ratiox.d[1] = buffratio;
  38. asm volatile
  39. ("\n\t movq %[ratio], %%mm6"
  40. "\n\t pslld $16, %%mm6" /* mm6 = [rat16=buffratio<<16 | rat16=buffratio<<16] */
  41. "\n\t pxor %%mm7, %%mm7" /* mm7 = 0 */
  42. ::[ratio]"m"(ratiox));
  43. loop=0;
  44. /*
  45. * NOTE : mm6 et mm7 ne sont pas modifies dans la boucle.
  46. */
  47. while (loop < bufsize)
  48. {
  49. /* Thread #1
  50. * pre : mm6 = [rat16|rat16]
  51. * post : mm0 = S + ((D-S)*rat16 format [X|Y]
  52. * modified = mm0,mm1,mm2
  53. */
  54. asm volatile ("#1 \n\t movq 0(%[brutS]), %%mm0"
  55. "#1 \n\t movq 0(%[brutD]), %%mm1"
  56. "#1 \n\t psubd %%mm0, %%mm1" /* mm1 = D - S */
  57. "#1 \n\t movq %%mm1, %%mm2" /* mm2 = D - S */
  58. "#1 \n\t pslld $16, %%mm1"
  59. "#1 \n\t pmullw %%mm6, %%mm2"
  60. "#1 \n\t pmulhuw %%mm6, %%mm1"
  61. "#1 \n\t pslld $16, %%mm0"
  62. "#1 \n\t paddd %%mm2, %%mm1" /* mm1 = (D - S) * buffratio >> 16 */
  63. "#1 \n\t paddd %%mm1, %%mm0" /* mm0 = S + mm1 */
  64. "#1 \n\t psrld $16, %%mm0"
  65. :
  66. :[brutS] "r" (&brutS[loop]) ,[brutD] "r" (&brutD[loop])
  67. ); /* mm0 = S */
  68. /*
  69. * pre : mm0 : position vector on screen
  70. * prevXY : coordinate of the lower-right point on screen
  71. * post : clipped mm0
  72. * modified : mm0,mm1,mm2
  73. */
  74. asm volatile
  75. ("#1 \n\t movq %[prevXY], %%mm1"
  76. "#1 \n\t pcmpgtd %%mm0, %%mm1"
  77. /* mm0 en X contient (idem pour Y) :
  78. * 1111 si prevXY > px
  79. * 0000 si prevXY <= px */
  80. #ifdef STRICT_COMPAT
  81. "#1 \n\t movq %%mm1, %%mm2"
  82. "#1 \n\t punpckhdq %%mm2, %%mm2"
  83. "#1 \n\t punpckldq %%mm1, %%mm1"
  84. "#1 \n\t pand %%mm2, %%mm0"
  85. #endif
  86. "#1 \n\t pand %%mm1, %%mm0" /* on met a zero la partie qui deborde */
  87. ::[prevXY]"m"(prevXY));
  88. /* Thread #2
  89. * pre : mm0 : clipped position on screen
  90. *
  91. * post : mm3 : coefs for this position
  92. * mm1 : X vector [0|X]
  93. *
  94. * modif : eax,esi
  95. */
  96. __asm__ __volatile__ (
  97. "#2 \n\t movd %%mm0,%%esi"
  98. "#2 \n\t movq %%mm0,%%mm1"
  99. "#2 \n\t andl $15,%%esi"
  100. "#2 \n\t psrlq $32,%%mm1"
  101. "#2 \n\t shll $6,%%esi"
  102. "#2 \n\t movd %%mm1,%%eax"
  103. "#2 \n\t addl %[precalCoef],%%esi"
  104. "#2 \n\t andl $15,%%eax"
  105. "#2 \n\t movd (%%esi,%%eax,4),%%mm3"
  106. ::[precalCoef]"g"(precalCoef):"eax","esi");
  107. /*
  108. * extraction des coefficients... (Thread #3)
  109. *
  110. * pre : coef dans mm3
  111. *
  112. * post : coef extraits dans mm3 (c1 & c2)
  113. * et mm4 (c3 & c4)
  114. *
  115. * modif : mm5
  116. */
  117. /* (Thread #4)
  118. * pre : mm0 : Y pos [*|Y]
  119. * mm1 : X pos [*|X]
  120. *
  121. * post : mm0 : expix1[position]
  122. * mm2 : expix1[position+largeur]
  123. *
  124. * modif : eax, esi
  125. */
  126. __asm__ __volatile__ (
  127. "#2 \n\t psrld $4, %%mm0"
  128. "#2 \n\t psrld $4, %%mm1" /* PERTEDEC = $4 */
  129. "#4 \n\t movd %%mm1,%%eax"
  130. "#3 \n\t movq %%mm3,%%mm5"
  131. "#4 \n\t mull %[prevX]"
  132. "#4 \n\t movd %%mm0,%%esi"
  133. "#3 \n\t punpcklbw %%mm5, %%mm3"
  134. "#4 \n\t addl %%esi, %%eax"
  135. "#3 \n\t movq %%mm3, %%mm4"
  136. "#3 \n\t movq %%mm3, %%mm5"
  137. "#4 \n\t movl %[expix1], %%esi"
  138. "#3 \n\t punpcklbw %%mm5, %%mm3"
  139. "#4 \n\t movq (%%esi,%%eax,4),%%mm0"
  140. "#3 \n\t punpckhbw %%mm5, %%mm4"
  141. "#4 \n\t addl %[prevX],%%eax"
  142. "#4 \n\t movq (%%esi,%%eax,4),%%mm2"
  143. :
  144. : [expix1] "g"(expix1)
  145. , [prevX] "g"(prevX)
  146. :"eax","esi"
  147. );
  148. /*
  149. * pre : mm0 : expix1[position]
  150. * mm2 : expix1[position+largeur]
  151. * mm3 & mm4 : coefs
  152. */
  153. /* recopie des deux premiers pixels dans mm0 et mm1 */
  154. movq_r2r (mm0, mm1); /* b1-v1-r1-a1-b2-v2-r2-a2 */
  155. /* depackage du premier pixel */
  156. punpcklbw_r2r (mm7, mm0); /* 00-b2-00-v2-00-r2-00-a2 */
  157. /* extraction des coefficients... */
  158. movq_r2r (mm3, mm5); /* c2-c2-c2-c2-c1-c1-c1-c1 */
  159. /*^en parrallele^*/ /* depackage du 2ieme pixel */
  160. /*^*/ punpckhbw_r2r (mm7, mm1); /* 00-b1-00-v1-00-r1-00-a1 */
  161. punpcklbw_r2r (mm7, mm5); /* 00-c1-00-c1-00-c1-00-c1 */
  162. punpckhbw_r2r (mm7, mm3); /* 00-c2-00-c2-00-c2-00-c2 */
  163. /* multiplication des pixels par les coefficients */
  164. pmullw_r2r (mm5, mm0); /* c1*b2-c1*v2-c1*r2-c1*a2 */
  165. pmullw_r2r (mm3, mm1); /* c2*b1-c2*v1-c2*r1-c2*a1 */
  166. paddw_r2r (mm1, mm0);
  167. /* ...extraction des 2 derniers coefficients */
  168. movq_r2r (mm4, mm5); /* c4-c4-c4-c4-c3-c3-c3-c3 */
  169. punpcklbw_r2r (mm7, mm4); /* 00-c3-00-c3-00-c3-00-c3 */
  170. punpckhbw_r2r (mm7, mm5); /* 00-c4-00-c4-00-c4-00-c4 */
  171. /* recuperation des 2 derniers pixels */
  172. movq_r2r (mm2, mm1);
  173. /* depackage des pixels */
  174. punpcklbw_r2r (mm7, mm1);
  175. punpckhbw_r2r (mm7, mm2);
  176. /* multiplication pas les coeffs */
  177. pmullw_r2r (mm4, mm1);
  178. pmullw_r2r (mm5, mm2);
  179. /* ajout des valeurs obtenues ? la valeur finale */
  180. paddw_r2r (mm1, mm0);
  181. paddw_r2r (mm2, mm0);
  182. /* division par 256 = 16+16+16+16, puis repackage du pixel final */
  183. psrlw_i2r (8, mm0);
  184. packuswb_r2r (mm7, mm0);
  185. movd_r2m (mm0,expix2[loop]);
  186. ++loop;
  187. }
  188. __asm__ __volatile__ ("emms\n");
  189. }
  190. #define DRAWMETHOD_PLUS_XMMX(_out,_backbuf,_col) \
  191. { \
  192. movd_m2r(_backbuf, mm0); \
  193. paddusb_m2r(_col, mm0); \
  194. movd_r2m(mm0, _out); \
  195. }
  196. #define DRAWMETHOD DRAWMETHOD_PLUS_XMMX(*p,*p,col)
  197. void draw_line_xmmx (Pixel *data, int x1, int y1, int x2, int y2, int col, int screenx, int screeny)
  198. {
  199. int x, y, dx, dy, yy, xx;
  200. Pixel *p;
  201. if ((y1 < 0) || (y2 < 0) || (x1 < 0) || (x2 < 0) || (y1 >= screeny) || (y2 >= screeny) || (x1 >= screenx) || (x2 >= screenx))
  202. goto end_of_line;
  203. dx = x2 - x1;
  204. dy = y2 - y1;
  205. if (x1 >= x2) {
  206. int tmp;
  207. tmp = x1;
  208. x1 = x2;
  209. x2 = tmp;
  210. tmp = y1;
  211. y1 = y2;
  212. y2 = tmp;
  213. dx = x2 - x1;
  214. dy = y2 - y1;
  215. }
  216. /* vertical line */
  217. if (dx == 0) {
  218. if (y1 < y2) {
  219. p = &(data[(screenx * y1) + x1]);
  220. for (y = y1; y <= y2; y++) {
  221. DRAWMETHOD;
  222. p += screenx;
  223. }
  224. }
  225. else {
  226. p = &(data[(screenx * y2) + x1]);
  227. for (y = y2; y <= y1; y++) {
  228. DRAWMETHOD;
  229. p += screenx;
  230. }
  231. }
  232. goto end_of_line;
  233. }
  234. /* horizontal line */
  235. if (dy == 0) {
  236. if (x1 < x2) {
  237. p = &(data[(screenx * y1) + x1]);
  238. for (x = x1; x <= x2; x++) {
  239. DRAWMETHOD;
  240. p++;
  241. }
  242. goto end_of_line;
  243. }
  244. else {
  245. p = &(data[(screenx * y1) + x2]);
  246. for (x = x2; x <= x1; x++) {
  247. DRAWMETHOD;
  248. p++;
  249. }
  250. goto end_of_line;
  251. }
  252. }
  253. /* 1 */
  254. /* \ */
  255. /* \ */
  256. /* 2 */
  257. if (y2 > y1) {
  258. /* steep */
  259. if (dy > dx) {
  260. dx = ((dx << 16) / dy);
  261. x = x1 << 16;
  262. for (y = y1; y <= y2; y++) {
  263. xx = x >> 16;
  264. p = &(data[(screenx * y) + xx]);
  265. DRAWMETHOD;
  266. if (xx < (screenx - 1)) {
  267. p++;
  268. /* DRAWMETHOD; */
  269. }
  270. x += dx;
  271. }
  272. goto end_of_line;
  273. }
  274. /* shallow */
  275. else {
  276. dy = ((dy << 16) / dx);
  277. y = y1 << 16;
  278. for (x = x1; x <= x2; x++) {
  279. yy = y >> 16;
  280. p = &(data[(screenx * yy) + x]);
  281. DRAWMETHOD;
  282. if (yy < (screeny - 1)) {
  283. p += screeny;
  284. /* DRAWMETHOD; */
  285. }
  286. y += dy;
  287. }
  288. }
  289. }
  290. /* 2 */
  291. /* / */
  292. /* / */
  293. /* 1 */
  294. else {
  295. /* steep */
  296. if (-dy > dx) {
  297. dx = ((dx << 16) / -dy);
  298. x = (x1 + 1) << 16;
  299. for (y = y1; y >= y2; y--) {
  300. xx = x >> 16;
  301. p = &(data[(screenx * y) + xx]);
  302. DRAWMETHOD;
  303. if (xx < (screenx - 1)) {
  304. p--;
  305. /* DRAWMETHOD; */
  306. }
  307. x += dx;
  308. }
  309. goto end_of_line;
  310. }
  311. /* shallow */
  312. else {
  313. dy = ((dy << 16) / dx);
  314. y = y1 << 16;
  315. for (x = x1; x <= x2; x++) {
  316. yy = y >> 16;
  317. p = &(data[(screenx * yy) + x]);
  318. DRAWMETHOD;
  319. if (yy < (screeny - 1)) {
  320. p += screeny;
  321. /* DRAWMETHOD; */
  322. }
  323. y += dy;
  324. }
  325. goto end_of_line;
  326. }
  327. }
  328. end_of_line:
  329. __asm__ __volatile__ ("emms\n");
  330. }
  331. #endif