/media/libvpx/vp8/encoder/x86/variance_sse2.c

http://github.com/zpao/v8monkey · C · 558 lines · 490 code · 57 blank · 11 comment · 56 complexity · b7ace89f5306687aee96c184da06aa0c MD5 · raw file

  1. /*
  2. * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "vpx_config.h"
  11. #include "vp8/encoder/variance.h"
  12. #include "vp8/common/pragmas.h"
  13. #include "vpx_ports/mem.h"
  14. extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
  15. extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
  16. extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
  17. extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
  18. extern void vp8_filter_block2d_bil4x4_var_mmx
  19. (
  20. const unsigned char *ref_ptr,
  21. int ref_pixels_per_line,
  22. const unsigned char *src_ptr,
  23. int src_pixels_per_line,
  24. const short *HFilter,
  25. const short *VFilter,
  26. int *sum,
  27. unsigned int *sumsquared
  28. );
  29. extern unsigned int vp8_get4x4var_mmx
  30. (
  31. const unsigned char *src_ptr,
  32. int source_stride,
  33. const unsigned char *ref_ptr,
  34. int recon_stride,
  35. unsigned int *SSE,
  36. int *Sum
  37. );
  38. unsigned int vp8_get_mb_ss_sse2
  39. (
  40. const short *src_ptr
  41. );
  42. unsigned int vp8_get16x16var_sse2
  43. (
  44. const unsigned char *src_ptr,
  45. int source_stride,
  46. const unsigned char *ref_ptr,
  47. int recon_stride,
  48. unsigned int *SSE,
  49. int *Sum
  50. );
  51. unsigned int vp8_get8x8var_sse2
  52. (
  53. const unsigned char *src_ptr,
  54. int source_stride,
  55. const unsigned char *ref_ptr,
  56. int recon_stride,
  57. unsigned int *SSE,
  58. int *Sum
  59. );
  60. void vp8_filter_block2d_bil_var_sse2
  61. (
  62. const unsigned char *ref_ptr,
  63. int ref_pixels_per_line,
  64. const unsigned char *src_ptr,
  65. int src_pixels_per_line,
  66. unsigned int Height,
  67. int xoffset,
  68. int yoffset,
  69. int *sum,
  70. unsigned int *sumsquared
  71. );
  72. void vp8_half_horiz_vert_variance8x_h_sse2
  73. (
  74. const unsigned char *ref_ptr,
  75. int ref_pixels_per_line,
  76. const unsigned char *src_ptr,
  77. int src_pixels_per_line,
  78. unsigned int Height,
  79. int *sum,
  80. unsigned int *sumsquared
  81. );
  82. void vp8_half_horiz_vert_variance16x_h_sse2
  83. (
  84. const unsigned char *ref_ptr,
  85. int ref_pixels_per_line,
  86. const unsigned char *src_ptr,
  87. int src_pixels_per_line,
  88. unsigned int Height,
  89. int *sum,
  90. unsigned int *sumsquared
  91. );
  92. void vp8_half_horiz_variance8x_h_sse2
  93. (
  94. const unsigned char *ref_ptr,
  95. int ref_pixels_per_line,
  96. const unsigned char *src_ptr,
  97. int src_pixels_per_line,
  98. unsigned int Height,
  99. int *sum,
  100. unsigned int *sumsquared
  101. );
  102. void vp8_half_horiz_variance16x_h_sse2
  103. (
  104. const unsigned char *ref_ptr,
  105. int ref_pixels_per_line,
  106. const unsigned char *src_ptr,
  107. int src_pixels_per_line,
  108. unsigned int Height,
  109. int *sum,
  110. unsigned int *sumsquared
  111. );
  112. void vp8_half_vert_variance8x_h_sse2
  113. (
  114. const unsigned char *ref_ptr,
  115. int ref_pixels_per_line,
  116. const unsigned char *src_ptr,
  117. int src_pixels_per_line,
  118. unsigned int Height,
  119. int *sum,
  120. unsigned int *sumsquared
  121. );
  122. void vp8_half_vert_variance16x_h_sse2
  123. (
  124. const unsigned char *ref_ptr,
  125. int ref_pixels_per_line,
  126. const unsigned char *src_ptr,
  127. int src_pixels_per_line,
  128. unsigned int Height,
  129. int *sum,
  130. unsigned int *sumsquared
  131. );
  132. DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
  133. unsigned int vp8_variance4x4_wmt(
  134. const unsigned char *src_ptr,
  135. int source_stride,
  136. const unsigned char *ref_ptr,
  137. int recon_stride,
  138. unsigned int *sse)
  139. {
  140. unsigned int var;
  141. int avg;
  142. vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
  143. *sse = var;
  144. return (var - ((avg * avg) >> 4));
  145. }
  146. unsigned int vp8_variance8x8_wmt
  147. (
  148. const unsigned char *src_ptr,
  149. int source_stride,
  150. const unsigned char *ref_ptr,
  151. int recon_stride,
  152. unsigned int *sse)
  153. {
  154. unsigned int var;
  155. int avg;
  156. vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
  157. *sse = var;
  158. return (var - ((avg * avg) >> 6));
  159. }
  160. unsigned int vp8_variance16x16_wmt
  161. (
  162. const unsigned char *src_ptr,
  163. int source_stride,
  164. const unsigned char *ref_ptr,
  165. int recon_stride,
  166. unsigned int *sse)
  167. {
  168. unsigned int sse0;
  169. int sum0;
  170. vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
  171. *sse = sse0;
  172. return (sse0 - ((sum0 * sum0) >> 8));
  173. }
  174. unsigned int vp8_mse16x16_wmt(
  175. const unsigned char *src_ptr,
  176. int source_stride,
  177. const unsigned char *ref_ptr,
  178. int recon_stride,
  179. unsigned int *sse)
  180. {
  181. unsigned int sse0;
  182. int sum0;
  183. vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
  184. *sse = sse0;
  185. return sse0;
  186. }
  187. unsigned int vp8_variance16x8_wmt
  188. (
  189. const unsigned char *src_ptr,
  190. int source_stride,
  191. const unsigned char *ref_ptr,
  192. int recon_stride,
  193. unsigned int *sse)
  194. {
  195. unsigned int sse0, sse1, var;
  196. int sum0, sum1, avg;
  197. vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
  198. vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
  199. var = sse0 + sse1;
  200. avg = sum0 + sum1;
  201. *sse = var;
  202. return (var - ((avg * avg) >> 7));
  203. }
  204. unsigned int vp8_variance8x16_wmt
  205. (
  206. const unsigned char *src_ptr,
  207. int source_stride,
  208. const unsigned char *ref_ptr,
  209. int recon_stride,
  210. unsigned int *sse)
  211. {
  212. unsigned int sse0, sse1, var;
  213. int sum0, sum1, avg;
  214. vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
  215. vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
  216. var = sse0 + sse1;
  217. avg = sum0 + sum1;
  218. *sse = var;
  219. return (var - ((avg * avg) >> 7));
  220. }
  221. unsigned int vp8_sub_pixel_variance4x4_wmt
  222. (
  223. const unsigned char *src_ptr,
  224. int src_pixels_per_line,
  225. int xoffset,
  226. int yoffset,
  227. const unsigned char *dst_ptr,
  228. int dst_pixels_per_line,
  229. unsigned int *sse
  230. )
  231. {
  232. int xsum;
  233. unsigned int xxsum;
  234. vp8_filter_block2d_bil4x4_var_mmx(
  235. src_ptr, src_pixels_per_line,
  236. dst_ptr, dst_pixels_per_line,
  237. vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
  238. &xsum, &xxsum
  239. );
  240. *sse = xxsum;
  241. return (xxsum - ((xsum * xsum) >> 4));
  242. }
  243. unsigned int vp8_sub_pixel_variance8x8_wmt
  244. (
  245. const unsigned char *src_ptr,
  246. int src_pixels_per_line,
  247. int xoffset,
  248. int yoffset,
  249. const unsigned char *dst_ptr,
  250. int dst_pixels_per_line,
  251. unsigned int *sse
  252. )
  253. {
  254. int xsum;
  255. unsigned int xxsum;
  256. if (xoffset == 4 && yoffset == 0)
  257. {
  258. vp8_half_horiz_variance8x_h_sse2(
  259. src_ptr, src_pixels_per_line,
  260. dst_ptr, dst_pixels_per_line, 8,
  261. &xsum, &xxsum);
  262. }
  263. else if (xoffset == 0 && yoffset == 4)
  264. {
  265. vp8_half_vert_variance8x_h_sse2(
  266. src_ptr, src_pixels_per_line,
  267. dst_ptr, dst_pixels_per_line, 8,
  268. &xsum, &xxsum);
  269. }
  270. else if (xoffset == 4 && yoffset == 4)
  271. {
  272. vp8_half_horiz_vert_variance8x_h_sse2(
  273. src_ptr, src_pixels_per_line,
  274. dst_ptr, dst_pixels_per_line, 8,
  275. &xsum, &xxsum);
  276. }
  277. else
  278. {
  279. vp8_filter_block2d_bil_var_sse2(
  280. src_ptr, src_pixels_per_line,
  281. dst_ptr, dst_pixels_per_line, 8,
  282. xoffset, yoffset,
  283. &xsum, &xxsum);
  284. }
  285. *sse = xxsum;
  286. return (xxsum - ((xsum * xsum) >> 6));
  287. }
  288. unsigned int vp8_sub_pixel_variance16x16_wmt
  289. (
  290. const unsigned char *src_ptr,
  291. int src_pixels_per_line,
  292. int xoffset,
  293. int yoffset,
  294. const unsigned char *dst_ptr,
  295. int dst_pixels_per_line,
  296. unsigned int *sse
  297. )
  298. {
  299. int xsum0, xsum1;
  300. unsigned int xxsum0, xxsum1;
  301. // note we could avoid these if statements if the calling function
  302. // just called the appropriate functions inside.
  303. if (xoffset == 4 && yoffset == 0)
  304. {
  305. vp8_half_horiz_variance16x_h_sse2(
  306. src_ptr, src_pixels_per_line,
  307. dst_ptr, dst_pixels_per_line, 16,
  308. &xsum0, &xxsum0);
  309. }
  310. else if (xoffset == 0 && yoffset == 4)
  311. {
  312. vp8_half_vert_variance16x_h_sse2(
  313. src_ptr, src_pixels_per_line,
  314. dst_ptr, dst_pixels_per_line, 16,
  315. &xsum0, &xxsum0);
  316. }
  317. else if (xoffset == 4 && yoffset == 4)
  318. {
  319. vp8_half_horiz_vert_variance16x_h_sse2(
  320. src_ptr, src_pixels_per_line,
  321. dst_ptr, dst_pixels_per_line, 16,
  322. &xsum0, &xxsum0);
  323. }
  324. else
  325. {
  326. vp8_filter_block2d_bil_var_sse2(
  327. src_ptr, src_pixels_per_line,
  328. dst_ptr, dst_pixels_per_line, 16,
  329. xoffset, yoffset,
  330. &xsum0, &xxsum0
  331. );
  332. vp8_filter_block2d_bil_var_sse2(
  333. src_ptr + 8, src_pixels_per_line,
  334. dst_ptr + 8, dst_pixels_per_line, 16,
  335. xoffset, yoffset,
  336. &xsum1, &xxsum1
  337. );
  338. xsum0 += xsum1;
  339. xxsum0 += xxsum1;
  340. }
  341. *sse = xxsum0;
  342. return (xxsum0 - ((xsum0 * xsum0) >> 8));
  343. }
  344. unsigned int vp8_sub_pixel_mse16x16_wmt(
  345. const unsigned char *src_ptr,
  346. int src_pixels_per_line,
  347. int xoffset,
  348. int yoffset,
  349. const unsigned char *dst_ptr,
  350. int dst_pixels_per_line,
  351. unsigned int *sse
  352. )
  353. {
  354. vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
  355. return *sse;
  356. }
  357. unsigned int vp8_sub_pixel_variance16x8_wmt
  358. (
  359. const unsigned char *src_ptr,
  360. int src_pixels_per_line,
  361. int xoffset,
  362. int yoffset,
  363. const unsigned char *dst_ptr,
  364. int dst_pixels_per_line,
  365. unsigned int *sse
  366. )
  367. {
  368. int xsum0, xsum1;
  369. unsigned int xxsum0, xxsum1;
  370. if (xoffset == 4 && yoffset == 0)
  371. {
  372. vp8_half_horiz_variance16x_h_sse2(
  373. src_ptr, src_pixels_per_line,
  374. dst_ptr, dst_pixels_per_line, 8,
  375. &xsum0, &xxsum0);
  376. }
  377. else if (xoffset == 0 && yoffset == 4)
  378. {
  379. vp8_half_vert_variance16x_h_sse2(
  380. src_ptr, src_pixels_per_line,
  381. dst_ptr, dst_pixels_per_line, 8,
  382. &xsum0, &xxsum0);
  383. }
  384. else if (xoffset == 4 && yoffset == 4)
  385. {
  386. vp8_half_horiz_vert_variance16x_h_sse2(
  387. src_ptr, src_pixels_per_line,
  388. dst_ptr, dst_pixels_per_line, 8,
  389. &xsum0, &xxsum0);
  390. }
  391. else
  392. {
  393. vp8_filter_block2d_bil_var_sse2(
  394. src_ptr, src_pixels_per_line,
  395. dst_ptr, dst_pixels_per_line, 8,
  396. xoffset, yoffset,
  397. &xsum0, &xxsum0);
  398. vp8_filter_block2d_bil_var_sse2(
  399. src_ptr + 8, src_pixels_per_line,
  400. dst_ptr + 8, dst_pixels_per_line, 8,
  401. xoffset, yoffset,
  402. &xsum1, &xxsum1);
  403. xsum0 += xsum1;
  404. xxsum0 += xxsum1;
  405. }
  406. *sse = xxsum0;
  407. return (xxsum0 - ((xsum0 * xsum0) >> 7));
  408. }
  409. unsigned int vp8_sub_pixel_variance8x16_wmt
  410. (
  411. const unsigned char *src_ptr,
  412. int src_pixels_per_line,
  413. int xoffset,
  414. int yoffset,
  415. const unsigned char *dst_ptr,
  416. int dst_pixels_per_line,
  417. unsigned int *sse
  418. )
  419. {
  420. int xsum;
  421. unsigned int xxsum;
  422. if (xoffset == 4 && yoffset == 0)
  423. {
  424. vp8_half_horiz_variance8x_h_sse2(
  425. src_ptr, src_pixels_per_line,
  426. dst_ptr, dst_pixels_per_line, 16,
  427. &xsum, &xxsum);
  428. }
  429. else if (xoffset == 0 && yoffset == 4)
  430. {
  431. vp8_half_vert_variance8x_h_sse2(
  432. src_ptr, src_pixels_per_line,
  433. dst_ptr, dst_pixels_per_line, 16,
  434. &xsum, &xxsum);
  435. }
  436. else if (xoffset == 4 && yoffset == 4)
  437. {
  438. vp8_half_horiz_vert_variance8x_h_sse2(
  439. src_ptr, src_pixels_per_line,
  440. dst_ptr, dst_pixels_per_line, 16,
  441. &xsum, &xxsum);
  442. }
  443. else
  444. {
  445. vp8_filter_block2d_bil_var_sse2(
  446. src_ptr, src_pixels_per_line,
  447. dst_ptr, dst_pixels_per_line, 16,
  448. xoffset, yoffset,
  449. &xsum, &xxsum);
  450. }
  451. *sse = xxsum;
  452. return (xxsum - ((xsum * xsum) >> 7));
  453. }
  454. unsigned int vp8_variance_halfpixvar16x16_h_wmt(
  455. const unsigned char *src_ptr,
  456. int src_pixels_per_line,
  457. const unsigned char *dst_ptr,
  458. int dst_pixels_per_line,
  459. unsigned int *sse)
  460. {
  461. int xsum0;
  462. unsigned int xxsum0;
  463. vp8_half_horiz_variance16x_h_sse2(
  464. src_ptr, src_pixels_per_line,
  465. dst_ptr, dst_pixels_per_line, 16,
  466. &xsum0, &xxsum0);
  467. *sse = xxsum0;
  468. return (xxsum0 - ((xsum0 * xsum0) >> 8));
  469. }
  470. unsigned int vp8_variance_halfpixvar16x16_v_wmt(
  471. const unsigned char *src_ptr,
  472. int src_pixels_per_line,
  473. const unsigned char *dst_ptr,
  474. int dst_pixels_per_line,
  475. unsigned int *sse)
  476. {
  477. int xsum0;
  478. unsigned int xxsum0;
  479. vp8_half_vert_variance16x_h_sse2(
  480. src_ptr, src_pixels_per_line,
  481. dst_ptr, dst_pixels_per_line, 16,
  482. &xsum0, &xxsum0);
  483. *sse = xxsum0;
  484. return (xxsum0 - ((xsum0 * xsum0) >> 8));
  485. }
  486. unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
  487. const unsigned char *src_ptr,
  488. int src_pixels_per_line,
  489. const unsigned char *dst_ptr,
  490. int dst_pixels_per_line,
  491. unsigned int *sse)
  492. {
  493. int xsum0;
  494. unsigned int xxsum0;
  495. vp8_half_horiz_vert_variance16x_h_sse2(
  496. src_ptr, src_pixels_per_line,
  497. dst_ptr, dst_pixels_per_line, 16,
  498. &xsum0, &xxsum0);
  499. *sse = xxsum0;
  500. return (xxsum0 - ((xsum0 * xsum0) >> 8));
  501. }