PageRenderTime 51ms CodeModel.GetById 15ms app.highlight 32ms RepoModel.GetById 1ms app.codeStats 0ms

/media/libvpx/vp8/encoder/x86/variance_sse2.c

http://github.com/zpao/v8monkey
C | 558 lines | 490 code | 57 blank | 11 comment | 56 complexity | b7ace89f5306687aee96c184da06aa0c MD5 | raw file
  1/*
  2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3 *
  4 *  Use of this source code is governed by a BSD-style license
  5 *  that can be found in the LICENSE file in the root of the source
  6 *  tree. An additional intellectual property rights grant can be found
  7 *  in the file PATENTS.  All contributing project authors may
  8 *  be found in the AUTHORS file in the root of the source tree.
  9 */
 10
 11#include "vpx_config.h"
 12#include "vp8/encoder/variance.h"
 13#include "vp8/common/pragmas.h"
 14#include "vpx_ports/mem.h"
 15
 16extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
 17extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
 18extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
 19extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
 20
 21extern void vp8_filter_block2d_bil4x4_var_mmx
 22(
 23    const unsigned char *ref_ptr,
 24    int ref_pixels_per_line,
 25    const unsigned char *src_ptr,
 26    int src_pixels_per_line,
 27    const short *HFilter,
 28    const short *VFilter,
 29    int *sum,
 30    unsigned int *sumsquared
 31);
 32
 33extern unsigned int vp8_get4x4var_mmx
 34(
 35    const unsigned char *src_ptr,
 36    int  source_stride,
 37    const unsigned char *ref_ptr,
 38    int  recon_stride,
 39    unsigned int *SSE,
 40    int *Sum
 41);
 42
 43unsigned int vp8_get_mb_ss_sse2
 44(
 45    const short *src_ptr
 46);
 47unsigned int vp8_get16x16var_sse2
 48(
 49    const unsigned char *src_ptr,
 50    int source_stride,
 51    const unsigned char *ref_ptr,
 52    int recon_stride,
 53    unsigned int *SSE,
 54    int *Sum
 55);
 56unsigned int vp8_get8x8var_sse2
 57(
 58    const unsigned char *src_ptr,
 59    int source_stride,
 60    const unsigned char *ref_ptr,
 61    int recon_stride,
 62    unsigned int *SSE,
 63    int *Sum
 64);
 65void vp8_filter_block2d_bil_var_sse2
 66(
 67    const unsigned char *ref_ptr,
 68    int ref_pixels_per_line,
 69    const unsigned char *src_ptr,
 70    int src_pixels_per_line,
 71    unsigned int Height,
 72    int  xoffset,
 73    int  yoffset,
 74    int *sum,
 75    unsigned int *sumsquared
 76);
 77void vp8_half_horiz_vert_variance8x_h_sse2
 78(
 79    const unsigned char *ref_ptr,
 80    int ref_pixels_per_line,
 81    const unsigned char *src_ptr,
 82    int src_pixels_per_line,
 83    unsigned int Height,
 84    int *sum,
 85    unsigned int *sumsquared
 86);
 87void vp8_half_horiz_vert_variance16x_h_sse2
 88(
 89    const unsigned char *ref_ptr,
 90    int ref_pixels_per_line,
 91    const unsigned char *src_ptr,
 92    int src_pixels_per_line,
 93    unsigned int Height,
 94    int *sum,
 95    unsigned int *sumsquared
 96);
 97void vp8_half_horiz_variance8x_h_sse2
 98(
 99    const unsigned char *ref_ptr,
100    int ref_pixels_per_line,
101    const unsigned char *src_ptr,
102    int src_pixels_per_line,
103    unsigned int Height,
104    int *sum,
105    unsigned int *sumsquared
106);
107void vp8_half_horiz_variance16x_h_sse2
108(
109    const unsigned char *ref_ptr,
110    int ref_pixels_per_line,
111    const unsigned char *src_ptr,
112    int src_pixels_per_line,
113    unsigned int Height,
114    int *sum,
115    unsigned int *sumsquared
116);
117void vp8_half_vert_variance8x_h_sse2
118(
119    const unsigned char *ref_ptr,
120    int ref_pixels_per_line,
121    const unsigned char *src_ptr,
122    int src_pixels_per_line,
123    unsigned int Height,
124    int *sum,
125    unsigned int *sumsquared
126);
127void vp8_half_vert_variance16x_h_sse2
128(
129    const unsigned char *ref_ptr,
130    int ref_pixels_per_line,
131    const unsigned char *src_ptr,
132    int src_pixels_per_line,
133    unsigned int Height,
134    int *sum,
135    unsigned int *sumsquared
136);
137
138DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
139
140unsigned int vp8_variance4x4_wmt(
141    const unsigned char *src_ptr,
142    int  source_stride,
143    const unsigned char *ref_ptr,
144    int  recon_stride,
145    unsigned int *sse)
146{
147    unsigned int var;
148    int avg;
149
150    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
151    *sse = var;
152    return (var - ((avg * avg) >> 4));
153
154}
155
156unsigned int vp8_variance8x8_wmt
157(
158    const unsigned char *src_ptr,
159    int  source_stride,
160    const unsigned char *ref_ptr,
161    int  recon_stride,
162    unsigned int *sse)
163{
164    unsigned int var;
165    int avg;
166
167    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
168    *sse = var;
169    return (var - ((avg * avg) >> 6));
170
171}
172
173
174unsigned int vp8_variance16x16_wmt
175(
176    const unsigned char *src_ptr,
177    int  source_stride,
178    const unsigned char *ref_ptr,
179    int  recon_stride,
180    unsigned int *sse)
181{
182    unsigned int sse0;
183    int sum0;
184
185
186    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
187    *sse = sse0;
188    return (sse0 - ((sum0 * sum0) >> 8));
189}
190unsigned int vp8_mse16x16_wmt(
191    const unsigned char *src_ptr,
192    int  source_stride,
193    const unsigned char *ref_ptr,
194    int  recon_stride,
195    unsigned int *sse)
196{
197
198    unsigned int sse0;
199    int sum0;
200    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
201    *sse = sse0;
202    return sse0;
203
204}
205
206
207unsigned int vp8_variance16x8_wmt
208(
209    const unsigned char *src_ptr,
210    int  source_stride,
211    const unsigned char *ref_ptr,
212    int  recon_stride,
213    unsigned int *sse)
214{
215    unsigned int sse0, sse1, var;
216    int sum0, sum1, avg;
217
218    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
219    vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
220
221    var = sse0 + sse1;
222    avg = sum0 + sum1;
223    *sse = var;
224    return (var - ((avg * avg) >> 7));
225
226}
227
228unsigned int vp8_variance8x16_wmt
229(
230    const unsigned char *src_ptr,
231    int  source_stride,
232    const unsigned char *ref_ptr,
233    int  recon_stride,
234    unsigned int *sse)
235{
236    unsigned int sse0, sse1, var;
237    int sum0, sum1, avg;
238
239    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
240    vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
241
242    var = sse0 + sse1;
243    avg = sum0 + sum1;
244    *sse = var;
245    return (var - ((avg * avg) >> 7));
246
247}
248
249unsigned int vp8_sub_pixel_variance4x4_wmt
250(
251    const unsigned char  *src_ptr,
252    int  src_pixels_per_line,
253    int  xoffset,
254    int  yoffset,
255    const unsigned char *dst_ptr,
256    int dst_pixels_per_line,
257    unsigned int *sse
258)
259{
260    int xsum;
261    unsigned int xxsum;
262    vp8_filter_block2d_bil4x4_var_mmx(
263        src_ptr, src_pixels_per_line,
264        dst_ptr, dst_pixels_per_line,
265        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
266        &xsum, &xxsum
267    );
268    *sse = xxsum;
269    return (xxsum - ((xsum * xsum) >> 4));
270}
271
272
273unsigned int vp8_sub_pixel_variance8x8_wmt
274(
275    const unsigned char  *src_ptr,
276    int  src_pixels_per_line,
277    int  xoffset,
278    int  yoffset,
279    const unsigned char *dst_ptr,
280    int dst_pixels_per_line,
281    unsigned int *sse
282)
283{
284    int xsum;
285    unsigned int xxsum;
286
287    if (xoffset == 4 && yoffset == 0)
288    {
289        vp8_half_horiz_variance8x_h_sse2(
290            src_ptr, src_pixels_per_line,
291            dst_ptr, dst_pixels_per_line, 8,
292            &xsum, &xxsum);
293    }
294    else if (xoffset == 0 && yoffset == 4)
295    {
296        vp8_half_vert_variance8x_h_sse2(
297            src_ptr, src_pixels_per_line,
298            dst_ptr, dst_pixels_per_line, 8,
299            &xsum, &xxsum);
300    }
301    else if (xoffset == 4 && yoffset == 4)
302    {
303        vp8_half_horiz_vert_variance8x_h_sse2(
304            src_ptr, src_pixels_per_line,
305            dst_ptr, dst_pixels_per_line, 8,
306            &xsum, &xxsum);
307    }
308    else
309    {
310        vp8_filter_block2d_bil_var_sse2(
311            src_ptr, src_pixels_per_line,
312            dst_ptr, dst_pixels_per_line, 8,
313            xoffset, yoffset,
314            &xsum, &xxsum);
315    }
316
317    *sse = xxsum;
318    return (xxsum - ((xsum * xsum) >> 6));
319}
320
321unsigned int vp8_sub_pixel_variance16x16_wmt
322(
323    const unsigned char  *src_ptr,
324    int  src_pixels_per_line,
325    int  xoffset,
326    int  yoffset,
327    const unsigned char *dst_ptr,
328    int dst_pixels_per_line,
329    unsigned int *sse
330)
331{
332    int xsum0, xsum1;
333    unsigned int xxsum0, xxsum1;
334
335
336    // note we could avoid these if statements if the calling function
337    // just called the appropriate functions inside.
338    if (xoffset == 4 && yoffset == 0)
339    {
340        vp8_half_horiz_variance16x_h_sse2(
341            src_ptr, src_pixels_per_line,
342            dst_ptr, dst_pixels_per_line, 16,
343            &xsum0, &xxsum0);
344    }
345    else if (xoffset == 0 && yoffset == 4)
346    {
347        vp8_half_vert_variance16x_h_sse2(
348            src_ptr, src_pixels_per_line,
349            dst_ptr, dst_pixels_per_line, 16,
350            &xsum0, &xxsum0);
351    }
352    else if (xoffset == 4 && yoffset == 4)
353    {
354        vp8_half_horiz_vert_variance16x_h_sse2(
355            src_ptr, src_pixels_per_line,
356            dst_ptr, dst_pixels_per_line, 16,
357            &xsum0, &xxsum0);
358    }
359    else
360    {
361        vp8_filter_block2d_bil_var_sse2(
362            src_ptr, src_pixels_per_line,
363            dst_ptr, dst_pixels_per_line, 16,
364            xoffset, yoffset,
365            &xsum0, &xxsum0
366        );
367
368        vp8_filter_block2d_bil_var_sse2(
369            src_ptr + 8, src_pixels_per_line,
370            dst_ptr + 8, dst_pixels_per_line, 16,
371            xoffset, yoffset,
372            &xsum1, &xxsum1
373        );
374        xsum0 += xsum1;
375        xxsum0 += xxsum1;
376    }
377
378    *sse = xxsum0;
379    return (xxsum0 - ((xsum0 * xsum0) >> 8));
380}
381
382unsigned int vp8_sub_pixel_mse16x16_wmt(
383    const unsigned char  *src_ptr,
384    int  src_pixels_per_line,
385    int  xoffset,
386    int  yoffset,
387    const unsigned char *dst_ptr,
388    int dst_pixels_per_line,
389    unsigned int *sse
390)
391{
392    vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
393    return *sse;
394}
395
396unsigned int vp8_sub_pixel_variance16x8_wmt
397(
398    const unsigned char  *src_ptr,
399    int  src_pixels_per_line,
400    int  xoffset,
401    int  yoffset,
402    const unsigned char *dst_ptr,
403    int dst_pixels_per_line,
404    unsigned int *sse
405
406)
407{
408    int xsum0, xsum1;
409    unsigned int xxsum0, xxsum1;
410
411    if (xoffset == 4 && yoffset == 0)
412    {
413        vp8_half_horiz_variance16x_h_sse2(
414            src_ptr, src_pixels_per_line,
415            dst_ptr, dst_pixels_per_line, 8,
416            &xsum0, &xxsum0);
417    }
418    else if (xoffset == 0 && yoffset == 4)
419    {
420        vp8_half_vert_variance16x_h_sse2(
421            src_ptr, src_pixels_per_line,
422            dst_ptr, dst_pixels_per_line, 8,
423            &xsum0, &xxsum0);
424    }
425    else if (xoffset == 4 && yoffset == 4)
426    {
427        vp8_half_horiz_vert_variance16x_h_sse2(
428            src_ptr, src_pixels_per_line,
429            dst_ptr, dst_pixels_per_line, 8,
430            &xsum0, &xxsum0);
431    }
432    else
433    {
434        vp8_filter_block2d_bil_var_sse2(
435            src_ptr, src_pixels_per_line,
436            dst_ptr, dst_pixels_per_line, 8,
437            xoffset, yoffset,
438            &xsum0, &xxsum0);
439
440        vp8_filter_block2d_bil_var_sse2(
441            src_ptr + 8, src_pixels_per_line,
442            dst_ptr + 8, dst_pixels_per_line, 8,
443            xoffset, yoffset,
444            &xsum1, &xxsum1);
445        xsum0 += xsum1;
446        xxsum0 += xxsum1;
447    }
448
449    *sse = xxsum0;
450    return (xxsum0 - ((xsum0 * xsum0) >> 7));
451}
452
453unsigned int vp8_sub_pixel_variance8x16_wmt
454(
455    const unsigned char  *src_ptr,
456    int  src_pixels_per_line,
457    int  xoffset,
458    int  yoffset,
459    const unsigned char *dst_ptr,
460    int dst_pixels_per_line,
461    unsigned int *sse
462)
463{
464    int xsum;
465    unsigned int xxsum;
466
467    if (xoffset == 4 && yoffset == 0)
468    {
469        vp8_half_horiz_variance8x_h_sse2(
470            src_ptr, src_pixels_per_line,
471            dst_ptr, dst_pixels_per_line, 16,
472            &xsum, &xxsum);
473    }
474    else if (xoffset == 0 && yoffset == 4)
475    {
476        vp8_half_vert_variance8x_h_sse2(
477            src_ptr, src_pixels_per_line,
478            dst_ptr, dst_pixels_per_line, 16,
479            &xsum, &xxsum);
480    }
481    else if (xoffset == 4 && yoffset == 4)
482    {
483        vp8_half_horiz_vert_variance8x_h_sse2(
484            src_ptr, src_pixels_per_line,
485            dst_ptr, dst_pixels_per_line, 16,
486            &xsum, &xxsum);
487    }
488    else
489    {
490        vp8_filter_block2d_bil_var_sse2(
491            src_ptr, src_pixels_per_line,
492            dst_ptr, dst_pixels_per_line, 16,
493            xoffset, yoffset,
494            &xsum, &xxsum);
495    }
496
497    *sse = xxsum;
498    return (xxsum - ((xsum * xsum) >> 7));
499}
500
501
502unsigned int vp8_variance_halfpixvar16x16_h_wmt(
503    const unsigned char *src_ptr,
504    int  src_pixels_per_line,
505    const unsigned char *dst_ptr,
506    int  dst_pixels_per_line,
507    unsigned int *sse)
508{
509    int xsum0;
510    unsigned int xxsum0;
511
512    vp8_half_horiz_variance16x_h_sse2(
513        src_ptr, src_pixels_per_line,
514        dst_ptr, dst_pixels_per_line, 16,
515        &xsum0, &xxsum0);
516
517    *sse = xxsum0;
518    return (xxsum0 - ((xsum0 * xsum0) >> 8));
519}
520
521
522unsigned int vp8_variance_halfpixvar16x16_v_wmt(
523    const unsigned char *src_ptr,
524    int  src_pixels_per_line,
525    const unsigned char *dst_ptr,
526    int  dst_pixels_per_line,
527    unsigned int *sse)
528{
529    int xsum0;
530    unsigned int xxsum0;
531    vp8_half_vert_variance16x_h_sse2(
532        src_ptr, src_pixels_per_line,
533        dst_ptr, dst_pixels_per_line, 16,
534        &xsum0, &xxsum0);
535
536    *sse = xxsum0;
537    return (xxsum0 - ((xsum0 * xsum0) >> 8));
538}
539
540
541unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
542    const unsigned char *src_ptr,
543    int  src_pixels_per_line,
544    const unsigned char *dst_ptr,
545    int  dst_pixels_per_line,
546    unsigned int *sse)
547{
548    int xsum0;
549    unsigned int xxsum0;
550
551    vp8_half_horiz_vert_variance16x_h_sse2(
552        src_ptr, src_pixels_per_line,
553        dst_ptr, dst_pixels_per_line, 16,
554        &xsum0, &xxsum0);
555
556    *sse = xxsum0;
557    return (xxsum0 - ((xsum0 * xsum0) >> 8));
558}