PageRenderTime 42ms CodeModel.GetById 35ms app.highlight 4ms RepoModel.GetById 2ms app.codeStats 0ms

/FFdecsa/parallel_064_mmx.h

https://bitbucket.org/a84/gs_public/
C++ Header | 83 lines | 47 code | 16 blank | 20 comment | 0 complexity | aa76ab238583a4be431c53a26c1a7c90 MD5 | raw file
 1/* FFdecsa -- fast decsa algorithm
 2 *
 3 * Copyright (C) 2007 Dark Avenger
 4 *               2003-2004  fatih89r
 5 *
 6 * This program is free software; you can redistribute it and/or modify
 7 * it under the terms of the GNU General Public License as published by
 8 * the Free Software Foundation; either version 2 of the License, or
 9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20
21#include <mmintrin.h>
22
23#define MEMALIGN __attribute__((aligned(16)))
24
25union __u64 {
26    unsigned int u[2];
27    __m64 v;
28};
29
30static const union __u64 ff0 = {{0x00000000U, 0x00000000U}};
31static const union __u64 ff1 = {{0xffffffffU, 0xffffffffU}};
32
33typedef __m64 group;
34#define GROUP_PARALLELISM 64
35#define FF0()      ff0.v
36#define FF1()      ff1.v
37#define FFAND(a,b) _mm_and_si64((a),(b))
38#define FFOR(a,b)  _mm_or_si64((a),(b))
39#define FFXOR(a,b) _mm_xor_si64((a),(b))
40#define FFNOT(a)   _mm_xor_si64((a),FF1())
41
42/* 64 rows of 64 bits */
43
44static const union __u64 ff29 = {{0x29292929U, 0x29292929U}};
45static const union __u64 ff02 = {{0x02020202U, 0x02020202U}};
46static const union __u64 ff04 = {{0x04040404U, 0x04040404U}};
47static const union __u64 ff10 = {{0x10101010U, 0x10101010U}};
48static const union __u64 ff40 = {{0x40404040U, 0x40404040U}};
49static const union __u64 ff80 = {{0x80808080U, 0x80808080U}};
50
51typedef __m64 batch;
52#define BYTES_PER_BATCH 8
53#define B_FFAND(a,b) FFAND((a),(b))
54#define B_FFOR(a,b)  FFOR((a),(b))
55#define B_FFXOR(a,b) FFXOR((a),(b))
56#define B_FFN_ALL_29() ff29.v
57#define B_FFN_ALL_02() ff02.v
58#define B_FFN_ALL_04() ff04.v
59#define B_FFN_ALL_10() ff10.v
60#define B_FFN_ALL_40() ff40.v
61#define B_FFN_ALL_80() ff80.v
62#define B_FFSH8L(a,n) _mm_slli_si64((a),(n))
63#define B_FFSH8R(a,n) _mm_srli_si64((a),(n))
64
65#define M_EMPTY() _mm_empty()
66
67
68#undef XOR_8_BY
69#define XOR_8_BY(d,s1,s2)    do { *(__m64*)d = _mm_xor_si64(*(__m64*)(s1), *(__m64*)(s2)); } while(0)
70
71#undef XOREQ_8_BY
72#define XOREQ_8_BY(d,s)      XOR_8_BY(d, d, s)
73
74#undef COPY_8_BY
75#define COPY_8_BY(d,s)       do { *(__m64 *)(d) = *(__m64 *)(s); } while(0)
76
77#undef BEST_SPAN
78#define BEST_SPAN            8
79
80#undef XOR_BEST_BY
81#define XOR_BEST_BY(d,s1,s2) XOR_8_BY(d,s1,s2)
82
83#include "fftable.h"