16 #define SIMD_LOAD_V4SI(vec,arr) vec=_mm_loadu_si128((__m128i*)arr)
17 #define SIMD_LOADA_V4SI(vec,arr) vec=_mm_load_si128((__m128i*)arr)
18 #define SIMD_LOAD_BROADCAST_V4SI(vec,val) vec=_mm_set1_si128(val)
19 #define SIMD_MULD(vec1,vec2,vec3) vec1=_mm_mul_epi32(vec2,vec3)
20 #define SIMD_ADDD(vec1,vec2,vec3) vec1=_mm_add_epi32(vec2,vec3)
21 #define SIMD_SUBD(vec1, vec2, vec3) vec1 = _mm_sub_epi32(vec2, vec3)
23 #define SIMD_STORE_V4SI(vec,arr) _mm_storeu_si128((__m128i*)arr,vec)
24 #define SIMD_STOREA_V4SI(vec,arr) _mm_store_si128((__m128i*)arr,vec)
27 #define SIMD_LOAD_V4SF(vec,arr) vec=_mm_loadu_ps(arr)
28 #define SIMD_LOADA_V4SF(vec,arr) vec=_mm_load_ps(arr)
29 #define SIMD_LOAD_BROADCAST_V4SF(vec,val) vec=_mm_set1_ps(val)
30 #define SIMD_MULPS(vec1,vec2,vec3) vec1=_mm_mul_ps(vec2,vec3)
31 #define SIMD_DIVPS(vec1,vec2,vec3) vec1=_mm_div_ps(vec2,vec3)
32 #define SIMD_ADDPS(vec1,vec2,vec3) vec1=_mm_add_ps(vec2,vec3)
33 #define SIMD_SUBPS(vec1, vec2, vec3) vec1 = _mm_sub_ps(vec2, vec3)
34 #define SIMD_MULADDPS(vec1, vec2, vec3, vec4) \
37 SIMD_MULPS(__pips_tmp, vec3, vec4);\
38 SIMD_ADDPS(vec1, __pips_tmp, vec2); \
41 #define SIMD_SHUFFLE_V4SF(dist,src,i0,i1,i2,i3) dist=_mm_shuffle_ps(src,src,_MM_SHUFFLE(i3,i2,i1,i0)
44 #define SIMD_UMINPS(vec1, vec2) \
47 __pips_tmp = _mm_setzero_ps(); \
48 vec1 = _mm_sub_ps(__pips_tmp, vec2); \
51 #define SIMD_STORE_V4SF(vec,arr) _mm_storeu_ps(arr,vec)
52 #define SIMD_STOREA_V4SF(vec,arr) _mm_store_ps(arr,vec)
53 #define SIMD_STORE_GENERIC_V4SF(vec,v0,v1,v2,v3) \
55 float __pips_tmp[4] __attribute__ ((aligned (16))); \
56 SIMD_STOREA_V4SF(vec,&__pips_tmp[0]); \
57 *(v0)=__pips_tmp[0]; \
58 *(v1)=__pips_tmp[1]; \
59 *(v2)=__pips_tmp[2]; \
60 *(v3)=__pips_tmp[3]; \
63 #define SIMD_ZERO_V4SF(vec) vec = _mm_setzero_ps()
64 #define SIMD_INVERT_V4SF(vec) vec = _mm_shuffle_ps(vec,vec,_MM_SHUFFLE(4,3,2,1))
66 #define SIMD_LOAD_GENERIC_V4SF(vec,v0,v1,v2,v3) \
68 float __pips_v[4] __attribute ((aligned (16)));\
73 SIMD_LOADA_V4SF(vec,&__pips_v[0]); \
77 #define SIMD_STORE_MASKED_V4SF(vec,arr) \
79 float __pips_tmp[4] __attribute__ ((aligned (16))); \
80 SIMD_STOREA_V4SF(vec,&__pips_tmp[0]); \
81 (arr)[0] = __pips_tmp[0]; \
82 (arr)[1] = __pips_tmp[1]; \
83 (arr)[2] = __pips_tmp[2]; \
86 #define SIMD_LOAD_V4SI_TO_V4SF(v, f) \
88 float __pips_tmp[4]; \
89 __pips_tmp[0] = (f)[0]; \
90 __pips_tmp[1] = (f)[1]; \
91 __pips_tmp[2] = (f)[2]; \
92 __pips_tmp[3] = (f)[3]; \
93 SIMD_LOAD_V4SF(v, __pips_tmp); \
97 #define SIMD_LOAD_V2DF(vec,arr) vec=_mm_loadu_pd(arr)
98 #define SIMD_MULPD(vec1,vec2,vec3) vec1=_mm_mul_pd(vec2,vec3)
99 #define SIMD_ADDPD(vec1,vec2,vec3) vec1=_mm_add_pd(vec2,vec3)
100 #define SIMD_MULADDPD(vec1, vec2, vec3, vec4) \
103 SIMD_MULPD(__pips_tmp, vec3, vec4);\
104 SIMD_ADDPD(vec1, __pips_tmp, vec2); \
106 #define SIMD_UMINPD(vec1, vec2) \
108 __m128d __pips_tmp; \
109 __pips_tmp = _mm_setzero_pd(); \
110 vec1 = _mm_sub_pd(__pips_tmp, vec2); \
113 #define SIMD_COSPD(vec1, vec2) \
115 double __pips_tmp[2] __attribute__ ((aligned (16))); \
116 SIMD_STORE_V2DF(vec2, __pips_tmp); \
117 __pips_tmp[0] = cos(__pips_tmp[0]); \
118 __pips_tmp[1] = cos(__pips_tmp[1]); \
119 SIMD_LOAD_V2DF(vec1, __pips_tmp); \
122 #define SIMD_SINPD(vec1, vec2) \
124 double __pips_tmp[2] __attribute__ ((aligned (16))); \
125 SIMD_STORE_V2DF(vec2, __pips_tmp); \
126 __pips_tmp[0] = sin(__pips_tmp[0]); \
127 __pips_tmp[1] = sin(__pips_tmp[1]); \
128 SIMD_LOAD_V2DF(vec1, __pips_tmp); \
131 #define SIMD_STORE_V2DF(vec,arr) _mm_storeu_pd(arr,vec)
132 #define SIMD_STORE_GENERIC_V2DF(vec, v0, v1) \
134 double __pips_tmp[2]; \
135 SIMD_STORE_V2DF(vec,&__pips_tmp[0]); \
136 *(v0)=__pips_tmp[0]; \
137 *(v1)=__pips_tmp[1]; \
139 #define SIMD_LOAD_GENERIC_V2DF(vec,v0,v1) \
141 double v[2] = { v0,v1}; \
142 SIMD_LOAD_V2DF(vec,&v[0]); \
146 #define SIMD_STORE_V2DF_TO_V2SF(vec,f) \
148 double __pips_tmp[2]; \
149 SIMD_STORE_V2DF(vec, __pips_tmp); \
150 (f)[0] = __pips_tmp[0]; \
151 (f)[1] = __pips_tmp[1]; \
154 #define SIMD_LOAD_V2SF_TO_V2DF(vec,f) \
155 SIMD_LOAD_GENERIC_V2DF(vec,(f)[0],(f)[1])
158 #define SIMD_LOAD_V8HI(vec,arr) \
159 vec = (__m128i*)(arr)
161 #define SIMD_STORE_V8HI(vec,arr)\
162 *(__m128i *)(&(arr)[0]) = vec
164 #define SIMD_STORE_V8HI_TO_V8SI(vec,arr)\
165 SIMD_STORE_V8HI(vec,arr)
166 #define SIMD_LOAD_V8SI_TO_V8HI(vec,arr)\
167 SIMD_LOAD_V8HI(vec,arr)
float a2sf[2] __attribute__((aligned(16)))