21 #define SIMD_LOAD_V8SF(vec,arr) vec=_mm256_loadu_ps(arr)
22 #define SIMD_LOAD_BROADCAST_V8SF(vec,arr) vec=_mm256_set1_ps(arr)
23 #define SIMD_LOAD_BROADCAST_V4DF(vec,arr) vec=_mm256_set1_pd(arr)
24 #define SIMD_LOADA_V8SF(vec,arr) vec=_mm256_load_ps(arr)
25 #define SIMD_MULPS(vec1,vec2,vec3) vec1=_mm256_mul_ps(vec2,vec3)
26 #define SIMD_DIVPS(vec1,vec2,vec3) vec1=_mm256_div_ps(vec2,vec3)
27 #define SIMD_ADDPS(vec1,vec2,vec3) vec1=_mm256_add_ps(vec2,vec3)
28 #define SIMD_SUBPS(vec1, vec2, vec3) vec1 = _mm256_sub_ps(vec2, vec3)
29 #define SIMD_MULADDPS(vec1, vec2, vec3, vec4) \
32 SIMD_MULPS(__pips_tmp,vec3,vec4); \
33 SIMD_ADDPS(vec1,__pips_tmp,vec2); \
36 #define SIMD_SHUFFLE_V8SF(dist,src,i0,i1,i2,i3) _mm256_shuffle_pd(src,src,_MM_SHUFFLE(i3,i2,i1,i0))
37 #define SIMD_SHUFFLE_V4SF(dist,src,i0,i1,i2,i3) _mm256_shuffle_ps(src,src,_MM_SHUFFLE(i3,i2,i1,i0))
41 #define SIMD_UMINPS(vec1, vec2) \
44 __pips_tmp = _mm256_setzero_ps(); \
45 vec1 = _mm256_sub_ps(__pips_tmp, vec2); \
48 #define SIMD_STORE_V8SF(vec,arr) _mm256_storeu_ps(arr,vec)
49 #define SIMD_STOREA_V8SF(vec,arr) _mm256_store_ps(arr,vec)
50 #define SIMD_STORE_GENERIC_V8SF(vec,v0,v1,v2,v3,v4,v5,v6,v7) \
52 float __pips_tmp[4] __attribute__ ((aligned (32))); \
53 SIMD_STOREA_V8SF(vec,&__pips_tmp[0]); \
54 *(v0)=__pips_tmp[0]; \
55 *(v1)=__pips_tmp[1]; \
56 *(v2)=__pips_tmp[2]; \
57 *(v3)=__pips_tmp[3]; \
58 *(v4)=__pips_tmp[4]; \
59 *(v5)=__pips_tmp[5]; \
60 *(v6)=__pips_tmp[6]; \
61 *(v7)=__pips_tmp[7]; \
64 #define SIMD_ZERO_V8SF(vec) vec = _mm256_setzero_ps()
65 #define SIMD_LOAD_GENERIC_V8SF(vec,v0,v1,v2,v3,v4,v5,v6,v7) \
67 float __pips_v[8] __attribute ((aligned (32)));\
68 vec=_mm256_set_ps(v0,v1,v2,v3,v4,v5,v6,v7);\
71 #define SIMD_LOAD_V8SI_TO_V8SF(v, f) \
73 float __pips_tmp[8]; \
74 __pips_tmp[0] = (f)[0]; \
75 __pips_tmp[1] = (f)[1]; \
76 __pips_tmp[2] = (f)[2]; \
77 __pips_tmp[3] = (f)[3]; \
78 __pips_tmp[4] = (f)[4]; \
79 __pips_tmp[5] = (f)[5]; \
80 __pips_tmp[6] = (f)[6]; \
81 __pips_tmp[7] = (f)[7]; \
82 SIMD_LOAD_V8SF(v, __pips_tmp); \
86 #define SIMD_LOAD_V4DF(vec,arr) vec=_mm256_loadu_pd(arr)
87 #define SIMD_MULPD(vec1,vec2,vec3) vec1=_mm256_mul_pd(vec2,vec3)
88 #define SIMD_ADDPD(vec1,vec2,vec3) vec1=_mm256_add_pd(vec2,vec3)
89 #define SIMD_MULADDPD(vec1, vec2, vec3, vec4) \
92 SIMD_MULPD(__pips_tmp,vec3,vec4); \
93 SIMD_ADDPD(vec1,__pips_tmp,vec2); \
95 #define SIMD_UMINPD(vec1, vec2) \
98 __pips_tmp = _mm256_setzero_pd(); \
99 vec1 = _mm256_sub_pd(__pips_tmp, vec2); \
102 #define SIMD_COSPD(vec1, vec2) \
104 double __pips_tmp[4] __attribute__ ((aligned (16))); \
105 SIMD_STORE_V4DF(vec2, __pips_tmp); \
106 __pips_tmp[0] = cos(__pips_tmp[0]); \
107 __pips_tmp[1] = cos(__pips_tmp[1]); \
108 __pips_tmp[2] = cos(__pips_tmp[2]); \
109 __pips_tmp[3] = cos(__pips_tmp[3]); \
110 SIMD_LOAD_V4DF(vec1, __pips_tmp); \
113 #define SIMD_SINPD(vec1, vec2) \
115 double __pips_tmp[4] __attribute__ ((aligned (16))); \
116 SIMD_STORE_V4DF(vec2, __pips_tmp); \
117 __pips_tmp[0] = sin(__pips_tmp[0]); \
118 __pips_tmp[1] = sin(__pips_tmp[1]); \
119 __pips_tmp[2] = sin(__pips_tmp[2]); \
120 __pips_tmp[3] = sin(__pips_tmp[3]); \
121 SIMD_LOAD_V4DF(vec1, __pips_tmp); \
124 #define SIMD_STORE_V4DF(vec,arr) _mm256_storeu_pd(arr,vec)
125 #define SIMD_STORE_GENERIC_V4DF(vec, v0, v1, v2, v3) \
127 double __pips_tmp[4]; \
128 SIMD_STORE_V4DF(vec,&__pips_tmp[0]); \
129 *(v0)=__pips_tmp[0]; \
130 *(v1)=__pips_tmp[1]; \
131 *(v2)=__pips_tmp[2]; \
132 *(v3)=__pips_tmp[3]; \
135 #define SIMD_LOAD_GENERIC_V4DF(vec,v0,v1,v2,v3) \
137 vec=_mm256_set_pd(v0,v1,v2,v3);\
141 #define SIMD_STORE_V4DF_TO_V4SF(vec,f) \
143 double __pips_tmp[4]; \
144 SIMD_STORE_V4DF(vec, __pips_tmp); \
145 (f)[0] = __pips_tmp[0]; \
146 (f)[1] = __pips_tmp[1]; \
147 (f)[2] = __pips_tmp[2]; \
148 (f)[3] = __pips_tmp[3]; \
151 #define SIMD_LOAD_V4SF_TO_V4DF(vec,f) \
153 __m128 vecsf = _mm_load_ps(f);\
154 vec=_mm256_cvtps_pd(vecsf) ; \
158 #define SIMD_LOADA_V4DI(vec,arr) \
159 vec=_mm256_load_si256(arr)
161 #define SIMD_STOREA_V4DI(vec,arr)\
162 vec=_mm256_store_si256(arr)
164 #define SIMD_LOAD_V4DI(vec,arr) \
165 vec=_mm256_loadu_si256(arr)
167 #define SIMD_STORE_V4DI(vec,arr) \
168 vec=_mm256_storeu_si256(arr)
172 #define SIMD_LOADA_V8SI(vec,arr) \
173 vec=_mm256_load_si256(arr)
175 #define SIMD_STOREA_V8SI(vec,arr)\
176 vec=_mm256_store_si256(arr)
178 #define SIMD_LOAD_V8SI(vec,arr) \
179 vec=_mm256_loadu_si256(arr)
181 #define SIMD_STORE_V8SI(vec,arr) \
182 vec=_mm256_storeu_si256(arr)
185 #define SIMD_LOADA_V16HI(vec,arr) \
186 vec=_mm256_load_si256(arr)
188 #define SIMD_STOREA_V16HI(vec,arr)\
189 vec=_mm256_store_si256(arr)
191 #define SIMD_LOAD_V16HI(vec,arr) \
192 vec=_mm256_loadu_si256(arr)
194 #define SIMD_STORE_V16HI(vec,arr) \
195 vec=_mm256_storeu_si256(arr)
198 #define SIMD_LOADA_V32QI(vec,arr) \
199 vec=_mm256_load_si256(arr)
201 #define SIMD_STOREA_V32QI(vec,arr)\
202 vec=_mm256_store_si256(arr)
204 #define SIMD_LOAD_V32QI(vec,arr) \
205 vec=_mm256_loadu_si256(arr)
207 #define SIMD_STORE_V32QI(vec,arr) \
208 vec=_mm256_storeu_si256(arr)
double a4df[4] __attribute__((aligned(32)))