PIPS
neon.h
Go to the documentation of this file.
1
#include <arm_neon.h>
2
3
4
/* Uses 128-bits NEON instructions.
5
Notes :
6
* NEON can also operate on 64-bits vectors.
7
* NEON does not operate on double-precision float. However, VFP can work on double-precision 64-bits vectors,
8
but VFP is not a simd unit : it processes vectors scalar by scalar.
9
* __reminder__ : NEON data types : signed/unsigned 8-bit, 16-bit, 32-bit, 64-bit, single precision floating point
10
* TODO: alignement: [1] says that each instruction has an alignement offset argument., but I can't find it in
11
the intrinsics... An other option is to used an isntruction that set the alignement offset before each call,
12
but it sounds like wasted cycles...
13
* TODO: a feature of NEON is to load/store up to 4 vectors with just one instruction
14
(see vst{1,2,3,4}/vld{1,2,3,4} variants and [2] for examples). This needs (I think) a modification in SAC.
15
* [2] is a nice summary of the intrinsics used here
16
17
[1] : http://infocenter.arm.com/help/topic/com.arm.doc.dui0489b/CIHGIAEH.html
18
[2] : http://gcc.gnu.org/onlinedocs/gcc/ARM-NEON-Intrinsics.html
19
*/
20
21
22
typedef
float32_t
a4sf
[8]
__attribute__
((aligned (32)));
23
typedef
int64_t a2di[4]
__attribute__
((aligned (32)));
24
typedef
int32_t
a4si
[8]
__attribute__
((aligned (32)));
25
typedef
int16_t
a8hi[16]
__attribute__
((aligned (32)));
26
typedef
int8_t
a16qi[32]
__attribute__
((aligned (32)));
27
28
typedef
float32x4_t
v4sf
;
29
typedef
int64x2_t
v2di
;
30
typedef
int32x4_t
v4si
;
31
typedef
int16x8_t
v8hi
;
32
typedef
int8x16_t
v16qi
;
33
34
/* float */
35
#define SIMD_LOAD_V4SF(vec,arr) vec=vld1q_f32(arr)
36
#define SIMD_LOADA_V4SF(vec,arr) vec=vld1q_f32(arr)
37
#define SIMD_MULPS(vec1,vec2,vec3) vec1=vmulq_f32(vec2,vec3)
38
#define SIMD_DIVPS(vec1,vec2,vec3)\
39
do {\
40
vec3=vrecpeq_f32(vec3);\
41
vec1=vmulq_f32(vec2,vec3);\
42
}\
43
while (0)
44
45
#define SIMD_ADDPS(vec1,vec2,vec3) vec1=vaddq_f32(vec2,vec3)
46
#define SIMD_SUBPS(vec1, vec2, vec3) vec1=vsubq_f32(vec2, vec3)
47
#define SIMD_MULADDPS(vec1, vec2, vec3, vec4) vec1=vmlaq_f32(vec2,vec3,vec4)
48
#define SIMD_UMINPS(vec1, vec2) vec1=vnegq_f32(vec2)
49
#define SIMD_STORE_V4SF(vec,arr) vst1q_f32(arr,vec)
50
#define SIMD_STOREA_V4SF(vec,arr) vst1q_f32(arr,vec)
51
#define SIMD_STORE_GENERIC_V4SF(vec,v0,v1,v2,v3) \
52
do { \
53
float __pips_tmp[4] __attribute__ ((aligned (16))); \
54
SIMD_STOREA_V4SF(vec,&__pips_tmp[0]); \
55
*(v0)=__pips_tmp[0]; \
56
*(v1)=__pips_tmp[1]; \
57
*(v2)=__pips_tmp[2]; \
58
*(v3)=__pips_tmp[3]; \
59
} while (0)
60
61
#define SIMD_ZERO_V4SF(vec) vec = vsubq_f32(vec,vec)
62
63
#define SIMD_LOAD_GENERIC_V4SF(vec,v0,v1,v2,v3) \
64
do { \
65
float __pips_v[4] __attribute ((aligned (16)));\
66
__pips_v[0]=v0;\
67
__pips_v[1]=v1;\
68
__pips_v[2]=v2;\
69
__pips_v[3]=v3;\
70
SIMD_LOADA_V4SF(vec,&__pips_v[0]); \
71
} while(0)
72
73
/* handle padded value, this is a very bad implementation ... */
74
#define SIMD_STORE_MASKED_V4SF(vec,arr) \
75
do { \
76
float __pips_tmp[4] __attribute__ ((aligned (16))); \
77
SIMD_STOREA_V4SF(vec,&__pips_tmp[0]); \
78
(arr)[0] = __pips_tmp[0]; \
79
(arr)[1] = __pips_tmp[1]; \
80
(arr)[2] = __pips_tmp[2]; \
81
} while(0)
82
83
#define SIMD_LOAD_V4SI_TO_V4SF(v, f) \
84
do { \
85
float __pips_tmp[4]; \
86
__pips_tmp[0] = (f)[0]; \
87
__pips_tmp[1] = (f)[1]; \
88
__pips_tmp[2] = (f)[2]; \
89
__pips_tmp[3] = (f)[3]; \
90
SIMD_LOAD_V4SF(v, __pips_tmp); \
91
} while(0)
92
93
/* int64_t */
94
#define SIMD_LOAD_V2DI(vec,arr) vec=vld1q_s64(arr)
95
#define SIMD_STORE_V2DI(vec,arr) vst1q_s64(arr,vec)
96
97
#define SIMD_ZERO_V2DI(vec) vec = veorq_s64(vec,vec)
98
99
#define SIMD_ADDDI(v1,v2,v3) v1=vaddq_s64(v2,v3)
100
#define SIMD_SUBDI(v1,v2,v3) v1=vsubq_s64(v2,v3)
101
#define SIMD_DIVDI(vec1,vec2,vec3)\
102
do {\
103
vec3=vrecpeq_s64(vec3);\
104
vec1=vmulq_s64(vec2,vec3);\
105
}\
106
while (0)
107
#define SIMD_MULDI(v1,v2,v3) v1=vmulq_s64(v2,v3)
108
#define SIMD_MULADDDI(vec1, vec2, vec3, vec4) vec1=vmlaq_s64(vec2,vec3,vec4)
109
110
/* int32_t */
111
#define SIMD_LOAD_V4SI(vec,arr) vec=vld1q_s32(arr)
112
#define SIMD_STORE_V4SI(vec,arr) vst1q_s32(arr,vec)
113
114
#define SIMD_ZERO_V4SI(vec) vec = veorq_s32(vec,vec)
115
116
#define SIMD_ADDD(v1,v2,v3) v1=vaddq_s32(v2,v3)
117
#define SIMD_SUBD(v1,v2,v3) v1=vsubq_s32(v2,v3)
118
#define SIMD_DIVD(vec1,vec2,vec3)\
119
do {\
120
vec3=vrecpeq_s32(vec3);\
121
vec1=vmulq_s32(vec2,vec3);\
122
}\
123
while (0)
124
#define SIMD_MULD(v1,v2,v3) v1=vmulq_s32(v2,v3)
125
#define SIMD_MULADDD(vec1, vec2, vec3, vec4) vec1=vmlaq_s32(vec2,vec3,vec4)
126
127
/* int16_t */
128
#define SIMD_LOAD_V8HI(vec,arr) vec=vld1q_s16(arr)
129
#define SIMD_STORE_V8HI(vec,arr) vst1q_s16(arr,vec)
130
131
#define SIMD_ZERO_V8HI(vec) vec = veorq_s16(vec,vec)
132
133
#define SIMD_ADDHI(v1,v2,v3) v1=vaddq_s16(v2,v3)
134
#define SIMD_SUBHI(v1,v2,v3) v1=vsubq_s16(v2,v3)
135
#define SIMD_DIVHI(vec1,vec2,vec3)\
136
do {\
137
vec3=vrecpeq_s16(vec3);\
138
vec1=vmulq_s16(vec2,vec3);\
139
}\
140
while (0)
141
#define SIMD_MULHI(v1,v2,v3) v1=vmulq_s16(v2,v3)
142
143
#define SIMD_STORE_V8HI_TO_V8SI(vec,arr)\
144
SIMD_STORE_V8HI(vec,arr)
145
#define SIMD_LOAD_V8SI_TO_V8HI(vec,arr)\
146
SIMD_LOAD_V8HI(vec,arr)
147
148
#define SIMD_MULADDHI(vec1, vec2, vec3, vec4) vec1=vmlaq_s16(vec2,vec3,vec4)
149
150
/* int8_t */
151
#define SIMD_LOAD_V16QI(vec,arr) vec=vld1q_s8(arr)
152
#define SIMD_STORE_V16QI(vec,arr) vst1q_s8(arr,vec)
153
154
#define SIMD_ZERO_V16QI(vec) vec = veorq_s8(vec,vec)
155
156
#define SIMD_ADDQI(v1,v2,v3) v1=vaddq_s8(v2,v3)
157
#define SIMD_SUBQI(v1,v2,v3) v1=vsubq_s8(v2,v3)
158
#define SIMD_DIVQI(vec1,vec2,vec3)\
159
do {\
160
vec3=vrecpeq_s8(vec3);\
161
vec1=vmulq_s8(vec2,vec3);\
162
}\
163
while (0)
164
#define SIMD_MULQI(v1,v2,v3) v1=vmulq_s8(v2,v3)
165
166
#define SIMD_MULADDQI(vec1, vec2, vec3, vec4) vec1=vmlaq_s8(vec2,vec3,vec4)
a4sf
float a4sf[4]
Definition:
SIMD_types.h:5
a4si
int a4si[4]
Definition:
SIMD_types.h:8
v4sf
float32x4_t v4sf
Definition:
neon.h:28
__attribute__
float32_t a4sf[8] __attribute__((aligned(32)))
Uses 128-bits NEON instructions.
Definition:
neon.h:22
v8hi
int16x8_t v8hi
Definition:
neon.h:31
v2di
int64x2_t v2di
Definition:
neon.h:29
v16qi
int8x16_t v16qi
Definition:
neon.h:32
v4si
int32x4_t v4si
Definition:
neon.h:30
int32_t
#define int32_t
Definition:
stdint.in.h:155
int8_t
#define int8_t
Definition:
stdint.in.h:141
int16_t
#define int16_t
Definition:
stdint.in.h:148
pips
src
Passes
pyps
drivers
sac
impl
neon.h
Generated on Thu Sep 26 2024 22:11:50 for PIPS by
1.9.1