PIPS
|
#include <arm_neon.h>
Go to the source code of this file.
Macros | |
#define | SIMD_LOAD_V4SF(vec, arr) vec=vld1q_f32(arr) |
float More... | |
#define | SIMD_LOADA_V4SF(vec, arr) vec=vld1q_f32(arr) |
#define | SIMD_MULPS(vec1, vec2, vec3) vec1=vmulq_f32(vec2,vec3) |
#define | SIMD_DIVPS(vec1, vec2, vec3) |
#define | SIMD_ADDPS(vec1, vec2, vec3) vec1=vaddq_f32(vec2,vec3) |
#define | SIMD_SUBPS(vec1, vec2, vec3) vec1=vsubq_f32(vec2, vec3) |
#define | SIMD_MULADDPS(vec1, vec2, vec3, vec4) vec1=vmlaq_f32(vec2,vec3,vec4) |
#define | SIMD_UMINPS(vec1, vec2) vec1=vnegq_f32(vec2) |
#define | SIMD_STORE_V4SF(vec, arr) vst1q_f32(arr,vec) |
#define | SIMD_STOREA_V4SF(vec, arr) vst1q_f32(arr,vec) |
#define | SIMD_STORE_GENERIC_V4SF(vec, v0, v1, v2, v3) |
#define | SIMD_ZERO_V4SF(vec) vec = vsubq_f32(vec,vec) |
#define | SIMD_LOAD_GENERIC_V4SF(vec, v0, v1, v2, v3) |
#define | SIMD_STORE_MASKED_V4SF(vec, arr) |
handle padded value, this is a very bad implementation ... More... | |
#define | SIMD_LOAD_V4SI_TO_V4SF(v, f) |
#define | SIMD_LOAD_V2DI(vec, arr) vec=vld1q_s64(arr) |
int64_t More... | |
#define | SIMD_STORE_V2DI(vec, arr) vst1q_s64(arr,vec) |
#define | SIMD_ZERO_V2DI(vec) vec = veorq_s64(vec,vec) |
#define | SIMD_ADDDI(v1, v2, v3) v1=vaddq_s64(v2,v3) |
#define | SIMD_SUBDI(v1, v2, v3) v1=vsubq_s64(v2,v3) |
#define | SIMD_DIVDI(vec1, vec2, vec3) |
#define | SIMD_MULDI(v1, v2, v3) v1=vmulq_s64(v2,v3) |
#define | SIMD_MULADDDI(vec1, vec2, vec3, vec4) vec1=vmlaq_s64(vec2,vec3,vec4) |
#define | SIMD_LOAD_V4SI(vec, arr) vec=vld1q_s32(arr) |
int32_t More... | |
#define | SIMD_STORE_V4SI(vec, arr) vst1q_s32(arr,vec) |
#define | SIMD_ZERO_V4SI(vec) vec = veorq_s32(vec,vec) |
#define | SIMD_ADDD(v1, v2, v3) v1=vaddq_s32(v2,v3) |
#define | SIMD_SUBD(v1, v2, v3) v1=vsubq_s32(v2,v3) |
#define | SIMD_DIVD(vec1, vec2, vec3) |
#define | SIMD_MULD(v1, v2, v3) v1=vmulq_s32(v2,v3) |
#define | SIMD_MULADDD(vec1, vec2, vec3, vec4) vec1=vmlaq_s32(vec2,vec3,vec4) |
#define | SIMD_LOAD_V8HI(vec, arr) vec=vld1q_s16(arr) |
int16_t More... | |
#define | SIMD_STORE_V8HI(vec, arr) vst1q_s16(arr,vec) |
#define | SIMD_ZERO_V8HI(vec) vec = veorq_s16(vec,vec) |
#define | SIMD_ADDHI(v1, v2, v3) v1=vaddq_s16(v2,v3) |
#define | SIMD_SUBHI(v1, v2, v3) v1=vsubq_s16(v2,v3) |
#define | SIMD_DIVHI(vec1, vec2, vec3) |
#define | SIMD_MULHI(v1, v2, v3) v1=vmulq_s16(v2,v3) |
#define | SIMD_STORE_V8HI_TO_V8SI(vec, arr) SIMD_STORE_V8HI(vec,arr) |
#define | SIMD_LOAD_V8SI_TO_V8HI(vec, arr) SIMD_LOAD_V8HI(vec,arr) |
#define | SIMD_MULADDHI(vec1, vec2, vec3, vec4) vec1=vmlaq_s16(vec2,vec3,vec4) |
#define | SIMD_LOAD_V16QI(vec, arr) vec=vld1q_s8(arr) |
int8_t More... | |
#define | SIMD_STORE_V16QI(vec, arr) vst1q_s8(arr,vec) |
#define | SIMD_ZERO_V16QI(vec) vec = veorq_s8(vec,vec) |
#define | SIMD_ADDQI(v1, v2, v3) v1=vaddq_s8(v2,v3) |
#define | SIMD_SUBQI(v1, v2, v3) v1=vsubq_s8(v2,v3) |
#define | SIMD_DIVQI(vec1, vec2, vec3) |
#define | SIMD_MULQI(v1, v2, v3) v1=vmulq_s8(v2,v3) |
#define | SIMD_MULADDQI(vec1, vec2, vec3, vec4) vec1=vmlaq_s8(vec2,vec3,vec4) |
Typedefs | |
typedef float32_t a4sf[8] | __attribute__((aligned(32))) |
Uses 128-bits NEON instructions. More... | |
typedef float32x4_t | v4sf |
typedef int64x2_t | v2di |
typedef int32x4_t | v4si |
typedef int16x8_t | v8hi |
typedef int8x16_t | v16qi |
#define SIMD_ADDPS | ( | vec1, | |
vec2, | |||
vec3 | |||
) | vec1=vaddq_f32(vec2,vec3) |
#define SIMD_DIVD | ( | vec1, | |
vec2, | |||
vec3 | |||
) |
#define SIMD_DIVDI | ( | vec1, | |
vec2, | |||
vec3 | |||
) |
#define SIMD_DIVHI | ( | vec1, | |
vec2, | |||
vec3 | |||
) |
#define SIMD_DIVPS | ( | vec1, | |
vec2, | |||
vec3 | |||
) |
#define SIMD_DIVQI | ( | vec1, | |
vec2, | |||
vec3 | |||
) |
#define SIMD_LOAD_GENERIC_V4SF | ( | vec, | |
v0, | |||
v1, | |||
v2, | |||
v3 | |||
) |
#define SIMD_LOAD_V16QI | ( | vec, | |
arr | |||
) | vec=vld1q_s8(arr) |
#define SIMD_LOAD_V2DI | ( | vec, | |
arr | |||
) | vec=vld1q_s64(arr) |
#define SIMD_LOAD_V4SI | ( | vec, | |
arr | |||
) | vec=vld1q_s32(arr) |
#define SIMD_LOAD_V4SI_TO_V4SF | ( | v, | |
f | |||
) |
#define SIMD_LOAD_V8HI | ( | vec, | |
arr | |||
) | vec=vld1q_s16(arr) |
#define SIMD_LOAD_V8SI_TO_V8HI | ( | vec, | |
arr | |||
) | SIMD_LOAD_V8HI(vec,arr) |
#define SIMD_MULADDD | ( | vec1, | |
vec2, | |||
vec3, | |||
vec4 | |||
) | vec1=vmlaq_s32(vec2,vec3,vec4) |
#define SIMD_MULADDDI | ( | vec1, | |
vec2, | |||
vec3, | |||
vec4 | |||
) | vec1=vmlaq_s64(vec2,vec3,vec4) |
#define SIMD_MULADDHI | ( | vec1, | |
vec2, | |||
vec3, | |||
vec4 | |||
) | vec1=vmlaq_s16(vec2,vec3,vec4) |
#define SIMD_MULADDPS | ( | vec1, | |
vec2, | |||
vec3, | |||
vec4 | |||
) | vec1=vmlaq_f32(vec2,vec3,vec4) |
#define SIMD_MULADDQI | ( | vec1, | |
vec2, | |||
vec3, | |||
vec4 | |||
) | vec1=vmlaq_s8(vec2,vec3,vec4) |
#define SIMD_MULPS | ( | vec1, | |
vec2, | |||
vec3 | |||
) | vec1=vmulq_f32(vec2,vec3) |
#define SIMD_STORE_GENERIC_V4SF | ( | vec, | |
v0, | |||
v1, | |||
v2, | |||
v3 | |||
) |
#define SIMD_STORE_MASKED_V4SF | ( | vec, | |
arr | |||
) |
handle padded value, this is a very bad implementation ...
#define SIMD_STORE_V8HI_TO_V8SI | ( | vec, | |
arr | |||
) | SIMD_STORE_V8HI(vec,arr) |
#define SIMD_SUBPS | ( | vec1, | |
vec2, | |||
vec3 | |||
) | vec1=vsubq_f32(vec2, vec3) |
typedef int8_t a16qi [32] __attribute__((aligned(32))) |
Uses 128-bits NEON instructions.
Notes : NEON can also operate on 64-bits vectors. NEON does not operate on double-precision float. However, VFP can work on double-precision 64-bits vectors, but VFP is not a simd unit : it processes vectors scalar by scalar. reminder : NEON data types : signed/unsigned 8-bit, 16-bit, 32-bit, 64-bit, single precision floating point TODO: alignement: [1] says that each instruction has an alignement offset argument., but I can't find it in the intrinsics... An other option is to used an isntruction that set the alignement offset before each call, but it sounds like wasted cycles... TODO: a feature of NEON is to load/store up to 4 vectors with just one instruction (see vst{1,2,3,4}/vld{1,2,3,4} variants and [2] for examples). This needs (I think) a modification in SAC. [2] is a nice summary of the intrinsics used here
[1] : http://infocenter.arm.com/help/topic/com.arm.doc.dui0489b/CIHGIAEH.html [2] : http://gcc.gnu.org/onlinedocs/gcc/ARM-NEON-Intrinsics.html