25#define COMPILER_CLANG 1
28#define COMPILER_CLANG 0
33#define COMPILER_CLANG 1
36#define COMPILER_CLANG 0
45#ifndef __ARM_ARCH_7A__
46#define __ARM_ARCH_7A__ 0
49constexpr bool ARMv7a = (__ARM_ARCH == 7 && __ARM_ARCH_7A__);
50constexpr bool kCompilerClang = COMPILER_CLANG;
54[[gnu::always_inline]]
static constexpr int32_t multiply_32x32_rshift32(int32_t a, int32_t b) {
56 if constexpr (ARMv7a && !kCompilerClang) {
58 asm(
"smmul %0, %1, %2" :
"=r"(out) :
"r"(a),
"r"(b));
63 return (int32_t)(((int64_t)a * b) >> 32);
67[[gnu::always_inline]]
static constexpr int32_t multiply_32x32_rshift32_rounded(int32_t a, int32_t b) {
69 if constexpr (ARMv7a && !kCompilerClang) {
71 asm(
"smmulr %0, %1, %2" :
"=r"(out) :
"r"(a),
"r"(b));
76 return (int32_t)(((int64_t)a * b + 0x80000000) >> 32);
81[[gnu::always_inline]]
static constexpr int32_t q31_mult(int32_t a, int32_t b) {
83 if constexpr (ARMv7a) {
84 return multiply_32x32_rshift32(a, b) << 1;
87 return (int32_t)(((int64_t)a * b) >> 32) << 1;
91[[gnu::always_inline]]
static constexpr int32_t multiply_accumulate_32x32_rshift32_rounded(int32_t sum, int32_t a,
94 if constexpr (ARMv7a && !kCompilerClang) {
96 asm(
"smmlar %0, %1, %2, %3" :
"=r"(out) :
"r"(a),
"r"(b),
"r"(sum));
101 return (int32_t)(((((int64_t)sum) << 32) + ((int64_t)a * b) + 0x80000000) >> 32);
105[[gnu::always_inline]]
static constexpr int32_t multiply_accumulate_32x32_rshift32(int32_t sum, int32_t a, int32_t b) {
107 if constexpr (ARMv7a && !kCompilerClang) {
109 asm(
"smmla %0, %1, %2, %3" :
"=r"(out) :
"r"(a),
"r"(b),
"r"(sum));
114 return (int32_t)(((((int64_t)sum) << 32) + ((int64_t)a * b)) >> 32);
118[[gnu::always_inline]]
static constexpr int32_t multiply_subtract_32x32_rshift32_rounded(int32_t sum, int32_t a,
121 if constexpr (ARMv7a && !kCompilerClang) {
123 asm(
"smmlsr %0, %1, %2, %3" :
"=r"(out) :
"r"(a),
"r"(b),
"r"(sum));
128 return (int32_t)((((((int64_t)sum) << 32) - ((int64_t)a * b)) + 0x80000000) >> 32);
132template <
size_t bits>
134[[gnu::always_inline]]
static constexpr int32_t signed_saturate(int32_t val) {
136 if constexpr (ARMv7a) {
138 asm(
"ssat %0, %1, %2" :
"=r"(out) :
"I"(bits),
"r"(val));
143 return std::clamp<int32_t>(val, -(1LL << (bits - 1)), (1LL << (bits - 1)) - 1);
149template <
size_t bits>
151[[gnu::always_inline]]
static constexpr uint32_t unsigned_saturate(uint32_t val) {
154 if constexpr (ARMv7a) {
156 asm(
"usat %0, %1, %2" :
"=r"(out) :
"I"(bits),
"r"(val));
161 return std::clamp<uint32_t>(val, 0u, (1uL << bits) - 1);
164template <
size_t shift,
size_t bits = 32>
165requires(shift < 32 && bits <= 32)
166[[gnu::always_inline]]
static constexpr int32_t shift_left_saturate(int32_t val) {
168 if constexpr (ARMv7a) {
170 asm(
"ssat %0, %1, %2, LSL %3" :
"=r"(out) :
"I"(bits),
"r"(val),
"I"(shift));
175 return std::clamp<int64_t>((int64_t)val << shift, -(1LL << (bits - 1)), (1LL << (bits - 1)) - 1);
178template <
size_t shift,
size_t bits = 32>
179requires(shift < 32 && bits <= 32)
180[[gnu::always_inline]]
static constexpr uint32_t shift_left_saturate(uint32_t val) {
182 if constexpr (ARMv7a) {
184 asm(
"usat %0, %1, %2, LSL %3" :
"=r"(out) :
"I"(bits),
"r"(val),
"I"(shift));
188 return std::clamp<uint64_t>((uint64_t)val << shift, 0u, (1uLL << bits) - 1);
191[[gnu::always_inline]]
static constexpr int32_t add_saturate(int32_t a, int32_t b) {
193 if constexpr (ARMv7a) {
195 asm(
"qadd %0, %1, %2" :
"=r"(out) :
"r"(a),
"r"(b));
199 return std::clamp<int64_t>(a + b, -(1LL << 31), (1LL << 31) - 1);
202[[gnu::always_inline]]
static constexpr int32_t subtract_saturate(int32_t a, int32_t b) {
204 if constexpr (ARMv7a) {
206 asm(
"qsub %0, %1, %2" :
"=r"(out) :
"r"(a),
"r"(b));
210 return std::clamp<int64_t>(a - b, -(1LL << 31), (1LL << 31) - 1);
219static inline int32_t q31_from_float(
float value) {
220 asm(
"vcvt.s32.f32 %0, %0, #31" :
"=t"(value) :
"t"(value));
221 return std::bit_cast<int32_t>(value);
227static inline float int32_to_float(int32_t value) {
228 asm(
"vcvt.f32.s32 %0, %0, #31" :
"=t"(value) :
"t"(value));
229 return std::bit_cast<float>(value);