OpenShot Audio Library | OpenShotAudio  0.3.3
juce_sse_SIMDNativeOps.h
1 /*
2  ==============================================================================
3 
4  This file is part of the JUCE library.
5  Copyright (c) 2017 - ROLI Ltd.
6 
7  JUCE is an open source library subject to commercial or open-source
8  licensing.
9 
10  By using JUCE, you agree to the terms of both the JUCE 5 End-User License
11  Agreement and JUCE 5 Privacy Policy (both updated and effective as of the
12  27th April 2017).
13 
14  End User License Agreement: www.juce.com/juce-5-licence
15  Privacy Policy: www.juce.com/juce-5-privacy-policy
16 
17  Or: You may also use this code under the terms of the GPL v3 (see
18  www.gnu.org/licenses).
19 
20  JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
21  EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
22  DISCLAIMED.
23 
24  ==============================================================================
25 */
26 
27 namespace juce
28 {
29 namespace dsp
30 {
31 
32 #ifndef DOXYGEN
33 
34 #if JUCE_GCC && (__GNUC__ >= 6)
35  #pragma GCC diagnostic push
36  #pragma GCC diagnostic ignored "-Wignored-attributes"
37 #endif
38 
39 #ifdef _MSC_VER
40  #define DECLARE_SSE_SIMD_CONST(type, name) \
41  static __declspec(align(16)) const type name [16 / sizeof (type)]
42 
43  #define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
44  __declspec(align(16)) const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)]
45 
46 #else
47  #define DECLARE_SSE_SIMD_CONST(type, name) \
48  static const type name [16 / sizeof (type)] __attribute__((aligned(16)))
49 
50  #define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
51  const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)] __attribute__((aligned(16)))
52 
53 #endif
54 
55 template <typename type>
56 struct SIMDNativeOps;
57 
58 //==============================================================================
63 template <>
64 struct SIMDNativeOps<float>
65 {
66  //==============================================================================
67  using vSIMDType = __m128;
68 
69  //==============================================================================
70  DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
71  DECLARE_SSE_SIMD_CONST (int32_t, kEvenHighBit);
72  DECLARE_SSE_SIMD_CONST (float, kOne);
73 
74  //==============================================================================
75  static forcedinline __m128 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm_load1_ps (&s); }
76  static forcedinline __m128 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm_load_ps (a); }
77  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128 value, float* dest) noexcept { _mm_store_ps (dest, value); }
78  static forcedinline __m128 JUCE_VECTOR_CALLTYPE add (__m128 a, __m128 b) noexcept { return _mm_add_ps (a, b); }
79  static forcedinline __m128 JUCE_VECTOR_CALLTYPE sub (__m128 a, __m128 b) noexcept { return _mm_sub_ps (a, b); }
80  static forcedinline __m128 JUCE_VECTOR_CALLTYPE mul (__m128 a, __m128 b) noexcept { return _mm_mul_ps (a, b); }
81  static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_and (__m128 a, __m128 b) noexcept { return _mm_and_ps (a, b); }
82  static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_or (__m128 a, __m128 b) noexcept { return _mm_or_ps (a, b); }
83  static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_xor (__m128 a, __m128 b) noexcept { return _mm_xor_ps (a, b); }
84  static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_notand (__m128 a, __m128 b) noexcept { return _mm_andnot_ps (a, b); }
85  static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_not (__m128 a) noexcept { return bit_notand (a, _mm_loadu_ps ((float*) kAllBitsSet)); }
86  static forcedinline __m128 JUCE_VECTOR_CALLTYPE min (__m128 a, __m128 b) noexcept { return _mm_min_ps (a, b); }
87  static forcedinline __m128 JUCE_VECTOR_CALLTYPE max (__m128 a, __m128 b) noexcept { return _mm_max_ps (a, b); }
88  static forcedinline __m128 JUCE_VECTOR_CALLTYPE equal (__m128 a, __m128 b) noexcept { return _mm_cmpeq_ps (a, b); }
89  static forcedinline __m128 JUCE_VECTOR_CALLTYPE notEqual (__m128 a, __m128 b) noexcept { return _mm_cmpneq_ps (a, b); }
90  static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThan (__m128 a, __m128 b) noexcept { return _mm_cmpgt_ps (a, b); }
91  static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128 a, __m128 b) noexcept { return _mm_cmpge_ps (a, b); }
92  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128 a, __m128 b ) noexcept { return (_mm_movemask_ps (equal (a, b)) == 0xf); }
93  static forcedinline __m128 JUCE_VECTOR_CALLTYPE multiplyAdd (__m128 a, __m128 b, __m128 c) noexcept { return _mm_add_ps (a, _mm_mul_ps (b, c)); }
94  static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupeven (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
95  static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
96  static forcedinline __m128 JUCE_VECTOR_CALLTYPE swapevenodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
97  static forcedinline __m128 JUCE_VECTOR_CALLTYPE oddevensum (__m128 a) noexcept { return _mm_add_ps (_mm_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a); }
98  static forcedinline float JUCE_VECTOR_CALLTYPE get (__m128 v, size_t i) noexcept { return SIMDFallbackOps<float, __m128>::get (v, i); }
99  static forcedinline __m128 JUCE_VECTOR_CALLTYPE set (__m128 v, size_t i, float s) noexcept { return SIMDFallbackOps<float, __m128>::set (v, i, s); }
100  static forcedinline __m128 JUCE_VECTOR_CALLTYPE truncate (__m128 a) noexcept { return _mm_cvtepi32_ps (_mm_cvttps_epi32 (a)); }
101 
102  //==============================================================================
103  static forcedinline __m128 JUCE_VECTOR_CALLTYPE cmplxmul (__m128 a, __m128 b) noexcept
104  {
105  __m128 rr_ir = mul (a, dupeven (b));
106  __m128 ii_ri = mul (swapevenodd (a), dupodd (b));
107  return add (rr_ir, bit_xor (ii_ri, _mm_loadu_ps ((float*) kEvenHighBit)));
108  }
109 
110  static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m128 a) noexcept
111  {
112  #if defined(__SSE4__)
113  __m128 retval = _mm_dp_ps (a, _mm_loadu_ps (kOne), 0xff);
114  #elif defined(__SSE3__)
115  __m128 retval = _mm_hadd_ps (_mm_hadd_ps (a, a), a);
116  #else
117  __m128 retval = _mm_add_ps (_mm_shuffle_ps (a, a, 0x4e), a);
118  retval = _mm_add_ps (retval, _mm_shuffle_ps (retval, retval, 0xb1));
119  #endif
120  return _mm_cvtss_f32 (retval);
121  }
122 };
123 
124 //==============================================================================
129 template <>
130 struct SIMDNativeOps<double>
131 {
132  //==============================================================================
133  using vSIMDType = __m128d;
134 
135  //==============================================================================
136  DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
137  DECLARE_SSE_SIMD_CONST (int64_t, kEvenHighBit);
138  DECLARE_SSE_SIMD_CONST (double, kOne);
139 
140  //==============================================================================
141  static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return load (a); }
142  static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return _mm_castsi128_pd (_mm_load_si128 (reinterpret_cast<const __m128i*> (a))); }
143  static forcedinline __m128d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm_load1_pd (&s); }
144  static forcedinline __m128d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm_load_pd (a); }
145  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128d value, double* dest) noexcept { _mm_store_pd (dest, value); }
146  static forcedinline __m128d JUCE_VECTOR_CALLTYPE add (__m128d a, __m128d b) noexcept { return _mm_add_pd (a, b); }
147  static forcedinline __m128d JUCE_VECTOR_CALLTYPE sub (__m128d a, __m128d b) noexcept { return _mm_sub_pd (a, b); }
148  static forcedinline __m128d JUCE_VECTOR_CALLTYPE mul (__m128d a, __m128d b) noexcept { return _mm_mul_pd (a, b); }
149  static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_and (__m128d a, __m128d b) noexcept { return _mm_and_pd (a, b); }
150  static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_or (__m128d a, __m128d b) noexcept { return _mm_or_pd (a, b); }
151  static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_xor (__m128d a, __m128d b) noexcept { return _mm_xor_pd (a, b); }
152  static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_notand (__m128d a, __m128d b) noexcept { return _mm_andnot_pd (a, b); }
153  static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_not (__m128d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
154  static forcedinline __m128d JUCE_VECTOR_CALLTYPE min (__m128d a, __m128d b) noexcept { return _mm_min_pd (a, b); }
155  static forcedinline __m128d JUCE_VECTOR_CALLTYPE max (__m128d a, __m128d b) noexcept { return _mm_max_pd (a, b); }
156  static forcedinline __m128d JUCE_VECTOR_CALLTYPE equal (__m128d a, __m128d b) noexcept { return _mm_cmpeq_pd (a, b); }
157  static forcedinline __m128d JUCE_VECTOR_CALLTYPE notEqual (__m128d a, __m128d b) noexcept { return _mm_cmpneq_pd (a, b); }
158  static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThan (__m128d a, __m128d b) noexcept { return _mm_cmpgt_pd (a, b); }
159  static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128d a, __m128d b) noexcept { return _mm_cmpge_pd (a, b); }
160  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128d a, __m128d b ) noexcept { return (_mm_movemask_pd (equal (a, b)) == 0x3); }
161  static forcedinline __m128d JUCE_VECTOR_CALLTYPE multiplyAdd (__m128d a, __m128d b, __m128d c) noexcept { return _mm_add_pd (a, _mm_mul_pd (b, c)); }
162  static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupeven (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 0)); }
163  static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (1, 1)); }
164  static forcedinline __m128d JUCE_VECTOR_CALLTYPE swapevenodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 1)); }
165  static forcedinline __m128d JUCE_VECTOR_CALLTYPE oddevensum (__m128d a) noexcept { return a; }
166  static forcedinline double JUCE_VECTOR_CALLTYPE get (__m128d v, size_t i) noexcept { return SIMDFallbackOps<double, __m128d>::get (v, i); }
167  static forcedinline __m128d JUCE_VECTOR_CALLTYPE set (__m128d v, size_t i, double s) noexcept { return SIMDFallbackOps<double, __m128d>::set (v, i, s); }
168  static forcedinline __m128d JUCE_VECTOR_CALLTYPE truncate (__m128d a) noexcept { return _mm_cvtepi32_pd (_mm_cvttpd_epi32 (a)); }
169 
170  //==============================================================================
171  static forcedinline __m128d JUCE_VECTOR_CALLTYPE cmplxmul (__m128d a, __m128d b) noexcept
172  {
173  __m128d rr_ir = mul (a, dupeven (b));
174  __m128d ii_ri = mul (swapevenodd (a), dupodd (b));
175  return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
176  }
177 
178  static forcedinline double JUCE_VECTOR_CALLTYPE sum (__m128d a) noexcept
179  {
180  #if defined(__SSE4__)
181  __m128d retval = _mm_dp_pd (a, vconst (kOne), 0xff);
182  #elif defined(__SSE3__)
183  __m128d retval = _mm_hadd_pd (a, a);
184  #else
185  __m128d retval = _mm_add_pd (_mm_shuffle_pd (a, a, 0x01), a);
186  #endif
187  return _mm_cvtsd_f64 (retval);
188  }
189 };
190 
191 //==============================================================================
196 template <>
197 struct SIMDNativeOps<int8_t>
198 {
199  //==============================================================================
200  using vSIMDType = __m128i;
201 
202  //==============================================================================
203  DECLARE_SSE_SIMD_CONST (int8_t, kAllBitsSet);
204 
205  static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return load (a); }
206  static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
207  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int8_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
208  static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm_set1_epi8 (s); }
209  static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
210  static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
211  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
212  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
213  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
214  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
215  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
216  #if defined(__SSE4__)
217  static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi8 (a, b); }
218  static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi8 (a, b); }
219  #else
220  static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
221  static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
222  #endif
223  static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
224  static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (a, b); }
225  static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
226  static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
227  static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
228  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
229  static forcedinline int8_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<int8_t, __m128i>::get (v, i); }
230  static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int8_t s) noexcept { return SIMDFallbackOps<int8_t, __m128i>::set (v, i, s); }
231  static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
232 
233  //==============================================================================
234  static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
235  {
236  #ifdef __SSSE3__
237  __m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
238  __m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
239 
240  for (int i = 0; i < 3; ++i)
241  {
242  lo = _mm_hadd_epi16 (lo, lo);
243  hi = _mm_hadd_epi16 (hi, hi);
244  }
245 
246  return static_cast<int8_t> ((_mm_cvtsi128_si32 (lo) & 0xff) + (_mm_cvtsi128_si32 (hi) & 0xff));
247  #else
248  return SIMDFallbackOps<int8_t, __m128i>::sum (a);
249  #endif
250  }
251 
252  static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
253  {
254  // unpack and multiply
255  __m128i even = _mm_mullo_epi16 (a, b);
256  __m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
257 
258  return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
259  _mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
260  }
261 };
262 
263 //==============================================================================
268 template <>
269 struct SIMDNativeOps<uint8_t>
270 {
271  //==============================================================================
272  using vSIMDType = __m128i;
273 
274  //==============================================================================
275  DECLARE_SSE_SIMD_CONST (uint8_t, kHighBit);
276  DECLARE_SSE_SIMD_CONST (uint8_t, kAllBitsSet);
277 
278  static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept { return load (a); }
279  static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
280  static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
281  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint8_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
282  static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm_set1_epi8 ((int8_t) s); }
283  static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
284  static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
285  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
286  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
287  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
288  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
289  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
290  static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu8 (a, b); }
291  static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu8 (a, b); }
292  static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
293  static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (ssign (a), ssign (b)); }
294  static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
295  static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
296  static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
297  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
298  static forcedinline uint8_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<uint8_t, __m128i>::get (v, i); }
299  static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint8_t s) noexcept { return SIMDFallbackOps<uint8_t, __m128i>::set (v, i, s); }
300  static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
301 
302  //==============================================================================
303  static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
304  {
305  #ifdef __SSSE3__
306  __m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
307  __m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
308 
309  for (int i = 0; i < 3; ++i)
310  {
311  lo = _mm_hadd_epi16 (lo, lo);
312  hi = _mm_hadd_epi16 (hi, hi);
313  }
314 
315  return static_cast<uint8_t> ((static_cast<uint32_t> (_mm_cvtsi128_si32 (lo)) & 0xffu)
316  + (static_cast<uint32_t> (_mm_cvtsi128_si32 (hi)) & 0xffu));
317  #else
318  return SIMDFallbackOps<uint8_t, __m128i>::sum (a);
319  #endif
320  }
321 
322  static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
323  {
324  // unpack and multiply
325  __m128i even = _mm_mullo_epi16 (a, b);
326  __m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
327 
328  return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
329  _mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
330  }
331 };
332 
333 //==============================================================================
338 template <>
339 struct SIMDNativeOps<int16_t>
340 {
341  //==============================================================================
342  using vSIMDType = __m128i;
343 
344  //==============================================================================
345  DECLARE_SSE_SIMD_CONST (int16_t, kAllBitsSet);
346 
347  //==============================================================================
348  static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept { return load (a); }
349  static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
350  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int16_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
351  static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm_set1_epi16 (s); }
352  static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
353  static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
354  static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
355  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
356  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
357  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
358  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
359  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
360  static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi16 (a, b); }
361  static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi16 (a, b); }
362  static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
363  static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (a, b); }
364  static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
365  static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
366  static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
367  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
368  static forcedinline int16_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<int16_t, __m128i>::get (v, i); }
369  static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int16_t s) noexcept { return SIMDFallbackOps<int16_t, __m128i>::set (v, i, s); }
370  static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
371 
372  //==============================================================================
373  static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
374  {
375  #ifdef __SSSE3__
376  __m128i tmp = _mm_hadd_epi16 (a, a);
377  tmp = _mm_hadd_epi16 (tmp, tmp);
378  tmp = _mm_hadd_epi16 (tmp, tmp);
379 
380  return static_cast<int16_t> (_mm_cvtsi128_si32 (tmp) & 0xffff);
381  #else
382  return SIMDFallbackOps<int16_t, __m128i>::sum (a);
383  #endif
384  }
385 };
386 
387 //==============================================================================
392 template <>
393 struct SIMDNativeOps<uint16_t>
394 {
395  //==============================================================================
396  using vSIMDType = __m128i;
397 
398  //==============================================================================
399  DECLARE_SSE_SIMD_CONST (uint16_t, kHighBit);
400  DECLARE_SSE_SIMD_CONST (uint16_t, kAllBitsSet);
401 
402  //==============================================================================
403  static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept { return load (a); }
404  static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
405  static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
406  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint16_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
407  static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm_set1_epi16 ((int16_t) s); }
408  static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
409  static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
410  static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
411  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
412  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
413  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
414  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
415  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
416  #if defined(__SSE4__)
417  static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu16 (a, b); }
418  static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu16 (a, b); }
419  #else
420  static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
421  static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
422  #endif
423  static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
424  static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (ssign (a), ssign (b)); }
425  static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
426  static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
427  static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
428  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
429  static forcedinline uint16_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<uint16_t, __m128i>::get (v, i); }
430  static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint16_t s) noexcept { return SIMDFallbackOps<uint16_t, __m128i>::set (v, i, s); }
431  static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
432 
433  //==============================================================================
434  static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
435  {
436  #ifdef __SSSE3__
437  __m128i tmp = _mm_hadd_epi16 (a, a);
438  tmp = _mm_hadd_epi16 (tmp, tmp);
439  tmp = _mm_hadd_epi16 (tmp, tmp);
440 
441  return static_cast<uint16_t> (static_cast<uint32_t> (_mm_cvtsi128_si32 (tmp)) & 0xffffu);
442  #else
443  return SIMDFallbackOps<uint16_t, __m128i>::sum (a);
444  #endif
445  }
446 };
447 
448 //==============================================================================
453 template <>
454 struct SIMDNativeOps<int32_t>
455 {
456  //==============================================================================
457  using vSIMDType = __m128i;
458 
459  //==============================================================================
460  DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
461 
462  //==============================================================================
463  static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return load (a); }
464  static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
465  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int32_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
466  static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm_set1_epi32 (s); }
467  static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
468  static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
469  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
470  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
471  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
472  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
473  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
474  static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
475  static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (a, b); }
476  static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
477  static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
478  static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
479  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
480  static forcedinline int32_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<int32_t, __m128i>::get (v, i); }
481  static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int32_t s) noexcept { return SIMDFallbackOps<int32_t, __m128i>::set (v, i, s); }
482  static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
483 
484  //==============================================================================
485  static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
486  {
487  #ifdef __SSSE3__
488  __m128i tmp = _mm_hadd_epi32 (a, a);
489  return _mm_cvtsi128_si32 (_mm_hadd_epi32 (tmp, tmp));
490  #else
491  return SIMDFallbackOps<int32_t, __m128i>::sum (a);
492  #endif
493  }
494 
495  static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
496  {
497  #if defined(__SSE4_1__)
498  return _mm_mullo_epi32 (a, b);
499  #else
500  __m128i even = _mm_mul_epu32 (a,b);
501  __m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
502  return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
503  _mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
504  #endif
505  }
506 
507  static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
508  {
509  #if defined(__SSE4_1__)
510  return _mm_min_epi32 (a, b);
511  #else
512  __m128i lt = greaterThan (b, a);
513  return bit_or (bit_and (lt, a), bit_andnot (lt, b));
514  #endif
515  }
516 
517  static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
518  {
519  #if defined(__SSE4_1__)
520  return _mm_max_epi32 (a, b);
521  #else
522  __m128i gt = greaterThan (a, b);
523  return bit_or (bit_and (gt, a), bit_andnot (gt, b));
524  #endif
525  }
526 };
527 
528 //==============================================================================
533 template <>
534 struct SIMDNativeOps<uint32_t>
535 {
536  //==============================================================================
537  using vSIMDType = __m128i;
538 
539  //==============================================================================
540  DECLARE_SSE_SIMD_CONST (uint32_t, kAllBitsSet);
541  DECLARE_SSE_SIMD_CONST (uint32_t, kHighBit);
542 
543  //==============================================================================
544  static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept { return load (a); }
545  static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
546  static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
547  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint32_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
548  static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm_set1_epi32 ((int32_t) s); }
549  static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
550  static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
551  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
552  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
553  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
554  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
555  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
556  static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
557  static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (ssign (a), ssign (b)); }
558  static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
559  static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
560  static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
561  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
562  static forcedinline uint32_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<uint32_t, __m128i>::get (v, i); }
563  static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint32_t s) noexcept { return SIMDFallbackOps<uint32_t, __m128i>::set (v, i, s); }
564  static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
565 
566  //==============================================================================
567  static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
568  {
569  #ifdef __SSSE3__
570  __m128i tmp = _mm_hadd_epi32 (a, a);
571  return static_cast<uint32_t> (_mm_cvtsi128_si32 (_mm_hadd_epi32 (tmp, tmp)));
572  #else
573  return SIMDFallbackOps<uint32_t, __m128i>::sum (a);
574  #endif
575  }
576 
577  static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
578  {
579  #if defined(__SSE4_1__)
580  return _mm_mullo_epi32 (a, b);
581  #else
582  __m128i even = _mm_mul_epu32 (a,b);
583  __m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
584  return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
585  _mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
586  #endif
587  }
588 
589  static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
590  {
591  #if defined(__SSE4_1__)
592  return _mm_min_epi32 (a, b);
593  #else
594  __m128i lt = greaterThan (b, a);
595  return bit_or (bit_and (lt, a), bit_andnot (lt, b));
596  #endif
597  }
598 
599  static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
600  {
601  #if defined(__SSE4_1__)
602  return _mm_max_epi32 (a, b);
603  #else
604  __m128i gt = greaterThan (a, b);
605  return bit_or (bit_and (gt, a), bit_andnot (gt, b));
606  #endif
607  }
608 };
609 
610 //==============================================================================
615 template <>
616 struct SIMDNativeOps<int64_t>
617 {
618  //==============================================================================
619  using vSIMDType = __m128i;
620 
621  //==============================================================================
622  DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
623 
624  static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return load (a); }
625  static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept { return _mm_set1_epi64x (s); }
626  static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
627  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int64_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
628  static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
629  static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
630  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
631  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
632  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
633  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
634  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
635  static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
636  static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
637  static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
638  static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
639  static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
640  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
641  static forcedinline int64_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<int64_t, __m128i>::get (v, i); }
642  static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int64_t s) noexcept { return SIMDFallbackOps<int64_t, __m128i>::set (v, i, s); }
643  static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { return SIMDFallbackOps<int64_t, __m128i>::sum (a); }
644  static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return SIMDFallbackOps<int64_t, __m128i>::mul (a, b); }
645  static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
646 
647  static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
648  {
649  #if defined(__SSE4_1__)
650  return _mm_cmpeq_epi64 (a, b);
651  #else
652  __m128i bitmask = _mm_cmpeq_epi32 (a, b);
653  bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
654  return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
655  #endif
656  }
657 
658  static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
659  {
660  #if defined(__SSE4_2__)
661  return _mm_cmpgt_epi64 (a, b);
662  #else
663  return SIMDFallbackOps<int64_t, __m128i>::greaterThan (a, b);
664  #endif
665  }
666 };
667 
668 //==============================================================================
673 template <>
674 struct SIMDNativeOps<uint64_t>
675 {
676  //==============================================================================
677  using vSIMDType = __m128i;
678 
679  //==============================================================================
680  DECLARE_SSE_SIMD_CONST (uint64_t, kAllBitsSet);
681  DECLARE_SSE_SIMD_CONST (uint64_t, kHighBit);
682 
683  static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept { return load (a); }
684  static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept { return _mm_set1_epi64x ((int64_t) s); }
685  static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
686  static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
687  static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint64_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
688  static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
689  static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
690  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
691  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
692  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
693  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
694  static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
695  static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
696  static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
697  static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
698  static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
699  static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
700  static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
701  static forcedinline uint64_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<uint64_t, __m128i>::get (v, i); }
702  static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint64_t s) noexcept { return SIMDFallbackOps<uint64_t, __m128i>::set (v, i, s); }
703  static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { return SIMDFallbackOps<uint64_t, __m128i>::sum (a); }
704  static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return SIMDFallbackOps<uint64_t, __m128i>::mul (a, b); }
705  static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
706 
707  static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
708  {
709  #if defined(__SSE4_1__)
710  return _mm_cmpeq_epi64 (a, b);
711  #else
712  __m128i bitmask = _mm_cmpeq_epi32 (a, b);
713  bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
714  return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
715  #endif
716  }
717 
718  static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
719  {
720  #if defined(__SSE4_2__)
721  return _mm_cmpgt_epi64 (ssign (a), ssign (b));
722  #else
723  return SIMDFallbackOps<uint64_t, __m128i>::greaterThan (a, b);
724  #endif
725  }
726 };
727 
728 #endif
729 
730 #if JUCE_GCC && (__GNUC__ >= 6)
731  #pragma GCC diagnostic pop
732 #endif
733 
734 } // namespace dsp
735 } // namespace juce