Line data Source code
1 : /*
2 : * Copyright (c) 2015 Cisco and/or its affiliates.
3 : * Licensed under the Apache License, Version 2.0 (the "License");
4 : * you may not use this file except in compliance with the License.
5 : * You may obtain a copy of the License at:
6 : *
7 : * http://www.apache.org/licenses/LICENSE-2.0
8 : *
9 : * Unless required by applicable law or agreed to in writing, software
10 : * distributed under the License is distributed on an "AS IS" BASIS,
11 : * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 : * See the License for the specific language governing permissions and
13 : * limitations under the License.
14 : */
15 :
16 : #ifndef included_vector_avx512_h
17 : #define included_vector_avx512_h
18 :
19 : #include <vppinfra/clib.h>
20 : #include <x86intrin.h>
21 :
22 : /* *INDENT-OFF* */
23 : #define foreach_avx512_vec512i \
24 : _(i,8,64,epi8) _(i,16,32,epi16) _(i,32,16,epi32) _(i,64,8,epi64)
25 : #define foreach_avx512_vec512u \
26 : _(u,8,64,epi8) _(u,16,32,epi16) _(u,32,16,epi32) _(u,64,8,epi64)
27 : #define foreach_avx512_vec512f \
28 : _(f,32,8,ps) _(f,64,4,pd)
29 :
30 : /* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
31 : is_all_equal, is_zero_mask */
32 : #define _(t, s, c, i) \
33 : static_always_inline t##s##x##c t##s##x##c##_splat (t##s x) \
34 : { \
35 : return (t##s##x##c) _mm512_set1_##i (x); \
36 : } \
37 : \
38 : static_always_inline t##s##x##c t##s##x##c##_load_aligned (void *p) \
39 : { \
40 : return (t##s##x##c) _mm512_load_si512 (p); \
41 : } \
42 : \
43 : static_always_inline void t##s##x##c##_store_aligned (t##s##x##c v, \
44 : void *p) \
45 : { \
46 : _mm512_store_si512 ((__m512i *) p, (__m512i) v); \
47 : } \
48 : \
49 : static_always_inline t##s##x##c t##s##x##c##_load_unaligned (void *p) \
50 : { \
51 : return (t##s##x##c) _mm512_loadu_si512 (p); \
52 : } \
53 : \
54 : static_always_inline void t##s##x##c##_store_unaligned (t##s##x##c v, \
55 : void *p) \
56 : { \
57 : _mm512_storeu_si512 ((__m512i *) p, (__m512i) v); \
58 : } \
59 : \
60 : static_always_inline int t##s##x##c##_is_all_zero (t##s##x##c v) \
61 : { \
62 : return (_mm512_test_epi64_mask ((__m512i) v, (__m512i) v) == 0); \
63 : } \
64 : \
65 : static_always_inline int t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
66 : { \
67 : return (_mm512_cmpneq_epi64_mask ((__m512i) a, (__m512i) b) == 0); \
68 : } \
69 : \
70 : static_always_inline int t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
71 : { \
72 : return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); \
73 : } \
74 : \
75 : static_always_inline u##c t##s##x##c##_is_zero_mask (t##s##x##c v) \
76 : { \
77 : return _mm512_test_##i##_mask ((__m512i) v, (__m512i) v); \
78 : } \
79 : \
80 : static_always_inline t##s##x##c t##s##x##c##_interleave_lo (t##s##x##c a, \
81 : t##s##x##c b) \
82 : { \
83 : return (t##s##x##c) _mm512_unpacklo_##i ((__m512i) a, (__m512i) b); \
84 : } \
85 : \
86 : static_always_inline t##s##x##c t##s##x##c##_interleave_hi (t##s##x##c a, \
87 : t##s##x##c b) \
88 : { \
89 : return (t##s##x##c) _mm512_unpackhi_##i ((__m512i) a, (__m512i) b); \
90 : }
91 :
92 0 : foreach_avx512_vec512i foreach_avx512_vec512u
93 : #undef _
94 : /* *INDENT-ON* */
95 :
96 : static_always_inline u32
97 : u16x32_msb_mask (u16x32 v)
98 : {
99 : return (u32) _mm512_movepi16_mask ((__m512i) v);
100 : }
101 :
102 : #define u64x8_i64gather(index, base, scale) \
103 : (u64x8) _mm512_i64gather_epi64 ((__m512i) index, base, scale)
104 :
105 : /* 512-bit packs */
106 : #define _(f, t, fn) \
107 : always_inline t t##_pack (f lo, f hi) \
108 : { \
109 : return (t) fn ((__m512i) lo, (__m512i) hi); \
110 : }
111 :
112 : _ (i16x32, i8x64, _mm512_packs_epi16)
113 : _ (i16x32, u8x64, _mm512_packus_epi16)
114 : _ (i32x16, i16x32, _mm512_packs_epi32)
115 : _ (i32x16, u16x32, _mm512_packus_epi32)
116 : #undef _
117 :
118 : static_always_inline u64x8
119 : u64x8_byte_swap (u64x8 v)
120 : {
121 : u8x64 swap = {
122 : 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
123 : 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
124 : 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
125 : 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
126 : };
127 : return (u64x8) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
128 : }
129 :
130 : static_always_inline u32x16
131 : u32x16_byte_swap (u32x16 v)
132 : {
133 : u8x64 swap = {
134 : 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
135 : 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
136 : 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
137 : 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
138 : };
139 : return (u32x16) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
140 : }
141 :
142 : static_always_inline u16x32
143 : u16x32_byte_swap (u16x32 v)
144 : {
145 : u8x64 swap = {
146 : 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
147 : 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
148 : 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
149 : 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
150 : };
151 : return (u16x32) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
152 : }
153 :
154 : #define _(f, t) \
155 : static_always_inline t f##_extract_lo (f v) \
156 : { \
157 : return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 0); \
158 : } \
159 : static_always_inline t f##_extract_hi (f v) \
160 : { \
161 : return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 1); \
162 : }
163 :
164 0 : _ (u64x8, u64x4)
165 0 : _ (u32x16, u32x8)
166 : _ (u16x32, u16x16)
167 0 : _ (u8x64, u8x32)
168 : #undef _
169 :
170 : static_always_inline u32
171 0 : u32x16_min_scalar (u32x16 v)
172 : {
173 0 : return u32x8_min_scalar (u32x8_min (u32x16_extract_lo (v),
174 : u32x16_extract_hi (v)));
175 : }
176 :
177 : static_always_inline u32x16
178 : u32x16_insert_lo (u32x16 r, u32x8 v)
179 : {
180 : return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 0);
181 : }
182 :
183 : static_always_inline u32x16
184 : u32x16_insert_hi (u32x16 r, u32x8 v)
185 : {
186 : return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 1);
187 : }
188 :
189 : static_always_inline u64x8
190 : u64x8_permute (u64x8 a, u64x8 b, u64x8 mask)
191 : {
192 : return (u64x8) _mm512_permutex2var_epi64 ((__m512i) a, (__m512i) mask,
193 : (__m512i) b);
194 : }
195 :
196 :
197 : #define u32x16_ternary_logic(a, b, c, d) \
198 : (u32x16) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b, (__m512i) c, d)
199 :
200 : #define u8x64_insert_u8x16(a, b, n) \
201 : (u8x64) _mm512_inserti64x2 ((__m512i) (a), (__m128i) (b), n)
202 :
203 : #define u8x64_extract_u8x16(a, n) \
204 : (u8x16) _mm512_extracti64x2_epi64 ((__m512i) (a), n)
205 :
206 : #define u8x64_word_shift_left(a,n) (u8x64) _mm512_bslli_epi128((__m512i) a, n)
207 : #define u8x64_word_shift_right(a,n) (u8x64) _mm512_bsrli_epi128((__m512i) a, n)
208 :
209 : static_always_inline u8x64
210 0 : u8x64_xor3 (u8x64 a, u8x64 b, u8x64 c)
211 : {
212 0 : return (u8x64) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b,
213 : (__m512i) c, 0x96);
214 : }
215 :
216 : static_always_inline u64x8
217 : u64x8_xor3 (u64x8 a, u64x8 b, u64x8 c)
218 : {
219 : return (u64x8) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b,
220 : (__m512i) c, 0x96);
221 : }
222 :
223 : static_always_inline u8x64
224 0 : u8x64_reflect_u8x16 (u8x64 x)
225 : {
226 : static const u8x64 mask = {
227 : 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
228 : 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
229 : 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
230 : 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
231 : };
232 0 : return (u8x64) _mm512_shuffle_epi8 ((__m512i) x, (__m512i) mask);
233 : }
234 :
235 : #define u8x64_align_right(a, b, imm) \
236 : (u8x64) _mm512_alignr_epi8 ((__m512i) a, (__m512i) b, imm)
237 :
238 : #define u64x8_align_right(a, b, imm) \
239 : (u64x8) _mm512_alignr_epi64 ((__m512i) a, (__m512i) b, imm)
240 :
241 : static_always_inline u32
242 : u32x16_sum_elts (u32x16 sum16)
243 : {
244 : u32x8 sum8;
245 : sum16 += (u32x16) u8x64_align_right (sum16, sum16, 8);
246 : sum16 += (u32x16) u8x64_align_right (sum16, sum16, 4);
247 : sum8 = u32x16_extract_hi (sum16) + u32x16_extract_lo (sum16);
248 : return sum8[0] + sum8[4];
249 : }
250 :
251 : #define _(t, m, p, i, e) \
252 : static_always_inline t t##_mask_load (t a, void *p, m mask) \
253 : { \
254 : return (t) p##_mask_loadu_##e ((i) a, mask, p); \
255 : } \
256 : static_always_inline t t##_mask_load_zero (void *p, m mask) \
257 : { \
258 : return (t) p##_maskz_loadu_##e (mask, p); \
259 : } \
260 : static_always_inline void t##_mask_store (t a, void *p, m mask) \
261 : { \
262 : p##_mask_storeu_##e (p, mask, (i) a); \
263 : }
264 :
265 0 : _ (u8x64, u64, _mm512, __m512i, epi8)
266 459501062 : _ (u8x32, u32, _mm256, __m256i, epi8)
267 : _ (u8x16, u16, _mm, __m128i, epi8)
268 : _ (u16x32, u32, _mm512, __m512i, epi16)
269 : _ (u16x16, u16, _mm256, __m256i, epi16)
270 : _ (u16x8, u8, _mm, __m128i, epi16)
271 0 : _ (u32x16, u16, _mm512, __m512i, epi32)
272 115564246 : _ (u32x8, u8, _mm256, __m256i, epi32)
273 3915034 : _ (u32x4, u8, _mm, __m128i, epi32)
274 0 : _ (u64x8, u8, _mm512, __m512i, epi64)
275 1962189 : _ (u64x4, u8, _mm256, __m256i, epi64)
276 : _ (u64x2, u8, _mm, __m128i, epi64)
277 : #undef _
278 :
279 : #define _(t, m, p, i, e) \
280 : static_always_inline t t##_mask_and (t a, t b, m mask) \
281 : { \
282 : return (t) p##_mask_and_##e ((i) a, mask, (i) a, (i) b); \
283 : } \
284 : static_always_inline t t##_mask_andnot (t a, t b, m mask) \
285 : { \
286 : return (t) p##_mask_andnot_##e ((i) a, mask, (i) a, (i) b); \
287 : } \
288 : static_always_inline t t##_mask_xor (t a, t b, m mask) \
289 : { \
290 : return (t) p##_mask_xor_##e ((i) a, mask, (i) a, (i) b); \
291 : } \
292 : static_always_inline t t##_mask_or (t a, t b, m mask) \
293 : { \
294 : return (t) p##_mask_or_##e ((i) a, mask, (i) a, (i) b); \
295 : }
296 : _ (u32x16, u16, _mm512, __m512i, epi32)
297 : _ (u32x8, u8, _mm256, __m256i, epi32)
298 : _ (u32x4, u8, _mm, __m128i, epi32)
299 : _ (u64x8, u8, _mm512, __m512i, epi64)
300 : _ (u64x4, u8, _mm256, __m256i, epi64)
301 : _ (u64x2, u8, _mm, __m128i, epi64)
302 : #undef _
303 :
304 : #ifdef CLIB_HAVE_VEC512
305 : #define CLIB_HAVE_VEC512_MASK_LOAD_STORE
306 : #define CLIB_HAVE_VEC512_MASK_BITWISE_OPS
307 : #endif
308 : #ifdef CLIB_HAVE_VEC256
309 : #define CLIB_HAVE_VEC256_MASK_LOAD_STORE
310 : #define CLIB_HAVE_VEC256_MASK_BITWISE_OPS
311 : #endif
312 : #ifdef CLIB_HAVE_VEC128
313 : #define CLIB_HAVE_VEC128_MASK_LOAD_STORE
314 : #define CLIB_HAVE_VEC128_MASK_BITWISE_OPS
315 : #endif
316 :
317 : static_always_inline u8x64
318 0 : u8x64_splat_u8x16 (u8x16 a)
319 : {
320 0 : return (u8x64) _mm512_broadcast_i64x2 ((__m128i) a);
321 : }
322 :
323 : static_always_inline u32x16
324 0 : u32x16_splat_u32x4 (u32x4 a)
325 : {
326 0 : return (u32x16) _mm512_broadcast_i64x2 ((__m128i) a);
327 : }
328 :
329 : static_always_inline u64x8
330 : u64x8_splat_u64x2 (u64x2 a)
331 : {
332 : return (u64x8) _mm512_broadcast_i64x2 ((__m128i) a);
333 : }
334 :
335 : static_always_inline u32x16
336 : u32x16_mask_blend (u32x16 a, u32x16 b, u16 mask)
337 : {
338 : return (u32x16) _mm512_mask_blend_epi32 (mask, (__m512i) a, (__m512i) b);
339 : }
340 :
341 : static_always_inline u8x64
342 : u8x64_mask_blend (u8x64 a, u8x64 b, u64 mask)
343 : {
344 : return (u8x64) _mm512_mask_blend_epi8 (mask, (__m512i) a, (__m512i) b);
345 : }
346 :
347 : static_always_inline u8x64
348 : u8x64_permute (u8x64 idx, u8x64 a)
349 : {
350 : return (u8x64) _mm512_permutexvar_epi8 ((__m512i) idx, (__m512i) a);
351 : }
352 :
353 : static_always_inline u8x64
354 : u8x64_permute2 (u8x64 idx, u8x64 a, u8x64 b)
355 : {
356 : return (u8x64) _mm512_permutex2var_epi8 ((__m512i) a, (__m512i) idx,
357 : (__m512i) b);
358 : }
359 :
360 : #define _(t, m, e, p, it) \
361 : static_always_inline m t##_is_equal_mask (t a, t b) \
362 : { \
363 : return p##_cmpeq_##e##_mask ((it) a, (it) b); \
364 : }
365 : _ (u8x16, u16, epu8, _mm, __m128i)
366 : _ (u16x8, u8, epu16, _mm, __m128i)
367 : _ (u32x4, u8, epu32, _mm, __m128i)
368 : _ (u64x2, u8, epu64, _mm, __m128i)
369 :
370 : _ (u8x32, u32, epu8, _mm256, __m256i)
371 : _ (u16x16, u16, epu16, _mm256, __m256i)
372 15320378 : _ (u32x8, u8, epu32, _mm256, __m256i)
373 : _ (u64x4, u8, epu64, _mm256, __m256i)
374 :
375 : _ (u8x64, u64, epu8, _mm512, __m512i)
376 0 : _ (u16x32, u32, epu16, _mm512, __m512i)
377 0 : _ (u32x16, u16, epu32, _mm512, __m512i)
378 0 : _ (u64x8, u8, epu64, _mm512, __m512i)
379 : #undef _
380 :
381 : #define _(t, m, e, p, it) \
382 : static_always_inline m t##_is_not_equal_mask (t a, t b) \
383 : { \
384 : return p##_cmpneq_##e##_mask ((it) a, (it) b); \
385 : }
386 : _ (u8x16, u16, epu8, _mm, __m128i)
387 : _ (u16x8, u8, epu16, _mm, __m128i)
388 : _ (u32x4, u8, epu32, _mm, __m128i)
389 : _ (u64x2, u8, epu64, _mm, __m128i)
390 :
391 : _ (u8x32, u32, epu8, _mm256, __m256i)
392 : _ (u16x16, u16, epu16, _mm256, __m256i)
393 : _ (u32x8, u8, epu32, _mm256, __m256i)
394 : _ (u64x4, u8, epu64, _mm256, __m256i)
395 :
396 : _ (u8x64, u64, epu8, _mm512, __m512i)
397 : _ (u16x32, u32, epu16, _mm512, __m512i)
398 : _ (u32x16, u16, epu32, _mm512, __m512i)
399 : _ (u64x8, u8, epu64, _mm512, __m512i)
400 : #undef _
401 :
402 : #define _(f, t, fn, it) \
403 : static_always_inline t t##_from_##f (f x) { return (t) fn ((it) x); }
404 : _ (u16x16, u32x16, _mm512_cvtepi16_epi32, __m256i)
405 : _ (u32x16, u16x16, _mm512_cvtusepi32_epi16, __m512i)
406 : _ (u32x8, u16x8, _mm256_cvtusepi32_epi16, __m256i)
407 0 : _ (u32x8, u64x8, _mm512_cvtepu32_epi64, __m256i)
408 : #undef _
409 :
410 : #define _(vt, mt, p, it, epi) \
411 : static_always_inline vt vt##_compress (vt a, mt mask) \
412 : { \
413 : return (vt) p##_maskz_compress_##epi (mask, (it) a); \
414 : } \
415 : static_always_inline vt vt##_expand (vt a, mt mask) \
416 : { \
417 : return (vt) p##_maskz_expand_##epi (mask, (it) a); \
418 : } \
419 : static_always_inline void vt##_compress_store (vt v, mt mask, void *p) \
420 : { \
421 : p##_mask_compressstoreu_##epi (p, mask, (it) v); \
422 : }
423 :
424 0 : _ (u64x8, u8, _mm512, __m512i, epi64)
425 0 : _ (u32x16, u16, _mm512, __m512i, epi32)
426 0 : _ (u64x4, u8, _mm256, __m256i, epi64)
427 67588680 : _ (u32x8, u8, _mm256, __m256i, epi32)
428 : _ (u64x2, u8, _mm, __m128i, epi64)
429 : _ (u32x4, u8, _mm, __m128i, epi32)
430 : #ifdef __AVX512VBMI2__
431 : _ (u16x32, u32, _mm512, __m512i, epi16)
432 : _ (u8x64, u64, _mm512, __m512i, epi8)
433 : _ (u16x16, u16, _mm256, __m256i, epi16)
434 : _ (u8x32, u32, _mm256, __m256i, epi8)
435 : _ (u16x8, u8, _mm, __m128i, epi16)
436 : _ (u8x16, u16, _mm, __m128i, epi8)
437 : #endif
438 : #undef _
439 :
440 : #ifdef CLIB_HAVE_VEC256
441 : #define CLIB_HAVE_VEC256_COMPRESS
442 : #ifdef __AVX512VBMI2__
443 : #define CLIB_HAVE_VEC256_COMPRESS_U8_U16
444 : #endif
445 :
446 : #endif
447 : #ifdef CLIB_HAVE_VEC512
448 : #define CLIB_HAVE_VEC512_COMPRESS
449 : #ifdef __AVX512VBMI2__
450 : #define CLIB_HAVE_VEC512_COMPRESS_U8_U16
451 : #endif
452 :
453 : #endif
454 :
455 : #ifndef __AVX512VBMI2__
456 : static_always_inline u16x16
457 : u16x16_compress (u16x16 v, u16 mask)
458 : {
459 : return u16x16_from_u32x16 (u32x16_compress (u32x16_from_u16x16 (v), mask));
460 : }
461 :
462 : static_always_inline u16x8
463 : u16x8_compress (u16x8 v, u8 mask)
464 : {
465 : return u16x8_from_u32x8 (u32x8_compress (u32x8_from_u16x8 (v), mask));
466 : }
467 : #endif
468 :
469 : static_always_inline u64
470 : u64x8_hxor (u64x8 v)
471 : {
472 : v ^= u64x8_align_right (v, v, 4);
473 : v ^= u64x8_align_right (v, v, 2);
474 : return v[0] ^ v[1];
475 : }
476 :
477 : static_always_inline void
478 : u32x16_transpose (u32x16 m[16])
479 : {
480 : __m512i r[16], a, b, c, d, x, y;
481 :
482 : /* *INDENT-OFF* */
483 : __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
484 : __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
485 : __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
486 : __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
487 : /* *INDENT-ON* */
488 :
489 : r[0] = _mm512_unpacklo_epi32 ((__m512i) m[0], (__m512i) m[1]);
490 : r[1] = _mm512_unpacklo_epi32 ((__m512i) m[2], (__m512i) m[3]);
491 : r[2] = _mm512_unpacklo_epi32 ((__m512i) m[4], (__m512i) m[5]);
492 : r[3] = _mm512_unpacklo_epi32 ((__m512i) m[6], (__m512i) m[7]);
493 : r[4] = _mm512_unpacklo_epi32 ((__m512i) m[8], (__m512i) m[9]);
494 : r[5] = _mm512_unpacklo_epi32 ((__m512i) m[10], (__m512i) m[11]);
495 : r[6] = _mm512_unpacklo_epi32 ((__m512i) m[12], (__m512i) m[13]);
496 : r[7] = _mm512_unpacklo_epi32 ((__m512i) m[14], (__m512i) m[15]);
497 :
498 : r[8] = _mm512_unpackhi_epi32 ((__m512i) m[0], (__m512i) m[1]);
499 : r[9] = _mm512_unpackhi_epi32 ((__m512i) m[2], (__m512i) m[3]);
500 : r[10] = _mm512_unpackhi_epi32 ((__m512i) m[4], (__m512i) m[5]);
501 : r[11] = _mm512_unpackhi_epi32 ((__m512i) m[6], (__m512i) m[7]);
502 : r[12] = _mm512_unpackhi_epi32 ((__m512i) m[8], (__m512i) m[9]);
503 : r[13] = _mm512_unpackhi_epi32 ((__m512i) m[10], (__m512i) m[11]);
504 : r[14] = _mm512_unpackhi_epi32 ((__m512i) m[12], (__m512i) m[13]);
505 : r[15] = _mm512_unpackhi_epi32 ((__m512i) m[14], (__m512i) m[15]);
506 :
507 : a = _mm512_unpacklo_epi64 (r[0], r[1]);
508 : b = _mm512_unpacklo_epi64 (r[2], r[3]);
509 : c = _mm512_unpacklo_epi64 (r[4], r[5]);
510 : d = _mm512_unpacklo_epi64 (r[6], r[7]);
511 : x = _mm512_permutex2var_epi64 (a, pm1, b);
512 : y = _mm512_permutex2var_epi64 (c, pm1, d);
513 : m[0] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
514 : m[8] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
515 : x = _mm512_permutex2var_epi64 (a, pm2, b);
516 : y = _mm512_permutex2var_epi64 (c, pm2, d);
517 : m[4] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
518 : m[12] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
519 :
520 : a = _mm512_unpacklo_epi64 (r[8], r[9]);
521 : b = _mm512_unpacklo_epi64 (r[10], r[11]);
522 : c = _mm512_unpacklo_epi64 (r[12], r[13]);
523 : d = _mm512_unpacklo_epi64 (r[14], r[15]);
524 : x = _mm512_permutex2var_epi64 (a, pm1, b);
525 : y = _mm512_permutex2var_epi64 (c, pm1, d);
526 : m[2] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
527 : m[10] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
528 : x = _mm512_permutex2var_epi64 (a, pm2, b);
529 : y = _mm512_permutex2var_epi64 (c, pm2, d);
530 : m[6] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
531 : m[14] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
532 :
533 : a = _mm512_unpackhi_epi64 (r[0], r[1]);
534 : b = _mm512_unpackhi_epi64 (r[2], r[3]);
535 : c = _mm512_unpackhi_epi64 (r[4], r[5]);
536 : d = _mm512_unpackhi_epi64 (r[6], r[7]);
537 : x = _mm512_permutex2var_epi64 (a, pm1, b);
538 : y = _mm512_permutex2var_epi64 (c, pm1, d);
539 : m[1] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
540 : m[9] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
541 : x = _mm512_permutex2var_epi64 (a, pm2, b);
542 : y = _mm512_permutex2var_epi64 (c, pm2, d);
543 : m[5] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
544 : m[13] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
545 :
546 : a = _mm512_unpackhi_epi64 (r[8], r[9]);
547 : b = _mm512_unpackhi_epi64 (r[10], r[11]);
548 : c = _mm512_unpackhi_epi64 (r[12], r[13]);
549 : d = _mm512_unpackhi_epi64 (r[14], r[15]);
550 : x = _mm512_permutex2var_epi64 (a, pm1, b);
551 : y = _mm512_permutex2var_epi64 (c, pm1, d);
552 : m[3] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
553 : m[11] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
554 : x = _mm512_permutex2var_epi64 (a, pm2, b);
555 : y = _mm512_permutex2var_epi64 (c, pm2, d);
556 : m[7] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
557 : m[15] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
558 : }
559 :
560 :
561 :
562 : static_always_inline void
563 : u64x8_transpose (u64x8 m[8])
564 : {
565 : __m512i r[8], x, y;
566 :
567 : /* *INDENT-OFF* */
568 : __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
569 : __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
570 : __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
571 : __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
572 : /* *INDENT-ON* */
573 :
574 : r[0] = _mm512_unpacklo_epi64 ((__m512i) m[0], (__m512i) m[1]);
575 : r[1] = _mm512_unpacklo_epi64 ((__m512i) m[2], (__m512i) m[3]);
576 : r[2] = _mm512_unpacklo_epi64 ((__m512i) m[4], (__m512i) m[5]);
577 : r[3] = _mm512_unpacklo_epi64 ((__m512i) m[6], (__m512i) m[7]);
578 : r[4] = _mm512_unpackhi_epi64 ((__m512i) m[0], (__m512i) m[1]);
579 : r[5] = _mm512_unpackhi_epi64 ((__m512i) m[2], (__m512i) m[3]);
580 : r[6] = _mm512_unpackhi_epi64 ((__m512i) m[4], (__m512i) m[5]);
581 : r[7] = _mm512_unpackhi_epi64 ((__m512i) m[6], (__m512i) m[7]);
582 :
583 : x = _mm512_permutex2var_epi64 (r[0], pm1, r[1]);
584 : y = _mm512_permutex2var_epi64 (r[2], pm1, r[3]);
585 : m[0] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
586 : m[4] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
587 : x = _mm512_permutex2var_epi64 (r[0], pm2, r[1]);
588 : y = _mm512_permutex2var_epi64 (r[2], pm2, r[3]);
589 : m[2] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
590 : m[6] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
591 :
592 : x = _mm512_permutex2var_epi64 (r[4], pm1, r[5]);
593 : y = _mm512_permutex2var_epi64 (r[6], pm1, r[7]);
594 : m[1] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
595 : m[5] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
596 : x = _mm512_permutex2var_epi64 (r[4], pm2, r[5]);
597 : y = _mm512_permutex2var_epi64 (r[6], pm2, r[7]);
598 : m[3] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
599 : m[7] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
600 : }
601 :
602 : static_always_inline u8x64
603 0 : u8x64_load_partial (u8 *data, uword n)
604 : {
605 0 : return u8x64_mask_load_zero (data, pow2_mask (n));
606 : }
607 :
608 : static_always_inline void
609 0 : u8x64_store_partial (u8x64 r, u8 *data, uword n)
610 : {
611 0 : u8x64_mask_store (r, data, pow2_mask (n));
612 0 : }
613 :
614 : #endif /* included_vector_avx512_h */
615 : /*
616 : * fd.io coding-style-patch-verification: ON
617 : *
618 : * Local Variables:
619 : * eval: (c-set-style "gnu")
620 : * End:
621 : */
|