LCOV - code coverage report
Current view: top level - vppinfra - vector_avx512.h (source / functions) Hit Total Coverage
Test: coverage-filtered.info Lines: 6 35 17.1 %
Date: 2023-10-26 01:39:38 Functions: 9 51 17.6 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2015 Cisco and/or its affiliates.
       3             :  * Licensed under the Apache License, Version 2.0 (the "License");
       4             :  * you may not use this file except in compliance with the License.
       5             :  * You may obtain a copy of the License at:
       6             :  *
       7             :  *     http://www.apache.org/licenses/LICENSE-2.0
       8             :  *
       9             :  * Unless required by applicable law or agreed to in writing, software
      10             :  * distributed under the License is distributed on an "AS IS" BASIS,
      11             :  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      12             :  * See the License for the specific language governing permissions and
      13             :  * limitations under the License.
      14             :  */
      15             : 
      16             : #ifndef included_vector_avx512_h
      17             : #define included_vector_avx512_h
      18             : 
      19             : #include <vppinfra/clib.h>
      20             : #include <x86intrin.h>
      21             : 
      22             : /* *INDENT-OFF* */
      23             : #define foreach_avx512_vec512i \
      24             :   _(i,8,64,epi8) _(i,16,32,epi16) _(i,32,16,epi32)  _(i,64,8,epi64)
      25             : #define foreach_avx512_vec512u \
      26             :   _(u,8,64,epi8) _(u,16,32,epi16) _(u,32,16,epi32)  _(u,64,8,epi64)
      27             : #define foreach_avx512_vec512f \
      28             :   _(f,32,8,ps) _(f,64,4,pd)
      29             : 
      30             : /* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
      31             :    is_all_equal, is_zero_mask */
      32             : #define _(t, s, c, i)                                                         \
      33             :   static_always_inline t##s##x##c t##s##x##c##_splat (t##s x)                 \
      34             :   {                                                                           \
      35             :     return (t##s##x##c) _mm512_set1_##i (x);                                  \
      36             :   }                                                                           \
      37             :                                                                               \
      38             :   static_always_inline t##s##x##c t##s##x##c##_load_aligned (void *p)         \
      39             :   {                                                                           \
      40             :     return (t##s##x##c) _mm512_load_si512 (p);                                \
      41             :   }                                                                           \
      42             :                                                                               \
      43             :   static_always_inline void t##s##x##c##_store_aligned (t##s##x##c v,         \
      44             :                                                         void *p)              \
      45             :   {                                                                           \
      46             :     _mm512_store_si512 ((__m512i *) p, (__m512i) v);                          \
      47             :   }                                                                           \
      48             :                                                                               \
      49             :   static_always_inline t##s##x##c t##s##x##c##_load_unaligned (void *p)       \
      50             :   {                                                                           \
      51             :     return (t##s##x##c) _mm512_loadu_si512 (p);                               \
      52             :   }                                                                           \
      53             :                                                                               \
      54             :   static_always_inline void t##s##x##c##_store_unaligned (t##s##x##c v,       \
      55             :                                                           void *p)            \
      56             :   {                                                                           \
      57             :     _mm512_storeu_si512 ((__m512i *) p, (__m512i) v);                         \
      58             :   }                                                                           \
      59             :                                                                               \
      60             :   static_always_inline int t##s##x##c##_is_all_zero (t##s##x##c v)            \
      61             :   {                                                                           \
      62             :     return (_mm512_test_epi64_mask ((__m512i) v, (__m512i) v) == 0);          \
      63             :   }                                                                           \
      64             :                                                                               \
      65             :   static_always_inline int t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
      66             :   {                                                                           \
      67             :     return (_mm512_cmpneq_epi64_mask ((__m512i) a, (__m512i) b) == 0);        \
      68             :   }                                                                           \
      69             :                                                                               \
      70             :   static_always_inline int t##s##x##c##_is_all_equal (t##s##x##c v, t##s x)   \
      71             :   {                                                                           \
      72             :     return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x));                 \
      73             :   }                                                                           \
      74             :                                                                               \
      75             :   static_always_inline u##c t##s##x##c##_is_zero_mask (t##s##x##c v)          \
      76             :   {                                                                           \
      77             :     return _mm512_test_##i##_mask ((__m512i) v, (__m512i) v);                 \
      78             :   }                                                                           \
      79             :                                                                               \
      80             :   static_always_inline t##s##x##c t##s##x##c##_interleave_lo (t##s##x##c a,   \
      81             :                                                               t##s##x##c b)   \
      82             :   {                                                                           \
      83             :     return (t##s##x##c) _mm512_unpacklo_##i ((__m512i) a, (__m512i) b);       \
      84             :   }                                                                           \
      85             :                                                                               \
      86             :   static_always_inline t##s##x##c t##s##x##c##_interleave_hi (t##s##x##c a,   \
      87             :                                                               t##s##x##c b)   \
      88             :   {                                                                           \
      89             :     return (t##s##x##c) _mm512_unpackhi_##i ((__m512i) a, (__m512i) b);       \
      90             :   }
      91             : 
      92           0 : foreach_avx512_vec512i foreach_avx512_vec512u
      93             : #undef _
      94             : /* *INDENT-ON* */
      95             : 
      96             : static_always_inline u32
      97             : u16x32_msb_mask (u16x32 v)
      98             : {
      99             :   return (u32) _mm512_movepi16_mask ((__m512i) v);
     100             : }
     101             : 
     102             : #define u64x8_i64gather(index, base, scale)                                   \
     103             :   (u64x8) _mm512_i64gather_epi64 ((__m512i) index, base, scale)
     104             : 
     105             : /* 512-bit packs */
     106             : #define _(f, t, fn)                                                           \
     107             :   always_inline t t##_pack (f lo, f hi)                                       \
     108             :   {                                                                           \
     109             :     return (t) fn ((__m512i) lo, (__m512i) hi);                               \
     110             :   }
     111             : 
     112             : _ (i16x32, i8x64, _mm512_packs_epi16)
     113             : _ (i16x32, u8x64, _mm512_packus_epi16)
     114             : _ (i32x16, i16x32, _mm512_packs_epi32)
     115             : _ (i32x16, u16x32, _mm512_packus_epi32)
     116             : #undef _
     117             : 
     118             : static_always_inline u64x8
     119             : u64x8_byte_swap (u64x8 v)
     120             : {
     121             :   u8x64 swap = {
     122             :     7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
     123             :     7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
     124             :     7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
     125             :     7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
     126             :   };
     127             :   return (u64x8) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
     128             : }
     129             : 
     130             : static_always_inline u32x16
     131             : u32x16_byte_swap (u32x16 v)
     132             : {
     133             :   u8x64 swap = {
     134             :     3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
     135             :     3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
     136             :     3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
     137             :     3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
     138             :   };
     139             :   return (u32x16) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
     140             : }
     141             : 
     142             : static_always_inline u16x32
     143             : u16x32_byte_swap (u16x32 v)
     144             : {
     145             :   u8x64 swap = {
     146             :     1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
     147             :     1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
     148             :     1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
     149             :     1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
     150             :   };
     151             :   return (u16x32) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
     152             : }
     153             : 
     154             : #define _(f, t)                                                               \
     155             :   static_always_inline t f##_extract_lo (f v)                                 \
     156             :   {                                                                           \
     157             :     return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 0);                    \
     158             :   }                                                                           \
     159             :   static_always_inline t f##_extract_hi (f v)                                 \
     160             :   {                                                                           \
     161             :     return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 1);                    \
     162             :   }
     163             : 
     164           0 : _ (u64x8, u64x4)
     165           0 : _ (u32x16, u32x8)
     166             : _ (u16x32, u16x16)
     167           0 : _ (u8x64, u8x32)
     168             : #undef _
     169             : 
     170             : static_always_inline u32
     171           0 : u32x16_min_scalar (u32x16 v)
     172             : {
     173           0 :   return u32x8_min_scalar (u32x8_min (u32x16_extract_lo (v),
     174             :                                       u32x16_extract_hi (v)));
     175             : }
     176             : 
     177             : static_always_inline u32x16
     178             : u32x16_insert_lo (u32x16 r, u32x8 v)
     179             : {
     180             :   return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 0);
     181             : }
     182             : 
     183             : static_always_inline u32x16
     184             : u32x16_insert_hi (u32x16 r, u32x8 v)
     185             : {
     186             :   return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 1);
     187             : }
     188             : 
     189             : static_always_inline u64x8
     190             : u64x8_permute (u64x8 a, u64x8 b, u64x8 mask)
     191             : {
     192             :   return (u64x8) _mm512_permutex2var_epi64 ((__m512i) a, (__m512i) mask,
     193             :                                             (__m512i) b);
     194             : }
     195             : 
     196             : 
     197             : #define u32x16_ternary_logic(a, b, c, d) \
     198             :   (u32x16) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b, (__m512i) c, d)
     199             : 
     200             : #define u8x64_insert_u8x16(a, b, n) \
     201             :   (u8x64) _mm512_inserti64x2 ((__m512i) (a), (__m128i) (b), n)
     202             : 
     203             : #define u8x64_extract_u8x16(a, n) \
     204             :   (u8x16) _mm512_extracti64x2_epi64 ((__m512i) (a), n)
     205             : 
     206             : #define u8x64_word_shift_left(a,n)  (u8x64) _mm512_bslli_epi128((__m512i) a, n)
     207             : #define u8x64_word_shift_right(a,n) (u8x64) _mm512_bsrli_epi128((__m512i) a, n)
     208             : 
     209             : static_always_inline u8x64
     210           0 : u8x64_xor3 (u8x64 a, u8x64 b, u8x64 c)
     211             : {
     212           0 :   return (u8x64) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b,
     213             :                                             (__m512i) c, 0x96);
     214             : }
     215             : 
     216             : static_always_inline u64x8
     217             : u64x8_xor3 (u64x8 a, u64x8 b, u64x8 c)
     218             : {
     219             :   return (u64x8) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b,
     220             :                                             (__m512i) c, 0x96);
     221             : }
     222             : 
     223             : static_always_inline u8x64
     224           0 : u8x64_reflect_u8x16 (u8x64 x)
     225             : {
     226             :   static const u8x64 mask = {
     227             :     15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
     228             :     15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
     229             :     15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
     230             :     15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
     231             :   };
     232           0 :   return (u8x64) _mm512_shuffle_epi8 ((__m512i) x, (__m512i) mask);
     233             : }
     234             : 
     235             : #define u8x64_align_right(a, b, imm) \
     236             :   (u8x64) _mm512_alignr_epi8 ((__m512i) a, (__m512i) b, imm)
     237             : 
     238             : #define u64x8_align_right(a, b, imm)                                          \
     239             :   (u64x8) _mm512_alignr_epi64 ((__m512i) a, (__m512i) b, imm)
     240             : 
     241             : static_always_inline u32
     242             : u32x16_sum_elts (u32x16 sum16)
     243             : {
     244             :   u32x8 sum8;
     245             :   sum16 += (u32x16) u8x64_align_right (sum16, sum16, 8);
     246             :   sum16 += (u32x16) u8x64_align_right (sum16, sum16, 4);
     247             :   sum8 = u32x16_extract_hi (sum16) + u32x16_extract_lo (sum16);
     248             :   return sum8[0] + sum8[4];
     249             : }
     250             : 
     251             : #define _(t, m, p, i, e)                                                      \
     252             :   static_always_inline t t##_mask_load (t a, void *p, m mask)                 \
     253             :   {                                                                           \
     254             :     return (t) p##_mask_loadu_##e ((i) a, mask, p);                           \
     255             :   }                                                                           \
     256             :   static_always_inline t t##_mask_load_zero (void *p, m mask)                 \
     257             :   {                                                                           \
     258             :     return (t) p##_maskz_loadu_##e (mask, p);                                 \
     259             :   }                                                                           \
     260             :   static_always_inline void t##_mask_store (t a, void *p, m mask)             \
     261             :   {                                                                           \
     262             :     p##_mask_storeu_##e (p, mask, (i) a);                                     \
     263             :   }
     264             : 
     265           0 : _ (u8x64, u64, _mm512, __m512i, epi8)
     266   459501062 : _ (u8x32, u32, _mm256, __m256i, epi8)
     267             : _ (u8x16, u16, _mm, __m128i, epi8)
     268             : _ (u16x32, u32, _mm512, __m512i, epi16)
     269             : _ (u16x16, u16, _mm256, __m256i, epi16)
     270             : _ (u16x8, u8, _mm, __m128i, epi16)
     271           0 : _ (u32x16, u16, _mm512, __m512i, epi32)
     272   115564246 : _ (u32x8, u8, _mm256, __m256i, epi32)
     273     3915034 : _ (u32x4, u8, _mm, __m128i, epi32)
     274           0 : _ (u64x8, u8, _mm512, __m512i, epi64)
     275     1962189 : _ (u64x4, u8, _mm256, __m256i, epi64)
     276             : _ (u64x2, u8, _mm, __m128i, epi64)
     277             : #undef _
     278             : 
     279             : #define _(t, m, p, i, e)                                                      \
     280             :   static_always_inline t t##_mask_and (t a, t b, m mask)                      \
     281             :   {                                                                           \
     282             :     return (t) p##_mask_and_##e ((i) a, mask, (i) a, (i) b);                  \
     283             :   }                                                                           \
     284             :   static_always_inline t t##_mask_andnot (t a, t b, m mask)                   \
     285             :   {                                                                           \
     286             :     return (t) p##_mask_andnot_##e ((i) a, mask, (i) a, (i) b);               \
     287             :   }                                                                           \
     288             :   static_always_inline t t##_mask_xor (t a, t b, m mask)                      \
     289             :   {                                                                           \
     290             :     return (t) p##_mask_xor_##e ((i) a, mask, (i) a, (i) b);                  \
     291             :   }                                                                           \
     292             :   static_always_inline t t##_mask_or (t a, t b, m mask)                       \
     293             :   {                                                                           \
     294             :     return (t) p##_mask_or_##e ((i) a, mask, (i) a, (i) b);                   \
     295             :   }
     296             : _ (u32x16, u16, _mm512, __m512i, epi32)
     297             : _ (u32x8, u8, _mm256, __m256i, epi32)
     298             : _ (u32x4, u8, _mm, __m128i, epi32)
     299             : _ (u64x8, u8, _mm512, __m512i, epi64)
     300             : _ (u64x4, u8, _mm256, __m256i, epi64)
     301             : _ (u64x2, u8, _mm, __m128i, epi64)
     302             : #undef _
     303             : 
     304             : #ifdef CLIB_HAVE_VEC512
     305             : #define CLIB_HAVE_VEC512_MASK_LOAD_STORE
     306             : #define CLIB_HAVE_VEC512_MASK_BITWISE_OPS
     307             : #endif
     308             : #ifdef CLIB_HAVE_VEC256
     309             : #define CLIB_HAVE_VEC256_MASK_LOAD_STORE
     310             : #define CLIB_HAVE_VEC256_MASK_BITWISE_OPS
     311             : #endif
     312             : #ifdef CLIB_HAVE_VEC128
     313             : #define CLIB_HAVE_VEC128_MASK_LOAD_STORE
     314             : #define CLIB_HAVE_VEC128_MASK_BITWISE_OPS
     315             : #endif
     316             : 
     317             : static_always_inline u8x64
     318           0 : u8x64_splat_u8x16 (u8x16 a)
     319             : {
     320           0 :   return (u8x64) _mm512_broadcast_i64x2 ((__m128i) a);
     321             : }
     322             : 
     323             : static_always_inline u32x16
     324           0 : u32x16_splat_u32x4 (u32x4 a)
     325             : {
     326           0 :   return (u32x16) _mm512_broadcast_i64x2 ((__m128i) a);
     327             : }
     328             : 
     329             : static_always_inline u64x8
     330             : u64x8_splat_u64x2 (u64x2 a)
     331             : {
     332             :   return (u64x8) _mm512_broadcast_i64x2 ((__m128i) a);
     333             : }
     334             : 
     335             : static_always_inline u32x16
     336             : u32x16_mask_blend (u32x16 a, u32x16 b, u16 mask)
     337             : {
     338             :   return (u32x16) _mm512_mask_blend_epi32 (mask, (__m512i) a, (__m512i) b);
     339             : }
     340             : 
     341             : static_always_inline u8x64
     342             : u8x64_mask_blend (u8x64 a, u8x64 b, u64 mask)
     343             : {
     344             :   return (u8x64) _mm512_mask_blend_epi8 (mask, (__m512i) a, (__m512i) b);
     345             : }
     346             : 
     347             : static_always_inline u8x64
     348             : u8x64_permute (u8x64 idx, u8x64 a)
     349             : {
     350             :   return (u8x64) _mm512_permutexvar_epi8 ((__m512i) idx, (__m512i) a);
     351             : }
     352             : 
     353             : static_always_inline u8x64
     354             : u8x64_permute2 (u8x64 idx, u8x64 a, u8x64 b)
     355             : {
     356             :   return (u8x64) _mm512_permutex2var_epi8 ((__m512i) a, (__m512i) idx,
     357             :                                            (__m512i) b);
     358             : }
     359             : 
     360             : #define _(t, m, e, p, it)                                                     \
     361             :   static_always_inline m t##_is_equal_mask (t a, t b)                         \
     362             :   {                                                                           \
     363             :     return p##_cmpeq_##e##_mask ((it) a, (it) b);                             \
     364             :   }
     365             : _ (u8x16, u16, epu8, _mm, __m128i)
     366             : _ (u16x8, u8, epu16, _mm, __m128i)
     367             : _ (u32x4, u8, epu32, _mm, __m128i)
     368             : _ (u64x2, u8, epu64, _mm, __m128i)
     369             : 
     370             : _ (u8x32, u32, epu8, _mm256, __m256i)
     371             : _ (u16x16, u16, epu16, _mm256, __m256i)
     372    15320378 : _ (u32x8, u8, epu32, _mm256, __m256i)
     373             : _ (u64x4, u8, epu64, _mm256, __m256i)
     374             : 
     375             : _ (u8x64, u64, epu8, _mm512, __m512i)
     376           0 : _ (u16x32, u32, epu16, _mm512, __m512i)
     377           0 : _ (u32x16, u16, epu32, _mm512, __m512i)
     378           0 : _ (u64x8, u8, epu64, _mm512, __m512i)
     379             : #undef _
     380             : 
     381             : #define _(t, m, e, p, it)                                                     \
     382             :   static_always_inline m t##_is_not_equal_mask (t a, t b)                     \
     383             :   {                                                                           \
     384             :     return p##_cmpneq_##e##_mask ((it) a, (it) b);                            \
     385             :   }
     386             : _ (u8x16, u16, epu8, _mm, __m128i)
     387             : _ (u16x8, u8, epu16, _mm, __m128i)
     388             : _ (u32x4, u8, epu32, _mm, __m128i)
     389             : _ (u64x2, u8, epu64, _mm, __m128i)
     390             : 
     391             : _ (u8x32, u32, epu8, _mm256, __m256i)
     392             : _ (u16x16, u16, epu16, _mm256, __m256i)
     393             : _ (u32x8, u8, epu32, _mm256, __m256i)
     394             : _ (u64x4, u8, epu64, _mm256, __m256i)
     395             : 
     396             : _ (u8x64, u64, epu8, _mm512, __m512i)
     397             : _ (u16x32, u32, epu16, _mm512, __m512i)
     398             : _ (u32x16, u16, epu32, _mm512, __m512i)
     399             : _ (u64x8, u8, epu64, _mm512, __m512i)
     400             : #undef _
     401             : 
     402             : #define _(f, t, fn, it)                                                       \
     403             :   static_always_inline t t##_from_##f (f x) { return (t) fn ((it) x); }
     404             : _ (u16x16, u32x16, _mm512_cvtepi16_epi32, __m256i)
     405             : _ (u32x16, u16x16, _mm512_cvtusepi32_epi16, __m512i)
     406             : _ (u32x8, u16x8, _mm256_cvtusepi32_epi16, __m256i)
     407           0 : _ (u32x8, u64x8, _mm512_cvtepu32_epi64, __m256i)
     408             : #undef _
     409             : 
     410             : #define _(vt, mt, p, it, epi)                                                 \
     411             :   static_always_inline vt vt##_compress (vt a, mt mask)                       \
     412             :   {                                                                           \
     413             :     return (vt) p##_maskz_compress_##epi (mask, (it) a);                      \
     414             :   }                                                                           \
     415             :   static_always_inline vt vt##_expand (vt a, mt mask)                         \
     416             :   {                                                                           \
     417             :     return (vt) p##_maskz_expand_##epi (mask, (it) a);                        \
     418             :   }                                                                           \
     419             :   static_always_inline void vt##_compress_store (vt v, mt mask, void *p)      \
     420             :   {                                                                           \
     421             :     p##_mask_compressstoreu_##epi (p, mask, (it) v);                          \
     422             :   }
     423             : 
     424           0 : _ (u64x8, u8, _mm512, __m512i, epi64)
     425           0 : _ (u32x16, u16, _mm512, __m512i, epi32)
     426           0 : _ (u64x4, u8, _mm256, __m256i, epi64)
     427    67588680 : _ (u32x8, u8, _mm256, __m256i, epi32)
     428             : _ (u64x2, u8, _mm, __m128i, epi64)
     429             : _ (u32x4, u8, _mm, __m128i, epi32)
     430             : #ifdef __AVX512VBMI2__
     431             : _ (u16x32, u32, _mm512, __m512i, epi16)
     432             : _ (u8x64, u64, _mm512, __m512i, epi8)
     433             : _ (u16x16, u16, _mm256, __m256i, epi16)
     434             : _ (u8x32, u32, _mm256, __m256i, epi8)
     435             : _ (u16x8, u8, _mm, __m128i, epi16)
     436             : _ (u8x16, u16, _mm, __m128i, epi8)
     437             : #endif
     438             : #undef _
     439             : 
     440             : #ifdef CLIB_HAVE_VEC256
     441             : #define CLIB_HAVE_VEC256_COMPRESS
     442             : #ifdef __AVX512VBMI2__
     443             : #define CLIB_HAVE_VEC256_COMPRESS_U8_U16
     444             : #endif
     445             : 
     446             : #endif
     447             : #ifdef CLIB_HAVE_VEC512
     448             : #define CLIB_HAVE_VEC512_COMPRESS
     449             : #ifdef __AVX512VBMI2__
     450             : #define CLIB_HAVE_VEC512_COMPRESS_U8_U16
     451             : #endif
     452             : 
     453             : #endif
     454             : 
     455             : #ifndef __AVX512VBMI2__
     456             : static_always_inline u16x16
     457             : u16x16_compress (u16x16 v, u16 mask)
     458             : {
     459             :   return u16x16_from_u32x16 (u32x16_compress (u32x16_from_u16x16 (v), mask));
     460             : }
     461             : 
     462             : static_always_inline u16x8
     463             : u16x8_compress (u16x8 v, u8 mask)
     464             : {
     465             :   return u16x8_from_u32x8 (u32x8_compress (u32x8_from_u16x8 (v), mask));
     466             : }
     467             : #endif
     468             : 
     469             : static_always_inline u64
     470             : u64x8_hxor (u64x8 v)
     471             : {
     472             :   v ^= u64x8_align_right (v, v, 4);
     473             :   v ^= u64x8_align_right (v, v, 2);
     474             :   return v[0] ^ v[1];
     475             : }
     476             : 
     477             : static_always_inline void
     478             : u32x16_transpose (u32x16 m[16])
     479             : {
     480             :   __m512i r[16], a, b, c, d, x, y;
     481             : 
     482             :   /* *INDENT-OFF* */
     483             :   __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
     484             :   __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
     485             :   __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
     486             :   __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
     487             :   /* *INDENT-ON* */
     488             : 
     489             :   r[0] = _mm512_unpacklo_epi32 ((__m512i) m[0], (__m512i) m[1]);
     490             :   r[1] = _mm512_unpacklo_epi32 ((__m512i) m[2], (__m512i) m[3]);
     491             :   r[2] = _mm512_unpacklo_epi32 ((__m512i) m[4], (__m512i) m[5]);
     492             :   r[3] = _mm512_unpacklo_epi32 ((__m512i) m[6], (__m512i) m[7]);
     493             :   r[4] = _mm512_unpacklo_epi32 ((__m512i) m[8], (__m512i) m[9]);
     494             :   r[5] = _mm512_unpacklo_epi32 ((__m512i) m[10], (__m512i) m[11]);
     495             :   r[6] = _mm512_unpacklo_epi32 ((__m512i) m[12], (__m512i) m[13]);
     496             :   r[7] = _mm512_unpacklo_epi32 ((__m512i) m[14], (__m512i) m[15]);
     497             : 
     498             :   r[8] = _mm512_unpackhi_epi32 ((__m512i) m[0], (__m512i) m[1]);
     499             :   r[9] = _mm512_unpackhi_epi32 ((__m512i) m[2], (__m512i) m[3]);
     500             :   r[10] = _mm512_unpackhi_epi32 ((__m512i) m[4], (__m512i) m[5]);
     501             :   r[11] = _mm512_unpackhi_epi32 ((__m512i) m[6], (__m512i) m[7]);
     502             :   r[12] = _mm512_unpackhi_epi32 ((__m512i) m[8], (__m512i) m[9]);
     503             :   r[13] = _mm512_unpackhi_epi32 ((__m512i) m[10], (__m512i) m[11]);
     504             :   r[14] = _mm512_unpackhi_epi32 ((__m512i) m[12], (__m512i) m[13]);
     505             :   r[15] = _mm512_unpackhi_epi32 ((__m512i) m[14], (__m512i) m[15]);
     506             : 
     507             :   a = _mm512_unpacklo_epi64 (r[0], r[1]);
     508             :   b = _mm512_unpacklo_epi64 (r[2], r[3]);
     509             :   c = _mm512_unpacklo_epi64 (r[4], r[5]);
     510             :   d = _mm512_unpacklo_epi64 (r[6], r[7]);
     511             :   x = _mm512_permutex2var_epi64 (a, pm1, b);
     512             :   y = _mm512_permutex2var_epi64 (c, pm1, d);
     513             :   m[0] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
     514             :   m[8] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
     515             :   x = _mm512_permutex2var_epi64 (a, pm2, b);
     516             :   y = _mm512_permutex2var_epi64 (c, pm2, d);
     517             :   m[4] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
     518             :   m[12] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
     519             : 
     520             :   a = _mm512_unpacklo_epi64 (r[8], r[9]);
     521             :   b = _mm512_unpacklo_epi64 (r[10], r[11]);
     522             :   c = _mm512_unpacklo_epi64 (r[12], r[13]);
     523             :   d = _mm512_unpacklo_epi64 (r[14], r[15]);
     524             :   x = _mm512_permutex2var_epi64 (a, pm1, b);
     525             :   y = _mm512_permutex2var_epi64 (c, pm1, d);
     526             :   m[2] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
     527             :   m[10] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
     528             :   x = _mm512_permutex2var_epi64 (a, pm2, b);
     529             :   y = _mm512_permutex2var_epi64 (c, pm2, d);
     530             :   m[6] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
     531             :   m[14] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
     532             : 
     533             :   a = _mm512_unpackhi_epi64 (r[0], r[1]);
     534             :   b = _mm512_unpackhi_epi64 (r[2], r[3]);
     535             :   c = _mm512_unpackhi_epi64 (r[4], r[5]);
     536             :   d = _mm512_unpackhi_epi64 (r[6], r[7]);
     537             :   x = _mm512_permutex2var_epi64 (a, pm1, b);
     538             :   y = _mm512_permutex2var_epi64 (c, pm1, d);
     539             :   m[1] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
     540             :   m[9] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
     541             :   x = _mm512_permutex2var_epi64 (a, pm2, b);
     542             :   y = _mm512_permutex2var_epi64 (c, pm2, d);
     543             :   m[5] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
     544             :   m[13] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
     545             : 
     546             :   a = _mm512_unpackhi_epi64 (r[8], r[9]);
     547             :   b = _mm512_unpackhi_epi64 (r[10], r[11]);
     548             :   c = _mm512_unpackhi_epi64 (r[12], r[13]);
     549             :   d = _mm512_unpackhi_epi64 (r[14], r[15]);
     550             :   x = _mm512_permutex2var_epi64 (a, pm1, b);
     551             :   y = _mm512_permutex2var_epi64 (c, pm1, d);
     552             :   m[3] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
     553             :   m[11] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
     554             :   x = _mm512_permutex2var_epi64 (a, pm2, b);
     555             :   y = _mm512_permutex2var_epi64 (c, pm2, d);
     556             :   m[7] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
     557             :   m[15] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
     558             : }
     559             : 
     560             : 
     561             : 
     562             : static_always_inline void
     563             : u64x8_transpose (u64x8 m[8])
     564             : {
     565             :   __m512i r[8], x, y;
     566             : 
     567             :   /* *INDENT-OFF* */
     568             :   __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
     569             :   __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
     570             :   __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
     571             :   __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
     572             :   /* *INDENT-ON* */
     573             : 
     574             :   r[0] = _mm512_unpacklo_epi64 ((__m512i) m[0], (__m512i) m[1]);
     575             :   r[1] = _mm512_unpacklo_epi64 ((__m512i) m[2], (__m512i) m[3]);
     576             :   r[2] = _mm512_unpacklo_epi64 ((__m512i) m[4], (__m512i) m[5]);
     577             :   r[3] = _mm512_unpacklo_epi64 ((__m512i) m[6], (__m512i) m[7]);
     578             :   r[4] = _mm512_unpackhi_epi64 ((__m512i) m[0], (__m512i) m[1]);
     579             :   r[5] = _mm512_unpackhi_epi64 ((__m512i) m[2], (__m512i) m[3]);
     580             :   r[6] = _mm512_unpackhi_epi64 ((__m512i) m[4], (__m512i) m[5]);
     581             :   r[7] = _mm512_unpackhi_epi64 ((__m512i) m[6], (__m512i) m[7]);
     582             : 
     583             :   x = _mm512_permutex2var_epi64 (r[0], pm1, r[1]);
     584             :   y = _mm512_permutex2var_epi64 (r[2], pm1, r[3]);
     585             :   m[0] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
     586             :   m[4] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
     587             :   x = _mm512_permutex2var_epi64 (r[0], pm2, r[1]);
     588             :   y = _mm512_permutex2var_epi64 (r[2], pm2, r[3]);
     589             :   m[2] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
     590             :   m[6] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
     591             : 
     592             :   x = _mm512_permutex2var_epi64 (r[4], pm1, r[5]);
     593             :   y = _mm512_permutex2var_epi64 (r[6], pm1, r[7]);
     594             :   m[1] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
     595             :   m[5] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
     596             :   x = _mm512_permutex2var_epi64 (r[4], pm2, r[5]);
     597             :   y = _mm512_permutex2var_epi64 (r[6], pm2, r[7]);
     598             :   m[3] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
     599             :   m[7] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
     600             : }
     601             : 
     602             : static_always_inline u8x64
     603           0 : u8x64_load_partial (u8 *data, uword n)
     604             : {
     605           0 :   return u8x64_mask_load_zero (data, pow2_mask (n));
     606             : }
     607             : 
     608             : static_always_inline void
     609           0 : u8x64_store_partial (u8x64 r, u8 *data, uword n)
     610             : {
     611           0 :   u8x64_mask_store (r, data, pow2_mask (n));
     612           0 : }
     613             : 
     614             : #endif /* included_vector_avx512_h */
     615             : /*
     616             :  * fd.io coding-style-patch-verification: ON
     617             :  *
     618             :  * Local Variables:
     619             :  * eval: (c-set-style "gnu")
     620             :  * End:
     621             :  */

Generated by: LCOV version 1.14