LCOV - coverage-filtered.info - vppinfra/crypto/ghash.h

LCOV - code coverage report

Current view:	top level - vppinfra/crypto - ghash.h (source / functions)		Hit	Total	Coverage
Test:	coverage-filtered.info	Lines:	61	107	57.0 %
Date:	2023-10-26 01:39:38	Functions:	11	20	55.0 %

          Line data    Source code

       1             : /*
       2             :  *------------------------------------------------------------------
       3             :  * Copyright (c) 2019 Cisco and/or its affiliates.
       4             :  * Licensed under the Apache License, Version 2.0 (the "License");
       5             :  * you may not use this file except in compliance with the License.
       6             :  * You may obtain a copy of the License at:
       7             :  *
       8             :  *     http://www.apache.org/licenses/LICENSE-2.0
       9             :  *
      10             :  * Unless required by applicable law or agreed to in writing, software
      11             :  * distributed under the License is distributed on an "AS IS" BASIS,
      12             :  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      13             :  * See the License for the specific language governing permissions and
      14             :  * limitations under the License.
      15             :  *------------------------------------------------------------------
      16             :  */
      17             : 
      18             : /*
      19             :  *------------------------------------------------------------------
      20             :  *  Copyright(c) 2018, Intel Corporation All rights reserved.
      21             :  *
      22             :  *  Redistribution and use in source and binary forms, with or without
      23             :  *  modification, are permitted provided that the following conditions
      24             :  *  are met:
      25             :  *    * Redistributions of source code must retain the above copyright
      26             :  *      notice, this list of conditions and the following disclaimer.
      27             :  *    * Redistributions in binary form must reproduce the above copyright
      28             :  *      notice, this list of conditions and the following disclaimer in
      29             :  *      the documentation and/or other materials provided with the
      30             :  *      distribution.
      31             :  *    * Neither the name of Intel Corporation nor the names of its
      32             :  *      contributors may be used to endorse or promote products derived
      33             :  *      from this software without specific prior written permission.
      34             :  *
      35             :  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
      36             :  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
      37             :  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
      38             :  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
      39             :  *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
      40             :  *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
      41             :  *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES * LOSS OF USE,
      42             :  *  DATA, OR PROFITS * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
      43             :  *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
      44             :  *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
      45             :  *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      46             :  *------------------------------------------------------------------
      47             :  */
      48             : 
      49             : /*
      50             :  * Based on work by: Shay Gueron, Michael E. Kounavis, Erdinc Ozturk,
      51             :  *                   Vinodh Gopal, James Guilford, Tomasz Kantecki
      52             :  *
      53             :  * References:
      54             :  * [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
      55             :  *     Intel Architecture Processors. August, 2010
      56             :  * [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
      57             :  *     Intel Architecture Processors. October, 2012.
      58             :  * [3] intel-ipsec-mb library, https://github.com/01org/intel-ipsec-mb.git
      59             :  *
      60             :  * Definitions:
      61             :  *  GF    Galois Extension Field GF(2^128) - finite field where elements are
      62             :  *        represented as polynomials with coefficients in GF(2) with the
      63             :  *        highest degree of 127. Polynomials are represented as 128-bit binary
      64             :  *        numbers where each bit represents one coefficient.
      65             :  *        e.g. polynomial x^5 + x^3 + x + 1 is represented in binary 101011.
      66             :  *  H     hash key (128 bit)
      67             :  *  POLY  irreducible polynomial x^127 + x^7 + x^2 + x + 1
      68             :  *  RPOLY irreducible polynomial x^128 + x^127 + x^126 + x^121 + 1
      69             :  *  +     addition in GF, which equals to XOR operation
      70             :  *  *     multiplication in GF
      71             :  *
      72             :  * GF multiplication consists of 2 steps:
      73             :  *  - carry-less multiplication of two 128-bit operands into 256-bit result
      74             :  *  - reduction of 256-bit result into 128-bit with modulo POLY
      75             :  *
      76             :  * GHash is calculated on 128-bit blocks of data according to the following
      77             :  * formula:
      78             :  *    GH = (GH + data) * hash_key
      79             :  *
      80             :  * To avoid bit-reflection of data, this code uses GF multipication
      81             :  * with reversed polynomial:
      82             :  *   a * b * x^-127 mod RPOLY
      83             :  *
      84             :  * To improve computation speed table Hi is precomputed with powers of H',
      85             :  * where H' is calculated as H<<1 mod RPOLY.
      86             :  * This allows us to improve performance by deferring reduction. For example
      87             :  * to caclulate ghash of 4 128-bit blocks of data (b0, b1, b2, b3), we can do:
      88             :  *
      89             :  * u8x16 Hi[4];
      90             :  * ghash_precompute (H, Hi, 4);
      91             :  *
      92             :  * ghash_ctx_t _gd, *gd = &_gd;
      93             :  * ghash_mul_first (gd, GH ^ b0, Hi[3]);
      94             :  * ghash_mul_next (gd, b1, Hi[2]);
      95             :  * ghash_mul_next (gd, b2, Hi[1]);
      96             :  * ghash_mul_next (gd, b3, Hi[0]);
      97             :  * ghash_reduce (gd);
      98             :  * ghash_reduce2 (gd);
      99             :  * GH = ghash_final (gd);
     100             :  *
     101             :  * Reduction step is split into 3 functions so it can be better interleaved
     102             :  * with other code, (i.e. with AES computation).
     103             :  */
     104             : 
     105             : #ifndef __ghash_h__
     106             : #define __ghash_h__
     107             : 
     108             : static_always_inline u8x16
     109    16464800 : gmul_lo_lo (u8x16 a, u8x16 b)
     110             : {
     111             : #if defined (__PCLMUL__)
     112    16464800 :   return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x00);
     113             : #elif defined (__ARM_FEATURE_CRYPTO)
     114             :   return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
     115             :                             (poly64_t) vget_low_p64 ((poly64x2_t) b));
     116             : #endif
     117             : }
     118             : 
     119             : static_always_inline u8x16
     120    16536100 : gmul_hi_lo (u8x16 a, u8x16 b)
     121             : {
     122             : #if defined (__PCLMUL__)
     123    16536100 :   return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x01);
     124             : #elif defined (__ARM_FEATURE_CRYPTO)
     125             :   return (u8x16) vmull_p64 ((poly64_t) vget_high_p64 ((poly64x2_t) a),
     126             :                             (poly64_t) vget_low_p64 ((poly64x2_t) b));
     127             : #endif
     128             : }
     129             : 
     130             : static_always_inline u8x16
     131    16464800 : gmul_lo_hi (u8x16 a, u8x16 b)
     132             : {
     133             : #if defined (__PCLMUL__)
     134    16464800 :   return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x10);
     135             : #elif defined (__ARM_FEATURE_CRYPTO)
     136             :   return (u8x16) vmull_p64 ((poly64_t) vget_low_p64 ((poly64x2_t) a),
     137             :                             (poly64_t) vget_high_p64 ((poly64x2_t) b));
     138             : #endif
     139             : }
     140             : 
     141             : static_always_inline u8x16
     142    11544200 : gmul_hi_hi (u8x16 a, u8x16 b)
     143             : {
     144             : #if defined (__PCLMUL__)
     145    11544200 :   return (u8x16) _mm_clmulepi64_si128 ((__m128i) a, (__m128i) b, 0x11);
     146             : #elif defined (__ARM_FEATURE_CRYPTO)
     147             :   return (u8x16) vmull_high_p64 ((poly64x2_t) a, (poly64x2_t) b);
     148             : #endif
     149             : }
     150             : 
     151             : typedef struct
     152             : {
     153             :   u8x16 mid, hi, lo, tmp_lo, tmp_hi;
     154             :   u8x32 hi2, lo2, mid2, tmp_lo2, tmp_hi2;
     155             :   u8x64 hi4, lo4, mid4, tmp_lo4, tmp_hi4;
     156             :   int pending;
     157             : } ghash_ctx_t;
     158             : 
     159             : static const u8x16 ghash_poly = {
     160             :   0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     161             :   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
     162             : };
     163             : 
     164             : static const u8x16 ghash_poly2 = {
     165             :   0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
     166             :   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2
     167             : };
     168             : 
     169             : static_always_inline void
     170     4920670 : ghash_mul_first (ghash_ctx_t *gd, u8x16 a, u8x16 b)
     171             : {
     172             :   /* a1 * b1 */
     173     4920670 :   gd->hi = gmul_hi_hi (a, b);
     174             :   /* a0 * b0 */
     175     4920670 :   gd->lo = gmul_lo_lo (a, b);
     176             :   /* a0 * b1 ^ a1 * b0 */
     177     4920670 :   gd->mid = gmul_hi_lo (a, b) ^ gmul_lo_hi (a, b);
     178             : 
     179             :   /* set gd->pending to 0 so next invocation of ghash_mul_next(...) knows that
     180             :      there is no pending data in tmp_lo and tmp_hi */
     181     4920670 :   gd->pending = 0;
     182     4920670 : }
     183             : 
     184             : static_always_inline void
     185     6623480 : ghash_mul_next (ghash_ctx_t *gd, u8x16 a, u8x16 b)
     186             : {
     187             :   /* a1 * b1 */
     188     6623480 :   u8x16 hi = gmul_hi_hi (a, b);
     189             :   /* a0 * b0 */
     190     6623480 :   u8x16 lo = gmul_lo_lo (a, b);
     191             : 
     192             :   /* this branch will be optimized out by the compiler, and it allows us to
     193             :      reduce number of XOR operations by using ternary logic */
     194     6623480 :   if (gd->pending)
     195             :     {
     196             :       /* there is peding data from previous invocation so we can XOR */
     197     2797390 :       gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, hi);
     198     2797390 :       gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, lo);
     199     2797390 :       gd->pending = 0;
     200             :     }
     201             :   else
     202             :     {
     203             :       /* there is no peding data from previous invocation so we postpone XOR */
     204     3826090 :       gd->tmp_hi = hi;
     205     3826090 :       gd->tmp_lo = lo;
     206     3826090 :       gd->pending = 1;
     207             :     }
     208             : 
     209             :   /* gd->mid ^= a0 * b1 ^ a1 * b0  */
     210     6623480 :   gd->mid = u8x16_xor3 (gd->mid, gmul_hi_lo (a, b), gmul_lo_hi (a, b));
     211     6623480 : }
     212             : 
     213             : static_always_inline void
     214     4991910 : ghash_reduce (ghash_ctx_t *gd)
     215             : {
     216             :   u8x16 r;
     217             : 
     218             :   /* Final combination:
     219             :      gd->lo ^= gd->mid << 64
     220             :      gd->hi ^= gd->mid >> 64 */
     221     4991910 :   u8x16 midl = u8x16_word_shift_left (gd->mid, 8);
     222     4991910 :   u8x16 midr = u8x16_word_shift_right (gd->mid, 8);
     223             : 
     224     4991910 :   if (gd->pending)
     225             :     {
     226     1028840 :       gd->lo = u8x16_xor3 (gd->lo, gd->tmp_lo, midl);
     227     1028840 :       gd->hi = u8x16_xor3 (gd->hi, gd->tmp_hi, midr);
     228             :     }
     229             :   else
     230             :     {
     231     3963070 :       gd->lo ^= midl;
     232     3963070 :       gd->hi ^= midr;
     233             :     }
     234     4991910 :   r = gmul_hi_lo (ghash_poly2, gd->lo);
     235     4991910 :   gd->lo ^= u8x16_word_shift_left (r, 8);
     236     4991910 : }
     237             : 
     238             : static_always_inline void
     239     4920670 : ghash_reduce2 (ghash_ctx_t *gd)
     240             : {
     241     4920670 :   gd->tmp_lo = gmul_lo_lo (ghash_poly2, gd->lo);
     242     4920670 :   gd->tmp_hi = gmul_lo_hi (ghash_poly2, gd->lo);
     243     4920670 : }
     244             : 
     245             : static_always_inline u8x16
     246     4920670 : ghash_final (ghash_ctx_t *gd)
     247             : {
     248     9841340 :   return u8x16_xor3 (gd->hi, u8x16_word_shift_right (gd->tmp_lo, 4),
     249     4920670 :                      u8x16_word_shift_left (gd->tmp_hi, 4));
     250             : }
     251             : 
     252             : static_always_inline u8x16
     253     3610390 : ghash_mul (u8x16 a, u8x16 b)
     254             : {
     255     3610390 :   ghash_ctx_t _gd, *gd = &_gd;
     256     3610390 :   ghash_mul_first (gd, a, b);
     257     3610390 :   ghash_reduce (gd);
     258     3610390 :   ghash_reduce2 (gd);
     259     3610390 :   return ghash_final (gd);
     260             : }
     261             : 
     262             : #if defined(__VPCLMULQDQ__) && defined(__AVX512F__)
     263             : 
     264             : static const u8x64 ghash4_poly2 = {
     265             :   0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
     266             :   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
     267             :   0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
     268             :   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
     269             :   0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
     270             :   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
     271             :   0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00,
     272             :   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
     273             : };
     274             : 
     275             : static_always_inline u8x64
     276           0 : gmul4_lo_lo (u8x64 a, u8x64 b)
     277             : {
     278           0 :   return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x00);
     279             : }
     280             : 
     281             : static_always_inline u8x64
     282           0 : gmul4_hi_lo (u8x64 a, u8x64 b)
     283             : {
     284           0 :   return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x01);
     285             : }
     286             : 
     287             : static_always_inline u8x64
     288           0 : gmul4_lo_hi (u8x64 a, u8x64 b)
     289             : {
     290           0 :   return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x10);
     291             : }
     292             : 
     293             : static_always_inline u8x64
     294           0 : gmul4_hi_hi (u8x64 a, u8x64 b)
     295             : {
     296           0 :   return (u8x64) _mm512_clmulepi64_epi128 ((__m512i) a, (__m512i) b, 0x11);
     297             : }
     298             : 
     299             : static_always_inline void
     300           0 : ghash4_mul_first (ghash_ctx_t *gd, u8x64 a, u8x64 b)
     301             : {
     302           0 :   gd->hi4 = gmul4_hi_hi (a, b);
     303           0 :   gd->lo4 = gmul4_lo_lo (a, b);
     304           0 :   gd->mid4 = gmul4_hi_lo (a, b) ^ gmul4_lo_hi (a, b);
     305           0 :   gd->pending = 0;
     306           0 : }
     307             : 
     308             : static_always_inline void
     309           0 : ghash4_mul_next (ghash_ctx_t *gd, u8x64 a, u8x64 b)
     310             : {
     311           0 :   u8x64 hi = gmul4_hi_hi (a, b);
     312           0 :   u8x64 lo = gmul4_lo_lo (a, b);
     313             : 
     314           0 :   if (gd->pending)
     315             :     {
     316             :       /* there is peding data from previous invocation so we can XOR */
     317           0 :       gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, hi);
     318           0 :       gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, lo);
     319           0 :       gd->pending = 0;
     320             :     }
     321             :   else
     322             :     {
     323             :       /* there is no peding data from previous invocation so we postpone XOR */
     324           0 :       gd->tmp_hi4 = hi;
     325           0 :       gd->tmp_lo4 = lo;
     326           0 :       gd->pending = 1;
     327             :     }
     328           0 :   gd->mid4 = u8x64_xor3 (gd->mid4, gmul4_hi_lo (a, b), gmul4_lo_hi (a, b));
     329           0 : }
     330             : 
     331             : static_always_inline void
     332           0 : ghash4_reduce (ghash_ctx_t *gd)
     333             : {
     334             :   u8x64 r;
     335             : 
     336             :   /* Final combination:
     337             :      gd->lo4 ^= gd->mid4 << 64
     338             :      gd->hi4 ^= gd->mid4 >> 64 */
     339             : 
     340           0 :   u8x64 midl = u8x64_word_shift_left (gd->mid4, 8);
     341           0 :   u8x64 midr = u8x64_word_shift_right (gd->mid4, 8);
     342             : 
     343           0 :   if (gd->pending)
     344             :     {
     345           0 :       gd->lo4 = u8x64_xor3 (gd->lo4, gd->tmp_lo4, midl);
     346           0 :       gd->hi4 = u8x64_xor3 (gd->hi4, gd->tmp_hi4, midr);
     347             :     }
     348             :   else
     349             :     {
     350           0 :       gd->lo4 ^= midl;
     351           0 :       gd->hi4 ^= midr;
     352             :     }
     353             : 
     354           0 :   r = gmul4_hi_lo (ghash4_poly2, gd->lo4);
     355           0 :   gd->lo4 ^= u8x64_word_shift_left (r, 8);
     356           0 : }
     357             : 
     358             : static_always_inline void
     359           0 : ghash4_reduce2 (ghash_ctx_t *gd)
     360             : {
     361           0 :   gd->tmp_lo4 = gmul4_lo_lo (ghash4_poly2, gd->lo4);
     362           0 :   gd->tmp_hi4 = gmul4_lo_hi (ghash4_poly2, gd->lo4);
     363           0 : }
     364             : 
     365             : static_always_inline u8x16
     366           0 : ghash4_final (ghash_ctx_t *gd)
     367             : {
     368             :   u8x64 r;
     369             :   u8x32 t;
     370             : 
     371           0 :   r = u8x64_xor3 (gd->hi4, u8x64_word_shift_right (gd->tmp_lo4, 4),
     372           0 :                   u8x64_word_shift_left (gd->tmp_hi4, 4));
     373             : 
     374             :   /* horizontal XOR of 4 128-bit lanes */
     375           0 :   t = u8x64_extract_lo (r) ^ u8x64_extract_hi (r);
     376           0 :   return u8x32_extract_hi (t) ^ u8x32_extract_lo (t);
     377             : }
     378             : #endif
     379             : 
     380             : #if defined(__VPCLMULQDQ__)
     381             : 
     382             : static const u8x32 ghash2_poly2 = {
     383             :   0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     384             :   0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00,
     385             :   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc2,
     386             : };
     387             : 
     388             : static_always_inline u8x32
     389             : gmul2_lo_lo (u8x32 a, u8x32 b)
     390             : {
     391             :   return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x00);
     392             : }
     393             : 
     394             : static_always_inline u8x32
     395             : gmul2_hi_lo (u8x32 a, u8x32 b)
     396             : {
     397             :   return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x01);
     398             : }
     399             : 
     400             : static_always_inline u8x32
     401             : gmul2_lo_hi (u8x32 a, u8x32 b)
     402             : {
     403             :   return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x10);
     404             : }
     405             : 
     406             : static_always_inline u8x32
     407             : gmul2_hi_hi (u8x32 a, u8x32 b)
     408             : {
     409             :   return (u8x32) _mm256_clmulepi64_epi128 ((__m256i) a, (__m256i) b, 0x11);
     410             : }
     411             : 
     412             : static_always_inline void
     413             : ghash2_mul_first (ghash_ctx_t *gd, u8x32 a, u8x32 b)
     414             : {
     415             :   gd->hi2 = gmul2_hi_hi (a, b);
     416             :   gd->lo2 = gmul2_lo_lo (a, b);
     417             :   gd->mid2 = gmul2_hi_lo (a, b) ^ gmul2_lo_hi (a, b);
     418             :   gd->pending = 0;
     419             : }
     420             : 
     421             : static_always_inline void
     422             : ghash2_mul_next (ghash_ctx_t *gd, u8x32 a, u8x32 b)
     423             : {
     424             :   u8x32 hi = gmul2_hi_hi (a, b);
     425             :   u8x32 lo = gmul2_lo_lo (a, b);
     426             : 
     427             :   if (gd->pending)
     428             :     {
     429             :       /* there is peding data from previous invocation so we can XOR */
     430             :       gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, hi);
     431             :       gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, lo);
     432             :       gd->pending = 0;
     433             :     }
     434             :   else
     435             :     {
     436             :       /* there is no peding data from previous invocation so we postpone XOR */
     437             :       gd->tmp_hi2 = hi;
     438             :       gd->tmp_lo2 = lo;
     439             :       gd->pending = 1;
     440             :     }
     441             :   gd->mid2 = u8x32_xor3 (gd->mid2, gmul2_hi_lo (a, b), gmul2_lo_hi (a, b));
     442             : }
     443             : 
     444             : static_always_inline void
     445             : ghash2_reduce (ghash_ctx_t *gd)
     446             : {
     447             :   u8x32 r;
     448             : 
     449             :   /* Final combination:
     450             :      gd->lo2 ^= gd->mid2 << 64
     451             :      gd->hi2 ^= gd->mid2 >> 64 */
     452             : 
     453             :   u8x32 midl = u8x32_word_shift_left (gd->mid2, 8);
     454             :   u8x32 midr = u8x32_word_shift_right (gd->mid2, 8);
     455             : 
     456             :   if (gd->pending)
     457             :     {
     458             :       gd->lo2 = u8x32_xor3 (gd->lo2, gd->tmp_lo2, midl);
     459             :       gd->hi2 = u8x32_xor3 (gd->hi2, gd->tmp_hi2, midr);
     460             :     }
     461             :   else
     462             :     {
     463             :       gd->lo2 ^= midl;
     464             :       gd->hi2 ^= midr;
     465             :     }
     466             : 
     467             :   r = gmul2_hi_lo (ghash2_poly2, gd->lo2);
     468             :   gd->lo2 ^= u8x32_word_shift_left (r, 8);
     469             : }
     470             : 
     471             : static_always_inline void
     472             : ghash2_reduce2 (ghash_ctx_t *gd)
     473             : {
     474             :   gd->tmp_lo2 = gmul2_lo_lo (ghash2_poly2, gd->lo2);
     475             :   gd->tmp_hi2 = gmul2_lo_hi (ghash2_poly2, gd->lo2);
     476             : }
     477             : 
     478             : static_always_inline u8x16
     479             : ghash2_final (ghash_ctx_t *gd)
     480             : {
     481             :   u8x32 r;
     482             : 
     483             :   r = u8x32_xor3 (gd->hi2, u8x32_word_shift_right (gd->tmp_lo2, 4),
     484             :                   u8x32_word_shift_left (gd->tmp_hi2, 4));
     485             : 
     486             :   /* horizontal XOR of 2 128-bit lanes */
     487             :   return u8x32_extract_hi (r) ^ u8x32_extract_lo (r);
     488             : }
     489             : #endif
     490             : 
     491             : static_always_inline void
     492      103154 : ghash_precompute (u8x16 H, u8x16 * Hi, int n)
     493             : {
     494             :   u8x16 r8;
     495             :   u32x4 r32;
     496             :   /* calcullate H<<1 mod poly from the hash key */
     497      103154 :   r8 = (u8x16) ((u64x2) H >> 63);
     498      103154 :   H = (u8x16) ((u64x2) H << 1);
     499      103154 :   H |= u8x16_word_shift_left (r8, 8);
     500      103154 :   r32 = (u32x4) u8x16_word_shift_right (r8, 8);
     501             : #ifdef __SSE2__
     502      103154 :   r32 = u32x4_shuffle (r32, 0, 1, 2, 0);
     503             : #else
     504             :   r32[3] = r32[0];
     505             : #endif
     506      103154 :   r32 = r32 == (u32x4) {1, 0, 0, 1};
     507      103154 :   Hi[n - 1] = H = H ^ ((u8x16) r32 & ghash_poly);
     508             : 
     509             :   /* calculate H^(i + 1) */
     510     3713540 :   for (int i = n - 2; i >= 0; i--)
     511     3610390 :     Hi[i] = ghash_mul (H, Hi[i + 1]);
     512      103154 : }
     513             : 
     514             : #endif /* __ghash_h__ */
     515             :

Generated by: LCOV version 1.14