LCOV - code coverage report
Current view: top level - vnet/tcp - tcp.c (source / functions) Hit Total Coverage
Test: coverage-filtered.info Lines: 458 750 61.1 %
Date: 2023-07-05 22:20:52 Functions: 53 70 75.7 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016-2019 Cisco and/or its affiliates.
       3             :  * Licensed under the Apache License, Version 2.0 (the "License");
       4             :  * you may not use this file except in compliance with the License.
       5             :  * You may obtain a copy of the License at:
       6             :  *
       7             :  *     http://www.apache.org/licenses/LICENSE-2.0
       8             :  *
       9             :  * Unless required by applicable law or agreed to in writing, software
      10             :  * distributed under the License is distributed on an "AS IS" BASIS,
      11             :  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      12             :  * See the License for the specific language governing permissions and
      13             :  * limitations under the License.
      14             :  */
      15             : 
      16             : /**
      17             :  * @file
      18             :  * @brief TCP host stack utilities
      19             :  */
      20             : 
      21             : #include <vnet/tcp/tcp.h>
      22             : #include <vnet/tcp/tcp_inlines.h>
      23             : #include <vnet/session/session.h>
      24             : #include <vnet/fib/fib.h>
      25             : #include <vnet/dpo/load_balance.h>
      26             : #include <math.h>
      27             : 
      28             : #include <vlib/stats/stats.h>
      29             : 
      30             : tcp_main_t tcp_main;
      31             : 
      32             : typedef struct
      33             : {
      34             :   fib_protocol_t nh_proto;
      35             :   vnet_link_t link_type;
      36             :   ip46_address_t ip;
      37             :   u32 sw_if_index;
      38             :   u8 is_add;
      39             : } tcp_add_del_adj_args_t;
      40             : 
      41             : static void
      42           0 : tcp_add_del_adj_cb (tcp_add_del_adj_args_t * args)
      43             : {
      44             :   u32 ai;
      45           0 :   if (args->is_add)
      46             :     {
      47           0 :       adj_nbr_add_or_lock (args->nh_proto, args->link_type, &args->ip,
      48             :                            args->sw_if_index);
      49             :     }
      50             :   else
      51             :     {
      52           0 :       ai = adj_nbr_find (FIB_PROTOCOL_IP6, VNET_LINK_IP6, &args->ip,
      53             :                          args->sw_if_index);
      54           0 :       if (ai != ADJ_INDEX_INVALID)
      55           0 :         adj_unlock (ai);
      56             :     }
      57           0 : }
      58             : 
      59             : static void
      60           0 : tcp_add_del_adjacency (tcp_connection_t * tc, u8 is_add)
      61             : {
      62           0 :   tcp_add_del_adj_args_t args = {
      63             :     .nh_proto = FIB_PROTOCOL_IP6,
      64             :     .link_type = VNET_LINK_IP6,
      65             :     .ip = tc->c_rmt_ip,
      66           0 :     .sw_if_index = tc->sw_if_index,
      67             :     .is_add = is_add
      68             :   };
      69           0 :   vlib_rpc_call_main_thread (tcp_add_del_adj_cb, (u8 *) & args,
      70             :                              sizeof (args));
      71           0 : }
      72             : 
      73             : static void
      74         267 : tcp_cc_init (tcp_connection_t * tc)
      75             : {
      76             :   /* As per RFC 6582 initialize "recover" to iss */
      77         267 :   if (tcp_opts_sack_permitted (&tc->rcv_opts))
      78         264 :     tc->snd_congestion = tc->iss;
      79             : 
      80         267 :   tc->cc_algo->init (tc);
      81         267 : }
      82             : 
      83             : static void
      84         134 : tcp_cc_cleanup (tcp_connection_t * tc)
      85             : {
      86         134 :   if (tc->cc_algo->cleanup)
      87           0 :     tc->cc_algo->cleanup (tc);
      88         134 : }
      89             : 
      90             : void
      91        1118 : tcp_cc_algo_register (tcp_cc_algorithm_type_e type,
      92             :                       const tcp_cc_algorithm_t * vft)
      93             : {
      94        1118 :   tcp_main_t *tm = vnet_get_tcp_main ();
      95        1118 :   vec_validate (tm->cc_algos, type);
      96             : 
      97        1118 :   tm->cc_algos[type] = *vft;
      98        2236 :   hash_set_mem (tm->cc_algo_by_name, vft->name, type);
      99        1118 : }
     100             : 
     101             : tcp_cc_algorithm_t *
     102         171 : tcp_cc_algo_get (tcp_cc_algorithm_type_e type)
     103             : {
     104         171 :   tcp_main_t *tm = vnet_get_tcp_main ();
     105         171 :   return &tm->cc_algos[type];
     106             : }
     107             : 
     108             : tcp_cc_algorithm_type_e
     109           0 : tcp_cc_algo_new_type (const tcp_cc_algorithm_t * vft)
     110             : {
     111           0 :   tcp_main_t *tm = vnet_get_tcp_main ();
     112           0 :   tcp_cc_algo_register (++tm->cc_last_type, vft);
     113           0 :   return tm->cc_last_type;
     114             : }
     115             : 
     116             : static u32
     117          39 : tcp_connection_bind (u32 session_index, transport_endpoint_cfg_t *lcl)
     118             : {
     119          39 :   tcp_main_t *tm = &tcp_main;
     120             :   tcp_connection_t *listener;
     121             :   void *iface_ip;
     122             : 
     123          39 :   pool_get (tm->listener_pool, listener);
     124          39 :   clib_memset (listener, 0, sizeof (*listener));
     125             : 
     126          39 :   listener->c_c_index = listener - tm->listener_pool;
     127          39 :   listener->c_lcl_port = lcl->port;
     128             : 
     129             :   /* If we are provided a sw_if_index, bind using one of its ips */
     130          39 :   if (ip_is_zero (&lcl->ip, 1) && lcl->sw_if_index != ENDPOINT_INVALID_INDEX)
     131             :     {
     132          29 :       if ((iface_ip = ip_interface_get_first_ip (lcl->sw_if_index,
     133          29 :                                                  lcl->is_ip4)))
     134          28 :         ip_set (&lcl->ip, iface_ip, lcl->is_ip4);
     135             :     }
     136          39 :   ip_copy (&listener->c_lcl_ip, &lcl->ip, lcl->is_ip4);
     137          39 :   listener->c_is_ip4 = lcl->is_ip4;
     138          39 :   listener->c_proto = TRANSPORT_PROTO_TCP;
     139          39 :   listener->c_s_index = session_index;
     140          39 :   listener->c_fib_index = lcl->fib_index;
     141          39 :   listener->state = TCP_STATE_LISTEN;
     142          39 :   listener->cc_algo = tcp_cc_algo_get (tcp_cfg.cc_algo);
     143             : 
     144          39 :   tcp_connection_timers_init (listener);
     145             : 
     146             :   TCP_EVT (TCP_EVT_BIND, listener);
     147             : 
     148          39 :   return listener->c_c_index;
     149             : }
     150             : 
     151             : static u32
     152          39 : tcp_session_bind (u32 session_index, transport_endpoint_cfg_t *tep)
     153             : {
     154          39 :   return tcp_connection_bind (session_index, tep);
     155             : }
     156             : 
     157             : static void
     158          33 : tcp_connection_unbind (u32 listener_index)
     159             : {
     160          33 :   tcp_main_t *tm = vnet_get_tcp_main ();
     161             :   tcp_connection_t *tc;
     162             : 
     163          33 :   tc = pool_elt_at_index (tm->listener_pool, listener_index);
     164             : 
     165             :   TCP_EVT (TCP_EVT_UNBIND, tc);
     166             : 
     167             :   /* Poison the entry */
     168             :   if (CLIB_DEBUG > 0)
     169          33 :     clib_memset (tc, 0xFA, sizeof (*tc));
     170             : 
     171          33 :   pool_put_index (tm->listener_pool, listener_index);
     172          33 : }
     173             : 
     174             : static u32
     175          33 : tcp_session_unbind (u32 listener_index)
     176             : {
     177          33 :   tcp_connection_unbind (listener_index);
     178          33 :   return 0;
     179             : }
     180             : 
     181             : static transport_connection_t *
     182         418 : tcp_session_get_listener (u32 listener_index)
     183             : {
     184         418 :   tcp_main_t *tm = vnet_get_tcp_main ();
     185             :   tcp_connection_t *tc;
     186         418 :   tc = pool_elt_at_index (tm->listener_pool, listener_index);
     187         418 :   return &tc->connection;
     188             : }
     189             : 
     190             : static tcp_connection_t *
     191         132 : tcp_half_open_connection_alloc (void)
     192             : {
     193         132 :   return tcp_connection_alloc (transport_cl_thread ());
     194             : }
     195             : 
     196             : /**
     197             :  * Cleanup half-open connection
     198             :  *
     199             :  */
     200             : static void
     201         132 : tcp_half_open_connection_free (tcp_connection_t * tc)
     202             : {
     203         132 :   ASSERT (vlib_get_thread_index () == tc->c_thread_index ||
     204             :           vlib_thread_is_main_w_barrier ());
     205         132 :   return tcp_connection_free (tc);
     206             : }
     207             : 
     208             : /**
     209             :  * Try to cleanup half-open connection
     210             :  *
     211             :  * If called from a thread that doesn't own tc, the call won't have any
     212             :  * effect.
     213             :  *
     214             :  * @param tc - connection to be cleaned up
     215             :  * @return non-zero if cleanup failed.
     216             :  */
     217             : int
     218         132 : tcp_half_open_connection_cleanup (tcp_connection_t * tc)
     219             : {
     220             :   tcp_worker_ctx_t *wrk;
     221             : 
     222             :   /* Make sure this is the owning thread */
     223         132 :   if (tc->c_thread_index != vlib_get_thread_index ())
     224           0 :     return 1;
     225             : 
     226         132 :   session_half_open_delete_notify (&tc->connection);
     227         132 :   wrk = tcp_get_worker (tc->c_thread_index);
     228         132 :   tcp_timer_reset (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT_SYN);
     229         132 :   tcp_half_open_connection_free (tc);
     230         132 :   return 0;
     231             : }
     232             : 
     233             : /**
     234             :  * Cleans up connection state.
     235             :  *
     236             :  * No notifications.
     237             :  */
     238             : void
     239         134 : tcp_connection_cleanup (tcp_connection_t * tc)
     240             : {
     241             :   TCP_EVT (TCP_EVT_DELETE, tc);
     242             : 
     243             :   /* Cleanup local endpoint if this was an active connect */
     244         134 :   if (!(tc->cfg_flags & TCP_CFG_F_NO_ENDPOINT))
     245         134 :     transport_release_local_endpoint (TRANSPORT_PROTO_TCP, &tc->c_lcl_ip,
     246         134 :                                       tc->c_lcl_port);
     247             : 
     248             :   /* Check if connection is not yet fully established */
     249         134 :   if (tc->state == TCP_STATE_SYN_SENT)
     250             :     {
     251             :       /* Try to remove the half-open connection. If this is not the owning
     252             :        * thread, tc won't be removed. Retransmit or establish timers will
     253             :        * eventually expire and call again cleanup on the right thread. */
     254           0 :       if (tcp_half_open_connection_cleanup (tc))
     255           0 :         tc->flags |= TCP_CONN_HALF_OPEN_DONE;
     256             :     }
     257             :   else
     258             :     {
     259             :       /* Make sure all timers are cleared */
     260         134 :       tcp_connection_timers_reset (tc);
     261             : 
     262         134 :       if (!tc->c_is_ip4 && ip6_address_is_link_local_unicast (&tc->c_rmt_ip6))
     263           0 :         tcp_add_del_adjacency (tc, 0);
     264             : 
     265         134 :       tcp_cc_cleanup (tc);
     266         134 :       vec_free (tc->snd_sacks);
     267         134 :       vec_free (tc->snd_sacks_fl);
     268         134 :       vec_free (tc->rcv_opts.sacks);
     269         134 :       pool_free (tc->sack_sb.holes);
     270             : 
     271         134 :       if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
     272           0 :         tcp_bt_cleanup (tc);
     273             : 
     274         134 :       tcp_connection_free (tc);
     275             :     }
     276         134 : }
     277             : 
     278             : /**
     279             :  * Connection removal.
     280             :  *
     281             :  * This should be called only once connection enters CLOSED state. Note
     282             :  * that it notifies the session of the removal event, so if the goal is to
     283             :  * just remove the connection, call tcp_connection_cleanup instead.
     284             :  */
     285             : void
     286           0 : tcp_connection_del (tcp_connection_t * tc)
     287             : {
     288           0 :   session_transport_delete_notify (&tc->connection);
     289           0 :   tcp_connection_cleanup (tc);
     290           0 : }
     291             : 
     292             : tcp_connection_t *
     293         269 : tcp_connection_alloc (u8 thread_index)
     294             : {
     295         269 :   tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
     296             :   tcp_connection_t *tc;
     297             : 
     298         269 :   pool_get_aligned_safe (wrk->connections, tc, CLIB_CACHE_LINE_BYTES);
     299         269 :   clib_memset (tc, 0, sizeof (*tc));
     300         269 :   tc->c_c_index = tc - wrk->connections;
     301         269 :   tc->c_thread_index = thread_index;
     302         269 :   return tc;
     303             : }
     304             : 
     305             : tcp_connection_t *
     306         132 : tcp_connection_alloc_w_base (u8 thread_index, tcp_connection_t **base)
     307             : {
     308         132 :   tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
     309             :   tcp_connection_t *tc;
     310             : 
     311             :   /* Make sure connection is still valid if pool moves */
     312         132 :   if ((*base)->c_thread_index == thread_index)
     313             :     {
     314         132 :       u32 base_index = (*base)->c_c_index;
     315         132 :       pool_get_aligned_safe (wrk->connections, tc, CLIB_CACHE_LINE_BYTES);
     316         132 :       *base = tcp_connection_get (base_index, thread_index);
     317             :     }
     318             :   else
     319             :     {
     320           0 :       pool_get_aligned_safe (wrk->connections, tc, CLIB_CACHE_LINE_BYTES);
     321             :     }
     322         132 :   clib_memcpy_fast (tc, *base, sizeof (*tc));
     323         132 :   tc->c_c_index = tc - wrk->connections;
     324         132 :   tc->c_thread_index = thread_index;
     325         132 :   return tc;
     326             : }
     327             : 
     328             : void
     329         266 : tcp_connection_free (tcp_connection_t * tc)
     330             : {
     331         266 :   tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
     332             :   if (CLIB_DEBUG)
     333             :     {
     334         266 :       clib_memset (tc, 0xFA, sizeof (*tc));
     335         266 :       pool_put (wrk->connections, tc);
     336         266 :       return;
     337             :     }
     338             :   pool_put (wrk->connections, tc);
     339             : }
     340             : 
     341             : void
     342         135 : tcp_program_cleanup (tcp_worker_ctx_t * wrk, tcp_connection_t * tc)
     343             : {
     344             :   tcp_cleanup_req_t *req;
     345             :   clib_time_type_t now;
     346             : 
     347         135 :   now = tcp_time_now_us (tc->c_thread_index);
     348         135 :   clib_fifo_add2 (wrk->pending_cleanups, req);
     349         135 :   req->connection_index = tc->c_c_index;
     350         135 :   req->free_time = now + tcp_cfg.cleanup_time;
     351         135 : }
     352             : 
     353             : /**
     354             :  * Begin connection closing procedure.
     355             :  *
     356             :  * If at the end the connection is not in CLOSED state, it is not removed.
     357             :  * Instead, we rely on on TCP to advance through state machine to either
     358             :  * 1) LAST_ACK (passive close) whereby when the last ACK is received
     359             :  * tcp_connection_del is called. This notifies session of the delete and
     360             :  * calls cleanup.
     361             :  * 2) TIME_WAIT (active close) whereby after 2MSL the 2MSL timer triggers
     362             :  * and cleanup is called.
     363             :  *
     364             :  */
     365             : void
     366         260 : tcp_connection_close (tcp_connection_t * tc)
     367             : {
     368         260 :   tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
     369             : 
     370             :   TCP_EVT (TCP_EVT_CLOSE, tc);
     371             : 
     372             :   /* Send/Program FIN if needed and switch state */
     373         260 :   switch (tc->state)
     374             :     {
     375           0 :     case TCP_STATE_SYN_SENT:
     376             :       /* Try to cleanup. If not on the right thread, mark as half-open done.
     377             :        * Connection will be cleaned up when establish timer pops */
     378           0 :       tcp_connection_cleanup (tc);
     379           0 :       break;
     380           0 :     case TCP_STATE_SYN_RCVD:
     381           0 :       tcp_connection_timers_reset (tc);
     382           0 :       tcp_send_fin (tc);
     383           0 :       tcp_connection_set_state (tc, TCP_STATE_FIN_WAIT_1);
     384           0 :       tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_WAITCLOSE,
     385             :                         tcp_cfg.finwait1_time);
     386           0 :       break;
     387         133 :     case TCP_STATE_ESTABLISHED:
     388             :       /* If closing with unread data, reset the connection */
     389         133 :       if (transport_max_rx_dequeue (&tc->connection))
     390             :         {
     391           4 :           tcp_send_reset (tc);
     392           4 :           tcp_connection_timers_reset (tc);
     393           4 :           tcp_connection_set_state (tc, TCP_STATE_CLOSED);
     394           4 :           session_transport_closed_notify (&tc->connection);
     395           4 :           tcp_program_cleanup (tcp_get_worker (tc->c_thread_index), tc);
     396           4 :           tcp_worker_stats_inc (wrk, rst_unread, 1);
     397           4 :           break;
     398             :         }
     399         129 :       if (!transport_max_tx_dequeue (&tc->connection))
     400         127 :         tcp_send_fin (tc);
     401             :       else
     402           2 :         tc->flags |= TCP_CONN_FINPNDG;
     403         129 :       tcp_connection_set_state (tc, TCP_STATE_FIN_WAIT_1);
     404             :       /* Set a timer in case the peer stops responding. Otherwise the
     405             :        * connection will be stuck here forever. */
     406         129 :       ASSERT (tc->timers[TCP_TIMER_WAITCLOSE] == TCP_TIMER_HANDLE_INVALID);
     407         129 :       tcp_timer_set (&wrk->timer_wheel, tc, TCP_TIMER_WAITCLOSE,
     408             :                      tcp_cfg.finwait1_time);
     409         129 :       break;
     410         127 :     case TCP_STATE_CLOSE_WAIT:
     411         127 :       if (!transport_max_tx_dequeue (&tc->connection))
     412             :         {
     413         125 :           tcp_send_fin (tc);
     414         125 :           tcp_connection_timers_reset (tc);
     415         125 :           tcp_connection_set_state (tc, TCP_STATE_LAST_ACK);
     416         125 :           tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_WAITCLOSE,
     417             :                             tcp_cfg.lastack_time);
     418             :         }
     419             :       else
     420           2 :         tc->flags |= TCP_CONN_FINPNDG;
     421         127 :       break;
     422           0 :     case TCP_STATE_FIN_WAIT_1:
     423           0 :       tcp_timer_update (&wrk->timer_wheel, tc, TCP_TIMER_WAITCLOSE,
     424             :                         tcp_cfg.finwait1_time);
     425           0 :       break;
     426           0 :     case TCP_STATE_CLOSED:
     427             :       /* Cleanup should've been programmed already */
     428           0 :       break;
     429         260 :     default:
     430             :       TCP_DBG ("state: %u", tc->state);
     431             :     }
     432         260 : }
     433             : 
     434             : static void
     435           0 : tcp_session_half_close (u32 conn_index, u32 thread_index)
     436             : {
     437             :   tcp_worker_ctx_t *wrk;
     438             :   tcp_connection_t *tc;
     439             : 
     440           0 :   tc = tcp_connection_get (conn_index, thread_index);
     441           0 :   wrk = tcp_get_worker (tc->c_thread_index);
     442             : 
     443             :   /* If the connection is not in ESTABLISHED state, ignore it */
     444           0 :   if (tc->state != TCP_STATE_ESTABLISHED)
     445           0 :     return;
     446           0 :   if (!transport_max_tx_dequeue (&tc->connection))
     447           0 :     tcp_send_fin (tc);
     448             :   else
     449           0 :     tc->flags |= TCP_CONN_FINPNDG;
     450           0 :   tcp_connection_set_state (tc, TCP_STATE_FIN_WAIT_1);
     451             :   /* Set a timer in case the peer stops responding. Otherwise the
     452             :    * connection will be stuck here forever. */
     453           0 :   ASSERT (tc->timers[TCP_TIMER_WAITCLOSE] == TCP_TIMER_HANDLE_INVALID);
     454           0 :   tcp_timer_set (&wrk->timer_wheel, tc, TCP_TIMER_WAITCLOSE,
     455             :                  tcp_cfg.finwait1_time);
     456             : }
     457             : 
     458             : static void
     459         260 : tcp_session_close (u32 conn_index, u32 thread_index)
     460             : {
     461             :   tcp_connection_t *tc;
     462         260 :   tc = tcp_connection_get (conn_index, thread_index);
     463         260 :   tcp_connection_close (tc);
     464         260 : }
     465             : 
     466             : static void
     467           0 : tcp_session_cleanup (u32 conn_index, u32 thread_index)
     468             : {
     469             :   tcp_connection_t *tc;
     470           0 :   tc = tcp_connection_get (conn_index, thread_index);
     471           0 :   if (!tc)
     472           0 :     return;
     473           0 :   tcp_connection_set_state (tc, TCP_STATE_CLOSED);
     474           0 :   tcp_connection_cleanup (tc);
     475             : }
     476             : 
     477             : static void
     478           0 : tcp_session_cleanup_ho (u32 conn_index)
     479             : {
     480             :   tcp_worker_ctx_t *wrk;
     481             :   tcp_connection_t *tc;
     482             : 
     483           0 :   tc = tcp_half_open_connection_get (conn_index);
     484           0 :   wrk = tcp_get_worker (tc->c_thread_index);
     485           0 :   tcp_timer_reset (&wrk->timer_wheel, tc, TCP_TIMER_RETRANSMIT_SYN);
     486           0 :   tcp_half_open_connection_free (tc);
     487           0 : }
     488             : 
     489             : static void
     490           0 : tcp_session_reset (u32 conn_index, u32 thread_index)
     491             : {
     492             :   tcp_connection_t *tc;
     493           0 :   tc = tcp_connection_get (conn_index, thread_index);
     494             : 
     495             :   /* For half-opens just cleanup */
     496           0 :   if (tc->state == TCP_STATE_SYN_SENT)
     497             :     {
     498           0 :       tcp_connection_cleanup (tc);
     499           0 :       return;
     500             :     }
     501             : 
     502           0 :   tcp_send_reset (tc);
     503           0 :   tcp_connection_timers_reset (tc);
     504           0 :   tcp_cong_recovery_off (tc);
     505           0 :   tcp_connection_set_state (tc, TCP_STATE_CLOSED);
     506           0 :   session_transport_closed_notify (&tc->connection);
     507           0 :   tcp_program_cleanup (tcp_get_worker (thread_index), tc);
     508             : }
     509             : 
     510             : /**
     511             :  * Initialize all connection timers as invalid
     512             :  */
     513             : void
     514         438 : tcp_connection_timers_init (tcp_connection_t * tc)
     515             : {
     516             :   int i;
     517             : 
     518             :   /* Set all to invalid */
     519        2190 :   for (i = 0; i < TCP_N_TIMERS; i++)
     520             :     {
     521        1752 :       tc->timers[i] = TCP_TIMER_HANDLE_INVALID;
     522             :     }
     523             : 
     524         438 :   tc->rto = TCP_RTO_INIT;
     525         438 : }
     526             : 
     527             : /**
     528             :  * Stop all connection timers
     529             :  */
     530             : void
     531         652 : tcp_connection_timers_reset (tcp_connection_t * tc)
     532             : {
     533         652 :   tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
     534             :   int i;
     535             : 
     536        3260 :   for (i = 0; i < TCP_N_TIMERS; i++)
     537        2608 :     tcp_timer_reset (&wrk->timer_wheel, tc, i);
     538         652 : }
     539             : 
     540             : #if 0
     541             : typedef struct ip4_tcp_hdr
     542             : {
     543             :   ip4_header_t ip;
     544             :   tcp_header_t tcp;
     545             : } ip4_tcp_hdr_t;
     546             : 
     547             : typedef struct ip6_tcp_hdr
     548             : {
     549             :   ip6_header_t ip;
     550             :   tcp_header_t tcp;
     551             : } ip6_tcp_hdr_t;
     552             : 
     553             : static void
     554             : tcp_connection_select_lb_bucket (tcp_connection_t * tc, const dpo_id_t * dpo,
     555             :                                  dpo_id_t * result)
     556             : {
     557             :   const dpo_id_t *choice;
     558             :   load_balance_t *lb;
     559             :   int hash;
     560             : 
     561             :   lb = load_balance_get (dpo->dpoi_index);
     562             :   if (tc->c_is_ip4)
     563             :     {
     564             :       ip4_tcp_hdr_t hdr;
     565             :       clib_memset (&hdr, 0, sizeof (hdr));
     566             :       hdr.ip.protocol = IP_PROTOCOL_TCP;
     567             :       hdr.ip.address_pair.src.as_u32 = tc->c_lcl_ip.ip4.as_u32;
     568             :       hdr.ip.address_pair.dst.as_u32 = tc->c_rmt_ip.ip4.as_u32;
     569             :       hdr.tcp.src_port = tc->c_lcl_port;
     570             :       hdr.tcp.dst_port = tc->c_rmt_port;
     571             :       hash = ip4_compute_flow_hash (&hdr.ip, lb->lb_hash_config);
     572             :     }
     573             :   else
     574             :     {
     575             :       ip6_tcp_hdr_t hdr;
     576             :       clib_memset (&hdr, 0, sizeof (hdr));
     577             :       hdr.ip.protocol = IP_PROTOCOL_TCP;
     578             :       clib_memcpy_fast (&hdr.ip.src_address, &tc->c_lcl_ip.ip6,
     579             :                         sizeof (ip6_address_t));
     580             :       clib_memcpy_fast (&hdr.ip.dst_address, &tc->c_rmt_ip.ip6,
     581             :                         sizeof (ip6_address_t));
     582             :       hdr.tcp.src_port = tc->c_lcl_port;
     583             :       hdr.tcp.dst_port = tc->c_rmt_port;
     584             :       hash = ip6_compute_flow_hash (&hdr.ip, lb->lb_hash_config);
     585             :     }
     586             :   choice = load_balance_get_bucket_i (lb, hash & lb->lb_n_buckets_minus_1);
     587             :   dpo_copy (result, choice);
     588             : }
     589             : 
     590             : fib_node_index_t
     591             : tcp_lookup_rmt_in_fib (tcp_connection_t * tc)
     592             : {
     593             :   fib_prefix_t prefix;
     594             :   u32 fib_index;
     595             : 
     596             :   clib_memcpy_fast (&prefix.fp_addr, &tc->c_rmt_ip, sizeof (prefix.fp_addr));
     597             :   prefix.fp_proto = tc->c_is_ip4 ? FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6;
     598             :   prefix.fp_len = tc->c_is_ip4 ? 32 : 128;
     599             :   fib_index = fib_table_find (prefix.fp_proto, tc->c_fib_index);
     600             :   return fib_table_lookup (fib_index, &prefix);
     601             : }
     602             : 
     603             : static int
     604             : tcp_connection_stack_on_fib_entry (tcp_connection_t * tc)
     605             : {
     606             :   dpo_id_t choice = DPO_INVALID;
     607             :   u32 output_node_index;
     608             :   fib_entry_t *fe;
     609             : 
     610             :   fe = fib_entry_get (tc->c_rmt_fei);
     611             :   if (fe->fe_lb.dpoi_type != DPO_LOAD_BALANCE)
     612             :     return -1;
     613             : 
     614             :   tcp_connection_select_lb_bucket (tc, &fe->fe_lb, &choice);
     615             : 
     616             :   output_node_index =
     617             :     tc->c_is_ip4 ? tcp4_output_node.index : tcp6_output_node.index;
     618             :   dpo_stack_from_node (output_node_index, &tc->c_rmt_dpo, &choice);
     619             :   return 0;
     620             : }
     621             : 
     622             : /** Stack tcp connection on peer's fib entry.
     623             :  *
     624             :  * This ultimately populates the dpo the connection will use to send packets.
     625             :  */
     626             : static void
     627             : tcp_connection_fib_attach (tcp_connection_t * tc)
     628             : {
     629             :   tc->c_rmt_fei = tcp_lookup_rmt_in_fib (tc);
     630             : 
     631             :   ASSERT (tc->c_rmt_fei != FIB_NODE_INDEX_INVALID);
     632             : 
     633             :   tcp_connection_stack_on_fib_entry (tc);
     634             : }
     635             : #endif /* 0 */
     636             : 
     637             : /**
     638             :  * Generate random iss as per rfc6528
     639             :  */
     640             : static u32
     641         267 : tcp_generate_random_iss (tcp_connection_t * tc)
     642             : {
     643         267 :   tcp_main_t *tm = &tcp_main;
     644             :   u64 tmp;
     645             : 
     646         267 :   if (tc->c_is_ip4)
     647         265 :     tmp = (u64) tc->c_lcl_ip.ip4.as_u32 << 32 | (u64) tc->c_rmt_ip.ip4.as_u32;
     648             :   else
     649           2 :     tmp = tc->c_lcl_ip.ip6.as_u64[0] ^ tc->c_lcl_ip.ip6.as_u64[1]
     650           2 :       ^ tc->c_rmt_ip.ip6.as_u64[0] ^ tc->c_rmt_ip.ip6.as_u64[1];
     651             : 
     652         267 :   tmp ^= tm->iss_seed.first | ((u64) tc->c_lcl_port << 16 | tc->c_rmt_port);
     653         267 :   tmp ^= tm->iss_seed.second;
     654         267 :   tmp = clib_xxhash (tmp) + clib_cpu_time_now ();
     655         267 :   return ((tmp >> 32) ^ (tmp & 0xffffffff));
     656             : }
     657             : 
     658             : /**
     659             :  * Initialize max segment size we're able to process.
     660             :  *
     661             :  * The value is constrained by the output interface's MTU and by the size
     662             :  * of the IP and TCP headers (see RFC6691). It is also what we advertise
     663             :  * to our peer.
     664             :  */
     665             : static void
     666         534 : tcp_init_rcv_mss (tcp_connection_t * tc)
     667             : {
     668             :   u8 ip_hdr_len;
     669             : 
     670             :   /* Already provided at connection init time */
     671         534 :   if (tc->mss)
     672         267 :     return;
     673             : 
     674         267 :   ip_hdr_len = tc->c_is_ip4 ? sizeof (ip4_header_t) : sizeof (ip6_header_t);
     675         267 :   tc->mss = tcp_cfg.default_mtu - sizeof (tcp_header_t) - ip_hdr_len;
     676             : }
     677             : 
     678             : static void
     679         267 : tcp_init_mss (tcp_connection_t * tc)
     680             : {
     681         267 :   u16 default_min_mss = 536;
     682             : 
     683         267 :   tcp_init_rcv_mss (tc);
     684             : 
     685             :   /* TODO consider PMTU discovery */
     686         267 :   tc->snd_mss = clib_min (tc->rcv_opts.mss, tc->mss);
     687             : 
     688         267 :   if (tc->snd_mss < 45)
     689             :     {
     690             :       /* Assume that at least the min default mss works */
     691           3 :       tc->snd_mss = default_min_mss;
     692           3 :       tc->rcv_opts.mss = default_min_mss;
     693             :     }
     694             : 
     695             :   /* We should have enough space for 40 bytes of options */
     696         267 :   ASSERT (tc->snd_mss > 45);
     697             : 
     698             :   /* If we use timestamp option, account for it and make sure
     699             :    * the options are 4-byte aligned */
     700         267 :   if (tcp_opts_tstamp (&tc->rcv_opts))
     701         264 :     tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP + 2 /* alignment */;
     702         267 : }
     703             : 
     704             : /**
     705             :  * Initialize connection send variables.
     706             :  */
     707             : void
     708         267 : tcp_init_snd_vars (tcp_connection_t * tc)
     709             : {
     710             :   /*
     711             :    * We use the time to randomize iss and for setting up the initial
     712             :    * timestamp. Make sure it's updated otherwise syn and ack in the
     713             :    * handshake may make it look as if time has flown in the opposite
     714             :    * direction for us.
     715             :    */
     716         267 :   tcp_update_time_now (tcp_get_worker (vlib_get_thread_index ()));
     717             : 
     718         267 :   tcp_init_rcv_mss (tc);
     719             :   /*
     720             :    * In special case of early-kill of timewait socket, the iss will already
     721             :    * be initialized to ensure it is greater than the last incarnation of the
     722             :    * connection. see syn_during_timewait() for more details.
     723             :    */
     724         267 :   if (!tc->iss)
     725         267 :     tc->iss = tcp_generate_random_iss (tc);
     726         267 :   tc->snd_una = tc->iss;
     727         267 :   tc->snd_nxt = tc->iss + 1;
     728         267 :   tc->srtt = 0.1 * THZ;              /* 100 ms */
     729             : 
     730         267 :   if (!tcp_cfg.csum_offload)
     731           0 :     tc->cfg_flags |= TCP_CFG_F_NO_CSUM_OFFLOAD;
     732         267 : }
     733             : 
     734             : void
     735         267 : tcp_enable_pacing (tcp_connection_t * tc)
     736             : {
     737             :   u32 byte_rate;
     738         267 :   byte_rate = tc->cwnd / (tc->srtt * TCP_TICK);
     739         267 :   transport_connection_tx_pacer_init (&tc->connection, byte_rate, tc->cwnd);
     740         267 :   tc->mrtt_us = (u32) ~ 0;
     741         267 : }
     742             : 
     743             : /** Initialize tcp connection variables
     744             :  *
     745             :  * Should be called after having received a msg from the peer, i.e., a SYN or
     746             :  * a SYNACK, such that connection options have already been exchanged. */
     747             : void
     748         267 : tcp_connection_init_vars (tcp_connection_t * tc)
     749             : {
     750         267 :   tcp_connection_timers_init (tc);
     751         267 :   tcp_init_mss (tc);
     752         267 :   scoreboard_init (&tc->sack_sb);
     753         267 :   if (tc->state == TCP_STATE_SYN_RCVD)
     754         135 :     tcp_init_snd_vars (tc);
     755             : 
     756         267 :   tcp_cc_init (tc);
     757             : 
     758         267 :   if (!tc->c_is_ip4 && ip6_address_is_link_local_unicast (&tc->c_rmt_ip6))
     759           0 :     tcp_add_del_adjacency (tc, 1);
     760             : 
     761             :   /*  tcp_connection_fib_attach (tc); */
     762             : 
     763         267 :   if (transport_connection_is_tx_paced (&tc->connection)
     764         267 :       || tcp_cfg.enable_tx_pacing)
     765         267 :     tcp_enable_pacing (tc);
     766             : 
     767         267 :   if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
     768           0 :     tcp_bt_init (tc);
     769             : 
     770         267 :   if (!tcp_cfg.allow_tso)
     771         267 :     tc->cfg_flags |= TCP_CFG_F_NO_TSO;
     772             : 
     773         267 :   tc->start_ts = tcp_time_now_us (tc->c_thread_index);
     774         267 : }
     775             : 
     776             : static int
     777           0 : tcp_alloc_custom_local_endpoint (ip46_address_t *lcl_addr, u16 *lcl_port,
     778             :                                  transport_endpoint_cfg_t *rmt)
     779             : {
     780           0 :   tcp_main_t *tm = vnet_get_tcp_main ();
     781             :   int index, port;
     782             : 
     783           0 :   if (rmt->is_ip4)
     784             :     {
     785           0 :       index = tm->last_v4_addr_rotor++;
     786           0 :       if (tm->last_v4_addr_rotor >= vec_len (tcp_cfg.ip4_src_addrs))
     787           0 :         tm->last_v4_addr_rotor = 0;
     788           0 :       clib_memset (lcl_addr, 0, sizeof (*lcl_addr));
     789           0 :       lcl_addr->ip4.as_u32 = tcp_cfg.ip4_src_addrs[index].as_u32;
     790             :     }
     791             :   else
     792             :     {
     793           0 :       index = tm->last_v6_addr_rotor++;
     794           0 :       if (tm->last_v6_addr_rotor >= vec_len (tcp_cfg.ip6_src_addrs))
     795           0 :         tm->last_v6_addr_rotor = 0;
     796           0 :       clib_memcpy_fast (&lcl_addr->ip6, &tcp_cfg.ip6_src_addrs[index],
     797             :                         sizeof (ip6_address_t));
     798             :     }
     799           0 :   port = transport_alloc_local_port (TRANSPORT_PROTO_TCP, lcl_addr, rmt);
     800           0 :   if (port < 1)
     801           0 :     return SESSION_E_NOPORT;
     802           0 :   *lcl_port = port;
     803           0 :   return 0;
     804             : }
     805             : 
     806             : static int
     807         133 : tcp_session_open (transport_endpoint_cfg_t * rmt)
     808             : {
     809             :   tcp_connection_t *tc;
     810             :   ip46_address_t lcl_addr;
     811             :   u16 lcl_port;
     812             :   int rv;
     813             : 
     814             :   /*
     815             :    * Allocate local endpoint
     816             :    */
     817         133 :   if ((rmt->is_ip4 && vec_len (tcp_cfg.ip4_src_addrs))
     818         133 :       || (!rmt->is_ip4 && vec_len (tcp_cfg.ip6_src_addrs)))
     819           0 :     rv = tcp_alloc_custom_local_endpoint (&lcl_addr, &lcl_port, rmt);
     820             :   else
     821         133 :     rv = transport_alloc_local_endpoint (TRANSPORT_PROTO_TCP, rmt, &lcl_addr,
     822             :                                          &lcl_port);
     823             : 
     824         133 :   if (rv)
     825           1 :     return rv;
     826             : 
     827             :   /*
     828             :    * Create connection and send SYN
     829             :    */
     830         132 :   tc = tcp_half_open_connection_alloc ();
     831         132 :   ip_copy (&tc->c_rmt_ip, &rmt->ip, rmt->is_ip4);
     832         132 :   ip_copy (&tc->c_lcl_ip, &lcl_addr, rmt->is_ip4);
     833         132 :   tc->c_rmt_port = rmt->port;
     834         132 :   tc->c_lcl_port = clib_host_to_net_u16 (lcl_port);
     835         132 :   tc->c_is_ip4 = rmt->is_ip4;
     836         132 :   tc->c_proto = TRANSPORT_PROTO_TCP;
     837         132 :   tc->c_fib_index = rmt->fib_index;
     838         132 :   tc->cc_algo = tcp_cc_algo_get (tcp_cfg.cc_algo);
     839             :   /* The other connection vars will be initialized after SYN ACK */
     840         132 :   tcp_connection_timers_init (tc);
     841         132 :   tc->mss = rmt->mss;
     842         132 :   if (rmt->peer.sw_if_index != ENDPOINT_INVALID_INDEX)
     843         132 :     tc->sw_if_index = rmt->peer.sw_if_index;
     844         132 :   tc->next_node_index = rmt->next_node_index;
     845         132 :   tc->next_node_opaque = rmt->next_node_opaque;
     846             : 
     847             :   TCP_EVT (TCP_EVT_OPEN, tc);
     848         132 :   tc->state = TCP_STATE_SYN_SENT;
     849         132 :   tcp_init_snd_vars (tc);
     850         132 :   tcp_send_syn (tc);
     851             : 
     852         132 :   return tc->c_c_index;
     853             : }
     854             : 
     855             : static u8 *
     856          14 : format_tcp_session (u8 * s, va_list * args)
     857             : {
     858          14 :   u32 tci = va_arg (*args, u32);
     859          14 :   u32 thread_index = va_arg (*args, u32);
     860          14 :   u32 verbose = va_arg (*args, u32);
     861             :   tcp_connection_t *tc;
     862             : 
     863          14 :   tc = tcp_connection_get (tci, thread_index);
     864          14 :   if (tc)
     865          14 :     s = format (s, "%U", format_tcp_connection, tc, verbose);
     866             :   else
     867           0 :     s = format (s, "empty\n");
     868          14 :   return s;
     869             : }
     870             : 
     871             : static u8 *
     872           0 : format_tcp_listener_session (u8 * s, va_list * args)
     873             : {
     874           0 :   u32 tci = va_arg (*args, u32);
     875           0 :   u32 __clib_unused thread_index = va_arg (*args, u32);
     876           0 :   u32 verbose = va_arg (*args, u32);
     877           0 :   tcp_connection_t *tc = tcp_listener_get (tci);
     878           0 :   s = format (s, "%-" SESSION_CLI_ID_LEN "U", format_tcp_connection_id, tc);
     879           0 :   if (verbose)
     880           0 :     s = format (s, "%-" SESSION_CLI_STATE_LEN "U", format_tcp_state,
     881           0 :                 tc->state);
     882           0 :   return s;
     883             : }
     884             : 
     885             : static u8 *
     886           0 : format_tcp_half_open_session (u8 * s, va_list * args)
     887             : {
     888           0 :   u32 tci = va_arg (*args, u32);
     889           0 :   u32 __clib_unused thread_index = va_arg (*args, u32);
     890           0 :   u32 verbose = va_arg (*args, u32);
     891             :   tcp_connection_t *tc;
     892           0 :   u8 *state = 0;
     893             : 
     894           0 :   tc = tcp_half_open_connection_get (tci);
     895           0 :   if (tc->flags & TCP_CONN_HALF_OPEN_DONE)
     896           0 :     state = format (state, "%s", "CLOSED");
     897             :   else
     898           0 :     state = format (state, "%U", format_tcp_state, tc->state);
     899           0 :   s = format (s, "%-" SESSION_CLI_ID_LEN "U", format_tcp_connection_id, tc);
     900           0 :   if (verbose)
     901           0 :     s = format (s, "%-" SESSION_CLI_STATE_LEN "v", state);
     902           0 :   vec_free (state);
     903           0 :   return s;
     904             : }
     905             : 
     906             : static transport_connection_t *
     907     5581160 : tcp_session_get_transport (u32 conn_index, u32 thread_index)
     908             : {
     909     5581160 :   tcp_connection_t *tc = tcp_connection_get (conn_index, thread_index);
     910     5581160 :   if (PREDICT_FALSE (!tc))
     911           0 :     return 0;
     912     5581160 :   return &tc->connection;
     913             : }
     914             : 
     915             : static transport_connection_t *
     916         264 : tcp_half_open_session_get_transport (u32 conn_index)
     917             : {
     918         264 :   tcp_connection_t *tc = tcp_half_open_connection_get (conn_index);
     919         264 :   return &tc->connection;
     920             : }
     921             : 
     922             : static int
     923           0 : tcp_set_attribute (tcp_connection_t *tc, transport_endpt_attr_t *attr)
     924             : {
     925           0 :   int rv = 0;
     926             : 
     927           0 :   switch (attr->type)
     928             :     {
     929           0 :     case TRANSPORT_ENDPT_ATTR_NEXT_OUTPUT_NODE:
     930           0 :       tc->next_node_index = attr->next_output_node & 0xffffffff;
     931           0 :       tc->next_node_opaque = attr->next_output_node >> 32;
     932           0 :       break;
     933           0 :     case TRANSPORT_ENDPT_ATTR_MSS:
     934           0 :       tc->mss = attr->mss;
     935           0 :       tc->snd_mss = clib_min (tc->snd_mss, tc->mss);
     936           0 :       break;
     937           0 :     case TRANSPORT_ENDPT_ATTR_FLAGS:
     938           0 :       if (attr->flags & TRANSPORT_ENDPT_ATTR_F_CSUM_OFFLOAD)
     939           0 :         tc->cfg_flags |= TCP_CFG_F_NO_CSUM_OFFLOAD;
     940             :       else
     941           0 :         tc->cfg_flags &= ~TCP_CFG_F_NO_CSUM_OFFLOAD;
     942           0 :       if (attr->flags & TRANSPORT_ENDPT_ATTR_F_GSO)
     943             :         {
     944           0 :           if (!(tc->cfg_flags & TCP_CFG_F_TSO))
     945           0 :             tcp_check_gso (tc);
     946           0 :           tc->cfg_flags &= ~TCP_CFG_F_NO_TSO;
     947             :         }
     948             :       else
     949             :         {
     950           0 :           tc->cfg_flags |= TCP_CFG_F_NO_TSO;
     951           0 :           tc->cfg_flags &= ~TCP_CFG_F_TSO;
     952             :         }
     953           0 :       if (attr->flags & TRANSPORT_ENDPT_ATTR_F_RATE_SAMPLING)
     954             :         {
     955           0 :           if (!(tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE))
     956           0 :             tcp_bt_init (tc);
     957           0 :           tc->cfg_flags |= TCP_CFG_F_RATE_SAMPLE;
     958             :         }
     959             :       else
     960             :         {
     961           0 :           if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
     962           0 :             tcp_bt_cleanup (tc);
     963           0 :           tc->cfg_flags &= ~TCP_CFG_F_RATE_SAMPLE;
     964             :         }
     965           0 :       break;
     966           0 :     case TRANSPORT_ENDPT_ATTR_CC_ALGO:
     967           0 :       if (tc->cc_algo == tcp_cc_algo_get (attr->cc_algo))
     968           0 :         break;
     969           0 :       tcp_cc_cleanup (tc);
     970           0 :       tc->cc_algo = tcp_cc_algo_get (attr->cc_algo);
     971           0 :       tcp_cc_init (tc);
     972           0 :       break;
     973           0 :     default:
     974           0 :       rv = -1;
     975           0 :       break;
     976             :     }
     977             : 
     978           0 :   return rv;
     979             : }
     980             : 
     981             : static int
     982           3 : tcp_get_attribute (tcp_connection_t *tc, transport_endpt_attr_t *attr)
     983             : {
     984           3 :   int rv = 0;
     985             :   u64 non;
     986             : 
     987           3 :   switch (attr->type)
     988             :     {
     989           0 :     case TRANSPORT_ENDPT_ATTR_NEXT_OUTPUT_NODE:
     990           0 :       non = (u64) tc->next_node_opaque << 32 | tc->next_node_index;
     991           0 :       attr->next_output_node = non;
     992           0 :       break;
     993           3 :     case TRANSPORT_ENDPT_ATTR_MSS:
     994           3 :       attr->mss = tc->snd_mss;
     995           3 :       break;
     996           0 :     case TRANSPORT_ENDPT_ATTR_FLAGS:
     997           0 :       attr->flags = 0;
     998           0 :       if (!(tc->cfg_flags & TCP_CFG_F_NO_CSUM_OFFLOAD))
     999           0 :         attr->flags |= TRANSPORT_ENDPT_ATTR_F_CSUM_OFFLOAD;
    1000           0 :       if (tc->cfg_flags & TCP_CFG_F_TSO)
    1001           0 :         attr->flags |= TRANSPORT_ENDPT_ATTR_F_GSO;
    1002           0 :       if (tc->cfg_flags & TCP_CFG_F_RATE_SAMPLE)
    1003           0 :         attr->flags |= TRANSPORT_ENDPT_ATTR_F_RATE_SAMPLING;
    1004           0 :       break;
    1005           0 :     case TRANSPORT_ENDPT_ATTR_CC_ALGO:
    1006           0 :       attr->cc_algo = tc->cc_algo - tcp_main.cc_algos;
    1007           0 :       break;
    1008           0 :     default:
    1009           0 :       rv = -1;
    1010           0 :       break;
    1011             :     }
    1012             : 
    1013           3 :   return rv;
    1014             : }
    1015             : 
    1016             : static int
    1017           3 : tcp_session_attribute (u32 conn_index, u32 thread_index, u8 is_get,
    1018             :                        transport_endpt_attr_t *attr)
    1019             : {
    1020           3 :   tcp_connection_t *tc = tcp_connection_get (conn_index, thread_index);
    1021             : 
    1022           3 :   if (PREDICT_FALSE (!tc))
    1023           0 :     return -1;
    1024             : 
    1025           3 :   if (is_get)
    1026           3 :     return tcp_get_attribute (tc, attr);
    1027             :   else
    1028           0 :     return tcp_set_attribute (tc, attr);
    1029             : }
    1030             : 
    1031             : static u16
    1032           0 : tcp_session_cal_goal_size (tcp_connection_t * tc)
    1033             : {
    1034           0 :   u16 goal_size = tc->snd_mss;
    1035             : 
    1036           0 :   goal_size = tcp_cfg.max_gso_size - tc->snd_mss % tcp_cfg.max_gso_size;
    1037           0 :   goal_size = clib_min (goal_size, tc->snd_wnd / 2);
    1038             : 
    1039           0 :   return goal_size > tc->snd_mss ? goal_size : tc->snd_mss;
    1040             : }
    1041             : 
    1042             : always_inline u32
    1043       99722 : tcp_round_snd_space (tcp_connection_t * tc, u32 snd_space)
    1044             : {
    1045       99722 :   if (PREDICT_FALSE (tc->snd_wnd < tc->snd_mss))
    1046             :     {
    1047           0 :       return tc->snd_wnd <= snd_space ? tc->snd_wnd : 0;
    1048             :     }
    1049             : 
    1050             :   /* If not snd_wnd constrained and we can't write at least a segment,
    1051             :    * don't try at all */
    1052       99722 :   if (PREDICT_FALSE (snd_space < tc->snd_mss))
    1053        2840 :     return snd_space < tc->cwnd ? 0 : snd_space;
    1054             : 
    1055             :   /* round down to mss multiple */
    1056       96882 :   return snd_space - (snd_space % tc->snd_mss);
    1057             : }
    1058             : 
    1059             : /**
    1060             :  * Compute tx window session is allowed to fill.
    1061             :  *
    1062             :  * Takes into account available send space, snd_mss and the congestion
    1063             :  * state of the connection. If possible, the value returned is a multiple
    1064             :  * of snd_mss.
    1065             :  *
    1066             :  * @param tc tcp connection
    1067             :  * @return number of bytes session is allowed to write
    1068             :  */
    1069             : static inline u32
    1070       99722 : tcp_snd_space_inline (tcp_connection_t * tc)
    1071             : {
    1072             :   int snd_space;
    1073             : 
    1074             :   /* Fast path is disabled when recovery is on. @ref tcp_session_custom_tx
    1075             :    * controls both retransmits and the sending of new data while congested
    1076             :    */
    1077       99722 :   if (PREDICT_FALSE (tcp_in_cong_recovery (tc)
    1078             :                      || tc->state == TCP_STATE_CLOSED))
    1079           0 :     return 0;
    1080             : 
    1081       99722 :   snd_space = tcp_available_output_snd_space (tc);
    1082             : 
    1083             :   /* If we got dupacks or sacked bytes but we're not yet in recovery, try
    1084             :    * to force the peer to send enough dupacks to start retransmitting as
    1085             :    * per Limited Transmit (RFC3042)
    1086             :    */
    1087       99722 :   if (PREDICT_FALSE (tc->rcv_dupacks || tc->sack_sb.sacked_bytes))
    1088             :     {
    1089             :       int snt_limited, n_pkts;
    1090             : 
    1091           0 :       n_pkts = tcp_opts_sack_permitted (&tc->rcv_opts)
    1092           0 :         ? tc->sack_sb.reorder - 1 : 2;
    1093             : 
    1094           0 :       if ((seq_lt (tc->limited_transmit, tc->snd_nxt - n_pkts * tc->snd_mss)
    1095           0 :            || seq_gt (tc->limited_transmit, tc->snd_nxt)))
    1096           0 :         tc->limited_transmit = tc->snd_nxt;
    1097             : 
    1098           0 :       ASSERT (seq_leq (tc->limited_transmit, tc->snd_nxt));
    1099             : 
    1100           0 :       snt_limited = tc->snd_nxt - tc->limited_transmit;
    1101           0 :       snd_space = clib_max (n_pkts * tc->snd_mss - snt_limited, 0);
    1102             :     }
    1103       99722 :   return tcp_round_snd_space (tc, snd_space);
    1104             : }
    1105             : 
    1106             : u32
    1107           0 : tcp_snd_space (tcp_connection_t * tc)
    1108             : {
    1109           0 :   return tcp_snd_space_inline (tc);
    1110             : }
    1111             : 
    1112             : static int
    1113       70832 : tcp_session_send_params (transport_connection_t * trans_conn,
    1114             :                          transport_send_params_t * sp)
    1115             : {
    1116       70832 :   tcp_connection_t *tc = (tcp_connection_t *) trans_conn;
    1117             : 
    1118             :   /* Ensure snd_mss does accurately reflect the amount of data we can push
    1119             :    * in a segment. This also makes sure that options are updated according to
    1120             :    * the current state of the connection. */
    1121       70832 :   tcp_update_burst_snd_vars (tc);
    1122             : 
    1123       70832 :   if (PREDICT_FALSE (tc->cfg_flags & TCP_CFG_F_TSO))
    1124           0 :     sp->snd_mss = tcp_session_cal_goal_size (tc);
    1125             :   else
    1126       70832 :     sp->snd_mss = tc->snd_mss;
    1127             : 
    1128       70832 :   sp->snd_space = clib_min (tcp_snd_space_inline (tc),
    1129             :                             tc->snd_wnd - (tc->snd_nxt - tc->snd_una));
    1130             : 
    1131       70832 :   ASSERT (seq_geq (tc->snd_nxt, tc->snd_una));
    1132             :   /* This still works if fast retransmit is on */
    1133       70832 :   sp->tx_offset = tc->snd_nxt - tc->snd_una;
    1134             : 
    1135       70832 :   sp->flags = sp->snd_space ? 0 : TRANSPORT_SND_F_DESCHED;
    1136             : 
    1137       70832 :   return 0;
    1138             : }
    1139             : 
    1140             : static void
    1141           0 : tcp_timer_waitclose_handler (tcp_connection_t * tc)
    1142             : {
    1143           0 :   tcp_worker_ctx_t *wrk = tcp_get_worker (tc->c_thread_index);
    1144             : 
    1145           0 :   switch (tc->state)
    1146             :     {
    1147           0 :     case TCP_STATE_CLOSE_WAIT:
    1148           0 :       tcp_connection_timers_reset (tc);
    1149             :       /* App never returned with a close */
    1150           0 :       if (!(tc->flags & TCP_CONN_FINPNDG))
    1151             :         {
    1152           0 :           tcp_connection_set_state (tc, TCP_STATE_CLOSED);
    1153           0 :           session_transport_closed_notify (&tc->connection);
    1154           0 :           tcp_program_cleanup (wrk, tc);
    1155           0 :           tcp_worker_stats_inc (wrk, to_closewait, 1);
    1156           0 :           break;
    1157             :         }
    1158             : 
    1159             :       /* Send FIN either way and switch to LAST_ACK. */
    1160           0 :       tcp_cong_recovery_off (tc);
    1161             :       /* Make sure we don't try to send unsent data */
    1162           0 :       tc->snd_nxt = tc->snd_una;
    1163           0 :       tcp_send_fin (tc);
    1164           0 :       tcp_connection_set_state (tc, TCP_STATE_LAST_ACK);
    1165           0 :       session_transport_closed_notify (&tc->connection);
    1166             : 
    1167             :       /* Make sure we don't wait in LAST ACK forever */
    1168           0 :       tcp_timer_set (&wrk->timer_wheel, tc, TCP_TIMER_WAITCLOSE,
    1169             :                      tcp_cfg.lastack_time);
    1170           0 :       tcp_worker_stats_inc (wrk, to_closewait2, 1);
    1171             : 
    1172             :       /* Don't delete the connection yet */
    1173           0 :       break;
    1174           0 :     case TCP_STATE_FIN_WAIT_1:
    1175           0 :       tcp_connection_timers_reset (tc);
    1176           0 :       if (tc->flags & TCP_CONN_FINPNDG)
    1177             :         {
    1178             :           /* If FIN pending, we haven't sent everything, but we did try.
    1179             :            * Notify session layer that transport is closed. */
    1180           0 :           tcp_connection_set_state (tc, TCP_STATE_CLOSED);
    1181           0 :           tcp_send_reset (tc);
    1182           0 :           tcp_program_cleanup (wrk, tc);
    1183             :         }
    1184             :       else
    1185             :         {
    1186             :           /* We've sent the fin but no progress. Close the connection and
    1187             :            * to make sure everything is flushed, setup a cleanup timer */
    1188           0 :           tcp_connection_set_state (tc, TCP_STATE_CLOSED);
    1189           0 :           tcp_program_cleanup (wrk, tc);
    1190             :         }
    1191           0 :       session_transport_closed_notify (&tc->connection);
    1192           0 :       tcp_worker_stats_inc (wrk, to_finwait1, 1);
    1193           0 :       break;
    1194           0 :     case TCP_STATE_LAST_ACK:
    1195           0 :       tcp_connection_timers_reset (tc);
    1196           0 :       tcp_connection_set_state (tc, TCP_STATE_CLOSED);
    1197           0 :       session_transport_closed_notify (&tc->connection);
    1198           0 :       tcp_program_cleanup (wrk, tc);
    1199           0 :       tcp_worker_stats_inc (wrk, to_lastack, 1);
    1200           0 :       break;
    1201           0 :     case TCP_STATE_CLOSING:
    1202           0 :       tcp_connection_timers_reset (tc);
    1203           0 :       tcp_connection_set_state (tc, TCP_STATE_CLOSED);
    1204           0 :       session_transport_closed_notify (&tc->connection);
    1205           0 :       tcp_program_cleanup (wrk, tc);
    1206           0 :       tcp_worker_stats_inc (wrk, to_closing, 1);
    1207           0 :       break;
    1208           0 :     case TCP_STATE_FIN_WAIT_2:
    1209           0 :       tcp_send_reset (tc);
    1210           0 :       tcp_connection_timers_reset (tc);
    1211           0 :       tcp_connection_set_state (tc, TCP_STATE_CLOSED);
    1212           0 :       session_transport_closed_notify (&tc->connection);
    1213           0 :       tcp_program_cleanup (wrk, tc);
    1214           0 :       tcp_worker_stats_inc (wrk, to_finwait2, 1);
    1215           0 :       break;
    1216           0 :     case TCP_STATE_TIME_WAIT:
    1217           0 :       tcp_connection_set_state (tc, TCP_STATE_CLOSED);
    1218           0 :       tcp_program_cleanup (wrk, tc);
    1219           0 :       break;
    1220           0 :     default:
    1221           0 :       clib_warning ("waitclose in state: %U", format_tcp_state, tc->state);
    1222           0 :       break;
    1223             :     }
    1224           0 : }
    1225             : 
    1226             : /* *INDENT-OFF* */
    1227             : static timer_expiration_handler *timer_expiration_handlers[TCP_N_TIMERS] =
    1228             : {
    1229             :     tcp_timer_retransmit_handler,
    1230             :     tcp_timer_persist_handler,
    1231             :     tcp_timer_waitclose_handler,
    1232             :     tcp_timer_retransmit_syn_handler,
    1233             : };
    1234             : /* *INDENT-ON* */
    1235             : 
    1236             : static void
    1237    74946300 : tcp_dispatch_pending_timers (tcp_worker_ctx_t * wrk)
    1238             : {
    1239             :   u32 n_timers, connection_index, timer_id, thread_index, timer_handle;
    1240             :   tcp_connection_t *tc;
    1241             :   int i;
    1242             : 
    1243    74946300 :   if (!(n_timers = clib_fifo_elts (wrk->pending_timers)))
    1244    74901800 :     return;
    1245             : 
    1246         198 :   thread_index = wrk->vm->thread_index;
    1247         210 :   for (i = 0; i < clib_min (n_timers, wrk->max_timers_per_loop); i++)
    1248             :     {
    1249          12 :       clib_fifo_sub1 (wrk->pending_timers, timer_handle);
    1250          12 :       connection_index = timer_handle & 0x0FFFFFFF;
    1251          12 :       timer_id = timer_handle >> 28;
    1252             : 
    1253          12 :       if (PREDICT_TRUE (timer_id != TCP_TIMER_RETRANSMIT_SYN))
    1254          12 :         tc = tcp_connection_get (connection_index, thread_index);
    1255             :       else
    1256           0 :         tc = tcp_half_open_connection_get (connection_index);
    1257             : 
    1258          12 :       if (PREDICT_FALSE (!tc))
    1259           0 :         continue;
    1260             : 
    1261             :       /* Skip if the timer is not pending. Probably it was reset while
    1262             :        * waiting for dispatch */
    1263          12 :       if (PREDICT_FALSE (!(tc->pending_timers & (1 << timer_id))))
    1264           0 :         continue;
    1265             : 
    1266          12 :       tc->pending_timers &= ~(1 << timer_id);
    1267             : 
    1268             :       /* Skip timer if it was rearmed while pending dispatch */
    1269          12 :       if (PREDICT_FALSE (tc->timers[timer_id] != TCP_TIMER_HANDLE_INVALID))
    1270           0 :         continue;
    1271             : 
    1272          12 :       (*timer_expiration_handlers[timer_id]) (tc);
    1273             :     }
    1274             : 
    1275         198 :   if (thread_index == 0 && clib_fifo_elts (wrk->pending_timers))
    1276           0 :     session_queue_run_on_main_thread (wrk->vm);
    1277             : }
    1278             : 
    1279             : static void
    1280    74980300 : tcp_handle_cleanups (tcp_worker_ctx_t * wrk, clib_time_type_t now)
    1281             : {
    1282    74980300 :   u32 thread_index = wrk->vm->thread_index;
    1283             :   tcp_cleanup_req_t *req;
    1284             :   tcp_connection_t *tc;
    1285             : 
    1286    74980500 :   while (clib_fifo_elts (wrk->pending_cleanups))
    1287             :     {
    1288     1741580 :       req = clib_fifo_head (wrk->pending_cleanups);
    1289     1741580 :       if (req->free_time > now)
    1290     1741450 :         break;
    1291         134 :       clib_fifo_sub2 (wrk->pending_cleanups, req);
    1292         134 :       tc = tcp_connection_get (req->connection_index, thread_index);
    1293         134 :       if (PREDICT_FALSE (!tc))
    1294           0 :         continue;
    1295         134 :       session_transport_delete_notify (&tc->connection);
    1296         134 :       tcp_connection_cleanup (tc);
    1297             :     }
    1298    74950600 : }
    1299             : 
    1300             : static void
    1301    75003500 : tcp_update_time (f64 now, u8 thread_index)
    1302             : {
    1303    75003500 :   tcp_worker_ctx_t *wrk = tcp_get_worker (thread_index);
    1304             : 
    1305    74994700 :   tcp_set_time_now (wrk, now);
    1306    74985200 :   tcp_handle_cleanups (wrk, now);
    1307    74948500 :   tcp_timer_expire_timers (&wrk->timer_wheel, now);
    1308    74948200 :   tcp_dispatch_pending_timers (wrk);
    1309    74901800 : }
    1310             : 
    1311             : static void
    1312          79 : tcp_session_flush_data (transport_connection_t * tconn)
    1313             : {
    1314          79 :   tcp_connection_t *tc = (tcp_connection_t *) tconn;
    1315          79 :   if (tc->flags & TCP_CONN_PSH_PENDING)
    1316          55 :     return;
    1317          24 :   tc->flags |= TCP_CONN_PSH_PENDING;
    1318          24 :   tc->psh_seq = tc->snd_una + transport_max_tx_dequeue (tconn) - 1;
    1319             : }
    1320             : 
    1321             : static int
    1322           0 : tcp_session_app_rx_evt (transport_connection_t *conn)
    1323             : {
    1324           0 :   tcp_connection_t *tc = (tcp_connection_t *) conn;
    1325           0 :   u32 min_free, lo = 4 << 10, hi = 128 << 10;
    1326             : 
    1327           0 :   if (!(tc->flags & TCP_CONN_ZERO_RWND_SENT))
    1328           0 :     return 0;
    1329             : 
    1330           0 :   min_free = clib_clamp (transport_rx_fifo_size (conn) >> 3, lo, hi);
    1331           0 :   if (transport_max_rx_enqueue (conn) < min_free)
    1332             :     {
    1333           0 :       transport_rx_fifo_req_deq_ntf (conn);
    1334           0 :       return 0;
    1335             :     }
    1336             : 
    1337           0 :   tcp_send_ack (tc);
    1338             : 
    1339           0 :   return 0;
    1340             : }
    1341             : 
    1342             : /* *INDENT-OFF* */
    1343             : const static transport_proto_vft_t tcp_proto = {
    1344             :   .enable = vnet_tcp_enable_disable,
    1345             :   .start_listen = tcp_session_bind,
    1346             :   .stop_listen = tcp_session_unbind,
    1347             :   .push_header = tcp_session_push_header,
    1348             :   .get_connection = tcp_session_get_transport,
    1349             :   .get_listener = tcp_session_get_listener,
    1350             :   .get_half_open = tcp_half_open_session_get_transport,
    1351             :   .attribute = tcp_session_attribute,
    1352             :   .connect = tcp_session_open,
    1353             :   .half_close = tcp_session_half_close,
    1354             :   .close = tcp_session_close,
    1355             :   .cleanup = tcp_session_cleanup,
    1356             :   .cleanup_ho = tcp_session_cleanup_ho,
    1357             :   .reset = tcp_session_reset,
    1358             :   .send_params = tcp_session_send_params,
    1359             :   .update_time = tcp_update_time,
    1360             :   .flush_data = tcp_session_flush_data,
    1361             :   .custom_tx = tcp_session_custom_tx,
    1362             :   .app_rx_evt = tcp_session_app_rx_evt,
    1363             :   .format_connection = format_tcp_session,
    1364             :   .format_listener = format_tcp_listener_session,
    1365             :   .format_half_open = format_tcp_half_open_session,
    1366             :   .transport_options = {
    1367             :     .name = "tcp",
    1368             :     .short_name = "T",
    1369             :     .tx_type = TRANSPORT_TX_PEEK,
    1370             :     .service_type = TRANSPORT_SERVICE_VC,
    1371             :   },
    1372             : };
    1373             : /* *INDENT-ON* */
    1374             : 
    1375             : void
    1376       40937 : tcp_connection_tx_pacer_update (tcp_connection_t * tc)
    1377             : {
    1378       40937 :   if (!transport_connection_is_tx_paced (&tc->connection))
    1379           0 :     return;
    1380             : 
    1381       40937 :   f64 srtt = clib_min ((f64) tc->srtt * TCP_TICK, tc->mrtt_us);
    1382             : 
    1383       40937 :   transport_connection_tx_pacer_update (&tc->connection,
    1384             :                                         tcp_cc_get_pacing_rate (tc),
    1385       40937 :                                         srtt * CLIB_US_TIME_FREQ);
    1386             : }
    1387             : 
    1388             : void
    1389           0 : tcp_connection_tx_pacer_reset (tcp_connection_t * tc, u32 window,
    1390             :                                u32 start_bucket)
    1391             : {
    1392           0 :   f64 srtt = clib_min ((f64) tc->srtt * TCP_TICK, tc->mrtt_us);
    1393           0 :   transport_connection_tx_pacer_reset (&tc->connection,
    1394             :                                        tcp_cc_get_pacing_rate (tc),
    1395             :                                        start_bucket,
    1396           0 :                                        srtt * CLIB_US_TIME_FREQ);
    1397           0 : }
    1398             : 
    1399             : void
    1400       28890 : tcp_reschedule (tcp_connection_t * tc)
    1401             : {
    1402       28890 :   if (tcp_in_cong_recovery (tc) || tcp_snd_space_inline (tc))
    1403       27076 :     transport_connection_reschedule (&tc->connection);
    1404       28890 : }
    1405             : 
    1406             : static void
    1407          12 : tcp_expired_timers_dispatch (u32 * expired_timers)
    1408             : {
    1409          12 :   u32 thread_index = vlib_get_thread_index (), n_left, max_per_loop;
    1410             :   u32 connection_index, timer_id, n_expired, max_loops;
    1411             :   tcp_worker_ctx_t *wrk;
    1412             :   tcp_connection_t *tc;
    1413             :   int i;
    1414             : 
    1415          12 :   wrk = tcp_get_worker (thread_index);
    1416          12 :   n_expired = vec_len (expired_timers);
    1417          12 :   tcp_worker_stats_inc (wrk, timer_expirations, n_expired);
    1418          12 :   n_left = clib_fifo_elts (wrk->pending_timers);
    1419             : 
    1420             :   /*
    1421             :    * Invalidate all timer handles before dispatching. This avoids dangling
    1422             :    * index references to timer wheel pool entries that have been freed.
    1423             :    */
    1424          24 :   for (i = 0; i < n_expired; i++)
    1425             :     {
    1426          12 :       connection_index = expired_timers[i] & 0x0FFFFFFF;
    1427          12 :       timer_id = expired_timers[i] >> 28;
    1428             : 
    1429          12 :       if (timer_id != TCP_TIMER_RETRANSMIT_SYN)
    1430          12 :         tc = tcp_connection_get (connection_index, thread_index);
    1431             :       else
    1432           0 :         tc = tcp_half_open_connection_get (connection_index);
    1433             : 
    1434             :       TCP_EVT (TCP_EVT_TIMER_POP, connection_index, timer_id);
    1435             : 
    1436          12 :       tc->timers[timer_id] = TCP_TIMER_HANDLE_INVALID;
    1437          12 :       tc->pending_timers |= (1 << timer_id);
    1438             :     }
    1439             : 
    1440          12 :   clib_fifo_add (wrk->pending_timers, expired_timers, n_expired);
    1441             : 
    1442          12 :   max_loops =
    1443          12 :     clib_max ((u32) 0.5 * TCP_TIMER_TICK * wrk->vm->loops_per_second, 1);
    1444          12 :   max_per_loop = clib_max ((n_left + n_expired) / max_loops, 10);
    1445          12 :   max_per_loop = clib_min (max_per_loop, VLIB_FRAME_SIZE);
    1446          12 :   wrk->max_timers_per_loop = clib_max (n_left ? wrk->max_timers_per_loop : 0,
    1447             :                                        max_per_loop);
    1448             : 
    1449          12 :   if (thread_index == 0)
    1450          12 :     session_queue_run_on_main_thread (wrk->vm);
    1451          12 : }
    1452             : 
    1453             : static void
    1454          49 : tcp_initialize_iss_seed (tcp_main_t * tm)
    1455             : {
    1456          49 :   u32 default_seed = random_default_seed ();
    1457          49 :   u64 time_now = clib_cpu_time_now ();
    1458             : 
    1459          49 :   tm->iss_seed.first = (u64) random_u32 (&default_seed) << 32;
    1460          49 :   tm->iss_seed.second = random_u64 (&time_now);
    1461          49 : }
    1462             : 
    1463             : static void
    1464          25 : tcp_stats_collector_fn (vlib_stats_collector_data_t *d)
    1465             : {
    1466          25 :   tcp_main_t *tm = vnet_get_tcp_main ();
    1467          25 :   counter_t **counters = d->entry->data;
    1468          25 :   counter_t *cb = counters[0];
    1469          25 :   tcp_wrk_stats_t acc = {};
    1470             :   tcp_worker_ctx_t *wrk;
    1471             : 
    1472          71 :   vec_foreach (wrk, tm->wrk_ctx)
    1473             :     {
    1474             : #define _(name, type, str) acc.name += wrk->stats.name;
    1475          46 :       foreach_tcp_wrk_stat
    1476             : #undef _
    1477             :     }
    1478             : 
    1479             : #define _(name, type, str) cb[TCP_STAT_##name] = acc.name;
    1480          25 :   foreach_tcp_wrk_stat
    1481             : #undef _
    1482          25 : }
    1483             : 
    1484             : static void
    1485          49 : tcp_counters_init (tcp_main_t *tm)
    1486             : {
    1487          49 :   vlib_stats_collector_reg_t r = {};
    1488             :   u32 idx;
    1489             : 
    1490          49 :   if (tm->counters_init)
    1491           0 :     return;
    1492             : 
    1493          49 :   r.entry_index = idx = vlib_stats_add_counter_vector ("/sys/tcp");
    1494          49 :   r.collect_fn = tcp_stats_collector_fn;
    1495          49 :   vlib_stats_validate (idx, 0, TCP_STAT_no_buffer);
    1496             : 
    1497             : #define _(name, type, str)                                                    \
    1498             :   vlib_stats_add_symlink (idx, TCP_STAT_##name, "/sys/tcp/%s",                \
    1499             :                           CLIB_STRING_MACRO (name));
    1500          49 :   foreach_tcp_wrk_stat
    1501             : #undef _
    1502             : 
    1503          49 :     vlib_stats_register_collector_fn (&r);
    1504             : 
    1505          49 :   tm->counters_init = 1;
    1506             : }
    1507             : 
    1508             : static clib_error_t *
    1509          49 : tcp_main_enable (vlib_main_t * vm)
    1510             : {
    1511          49 :   vlib_thread_main_t *vtm = vlib_get_thread_main ();
    1512             :   u32 num_threads, n_workers, prealloc_conn_per_wrk;
    1513             :   tcp_connection_t *tc __attribute__ ((unused));
    1514          49 :   tcp_main_t *tm = vnet_get_tcp_main ();
    1515             :   tcp_worker_ctx_t *wrk;
    1516          49 :   clib_error_t *error = 0;
    1517             :   int thread;
    1518             : 
    1519          49 :   if ((error = vlib_call_init_function (vm, ip_main_init)))
    1520           0 :     return error;
    1521          49 :   if ((error = vlib_call_init_function (vm, ip4_lookup_init)))
    1522           0 :     return error;
    1523          49 :   if ((error = vlib_call_init_function (vm, ip6_lookup_init)))
    1524           0 :     return error;
    1525             : 
    1526             :   /*
    1527             :    * Registrations
    1528             :    */
    1529             : 
    1530          49 :   ip4_register_protocol (IP_PROTOCOL_TCP, tcp4_input_node.index);
    1531          49 :   ip6_register_protocol (IP_PROTOCOL_TCP, tcp6_input_node.index);
    1532             : 
    1533             :   /*
    1534             :    * Initialize data structures
    1535             :    */
    1536             : 
    1537          49 :   num_threads = 1 /* main thread */  + vtm->n_threads;
    1538          49 :   vec_validate (tm->wrk_ctx, num_threads - 1);
    1539          49 :   n_workers = num_threads == 1 ? 1 : vtm->n_threads;
    1540          49 :   prealloc_conn_per_wrk = tcp_cfg.preallocated_connections / n_workers;
    1541             : 
    1542          49 :   wrk = &tm->wrk_ctx[0];
    1543          98 :   wrk->tco_next_node[0] = vlib_node_get_next (vm, session_queue_node.index,
    1544          49 :                                               tcp4_output_node.index);
    1545          98 :   wrk->tco_next_node[1] = vlib_node_get_next (vm, session_queue_node.index,
    1546          49 :                                               tcp6_output_node.index);
    1547             : 
    1548         119 :   for (thread = 0; thread < num_threads; thread++)
    1549             :     {
    1550          70 :       wrk = &tm->wrk_ctx[thread];
    1551             : 
    1552          70 :       vec_validate (wrk->pending_deq_acked, 255);
    1553          70 :       vec_validate (wrk->pending_disconnects, 255);
    1554          70 :       vec_validate (wrk->pending_resets, 255);
    1555          70 :       vec_reset_length (wrk->pending_deq_acked);
    1556          70 :       vec_reset_length (wrk->pending_disconnects);
    1557          70 :       vec_reset_length (wrk->pending_resets);
    1558          70 :       wrk->vm = vlib_get_main_by_index (thread);
    1559          70 :       wrk->max_timers_per_loop = 10;
    1560             : 
    1561          70 :       if (thread > 0)
    1562             :         {
    1563          21 :           wrk->tco_next_node[0] = tm->wrk_ctx[0].tco_next_node[0];
    1564          21 :           wrk->tco_next_node[1] = tm->wrk_ctx[0].tco_next_node[1];
    1565             :         }
    1566             : 
    1567             :       /*
    1568             :        * Preallocate connections. Assume that thread 0 won't
    1569             :        * use preallocated threads when running multi-core
    1570             :        */
    1571          70 :       if ((thread > 0 || num_threads == 1) && prealloc_conn_per_wrk)
    1572           0 :         pool_init_fixed (wrk->connections, prealloc_conn_per_wrk);
    1573             : 
    1574          70 :       tcp_timer_initialize_wheel (&wrk->timer_wheel,
    1575             :                                   tcp_expired_timers_dispatch,
    1576             :                                   vlib_time_now (vm));
    1577             :     }
    1578             : 
    1579          49 :   tcp_initialize_iss_seed (tm);
    1580             : 
    1581          49 :   tm->bytes_per_buffer = vlib_buffer_get_default_data_size (vm);
    1582          49 :   tm->cc_last_type = TCP_CC_LAST;
    1583             : 
    1584          49 :   tcp_counters_init (tm);
    1585             : 
    1586          49 :   return error;
    1587             : }
    1588             : 
    1589             : clib_error_t *
    1590          57 : vnet_tcp_enable_disable (vlib_main_t * vm, u8 is_en)
    1591             : {
    1592          57 :   if (is_en)
    1593             :     {
    1594          49 :       if (tcp_main.is_enabled)
    1595           0 :         return 0;
    1596             : 
    1597          49 :       return tcp_main_enable (vm);
    1598             :     }
    1599             :   else
    1600             :     {
    1601           8 :       tcp_main.is_enabled = 0;
    1602             :     }
    1603             : 
    1604           8 :   return 0;
    1605             : }
    1606             : 
    1607             : void
    1608           4 : tcp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add)
    1609             : {
    1610           4 :   tcp_main_t *tm = &tcp_main;
    1611           4 :   if (is_ip4)
    1612           2 :     tm->punt_unknown4 = is_add;
    1613             :   else
    1614           2 :     tm->punt_unknown6 = is_add;
    1615           4 : }
    1616             : 
    1617             : /**
    1618             :  * Initialize default values for tcp parameters
    1619             :  */
    1620             : static void
    1621         559 : tcp_configuration_init (void)
    1622             : {
    1623             :   /* Initial wnd for SYN. Fifos are not allocated at that point so use some
    1624             :    * predefined value. For SYN-ACK we still want the scale to be computed in
    1625             :    * the same way */
    1626         559 :   tcp_cfg.max_rx_fifo = 32 << 20;
    1627         559 :   tcp_cfg.min_rx_fifo = 4 << 10;
    1628             : 
    1629         559 :   tcp_cfg.default_mtu = 1500;
    1630         559 :   tcp_cfg.initial_cwnd_multiplier = 0;
    1631         559 :   tcp_cfg.enable_tx_pacing = 1;
    1632         559 :   tcp_cfg.allow_tso = 0;
    1633         559 :   tcp_cfg.csum_offload = 1;
    1634         559 :   tcp_cfg.cc_algo = TCP_CC_CUBIC;
    1635         559 :   tcp_cfg.rwnd_min_update_ack = 1;
    1636         559 :   tcp_cfg.max_gso_size = TCP_MAX_GSO_SZ;
    1637             : 
    1638             :   /* Time constants defined as timer tick (100us) multiples */
    1639         559 :   tcp_cfg.closewait_time = 20000;       /* 2s */
    1640         559 :   tcp_cfg.timewait_time = 100000;       /* 10s */
    1641         559 :   tcp_cfg.finwait1_time = 600000;       /* 60s */
    1642         559 :   tcp_cfg.lastack_time = 300000;        /* 30s */
    1643         559 :   tcp_cfg.finwait2_time = 300000;       /* 30s */
    1644         559 :   tcp_cfg.closing_time = 300000;        /* 30s */
    1645         559 :   tcp_cfg.alloc_err_timeout = 1000;     /* 100ms */
    1646             : 
    1647             :   /* This value is seconds */
    1648         559 :   tcp_cfg.cleanup_time = 0.1;   /* 100ms */
    1649         559 : }
    1650             : 
    1651             : static clib_error_t *
    1652         559 : tcp_init (vlib_main_t * vm)
    1653             : {
    1654         559 :   tcp_main_t *tm = vnet_get_tcp_main ();
    1655         559 :   ip_main_t *im = &ip_main;
    1656             :   ip_protocol_info_t *pi;
    1657             : 
    1658             :   /* Session layer, and by implication tcp, are disabled by default */
    1659         559 :   tm->is_enabled = 0;
    1660             : 
    1661             :   /* Register with IP for header parsing */
    1662         559 :   pi = ip_get_protocol_info (im, IP_PROTOCOL_TCP);
    1663         559 :   if (pi == 0)
    1664           0 :     return clib_error_return (0, "TCP protocol info AWOL");
    1665         559 :   pi->format_header = format_tcp_header;
    1666         559 :   pi->unformat_pg_edit = unformat_pg_tcp_header;
    1667             : 
    1668             :   /* Register as transport with session layer */
    1669         559 :   transport_register_protocol (TRANSPORT_PROTO_TCP, &tcp_proto,
    1670             :                                FIB_PROTOCOL_IP4, tcp4_output_node.index);
    1671         559 :   transport_register_protocol (TRANSPORT_PROTO_TCP, &tcp_proto,
    1672             :                                FIB_PROTOCOL_IP6, tcp6_output_node.index);
    1673             : 
    1674         559 :   tcp_configuration_init ();
    1675             : 
    1676         559 :   tm->cc_algo_by_name = hash_create_string (0, sizeof (uword));
    1677             : 
    1678         559 :   return 0;
    1679             : }
    1680             : 
    1681       58799 : VLIB_INIT_FUNCTION (tcp_init);
    1682             : 
    1683             : /*
    1684             :  * fd.io coding-style-patch-verification: ON
    1685             :  *
    1686             :  * Local Variables:
    1687             :  * eval: (c-set-style "gnu")
    1688             :  * End:
    1689             :  */

Generated by: LCOV version 1.14